def readSegFile():
	jieba.load_userdict("../../data/jieba_userdict.txt")
	infile = open('../../data/all_cn_seg_nwi_clean.txt','rb')
	outfile = open('../../data/all_word.txt','wb')
	stopword_set = text_process.getStopword('../../data/stopword.txt')
	word_set = set([])
	word_fre_dict = {}
	row_counter = 0
	for row in infile:
		row_counter += 1
		print row_counter
		row = row.strip().decode('utf-8')
		items = row.split('<@>')
		app_name = items[1]
		brief_seg = items[2].split()
		title_seg = jieba.cut(app_name)
		for title in title_seg:
			if text_process.isChinese(title) and title not in stopword_set:
				word_set.add(title)
				word_fre_dict.setdefault(title,0)
				word_fre_dict[title] += 1
		for brief in brief_seg:
			if text_process.isChinese(brief) and brief not in stopword_set:
				word_set.add(brief)
				word_fre_dict.setdefault(brief,0)
				word_fre_dict[brief] += 1

	sorted_list = sorted(word_fre_dict.items(),key=lambda p:p[1],reverse=True)
	for val in sorted_list:
		if val[1] >= 10:
			outfile.write(val[0]+','+str(val[1])+'\r\n')
def readSegFile():
	infile = open('../../data/all_cn_seg_nwi_clean.txt','rb')
	outfile = open('../../data/candidate_title_word.txt','wb')
	stopword_set = text_process.getStopword('../../data/stopword.txt')
	word_set = set([])
	word_fre_dict = {}
	row_counter = 0
	for row in infile:
		row_counter += 1
		print row_counter
		row = row.strip().decode('utf-8')
		items = row.split('<@>')
		app_name = items[1]
		brief_seg = items[2].split()
		title_seg = jieba.cut(app_name)
		for title in title_seg:
			if text_process.isChinese(title) and title not in stopword_set:
				word_set.add(title)
				word_fre_dict.setdefault(title,0)
				word_fre_dict[title] += 1
		# for brief in brief_seg:
		# 	word_set.add(brief)
	for word in word_fre_dict.keys():
		if word_fre_dict[word] >= 10:
			outfile.write(word+'\r\n')
Пример #3
0
def recommendTag(category_parent_dict):
    outfile = open('tag_recommend_result.txt', 'wb')
    print 'loading jieba userdict'
    jieba.load_userdict('../../../data/jieba_userdict.txt')
    print 'loading stopword'
    stopword_set = text_process.getStopword('../../../data/stopword.txt')
    print 'reading app json'
    infile = open('../data/' + category_path + '.json', 'rb')
    for row in infile:
        json_obj = json.loads(row.strip())
        app_id = int(json_obj["soft_id"])
        app_name = json_obj["soft_name"]
        app_brief = json_obj["soft_brief"]
        app_download = int(json_obj["download_times"])
        outfile.write(
            str(app_id) + '<@>' + app_name + '<@>' + app_brief + '<@>')
        tag_recommend_set = set([])
        for category in category_parent_dict.keys():
            if category in app_name or category in app_brief:
                for parent_tuple in category_parent_dict[category]:
                    if parent_tuple[1] == 0:
                        tag_recommend_set.add(parent_tuple[0])
                    else:
                        tag_recommend_set.add(category)
                    if parent_tuple[1] == 2:
                        tag_recommend_set.add(parent_tuple[0])

        outfile.write(','.join(tag_recommend_set))
        outfile.write('\r\n')
Пример #4
0
def recommendTag(category_parent_dict):
	outfile = open('tag_recommend_result.txt','wb')
	print 'loading jieba userdict'
	jieba.load_userdict('../../../data/jieba_userdict.txt')
	print 'loading stopword'
	stopword_set = text_process.getStopword('../../../data/stopword.txt')
	print 'reading app json'
	infile = open('../data/'+category_path+'.json','rb')
	for row in infile:
		json_obj = json.loads(row.strip())
		app_id = int(json_obj["soft_id"])
		app_name = json_obj["soft_name"]
		app_brief = json_obj["soft_brief"]
		app_download = int(json_obj["download_times"])
		outfile.write(str(app_id)+'<@>'+app_name+'<@>'+app_brief+'<@>')
		tag_recommend_set = set([])
		for category in category_parent_dict.keys():
			if category in app_name or category in app_brief:
				for parent_tuple in category_parent_dict[category]:
					if parent_tuple[1] == 0:
						tag_recommend_set.add(parent_tuple[0])
					else:
						tag_recommend_set.add(category)
					if parent_tuple[1] == 2:
						tag_recommend_set.add(parent_tuple[0])

		outfile.write(','.join(tag_recommend_set))
		outfile.write('\r\n')
Пример #5
0
def readSegFile():
    jieba.load_userdict("../../data/jieba_userdict.txt")
    infile = open('../../data/all_cn_seg_nwi_clean.txt', 'rb')
    outfile = open('../../data/all_word.txt', 'wb')
    stopword_set = text_process.getStopword('../../data/stopword.txt')
    word_set = set([])
    word_fre_dict = {}
    row_counter = 0
    for row in infile:
        row_counter += 1
        print row_counter
        row = row.strip().decode('utf-8')
        items = row.split('<@>')
        app_name = items[1]
        brief_seg = items[2].split()
        title_seg = jieba.cut(app_name)
        for title in title_seg:
            if text_process.isChinese(title) and title not in stopword_set:
                word_set.add(title)
                word_fre_dict.setdefault(title, 0)
                word_fre_dict[title] += 1
        for brief in brief_seg:
            if text_process.isChinese(brief) and brief not in stopword_set:
                word_set.add(brief)
                word_fre_dict.setdefault(brief, 0)
                word_fre_dict[brief] += 1

    sorted_list = sorted(word_fre_dict.items(),
                         key=lambda p: p[1],
                         reverse=True)
    for val in sorted_list:
        if val[1] >= 10:
            outfile.write(val[0] + ',' + str(val[1]) + '\r\n')
Пример #6
0
def mineAbbreviation():
    print 'mining abbreviation'
    jieba.load_userdict("../../../data/jieba_userdict.txt")
    stopword_set = text_process.getStopword('../../../data/stopword.txt')
    word2vec_model = Word2Vec.load('../../../data/word2vec.model')
    word_set = getWords()
    word_syn_dict = {}
    for word in word_set:
        word_syn_dict.setdefault(word, set([word]))
        if len(word) != 2:
            continue
        try:
            for simi_word_tuple in word2vec_model.most_similar(positive=[word],
                                                               topn=20):
                simi_word = simi_word_tuple[0]
                simi_value = simi_word_tuple[1]
                reverse_word = word[1] + word[0]
                if reverse_word == simi_word:
                    pass
                else:
                    if len(set(word) & set(simi_word)) != len(
                            word
                    ) or simi_value < 0.5 or word in simi_word or reverse_word in simi_word:
                        continue
                word_syn_dict[word].add(simi_word)
        except:
            pass
            # print word

    outfile = open('abbreviation.txt', 'wb')
    for word in word_syn_dict.keys():
        if len(word_syn_dict[word]) >= 2:
            outfile.write(word + '@' + ','.join(word_syn_dict[word]) + '\r\n')
def clean(category_id,category_crawl_dict,category_set):
	print 'cleaning'
	stop_word_set = text_process.getStopword(data_path+'stopword.txt')
	for category in category_crawl_dict.keys():
		word_fre_dict = {}
		outfile = open('wiki_search/'+str(category_id)+'_'+category+'.txt','wb')
		print category
		for page in category_crawl_dict[category]:
			abstract = page['abstract']
			stat(6,category,abstract,word_fre_dict,category_set,stop_word_set)
			abstract_link = page['abstract_link']
			stat2(10,category,abstract_link,word_fre_dict,category_set,stop_word_set)
			abstract_bold = page['abstract_bold']
			stat2(8,category,abstract_bold,word_fre_dict,category_set,stop_word_set)
			if 'wiki_category' in page.keys():
				wiki_category = page['wiki_category']
				stat2(20,category,wiki_category,word_fre_dict,category_set,stop_word_set)
			if 'content' in page.keys():
				content = page['content']
				stat(1,category,content,word_fre_dict,category_set,stop_word_set)
				content_link = page['content_link']
				stat2(4,category,content_link,word_fre_dict,category_set,stop_word_set)
				content_bold = page['content_bold']
				stat2(2,category,content_bold,word_fre_dict,category_set,stop_word_set)

		sorted_list = sorted(word_fre_dict.items(),key=lambda p:p[1],reverse=True)
		for val in sorted_list:
			outfile.write(val[0]+','+str(val[1])+'\r\n')
def mineAbbreviation():
	print 'mining abbreviation'
	jieba.load_userdict("../../../data/jieba_userdict.txt")
	stopword_set = text_process.getStopword('../../../data/stopword.txt')
	word2vec_model = Word2Vec.load('../../../data/word2vec.model')
	word_set = getWords()
	word_syn_dict = {}
	for word in word_set:
		word_syn_dict.setdefault(word,set([word]))
		if len(word) != 2:
			continue
		try:
			for simi_word_tuple in word2vec_model.most_similar(positive=[word],topn=20):
				simi_word = simi_word_tuple[0]
				simi_value = simi_word_tuple[1]
				reverse_word = word[1]+word[0]
				if reverse_word == simi_word:
					pass
				else:	
					if len(set(word)&set(simi_word)) != len(word) or simi_value < 0.5 or word in simi_word or reverse_word in simi_word:
						continue
				word_syn_dict[word].add(simi_word)
		except:
			pass
			# print word

	outfile = open('abbreviation.txt','wb')
	for word in word_syn_dict.keys():
		if len(word_syn_dict[word])>=2:
			outfile.write(word+'@'+','.join(word_syn_dict[word])+'\r\n')	
Пример #9
0
def getTrainTest(category_name,category_parent_dict,category_child_dict,category_synonyms_dict,indicator_set,comment_category_set,ambiguation_dict):
	#主类目名称
	main_category = u"软件"

	jieba.load_userdict('../../../data/jieba_userdict.txt')
	stopword_set = text_process.getStopword('../../../data/stopword.txt')

	node_children_dict = rule_base.createNodeChildrenDict(category_child_dict)
	candidate_tag_set,candidate_delegate_tag_set = rule_base.getCandidateTag(main_category,node_children_dict,category_synonyms_dict)
	level_category_dict = rule_base.createLevelCategoryDict(main_category,candidate_tag_set,category_parent_dict,category_child_dict,category_synonyms_dict)
	# for level in level_category_dict.keys():
	# 	print level
	# 	print ' '.join(level_category_dict[level])

	dictionary = corpora.Dictionary([list(candidate_delegate_tag_set)])
	valcabulary_size = len(dictionary)

	#遍历主类目下的app
	infile = open('../data/'+category_name+'.json','rb')
	X_train = []
	X_test = []
	X_test_info = []
	all_counter = 0
	train_counter = 0
	for row in infile:

		all_counter += 1
		json_obj = json.loads(row.strip())
		app_id = int(json_obj["id"])
		app_name = json_obj["title"]
		app_brief = json_obj["brief"]
		app_tag = json_obj["tags"]
		app_download = int(json_obj["download_times"])
		app_brief_seg = [word for word in jieba.cut(app_brief) if word not in stopword_set and text_process.isChinese(word)]
		app_name_brief = app_name+" "+app_brief
		app_name_brief += " "+rule_base.grabEnglish(app_name_brief)

		tag_recommend_set = set([])

		for tag in candidate_tag_set:
			if tag in app_name_brief:
				tag_recommend_set.add(category_synonyms_dict[tag][0])

		doc = dictionary.doc2bow(list(tag_recommend_set))
		x = [0 for i in range(valcabulary_size)]
		for val in doc:
			index = val[0]
			x[index] = val[1]
		if u"视频" in app_tag or u"音乐" in app_tag and app_download >= 1000:
			train_counter += 1
			X_train.append(x)
		else:
			X_test.append(x)
			X_test_info.append([app_name,' '.join(app_brief_seg)])

	print 1.0*train_counter/all_counter
	return X_train,X_test,X_test_info
def generateCandidateCategory(category_path):
    print 'loading file'
    jieba.load_userdict(data_path + "jieba_userdict.txt")
    stopword_set = text_process.getStopword('../../../data/stopword.txt')

    print 'reading file'
    word_title_dict = {}
    word_brief_dict = {}
    word_all_dict = {}
    infile = open('../data/' + category_path + '.json', 'rb')
    outfile = open('candidate_category/' + str(category_path) + '.txt', 'wb')
    for row in infile:
        json_obj = json.loads(row.strip())
        app_name = json_obj["soft_name"]
        app_brief = json_obj["soft_brief"]

        seg_title_list = jieba.cut(app_name)
        seg_brief_list = jieba.cut(app_brief)

        for seg_title in seg_title_list:
            if text_process.isChinese(
                    seg_title) and seg_title not in stopword_set:
                word_title_dict.setdefault(seg_title, 0)
                word_title_dict[seg_title] += 1

        for seg_brief in seg_brief_list:
            if text_process.isChinese(
                    seg_brief) and seg_brief not in stopword_set:
                word_brief_dict.setdefault(seg_brief, 0)
                word_brief_dict[seg_brief] += 1

    print 'sorting'
    sorted_list = sorted(word_title_dict.items(),
                         key=lambda p: p[1],
                         reverse=True)
    for item in sorted_list:
        if item[1] >= 10:
            word_all_dict.setdefault(item[0], 0)
            word_all_dict[item[0]] += item[1]
            # outfile.write(item[0]+','+str(item[1])+'\r\n')

    sorted_list = sorted(word_brief_dict.items(),
                         key=lambda p: p[1],
                         reverse=True)
    for item in sorted_list:
        if item[1] >= 50:
            word_all_dict.setdefault(item[0], 0)
            word_all_dict[item[0]] += item[1]
            # outfile.write(item[0]+','+str(item[1])+'\r\n')

    sorted_list = sorted(word_all_dict.items(),
                         key=lambda p: p[1],
                         reverse=True)
    for item in sorted_list:
        outfile.write(item[0] + ',' + str(item[1]) + '\r\n')
def calculateCoverage(category_parent_dict, category_stat_dict):
    print 'loading jieba userdict'
    jieba.load_userdict('../../../data/jieba_userdict.txt')
    print 'loading stopword'
    stopword_set = text_process.getStopword('../../../data/stopword.txt')
    print 'reading app json'
    infile = open('../data/' + category_path + '.json', 'rb')
    all_app_counter = 0
    print u'下载次数过滤阈值: ' + str(download_times_filter)
    for row in infile:
        json_obj = json.loads(row.strip())
        app_id = int(json_obj["soft_id"])
        app_name = json_obj["soft_name"]
        app_brief = json_obj["soft_brief"]
        app_download = int(json_obj["download_times"])

        if app_download < download_times_filter:
            continue

        all_app_counter += 1

        # if u'设备' in app_brief:
        # 	print app_brief

        for delegate_category in category_stat_dict.keys():
            for relevant_category in category_stat_dict[delegate_category][0]:
                if relevant_category in app_name or relevant_category in app_brief:
                    if relevant_category != delegate_category:
                        #如果是强联通的,根节点不需要出现
                        if isStrongConnect(1, delegate_category,
                                           relevant_category,
                                           category_parent_dict):
                            category_stat_dict[delegate_category][1].add(
                                app_id)
                            break
                        elif delegate_category in app_name or delegate_category in app_brief:
                            category_stat_dict[delegate_category][1].add(
                                app_id)
                            break
                    else:
                        category_stat_dict[delegate_category][1].add(app_id)
                        break

    print u'过滤之后的app总数: ' + str(all_app_counter)

    top_coverage_category_info_dict = {}
    for iter_num in range(100):
        print '循环次数: ' + str(iter_num)
        coverage_ratio = rankTopCoverage(top_coverage_category_info_dict,
                                         category_stat_dict, all_app_counter)
        #达到一定累积覆盖率则停止
        if coverage_ratio >= 0.99:
            break
Пример #12
0
def clean(category_path, category_crawl_dict, category_set):
    print 'cleaning'
    stop_word_set = text_process.getStopword(data_path + 'stopword.txt')
    for category in category_crawl_dict.keys():
        word_score_dict = {}
        outfile = open('../clean_data/' + str(category_path) + '_' + category,
                       'wb')
        print category
        for page in category_crawl_dict[category]:
            offset_weight = 1.0 * (5 - int(page['offset'])) / 5

            title = page['title']
            statRawText(10 * offset_weight, category, title, word_score_dict,
                        category_set, stop_word_set)

            if 'abstract' in page.keys():
                abstract = page['abstract']
                statRawText(6 * offset_weight, category, abstract,
                            word_score_dict, category_set, stop_word_set)
            if 'abstract_link' in page.keys():
                abstract_link = page['abstract_link']
                statTextList(10 * offset_weight, category, abstract_link,
                             word_score_dict, category_set, stop_word_set)
            if 'abstract_bold' in page.keys():
                abstract_bold = page['abstract_bold']
                statTextList(8 * offset_weight, category, abstract_bold,
                             word_score_dict, category_set, stop_word_set)

            if 'wiki_category' in page.keys():
                wiki_category = page['wiki_category']
                statTextList(10 * offset_weight, category, wiki_category,
                             word_score_dict, category_set, stop_word_set)

            if 'content' in page.keys():
                content = page['content']
                statRawText(1 * offset_weight, category, content,
                            word_score_dict, category_set, stop_word_set)
            if 'content_link' in page.keys():
                content_link = page['content_link']
                statTextList(4 * offset_weight, category, content_link,
                             word_score_dict, category_set, stop_word_set)
            if 'content_bold' in page.keys():
                content_bold = page['content_bold']
                statTextList(2 * offset_weight, category, content_bold,
                             word_score_dict, category_set, stop_word_set)

        sorted_list = sorted(word_score_dict.items(),
                             key=lambda p: p[1],
                             reverse=True)
        for val in sorted_list:
            outfile.write(val[0] + ',' + str(val[1]) + '\r\n')
def getCorpus(category_name):

	app_lable_dict = {10743:1,1002128:1,47:1,498:1,550:-1,48:-1,490:-1,761:-1,101108:-1,101916:-1}

	x_train = []
	y_train = []
	x_test = []

	jieba.load_userdict('../../../data/jieba_userdict.txt')
	stopword_set = text_process.getStopword('../../../data/stopword.txt')

	doc_app_id = []
	docs = []
	id_name_dict = {}
	infile = open('corpus/'+category_name+'.json','rb')
	for row in infile:
		json_obj = json.loads(row.strip())
		app_id = int(json_obj["id"])
		app_name = json_obj["title"]
		app_brief = json_obj["brief"]
		app_download = int(json_obj["download_times"])
		app_brief_seg = [word for word in jieba.cut(app_name+" "+app_brief) if word not in stopword_set and text_process.isChinese(word)]

		if len(app_brief_seg) <= 10 and app_download <= 100:
			continue

		doc_app_id.append(app_id)
		id_name_dict[app_id] = app_name
		docs.append(app_brief_seg)

	dictionary = corpora.Dictionary(docs)
	corpus = [dictionary.doc2bow(text) for text in docs]

	for i in range(len(corpus)):
		doc = corpus[i]
		x = [0 for n in range(len(dictionary))]
		for val in doc:
			x[val[0]] = val[1]

		app_id = doc_app_id[i]
		if app_id in app_lable_dict.keys():
			x_train.append(x)
			if app_lable_dict[app_id] == 1:
				y_train.append(1)
			else:
				y_train.append(-1)
		else:
			x_test.append(x)

	return x_train,x_test,y_train,doc_app_id,id_name_dict
Пример #14
0
def clean(category_path,category_crawl_dict,category_set):
	print 'cleaning'
	stop_word_set = text_process.getStopword(data_path+'stopword.txt')
	for category in category_crawl_dict.keys():
		word_score_dict = {}
		outfile = open('../clean_data/'+str(category_path)+'/'+category,'wb')
		print category
		for page in category_crawl_dict[category]:
			offset_weight = 1.0*(5-int(page['offset']))/5
			
			title = page['title']
			content = page['content']
			abstract = page['abstract']

			# content_seg_list = jieba.cut(content)
			# abstract_seg_list = jieba.cut(abstract)
			# all_seg_set = set(content_seg_list) | set(abstract_seg_list)
			# intersec_num = 1.0*len(all_seg_set & category_set)/len(category_set)
			# print '------------'
			# print title
			# print 'abstract: '+abstract
			# print 'content: '+content
			# print ' '.join(all_seg_set & category_set)
			# print intersec_num
			# print '------------'
			# offset_weight = offset_weight*intersec_num
			# if intersec_num <= 0.01:
			# 	continue

			statRawText(10*offset_weight,category,title,word_score_dict,category_set,stop_word_set)
			
			statRawText(6*offset_weight,category,abstract,word_score_dict,category_set,stop_word_set)
			abstract_link = page['abstract_link']
			statTextList(10*offset_weight,category,abstract_link,word_score_dict,category_set,stop_word_set)
			abstract_bold = page['abstract_bold']
			statTextList(8*offset_weight,category,abstract_bold,word_score_dict,category_set,stop_word_set)
			
			if 'tags' in page.keys():
				tags = page['tags']
				statTextList(10*offset_weight,category,tags,word_score_dict,category_set,stop_word_set)
			
			statRawText(1*offset_weight,category,content,word_score_dict,category_set,stop_word_set)
			content_link = page['content_link']
			statTextList(4*offset_weight,category,content_link,word_score_dict,category_set,stop_word_set)
			content_bold = page['content_bold']
			statTextList(2*offset_weight,category,content_bold,word_score_dict,category_set,stop_word_set)

		sorted_list = sorted(word_score_dict.items(),key=lambda p:p[1],reverse=True)
		for val in sorted_list:
			outfile.write(val[0]+','+str(val[1])+'\r\n')
Пример #15
0
def mineKeywordCombination(category_id, query_keyword):

    #主类目名称
    main_category = idToName(category_id)

    jieba.load_userdict('../../../data/jieba_userdict.txt')
    stopword_set = text_process.getStopword('../../../data/stopword.txt')

    combination_fre_dict = {}

    outfile = open('keyword_combination.txt', 'wb')
    #遍历主类目下的app
    infile = open('../data/' + str(category_id) + '.json', 'rb')
    for row in infile:

        json_obj = json.loads(row.strip())
        app_id = int(json_obj["id"])
        app_name = json_obj["title"]
        app_brief = json_obj["brief"]
        app_download = int(json_obj["download_times"])
        app_name_seg = [
            word for word in jieba.cut(app_name)
            if word not in stopword_set and text_process.isChinese(word)
        ]
        app_brief_seg = [
            word for word in jieba.cut(app_brief)
            if word not in stopword_set and text_process.isChinese(word)
        ]
        app_name_brief = app_name + " " + app_brief

        app_name_combination_dict = combineNeighborWord(
            app_name_seg, query_keyword)
        for word in app_name_combination_dict.keys():
            combination_fre_dict.setdefault(word, 0)
            combination_fre_dict[word] += app_name_combination_dict[word]

        app_brief_combination_dict = combineNeighborWord(
            app_brief_seg, query_keyword)
        for word in app_brief_combination_dict.keys():
            combination_fre_dict.setdefault(word, 0)
            combination_fre_dict[word] += app_brief_combination_dict[word]

    sorted_list = sorted(combination_fre_dict.items(),
                         key=lambda p: p[1],
                         reverse=True)
    for val in sorted_list:
        if val[1] >= 2:
            print val[0] + ',' + str(val[1])
            outfile.write(val[0] + ',' + str(val[1]) + '\r\n')
def calculateCoverage(category_parent_dict,category_stat_dict):
	print 'loading jieba userdict'
	jieba.load_userdict('../../../data/jieba_userdict.txt')
	print 'loading stopword'
	stopword_set = text_process.getStopword('../../../data/stopword.txt')
	print 'reading app json'
	infile = open('../data/'+category_path+'.json','rb')
	all_app_counter = 0
	print u'下载次数过滤阈值: '+str(download_times_filter)
	for row in infile:
		json_obj = json.loads(row.strip())
		app_id = int(json_obj["id"])
		app_name = json_obj["title"]
		app_brief = json_obj["brief"]
		app_download = int(json_obj["download_times"])

		if app_download < download_times_filter:
			continue

		all_app_counter += 1

		# if u'设备' in app_brief:
		# 	print app_brief

		for delegate_category in category_stat_dict.keys():
			for relevant_category in category_stat_dict[delegate_category][0]:
				if relevant_category in app_name or relevant_category in app_brief:
					if relevant_category != delegate_category:
						#如果是强联通的,根节点不需要出现
						if isStrongConnect(1,delegate_category,relevant_category,category_parent_dict):
							category_stat_dict[delegate_category][1].add(app_id)
							break
						elif delegate_category in app_name or delegate_category in app_brief:
							category_stat_dict[delegate_category][1].add(app_id)
							break
					else:
						category_stat_dict[delegate_category][1].add(app_id)
						break


	print u'过滤之后的app总数: '+str(all_app_counter)

	top_coverage_category_info_dict = {}
	for iter_num in range(100):
		print '循环次数: '+str(iter_num)
		coverage_ratio = rankTopCoverage(top_coverage_category_info_dict,category_stat_dict,all_app_counter)
		#达到一定累积覆盖率则停止
		if coverage_ratio >= 0.90:
			break
Пример #17
0
def tf(category_id, category_path, query_category, category_set,
       app_category_dict, app_tag_dict):
    print '-extracting feature'
    infile = open(data_path + 'all_cn_seg_nwi_clean.txt', 'rb')
    stopword_set = text_process.getStopword(data_path + 'stopword.txt')
    outfile_title = open('title_tf/' + str(category_path) + '.csv', 'wb')
    outfile_tag = open('tag_tf/' + str(category_path) + '.csv', 'wb')
    title_tf_dict = {}
    tag_tf_dict = {}
    for category in category_set:
        title_tf_dict.setdefault(category, 0)
        tag_tf_dict.setdefault(category, 0)
    row_index = 0
    for row in infile:
        row_index += 1
        items = row.strip().split("<@>")
        try:
            app_id = int(items[0])
            app_name = items[1].decode('utf-8')
            seg_brief_list = items[2].decode('utf-8').split()
        except:
            continue
        if app_category_dict[app_id][1] != category_id:
            continue
        if query_category != "":
            if not isRelevant(query_category, app_name, seg_brief_list):
                continue

        title(app_name, title_tf_dict)
        # brief(seg_brief_list,tf_dict)
        tag(app_id, app_tag_dict, tag_tf_dict)

    max_title_tf = max(title_tf_dict.values())
    print 'sorting'
    title_sorted_list = sorted(title_tf_dict.items(),
                               key=lambda p: p[1],
                               reverse=True)
    for val in title_sorted_list:
        outfile_title.write(val[0] + ',' + str(1.0 * val[1] / max_title_tf) +
                            '\r\n')

    max_tag_tf = max(tag_tf_dict.values())
    tag_sorted_list = sorted(tag_tf_dict.items(),
                             key=lambda p: p[1],
                             reverse=True)
    for val in tag_sorted_list:
        outfile_tag.write(val[0] + ',' + str(1.0 * val[1] / max_tag_tf) +
                          '\r\n')
def generateCandidateCategory(category_path):
	print 'loading file'
	jieba.load_userdict(data_path+"jieba_userdict.txt")
	stopword_set = text_process.getStopword('../../../data/stopword.txt')

	print 'reading file'
	word_title_dict = {}
	word_brief_dict = {}
	word_all_dict = {}
	infile = open('../data/'+category_path+'.json','rb')
	outfile = open('candidate_category/'+str(category_path)+'.txt','wb')
	for row in infile:
		json_obj = json.loads(row.strip())
		app_name = json_obj["soft_name"]
		app_brief = json_obj["soft_brief"]

		seg_title_list = jieba.cut(app_name)
		seg_brief_list = jieba.cut(app_brief)

		for seg_title in seg_title_list:
			if text_process.isChinese(seg_title) and seg_title not in stopword_set:
				word_title_dict.setdefault(seg_title,0)
				word_title_dict[seg_title] += 1

		for seg_brief in seg_brief_list:
			if text_process.isChinese(seg_brief) and seg_brief not in stopword_set: 
				word_brief_dict.setdefault(seg_brief,0)
				word_brief_dict[seg_brief] += 1

	print 'sorting'
	sorted_list = sorted(word_title_dict.items(),key=lambda p:p[1],reverse=True)
	for item in sorted_list:
		if item[1] >= 10:
			word_all_dict.setdefault(item[0],0)
			word_all_dict[item[0]] += item[1]
			# outfile.write(item[0]+','+str(item[1])+'\r\n')

	sorted_list = sorted(word_brief_dict.items(),key=lambda p:p[1],reverse=True)
	for item in sorted_list:
		if item[1] >= 50:
			word_all_dict.setdefault(item[0],0)
			word_all_dict[item[0]] += item[1]
			# outfile.write(item[0]+','+str(item[1])+'\r\n')

	sorted_list = sorted(word_all_dict.items(),key=lambda p:p[1],reverse=True)
	for item in sorted_list:
		outfile.write(item[0]+','+str(item[1])+'\r\n')
Пример #19
0
def classify(category_name,category_parent_dict,category_child_dict,category_synonyms_dict,indicator_set,comment_category_set,ambiguation_dict):
	#主类目名称
	main_category = u"软件"

	jieba.load_userdict('../../../data/jieba_userdict.txt')
	stopword_set = text_process.getStopword('../../../data/stopword.txt')

	node_children_dict = rule_base.createNodeChildrenDict(category_child_dict)
	candidate_tag_set,candidate_delegate_tag_set = rule_base.getCandidateTag(main_category,node_children_dict,category_synonyms_dict)
	level_category_dict = rule_base.createLevelCategoryDict(main_category,candidate_tag_set,category_parent_dict,category_child_dict,category_synonyms_dict)
	for level in level_category_dict.keys():
		print level
		print ' '.join(level_category_dict[level])

	#遍历主类目下的app
	infile = open('../data/'+category_name+'.json','rb')
	outfile_classification = open('../data/'+ category_name+'_classification.json','wb')

	for row in infile:
		
		json_obj = json.loads(row.strip())
		app_id = int(json_obj["id"])
		app_name = json_obj["title"]
		app_brief = json_obj["brief"]
		app_download = int(json_obj["download_times"])
		app_brief_seg = [word for word in jieba.cut(app_brief) if word not in stopword_set and text_process.isChinese(word)]
		app_name_brief = app_name+" "+app_brief
		app_name_brief += " "+rule_base.grabEnglish(app_name_brief)

		tag_recommend_set = set([])

		for tag in candidate_tag_set:
			if tag in app_name_brief:
				tag_recommend_set.add(category_synonyms_dict[tag][0])
	
		if len(level_category_dict[1] & tag_recommend_set) != 0:
			candidate_main_level_set = level_category_dict[1] & tag_recommend_set
			candidate_main_level_score_dict = {}
			for candidate_main_level in candidate_main_level_set:
				score = len(node_children_dict[candidate_main_level] & tag_recommend_set)
				candidate_main_level_score_dict.setdefault(score,set([])).add(candidate_main_level)
			max_score = max(candidate_main_level_score_dict.keys())
			if max_score >= 3:
				final_category_list = list(candidate_main_level_score_dict[max_score])
				if final_category_list[0] != category_name:
					outfile_classification.write(str(app_id)+"->"+final_category_list[0]+"->"+app_name+"<@>"+" ".join(app_brief_seg)+'\r\n')
def tf(category_id,category_path,query_category,category_set,app_category_dict,app_tag_dict):
	print '-extracting feature'
	infile = open(data_path+'all_cn_seg_nwi_clean.txt','rb')
	stopword_set = text_process.getStopword(data_path+'stopword.txt')
	outfile_title = open('title_tf/'+str(category_path)+'.csv','wb')
	outfile_tag = open('tag_tf/'+str(category_path)+'.csv','wb')
	title_tf_dict = {}
	tag_tf_dict = {}
	for category in category_set:
		title_tf_dict.setdefault(category,0)
		tag_tf_dict.setdefault(category,0)
	row_index = 0
	for row in infile:
		row_index += 1
		items = row.strip().split("<@>")
		try:
			app_id = int(items[0])
			app_name = items[1].decode('utf-8')
			seg_brief_list = items[2].decode('utf-8').split()
		except:
			continue
		if app_category_dict[app_id][1] != category_id:
			continue
		if query_category != "":
			if not isRelevant(query_category,app_name,seg_brief_list):
				continue
				
		title(app_name,title_tf_dict)
		# brief(seg_brief_list,tf_dict)
		tag(app_id,app_tag_dict,tag_tf_dict)

	max_title_tf = max(title_tf_dict.values())
	print 'sorting'
	title_sorted_list = sorted(title_tf_dict.items(),key=lambda p:p[1],reverse=True)
	for val in title_sorted_list:
		outfile_title.write(val[0]+','+str(1.0*val[1]/max_title_tf)+'\r\n')

	max_tag_tf = max(tag_tf_dict.values())
	tag_sorted_list = sorted(tag_tf_dict.items(),key=lambda p:p[1],reverse=True)
	for val in tag_sorted_list:
		outfile_tag.write(val[0]+','+str(1.0*val[1]/max_tag_tf)+'\r\n')
def calculateCoverage(category_stat_dict, synonyms_set_list):
    print 'loading file'
    jieba.load_userdict(data_path + "jieba_userdict.txt")
    stopword_set = text_process.getStopword('../../../data/stopword.txt')

    print 'reading file'
    infile = open('../data/' + category_path + '.json', 'rb')
    all_app_counter = 0
    for row in infile:
        json_obj = json.loads(row.strip())
        app_id = int(json_obj["soft_id"])
        app_name = json_obj["soft_name"]
        app_brief = json_obj["soft_brief"]
        app_download = int(json_obj["download_times"])

        if app_download < 100:
            continue

        all_app_counter += 1

        seg_title_list = jieba.cut(app_name)
        seg_brief_list = jieba.cut(app_brief)

        for seg_title in seg_title_list:
            if text_process.isChinese(
                    seg_title) and seg_title not in stopword_set:
                for main_category in category_stat_dict.keys():
                    if seg_title in category_stat_dict[main_category][0]:
                        category_stat_dict[main_category][1].add(app_id)

        for seg_brief in seg_brief_list:
            if text_process.isChinese(
                    seg_brief) and seg_brief not in stopword_set:
                for main_category in category_stat_dict.keys():
                    if seg_brief in category_stat_dict[main_category][0]:
                        category_stat_dict[main_category][1].add(app_id)

    top_coverage_category_info_dict = {}
    for iter_num in range(20):
        stat(top_coverage_category_info_dict, category_stat_dict,
             all_app_counter, synonyms_set_list)
Пример #22
0
def clean(category_path, category_crawl_dict, category_set):
    print "cleaning"
    stop_word_set = text_process.getStopword(data_path + "stopword.txt")
    for category in category_crawl_dict.keys():
        word_score_dict = {}
        outfile = open("../clean_data/" + str(category_path) + "_" + category, "wb")
        print category
        for page in category_crawl_dict[category]:
            offset_weight = 1.0 * (5 - int(page["offset"])) / 5

            title = page["title"]
            statRawText(10 * offset_weight, category, title, word_score_dict, category_set, stop_word_set)

            if "abstract" in page.keys():
                abstract = page["abstract"]
                statRawText(6 * offset_weight, category, abstract, word_score_dict, category_set, stop_word_set)
            if "abstract_link" in page.keys():
                abstract_link = page["abstract_link"]
                statTextList(10 * offset_weight, category, abstract_link, word_score_dict, category_set, stop_word_set)
            if "abstract_bold" in page.keys():
                abstract_bold = page["abstract_bold"]
                statTextList(8 * offset_weight, category, abstract_bold, word_score_dict, category_set, stop_word_set)

            if "wiki_category" in page.keys():
                wiki_category = page["wiki_category"]
                statTextList(10 * offset_weight, category, wiki_category, word_score_dict, category_set, stop_word_set)

            if "content" in page.keys():
                content = page["content"]
                statRawText(1 * offset_weight, category, content, word_score_dict, category_set, stop_word_set)
            if "content_link" in page.keys():
                content_link = page["content_link"]
                statTextList(4 * offset_weight, category, content_link, word_score_dict, category_set, stop_word_set)
            if "content_bold" in page.keys():
                content_bold = page["content_bold"]
                statTextList(2 * offset_weight, category, content_bold, word_score_dict, category_set, stop_word_set)

        sorted_list = sorted(word_score_dict.items(), key=lambda p: p[1], reverse=True)
        for val in sorted_list:
            outfile.write(val[0] + "," + str(val[1]) + "\r\n")
def mineKeywordCombination(category_id,query_keyword):

	#主类目名称
	main_category = idToName(category_id)

	jieba.load_userdict('../../../data/jieba_userdict.txt')
	stopword_set = text_process.getStopword('../../../data/stopword.txt')

	combination_fre_dict = {}

	outfile = open('keyword_combination.txt','wb')
	#遍历主类目下的app
	infile = open('../data/'+str(category_id)+'.json','rb')
	for row in infile:
		
		json_obj = json.loads(row.strip())
		app_id = int(json_obj["id"])
		app_name = json_obj["title"]
		app_brief = json_obj["brief"]
		app_download = int(json_obj["download_times"])
		app_name_seg = [word for word in jieba.cut(app_name) if word not in stopword_set and text_process.isChinese(word)]
		app_brief_seg = [word for word in jieba.cut(app_brief) if word not in stopword_set and text_process.isChinese(word)]
		app_name_brief = app_name+" "+app_brief

		app_name_combination_dict = combineNeighborWord(app_name_seg,query_keyword)
		for word in app_name_combination_dict.keys():
			combination_fre_dict.setdefault(word,0)
			combination_fre_dict[word] += app_name_combination_dict[word]
		
		app_brief_combination_dict = combineNeighborWord(app_brief_seg,query_keyword)
		for word in app_brief_combination_dict.keys():
			combination_fre_dict.setdefault(word,0)
			combination_fre_dict[word] += app_brief_combination_dict[word]


	sorted_list = sorted(combination_fre_dict.items(),key=lambda p:p[1],reverse=True)
	for val in sorted_list:
		if val[1] >= 2:
			print val[0]+','+str(val[1])
			outfile.write(val[0]+','+str(val[1])+'\r\n')
def calculateCoverage(category_stat_dict,synonyms_set_list):
	print 'loading file'
	jieba.load_userdict(data_path+"jieba_userdict.txt")
	stopword_set = text_process.getStopword('../../../data/stopword.txt')

	print 'reading file'
	infile = open('../data/'+category_path+'.json','rb')
	all_app_counter = 0
	for row in infile:
		json_obj = json.loads(row.strip())
		app_id = int(json_obj["soft_id"])
		app_name = json_obj["soft_name"]
		app_brief = json_obj["soft_brief"]
		app_download = int(json_obj["download_times"])

		if app_download < 100:
			continue

		all_app_counter += 1

		seg_title_list = jieba.cut(app_name)
		seg_brief_list = jieba.cut(app_brief)

		for seg_title in seg_title_list:
			if text_process.isChinese(seg_title) and seg_title not in stopword_set:
				for main_category in category_stat_dict.keys():
					if seg_title in category_stat_dict[main_category][0]:
						category_stat_dict[main_category][1].add(app_id)

		for seg_brief in seg_brief_list:
			if text_process.isChinese(seg_brief) and seg_brief not in stopword_set: 
				for main_category in category_stat_dict.keys():
					if seg_brief in category_stat_dict[main_category][0]:
						category_stat_dict[main_category][1].add(app_id)
	
	top_coverage_category_info_dict = {}
	for iter_num in range(20):
		stat(top_coverage_category_info_dict,category_stat_dict,all_app_counter,synonyms_set_list)
Пример #25
0
def clean(category_id, category_crawl_dict, category_set):
    print 'cleaning'
    stop_word_set = text_process.getStopword(data_path + 'stopword.txt')
    for category in category_crawl_dict.keys():
        word_fre_dict = {}
        outfile = open(
            'wiki_search/' + str(category_id) + '_' + category + '.txt', 'wb')
        print category
        for page in category_crawl_dict[category]:
            abstract = page['abstract']
            stat(6, category, abstract, word_fre_dict, category_set,
                 stop_word_set)
            abstract_link = page['abstract_link']
            stat2(10, category, abstract_link, word_fre_dict, category_set,
                  stop_word_set)
            abstract_bold = page['abstract_bold']
            stat2(8, category, abstract_bold, word_fre_dict, category_set,
                  stop_word_set)
            if 'wiki_category' in page.keys():
                wiki_category = page['wiki_category']
                stat2(20, category, wiki_category, word_fre_dict, category_set,
                      stop_word_set)
            if 'content' in page.keys():
                content = page['content']
                stat(1, category, content, word_fre_dict, category_set,
                     stop_word_set)
                content_link = page['content_link']
                stat2(4, category, content_link, word_fre_dict, category_set,
                      stop_word_set)
                content_bold = page['content_bold']
                stat2(2, category, content_bold, word_fre_dict, category_set,
                      stop_word_set)

        sorted_list = sorted(word_fre_dict.items(),
                             key=lambda p: p[1],
                             reverse=True)
        for val in sorted_list:
            outfile.write(val[0] + ',' + str(val[1]) + '\r\n')
Пример #26
0
def recommendTag(category_name, category_parent_dict, category_child_dict,
                 category_synonyms_dict, indicator_set, comment_category_set,
                 ambiguation_dict):
    #主类目名称
    main_category = category_name

    #未被匹配到的app
    others_app = {}
    outfile_json = open('tag_recommend_result.json', 'wb')
    jieba.load_userdict('../../../data/jieba_userdict.txt')
    stopword_set = text_process.getStopword('../../../data/stopword.txt')
    node_children_dict = rule_base.createNodeChildrenDict(category_child_dict)

    candidate_tag_set, candidate_delegate_tag_set = rule_base.getCandidateTag(
        main_category, node_children_dict, category_synonyms_dict)
    level_category_dict = rule_base.createLevelCategoryDict(
        main_category, candidate_tag_set, category_parent_dict,
        category_child_dict, category_synonyms_dict)
    # level_category_dict[0] = set([main_category])
    for level in level_category_dict.keys():
        print level
        print ' '.join(level_category_dict[level])

    match_counter = 0
    all_app_counter = 0

    #遍历主类目下的app
    infile = open('../data/' + category_name + '.json', 'rb')
    outfile_match = open('../data/' + category_name + '_match.json', 'wb')
    outfile_unmatch = open('../data/' + category_name + '_unmatch.json', 'wb')

    for row in infile:
        all_app_counter += 1

        json_obj = json.loads(row.strip())
        app_id = int(json_obj["id"])
        app_name = json_obj["title"]
        app_brief = json_obj["brief"]
        app_download = int(json_obj["download_times"])
        app_brief_seg = [
            word for word in jieba.cut(app_brief)
            if word not in stopword_set and text_process.isChinese(word)
        ]
        app_name_brief = app_name + " " + app_brief
        app_name_brief += " " + rule_base.grabEnglish(app_name_brief)

        output_dict = {}
        output_dict["id"] = app_id
        output_dict["content"] = {}
        tag_recommend_set = set([])

        #情感词匹配,暂时不处理情感词的同义关系
        for comment_word in [
                comment_word for comment_word in comment_category_set
                if comment_word in app_name_brief
        ]:
            output_dict.setdefault("character", []).append(comment_word)

        #自下而上匹配
        for depth in reversed(range(0, max(level_category_dict.keys()) + 1)):
            if depth not in level_category_dict.keys():
                continue
            current_level_category_set = level_category_dict[depth]
            for current_level_category in current_level_category_set:
                if current_level_category in app_name_brief and not rule_base.isAmbiguous(
                        current_level_category, ambiguation_dict,
                        app_name_brief):
                    category_delegate = category_synonyms_dict[
                        current_level_category][0]
                    tag_recommend_set.add(category_delegate)
                    #强规则
                    strong_parent_set = rule_base.getNodeListOnStrongPath(
                        category_parent_dict[category_delegate],
                        category_parent_dict, set([]))
                    tag_recommend_set = tag_recommend_set | (
                        strong_parent_set & candidate_tag_set)

            current_level_unmatch_category_set = current_level_category_set - tag_recommend_set
            for unmatch_category in current_level_unmatch_category_set:
                if unmatch_category in indicator_set:
                    continue
                unmatch_category = category_synonyms_dict[unmatch_category][0]
                unmatch_category_children = node_children_dict[
                    unmatch_category]
                match_children = unmatch_category_children & tag_recommend_set
                if len(match_children) >= 3:
                    tag_recommend_set.add(unmatch_category)

        #隐节点
        for tag in tag_recommend_set:
            if u'(' in tag and u')' in tag:
                hidden_node_next_level = rule_base.getNextLevelCategorySet(
                    category_synonyms_dict, category_child_dict, tag)
                for hidden_node_next_level_item in hidden_node_next_level:
                    hidden_node_next_level_item = category_synonyms_dict[
                        hidden_node_next_level_item][0]
                    if hidden_node_next_level_item in tag_recommend_set:
                        output_dict.setdefault(
                            tag, []).append(hidden_node_next_level_item)
        #去除推导词
        tag_recommend_set = tag_recommend_set - indicator_set

        #构建输出字典
        content = outputJson(main_category, category_parent_dict,
                             category_child_dict, category_synonyms_dict,
                             tag_recommend_set)
        output_dict['content'] = content

        if len(content.keys()) != 0:
            outfile_match.write(row)
            match_counter += 1
            if app_download >= 10000000:
                continue
            outfile_json.write(
                json.dumps(output_dict, ensure_ascii=False) + '\r\n')
        else:
            outfile_unmatch.write(row)
            if app_download <= 500:
                continue
            others_app.setdefault(app_name,
                                  [app_download, ' '.join(app_brief_seg)])
    print "覆盖率: " + str(1.0 * match_counter / all_app_counter)

    #剩下没有匹配到的按下载量排序,输出
    other_title_fre = {}
    sorted_list = sorted(others_app.items(),
                         key=lambda p: p[1][0],
                         reverse=True)
    outfile_others = open('others.txt', 'wb')
    for val in sorted_list:
        title_seg = jieba.cut(val[0])
        for title in title_seg:
            if text_process.isChinese(title) and title not in stopword_set:
                other_title_fre.setdefault(title, 0)
                other_title_fre[title] += 1
        outfile_others.write(val[0] + '<@>' + val[1][1] + '\r\n')

    sorted_list = sorted(other_title_fre.items(),
                         key=lambda p: p[1],
                         reverse=True)
    outfile_others_title = open('others_title.txt', 'wb')
    for val in sorted_list:
        outfile_others_title.write(val[0] + '<@>' + str(val[1]) + '\r\n')
def recommendTag(
    category_name,
    category_parent_dict,
    category_child_dict,
    category_synonyms_dict,
    indicator_set,
    comment_category_set,
    ambiguation_dict,
):
    # 主类目名称
    main_category = category_name

    # 未被匹配到的app
    others_app = {}
    outfile_json = open("tag_recommend_result.json", "wb")
    jieba.load_userdict("../../../data/jieba_userdict.txt")
    stopword_set = text_process.getStopword("../../../data/stopword.txt")
    node_children_dict = rule_base.createNodeChildrenDict(category_child_dict)

    candidate_tag_set, candidate_delegate_tag_set = rule_base.getCandidateTag(
        main_category, node_children_dict, category_synonyms_dict
    )
    level_category_dict = rule_base.createLevelCategoryDict(
        main_category, candidate_tag_set, category_parent_dict, category_child_dict, category_synonyms_dict
    )
    # level_category_dict[0] = set([main_category])
    for level in level_category_dict.keys():
        print level
        print " ".join(level_category_dict[level])

    match_counter = 0
    all_app_counter = 0

    # 遍历主类目下的app
    infile = open("../data/" + category_name + ".json", "rb")
    outfile_match = open("../data/" + category_name + "_match.json", "wb")
    outfile_unmatch = open("../data/" + category_name + "_unmatch.json", "wb")

    for row in infile:
        all_app_counter += 1

        json_obj = json.loads(row.strip())
        app_id = int(json_obj["id"])
        app_name = json_obj["title"]
        app_brief = json_obj["brief"]
        app_download = int(json_obj["download_times"])
        app_brief_seg = [
            word for word in jieba.cut(app_brief) if word not in stopword_set and text_process.isChinese(word)
        ]
        app_name_brief = app_name + " " + app_brief
        app_name_brief += " " + rule_base.grabEnglish(app_name_brief)

        output_dict = {}
        output_dict["id"] = app_id
        output_dict["content"] = {}
        tag_recommend_set = set([])

        # 情感词匹配,暂时不处理情感词的同义关系
        for comment_word in [comment_word for comment_word in comment_category_set if comment_word in app_name_brief]:
            output_dict.setdefault("character", []).append(comment_word)

            # 自下而上匹配
        for depth in reversed(range(0, max(level_category_dict.keys()) + 1)):
            if depth not in level_category_dict.keys():
                continue
            current_level_category_set = level_category_dict[depth]
            for current_level_category in current_level_category_set:
                if current_level_category in app_name_brief and not rule_base.isAmbiguous(
                    current_level_category, ambiguation_dict, app_name_brief
                ):
                    category_delegate = category_synonyms_dict[current_level_category][0]
                    tag_recommend_set.add(category_delegate)
                    # 强规则
                    strong_parent_set = rule_base.getNodeListOnStrongPath(
                        category_parent_dict[category_delegate], category_parent_dict, set([])
                    )
                    tag_recommend_set = tag_recommend_set | (strong_parent_set & candidate_tag_set)

            current_level_unmatch_category_set = current_level_category_set - tag_recommend_set
            for unmatch_category in current_level_unmatch_category_set:
                if unmatch_category in indicator_set:
                    continue
                unmatch_category = category_synonyms_dict[unmatch_category][0]
                unmatch_category_children = node_children_dict[unmatch_category]
                match_children = unmatch_category_children & tag_recommend_set
                if len(match_children) >= 3:
                    tag_recommend_set.add(unmatch_category)

                    # 隐节点
        for tag in tag_recommend_set:
            if u"(" in tag and u")" in tag:
                hidden_node_next_level = rule_base.getNextLevelCategorySet(
                    category_synonyms_dict, category_child_dict, tag
                )
                for hidden_node_next_level_item in hidden_node_next_level:
                    hidden_node_next_level_item = category_synonyms_dict[hidden_node_next_level_item][0]
                    if hidden_node_next_level_item in tag_recommend_set:
                        output_dict.setdefault(tag, []).append(hidden_node_next_level_item)
                        # 去除推导词
        tag_recommend_set = tag_recommend_set - indicator_set

        # 构建输出字典
        content = outputJson(
            main_category, category_parent_dict, category_child_dict, category_synonyms_dict, tag_recommend_set
        )
        output_dict["content"] = content

        if len(content.keys()) != 0:
            outfile_match.write(row)
            match_counter += 1
            if app_download >= 10000000:
                continue
            outfile_json.write(json.dumps(output_dict, ensure_ascii=False) + "\r\n")
        else:
            outfile_unmatch.write(row)
            if app_download <= 500:
                continue
            others_app.setdefault(app_name, [app_download, " ".join(app_brief_seg)])
    print "覆盖率: " + str(1.0 * match_counter / all_app_counter)

    # 剩下没有匹配到的按下载量排序,输出
    other_title_fre = {}
    sorted_list = sorted(others_app.items(), key=lambda p: p[1][0], reverse=True)
    outfile_others = open("others.txt", "wb")
    for val in sorted_list:
        title_seg = jieba.cut(val[0])
        for title in title_seg:
            if text_process.isChinese(title) and title not in stopword_set:
                other_title_fre.setdefault(title, 0)
                other_title_fre[title] += 1
        outfile_others.write(val[0] + "<@>" + val[1][1] + "\r\n")

    sorted_list = sorted(other_title_fre.items(), key=lambda p: p[1], reverse=True)
    outfile_others_title = open("others_title.txt", "wb")
    for val in sorted_list:
        outfile_others_title.write(val[0] + "<@>" + str(val[1]) + "\r\n")
def readJson(word2vec_model):
	print 'parsing json'
	stopword_set = text_process.getStopword('../../../data/stopword.txt')
	outfile = open('baidu_baike_definition.txt','wb')
	infile = open('../../scrapy/baidu_baike_definition/crawl_data/definition.json','rb')
	row_index = 0
	for row in infile:

		json_str = row.strip()
		json_str = json_str.lstrip('[')
		json_str = json_str.rstrip(',')
		json_str = json_str.rstrip(']')
		json_obj = json.loads(json_str)

		query_word = json_obj['query_category']
		is_only = json_obj['is_only']
		ambiguous_tips = json_obj['ambiguous_tips']
		title = json_obj['title']
		title_note = json_obj['title_note']
		structure_tag = json_obj['structure_tag']
		abstract = json_obj['abstract']
		content = json_obj['content']

		word_synonyms_set = set([query_word])
		
		if is_only:
			word_synonyms_set.add(title)
	
		alias_list = []
		alias_list_clean = []
		for tag_key in structure_tag.keys():
			tag_key = tag_key.decode('utf-8')
			tag_value = structure_tag[tag_key].decode('utf-8')
			tag_key_clean = tag_key.replace(u' ','')
			if tag_key_clean in baidu_tag_keys:
				if tag_value != u"无":
					alias_list.append(tag_value)
		for alias in alias_list:
			alias = regular_template.cleanNote(alias)
			if regular_template.isEnglishPhrase(alias):
				print alias
				continue
			for word in alias.replace(u","," ").replace(u"、"," ").replace(u","," ").replace(u";"," ").replace(u";"," ").split():
				word = word.replace(u"“","").replace(u"”","").replace(u" ","").rstrip(u"等")
				alias_list_clean.append(regular_template.cleanDefinition(word))

		alias_text = ' '.join(alias_list_clean)

		if is_only:
			word_synonyms_set = word_synonyms_set | set(alias_list_clean)

		
		ambiguous_tips = regular_template.cleanNote(ambiguous_tips)
		
		abstract_definition_text_set,abstract_definition_set = regular_template.main(abstract,query_word)
		abstract_definition_text = ' '.join(abstract_definition_text_set)

		title_note_definition_text_set,title_note_definition_set = regular_template.main(title_note,query_word)
		title_note_definition_text = ' '.join(title_note_definition_text_set)

		try:
			top_simi_words = [simi_word_tuple[0] for simi_word_tuple in word2vec_model.most_similar(positive=[query_word],topn=80)]
			for simi_word in top_simi_words:
				if len(simi_word)==1:
					continue
				if simi_word in alias_text or simi_word in abstract_definition_text or simi_word in ambiguous_tips or simi_word in title_note_definition_text or simi_word in title:
					if not text_process.isSubsetGeneral(query_word,simi_word):
						word_synonyms_set.add(simi_word)
			for pair in itertools.combinations(word_synonyms_set,2):
				new_word = ''.join(pair)
				if new_word not in  word_synonyms_set and (new_word in abstract_definition_text or new_word in title):
					word_synonyms_set.add(new_word)

			if len([word for word in word_synonyms_set if len(word)>0]) >= 2:
				outfile.write(query_word+'@'+','.join([word for word in word_synonyms_set if len(word)>0])+'\r\n')
		except:
			print 'not in vocabulary '+query_word
Пример #29
0
def getCorpus(category_name):

    app_lable_dict = {
        10743: 1,
        1002128: 1,
        47: 1,
        498: 1,
        550: -1,
        48: -1,
        490: -1,
        761: -1,
        101108: -1,
        101916: -1
    }

    x_train = []
    y_train = []
    x_test = []

    jieba.load_userdict('../../../data/jieba_userdict.txt')
    stopword_set = text_process.getStopword('../../../data/stopword.txt')

    doc_app_id = []
    docs = []
    id_name_dict = {}
    infile = open('corpus/' + category_name + '.json', 'rb')
    for row in infile:
        json_obj = json.loads(row.strip())
        app_id = int(json_obj["id"])
        app_name = json_obj["title"]
        app_brief = json_obj["brief"]
        app_download = int(json_obj["download_times"])
        app_brief_seg = [
            word for word in jieba.cut(app_name + " " + app_brief)
            if word not in stopword_set and text_process.isChinese(word)
        ]

        if len(app_brief_seg) <= 10 and app_download <= 100:
            continue

        doc_app_id.append(app_id)
        id_name_dict[app_id] = app_name
        docs.append(app_brief_seg)

    dictionary = corpora.Dictionary(docs)
    corpus = [dictionary.doc2bow(text) for text in docs]

    for i in range(len(corpus)):
        doc = corpus[i]
        x = [0 for n in range(len(dictionary))]
        for val in doc:
            x[val[0]] = val[1]

        app_id = doc_app_id[i]
        if app_id in app_lable_dict.keys():
            x_train.append(x)
            if app_lable_dict[app_id] == 1:
                y_train.append(1)
            else:
                y_train.append(-1)
        else:
            x_test.append(x)

    return x_train, x_test, y_train, doc_app_id, id_name_dict
Пример #30
0
def readJson(word2vec_model):
    print 'parsing json'
    stopword_set = text_process.getStopword('../../../data/stopword.txt')
    outfile = open('baidu_baike_definition.txt', 'wb')
    infile = open(
        '../../scrapy/baidu_baike_definition/crawl_data/definition.json', 'rb')
    row_index = 0
    for row in infile:

        json_str = row.strip()
        json_str = json_str.lstrip('[')
        json_str = json_str.rstrip(',')
        json_str = json_str.rstrip(']')
        json_obj = json.loads(json_str)

        query_word = json_obj['query_category']
        is_only = json_obj['is_only']
        ambiguous_tips = json_obj['ambiguous_tips']
        title = json_obj['title']
        title_note = json_obj['title_note']
        structure_tag = json_obj['structure_tag']
        abstract = json_obj['abstract']
        content = json_obj['content']

        word_synonyms_set = set([query_word])

        if is_only:
            word_synonyms_set.add(title)

        alias_list = []
        alias_list_clean = []
        for tag_key in structure_tag.keys():
            tag_key = tag_key.decode('utf-8')
            tag_value = structure_tag[tag_key].decode('utf-8')
            tag_key_clean = tag_key.replace(u' ', '')
            if tag_key_clean in baidu_tag_keys:
                if tag_value != u"无":
                    alias_list.append(tag_value)
        for alias in alias_list:
            alias = regular_template.cleanNote(alias)
            if regular_template.isEnglishPhrase(alias):
                print alias
                continue
            for word in alias.replace(u",", " ").replace(u"、", " ").replace(
                    u",", " ").replace(u";", " ").replace(u";", " ").split():
                word = word.replace(u"“",
                                    "").replace(u"”",
                                                "").replace(u" ",
                                                            "").rstrip(u"等")
                alias_list_clean.append(regular_template.cleanDefinition(word))

        alias_text = ' '.join(alias_list_clean)

        if is_only:
            word_synonyms_set = word_synonyms_set | set(alias_list_clean)

        ambiguous_tips = regular_template.cleanNote(ambiguous_tips)

        abstract_definition_text_set, abstract_definition_set = regular_template.main(
            abstract, query_word)
        abstract_definition_text = ' '.join(abstract_definition_text_set)

        title_note_definition_text_set, title_note_definition_set = regular_template.main(
            title_note, query_word)
        title_note_definition_text = ' '.join(title_note_definition_text_set)

        try:
            top_simi_words = [
                simi_word_tuple[0] for simi_word_tuple in
                word2vec_model.most_similar(positive=[query_word], topn=80)
            ]
            for simi_word in top_simi_words:
                if len(simi_word) == 1:
                    continue
                if simi_word in alias_text or simi_word in abstract_definition_text or simi_word in ambiguous_tips or simi_word in title_note_definition_text or simi_word in title:
                    if not text_process.isSubsetGeneral(query_word, simi_word):
                        word_synonyms_set.add(simi_word)
            for pair in itertools.combinations(word_synonyms_set, 2):
                new_word = ''.join(pair)
                if new_word not in word_synonyms_set and (
                        new_word in abstract_definition_text
                        or new_word in title):
                    word_synonyms_set.add(new_word)

            if len([word for word in word_synonyms_set if len(word) > 0]) >= 2:
                outfile.write(query_word + '@' + ','.join(
                    [word
                     for word in word_synonyms_set if len(word) > 0]) + '\r\n')
        except:
            print 'not in vocabulary ' + query_word