def readSegFile(): jieba.load_userdict("../../data/jieba_userdict.txt") infile = open('../../data/all_cn_seg_nwi_clean.txt','rb') outfile = open('../../data/all_word.txt','wb') stopword_set = text_process.getStopword('../../data/stopword.txt') word_set = set([]) word_fre_dict = {} row_counter = 0 for row in infile: row_counter += 1 print row_counter row = row.strip().decode('utf-8') items = row.split('<@>') app_name = items[1] brief_seg = items[2].split() title_seg = jieba.cut(app_name) for title in title_seg: if text_process.isChinese(title) and title not in stopword_set: word_set.add(title) word_fre_dict.setdefault(title,0) word_fre_dict[title] += 1 for brief in brief_seg: if text_process.isChinese(brief) and brief not in stopword_set: word_set.add(brief) word_fre_dict.setdefault(brief,0) word_fre_dict[brief] += 1 sorted_list = sorted(word_fre_dict.items(),key=lambda p:p[1],reverse=True) for val in sorted_list: if val[1] >= 10: outfile.write(val[0]+','+str(val[1])+'\r\n')
def readSegFile(): infile = open('../../data/all_cn_seg_nwi_clean.txt','rb') outfile = open('../../data/candidate_title_word.txt','wb') stopword_set = text_process.getStopword('../../data/stopword.txt') word_set = set([]) word_fre_dict = {} row_counter = 0 for row in infile: row_counter += 1 print row_counter row = row.strip().decode('utf-8') items = row.split('<@>') app_name = items[1] brief_seg = items[2].split() title_seg = jieba.cut(app_name) for title in title_seg: if text_process.isChinese(title) and title not in stopword_set: word_set.add(title) word_fre_dict.setdefault(title,0) word_fre_dict[title] += 1 # for brief in brief_seg: # word_set.add(brief) for word in word_fre_dict.keys(): if word_fre_dict[word] >= 10: outfile.write(word+'\r\n')
def recommendTag(category_parent_dict): outfile = open('tag_recommend_result.txt', 'wb') print 'loading jieba userdict' jieba.load_userdict('../../../data/jieba_userdict.txt') print 'loading stopword' stopword_set = text_process.getStopword('../../../data/stopword.txt') print 'reading app json' infile = open('../data/' + category_path + '.json', 'rb') for row in infile: json_obj = json.loads(row.strip()) app_id = int(json_obj["soft_id"]) app_name = json_obj["soft_name"] app_brief = json_obj["soft_brief"] app_download = int(json_obj["download_times"]) outfile.write( str(app_id) + '<@>' + app_name + '<@>' + app_brief + '<@>') tag_recommend_set = set([]) for category in category_parent_dict.keys(): if category in app_name or category in app_brief: for parent_tuple in category_parent_dict[category]: if parent_tuple[1] == 0: tag_recommend_set.add(parent_tuple[0]) else: tag_recommend_set.add(category) if parent_tuple[1] == 2: tag_recommend_set.add(parent_tuple[0]) outfile.write(','.join(tag_recommend_set)) outfile.write('\r\n')
def recommendTag(category_parent_dict): outfile = open('tag_recommend_result.txt','wb') print 'loading jieba userdict' jieba.load_userdict('../../../data/jieba_userdict.txt') print 'loading stopword' stopword_set = text_process.getStopword('../../../data/stopword.txt') print 'reading app json' infile = open('../data/'+category_path+'.json','rb') for row in infile: json_obj = json.loads(row.strip()) app_id = int(json_obj["soft_id"]) app_name = json_obj["soft_name"] app_brief = json_obj["soft_brief"] app_download = int(json_obj["download_times"]) outfile.write(str(app_id)+'<@>'+app_name+'<@>'+app_brief+'<@>') tag_recommend_set = set([]) for category in category_parent_dict.keys(): if category in app_name or category in app_brief: for parent_tuple in category_parent_dict[category]: if parent_tuple[1] == 0: tag_recommend_set.add(parent_tuple[0]) else: tag_recommend_set.add(category) if parent_tuple[1] == 2: tag_recommend_set.add(parent_tuple[0]) outfile.write(','.join(tag_recommend_set)) outfile.write('\r\n')
def readSegFile(): jieba.load_userdict("../../data/jieba_userdict.txt") infile = open('../../data/all_cn_seg_nwi_clean.txt', 'rb') outfile = open('../../data/all_word.txt', 'wb') stopword_set = text_process.getStopword('../../data/stopword.txt') word_set = set([]) word_fre_dict = {} row_counter = 0 for row in infile: row_counter += 1 print row_counter row = row.strip().decode('utf-8') items = row.split('<@>') app_name = items[1] brief_seg = items[2].split() title_seg = jieba.cut(app_name) for title in title_seg: if text_process.isChinese(title) and title not in stopword_set: word_set.add(title) word_fre_dict.setdefault(title, 0) word_fre_dict[title] += 1 for brief in brief_seg: if text_process.isChinese(brief) and brief not in stopword_set: word_set.add(brief) word_fre_dict.setdefault(brief, 0) word_fre_dict[brief] += 1 sorted_list = sorted(word_fre_dict.items(), key=lambda p: p[1], reverse=True) for val in sorted_list: if val[1] >= 10: outfile.write(val[0] + ',' + str(val[1]) + '\r\n')
def mineAbbreviation(): print 'mining abbreviation' jieba.load_userdict("../../../data/jieba_userdict.txt") stopword_set = text_process.getStopword('../../../data/stopword.txt') word2vec_model = Word2Vec.load('../../../data/word2vec.model') word_set = getWords() word_syn_dict = {} for word in word_set: word_syn_dict.setdefault(word, set([word])) if len(word) != 2: continue try: for simi_word_tuple in word2vec_model.most_similar(positive=[word], topn=20): simi_word = simi_word_tuple[0] simi_value = simi_word_tuple[1] reverse_word = word[1] + word[0] if reverse_word == simi_word: pass else: if len(set(word) & set(simi_word)) != len( word ) or simi_value < 0.5 or word in simi_word or reverse_word in simi_word: continue word_syn_dict[word].add(simi_word) except: pass # print word outfile = open('abbreviation.txt', 'wb') for word in word_syn_dict.keys(): if len(word_syn_dict[word]) >= 2: outfile.write(word + '@' + ','.join(word_syn_dict[word]) + '\r\n')
def clean(category_id,category_crawl_dict,category_set): print 'cleaning' stop_word_set = text_process.getStopword(data_path+'stopword.txt') for category in category_crawl_dict.keys(): word_fre_dict = {} outfile = open('wiki_search/'+str(category_id)+'_'+category+'.txt','wb') print category for page in category_crawl_dict[category]: abstract = page['abstract'] stat(6,category,abstract,word_fre_dict,category_set,stop_word_set) abstract_link = page['abstract_link'] stat2(10,category,abstract_link,word_fre_dict,category_set,stop_word_set) abstract_bold = page['abstract_bold'] stat2(8,category,abstract_bold,word_fre_dict,category_set,stop_word_set) if 'wiki_category' in page.keys(): wiki_category = page['wiki_category'] stat2(20,category,wiki_category,word_fre_dict,category_set,stop_word_set) if 'content' in page.keys(): content = page['content'] stat(1,category,content,word_fre_dict,category_set,stop_word_set) content_link = page['content_link'] stat2(4,category,content_link,word_fre_dict,category_set,stop_word_set) content_bold = page['content_bold'] stat2(2,category,content_bold,word_fre_dict,category_set,stop_word_set) sorted_list = sorted(word_fre_dict.items(),key=lambda p:p[1],reverse=True) for val in sorted_list: outfile.write(val[0]+','+str(val[1])+'\r\n')
def mineAbbreviation(): print 'mining abbreviation' jieba.load_userdict("../../../data/jieba_userdict.txt") stopword_set = text_process.getStopword('../../../data/stopword.txt') word2vec_model = Word2Vec.load('../../../data/word2vec.model') word_set = getWords() word_syn_dict = {} for word in word_set: word_syn_dict.setdefault(word,set([word])) if len(word) != 2: continue try: for simi_word_tuple in word2vec_model.most_similar(positive=[word],topn=20): simi_word = simi_word_tuple[0] simi_value = simi_word_tuple[1] reverse_word = word[1]+word[0] if reverse_word == simi_word: pass else: if len(set(word)&set(simi_word)) != len(word) or simi_value < 0.5 or word in simi_word or reverse_word in simi_word: continue word_syn_dict[word].add(simi_word) except: pass # print word outfile = open('abbreviation.txt','wb') for word in word_syn_dict.keys(): if len(word_syn_dict[word])>=2: outfile.write(word+'@'+','.join(word_syn_dict[word])+'\r\n')
def getTrainTest(category_name,category_parent_dict,category_child_dict,category_synonyms_dict,indicator_set,comment_category_set,ambiguation_dict): #主类目名称 main_category = u"软件" jieba.load_userdict('../../../data/jieba_userdict.txt') stopword_set = text_process.getStopword('../../../data/stopword.txt') node_children_dict = rule_base.createNodeChildrenDict(category_child_dict) candidate_tag_set,candidate_delegate_tag_set = rule_base.getCandidateTag(main_category,node_children_dict,category_synonyms_dict) level_category_dict = rule_base.createLevelCategoryDict(main_category,candidate_tag_set,category_parent_dict,category_child_dict,category_synonyms_dict) # for level in level_category_dict.keys(): # print level # print ' '.join(level_category_dict[level]) dictionary = corpora.Dictionary([list(candidate_delegate_tag_set)]) valcabulary_size = len(dictionary) #遍历主类目下的app infile = open('../data/'+category_name+'.json','rb') X_train = [] X_test = [] X_test_info = [] all_counter = 0 train_counter = 0 for row in infile: all_counter += 1 json_obj = json.loads(row.strip()) app_id = int(json_obj["id"]) app_name = json_obj["title"] app_brief = json_obj["brief"] app_tag = json_obj["tags"] app_download = int(json_obj["download_times"]) app_brief_seg = [word for word in jieba.cut(app_brief) if word not in stopword_set and text_process.isChinese(word)] app_name_brief = app_name+" "+app_brief app_name_brief += " "+rule_base.grabEnglish(app_name_brief) tag_recommend_set = set([]) for tag in candidate_tag_set: if tag in app_name_brief: tag_recommend_set.add(category_synonyms_dict[tag][0]) doc = dictionary.doc2bow(list(tag_recommend_set)) x = [0 for i in range(valcabulary_size)] for val in doc: index = val[0] x[index] = val[1] if u"视频" in app_tag or u"音乐" in app_tag and app_download >= 1000: train_counter += 1 X_train.append(x) else: X_test.append(x) X_test_info.append([app_name,' '.join(app_brief_seg)]) print 1.0*train_counter/all_counter return X_train,X_test,X_test_info
def generateCandidateCategory(category_path): print 'loading file' jieba.load_userdict(data_path + "jieba_userdict.txt") stopword_set = text_process.getStopword('../../../data/stopword.txt') print 'reading file' word_title_dict = {} word_brief_dict = {} word_all_dict = {} infile = open('../data/' + category_path + '.json', 'rb') outfile = open('candidate_category/' + str(category_path) + '.txt', 'wb') for row in infile: json_obj = json.loads(row.strip()) app_name = json_obj["soft_name"] app_brief = json_obj["soft_brief"] seg_title_list = jieba.cut(app_name) seg_brief_list = jieba.cut(app_brief) for seg_title in seg_title_list: if text_process.isChinese( seg_title) and seg_title not in stopword_set: word_title_dict.setdefault(seg_title, 0) word_title_dict[seg_title] += 1 for seg_brief in seg_brief_list: if text_process.isChinese( seg_brief) and seg_brief not in stopword_set: word_brief_dict.setdefault(seg_brief, 0) word_brief_dict[seg_brief] += 1 print 'sorting' sorted_list = sorted(word_title_dict.items(), key=lambda p: p[1], reverse=True) for item in sorted_list: if item[1] >= 10: word_all_dict.setdefault(item[0], 0) word_all_dict[item[0]] += item[1] # outfile.write(item[0]+','+str(item[1])+'\r\n') sorted_list = sorted(word_brief_dict.items(), key=lambda p: p[1], reverse=True) for item in sorted_list: if item[1] >= 50: word_all_dict.setdefault(item[0], 0) word_all_dict[item[0]] += item[1] # outfile.write(item[0]+','+str(item[1])+'\r\n') sorted_list = sorted(word_all_dict.items(), key=lambda p: p[1], reverse=True) for item in sorted_list: outfile.write(item[0] + ',' + str(item[1]) + '\r\n')
def calculateCoverage(category_parent_dict, category_stat_dict): print 'loading jieba userdict' jieba.load_userdict('../../../data/jieba_userdict.txt') print 'loading stopword' stopword_set = text_process.getStopword('../../../data/stopword.txt') print 'reading app json' infile = open('../data/' + category_path + '.json', 'rb') all_app_counter = 0 print u'下载次数过滤阈值: ' + str(download_times_filter) for row in infile: json_obj = json.loads(row.strip()) app_id = int(json_obj["soft_id"]) app_name = json_obj["soft_name"] app_brief = json_obj["soft_brief"] app_download = int(json_obj["download_times"]) if app_download < download_times_filter: continue all_app_counter += 1 # if u'设备' in app_brief: # print app_brief for delegate_category in category_stat_dict.keys(): for relevant_category in category_stat_dict[delegate_category][0]: if relevant_category in app_name or relevant_category in app_brief: if relevant_category != delegate_category: #如果是强联通的,根节点不需要出现 if isStrongConnect(1, delegate_category, relevant_category, category_parent_dict): category_stat_dict[delegate_category][1].add( app_id) break elif delegate_category in app_name or delegate_category in app_brief: category_stat_dict[delegate_category][1].add( app_id) break else: category_stat_dict[delegate_category][1].add(app_id) break print u'过滤之后的app总数: ' + str(all_app_counter) top_coverage_category_info_dict = {} for iter_num in range(100): print '循环次数: ' + str(iter_num) coverage_ratio = rankTopCoverage(top_coverage_category_info_dict, category_stat_dict, all_app_counter) #达到一定累积覆盖率则停止 if coverage_ratio >= 0.99: break
def clean(category_path, category_crawl_dict, category_set): print 'cleaning' stop_word_set = text_process.getStopword(data_path + 'stopword.txt') for category in category_crawl_dict.keys(): word_score_dict = {} outfile = open('../clean_data/' + str(category_path) + '_' + category, 'wb') print category for page in category_crawl_dict[category]: offset_weight = 1.0 * (5 - int(page['offset'])) / 5 title = page['title'] statRawText(10 * offset_weight, category, title, word_score_dict, category_set, stop_word_set) if 'abstract' in page.keys(): abstract = page['abstract'] statRawText(6 * offset_weight, category, abstract, word_score_dict, category_set, stop_word_set) if 'abstract_link' in page.keys(): abstract_link = page['abstract_link'] statTextList(10 * offset_weight, category, abstract_link, word_score_dict, category_set, stop_word_set) if 'abstract_bold' in page.keys(): abstract_bold = page['abstract_bold'] statTextList(8 * offset_weight, category, abstract_bold, word_score_dict, category_set, stop_word_set) if 'wiki_category' in page.keys(): wiki_category = page['wiki_category'] statTextList(10 * offset_weight, category, wiki_category, word_score_dict, category_set, stop_word_set) if 'content' in page.keys(): content = page['content'] statRawText(1 * offset_weight, category, content, word_score_dict, category_set, stop_word_set) if 'content_link' in page.keys(): content_link = page['content_link'] statTextList(4 * offset_weight, category, content_link, word_score_dict, category_set, stop_word_set) if 'content_bold' in page.keys(): content_bold = page['content_bold'] statTextList(2 * offset_weight, category, content_bold, word_score_dict, category_set, stop_word_set) sorted_list = sorted(word_score_dict.items(), key=lambda p: p[1], reverse=True) for val in sorted_list: outfile.write(val[0] + ',' + str(val[1]) + '\r\n')
def getCorpus(category_name): app_lable_dict = {10743:1,1002128:1,47:1,498:1,550:-1,48:-1,490:-1,761:-1,101108:-1,101916:-1} x_train = [] y_train = [] x_test = [] jieba.load_userdict('../../../data/jieba_userdict.txt') stopword_set = text_process.getStopword('../../../data/stopword.txt') doc_app_id = [] docs = [] id_name_dict = {} infile = open('corpus/'+category_name+'.json','rb') for row in infile: json_obj = json.loads(row.strip()) app_id = int(json_obj["id"]) app_name = json_obj["title"] app_brief = json_obj["brief"] app_download = int(json_obj["download_times"]) app_brief_seg = [word for word in jieba.cut(app_name+" "+app_brief) if word not in stopword_set and text_process.isChinese(word)] if len(app_brief_seg) <= 10 and app_download <= 100: continue doc_app_id.append(app_id) id_name_dict[app_id] = app_name docs.append(app_brief_seg) dictionary = corpora.Dictionary(docs) corpus = [dictionary.doc2bow(text) for text in docs] for i in range(len(corpus)): doc = corpus[i] x = [0 for n in range(len(dictionary))] for val in doc: x[val[0]] = val[1] app_id = doc_app_id[i] if app_id in app_lable_dict.keys(): x_train.append(x) if app_lable_dict[app_id] == 1: y_train.append(1) else: y_train.append(-1) else: x_test.append(x) return x_train,x_test,y_train,doc_app_id,id_name_dict
def clean(category_path,category_crawl_dict,category_set): print 'cleaning' stop_word_set = text_process.getStopword(data_path+'stopword.txt') for category in category_crawl_dict.keys(): word_score_dict = {} outfile = open('../clean_data/'+str(category_path)+'/'+category,'wb') print category for page in category_crawl_dict[category]: offset_weight = 1.0*(5-int(page['offset']))/5 title = page['title'] content = page['content'] abstract = page['abstract'] # content_seg_list = jieba.cut(content) # abstract_seg_list = jieba.cut(abstract) # all_seg_set = set(content_seg_list) | set(abstract_seg_list) # intersec_num = 1.0*len(all_seg_set & category_set)/len(category_set) # print '------------' # print title # print 'abstract: '+abstract # print 'content: '+content # print ' '.join(all_seg_set & category_set) # print intersec_num # print '------------' # offset_weight = offset_weight*intersec_num # if intersec_num <= 0.01: # continue statRawText(10*offset_weight,category,title,word_score_dict,category_set,stop_word_set) statRawText(6*offset_weight,category,abstract,word_score_dict,category_set,stop_word_set) abstract_link = page['abstract_link'] statTextList(10*offset_weight,category,abstract_link,word_score_dict,category_set,stop_word_set) abstract_bold = page['abstract_bold'] statTextList(8*offset_weight,category,abstract_bold,word_score_dict,category_set,stop_word_set) if 'tags' in page.keys(): tags = page['tags'] statTextList(10*offset_weight,category,tags,word_score_dict,category_set,stop_word_set) statRawText(1*offset_weight,category,content,word_score_dict,category_set,stop_word_set) content_link = page['content_link'] statTextList(4*offset_weight,category,content_link,word_score_dict,category_set,stop_word_set) content_bold = page['content_bold'] statTextList(2*offset_weight,category,content_bold,word_score_dict,category_set,stop_word_set) sorted_list = sorted(word_score_dict.items(),key=lambda p:p[1],reverse=True) for val in sorted_list: outfile.write(val[0]+','+str(val[1])+'\r\n')
def mineKeywordCombination(category_id, query_keyword): #主类目名称 main_category = idToName(category_id) jieba.load_userdict('../../../data/jieba_userdict.txt') stopword_set = text_process.getStopword('../../../data/stopword.txt') combination_fre_dict = {} outfile = open('keyword_combination.txt', 'wb') #遍历主类目下的app infile = open('../data/' + str(category_id) + '.json', 'rb') for row in infile: json_obj = json.loads(row.strip()) app_id = int(json_obj["id"]) app_name = json_obj["title"] app_brief = json_obj["brief"] app_download = int(json_obj["download_times"]) app_name_seg = [ word for word in jieba.cut(app_name) if word not in stopword_set and text_process.isChinese(word) ] app_brief_seg = [ word for word in jieba.cut(app_brief) if word not in stopword_set and text_process.isChinese(word) ] app_name_brief = app_name + " " + app_brief app_name_combination_dict = combineNeighborWord( app_name_seg, query_keyword) for word in app_name_combination_dict.keys(): combination_fre_dict.setdefault(word, 0) combination_fre_dict[word] += app_name_combination_dict[word] app_brief_combination_dict = combineNeighborWord( app_brief_seg, query_keyword) for word in app_brief_combination_dict.keys(): combination_fre_dict.setdefault(word, 0) combination_fre_dict[word] += app_brief_combination_dict[word] sorted_list = sorted(combination_fre_dict.items(), key=lambda p: p[1], reverse=True) for val in sorted_list: if val[1] >= 2: print val[0] + ',' + str(val[1]) outfile.write(val[0] + ',' + str(val[1]) + '\r\n')
def calculateCoverage(category_parent_dict,category_stat_dict): print 'loading jieba userdict' jieba.load_userdict('../../../data/jieba_userdict.txt') print 'loading stopword' stopword_set = text_process.getStopword('../../../data/stopword.txt') print 'reading app json' infile = open('../data/'+category_path+'.json','rb') all_app_counter = 0 print u'下载次数过滤阈值: '+str(download_times_filter) for row in infile: json_obj = json.loads(row.strip()) app_id = int(json_obj["id"]) app_name = json_obj["title"] app_brief = json_obj["brief"] app_download = int(json_obj["download_times"]) if app_download < download_times_filter: continue all_app_counter += 1 # if u'设备' in app_brief: # print app_brief for delegate_category in category_stat_dict.keys(): for relevant_category in category_stat_dict[delegate_category][0]: if relevant_category in app_name or relevant_category in app_brief: if relevant_category != delegate_category: #如果是强联通的,根节点不需要出现 if isStrongConnect(1,delegate_category,relevant_category,category_parent_dict): category_stat_dict[delegate_category][1].add(app_id) break elif delegate_category in app_name or delegate_category in app_brief: category_stat_dict[delegate_category][1].add(app_id) break else: category_stat_dict[delegate_category][1].add(app_id) break print u'过滤之后的app总数: '+str(all_app_counter) top_coverage_category_info_dict = {} for iter_num in range(100): print '循环次数: '+str(iter_num) coverage_ratio = rankTopCoverage(top_coverage_category_info_dict,category_stat_dict,all_app_counter) #达到一定累积覆盖率则停止 if coverage_ratio >= 0.90: break
def tf(category_id, category_path, query_category, category_set, app_category_dict, app_tag_dict): print '-extracting feature' infile = open(data_path + 'all_cn_seg_nwi_clean.txt', 'rb') stopword_set = text_process.getStopword(data_path + 'stopword.txt') outfile_title = open('title_tf/' + str(category_path) + '.csv', 'wb') outfile_tag = open('tag_tf/' + str(category_path) + '.csv', 'wb') title_tf_dict = {} tag_tf_dict = {} for category in category_set: title_tf_dict.setdefault(category, 0) tag_tf_dict.setdefault(category, 0) row_index = 0 for row in infile: row_index += 1 items = row.strip().split("<@>") try: app_id = int(items[0]) app_name = items[1].decode('utf-8') seg_brief_list = items[2].decode('utf-8').split() except: continue if app_category_dict[app_id][1] != category_id: continue if query_category != "": if not isRelevant(query_category, app_name, seg_brief_list): continue title(app_name, title_tf_dict) # brief(seg_brief_list,tf_dict) tag(app_id, app_tag_dict, tag_tf_dict) max_title_tf = max(title_tf_dict.values()) print 'sorting' title_sorted_list = sorted(title_tf_dict.items(), key=lambda p: p[1], reverse=True) for val in title_sorted_list: outfile_title.write(val[0] + ',' + str(1.0 * val[1] / max_title_tf) + '\r\n') max_tag_tf = max(tag_tf_dict.values()) tag_sorted_list = sorted(tag_tf_dict.items(), key=lambda p: p[1], reverse=True) for val in tag_sorted_list: outfile_tag.write(val[0] + ',' + str(1.0 * val[1] / max_tag_tf) + '\r\n')
def generateCandidateCategory(category_path): print 'loading file' jieba.load_userdict(data_path+"jieba_userdict.txt") stopword_set = text_process.getStopword('../../../data/stopword.txt') print 'reading file' word_title_dict = {} word_brief_dict = {} word_all_dict = {} infile = open('../data/'+category_path+'.json','rb') outfile = open('candidate_category/'+str(category_path)+'.txt','wb') for row in infile: json_obj = json.loads(row.strip()) app_name = json_obj["soft_name"] app_brief = json_obj["soft_brief"] seg_title_list = jieba.cut(app_name) seg_brief_list = jieba.cut(app_brief) for seg_title in seg_title_list: if text_process.isChinese(seg_title) and seg_title not in stopword_set: word_title_dict.setdefault(seg_title,0) word_title_dict[seg_title] += 1 for seg_brief in seg_brief_list: if text_process.isChinese(seg_brief) and seg_brief not in stopword_set: word_brief_dict.setdefault(seg_brief,0) word_brief_dict[seg_brief] += 1 print 'sorting' sorted_list = sorted(word_title_dict.items(),key=lambda p:p[1],reverse=True) for item in sorted_list: if item[1] >= 10: word_all_dict.setdefault(item[0],0) word_all_dict[item[0]] += item[1] # outfile.write(item[0]+','+str(item[1])+'\r\n') sorted_list = sorted(word_brief_dict.items(),key=lambda p:p[1],reverse=True) for item in sorted_list: if item[1] >= 50: word_all_dict.setdefault(item[0],0) word_all_dict[item[0]] += item[1] # outfile.write(item[0]+','+str(item[1])+'\r\n') sorted_list = sorted(word_all_dict.items(),key=lambda p:p[1],reverse=True) for item in sorted_list: outfile.write(item[0]+','+str(item[1])+'\r\n')
def classify(category_name,category_parent_dict,category_child_dict,category_synonyms_dict,indicator_set,comment_category_set,ambiguation_dict): #主类目名称 main_category = u"软件" jieba.load_userdict('../../../data/jieba_userdict.txt') stopword_set = text_process.getStopword('../../../data/stopword.txt') node_children_dict = rule_base.createNodeChildrenDict(category_child_dict) candidate_tag_set,candidate_delegate_tag_set = rule_base.getCandidateTag(main_category,node_children_dict,category_synonyms_dict) level_category_dict = rule_base.createLevelCategoryDict(main_category,candidate_tag_set,category_parent_dict,category_child_dict,category_synonyms_dict) for level in level_category_dict.keys(): print level print ' '.join(level_category_dict[level]) #遍历主类目下的app infile = open('../data/'+category_name+'.json','rb') outfile_classification = open('../data/'+ category_name+'_classification.json','wb') for row in infile: json_obj = json.loads(row.strip()) app_id = int(json_obj["id"]) app_name = json_obj["title"] app_brief = json_obj["brief"] app_download = int(json_obj["download_times"]) app_brief_seg = [word for word in jieba.cut(app_brief) if word not in stopword_set and text_process.isChinese(word)] app_name_brief = app_name+" "+app_brief app_name_brief += " "+rule_base.grabEnglish(app_name_brief) tag_recommend_set = set([]) for tag in candidate_tag_set: if tag in app_name_brief: tag_recommend_set.add(category_synonyms_dict[tag][0]) if len(level_category_dict[1] & tag_recommend_set) != 0: candidate_main_level_set = level_category_dict[1] & tag_recommend_set candidate_main_level_score_dict = {} for candidate_main_level in candidate_main_level_set: score = len(node_children_dict[candidate_main_level] & tag_recommend_set) candidate_main_level_score_dict.setdefault(score,set([])).add(candidate_main_level) max_score = max(candidate_main_level_score_dict.keys()) if max_score >= 3: final_category_list = list(candidate_main_level_score_dict[max_score]) if final_category_list[0] != category_name: outfile_classification.write(str(app_id)+"->"+final_category_list[0]+"->"+app_name+"<@>"+" ".join(app_brief_seg)+'\r\n')
def tf(category_id,category_path,query_category,category_set,app_category_dict,app_tag_dict): print '-extracting feature' infile = open(data_path+'all_cn_seg_nwi_clean.txt','rb') stopword_set = text_process.getStopword(data_path+'stopword.txt') outfile_title = open('title_tf/'+str(category_path)+'.csv','wb') outfile_tag = open('tag_tf/'+str(category_path)+'.csv','wb') title_tf_dict = {} tag_tf_dict = {} for category in category_set: title_tf_dict.setdefault(category,0) tag_tf_dict.setdefault(category,0) row_index = 0 for row in infile: row_index += 1 items = row.strip().split("<@>") try: app_id = int(items[0]) app_name = items[1].decode('utf-8') seg_brief_list = items[2].decode('utf-8').split() except: continue if app_category_dict[app_id][1] != category_id: continue if query_category != "": if not isRelevant(query_category,app_name,seg_brief_list): continue title(app_name,title_tf_dict) # brief(seg_brief_list,tf_dict) tag(app_id,app_tag_dict,tag_tf_dict) max_title_tf = max(title_tf_dict.values()) print 'sorting' title_sorted_list = sorted(title_tf_dict.items(),key=lambda p:p[1],reverse=True) for val in title_sorted_list: outfile_title.write(val[0]+','+str(1.0*val[1]/max_title_tf)+'\r\n') max_tag_tf = max(tag_tf_dict.values()) tag_sorted_list = sorted(tag_tf_dict.items(),key=lambda p:p[1],reverse=True) for val in tag_sorted_list: outfile_tag.write(val[0]+','+str(1.0*val[1]/max_tag_tf)+'\r\n')
def calculateCoverage(category_stat_dict, synonyms_set_list): print 'loading file' jieba.load_userdict(data_path + "jieba_userdict.txt") stopword_set = text_process.getStopword('../../../data/stopword.txt') print 'reading file' infile = open('../data/' + category_path + '.json', 'rb') all_app_counter = 0 for row in infile: json_obj = json.loads(row.strip()) app_id = int(json_obj["soft_id"]) app_name = json_obj["soft_name"] app_brief = json_obj["soft_brief"] app_download = int(json_obj["download_times"]) if app_download < 100: continue all_app_counter += 1 seg_title_list = jieba.cut(app_name) seg_brief_list = jieba.cut(app_brief) for seg_title in seg_title_list: if text_process.isChinese( seg_title) and seg_title not in stopword_set: for main_category in category_stat_dict.keys(): if seg_title in category_stat_dict[main_category][0]: category_stat_dict[main_category][1].add(app_id) for seg_brief in seg_brief_list: if text_process.isChinese( seg_brief) and seg_brief not in stopword_set: for main_category in category_stat_dict.keys(): if seg_brief in category_stat_dict[main_category][0]: category_stat_dict[main_category][1].add(app_id) top_coverage_category_info_dict = {} for iter_num in range(20): stat(top_coverage_category_info_dict, category_stat_dict, all_app_counter, synonyms_set_list)
def clean(category_path, category_crawl_dict, category_set): print "cleaning" stop_word_set = text_process.getStopword(data_path + "stopword.txt") for category in category_crawl_dict.keys(): word_score_dict = {} outfile = open("../clean_data/" + str(category_path) + "_" + category, "wb") print category for page in category_crawl_dict[category]: offset_weight = 1.0 * (5 - int(page["offset"])) / 5 title = page["title"] statRawText(10 * offset_weight, category, title, word_score_dict, category_set, stop_word_set) if "abstract" in page.keys(): abstract = page["abstract"] statRawText(6 * offset_weight, category, abstract, word_score_dict, category_set, stop_word_set) if "abstract_link" in page.keys(): abstract_link = page["abstract_link"] statTextList(10 * offset_weight, category, abstract_link, word_score_dict, category_set, stop_word_set) if "abstract_bold" in page.keys(): abstract_bold = page["abstract_bold"] statTextList(8 * offset_weight, category, abstract_bold, word_score_dict, category_set, stop_word_set) if "wiki_category" in page.keys(): wiki_category = page["wiki_category"] statTextList(10 * offset_weight, category, wiki_category, word_score_dict, category_set, stop_word_set) if "content" in page.keys(): content = page["content"] statRawText(1 * offset_weight, category, content, word_score_dict, category_set, stop_word_set) if "content_link" in page.keys(): content_link = page["content_link"] statTextList(4 * offset_weight, category, content_link, word_score_dict, category_set, stop_word_set) if "content_bold" in page.keys(): content_bold = page["content_bold"] statTextList(2 * offset_weight, category, content_bold, word_score_dict, category_set, stop_word_set) sorted_list = sorted(word_score_dict.items(), key=lambda p: p[1], reverse=True) for val in sorted_list: outfile.write(val[0] + "," + str(val[1]) + "\r\n")
def mineKeywordCombination(category_id,query_keyword): #主类目名称 main_category = idToName(category_id) jieba.load_userdict('../../../data/jieba_userdict.txt') stopword_set = text_process.getStopword('../../../data/stopword.txt') combination_fre_dict = {} outfile = open('keyword_combination.txt','wb') #遍历主类目下的app infile = open('../data/'+str(category_id)+'.json','rb') for row in infile: json_obj = json.loads(row.strip()) app_id = int(json_obj["id"]) app_name = json_obj["title"] app_brief = json_obj["brief"] app_download = int(json_obj["download_times"]) app_name_seg = [word for word in jieba.cut(app_name) if word not in stopword_set and text_process.isChinese(word)] app_brief_seg = [word for word in jieba.cut(app_brief) if word not in stopword_set and text_process.isChinese(word)] app_name_brief = app_name+" "+app_brief app_name_combination_dict = combineNeighborWord(app_name_seg,query_keyword) for word in app_name_combination_dict.keys(): combination_fre_dict.setdefault(word,0) combination_fre_dict[word] += app_name_combination_dict[word] app_brief_combination_dict = combineNeighborWord(app_brief_seg,query_keyword) for word in app_brief_combination_dict.keys(): combination_fre_dict.setdefault(word,0) combination_fre_dict[word] += app_brief_combination_dict[word] sorted_list = sorted(combination_fre_dict.items(),key=lambda p:p[1],reverse=True) for val in sorted_list: if val[1] >= 2: print val[0]+','+str(val[1]) outfile.write(val[0]+','+str(val[1])+'\r\n')
def calculateCoverage(category_stat_dict,synonyms_set_list): print 'loading file' jieba.load_userdict(data_path+"jieba_userdict.txt") stopword_set = text_process.getStopword('../../../data/stopword.txt') print 'reading file' infile = open('../data/'+category_path+'.json','rb') all_app_counter = 0 for row in infile: json_obj = json.loads(row.strip()) app_id = int(json_obj["soft_id"]) app_name = json_obj["soft_name"] app_brief = json_obj["soft_brief"] app_download = int(json_obj["download_times"]) if app_download < 100: continue all_app_counter += 1 seg_title_list = jieba.cut(app_name) seg_brief_list = jieba.cut(app_brief) for seg_title in seg_title_list: if text_process.isChinese(seg_title) and seg_title not in stopword_set: for main_category in category_stat_dict.keys(): if seg_title in category_stat_dict[main_category][0]: category_stat_dict[main_category][1].add(app_id) for seg_brief in seg_brief_list: if text_process.isChinese(seg_brief) and seg_brief not in stopword_set: for main_category in category_stat_dict.keys(): if seg_brief in category_stat_dict[main_category][0]: category_stat_dict[main_category][1].add(app_id) top_coverage_category_info_dict = {} for iter_num in range(20): stat(top_coverage_category_info_dict,category_stat_dict,all_app_counter,synonyms_set_list)
def clean(category_id, category_crawl_dict, category_set): print 'cleaning' stop_word_set = text_process.getStopword(data_path + 'stopword.txt') for category in category_crawl_dict.keys(): word_fre_dict = {} outfile = open( 'wiki_search/' + str(category_id) + '_' + category + '.txt', 'wb') print category for page in category_crawl_dict[category]: abstract = page['abstract'] stat(6, category, abstract, word_fre_dict, category_set, stop_word_set) abstract_link = page['abstract_link'] stat2(10, category, abstract_link, word_fre_dict, category_set, stop_word_set) abstract_bold = page['abstract_bold'] stat2(8, category, abstract_bold, word_fre_dict, category_set, stop_word_set) if 'wiki_category' in page.keys(): wiki_category = page['wiki_category'] stat2(20, category, wiki_category, word_fre_dict, category_set, stop_word_set) if 'content' in page.keys(): content = page['content'] stat(1, category, content, word_fre_dict, category_set, stop_word_set) content_link = page['content_link'] stat2(4, category, content_link, word_fre_dict, category_set, stop_word_set) content_bold = page['content_bold'] stat2(2, category, content_bold, word_fre_dict, category_set, stop_word_set) sorted_list = sorted(word_fre_dict.items(), key=lambda p: p[1], reverse=True) for val in sorted_list: outfile.write(val[0] + ',' + str(val[1]) + '\r\n')
def recommendTag(category_name, category_parent_dict, category_child_dict, category_synonyms_dict, indicator_set, comment_category_set, ambiguation_dict): #主类目名称 main_category = category_name #未被匹配到的app others_app = {} outfile_json = open('tag_recommend_result.json', 'wb') jieba.load_userdict('../../../data/jieba_userdict.txt') stopword_set = text_process.getStopword('../../../data/stopword.txt') node_children_dict = rule_base.createNodeChildrenDict(category_child_dict) candidate_tag_set, candidate_delegate_tag_set = rule_base.getCandidateTag( main_category, node_children_dict, category_synonyms_dict) level_category_dict = rule_base.createLevelCategoryDict( main_category, candidate_tag_set, category_parent_dict, category_child_dict, category_synonyms_dict) # level_category_dict[0] = set([main_category]) for level in level_category_dict.keys(): print level print ' '.join(level_category_dict[level]) match_counter = 0 all_app_counter = 0 #遍历主类目下的app infile = open('../data/' + category_name + '.json', 'rb') outfile_match = open('../data/' + category_name + '_match.json', 'wb') outfile_unmatch = open('../data/' + category_name + '_unmatch.json', 'wb') for row in infile: all_app_counter += 1 json_obj = json.loads(row.strip()) app_id = int(json_obj["id"]) app_name = json_obj["title"] app_brief = json_obj["brief"] app_download = int(json_obj["download_times"]) app_brief_seg = [ word for word in jieba.cut(app_brief) if word not in stopword_set and text_process.isChinese(word) ] app_name_brief = app_name + " " + app_brief app_name_brief += " " + rule_base.grabEnglish(app_name_brief) output_dict = {} output_dict["id"] = app_id output_dict["content"] = {} tag_recommend_set = set([]) #情感词匹配,暂时不处理情感词的同义关系 for comment_word in [ comment_word for comment_word in comment_category_set if comment_word in app_name_brief ]: output_dict.setdefault("character", []).append(comment_word) #自下而上匹配 for depth in reversed(range(0, max(level_category_dict.keys()) + 1)): if depth not in level_category_dict.keys(): continue current_level_category_set = level_category_dict[depth] for current_level_category in current_level_category_set: if current_level_category in app_name_brief and not rule_base.isAmbiguous( current_level_category, ambiguation_dict, app_name_brief): category_delegate = category_synonyms_dict[ current_level_category][0] tag_recommend_set.add(category_delegate) #强规则 strong_parent_set = rule_base.getNodeListOnStrongPath( category_parent_dict[category_delegate], category_parent_dict, set([])) tag_recommend_set = tag_recommend_set | ( strong_parent_set & candidate_tag_set) current_level_unmatch_category_set = current_level_category_set - tag_recommend_set for unmatch_category in current_level_unmatch_category_set: if unmatch_category in indicator_set: continue unmatch_category = category_synonyms_dict[unmatch_category][0] unmatch_category_children = node_children_dict[ unmatch_category] match_children = unmatch_category_children & tag_recommend_set if len(match_children) >= 3: tag_recommend_set.add(unmatch_category) #隐节点 for tag in tag_recommend_set: if u'(' in tag and u')' in tag: hidden_node_next_level = rule_base.getNextLevelCategorySet( category_synonyms_dict, category_child_dict, tag) for hidden_node_next_level_item in hidden_node_next_level: hidden_node_next_level_item = category_synonyms_dict[ hidden_node_next_level_item][0] if hidden_node_next_level_item in tag_recommend_set: output_dict.setdefault( tag, []).append(hidden_node_next_level_item) #去除推导词 tag_recommend_set = tag_recommend_set - indicator_set #构建输出字典 content = outputJson(main_category, category_parent_dict, category_child_dict, category_synonyms_dict, tag_recommend_set) output_dict['content'] = content if len(content.keys()) != 0: outfile_match.write(row) match_counter += 1 if app_download >= 10000000: continue outfile_json.write( json.dumps(output_dict, ensure_ascii=False) + '\r\n') else: outfile_unmatch.write(row) if app_download <= 500: continue others_app.setdefault(app_name, [app_download, ' '.join(app_brief_seg)]) print "覆盖率: " + str(1.0 * match_counter / all_app_counter) #剩下没有匹配到的按下载量排序,输出 other_title_fre = {} sorted_list = sorted(others_app.items(), key=lambda p: p[1][0], reverse=True) outfile_others = open('others.txt', 'wb') for val in sorted_list: title_seg = jieba.cut(val[0]) for title in title_seg: if text_process.isChinese(title) and title not in stopword_set: other_title_fre.setdefault(title, 0) other_title_fre[title] += 1 outfile_others.write(val[0] + '<@>' + val[1][1] + '\r\n') sorted_list = sorted(other_title_fre.items(), key=lambda p: p[1], reverse=True) outfile_others_title = open('others_title.txt', 'wb') for val in sorted_list: outfile_others_title.write(val[0] + '<@>' + str(val[1]) + '\r\n')
def recommendTag( category_name, category_parent_dict, category_child_dict, category_synonyms_dict, indicator_set, comment_category_set, ambiguation_dict, ): # 主类目名称 main_category = category_name # 未被匹配到的app others_app = {} outfile_json = open("tag_recommend_result.json", "wb") jieba.load_userdict("../../../data/jieba_userdict.txt") stopword_set = text_process.getStopword("../../../data/stopword.txt") node_children_dict = rule_base.createNodeChildrenDict(category_child_dict) candidate_tag_set, candidate_delegate_tag_set = rule_base.getCandidateTag( main_category, node_children_dict, category_synonyms_dict ) level_category_dict = rule_base.createLevelCategoryDict( main_category, candidate_tag_set, category_parent_dict, category_child_dict, category_synonyms_dict ) # level_category_dict[0] = set([main_category]) for level in level_category_dict.keys(): print level print " ".join(level_category_dict[level]) match_counter = 0 all_app_counter = 0 # 遍历主类目下的app infile = open("../data/" + category_name + ".json", "rb") outfile_match = open("../data/" + category_name + "_match.json", "wb") outfile_unmatch = open("../data/" + category_name + "_unmatch.json", "wb") for row in infile: all_app_counter += 1 json_obj = json.loads(row.strip()) app_id = int(json_obj["id"]) app_name = json_obj["title"] app_brief = json_obj["brief"] app_download = int(json_obj["download_times"]) app_brief_seg = [ word for word in jieba.cut(app_brief) if word not in stopword_set and text_process.isChinese(word) ] app_name_brief = app_name + " " + app_brief app_name_brief += " " + rule_base.grabEnglish(app_name_brief) output_dict = {} output_dict["id"] = app_id output_dict["content"] = {} tag_recommend_set = set([]) # 情感词匹配,暂时不处理情感词的同义关系 for comment_word in [comment_word for comment_word in comment_category_set if comment_word in app_name_brief]: output_dict.setdefault("character", []).append(comment_word) # 自下而上匹配 for depth in reversed(range(0, max(level_category_dict.keys()) + 1)): if depth not in level_category_dict.keys(): continue current_level_category_set = level_category_dict[depth] for current_level_category in current_level_category_set: if current_level_category in app_name_brief and not rule_base.isAmbiguous( current_level_category, ambiguation_dict, app_name_brief ): category_delegate = category_synonyms_dict[current_level_category][0] tag_recommend_set.add(category_delegate) # 强规则 strong_parent_set = rule_base.getNodeListOnStrongPath( category_parent_dict[category_delegate], category_parent_dict, set([]) ) tag_recommend_set = tag_recommend_set | (strong_parent_set & candidate_tag_set) current_level_unmatch_category_set = current_level_category_set - tag_recommend_set for unmatch_category in current_level_unmatch_category_set: if unmatch_category in indicator_set: continue unmatch_category = category_synonyms_dict[unmatch_category][0] unmatch_category_children = node_children_dict[unmatch_category] match_children = unmatch_category_children & tag_recommend_set if len(match_children) >= 3: tag_recommend_set.add(unmatch_category) # 隐节点 for tag in tag_recommend_set: if u"(" in tag and u")" in tag: hidden_node_next_level = rule_base.getNextLevelCategorySet( category_synonyms_dict, category_child_dict, tag ) for hidden_node_next_level_item in hidden_node_next_level: hidden_node_next_level_item = category_synonyms_dict[hidden_node_next_level_item][0] if hidden_node_next_level_item in tag_recommend_set: output_dict.setdefault(tag, []).append(hidden_node_next_level_item) # 去除推导词 tag_recommend_set = tag_recommend_set - indicator_set # 构建输出字典 content = outputJson( main_category, category_parent_dict, category_child_dict, category_synonyms_dict, tag_recommend_set ) output_dict["content"] = content if len(content.keys()) != 0: outfile_match.write(row) match_counter += 1 if app_download >= 10000000: continue outfile_json.write(json.dumps(output_dict, ensure_ascii=False) + "\r\n") else: outfile_unmatch.write(row) if app_download <= 500: continue others_app.setdefault(app_name, [app_download, " ".join(app_brief_seg)]) print "覆盖率: " + str(1.0 * match_counter / all_app_counter) # 剩下没有匹配到的按下载量排序,输出 other_title_fre = {} sorted_list = sorted(others_app.items(), key=lambda p: p[1][0], reverse=True) outfile_others = open("others.txt", "wb") for val in sorted_list: title_seg = jieba.cut(val[0]) for title in title_seg: if text_process.isChinese(title) and title not in stopword_set: other_title_fre.setdefault(title, 0) other_title_fre[title] += 1 outfile_others.write(val[0] + "<@>" + val[1][1] + "\r\n") sorted_list = sorted(other_title_fre.items(), key=lambda p: p[1], reverse=True) outfile_others_title = open("others_title.txt", "wb") for val in sorted_list: outfile_others_title.write(val[0] + "<@>" + str(val[1]) + "\r\n")
def readJson(word2vec_model): print 'parsing json' stopword_set = text_process.getStopword('../../../data/stopword.txt') outfile = open('baidu_baike_definition.txt','wb') infile = open('../../scrapy/baidu_baike_definition/crawl_data/definition.json','rb') row_index = 0 for row in infile: json_str = row.strip() json_str = json_str.lstrip('[') json_str = json_str.rstrip(',') json_str = json_str.rstrip(']') json_obj = json.loads(json_str) query_word = json_obj['query_category'] is_only = json_obj['is_only'] ambiguous_tips = json_obj['ambiguous_tips'] title = json_obj['title'] title_note = json_obj['title_note'] structure_tag = json_obj['structure_tag'] abstract = json_obj['abstract'] content = json_obj['content'] word_synonyms_set = set([query_word]) if is_only: word_synonyms_set.add(title) alias_list = [] alias_list_clean = [] for tag_key in structure_tag.keys(): tag_key = tag_key.decode('utf-8') tag_value = structure_tag[tag_key].decode('utf-8') tag_key_clean = tag_key.replace(u' ','') if tag_key_clean in baidu_tag_keys: if tag_value != u"无": alias_list.append(tag_value) for alias in alias_list: alias = regular_template.cleanNote(alias) if regular_template.isEnglishPhrase(alias): print alias continue for word in alias.replace(u","," ").replace(u"、"," ").replace(u","," ").replace(u";"," ").replace(u";"," ").split(): word = word.replace(u"“","").replace(u"”","").replace(u" ","").rstrip(u"等") alias_list_clean.append(regular_template.cleanDefinition(word)) alias_text = ' '.join(alias_list_clean) if is_only: word_synonyms_set = word_synonyms_set | set(alias_list_clean) ambiguous_tips = regular_template.cleanNote(ambiguous_tips) abstract_definition_text_set,abstract_definition_set = regular_template.main(abstract,query_word) abstract_definition_text = ' '.join(abstract_definition_text_set) title_note_definition_text_set,title_note_definition_set = regular_template.main(title_note,query_word) title_note_definition_text = ' '.join(title_note_definition_text_set) try: top_simi_words = [simi_word_tuple[0] for simi_word_tuple in word2vec_model.most_similar(positive=[query_word],topn=80)] for simi_word in top_simi_words: if len(simi_word)==1: continue if simi_word in alias_text or simi_word in abstract_definition_text or simi_word in ambiguous_tips or simi_word in title_note_definition_text or simi_word in title: if not text_process.isSubsetGeneral(query_word,simi_word): word_synonyms_set.add(simi_word) for pair in itertools.combinations(word_synonyms_set,2): new_word = ''.join(pair) if new_word not in word_synonyms_set and (new_word in abstract_definition_text or new_word in title): word_synonyms_set.add(new_word) if len([word for word in word_synonyms_set if len(word)>0]) >= 2: outfile.write(query_word+'@'+','.join([word for word in word_synonyms_set if len(word)>0])+'\r\n') except: print 'not in vocabulary '+query_word
def getCorpus(category_name): app_lable_dict = { 10743: 1, 1002128: 1, 47: 1, 498: 1, 550: -1, 48: -1, 490: -1, 761: -1, 101108: -1, 101916: -1 } x_train = [] y_train = [] x_test = [] jieba.load_userdict('../../../data/jieba_userdict.txt') stopword_set = text_process.getStopword('../../../data/stopword.txt') doc_app_id = [] docs = [] id_name_dict = {} infile = open('corpus/' + category_name + '.json', 'rb') for row in infile: json_obj = json.loads(row.strip()) app_id = int(json_obj["id"]) app_name = json_obj["title"] app_brief = json_obj["brief"] app_download = int(json_obj["download_times"]) app_brief_seg = [ word for word in jieba.cut(app_name + " " + app_brief) if word not in stopword_set and text_process.isChinese(word) ] if len(app_brief_seg) <= 10 and app_download <= 100: continue doc_app_id.append(app_id) id_name_dict[app_id] = app_name docs.append(app_brief_seg) dictionary = corpora.Dictionary(docs) corpus = [dictionary.doc2bow(text) for text in docs] for i in range(len(corpus)): doc = corpus[i] x = [0 for n in range(len(dictionary))] for val in doc: x[val[0]] = val[1] app_id = doc_app_id[i] if app_id in app_lable_dict.keys(): x_train.append(x) if app_lable_dict[app_id] == 1: y_train.append(1) else: y_train.append(-1) else: x_test.append(x) return x_train, x_test, y_train, doc_app_id, id_name_dict
def readJson(word2vec_model): print 'parsing json' stopword_set = text_process.getStopword('../../../data/stopword.txt') outfile = open('baidu_baike_definition.txt', 'wb') infile = open( '../../scrapy/baidu_baike_definition/crawl_data/definition.json', 'rb') row_index = 0 for row in infile: json_str = row.strip() json_str = json_str.lstrip('[') json_str = json_str.rstrip(',') json_str = json_str.rstrip(']') json_obj = json.loads(json_str) query_word = json_obj['query_category'] is_only = json_obj['is_only'] ambiguous_tips = json_obj['ambiguous_tips'] title = json_obj['title'] title_note = json_obj['title_note'] structure_tag = json_obj['structure_tag'] abstract = json_obj['abstract'] content = json_obj['content'] word_synonyms_set = set([query_word]) if is_only: word_synonyms_set.add(title) alias_list = [] alias_list_clean = [] for tag_key in structure_tag.keys(): tag_key = tag_key.decode('utf-8') tag_value = structure_tag[tag_key].decode('utf-8') tag_key_clean = tag_key.replace(u' ', '') if tag_key_clean in baidu_tag_keys: if tag_value != u"无": alias_list.append(tag_value) for alias in alias_list: alias = regular_template.cleanNote(alias) if regular_template.isEnglishPhrase(alias): print alias continue for word in alias.replace(u",", " ").replace(u"、", " ").replace( u",", " ").replace(u";", " ").replace(u";", " ").split(): word = word.replace(u"“", "").replace(u"”", "").replace(u" ", "").rstrip(u"等") alias_list_clean.append(regular_template.cleanDefinition(word)) alias_text = ' '.join(alias_list_clean) if is_only: word_synonyms_set = word_synonyms_set | set(alias_list_clean) ambiguous_tips = regular_template.cleanNote(ambiguous_tips) abstract_definition_text_set, abstract_definition_set = regular_template.main( abstract, query_word) abstract_definition_text = ' '.join(abstract_definition_text_set) title_note_definition_text_set, title_note_definition_set = regular_template.main( title_note, query_word) title_note_definition_text = ' '.join(title_note_definition_text_set) try: top_simi_words = [ simi_word_tuple[0] for simi_word_tuple in word2vec_model.most_similar(positive=[query_word], topn=80) ] for simi_word in top_simi_words: if len(simi_word) == 1: continue if simi_word in alias_text or simi_word in abstract_definition_text or simi_word in ambiguous_tips or simi_word in title_note_definition_text or simi_word in title: if not text_process.isSubsetGeneral(query_word, simi_word): word_synonyms_set.add(simi_word) for pair in itertools.combinations(word_synonyms_set, 2): new_word = ''.join(pair) if new_word not in word_synonyms_set and ( new_word in abstract_definition_text or new_word in title): word_synonyms_set.add(new_word) if len([word for word in word_synonyms_set if len(word) > 0]) >= 2: outfile.write(query_word + '@' + ','.join( [word for word in word_synonyms_set if len(word) > 0]) + '\r\n') except: print 'not in vocabulary ' + query_word