def convert_raw_file_to_indexed(infile, outfile, word_hashing_file): wh_model = utils.load_obj_from_file(word_hashing_file) with codecs.open(infile, 'r', 'utf-8') as rd: with codecs.open(outfile, 'w', 'utf-8') as wt: while True: line = rd.readline() if not line: break words = line.strip().split('\t') if len(words) != 2 or not line.startswith('http'): continue tokens = utils.clean_str(words[1]).split(' ') if len(tokens) < 2: continue cur_word_dict = Counter(tokens) cur_word_list = [(wh_model.word2idx[k], v) for k, v in cur_word_dict.items() if k in wh_model.word2idx] if not cur_word_list: continue cur_word_list.sort() doc_word_cnt = sum(p[1] for p in cur_word_list) * 1.0 if doc_word_cnt <= 0.001: continue for p in cur_word_list: wt.write('{0}:{1:.2f} '.format( p[0], p[1] * 1.0 / doc_word_cnt * math.log2( 100000 * 1.0 / wh_model.word2freq[wh_model.idx2word[p[0]]]))) wt.write('\n')
def load_wordhash_data(filename): wh_model = utils.load_obj_from_file(filename) cnt = 0 for p in wh_model.word2freq.items(): print(p) cnt += 1 if cnt > 10: break
def load_wordhash_data(filename): wh_model = utils.load_obj_from_file(filename) cnt = 0 for p in wh_model.word2freq.items(): print(p) cnt+=1 if cnt>10: break
def print_top_words(): infile = r'Y:\BingNews\Zhongxia\articles.txt' word_hashing_file = r'Y:\BingNews\Zhongxia\my\articles_wordhashing_3w.obj' doc2title, doc2category = load_documents(infile) wh_model = utils.load_obj_from_file(word_hashing_file) #doc2tfidf = convert_doc_tfidf(doc2title, wh_model) ae_model = load_autoencoder() doc_ids = list(doc2title.keys()) while True: try: doc_id = random.choice(doc_ids) cur_title = doc2title[doc_id] cur_tfidf = utils.convert_line_to_tfidf(cur_title, wh_model, norm=True) if not cur_tfidf: continue print('docid: {0}\t cate: {2} \t title: {1}'.format( doc_id, doc2title[doc_id], doc2category[doc_id])) sorted_tfidf = sorted(cur_tfidf, key=itemgetter(1), reverse=True) k = min(10, len(sorted_tfidf)) str_gt = [ '{0}:{1:.2f}'.format(wh_model.idx2word[p[0]], p[1]) for p in sorted_tfidf[0:k] ] print('gt: ' + ' '.join(str_gt)) data_for_ae = wrap_ae_data(cur_tfidf, 0.2, True) pred, masked_pred = ae_model.get_predictions(*data_for_ae, 1) pred = list(enumerate(np.reshape(pred, [-1]).tolist())) masked_pred = list( enumerate(np.reshape(masked_pred, [-1]).tolist())) masked_pred = [p for p in masked_pred if p[1] > 0.001] pred = sorted(pred, key=itemgetter(1), reverse=True) masked_pred = sorted(masked_pred, key=itemgetter(1), reverse=True) k = min(10, len(masked_pred)) str_pred_all = [ '{0}:{1:.2f}'.format(wh_model.idx2word[p[0]], p[1]) for p in pred[0:k] ] str_pred_masked = [ '{0}:{1:.2f}'.format(wh_model.idx2word[p[0]], p[1]) for p in masked_pred[0:k] ] print('pred_all: ' + ' '.join(str_pred_all)) print('pred_masked: ' + ' '.join(str_pred_masked)) var = input("press enter to continue... ") except KeyboardInterrupt: break
def convert_raw_file_to_indexed(infile,outfile,word_hashing_file,norm=False): r''' input format: id\t category\t title output format: id\t word:weight ... ''' wh_model = utils.load_obj_from_file(word_hashing_file) with codecs.open(infile, 'r', 'utf-8') as rd: with codecs.open(outfile, 'w', 'utf-8') as wt: while True: line = rd.readline() if not line: break words = line.strip().split('\t') #if len(words)!=2 or not line.startswith('http'): if len(words)!=3 :# or not line.startswith('http'): continue r''' tokens = utils.clean_str(words[2]).split(' ') if len(tokens)<2: continue cur_word_dict = Counter(tokens) cur_word_list = [(wh_model.word2idx[k],v) for k,v in cur_word_dict.items() if k in wh_model.word2idx] if not cur_word_list: continue cur_word_list.sort() doc_word_cnt = sum(p[1] for p in cur_word_list) * 1.0 if doc_word_cnt<=0.001: continue wt.write(words[0]+'\t') for p in cur_word_list: wt.write('{0}:{1:.2f} '.format(p[0],p[1]*1.0/doc_word_cnt * math.log2(1000000*1.0/wh_model.word2freq[wh_model.idx2word[p[0]]])) ) wt.write('\n') ''' cur_word_list = utils.convert_line_to_tfidf(words[2], wh_model, norm) if not cur_word_list: continue wt.write(words[0]+'\t') for p in cur_word_list: wt.write('{0}:{1:.2f} '.format(p[0],p[1])) wt.write('\n')
def print_top_words(): infile = r'Y:\BingNews\Zhongxia\articles.txt' word_hashing_file = r'Y:\BingNews\Zhongxia\my\articles_wordhashing_3w.obj' doc2title, doc2category = load_documents(infile) wh_model = utils.load_obj_from_file(word_hashing_file) #doc2tfidf = convert_doc_tfidf(doc2title, wh_model) ae_model = load_autoencoder() doc_ids = list(doc2title.keys()) while True: try : doc_id = random.choice(doc_ids) cur_title = doc2title[doc_id] cur_tfidf = utils.convert_line_to_tfidf(cur_title, wh_model, norm=True ) if not cur_tfidf: continue print('docid: {0}\t cate: {2} \t title: {1}'.format(doc_id, doc2title[doc_id], doc2category[doc_id])) sorted_tfidf = sorted(cur_tfidf, key = itemgetter(1), reverse = True) k = min(10, len(sorted_tfidf)) str_gt = ['{0}:{1:.2f}'.format(wh_model.idx2word[p[0]],p[1]) for p in sorted_tfidf[0:k]] print('gt: ' + ' '.join(str_gt)) data_for_ae = wrap_ae_data(cur_tfidf,0.2,True) pred , masked_pred = ae_model.get_predictions( *data_for_ae, 1) pred = list( enumerate(np.reshape(pred, [-1]).tolist()) ) masked_pred = list( enumerate(np.reshape(masked_pred, [-1]).tolist()) ) masked_pred = [p for p in masked_pred if p[1]>0.001] pred = sorted(pred, key = itemgetter(1), reverse = True) masked_pred = sorted(masked_pred, key = itemgetter(1), reverse = True) k = min(10, len(masked_pred)) str_pred_all = ['{0}:{1:.2f}'.format(wh_model.idx2word[p[0]],p[1]) for p in pred[0:k]] str_pred_masked = ['{0}:{1:.2f}'.format(wh_model.idx2word[p[0]],p[1]) for p in masked_pred[0:k]] print('pred_all: ' + ' '.join(str_pred_all)) print('pred_masked: ' + ' '.join(str_pred_masked)) var = input("press enter to continue... ") except KeyboardInterrupt: break