Пример #1
0
def convert_raw_file_to_indexed(infile, outfile, word_hashing_file):
    wh_model = utils.load_obj_from_file(word_hashing_file)
    with codecs.open(infile, 'r', 'utf-8') as rd:
        with codecs.open(outfile, 'w', 'utf-8') as wt:
            while True:
                line = rd.readline()
                if not line:
                    break
                words = line.strip().split('\t')
                if len(words) != 2 or not line.startswith('http'):
                    continue
                tokens = utils.clean_str(words[1]).split(' ')
                if len(tokens) < 2:
                    continue
                cur_word_dict = Counter(tokens)
                cur_word_list = [(wh_model.word2idx[k], v)
                                 for k, v in cur_word_dict.items()
                                 if k in wh_model.word2idx]
                if not cur_word_list:
                    continue
                cur_word_list.sort()
                doc_word_cnt = sum(p[1] for p in cur_word_list) * 1.0
                if doc_word_cnt <= 0.001:
                    continue
                for p in cur_word_list:
                    wt.write('{0}:{1:.2f} '.format(
                        p[0], p[1] * 1.0 / doc_word_cnt * math.log2(
                            100000 * 1.0 /
                            wh_model.word2freq[wh_model.idx2word[p[0]]])))
                wt.write('\n')
Пример #2
0
def load_wordhash_data(filename):
    wh_model = utils.load_obj_from_file(filename)
    cnt = 0
    for p in wh_model.word2freq.items():
        print(p)
        cnt += 1
        if cnt > 10:
            break
Пример #3
0
def load_wordhash_data(filename):
    wh_model = utils.load_obj_from_file(filename)  
    cnt = 0 
    for p in wh_model.word2freq.items():
        print(p) 
        cnt+=1 
        if cnt>10:
            break   
Пример #4
0
def print_top_words():
    infile = r'Y:\BingNews\Zhongxia\articles.txt'
    word_hashing_file = r'Y:\BingNews\Zhongxia\my\articles_wordhashing_3w.obj'
    doc2title, doc2category = load_documents(infile)
    wh_model = utils.load_obj_from_file(word_hashing_file)
    #doc2tfidf = convert_doc_tfidf(doc2title, wh_model)

    ae_model = load_autoencoder()

    doc_ids = list(doc2title.keys())
    while True:
        try:
            doc_id = random.choice(doc_ids)
            cur_title = doc2title[doc_id]
            cur_tfidf = utils.convert_line_to_tfidf(cur_title,
                                                    wh_model,
                                                    norm=True)
            if not cur_tfidf:
                continue
            print('docid: {0}\t cate: {2} \t title: {1}'.format(
                doc_id, doc2title[doc_id], doc2category[doc_id]))
            sorted_tfidf = sorted(cur_tfidf, key=itemgetter(1), reverse=True)
            k = min(10, len(sorted_tfidf))
            str_gt = [
                '{0}:{1:.2f}'.format(wh_model.idx2word[p[0]], p[1])
                for p in sorted_tfidf[0:k]
            ]
            print('gt: ' + ' '.join(str_gt))

            data_for_ae = wrap_ae_data(cur_tfidf, 0.2, True)
            pred, masked_pred = ae_model.get_predictions(*data_for_ae, 1)
            pred = list(enumerate(np.reshape(pred, [-1]).tolist()))
            masked_pred = list(
                enumerate(np.reshape(masked_pred, [-1]).tolist()))
            masked_pred = [p for p in masked_pred if p[1] > 0.001]

            pred = sorted(pred, key=itemgetter(1), reverse=True)
            masked_pred = sorted(masked_pred, key=itemgetter(1), reverse=True)
            k = min(10, len(masked_pred))
            str_pred_all = [
                '{0}:{1:.2f}'.format(wh_model.idx2word[p[0]], p[1])
                for p in pred[0:k]
            ]
            str_pred_masked = [
                '{0}:{1:.2f}'.format(wh_model.idx2word[p[0]], p[1])
                for p in masked_pred[0:k]
            ]
            print('pred_all: ' + ' '.join(str_pred_all))
            print('pred_masked: ' + ' '.join(str_pred_masked))

            var = input("press enter to continue... ")
        except KeyboardInterrupt:
            break
Пример #5
0
def convert_raw_file_to_indexed(infile,outfile,word_hashing_file,norm=False):
    r'''
    input format: id\t category\t title
    output format: id\t word:weight ...
    '''
    wh_model = utils.load_obj_from_file(word_hashing_file)
    with codecs.open(infile, 'r', 'utf-8') as rd:
        with codecs.open(outfile, 'w', 'utf-8') as wt:
            while True:
                line = rd.readline()
                if not line:
                    break  
                words = line.strip().split('\t')
                #if len(words)!=2 or not line.startswith('http'):
                if len(words)!=3 :# or not line.startswith('http'):
                    continue
                
                r'''
                tokens = utils.clean_str(words[2]).split(' ')
                if len(tokens)<2:
                    continue
                cur_word_dict = Counter(tokens)
                cur_word_list = [(wh_model.word2idx[k],v) for k,v in cur_word_dict.items() if k in wh_model.word2idx]
                if not cur_word_list:
                    continue
                cur_word_list.sort()
                doc_word_cnt = sum(p[1] for p in cur_word_list) * 1.0 
                if doc_word_cnt<=0.001:
                    continue
                wt.write(words[0]+'\t')
                for p in cur_word_list:
                    wt.write('{0}:{1:.2f} '.format(p[0],p[1]*1.0/doc_word_cnt * math.log2(1000000*1.0/wh_model.word2freq[wh_model.idx2word[p[0]]])) )
                wt.write('\n')
                ''' 
               
                cur_word_list = utils.convert_line_to_tfidf(words[2], wh_model, norm)
                if not cur_word_list:
                    continue 
                wt.write(words[0]+'\t')
                for p in cur_word_list:
                    wt.write('{0}:{1:.2f} '.format(p[0],p[1]))
                wt.write('\n')
Пример #6
0
def print_top_words():
    infile = r'Y:\BingNews\Zhongxia\articles.txt'
    word_hashing_file = r'Y:\BingNews\Zhongxia\my\articles_wordhashing_3w.obj'
    doc2title, doc2category = load_documents(infile) 
    wh_model = utils.load_obj_from_file(word_hashing_file)
    #doc2tfidf = convert_doc_tfidf(doc2title, wh_model)
    
    ae_model = load_autoencoder()
    
    doc_ids = list(doc2title.keys()) 
    while True:
        try :
            doc_id = random.choice(doc_ids)
            cur_title = doc2title[doc_id]
            cur_tfidf = utils.convert_line_to_tfidf(cur_title, wh_model, norm=True  )
            if not cur_tfidf:
                continue
            print('docid: {0}\t cate: {2} \t title: {1}'.format(doc_id, doc2title[doc_id], doc2category[doc_id]))
            sorted_tfidf = sorted(cur_tfidf, key = itemgetter(1), reverse = True)
            k = min(10, len(sorted_tfidf))
            str_gt = ['{0}:{1:.2f}'.format(wh_model.idx2word[p[0]],p[1]) for p in sorted_tfidf[0:k]]
            print('gt: ' +  ' '.join(str_gt))
            
            data_for_ae = wrap_ae_data(cur_tfidf,0.2,True)
            pred , masked_pred = ae_model.get_predictions(  *data_for_ae, 1)
            pred = list( enumerate(np.reshape(pred, [-1]).tolist()) )
            masked_pred = list( enumerate(np.reshape(masked_pred, [-1]).tolist()) )            
            masked_pred = [p for p in masked_pred if p[1]>0.001]
            
            pred = sorted(pred, key = itemgetter(1), reverse = True)
            masked_pred = sorted(masked_pred, key = itemgetter(1), reverse = True)
            k = min(10, len(masked_pred))
            str_pred_all = ['{0}:{1:.2f}'.format(wh_model.idx2word[p[0]],p[1]) for p in pred[0:k]]
            str_pred_masked = ['{0}:{1:.2f}'.format(wh_model.idx2word[p[0]],p[1]) for p in masked_pred[0:k]]
            print('pred_all: ' +  ' '.join(str_pred_all))
            print('pred_masked: ' +  ' '.join(str_pred_masked))
            
            var = input("press enter to continue... ")
        except KeyboardInterrupt:
            break