def generate_newsid2pubtime(corpus_file):
    '''
        从原始语料中生成每篇文章id与发布时间的映射,时间转换为时间戳
    '''
    print 'run generate_newsid2pubtime...'
    id2pubtime = {}
    f = open(corpus_file, 'r')
    line = f.readline()
    while line:
        parts = line.strip().split('\t')
        id_ = parts[1].strip()
        pubtime_str = parts[5].strip()
        if pubtime_str.lower() == 'null':
            line = f.readline()
            continue
        print pubtime_str
        if ':' not in pubtime_str:
            #存在没有分和秒的
            pubtime_str += '00:00'
        pub_time = datetime.strptime(unicode2str(pubtime_str), unicode2str(u'%Y年%m月%d日%H:%M'))
        pub_time = int(time.mktime(pub_time.timetuple()))
        id2pubtime[id_] = pub_time
        line = f.readline()
    print 'finish generating %s id2pubtime elements' % len(id2pubtime.items())
    return id2pubtime
Пример #2
0
def generate_newsid2pubtime(corpus_file):
    '''
        从原始语料中生成每篇文章id与发布时间的映射,时间转换为时间戳
    '''
    print 'run generate_newsid2pubtime...'
    id2pubtime = {}
    f = open(corpus_file, 'r')
    line = f.readline()
    while line:
        parts = line.strip().split('\t')
        id_ = parts[1].strip()
        pubtime_str = parts[5].strip()
        if pubtime_str.lower() == 'null':
            line = f.readline()
            continue
        print pubtime_str
        if ':' not in pubtime_str:
            #存在没有分和秒的
            pubtime_str += '00:00'
        pub_time = datetime.strptime(unicode2str(pubtime_str),
                                     unicode2str(u'%Y年%m月%d日%H:%M'))
        pub_time = int(time.mktime(pub_time.timetuple()))
        id2pubtime[id_] = pub_time
        line = f.readline()
    print 'finish generating %s id2pubtime elements' % len(id2pubtime.items())
    return id2pubtime
def segment_train_data(train_data_path, saved_file):
    '''
    '''

    #filter_stopwords = lambda x: text_segment(x)
    stopwords = load_stopwords()

    def filter_stopwords(words):
        #import pdb;pdb.set_trace()
        return [r for r in words if r not in stopwords]

    print 'run segment_train_data...'
    titles = []
    f = open(train_data_path, 'r')
    line = f.readline()
    start_time = time.time()
    res = []
    cnt = 0
    newsids = set()
    round_start = time.time()
    while line:
        #line = unicode2str(line)
        parts = line.strip().split('\t')
        if parts[1].strip() in newsids:
            line = f.readline()
            continue

        newsids.add(parts[1].strip())
        cnt += 1
        parts[3] = unicode2str(' '.join(
            filter_stopwords(text_segment(parts[3], is_ret_utf8=True))))
        parts[4] = unicode2str(' '.join(
            filter_stopwords(text_segment(parts[4], is_ret_utf8=True))))
        res.append('\t\t'.join(parts))

        if cnt % 1000 == 0:
            round_cost = (time.time() - round_start)
            round_start = time.time()
            print 'segmenting %s, cost %.3fs, aver=%.3fs' % (
                cnt, round_cost, round_cost / 100.0)

        line = f.readline()

    end_time = time.time()
    total_cost = (end_time - start_time) / 60.0
    aver_cost = total_cost / float(cnt)

    print 'segmenting all %s records, total cost=%.3fmin, average=%.3fmin' % (
        cnt, total_cost, aver_cost)

    fw = open(data_dir + saved_file, 'w+')
    fw.write('\n'.join(res))
    fw.close()
    print 'res is saved in %s' % (saved_file)
Пример #4
0
def segment_train_data(train_data_path, saved_original_file,
                       saved_unique_file):
    '''
        包括对title和content的分词与去掉停用词
        生成两份文件:
            1,和原数据结构保持一致的分词后结果;
            2,将文章利用newsid进行消重,得到一份新的data,用来index中使用(unique_news)
    '''
    stopwords = load_stopwords()

    def filter_stopwords(words):
        return [r for r in words if r not in stopwords]

    print 'run segment_train_data...'
    f = open(train_data_path, 'r')
    line = f.readline()
    nid2tc, original_news, unique_news, newsids = {}, [], [], set()
    start_time, round_start, cnt = time.time(), time.time(), 0
    while line:
        parts = line.strip().split('\t')
        original_news.append(parts)
        nid = parts[1].strip()
        if nid in newsids:
            line = f.readline()
            continue
        newsids.add(nid)
        cnt += 1

        parts[3] = unicode2str(' '.join(
            filter_stopwords(text_segment(parts[3], is_ret_utf8=True))))
        parts[4] = unicode2str(' '.join(
            filter_stopwords(text_segment(parts[4], is_ret_utf8=True))))
        unique_news.append('\t'.join(parts))
        nid2tc[parts[1]] = (parts[3], parts[4])

        if cnt % 1000 == 0:
            round_cost = (time.time() - round_start)
            round_start = time.time()
            print 'segmenting %s, cost %.3fs, aver=%.3fs' % (
                cnt, round_cost, round_cost / 100.0)

        line = f.readline()
    end_time = time.time()
    total_cost = (end_time - start_time) / 60.0
    aver_cost = total_cost / float(cnt)
    print 'segmenting all %s records(cnt=%s), total cost=%.3fmin, average=%.3fmin' % (
        len(unique_news), cnt, total_cost, aver_cost)

    save_original_data(original_news, nid2tc, saved_original_file)

    save_unique_data(unique_news, saved_unique_file)
def segment_train_data(train_data_path, saved_file):
    '''
    '''

    #filter_stopwords = lambda x: text_segment(x)
    stopwords = load_stopwords()

    def filter_stopwords(words):
        #import pdb;pdb.set_trace()
        return [r for r in words if r not in stopwords]

    print 'run segment_train_data...'
    titles = []
    f = open(train_data_path, 'r')
    line = f.readline()
    start_time = time.time()
    res = []
    cnt = 0
    newsids = set()
    round_start = time.time()
    while line:
        #line = unicode2str(line)
        parts = line.strip().split('\t')
        if parts[1].strip() in newsids:
            line = f.readline()
            continue

        newsids.add(parts[1].strip())
        cnt += 1
        parts[3] = unicode2str(' '.join(filter_stopwords(text_segment(parts[3], is_ret_utf8=True))))
        parts[4] = unicode2str(' '.join(filter_stopwords(text_segment(parts[4], is_ret_utf8=True))))
        res.append('\t\t'.join(parts))

        if cnt % 1000 == 0:
            round_cost = (time.time() - round_start)
            round_start = time.time()
            print 'segmenting %s, cost %.3fs, aver=%.3fs' % (cnt, round_cost, round_cost / 100.0 )

        line = f.readline()

    end_time = time.time()
    total_cost = (end_time - start_time) / 60.0
    aver_cost = total_cost / float(cnt)

    print 'segmenting all %s records, total cost=%.3fmin, average=%.3fmin' % (cnt, total_cost, aver_cost)

    fw = open(data_dir + saved_file, 'w+')
    fw.write('\n'.join(res))
    fw.close()
    print 'res is saved in %s' % (saved_file)
def segment_train_data(train_data_path, saved_original_file, saved_unique_file):
    '''
        包括对title和content的分词与去掉停用词
        生成两份文件:
            1,和原数据结构保持一致的分词后结果;
            2,将文章利用newsid进行消重,得到一份新的data,用来index中使用(unique_news)
    '''
    stopwords = load_stopwords()
    def filter_stopwords(words):
        return [r for r in words if r not in stopwords]

    print 'run segment_train_data...'
    f = open(train_data_path, 'r')
    line = f.readline()
    nid2tc, original_news, unique_news, newsids = {}, [], [], set()
    start_time, round_start, cnt = time.time(), time.time(), 0
    while line:
        parts = line.strip().split('\t')
        original_news.append(parts)
        nid = parts[1].strip()
        if  nid in newsids:
            line = f.readline()
            continue
        newsids.add(nid)
        cnt += 1

        parts[3] = unicode2str(' '.join(filter_stopwords(text_segment(parts[3], is_ret_utf8=True))))
        parts[4] = unicode2str(' '.join(filter_stopwords(text_segment(parts[4], is_ret_utf8=True))))
        unique_news.append('\t'.join(parts))
        nid2tc[parts[1]] = (parts[3], parts[4])

        if cnt % 1000 == 0:
            round_cost = (time.time() - round_start)
            round_start = time.time()
            print 'segmenting %s, cost %.3fs, aver=%.3fs' % (cnt, round_cost, round_cost / 100.0 )

        line = f.readline()
    end_time = time.time()
    total_cost = (end_time - start_time) / 60.0
    aver_cost = total_cost / float(cnt)
    print 'segmenting all %s records(cnt=%s), total cost=%.3fmin, average=%.3fmin' % (len(unique_news), cnt, total_cost, aver_cost)

    save_original_data(original_news, nid2tc, saved_original_file)

    save_unique_data(unique_news, saved_unique_file)
Пример #7
0
 def cut(self, text):
     text = unicode2str(text)
     content = self._request(text)
     words = json.loads(content, encoding='UTF-8')
     start_pos = 0
     for word in words:
         try:
             yield word['word'], start_pos, start_pos + len(word['word'])
             start_pos += len(word['word'])
         except Exception, e:
             # sae analyzer unknown exception
             logging.warn('sae tokenizer error', e)
Пример #8
0
 def cut(self, text):
     text = unicode2str(text)
     content = self._request(text)
     words = json.loads(content, encoding='UTF-8')
     start_pos = 0
     for word in words:
         try:
             yield word['word'], start_pos, start_pos + len(word['word'])
             start_pos += len(word['word'])
         except Exception, e:
             # sae analyzer unknown exception
             logging.warn('sae tokenizer error', e)
def get_recommend_news_by_tfidf_sim():
    '''
        基于tfidf生成的user profile和文章的keywords(topN,设为20),从用户的candidate articles中选出相似度最大的TopN返回
    '''
    topN = 20
    uids = os.listdir(user_keywords_by_tfidf)
    uid2can_newsids = get_user_candidate_newsids(user_candidate_newsids_path)
    user_recommend_res = []
    #recommend_res_path = recommend_res_path.replace('.csv', '_by_tfidf.csv')
    cnt = 0
    for uid in uids:
        cnt += 1
        if cnt % 100 == 0:
            print 'recommend %d user: %s' % (cnt, uid)
        user_terms = get_user_tfidf_terms(
            os.path.join(user_keywords_by_tfidf, uid), topN)
        candidate_newsids = uid2can_newsids.get(uid, [])
        if not candidate_newsids:
            continue
        candidate_news_top_terms = get_news_top_terms(candidate_newsids, topN)
        #can_news_vectors和candidate_newsids中的nid一一对应
        user_vector, can_news_vectors = generate_feature_vectors(
            user_terms, candidate_news_top_terms, topN)
        #调用sklearn接口,可以一次计算user和全部news的cosine distance
        #注意,该接口的值是1-product(v1, v2), 所以值越小,越相似,表示distance越小
        user_news_distances = distance(user_vector,
                                       Y=can_news_vectors,
                                       metric='cosine')
        user_news_distances = zip(candidate_newsids,
                                  user_news_distances.tolist()[0])
        user_news_distances = sorted(user_news_distances, key=lambda d: d[1])

        user_recommend_res.append(
            (uid, [nid for nid, d in user_news_distances][:REC_NUM]))

    fw = open(recommend_res_path, 'w+')
    fw.write('userid,newsid\n')
    cnt = 0
    for uid, rec_news in user_recommend_res:
        #import pdb;pdb.set_trace()
        cnt += 1
        if cnt % 100 == 0:
            print 'finish %d user: %s, %s' % (cnt, uid, ' '.join(rec_news))
        fw.write('\n'.join(
            [','.join((uid, unicode2str(nid))) for nid in rec_news]))
        fw.write('\n')
    fw.close()
    print 'finish recommending, res saved in %s' % recommend_res_path
Пример #10
0
def generate_tfidf(corpus_path, tfidf_dir):
    '''
        从分词和去停用词后的文档集中读入corpus和文档id,然后计算tf-idf值,结果保存在以文档id作为
        文件名的文件中
    '''
    newsids, corpus = generate_copurs_from_file(corpus_path)
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    terms = vectorizer.get_feature_names()
    weights = tfidf.toarray()
    tfidf_res = []
    for i, id_ in enumerate(newsids):
        each_tf_idf = []
        save_file = tfidf_dir + id_
        if i % 100 == 0:
            print i
        for j, term in enumerate(terms):
            if weights[i][j] > 0.0001:
                each_tf_idf.append((terms[j], weights[i][j]))
        each_tf_idf = sorted(each_tf_idf, key=lambda d: d[1], reverse=True)
        fw = open(save_file, 'w+')
        fw.write('\n'.join(['%s,%s' % (unicode2str(t), w) for t, w in each_tf_idf]))
        fw.close()
def get_recommend_news_by_xapian():
    '''
        根据每个用户的关键词,使用xapian进行检索,每个用户返回的结果为用户的
        阅读数+N(暂定为5)
    '''
    print 'run get_recommend_news...'
    user_keywords = get_user_keywords()
    uid2newsids = get_user_read_list()

    def generate_query_str(keywords):
        return ' '.join(['title:%s content:%s' % (k, k) for k in keywords])

    rec_res = []
    for uid, keywords in user_keywords.items():
        if not keywords:
            #rec_res.append(uid, [])
            continue
        if uid not in uid2newsids:
            continue
        query_str = generate_query_str(keywords)
        read_news = set(uid2newsids.get(uid, []))
        read_num = len(read_news)
        search_res = search(indexed_file_path,
                            query_str,
                            ret_num=read_num + REC_NUM)
        user_rec_news = [r for r in search_res if r not in read_news]
        rec_res.append((uid, user_rec_news[:REC_NUM]))

    f = open(recommend_res_path, 'w+')
    f.write('userid,newsid\n')
    for uid, rec_news in rec_res:
        print uid, rec_news
        f.write('\n'.join(
            [','.join((uid, unicode2str(nid))) for nid in rec_news]))
        f.write('\n')

    f.close()
    print 'finish recommending, res saved in %s' % recommend_res_path
Пример #12
0
def save_uid_terms(saved_file, uid_terms_weights):
    fw = open(saved_file, 'w+')
    fw.write('\n'.join(['%s,%s' % (unicode2str(t), w) for t, w in uid_terms_weights]))
    fw.close()