Exemplo n.º 1
0
def coll_sentence_hash():
    logger_9965.info("Begin to collect sentence...")
    exist_set = get_exist_nids()
    limit = 10000
    offset = 10000
    pool = Pool(30)
    while True:
        conn, cursor = get_postgredb_query()
        cursor.execute(cal_sql2, (ignore_cname, limit, offset))
        rows = cursor.fetchall()
        conn.close()
        offset += limit
        if len(rows) == 0:
            break
        all_set = set()
        for r in rows:
            all_set.add(r[0])
        need_to_cal_set = all_set - exist_set
        if len(need_to_cal_set) == 0:
            continue
        same_dict = get_relate_same_news(need_to_cal_set)
        pool.apply_async(cal_process, args=(need_to_cal_set, None, 3, 3, same_dict)) #相同的阈值为3; 取2天内的新闻

    pool.close()
    pool.join()

    logger_9965.info("Congratulations! Finish to collect sentences.")
Exemplo n.º 2
0
def check_del_sub_subject(sub_id1, sub_id2):
    nids1 = set()
    nids2 = set()
    conn, cursor = get_postgredb_query()
    cursor.execute(check_sql, (sub_id1, ))
    rows = cursor.fetchall()
    for r in rows:
        nids1.add(r[0])

    cursor.execute(check_sql, (sub_id2, ))
    rows = cursor.fetchall()
    for r in rows:
        nids2.add(r[0])

    cursor.close()
    conn.close()
    if (nids1 | nids2 == nids1) or (nids1 | nids2 == nids2):  #nids存在包含关系
        delete_id = sub_id1 if len(nids1) < len(nids2) else sub_id2
        logger_sub.info('delete subject {}.'.format(delete_id))
        logger_sub.info('    {} :{}'.format(sub_id1, nids1))
        logger_sub.info('    {} :{}'.format(sub_id2, nids2))
        data = {'id':delete_id}
        requests.post(delete_sub_url, data=data, cookies=cookie)
        return delete_id
    return None
Exemplo n.º 3
0
def random_predict_nids():
    sql = "select nid from newslist_v2 nv inner join channellist_v2 cl on nv.chid=cl.id where cl.cname in %s order by nid desc limit 50"
    conn, cursor = doc_process.get_postgredb_query()
    #print cursor.mogrify(sql, (tuple(chnl_newsnum_dict.keys()),))
    cursor.execute(sql, (tuple(chnl_newsnum_dict.keys()), ))
    rows = cursor.fetchall()
    conn.close()
    return kmeans_predict(list(rows))
Exemplo n.º 4
0
def get_exist_nids():
    conn, cursor = get_postgredb_query()
    cursor.execute(s_nid_sql)
    rows = cursor.fetchall()
    nid_set = set()
    for r in rows:
        nid_set.add(r[0])
    conn.close()
    return nid_set
Exemplo n.º 5
0
def get_chname_id_dict():
    global chname_id_dict
    chname_id_sql = "select id, cname from channellist_v2"
    conn, cursor = doc_process.get_postgredb_query()
    cursor.execute(chname_id_sql)
    rows = cursor.fetchall()
    for r in rows:
        chname_id_dict[r[1]] = r[0]
    cursor.close()
    conn.close()
Exemplo n.º 6
0
def get_newest_topic_v():
    topic_sql = "select model_v from user_topics_v2 group by model_v"
    conn, cursor = get_postgredb_query()
    cursor.execute(topic_sql)
    rows = cursor.fetchall()
    topic_vs = []
    for row in rows:
        topic_vs.append(row[0])
    conn.close()
    return max(topic_vs)
Exemplo n.º 7
0
def add_cover_to_sub():
    conn, cursor = get_postgredb_query()
    cover_sql = "select id from topiclist where type=1 and cover=''"
    cursor.execute(cover_sql)
    rows = cursor.fetchall()
    for r in rows:
        data = {'id': r[0], 'cover': subject_cover}
        requests.put(modify_url, data=data, cookies=cookie)
    cursor.close()
    conn.close()
Exemplo n.º 8
0
def get_active_user_info(min_interval=1, min_click=1):
    '''
    获取活跃用户信息
    :param min_interval: 多少天内有点击行为的才是活跃用户
    :param min_click: 用户最少点击量
    :return:
    '''
    nt = datetime.datetime.now()
    t = nt.strftime('%Y-%m-%d %H:%M:%S')
    #获取活跃用户
    user_sql = "select uid from newsrecommendclick " \
               "where ctime > to_timestamp('{}', 'yyyy-mm-dd hh24:mi:ss') - interval '{} day' " \
               "group by uid HAVING \"count\"(*)>={}"
    conn, cursor = get_postgredb_query()
    cursor.execute(user_sql.format(t, min_interval, min_click))
    rows = cursor.fetchall()
    active_users = [r[0] for r in rows]

    #先获取用户特征
    user_device_sql = "select uid, brand,device_size,network,ctype,province,city,area " \
                      "from user_device " \
                      "where uid in ({})"
    user_raw_info = dict()
    cursor.execute(
        user_device_sql.format(','.join(str(u) for u in active_users)))
    user_raw = cursor.fetchall()
    for u in user_raw:
        user_raw_info[u[0]] = [u[1], u[2], u[3], u[4], u[5], u[6], u[7]]

    #获取用户活跃时间段及点击的新闻
    user_time_sql = "select nid, ctime from newsrecommendclick " \
                    "where uid={} and " \
                    "ctime > to_timestamp('{}', 'yyyy-mm-dd hh24:mi:ss') - interval '{} day' "
    user_click_dict = dict()
    for u in user_raw_info.keys():
        hour_dict = dict()  #记录每个小时的点击数
        cursor.execute(user_time_sql.format(u, t, min_interval))
        rows = cursor.fetchall()
        user_click_dict[u] = []
        for r in rows:
            h = r[1].hour
            hour_dict[h] = 1 if h not in hour_dict else hour_dict[h] + 1
            user_click_dict[u].append(r[0])
        user_raw_info[u].append(hour_dict.keys())

    user_csv = pd.Series(user_raw_info).to_csv('user_feature.csv')
    user_click_csv = pd.Series(user_click_dict).to_csv('user_click.csv')
    print 'finished!!'
    #取负样本

    cursor.close()
    conn.close()
Exemplo n.º 9
0
 def get(self):
     try:
         print '-------------deal old clicks begin!-------------'
         from graphlab_lda import topic_model_model
         s = "select uid, nid, ctime from newsrecommendclick where ctime > now() - interval '30 day'"
         from util import doc_process
         conn, cursor = doc_process.get_postgredb_query()
         cursor.execute(s)
         rows = cursor.fetchall()
         clicks = tuple(rows)
         topic_model_model.predict_clicks(clicks)
         print '-------------deal old clicks finish!-------------'
     except:
         traceback.print_exc()
Exemplo n.º 10
0
def deal_old_nids(nid_list):
    try:
        s = 'select nid from news_topic_v2 where nid in {}'
        conn, cursor = get_postgredb_query()
        cursor.execute(s.format(tuple(nid_list)))
        rows = cursor.fetchall()
        nid_set = set(nid_list)
        exist_set = set(rows)
        to_deal_set = nid_set - exist_set
        predict_nids(list(to_deal_set))
        conn.close()
    except:
        conn.close()
        raise
Exemplo n.º 11
0
def get_old_news(interval=2.0):
    old_news_sql = "select ns.nid, hash_val from news_simhash ns " \
                   "inner join newslist_v2 nv on ns.nid=nv.nid " \
                   "where (ns.ctime > now() - interval '{0} day') and nv.state=0 " \
                   "and nv.chid != 44"
    conn, cursor = doc_process.get_postgredb_query()
    cursor.execute(old_news_sql.format(interval))
    rows = cursor.fetchall()
    nids_hash_dict = dict()
    for r in rows:
        nids_hash_dict[r[0]] = long(r[1])
    cursor.close()
    conn.close()
    return nids_hash_dict
Exemplo n.º 12
0
def choose_subject_name(name_list):
    #首先去除已经存在的专题名
    check_exist_sql = "select id from topiclist where " \
                      "create_time > now() - interval '7 day' and " \
                      "type = 1 and name=%s"
    conn, cursor = get_postgredb_query()
    #for name in name_list:
    for n in name_list:
        logger_sub.info('    before name_list {}'.format(n))
    i = 0
    while i < len(name_list):
        cursor.execute(check_exist_sql, (name_list[i], ))
        row = cursor.fetchone()
        if row:
            logger_sub.info('    {} remove {}'.format(row[0], name_list[i]))
            name_list.remove(name_list[i])
            continue
        i += 1
    conn.close()

    #logger_sub.info('after name_list: {}'.format(name_list))
    if len(name_list) == 0:
        logger_sub.info('all invalid!!!!')
        #raise ValueError('all subject names have existed!')
        return None
    for n in name_list:
        logger_sub.info('    after name_list {}'.format(n))

    word_doc_freq = dict()  #词的
    name_ws = []
    name_num = len(name_list)
    for name in name_list:
        ws = set(cut_pos_ltp(name, return_str=False))
        name_ws.append(ws)
        for w in ws:
            if w in word_doc_freq:
                word_doc_freq[w] += 1
            else:
                word_doc_freq[w] = 1
    words_matter = []
    for item in word_doc_freq.items():
        if item[1] > name_num / 2:
            words_matter.append(item[0])
    words_matter_ratio = []
    for name in name_ws:
        name_matter = name & set(words_matter)
        words_matter_ratio.append(len(name_matter) / float(len(name)))
    index, value = max(enumerate(words_matter_ratio), key=operator.itemgetter(1))
    return name_list[index]
Exemplo n.º 13
0
def get_hashval():
    sql = "select nid, hash_val from news_simhash where ctime > now() - interval '2 day'"
    conn, cursor = get_postgredb_query()
    cursor.execute(sql)
    rows = cursor.fetchall()
    t0 = datetime.datetime.now()
    print 'compare with {}'.format(len(rows))
    hashval = 3255685376439667788
    same = []
    for r in rows:
        if simhash.dif_bit(hashval, long(r[1])) <= 12:
            same.append(r[0])

    t1 = datetime.datetime.now()
    print len(same)
    print 'it takes {} sec'.format((t1 - t0).total_seconds())
Exemplo n.º 14
0
def get_news_interval(h, interval = 9999):
    '''
    找到一定时间内可能重复的新闻
    :param h:
    :param interval:
    :return:
    '''
    fir, sec, thi, fou, fir2, sec2, thi2, fou2 = get_4_segments(h.__long__())
    conn, cursor = doc_process.get_postgredb_query()
    cursor.execute(hash_sql.format(interval, fir, sec, thi, fou, fir2, sec2, thi2, fou2))
    rows = cursor.fetchall()
    nid_hv_list = []
    for r in rows:
        nid_hv_list.append((r[0], r[1]))
    conn.close()
    return nid_hv_list
Exemplo n.º 15
0
def coll_news_proc(save_dir, chnl, doc_num_per_chnl, csv_path):
    try:
        logger.info('    start to collect {} ......'.format(chnl))
        #f = open(os.path.join(save_dir, chnl), 'w') #定义频道文件
        conn, cursor = doc_process.get_postgredb_query()
        if chnl in channel_for_topic_dict.keys():
            num = channel_for_topic_dict[chnl]
        else:
            num = doc_num_per_chnl
        logger.info('    {} num is {}'.format(chnl, num))
        cursor.execute(channle_sql, (chnl, num))
        logger.info('        finish to query {} '. format(chnl))
        rows = cursor.fetchall()
        print len(rows)
        df = pd.DataFrame(columns=csv_columns)
        for row in rows:
            title = row[0]
            content_list = row[1]
            txt = ''
            for content in content_list:
                if 'txt' in content.keys():
                    txt += content['txt'].encode('utf-8')
            total_txt = title*3 + txt
            data = {'nid':[row[2]], 'doc':[''.join(total_txt.split())]} #split主要去除回车符\r, 否则pandas.read_csv出错
            df_local = pd.DataFrame(data, columns=csv_columns)
            df = df.append(df_local, ignore_index=True)
            '''
            total_list = doc_process.filter_html_stopwords_pos(total_txt, remove_num=True, remove_single_word=True)
            if len(total_list) < doc_min_len:  #字数太少则丢弃
                continue
            #根据tfidf进行二次筛选
            total_list = doc_process.jieba_extract_keywords(' '.join(total_list), min(50, len(total_list)/5))
            for w in total_list:
                f.write(w.encode('utf-8') + ' ')
            f.write('\n')
            #f.write(' '.join(total_list).encode('utf-8') + '\n')
            del content_list
            '''
        df.to_csv(csv_path, index=False)
        cursor.close()
        conn.close()
        #f.close()
        logger.info('    finished to collect {} ......'.format(chnl))
    except:
        traceback.print_exc()
        logger.exception(traceback.format_exc())
Exemplo n.º 16
0
def get_relate_same_news(nid_set):
    if len(nid_set) == 0:
        return dict()
    conn, cursor = get_postgredb_query()
    nid_tuple = tuple(nid_set)
    cursor.execute(same_sql, (nid_tuple, nid_tuple))
    same_dict = {}
    rows = cursor.fetchall()
    for r in rows:
        if r[0] not in same_dict.keys():
            same_dict[r[0]] = []
        if r[1] not in same_dict.keys():
            same_dict[r[1]] = []
        same_dict[r[0]].append(r[1])
        same_dict[r[1]].append(r[0])

    conn.close()
    return same_dict
Exemplo n.º 17
0
def is_sentence_ads(hash_val, fir_16, sec_16, thi_16, fou_16, fir2_16, sec2_16, thi2_16, fou2_16, pname):
    conn, cursor = get_postgredb_query()
    cursor.execute(check_ads_sql, (fir_16, sec_16, thi_16, fou_16, fir2_16, sec2_16, thi2_16, fou2_16))
    rows = cursor.fetchall()
    for r in rows:
        if hash_val.hamming_distance_with_val(long(r[1])) <= 3:
            exist = False
            if r[2]:
                spnames = r[2].split(',')
                if len(spnames) == 0 or (pname in spnames):
                    exist = True
            else:
                exist = True
            if exist:
                conn.close()
                return True
    conn.close()
    return False
Exemplo n.º 18
0
def update_sub_name_on_nids(sub_id, nids):
    conn, cursor = get_postgredb_query()
    sql = "select title from newslist_v2 where nid in ({}) and nid not in (select news from topicnews where topic={})"
    nid_str = ', '.join(str(i) for i in nids)
    cursor.execute(sql.format(nid_str, sub_id))
    rows = cursor.fetchall()
    cursor.close()
    conn.close()
    logger_sub.info('    choose from {}'.format(nids))
    for r in rows:
        logger_sub.info('    choose from {}'.format(r[0]))
    mod_name = choose_subject_name([r[0] for r in rows])
    if not mod_name:
        return
    data = {'id': sub_id, 'name': mod_name}
    respond = requests.put(modify_url, data=data, cookies=cookie)
    logger_sub.info('response:  {}'.format(respond.content))
    logger_sub.info('update {} sub name to {}'.format(sub_id, mod_name))
Exemplo n.º 19
0
def del_nid_of_fewer_comment(nid, n, log=logger):
    try:
        conn, cursor = doc_process.get_postgredb_query()
        #先判断新闻是否已经被手工推荐。有则删除没有被手工推荐的新闻
        cursor.execute(recommend_sql, (nid, n))
        rs = cursor.fetchall()
        if len(rs) == 1: #一个被手工上线
            for r in rs:
                rnid = r[0]
            if rnid == n:
                del_nid = nid
                stay_nid = n
            else:
                del_nid = n
                stay_nid = nid
            #cursor.execute(offonline_sql.format(del_nid))
            #conn.commit()
            data = {}
            data['nid'] = del_nid
            response = requests.post(url, data=data)
            cursor.close()
            conn.close()
            log.info('{0} has been recommended, so offline {1}'.format(stay_nid, del_nid))
            return del_nid

        cursor.execute(get_comment_num_sql.format(nid, n))
        rows = cursor.fetchall()
        nid_goal = []
        for r in rows:
            nid_goal.append((r[0], goal_to_del(r[2], r[1])))  #计算两篇新闻的得分
        if len(nid_goal) == 0:  #查库失败, 直接删除旧新闻
            return n
        sorted_goal = sorted(nid_goal, key=lambda goal:goal[1])
        del_nid = sorted_goal[0][0]

        data = {}
        data['nid'] = del_nid
        response = requests.post(url, data=data)
        cursor.close()
        conn.close()
        log.info('{0} vs {1},  offline {2}'.format(nid, n, del_nid))
        return del_nid
    except Exception as e:
        log.error(traceback.format_exc())
Exemplo n.º 20
0
def predict_click(click_info, model_v=None):
    try:
        if not model_v:
            model_v = os.path.split(get_newest_dir(model_base_path))[-1]
        uid = click_info[0]
        nid = click_info[1]
        if isinstance(click_info[2], basestring):
            time_str = click_info[2]
            ctime = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
        else:
            ctime = click_info[2]
            time_str = ctime.strftime('%Y-%m-%d %H:%M:%S')
        logger_9990.info("consume click: uid={}, nid={}, time_str={}".format(
            uid, nid, time_str))
        valid_time = ctime + timedelta(days=30)  #有效时间定为30天
        fail_time = valid_time.strftime('%Y-%m-%d %H:%M:%S')
        conn, cursor = get_postgredb_query()
        cursor.execute(nt_sql.format(nid, model_v))  #获取nid可能的话题
        rows = cursor.fetchall()
        for r in rows:
            topic_id = r[0]
            probability = r[1]
            conn2, cursor2 = get_postgredb()
            cursor2.execute(ut_sql.format(uid, model_v, topic_id))
            rows2 = cursor2.fetchone()
            if rows2:  #该用户已经关注过该topic_id, 更新probability即可
                new_prop = probability + rows2[0]
                logger_9990.info('update: uid={}, topic_id={}'.format(
                    uid, topic_id))
                cursor2.execute(
                    ut_update_sql.format(new_prop, time_str, fail_time, uid,
                                         model_v, topic_id))
            else:
                cursor2.execute(
                    user_topic_insert_sql.format(uid, model_v, topic_id,
                                                 probability, time_str,
                                                 fail_time))
            conn2.commit()
            conn2.close()
        cursor.close()
        conn.close()
    except:
        traceback.print_exc()
Exemplo n.º 21
0
def get_news_words(nid_list):
    conn, cursor = doc_process.get_postgredb_query()
    nids_str = ','.join([str(i) for i in nid_list])
    cursor.execute(news_word_sql.format(nids_str))
    rows = cursor.fetchall()
    conn.close()
    nid_words_dict = {}
    for r in rows:
        nid = r[0]
        title = r[1]
        paragraphs = r[2]
        txt = ''
        for para in paragraphs:
            if 'txt' in para.keys():
                txt += para['txt']
        total_txt = title + txt.encode('utf-8')
        word_list = doc_process.filter_html_stopwords_pos(total_txt, remove_num=True, remove_single_word=True)
        nid_words_dict[nid] = ' '.join(word_list)
    return nid_words_dict
Exemplo n.º 22
0
def coll_user_topics(model_v):
    # uid=0是旧版app,没有确切的uid。所有旧版app的使用者的id都是0
    if TEST_FLAG:
        user_topic_prop_sql = '''select uid, topic_id, probability from user_topics_v2 
                             where model_v = '{}' and uid != 0 and 
                             create_time > now() - interval '10 minute' '''
    else:
        user_topic_prop_sql = '''select uid, topic_id, probability from user_topics_v2 
                             where model_v = '{}' and uid != 0 and 
                             create_time > now() - interval '3 day' '''

    try:
        log_cf.info('    coll_user_topics begin ...')
        conn, cursor = get_postgredb_query()
        cursor.execute(user_topic_prop_sql.format(model_v))
        rows = cursor.fetchall()
        user_ids = []
        topic_ids = []
        props = []
        log_cf.info('    query user topic finished. {} item found.'.format(len(rows)))
        user_topic_prop_dict = {}
        for r in rows:
            user_ids.append(r[0])
            topic_ids.append(r[1])
            props.append(r[2])
            if r[0] not in user_topic_prop_dict:
                user_topic_prop_dict[r[0]] = dict()
            user_topic_prop_dict[r[0]][r[1]] = r[2]

        log_cf.info('    coll_user_topics end')
        del rows
        cursor.close()
        conn.close()
        if TEST_FLAG:
            f = os.path.join(real_dir_path, 'data', 'user_topic.csv')
            df = {'user':user_ids, 'topic':topic_ids, 'prop':props}
            pd.DataFrame(df).to_csv(f, columns=('user', 'topic', 'prop'))
        return user_topic_prop_dict, user_ids, topic_ids, props
    except:
        traceback.print_exc()
        log_cf.exception(traceback.format_exc())
Exemplo n.º 23
0
def predict_chnl_news(chnl_name, num_limit=None):
    '''
    预测特定频道的新闻
    :param chnl_name: 频道名称
    :param num_limit: 数量限制
    :return:
    '''
    logger_chnl.info('begin to predict {}'.format(chnl_name))
    conn, cursor = get_postgredb_query()
    if num_limit:
        chnl_sql = '''select nid from info_news a inner join channellist_v2 cl
                  on a.chid=cl.id where cl.cname={} limit {}'''
        logger_chnl.info(cursor.mogrify(chnl_sql.format(chnl_name)))
        cursor.execute(chnl_sql.format(chnl_name, num_limit))
    else:
        chnl_sql = '''select nid from info_news a inner join channellist_v2 cl
                  on a.chid=cl.id where cl.cname=\'{}\''''
        logger_chnl.info(cursor.mogrify(chnl_sql.format(chnl_name)))
        cursor.execute(chnl_sql.format(chnl_name))

    rows = cursor.fetchall()
    nids = [r[0] for r in rows]
    l = len(nids)
    logger_chnl.info('len of nids is {}'.format(l))
    #分段预测
    if l < 1000:
        kmeans_predict(nids, logger_chnl)
    else:
        n = 0
        while (n + 1000) < l:
            kmeans_predict(nids[n:n + 1000], logger_chnl)
            n += 1000
            logger_chnl.info('{} of {} finished!'.format(n, l))
        kmeans_predict(nids[n:l], logger_chnl)

    logger_chnl.info('predict {} finished!'.format(chnl_name))

    cursor.close()
    conn.close()
Exemplo n.º 24
0
    def get(self):
        try:
            print '----deal old news and click----'
            from graphlab_lda import topic_model_model
            from redis_process import nid_queue
            from util import doc_process
            conn, cursor = doc_process.get_postgredb_query()
            nid_queue.clear_queue_click()
            nid_queue.clear_queue_lda()  #清空旧nid
            s_new = "select nid from newslist_v2 where ctime > now() - interval '10 day' and chid not in (28, 23, 21, 44) and state=0"
            cursor.execute(s_new)
            rows = cursor.fetchall()
            nids = []
            for r in rows:
                nids.append(r[0])
            l = len(nids)

            if len(nids) < 1000:
                topic_model_model.predict_nids(nids)
            else:
                n = 0
                while (n + 1000) < len(nids):
                    topic_model_model.predict_nids(nids[n:n + 1000])
                    n += 1000
                    print('{} of {} finished!'.format(n, l))
                topic_model_model.predict_nids(nids[n - 1000:len(nids)])

            print '    ----- finish to predict news, begin to predict click-----'

            s_click = "select uid, nid, ctime from newsrecommendclick where (ctime > now() - interval '10 day') and (ctime < now() - interval '1.5 day') "
            cursor.execute(s_click)
            clicks = tuple(cursor.fetchall())
            topic_model_model.predict_clicks(clicks)
            print '----------- finish to predict clicks--------'

            conn.close()
        except:
            traceback.print_exc()
Exemplo n.º 25
0
def test_special_space():
    from util.doc_process import get_postgredb_query
    sql = "select title, content from newslist_v2 where nid = 13282986"
    conn, cursor = get_postgredb_query()
    cursor.execute(sql)
    rows = cursor.fetchall()
    for row in rows:
        title = row[0]
        content_list = row[1]
        txt = ''
        for content in content_list:
            if 'txt' in content.keys():
                txt += content['txt'] + ' '  #unicode

        soup = BeautifulSoup(txt, 'lxml')
        txt = soup.get_text()
        total_txt = title + ' ' + txt.encode('utf-8')
        print total_txt
        total_txt = ''.join(total_txt.split())

    print total_txt
    total_txt = total_txt.replace('\xe2\x80\x8b', '')
    total_txt = total_txt.replace('\xe2\x80\x8c', '')
    total_txt = total_txt.replace('\xe2\x80\x8d', '')
    from pyltp import Postagger
    poser = Postagger()
    poser.load('/Users/a000/git/ltp_data/pos.model')
    from pyltp import Segmentor
    segmentor = Segmentor()
    segmentor.load('/Users/a000/git/ltp_data/cws.model')
    ws = segmentor.segment(total_txt)
    wspos = poser.postag(ws)
    for k, i in enumerate(wspos):
        print ws[k]
        print i
        if k > 300:
            break
Exemplo n.º 26
0
def create_subject_class(sub_id):
    time = datetime.datetime.now()
    #class_name = str(time.month) + '.' + str(time.day) + '.' + str(time.hour) + '.' + str(time.minute)
    class_name = str(time.month) + '.' + str(time.day)
    #检测是否已经存在
    #check_class_ex = "select id, name, order from topicclasslist where topic=%s and name=%s"
    check_class_ex = "select id, name, \"order\" from topicclasslist where topic=%s"
    conn, cursor = get_postgredb_query()
    cursor.execute(check_class_ex, (sub_id, ))
    rows = cursor.fetchall()
    conn.close()
    new_order = -1
    for row in rows:
        if row[1] == class_name:
            return row[0]
        new_order = max(new_order, row[2])

    data = {'topic': sub_id, 'name': class_name, 'order': new_order + 1}
    try:
        response = requests.post(topic_class_url, data=data, cookies=cookie)
        return json.loads(response.content)['id']
    except:
        logger_sub.exception(response.content)
        raise
Exemplo n.º 27
0
def kmeans_predict(nid_list):
    global g_channel_kmeans_model_dict, chname_id_dict
    print "****************************************************" + model_v
    if len(g_channel_kmeans_model_dict) == 0:
        load_newest_models()
    if (len(chname_id_dict)) == 0:
        get_chname_id_dict()
    nid_info = {}
    for nid in nid_list:
        conn, cursor = doc_process.get_postgredb_query()
        cursor.execute(nid_sql, [nid])
        row = cursor.fetchone()
        if not row:
            print 'Error: do not get info of nid: ' + str(nid)
            continue
        title = row[0]
        content_list = row[1]
        chanl_name = row[2]

        if chanl_name not in g_channel_kmeans_model_dict:
            continue

        txt = ''
        for content in content_list:
            if 'txt' in content.keys():
                txt += content['txt']
        total_txt = title + txt.encode('utf-8')
        #word_list = doc_process.filter_html_stopwords_pos(total_txt, remove_num=True, remove_single_word=True)
        total_txt = cut_pos_ltp(total_txt)
        nid_info[nid] = [chanl_name, total_txt]
        cursor.close()
        conn.close()

    ch_pred_dict = {}
    for chname in g_channel_kmeans_model_dict.keys():
        clstid_nid_dict = {}
        print 'predict ---- ' + chname
        nids = []
        doc_list = []
        for nid in nid_info.keys():
            if nid_info[nid][0] == chname:
                nids.append(nid)
                doc_list.append(nid_info[nid][1])

        print 'news num of ' + chname + ' is ' + str(len(nids))
        if len(nids) == 0:
            continue
        logger_update.info('type of doc_list is {}'.format(type(doc_list[0])))
        ws = gl.SArray(doc_list)
        docs = gl.SFrame(data={'X1': ws})
        docs = gl.text_analytics.count_words(docs['X1'])
        docs = gl.SFrame(docs)
        pred = g_channel_kmeans_model_dict[chname].predict(
            docs, output_type='cluster_id')
        print pred
        logger_update.info('result : {}'.format(pred))
        if len(nids) != len(pred):
            print 'len(nids) != len(pred)'
            return
        for i in xrange(0, len(pred)):
            if pred[i] not in clstid_nid_dict.keys():
                clstid_nid_dict[pred[i]] = []
            clstid_nid_dict[pred[i]].append(nids[i])
        ch_pred_dict[chname] = clstid_nid_dict
    #print clstid_nid_dict
    return ch_pred_dict
Exemplo n.º 28
0
def cal_process(nid_set, log=None, same_t=3, news_interval=3, same_dict = {}):
    log = logger_9965
    log.info('there are {} news to calulate'.format(len(nid_set)))
    ttt1 = datetime.datetime.now()
    try:
        nid_sents_dict, nid_para_links_dict, nid_pname_dict = get_nids_sentences(nid_set)
        kkkk = 0
        for item in nid_sents_dict.items(): #每条新闻
            #存放专题, 每个元素包含关键句和新闻id两个列表
            #例如[[['abc', 'aaa'], [123, 231]], [['bcd', 'bbb'], [542, 126]] ]
            subject_sentence_nids = []
            kkkk += 1
            n = 0
            nid = item[0]
            log.info('    cal {} sentences...'.format(nid))
            #log.info('--- consume :{}'.format(nid))
            t = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            para_sent_dict = item[1]

            sen_len = 0   #文章总句子数目
            for pa in para_sent_dict.items(): #每个段落
                sen_len += len(pa[1])
            for pa in para_sent_dict.items():
                para_num = pa[0]  #段落号
                sents = pa[1]
                conn, cursor = get_postgredb()
                conn_query, cursor_query = get_postgredb_query()
                for s in sents:  #每个句子
                    n += 1
                    #ts1 = datetime.datetime.now()
                    #print '-------1'
                    #print ts1
                    str_no_html, wl = filter_html_stopwords_pos(s, False, True, True, False)
                    #if len(wl) == 0 or len(str_no_html) <= 2: #去除一个字的句子,因为有很多是特殊字符
                    #if len(wl) == 0 or len(str_no_html) <= 15: #去除一个字的句子,因为有很多是特殊字符
                    #if len(wl) == 10 or len(str_no_html) <= 15: #去除一个字的句子,因为有很多是特殊字符
                    if len(wl) <= 10 : #去除一个字的句子,因为有很多是特殊字符
                        continue
                    #ts2 = datetime.datetime.now()
                    #print '-------2'
                    #print ts2
                    h = simhash.simhash(wl)
                    check_exist_sql = "select nid from news_sentence_hash_cache where nid=%s and hash_val=%s" #该新闻中已经有这个句子,即有重复句子存在
                    cursor_query.execute(check_exist_sql, (nid, h.__str__()))
                    #ts3 = datetime.datetime.now()
                    #print '-------3'
                    #print ts3
                    if len(cursor_query.fetchall()) != 0:
                        #log.info('sentence has existed in this news: {}'.format(str_no_html.encode("utf-8")))
                        continue
                    fir, sec, thi, fou, fir2, sec2, thi2, fou2 = simhash.get_4_segments(h.__long__())
                    if is_sentence_ads(h, fir, sec, thi, fou, fir2, sec2, thi2, fou2, nid_pname_dict[nid]):  #在广告db内
                        #  删除广告句子
                        #log.info('find ads of {0}  : {1} '.format(nid, str_no_html.encode("utf-8")))
                        continue
                    #ts4 = datetime.datetime.now()
                    #print '-------4'
                    #print ts4
                    cursor_query.execute(query_sen_sql_interval, (str(fir), str(sec), str(thi), str(fou), str(fir2), str(sec2), str(thi2), str(fou2), news_interval))
                    #print cursor.mogrify(query_sen_sql_interval, (str(fir), str(sec), str(thi), str(fou), str(fir2), str(sec2), str(thi2), str(fou2), news_interval))
                    rows = cursor_query.fetchall()  #所有可能相同的段落
                    #print 'len of potential same sentence is {}'.format(len(rows))
                    if len(rows) == 0:  #没有相似的句子
                        #将所有句子入库
                        cursor.execute(insert_sentence_hash, (nid, str_no_html, n, h.__str__(), fir, sec, thi, fou, t, fir2, sec2, thi2, fou2))
                        #logger_9965.info('len of potential same sentence is 0')
                        continue
                    #else:
                        #logger_9965.info('len of potential same sentence is {}'.format(len(rows)))

                    #ts5 = datetime.datetime.now()
                    #print '-------5'
                    #print ts5
                    same_sentence_sql_para = []
                    nids_for_ads = set()
                    for r in rows:
                        #if len(nids_for_ads) >= 15:
                            #break
                        #距离过大或者是同一篇新闻
                        if h.hamming_distance_with_val(long(r[1])) > same_t or (nid in same_dict.keys() and r[0] in same_dict[nid]) or nid == r[0]:
                            #logger_9965.info('distance is too big or same news of {} and {}'.format(nid, r[0]))
                            continue
                        cursor_query.execute(same_sql2, (r[0], r[1]))
                        rs = cursor_query.fetchall()
                        for r2 in rs:
                            sen = r2[0].decode('utf-8')
                            sen_without_html = filter_tags(sen)
                            if len(sen) == 1 or len(sen_without_html) > len(str_no_html)*1.5 or len(str_no_html) > len(sen_without_html)*1.5:
                                #logger_9965.info('sentence len mismatch: {} ----{}'.format(str_no_html.encode('utf-8'), sen_without_html))
                                continue
                            wl1 = jieba.cut(str_no_html)
                            set1 = set(wl1)
                            l1 = len(set1)
                            wl2 = jieba.cut(sen_without_html)
                            set2 = set(wl2)
                            set_same = set1 & set2
                            l2 = len(set2)
                            l3 = len(set_same)
                            if l3 < max(l1, l2) * 0.6:  #相同比例要达到0.6
                                continue
                            nids_for_ads.add(str(r[0]))
                            same_sentence_sql_para.append((nid, r[0], str_no_html, sen, t))
                            #cursor.execute(insert_same_sentence, (nid, r[0], str_no_html, sen, t))
                            #print cursor.mogrify(insert_same_sentence, (nid, r[0], str_no_html, sen_without_html, t))
                    #ts6 = datetime.datetime.now()
                    #print '-------6'
                    #print ts6
                    if len(nids_for_ads) == 0:  #没有潜在相同的句子; 这些句子先做广告检测
                        cursor.execute(insert_sentence_hash, (nid, str_no_html, n, h.__str__(), fir, sec, thi, fou, t, fir2, sec2, thi2, fou2))
                        conn.commit()
                        continue

                    is_new_ads = False
                    not_ads_but_ignore = False   #不是广告,但需要忽略计算重复
                    PNAME_T = 3
                    nid_pn = {}
                    pname_set = set()
                    chid_set = set()
                    ctime_list = []
                    #print cursor.mogrify(get_pname, (tuple(nids_for_ads),))
                    cursor_query.execute(get_pname, (tuple(nids_for_ads),))
                    rows2 = cursor_query.fetchall()
                    for rk in rows2:
                        pname_set.add(rk[0])
                        chid_set.add(rk[1])
                        ctime_list.append(rk[2])
                        nid_pn[rk[3]] = rk[0]
                    if len(nids_for_ads) / float(len(pname_set)) > 3: #2017.06.13 添加
                        is_new_ads = True
                    if len(nids_for_ads) >= 10:
                        #先处理同源潜在广告
                        if len(pname_set) <= PNAME_T or (len(pname_set) > 5 and len(chid_set) < 4):
                            #if n > sen_len * .2 and n < sen_len * .8:
                            if float(n) < float(sen_len * .2) or float(n) > float(sen_len * .8):
                                min_time = ctime_list[0]
                                max_time = ctime_list[0]
                                for kkk in xrange(1, len(ctime_list)):
                                    if ctime_list[kkk] > max_time:
                                        max_time = ctime_list[kkk]
                                    if ctime_list[kkk] < min_time:
                                        min_time = ctime_list[kkk]
                                if (max_time - min_time).days > 2:  #不是两天内的热点新闻
                                    is_new_ads = True
                            '''
                            nid_links = nid_para_links_dict[nid]
                            sum_own_links = 0  #有链接的段落数
                            for kk in xrange(para_num, len(nid_links)):
                                if len(nid_links[kk]):
                                    sum_own_links += 1
                            if sum_own_links > (len(nid_links) - para_num) * 0.8: #后面的链接很多,认为是广告
                                is_new_ads = True
                        elif len(pname_set) > 5 and len(chid_set) < 4:   #来自多个源, 看是否集中在几个频道,如果是,则认为是广告
                            #需要判断这些新闻入库时间不集中在3天内,否则可能不是广告
                            min_time = ctime_list[0]
                            max_time = ctime_list[0]
                            for kkk in xrange(1, len(ctime_list)):
                                if ctime_list[kkk] > max_time:
                                    max_time = ctime_list[kkk]
                                if ctime_list[kkk] < min_time:
                                    min_time = ctime_list[kkk]
                            if (max_time - min_time).days > 2:  #不是三天内的热点新闻
                                is_new_ads = True
                             '''
                        else:
                            not_ads_but_ignore = True
                    #ts7 = datetime.datetime.now()
                    #print '-------7'
                    #print ts7
                    nids_str = ','.join(nids_for_ads)
                    if is_new_ads:  #是否是新广告
                        if len(pname_set) <= PNAME_T:  #源
                            pname_str = ','.join(pname_set)
                        else:
                            pname_str = ""
                        cursor.execute(ads_insert, (str_no_html, h.__str__(), t, fir, sec, thi, fou, fir2, sec2, thi2, fou2, nids_str, 0, pname_str))
                        #log.info('find new ads : {0}'.format(str_no_html.encode("utf-8")))
                    else:
                        #if len(same_sentence_sql_para) < 5:  #检测出过多的相同句子,又不是广告, 可能是误判, 不处理
                        if not_ads_but_ignore:  #相同的句子过多,认为是误判, 加入广告数据库,但state=1,即不是真广告,这样可以在下次碰到时减少计算
                            cursor.execute(ads_insert, (str_no_html, h.__str__(), t, fir, sec, thi, fou, fir2, sec2, thi2, fou2, nids_str, 1, "" ))
                        else:
                            cursor.executemany(insert_same_sentence, same_sentence_sql_para)  #有效的重复句子
                            #log.info('get same sentence map :{}'.format(str_no_html.encode('utf-8')))
                            #多放观点  1. 句子长度>30.  2 不同源  3. 去除首尾
                            if len(str_no_html) > 15 and n > 2 and (n < sen_len-2):
                            #if len(str_no_html) > 15:
                                sub_nids_set = set()
                                for same in same_sentence_sql_para:
                                    nn = same[1]  #nid
                                    if nid_pname_dict[nid] != nid_pn[nn]:
                                        ctime_sql = "select nid, ctime from info_news where nid = %s or nid=%s"
                                        cursor_query.execute(ctime_sql, (same[0], same[1]))
                                        ctimes = cursor_query.fetchall()
                                        ctime_dict = {}
                                        for ct in ctimes:
                                            ctime_dict[str(ct[0])] = ct[1]
                                        cursor.execute(multo_vp_insert_sql, (str(same[0]), same[2], str(same[1]), same[3], t, ctime_dict[str(same[0])], ctime_dict[str(same[1])]))
                                        log.info('      get multi viewpoint :{}'.format(str_no_html.encode('utf-8')))
                                        sub_nids_set.add(same[0])
                                        sub_nids_set.add(same[1])
                                        subject_queue.product_simhash2((same[0], same[1]))
                                #log.info("num of mvp is {}".format(sub_nids_set))
                                if len(sub_nids_set) >= 2:  ## 专题新闻入队列
                                    log.info('      generate subject for {}'.format(sub_nids_set))
                                    #for i in sub_nids_set:
                                    #    subject_nids.add(i)
                                    key_sents = [str_no_html.encode('utf-8'), ]
                                    sub_nids = []
                                    for i in sub_nids_set:
                                        sub_nids.append(i)
                                    subject_sentence_nids.append([key_sents, sub_nids])
                                    #subject_queue.product_subject(tuple(nid_set))

                    #将所有段落入库
                    cursor.execute(insert_sentence_hash, (nid, str_no_html, n, h.__str__(), fir, sec, thi, fou, t, fir2, sec2, thi2, fou2))
                conn.commit()
                cursor.close()
                conn.close()
                cursor_query.close()
                conn_query.close()
            if len(subject_sentence_nids) > 0 and len(subject_sentence_nids) < 3:
                #log.info("before merge : {}".format(subject_sentence_nids))
                subs = merge_subs(subject_sentence_nids)
                #log.info("after merge : {}".format(subs))
                for sub in subs:
                    subject_queue.product_subject(sub)
                #log.info('generate subject for {} ------ {}'.format(nid, subject_nids))
                #subject_queue.product_subject(tuple(subject_nids))

        ttt2 = datetime.datetime.now()
        log.info('it takes {}'.format((ttt2-ttt1).total_seconds()))
        del nid_sents_dict
        del nid_para_links_dict
    except:
        log.exception(traceback.format_exc())