示例#1
0
    def create(self):
        #logger_9987.info('TopicModel::create begin ...')
        docs_sframe = gl.SFrame.read_csv(self.data_path, header=True)
        docs = gl.text_analytics.count_words(docs_sframe['doc'])
        docs = gl.text_analytics.trim_rare_words(docs,
                                                 threshold=30,
                                                 delimiters=None)
        self.model = gl.topic_model.create(docs,
                                           num_iterations=1000,
                                           num_burnin=100,
                                           num_topics=5000)

        sf = self.model.get_topics(num_words=20, output_type='topic_words')
        conn, cursor = get_postgredb()
        for i in xrange(0, len(sf)):
            try:
                keys_words_jsonb = json.dumps(sf[i]['words'])
                cursor.execute(
                    save_model_sql,
                    [self.version, str(i), keys_words_jsonb])
                conn.commit()
            except Exception:
                print 'save model to db error'
        conn.close()
        del docs_sframe
        del docs
示例#2
0
def generate_subject2(sub_nids):
    prefix = 'http://fez.deeporiginalx.com:9001'
    create_url = prefix + '/topics'
    cookie = {'Authorization': 'f76f3276c1ac832b935163c451f62a2abf5b253c'}
    #set subject name as one title of one piece of news
    sql = "select title from newslist_v2 where nid=%s"
    conn, cursor = get_postgredb()
    cursor.execute(sql, (sub_nids[0], ))
    row = cursor.fetchone()
    sub_name = row[0]
    data = {'name': sub_name}
    response = requests.post(create_url, data=data, cookies=cookie)
    print response.content
    content = json.loads(response.content)
    id = content['id']

    topic_class_url = prefix + '/topic_classes'
    data = {'topic': id, 'name': 'random'}
    response = requests.post(topic_class_url, data=data, cookies=cookie)
    class_id = json.loads(response.content)['id']

    add_nid_url = prefix + '/topic_news'
    for nid in sub_nids:
        data = {'topic_id': id, 'news_id': nid, 'topic_class_id': class_id}
        requests.post(add_nid_url, data=data, cookies=cookie)
示例#3
0
def detect_multivp_and_ads(nid_list):
    conn, cursor = get_postgredb()
    cursor.execute(sql, (tuple(nid_list), ))
    rows = cursor.fetchall()
    all_nids = set()
    #查询这些nid的pname
    for r in rows:
        all_nids.add(r[0])
        all_nids.add(r[1])
    nids = []
    for i in all_nids:
        nids.append(i)
    cursor.execute(ch_sql, (tuple(nids), ))
    rows2 = cursor.fetchall()
    nid_pname_dict = {}
    for r in rows2:
        nid_pname_dict[r[0]] = r[1]

    for r in rows:
        nid1 = r[0]
        pname1 = nid_pname_dict[nid1]
        nid2 = r[1]
        pname2 = nid_pname_dict[nid2]
        if pname1 == pname2:  #同一个源,可能是广告
            pass
        else:  #进一步判断是否是多放观点
            pass
示例#4
0
def save_subject_sentences(sub_id, sents):
    #记录专题key_sentence
    conn, cursor = get_postgredb()
    sub_sents_sql = "insert into topic_sentences (topic_id, sentences) values (%s, %s)"
    cursor.execute(sub_sents_sql, (sub_id, json.dumps(sents)))
    conn.commit()
    conn.close()
示例#5
0
def update_sub(old_sub_id, sub):
    #先获取old_sub_id的class id
    logger_sub.info('    update_sub {} to: {}'.format(old_sub_id, sub))
    conn, cursor = get_postgredb()
    #创建新的class_id
    class_id = create_subject_class(old_sub_id)
    #添加新闻
    add_news_to_subject(old_sub_id, class_id, sub[1])
    #更新专题名称
    update_sub_name_on_nids(old_sub_id, sub[1])


    #topic中添加key_sentence
    sent_sql = "select sentences from topic_sentences where topic_id=%s"
    cursor.execute(sent_sql, (old_sub_id, ))
    row = cursor.fetchone()
    if row:
        old_sents = row[0]
    else:
        old_sents = []
    added_sen = set(sub[0]) - set(old_sents)
    if len(added_sen) > 0:
        old_sents.extend(added_sen)
    update_sql = "update topic_sentences set sentences=%s where topic_id=%s"
    cursor.execute(update_sql, (json.dumps(old_sents), old_sub_id))
    conn.commit()
    conn.close()
示例#6
0
def check_same_news(nid1, nid2):
    conn, cursor = doc_process.get_postgredb()
    check_state = "select state from newslist_v2 where nid in ({}, {}) and state=0"
    cursor.execute(check_state.format(nid1, nid2))
    rs = cursor.fetchall()
    if len(list(rs)) < 2:
        return
    words_list1 = doc_process.get_words_on_nid(nid1) #获取新闻的分词
    words_list2 = doc_process.get_words_on_nid(nid2) #获取新闻的分词
    h1 = simhash(words_list1) #本篇新闻的hash值
    h2 = simhash(words_list2) #本篇新闻的hash值
    diff_bit = h1.hamming_distance(h2)
    if diff_bit > 12: #大于12, 认为不可能是同一篇新闻
        return
    title_sql = "select title from newslist_v2 where nid in ({}, {})"
    cursor.execute(title_sql.format(nid1, nid2))
    rows = cursor.fetchall()
    titles = [r[0] for r in rows]
    if doc_process.get_sentence_similarity(titles[0], titles[1]) > 0.3: #标题相似性大于0.3
        off_nid = del_nid_of_fewer_comment(nid1, nid2, log=logger_sen)
        t0 = datetime.datetime.now()
        cursor.execute(insert_same_sql.format(nid1, nid2, diff_bit, t0.strftime('%Y-%m-%d %H:%M:%S'), off_nid)) #记录去重操作
    conn.commit()
    cursor.close()
    conn.close()
示例#7
0
def cal_save_simhash_proc(nids, t):
    conn, cursor = doc_process.get_postgredb()
    for nid in nids:
        words_list = doc_process.get_words_on_nid(nid) #获取新闻的分词
        h = simhash(words_list) #本篇新闻的hash值
        fir, sec, thi, fou, fir2, sec2, thi2, fou2 = get_4_segments(h.__long__()) #获取hash值的分段
        cursor.execute(insert_news_simhash_sql.format(nid, h.__str__(), t, fir, sec, thi, fou, fir2, sec2, thi2, fou2))#记录新闻hash新闻
        conn.commit()
    cursor.close()
    conn.close()
示例#8
0
def clear_data():
    try:
        log_cf_clear_data.info('begin clear data...')
        conn, cursor = get_postgredb()
        cursor.execute(clear_sql)
        cursor.execute(clear_sql2)
        conn.commit()
        cursor.close()
        conn.close()
        log_cf_clear_data.info('finish clearing data...')
    except:
        pass
示例#9
0
def test_jsonb():
    conn, cursor = get_postgredb()
    id = 0
    sentence = ['测试句子1', '测试句子2']
    insert_sql = "insert into topic_sentences (topic_id, sentences) values (%s, %s)"
    print json.dumps(sentence)
    #cursor.execute(insert_sql, (id, json.dumps(sentence)))
    #conn.commit()
    query_sql = "select sentences from topic_sentences where topic_id=%s"
    cursor.execute(query_sql, (id, ))
    row = cursor.fetchone()
    print row[0]
    print type(row[0])
    conn.close()
示例#10
0
def move_sentence_data():
    try:
        nt = datetime.datetime.now()
        t = nt.strftime('%Y-%m-%d %H:%M:%S')
        logger_9963.info('move_sentence_data--- {}'.format(t))
        conn, cursor = get_postgredb()
        # cursor.execute(move_sentenct_sql, (t, ))
        # logger_9963.info('move finished')
        cursor.execute(del_sentenct_sql, (t, ))
        conn.commit()
        conn.close()
        nt2 = datetime.datetime.now()
        logger_9963.info('finished to move_sentence_data. it takes {} s'.format((nt2 - nt).total_seconds()))
    except:
        logger_9963.info(traceback.format_exc())
示例#11
0
def predict_click(click_info, model_v=None):
    try:
        if not model_v:
            model_v = os.path.split(get_newest_dir(model_base_path))[-1]
        uid = click_info[0]
        nid = click_info[1]
        if isinstance(click_info[2], basestring):
            time_str = click_info[2]
            ctime = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
        else:
            ctime = click_info[2]
            time_str = ctime.strftime('%Y-%m-%d %H:%M:%S')
        logger_9990.info("consume click: uid={}, nid={}, time_str={}".format(
            uid, nid, time_str))
        valid_time = ctime + timedelta(days=30)  #有效时间定为30天
        fail_time = valid_time.strftime('%Y-%m-%d %H:%M:%S')
        conn, cursor = get_postgredb_query()
        cursor.execute(nt_sql.format(nid, model_v))  #获取nid可能的话题
        rows = cursor.fetchall()
        for r in rows:
            topic_id = r[0]
            probability = r[1]
            conn2, cursor2 = get_postgredb()
            cursor2.execute(ut_sql.format(uid, model_v, topic_id))
            rows2 = cursor2.fetchone()
            if rows2:  #该用户已经关注过该topic_id, 更新probability即可
                new_prop = probability + rows2[0]
                logger_9990.info('update: uid={}, topic_id={}'.format(
                    uid, topic_id))
                cursor2.execute(
                    ut_update_sql.format(new_prop, time_str, fail_time, uid,
                                         model_v, topic_id))
            else:
                cursor2.execute(
                    user_topic_insert_sql.format(uid, model_v, topic_id,
                                                 probability, time_str,
                                                 fail_time))
            conn2.commit()
            conn2.close()
        cursor.close()
        conn.close()
    except:
        traceback.print_exc()
示例#12
0
def get_clicks_5m():
    logger_9989.info('news epoch...')
    global last_time
    now = datetime.datetime.now()
    if last_time > now:
        logger_9989.info('    **** time error! {}'.format(last_time))
        last_time = now - timedelta(seconds=3)

    conn, cursor = get_postgredb()
    #cursor.execute(click_sql.format(channels, period))
    cursor.execute(click_sql.format(channels, last_time.strftime('%Y-%m-%d %H:%M:%S.%f')))
    rows = cursor.fetchall()
    for r in rows:
        if r[3] > now:
            continue
        last_time = r[3]
        ctime_str = r[2].strftime('%Y-%m-%d %H:%M:%S')
        logger_9989.info('    pruduce {}--{}--{}'.format(r[0], r[1], ctime_str))
        nid_queue.produce_user_click_lda(r[0], r[1], ctime_str)
    cursor.close()
    conn.close()
示例#13
0
def get_potential_topic(user_topic_prop_dict, user_neighbours, model_v, time):
    log_cf.info('    begin to get_potential_topic...')
    potential_utp_dict = dict() #存储每个邻居推荐的topic及对应的概率
    for it in user_neighbours.items():
        u = it[0]
        potential_utp_dict[u] = dict()
        for nei_sim in it[1]: #每个邻居
            nei = nei_sim[0]
            sim = nei_sim[1]
            if sim == 1.0:  #完全相同的用户不需做其他比较
                continue
            nei_topics_prop = user_topic_prop_dict[nei]  #邻居的所有topic
            for tp in nei_topics_prop.items():  #
                if tp[0] not in user_topic_prop_dict[u]: #原用户并没有行为的topic
                    potential_utp_dict[u][tp[0]] = potential_utp_dict[u].setdefault(tp[0], 0) + sim * tp[1]

    user_potential_topic_sql = "insert into user_topic_cf (uid, model_v, topic_id, property, ctime) VALUES ({}, '{}', {}, {}, '{}')"
    if TEST_FLAG:
        us, ts, ps = [], [], []
        for item in potential_utp_dict.items():
            for i in item[1].items():
                us.append(item[0])
                ts.append(i[0])
                ps.append(i[1])
        f = os.path.join(real_dir_path, 'data', 'final_recommend.csv')
        pd.DataFrame({'user':us, 'topic':ts, 'prop':ps}).to_csv(f, columns=('user', 'topic', 'prop'))
        print '    finished get_potential_topic...'
    else:
        conn, cursor = get_postgredb()
        for item in potential_utp_dict.items():
            u = item[0]
            topic_score = item[1]
            #sorted_topic_score = sorted(topic_score.items(), key=lambda d: d[1], reverse=True)[:30]
            sorted_topic_score = nlargest(30, topic_score.items(), key=itemgetter(1))
            for it in sorted_topic_score:
                if it[1] > 0.1:
                    cursor.execute(user_potential_topic_sql.format(u, model_v, it[0], it[1], time))
        conn.commit()
        conn.close()
    log_cf.info('    finished get_potential_topic...')
示例#14
0
def create_subject(nids):
    try:
        logger_sub.info('create subject for {}'.format(nids))
        conn, cursor = get_postgredb()
        sql = "select title from newslist_v2 where nid in ({})"
        nid_str = ', '.join(str(i) for i in nids)
        cursor.execute(sql.format(nid_str))
        rows = cursor.fetchall()
        conn.close()
        sub_name = choose_subject_name([r[0] for r in rows])
        if not sub_name:
            return

        data = {'name': sub_name, 'type': 1, 'cover': subject_cover}
        logger_sub.info('create subject "{}"'.format(sub_name))
        response = requests.post(create_url, data=data, cookies=cookie)
        content = json.loads(response.content)
        if 'id' not in content:
            logger_sub.info('error to create subject : {}'.format(content))
            return
        return content['id']
    except:
        logger_sub.exception(traceback.format_exc())
        return
示例#15
0
def del_same_old_news(nid, nid_hash_dict):
    '''
        直接对比
    '''
    nid = int(nid)
    if nid not in nid_hash_dict:
        return
    conn, cursor = doc_process.get_postgredb()
    hash_val = nid_hash_dict[nid]
    for n, hv in nid_hash_dict.items():
        if n == nid:
            continue

        diff_bit = dif_bit(hash_val, hv)
        if diff_bit <= 6:
            offnid = del_nid_of_fewer_comment(nid, n)
            t0 = datetime.datetime.now()
            cursor.execute(insert_same_sql.format(nid, n, diff_bit, t0.strftime('%Y-%m-%d %H:%M:%S'), offnid)) #记录去重操作
            nid_hash_dict.pop(offnid)
            break  #认为只与一个可能存在相同。  从数据看这么做.

    conn.commit()
    cursor.close()
    conn.close()
示例#16
0
def add_news_to_subject(sub_id, class_id, nids):
    conn, cursor = get_postgredb()
    sub_nids_sql = "select news from topicnews where topic=%s"
    cursor.execute(sub_nids_sql, (sub_id, ))
    rows = cursor.fetchall()
    old_sub_nids_set = set()
    for r in rows:
        old_sub_nids_set.add(r[0])
    sub_nids_set = set(nids)

    #专题插入新闻
    added_nids = sub_nids_set - old_sub_nids_set
    for nid in added_nids:
        data = {'topic_id':sub_id, 'news_id':nid, 'topic_class_id':class_id}
        requests.post(add_nid_url, data=data, cookies=cookie)

    #2017.05.22. 检查新的专题的新闻是否与已经存在的新闻有重复,如果有,删除一个小的专题
    potential_same_sub_sql = "select topic from topicnews tn inner join topiclist tl on tn.topic=tl.id where news in ({}) and topic != {} group by topic"
    nid_str = ', '.join(str(i) for i in (sub_nids_set | old_sub_nids_set))
    cursor.execute(potential_same_sub_sql.format(nid_str, sub_id))
    rows = cursor.fetchall()
    for r in rows:
        del_id = check_del_sub_subject(sub_id, r[0])
        if del_id and del_id == sub_id: #有已经存在的专题包含sub_id, 删除sub_id后返回
            return


    #查询专题-topic
    sub_topic_sql = "select model_v, topic_id, probability from subject_topic where subject_id=%s"
    cursor.execute(sub_topic_sql, (sub_id, ))
    sub_topic_dict = dict()
    topic_model_v = ''
    rows = cursor.fetchall()
    for r in rows:
        topic_model_v = r[0]
        sub_topic_dict[r[1]] = r[2]
    old_topics = sub_topic_dict.keys()
    #计算新闻topic
    news_topic_sql = "select topic_id, probability, model_v from news_topic_v2 where nid=%s"
    topic_model_set = set()
    news_topics_dict = dict()
    for nid in added_nids:
        cursor.execute(news_topic_sql, (nid, ))
        rows2 = cursor.fetchall()
        for r in rows2:
            topic_model_set.add(r[2])
            if r[0] in news_topics_dict:
                news_topics_dict[r[0]] += r[1]
            else:
                news_topics_dict[r[0]] = r[1]
    if len(topic_model_set) == 0 or len(topic_model_set) != 1 or \
       (topic_model_v != '' and topic_model_v != list(topic_model_set)[0]):  #包含多个版本的topic信息
        conn.close()
        return
    if topic_model_v == '':
        topic_model_v = list(topic_model_set)[0]
    #更新专题
    for item in news_topics_dict.items():
        if item[0] in sub_topic_dict:
            sub_topic_dict[item[0]] += item[1]/len(added_nids)
        else:
            sub_topic_dict[item[0]] = item[1]/len(added_nids)
    sub_topic_sort = sorted(sub_topic_dict.items(), key=lambda d:d[1], reverse=True)
    time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    update_sub_topic = "update subject_topic set probability=%s, ctime=%s where subject_id=%s and model_v=%s and topic_id=%s"
    insert_sub_topic = "insert into subject_topic (subject_id, model_v, topic_id, probability, ctime) values (%s, %s, %s, %s, %s)"
    for i in range(0, min(len(sub_topic_sort), 10)):
        tid = sub_topic_sort[i][0]
        tp = sub_topic_sort[i][1]
        if tid in old_topics:
            cursor.execute(update_sub_topic, (tp, time, sub_id, topic_model_v, tid))
        else:
            cursor.execute(insert_sub_topic, (sub_id, topic_model_v, tid, tp, time))

    #专题的新闻总数大于5就自动上线
    all_nids = old_sub_nids_set | sub_nids_set
    logger_sub.info('    sub {} :{} -change to - {}'.format(sub_id, old_sub_nids_set, all_nids))
    if len(all_nids) >= 5:
        data = {'zt_id': sub_id, 'online': 0}
        requests.get(online_url, params=data, cookies=cookie)

    conn.commit()
    cursor.close()
    conn.close()
示例#17
0
def generate_subject(sub):
    try:
        sub_sents = sub[0]
        sub_nids = sub[1]
        conn, cursor = get_postgredb()
        state_sql = 'select nid from newslist_v2 where nid in ({}) and state=0'
        cursor.execute(state_sql.format(','.join(str(n) for n in sub_nids)))
        rs = cursor.fetchall()
        sub_nids = list(r[0] for r in rs)
        if len(sub_nids) <= 1:
            conn.close()
            return
        logger_sub.info('******prepare to create subject for {}'.format(sub_nids))
        ##############检查是否需要新建专题还是更新到旧专题###
        if len(sub_nids) > 4:  #含4条以上新闻才可以合并到其他专题
            oldsub_nid_dict = dict()  #记录旧topic--与本sub相同的nid
            nid_old_sub_sql = "select tn.topic, tn.news from topicnews tn " \
                              "inner join topiclist tl on tn.topic=tl.id " \
                              "where news in %s and tl.type=1"
            cursor.execute(nid_old_sub_sql, (tuple(sub_nids), ))
            rows = cursor.fetchall()
            for r in rows:
                 if r[0] in oldsub_nid_dict:
                     oldsub_nid_dict[r[0]].append(r[1])
                 else:
                     oldsub_nid_dict[r[0]] = [r[1], ]
            update = False
            for item in oldsub_nid_dict.items():
                if float(len(item[1])) >= 0.5 * len(sub_nids):  #sub一半以上的nid包含在旧subject内,则把sub合并进旧subject
                    update_sub(item[0], sub)
                    update = True
            if update:
                conn.close()
                return


        ##############需要新建专题#######################
        '''
        create_url = prefix + '/topics'
        #set subject name as one title of one piece of news
        sql = "select title from newslist_v2 where nid=%s"
        cursor.execute(sql, (sub_nids[0],))
        rows = cursor.fetchall()
        sub_name = choose_subject_name([r[0] for r in rows])

        data = {'name': sub_name, 'type': 1}
        logger_sub.info('create subject {}'.format(sub_name))
        response = requests.post(create_url, data=data, cookies=cookie)
        content = json.loads(response.content)
        if 'id' not in content:
            logger_sub.info('error to create subject : {}'.format(content))
            return
        sub_id = content['id']
        '''
        sub_id = create_subject(sub_nids)
        if not sub_id:
            return

        '''
        topic_class_url = prefix + '/topic_classes'
        time = datetime.datetime.now()
        class_name = str(time.month) + '.' + str(time.day)
        data = {'topic': sub_id, 'name': class_name}
        response = requests.post(topic_class_url, data=data, cookies=cookie)
        class_id = json.loads(response.content)['id']
        '''
        class_id = create_subject_class(sub_id)

        '''
        add_nid_url = prefix + '/topic_news'
        for nid in sub_nids:
            data = {'topic_id':sub_id, 'news_id':nid, 'topic_class_id':class_id}
            requests.post(add_nid_url, data=data, cookies=cookie)
        '''
        add_news_to_subject(sub_id, class_id, sub_nids)

        '''
        #记录专题key_sentence
        sub_sents_sql = "insert into topic_sentences (topic_id, sentences) values (%s, %s)"
        cursor.execute(sub_sents_sql, (sub_id, json.dumps(sub_sents)))
        '''
        save_subject_sentences(sub_id, sub_sents)
        conn.commit()
        conn.close()
    except:
        logger_sub.exception(traceback.format_exc())
示例#18
0
def predict(model, nid_list):
    logger_9988.info('predict {}'.format(nid_list))
    t0 = datetime.datetime.now()
    nid_words_dict = get_news_words(nid_list)
    nids = []
    doc_list = []
    for item in nid_words_dict.items():
        nids.append(item[0])
        doc_list.append(item[1])
    ws = gl.SArray(doc_list)
    docs = gl.SFrame(data={'X1': ws})
    docs = gl.text_analytics.count_words(docs['X1'])
    pred = model.model.predict(docs, output_type='probability', num_burnin=50)
    #pred保存的是每个doc在所有主题上的概率值
    props_list = []  #所有文档的主题-概率对儿
    for doc_index in xrange(len(pred)):  #取每个doc的分布
        doc_props = pred[doc_index]
        index_val_dict = {}
        for k in xrange(len(doc_props)):
            if doc_props[k] > 0.1:
                index_val_dict[k] = doc_props[
                    k]  #{ topic1:0.3, topic2:0.2, ...}
        sort_prop = sorted(index_val_dict.items(),
                           key=lambda d: d[1],
                           reverse=True)
        props = []  #本文档的主题-概率对儿 # [(5, 0.3), (3, 0.2), ...]
        for i in xrange(min(3, len(sort_prop))):
            if i == 0:
                props.append(sort_prop[i])
            else:
                if sort_prop[i][1] > 0.5 * sort_prop[
                        i - 1][1]:  #大于0.1并且与前一个概率差别不到一倍
                    props.append(sort_prop[i])
                else:
                    break

        props_list.append(props)  # [ [(5, 0.3), (3, 0.2)..], ....  ]
    #入库
    insert_list = []
    str_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    res_dict_list = []
    for n in xrange(len(nids)):
        for m in xrange(len(props_list[n])):
            topic_id = props_list[n][m][0]
            prop = props_list[n][m][1]
            insert_list.append(
                (nids[n], model.version, topic_id, prop, str_time))
            '''
            sf = model.model.get_topics(num_words=20,
                                       output_type='topic_words')
            info_dict = {}
            info_dict['nid'] = nids[n]
            info_dict['model_v'] = model_version
            info_dict['topic_id'] = topic_id
            info_dict['probability'] = prop
            info_dict['topic_words'] = sf[topic_id]['words']
            res_dict_list.append(info_dict)
            '''
    conn, cursor = get_postgredb()
    cursor.executemany(insert_sql, insert_list)
    conn.commit()
    conn.close()
    t1 = datetime.datetime.now()
    logger_9988.info('prediction takes {}s'.format((t1 - t0).total_seconds()))
    return res_dict_list
示例#19
0
def cal_process(nid_set, log=None, same_t=3, news_interval=3, same_dict = {}):
    log = logger_9965
    log.info('there are {} news to calulate'.format(len(nid_set)))
    ttt1 = datetime.datetime.now()
    try:
        nid_sents_dict, nid_para_links_dict, nid_pname_dict = get_nids_sentences(nid_set)
        kkkk = 0
        for item in nid_sents_dict.items(): #每条新闻
            #存放专题, 每个元素包含关键句和新闻id两个列表
            #例如[[['abc', 'aaa'], [123, 231]], [['bcd', 'bbb'], [542, 126]] ]
            subject_sentence_nids = []
            kkkk += 1
            n = 0
            nid = item[0]
            log.info('    cal {} sentences...'.format(nid))
            #log.info('--- consume :{}'.format(nid))
            t = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            para_sent_dict = item[1]

            sen_len = 0   #文章总句子数目
            for pa in para_sent_dict.items(): #每个段落
                sen_len += len(pa[1])
            for pa in para_sent_dict.items():
                para_num = pa[0]  #段落号
                sents = pa[1]
                conn, cursor = get_postgredb()
                conn_query, cursor_query = get_postgredb_query()
                for s in sents:  #每个句子
                    n += 1
                    #ts1 = datetime.datetime.now()
                    #print '-------1'
                    #print ts1
                    str_no_html, wl = filter_html_stopwords_pos(s, False, True, True, False)
                    #if len(wl) == 0 or len(str_no_html) <= 2: #去除一个字的句子,因为有很多是特殊字符
                    #if len(wl) == 0 or len(str_no_html) <= 15: #去除一个字的句子,因为有很多是特殊字符
                    #if len(wl) == 10 or len(str_no_html) <= 15: #去除一个字的句子,因为有很多是特殊字符
                    if len(wl) <= 10 : #去除一个字的句子,因为有很多是特殊字符
                        continue
                    #ts2 = datetime.datetime.now()
                    #print '-------2'
                    #print ts2
                    h = simhash.simhash(wl)
                    check_exist_sql = "select nid from news_sentence_hash_cache where nid=%s and hash_val=%s" #该新闻中已经有这个句子,即有重复句子存在
                    cursor_query.execute(check_exist_sql, (nid, h.__str__()))
                    #ts3 = datetime.datetime.now()
                    #print '-------3'
                    #print ts3
                    if len(cursor_query.fetchall()) != 0:
                        #log.info('sentence has existed in this news: {}'.format(str_no_html.encode("utf-8")))
                        continue
                    fir, sec, thi, fou, fir2, sec2, thi2, fou2 = simhash.get_4_segments(h.__long__())
                    if is_sentence_ads(h, fir, sec, thi, fou, fir2, sec2, thi2, fou2, nid_pname_dict[nid]):  #在广告db内
                        #  删除广告句子
                        #log.info('find ads of {0}  : {1} '.format(nid, str_no_html.encode("utf-8")))
                        continue
                    #ts4 = datetime.datetime.now()
                    #print '-------4'
                    #print ts4
                    cursor_query.execute(query_sen_sql_interval, (str(fir), str(sec), str(thi), str(fou), str(fir2), str(sec2), str(thi2), str(fou2), news_interval))
                    #print cursor.mogrify(query_sen_sql_interval, (str(fir), str(sec), str(thi), str(fou), str(fir2), str(sec2), str(thi2), str(fou2), news_interval))
                    rows = cursor_query.fetchall()  #所有可能相同的段落
                    #print 'len of potential same sentence is {}'.format(len(rows))
                    if len(rows) == 0:  #没有相似的句子
                        #将所有句子入库
                        cursor.execute(insert_sentence_hash, (nid, str_no_html, n, h.__str__(), fir, sec, thi, fou, t, fir2, sec2, thi2, fou2))
                        #logger_9965.info('len of potential same sentence is 0')
                        continue
                    #else:
                        #logger_9965.info('len of potential same sentence is {}'.format(len(rows)))

                    #ts5 = datetime.datetime.now()
                    #print '-------5'
                    #print ts5
                    same_sentence_sql_para = []
                    nids_for_ads = set()
                    for r in rows:
                        #if len(nids_for_ads) >= 15:
                            #break
                        #距离过大或者是同一篇新闻
                        if h.hamming_distance_with_val(long(r[1])) > same_t or (nid in same_dict.keys() and r[0] in same_dict[nid]) or nid == r[0]:
                            #logger_9965.info('distance is too big or same news of {} and {}'.format(nid, r[0]))
                            continue
                        cursor_query.execute(same_sql2, (r[0], r[1]))
                        rs = cursor_query.fetchall()
                        for r2 in rs:
                            sen = r2[0].decode('utf-8')
                            sen_without_html = filter_tags(sen)
                            if len(sen) == 1 or len(sen_without_html) > len(str_no_html)*1.5 or len(str_no_html) > len(sen_without_html)*1.5:
                                #logger_9965.info('sentence len mismatch: {} ----{}'.format(str_no_html.encode('utf-8'), sen_without_html))
                                continue
                            wl1 = jieba.cut(str_no_html)
                            set1 = set(wl1)
                            l1 = len(set1)
                            wl2 = jieba.cut(sen_without_html)
                            set2 = set(wl2)
                            set_same = set1 & set2
                            l2 = len(set2)
                            l3 = len(set_same)
                            if l3 < max(l1, l2) * 0.6:  #相同比例要达到0.6
                                continue
                            nids_for_ads.add(str(r[0]))
                            same_sentence_sql_para.append((nid, r[0], str_no_html, sen, t))
                            #cursor.execute(insert_same_sentence, (nid, r[0], str_no_html, sen, t))
                            #print cursor.mogrify(insert_same_sentence, (nid, r[0], str_no_html, sen_without_html, t))
                    #ts6 = datetime.datetime.now()
                    #print '-------6'
                    #print ts6
                    if len(nids_for_ads) == 0:  #没有潜在相同的句子; 这些句子先做广告检测
                        cursor.execute(insert_sentence_hash, (nid, str_no_html, n, h.__str__(), fir, sec, thi, fou, t, fir2, sec2, thi2, fou2))
                        conn.commit()
                        continue

                    is_new_ads = False
                    not_ads_but_ignore = False   #不是广告,但需要忽略计算重复
                    PNAME_T = 3
                    nid_pn = {}
                    pname_set = set()
                    chid_set = set()
                    ctime_list = []
                    #print cursor.mogrify(get_pname, (tuple(nids_for_ads),))
                    cursor_query.execute(get_pname, (tuple(nids_for_ads),))
                    rows2 = cursor_query.fetchall()
                    for rk in rows2:
                        pname_set.add(rk[0])
                        chid_set.add(rk[1])
                        ctime_list.append(rk[2])
                        nid_pn[rk[3]] = rk[0]
                    if len(nids_for_ads) / float(len(pname_set)) > 3: #2017.06.13 添加
                        is_new_ads = True
                    if len(nids_for_ads) >= 10:
                        #先处理同源潜在广告
                        if len(pname_set) <= PNAME_T or (len(pname_set) > 5 and len(chid_set) < 4):
                            #if n > sen_len * .2 and n < sen_len * .8:
                            if float(n) < float(sen_len * .2) or float(n) > float(sen_len * .8):
                                min_time = ctime_list[0]
                                max_time = ctime_list[0]
                                for kkk in xrange(1, len(ctime_list)):
                                    if ctime_list[kkk] > max_time:
                                        max_time = ctime_list[kkk]
                                    if ctime_list[kkk] < min_time:
                                        min_time = ctime_list[kkk]
                                if (max_time - min_time).days > 2:  #不是两天内的热点新闻
                                    is_new_ads = True
                            '''
                            nid_links = nid_para_links_dict[nid]
                            sum_own_links = 0  #有链接的段落数
                            for kk in xrange(para_num, len(nid_links)):
                                if len(nid_links[kk]):
                                    sum_own_links += 1
                            if sum_own_links > (len(nid_links) - para_num) * 0.8: #后面的链接很多,认为是广告
                                is_new_ads = True
                        elif len(pname_set) > 5 and len(chid_set) < 4:   #来自多个源, 看是否集中在几个频道,如果是,则认为是广告
                            #需要判断这些新闻入库时间不集中在3天内,否则可能不是广告
                            min_time = ctime_list[0]
                            max_time = ctime_list[0]
                            for kkk in xrange(1, len(ctime_list)):
                                if ctime_list[kkk] > max_time:
                                    max_time = ctime_list[kkk]
                                if ctime_list[kkk] < min_time:
                                    min_time = ctime_list[kkk]
                            if (max_time - min_time).days > 2:  #不是三天内的热点新闻
                                is_new_ads = True
                             '''
                        else:
                            not_ads_but_ignore = True
                    #ts7 = datetime.datetime.now()
                    #print '-------7'
                    #print ts7
                    nids_str = ','.join(nids_for_ads)
                    if is_new_ads:  #是否是新广告
                        if len(pname_set) <= PNAME_T:  #源
                            pname_str = ','.join(pname_set)
                        else:
                            pname_str = ""
                        cursor.execute(ads_insert, (str_no_html, h.__str__(), t, fir, sec, thi, fou, fir2, sec2, thi2, fou2, nids_str, 0, pname_str))
                        #log.info('find new ads : {0}'.format(str_no_html.encode("utf-8")))
                    else:
                        #if len(same_sentence_sql_para) < 5:  #检测出过多的相同句子,又不是广告, 可能是误判, 不处理
                        if not_ads_but_ignore:  #相同的句子过多,认为是误判, 加入广告数据库,但state=1,即不是真广告,这样可以在下次碰到时减少计算
                            cursor.execute(ads_insert, (str_no_html, h.__str__(), t, fir, sec, thi, fou, fir2, sec2, thi2, fou2, nids_str, 1, "" ))
                        else:
                            cursor.executemany(insert_same_sentence, same_sentence_sql_para)  #有效的重复句子
                            #log.info('get same sentence map :{}'.format(str_no_html.encode('utf-8')))
                            #多放观点  1. 句子长度>30.  2 不同源  3. 去除首尾
                            if len(str_no_html) > 15 and n > 2 and (n < sen_len-2):
                            #if len(str_no_html) > 15:
                                sub_nids_set = set()
                                for same in same_sentence_sql_para:
                                    nn = same[1]  #nid
                                    if nid_pname_dict[nid] != nid_pn[nn]:
                                        ctime_sql = "select nid, ctime from info_news where nid = %s or nid=%s"
                                        cursor_query.execute(ctime_sql, (same[0], same[1]))
                                        ctimes = cursor_query.fetchall()
                                        ctime_dict = {}
                                        for ct in ctimes:
                                            ctime_dict[str(ct[0])] = ct[1]
                                        cursor.execute(multo_vp_insert_sql, (str(same[0]), same[2], str(same[1]), same[3], t, ctime_dict[str(same[0])], ctime_dict[str(same[1])]))
                                        log.info('      get multi viewpoint :{}'.format(str_no_html.encode('utf-8')))
                                        sub_nids_set.add(same[0])
                                        sub_nids_set.add(same[1])
                                        subject_queue.product_simhash2((same[0], same[1]))
                                #log.info("num of mvp is {}".format(sub_nids_set))
                                if len(sub_nids_set) >= 2:  ## 专题新闻入队列
                                    log.info('      generate subject for {}'.format(sub_nids_set))
                                    #for i in sub_nids_set:
                                    #    subject_nids.add(i)
                                    key_sents = [str_no_html.encode('utf-8'), ]
                                    sub_nids = []
                                    for i in sub_nids_set:
                                        sub_nids.append(i)
                                    subject_sentence_nids.append([key_sents, sub_nids])
                                    #subject_queue.product_subject(tuple(nid_set))

                    #将所有段落入库
                    cursor.execute(insert_sentence_hash, (nid, str_no_html, n, h.__str__(), fir, sec, thi, fou, t, fir2, sec2, thi2, fou2))
                conn.commit()
                cursor.close()
                conn.close()
                cursor_query.close()
                conn_query.close()
            if len(subject_sentence_nids) > 0 and len(subject_sentence_nids) < 3:
                #log.info("before merge : {}".format(subject_sentence_nids))
                subs = merge_subs(subject_sentence_nids)
                #log.info("after merge : {}".format(subs))
                for sub in subs:
                    subject_queue.product_subject(sub)
                #log.info('generate subject for {} ------ {}'.format(nid, subject_nids))
                #subject_queue.product_subject(tuple(subject_nids))

        ttt2 = datetime.datetime.now()
        log.info('it takes {}'.format((ttt2-ttt1).total_seconds()))
        del nid_sents_dict
        del nid_para_links_dict
    except:
        log.exception(traceback.format_exc())