def create(self): #logger_9987.info('TopicModel::create begin ...') docs_sframe = gl.SFrame.read_csv(self.data_path, header=True) docs = gl.text_analytics.count_words(docs_sframe['doc']) docs = gl.text_analytics.trim_rare_words(docs, threshold=30, delimiters=None) self.model = gl.topic_model.create(docs, num_iterations=1000, num_burnin=100, num_topics=5000) sf = self.model.get_topics(num_words=20, output_type='topic_words') conn, cursor = get_postgredb() for i in xrange(0, len(sf)): try: keys_words_jsonb = json.dumps(sf[i]['words']) cursor.execute( save_model_sql, [self.version, str(i), keys_words_jsonb]) conn.commit() except Exception: print 'save model to db error' conn.close() del docs_sframe del docs
def generate_subject2(sub_nids): prefix = 'http://fez.deeporiginalx.com:9001' create_url = prefix + '/topics' cookie = {'Authorization': 'f76f3276c1ac832b935163c451f62a2abf5b253c'} #set subject name as one title of one piece of news sql = "select title from newslist_v2 where nid=%s" conn, cursor = get_postgredb() cursor.execute(sql, (sub_nids[0], )) row = cursor.fetchone() sub_name = row[0] data = {'name': sub_name} response = requests.post(create_url, data=data, cookies=cookie) print response.content content = json.loads(response.content) id = content['id'] topic_class_url = prefix + '/topic_classes' data = {'topic': id, 'name': 'random'} response = requests.post(topic_class_url, data=data, cookies=cookie) class_id = json.loads(response.content)['id'] add_nid_url = prefix + '/topic_news' for nid in sub_nids: data = {'topic_id': id, 'news_id': nid, 'topic_class_id': class_id} requests.post(add_nid_url, data=data, cookies=cookie)
def detect_multivp_and_ads(nid_list): conn, cursor = get_postgredb() cursor.execute(sql, (tuple(nid_list), )) rows = cursor.fetchall() all_nids = set() #查询这些nid的pname for r in rows: all_nids.add(r[0]) all_nids.add(r[1]) nids = [] for i in all_nids: nids.append(i) cursor.execute(ch_sql, (tuple(nids), )) rows2 = cursor.fetchall() nid_pname_dict = {} for r in rows2: nid_pname_dict[r[0]] = r[1] for r in rows: nid1 = r[0] pname1 = nid_pname_dict[nid1] nid2 = r[1] pname2 = nid_pname_dict[nid2] if pname1 == pname2: #同一个源,可能是广告 pass else: #进一步判断是否是多放观点 pass
def save_subject_sentences(sub_id, sents): #记录专题key_sentence conn, cursor = get_postgredb() sub_sents_sql = "insert into topic_sentences (topic_id, sentences) values (%s, %s)" cursor.execute(sub_sents_sql, (sub_id, json.dumps(sents))) conn.commit() conn.close()
def update_sub(old_sub_id, sub): #先获取old_sub_id的class id logger_sub.info(' update_sub {} to: {}'.format(old_sub_id, sub)) conn, cursor = get_postgredb() #创建新的class_id class_id = create_subject_class(old_sub_id) #添加新闻 add_news_to_subject(old_sub_id, class_id, sub[1]) #更新专题名称 update_sub_name_on_nids(old_sub_id, sub[1]) #topic中添加key_sentence sent_sql = "select sentences from topic_sentences where topic_id=%s" cursor.execute(sent_sql, (old_sub_id, )) row = cursor.fetchone() if row: old_sents = row[0] else: old_sents = [] added_sen = set(sub[0]) - set(old_sents) if len(added_sen) > 0: old_sents.extend(added_sen) update_sql = "update topic_sentences set sentences=%s where topic_id=%s" cursor.execute(update_sql, (json.dumps(old_sents), old_sub_id)) conn.commit() conn.close()
def check_same_news(nid1, nid2): conn, cursor = doc_process.get_postgredb() check_state = "select state from newslist_v2 where nid in ({}, {}) and state=0" cursor.execute(check_state.format(nid1, nid2)) rs = cursor.fetchall() if len(list(rs)) < 2: return words_list1 = doc_process.get_words_on_nid(nid1) #获取新闻的分词 words_list2 = doc_process.get_words_on_nid(nid2) #获取新闻的分词 h1 = simhash(words_list1) #本篇新闻的hash值 h2 = simhash(words_list2) #本篇新闻的hash值 diff_bit = h1.hamming_distance(h2) if diff_bit > 12: #大于12, 认为不可能是同一篇新闻 return title_sql = "select title from newslist_v2 where nid in ({}, {})" cursor.execute(title_sql.format(nid1, nid2)) rows = cursor.fetchall() titles = [r[0] for r in rows] if doc_process.get_sentence_similarity(titles[0], titles[1]) > 0.3: #标题相似性大于0.3 off_nid = del_nid_of_fewer_comment(nid1, nid2, log=logger_sen) t0 = datetime.datetime.now() cursor.execute(insert_same_sql.format(nid1, nid2, diff_bit, t0.strftime('%Y-%m-%d %H:%M:%S'), off_nid)) #记录去重操作 conn.commit() cursor.close() conn.close()
def cal_save_simhash_proc(nids, t): conn, cursor = doc_process.get_postgredb() for nid in nids: words_list = doc_process.get_words_on_nid(nid) #获取新闻的分词 h = simhash(words_list) #本篇新闻的hash值 fir, sec, thi, fou, fir2, sec2, thi2, fou2 = get_4_segments(h.__long__()) #获取hash值的分段 cursor.execute(insert_news_simhash_sql.format(nid, h.__str__(), t, fir, sec, thi, fou, fir2, sec2, thi2, fou2))#记录新闻hash新闻 conn.commit() cursor.close() conn.close()
def clear_data(): try: log_cf_clear_data.info('begin clear data...') conn, cursor = get_postgredb() cursor.execute(clear_sql) cursor.execute(clear_sql2) conn.commit() cursor.close() conn.close() log_cf_clear_data.info('finish clearing data...') except: pass
def test_jsonb(): conn, cursor = get_postgredb() id = 0 sentence = ['测试句子1', '测试句子2'] insert_sql = "insert into topic_sentences (topic_id, sentences) values (%s, %s)" print json.dumps(sentence) #cursor.execute(insert_sql, (id, json.dumps(sentence))) #conn.commit() query_sql = "select sentences from topic_sentences where topic_id=%s" cursor.execute(query_sql, (id, )) row = cursor.fetchone() print row[0] print type(row[0]) conn.close()
def move_sentence_data(): try: nt = datetime.datetime.now() t = nt.strftime('%Y-%m-%d %H:%M:%S') logger_9963.info('move_sentence_data--- {}'.format(t)) conn, cursor = get_postgredb() # cursor.execute(move_sentenct_sql, (t, )) # logger_9963.info('move finished') cursor.execute(del_sentenct_sql, (t, )) conn.commit() conn.close() nt2 = datetime.datetime.now() logger_9963.info('finished to move_sentence_data. it takes {} s'.format((nt2 - nt).total_seconds())) except: logger_9963.info(traceback.format_exc())
def predict_click(click_info, model_v=None): try: if not model_v: model_v = os.path.split(get_newest_dir(model_base_path))[-1] uid = click_info[0] nid = click_info[1] if isinstance(click_info[2], basestring): time_str = click_info[2] ctime = datetime.datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S') else: ctime = click_info[2] time_str = ctime.strftime('%Y-%m-%d %H:%M:%S') logger_9990.info("consume click: uid={}, nid={}, time_str={}".format( uid, nid, time_str)) valid_time = ctime + timedelta(days=30) #有效时间定为30天 fail_time = valid_time.strftime('%Y-%m-%d %H:%M:%S') conn, cursor = get_postgredb_query() cursor.execute(nt_sql.format(nid, model_v)) #获取nid可能的话题 rows = cursor.fetchall() for r in rows: topic_id = r[0] probability = r[1] conn2, cursor2 = get_postgredb() cursor2.execute(ut_sql.format(uid, model_v, topic_id)) rows2 = cursor2.fetchone() if rows2: #该用户已经关注过该topic_id, 更新probability即可 new_prop = probability + rows2[0] logger_9990.info('update: uid={}, topic_id={}'.format( uid, topic_id)) cursor2.execute( ut_update_sql.format(new_prop, time_str, fail_time, uid, model_v, topic_id)) else: cursor2.execute( user_topic_insert_sql.format(uid, model_v, topic_id, probability, time_str, fail_time)) conn2.commit() conn2.close() cursor.close() conn.close() except: traceback.print_exc()
def get_clicks_5m(): logger_9989.info('news epoch...') global last_time now = datetime.datetime.now() if last_time > now: logger_9989.info(' **** time error! {}'.format(last_time)) last_time = now - timedelta(seconds=3) conn, cursor = get_postgredb() #cursor.execute(click_sql.format(channels, period)) cursor.execute(click_sql.format(channels, last_time.strftime('%Y-%m-%d %H:%M:%S.%f'))) rows = cursor.fetchall() for r in rows: if r[3] > now: continue last_time = r[3] ctime_str = r[2].strftime('%Y-%m-%d %H:%M:%S') logger_9989.info(' pruduce {}--{}--{}'.format(r[0], r[1], ctime_str)) nid_queue.produce_user_click_lda(r[0], r[1], ctime_str) cursor.close() conn.close()
def get_potential_topic(user_topic_prop_dict, user_neighbours, model_v, time): log_cf.info(' begin to get_potential_topic...') potential_utp_dict = dict() #存储每个邻居推荐的topic及对应的概率 for it in user_neighbours.items(): u = it[0] potential_utp_dict[u] = dict() for nei_sim in it[1]: #每个邻居 nei = nei_sim[0] sim = nei_sim[1] if sim == 1.0: #完全相同的用户不需做其他比较 continue nei_topics_prop = user_topic_prop_dict[nei] #邻居的所有topic for tp in nei_topics_prop.items(): # if tp[0] not in user_topic_prop_dict[u]: #原用户并没有行为的topic potential_utp_dict[u][tp[0]] = potential_utp_dict[u].setdefault(tp[0], 0) + sim * tp[1] user_potential_topic_sql = "insert into user_topic_cf (uid, model_v, topic_id, property, ctime) VALUES ({}, '{}', {}, {}, '{}')" if TEST_FLAG: us, ts, ps = [], [], [] for item in potential_utp_dict.items(): for i in item[1].items(): us.append(item[0]) ts.append(i[0]) ps.append(i[1]) f = os.path.join(real_dir_path, 'data', 'final_recommend.csv') pd.DataFrame({'user':us, 'topic':ts, 'prop':ps}).to_csv(f, columns=('user', 'topic', 'prop')) print ' finished get_potential_topic...' else: conn, cursor = get_postgredb() for item in potential_utp_dict.items(): u = item[0] topic_score = item[1] #sorted_topic_score = sorted(topic_score.items(), key=lambda d: d[1], reverse=True)[:30] sorted_topic_score = nlargest(30, topic_score.items(), key=itemgetter(1)) for it in sorted_topic_score: if it[1] > 0.1: cursor.execute(user_potential_topic_sql.format(u, model_v, it[0], it[1], time)) conn.commit() conn.close() log_cf.info(' finished get_potential_topic...')
def create_subject(nids): try: logger_sub.info('create subject for {}'.format(nids)) conn, cursor = get_postgredb() sql = "select title from newslist_v2 where nid in ({})" nid_str = ', '.join(str(i) for i in nids) cursor.execute(sql.format(nid_str)) rows = cursor.fetchall() conn.close() sub_name = choose_subject_name([r[0] for r in rows]) if not sub_name: return data = {'name': sub_name, 'type': 1, 'cover': subject_cover} logger_sub.info('create subject "{}"'.format(sub_name)) response = requests.post(create_url, data=data, cookies=cookie) content = json.loads(response.content) if 'id' not in content: logger_sub.info('error to create subject : {}'.format(content)) return return content['id'] except: logger_sub.exception(traceback.format_exc()) return
def del_same_old_news(nid, nid_hash_dict): ''' 直接对比 ''' nid = int(nid) if nid not in nid_hash_dict: return conn, cursor = doc_process.get_postgredb() hash_val = nid_hash_dict[nid] for n, hv in nid_hash_dict.items(): if n == nid: continue diff_bit = dif_bit(hash_val, hv) if diff_bit <= 6: offnid = del_nid_of_fewer_comment(nid, n) t0 = datetime.datetime.now() cursor.execute(insert_same_sql.format(nid, n, diff_bit, t0.strftime('%Y-%m-%d %H:%M:%S'), offnid)) #记录去重操作 nid_hash_dict.pop(offnid) break #认为只与一个可能存在相同。 从数据看这么做. conn.commit() cursor.close() conn.close()
def add_news_to_subject(sub_id, class_id, nids): conn, cursor = get_postgredb() sub_nids_sql = "select news from topicnews where topic=%s" cursor.execute(sub_nids_sql, (sub_id, )) rows = cursor.fetchall() old_sub_nids_set = set() for r in rows: old_sub_nids_set.add(r[0]) sub_nids_set = set(nids) #专题插入新闻 added_nids = sub_nids_set - old_sub_nids_set for nid in added_nids: data = {'topic_id':sub_id, 'news_id':nid, 'topic_class_id':class_id} requests.post(add_nid_url, data=data, cookies=cookie) #2017.05.22. 检查新的专题的新闻是否与已经存在的新闻有重复,如果有,删除一个小的专题 potential_same_sub_sql = "select topic from topicnews tn inner join topiclist tl on tn.topic=tl.id where news in ({}) and topic != {} group by topic" nid_str = ', '.join(str(i) for i in (sub_nids_set | old_sub_nids_set)) cursor.execute(potential_same_sub_sql.format(nid_str, sub_id)) rows = cursor.fetchall() for r in rows: del_id = check_del_sub_subject(sub_id, r[0]) if del_id and del_id == sub_id: #有已经存在的专题包含sub_id, 删除sub_id后返回 return #查询专题-topic sub_topic_sql = "select model_v, topic_id, probability from subject_topic where subject_id=%s" cursor.execute(sub_topic_sql, (sub_id, )) sub_topic_dict = dict() topic_model_v = '' rows = cursor.fetchall() for r in rows: topic_model_v = r[0] sub_topic_dict[r[1]] = r[2] old_topics = sub_topic_dict.keys() #计算新闻topic news_topic_sql = "select topic_id, probability, model_v from news_topic_v2 where nid=%s" topic_model_set = set() news_topics_dict = dict() for nid in added_nids: cursor.execute(news_topic_sql, (nid, )) rows2 = cursor.fetchall() for r in rows2: topic_model_set.add(r[2]) if r[0] in news_topics_dict: news_topics_dict[r[0]] += r[1] else: news_topics_dict[r[0]] = r[1] if len(topic_model_set) == 0 or len(topic_model_set) != 1 or \ (topic_model_v != '' and topic_model_v != list(topic_model_set)[0]): #包含多个版本的topic信息 conn.close() return if topic_model_v == '': topic_model_v = list(topic_model_set)[0] #更新专题 for item in news_topics_dict.items(): if item[0] in sub_topic_dict: sub_topic_dict[item[0]] += item[1]/len(added_nids) else: sub_topic_dict[item[0]] = item[1]/len(added_nids) sub_topic_sort = sorted(sub_topic_dict.items(), key=lambda d:d[1], reverse=True) time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') update_sub_topic = "update subject_topic set probability=%s, ctime=%s where subject_id=%s and model_v=%s and topic_id=%s" insert_sub_topic = "insert into subject_topic (subject_id, model_v, topic_id, probability, ctime) values (%s, %s, %s, %s, %s)" for i in range(0, min(len(sub_topic_sort), 10)): tid = sub_topic_sort[i][0] tp = sub_topic_sort[i][1] if tid in old_topics: cursor.execute(update_sub_topic, (tp, time, sub_id, topic_model_v, tid)) else: cursor.execute(insert_sub_topic, (sub_id, topic_model_v, tid, tp, time)) #专题的新闻总数大于5就自动上线 all_nids = old_sub_nids_set | sub_nids_set logger_sub.info(' sub {} :{} -change to - {}'.format(sub_id, old_sub_nids_set, all_nids)) if len(all_nids) >= 5: data = {'zt_id': sub_id, 'online': 0} requests.get(online_url, params=data, cookies=cookie) conn.commit() cursor.close() conn.close()
def generate_subject(sub): try: sub_sents = sub[0] sub_nids = sub[1] conn, cursor = get_postgredb() state_sql = 'select nid from newslist_v2 where nid in ({}) and state=0' cursor.execute(state_sql.format(','.join(str(n) for n in sub_nids))) rs = cursor.fetchall() sub_nids = list(r[0] for r in rs) if len(sub_nids) <= 1: conn.close() return logger_sub.info('******prepare to create subject for {}'.format(sub_nids)) ##############检查是否需要新建专题还是更新到旧专题### if len(sub_nids) > 4: #含4条以上新闻才可以合并到其他专题 oldsub_nid_dict = dict() #记录旧topic--与本sub相同的nid nid_old_sub_sql = "select tn.topic, tn.news from topicnews tn " \ "inner join topiclist tl on tn.topic=tl.id " \ "where news in %s and tl.type=1" cursor.execute(nid_old_sub_sql, (tuple(sub_nids), )) rows = cursor.fetchall() for r in rows: if r[0] in oldsub_nid_dict: oldsub_nid_dict[r[0]].append(r[1]) else: oldsub_nid_dict[r[0]] = [r[1], ] update = False for item in oldsub_nid_dict.items(): if float(len(item[1])) >= 0.5 * len(sub_nids): #sub一半以上的nid包含在旧subject内,则把sub合并进旧subject update_sub(item[0], sub) update = True if update: conn.close() return ##############需要新建专题####################### ''' create_url = prefix + '/topics' #set subject name as one title of one piece of news sql = "select title from newslist_v2 where nid=%s" cursor.execute(sql, (sub_nids[0],)) rows = cursor.fetchall() sub_name = choose_subject_name([r[0] for r in rows]) data = {'name': sub_name, 'type': 1} logger_sub.info('create subject {}'.format(sub_name)) response = requests.post(create_url, data=data, cookies=cookie) content = json.loads(response.content) if 'id' not in content: logger_sub.info('error to create subject : {}'.format(content)) return sub_id = content['id'] ''' sub_id = create_subject(sub_nids) if not sub_id: return ''' topic_class_url = prefix + '/topic_classes' time = datetime.datetime.now() class_name = str(time.month) + '.' + str(time.day) data = {'topic': sub_id, 'name': class_name} response = requests.post(topic_class_url, data=data, cookies=cookie) class_id = json.loads(response.content)['id'] ''' class_id = create_subject_class(sub_id) ''' add_nid_url = prefix + '/topic_news' for nid in sub_nids: data = {'topic_id':sub_id, 'news_id':nid, 'topic_class_id':class_id} requests.post(add_nid_url, data=data, cookies=cookie) ''' add_news_to_subject(sub_id, class_id, sub_nids) ''' #记录专题key_sentence sub_sents_sql = "insert into topic_sentences (topic_id, sentences) values (%s, %s)" cursor.execute(sub_sents_sql, (sub_id, json.dumps(sub_sents))) ''' save_subject_sentences(sub_id, sub_sents) conn.commit() conn.close() except: logger_sub.exception(traceback.format_exc())
def predict(model, nid_list): logger_9988.info('predict {}'.format(nid_list)) t0 = datetime.datetime.now() nid_words_dict = get_news_words(nid_list) nids = [] doc_list = [] for item in nid_words_dict.items(): nids.append(item[0]) doc_list.append(item[1]) ws = gl.SArray(doc_list) docs = gl.SFrame(data={'X1': ws}) docs = gl.text_analytics.count_words(docs['X1']) pred = model.model.predict(docs, output_type='probability', num_burnin=50) #pred保存的是每个doc在所有主题上的概率值 props_list = [] #所有文档的主题-概率对儿 for doc_index in xrange(len(pred)): #取每个doc的分布 doc_props = pred[doc_index] index_val_dict = {} for k in xrange(len(doc_props)): if doc_props[k] > 0.1: index_val_dict[k] = doc_props[ k] #{ topic1:0.3, topic2:0.2, ...} sort_prop = sorted(index_val_dict.items(), key=lambda d: d[1], reverse=True) props = [] #本文档的主题-概率对儿 # [(5, 0.3), (3, 0.2), ...] for i in xrange(min(3, len(sort_prop))): if i == 0: props.append(sort_prop[i]) else: if sort_prop[i][1] > 0.5 * sort_prop[ i - 1][1]: #大于0.1并且与前一个概率差别不到一倍 props.append(sort_prop[i]) else: break props_list.append(props) # [ [(5, 0.3), (3, 0.2)..], .... ] #入库 insert_list = [] str_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') res_dict_list = [] for n in xrange(len(nids)): for m in xrange(len(props_list[n])): topic_id = props_list[n][m][0] prop = props_list[n][m][1] insert_list.append( (nids[n], model.version, topic_id, prop, str_time)) ''' sf = model.model.get_topics(num_words=20, output_type='topic_words') info_dict = {} info_dict['nid'] = nids[n] info_dict['model_v'] = model_version info_dict['topic_id'] = topic_id info_dict['probability'] = prop info_dict['topic_words'] = sf[topic_id]['words'] res_dict_list.append(info_dict) ''' conn, cursor = get_postgredb() cursor.executemany(insert_sql, insert_list) conn.commit() conn.close() t1 = datetime.datetime.now() logger_9988.info('prediction takes {}s'.format((t1 - t0).total_seconds())) return res_dict_list
def cal_process(nid_set, log=None, same_t=3, news_interval=3, same_dict = {}): log = logger_9965 log.info('there are {} news to calulate'.format(len(nid_set))) ttt1 = datetime.datetime.now() try: nid_sents_dict, nid_para_links_dict, nid_pname_dict = get_nids_sentences(nid_set) kkkk = 0 for item in nid_sents_dict.items(): #每条新闻 #存放专题, 每个元素包含关键句和新闻id两个列表 #例如[[['abc', 'aaa'], [123, 231]], [['bcd', 'bbb'], [542, 126]] ] subject_sentence_nids = [] kkkk += 1 n = 0 nid = item[0] log.info(' cal {} sentences...'.format(nid)) #log.info('--- consume :{}'.format(nid)) t = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') para_sent_dict = item[1] sen_len = 0 #文章总句子数目 for pa in para_sent_dict.items(): #每个段落 sen_len += len(pa[1]) for pa in para_sent_dict.items(): para_num = pa[0] #段落号 sents = pa[1] conn, cursor = get_postgredb() conn_query, cursor_query = get_postgredb_query() for s in sents: #每个句子 n += 1 #ts1 = datetime.datetime.now() #print '-------1' #print ts1 str_no_html, wl = filter_html_stopwords_pos(s, False, True, True, False) #if len(wl) == 0 or len(str_no_html) <= 2: #去除一个字的句子,因为有很多是特殊字符 #if len(wl) == 0 or len(str_no_html) <= 15: #去除一个字的句子,因为有很多是特殊字符 #if len(wl) == 10 or len(str_no_html) <= 15: #去除一个字的句子,因为有很多是特殊字符 if len(wl) <= 10 : #去除一个字的句子,因为有很多是特殊字符 continue #ts2 = datetime.datetime.now() #print '-------2' #print ts2 h = simhash.simhash(wl) check_exist_sql = "select nid from news_sentence_hash_cache where nid=%s and hash_val=%s" #该新闻中已经有这个句子,即有重复句子存在 cursor_query.execute(check_exist_sql, (nid, h.__str__())) #ts3 = datetime.datetime.now() #print '-------3' #print ts3 if len(cursor_query.fetchall()) != 0: #log.info('sentence has existed in this news: {}'.format(str_no_html.encode("utf-8"))) continue fir, sec, thi, fou, fir2, sec2, thi2, fou2 = simhash.get_4_segments(h.__long__()) if is_sentence_ads(h, fir, sec, thi, fou, fir2, sec2, thi2, fou2, nid_pname_dict[nid]): #在广告db内 # 删除广告句子 #log.info('find ads of {0} : {1} '.format(nid, str_no_html.encode("utf-8"))) continue #ts4 = datetime.datetime.now() #print '-------4' #print ts4 cursor_query.execute(query_sen_sql_interval, (str(fir), str(sec), str(thi), str(fou), str(fir2), str(sec2), str(thi2), str(fou2), news_interval)) #print cursor.mogrify(query_sen_sql_interval, (str(fir), str(sec), str(thi), str(fou), str(fir2), str(sec2), str(thi2), str(fou2), news_interval)) rows = cursor_query.fetchall() #所有可能相同的段落 #print 'len of potential same sentence is {}'.format(len(rows)) if len(rows) == 0: #没有相似的句子 #将所有句子入库 cursor.execute(insert_sentence_hash, (nid, str_no_html, n, h.__str__(), fir, sec, thi, fou, t, fir2, sec2, thi2, fou2)) #logger_9965.info('len of potential same sentence is 0') continue #else: #logger_9965.info('len of potential same sentence is {}'.format(len(rows))) #ts5 = datetime.datetime.now() #print '-------5' #print ts5 same_sentence_sql_para = [] nids_for_ads = set() for r in rows: #if len(nids_for_ads) >= 15: #break #距离过大或者是同一篇新闻 if h.hamming_distance_with_val(long(r[1])) > same_t or (nid in same_dict.keys() and r[0] in same_dict[nid]) or nid == r[0]: #logger_9965.info('distance is too big or same news of {} and {}'.format(nid, r[0])) continue cursor_query.execute(same_sql2, (r[0], r[1])) rs = cursor_query.fetchall() for r2 in rs: sen = r2[0].decode('utf-8') sen_without_html = filter_tags(sen) if len(sen) == 1 or len(sen_without_html) > len(str_no_html)*1.5 or len(str_no_html) > len(sen_without_html)*1.5: #logger_9965.info('sentence len mismatch: {} ----{}'.format(str_no_html.encode('utf-8'), sen_without_html)) continue wl1 = jieba.cut(str_no_html) set1 = set(wl1) l1 = len(set1) wl2 = jieba.cut(sen_without_html) set2 = set(wl2) set_same = set1 & set2 l2 = len(set2) l3 = len(set_same) if l3 < max(l1, l2) * 0.6: #相同比例要达到0.6 continue nids_for_ads.add(str(r[0])) same_sentence_sql_para.append((nid, r[0], str_no_html, sen, t)) #cursor.execute(insert_same_sentence, (nid, r[0], str_no_html, sen, t)) #print cursor.mogrify(insert_same_sentence, (nid, r[0], str_no_html, sen_without_html, t)) #ts6 = datetime.datetime.now() #print '-------6' #print ts6 if len(nids_for_ads) == 0: #没有潜在相同的句子; 这些句子先做广告检测 cursor.execute(insert_sentence_hash, (nid, str_no_html, n, h.__str__(), fir, sec, thi, fou, t, fir2, sec2, thi2, fou2)) conn.commit() continue is_new_ads = False not_ads_but_ignore = False #不是广告,但需要忽略计算重复 PNAME_T = 3 nid_pn = {} pname_set = set() chid_set = set() ctime_list = [] #print cursor.mogrify(get_pname, (tuple(nids_for_ads),)) cursor_query.execute(get_pname, (tuple(nids_for_ads),)) rows2 = cursor_query.fetchall() for rk in rows2: pname_set.add(rk[0]) chid_set.add(rk[1]) ctime_list.append(rk[2]) nid_pn[rk[3]] = rk[0] if len(nids_for_ads) / float(len(pname_set)) > 3: #2017.06.13 添加 is_new_ads = True if len(nids_for_ads) >= 10: #先处理同源潜在广告 if len(pname_set) <= PNAME_T or (len(pname_set) > 5 and len(chid_set) < 4): #if n > sen_len * .2 and n < sen_len * .8: if float(n) < float(sen_len * .2) or float(n) > float(sen_len * .8): min_time = ctime_list[0] max_time = ctime_list[0] for kkk in xrange(1, len(ctime_list)): if ctime_list[kkk] > max_time: max_time = ctime_list[kkk] if ctime_list[kkk] < min_time: min_time = ctime_list[kkk] if (max_time - min_time).days > 2: #不是两天内的热点新闻 is_new_ads = True ''' nid_links = nid_para_links_dict[nid] sum_own_links = 0 #有链接的段落数 for kk in xrange(para_num, len(nid_links)): if len(nid_links[kk]): sum_own_links += 1 if sum_own_links > (len(nid_links) - para_num) * 0.8: #后面的链接很多,认为是广告 is_new_ads = True elif len(pname_set) > 5 and len(chid_set) < 4: #来自多个源, 看是否集中在几个频道,如果是,则认为是广告 #需要判断这些新闻入库时间不集中在3天内,否则可能不是广告 min_time = ctime_list[0] max_time = ctime_list[0] for kkk in xrange(1, len(ctime_list)): if ctime_list[kkk] > max_time: max_time = ctime_list[kkk] if ctime_list[kkk] < min_time: min_time = ctime_list[kkk] if (max_time - min_time).days > 2: #不是三天内的热点新闻 is_new_ads = True ''' else: not_ads_but_ignore = True #ts7 = datetime.datetime.now() #print '-------7' #print ts7 nids_str = ','.join(nids_for_ads) if is_new_ads: #是否是新广告 if len(pname_set) <= PNAME_T: #源 pname_str = ','.join(pname_set) else: pname_str = "" cursor.execute(ads_insert, (str_no_html, h.__str__(), t, fir, sec, thi, fou, fir2, sec2, thi2, fou2, nids_str, 0, pname_str)) #log.info('find new ads : {0}'.format(str_no_html.encode("utf-8"))) else: #if len(same_sentence_sql_para) < 5: #检测出过多的相同句子,又不是广告, 可能是误判, 不处理 if not_ads_but_ignore: #相同的句子过多,认为是误判, 加入广告数据库,但state=1,即不是真广告,这样可以在下次碰到时减少计算 cursor.execute(ads_insert, (str_no_html, h.__str__(), t, fir, sec, thi, fou, fir2, sec2, thi2, fou2, nids_str, 1, "" )) else: cursor.executemany(insert_same_sentence, same_sentence_sql_para) #有效的重复句子 #log.info('get same sentence map :{}'.format(str_no_html.encode('utf-8'))) #多放观点 1. 句子长度>30. 2 不同源 3. 去除首尾 if len(str_no_html) > 15 and n > 2 and (n < sen_len-2): #if len(str_no_html) > 15: sub_nids_set = set() for same in same_sentence_sql_para: nn = same[1] #nid if nid_pname_dict[nid] != nid_pn[nn]: ctime_sql = "select nid, ctime from info_news where nid = %s or nid=%s" cursor_query.execute(ctime_sql, (same[0], same[1])) ctimes = cursor_query.fetchall() ctime_dict = {} for ct in ctimes: ctime_dict[str(ct[0])] = ct[1] cursor.execute(multo_vp_insert_sql, (str(same[0]), same[2], str(same[1]), same[3], t, ctime_dict[str(same[0])], ctime_dict[str(same[1])])) log.info(' get multi viewpoint :{}'.format(str_no_html.encode('utf-8'))) sub_nids_set.add(same[0]) sub_nids_set.add(same[1]) subject_queue.product_simhash2((same[0], same[1])) #log.info("num of mvp is {}".format(sub_nids_set)) if len(sub_nids_set) >= 2: ## 专题新闻入队列 log.info(' generate subject for {}'.format(sub_nids_set)) #for i in sub_nids_set: # subject_nids.add(i) key_sents = [str_no_html.encode('utf-8'), ] sub_nids = [] for i in sub_nids_set: sub_nids.append(i) subject_sentence_nids.append([key_sents, sub_nids]) #subject_queue.product_subject(tuple(nid_set)) #将所有段落入库 cursor.execute(insert_sentence_hash, (nid, str_no_html, n, h.__str__(), fir, sec, thi, fou, t, fir2, sec2, thi2, fou2)) conn.commit() cursor.close() conn.close() cursor_query.close() conn_query.close() if len(subject_sentence_nids) > 0 and len(subject_sentence_nids) < 3: #log.info("before merge : {}".format(subject_sentence_nids)) subs = merge_subs(subject_sentence_nids) #log.info("after merge : {}".format(subs)) for sub in subs: subject_queue.product_subject(sub) #log.info('generate subject for {} ------ {}'.format(nid, subject_nids)) #subject_queue.product_subject(tuple(subject_nids)) ttt2 = datetime.datetime.now() log.info('it takes {}'.format((ttt2-ttt1).total_seconds())) del nid_sents_dict del nid_para_links_dict except: log.exception(traceback.format_exc())