def get_wechat_news(name_set): news_dict = dict() conn, cursor = get_postgredb() #wechat_para_dict = {} pool = Pool(10) for name in name_set: #etc:name = "美国咖" pool.apply_async(get_ads_proc, name)
def get_ads_on_nid(nid): global ads_dict sql_get_news = "select content,pname from newslist_v2 where nid = {0}" conn, cursor = get_postgredb() cursor.execute(sql_get_news.format(nid)) rows = cursor.fetchall() row = rows[0] contents = row[0] pname = row[1] content_list = get_para_list(contents) ads_paras = ads_dict[pname] sorted_ads = sorted(ads_paras, key=lambda d: d[0]) print sorted_ads remove_para_list = [] i = 0 while i < len(sorted_ads): para = sorted_ads[i] if para[0] < 0: n = int(para[0]) if len(content_list) < abs(n): continue str_no_tags = filter_tags(content_list[int(para[0])]) if is_sentenses_same(str_no_tags, para[1]): remove_para_list.append(para[0]) for k in xrange(i + 1, len(sorted_ads)): if sorted_ads[k][0] < 0: i = k + 1 continue else: i = k break else: i += 1 continue else: k = len(sorted_ads) - 1 while k >= i: para2 = sorted_ads[k] n = int(para2[0]) if len(content_list) < abs(n): continue str_no_tags = filter_tags(content_list[int(para2[0])]) if is_sentenses_same(str_no_tags, para2[1]): remove_para_list.append(para2[0]) break else: k -= 1 break print remove_para_list result = {} for i in remove_para_list: if int(i) < 0: result.update({'End Ads:': content_list[int(i)]}) else: result.update({'Begin Ads:': content_list[int(i)]}) res = {} res[nid] = result return res
def get_wechatnum_name(): wechat_set = set() conn, cursor = get_postgredb() cursor.execute(sql_get_wechat) rows = cursor.fetchall() for elem in rows: sname_and_id = elem[0] name = sname_and_id.split(';')[0] wechat_set.add(name) conn.close() print 'get wechat finished' return wechat_set
def test(): #sql_test = "select content from newslist_v2 where nid in (7057708, 7058238)" sql_test = "select content from newslist_v2 where nid in (7088831, 7088826)" conn, cursor = get_postgredb() cursor.execute(sql_test) rows = cursor.fetchall() r1 = rows[0] r2 = rows[1] c1 = r1[0] c2 = r2[0] p1 = get_para_list(c1) p2 = get_para_list(c2) p1_no_html = [] p2_no_html = [] for s in p1: p1_no_html.append(filter_tags(s)) for s in p2: p2_no_html.append(filter_tags(s)) r = get_same_paras(p1, p2)
def get_ads_proc(wechat_name): print 'process of ', wechat_name conn, cursor = get_postgredb() sql_get_wechat_news = "select nid, title, content from newslist_v2 where pname = \'{0}\' LIMIT 30" cursor.execute(sql_get_wechat_news.format(wechat_name)) rows = cursor.fetchall() para_list_list = [] nids = [] same_content_dict = dict() for news in rows: nid = news[0] nids.append(nid) paras = news[2] para_list = get_para_list(paras) para_list_list.append(para_list) print nids ads_dict = {} #一个公众号的所有相同段落 i = 0 num = len(para_list_list) while i < len(para_list_list): paras = para_list_list[i] k = i + 1 while k < len(para_list_list): paras2 = para_list_list[k] bSameNews, same_dict = get_same_paras(paras, paras2) if bSameNews: del para_list_list[k] #f2.write(str(nids[i]) + ' --- same with ---' + str(nids[k])) #same_content_dict[str(nids[i])] = str(nids[k]) for elem in same_dict.items(): el = elem[0] + '\t|' + elem[1] if el not in ads_dict: ads_dict[el] = 1 else: ads_dict[el] += 1 if len(same_dict): del para_list_list[k] else: k += 1 i += 1 sorted_dict = sorted(ads_dict.items(), key=lambda d: d[1], reverse=True) s = wechat_name + ' ' #f.write(name + ' ') #wechat_para_dict[name] = [] tmp_list = [] for i in sorted_dict: if float(i[1]) > float(num / 3): para = i[0].split('\t|') tmp_list.append((int(para[0]), para[1])) sorted_list = sorted(tmp_list, key=lambda d: d[0]) for para in sorted_list: para_num = int(para[0]) para_text = para[1] #f.write(str(para_num)+ ':' + ''.join(para_text.split()) + ' ') s += str(para_num) + ':' + ''.join(para_text.split()) + ' ' #wechat_para_dict[name].append((para_num, para_text)); #f.write('\n') s += '\n' write_to_file(s) conn.close()
def get_wechat_news(name_set): news_dict = dict() conn, cursor = get_postgredb() f = open(ads_data_file, 'w') f2 = open('same_news.txt', 'w') wechat_para_dict = {} for name in name_set: #etc:name = "美国咖" print name wechat_dir = ads_data_dir + name + '/' if not os.path.exists(wechat_dir): os.mkdir(wechat_dir) sql_get_wechat_news = "select nid, title, content from newslist_v2 where pname = \'{0}\' LIMIT 30" cursor.execute(sql_get_wechat_news.format(name)) rows = cursor.fetchall() news_dict[name] = dict() para_list_list = [] nids = [] for news in rows: nid = news[0] nids.append(nid) paras = news[2] para_list = get_para_list(paras) #nf = open(wechat_dir+str(nid)+'.txt', 'w') para_list_list.append(para_list) news_dict[name][str(nid)] = para_list print nids ads_dict = {} #一个公众号的所有相同段落 #para_len = len(para_list_list) #for i in xrange(0, para_len): #每一篇文章 i = 0 num = len(para_list_list) while i < len(para_list_list): paras = para_list_list[i] #for k in xrange(i+1, len(para_list_list)): #对比其余文章 k = i + 1 while k < len(para_list_list): paras2 = para_list_list[k] #comp_dict_list.extend(get_same_paras(paras, paras2)) bSameNews, same_dict = get_same_paras(paras, paras2) if bSameNews: del para_list_list[k] #print nids[i], ' ---same with----- ', nids[k] f2.write( str(nids[i]) + ' --- same with ---' + str(nids[k])) for elem in same_dict.items(): el = elem[0] + '\t|' + elem[1] if el not in ads_dict: ads_dict[el] = 1 else: ads_dict[el] += 1 if len(same_dict): del para_list_list[k] else: k += 1 i += 1 sorted_dict = sorted(ads_dict.items(), key=lambda d: d[1], reverse=True) f.write(name + ' ') wechat_para_dict[name] = [] tmp_list = [] for i in sorted_dict: if float(i[1]) > float(num / 3): para = i[0].split('\t|') tmp_list.append((int(para[0]), para[1])) sorted_list = sorted(tmp_list, key=lambda d: d[0]) for para in sorted_list: para_num = int(para[0]) para_text = para[1] #f.write(' ' + str(para_num)+ ' | ' + para_text.strip() + ' ' + str(i[1]) + '\n') #f.write(' ' + str(para_num)+ ':' + ''.join(para_text.split()) + ' ' + str(i[1]) + '\n') f.write(str(para_num) + ':' + ''.join(para_text.split()) + ' ') wechat_para_dict[name].append((para_num, para_text)) f.write('\n') f.close() f2.close() return wechat_para_dict