Пример #1
0
def get_wechat_news(name_set):
    news_dict = dict()
    conn, cursor = get_postgredb()
    #wechat_para_dict = {}
    pool = Pool(10)
    for name in name_set:  #etc:name = "美国咖"
        pool.apply_async(get_ads_proc, name)
Пример #2
0
def get_ads_on_nid(nid):
    global ads_dict
    sql_get_news = "select content,pname from newslist_v2 where nid = {0}"
    conn, cursor = get_postgredb()
    cursor.execute(sql_get_news.format(nid))
    rows = cursor.fetchall()
    row = rows[0]
    contents = row[0]
    pname = row[1]
    content_list = get_para_list(contents)
    ads_paras = ads_dict[pname]
    sorted_ads = sorted(ads_paras, key=lambda d: d[0])
    print sorted_ads
    remove_para_list = []
    i = 0
    while i < len(sorted_ads):
        para = sorted_ads[i]
        if para[0] < 0:
            n = int(para[0])
            if len(content_list) < abs(n):
                continue
            str_no_tags = filter_tags(content_list[int(para[0])])
            if is_sentenses_same(str_no_tags, para[1]):
                remove_para_list.append(para[0])
                for k in xrange(i + 1, len(sorted_ads)):
                    if sorted_ads[k][0] < 0:
                        i = k + 1
                        continue
                    else:
                        i = k
                        break
            else:
                i += 1
                continue
        else:
            k = len(sorted_ads) - 1
            while k >= i:
                para2 = sorted_ads[k]
                n = int(para2[0])
                if len(content_list) < abs(n):
                    continue
                str_no_tags = filter_tags(content_list[int(para2[0])])
                if is_sentenses_same(str_no_tags, para2[1]):
                    remove_para_list.append(para2[0])
                    break
                else:
                    k -= 1
            break
    print remove_para_list
    result = {}
    for i in remove_para_list:
        if int(i) < 0:
            result.update({'End Ads:': content_list[int(i)]})
        else:
            result.update({'Begin Ads:': content_list[int(i)]})
    res = {}
    res[nid] = result
    return res
Пример #3
0
def get_wechatnum_name():
    wechat_set = set()
    conn, cursor = get_postgredb()
    cursor.execute(sql_get_wechat)
    rows = cursor.fetchall()
    for elem in rows:
        sname_and_id = elem[0]
        name = sname_and_id.split(';')[0]
        wechat_set.add(name)
    conn.close()
    print 'get wechat finished'
    return wechat_set
Пример #4
0
def test():
    #sql_test = "select content from newslist_v2 where nid in (7057708, 7058238)"
    sql_test = "select content from newslist_v2 where nid in (7088831, 7088826)"
    conn, cursor = get_postgredb()
    cursor.execute(sql_test)
    rows = cursor.fetchall()
    r1 = rows[0]
    r2 = rows[1]
    c1 = r1[0]
    c2 = r2[0]
    p1 = get_para_list(c1)
    p2 = get_para_list(c2)
    p1_no_html = []
    p2_no_html = []
    for s in p1:
        p1_no_html.append(filter_tags(s))
    for s in p2:
        p2_no_html.append(filter_tags(s))
    r = get_same_paras(p1, p2)
Пример #5
0
def get_ads_proc(wechat_name):
    print 'process of ', wechat_name
    conn, cursor = get_postgredb()
    sql_get_wechat_news = "select nid, title, content from newslist_v2 where pname = \'{0}\' LIMIT 30"
    cursor.execute(sql_get_wechat_news.format(wechat_name))
    rows = cursor.fetchall()
    para_list_list = []
    nids = []
    same_content_dict = dict()
    for news in rows:
        nid = news[0]
        nids.append(nid)
        paras = news[2]
        para_list = get_para_list(paras)
        para_list_list.append(para_list)
    print nids
    ads_dict = {}  #一个公众号的所有相同段落
    i = 0
    num = len(para_list_list)
    while i < len(para_list_list):
        paras = para_list_list[i]
        k = i + 1
        while k < len(para_list_list):
            paras2 = para_list_list[k]
            bSameNews, same_dict = get_same_paras(paras, paras2)
            if bSameNews:
                del para_list_list[k]
                #f2.write(str(nids[i]) + ' --- same with ---' + str(nids[k]))
                #same_content_dict[str(nids[i])] = str(nids[k])
            for elem in same_dict.items():
                el = elem[0] + '\t|' + elem[1]
                if el not in ads_dict:
                    ads_dict[el] = 1
                else:
                    ads_dict[el] += 1
            if len(same_dict):
                del para_list_list[k]
            else:
                k += 1
        i += 1
    sorted_dict = sorted(ads_dict.items(), key=lambda d: d[1], reverse=True)

    s = wechat_name + ' '
    #f.write(name + ' ')
    #wechat_para_dict[name] = []
    tmp_list = []
    for i in sorted_dict:
        if float(i[1]) > float(num / 3):
            para = i[0].split('\t|')
            tmp_list.append((int(para[0]), para[1]))
    sorted_list = sorted(tmp_list, key=lambda d: d[0])
    for para in sorted_list:
        para_num = int(para[0])
        para_text = para[1]
        #f.write(str(para_num)+ ':' + ''.join(para_text.split()) + ' ')
        s += str(para_num) + ':' + ''.join(para_text.split()) + ' '
        #wechat_para_dict[name].append((para_num, para_text));
    #f.write('\n')
    s += '\n'
    write_to_file(s)
    conn.close()
Пример #6
0
def get_wechat_news(name_set):
    news_dict = dict()
    conn, cursor = get_postgredb()
    f = open(ads_data_file, 'w')
    f2 = open('same_news.txt', 'w')
    wechat_para_dict = {}
    for name in name_set:  #etc:name = "美国咖"
        print name
        wechat_dir = ads_data_dir + name + '/'
        if not os.path.exists(wechat_dir):
            os.mkdir(wechat_dir)
        sql_get_wechat_news = "select nid, title, content from newslist_v2 where pname = \'{0}\' LIMIT 30"
        cursor.execute(sql_get_wechat_news.format(name))
        rows = cursor.fetchall()
        news_dict[name] = dict()
        para_list_list = []
        nids = []
        for news in rows:
            nid = news[0]
            nids.append(nid)
            paras = news[2]
            para_list = get_para_list(paras)
            #nf = open(wechat_dir+str(nid)+'.txt', 'w')

            para_list_list.append(para_list)
            news_dict[name][str(nid)] = para_list

        print nids
        ads_dict = {}  #一个公众号的所有相同段落
        #para_len = len(para_list_list)
        #for i in xrange(0, para_len): #每一篇文章
        i = 0
        num = len(para_list_list)
        while i < len(para_list_list):
            paras = para_list_list[i]
            #for k in xrange(i+1, len(para_list_list)): #对比其余文章
            k = i + 1
            while k < len(para_list_list):
                paras2 = para_list_list[k]
                #comp_dict_list.extend(get_same_paras(paras, paras2))
                bSameNews, same_dict = get_same_paras(paras, paras2)
                if bSameNews:
                    del para_list_list[k]
                    #print nids[i], ' ---same with----- ', nids[k]
                    f2.write(
                        str(nids[i]) + ' --- same with ---' + str(nids[k]))
                for elem in same_dict.items():
                    el = elem[0] + '\t|' + elem[1]
                    if el not in ads_dict:
                        ads_dict[el] = 1
                    else:
                        ads_dict[el] += 1
                if len(same_dict):
                    del para_list_list[k]
                else:
                    k += 1
            i += 1
        sorted_dict = sorted(ads_dict.items(),
                             key=lambda d: d[1],
                             reverse=True)

        f.write(name + ' ')
        wechat_para_dict[name] = []
        tmp_list = []
        for i in sorted_dict:
            if float(i[1]) > float(num / 3):
                para = i[0].split('\t|')
                tmp_list.append((int(para[0]), para[1]))
        sorted_list = sorted(tmp_list, key=lambda d: d[0])
        for para in sorted_list:
            para_num = int(para[0])
            para_text = para[1]
            #f.write('    ' + str(para_num)+ ' | ' + para_text.strip() + '    ' + str(i[1]) + '\n')
            #f.write(' ' + str(para_num)+ ':' + ''.join(para_text.split()) + '    ' + str(i[1]) + '\n')
            f.write(str(para_num) + ':' + ''.join(para_text.split()) + ' ')
            wechat_para_dict[name].append((para_num, para_text))
        f.write('\n')
    f.close()
    f2.close()
    return wechat_para_dict