Exemplo n.º 1
0
def heat_quantity2():
    """
    热议文章和优质文章的交集,评论点赞量常量比,这个比值可以根据各个公众号进行拟合
    :return:
    """

    sql = "select biz, nickname from bizinfo"
    result = dbutil.query_with_sql(conn, sql)
    total = 0

    for (biz, nickname) in result:
        print("公众号昵称", nickname)
        heat_id = set()
        sql = "SELECT AVG((comment_num+commentlike_num/200)/readnum) as mean FROM content WHERE biz = '%s'" % biz
        result = dbutil.query_with_sql_one(conn, sql)
        mean = result[0]

        sql = "SELECT id, (comment_num+commentlike_num/200)/readnum as num FROM content WHERE biz = '%s'" % biz
        result = dbutil.query_with_sql(conn, sql)

        for (id, num) in result:
            if num and num > mean:
                heat_id.add(id)

        print(len(heat_id))

        quanlity_id = set()
        sql = "SELECT AVG(likenum/readnum) as mean FROM content WHERE biz = '%s'" % biz
        result = dbutil.query_with_sql_one(conn, sql)
        mean = result[0]

        sql = "SELECT id, likenum/readnum as num FROM content WHERE biz = '%s'" % biz
        result = dbutil.query_with_sql(conn, sql)

        for (id, num) in result:
            if num and num > mean:
                quanlity_id.add(id)

        print(len(quanlity_id))

        cross = heat_id & quanlity_id
        total += len(cross)

        plt.figure(figsize=(4, 4))
        venn2(subsets=[heat_id, quanlity_id], set_labels=('heat', 'quanlity'))
        plt.savefig("./heat/" + '%s.png' % nickname)
        plt.close()
    print("总量", total)
Exemplo n.º 2
0
def publish_daily(biz, nickname):
    """
    时间维度和发文数维度。公众号每日发文折线图,找出发文数量突增的点
    :param filename: 公众号昵称
    :return:
    """
    num_map = collections.OrderedDict()  # key为日期如20190102,value为该日发布的文章数
    sql = "SELECT FROM_UNIXTIME(datetime) as stamp, COUNT(*) as num  FROM content where biz = '%s' group by datetime ORDER BY datetime" % biz

    result = dbutil.query_with_sql(conn, sql)
    for (stamp, num) in result:
        ss = str(stamp)
        day = ss.split(' ')[0]  # split取出日期
        print(day)
        num_map.setdefault(day, 0)
        num_map[day] += num

    x = list(range(1, len(num_map.values()) + 1))
    # x = list(num_map.keys())   # 不能以此为坐标轴,画图太稠密

    y = list(num_map.values())
    print(x)
    print(y)
    title = '%s发文数量统计图' % nickname
    xlabel = '时间/天'
    ylabel = '发文数量'
    path = './%s_punish_daily.png' % nickname

    pltutil.plot_single_figure(x, y, title, xlabel, ylabel, path)
Exemplo n.º 3
0
def read_daily(figurename):
    num_map = collections.OrderedDict()  # key为日期如20190102,value为该日的阅读量

    sql = "SELECT DISTINCT(datetime) FROM test where datetime between 1546272000 and 1577808000 ORDER BY datetime"
    result, rowcount = dbutil.query_with_sql_rowcount(conn, sql)
    for (datetime, ) in result:
        time_local = time.localtime(datetime)
        time_format = time.strftime('%Y%m%d %H:%M:%S', time_local)
        day = time_format.split(' ')[0]  # split取出日期
        num_map.setdefault(day, 0)
        sql2 = "SELECT readnum FROM test WHERE readnum!='' and datetime = %s" % datetime
        result2 = dbutil.query_with_sql(conn, sql2)
        for (readnum, ) in result2:
            if '万' in readnum:
                readnum = float(readnum.split('万')[0]) * 10000
            readnum = int(readnum)
            num_map[day] += readnum

    # 开始画图 每日阅读量随时间变化图
    plt.title('Read Daily Information Analysis')
    x = list(num_map.keys())
    y = np.array(list(num_map.values())) / 100000
    plt.xticks(np.arange(0, len(x), 30))
    plt.plot(x, y)
    # plt.scatter(x, y)
    plt.legend()  # 显示图例
    plt.savefig('%s_read_daily.png' % figurename)
    # plt.show()
    plt.close()
Exemplo n.º 4
0
def publish_time(figurename):
    """
    时间维度,公众号发布时间变化率,可以大致看出发布文章的规律
    :param filename: 图例保存名称
    :return:
    """
    date = []

    sql = "SELECT datetime FROM test where datetime between 1546272000 and 1577808000 GROUP BY datetime ORDER BY datetime"
    result = dbutil.query_with_sql(conn, sql)
    for (datetime,) in result:
        tmp = round((datetime - START_TIME) / 3600, 1)  # 以小时为单位
        date.append(tmp)  # 减去最小值
    print(date)
    delta = []  # 变化率
    for i in range(1, len(date)):
        delta.append(round(date[i] - date[i - 1], 1))

    x = list(range(0, len(delta)))
    y = delta
    print(delta)
    plt.scatter(x, y)  # 这里画散点图比较好,
    plt.legend()
    plt.savefig('%s_delta.png' % figurename)
    plt.close()
Exemplo n.º 5
0
def extract_comment():
    sql = "SELECT id, biz, title, digest, content, strong_content, color_content,`comment` FROM content WHERE datetime BETWEEN UNIX_TIMESTAMP('2019-06-01 00:00:00') and UNIX_TIMESTAMP('2019-07-01 00:00:00') "
    result = dbutil.query_with_sql(conn, sql)
    id_list = []
    biz_list = []
    title_list = []
    digest_list = []
    content_list = []
    strong_content_list = []
    color_content_list = []
    for (id, biz, title, digest, content, strong_content, color_content,
         comment) in result:
        if content:
            contents = content.split('\n')
            print('段落长度', len(contents))
            if len(contents) == 1:
                tmp = contents[0].split('。')
                if len(tmp) < 4:
                    content = contents[0]
                    content = re.sub("\n", "。", content)
                else:
                    content = "。".join(i for i in tmp[0:4])  # 前四句
            else:
                content = contents[0] + contents[1]  # 前两段
        id_list.append(id)
        biz_list.append(biz)
        title_list.append(title)
        digest_list.append(digest)
        content_list.append(content)
        if strong_content and len(strong_content.split('。')) > 4:
            tmp = strong_content.split('。')
            strong_content = '。'.join(i for i in tmp[0:4])
        if color_content and len(color_content.split('。')) > 4:
            tmp = color_content.split('。')
            color_content = '。'.join(i for i in tmp[0:4])
        strong_content_list.append(strong_content)
        color_content_list.append(color_content)
    dataframe = pd.DataFrame({
        'id': id_list,
        'biz': biz_list,
        'title': title_list,
        'digest': digest_list,
        'content': content_list,
        'strong_content': strong_content_list,
        'color_content': color_content_list
    })
    dataframe.to_csv("trainsix_wechat.csv",
                     index=False,
                     sep=',',
                     quotechar='\"')
Exemplo n.º 6
0
def write_content_to_file_with_sql(sql):
    """
    把文章内容写入文件
    :return:
    """
    time0 = time.time()
    result = dbutil.query_with_sql(conn, sql)
    file = open("content.txt", "w")
    for (content,) in result:
        if content:
            file.write(content)
            file.write("\n")
    file.close()
    print('用时   ', time.time()-time0)
Exemplo n.º 7
0
def write_content_to_file(table):
    """
    把文章内容写入文件
    :return:
    """
    time0 = time.time()
    sql = "SELECT id,content FROM %s" % table
    result = dbutil.query_with_sql(conn, sql)
    file = open("content.txt", "w")
    for (id, content) in result:
        if content:
            file.write(content)
            file.write("\n---------------------------\n")
    file.close()
    print('用时   ', time.time()-time0)
Exemplo n.º 8
0
def publish_minute(figurename):
    """
    时间维度和发文数维度。公众号发布分钟统计图
    :param figurename:
    :return:
    """
    map = collections.OrderedDict()  # key 为分钟[0000,2400),value 为该分钟发文数

    sql = "SELECT datetime, COUNT(*) as num FROM test where datetime between 1546272000 and 1577808000 GROUP BY datetime ORDER BY datetime"
    result = dbutil.query_with_sql(conn, sql)
    for (datetime, num) in result:
        time_local = time.localtime(datetime)
        time_format = time.strftime('%Y%m%d %H%M', time_local)
        minute = time_format.split(' ')[1]
        map.setdefault(minute, 0)
        map[minute] += num

    # 按照键值排序
    x = []
    y = []
    abnormal = []
    aa = set()
    for i in sorted(map):
        print((i, map[i]), end=" ")
        x.append(i)
        aa.add(map[i])
        y.append(map[i])
        if map[i] < 7:
            abnormal.append(i)

    print('\n')
    print(abnormal)

    print(len(abnormal))
    print(aa)

    print('\n不同的分钟数 %d ' % len(x))
    # print(x)
    # print(y)
    plt.xticks(np.arange(0, 2400, 30))
    plt.plot(x, y)
    # plt.scatter(x, y)
    plt.legend()
    plt.savefig('%s_minute.png' % figurename)
    plt.show()
    plt.close()
Exemplo n.º 9
0
def publish_hour(biz, figurename):
    """
    时间维度 和 发布数维度。公众号每小时发文折线图,找出不在规律内的;误差在正负30分钟之内,进行了四舍五入
    :param figurename: 图例名称
    :return:
    """

    map = {}  # key为小时,取值范围为[0,24),value为小时数的统计

    sql = "SELECT FROM_UNIXTIME(datetime) as stamp, COUNT(*) as num  FROM content where biz = '%s' group by datetime ORDER BY datetime" % biz
    result = dbutil.query_with_sql(conn, sql)
    total = 0
    for (stamp, num) in result:
        total += num
        ss = str(stamp)
        tmp = ss.split(' ')[1].split(':')  # 取出时分秒
        hour = int(tmp[0])
        if int(tmp[1]) > 30:
            hour += 1
        if hour == 24:
            hour = 0
        map.setdefault(hour, 0)
        map[hour] += num

    # 按照键值排序
    x = []
    y = []

    for i in sorted(map):
        print((i, map[i]), end=" ")
        x.append(i)
        y.append(map[i])

    print('\n--------------')

    # print(sorted(map.items(), key=lambda kv: (kv[1], kv[0])))

    print(x)
    print(y)
    title = '%s发布时间分布图' % nickname
    xlabel = '时间/h'
    ylabel = '发文数量'
    path = './%s_punish_hour.png' % nickname

    pltutil.plot_single_figure(x,y, title,xlabel,ylabel,path)
Exemplo n.º 10
0
def delete_white_line():
    """
    去除内容冗余的空行和前后空格
    """
    time0 = time.time()
    sql = "SELECT id,digest FROM content"
    result = dbutil.query_with_sql(conn, sql)
    for (id, digest) in result:
        # 不为空
        if digest:
            # 去除多余的空行,只保留一个空行
            content = digest.lstrip("\n").rstrip("\n")
            content = re.sub("\n{2,}", "\n", content)
            digest = content.strip()
        sql = "update content set digest = '{}' where id = {}"\
            .format(pymysql.escape_string(digest), id)
        dbutil.exec_sql(conn, sql)
        # print(digest)
    print('用时   ', time.time()-time0)
Exemplo n.º 11
0
def index_each_biz():
    """
    索引位置对各个公众号的影响:计算不同索引位置的平均阅读量和点赞量
    :return:
    """
    sql = "select biz, nickname from bizinfo"
    result = dbutil.query_with_sql(conn, sql)

    for (biz, nickname) in result:
        print("公众号昵称", nickname)
        read_map = {}  # 阅读量
        read_id = {}  # 文章标识
        like_map = {}  # 点赞量

        sql2 = "SELECT id, contenturl, readnum, likenum, title, digest FROM content where biz = '%s'" % biz
        result2, rowcount = dbutil.query_with_sql_rowcount(conn, sql2)
        for (id, contenturl, readnum, likenum, title, digest) in result2:
            index = contenturl.find('&idx=')
            idx = int(contenturl[index + 5:index + 6])
            read_map.setdefault(idx, [])
            read_map[idx].append(readnum)
            read_id.setdefault(idx, [])
            read_id[idx].append(id)
            like_map.setdefault(idx, [])
            like_map[idx].append(likenum)

        idx_read_list = []  # 索引位置平均阅读量
        idx_like_list = []  # 索引位置平均点赞量
        x = list(range(1, len(read_map) + 1))

        for i in x:
            idx_read_list.append(
                round(sum(read_map[i]) / (len(read_map[i]) * 10000),
                      2))  # 以万为单位
            idx_like_list.append(round(sum(like_map[i]) / len(like_map[i]), 1))

        # 分布图
        plot_single_figure(x, idx_read_list, "索引位置对阅读量的影响", "索引位置", "阅读量/万",
                           nickname + "_read")
        plot_single_figure(x, idx_like_list, "索引位置对点赞量的影响", "索引位置", "点赞量",
                           nickname + "_like")
Exemplo n.º 12
0
def count_comment_nickname():
    dict = {}
    sql = "SELECT `comment` FROM content"
    result = dbutil.query_with_sql(conn, sql)
    for (comment, ) in result:
        if len(comment) > 2:
            comment = comment[2:len(comment) - 2]
            aa = comment.split('}, {')
            comment_list = []
            for a in aa:
                a = "{%s}" % a
                comment_list.append(a)
            for discuss in comment_list:
                # print(discuss)
                di = eval(discuss)  # 字符串转数组,该函数不安全
                tmp = di.get(
                    'nickname'
                )  # 评论点赞量也有过万的,在数据持久化的时候,'elected'存放的数据不太规范,有String类型,把无数据的当做0整型数据存放了
                dict.setdefault(tmp, 0)
                dict[tmp] += 1

    # jsObj = json.dumps(dict)
    # fileObject = open('jsonFile.json', 'w')
    # fileObject.write(jsObj)
    # fileObject.close()
    # csv_file = "nickname_output.csv"
    # csv_columns = ['Nickname', 'Count']
    # with open(csv_file, 'w') as csvfile:
    #     writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
    #     writer.writeheader()
    #     for key, val in dict:
    #         writer.writerow([key, val])
    filename = "nickname_output.csv"
    with open(filename, 'w') as csv_file:
        # with open(filename, 'w', encoding='utf_8_sig') as csv_file: 或者这样
        csv_file.write(codecs.BOM_UTF8.decode())  # 不加会导致中文乱码
        writer = csv.writer(csv_file)
        for key, val in dict.items():
            writer.writerow([key, val])
Exemplo n.º 13
0
def delete_white_space():
    """
    去除内容冗余的空行和前后空格
    """
    time0 = time.time()
    sql = "SELECT id,strong_content, color_content FROM content"
    result = dbutil.query_with_sql(conn, sql)
    for (id, strong_content, color_content) in result:
        # 不为空
        if strong_content:
            # 去除多余的空行,只保留一个空行
            content = strong_content.lstrip("。").rstrip("。")
            content = re.sub("。{2,}", "。", content)
            strong_content = content.strip()
        if color_content:
            # 去除多余的空行,只保留一个空行
            content = color_content.lstrip("。").rstrip("。")
            content = re.sub("。{2,}", "。", content)
            color_content = content.strip()
        sql = "update content set strong_content = '{}' , color_content='{}' where id = {}"\
            .format(pymysql.escape_string(strong_content), pymysql.escape_string(color_content), id)
        dbutil.exec_sql(conn, sql)
        # print(digest)
    print('用时   ', time.time()-time0)
Exemplo n.º 14
0
        total += len(cross)

        plt.figure(figsize=(4, 4))
        venn2(subsets=[heat_id, quanlity_id], set_labels=('heat', 'quanlity'))
        plt.savefig("./heat/" + '%s.png' % nickname)
        plt.close()
    print("总量", total)


if __name__ == '__main__':

    headlines = set()
    quans = set()
    hot = set()
    sql = 'select id from content where idx=1'
    result = dbutil.query_with_sql(conn, sql)
    for (id, ) in result:
        headlines.add(id)

    # data = pd.read_excel('优质和热议的文章.xls')
    # mean_quan = 106.8859
    # mean_hot = 84.6293
    # quans = data['id'][data['quan']>mean_quan]
    # print(quans)

    sql = 'select id from content where likenum/readnum > 0.0106886'
    result = dbutil.query_with_sql(conn, sql)
    for (id, ) in result:
        quans.add(id)

    sql = 'select id from content where (comment_num+0.004*commentlike_num)/readnum > 0.0008463'