def insert_mysql(year, month, day):
    db = get_mysql_db()
    cursor = db.cursor()

    sql = "insert into {}(_id, media, posit, url, flag, file_size, datetime)" \
                     " values (%s, %s, %s, %s, %s, %s, %s)".format("statistics_mongo_compare_osslog")
    f = open("target_file/mongo_compare_osslog" + year + month + day + ".txt",
             "r")
    content = f.readlines()

    lis = list()
    for line in content:
        line_list = line.split()
        _id = line_list[0].strip()
        media = line_list[1].strip()
        posit = line_list[2].strip()
        url = line_list[3].strip()
        flag = int(line_list[4].strip())
        file_size = int(line_list[5].strip())
        param = (_id, media, posit, url, flag, file_size,
                 year + '-' + month + '-' + day)
        lis.append(param)
        if len(lis) >= 2000:
            cursor.executemany(sql, lis)
            db.commit()
            lis = list()
    cursor.executemany(sql, lis)
    db.commit()

    cursor.close()
    db.close()

    f.close()
예제 #2
0
def insert_mysql_only_osslog(year, month, day):
    db = get_mysql_db()
    cursor = db.cursor()

    sql = "insert into {}(url, file_size, timestamps, datetime)" \
                     " values (%s, %s, %s, %s)".format("statistics_only_osslog")

    f = open("target_file/only_osslog" + year + month + day + ".txt", "r")
    content = f.readlines()

    lis = list()
    for line in content:
        line_list = line.split()
        url = line_list[0].strip()
        file_size = int(line_list[1].strip())
        timestamp = format_time(line_list[2])
        param = (url, file_size, timestamp, year + '-' + month + '-' + day)
        lis.append(param)
        if len(lis) >= 2000:
            cursor.executemany(sql, lis)
            db.commit()
            lis = list()
    cursor.executemany(sql, lis)
    db.commit()
    cursor.close()
    db.close()
    f.close()
예제 #3
0
def compare_table(year, month, day, count_dict, only_osslog_count):
    params = (count_dict["common_count"], count_dict["only_mongo_count"],
              only_osslog_count, year + '-' + month + '-' + day)
    db = get_mysql_db()
    cursor = db.cursor()
    sql = 'insert into {}(common_count, only_mongo_count, only_osslog_count, datetime) values(%s, %s, %s, %s)'\
          .format("statistics_compare_table")
    cursor.execute(sql, params)
    db.commit()

    cursor.close()
    db.close()
예제 #4
0
def delete_mysql_table(save_days):
    time_str = (datetime.datetime.now() -
                datetime.timedelta(days=save_days + 1)).strftime("%Y-%m-%d")
    db = get_mysql_db()
    cursor = db.cursor()
    tables_list = ['statistics_mongo_compare_osslog', 'statistics_only_osslog']
    for table in tables_list:
        sql = 'delete from {} where datetime=%s'.format(table)
        result = cursor.execute(sql, time_str)
        if result > 0:
            db.commit()
            logging.warning("Delete %s success! datetime: %s ...", table,
                            time_str)
예제 #5
0
def size_table(year, month, day, common_size, only_osslog_size,
               danews_data_size, donews_test1_size, wangleilog_size):
    db = get_mysql_db()
    cursor = db.cursor()
    insert_sql = 'insert into {}(common_size, only_osslog_size, datetime,' \
                 ' danews_data_size, donews_test1_size, wangleilog_size) values(%s, %s, %s, %s, %s, %s)'\
        .format("statistics_size_table")
    params = (common_size, only_osslog_size, year + '-' + month + '-' + day,
              danews_data_size, donews_test1_size, wangleilog_size)
    cursor.execute(insert_sql, params)
    db.commit()

    cursor.close()
    db.close()
예제 #6
0
def insert_mysql(year, month, day):
    db = get_mysql_db()
    cursor = db.cursor()
    sql = "insert into {}(_id)" \
                     " values (%s)".format("log_to_mongo0911_id")
    f = open("target_file/mongo_to_file" + year + month + day + ".txt", "r")
    content = f.readlines()

    lis = list()
    for line in content:
        line_list = line.split()
        _id = line_list[0].strip()
        param = (_id, )
        lis.append(param)
        if len(lis) >= 2000:
            cursor.executemany(sql, lis)
            db.commit()
            lis = list()
    cursor.executemany(sql, lis)
    db.commit()
    f.close()
예제 #7
0
def export(year, month, day):
    table_name = 'statistics_result_table'
    conn = get_mysql_db()
    cursor = conn.cursor()
    sql1 = """
        select
            media as 库名,
            news_count as 新闻数,
            total_count as 总资源数,
            total_count_1 as Mongo总资源数,
            format(total_size/1024/1024/1024, 2) as OSS日志存储总量G,
            format(total_size_1/1024/1024/1024, 2) as Mongo共有数据存储总量G,
            format(avg_size_1/1024/1024, 2) as 平均大小M,
            format(total_size_1*100/(select sum(total_size_1) from statistics_result_table where
             datetime ='{}'), 2) as 百分占比
        from statistics_result_table where datetime = %s;
    """.format(year + '-' + month + '-' + day)

    sql2 = """
        select
        media as 库名,
        news_count as 新闻数,
        img_location_count_1 as 图片数据量,
        format(img_location_size_1/1024/1024/1024, 2) as 图片大小G,
        small_img_location_count_1 as 缩略图数据量,
        format(small_img_location_size_1/1024/1024/1024, 2) as 缩略图大小G,
        video_location_count_1 as 视频数据量,
        format(video_location_size_1/1024/1024/1024, 2) as 视频大小G
      from {} where datetime = %s;
    """.format("statistics_result_table")

    sql3 = """
            select
            media as 库名,
            ifnull(format(img_location_count_1/news_count, 2), 0) as 平均图片数量,
            ifnull(format(small_img_location_count_1/news_count, 2), 0) as 平均缩略图数量,
            ifnull(format(video_location_count_1/news_count, 2), 0) as 平均视频数量,
            ifnull(format(img_location_size_1/img_location_count_1/1024, 2), 0) as 单条数据平均图片大小kb,
            ifnull(format(small_img_location_size_1/small_img_location_count_1/1024, 2), 0) as 单条数据平均缩略图大小kb,
            ifnull(format(video_location_size_1/video_location_count_1/1024, 2), 0) as 单条数据视频大小kb,
            format(ifnull(img_location_size_1/img_location_count_1/1024, 0) + 
            ifnull(small_img_location_size_1/small_img_location_count_1/1024, 0) + 
            ifnull(video_location_size_1/video_location_count_1/1024, 0), 2) as 平均单条数据
        from {} where datetime = %s;
    """.format("statistics_result_table")

    workbook = xlsxwriter.Workbook('csv_file/statistics_result_table' + year +
                                   '-' + month + '-' + day + '.xlsx')
    count = 1
    for sql in [sql1, sql2, sql3]:
        cursor.execute(sql, year + '-' + month + '-' + day)
        # 搜取所有结果
        results = cursor.fetchall()
        # 获取MYSQL里面的数据字段名称
        fields = cursor.description
        sheet = workbook.add_worksheet('table_' + table_name + str(count))
        count += 1
        # 写上字段信息
        for field in range(0, len(fields)):
            sheet.write(0, field, fields[field][0])

        # 获取并写入数据段信息
        for row in range(1, len(results) + 1):
            for col in range(0, len(fields)):
                value = results[row - 1][col]
                if str(value).find(".") != -1 or str(value) == '0':
                    if str(value).find(",") != -1:
                        value = decimal.Decimal(str(value).replace(",", ''))
                    value = decimal.Decimal(value)
                sheet.write(row, col, value)
    workbook.close()
예제 #8
0
def export(year, month, day):
    table_name = 'statistics_result_table'
    conn = get_mysql_db()
    cursor = conn.cursor()
    sql1 = """
        select
            media as 库名,
            news_count as 新闻数,
            total_count as 总资源数,
            total_count_1 as Mongo总资源数,
            format(total_size/1024/1024/1024, 2) as OSS日志存储总量G,
            format(total_size_1/1024/1024/1024, 2) as Mongo共有数据存储总量G,
            format(avg_size_1/1024/1024, 2) as 平均大小M,
            format(total_size_1*100/(select sum(total_size_1) from statistics_result_table where
             datetime ='{}'), 2) as 百分占比
        from statistics_result_table where datetime = %s;
    """.format(year + '-' + month + '-' + day)

    sql2 = """
        select
        media as 库名,
        news_count as 新闻数,
        img_location_count_1 as 图片数据量,
        format(img_location_size_1/1024/1024/1024, 2) as 图片大小G,
        small_img_location_count_1 as 缩略图数据量,
        format(small_img_location_size_1/1024/1024/1024, 2) as 缩略图大小G,
        video_location_count_1 as 视频数据量,
        format(video_location_size_1/1024/1024/1024, 2) as 视频大小G
      from {} where datetime = %s;
    """.format("statistics_result_table")

    sql3 = """
            select
            media as 库名,
            ifnull(format(img_location_count_1/news_count, 2), 0) as 平均图片数量,
            ifnull(format(small_img_location_count_1/news_count, 2), 0) as 平均缩略图数量,
            ifnull(format(video_location_count_1/news_count, 2), 0) as 平均视频数量,
            ifnull(format(img_location_size_1/img_location_count_1/1024, 2), 0) as 单条数据平均图片大小kb,
            ifnull(format(small_img_location_size_1/small_img_location_count_1/1024, 2), 0) as 单条数据平均缩略图大小kb,
            ifnull(format(video_location_size_1/video_location_count_1/1024, 2), 0) as 单条数据视频大小kb,
            format(ifnull(img_location_size_1/img_location_count_1/1024, 0) + 
            ifnull(small_img_location_size_1/small_img_location_count_1/1024, 0) + 
            ifnull(video_location_size_1/video_location_count_1/1024, 0), 2) as 平均单条数据
        from {} where datetime = %s;
    """.format("statistics_result_table")

    workbook = xlwt.Workbook()
    count = 1
    for sql in [sql1, sql2, sql3]:
        cursor.execute(sql, year + '-' + month + '-' + day)
        # 重置游标的位置
        # cursor.scroll(0,mode='absolute')
        # 搜取所有结果
        results = cursor.fetchall()
        # 获取MYSQL里面的数据字段名称
        fields = cursor.description
        sheet = workbook.add_sheet('table_' + table_name + str(count),
                                   cell_overwrite_ok=True)
        count += 1

        # 写上字段信息
        for field in range(0, len(fields)):
            sheet.write(0, field, fields[field][0])

        # 获取并写入数据段信息
        row = 1
        col = 0
        for row in range(1, len(results) + 1):
            for col in range(0, len(fields)):
                value = u'%s' % results[row - 1][col]
                sheet.write(row, col, value)

        workbook.save('csv_file/statistics_result_table' + year + '-' + month +
                      '-' + day + '.csv')
예제 #9
0
def result_table(year, month, day, _id_media):
    db = get_mysql_db()
    cursor = db.cursor()
    params = {}
    select_sql = 'select media, posit, count(*) as count,' \
                 ' sum(file_size) as size from {} where media=%s and flag=1 and datetime=%s group by posit;'\
        .format("statistics_mongo_compare_osslog")

    select_sql_other = 'select media, posit, count(*) as count,' \
                       ' sum(file_size) as size from {} where media=%s and datetime=%s group by posit;'\
        .format("statistics_mongo_compare_osslog")

    client = get_mongo(MONGO_PARA_N)
    dmt_list = client['dmt_jh_data'].collection_names()
    crawled_list = client['crawled_TTH_web_page'].collection_names()
    crawled_list.remove(u"miaopai")
    media_list = dmt_list + crawled_list

    for media in media_list:
        para = (media, year + '-' + month + '-' + day)
        cursor.execute(select_sql, para)
        result = cursor.fetchall()
        params[media] = {}
        for item in result:
            params[media][item[1] + "_count_1"] = item[2]
            params[media][item[1] + "_size_1"] = float(item[3])
        for posit in [
                "img_location_count_1", "small_img_location_count_1",
                "video_location_count_1"
        ]:
            if posit not in params[media]:
                params[media][posit] = 0
        for posit in [
                "img_location_size_1", "small_img_location_size_1",
                "video_location_size_1"
        ]:
            if posit not in params[media]:
                params[media][posit] = 0

        img_location_size_1 =\
            params[media].get("img_location_size_1") if params[media].get("img_location_size_1") else 0
        img_location_count_1 =\
            params[media].get("img_location_count_1") if params[media].get("img_location_count_1") else 0
        small_img_location_size_1 =\
            params[media].get("small_img_location_size_1") if params[media].get("small_img_location_size_1") else 0
        small_img_location_count_1 =\
            params[media].get("small_img_location_count_1") if params[media].get("small_img_location_count_1") else 0
        video_location_size_1 =\
            params[media].get("video_location_size_1") if params[media].get("video_location_size_1") else 0
        video_location_count_1 =\
            params[media].get("video_location_count_1") if params[media].get("video_location_count_1") else 0

        params[media][
            "total_count_1"] = img_location_count_1 + small_img_location_count_1 + video_location_count_1
        params[media][
            "total_size_1"] = img_location_size_1 + small_img_location_size_1 + video_location_size_1

        if params[media]["total_count_1"] > 0:
            params[media]["avg_size_1"] =\
                float('%.4f' % (float(params[media]["total_size_1"]) / params[media]["total_count_1"]))

            params[media]["percent_img_location_1"] =\
                float('%.4f' % (float(img_location_count_1) / params[media]["total_count_1"]))

            params[media]["percent_small_img_location_1"] =\
                float('%.4f' % (float(small_img_location_count_1) / params[media]["total_count_1"]))

            params[media]["percent_video_location_1"] =\
                float('%.4f' % (float(video_location_count_1) / params[media]["total_count_1"]))
        else:
            params[media]["avg_size_1"] = 0
            params[media]["percent_img_location_1"] = 0
            params[media]["percent_small_img_location_1"] = 0
            params[media]["percent_video_location_1"] = 0

        if img_location_count_1 > 0:
            params[media]["avg_img_location_size_1"] = \
                float('%.4f' % (float(img_location_size_1) / img_location_count_1))
        else:
            params[media]["avg_img_location_size_1"] = 0

        if small_img_location_count_1 > 0:
            params[media]["avg_small_img_location_size_1"] =\
                float('%.4f' % (float(small_img_location_size_1) / small_img_location_count_1))
        else:
            params[media]["avg_small_img_location_size_1"] = 0

        if video_location_count_1 > 0:
            params[media]["avg_video_location_size_1"] =\
                float('%.4f' % (float(video_location_size_1) / video_location_count_1))
        else:
            params[media]["avg_video_location_size_1"] = 0

    for media_other in media_list:
        para = (media_other, year + '-' + month + '-' + day)
        cursor.execute(select_sql_other, para)
        result = cursor.fetchall()

        for item in result:
            params[media_other][item[1] + "_count"] = item[2]
            params[media_other][item[1] + "_size"] = float(item[3])
        for posit in [
                "img_location_count", "small_img_location_count",
                "video_location_count"
        ]:
            if posit not in params[media_other]:
                params[media_other][posit] = 0
        for posit in [
                "img_location_size", "small_img_location_size",
                "video_location_size"
        ]:
            if posit not in params[media_other]:
                params[media_other][posit] = 0

        img_location_size =\
            params[media_other].get("img_location_size") if params[media_other].get("img_location_size") else 0
        img_location_count =\
            params[media_other].get("img_location_count") if params[media_other].get("img_location_count") else 0
        small_img_location_size =\
            params[media_other].get("small_img_location_size") if params[media_other].get("small_img_location_size") else 0
        small_img_location_count =\
            params[media_other].get("small_img_location_count") if params[media_other].get("small_img_location_count") else 0
        video_location_size =\
            params[media_other].get("video_location_size") if params[media_other].get("video_location_size") else 0
        video_location_count =\
            params[media_other].get("video_location_count") if params[media_other].get("video_location_count") else 0

        params[media_other][
            "total_count"] = img_location_count + small_img_location_count + video_location_count
        params[media_other][
            "total_size"] = img_location_size + small_img_location_size + video_location_size

    for media_item in params:
        params[media_item]["media"] = media_item
        params[media_item]["news_count"] = _id_media[media_item]
        insert_sql = "insert into {}(media, news_count, total_count, total_count_1, total_size, total_size_1," \
                     " avg_size_1, img_location_count, img_location_count_1, img_location_size, img_location_size_1," \
                     " percent_img_location_1, avg_img_location_size_1," \
                     " small_img_location_count, small_img_location_count_1, small_img_location_size," \
                     " small_img_location_size_1, percent_small_img_location_1, " \
                     "avg_small_img_location_size_1, video_location_count, video_location_count_1," \
                     " video_location_size, video_location_size_1, percent_video_location_1," \
                     " avg_video_location_size_1, datetime)" \
                     " values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s," \
                     " %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)".\
            format("statistics_result_table")

        temp_params = (
            params[media_item]["media"],
            params[media_item]["news_count"],
            params[media_item]["total_count"],
            params[media_item]["total_count_1"],
            params[media_item]["total_size"],
            params[media_item]["total_size_1"],
            params[media_item]["avg_size_1"],
            params[media_item]["img_location_count"],
            params[media_item]["img_location_count_1"],
            params[media_item]["img_location_size"],
            params[media_item]["img_location_size_1"],
            params[media_item]["percent_img_location_1"],
            params[media_item]["avg_img_location_size_1"],
            params[media_item]["small_img_location_count"],
            params[media_item]["small_img_location_count_1"],
            params[media_item]["small_img_location_size"],
            params[media_item]["small_img_location_size_1"],
            params[media_item]["percent_small_img_location_1"],
            params[media_item]["avg_small_img_location_size_1"],
            params[media_item]["video_location_count"],
            params[media_item]["video_location_count_1"],
            params[media_item]["video_location_size"],
            params[media_item]["video_location_size_1"],
            params[media_item]["percent_video_location_1"],
            params[media_item]["avg_video_location_size_1"],
            year + '-' + month + '-' + day,
        )

        cursor.execute(insert_sql, temp_params)
        db.commit()

    cursor.close()
    db.close()
예제 #10
0
def percent_table(year, month, day):
    db = get_mysql_db()
    cursor = db.cursor()
    sql = 'select media, news_count, total_count_1, img_location_count_1, small_img_location_count_1, ' \
          'video_location_count_1 from {} where datetime=%s'.format('statistics_result_table')

    insert_sql = 'insert into {}(media, news_percent, data_percent, img_location_percent,' \
                 ' small_img_location_percent, video_location_percent, datetime) values(%s, %s, %s, %s, %s, %s, %s)'\
                 .format("statistics_percent_table")

    cursor.execute(sql, year + '-' + month + '-' + day)
    result = cursor.fetchall()
    total_news_count = 0
    total_data_count = 0
    total_img_location_count = 0
    total_small_img_location_count = 0
    total_video_location_count = 0
    media_count_dict = {}
    for item in result:
        media_count_dict[item[0]] = {}
        total_news_count += item[1]
        media_count_dict[item[0]]["news_count"] = item[1]
        total_data_count += item[2]
        media_count_dict[item[0]]["total_count"] = item[2]
        total_img_location_count += item[3]
        media_count_dict[item[0]]["img_location_count"] = item[3]
        total_small_img_location_count += item[4]
        media_count_dict[item[0]]["small_img_location_count"] = item[4]
        total_video_location_count += item[5]
        media_count_dict[item[0]]["video_location_count"] = item[5]

    for key in media_count_dict:
        news_percent = '%.4f' % (float(media_count_dict[key]["news_count"]) /
                                 total_news_count)

        data_percent = '%.4f' % (float(media_count_dict[key]["total_count"]) /
                                 total_data_count)

        img_location_percent = '%.4f' % (float(
            media_count_dict[key]["total_count"]) / total_img_location_count)

        small_img_location_percent =\
            '%.4f' % (float(media_count_dict[key]["small_img_location_count"]) / total_small_img_location_count)

        video_location_percent =\
            '%.4f' % (float(media_count_dict[key]["video_location_count"]) / total_video_location_count)

        params = (
            key,
            float(news_percent),
            float(data_percent),
            float(img_location_percent),
            float(small_img_location_percent),
            float(video_location_percent),
            year + '-' + month + '-' + day,
        )
        cursor.execute(insert_sql, params)
        db.commit()

    cursor.close()
    db.close()