def insert_mysql(year, month, day): db = get_mysql_db() cursor = db.cursor() sql = "insert into {}(_id, media, posit, url, flag, file_size, datetime)" \ " values (%s, %s, %s, %s, %s, %s, %s)".format("statistics_mongo_compare_osslog") f = open("target_file/mongo_compare_osslog" + year + month + day + ".txt", "r") content = f.readlines() lis = list() for line in content: line_list = line.split() _id = line_list[0].strip() media = line_list[1].strip() posit = line_list[2].strip() url = line_list[3].strip() flag = int(line_list[4].strip()) file_size = int(line_list[5].strip()) param = (_id, media, posit, url, flag, file_size, year + '-' + month + '-' + day) lis.append(param) if len(lis) >= 2000: cursor.executemany(sql, lis) db.commit() lis = list() cursor.executemany(sql, lis) db.commit() cursor.close() db.close() f.close()
def insert_mysql_only_osslog(year, month, day): db = get_mysql_db() cursor = db.cursor() sql = "insert into {}(url, file_size, timestamps, datetime)" \ " values (%s, %s, %s, %s)".format("statistics_only_osslog") f = open("target_file/only_osslog" + year + month + day + ".txt", "r") content = f.readlines() lis = list() for line in content: line_list = line.split() url = line_list[0].strip() file_size = int(line_list[1].strip()) timestamp = format_time(line_list[2]) param = (url, file_size, timestamp, year + '-' + month + '-' + day) lis.append(param) if len(lis) >= 2000: cursor.executemany(sql, lis) db.commit() lis = list() cursor.executemany(sql, lis) db.commit() cursor.close() db.close() f.close()
def compare_table(year, month, day, count_dict, only_osslog_count): params = (count_dict["common_count"], count_dict["only_mongo_count"], only_osslog_count, year + '-' + month + '-' + day) db = get_mysql_db() cursor = db.cursor() sql = 'insert into {}(common_count, only_mongo_count, only_osslog_count, datetime) values(%s, %s, %s, %s)'\ .format("statistics_compare_table") cursor.execute(sql, params) db.commit() cursor.close() db.close()
def delete_mysql_table(save_days): time_str = (datetime.datetime.now() - datetime.timedelta(days=save_days + 1)).strftime("%Y-%m-%d") db = get_mysql_db() cursor = db.cursor() tables_list = ['statistics_mongo_compare_osslog', 'statistics_only_osslog'] for table in tables_list: sql = 'delete from {} where datetime=%s'.format(table) result = cursor.execute(sql, time_str) if result > 0: db.commit() logging.warning("Delete %s success! datetime: %s ...", table, time_str)
def size_table(year, month, day, common_size, only_osslog_size, danews_data_size, donews_test1_size, wangleilog_size): db = get_mysql_db() cursor = db.cursor() insert_sql = 'insert into {}(common_size, only_osslog_size, datetime,' \ ' danews_data_size, donews_test1_size, wangleilog_size) values(%s, %s, %s, %s, %s, %s)'\ .format("statistics_size_table") params = (common_size, only_osslog_size, year + '-' + month + '-' + day, danews_data_size, donews_test1_size, wangleilog_size) cursor.execute(insert_sql, params) db.commit() cursor.close() db.close()
def insert_mysql(year, month, day): db = get_mysql_db() cursor = db.cursor() sql = "insert into {}(_id)" \ " values (%s)".format("log_to_mongo0911_id") f = open("target_file/mongo_to_file" + year + month + day + ".txt", "r") content = f.readlines() lis = list() for line in content: line_list = line.split() _id = line_list[0].strip() param = (_id, ) lis.append(param) if len(lis) >= 2000: cursor.executemany(sql, lis) db.commit() lis = list() cursor.executemany(sql, lis) db.commit() f.close()
def export(year, month, day): table_name = 'statistics_result_table' conn = get_mysql_db() cursor = conn.cursor() sql1 = """ select media as 库名, news_count as 新闻数, total_count as 总资源数, total_count_1 as Mongo总资源数, format(total_size/1024/1024/1024, 2) as OSS日志存储总量G, format(total_size_1/1024/1024/1024, 2) as Mongo共有数据存储总量G, format(avg_size_1/1024/1024, 2) as 平均大小M, format(total_size_1*100/(select sum(total_size_1) from statistics_result_table where datetime ='{}'), 2) as 百分占比 from statistics_result_table where datetime = %s; """.format(year + '-' + month + '-' + day) sql2 = """ select media as 库名, news_count as 新闻数, img_location_count_1 as 图片数据量, format(img_location_size_1/1024/1024/1024, 2) as 图片大小G, small_img_location_count_1 as 缩略图数据量, format(small_img_location_size_1/1024/1024/1024, 2) as 缩略图大小G, video_location_count_1 as 视频数据量, format(video_location_size_1/1024/1024/1024, 2) as 视频大小G from {} where datetime = %s; """.format("statistics_result_table") sql3 = """ select media as 库名, ifnull(format(img_location_count_1/news_count, 2), 0) as 平均图片数量, ifnull(format(small_img_location_count_1/news_count, 2), 0) as 平均缩略图数量, ifnull(format(video_location_count_1/news_count, 2), 0) as 平均视频数量, ifnull(format(img_location_size_1/img_location_count_1/1024, 2), 0) as 单条数据平均图片大小kb, ifnull(format(small_img_location_size_1/small_img_location_count_1/1024, 2), 0) as 单条数据平均缩略图大小kb, ifnull(format(video_location_size_1/video_location_count_1/1024, 2), 0) as 单条数据视频大小kb, format(ifnull(img_location_size_1/img_location_count_1/1024, 0) + ifnull(small_img_location_size_1/small_img_location_count_1/1024, 0) + ifnull(video_location_size_1/video_location_count_1/1024, 0), 2) as 平均单条数据 from {} where datetime = %s; """.format("statistics_result_table") workbook = xlsxwriter.Workbook('csv_file/statistics_result_table' + year + '-' + month + '-' + day + '.xlsx') count = 1 for sql in [sql1, sql2, sql3]: cursor.execute(sql, year + '-' + month + '-' + day) # 搜取所有结果 results = cursor.fetchall() # 获取MYSQL里面的数据字段名称 fields = cursor.description sheet = workbook.add_worksheet('table_' + table_name + str(count)) count += 1 # 写上字段信息 for field in range(0, len(fields)): sheet.write(0, field, fields[field][0]) # 获取并写入数据段信息 for row in range(1, len(results) + 1): for col in range(0, len(fields)): value = results[row - 1][col] if str(value).find(".") != -1 or str(value) == '0': if str(value).find(",") != -1: value = decimal.Decimal(str(value).replace(",", '')) value = decimal.Decimal(value) sheet.write(row, col, value) workbook.close()
def export(year, month, day): table_name = 'statistics_result_table' conn = get_mysql_db() cursor = conn.cursor() sql1 = """ select media as 库名, news_count as 新闻数, total_count as 总资源数, total_count_1 as Mongo总资源数, format(total_size/1024/1024/1024, 2) as OSS日志存储总量G, format(total_size_1/1024/1024/1024, 2) as Mongo共有数据存储总量G, format(avg_size_1/1024/1024, 2) as 平均大小M, format(total_size_1*100/(select sum(total_size_1) from statistics_result_table where datetime ='{}'), 2) as 百分占比 from statistics_result_table where datetime = %s; """.format(year + '-' + month + '-' + day) sql2 = """ select media as 库名, news_count as 新闻数, img_location_count_1 as 图片数据量, format(img_location_size_1/1024/1024/1024, 2) as 图片大小G, small_img_location_count_1 as 缩略图数据量, format(small_img_location_size_1/1024/1024/1024, 2) as 缩略图大小G, video_location_count_1 as 视频数据量, format(video_location_size_1/1024/1024/1024, 2) as 视频大小G from {} where datetime = %s; """.format("statistics_result_table") sql3 = """ select media as 库名, ifnull(format(img_location_count_1/news_count, 2), 0) as 平均图片数量, ifnull(format(small_img_location_count_1/news_count, 2), 0) as 平均缩略图数量, ifnull(format(video_location_count_1/news_count, 2), 0) as 平均视频数量, ifnull(format(img_location_size_1/img_location_count_1/1024, 2), 0) as 单条数据平均图片大小kb, ifnull(format(small_img_location_size_1/small_img_location_count_1/1024, 2), 0) as 单条数据平均缩略图大小kb, ifnull(format(video_location_size_1/video_location_count_1/1024, 2), 0) as 单条数据视频大小kb, format(ifnull(img_location_size_1/img_location_count_1/1024, 0) + ifnull(small_img_location_size_1/small_img_location_count_1/1024, 0) + ifnull(video_location_size_1/video_location_count_1/1024, 0), 2) as 平均单条数据 from {} where datetime = %s; """.format("statistics_result_table") workbook = xlwt.Workbook() count = 1 for sql in [sql1, sql2, sql3]: cursor.execute(sql, year + '-' + month + '-' + day) # 重置游标的位置 # cursor.scroll(0,mode='absolute') # 搜取所有结果 results = cursor.fetchall() # 获取MYSQL里面的数据字段名称 fields = cursor.description sheet = workbook.add_sheet('table_' + table_name + str(count), cell_overwrite_ok=True) count += 1 # 写上字段信息 for field in range(0, len(fields)): sheet.write(0, field, fields[field][0]) # 获取并写入数据段信息 row = 1 col = 0 for row in range(1, len(results) + 1): for col in range(0, len(fields)): value = u'%s' % results[row - 1][col] sheet.write(row, col, value) workbook.save('csv_file/statistics_result_table' + year + '-' + month + '-' + day + '.csv')
def result_table(year, month, day, _id_media): db = get_mysql_db() cursor = db.cursor() params = {} select_sql = 'select media, posit, count(*) as count,' \ ' sum(file_size) as size from {} where media=%s and flag=1 and datetime=%s group by posit;'\ .format("statistics_mongo_compare_osslog") select_sql_other = 'select media, posit, count(*) as count,' \ ' sum(file_size) as size from {} where media=%s and datetime=%s group by posit;'\ .format("statistics_mongo_compare_osslog") client = get_mongo(MONGO_PARA_N) dmt_list = client['dmt_jh_data'].collection_names() crawled_list = client['crawled_TTH_web_page'].collection_names() crawled_list.remove(u"miaopai") media_list = dmt_list + crawled_list for media in media_list: para = (media, year + '-' + month + '-' + day) cursor.execute(select_sql, para) result = cursor.fetchall() params[media] = {} for item in result: params[media][item[1] + "_count_1"] = item[2] params[media][item[1] + "_size_1"] = float(item[3]) for posit in [ "img_location_count_1", "small_img_location_count_1", "video_location_count_1" ]: if posit not in params[media]: params[media][posit] = 0 for posit in [ "img_location_size_1", "small_img_location_size_1", "video_location_size_1" ]: if posit not in params[media]: params[media][posit] = 0 img_location_size_1 =\ params[media].get("img_location_size_1") if params[media].get("img_location_size_1") else 0 img_location_count_1 =\ params[media].get("img_location_count_1") if params[media].get("img_location_count_1") else 0 small_img_location_size_1 =\ params[media].get("small_img_location_size_1") if params[media].get("small_img_location_size_1") else 0 small_img_location_count_1 =\ params[media].get("small_img_location_count_1") if params[media].get("small_img_location_count_1") else 0 video_location_size_1 =\ params[media].get("video_location_size_1") if params[media].get("video_location_size_1") else 0 video_location_count_1 =\ params[media].get("video_location_count_1") if params[media].get("video_location_count_1") else 0 params[media][ "total_count_1"] = img_location_count_1 + small_img_location_count_1 + video_location_count_1 params[media][ "total_size_1"] = img_location_size_1 + small_img_location_size_1 + video_location_size_1 if params[media]["total_count_1"] > 0: params[media]["avg_size_1"] =\ float('%.4f' % (float(params[media]["total_size_1"]) / params[media]["total_count_1"])) params[media]["percent_img_location_1"] =\ float('%.4f' % (float(img_location_count_1) / params[media]["total_count_1"])) params[media]["percent_small_img_location_1"] =\ float('%.4f' % (float(small_img_location_count_1) / params[media]["total_count_1"])) params[media]["percent_video_location_1"] =\ float('%.4f' % (float(video_location_count_1) / params[media]["total_count_1"])) else: params[media]["avg_size_1"] = 0 params[media]["percent_img_location_1"] = 0 params[media]["percent_small_img_location_1"] = 0 params[media]["percent_video_location_1"] = 0 if img_location_count_1 > 0: params[media]["avg_img_location_size_1"] = \ float('%.4f' % (float(img_location_size_1) / img_location_count_1)) else: params[media]["avg_img_location_size_1"] = 0 if small_img_location_count_1 > 0: params[media]["avg_small_img_location_size_1"] =\ float('%.4f' % (float(small_img_location_size_1) / small_img_location_count_1)) else: params[media]["avg_small_img_location_size_1"] = 0 if video_location_count_1 > 0: params[media]["avg_video_location_size_1"] =\ float('%.4f' % (float(video_location_size_1) / video_location_count_1)) else: params[media]["avg_video_location_size_1"] = 0 for media_other in media_list: para = (media_other, year + '-' + month + '-' + day) cursor.execute(select_sql_other, para) result = cursor.fetchall() for item in result: params[media_other][item[1] + "_count"] = item[2] params[media_other][item[1] + "_size"] = float(item[3]) for posit in [ "img_location_count", "small_img_location_count", "video_location_count" ]: if posit not in params[media_other]: params[media_other][posit] = 0 for posit in [ "img_location_size", "small_img_location_size", "video_location_size" ]: if posit not in params[media_other]: params[media_other][posit] = 0 img_location_size =\ params[media_other].get("img_location_size") if params[media_other].get("img_location_size") else 0 img_location_count =\ params[media_other].get("img_location_count") if params[media_other].get("img_location_count") else 0 small_img_location_size =\ params[media_other].get("small_img_location_size") if params[media_other].get("small_img_location_size") else 0 small_img_location_count =\ params[media_other].get("small_img_location_count") if params[media_other].get("small_img_location_count") else 0 video_location_size =\ params[media_other].get("video_location_size") if params[media_other].get("video_location_size") else 0 video_location_count =\ params[media_other].get("video_location_count") if params[media_other].get("video_location_count") else 0 params[media_other][ "total_count"] = img_location_count + small_img_location_count + video_location_count params[media_other][ "total_size"] = img_location_size + small_img_location_size + video_location_size for media_item in params: params[media_item]["media"] = media_item params[media_item]["news_count"] = _id_media[media_item] insert_sql = "insert into {}(media, news_count, total_count, total_count_1, total_size, total_size_1," \ " avg_size_1, img_location_count, img_location_count_1, img_location_size, img_location_size_1," \ " percent_img_location_1, avg_img_location_size_1," \ " small_img_location_count, small_img_location_count_1, small_img_location_size," \ " small_img_location_size_1, percent_small_img_location_1, " \ "avg_small_img_location_size_1, video_location_count, video_location_count_1," \ " video_location_size, video_location_size_1, percent_video_location_1," \ " avg_video_location_size_1, datetime)" \ " values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s," \ " %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)".\ format("statistics_result_table") temp_params = ( params[media_item]["media"], params[media_item]["news_count"], params[media_item]["total_count"], params[media_item]["total_count_1"], params[media_item]["total_size"], params[media_item]["total_size_1"], params[media_item]["avg_size_1"], params[media_item]["img_location_count"], params[media_item]["img_location_count_1"], params[media_item]["img_location_size"], params[media_item]["img_location_size_1"], params[media_item]["percent_img_location_1"], params[media_item]["avg_img_location_size_1"], params[media_item]["small_img_location_count"], params[media_item]["small_img_location_count_1"], params[media_item]["small_img_location_size"], params[media_item]["small_img_location_size_1"], params[media_item]["percent_small_img_location_1"], params[media_item]["avg_small_img_location_size_1"], params[media_item]["video_location_count"], params[media_item]["video_location_count_1"], params[media_item]["video_location_size"], params[media_item]["video_location_size_1"], params[media_item]["percent_video_location_1"], params[media_item]["avg_video_location_size_1"], year + '-' + month + '-' + day, ) cursor.execute(insert_sql, temp_params) db.commit() cursor.close() db.close()
def percent_table(year, month, day): db = get_mysql_db() cursor = db.cursor() sql = 'select media, news_count, total_count_1, img_location_count_1, small_img_location_count_1, ' \ 'video_location_count_1 from {} where datetime=%s'.format('statistics_result_table') insert_sql = 'insert into {}(media, news_percent, data_percent, img_location_percent,' \ ' small_img_location_percent, video_location_percent, datetime) values(%s, %s, %s, %s, %s, %s, %s)'\ .format("statistics_percent_table") cursor.execute(sql, year + '-' + month + '-' + day) result = cursor.fetchall() total_news_count = 0 total_data_count = 0 total_img_location_count = 0 total_small_img_location_count = 0 total_video_location_count = 0 media_count_dict = {} for item in result: media_count_dict[item[0]] = {} total_news_count += item[1] media_count_dict[item[0]]["news_count"] = item[1] total_data_count += item[2] media_count_dict[item[0]]["total_count"] = item[2] total_img_location_count += item[3] media_count_dict[item[0]]["img_location_count"] = item[3] total_small_img_location_count += item[4] media_count_dict[item[0]]["small_img_location_count"] = item[4] total_video_location_count += item[5] media_count_dict[item[0]]["video_location_count"] = item[5] for key in media_count_dict: news_percent = '%.4f' % (float(media_count_dict[key]["news_count"]) / total_news_count) data_percent = '%.4f' % (float(media_count_dict[key]["total_count"]) / total_data_count) img_location_percent = '%.4f' % (float( media_count_dict[key]["total_count"]) / total_img_location_count) small_img_location_percent =\ '%.4f' % (float(media_count_dict[key]["small_img_location_count"]) / total_small_img_location_count) video_location_percent =\ '%.4f' % (float(media_count_dict[key]["video_location_count"]) / total_video_location_count) params = ( key, float(news_percent), float(data_percent), float(img_location_percent), float(small_img_location_percent), float(video_location_percent), year + '-' + month + '-' + day, ) cursor.execute(insert_sql, params) db.commit() cursor.close() db.close()