def scrapy_text(conn): ''' Function doc 统计 行业爬虫 每天抓取数据量 与 七天抓取的平均数量进行比较 @return text Parameter doc @conn : Mysql数据库 连接实例 ''' sql = ''' SELECT task_type, num, num7_avg, stauts FROM ind_sp_task_exec_info_st WHERE up_date = '%s' ''' % (today) datas = select_data(conn,'ind_sp_task_exec_info_st',sql) strs_pks = [] for data in datas: task_type = str(data[0]) num = str(data[1]) num7_avg = str(data[2]) stauts = str(data[3]) s_pk = ''' %s : yesterday: %s avg: %s %s <br> ''' % (task_type,num,num7_avg,stauts) strs_pks.append(s_pk) return '\n'.join(strs_pks)
def get_dis_shop_time(conn): ''' Function doc 获取需要去重的时间 Parameter doc @conn Mysql 数据库连接实例 return list ''' sql = ''' SELECT start_time, end_time, start_date, end_date, task_type, add_time FROM spider_job_stat WHERE stauts = 0 AND task_type = 'shop' ''' data = select_data(conn, 'spider_job_stat', sql) items = [] for d in data: item = {} item['start_time'] = str(d[0]) item['end_time'] = str(d[1]) item['start_date'] = str(d[2]) item['end_date'] = str(d[3]) item['task_type'] = str(d[4]) item['add_time'] = str(d[5]) items.append(item) return items
def sdian_text(conn): ''' Function doc 统计 多店 商品价格抓取情况 @return text Parameter doc @conn : Mysql数据库 连接实例 ''' sql = ''' SELECT a.dt, a.num num1, b.num2 num2, CASE WHEN b.num2 is NULL THEN 'is OK!' WHEN a.num > b.num2*0.8 THEN 'is OK!' ELSE '<font color="red"> <b>is not OK! </b></font>' END stauts FROM ( SELECT count(1) num,dt FROM tb_item_price WHERE dt = '%s' ) a INNER JOIN ( SELECT count(DISTINCT num_iid) num2 FROM tb_ware_list_get where dt='%s' ) b ''' % (today,yestday) datas = select_data(conn,'tb_ware_list_get',sql) s_pk = '' if datas: for data in datas: num1 = str(data[1]) num2 = str(data[2]) stauts = str(data[3]) s_pk = '''today 需要抓取的商品id数量: %s 实际抓取到的商品id数量: %s ... %s<br> ''' % (num2,num1,stauts) return s_pk
def judge_ydhy_job(conn, spider): ''' Fuction doc 判断任务列表还有有运动行业的未执行任务 Parameter doc @conn Mysql数据库连接实例 @spider 爬虫的名字 ruturn 返回查询出来的条数 ''' sql = "SELECT count(1) FROM spider_jobs WHERE stauts = 9 AND task_type = '%s' " % ( spider) print sql data = select_data(conn, 'spider_jobs', sql) return int(data[0][0])
def xtep_shops_tb_text(conn): ''' Function doc 统计 淘宝多店 商品价格抓取情况 拼接成字符串 @return text Parameter doc @conn : Mysql数据库 连接实例 ''' sqls = xtep_shops_tb_sql() #list datas = select_data(conn,'tb_ware_list_get',sqls[0]) s_pk = '' for data in datas: num1 = str(data[1]) num2 = str(data[2]) stauts = str(data[3]) s_pk = '''today 需要抓取价格的商品id数量: %s 价格为空商品id数量: %s ... %s<br> ''' % (num1,num2,stauts) return s_pk
def ydhy_text(conn): ''' Function doc 统计 淘宝运动行业 商品抓取情况 拼接成字符串 @return text Parameter doc @conn : Mysql数据库 连接实例 ''' sqls = ydhy_sql() #list datas = select_data(conn,'shop_get_item_list_new',sqls[0]) s_pk = '' for data in datas: num1 = str(data[1]) num2 = str(data[2]) num3 = str(data[3]) stauts = str(data[4]) s_pk = '''today 需要抓取的商品数量: %s 实际抓取到的商品数量: %s 未抓取到货号的商品数量 %s ... %s<br> ''' % (num1,num2,num3,stauts) return s_pk
def shops_sale_info_stat_text(conn,key_type): ''' Function doc 店铺销量统计 @return text Parameter doc @conn : Mysql数据库 连接实例 @key_type: 全渠道类型(如:xtep,mg) ''' sql = ''' SELECT a.sale, a.sale_b0, a.sale_s0, a.sale0_pr, a.brand,b.avg7pr FROM shops_sale_info_stat a LEFT JOIN ( SELECT brand,round(avg(sale0_pr),0) avg7pr FROM shops_sale_info_stat WHERE up_date > '%s' ) b ON a.brand = b.brand WHERE up_date = '%s' ''' % (ys_8_day,today) datas = select_data(conn,'shops_sale_info_stat',sql) s_pk = '' for data in datas: sale = str(data[0]) sale_b0 = str(data[1]) sale_s0 = str(data[2]) sale0_pr = str(data[3]) brand = str(data[4]) avg7pr = 0 if str(data[5]) == 'None' else int(data[5])*0.8 if sale0_pr != '0' : stauts = 'is OK!' if int(sale0_pr.strip('%')) > avg7pr else '<font color="red"><b>is not OK!</b></font>' else: stauts = '<font color="red"><b>is not OK!</b></font>' s_pk = '''<font color="Green"><b>店铺销量统计</b></font><br> %s : shop sale today 总销量: %s 授权店铺销量: %s 未授权店铺销量: %s 授权店铺销量占比 %s %s 授权店铺7天平均销量占比 %s <br> ''' % (brand,sale,sale_b0,sale_s0,sale0_pr,stauts,str(avg7pr*1.25)+'%') return s_pk
def lost_itemno_text(conn,table): ''' Function doc 统计 全渠道 商品信息表 没货号商品数量 @return text Parameter doc @conn : Mysql数据库 连接实例 @table: 表名 ''' f_list = [ 'qqd_amazon_search', 'qqd_dd_search', 'qqd_jd_search', 'qqd_sn_search', 'qqd_tbwx_search', 'qqd_tb_search', 'qqd_t_itemno_search', 'qqd_yhd_search'] sql = ''' SELECT a.fetch_api, a.num, b.num num_avg7, CASE WHEN b.num is NULL THEN 'is OK!' WHEN a.fetch_api in ('qqd_t_itemno_search','qqd_tb_search') AND a.num < b.num+1000 THEN 'is OK!' WHEN a.num < b.num+100 THEN 'is OK!' ELSE '<font color="red"> <b>is not OK! </b></font>' END stauts FROM ( SELECT count(itemno) num, fetch_api FROM %s WHERE adddate = '%s' AND (itemno ='' OR itemno = '11111111') GROUP BY fetch_api ORDER BY 1 ) a LEFT JOIN ( SELECT round(avg(a.num),2) num, a.fetch_api fetch_api FROM ( SELECT count(itemno) num, fetch_api FROM %s WHERE adddate > '%s' AND adddate < '%s' AND (itemno ='' OR itemno = '11111111') GROUP BY adddate,fetch_api ORDER BY isTmall) a GROUP BY a.fetch_api ORDER BY 1 ) b ON a.fetch_api = b.fetch_api ''' % (table,today,table,ys_8_day,today) datas = select_data(conn,table,sql) strs_pk = [] # f_list2 = [] for data in datas: fetch_api = str(data[0]) count = str(data[1]) count_avg = str(data[2]) stauts = str(data[3]) if fetch_api in f_list: s_pk = ''' %s : today: %s avg: %s %s <br> ''' % (fetch_api,count,count_avg,stauts) strs_pk.append(s_pk) # f_list2.append(fetch_api) # strs_pk2 = [] # s_fs = '' # if len(f_list2) < len(f_list):#当平台没数据的时候 # f_list3 = list(set(f_list).difference(set(f_list2))) #求list 差集 # for i in f_list3: # s_f = ''' %s : today: <font color="red"> <b>0 </b></font>,<font color="red"> <b>is not OK! </b></font><br>''' % (i) # strs_pk2.append(s_f) # s_fs = '\n'.join(strs_pk2) return '\n'.join(strs_pk)
def keywords_search_info_text(conn,table): ''' Function doc 统计 全渠道 商品信息表 每个爬虫当天抓取的数据量 与 7天平均抓取的数据量进行对比 @return text Parameter doc @conn : Mysql数据库 连接实例 @table: 表名 ''' f_list = [ 'qqd_amazon_search', 'qqd_dd_search', 'qqd_jd_search', 'qqd_sn_search', 'qqd_tbwx_search', 'qqd_tb_search', 'qqd_t_itemno_search', 'qqd_yhd_search'] # 爬虫列表 sql_1 = ''' SELECT c.adddate, c.fetch_api, c.num, b.b_num, CASE WHEN b.b_num is NULL THEN 'is OK!' WHEN c.num > b.b_num * 0.7 THEN 'is OK!' ELSE '<font color="red"> <b>is not OK! </b></font>' END stauts FROM (SELECT adddate, fetch_api, count(1) num FROM %s WHERE adddate = '%s' GROUP BY adddate, fetch_api) c LEFT JOIN (SELECT fetch_api,round(avg(num),2) b_num FROM ( SELECT adddate, fetch_api, count(1) num FROM %s WHERE adddate > '%s' AND adddate < '%s' GROUP BY adddate,fetch_api) a GROUP BY fetch_api ORDER BY 1) b ON c.fetch_api = b.fetch_api ''' % (table,today,table,ys_8_day,today) datas = select_data(conn,table,sql_1) strs_pk = [] f_list2 = [] for data in datas: fetch_api = str(data[1]) count = str(data[2]) count_avg = str(data[3]) stauts = str(data[4]) if fetch_api in f_list: s_pk = ''' %s : today: %s avg: %s %s <br> ''' % (fetch_api,count,count_avg,stauts) strs_pk.append(s_pk) f_list2.append(fetch_api) strs_pk2 = [] s_fs = '' if len(f_list2) < len(f_list):#当平台没数据的时候 f_list3 = list(set(f_list).difference(set(f_list2))) #求list 差集 for i in f_list3: s_f = ''' %s : today: <font color="red"> <b>0 </b></font>,<font color="red"> <b>is not OK! </b></font><br>''' % (i) strs_pk2.append(s_f) s_fs = '\n'.join(strs_pk2) return ('\n'.join(strs_pk)) + s_fs
def insert_spider_job_stat(conn): ''' Function doc 统计10天内爬虫任务每轮用时,抓取的数据量,和每轮每次执行时间的平均值,最大值,最小值 insert igore 插入数据库表 spider_job_stat Parameter doc @conn : Mysql 数据库连接实例 ''' start = time.time() pre_30day = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime( (int(time.time()) - 3600 * 24 * 10))) job_list = ['shop', 'search', 'bitem', 'citem'] for job in job_list: sql = ''' SELECT task_type, count(1), min(update_time), max(end_time), add_time FROM spider_jobs WHERE task_type = '%s' AND add_time > '%s' GROUP BY add_time ''' % (job, pre_30day) print sql data = select_data(conn, 'spider_jobs', sql) items = [] if len(data) > 1: for d in data[:-1]: item = {} item['task_type'] = str(d[0]) item['task_count'] = str(d[1]) item['start_time'] = str(d[2]) item['end_time'] = str(d[3]) item['start_date'] = str(d[2]).split(' ')[0] item['end_date'] = str(d[3]).split(' ')[0] item['add_time'] = str(d[4]) job_time = time_differ(item['end_time'], item['start_time']) item['job_time'] = round(job_time.total_seconds() / 60, 2) if int(item['task_count']) > 50: sql = get_spider_1_job_time_sql(item['start_time'], item['end_time'], item['task_type']) data1 = select_data(conn, 'spider_jobs', sql) item['avg_1_job_time'] = str(data1[0][0]) item['max_1_job_time'] = str(data1[0][1]) item['min_1_job_time'] = str(data1[0][2]) sql = get_spider_grab_num_sql(item['start_time'], item['end_time'], item['start_date'], item['end_date'], item['task_type']) data = select_data(conn, item['task_type'], sql) item['num'] = str(data[0][0]) elif int(item['task_count']) < 50: #bitem有几个特殊任务运行时间很短 item['avg_1_job_time'] = 0 item['max_1_job_time'] = 0 item['min_1_job_time'] = 0 sql = get_spider_grab_num_sql(item['start_time'], item['end_time'], item['start_date'], item['end_date'], item['task_type']) data = select_data(conn, item['task_type'], sql) item['num'] = str(data[0][0]) items.append(item) insert_ignore_dicts(items, 'spider_job_stat', conn)
def ind_sp_task_exec_info_st(conn): ''' Function doc 统计爬虫任务执行的所有情况插入ind_sp_task_exec_info_st表 Parameter doc @conn Mysql 数据库连接实例 ''' sql_citem = sql_citem_statistics() sql_bitem = sql_bitem_statistics() sql_search = sql_search_statistics() sql_shop = sql_shop_statistics() sql_sp_jobs = sql_sp_jobs_statistics() up_time = time.strftime('%Y-%m-%d %H:%M:%S') up_date = time.strftime('%Y-%m-%d') datas = select_data(conn, 'spider_jobs', sql_sp_jobs) #统计行业爬虫任务执行情况 print sql_sp_jobs items = [] for data in datas: #将每只爬虫的结果保存到items item = {} item['up_time'] = up_time item['up_date'] = up_date item['task_type'] = str(data[0]) item['task_success'] = str(data[1]) item['task_fail'] = str(data[2]) item['task_total'] = str(data[3]) item['task_long'] = str(data[4]) item['task_short'] = str(data[5]) item['avg_time'] = str(data[6]) item['max_time'] = str(data[7]) item['min_time'] = str(data[8]) items.append(item) for item in items: #遍历结果 if item['task_type'] == 'citem': #查询的结果整合到爬虫任务信息中 datas = select_data(conn, 'citem', sql_citem) print sql_citem if datas: item['shop_id'] = 0 item['key_word'] = str(datas[0][0]) item['num'] = str(datas[0][1]) item['num7_avg'] = str(datas[0][2]) item['stauts'] = str(datas[0][3]) elif item['task_type'] == 'bitem': #查询的结果整合到爬虫任务信息中 datas = select_data(conn, 'bitem', sql_bitem) print sql_bitem if datas: item['shop_id'] = 0 item['key_word'] = str(datas[0][0]) item['num'] = str(datas[0][1]) item['num7_avg'] = str(datas[0][2]) item['stauts'] = str(datas[0][3]) elif item['task_type'] == 'shop': #查询的结果整合到爬虫任务信息中 datas = select_data(conn, 'shop', sql_shop) print sql_shop if datas: item['shop_id'] = str(datas[0][0]) item['key_word'] = str(datas[0][1]) item['num'] = str(datas[0][2]) item['num7_avg'] = str(datas[0][3]) item['stauts'] = str(datas[0][4]) elif item['task_type'] == 'search': #查询的结果整合到爬虫任务信息中 datas = select_data(conn, 'search', sql_search) print sql_search if datas: item['shop_id'] = 0 item['key_word'] = str(datas[0][0]) item['num'] = str(datas[0][1]) item['num7_avg'] = str(datas[0][2]) item['stauts'] = str(datas[0][3]) if items: insert_ignore_dicts(items, 'ind_sp_task_exec_info_st', conn) else: print 'not items!'