Пример #1
0
def scrapy_text(conn):
	'''
	Function doc
	统计 行业爬虫 每天抓取数据量 与 七天抓取的平均数量进行比较
	@return text

	Parameter doc
	@conn : Mysql数据库 连接实例
	'''
	sql = '''
		SELECT 
		task_type,
		num,
		num7_avg,
		stauts 
		FROM ind_sp_task_exec_info_st WHERE up_date = '%s' ''' % (today) 
	datas = select_data(conn,'ind_sp_task_exec_info_st',sql)
	strs_pks = []

	for data in datas:
		task_type 	= str(data[0])
		num 		= str(data[1])
		num7_avg 	= str(data[2])
		stauts 	    = str(data[3])
		s_pk = ''' %s : yesterday: %s    avg: %s %s <br> ''' % (task_type,num,num7_avg,stauts)
		strs_pks.append(s_pk)

	return '\n'.join(strs_pks)
Пример #2
0
def get_dis_shop_time(conn):
    '''
	Function doc
	获取需要去重的时间
	Parameter doc
	@conn Mysql 数据库连接实例
	return list
	'''
    sql = '''
	SELECT
		start_time,
		end_time,
		start_date,
		end_date,
		task_type,
		add_time
		FROM spider_job_stat WHERE stauts = 0 AND task_type = 'shop'
		'''
    data = select_data(conn, 'spider_job_stat', sql)
    items = []

    for d in data:
        item = {}
        item['start_time'] = str(d[0])
        item['end_time'] = str(d[1])
        item['start_date'] = str(d[2])
        item['end_date'] = str(d[3])
        item['task_type'] = str(d[4])
        item['add_time'] = str(d[5])
        items.append(item)
    return items
Пример #3
0
def sdian_text(conn):
	'''
	Function doc
	统计 多店 商品价格抓取情况
	@return text

	Parameter doc
	@conn : Mysql数据库 连接实例
	'''
	sql = '''
		SELECT
		a.dt,
		a.num num1,
		b.num2 num2,
		CASE
			WHEN b.num2 is NULL THEN 'is OK!'
			WHEN a.num > b.num2*0.8 THEN 'is OK!'
			ELSE '<font color="red"> <b>is not OK! </b></font>'
		END stauts
		FROM
		( SELECT count(1) num,dt FROM tb_item_price WHERE dt = '%s' ) a
		INNER JOIN
		( SELECT count(DISTINCT num_iid) num2 FROM tb_ware_list_get where dt='%s' ) b
		''' % (today,yestday)
	datas = select_data(conn,'tb_ware_list_get',sql)
	s_pk = ''
	if datas:
		for data in datas:
			num1 	= str(data[1])
			num2 	= str(data[2])
			stauts 	= str(data[3])	
			s_pk = '''today 需要抓取的商品id数量: %s 实际抓取到的商品id数量: %s ... %s<br> 
			''' % (num2,num1,stauts)

	return s_pk
Пример #4
0
def judge_ydhy_job(conn, spider):
    '''
    Fuction doc
    判断任务列表还有有运动行业的未执行任务
    Parameter doc
    @conn Mysql数据库连接实例
    @spider 爬虫的名字
    ruturn 返回查询出来的条数
    '''
    sql = "SELECT count(1) FROM spider_jobs WHERE stauts = 9 AND task_type = '%s' " % (
        spider)
    print sql
    data = select_data(conn, 'spider_jobs', sql)
    return int(data[0][0])
Пример #5
0
def xtep_shops_tb_text(conn):
	'''
	Function doc
	统计 淘宝多店 商品价格抓取情况 拼接成字符串
	@return text

	Parameter doc
	@conn : Mysql数据库 连接实例
	'''
	sqls = xtep_shops_tb_sql() #list
	datas = select_data(conn,'tb_ware_list_get',sqls[0])
	s_pk = ''

	for data in datas:
		num1 	= str(data[1])
		num2 	= str(data[2])
		stauts 	= str(data[3])	
		s_pk 	= '''today 需要抓取价格的商品id数量: %s 价格为空商品id数量: %s ... %s<br> 
		''' % (num1,num2,stauts)

	return s_pk
Пример #6
0
def ydhy_text(conn):
	'''
	Function doc
	统计 淘宝运动行业 商品抓取情况 拼接成字符串
	@return text

	Parameter doc
	@conn : Mysql数据库 连接实例
	'''
	sqls = ydhy_sql() #list
	datas = select_data(conn,'shop_get_item_list_new',sqls[0])
	s_pk = ''

	for data in datas:
		num1 	= str(data[1])
		num2 	= str(data[2])
		num3 	= str(data[3])
		stauts 	= str(data[4])	
		s_pk = '''today 需要抓取的商品数量: %s 实际抓取到的商品数量: %s 未抓取到货号的商品数量 %s ... %s<br> 
		''' % (num1,num2,num3,stauts)

	return s_pk
Пример #7
0
def shops_sale_info_stat_text(conn,key_type):
	'''
	Function doc
	店铺销量统计 
	@return text

	Parameter doc
	@conn : Mysql数据库 连接实例
	@key_type: 全渠道类型(如:xtep,mg)
	'''
	sql = '''
		SELECT a.sale, a.sale_b0, a.sale_s0, a.sale0_pr, a.brand,b.avg7pr 
		FROM shops_sale_info_stat a
		LEFT JOIN
		( SELECT brand,round(avg(sale0_pr),0) avg7pr FROM shops_sale_info_stat WHERE up_date > '%s' ) b
		ON a.brand = b.brand
		WHERE up_date = '%s' ''' % (ys_8_day,today)
	datas = select_data(conn,'shops_sale_info_stat',sql)
	s_pk = ''

	for data in datas:
		sale 		= str(data[0])
		sale_b0 	= str(data[1])
		sale_s0 	= str(data[2])
		sale0_pr 	= str(data[3])
		brand 		= str(data[4])
		avg7pr		= 0 if str(data[5]) == 'None' else int(data[5])*0.8

		if sale0_pr != '0' :
			stauts = 'is OK!' if int(sale0_pr.strip('%')) > avg7pr else '<font color="red"><b>is not OK!</b></font>'
		else:
			stauts = '<font color="red"><b>is not OK!</b></font>'
				
		s_pk = '''<font color="Green"><b>店铺销量统计</b></font><br> 
		%s : shop sale today 总销量: %s 授权店铺销量: %s 未授权店铺销量: %s  授权店铺销量占比 %s  %s 授权店铺7天平均销量占比 %s <br> 
		''' % (brand,sale,sale_b0,sale_s0,sale0_pr,stauts,str(avg7pr*1.25)+'%')

	return s_pk
Пример #8
0
def lost_itemno_text(conn,table):
	'''
	Function doc
	统计 全渠道 商品信息表 没货号商品数量
	@return text

	Parameter doc
	@conn : Mysql数据库 连接实例
	@table: 表名
	'''
	f_list = [
		'qqd_amazon_search',
		'qqd_dd_search',
		'qqd_jd_search',
		'qqd_sn_search',
		'qqd_tbwx_search',
		'qqd_tb_search',
		'qqd_t_itemno_search',
		'qqd_yhd_search']

	sql = '''
		SELECT
		a.fetch_api,
		a.num,
		b.num num_avg7,
		CASE 
			WHEN b.num is NULL THEN 'is OK!'
			WHEN a.fetch_api in ('qqd_t_itemno_search','qqd_tb_search') AND a.num < b.num+1000 THEN 'is OK!'
			WHEN a.num < b.num+100 THEN 'is OK!'
			ELSE '<font color="red"> <b>is not OK! </b></font>'
		END stauts
		FROM
		( SELECT  count(itemno) num, fetch_api FROM %s WHERE adddate = '%s' AND (itemno ='' OR itemno = '11111111')  GROUP BY fetch_api ORDER BY 1 ) a
		LEFT JOIN
		( SELECT round(avg(a.num),2) num, a.fetch_api fetch_api 
			FROM ( SELECT  count(itemno) num, fetch_api FROM %s WHERE adddate > '%s' AND adddate < '%s' AND (itemno ='' OR itemno = '11111111')  GROUP BY adddate,fetch_api ORDER BY  isTmall) a 
			GROUP BY a.fetch_api ORDER BY 1 ) b
		ON a.fetch_api = b.fetch_api ''' % (table,today,table,ys_8_day,today)
	datas = select_data(conn,table,sql)
	strs_pk = []
	# f_list2 = []

	for data in datas:
		fetch_api 	= str(data[0])
		count 		= str(data[1])
		count_avg 	= str(data[2])
		stauts 	    = str(data[3])
		if fetch_api in f_list:
			s_pk = ''' %s : today: %s    avg: %s %s <br> ''' % (fetch_api,count,count_avg,stauts)
			strs_pk.append(s_pk)
		# f_list2.append(fetch_api)

	# strs_pk2 = []
	# s_fs = ''

	# if len(f_list2) < len(f_list):#当平台没数据的时候
	# 	f_list3 = list(set(f_list).difference(set(f_list2))) #求list 差集
	# 	for i in f_list3:
	# 		s_f = ''' %s : today: <font color="red"> <b>0 </b></font>,<font color="red"> <b>is not OK! </b></font><br>''' % (i)
	# 		strs_pk2.append(s_f)
	# 	s_fs = '\n'.join(strs_pk2)

	return '\n'.join(strs_pk) 
Пример #9
0
def keywords_search_info_text(conn,table):
	'''
	Function doc
	统计 全渠道 商品信息表 
		每个爬虫当天抓取的数据量 与 7天平均抓取的数据量进行对比
	@return text

	Parameter doc
	@conn : Mysql数据库 连接实例
	@table: 表名
	'''
	f_list = [
		'qqd_amazon_search',
		'qqd_dd_search',
		'qqd_jd_search',
		'qqd_sn_search',
		'qqd_tbwx_search',
		'qqd_tb_search',
		'qqd_t_itemno_search',
		'qqd_yhd_search'] # 爬虫列表
	sql_1 = '''
		SELECT
		c.adddate,
		c.fetch_api,
		c.num,
		b.b_num,
		CASE
		WHEN b.b_num is NULL THEN
			'is OK!'
		WHEN c.num > b.b_num * 0.7 THEN
			'is OK!'
		ELSE
			'<font color="red"> <b>is not OK! </b></font>'
		END stauts
		FROM 
			(SELECT adddate, fetch_api, count(1) num FROM %s WHERE adddate = '%s' GROUP BY adddate, fetch_api) c
		LEFT JOIN 
			(SELECT fetch_api,round(avg(num),2) b_num FROM ( SELECT adddate, fetch_api, count(1) num FROM %s WHERE adddate > '%s' AND adddate < '%s' GROUP BY adddate,fetch_api) a GROUP BY fetch_api ORDER BY 1) b 
		ON c.fetch_api = b.fetch_api ''' % (table,today,table,ys_8_day,today)
	datas = select_data(conn,table,sql_1)
	strs_pk = []
	f_list2 = []

	for data in datas:
		fetch_api 	= str(data[1])
		count 		= str(data[2])
		count_avg 	= str(data[3])
		stauts 	    = str(data[4])
		if fetch_api in f_list:
			s_pk = ''' %s : today: %s    avg: %s %s <br> ''' % (fetch_api,count,count_avg,stauts)
			strs_pk.append(s_pk)
		f_list2.append(fetch_api)
	
	strs_pk2 = []
	s_fs = ''

	if len(f_list2) < len(f_list):#当平台没数据的时候
		f_list3 = list(set(f_list).difference(set(f_list2))) #求list 差集
		for i in f_list3:
			s_f = ''' %s : today: <font color="red"> <b>0 </b></font>,<font color="red"> <b>is not OK! </b></font><br>''' % (i)
			strs_pk2.append(s_f)
		s_fs = '\n'.join(strs_pk2)

	return ('\n'.join(strs_pk)) + s_fs
Пример #10
0
def insert_spider_job_stat(conn):
    '''
	Function doc
	统计10天内爬虫任务每轮用时,抓取的数据量,和每轮每次执行时间的平均值,最大值,最小值
	insert igore 插入数据库表 spider_job_stat
	Parameter doc
	@conn : Mysql 数据库连接实例
	'''
    start = time.time()
    pre_30day = time.strftime(
        "%Y-%m-%d %H:%M:%S", time.localtime(
            (int(time.time()) - 3600 * 24 * 10)))
    job_list = ['shop', 'search', 'bitem', 'citem']

    for job in job_list:
        sql = '''
			SELECT 
			task_type,
			count(1),
			min(update_time),
			max(end_time),
			add_time
			FROM spider_jobs WHERE task_type = '%s' AND add_time > '%s' GROUP BY add_time
			''' % (job, pre_30day)
        print sql
        data = select_data(conn, 'spider_jobs', sql)
        items = []

        if len(data) > 1:
            for d in data[:-1]:
                item = {}
                item['task_type'] = str(d[0])
                item['task_count'] = str(d[1])
                item['start_time'] = str(d[2])
                item['end_time'] = str(d[3])
                item['start_date'] = str(d[2]).split(' ')[0]
                item['end_date'] = str(d[3]).split(' ')[0]
                item['add_time'] = str(d[4])
                job_time = time_differ(item['end_time'], item['start_time'])
                item['job_time'] = round(job_time.total_seconds() / 60, 2)
                if int(item['task_count']) > 50:
                    sql = get_spider_1_job_time_sql(item['start_time'],
                                                    item['end_time'],
                                                    item['task_type'])
                    data1 = select_data(conn, 'spider_jobs', sql)

                    item['avg_1_job_time'] = str(data1[0][0])
                    item['max_1_job_time'] = str(data1[0][1])
                    item['min_1_job_time'] = str(data1[0][2])
                    sql = get_spider_grab_num_sql(item['start_time'],
                                                  item['end_time'],
                                                  item['start_date'],
                                                  item['end_date'],
                                                  item['task_type'])
                    data = select_data(conn, item['task_type'], sql)
                    item['num'] = str(data[0][0])
                elif int(item['task_count']) < 50:  #bitem有几个特殊任务运行时间很短
                    item['avg_1_job_time'] = 0
                    item['max_1_job_time'] = 0
                    item['min_1_job_time'] = 0
                    sql = get_spider_grab_num_sql(item['start_time'],
                                                  item['end_time'],
                                                  item['start_date'],
                                                  item['end_date'],
                                                  item['task_type'])
                    data = select_data(conn, item['task_type'], sql)
                    item['num'] = str(data[0][0])

                items.append(item)
            insert_ignore_dicts(items, 'spider_job_stat', conn)
Пример #11
0
def ind_sp_task_exec_info_st(conn):
    '''
	Function doc
	统计爬虫任务执行的所有情况插入ind_sp_task_exec_info_st表
	Parameter doc
	@conn Mysql 数据库连接实例
	'''
    sql_citem = sql_citem_statistics()
    sql_bitem = sql_bitem_statistics()
    sql_search = sql_search_statistics()
    sql_shop = sql_shop_statistics()
    sql_sp_jobs = sql_sp_jobs_statistics()
    up_time = time.strftime('%Y-%m-%d %H:%M:%S')
    up_date = time.strftime('%Y-%m-%d')

    datas = select_data(conn, 'spider_jobs', sql_sp_jobs)  #统计行业爬虫任务执行情况
    print sql_sp_jobs
    items = []
    for data in datas:  #将每只爬虫的结果保存到items
        item = {}
        item['up_time'] = up_time
        item['up_date'] = up_date
        item['task_type'] = str(data[0])
        item['task_success'] = str(data[1])
        item['task_fail'] = str(data[2])
        item['task_total'] = str(data[3])
        item['task_long'] = str(data[4])
        item['task_short'] = str(data[5])
        item['avg_time'] = str(data[6])
        item['max_time'] = str(data[7])
        item['min_time'] = str(data[8])
        items.append(item)
    for item in items:  #遍历结果
        if item['task_type'] == 'citem':  #查询的结果整合到爬虫任务信息中
            datas = select_data(conn, 'citem', sql_citem)
            print sql_citem
            if datas:
                item['shop_id'] = 0
                item['key_word'] = str(datas[0][0])
                item['num'] = str(datas[0][1])
                item['num7_avg'] = str(datas[0][2])
                item['stauts'] = str(datas[0][3])
        elif item['task_type'] == 'bitem':  #查询的结果整合到爬虫任务信息中
            datas = select_data(conn, 'bitem', sql_bitem)
            print sql_bitem
            if datas:
                item['shop_id'] = 0
                item['key_word'] = str(datas[0][0])
                item['num'] = str(datas[0][1])
                item['num7_avg'] = str(datas[0][2])
                item['stauts'] = str(datas[0][3])
        elif item['task_type'] == 'shop':  #查询的结果整合到爬虫任务信息中
            datas = select_data(conn, 'shop', sql_shop)
            print sql_shop
            if datas:
                item['shop_id'] = str(datas[0][0])
                item['key_word'] = str(datas[0][1])
                item['num'] = str(datas[0][2])
                item['num7_avg'] = str(datas[0][3])
                item['stauts'] = str(datas[0][4])
        elif item['task_type'] == 'search':  #查询的结果整合到爬虫任务信息中
            datas = select_data(conn, 'search', sql_search)
            print sql_search
            if datas:
                item['shop_id'] = 0
                item['key_word'] = str(datas[0][0])
                item['num'] = str(datas[0][1])
                item['num7_avg'] = str(datas[0][2])
                item['stauts'] = str(datas[0][3])

    if items:
        insert_ignore_dicts(items, 'ind_sp_task_exec_info_st', conn)
    else:
        print 'not items!'