def get_last_id(table): store = AmazonStorePro() sql_select = "select scgs_id from {} order by scgs_id desc LIMIT 1".format(table) row = store.execute_sql(sql_select) if row: row_id = row[0]['scgs_id'] print(row_id) store.close()
def scan_database(rds, conf): sign = False try: store = AmazonStorePro(**MYSQL_CONFIG_LOCAL) # 服务器本地切换 sql_select = ( "select listing_id, product_id, site from bi_rc_task_center " "where create_time=%s and platform=%s") today = datetime.datetime.strptime(time.strftime("%Y-%m-%d"), "%Y-%m-%d") yesterday = today - datetime.timedelta(days=4) rows = store.execute_sql(sql_select, yesterday, 'amazon') sites = { 'US': 'com', 'UK': 'co.uk', 'FR': 'fr', 'DE': 'de', 'ES': 'es', 'CA': 'ca', 'IT': 'it', 'IN': 'in', 'JP': 'co.jp', 'AU': 'com.au', 'MX': 'com.mx' } for row in rows: listing_id = row['listing_id'] product_id = row['product_id'] site = row['site'] unique_comment = '{}{}'.format(product_id, site) if rds.is_member(conf.REDIS_REVIEW_ASIN, unique_comment): continue rds.add_set(conf.REDIS_REVIEW_ASIN, unique_comment) date_limit = '' latest_comment_date = rds.get_hash_field(conf.REDIS_REVIEW_DATE, unique_comment) if latest_comment_date: if isinstance(latest_comment_date, bytes): latest_comment_date = latest_comment_date.decode('utf-8') print(latest_comment_date) date_limit = latest_comment_date suffix = sites[site] page_url = "https://www.amazon.{suffix}/product-reviews/{asin}/?&pageNumber=1" \ "&pageSize=50&sortBy=recent".format(suffix=suffix, asin=product_id) mp = { 'entry': conf.COMMENT, 'page_url': page_url, 'asin': product_id, 'listing_id': listing_id, 'date': date_limit } rds.rds.rpush(conf.REDIS_START_URLS, mp) rds.delete_key(conf.REDIS_REVIEW_ASIN) store.close() except Exception as err: print('scan_database raise a error: {!r}'.format(err)) return sign
def data_insert(rds): if rds.exists_key(MysqlDataList): store = AmazonStorePro() while rds.exists_key(MysqlDataList): item = rds.rc.rpop(MysqlDataList) item_json = json.loads(item) table = item_json['table'] print(table) data = item_json['data'] try: if table == RelevanceTable: store.execute_sql( sql_relevance, data['wtr_sku_uuid'], data['wtr_sku_rank'], data['wtr_sku_url'], data['wtr_task_id'], data['wtr_task_type'], data['wtr_task_info'], data['wtr_platform'], data['wtr_crawl_time'], data['wtr_create_time']) else: store.execute_sql( sql_sku.format(table), data['scgs_uuid'], data['scgs_products_id'], data['scgs_url_id'], data['scgs_brand'], data['scgs_product_url'], data['scgs_name'], data['scgs_firstTitle'], data['scgs_secondTitle'], data['scgs_original_price'], data['scgs_price'], data['scgs_max_price'], data['scgs_discount'], data['scgs_dispatch'], data['scgs_shipping'], data['scgs_currency'], data['scgs_attribute'], data['scgs_version_urls'], data['scgs_review_count'], data['scgs_grade_count'], data['scgs_sales_total'], data['scgs_total_inventory'], data['scgs_favornum'], data['scgs_image_url'], data['scgs_extra_image_urls'], data['scgs_description'], data['scgs_category'], data['scgs_category_url'], data['scgs_tags'], data['scgs_shop_name'], data['scgs_shop_url'], data['scgs_generation_time'], data['scgs_platform'], data['scgs_platform_url'], data['scgs_crawl_time'], data['scgs_create_time'], data['scgs_status'], data['scgs_questions'], data['scgs_is_delete'], data['scgs_reserve_field_1'], data['scgs_reserve_field_2'], data['scgs_reserve_field_3'], data['scgs_reserve_field_4'], data['scgs_reserve_field_5'], data['scgs_reserve_field_6'], data['scgs_reserve_field_7']) except Exception as exp: traceback.print_exc() item_json['error'] = '{!r}'.format(exp) rds.rc.lpush(MysqlDataError, json.dumps(item_json)) print('finished insert') store.close() else: print('no item') time.sleep(30)
def scan_database(rds, conf): sign = False try: store = AmazonStorePro(**MYSQL_CONFIG_LOCAL) # 服务器本地切换 sql_select = ( "select listing_id, product_id, site from bi_rc_task_center " "where create_time=%s and platform=%s") today = datetime.datetime.strptime(time.strftime("%Y-%m-%d"), "%Y-%m-%d") yesterday = today - datetime.timedelta(days=4) rows = store.execute_sql(sql_select, yesterday, 'amazon') sites = {'US': 'com', 'UK': 'co.uk', 'FR': 'fr', 'DE': 'de', 'ES': 'es', 'CA': 'ca', 'IT': 'it', 'IN': 'in', 'JP': 'co.jp', 'AU': 'com.au', 'MX': 'com.mx'} for row in rows: listing_id = row['listing_id'] product_id = row['product_id'] site = row['site'] unique_comment = '{}{}'.format(product_id, site) if rds.is_member(conf.REDIS_REVIEW_ASIN, unique_comment): continue rds.add_set(conf.REDIS_REVIEW_ASIN, unique_comment) date_limit = '' latest_comment_date = rds.get_hash_field(conf.REDIS_REVIEW_DATE, unique_comment) if latest_comment_date: if isinstance(latest_comment_date, bytes): latest_comment_date = latest_comment_date.decode('utf-8') print(latest_comment_date) date_limit = latest_comment_date suffix = sites[site] page_url = "https://www.amazon.{suffix}/product-reviews/{asin}/?&pageNumber=1" \ "&pageSize=50&sortBy=recent".format(suffix=suffix, asin=product_id) mp = {'entry': conf.COMMENT, 'page_url': page_url, 'asin': product_id, 'listing_id': listing_id, 'date': date_limit} rds.rds.rpush(conf.REDIS_START_URLS, mp) rds.delete_key(conf.REDIS_REVIEW_ASIN) store.close() except Exception as err: print('scan_database raise a error: {!r}'.format(err)) return sign
def change_status(status, task_id): if Temp: pd = PostData() pd.update(wtcPlatform="amazon", wtcStatus=status, wtcId=task_id) else: store = AmazonStorePro() sql_update_status = "update crawler_wcs_task_center set wtc_status=%s, wtc_crawl_time=now() where wtc_id=%s" store.execute_sql(sql_update_status, status, task_id) store.close()
def data_insert(rds): if rds.exists_key(Config.REDIS_DATA_LIST): store = AmazonStorePro(**MYSQL_CONFIG_SERVER) # 服务器本地切换 while rds.exists_key(Config.REDIS_DATA_LIST): item = rds.rds.rpop(Config.REDIS_DATA_LIST) item_json = json.loads(item) table = item_json['table'] print(table) data = item_json['data'] try: if table == Config.MYSQL_TABLE_SKU_TRACK: store.execute_sql( sql_sku.format(table), data['scgs_uuid'], data['scgs_products_id'], data['scgs_url_id'], data['scgs_brand'], data['scgs_product_url'], data['scgs_name'], data['scgs_firstTitle'], data['scgs_secondTitle'], data['scgs_original_price'], data['scgs_price'], data['scgs_max_price'], data['scgs_discount'], data['scgs_dispatch'], data['scgs_shipping'], data['scgs_currency'], data['scgs_attribute'], data['scgs_version_urls'], data['scgs_review_count'], data['scgs_grade_count'], data['scgs_sales_total'], data['scgs_total_inventory'], data['scgs_favornum'], data['scgs_image_url'], data['scgs_extra_image_urls'], data['scgs_description'], data['scgs_category'], data['scgs_category_url'], data['scgs_tags'], data['scgs_shop_name'], data['scgs_shop_url'], data['scgs_generation_time'], data['scgs_platform'], data['scgs_platform_url'], data['scgs_crawl_time'], data['scgs_create_time'], data['scgs_status'], data['scgs_questions'], data['scgs_is_delete'], data['scgs_reserve_field_1'], data['scgs_reserve_field_2'], data['scgs_reserve_field_3'], data['scgs_reserve_field_4'], data['scgs_reserve_field_5'], data['scgs_reserve_field_6'], data['scgs_reserve_field_7']) except Exception as exp: traceback.print_exc() item_json['error'] = '{!r}'.format(exp) rds.rds.lpush(Config.REDIS_DATA_ERROR, json.dumps(item_json)) print('finished insert') store.close() else: print('no item') time.sleep(30)
def select_asin(rds): store = AmazonStorePro(**MYSQL_CONFIG_SERVER) # 服务器本地切换 add_task_cate = ("select wtc_task_category, wtc_task_type from crawler_amazon_track_task " "where wtc_status=%s and wtc_is_delete=%s") lines = store.execute_sql(add_task_cate, 1, 0) task_cate = 'amazon:di:cy:taskcate' for line in lines: cate_type = '{}@{}'.format(line['wtc_task_category'], line['wtc_task_type']) rds.add_set(task_cate, cate_type) sql_select_asin = ( "select scgs_id, scgs_products_id, scgs_category, scgs_category_url, scgs_generation_time, scgs_platform_url, " "scgs_type from crawler_amazon_sku_track_asin where scgs_status=%s and scgs_is_delete=%s ") rows = store.execute_sql(sql_select_asin, 1, 0) create_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) for row in rows: task_type = 1 category_entry = row['scgs_type'] task_category = row['scgs_category'] category_url = row['scgs_category_url'] suffix = row['scgs_platform_url'] asin = row['scgs_products_id'] cate_asin = '{}@{}@{}@{}'.format(task_category, category_entry, suffix, asin) asin_cate_type = '{}@{}'.format(task_category, category_entry) if not rds.is_member(task_cate, asin_cate_type): print('category out') continue if rds.is_member(Config.REDIS_CATE_ASIN, cate_asin): print('exist asin') continue unique_asin = '{}@{}'.format(asin, suffix) if rds.is_member(Config.REDIS_UNIQUE_ASIN, unique_asin): product_url = 'https://www.amazon.{}/dp/{}'.format(suffix, asin) repeat_mp = {'page_url': product_url, 'entry': 1, 'rank': 101, 'category_info': task_category, 'category_url': category_url, 'category_entry': category_entry, 'create_time': create_time} rds.rds.lpush(Config.REDIS_REPEAT_ASIN, repeat_mp) print('repeat asin') rds.add_set(Config.REDIS_CATE_ASIN, cate_asin) continue rds.add_set(Config.REDIS_CATE_ASIN, cate_asin) print(row['scgs_id']) page_url = 'https://www.amazon.{}/dp/{}'.format(suffix, asin) mp = {'entry': task_type, 'page_url': page_url, 'category_info': task_category, 'category_entry': category_entry, 'category_url': category_url, 'rank': 101, 'create_time': create_time} rds.rds.rpush('amazon:di:cy:detail', mp) store.close()
def scan_database(): print(time.strftime("%Y-%m-%d %H:%M:%S")) flag = False store = AmazonStorePro() pd = PostData() if Temp: rows_0 = pd.select(wtcPlatform="amazon", wtcStatus=0, limit=1) else: sql_select = ( "select wtc_id, wtc_task_type, wtc_task_frequency, wtc_task_period, wtc_task_info,wtc_task_category," "wtc_task_product_id, wtc_task_site from crawler_wcs_task_center where wtc_status=%s and wtc_platform=%s" "and wtc_is_delete=%s limit 1") rows_0 = store.execute_sql(sql_select, 0, 'amazon', 0) if rows_0: row_dct = rows_0[0] print(row_dct) if Temp: task_id = row_dct['wtcId'] try: task_type = row_dct['wtcTaskType'] task_frequency = row_dct['wtcTaskFrequency'] task_period = row_dct['wtcTaskPeriod'] task_info = row_dct['wtcTaskInfo'] task_category = row_dct['wtcTaskCategory'] task_asin = row_dct['wtcTaskProductId'] task_site = row_dct['wtcTaskSite'] except KeyError: print("KeyError") change_status(-1, task_id) return else: task_id = row_dct['wtc_id'] task_type = row_dct['wtc_task_type'] task_frequency = row_dct['wtc_task_frequency'] task_period = row_dct['wtc_task_period'] task_info = row_dct['wtc_task_info'] task_category = row_dct['wtc_task_category'] task_asin = row_dct['wtc_task_product_id'] task_site = row_dct['wtc_task_site'] if task_type == DetailTag: # 1 if not (task_asin and task_site): change_status(-1, task_id) return task_asin = task_asin.strip() task_site = task_site.strip() _uuid = str(uuid.uuid5(uuid.NAMESPACE_DNS, task_asin + task_site)).replace('-', '') product_url = 'https://www.amazon.{}/dp/{}'.format(task_site, task_asin) amazon_store = AmazonStore() amazon_store.insert_wcs_task_relevance(_uuid, 0, product_url, task_id, DetailTag, '', 'amazon') amazon_store.close() page_url = 'https://www.amazon.{}/dp/{}'.format(task_site, task_asin) elif task_type == KeyWordTag: # 3 if not (task_site and task_info) or (task_info.strip().startswith('http')): change_status(-1, task_id) return task_site = task_site.strip() task_info = task_info.strip() keyword = '+'.join(task_info.split()) if task_category: task_category = task_category.strip() search_box = SearchBox.get(task_category, None) if search_box: page_url = 'https://www.amazon.{}/s/?url=search-alias%3D{}&field-keywords={}'.format(task_site, search_box, keyword) else: change_status(-1, task_id) return else: page_url = 'https://www.amazon.{}/s/?url=search-alias%3Daps&field-keywords={}'.format(task_site, keyword) elif task_type in (ListTag, BestSellersTag, NewReleasesTag): # 2,4,5 if not task_info or (not task_info.strip().startswith('http')): change_status(-1, task_id) return if "keywords" in task_info: change_status(-1, task_id) return page_url = task_info.strip() else: change_status(-1, task_id) return mp = {'entry': task_type, 'page_url': page_url, 'task_id': task_id} if task_category: if task_type in (BestSellersTag, NewReleasesTag): mp['task_category'] = task_category.strip() if task_type == KeyWordTag: mp['search_box'] = task_category.strip() # 单次采集 if task_frequency == task_period == 1: RedisA.set_hash(OneTask, {'is_track': 0, 'task_id': task_id}) # 循环采集首次 elif task_period > task_frequency and task_type in (BestSellersTag, NewReleasesTag): RedisA.set_hash(OneTask, {'is_track': 1, 'task_id': task_id}) key = RedisSpace + str(task_id) now_time = time.strftime("%Y-%m-%d %H:%M:%S") RedisA.set_hash(key, {'start_track_time': now_time, 'last_track_time': now_time}) else: change_status(-1, task_id) return change_status(1, task_id) RedisA.rc.rpush(StartUrls, mp) # 昊旻测试用 RedisA.rc.rpush('amz_test', mp) update_proxy_ip(Que) else: # 循环采集 if Temp: rows_1 = pd.select(wtcPlatform="amazon", wtcStatus=1, limit=1000) else: sql_select_track = ( "select wtc_id, wtc_task_type, wtc_task_frequency, wtc_task_period, wtc_task_info,wtc_task_category," "wtc_task_product_id, wtc_task_site from crawler_wcs_task_center where wtc_status=%s and " "wtc_platform=%s and wtc_is_delete=%s") rows_1 = store.execute_sql(sql_select_track, 1, 'amazon', 0) for row_1 in rows_1: row_dct = row_1 if Temp: task_id = row_dct['wtcId'] task_type = row_dct['wtcTaskType'] task_frequency = row_dct['wtcTaskFrequency'] task_period = row_dct['wtcTaskPeriod'] task_info = row_dct['wtcTaskInfo'] task_category = row_dct['wtcTaskCategory'] else: task_id = row_dct['wtc_id'] task_type = row_dct['wtc_task_type'] task_frequency = row_dct['wtc_task_frequency'] task_period = row_dct['wtc_task_period'] task_info = row_dct['wtc_task_info'] task_category = row_dct['wtc_task_category'] key = RedisSpace + str(task_id) if RedisA.exists_key(key): start_track_time = RedisA.get_hash_field(key, 'start_track_time') last_track_time = RedisA.get_hash_field(key, 'last_track_time') if isinstance(start_track_time, bytes): start_track_time = start_track_time.decode('utf-8') last_track_time = last_track_time.decode('utf-8') start_track_time_dt = datetime.datetime.strptime(start_track_time, "%Y-%m-%d %H:%M:%S") last_track_time_dt = datetime.datetime.strptime(last_track_time, "%Y-%m-%d %H:%M:%S") end_track_time = start_track_time_dt + datetime.timedelta(days=task_period) next_track_time = last_track_time_dt + datetime.timedelta(days=task_frequency) now_time = datetime.datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S") if next_track_time > end_track_time: change_status(2, task_id) RedisA.delete_key(key) if now_time > next_track_time: page_url = task_info.strip() mp = {'entry': task_type, 'page_url': page_url, 'task_id': task_id} if task_category: mp['task_category'] = task_category.strip() RedisA.set_hash(key, {'last_track_time': now_time}) change_status(1, task_id) RedisA.rc.rpush(StartUrls, mp) print('track: %s' % task_id) update_proxy_ip(Que) break print('not track time: %s' % task_id) else: flag = True store.close() if flag: print('no task, waiting for {} sec.'.format(WaitSec)) time.sleep(WaitSec)
from store import AmazonStorePro, AmazonRedis from settings import MYSQL_CONFIG_LOCAL, MYSQL_CONFIG_SERVER, REDIS_CONFIG_LOCAL amazon_store = AmazonStorePro(**MYSQL_CONFIG_LOCAL) sql_1 = "select category, entry, url from amazon_top_category" sql_2 = "select wtc_id from crawler_amazon_track_task where wtc_task_category=%s and wtc_task_type=%s " \ "and wtc_task_site='com'" sql_3 = "update crawler_amazon_track_task set wtc_task_info=%s where wtc_task_category=%s and wtc_task_type=%s " \ "and wtc_task_site='com'" sql_4 = "insert into crawler_amazon_track_task(wtc_task_category, wtc_task_type, wtc_task_info, wtc_task_site, " \ "wtc_status, wtc_is_delete, wtc_create_time)values(%s,%s,%s,'com', 3, 0, now())" sql_5 = "select wtc_id, wtc_task_category,wtc_task_type from crawler_amazon_track_task where wtc_task_site='com'" sql_6 = "select id from amazon_top_category where category=%s and entry=%s" sql_7 = "update crawler_amazon_track_task set wtc_status=5 where wtc_id=%s" sql_truncate = "truncate amazon_top_category" rows = amazon_store.execute_sql(sql_1) rows_set = set() for row in rows: key = '{}{}'.format(row['category'], row['entry']) if key in rows_set: continue rows_set.add(key)
def scan_database(): print(time.strftime("%Y-%m-%d %H:%M:%S")) flag = False store = AmazonStorePro() pd = PostData() if Temp: rows_0 = pd.select(wtcPlatform="amazon", wtcStatus=0, limit=1) else: sql_select = ( "select wtc_id, wtc_user_id, wtc_task_type, wtc_task_frequency, wtc_task_period, wtc_task_info,wtc_task_category," "wtc_task_product_id, wtc_task_site from crawler_wcs_task_center where wtc_status=%s and wtc_platform=%s" "and wtc_is_delete=%s limit 1") rows_0 = store.execute_sql(sql_select, 0, 'amazon', 0) if rows_0: row_dct = rows_0[0] print(row_dct) if Temp: task_id = row_dct['wtcId'] try: user_id = row_dct['wtcUserId'] task_type = row_dct['wtcTaskType'] task_frequency = row_dct['wtcTaskFrequency'] task_period = row_dct['wtcTaskPeriod'] task_info = row_dct['wtcTaskInfo'] task_category = row_dct['wtcTaskCategory'] task_asin = row_dct['wtcTaskProductId'] task_site = row_dct['wtcTaskSite'] except KeyError: print("KeyError") change_status(-1, '', task_id) return else: task_id = row_dct['wtc_id'] user_id = row_dct['wtc_user_id'] task_type = row_dct['wtc_task_type'] task_frequency = row_dct['wtc_task_frequency'] task_period = row_dct['wtc_task_period'] task_info = row_dct['wtc_task_info'] task_category = row_dct['wtc_task_category'] task_asin = row_dct['wtc_task_product_id'] task_site = row_dct['wtc_task_site'] # 详情页 task_num = 1 if task_type == DetailTag: if not (task_asin and task_site): change_status(-1, '', task_id) # 前端已控制 return task_site = task_site.strip() task_asin_set = {item.strip() for item in task_asin.strip().replace(',', ',').split(',') if item} task_num = len(task_asin_set) for task_asin in task_asin_set: print(task_asin) page_url = 'https://www.amazon.{}/dp/{}'.format(task_site, task_asin) mp = {'entry': task_type, 'page_url': page_url, 'task_id': task_id} RedisA.rc.rpush(StartUrls, mp) # 关键词 elif task_type == KeyWordTag: if not (task_site and task_info) or (task_info.strip().startswith('http')): change_status(-1, '601', task_id) # 601 return task_site = task_site.strip() task_info = task_info.strip() keyword = '+'.join(task_info.split()) task_category = task_category.strip() sql_alias = ("select search_alias from crawler_wcs_amazon_search_category where site=%s and " "search_category=%s") search_alias = store.execute_sql(sql_alias, task_site, task_category) if search_alias: search_alias = search_alias[0]['search_alias'] page_url = 'https://www.amazon.{}/s/?url=search-alias%3D{}&field-keywords={}'.format(task_site, search_alias, keyword) else: change_status(-1, '602', task_id) return mp = {'entry': task_type, 'page_url': page_url, 'task_id': task_id, 'search_box': task_category.strip()} RedisA.rc.rpush(StartUrls, mp) # 列表,热销和新品 elif task_type in (ListTag, BestSellersTag, NewReleasesTag): # 2,4,5 if not task_info or (not task_info.strip().startswith('http')): change_status(-1, '', task_id) # 前端已控制 return if "keywords" in task_info: # 603 change_status(-1, '603', task_id) return if task_type == ListTag and ( 'Best-Sellers' in task_info or 'bestsellers' in task_info or 'best-sellers' in task_info or 'new-releases' in task_info): change_status(-1, '603', task_id) return page_url = task_info.strip() mp = {'entry': task_type, 'page_url': page_url, 'task_id': task_id, 'task_category': task_category} RedisA.rc.rpush(StartUrls, mp) else: change_status(-1, '', task_id) # 前端已控制 return # 单次采集 if task_frequency == task_period: hash_mp = {'is_track': 0, 'task_id': task_id, 'user_id': user_id, 'task_num': task_num} RedisA.set_hash(OneTask, hash_mp) # 循环采集首次 elif task_type in (BestSellersTag, NewReleasesTag): # and task_period > task_frequency RedisA.set_hash(OneTask, {'is_track': 1, 'task_id': task_id, 'user_id': user_id}) key = RedisSpace + str(task_id) now_time = time.strftime("%Y-%m-%d %H:%M:%S") RedisA.set_hash(key, {'start_track_time': now_time, 'last_track_time': now_time}) else: change_status(-1, '', task_id) # 前端已控制 return change_status(1, '', task_id) update_proxy_ip(Que) else: # 循环采集 if Temp: rows_1 = pd.select(wtcPlatform="amazon", wtcStatus=1, limit=1000) else: sql_select_track = ( "select wtc_id, wtc_task_type, wtc_task_frequency, wtc_task_period, wtc_task_info,wtc_task_category," "wtc_task_product_id, wtc_task_site from crawler_wcs_task_center where wtc_status=%s and " "wtc_platform=%s and wtc_is_delete=%s") rows_1 = store.execute_sql(sql_select_track, 1, 'amazon', 0) for row_1 in rows_1: row_dct = row_1 if Temp: task_id = row_dct['wtcId'] task_type = row_dct['wtcTaskType'] task_frequency = row_dct['wtcTaskFrequency'] task_period = row_dct['wtcTaskPeriod'] task_info = row_dct['wtcTaskInfo'] task_category = row_dct['wtcTaskCategory'] else: task_id = row_dct['wtc_id'] task_type = row_dct['wtc_task_type'] task_frequency = row_dct['wtc_task_frequency'] task_period = row_dct['wtc_task_period'] task_info = row_dct['wtc_task_info'] task_category = row_dct['wtc_task_category'] key = RedisSpace + str(task_id) if RedisA.exists_key(key): start_track_time = RedisA.get_hash_field(key, 'start_track_time') last_track_time = RedisA.get_hash_field(key, 'last_track_time') if isinstance(start_track_time, bytes): start_track_time = start_track_time.decode('utf-8') last_track_time = last_track_time.decode('utf-8') start_track_time_dt = datetime.datetime.strptime(start_track_time, "%Y-%m-%d %H:%M:%S") last_track_time_dt = datetime.datetime.strptime(last_track_time, "%Y-%m-%d %H:%M:%S") end_track_time = start_track_time_dt + datetime.timedelta(days=task_period) next_track_time = last_track_time_dt + datetime.timedelta(days=task_frequency) now_time = datetime.datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S") if next_track_time > end_track_time: change_status(2, '', task_id) RedisA.delete_key(key) if now_time > next_track_time: page_url = task_info.strip() mp = {'entry': task_type, 'page_url': page_url, 'task_id': task_id} if task_category: mp['task_category'] = task_category.strip() RedisA.set_hash(key, {'last_track_time': now_time}) change_status(1, '', task_id) RedisA.rc.rpush(StartUrls, mp) print('track: %s' % task_id) update_proxy_ip(Que) break print('not track time: %s' % task_id) else: flag = True store.close() if flag: print('no task, waiting for {} sec.'.format(WaitSec)) time.sleep(WaitSec)
def update_time(ut): store = AmazonStorePro(**MYSQL_CONFIG_SERVER) # 服务器本地切换 store.execute_sql(sql_asin, ut) store.close()
import random import re import requests from lxml import etree from settings import HEADERS from store import AmazonStorePro from parse_product_html import SiteType sql_insert = ( "insert into crawler_wcs_amazon_search_category(site, search_category, search_alias, create_time)" "values(%s,%s,%s,now())") amazon_store = AmazonStorePro() def get_searchword(url): suffix = re.findall(r'www.amazon.(.+)', url)[0] print(suffix) headers = {'user-agent': random.choice(HEADERS)} req = requests.get(url, headers=headers) if req.status_code == 200: sel = etree.HTML(req.text) options = sel.xpath('//select[@id="searchDropdownBox"]/option') for op in options: key_word = op.xpath('./text()')[0].strip() parm = op.xpath('./@value')[0].split('=')[1].strip() print(key_word, parm) amazon_store.execute_sql(sql_insert, suffix, key_word, parm) if __name__ == '__main__':
def select_asin(rds): if rds.exists_key(Config.REDIS_REPEAT_ASIN): store = AmazonStorePro(**MYSQL_CONFIG_SERVER) # 服务器本地切换 while rds.exists_key(Config.REDIS_REPEAT_ASIN): item = rds.rds.rpop(Config.REDIS_REPEAT_ASIN) asin_mp = eval(item) product_url = asin_mp['page_url'] asin = re.findall(r'dp/(.+)', product_url)[0] print(asin) rank = asin_mp['rank'] category = asin_mp['category_info'] category_entry = asin_mp['category_entry'] create_time = asin_mp['create_time'] category_url = asin_mp['category_url'] suffix = re.findall(r'www.amazon.(.*?)/', category_url)[0] if int(category_entry) == 4: tags = 'BestSellers' else: tags = 'NewReleases' crawl_time = create_time.split()[0] rst = store.execute_sql(sql_select_asin, asin, suffix) if rst: rst = rst[0] sku_mp = { 'scgs_uuid': rst['scgs_uuid'], 'scgs_products_id': asin, 'scgs_url_id': rst['scgs_url_id'], 'scgs_brand': rst['scgs_brand'], 'scgs_product_url': product_url, 'scgs_name': rst['scgs_name'], 'scgs_firstTitle': rst['scgs_firstTitle'], 'scgs_secondTitle': rst['scgs_secondTitle'], 'scgs_original_price': rst['scgs_original_price'], 'scgs_price': rst['scgs_price'], 'scgs_max_price': rst['scgs_max_price'], 'scgs_discount': rst['scgs_discount'], 'scgs_dispatch': '', 'scgs_shipping': '', 'scgs_currency': rst['scgs_currency'], 'scgs_attribute': rst['scgs_attribute'], 'scgs_version_urls': '', 'scgs_review_count': rst['scgs_review_count'], 'scgs_grade_count': rst['scgs_grade_count'], 'scgs_sales_total': '', 'scgs_total_inventory': '', 'scgs_favornum': rank, 'scgs_image_url': rst['scgs_image_url'], 'scgs_extra_image_urls': rst['scgs_extra_image_urls'], 'scgs_description': rst['scgs_description'], 'scgs_category': category, 'scgs_category_url': category_url, 'scgs_tags': tags, 'scgs_shop_name': rst['scgs_shop_name'], 'scgs_shop_url': rst['scgs_shop_url'], 'scgs_generation_time': rst['scgs_generation_time'].strftime("%Y-%m-%d"), 'scgs_platform': 'amazon', 'scgs_platform_url': suffix, 'scgs_crawl_time': crawl_time, 'scgs_create_time': create_time, 'scgs_status': 0, 'scgs_questions': rst['scgs_questions'], 'scgs_is_delete': 0, 'scgs_reserve_field_1': rst['scgs_reserve_field_1'], 'scgs_reserve_field_2': rst['scgs_reserve_field_2'], 'scgs_reserve_field_3': rst['scgs_reserve_field_3'], 'scgs_reserve_field_4': rst['scgs_reserve_field_4'], 'scgs_reserve_field_5': rst['scgs_reserve_field_5'], 'scgs_reserve_field_6': rst['scgs_reserve_field_6'], 'scgs_reserve_field_7': rst['scgs_reserve_field_7'], } data_mp = {"table": Config.MYSQL_TABLE_SKU_TRACK, "data": sku_mp} push_data_into_redis(rds, Config, data_mp) else: print('no exist asin') print('push repeat done') store.close() else: print('no repeat asin')
def scan_database(rds, conf): # headers = {'User-Agent': random.choice(HEADERS)} # jollychic_url = 'https://www.jollychic.com/delivery.html' # resp = requests.get(jollychic_url, headers=headers) # sel = etree.HTML(resp.text) # country_lst = sel.xpath('//dl[@class="delivery-chose-select fn-hide J-delivery-cnt J-delivery-country"]/dd') # for country in country_lst: # country_id = country.xpath('./@data-value')[0].strip() # country_name = country.xpath('./text()')[0].strip() # data = { # 'regionId': country_id, # 'configType': '1', # 'type': 'country', # 'countryId': country_id, # 'provinceId': '0', # 'cityId': '0' # } # dt = {'site': 'jollychic', 'url': 'https://www.jollychic.com/DeliveryAction/ajaxGetRegionShippingFee', # 'country': country_name, 'data': data} # mp = json.dumps(dt) # print(mp) # print(country_id, country_name) # rds.rds.rpush(conf.REDIS_START_URLS, mp) sign = False try: store = AmazonStorePro(**MYSQL_CONFIG_LOCAL) # 服务器本地切换 sql_update_status = "update crawler_amazon_track_task set wtc_status=%s, wtc_crawl_time=now() where wtc_id=%s" sql_select_track = ( "select wtc_id, wtc_task_type, wtc_task_frequency, wtc_task_period, wtc_task_info,wtc_task_category, " "wtc_crawl_time, wtc_create_time, wtc_task_site from crawler_amazon_track_task_copy " "where wtc_status=%s and wtc_is_delete=%s") rows = store.execute_sql(sql_select_track, 1, 0) for row in rows: task_id = row['wtc_id'] task_type = row['wtc_task_type'] task_frequency = row['wtc_task_frequency'] task_period = row['wtc_task_period'] task_info = row['wtc_task_info'] task_category = row['wtc_task_category'] start_track_time = row['wtc_create_time'] last_track_time = row['wtc_crawl_time'] # 非首次采集 if last_track_time is not None: end_track_time = start_track_time + datetime.timedelta( days=task_period) next_track_time = last_track_time + datetime.timedelta( days=task_frequency) now_time = datetime.datetime.strptime( time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S") if next_track_time > end_track_time: store.execute_sql(sql_update_status, 2, task_id) if now_time > next_track_time: sign = True page_url = task_info.strip() mp = { 'entry': task_type, 'page_url': page_url, 'task_category': task_category } #store.execute_sql(sql_update_status, 1, task_id) rds.rds.rpush(conf.REDIS_START_URLS, mp) print('track: %s' % task_id) else: sign = True page_url = task_info.strip() mp = { 'entry': task_type, 'page_url': page_url, 'task_category': task_category } #store.execute_sql(sql_update_status, 1, task_id) rds.rds.rpush(conf.REDIS_START_URLS, mp) print('track: %s' % task_id) store.close() except Exception as err: print('scan_database raise a error: {!r}'.format(err)) return sign
def data_insert(rds): if rds.exists_key(Config.REDIS_DATA_LIST): store = AmazonStorePro(**MYSQL_CONFIG_LOCAL) # 服务器本地切换 while rds.exists_key(Config.REDIS_DATA_LIST): item = rds.rds.rpop(Config.REDIS_DATA_LIST) item_json = json.loads(item) table = item_json['table'] print(table) data = item_json['data'] try: if table in (Config.MYSQL_TABLE_SKU, Config.MYSQL_TABLE_SKU_TRACK): store.execute_sql(sql_sku.format(table), data['scgs_uuid'], data['scgs_products_id'], data['scgs_url_id'], data['scgs_brand'], data['scgs_product_url'], data['scgs_name'], data['scgs_firstTitle'], data['scgs_secondTitle'], data['scgs_original_price'], data['scgs_price'], data['scgs_max_price'], data['scgs_discount'], data['scgs_dispatch'], data['scgs_shipping'], data['scgs_currency'], data['scgs_attribute'], data['scgs_version_urls'], data['scgs_review_count'], data['scgs_grade_count'], data['scgs_sales_total'], data['scgs_total_inventory'], data['scgs_favornum'], data['scgs_image_url'], data['scgs_extra_image_urls'], data['scgs_description'], data['scgs_category'], data['scgs_category_url'], data['scgs_tags'], data['scgs_shop_name'], data['scgs_shop_url'], data['scgs_generation_time'], data['scgs_platform'], data['scgs_platform_url'], data['scgs_crawl_time'], data['scgs_create_time'], data['scgs_status'], data['scgs_questions'], data['scgs_is_delete'], data['scgs_reserve_field_1'], data['scgs_reserve_field_2'], data['scgs_reserve_field_3'], data['scgs_reserve_field_4'], data['scgs_reserve_field_5'], data['scgs_reserve_field_6'], data['scgs_reserve_field_7']) elif table == Config.MYSQL_TABLE_TOP_CATE: store.execute_sql(sql_cate.format(table), data['category'], data['url'], data['entry'], data['rank']) elif table == Config.MYSQL_TABLE_GRADE: store.execute_sql(sql_grade.format(table), data['rc_listing_id'], data['rc_product_id'], data['rc_product_url'], data['rc_platform'], data['rc_site'], data['rc_grade_overall'], data['rc_grade_1_count'], data['rc_grade_2_count'], data['rc_grade_3_count'], data['rc_grade_4_count'], data['rc_grade_5_count'], data['rc_reviews_count'], data['rc_reviews_url'], data['is_delete'], data['rc_reserve_1'], data['rc_reserve_2'], ) elif table == Config.MYSQL_TABLE_REVIEW: store.execute_sql(sql_review.format(table), data['rc_listing_id'], data['rc_product_id'], data['rc_product_url'], data['rc_platform'], data['rc_site'], data['rc_review_id'], data['rc_review_time'], data['rc_customer'], data['rc_customer_email'], data['rc_review_title'], data['rc_review_content'], data['rc_review_image'], data['rc_review_grade'], data['rc_review_grade_detail'], data['is_delete'], data['rc_reserve_1'], data['rc_reserve_2'], data['rc_reserve_3'], ) else: print('wrong table name') except Exception as exp: traceback.print_exc() item_json['error'] = '{!r}'.format(exp) rds.rds.lpush(Config.REDIS_DATA_ERROR, json.dumps(item_json)) print('finished insert') store.close() else: print('no item') time.sleep(30)
# -*- coding: utf-8 -*- from store import AmazonStorePro from settings import MYSQL_CONFIG_SERVER import re store_server = AmazonStorePro(**MYSQL_CONFIG_SERVER) sql_insert = ( "insert into crawler_amazon_track_task(wtc_task_type, wtc_task_frequency, wtc_task_period," "wtc_task_info, wtc_task_category,wtc_task_site,wtc_status, wtc_is_delete, wtc_create_time)" "values(%s,%s,%s,%s,%s,%s,%s,%s,now())") sql_select = ( "select wtc_id from crawler_amazon_track_task where wtc_task_category=%s and wtc_task_type=%s " "and wtc_task_site=%s") category_lst = [] with open('category', encoding='utf-8') as f: for line in f: if len(line) > 0: line = '>'.join([item.strip() for item in line.strip().split('>')]) category_lst.append(line) url_lst = [] with open('url', encoding='utf-8') as f: for line in f: if len(line) > 0: line = line.strip() url_lst.append(line) print(len(category_lst)) print(len(url_lst))
def scan_database(rds, conf): sign = False try: store = AmazonStorePro(**MYSQL_CONFIG_SERVER) # 服务器本地切换 sql_update_status = "update crawler_amazon_track_task set wtc_status=%s, wtc_crawl_time=now() where wtc_id=%s" sql_select_track = ( "select wtc_id, wtc_task_type, wtc_task_frequency, wtc_task_period, wtc_task_info,wtc_task_category, " "wtc_crawl_time, wtc_create_time, wtc_task_site from crawler_amazon_track_task " "where wtc_status=%s and wtc_is_delete=%s") rows = store.execute_sql(sql_select_track, 1, 0) for row in rows: task_id = row['wtc_id'] task_type = row['wtc_task_type'] task_frequency = row['wtc_task_frequency'] task_period = row['wtc_task_period'] task_info = row['wtc_task_info'] task_category = row['wtc_task_category'] start_track_time = row['wtc_create_time'] last_track_time = row['wtc_crawl_time'] # 非首次采集 if last_track_time is not None: end_track_time = start_track_time + datetime.timedelta( days=task_period) next_track_time = last_track_time + datetime.timedelta( days=task_frequency) now_time = datetime.datetime.strptime( time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S") if next_track_time > end_track_time: store.execute_sql(sql_update_status, 2, task_id) if now_time > next_track_time: sign = True page_url = task_info.strip() mp = { 'entry': task_type, 'page_url': page_url, 'task_category': task_category } store.execute_sql(sql_update_status, 1, task_id) rds.rds.rpush(conf.REDIS_START_URLS, mp) print('track: %s' % task_id) else: sign = True page_url = task_info.strip() mp = { 'entry': task_type, 'page_url': page_url, 'task_category': task_category } store.execute_sql(sql_update_status, 1, task_id) rds.rds.rpush(conf.REDIS_START_URLS, mp) print('track: %s' % task_id) store.close() except Exception as err: print('scan_database raise a error: {!r}'.format(err)) return sign
# -*- coding: utf-8 -*- from store import AmazonStorePro from settings import MYSQL_CONFIG_SERVER import re store_server = AmazonStorePro(**MYSQL_CONFIG_SERVER) sql_insert = ("insert into crawler_amazon_track_task(wtc_task_type, wtc_task_frequency, wtc_task_period," "wtc_task_info, wtc_task_category,wtc_task_site,wtc_status, wtc_is_delete, wtc_create_time)" "values(%s,%s,%s,%s,%s,%s,%s,%s,now())") sql_select = ("select wtc_id from crawler_amazon_track_task where wtc_task_category=%s and wtc_task_type=%s " "and wtc_task_site=%s") category_lst = [] with open('category', encoding='utf-8') as f: for line in f: if len(line) > 0: line = '>'.join([item.strip() for item in line.strip().split('>')]) category_lst.append(line) url_lst = [] with open('url', encoding='utf-8') as f: for line in f: if len(line) > 0: line = line.strip() url_lst.append(line) print(len(category_lst)) print(len(url_lst))
from store import AmazonStorePro, AmazonRedis from settings import MYSQL_CONFIG_LOCAL, MYSQL_CONFIG_SERVER, REDIS_CONFIG_LOCAL amazon_store = AmazonStorePro(**MYSQL_CONFIG_LOCAL) sql_1 = "select category, entry, url from amazon_top_category" sql_2 = "select wtc_id from crawler_amazon_track_task where wtc_task_category=%s and wtc_task_type=%s " \ "and wtc_task_site='com'" sql_3 = "update crawler_amazon_track_task set wtc_task_info=%s where wtc_task_category=%s and wtc_task_type=%s " \ "and wtc_task_site='com'" sql_4 = "insert into crawler_amazon_track_task(wtc_task_category, wtc_task_type, wtc_task_info, wtc_task_site, " \ "wtc_status, wtc_is_delete, wtc_create_time)values(%s,%s,%s,'com', 3, 0, now())" sql_5 = "select wtc_id, wtc_task_category,wtc_task_type from crawler_amazon_track_task where wtc_task_site='com'" sql_6 = "select id from amazon_top_category where category=%s and entry=%s" sql_7 = "update crawler_amazon_track_task set wtc_status=5 where wtc_id=%s" sql_truncate = "truncate amazon_top_category" rows = amazon_store.execute_sql(sql_1) rows_set = set() for row in rows: key = '{}{}'.format(row['category'], row['entry']) if key in rows_set: continue
def scan_database(rds, conf): # headers = {'User-Agent': random.choice(HEADERS)} # jollychic_url = 'https://www.jollychic.com/delivery.html' # resp = requests.get(jollychic_url, headers=headers) # sel = etree.HTML(resp.text) # country_lst = sel.xpath('//dl[@class="delivery-chose-select fn-hide J-delivery-cnt J-delivery-country"]/dd') # for country in country_lst: # country_id = country.xpath('./@data-value')[0].strip() # country_name = country.xpath('./text()')[0].strip() # data = { # 'regionId': country_id, # 'configType': '1', # 'type': 'country', # 'countryId': country_id, # 'provinceId': '0', # 'cityId': '0' # } # dt = {'site': 'jollychic', 'url': 'https://www.jollychic.com/DeliveryAction/ajaxGetRegionShippingFee', # 'country': country_name, 'data': data} # mp = json.dumps(dt) # print(mp) # print(country_id, country_name) # rds.rds.rpush(conf.REDIS_START_URLS, mp) sign = False try: store = AmazonStorePro(**MYSQL_CONFIG_LOCAL) # 服务器本地切换 sql_update_status = "update crawler_amazon_track_task set wtc_status=%s, wtc_crawl_time=now() where wtc_id=%s" sql_select_track = ( "select wtc_id, wtc_task_type, wtc_task_frequency, wtc_task_period, wtc_task_info,wtc_task_category, " "wtc_crawl_time, wtc_create_time, wtc_task_site from crawler_amazon_track_task_copy " "where wtc_status=%s and wtc_is_delete=%s") rows = store.execute_sql(sql_select_track, 1, 0) for row in rows: task_id = row['wtc_id'] task_type = row['wtc_task_type'] task_frequency = row['wtc_task_frequency'] task_period = row['wtc_task_period'] task_info = row['wtc_task_info'] task_category = row['wtc_task_category'] start_track_time = row['wtc_create_time'] last_track_time = row['wtc_crawl_time'] # 非首次采集 if last_track_time is not None: end_track_time = start_track_time + datetime.timedelta(days=task_period) next_track_time = last_track_time + datetime.timedelta(days=task_frequency) now_time = datetime.datetime.strptime(time.strftime("%Y-%m-%d %H:%M:%S"), "%Y-%m-%d %H:%M:%S") if next_track_time > end_track_time: store.execute_sql(sql_update_status, 2, task_id) if now_time > next_track_time: sign = True page_url = task_info.strip() mp = {'entry': task_type, 'page_url': page_url, 'task_category': task_category} #store.execute_sql(sql_update_status, 1, task_id) rds.rds.rpush(conf.REDIS_START_URLS, mp) print('track: %s' % task_id) else: sign = True page_url = task_info.strip() mp = {'entry': task_type, 'page_url': page_url, 'task_category': task_category} #store.execute_sql(sql_update_status, 1, task_id) rds.rds.rpush(conf.REDIS_START_URLS, mp) print('track: %s' % task_id) store.close() except Exception as err: print('scan_database raise a error: {!r}'.format(err)) return sign