def start_requests(self): # 领取profileID并改变状态(目前只爬取某一站点的top10000的测评人数据) mysql_server = Mysql_server() cursor = mysql_server.get_cursor() cursor.execute(f"select profileID,country from product_toplistreviews where state=0 and country='jp' limit 300") task_list = cursor.fetchall() print(task_list) for task in task_list: if task[0] != "": task = {'profileID': task[0], 'country': task[-1]} parmas = (task['profileID'], task['country']) update_sql = f"""update product_toplistreviews set state=1 where profileID=%s and country=%s""" cursor.execute(update_sql, parmas) mysql_server.conn.commit() mysql_server.close() for task in task_list: if task[0] != "": task = {'profileID': task[0], "country": task[-1]} # https://www.amazon.com/hz/gamification/api/contributor/dashboard/amzn1.account.AFQ7TVKKSLR6C5MSZDWAYMR2OPCA url = "https://www.amazon.%s/" % self.country_site[ task['country']] + "hz/gamification/api/contributor/dashboard/amzn1.account.%s" % task["profileID"] self.headers['Referer'] = url self.headers['cookie'] = self.cookie_dict[task['country']] yield scrapy.Request(url, meta={'country': task['country'], 'profileID': task['profileID'], }, headers=self.headers, callback=self.parse, dont_filter=True)
def start_requests(self): # 领取sellerID并改变状态 mysql_server = Mysql_server() cursor = mysql_server.get_cursor() cursor.execute( f"select sellerID,country from product_detail where state=0 and seller_type != 1 and sellerID !=''" ) task_list = cursor.fetchall() # task_list = (("AODNN2DNYDROD","us"),) print(task_list) for task in task_list: if task[0] != "": task = {'sellerID': task[0], 'country': task[-1]} parmas = (task['sellerID'], task['country']) update_sql = f"""update product_detail set state=1 where sellerID=%s and country=%s""" cursor.execute(update_sql, parmas) mysql_server.conn.commit() mysql_server.close() # 整理数据发送请求 for task in task_list: if task[0] != "": task = {'sellerID': task[0], "country": task[-1]} url = "https://www.amazon.%s/sp" % self.country_site[task[ 'country']] + "?seller=%s&th=1&psc=1&language=en_US" % task[ "sellerID"] self.headers['Referer'] = url self.headers['cookie'] = self.cookie_dict[task['country']] yield scrapy.Request(url, meta={ 'country': task['country'], 'sellerID': task['sellerID'], 'retry_number': 0 }, callback=self.parse, dont_filter=True)
def start_requests(self): # 取出任务并改变状态 for country in self.country_list: mysql_server = Mysql_server() cursor = mysql_server.get_cursor() table_name = country + "_asins" cursor.execute(f"select distinct(asin) from {table_name} where state=5") task_list = cursor.fetchall() print(task_list) # task_list = (("B0753H1Z7L",),) # 测试用例 for task in task_list: task = {'asin': task[0], 'countrycode': country} parmas = (task['asin'],) update_sql = f"""update {table_name} set state=6 where asin=%s""" cursor.execute(update_sql, parmas) mysql_server.conn.commit() mysql_server.conn.close() for task in task_list: task = {'asin': task[0], 'countrycode': country} page_num = 1 url = f'https://{self.countryCodeArr[task["countrycode"]]}/product-reviews/{task["asin"]}/ref=cm_cr_arp_d_viewopt_fmt?reviewerType=all_reviews&pageNumber={page_num}&formatType=current_format&filterByStar=critical&language=en_US' if self.task_type == 1: url = f'https://{self.countryCodeArr[task["countrycode"]]}/product-reviews/{task["asin"]}?reviewerType=all_reviews&pageNumber={page_num}#reviews-filter-bar&language=en_US' yield scrapy.Request( url=url, headers=self.headers, meta={ 'page_num': page_num, 'countrycode': task['countrycode'], 'asin': task['asin'], 'table_name': table_name}, callback=self.parse, dont_filter=True )
def __init__(self, table, fromLang='auto', toLang='zh'): env_dist = os.environ self.table = table self.toLang = toLang self.fromLang = fromLang self.mysql = Mysql_server() self.cursor = self.mysql.get_cursor() self.appid = env_dist.get('baidufanyi_appid') # 填写你的appid self.secretKey = env_dist.get('baidufanyi_secretKey') # 填写你的密钥
def start_requests(self): # 各站点网址后缀 self.web_country = { "us": "com", 'uk': "co.uk", 'es': 'es', 'fr': 'fr', 'it': 'it', 'au': "com.au", 'ca': "ca", 'jp': 'co.jp', 'de': 'de' } self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', "cookie": "session-id=145-5209209-9478023; i18n-prefs=USD; ubid-main=133-8276981-7688751; x-wl-uid=1PCOyx0JI1kz7vWchZyMhRWJtqj1XoQoE0UNJPLhOT/Q8+kepq170hFhtVj1OBOSit46HW9f+Rz8=; lc-main=en_US; session-id-time=2082787201l; session-token=3TtwIpr/LCK/R5dUusiKqRfu1FQJmG80o4BC0knm7brPg8aelaJ+f/B16GedWlTyDSjn8qQo3s3PmGmw5mHywT8RWHthFHuduD76fCQKbeUHR0G/OJ4sj2eZxXUoxgcWn+a+xbKm+Rpj5ciXMPsk4ObS1HmuF5NFMFttjbT4ZsWQBxh5Ak9x1hxbsqNIrrrW; csm-hit=tb:0YBA58R18R2BQ1H4SWX6+b-0YBA58R18R2BQ1H4SWX6|1592453272955&t:1592453272955&adb:adblk_yes" } # 取任务改变状态发送请求 mysql_server = Mysql_server() cursor = mysql_server.get_cursor() cursor.execute("select keyword, country from keywords where state=0") word_list = cursor.fetchall() for task in word_list: task = task[0] parmas = (task, ) update_sql = """update keywords set state=1 where keyword=%s""" cursor.execute(update_sql, parmas) mysql_server.conn.commit() mysql_server.close() # 构建初始界面url for word in word_list: keyword = word[0] country = word[-1] self.headers['cookie'] = self.cookie_dict[country] url_word = '+'.join(keyword.split()) headers = self.headers headers[ 'origin'] = 'https://www.amazon.' + self.web_country[country] url = "https://www.amazon.{}/s?k={}&qid=1551418823".format( self.web_country[country], url_word) self.headers[ 'referer'] = "https://www.amazon." + self.web_country[country] yield scrapy.Request(url=url, meta={ "page": 1, "keyword": keyword, 'absolute_position': 0, 'country': country, 'retries': 0, 'result_num': 0 }, callback=self.parse, dont_filter=True)
def start_requests(self): # 取出各站点任务asin,并把状态从0改为1表示正在爬取 for country in self.country_list: mysql_server = Mysql_server() cursor = mysql_server.get_cursor() table_name = country + '_asins' cursor.execute( f"select asin from {table_name} where state=0 limit 10") task_list = cursor.fetchall() print(task_list) for task in task_list: task = {'asin': task[0], 'country': country} parmas = (task['asin'], ) update_sql = f"""update {table_name} set state=1 where asin=%s""" cursor.execute(update_sql, parmas) mysql_server.conn.commit() mysql_server.close() # 测试用例 # print(task_list) # asin_list = [{'country': 'it', 'asin': 'B07VB8WXNF'}] # asin_list = [{'countrycode': 'us', 'asin': 'B07FJGGWJL'}, {'countrycode': 'de', 'asin': 'B00Y211AFM'}, # {'countrycode': 'fr', 'asin': 'B00GS19NWG'}, {'countrycode': 'uk', 'asin': 'B0000E5SEQ'}, # {'countrycode': 'it', 'asin': 'B07VMRB2K1'}, {'countrycode': 'es', 'asin': 'B07FPFKL4X'}, # {'countrycode': 'ca', 'asin': 'B00XMD7KPU'}, {'countrycode': 'au', 'asin': 'B075FQY5BN'},] # {'countrycode': 'jp', 'asin': 'B07RPVQY62'}] # 整理数据,发送请求 for task in task_list: task = {'asin': task[0], 'country': country} if len(task['asin']) > 10 and '?' in task['asin']: asin = task['asin'].split('/')[-1].split('?')[0] if len(asin) == 10: asin = asin.upper() elif len(task['asin']) == 10: asin = task['asin'].upper() url = "https://www.amazon.%s/dp/" % self.country_site[task[ 'country']] + task['asin'] + '?th=1&psc=1&language=en_US' # https://www.amazon.co.uk/dp/B071GYJTST?th=1&psc=1&language=en_US # self.headers['Referer'] = url # self.headers['cookie'] = self.cookie_dict[country] yield scrapy.Request(url, meta={ 'country': task['country'], 'asin': task['asin'], 'type': "init", 'retry_number': 0, 'table_name': table_name }, callback=self.parse, dont_filter=True)
class SorftimePipeline(object): def open_spider(self, spider): self.mysql_server = Mysql_server() def process_item(self, item, spider): cursor = self.mysql_server.get_cursor() params = ( item['asin'], item['category_name'], item['rank'], item['bsr_url'], item['category_id'], item['sales'], item['country'], item['brand'], item['SalePrice'], item['product_url'], item['level']) sql = f"""insert into sorftime_sales_other (id, asin, category_name, `rank`, bsr_url, category_id, sales, country, brand,SalePrice,product_url,level ) values(0, %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) on duplicate key update sales=values(sales)""" cursor.execute(sql, params) # timestamp = int(time.time()) # table_name = item['country'] + '_asins' # params = (item['asin'], 1, item['rank'], # 'bsr_' + item['category_id'], timestamp, 0) # sql = f"""insert into {table_name} (id, asin, pageNum, positionNum, keyword, timestamp,state, ad) # values (0, %s, %s, %s, %s, %s, 0, %s)""" # cursor.execute(sql, params) self.mysql_server.conn.commit() return item def close_spider(self, spider): self.mysql_server.conn.close()
class SellerspritePipeline(object): def open_spider(self, spider): self.mysql_server = Mysql_server() def process_item(self, item, spider): cursor = self.mysql_server.get_cursor() for sale_data in item['total_dict'].items(): # m_add:只保存当月的数据 2020-10-07 if sale_data[0] != '2020-10': continue params = (item['asin'], sale_data[-1], sale_data[0]) sql = f""" insert into product_sales (id, asin, sales, date) values (0, %s,%s,%s) on duplicate key update sales=values(sales)""" cursor.execute(sql, params) for rankdata in item['rankHistory'].items(): params = (item['asin'], rankdata[-1], rankdata[0]) sql = f"""INSERT into product_sales (id, asin, sales, date) values (0, %s, %s, %s) ON DUPLICATE KEY UPDATE sales=values(%s)""" # sql = f""" insert into product_rank (id, asin, `rank`, date) values (0, %s,%s,%s) on duplicate key update `rank`=values(`rank`)""" # print(sql % params, '======================') cursor.execute(sql, params) params = (item['asin'],) sql = f"""update {item['table_name']} set state=4 where asin=%s""" cursor.execute(sql, params) self.mysql_server.conn.commit() return item def close_spider(self, spider): self.mysql_server.conn.close()
def parse_keepa(self, response): # 从keepa提取近30天排名 print('从keepa提取近30天排名') retry_number = response.meta["retry_number"] item = response.meta["item"] table_name = response.meta["table_name"] status = response.status url = response.url asin = response.meta["asin"] country = response.meta['country'] data = json.loads(response.body) # item['avg30'] = data['products'][0]["stats"].get("avg30", ['', '', '', ''])[3] avg_data = data['products'][0].get("stats", {}) if avg_data != None: item['avg30'] = avg_data.get("avg30", ['', '', '', ''])[3] else: item['avg30'] = 0 timedata = data['products'][0].get("trackingSince", 0) if timedata != 0: timestamp = (timedata + 21564000) * 60 if country in ['us', 'jp', 'fr', 'uk', 'es']: item['product_info']["Date First Available"] = \ time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(timestamp)).split(' ')[0] else: item['product_info']["Im Angebot von Amazon.de seit"] = \ time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(timestamp)).split(' ')[0] yield item else: yield item mysql_server = Mysql_server() cursor = mysql_server.get_cursor() params = (asin, ) update_sql = f"""update {table_name} set state=2 where asin=%s""" cursor.execute(update_sql, params) mysql_server.conn.commit() mysql_server.close()
class AmzToplistReviewsPipeline(object): def open_spider(self, spider): self.mysql_server = Mysql_server() def process_item(self, item, spider): timestamp = int(int(time.time()) / 86400) * 86400 cursor = self.mysql_server.get_cursor() params = (item['profileID'], item['country'], timestamp) sql = """insert ignore into product_toplistreviews (id, profileID, country, timestamp) values (0, %s, %s, %s)""" cursor.execute(sql, params) self.mysql_server.conn.commit() return item def close_spider(self, spider): self.mysql_server.conn.close()
class AmzReviewsPipeline(object): def open_spider(self, spider): self.mysql_server = Mysql_server() def process_item(self, item, spider): timestamp = int(int(time.time()) / 86400) * 86400 cursor = self.mysql_server.get_cursor() params = (item['reviewID'], item['review_time'], item['review_raiting'], item['helpful_num'], item['review_title'], item['review_body'], item['is_VP'], item['asin'], item['profileID'], item['country'], timestamp) sql = """insert into product_reviews (reviewID, review_time, review_raiting, helpful_num, review_title, review_body, is_VP, asin, profileID, country, timestamp) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s)""" cursor.execute(sql, params) self.mysql_server.conn.commit() return item def close_spider(self, spider): self.mysql_server.conn.close()
class AmzProductPipeline(object): def open_spider(self, spider): self.mysql_server = Mysql_server() self.client = pymongo.MongoClient('127.0.0.1', 27017, maxPoolSize=100) self.db = self.client.amazon self.collection = self.db.detail_info def process_item(self, item, spider): timestamp = int(time.time()) cursor = self.mysql_server.get_cursor() params = (item['asin'], item['seller_type'], item['seller_num'], item['brand'], item['price'], item['listing_rating'], item['ratings'], item['stock_status'], item['QA_num'], timestamp, item['sellerName'], item['sellerID'], item['country'], item['reviews'], item['actual_reviews'], item['critical'], item['vp_num'], item['product_style'], item["avg30"]) detail_sql = """insert into product_detail (id, asin, seller_type, seller_num, brand, price, listing_rating, ratings, stock_status, QA_num, timestamp, sellerName, sellerID, country, reviews, actual_reviews, critical, vp_num, product_style, avg30) values (0, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""" cursor.execute(detail_sql, params) self.mysql_server.conn.commit() rank_sql = f"""insert into product_rankinfo (id, asin, categoryID, category, `rank`, timestamp, country) values (0, %s, %s, %s, %s, %s, %s)""" rank_list = item['rank_list']['ranks'] for data in rank_list: params = (item['asin'], data['catId'], data['name'], int(data['rank']), timestamp, item['country']) cursor.execute(rank_sql, params) self.mysql_server.conn.commit() target_data = {"asin": item['asin'], "frequently_bought_asins": item['frequently_bought_asins'], "title": item['title'], "img_list": item['img_list'], "variant_list": item['variant_list'], "parentasin": item['parentasin'], "vari_num": item['vari_num'], "feature": item['feature'], "product_info": json.dumps(item['product_info']), "product_descript": json.dumps(item['product_descript']), "compare_info": json.dumps(item['compare_info']), "other_info": item['other_info'], 'country': item['country'], "timestamp": timestamp} str_data = json.dumps(target_data, ensure_ascii=False) target_data = json.loads(str_data) self.collection.insert_one(target_data) return item def close_spider(self, spider): self.mysql_server.conn.close() self.client.close()
class AmzKeywordsPipeline(object): def open_spider(self, spider): self.mysql_server = Mysql_server() def process_item(self, item, spider): timestamp = int(time.time()) cursor = self.mysql_server.get_cursor() table_name = item['country'] + '_asins' params = (item['asin'], item['pageNum'], item['positionNum'], item['keyword'], timestamp, item['ad']) sql = f"""insert into {table_name} (id, asin, pageNum, positionNum, keyword, timestamp,state, ad) values (0, %s, %s, %s, %s, %s, 0, %s)""" # print(sql,params) cursor.execute(sql, params) self.mysql_server.conn.commit() return item def close_spider(self, spider): self.mysql_server.conn.close()
class AmzProfilePipeline(object): def open_spider(self, spider): self.mysql_server = Mysql_server() def process_item(self, item, spider): cursor = self.mysql_server.get_cursor() params = (item['profileID'],) sql = """update product_toplistreviews set state=2 where profileID=%s""" cursor.execute(sql, params) params = (item['profileID'], item['helpfulVotes'], item['reviews'], item['location'], item['occupation'], item['facebook'], item['twitter'], item['instagram'], item['youtube'], item['country'], item['rank'], item['name']) sql = """insert into profile_info (id, profileID, helpfulVotes, reviews, location, occupation, facebook, twitter, instagram, youtube, country, `rank`, name) values(0, %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) on duplicate key update reviews=values(reviews), rank=values(rank)""" cursor.execute(sql, params) self.mysql_server.conn.commit() return item def close_spider(self, spider): self.mysql_server.conn.close()
class AmzSellerPipeline(object): def open_spider(self, spider): self.mysql_server = Mysql_server() def process_item(self, item, spider): item = item['data'] timestamp = int(time.time()) cursor = self.mysql_server.get_cursor() params = (item['sellerID'], item['negative_lifetime'], item['count_lifetime'], timestamp, item['country']) sql = f"""insert into seller_info (id, sellerID, negative_lifetime, count_lifetime, timestamp,country) values (0, %s, %s, %s, %s, %s)""" # print(sql,params) cursor.execute(sql, params) params = (item['sellerID'],) sql = f"""update product_detail set state=2 where sellerID=%s""" cursor.execute(sql, params) self.mysql_server.conn.commit() return item def close_spider(self, spider): self.mysql_server.conn.close()
class AmzbsrPipeline(object): def open_spider(self, spider): self.mysql_server = Mysql_server() def process_item(self, item, spider): timestamp = int(int(time.time()) / 86400 / 30) * 86400 * 30 cursor = self.mysql_server.get_cursor() if item['flag'] == 1: params = (item['category_id'], item['category_name'], item['category_id'] + f'_{timestamp}', item['level'], item['parent_id'], 0, item['bsr_url'], item['country']) sql = """insert into amz_category (id, category_id, category_name, node_name, uid, level, parent_id, state, bsr_url, country) values (0, %s, '',%s, %s, %s, %s, %s, %s, %s)""" # params = (item['category_name'], item['category_id'], item['country']) # sql = """update amz_category set node_name=%s where category_id=%s and country=%s""" # print('----') elif item['flag'] == 0: return timestamp = int(time.time()) table_name = item['data']['country'] + '_asins' params = (item['data']['asin'], 1, item['data']['rank'], 'bsr_' + item['data']['categoryId'], timestamp, 0) sql = f"""insert into {table_name} (id, asin, pageNum, positionNum, keyword, timestamp,state, ad) values (0, %s, %s, %s, %s, %s, 0, %s)""" elif item['flag'] == 2: print(f'======{item["category_name"]}=======') params = (item['category_name'], item['category_id'], item['country']) sql = """update amz_category set category_name=%s where category_id=%s and country=%s""" # try: cursor.execute(sql, params) self.mysql_server.conn.commit() # except: # pass return item def close_spider(self, spider): self.mysql_server.conn.close()
def start_requests(self): # 根据站点领取任务,整理数据,改变状态发送请求 for country in self.country_list: mysql_server = Mysql_server() cursor = mysql_server.get_cursor() table_name = 'amz_category' cursor.execute( f"select category_id, category_name, level, bsr_url from {table_name} where state=1 and country='{country}' and level<10 limit 10") task_list = cursor.fetchall() print(len(task_list), '=====') for task in task_list: task = {'category_id': task[0], 'category_name': task[1], 'level': task[2], 'country': country} parmas = (task['category_id'], task['level']) update_sql = f"""update {table_name} set state=2 where category_id=%s and level=%s""" cursor.execute(update_sql, parmas) mysql_server.conn.commit() mysql_server.close() for task in task_list: task = {'category_id': task[0], 'category_name': task[1], 'level': task[2], 'country': country, 'category_url': task[3]} # if task['level'] == 1: # if country == 'us': # if task['category_name'] == 'Cell Phones & Accessories': # task['category_id'] = 'wireless' # elif task['category_name'] == 'Electronics': # task['category_id'] = 'electronics' # elif task['category_name'] == 'Home & Kitchen': # task['category_id'] = 'home-garden' # else: # task['category_id'] = '541966' # elif country in ('uk', 'de', 'jp', 'es', 'fr'): # if country == 'uk': # if task['category_name'] == 'Electronics & Photo': # task['category_id'] = 'electronics' # elif task['category_name'] == 'Home & Kitchen': # task['category_id'] = 'home-garden' # else: # task['category_id'] = 'computers' # if country == 'de': # if task['category_name'] == 'Elektronik & Foto': # task['category_id'] = 'ce-de' # else: # task['category_id'] = 'computers' # if country == 'fr': # if task['category_name'] == 'High-Tech': # task['category_id'] = 'electronics' # else: # task['category_id'] = 'computers' # if country == 'it': # if task['category_name'] == 'Elettronica': # task['category_id'] = 'electronics' # else: # task['category_id'] = 'computers' # if country == 'es': # if task['category_name'] == 'Electrónica': # task['category_id'] = 'electronics' # else: # task['category_id'] = 'computers' # if country == 'jp': # if task['category_name'] == 'Electronics': # task['category_id'] = 'electronics' # else: # task['category_id'] = 'computers' # elif country in ('it',): # task['category_id'] = 'pc' site_dict = { 'us': '01', 'uk': '02', 'de': '03', 'fr': '04', 'jp': '07', 'es': '08', 'it': '09', } url = f'https://plug.sorftime.com/FlowCircle/QueryProductByNodeId?site={site_dict[country]}&token=Zkg4Q1Y4VllTUytIRWhiWFNpVGx4Zz09' self.headers['Referer'] = url data = f'delive=0&ebc=0&bbx=0&NodeId={task["category_id"]}×=0&ProductId=&Order=SaleCount&OrderType=desc' yield scrapy.Request(url, body=json.dumps(data), method='POST', meta={'country': task['country'], 'category_id': task['category_id'], 'category_name': task['category_name'], 'category_url': task['category_url'], 'level': task['level'], 'retry_number': 0, 'table_name': table_name}, headers=self.headers, callback=self.parse, dont_filter=True)
def parse(self, response): # 代理状态更新, 如果代理可用,则给一个高分数令其至前,否则给一个低分数置后 proxy = response.meta.get('proxy', '').replace('https://', '').replace('http://', '') proxy_data = {"proxy": proxy, "fail_count": 0, "region": "", "type": "", "source": "spider", "check_count": 20, "last_status": 0, "last_time": ""} page_num = response.meta['page_num'] table_name = response.meta['table_name'] asin = response.meta['asin'] countrycode = response.meta['countrycode'] # 判断是否出现验证码,否则重试并改变代理状态 if len(response.body) < 10000 or response.status == 503: proxy_data['fail_count'] = 18 self.collection.hset(name="useful_proxy", key=proxy, value=json.dumps(proxy_data)) yield scrapy.Request(url=response.url, headers=self.headers, meta={ 'page_num': page_num, 'asin': asin, 'table_name': table_name, 'countrycode': countrycode}, callback=self.parse, dont_filter=True ) return elif response.status == 404: # 若asin不存在则直接停止爬取 # Todo: 识别后应该进行状态重新设置标记 return self.collection.hset(name="useful_proxy", key=proxy, value=json.dumps(proxy_data)) # 提取评论列表 data_list = response.xpath('//div[@id="cm_cr-review_list"]/div') if len(data_list) > 0: for data in data_list: item = {} item['reviewID'] = data.xpath('./@id').extract_first() # 评论ID try: # 评论人ID item['profileID'] = \ data.xpath('./div/div/div//a[@class="a-profile"]/@href').extract_first().split('.')[2].split( '/')[0] except: continue # 评论时间 item['review_time'] = data.xpath( './div/div/span[@data-hook="review-date"]/text()').extract_first() # .split('on ')[-1] # 评论评分 item['review_raiting'] = float( data.xpath('.//div/div/div/a/@title').extract_first().split(' ')[0].replace(',', '.')) # 评论标题 item['review_title'] = data.xpath( './/div/div//a[@data-hook="review-title"]/span/text()').extract_first() if item['review_title'] == None: item['review_title'] = "" # 评论内容 item['review_body'] = data.xpath('string(.//span[@data-hook="review-body"]/span)').extract_first() # 评论有用数 helpful_str = data.xpath('./div/div/div/div/span/div/span/text()').extract_first() if helpful_str is None: item['helpful_num'] = 0 elif helpful_str.startswith('One') or helpful_str.startswith('Une') or helpful_str.startswith( 'A una') or helpful_str.startswith('Una') or helpful_str.startswith('Eine'): item['helpful_num'] = 1 else: try: item['helpful_num'] = int(helpful_str.split(' ')[1].replace(',', '')) except: item['helpful_num'] = int(helpful_str.split(' ')[0].replace(',', '')) # item['helpful_num'] = helpful_str # 评论是否为VP评论 vp_str = data.xpath('./div/div/div/span/a/span/text()').extract_first() if vp_str in ['Verified Purchase', 'Verifizierter Kauf', 'Amazonで購入', 'Achat vérifié', 'Acquisto verificato', 'Compra verificada', '']: item['is_VP'] = 1 # 是VP评论 elif vp_str is None: item['is_VP'] = 0 # 不是VP评论 else: item['is_VP'] = 2 # 其他情况,比如早期评论计划评论 # ------------------------------时间处理----------------------------------- review_time = item['review_time'] # 时间处理 if review_time != '': if countrycode == 'us' or countrycode == 'ca' or countrycode == 'jp': review_time = review_time.split('on ')[-1] review_time = time.strftime("%Y-%m-%d", time.strptime(review_time, "%B %d, %Y")) elif countrycode == 'fr': fr_month = ["janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août", "septembre", "octobre", "novembre", "décembre"] review_time = review_time.split('le ')[-1] # .decode('utf-8').encode("latin-1") for each in range(12): if self.fr_month[each] in review_time: review_time = review_time.replace(self.fr_month[each], self.us_month[each]) break review_time = time.strftime("%Y-%m-%d", time.strptime(review_time, "%d %B %Y")) elif countrycode == 'de': de_month = ["Januar", "Februar", "März", "April", "Mai", "Juni", "Juli", "August", "September", "Oktober", "November", "Dezember"] review_time = review_time.split('vom ')[-1] # .decode('utf-8').encode("latin-1") for each in range(12): if self.de_month[each] in review_time: review_time = review_time.replace(self.de_month[each], self.us_month[each]) break review_time = time.strftime("%Y-%m-%d", time.strptime(review_time, "%d. %B %Y")) elif countrycode == 'es': self.es_month = ["enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto", "septiembre", "octubre", "noviembre", "diciembre"] review_time = review_time.split('el ')[-1].replace('de ', '') # .decode('utf-8').encode("latin-1") for each in range(12): if self.es_month[each] in review_time: review_time = review_time.replace(self.es_month[each], self.us_month[each]) break review_time = time.strftime("%Y-%m-%d", time.strptime(review_time, "%d %B %Y")) elif countrycode == 'it': it_month = ["gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"] review_time = review_time.split('il ')[-1] # .decode('utf-8').encode("latin-1") for each in range(12): if self.it_month[each] in review_time: review_time = review_time.replace(self.it_month[each], self.us_month[each]) break review_time = time.strftime("%Y-%m-%d", time.strptime(review_time, "%d %B %Y")) # elif countrycode == 'jp': # review_time = '-'.join(re.findall('\d+', review_time)) # review_time = time.strftime("%Y-%m-%d", time.strptime(review_time, "%Y-%m-%d")) elif countrycode == 'uk' or countrycode == 'au': review_time = review_time.split('on ')[-1] review_time = time.strftime("%Y-%m-%d", time.strptime(review_time, "%d %B %Y")) item['review_time'] = review_time item['asin'] = asin item['country'] = countrycode # ------------------------------时间处理----------------------------------- yield item try: if len(data_list) >= 10 and page_num == 1: # 提取出评论总数之后进行并发爬取 if countrycode == 'fr': reviews_count = \ response.xpath('//div[@id="filter-info-section"]/span/text()').extract_first().replace(',', '') reviews_count = re.findall('sur ([0-9]+)', reviews_count)[0] elif countrycode == 'it': reviews_count = \ response.xpath('//div[@id="filter-info-section"]/span/text()').extract_first().replace(',', '') # print(reviews_count, '------') reviews_count = re.findall('su ([0-9]+)', reviews_count)[0] elif countrycode == 'de': # or countrycode == 'uk': reviews_count = \ response.xpath('//div[@id="filter-info-section"]/span/text()').extract_first().replace(',', '') # print(reviews_count, '------') reviews_count = re.findall('von ([0-9]+)', reviews_count)[0] elif countrycode == 'uk': reviews_count = \ response.xpath('//div[@id="filter-info-section"]/span/text()').extract_first().replace(',', '') # print(reviews_count, '------') reviews_count = re.findall('of ([0-9]+)', reviews_count)[0] elif countrycode == 'es': reviews_count = \ response.xpath('//div[@id="filter-info-section"]/span/text()').extract_first().replace(',', '') # print(reviews_count, '------') reviews_count = re.findall('de ([0-9]+)', reviews_count)[0] else: reviews_count = \ response.xpath('//div[@id="filter-info-section"]/span/text()').extract_first().split(' ')[ -2].replace( ',', '') except: reviews_count = \ response.xpath('//div[@id="filter-info-section"]//span/text()').extract() if reviews_count != []: reviews_count = re.findall('\| (.*?) ', reviews_count[-1])[0] reviews_count = reviews_count.replace(',', '').replace('.', '').replace(' ', '') else: reviews_count = 0 page_count = int(reviews_count) // 10 + 2 # 10条一页,对10取余加2保证完全爬取 for page_num in range(2, page_count): url = f'https://{self.countryCodeArr[countrycode]}/product-reviews/{asin}/ref=cm_cr_arp_d_viewopt_fmt?reviewerType=all_reviews&pageNumber={page_num}&formatType=current_format&filterByStar=critical&language=en_US' if self.task_type == 1: url = f'https://{self.countryCodeArr[countrycode]}/product-reviews/{asin}?reviewerType=all_reviews&pageNumber={page_num}#reviews-filter-bar&language=en_US' yield scrapy.Request( url=url, headers=self.headers, meta={ 'page_num': page_num, 'asin': asin, 'countrycode': countrycode, "table_name": table_name}, callback=self.parse, dont_filter=True ) mysql_server = Mysql_server() cursor = mysql_server.get_cursor() parmas = (asin,) # 并行爬取开始后改变状态表示已爬取,并不关心什么时候完成爬取 update_sql = f"""update {table_name} set state=7 where asin=%s""" cursor.execute(update_sql, parmas) mysql_server.conn.commit() mysql_server.close()
def start_requests(self): # 提取asin,整理数据发送请求, 改变状态 for country in self.country_list: mysql_server = Mysql_server() cursor = mysql_server.get_cursor() table_name = country + '_asins' cursor.execute( f"select asin from {table_name} where state=2 limit 100") task_list = cursor.fetchall() print(task_list) for task in task_list: task = {'asin': task[0], 'country': country} parmas = (task['asin'], ) update_sql = f"""update {table_name} set state=3 where asin=%s""" cursor.execute(update_sql, parmas) mysql_server.conn.commit() mysql_server.close() for task in task_list: task = {'asin': task[0], 'country': country} if len(task['asin']) > 10 and '?' in task['asin']: asin = task['asin'].split('/')[-1].split('?')[0] if len(asin) == 10: asin = asin.upper() elif len(task['asin']) == 10: asin = task['asin'].upper() item = {} item['asin'] = task['asin'] item['country'] = country item['table_name'] = table_name item['total_dict'] = {} item['rankHistory'] = {} # 站点后缀对应表 code_dir = { "us": "COM", "uk": "CO_UK", "de": "DE", "fr": "FR", "it": "IT", 'es': "ES", 'jp': "CO_JP" } url = f"https://www.amzscout.net/extensions/scoutpro/v1/products/{code_dir[country]}" # 各站点账号cookie请求头 headers = { 'us': { "Host": "amzscout.net", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36", "x-instance-id": "a3fa3f86-9edd-4743-90f6-26dace59202b", "x-signature": "5bb36a25caf0e17720e9a4a9e5cfa3ff", "Cookie": "_ga=GA1.2.1638731990.1595486255; _gid=GA1.2.1082995224.1595486255; mindboxDeviceUUID=7a11484f-1338-47a3-863e-747203d8f968; directCrm-session=%7B%22deviceGuid%22%3A%227a11484f-1338-47a3-863e-747203d8f968%22%7D; cid=19907047; G_ENABLED_IDPS=google; h=AJ8zGal7cqMwMFWwJ1JS", "Content-Type": "application/json", "Accept": "*/*", "Cache-Control": "no-cache", "Postman-Token": "e21cd0f2-623c-4d26-9101-61e784be19f3", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Content-Length": "23", }, 'uk': { "Host": "amzscout.net", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36", "x-instance-id": "a3fa3f86-9edd-4743-90f6-26dace59202b", "x-signature": "8fad7d23e214780b9b8a5bf6e1c36887", "Cookie": "_ga=GA1.2.1638731990.1595486255; _gid=GA1.2.1082995224.1595486255; mindboxDeviceUUID=7a11484f-1338-47a3-863e-747203d8f968; directCrm-session=%7B%22deviceGuid%22%3A%227a11484f-1338-47a3-863e-747203d8f968%22%7D; cid=19907047; G_ENABLED_IDPS=google; h=AJ8zGal7cqMwMFWwJ1JS", "Content-Type": "application/json", "Accept": "*/*", "Cache-Control": "no-cache", "Postman-Token": "e21cd0f2-623c-4d26-9101-61e784be19f3", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Content-Length": "23", }, 'de': { "Host": "amzscout.net", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36", "x-instance-id": "a3fa3f86-9edd-4743-90f6-26dace59202b", "x-signature": "7c0e3563c17a8dad639895474b4c8c9c", "Cookie": "_ga=GA1.2.1638731990.1595486255; directCrm-session=%7B%22deviceGuid%22%3A%227a11484f-1338-47a3-863e-747203d8f968%22%7D; mindboxDeviceUUID=7a11484f-1338-47a3-863e-747203d8f968; cid=19907047; G_ENABLED_IDPS=google; h=AJ8zGal7cqMwMFWwJ1JS; _ym_uid=1595493952224708321; _ym_d=1595493952; _gid=GA1.2.1387564678.1595814486", "Content-Type": "application/json", "Accept": "*/*", "Cache-Control": "no-cache", "Postman-Token": "e21cd0f2-623c-4d26-9101-61e784be19f3", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Content-Length": "23", }, 'fr': { "Host": "amzscout.net", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36", "x-instance-id": "a3fa3f86-9edd-4743-90f6-26dace59202b", "x-signature": "68b2abf9af33e951fecdf240fe897b1d", "Cookie": "_ga=GA1.2.1638731990.1595486255; directCrm-session=%7B%22deviceGuid%22%3A%227a11484f-1338-47a3-863e-747203d8f968%22%7D; mindboxDeviceUUID=7a11484f-1338-47a3-863e-747203d8f968; cid=19907047; G_ENABLED_IDPS=google; h=AJ8zGal7cqMwMFWwJ1JS; _ym_uid=1595493952224708321; _ym_d=1595493952; _gid=GA1.2.1387564678.1595814486", "Content-Type": "application/json", "Accept": "*/*", "Cache-Control": "no-cache", "Postman-Token": "e21cd0f2-623c-4d26-9101-61e784be19f3", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Content-Length": "23", }, 'it': { "Host": "amzscout.net", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36", "x-instance-id": "a3fa3f86-9edd-4743-90f6-26dace59202b", "x-signature": "54be64e0f25f50ffa05638afbf6811a3", "Cookie": "_ga=GA1.2.1638731990.1595486255; directCrm-session=%7B%22deviceGuid%22%3A%227a11484f-1338-47a3-863e-747203d8f968%22%7D; mindboxDeviceUUID=7a11484f-1338-47a3-863e-747203d8f968; cid=19907047; G_ENABLED_IDPS=google; h=AJ8zGal7cqMwMFWwJ1JS; _ym_uid=1595493952224708321; _ym_d=1595493952; _gid=GA1.2.1387564678.1595814486", "Content-Type": "application/json", "Accept": "*/*", "Cache-Control": "no-cache", "Postman-Token": "e21cd0f2-623c-4d26-9101-61e784be19f3", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Content-Length": "23", }, 'es': { "Host": "amzscout.net", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36", "x-instance-id": "a3fa3f86-9edd-4743-90f6-26dace59202b", "x-signature": "dbb302cf60e8efa77bc33e596a24b6b2", "Cookie": "_ga=GA1.2.1638731990.1595486255; directCrm-session=%7B%22deviceGuid%22%3A%227a11484f-1338-47a3-863e-747203d8f968%22%7D; mindboxDeviceUUID=7a11484f-1338-47a3-863e-747203d8f968; cid=19907047; G_ENABLED_IDPS=google; h=AJ8zGal7cqMwMFWwJ1JS; _ym_uid=1595493952224708321; _ym_d=1595493952; _gid=GA1.2.114546357.1596414342", "Content-Type": "application/json", "Accept": "*/*", "Cache-Control": "no-cache", "Postman-Token": "e21cd0f2-623c-4d26-9101-61e784be19f3", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Content-Length": "23", }, 'jp': { "Host": "amzscout.net", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36", "x-instance-id": "8e86ea27-d5fa-4f8c-b347-96a659338078", "x-signature": "7615fd3657994cba7e376594101849ab", "Cookie": "cid=20212472; mindboxDeviceUUID=486b8129-af83-42e9-a5cf-b70776d8b1e6; directCrm-session=%7B%22deviceGuid%22%3A%22486b8129-af83-42e9-a5cf-b70776d8b1e6%22%7D; h=bgXLkUopN26n8fEvlgHn", "Content-Type": "application/json", "Accept": "*/*", "Cache-Control": "no-cache", "Postman-Token": "e21cd0f2-623c-4d26-9101-61e784be19f3", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", "Content-Length": "23", }, } post = [{"asin": task['asin']}] # { # 该post请求用scrapy无法实现,所以用request实现数据获取,再通过多访问一次baidu走scrapy的流程返回数据 # Todo: 优化实现用scrapy实现请求 while True: # 代理可能失效,循环至成功为止 try: response = requests.post(url, data=json.dumps(post), headers=headers[country], timeout=10, verify=False) chartData = json.loads(response.text) # 提取近一年销量和排名数据,但是保存时无指定情况则只保存一条数据 if len(chartData) > 0: item['total_dict']['2020-10'] = chartData[0].get( 'estSales', 0) salesHistory = chartData[0].get('salesHistory', []) time_int = int(time.time()) salesHistory.reverse() # 提取历史销量数据 for saledata in salesHistory: time_int -= 86400 if saledata is not None: timeArray = time.localtime(time_int) time_str = time.strftime( "%Y-%m-%d", timeArray) item['total_dict'][time_str] = saledata rankHistory = chartData[0].get('rankHistory', []) rankHistory.reverse() time_int = int(time.time()) # 提取历史排名数据 for rankdata in rankHistory: time_int -= 86400 if rankdata is not None: timeArray = time.localtime(time_int) time_str = time.strftime( "%Y-%m-%d", timeArray) item['rankHistory'][time_str] = rankdata else: item['total_dict']['2020-10'] = 0 break except: time.sleep(3) pass url = 'https://www.baidu.com' yield scrapy.Request( url, method='get', # body=json.dumps(post), meta={ 'country': country, 'asin': task['asin'], 'retry_number': 0, 'table_name': table_name, "item": item }, headers=headers, callback=self.parse_total, dont_filter=True, )
def parse(self, response): # 代理状态更新, 如果代理可用,则给一个高分数令其至前,否则给一个低分数置后 proxy = response.meta.get('proxy', '').replace('https://', '').replace('http://', '') proxy_data = { "proxy": proxy, "fail_count": 0, "region": "", "type": "", "source": "spider", "check_count": 20, "last_status": 0, "last_time": "" } retry_number = response.meta["retry_number"] # 当页面10次都未成功提取数据则停止任务 if retry_number > 10: return table_name = response.meta["table_name"] status = response.status url = response.url asin = response.meta["asin"] country = response.meta['country'] if status == 200 or status == 503: try: # 判断是否为验证码,若是则更新代理状态并重试 if len(response.body) < 10000 or status == 503: proxy_data['fail_count'] = 18 self.collection.hset(name="useful_proxy", key=proxy, value=json.dumps(proxy_data)) yield scrapy.Request(url, meta={ 'country': country, 'asin': asin, 'type': "init", 'retry_number': retry_number, 'table_name': table_name }, callback=self.parse, dont_filter=True) return # 不是代理则解析数据 self.product_object.initial_data = response.text self.product_object.countryCode = country item = {} item['asin'] = asin item['country'] = country self.product_object.best_seller_rank() self.product_object.seller_info() self.product_object.revews_rating() item[ 'frequently_bought_asins'] = self.product_object.frequently_bought_asins # 关联购买 # item['rank_list'] = self.product_object.rank_list item['seller_type'] = int( self.product_object.seller_type) # 卖家类型 item['seller_num'] = int( self.product_object.seller_num) # 卖家数量,跟卖数 # item['merchant'] = self.product_object.merchant item['brand'] = self.product_object.brand # 品牌 item['price'] = float(self.product_object.price) # 价格 item['ratings'] = int(self.product_object.reviews) # 评分人数 item['listing_rating'] = float( self.product_object.rating) # 商品评分 item['title'] = str(self.product_object.title) # 标题 item['stock_status'] = int( self.product_object.availability[0]) # 库存状态 item['QA_num'] = int(self.product_object.QA_num) # QA数 item['img_list'] = self.product_object.img_list # 图片地址列表 variant_data = self.product_object.variant_list if len(variant_data) == 3: item['variant_list'] = variant_data[0] # 变体列表 item['parentasin'] = variant_data[1] # 父asin item['vari_num'] = variant_data[-1] # 变体数 else: item['variant_list'] = '' item['parentasin'] = '' item['vari_num'] = '' item['rank_list'] = self.product_object.rank_list # 排名信息 item['feature'] = self.product_object.description[ 'feature'] # 五点描述 item['sellerName'] = self.product_object.merchantName # 卖家名称 item['sellerID'] = self.product_object.merchantUrl # 卖家ID item[ 'product_info'] = self.product_object.product_info # 产品A+ 页面信息 item[ 'compare_info'] = self.product_object.compare_info # 产品比较信息 item[ 'product_descript'] = self.product_object.product_descript # 产品技术信息 item['other_info'] = self.product_object.other_info # 其他信息 # 亚马逊反爬手段之一,不给下列字段信息,其余信息完整无误,此时需要重新爬取,但是有的商品确实没有这些信息,如虚拟商品,所以重试一定次数截止 if len(item['product_descript']['product_dict']) == 0 and len( item['product_descript']['product_info']) == 0 and len( item['product_descript'] ['other_info']) == 0 and retry_number < 1: raise Exception else: url = f"https://www.amazon.{self.country_site[item['country']]}/product-reviews/{item['asin']}?formatType=current_format" yield scrapy.Request(url, meta={ 'country': item['country'], 'asin': item['asin'], 'type': "init", 'retry_number': retry_number, 'table_name': table_name, "item": item, "flag": 0 }, callback=self.parse_reviews, dont_filter=True) except Exception as e: # 提取信息出错则重试 print('提取信息出错则重试', e) retry_number += 1 yield scrapy.Request(url, meta={ 'country': country, 'asin': asin, 'type': "init", 'retry_number': retry_number, "table_name": table_name }, callback=self.parse, dont_filter=True) finally: # 更新代理信息 self.collection.hset(name="useful_proxy", key=proxy, value=json.dumps(proxy_data)) elif status == 404: # 商品已下架或更改,则改变商品状态 self.collection.hset(name="useful_proxy", key=proxy, value=json.dumps(proxy_data)) mysql_server = Mysql_server() cursor = mysql_server.get_cursor() params = (asin, ) update_sql = f"""update {table_name} set state=-1 where asin=%s""" cursor.execute(update_sql, params) mysql_server.conn.commit()
def start_requests(self): # lev1_list = [ # # {'cat_url': 'https://www.amazon.com/Best-Sellers/zgbs/amazon-devices/', # # 'category_name': 'Amazon Devices & Accessories', 'level': 1}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers-Amazon-Launchpad/zgbs/boost/', # # 'category_name': 'Amazon Launchpad', 'level': 1}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers-Prime-Pantry/zgbs/pantry/', 'category_name': 'Amazon Pantry', # # 'level': 1}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers-Appliances/zgbs/appliances/', 'category_name': 'Appliances', # # 'level': 1}, {'cat_url': 'https://www.amazon.com/Best-Sellers-Appstore-Android/zgbs/mobile-apps/', # # 'category_name': 'Apps & Games', 'level': 1}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers-Arts-Crafts-Sewing/zgbs/arts-crafts/', # # 'category_name': 'Arts, Crafts & Sewing', 'level': 1}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers-Audible-Audiobooks/zgbs/audible/', # # 'category_name': 'Audible Books & Originals', 'level': 1}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers-Automotive/zgbs/automotive/', 'category_name': 'Automotive', # # 'level': 1}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers-Baby/zgbs/baby-products/', 'category_name': 'Baby', 'level': 1}, # {'cat_url': 'https://www.amazon.com/Best-Sellers-Beauty/zgbs/beauty/', 'category_name': 'Beauty & Personal Care', # 'level': 1, "category_id": 2001911}, # # {'cat_url': 'https://www.amazon.com/best-sellers-books-Amazon/zgbs/books/', 'category_name': 'Books', # # 'level': 1}, # # {'cat_url': 'https://www.amazon.com/best-sellers-music-albums/zgbs/music/', 'category_name': 'CDs & Vinyl', # # 'level': 1}, # {'cat_url': 'https://www.amazon.com/best-sellers-camera-photo/zgbs/photo/', 'category_name': 'Camera & Photo', # 'level': 1,"category_id": 2001912}, # {'cat_url': 'https://www.amazon.com/Best-Sellers/zgbs/wireless/', 'category_name': 'Cell Phones & Accessories', # 'level': 1,"category_id": 2001913}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers/zgbs/fashion/', 'category_name': 'Clothing, Shoes & Jewelry', # # 'level': 1}, {'cat_url': 'https://www.amazon.com/Best-Sellers-Collectible-Coins/zgbs/coins/', # # 'category_name': 'Collectible Currencies', 'level': 1}, # {'cat_url': 'https://www.amazon.com/Best-Sellers-Computers-Accessories/zgbs/pc/', # 'category_name': 'Computers & Accessories', 'level': 1, "category_id": 2001914}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers-MP3-Downloads/zgbs/dmusic/', 'category_name': 'Digital Music', # # 'level': 1}, # {'cat_url': 'https://www.amazon.com/Best-Sellers-Electronics/zgbs/electronics/', 'category_name': 'Electronics', # 'level': 1, "category_id": 2001915}, # # { # # 'cat_url': 'https://www.amazon.com/Best-Sellers-Entertainment-Collectibles/zgbs/entertainment-collectibles/', # # 'category_name': 'Entertainment Collectibles', 'level': 1}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers-Gift-Cards/zgbs/gift-cards/', 'category_name': 'Gift Cards', # # 'level': 1}, {'cat_url': 'https://www.amazon.com/Best-Sellers-Grocery-Gourmet-Food/zgbs/grocery/', # # 'category_name': 'Grocery & Gourmet Food', 'level': 1}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers-Handmade/zgbs/handmade/', 'category_name': 'Handmade Products', # # 'level': 1}, # {'cat_url': 'https://www.amazon.com/Best-Sellers-Health-Personal-Care/zgbs/hpc/', # 'category_name': 'Health & Household', 'level': 1, "category_id": 2001916}, # {'cat_url': 'https://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/', # 'category_name': 'Home & Kitchen', 'level': 1,"category_id": 2001917}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers-Industrial-Scientific/zgbs/industrial/', # # 'category_name': 'Industrial & Scientific', 'level': 1}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers-Kindle-Store/zgbs/digital-text/', # # 'category_name': 'Kindle Store', 'level': 1}, # {'cat_url': 'https://www.amazon.com/Best-Sellers-Kitchen-Dining/zgbs/kitchen/', # 'category_name': 'Kitchen & Dining', 'level': 1,"category_id": 2001918}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers-Magazines/zgbs/magazines/', # # 'category_name': 'Magazine Subscriptions', 'level': 1}, # # {'cat_url': 'https://www.amazon.com/best-sellers-movies-TV-DVD-Blu-ray/zgbs/movies-tv/', # # 'category_name': 'Movies & TV', 'level': 1}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers-Musical-Instruments/zgbs/musical-instruments/', # # 'category_name': 'Musical Instruments', 'level': 1}, # {'cat_url': 'https://www.amazon.com/Best-Sellers-Office-Products/zgbs/office-products/', # 'category_name': 'Office Products', 'level': 1,"category_id": 2001919}, # {'cat_url': 'https://www.amazon.com/Best-Sellers-Garden-Outdoor/zgbs/lawn-garden/', # 'category_name': 'Patio, Lawn & Garden', 'level': 1, "category_id": 2001920}, # {'cat_url': 'https://www.amazon.com/Best-Sellers-Pet-Supplies/zgbs/pet-supplies/', # 'category_name': 'Pet Supplies', 'level': 1,"category_id": 2001921}, # # {'cat_url': 'https://www.amazon.com/best-sellers-software/zgbs/software/', 'category_name': 'Software', # # 'level': 1}, # {'cat_url': 'https://www.amazon.com/Best-Sellers-Sports-Outdoors/zgbs/sporting-goods/', # 'category_name': 'Sports & Outdoors', 'level': 1,"category_id": 2001922}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers-Sports-Collectibles/zgbs/sports-collectibles/', # # 'category_name': 'Sports Collectibles', 'level': 1}, # {'cat_url': 'https://www.amazon.com/Best-Sellers-Home-Improvement/zgbs/hi/', # 'category_name': 'Tools & Home Improvement', 'level': 1, "category_id": 2001923}, # # {'cat_url': 'https://www.amazon.com/Best-Sellers-Toys-Games/zgbs/toys-and-games/', # # 'category_name': 'Toys & Games', 'level': 1}, # # {'cat_url': 'https://www.amazon.com/best-sellers-video-games/zgbs/videogames/', 'category_name': 'Video Games', # # 'level': 1} # ] # code = data['code'] # if code == 0: # task_list = data['data'] # for data in task_list: # try: # categoryId = data['categoryId'] # categoryId = data['category_id'] # if categoryId != 0: # if data['level'] == 1: # category_name = data['category_name'] self.headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', "cookie": "session-id=145-5209209-9478023; i18n-prefs=USD; ubid-main=133-8276981-7688751; x-wl-uid=1PCOyx0JI1kz7vWchZyMhRWJtqj1XoQoE0UNJPLhOT/Q8+kepq170hFhtVj1OBOSit46HW9f+Rz8=; lc-main=en_US; session-id-time=2082787201l; session-token=3TtwIpr/LCK/R5dUusiKqRfu1FQJmG80o4BC0knm7brPg8aelaJ+f/B16GedWlTyDSjn8qQo3s3PmGmw5mHywT8RWHthFHuduD76fCQKbeUHR0G/OJ4sj2eZxXUoxgcWn+a+xbKm+Rpj5ciXMPsk4ObS1HmuF5NFMFttjbT4ZsWQBxh5Ak9x1hxbsqNIrrrW; csm-hit=tb:0YBA58R18R2BQ1H4SWX6+b-0YBA58R18R2BQ1H4SWX6|1592453272955&t:1592453272955&adb:adblk_yes" } # 从MySQL取状态为0的类目爬取子类目和asin mysql_server = Mysql_server() cursor = mysql_server.get_cursor() cursor.execute( f"select category_id, category_name, level, bsr_url,country from amz_category where state=0 and country='de' limit 10" ) task_list = cursor.fetchall() print(task_list) start_list = [] # 整理数据,并改变类目状态 for task in task_list: task = { 'cat_url': task[3].split('ref')[0], 'category_name': task[1], 'level': task[2], "category_id": task[0], 'country': task[4] } parmas = (task['category_id'], task['level'], task['country']) update_sql = f"""update amz_category set state=1 where category_id=%s and level=%s and country=%s""" cursor.execute(update_sql, parmas) # 将整理好的数据加入任务列表 start_list.append(task) mysql_server.conn.commit() mysql_server.close() # 发送请求, 开始爬取 for data in start_list: req_url = data['cat_url'] + 'ref=' yield scrapy.Request( req_url, headers=self.headers, dont_filter=True, callback=self.parse, meta={ 'data': data, 'req_url': req_url, 'page_num': 1, # 'cookiejar': response.meta['cookiejar'] })
def open_spider(self, spider): self.mysql_server = Mysql_server()
class TranslateScript(object): def __init__(self, table, fromLang='auto', toLang='zh'): env_dist = os.environ self.table = table self.toLang = toLang self.fromLang = fromLang self.mysql = Mysql_server() self.cursor = self.mysql.get_cursor() self.appid = env_dist.get('baidufanyi_appid') # 填写你的appid self.secretKey = env_dist.get('baidufanyi_secretKey') # 填写你的密钥 def translate(self, q='/'): httpClient = None toLang = self.toLang fromLang = self.fromLang myurl = '/api/trans/vip/translate' salt = random.randint(32768, 65536) sign = self.appid + str(q) + str(salt) + self.secretKey sign = hashlib.md5(sign.encode()).hexdigest() myurl = myurl + '?appid=' + self.appid + '&q=' + urllib.parse.quote( q) + '&from=' + fromLang + '&to=' + toLang + '&salt=' + str( salt) + '&sign=' + sign try: httpClient = http.client.HTTPConnection('api.fanyi.baidu.com') httpClient.request('GET', myurl) # response是HTTPResponse对象 response = httpClient.getresponse() result_all = response.read().decode("utf-8") result = json.loads(result_all) return result.get('trans_result')[0].get('dst') except Exception as e: print(e) finally: if httpClient: httpClient.close() def get_data(self): select_sql = "select id, review_title, review_body from {} where state=0 limit 10".format( self.table) self.cursor.execute(select_sql) data = self.cursor.fetchall() for record in data: parmas = (record[0]) update_sql = "update {} set state=1 where id=%s".format(self.table) self.cursor.execute(update_sql, parmas) self.mysql.conn.commit() return data def update_data(self, item): params = (item['translate_review_title'], item['translate_review_body'], item['id']) update_sql = "update {} set translate_review_title=%s, translate_review_body=%s, state=2 where id=%s".format( self.table) self.cursor.execute(update_sql, params) self.mysql.conn.commit() print('{}更新成功'.format(item['id'])) def close_link(self): self.mysql.conn.close() def start(self): data = self.get_data() for record in data: translate_review_title = '' translate_review_body = '' if record[2]: translate_review_body = self.translate(q=record[2]) or '' time.sleep(2) if record[1]: translate_review_title = self.translate(q=record[1]) or '' time.sleep(1) item = { 'id': record[0], 'translate_review_title': translate_review_title, 'translate_review_body': translate_review_body, } self.update_data(item) self.close_link()
def open_spider(self, spider): self.mysql_server = Mysql_server() self.client = pymongo.MongoClient('127.0.0.1', 27017, maxPoolSize=100) self.db = self.client.amazon self.collection = self.db.detail_info
def parse(self, response): keyword = response.meta['keyword'] country = response.meta['country'] # 代理状态更新, 如果代理可用,则给一个高分数令其至前,否则给一个低分数置后 proxy = response.meta.get('proxy', '').replace('https://', '').replace('http://', '') proxy_data = { "proxy": proxy, "fail_count": 0, "region": "", "type": "", "source": "spider", "check_count": 20, "last_status": 0, "last_time": "" } # 出现验证码,修改代理状态并重试 if len(response.body) < 10000 or response.status == 503: proxy_data['fail_count'] = 18 self.collection.hset(name="useful_proxy", key=proxy, value=json.dumps(proxy_data)) meta = response.meta self.headers['referer'] = response.url self.headers['origin'] = 'https://www.amazon.' + self.web_country[ meta['country']] headers = self.headers yield scrapy.Request(url=response.url, meta=meta, callback=self.parse, dont_filter=True) else: # 未出现验证码,提高代理级别 self.collection.hset(name="useful_proxy", key=proxy, value=json.dumps(proxy_data)) # country = response.meta['country'] result_num = response.meta['result_num'] results = response.xpath( '//span[@data-component-type="s-search-results"]/div/div[@data-asin]' ) # ab_position = response.meta['absolute_position'] metas = response.meta if len(results) == 0 and metas['retries'] < 30: # 如果未发现asin列表则进行重试 metas['retries'] += 1 self.headers['referer'] = response.url self.headers[ 'origin'] = 'https://www.amazon.' + self.web_country[ country] headers = self.headers yield scrapy.Request( url=response.url, meta=metas, callback=self.parse, dont_filter=True, ) return # 获取页面上的asin数据 # page_num = response.meta['page'] item = [] # 用于判断是否满足300个asin的要求 position_num = 0 for result in results: result_dict = {} result_dict['pageNum'] = response.meta['page'] # asin所在页数 asin = result.xpath('@data-asin').extract_first() # 提取asin if asin != '': result_dict['asin'] = asin position_num += 1 result_dict['positionNum'] = position_num # 所在页面位置 # 判断是否为广告位 # Todo: 英国站有时候会出现无任何广告位现象 ad_str = result.xpath( './div//span/span/span/span/text()').extract_first() print(ad_str, "====") if ad_str in [ "Sponsored", "Sponsorisé", "Sponsorizzato", "Patrocinado", "スポンサー プロダクト", "Gesponsert" ]: result_dict['ad'] = 1 else: result_dict['ad'] = 0 result_dict['keyword'] = response.meta['keyword'] result_dict['country'] = country item.append(result_dict) yield result_dict result_num += len(item) # 获取下一页url temp_url = response.xpath( '//ul[@class="a-pagination"]/li[@class="a-last"]/a/@href' ).extract_first() # 只抓取前300个asin,为了确保去重后不少于300个,设置判断条件为400个 if result_num <= 400 and temp_url is not None: next_url = "https://www.amazon." + self.web_country[ response.meta['country']] + temp_url self.headers['referer'] = response.url self.headers[ 'origin'] = 'https://www.amazon.' + self.web_country[ country] headers = self.headers yield scrapy.Request( url=next_url, meta={ "page": response.meta['page'] + 1, "keyword": keyword, # 'absolute_position': ab_position, 'country': country, 'retries': 0, "result_num": result_num }, callback=self.parse, dont_filter=True) else: mysql_server = Mysql_server() cursor = mysql_server.get_cursor() sql = """update keywords set state=2 where keyword=%s and country=%s""" params = (keyword, country) cursor.execute(sql, params) mysql_server.close()