Пример #1
0
 def start_requests(self):
     # 领取profileID并改变状态(目前只爬取某一站点的top10000的测评人数据)
     mysql_server = Mysql_server()
     cursor = mysql_server.get_cursor()
     cursor.execute(f"select profileID,country from product_toplistreviews where state=0 and country='jp' limit 300")
     task_list = cursor.fetchall()
     print(task_list)
     for task in task_list:
         if task[0] != "":
             task = {'profileID': task[0], 'country': task[-1]}
             parmas = (task['profileID'], task['country'])
             update_sql = f"""update product_toplistreviews set state=1 where profileID=%s and country=%s"""
             cursor.execute(update_sql, parmas)
     mysql_server.conn.commit()
     mysql_server.close()
     for task in task_list:
         if task[0] != "":
             task = {'profileID': task[0], "country": task[-1]}
             # https://www.amazon.com/hz/gamification/api/contributor/dashboard/amzn1.account.AFQ7TVKKSLR6C5MSZDWAYMR2OPCA
             url = "https://www.amazon.%s/" % self.country_site[
                 task['country']] + "hz/gamification/api/contributor/dashboard/amzn1.account.%s" % task["profileID"]
             self.headers['Referer'] = url
             self.headers['cookie'] = self.cookie_dict[task['country']]
             yield scrapy.Request(url, meta={'country': task['country'], 'profileID': task['profileID'], },
                                  headers=self.headers, callback=self.parse, dont_filter=True)
Пример #2
0
 def start_requests(self):
     # 领取sellerID并改变状态
     mysql_server = Mysql_server()
     cursor = mysql_server.get_cursor()
     cursor.execute(
         f"select sellerID,country from product_detail where state=0 and seller_type != 1 and sellerID !=''"
     )
     task_list = cursor.fetchall()
     # task_list = (("AODNN2DNYDROD","us"),)
     print(task_list)
     for task in task_list:
         if task[0] != "":
             task = {'sellerID': task[0], 'country': task[-1]}
             parmas = (task['sellerID'], task['country'])
             update_sql = f"""update product_detail set state=1 where sellerID=%s and country=%s"""
             cursor.execute(update_sql, parmas)
     mysql_server.conn.commit()
     mysql_server.close()
     # 整理数据发送请求
     for task in task_list:
         if task[0] != "":
             task = {'sellerID': task[0], "country": task[-1]}
             url = "https://www.amazon.%s/sp" % self.country_site[task[
                 'country']] + "?seller=%s&th=1&psc=1&language=en_US" % task[
                     "sellerID"]
             self.headers['Referer'] = url
             self.headers['cookie'] = self.cookie_dict[task['country']]
             yield scrapy.Request(url,
                                  meta={
                                      'country': task['country'],
                                      'sellerID': task['sellerID'],
                                      'retry_number': 0
                                  },
                                  callback=self.parse,
                                  dont_filter=True)
Пример #3
0
 def start_requests(self):
     # 取出任务并改变状态
     for country in self.country_list:
         mysql_server = Mysql_server()
         cursor = mysql_server.get_cursor()
         table_name = country + "_asins"
         cursor.execute(f"select distinct(asin) from {table_name} where state=5")
         task_list = cursor.fetchall()
         print(task_list)
         # task_list = (("B0753H1Z7L",),)  # 测试用例
         for task in task_list:
             task = {'asin': task[0], 'countrycode': country}
             parmas = (task['asin'],)
             update_sql = f"""update {table_name} set state=6 where asin=%s"""
             cursor.execute(update_sql, parmas)
         mysql_server.conn.commit()
         mysql_server.conn.close()
         for task in task_list:
             task = {'asin': task[0], 'countrycode': country}
             page_num = 1
             url = f'https://{self.countryCodeArr[task["countrycode"]]}/product-reviews/{task["asin"]}/ref=cm_cr_arp_d_viewopt_fmt?reviewerType=all_reviews&pageNumber={page_num}&formatType=current_format&filterByStar=critical&language=en_US'
             if self.task_type == 1:
                 url = f'https://{self.countryCodeArr[task["countrycode"]]}/product-reviews/{task["asin"]}?reviewerType=all_reviews&pageNumber={page_num}#reviews-filter-bar&language=en_US'
             yield scrapy.Request(
                 url=url,
                 headers=self.headers,
                 meta={
                     'page_num': page_num,
                     'countrycode': task['countrycode'],
                     'asin': task['asin'],
                     'table_name': table_name},
                 callback=self.parse,
                 dont_filter=True
             )
Пример #4
0
 def __init__(self, table, fromLang='auto', toLang='zh'):
     env_dist = os.environ
     self.table = table
     self.toLang = toLang
     self.fromLang = fromLang
     self.mysql = Mysql_server()
     self.cursor = self.mysql.get_cursor()
     self.appid = env_dist.get('baidufanyi_appid')  # 填写你的appid
     self.secretKey = env_dist.get('baidufanyi_secretKey')  # 填写你的密钥
Пример #5
0
    def start_requests(self):
        # 各站点网址后缀
        self.web_country = {
            "us": "com",
            'uk': "co.uk",
            'es': 'es',
            'fr': 'fr',
            'it': 'it',
            'au': "com.au",
            'ca': "ca",
            'jp': 'co.jp',
            'de': 'de'
        }
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
            "cookie":
            "session-id=145-5209209-9478023; i18n-prefs=USD; ubid-main=133-8276981-7688751; x-wl-uid=1PCOyx0JI1kz7vWchZyMhRWJtqj1XoQoE0UNJPLhOT/Q8+kepq170hFhtVj1OBOSit46HW9f+Rz8=; lc-main=en_US; session-id-time=2082787201l; session-token=3TtwIpr/LCK/R5dUusiKqRfu1FQJmG80o4BC0knm7brPg8aelaJ+f/B16GedWlTyDSjn8qQo3s3PmGmw5mHywT8RWHthFHuduD76fCQKbeUHR0G/OJ4sj2eZxXUoxgcWn+a+xbKm+Rpj5ciXMPsk4ObS1HmuF5NFMFttjbT4ZsWQBxh5Ak9x1hxbsqNIrrrW; csm-hit=tb:0YBA58R18R2BQ1H4SWX6+b-0YBA58R18R2BQ1H4SWX6|1592453272955&t:1592453272955&adb:adblk_yes"
        }

        # 取任务改变状态发送请求
        mysql_server = Mysql_server()
        cursor = mysql_server.get_cursor()
        cursor.execute("select keyword, country from keywords where state=0")
        word_list = cursor.fetchall()
        for task in word_list:
            task = task[0]
            parmas = (task, )
            update_sql = """update keywords set state=1 where keyword=%s"""
            cursor.execute(update_sql, parmas)
        mysql_server.conn.commit()
        mysql_server.close()
        # 构建初始界面url
        for word in word_list:
            keyword = word[0]
            country = word[-1]
            self.headers['cookie'] = self.cookie_dict[country]
            url_word = '+'.join(keyword.split())
            headers = self.headers
            headers[
                'origin'] = 'https://www.amazon.' + self.web_country[country]
            url = "https://www.amazon.{}/s?k={}&qid=1551418823".format(
                self.web_country[country], url_word)
            self.headers[
                'referer'] = "https://www.amazon." + self.web_country[country]
            yield scrapy.Request(url=url,
                                 meta={
                                     "page": 1,
                                     "keyword": keyword,
                                     'absolute_position': 0,
                                     'country': country,
                                     'retries': 0,
                                     'result_num': 0
                                 },
                                 callback=self.parse,
                                 dont_filter=True)
Пример #6
0
    def start_requests(self):
        # 取出各站点任务asin,并把状态从0改为1表示正在爬取
        for country in self.country_list:
            mysql_server = Mysql_server()
            cursor = mysql_server.get_cursor()
            table_name = country + '_asins'
            cursor.execute(
                f"select asin from {table_name} where state=0 limit 10")
            task_list = cursor.fetchall()
            print(task_list)
            for task in task_list:
                task = {'asin': task[0], 'country': country}
                parmas = (task['asin'], )
                update_sql = f"""update {table_name} set state=1 where asin=%s"""
                cursor.execute(update_sql, parmas)
            mysql_server.conn.commit()
            mysql_server.close()
            # 测试用例
            # print(task_list)
            # asin_list = [{'country': 'it', 'asin': 'B07VB8WXNF'}]
            # asin_list = [{'countrycode': 'us', 'asin': 'B07FJGGWJL'}, {'countrycode': 'de', 'asin': 'B00Y211AFM'},
            #              {'countrycode': 'fr', 'asin': 'B00GS19NWG'}, {'countrycode': 'uk', 'asin': 'B0000E5SEQ'},
            #              {'countrycode': 'it', 'asin': 'B07VMRB2K1'}, {'countrycode': 'es', 'asin': 'B07FPFKL4X'},
            #              {'countrycode': 'ca', 'asin': 'B00XMD7KPU'}, {'countrycode': 'au', 'asin': 'B075FQY5BN'},]
            # {'countrycode': 'jp', 'asin': 'B07RPVQY62'}]

            # 整理数据,发送请求
            for task in task_list:
                task = {'asin': task[0], 'country': country}
                if len(task['asin']) > 10 and '?' in task['asin']:
                    asin = task['asin'].split('/')[-1].split('?')[0]
                    if len(asin) == 10:
                        asin = asin.upper()
                elif len(task['asin']) == 10:
                    asin = task['asin'].upper()
                url = "https://www.amazon.%s/dp/" % self.country_site[task[
                    'country']] + task['asin'] + '?th=1&psc=1&language=en_US'
                # https://www.amazon.co.uk/dp/B071GYJTST?th=1&psc=1&language=en_US
                # self.headers['Referer'] = url
                # self.headers['cookie'] = self.cookie_dict[country]
                yield scrapy.Request(url,
                                     meta={
                                         'country': task['country'],
                                         'asin': task['asin'],
                                         'type': "init",
                                         'retry_number': 0,
                                         'table_name': table_name
                                     },
                                     callback=self.parse,
                                     dont_filter=True)
Пример #7
0
class SorftimePipeline(object):
    def open_spider(self, spider):
        self.mysql_server = Mysql_server()

    def process_item(self, item, spider):
        cursor = self.mysql_server.get_cursor()
        params = (
            item['asin'], item['category_name'], item['rank'], item['bsr_url'], item['category_id'], item['sales'],
            item['country'], item['brand'], item['SalePrice'], item['product_url'], item['level'])
        sql = f"""insert into sorftime_sales_other (id, asin, category_name, `rank`, bsr_url, category_id, sales, country,
                              brand,SalePrice,product_url,level
         ) values(0, %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) on duplicate key update sales=values(sales)"""
        cursor.execute(sql, params)
        # timestamp = int(time.time())
        # table_name = item['country'] + '_asins'
        # params = (item['asin'], 1, item['rank'],
        # 'bsr_' + item['category_id'], timestamp, 0)
        # sql = f"""insert into {table_name} (id, asin, pageNum, positionNum, keyword, timestamp,state, ad)
        #            values (0, %s, %s, %s, %s, %s, 0, %s)"""
        # cursor.execute(sql, params)
        self.mysql_server.conn.commit()
        return item

    def close_spider(self, spider):
        self.mysql_server.conn.close()
Пример #8
0
class SellerspritePipeline(object):
    def open_spider(self, spider):
        self.mysql_server = Mysql_server()

    def process_item(self, item, spider):
        cursor = self.mysql_server.get_cursor()
        for sale_data in item['total_dict'].items():
            # m_add:只保存当月的数据 2020-10-07
            if sale_data[0] != '2020-10':
                continue
            params = (item['asin'], sale_data[-1], sale_data[0])
            sql = f"""	insert into product_sales (id, asin, sales, date) values 
                        (0, %s,%s,%s) 
                        on duplicate key update sales=values(sales)"""
            cursor.execute(sql, params)
        for rankdata in item['rankHistory'].items():
            params = (item['asin'], rankdata[-1], rankdata[0])
            sql = f"""INSERT into product_sales (id, asin, sales, date) values (0, %s, %s, %s) ON DUPLICATE KEY UPDATE sales=values(%s)"""
            #
            sql = f"""	insert into product_rank (id, asin, `rank`, date) values
                        (0, %s,%s,%s)
                        on duplicate key update `rank`=values(`rank`)"""
            # print(sql % params, '======================')
            cursor.execute(sql, params)
        params = (item['asin'],)
        sql = f"""update {item['table_name']} set state=4 where asin=%s"""
        cursor.execute(sql, params)
        self.mysql_server.conn.commit()
        return item

    def close_spider(self, spider):
        self.mysql_server.conn.close()
Пример #9
0
 def parse_keepa(self, response):
     # 从keepa提取近30天排名
     print('从keepa提取近30天排名')
     retry_number = response.meta["retry_number"]
     item = response.meta["item"]
     table_name = response.meta["table_name"]
     status = response.status
     url = response.url
     asin = response.meta["asin"]
     country = response.meta['country']
     data = json.loads(response.body)
     # item['avg30'] = data['products'][0]["stats"].get("avg30", ['', '', '', ''])[3]
     avg_data = data['products'][0].get("stats", {})
     if avg_data != None:
         item['avg30'] = avg_data.get("avg30", ['', '', '', ''])[3]
     else:
         item['avg30'] = 0
     timedata = data['products'][0].get("trackingSince", 0)
     if timedata != 0:
         timestamp = (timedata + 21564000) * 60
         if country in ['us', 'jp', 'fr', 'uk', 'es']:
             item['product_info']["Date First Available"] = \
                 time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(timestamp)).split(' ')[0]
         else:
             item['product_info']["Im Angebot von Amazon.de seit"] = \
                 time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(timestamp)).split(' ')[0]
         yield item
     else:
         yield item
     mysql_server = Mysql_server()
     cursor = mysql_server.get_cursor()
     params = (asin, )
     update_sql = f"""update {table_name} set state=2 where asin=%s"""
     cursor.execute(update_sql, params)
     mysql_server.conn.commit()
     mysql_server.close()
Пример #10
0
class AmzToplistReviewsPipeline(object):
    def open_spider(self, spider):
        self.mysql_server = Mysql_server()

    def process_item(self, item, spider):
        timestamp = int(int(time.time()) / 86400) * 86400
        cursor = self.mysql_server.get_cursor()
        params = (item['profileID'], item['country'], timestamp)
        sql = """insert ignore into product_toplistreviews (id, profileID, country, timestamp)
                     values (0, %s, %s, %s)"""
        cursor.execute(sql, params)
        self.mysql_server.conn.commit()
        return item

    def close_spider(self, spider):
        self.mysql_server.conn.close()
Пример #11
0
class AmzReviewsPipeline(object):
    def open_spider(self, spider):
        self.mysql_server = Mysql_server()

    def process_item(self, item, spider):
        timestamp = int(int(time.time()) / 86400) * 86400
        cursor = self.mysql_server.get_cursor()
        params = (item['reviewID'], item['review_time'], item['review_raiting'],
                  item['helpful_num'], item['review_title'], item['review_body'],
                  item['is_VP'], item['asin'], item['profileID'], item['country'], timestamp)
        sql = """insert into product_reviews (reviewID, review_time, review_raiting, helpful_num,
                             review_title, review_body, is_VP, asin, profileID, country, timestamp)
                     values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s,%s)"""
        cursor.execute(sql, params)
        self.mysql_server.conn.commit()
        return item

    def close_spider(self, spider):
        self.mysql_server.conn.close()
Пример #12
0
class AmzProductPipeline(object):
    def open_spider(self, spider):
        self.mysql_server = Mysql_server()
        self.client = pymongo.MongoClient('127.0.0.1', 27017, maxPoolSize=100)
        self.db = self.client.amazon
        self.collection = self.db.detail_info

    def process_item(self, item, spider):
        timestamp = int(time.time())
        cursor = self.mysql_server.get_cursor()
        params = (item['asin'], item['seller_type'], item['seller_num'],
                  item['brand'], item['price'], item['listing_rating'],
                  item['ratings'], item['stock_status'], item['QA_num'], timestamp, item['sellerName'],
                  item['sellerID'], item['country'], item['reviews'], item['actual_reviews'], item['critical'],
                  item['vp_num'], item['product_style'], item["avg30"])
        detail_sql = """insert into product_detail (id, asin, seller_type, seller_num, brand,
                             price, listing_rating, ratings, stock_status, QA_num, timestamp,
                              sellerName, sellerID, country, reviews, actual_reviews, critical, vp_num, product_style, avg30)
                     values (0, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"""
        cursor.execute(detail_sql, params)
        self.mysql_server.conn.commit()
        rank_sql = f"""insert into product_rankinfo (id, asin, categoryID,
                       category, `rank`, timestamp, country) values (0, %s, %s, %s, %s, %s, %s)"""
        rank_list = item['rank_list']['ranks']
        for data in rank_list:
            params = (item['asin'], data['catId'], data['name'], int(data['rank']), timestamp, item['country'])
            cursor.execute(rank_sql, params)
            self.mysql_server.conn.commit()
        target_data = {"asin": item['asin'], "frequently_bought_asins": item['frequently_bought_asins'],
                       "title": item['title'], "img_list": item['img_list'], "variant_list": item['variant_list'],
                       "parentasin": item['parentasin'], "vari_num": item['vari_num'],
                       "feature": item['feature'], "product_info": json.dumps(item['product_info']),
                       "product_descript": json.dumps(item['product_descript']),
                       "compare_info": json.dumps(item['compare_info']),
                       "other_info": item['other_info'], 'country': item['country'], "timestamp": timestamp}
        str_data = json.dumps(target_data, ensure_ascii=False)
        target_data = json.loads(str_data)
        self.collection.insert_one(target_data)
        return item

    def close_spider(self, spider):
        self.mysql_server.conn.close()
        self.client.close()
Пример #13
0
class AmzKeywordsPipeline(object):
    def open_spider(self, spider):
        self.mysql_server = Mysql_server()

    def process_item(self, item, spider):
        timestamp = int(time.time())
        cursor = self.mysql_server.get_cursor()
        table_name = item['country'] + '_asins'
        params = (item['asin'], item['pageNum'], item['positionNum'],
                  item['keyword'], timestamp, item['ad'])
        sql = f"""insert into {table_name} (id, asin, pageNum, positionNum, keyword, timestamp,state, ad)
                     values (0, %s, %s, %s, %s, %s, 0, %s)"""
        # print(sql,params)
        cursor.execute(sql, params)
        self.mysql_server.conn.commit()
        return item

    def close_spider(self, spider):
        self.mysql_server.conn.close()
Пример #14
0
class AmzProfilePipeline(object):
    def open_spider(self, spider):
        self.mysql_server = Mysql_server()

    def process_item(self, item, spider):
        cursor = self.mysql_server.get_cursor()
        params = (item['profileID'],)
        sql = """update product_toplistreviews set state=2 where profileID=%s"""
        cursor.execute(sql, params)
        params = (item['profileID'], item['helpfulVotes'], item['reviews'], item['location'], item['occupation'],
                  item['facebook'], item['twitter'], item['instagram'], item['youtube'], item['country'], item['rank'],
                  item['name'])
        sql = """insert into profile_info (id, profileID, helpfulVotes, reviews, location, occupation, facebook, twitter,
                instagram, youtube, country, `rank`, name) values(0, %s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) on duplicate key update reviews=values(reviews), rank=values(rank)"""
        cursor.execute(sql, params)
        self.mysql_server.conn.commit()
        return item

    def close_spider(self, spider):
        self.mysql_server.conn.close()
Пример #15
0
class AmzSellerPipeline(object):
    def open_spider(self, spider):
        self.mysql_server = Mysql_server()

    def process_item(self, item, spider):
        item = item['data']
        timestamp = int(time.time())
        cursor = self.mysql_server.get_cursor()
        params = (item['sellerID'], item['negative_lifetime'], item['count_lifetime'],
                  timestamp, item['country'])
        sql = f"""insert into seller_info (id, sellerID, negative_lifetime, count_lifetime, timestamp,country)
                     values (0, %s, %s, %s, %s, %s)"""
        # print(sql,params)
        cursor.execute(sql, params)
        params = (item['sellerID'],)
        sql = f"""update product_detail set state=2 where sellerID=%s"""
        cursor.execute(sql, params)
        self.mysql_server.conn.commit()
        return item

    def close_spider(self, spider):
        self.mysql_server.conn.close()
Пример #16
0
class AmzbsrPipeline(object):
    def open_spider(self, spider):
        self.mysql_server = Mysql_server()

    def process_item(self, item, spider):
        timestamp = int(int(time.time()) / 86400 / 30) * 86400 * 30
        cursor = self.mysql_server.get_cursor()
        if item['flag'] == 1:
            params = (item['category_id'], item['category_name'], item['category_id'] + f'_{timestamp}', item['level'],
                      item['parent_id'], 0, item['bsr_url'], item['country'])
            sql = """insert into amz_category (id, category_id, category_name, node_name, uid, level, parent_id, state, bsr_url, country)
                         values (0, %s, '',%s, %s, %s, %s, %s, %s, %s)"""
            # params = (item['category_name'], item['category_id'], item['country'])
            # sql = """update amz_category set node_name=%s where category_id=%s and country=%s"""
            # print('----')
        elif item['flag'] == 0:
            return
            timestamp = int(time.time())
            table_name = item['data']['country'] + '_asins'
            params = (item['data']['asin'], 1, item['data']['rank'],
                      'bsr_' + item['data']['categoryId'], timestamp, 0)
            sql = f"""insert into {table_name} (id, asin, pageNum, positionNum, keyword, timestamp,state, ad)
                         values (0, %s, %s, %s, %s, %s, 0, %s)"""
        elif item['flag'] == 2:
            print(f'======{item["category_name"]}=======')
            params = (item['category_name'], item['category_id'], item['country'])
            sql = """update amz_category set category_name=%s where category_id=%s and country=%s"""
        # try:
        cursor.execute(sql, params)
        self.mysql_server.conn.commit()
        # except:
        #     pass
        return item

    def close_spider(self, spider):
        self.mysql_server.conn.close()
Пример #17
0
    def start_requests(self):
        # 根据站点领取任务,整理数据,改变状态发送请求
        for country in self.country_list:
            mysql_server = Mysql_server()
            cursor = mysql_server.get_cursor()
            table_name = 'amz_category'
            cursor.execute(
                f"select category_id, category_name, level, bsr_url from {table_name} where state=1 and country='{country}' and level<10 limit 10")
            task_list = cursor.fetchall()
            print(len(task_list), '=====')
            for task in task_list:
                task = {'category_id': task[0], 'category_name': task[1], 'level': task[2], 'country': country}
                parmas = (task['category_id'], task['level'])
                update_sql = f"""update {table_name} set state=2 where category_id=%s and level=%s"""
                cursor.execute(update_sql, parmas)
            mysql_server.conn.commit()
            mysql_server.close()
            for task in task_list:
                task = {'category_id': task[0], 'category_name': task[1], 'level': task[2], 'country': country,
                        'category_url': task[3]}
                # if task['level'] == 1:
                #     if country == 'us':
                #         if task['category_name'] == 'Cell Phones & Accessories':
                #             task['category_id'] = 'wireless'
                #         elif task['category_name'] == 'Electronics':
                #             task['category_id'] = 'electronics'
                #         elif task['category_name'] == 'Home & Kitchen':
                #             task['category_id'] = 'home-garden'
                #         else:
                #             task['category_id'] = '541966'
                #     elif country in ('uk', 'de', 'jp', 'es', 'fr'):
                #         if country == 'uk':
                #             if task['category_name'] == 'Electronics & Photo':
                #                 task['category_id'] = 'electronics'
                #             elif task['category_name'] == 'Home & Kitchen':
                #                 task['category_id'] = 'home-garden'
                #             else:
                #                 task['category_id'] = 'computers'
                #         if country == 'de':
                #             if task['category_name'] == 'Elektronik & Foto':
                #                 task['category_id'] = 'ce-de'
                #             else:
                #                 task['category_id'] = 'computers'
                #         if country == 'fr':
                #             if task['category_name'] == 'High-Tech':
                #                 task['category_id'] = 'electronics'
                #             else:
                #                 task['category_id'] = 'computers'
                #         if country == 'it':
                #             if task['category_name'] == 'Elettronica':
                #                 task['category_id'] = 'electronics'
                #             else:
                #                 task['category_id'] = 'computers'
                #         if country == 'es':
                #             if task['category_name'] == 'Electrónica':
                #                 task['category_id'] = 'electronics'
                #             else:
                #                 task['category_id'] = 'computers'
                #         if country == 'jp':
                #             if task['category_name'] == 'Electronics':
                #                 task['category_id'] = 'electronics'
                #             else:
                #                 task['category_id'] = 'computers'
                #     elif country in ('it',):
                #         task['category_id'] = 'pc'

                site_dict = {
                    'us': '01', 'uk': '02', 'de': '03', 'fr': '04', 'jp': '07', 'es': '08', 'it': '09',
                }
                url = f'https://plug.sorftime.com/FlowCircle/QueryProductByNodeId?site={site_dict[country]}&token=Zkg4Q1Y4VllTUytIRWhiWFNpVGx4Zz09'
                self.headers['Referer'] = url
                data = f'delive=0&ebc=0&bbx=0&NodeId={task["category_id"]}&times=0&ProductId=&Order=SaleCount&OrderType=desc'
                yield scrapy.Request(url, body=json.dumps(data), method='POST',
                                     meta={'country': task['country'], 'category_id': task['category_id'],
                                           'category_name': task['category_name'], 'category_url': task['category_url'],
                                           'level': task['level'],
                                           'retry_number': 0, 'table_name': table_name},
                                     headers=self.headers, callback=self.parse, dont_filter=True)
Пример #18
0
    def parse(self, response):
        # 代理状态更新, 如果代理可用,则给一个高分数令其至前,否则给一个低分数置后
        proxy = response.meta.get('proxy', '').replace('https://', '').replace('http://', '')
        proxy_data = {"proxy": proxy,
                      "fail_count": 0, "region": "", "type": "",
                      "source": "spider",
                      "check_count": 20, "last_status": 0,
                      "last_time": ""}
        page_num = response.meta['page_num']
        table_name = response.meta['table_name']
        asin = response.meta['asin']
        countrycode = response.meta['countrycode']
        # 判断是否出现验证码,否则重试并改变代理状态
        if len(response.body) < 10000 or response.status == 503:
            proxy_data['fail_count'] = 18
            self.collection.hset(name="useful_proxy", key=proxy, value=json.dumps(proxy_data))
            yield scrapy.Request(url=response.url,
                                 headers=self.headers,
                                 meta={
                                     'page_num': page_num,
                                     'asin': asin,
                                     'table_name': table_name,
                                     'countrycode': countrycode},
                                 callback=self.parse,
                                 dont_filter=True
                                 )
            return
        elif response.status == 404:
            # 若asin不存在则直接停止爬取    # Todo: 识别后应该进行状态重新设置标记
            return
        self.collection.hset(name="useful_proxy", key=proxy, value=json.dumps(proxy_data))

        # 提取评论列表
        data_list = response.xpath('//div[@id="cm_cr-review_list"]/div')
        if len(data_list) > 0:
            for data in data_list:
                item = {}
                item['reviewID'] = data.xpath('./@id').extract_first()  # 评论ID
                try:
                    # 评论人ID
                    item['profileID'] = \
                        data.xpath('./div/div/div//a[@class="a-profile"]/@href').extract_first().split('.')[2].split(
                            '/')[0]
                except:
                    continue

                # 评论时间
                item['review_time'] = data.xpath(
                    './div/div/span[@data-hook="review-date"]/text()').extract_first()  # .split('on ')[-1]
                # 评论评分
                item['review_raiting'] = float(
                    data.xpath('.//div/div/div/a/@title').extract_first().split(' ')[0].replace(',', '.'))
                # 评论标题
                item['review_title'] = data.xpath(
                    './/div/div//a[@data-hook="review-title"]/span/text()').extract_first()
                if item['review_title'] == None:
                    item['review_title'] = ""
                # 评论内容
                item['review_body'] = data.xpath('string(.//span[@data-hook="review-body"]/span)').extract_first()
                # 评论有用数
                helpful_str = data.xpath('./div/div/div/div/span/div/span/text()').extract_first()
                if helpful_str is None:
                    item['helpful_num'] = 0
                elif helpful_str.startswith('One') or helpful_str.startswith('Une') or helpful_str.startswith(
                        'A una') or helpful_str.startswith('Una') or helpful_str.startswith('Eine'):
                    item['helpful_num'] = 1
                else:
                    try:
                        item['helpful_num'] = int(helpful_str.split(' ')[1].replace(',', ''))
                    except:
                        item['helpful_num'] = int(helpful_str.split(' ')[0].replace(',', ''))
                    # item['helpful_num'] = helpful_str

                # 评论是否为VP评论
                vp_str = data.xpath('./div/div/div/span/a/span/text()').extract_first()
                if vp_str in ['Verified Purchase', 'Verifizierter Kauf', 'Amazonで購入',
                              'Achat vérifié', 'Acquisto verificato', 'Compra verificada', '']:
                    item['is_VP'] = 1  # 是VP评论
                elif vp_str is None:
                    item['is_VP'] = 0  # 不是VP评论
                else:
                    item['is_VP'] = 2  # 其他情况,比如早期评论计划评论
                # ------------------------------时间处理-----------------------------------
                review_time = item['review_time']
                # 时间处理
                if review_time != '':
                    if countrycode == 'us' or countrycode == 'ca' or countrycode == 'jp':
                        review_time = review_time.split('on ')[-1]
                        review_time = time.strftime("%Y-%m-%d", time.strptime(review_time, "%B %d, %Y"))
                    elif countrycode == 'fr':
                        fr_month = ["janvier", "février", "mars", "avril", "mai", "juin", "juillet", "août",
                                    "septembre", "octobre", "novembre", "décembre"]
                        review_time = review_time.split('le ')[-1]  # .decode('utf-8').encode("latin-1")
                        for each in range(12):
                            if self.fr_month[each] in review_time:
                                review_time = review_time.replace(self.fr_month[each], self.us_month[each])
                                break
                        review_time = time.strftime("%Y-%m-%d", time.strptime(review_time, "%d %B %Y"))
                    elif countrycode == 'de':
                        de_month = ["Januar", "Februar", "März", "April", "Mai", "Juni", "Juli", "August", "September",
                                    "Oktober", "November", "Dezember"]
                        review_time = review_time.split('vom ')[-1]  # .decode('utf-8').encode("latin-1")
                        for each in range(12):
                            if self.de_month[each] in review_time:
                                review_time = review_time.replace(self.de_month[each], self.us_month[each])
                                break
                        review_time = time.strftime("%Y-%m-%d", time.strptime(review_time, "%d. %B %Y"))
                    elif countrycode == 'es':
                        self.es_month = ["enero", "febrero", "marzo", "abril", "mayo", "junio", "julio", "agosto",
                                         "septiembre", "octubre", "noviembre", "diciembre"]
                        review_time = review_time.split('el ')[-1].replace('de ',
                                                                           '')  # .decode('utf-8').encode("latin-1")
                        for each in range(12):
                            if self.es_month[each] in review_time:
                                review_time = review_time.replace(self.es_month[each], self.us_month[each])
                                break
                        review_time = time.strftime("%Y-%m-%d", time.strptime(review_time, "%d %B %Y"))
                    elif countrycode == 'it':
                        it_month = ["gennaio", "febbraio", "marzo", "aprile", "maggio", "giugno", "luglio", "agosto",
                                    "settembre", "ottobre", "novembre", "dicembre"]
                        review_time = review_time.split('il ')[-1]  # .decode('utf-8').encode("latin-1")
                        for each in range(12):
                            if self.it_month[each] in review_time:
                                review_time = review_time.replace(self.it_month[each], self.us_month[each])
                                break
                        review_time = time.strftime("%Y-%m-%d", time.strptime(review_time, "%d %B %Y"))
                    # elif countrycode == 'jp':
                    #     review_time = '-'.join(re.findall('\d+', review_time))
                    #     review_time = time.strftime("%Y-%m-%d", time.strptime(review_time, "%Y-%m-%d"))
                    elif countrycode == 'uk' or countrycode == 'au':
                        review_time = review_time.split('on ')[-1]
                        review_time = time.strftime("%Y-%m-%d", time.strptime(review_time, "%d %B %Y"))
                item['review_time'] = review_time
                item['asin'] = asin
                item['country'] = countrycode
                # ------------------------------时间处理-----------------------------------
                yield item

        try:
            if len(data_list) >= 10 and page_num == 1:
                # 提取出评论总数之后进行并发爬取
                if countrycode == 'fr':
                    reviews_count = \
                        response.xpath('//div[@id="filter-info-section"]/span/text()').extract_first().replace(',', '')
                    reviews_count = re.findall('sur ([0-9]+)', reviews_count)[0]
                elif countrycode == 'it':
                    reviews_count = \
                        response.xpath('//div[@id="filter-info-section"]/span/text()').extract_first().replace(',', '')
                    # print(reviews_count, '------')
                    reviews_count = re.findall('su ([0-9]+)', reviews_count)[0]
                elif countrycode == 'de':  # or countrycode == 'uk':
                    reviews_count = \
                        response.xpath('//div[@id="filter-info-section"]/span/text()').extract_first().replace(',', '')
                    # print(reviews_count, '------')
                    reviews_count = re.findall('von ([0-9]+)', reviews_count)[0]
                elif countrycode == 'uk':
                    reviews_count = \
                        response.xpath('//div[@id="filter-info-section"]/span/text()').extract_first().replace(',', '')
                    # print(reviews_count, '------')
                    reviews_count = re.findall('of ([0-9]+)', reviews_count)[0]
                elif countrycode == 'es':
                    reviews_count = \
                        response.xpath('//div[@id="filter-info-section"]/span/text()').extract_first().replace(',', '')
                    # print(reviews_count, '------')
                    reviews_count = re.findall('de ([0-9]+)', reviews_count)[0]
                else:
                    reviews_count = \
                        response.xpath('//div[@id="filter-info-section"]/span/text()').extract_first().split(' ')[
                            -2].replace(
                            ',', '')
        except:
            reviews_count = \
                response.xpath('//div[@id="filter-info-section"]//span/text()').extract()
            if reviews_count != []:
                reviews_count = re.findall('\| (.*?) ', reviews_count[-1])[0]
                reviews_count = reviews_count.replace(',', '').replace('.', '').replace(' ', '')
            else:
                reviews_count = 0

            page_count = int(reviews_count) // 10 + 2  # 10条一页,对10取余加2保证完全爬取
            for page_num in range(2, page_count):
                url = f'https://{self.countryCodeArr[countrycode]}/product-reviews/{asin}/ref=cm_cr_arp_d_viewopt_fmt?reviewerType=all_reviews&pageNumber={page_num}&formatType=current_format&filterByStar=critical&language=en_US'
                if self.task_type == 1:
                    url = f'https://{self.countryCodeArr[countrycode]}/product-reviews/{asin}?reviewerType=all_reviews&pageNumber={page_num}#reviews-filter-bar&language=en_US'
                yield scrapy.Request(
                    url=url,
                    headers=self.headers,
                    meta={
                        'page_num': page_num,
                        'asin': asin,
                        'countrycode': countrycode,
                        "table_name": table_name},
                    callback=self.parse,
                    dont_filter=True
                )
            mysql_server = Mysql_server()
            cursor = mysql_server.get_cursor()
            parmas = (asin,)
            # 并行爬取开始后改变状态表示已爬取,并不关心什么时候完成爬取
            update_sql = f"""update {table_name} set state=7 where asin=%s"""
            cursor.execute(update_sql, parmas)
            mysql_server.conn.commit()
            mysql_server.close()
Пример #19
0
 def start_requests(self):
     # 提取asin,整理数据发送请求, 改变状态
     for country in self.country_list:
         mysql_server = Mysql_server()
         cursor = mysql_server.get_cursor()
         table_name = country + '_asins'
         cursor.execute(
             f"select asin from {table_name} where state=2 limit 100")
         task_list = cursor.fetchall()
         print(task_list)
         for task in task_list:
             task = {'asin': task[0], 'country': country}
             parmas = (task['asin'], )
             update_sql = f"""update {table_name} set state=3 where asin=%s"""
             cursor.execute(update_sql, parmas)
         mysql_server.conn.commit()
         mysql_server.close()
         for task in task_list:
             task = {'asin': task[0], 'country': country}
             if len(task['asin']) > 10 and '?' in task['asin']:
                 asin = task['asin'].split('/')[-1].split('?')[0]
                 if len(asin) == 10:
                     asin = asin.upper()
             elif len(task['asin']) == 10:
                 asin = task['asin'].upper()
             item = {}
             item['asin'] = task['asin']
             item['country'] = country
             item['table_name'] = table_name
             item['total_dict'] = {}
             item['rankHistory'] = {}
             # 站点后缀对应表
             code_dir = {
                 "us": "COM",
                 "uk": "CO_UK",
                 "de": "DE",
                 "fr": "FR",
                 "it": "IT",
                 'es': "ES",
                 'jp': "CO_JP"
             }
             url = f"https://www.amzscout.net/extensions/scoutpro/v1/products/{code_dir[country]}"
             # 各站点账号cookie请求头
             headers = {
                 'us': {
                     "Host": "amzscout.net",
                     "User-Agent":
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",
                     "x-instance-id":
                     "a3fa3f86-9edd-4743-90f6-26dace59202b",
                     "x-signature": "5bb36a25caf0e17720e9a4a9e5cfa3ff",
                     "Cookie":
                     "_ga=GA1.2.1638731990.1595486255; _gid=GA1.2.1082995224.1595486255; mindboxDeviceUUID=7a11484f-1338-47a3-863e-747203d8f968; directCrm-session=%7B%22deviceGuid%22%3A%227a11484f-1338-47a3-863e-747203d8f968%22%7D; cid=19907047; G_ENABLED_IDPS=google; h=AJ8zGal7cqMwMFWwJ1JS",
                     "Content-Type": "application/json",
                     "Accept": "*/*",
                     "Cache-Control": "no-cache",
                     "Postman-Token":
                     "e21cd0f2-623c-4d26-9101-61e784be19f3",
                     "Accept-Encoding": "gzip, deflate, br",
                     "Connection": "keep-alive",
                     "Content-Length": "23",
                 },
                 'uk': {
                     "Host": "amzscout.net",
                     "User-Agent":
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",
                     "x-instance-id":
                     "a3fa3f86-9edd-4743-90f6-26dace59202b",
                     "x-signature": "8fad7d23e214780b9b8a5bf6e1c36887",
                     "Cookie":
                     "_ga=GA1.2.1638731990.1595486255; _gid=GA1.2.1082995224.1595486255; mindboxDeviceUUID=7a11484f-1338-47a3-863e-747203d8f968; directCrm-session=%7B%22deviceGuid%22%3A%227a11484f-1338-47a3-863e-747203d8f968%22%7D; cid=19907047; G_ENABLED_IDPS=google; h=AJ8zGal7cqMwMFWwJ1JS",
                     "Content-Type": "application/json",
                     "Accept": "*/*",
                     "Cache-Control": "no-cache",
                     "Postman-Token":
                     "e21cd0f2-623c-4d26-9101-61e784be19f3",
                     "Accept-Encoding": "gzip, deflate, br",
                     "Connection": "keep-alive",
                     "Content-Length": "23",
                 },
                 'de': {
                     "Host": "amzscout.net",
                     "User-Agent":
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",
                     "x-instance-id":
                     "a3fa3f86-9edd-4743-90f6-26dace59202b",
                     "x-signature": "7c0e3563c17a8dad639895474b4c8c9c",
                     "Cookie":
                     "_ga=GA1.2.1638731990.1595486255; directCrm-session=%7B%22deviceGuid%22%3A%227a11484f-1338-47a3-863e-747203d8f968%22%7D; mindboxDeviceUUID=7a11484f-1338-47a3-863e-747203d8f968; cid=19907047; G_ENABLED_IDPS=google; h=AJ8zGal7cqMwMFWwJ1JS; _ym_uid=1595493952224708321; _ym_d=1595493952; _gid=GA1.2.1387564678.1595814486",
                     "Content-Type": "application/json",
                     "Accept": "*/*",
                     "Cache-Control": "no-cache",
                     "Postman-Token":
                     "e21cd0f2-623c-4d26-9101-61e784be19f3",
                     "Accept-Encoding": "gzip, deflate, br",
                     "Connection": "keep-alive",
                     "Content-Length": "23",
                 },
                 'fr': {
                     "Host": "amzscout.net",
                     "User-Agent":
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",
                     "x-instance-id":
                     "a3fa3f86-9edd-4743-90f6-26dace59202b",
                     "x-signature": "68b2abf9af33e951fecdf240fe897b1d",
                     "Cookie":
                     "_ga=GA1.2.1638731990.1595486255; directCrm-session=%7B%22deviceGuid%22%3A%227a11484f-1338-47a3-863e-747203d8f968%22%7D; mindboxDeviceUUID=7a11484f-1338-47a3-863e-747203d8f968; cid=19907047; G_ENABLED_IDPS=google; h=AJ8zGal7cqMwMFWwJ1JS; _ym_uid=1595493952224708321; _ym_d=1595493952; _gid=GA1.2.1387564678.1595814486",
                     "Content-Type": "application/json",
                     "Accept": "*/*",
                     "Cache-Control": "no-cache",
                     "Postman-Token":
                     "e21cd0f2-623c-4d26-9101-61e784be19f3",
                     "Accept-Encoding": "gzip, deflate, br",
                     "Connection": "keep-alive",
                     "Content-Length": "23",
                 },
                 'it': {
                     "Host": "amzscout.net",
                     "User-Agent":
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",
                     "x-instance-id":
                     "a3fa3f86-9edd-4743-90f6-26dace59202b",
                     "x-signature": "54be64e0f25f50ffa05638afbf6811a3",
                     "Cookie":
                     "_ga=GA1.2.1638731990.1595486255; directCrm-session=%7B%22deviceGuid%22%3A%227a11484f-1338-47a3-863e-747203d8f968%22%7D; mindboxDeviceUUID=7a11484f-1338-47a3-863e-747203d8f968; cid=19907047; G_ENABLED_IDPS=google; h=AJ8zGal7cqMwMFWwJ1JS; _ym_uid=1595493952224708321; _ym_d=1595493952; _gid=GA1.2.1387564678.1595814486",
                     "Content-Type": "application/json",
                     "Accept": "*/*",
                     "Cache-Control": "no-cache",
                     "Postman-Token":
                     "e21cd0f2-623c-4d26-9101-61e784be19f3",
                     "Accept-Encoding": "gzip, deflate, br",
                     "Connection": "keep-alive",
                     "Content-Length": "23",
                 },
                 'es': {
                     "Host": "amzscout.net",
                     "User-Agent":
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",
                     "x-instance-id":
                     "a3fa3f86-9edd-4743-90f6-26dace59202b",
                     "x-signature": "dbb302cf60e8efa77bc33e596a24b6b2",
                     "Cookie":
                     "_ga=GA1.2.1638731990.1595486255; directCrm-session=%7B%22deviceGuid%22%3A%227a11484f-1338-47a3-863e-747203d8f968%22%7D; mindboxDeviceUUID=7a11484f-1338-47a3-863e-747203d8f968; cid=19907047; G_ENABLED_IDPS=google; h=AJ8zGal7cqMwMFWwJ1JS; _ym_uid=1595493952224708321; _ym_d=1595493952; _gid=GA1.2.114546357.1596414342",
                     "Content-Type": "application/json",
                     "Accept": "*/*",
                     "Cache-Control": "no-cache",
                     "Postman-Token":
                     "e21cd0f2-623c-4d26-9101-61e784be19f3",
                     "Accept-Encoding": "gzip, deflate, br",
                     "Connection": "keep-alive",
                     "Content-Length": "23",
                 },
                 'jp': {
                     "Host": "amzscout.net",
                     "User-Agent":
                     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36",
                     "x-instance-id":
                     "8e86ea27-d5fa-4f8c-b347-96a659338078",
                     "x-signature": "7615fd3657994cba7e376594101849ab",
                     "Cookie":
                     "cid=20212472; mindboxDeviceUUID=486b8129-af83-42e9-a5cf-b70776d8b1e6; directCrm-session=%7B%22deviceGuid%22%3A%22486b8129-af83-42e9-a5cf-b70776d8b1e6%22%7D; h=bgXLkUopN26n8fEvlgHn",
                     "Content-Type": "application/json",
                     "Accept": "*/*",
                     "Cache-Control": "no-cache",
                     "Postman-Token":
                     "e21cd0f2-623c-4d26-9101-61e784be19f3",
                     "Accept-Encoding": "gzip, deflate, br",
                     "Connection": "keep-alive",
                     "Content-Length": "23",
                 },
             }
             post = [{"asin": task['asin']}]  # {
             # 该post请求用scrapy无法实现,所以用request实现数据获取,再通过多访问一次baidu走scrapy的流程返回数据
             # Todo: 优化实现用scrapy实现请求
             while True:
                 # 代理可能失效,循环至成功为止
                 try:
                     response = requests.post(url,
                                              data=json.dumps(post),
                                              headers=headers[country],
                                              timeout=10,
                                              verify=False)
                     chartData = json.loads(response.text)
                     # 提取近一年销量和排名数据,但是保存时无指定情况则只保存一条数据
                     if len(chartData) > 0:
                         item['total_dict']['2020-10'] = chartData[0].get(
                             'estSales', 0)
                         salesHistory = chartData[0].get('salesHistory', [])
                         time_int = int(time.time())
                         salesHistory.reverse()
                         # 提取历史销量数据
                         for saledata in salesHistory:
                             time_int -= 86400
                             if saledata is not None:
                                 timeArray = time.localtime(time_int)
                                 time_str = time.strftime(
                                     "%Y-%m-%d", timeArray)
                                 item['total_dict'][time_str] = saledata
                         rankHistory = chartData[0].get('rankHistory', [])
                         rankHistory.reverse()
                         time_int = int(time.time())
                         # 提取历史排名数据
                         for rankdata in rankHistory:
                             time_int -= 86400
                             if rankdata is not None:
                                 timeArray = time.localtime(time_int)
                                 time_str = time.strftime(
                                     "%Y-%m-%d", timeArray)
                                 item['rankHistory'][time_str] = rankdata
                     else:
                         item['total_dict']['2020-10'] = 0
                     break
                 except:
                     time.sleep(3)
                     pass
             url = 'https://www.baidu.com'
             yield scrapy.Request(
                 url,
                 method='get',  # body=json.dumps(post),
                 meta={
                     'country': country,
                     'asin': task['asin'],
                     'retry_number': 0,
                     'table_name': table_name,
                     "item": item
                 },
                 headers=headers,
                 callback=self.parse_total,
                 dont_filter=True,
             )
Пример #20
0
    def parse(self, response):
        # 代理状态更新, 如果代理可用,则给一个高分数令其至前,否则给一个低分数置后
        proxy = response.meta.get('proxy',
                                  '').replace('https://',
                                              '').replace('http://', '')
        proxy_data = {
            "proxy": proxy,
            "fail_count": 0,
            "region": "",
            "type": "",
            "source": "spider",
            "check_count": 20,
            "last_status": 0,
            "last_time": ""
        }
        retry_number = response.meta["retry_number"]

        # 当页面10次都未成功提取数据则停止任务
        if retry_number > 10:
            return
        table_name = response.meta["table_name"]
        status = response.status
        url = response.url
        asin = response.meta["asin"]
        country = response.meta['country']
        if status == 200 or status == 503:
            try:
                # 判断是否为验证码,若是则更新代理状态并重试
                if len(response.body) < 10000 or status == 503:
                    proxy_data['fail_count'] = 18
                    self.collection.hset(name="useful_proxy",
                                         key=proxy,
                                         value=json.dumps(proxy_data))
                    yield scrapy.Request(url,
                                         meta={
                                             'country': country,
                                             'asin': asin,
                                             'type': "init",
                                             'retry_number': retry_number,
                                             'table_name': table_name
                                         },
                                         callback=self.parse,
                                         dont_filter=True)
                    return

                # 不是代理则解析数据
                self.product_object.initial_data = response.text
                self.product_object.countryCode = country
                item = {}
                item['asin'] = asin
                item['country'] = country
                self.product_object.best_seller_rank()
                self.product_object.seller_info()
                self.product_object.revews_rating()
                item[
                    'frequently_bought_asins'] = self.product_object.frequently_bought_asins  # 关联购买
                # item['rank_list'] = self.product_object.rank_list
                item['seller_type'] = int(
                    self.product_object.seller_type)  # 卖家类型
                item['seller_num'] = int(
                    self.product_object.seller_num)  # 卖家数量,跟卖数
                # item['merchant'] = self.product_object.merchant
                item['brand'] = self.product_object.brand  # 品牌
                item['price'] = float(self.product_object.price)  # 价格
                item['ratings'] = int(self.product_object.reviews)  # 评分人数
                item['listing_rating'] = float(
                    self.product_object.rating)  # 商品评分
                item['title'] = str(self.product_object.title)  # 标题
                item['stock_status'] = int(
                    self.product_object.availability[0])  # 库存状态
                item['QA_num'] = int(self.product_object.QA_num)  # QA数
                item['img_list'] = self.product_object.img_list  # 图片地址列表
                variant_data = self.product_object.variant_list
                if len(variant_data) == 3:
                    item['variant_list'] = variant_data[0]  # 变体列表
                    item['parentasin'] = variant_data[1]  # 父asin
                    item['vari_num'] = variant_data[-1]  # 变体数
                else:
                    item['variant_list'] = ''
                    item['parentasin'] = ''
                    item['vari_num'] = ''
                item['rank_list'] = self.product_object.rank_list  # 排名信息
                item['feature'] = self.product_object.description[
                    'feature']  # 五点描述
                item['sellerName'] = self.product_object.merchantName  # 卖家名称
                item['sellerID'] = self.product_object.merchantUrl  # 卖家ID
                item[
                    'product_info'] = self.product_object.product_info  # 产品A+ 页面信息
                item[
                    'compare_info'] = self.product_object.compare_info  # 产品比较信息
                item[
                    'product_descript'] = self.product_object.product_descript  # 产品技术信息
                item['other_info'] = self.product_object.other_info  # 其他信息
                # 亚马逊反爬手段之一,不给下列字段信息,其余信息完整无误,此时需要重新爬取,但是有的商品确实没有这些信息,如虚拟商品,所以重试一定次数截止
                if len(item['product_descript']['product_dict']) == 0 and len(
                        item['product_descript']['product_info']) == 0 and len(
                            item['product_descript']
                            ['other_info']) == 0 and retry_number < 1:
                    raise Exception
                else:
                    url = f"https://www.amazon.{self.country_site[item['country']]}/product-reviews/{item['asin']}?formatType=current_format"
                    yield scrapy.Request(url,
                                         meta={
                                             'country': item['country'],
                                             'asin': item['asin'],
                                             'type': "init",
                                             'retry_number': retry_number,
                                             'table_name': table_name,
                                             "item": item,
                                             "flag": 0
                                         },
                                         callback=self.parse_reviews,
                                         dont_filter=True)

            except Exception as e:
                # 提取信息出错则重试
                print('提取信息出错则重试', e)
                retry_number += 1
                yield scrapy.Request(url,
                                     meta={
                                         'country': country,
                                         'asin': asin,
                                         'type': "init",
                                         'retry_number': retry_number,
                                         "table_name": table_name
                                     },
                                     callback=self.parse,
                                     dont_filter=True)
            finally:
                # 更新代理信息
                self.collection.hset(name="useful_proxy",
                                     key=proxy,
                                     value=json.dumps(proxy_data))

        elif status == 404:
            # 商品已下架或更改,则改变商品状态
            self.collection.hset(name="useful_proxy",
                                 key=proxy,
                                 value=json.dumps(proxy_data))
            mysql_server = Mysql_server()
            cursor = mysql_server.get_cursor()
            params = (asin, )
            update_sql = f"""update {table_name} set state=-1 where asin=%s"""
            cursor.execute(update_sql, params)
            mysql_server.conn.commit()
Пример #21
0
    def start_requests(self):
        # lev1_list = [
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers/zgbs/amazon-devices/',
        #     #  'category_name': 'Amazon Devices & Accessories', 'level': 1},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers-Amazon-Launchpad/zgbs/boost/',
        #     #  'category_name': 'Amazon Launchpad', 'level': 1},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers-Prime-Pantry/zgbs/pantry/', 'category_name': 'Amazon Pantry',
        #     #  'level': 1},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers-Appliances/zgbs/appliances/', 'category_name': 'Appliances',
        #     #  'level': 1}, {'cat_url': 'https://www.amazon.com/Best-Sellers-Appstore-Android/zgbs/mobile-apps/',
        #     #                'category_name': 'Apps & Games', 'level': 1},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers-Arts-Crafts-Sewing/zgbs/arts-crafts/',
        #     #  'category_name': 'Arts, Crafts & Sewing', 'level': 1},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers-Audible-Audiobooks/zgbs/audible/',
        #     #  'category_name': 'Audible Books & Originals', 'level': 1},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers-Automotive/zgbs/automotive/', 'category_name': 'Automotive',
        #     #  'level': 1},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers-Baby/zgbs/baby-products/', 'category_name': 'Baby', 'level': 1},
        #     {'cat_url': 'https://www.amazon.com/Best-Sellers-Beauty/zgbs/beauty/', 'category_name': 'Beauty & Personal Care',
        #      'level': 1, "category_id": 2001911},
        #     # {'cat_url': 'https://www.amazon.com/best-sellers-books-Amazon/zgbs/books/', 'category_name': 'Books',
        #     #  'level': 1},
        #     # {'cat_url': 'https://www.amazon.com/best-sellers-music-albums/zgbs/music/', 'category_name': 'CDs & Vinyl',
        #     #  'level': 1},
        #     {'cat_url': 'https://www.amazon.com/best-sellers-camera-photo/zgbs/photo/', 'category_name': 'Camera & Photo',
        #      'level': 1,"category_id": 2001912},
        #     {'cat_url': 'https://www.amazon.com/Best-Sellers/zgbs/wireless/', 'category_name': 'Cell Phones & Accessories',
        #      'level': 1,"category_id": 2001913},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers/zgbs/fashion/', 'category_name': 'Clothing, Shoes & Jewelry',
        #     #  'level': 1}, {'cat_url': 'https://www.amazon.com/Best-Sellers-Collectible-Coins/zgbs/coins/',
        #     #                'category_name': 'Collectible Currencies', 'level': 1},
        #     {'cat_url': 'https://www.amazon.com/Best-Sellers-Computers-Accessories/zgbs/pc/',
        #      'category_name': 'Computers & Accessories', 'level': 1, "category_id": 2001914},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers-MP3-Downloads/zgbs/dmusic/', 'category_name': 'Digital Music',
        #     #  'level': 1},
        #     {'cat_url': 'https://www.amazon.com/Best-Sellers-Electronics/zgbs/electronics/', 'category_name': 'Electronics',
        #      'level': 1, "category_id": 2001915},
        #     # {
        #     #     'cat_url': 'https://www.amazon.com/Best-Sellers-Entertainment-Collectibles/zgbs/entertainment-collectibles/',
        #     #     'category_name': 'Entertainment Collectibles', 'level': 1},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers-Gift-Cards/zgbs/gift-cards/', 'category_name': 'Gift Cards',
        #     #  'level': 1}, {'cat_url': 'https://www.amazon.com/Best-Sellers-Grocery-Gourmet-Food/zgbs/grocery/',
        #     #                'category_name': 'Grocery & Gourmet Food', 'level': 1},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers-Handmade/zgbs/handmade/', 'category_name': 'Handmade Products',
        #     #  'level': 1},
        # {'cat_url': 'https://www.amazon.com/Best-Sellers-Health-Personal-Care/zgbs/hpc/',
        #                    'category_name': 'Health & Household', 'level': 1, "category_id": 2001916},
        #     {'cat_url': 'https://www.amazon.com/Best-Sellers-Home-Kitchen/zgbs/home-garden/',
        #      'category_name': 'Home & Kitchen', 'level': 1,"category_id": 2001917},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers-Industrial-Scientific/zgbs/industrial/',
        #     #  'category_name': 'Industrial & Scientific', 'level': 1},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers-Kindle-Store/zgbs/digital-text/',
        #     #  'category_name': 'Kindle Store', 'level': 1},
        #     {'cat_url': 'https://www.amazon.com/Best-Sellers-Kitchen-Dining/zgbs/kitchen/',
        #      'category_name': 'Kitchen & Dining', 'level': 1,"category_id": 2001918},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers-Magazines/zgbs/magazines/',
        #     #  'category_name': 'Magazine Subscriptions', 'level': 1},
        #     # {'cat_url': 'https://www.amazon.com/best-sellers-movies-TV-DVD-Blu-ray/zgbs/movies-tv/',
        #     #  'category_name': 'Movies & TV', 'level': 1},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers-Musical-Instruments/zgbs/musical-instruments/',
        #     #  'category_name': 'Musical Instruments', 'level': 1},
        #     {'cat_url': 'https://www.amazon.com/Best-Sellers-Office-Products/zgbs/office-products/',
        #      'category_name': 'Office Products', 'level': 1,"category_id": 2001919},
        #     {'cat_url': 'https://www.amazon.com/Best-Sellers-Garden-Outdoor/zgbs/lawn-garden/',
        #      'category_name': 'Patio, Lawn & Garden', 'level': 1, "category_id": 2001920},
        #     {'cat_url': 'https://www.amazon.com/Best-Sellers-Pet-Supplies/zgbs/pet-supplies/',
        #      'category_name': 'Pet Supplies', 'level': 1,"category_id": 2001921},
        #     # {'cat_url': 'https://www.amazon.com/best-sellers-software/zgbs/software/', 'category_name': 'Software',
        #     #  'level': 1},
        # {'cat_url': 'https://www.amazon.com/Best-Sellers-Sports-Outdoors/zgbs/sporting-goods/',
        #                    'category_name': 'Sports & Outdoors', 'level': 1,"category_id": 2001922},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers-Sports-Collectibles/zgbs/sports-collectibles/',
        #     #  'category_name': 'Sports Collectibles', 'level': 1},
        #     {'cat_url': 'https://www.amazon.com/Best-Sellers-Home-Improvement/zgbs/hi/',
        #      'category_name': 'Tools & Home Improvement', 'level': 1, "category_id": 2001923},
        #     # {'cat_url': 'https://www.amazon.com/Best-Sellers-Toys-Games/zgbs/toys-and-games/',
        #     #  'category_name': 'Toys & Games', 'level': 1},
        #     # {'cat_url': 'https://www.amazon.com/best-sellers-video-games/zgbs/videogames/', 'category_name': 'Video Games',
        #     #  'level': 1}
        # ]

        # code = data['code']
        # if code == 0:
        #     task_list = data['data']
        # for data in task_list:
        # try:
        # categoryId = data['categoryId']
        # categoryId = data['category_id']
        # if categoryId != 0:
        #     if data['level'] == 1:
        #         category_name = data['category_name']
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
            "cookie":
            "session-id=145-5209209-9478023; i18n-prefs=USD; ubid-main=133-8276981-7688751; x-wl-uid=1PCOyx0JI1kz7vWchZyMhRWJtqj1XoQoE0UNJPLhOT/Q8+kepq170hFhtVj1OBOSit46HW9f+Rz8=; lc-main=en_US; session-id-time=2082787201l; session-token=3TtwIpr/LCK/R5dUusiKqRfu1FQJmG80o4BC0knm7brPg8aelaJ+f/B16GedWlTyDSjn8qQo3s3PmGmw5mHywT8RWHthFHuduD76fCQKbeUHR0G/OJ4sj2eZxXUoxgcWn+a+xbKm+Rpj5ciXMPsk4ObS1HmuF5NFMFttjbT4ZsWQBxh5Ak9x1hxbsqNIrrrW; csm-hit=tb:0YBA58R18R2BQ1H4SWX6+b-0YBA58R18R2BQ1H4SWX6|1592453272955&t:1592453272955&adb:adblk_yes"
        }
        #  从MySQL取状态为0的类目爬取子类目和asin
        mysql_server = Mysql_server()
        cursor = mysql_server.get_cursor()
        cursor.execute(
            f"select category_id, category_name, level, bsr_url,country from amz_category where state=0 and country='de' limit 10"
        )
        task_list = cursor.fetchall()
        print(task_list)
        start_list = []
        # 整理数据,并改变类目状态
        for task in task_list:
            task = {
                'cat_url': task[3].split('ref')[0],
                'category_name': task[1],
                'level': task[2],
                "category_id": task[0],
                'country': task[4]
            }
            parmas = (task['category_id'], task['level'], task['country'])
            update_sql = f"""update amz_category set state=1 where category_id=%s and level=%s and country=%s"""
            cursor.execute(update_sql, parmas)
            # 将整理好的数据加入任务列表
            start_list.append(task)
        mysql_server.conn.commit()
        mysql_server.close()
        # 发送请求, 开始爬取
        for data in start_list:
            req_url = data['cat_url'] + 'ref='
            yield scrapy.Request(
                req_url,
                headers=self.headers,
                dont_filter=True,
                callback=self.parse,
                meta={
                    'data': data,
                    'req_url': req_url,
                    'page_num': 1,
                    # 'cookiejar': response.meta['cookiejar']
                })
Пример #22
0
 def open_spider(self, spider):
     self.mysql_server = Mysql_server()
Пример #23
0
class TranslateScript(object):
    def __init__(self, table, fromLang='auto', toLang='zh'):
        env_dist = os.environ
        self.table = table
        self.toLang = toLang
        self.fromLang = fromLang
        self.mysql = Mysql_server()
        self.cursor = self.mysql.get_cursor()
        self.appid = env_dist.get('baidufanyi_appid')  # 填写你的appid
        self.secretKey = env_dist.get('baidufanyi_secretKey')  # 填写你的密钥

    def translate(self, q='/'):
        httpClient = None
        toLang = self.toLang
        fromLang = self.fromLang
        myurl = '/api/trans/vip/translate'
        salt = random.randint(32768, 65536)
        sign = self.appid + str(q) + str(salt) + self.secretKey
        sign = hashlib.md5(sign.encode()).hexdigest()
        myurl = myurl + '?appid=' + self.appid + '&q=' + urllib.parse.quote(
            q) + '&from=' + fromLang + '&to=' + toLang + '&salt=' + str(
                salt) + '&sign=' + sign
        try:
            httpClient = http.client.HTTPConnection('api.fanyi.baidu.com')
            httpClient.request('GET', myurl)
            # response是HTTPResponse对象
            response = httpClient.getresponse()
            result_all = response.read().decode("utf-8")
            result = json.loads(result_all)
            return result.get('trans_result')[0].get('dst')
        except Exception as e:
            print(e)
        finally:
            if httpClient:
                httpClient.close()

    def get_data(self):
        select_sql = "select id, review_title, review_body from {} where state=0 limit 10".format(
            self.table)
        self.cursor.execute(select_sql)
        data = self.cursor.fetchall()
        for record in data:
            parmas = (record[0])
            update_sql = "update {} set state=1 where id=%s".format(self.table)
            self.cursor.execute(update_sql, parmas)
        self.mysql.conn.commit()
        return data

    def update_data(self, item):
        params = (item['translate_review_title'],
                  item['translate_review_body'], item['id'])
        update_sql = "update {} set translate_review_title=%s, translate_review_body=%s, state=2 where id=%s".format(
            self.table)
        self.cursor.execute(update_sql, params)
        self.mysql.conn.commit()
        print('{}更新成功'.format(item['id']))

    def close_link(self):
        self.mysql.conn.close()

    def start(self):
        data = self.get_data()
        for record in data:
            translate_review_title = ''
            translate_review_body = ''
            if record[2]:
                translate_review_body = self.translate(q=record[2]) or ''
                time.sleep(2)
            if record[1]:
                translate_review_title = self.translate(q=record[1]) or ''
                time.sleep(1)
            item = {
                'id': record[0],
                'translate_review_title': translate_review_title,
                'translate_review_body': translate_review_body,
            }
            self.update_data(item)
        self.close_link()
Пример #24
0
 def open_spider(self, spider):
     self.mysql_server = Mysql_server()
     self.client = pymongo.MongoClient('127.0.0.1', 27017, maxPoolSize=100)
     self.db = self.client.amazon
     self.collection = self.db.detail_info
Пример #25
0
    def parse(self, response):
        keyword = response.meta['keyword']
        country = response.meta['country']

        # 代理状态更新, 如果代理可用,则给一个高分数令其至前,否则给一个低分数置后
        proxy = response.meta.get('proxy',
                                  '').replace('https://',
                                              '').replace('http://', '')
        proxy_data = {
            "proxy": proxy,
            "fail_count": 0,
            "region": "",
            "type": "",
            "source": "spider",
            "check_count": 20,
            "last_status": 0,
            "last_time": ""
        }

        # 出现验证码,修改代理状态并重试
        if len(response.body) < 10000 or response.status == 503:
            proxy_data['fail_count'] = 18
            self.collection.hset(name="useful_proxy",
                                 key=proxy,
                                 value=json.dumps(proxy_data))
            meta = response.meta
            self.headers['referer'] = response.url
            self.headers['origin'] = 'https://www.amazon.' + self.web_country[
                meta['country']]
            headers = self.headers
            yield scrapy.Request(url=response.url,
                                 meta=meta,
                                 callback=self.parse,
                                 dont_filter=True)

        else:
            # 未出现验证码,提高代理级别
            self.collection.hset(name="useful_proxy",
                                 key=proxy,
                                 value=json.dumps(proxy_data))
            # country = response.meta['country']
            result_num = response.meta['result_num']
            results = response.xpath(
                '//span[@data-component-type="s-search-results"]/div/div[@data-asin]'
            )
            # ab_position = response.meta['absolute_position']
            metas = response.meta
            if len(results) == 0 and metas['retries'] < 30:
                # 如果未发现asin列表则进行重试
                metas['retries'] += 1
                self.headers['referer'] = response.url
                self.headers[
                    'origin'] = 'https://www.amazon.' + self.web_country[
                        country]
                headers = self.headers
                yield scrapy.Request(
                    url=response.url,
                    meta=metas,
                    callback=self.parse,
                    dont_filter=True,
                )
                return
            # 获取页面上的asin数据
            # page_num = response.meta['page']
            item = []  # 用于判断是否满足300个asin的要求
            position_num = 0
            for result in results:
                result_dict = {}
                result_dict['pageNum'] = response.meta['page']  # asin所在页数
                asin = result.xpath('@data-asin').extract_first()  # 提取asin
                if asin != '':
                    result_dict['asin'] = asin
                    position_num += 1
                    result_dict['positionNum'] = position_num  # 所在页面位置
                    # 判断是否为广告位   # Todo: 英国站有时候会出现无任何广告位现象
                    ad_str = result.xpath(
                        './div//span/span/span/span/text()').extract_first()
                    print(ad_str, "====")
                    if ad_str in [
                            "Sponsored", "Sponsorisé", "Sponsorizzato",
                            "Patrocinado", "スポンサー プロダクト", "Gesponsert"
                    ]:
                        result_dict['ad'] = 1
                    else:
                        result_dict['ad'] = 0

                    result_dict['keyword'] = response.meta['keyword']
                    result_dict['country'] = country
                    item.append(result_dict)
                    yield result_dict
            result_num += len(item)

            # 获取下一页url
            temp_url = response.xpath(
                '//ul[@class="a-pagination"]/li[@class="a-last"]/a/@href'
            ).extract_first()

            # 只抓取前300个asin,为了确保去重后不少于300个,设置判断条件为400个
            if result_num <= 400 and temp_url is not None:
                next_url = "https://www.amazon." + self.web_country[
                    response.meta['country']] + temp_url
                self.headers['referer'] = response.url
                self.headers[
                    'origin'] = 'https://www.amazon.' + self.web_country[
                        country]
                headers = self.headers
                yield scrapy.Request(
                    url=next_url,
                    meta={
                        "page": response.meta['page'] + 1,
                        "keyword": keyword,
                        # 'absolute_position': ab_position,
                        'country': country,
                        'retries': 0,
                        "result_num": result_num
                    },
                    callback=self.parse,
                    dont_filter=True)
            else:
                mysql_server = Mysql_server()
                cursor = mysql_server.get_cursor()
                sql = """update keywords set state=2 where keyword=%s and country=%s"""
                params = (keyword, country)
                cursor.execute(sql, params)
                mysql_server.close()