示例#1
0
    def start_requests(self):
        urls_mba = []
        headers = get_random_headers(self.marketplace)
        # case use a csv with search terms
        if not self.df_search_terms.empty:
            for i, df_row in self.df_search_terms.iterrows():
                search_term = df_row["search_term"]
                url_mba = url_creator.main([search_term, self.marketplace, self.pod_product, self.sort])
                url_mba_page = url_mba + "&page=1"#+"&ref=sr_pg_"+str(page_number)
                urls_mba.append(url_mba_page)
        else:
            url_mba = url_creator.main([self.keyword, self.marketplace, self.pod_product, self.sort])
            send_msg(self.target, "Start scraper {} marketplace {} with {} pages and start page {} and sort {}".format(self.name, self.marketplace, self.pages, self.start_page, self.sort), self.api_key)
            # if start_page is other than one, crawler should start from differnt page
            until_page = 401

            if self.pages != 0:
                until_page = self.start_page + self.pages
            for page_number in np.arange(self.start_page, until_page, 1):
                if page_number <= 400:
                    url_mba_page = url_mba + "&page="+str(page_number)#+"&ref=sr_pg_"+str(page_number)
                    urls_mba.append(url_mba_page)
        for i, url_mba in enumerate(urls_mba):
            page = i + self.start_page
            # if self.marketplace == "com": 
            #     url_change_zip_code = "https://www.amazon.com/gp/delivery/ajax/address-change.html"
            #     yield scrapy.http.JsonRequest(url=url_change_zip_code, callback=self.change_zip_code, headers=headers, priority=i, data=self.change_zip_code_post_data,
            #                         errback=self.errback_httpbin, meta={"max_proxies_to_try": 30, 'page': page, "url": url_mba, "headers": headers})
            # else:
            yield scrapy.Request(url=url_mba, callback=self.parse, headers=headers, priority=i,
                                    errback=self.errback_httpbin, meta={"max_proxies_to_try": 30, 'page': page, "url": url_mba, "headers": headers})
示例#2
0
 def send_request_again(self, url, asin):
     headers = get_random_headers(self.marketplace)
     # send new request with high priority
     request = scrapy.Request(url=url,
                              callback=self.parse,
                              headers=headers,
                              priority=0,
                              dont_filter=True,
                              errback=self.errback_httpbin,
                              meta={"asin": asin})
     yield request
示例#3
0
 def start_requests(self):
     self.reset_was_banned_every_hour()
     urls = pd.read_csv(self.url_data_path)["url"].tolist()
     asins = pd.read_csv(self.url_data_path)["asin"].tolist()
     for i, url in enumerate(urls):
         #proxies = proxy_handler.get_random_proxy_url_dict()
         headers = get_random_headers(self.marketplace)
         asin = asins[i]
         yield scrapy.Request(url=url,
                              callback=self.parse,
                              headers=headers,
                              priority=1,
                              errback=self.errback_httpbin,
                              meta={
                                  "asin": asin,
                                  "max_proxies_to_try": 20
                              })  # "proxy": proxies["http"],
 def start_requests(self):
     urls = pd.read_csv(
         "mba_crawler/url_data/urls_mba_daily_de.csv")["url"].tolist()
     asins = pd.read_csv(
         "mba_crawler/url_data/urls_mba_daily_de.csv")["asin"].tolist()
     send_msg(
         self.target,
         "Start scraper {} with {} products".format(self.name, len(urls)),
         self.api_key)
     for i, url in enumerate(urls):
         #proxies = proxy_handler.get_random_proxy_url_dict()
         headers = get_random_headers(self.marketplace)
         asin = asins[i]
         yield scrapy.Request(url=url,
                              callback=self.parse,
                              headers=headers,
                              errback=self.errback_httpbin,
                              meta={"asin":
                                    asin})  # "proxy": proxies["http"],
示例#5
0
 def start_requests(self):
     self.reset_was_banned_every_hour()
     urls = pd.read_csv(self.url_data_path)["url"].tolist()
     asins = pd.read_csv(self.url_data_path)["asin"].tolist()
     send_msg(
         self.target, "Start scraper {} daily {} with {} products".format(
             self.name, self.daily, len(urls)), self.api_key)
     for i, url in enumerate(urls):
         #proxies = proxy_handler.get_random_proxy_url_dict()
         headers = get_random_headers(self.marketplace)
         asin = asins[i]
         yield scrapy.Request(url=url,
                              callback=self.parse,
                              headers=headers,
                              priority=1,
                              errback=self.errback_httpbin,
                              meta={
                                  "asin": asin,
                                  "max_proxies_to_try": 20
                              })  # "proxy": proxies["http"],
 def start_requests(self):
     self.reset_was_banned_every_hour()
     df = pd.read_gbq(
         'SELECT asin FROM `mba-pipeline.mba_de.products_no_mba_shirt` WHERE asin = "B07X6399HF"',
         project_id="mba-pipeline")
     urls = df["asin"].apply(
         lambda asin: f"https://www.amazon.{self.marketplace}/dp/{asin}"
     ).tolist()
     asins = df["asin"].tolist()
     for i, url in enumerate(urls):
         #proxies = proxy_handler.get_random_proxy_url_dict()
         headers = get_random_headers(self.marketplace)
         asin = asins[i]
         yield scrapy.Request(url=url,
                              callback=self.parse,
                              headers=headers,
                              priority=1,
                              errback=self.errback_httpbin,
                              meta={
                                  "asin": asin,
                                  "max_proxies_to_try": 20
                              })  # "proxy": proxies["http"],
    def parse(self, response):
        asin = response.meta["asin"]
        proxy = self.get_proxy(response)

        url = response.url
        #send_msg(self.target, "Response catched: {} with proxy {}".format(url,proxy), self.api_key)
        if self.is_captcha_required(response):
            #self.response_is_ban(request, response, is_ban=True)
            print("Captcha required for proxy: " + proxy)
            self.captcha_count = self.captcha_count + 1
            self.update_ban_count(proxy)
            #send_msg(self.target, "Captcha: " + url, self.api_key)

            headers = get_random_headers(self.marketplace)
            # send new request with high priority
            request = scrapy.Request(url=url,
                                     callback=self.parse,
                                     headers=headers,
                                     priority=0,
                                     dont_filter=True,
                                     errback=self.errback_httpbin,
                                     meta={"asin": asin})
            yield request
        # do not proceed if its not a mba shirt
        elif not self.is_mba_shirt(response):
            self.df_products_no_mba_shirt = self.df_products_no_mba_shirt.append(
                pd.DataFrame(
                    data={
                        "asin": [asin],
                        "url": [url],
                        "timestamp": [datetime.datetime.now()]
                    }))
        else:
            self.asin_list_remove_from_blacklist.append(asin)

            self.status_update()
示例#8
0
    def parse(self, response):
        proxy = self.get_proxy(response)
        url = response.url
        page = response.meta["page"]
        image_urls = []
        asins = []
        url_mba_lowqs = []

        #self.get_zip_code_location(response)
        #self.get_count_results(response)

        if self.is_captcha_required(response):
            #self.response_is_ban(request, response, is_ban=True)
            print("Captcha required for proxy: " + proxy)
            self.captcha_count = self.captcha_count + 1
            self.update_ban_count(proxy)            
            headers = get_random_headers(self.marketplace)
            # send new request with high priority
            request = scrapy.Request(url=url, callback=self.parse, headers=headers, priority=0, dont_filter=True,
                                    errback=self.errback_httpbin, meta={"max_proxies_to_try": 30, "page": page})
            yield request
        else:
            
            if self.should_zip_code_be_changed(response):
                print("Proxy does not get all .com results: " + proxy)
                self.update_ban_count(proxy)   
                headers = get_random_headers(self.marketplace)
                # send new request with high priority
                request = scrapy.Request(url=url, callback=self.parse, headers=headers, priority=0, dont_filter=True,
                                        errback=self.errback_httpbin, meta={"max_proxies_to_try": 30, "page": page})
                yield request
                # change zip code
                # meta_dict = {"max_proxies_to_try": 30, 'page': page, "url": url, "headers": response.meta["headers"]}
                # url_change_zip_code = "https://www.amazon.com/gp/delivery/ajax/address-change.html"
                # if self.is_perfect_privacy_proxy(response):
                #     proxy = "http://*****:*****@" + response.meta["download_slot"] + ":3128"
                # meta_dict.update({"proxy": proxy, "_rotating_proxy": False})
                # yield scrapy.http.JsonRequest(url=url_change_zip_code, callback=self.change_zip_code, headers=response.meta["headers"], priority=0, data=self.change_zip_code_post_data,
                #                     errback=self.errback_httpbin, meta=meta_dict, dont_filter=True)
            else:
                self.ip_addresses.append(response.ip_address.compressed)
                shirts = response.css('div.sg-col-inner')
                shirt_number_page = 0
                for i, shirt in enumerate(shirts):
                    if not self.is_shirt(shirt):
                        continue
                    shirt_number_page = shirt_number_page + 1
                    try:
                        price = self.get_price(shirt)
                    except Exception as e:
                        self.save_content(response, url)
                        send_msg(self.target, str(e) + " | url: " + url, self.api_key)
                        raise e
                    try:
                        title = self.get_title(shirt)
                    except Exception as e:
                        self.save_content(response, url)
                        send_msg(self.target, str(e) + " | url: " + url, self.api_key)
                        raise e
                    try:
                        brand = self.get_brand(shirt)
                    except Exception as e:
                        print("Could not get brand of shirt: ",title)
                        brand = None
                        # its possible that amazon does not show brand on overview page. Therefore raise is not neccessary.
                        #self.save_content(response, url)
                        #send_msg(self.target, str(e) + " | url: " + url, self.api_key)
                        #raise e
                    try:
                        url_product = self.get_url_product(shirt, url)
                    except Exception as e:
                        self.save_content(response, url)
                        send_msg(self.target, str(e) + " | url: " + url, self.api_key)
                        raise e
                    try:
                        url_image_lowq,url_image_q2,url_image_q3,url_image_q4,url_image_hq = self.get_img_urls(shirt)
                    except Exception as e:
                        self.save_content(response, url)
                        send_msg(self.target, str(e) + " | url: " + url, self.api_key)
                        raise e
                    try:
                        asin = self.get_asin(shirt)
                    except Exception as e:
                        self.save_content(response, url)
                        send_msg(self.target, str(e) + " | url: " + url, self.api_key)
                        raise e
                    try:
                        uuid = self.get_uuid(shirt)
                    except Exception as e:
                        self.save_content(response, url)
                        send_msg(self.target, str(e) + " | url: " + url, self.api_key)
                        raise e
                        
                    crawlingdate = datetime.datetime.now()
                    # append to general crawler
                    df_products = pd.DataFrame(data={"title":[title],"brand":[brand],"url_product":[url_product],"url_image_lowq":[url_image_lowq],"url_image_hq":[url_image_hq],"price":[price],"asin":[asin],"uuid":[uuid], "timestamp":[crawlingdate]})
                    df_mba_images = pd.DataFrame(data={"asin":[asin],"url_image_lowq":[url_image_lowq],"url_image_q2":[url_image_q2], "url_image_q3":[url_image_q3], "url_image_q4":[url_image_q4],"url_image_hq":[url_image_hq], "timestamp":[crawlingdate]})
                    shirt_number = int(shirt_number_page + ((int(page)-1)*self.shirts_per_page))
                    df_mba_relevance = pd.DataFrame(data={"asin":[asin],"sort":[self.sort],"number":[shirt_number],"timestamp":[crawlingdate]})

                    self.df_products = self.df_products.append(df_products)
                    self.df_mba_images = self.df_mba_images.append(df_mba_images)
                    self.df_mba_relevance = self.df_mba_relevance.append(df_mba_relevance)

                    # crawl only image if not already crawled
                    if asin not in self.products_images_already_downloaded:
                        image_urls.append(url_image_hq)
                        asins.append(asin)
                        url_mba_lowqs.append(url_image_lowq)

                # crawl images
                image_item = MbaCrawlerItem()
                image_item["image_urls"] = image_urls
                image_item["asins"] = asins
                image_item["url_mba_lowqs"] = url_mba_lowqs
                image_item["marketplace"] = self.marketplace
                if self.marketplace in ["com", "de"]:
                    yield image_item
                
                self.page_count = self.page_count + 1
                self.status_update()


                #url_next = "/".join(url.split("/")[0:3]) + response.css("ul.a-pagination li.a-last a::attr(href)").get()
                
                '''
示例#9
0
    def parse(self, response):
        asin = response.meta["asin"]
        proxy = self.get_proxy(response)

        url = response.url
        #send_msg(self.target, "Response catched: {} with proxy {}".format(url,proxy), self.api_key)
        if self.is_captcha_required(response):
            #self.response_is_ban(request, response, is_ban=True)
            print("Captcha required for proxy: " + proxy)
            self.captcha_count = self.captcha_count + 1
            self.update_ban_count(proxy)
            #send_msg(self.target, "Captcha: " + url, self.api_key)

            headers = get_random_headers(self.marketplace)
            # send new request with high priority
            request = scrapy.Request(url=url,
                                     callback=self.parse,
                                     headers=headers,
                                     priority=0,
                                     dont_filter=True,
                                     errback=self.errback_httpbin,
                                     meta={"asin": asin})
            yield request
            '''
            raise Exception("Captcha required")
            send_msg(self.target, "Captcha required" + " | asin: " + asin, self.api_key)
            self.captcha_count = self.captcha_count + 1
            # add download dely if captcha happens
            self.settings.attributes["DOWNLOAD_DELAY"].value = self.settings.attributes["DOWNLOAD_DELAY"].value + 3
            if self.captcha_count > self.settings.attributes["MAX_CAPTCHA_NUMBER"].value:
                raise CloseSpider(reason='To many catchas received')
            raise Exception("Captcha required")
            '''
        # do not proceed if its not a mba shirt
        elif not self.is_mba_shirt(response):
            self.df_products_no_mba_shirt = self.df_products_no_mba_shirt.append(
                pd.DataFrame(
                    data={
                        "asin": [asin],
                        "url": [url],
                        "timestamp": [datetime.datetime.now()]
                    }))
        else:
            self.ip_addresses.append(response.ip_address.compressed)
            try:
                price_str, price = self.get_price(response)
            except Exception as e:
                #self.save_content(response, asin)
                #send_msg(self.target, str(e) + " | asin: " + asin, self.api_key)
                price_str, price = "", 0.0
            try:
                mba_bsr_str, mba_bsr, array_mba_bsr, array_mba_bsr_categorie = self.get_bsr(
                    response)
            except Exception as e:
                self.save_content(response, asin)
                #send_msg(self.target, str(e) + " | asin: " + asin, self.api_key)
                if "no bsr" in str(e):
                    self.df_products_no_bsr = self.df_products_no_bsr.append(
                        pd.DataFrame(
                            data={
                                "asin": [asin],
                                "url": [url],
                                "timestamp": [datetime.datetime.now()]
                            }))
                if self.daily:
                    raise e
                else:
                    # Cases exists like https://www.amazon.com/dp/B0855BCBZ6, which should have BSR but dont contain it on html
                    # Therefore, we want to crawl it just once (if not daily crawl)
                    mba_bsr_str, mba_bsr, array_mba_bsr, array_mba_bsr_categorie = "", 0, [], []
            try:
                customer_review_score_mean, customer_review_score, customer_review_count = self.get_customer_review(
                    response)
            except Exception as e:
                self.save_content(response, asin)
                send_msg(self.target,
                         str(e) + " | asin: " + asin, self.api_key)
                raise e
            # if not daily crawler more data of website need to be crawled
            if not self.daily:
                try:
                    title = self.get_title(response)
                except Exception as e:
                    self.save_content(response, asin)
                    send_msg(self.target,
                             str(e) + " | asin: " + asin, self.api_key)
                    raise e
                try:
                    brand, url_brand = self.get_brand_infos(response)
                except Exception as e:
                    self.save_content(response, asin)
                    send_msg(self.target,
                             str(e) + " | asin: " + asin, self.api_key)
                    raise e
                try:
                    fit_types = self.get_fit_types(response)
                except Exception as e:
                    self.save_content(response, asin)
                    send_msg(self.target,
                             str(e) + " | asin: " + asin, self.api_key)
                    raise e
                try:
                    array_color_names, color_count = self.get_color_infos(
                        response)
                except Exception as e:
                    self.save_content(response, asin)
                    send_msg(self.target,
                             str(e) + " | asin: " + asin, self.api_key)
                    raise e
                try:
                    array_product_feature = self.get_product_features(response)
                except Exception as e:
                    self.save_content(response, asin)
                    send_msg(self.target,
                             str(e) + " | asin: " + asin, self.api_key)
                    raise e
                try:
                    description = self.get_description(response)
                except Exception as e:
                    #self.save_content(response, asin)
                    #send_msg(self.target, str(e) + "| asin: " + asin, self.api_key)
                    #raise e
                    description = ""
                try:
                    weight = self.get_weight(response)
                except Exception as e:
                    weight = "not found"
                    self.save_content(response, asin)
                    send_msg(self.target,
                             str(e) + " | asin: " + asin, self.api_key)
                try:
                    upload_date_str, upload_date = self.get_upload_date(
                        response)
                except Exception as e:
                    self.save_content(response, asin)
                    send_msg(self.target,
                             str(e) + " | asin: " + asin, self.api_key)
                    raise e

            crawlingdate = datetime.datetime.now()
            if not self.daily:
                # append to general crawler
                df = pd.DataFrame(
                    data={
                        "asin": [asin],
                        "title": [title],
                        "brand": [brand],
                        "url_brand": [url_brand],
                        "price": [price_str],
                        "fit_types": [fit_types],
                        "color_names": [array_color_names],
                        "color_count": [color_count],
                        "product_features": [array_product_feature],
                        "description": [description],
                        "weight": [weight],
                        "upload_date_str": [upload_date_str],
                        "upload_date": [upload_date],
                        "customer_review_score": [customer_review_score],
                        "customer_review_count": [customer_review_count],
                        "mba_bsr_str": [mba_bsr_str],
                        "mba_bsr": [array_mba_bsr],
                        "mba_bsr_categorie": [array_mba_bsr_categorie],
                        "timestamp": [crawlingdate]
                    })
                self.df_products_details = self.df_products_details.append(df)

            # append to daily crawler
            df = pd.DataFrame(
                data={
                    "asin": [asin],
                    "price": [price],
                    "price_str": [price_str],
                    "bsr": [mba_bsr],
                    "bsr_str": [mba_bsr_str],
                    "array_bsr": [array_mba_bsr],
                    "array_bsr_categorie": [array_mba_bsr_categorie],
                    "customer_review_score_mean": [customer_review_score_mean],
                    "customer_review_score": [customer_review_score],
                    "customer_review_count": [customer_review_count],
                    "timestamp": [crawlingdate]
                })
            self.df_products_details_daily = self.df_products_details_daily.append(
                df)

            self.status_update()
示例#10
0
    def parse(self, response):
        asin = response.meta["asin"]
        proxy = self.get_proxy(response)

        url = response.url
        #send_msg(self.target, "Response catched: {} with proxy {}".format(url,proxy), self.api_key)
        if self.is_captcha_required(response):
            #self.response_is_ban(request, response, is_ban=True)
            print("Captcha required for proxy: " + proxy)
            self.captcha_count = self.captcha_count + 1
            self.update_ban_count(proxy)
            #send_msg(self.target, "Captcha: " + url, self.api_key)

            headers = get_random_headers(self.marketplace)
            # send new request with high priority
            request = scrapy.Request(url=url,
                                     callback=self.parse,
                                     headers=headers,
                                     priority=0,
                                     dont_filter=True,
                                     errback=self.errback_httpbin,
                                     meta={"asin": asin})
            yield request
            '''
            raise Exception("Captcha required")
            send_msg(self.target, "Captcha required" + " | asin: " + asin, self.api_key)
            self.captcha_count = self.captcha_count + 1
            # add download dely if captcha happens
            self.settings.attributes["DOWNLOAD_DELAY"].value = self.settings.attributes["DOWNLOAD_DELAY"].value + 3
            if self.captcha_count > self.settings.attributes["MAX_CAPTCHA_NUMBER"].value:
                raise CloseSpider(reason='To many catchas received')
            raise Exception("Captcha required")
            '''
        else:
            self.ip_addresses.append(response.ip_address.compressed)
            try:
                # TODO: This scipt was not further developed, since image references are not the same on product page and overview page
                import ast
                image_urls_string = response.css(
                    'div#imgTagWrapperId img::attr(data-a-dynamic-image)').get(
                    )
                image_urls_dict = ast.literal_eval(image_urls_string)
                url_image_lowq, url_image_q2, url_image_q3, url_image_q4, url_image_hq = self.get_price(
                    response)
            except Exception as e:
                #self.save_content(response, asin)
                #send_msg(self.target, str(e) + " | asin: " + asin, self.api_key)
                raise e

            crawlingdate = datetime.datetime.now()

            df_mba_images = pd.DataFrame(
                data={
                    "asin": [asin],
                    "url_image_lowq": [url_image_lowq],
                    "url_image_q2": [url_image_q2],
                    "url_image_q3": [url_image_q3],
                    "url_image_q4": [url_image_q4],
                    "url_image_hq": [url_image_hq],
                    "timestamp": [crawlingdate]
                })
            self.df_mba_images = self.df_mba_images.append(df_mba_images)