def start_requests(self): urls_mba = [] headers = get_random_headers(self.marketplace) # case use a csv with search terms if not self.df_search_terms.empty: for i, df_row in self.df_search_terms.iterrows(): search_term = df_row["search_term"] url_mba = url_creator.main([search_term, self.marketplace, self.pod_product, self.sort]) url_mba_page = url_mba + "&page=1"#+"&ref=sr_pg_"+str(page_number) urls_mba.append(url_mba_page) else: url_mba = url_creator.main([self.keyword, self.marketplace, self.pod_product, self.sort]) send_msg(self.target, "Start scraper {} marketplace {} with {} pages and start page {} and sort {}".format(self.name, self.marketplace, self.pages, self.start_page, self.sort), self.api_key) # if start_page is other than one, crawler should start from differnt page until_page = 401 if self.pages != 0: until_page = self.start_page + self.pages for page_number in np.arange(self.start_page, until_page, 1): if page_number <= 400: url_mba_page = url_mba + "&page="+str(page_number)#+"&ref=sr_pg_"+str(page_number) urls_mba.append(url_mba_page) for i, url_mba in enumerate(urls_mba): page = i + self.start_page # if self.marketplace == "com": # url_change_zip_code = "https://www.amazon.com/gp/delivery/ajax/address-change.html" # yield scrapy.http.JsonRequest(url=url_change_zip_code, callback=self.change_zip_code, headers=headers, priority=i, data=self.change_zip_code_post_data, # errback=self.errback_httpbin, meta={"max_proxies_to_try": 30, 'page': page, "url": url_mba, "headers": headers}) # else: yield scrapy.Request(url=url_mba, callback=self.parse, headers=headers, priority=i, errback=self.errback_httpbin, meta={"max_proxies_to_try": 30, 'page': page, "url": url_mba, "headers": headers})
def send_request_again(self, url, asin): headers = get_random_headers(self.marketplace) # send new request with high priority request = scrapy.Request(url=url, callback=self.parse, headers=headers, priority=0, dont_filter=True, errback=self.errback_httpbin, meta={"asin": asin}) yield request
def start_requests(self): self.reset_was_banned_every_hour() urls = pd.read_csv(self.url_data_path)["url"].tolist() asins = pd.read_csv(self.url_data_path)["asin"].tolist() for i, url in enumerate(urls): #proxies = proxy_handler.get_random_proxy_url_dict() headers = get_random_headers(self.marketplace) asin = asins[i] yield scrapy.Request(url=url, callback=self.parse, headers=headers, priority=1, errback=self.errback_httpbin, meta={ "asin": asin, "max_proxies_to_try": 20 }) # "proxy": proxies["http"],
def start_requests(self): urls = pd.read_csv( "mba_crawler/url_data/urls_mba_daily_de.csv")["url"].tolist() asins = pd.read_csv( "mba_crawler/url_data/urls_mba_daily_de.csv")["asin"].tolist() send_msg( self.target, "Start scraper {} with {} products".format(self.name, len(urls)), self.api_key) for i, url in enumerate(urls): #proxies = proxy_handler.get_random_proxy_url_dict() headers = get_random_headers(self.marketplace) asin = asins[i] yield scrapy.Request(url=url, callback=self.parse, headers=headers, errback=self.errback_httpbin, meta={"asin": asin}) # "proxy": proxies["http"],
def start_requests(self): self.reset_was_banned_every_hour() urls = pd.read_csv(self.url_data_path)["url"].tolist() asins = pd.read_csv(self.url_data_path)["asin"].tolist() send_msg( self.target, "Start scraper {} daily {} with {} products".format( self.name, self.daily, len(urls)), self.api_key) for i, url in enumerate(urls): #proxies = proxy_handler.get_random_proxy_url_dict() headers = get_random_headers(self.marketplace) asin = asins[i] yield scrapy.Request(url=url, callback=self.parse, headers=headers, priority=1, errback=self.errback_httpbin, meta={ "asin": asin, "max_proxies_to_try": 20 }) # "proxy": proxies["http"],
def start_requests(self): self.reset_was_banned_every_hour() df = pd.read_gbq( 'SELECT asin FROM `mba-pipeline.mba_de.products_no_mba_shirt` WHERE asin = "B07X6399HF"', project_id="mba-pipeline") urls = df["asin"].apply( lambda asin: f"https://www.amazon.{self.marketplace}/dp/{asin}" ).tolist() asins = df["asin"].tolist() for i, url in enumerate(urls): #proxies = proxy_handler.get_random_proxy_url_dict() headers = get_random_headers(self.marketplace) asin = asins[i] yield scrapy.Request(url=url, callback=self.parse, headers=headers, priority=1, errback=self.errback_httpbin, meta={ "asin": asin, "max_proxies_to_try": 20 }) # "proxy": proxies["http"],
def parse(self, response): asin = response.meta["asin"] proxy = self.get_proxy(response) url = response.url #send_msg(self.target, "Response catched: {} with proxy {}".format(url,proxy), self.api_key) if self.is_captcha_required(response): #self.response_is_ban(request, response, is_ban=True) print("Captcha required for proxy: " + proxy) self.captcha_count = self.captcha_count + 1 self.update_ban_count(proxy) #send_msg(self.target, "Captcha: " + url, self.api_key) headers = get_random_headers(self.marketplace) # send new request with high priority request = scrapy.Request(url=url, callback=self.parse, headers=headers, priority=0, dont_filter=True, errback=self.errback_httpbin, meta={"asin": asin}) yield request # do not proceed if its not a mba shirt elif not self.is_mba_shirt(response): self.df_products_no_mba_shirt = self.df_products_no_mba_shirt.append( pd.DataFrame( data={ "asin": [asin], "url": [url], "timestamp": [datetime.datetime.now()] })) else: self.asin_list_remove_from_blacklist.append(asin) self.status_update()
def parse(self, response): proxy = self.get_proxy(response) url = response.url page = response.meta["page"] image_urls = [] asins = [] url_mba_lowqs = [] #self.get_zip_code_location(response) #self.get_count_results(response) if self.is_captcha_required(response): #self.response_is_ban(request, response, is_ban=True) print("Captcha required for proxy: " + proxy) self.captcha_count = self.captcha_count + 1 self.update_ban_count(proxy) headers = get_random_headers(self.marketplace) # send new request with high priority request = scrapy.Request(url=url, callback=self.parse, headers=headers, priority=0, dont_filter=True, errback=self.errback_httpbin, meta={"max_proxies_to_try": 30, "page": page}) yield request else: if self.should_zip_code_be_changed(response): print("Proxy does not get all .com results: " + proxy) self.update_ban_count(proxy) headers = get_random_headers(self.marketplace) # send new request with high priority request = scrapy.Request(url=url, callback=self.parse, headers=headers, priority=0, dont_filter=True, errback=self.errback_httpbin, meta={"max_proxies_to_try": 30, "page": page}) yield request # change zip code # meta_dict = {"max_proxies_to_try": 30, 'page': page, "url": url, "headers": response.meta["headers"]} # url_change_zip_code = "https://www.amazon.com/gp/delivery/ajax/address-change.html" # if self.is_perfect_privacy_proxy(response): # proxy = "http://*****:*****@" + response.meta["download_slot"] + ":3128" # meta_dict.update({"proxy": proxy, "_rotating_proxy": False}) # yield scrapy.http.JsonRequest(url=url_change_zip_code, callback=self.change_zip_code, headers=response.meta["headers"], priority=0, data=self.change_zip_code_post_data, # errback=self.errback_httpbin, meta=meta_dict, dont_filter=True) else: self.ip_addresses.append(response.ip_address.compressed) shirts = response.css('div.sg-col-inner') shirt_number_page = 0 for i, shirt in enumerate(shirts): if not self.is_shirt(shirt): continue shirt_number_page = shirt_number_page + 1 try: price = self.get_price(shirt) except Exception as e: self.save_content(response, url) send_msg(self.target, str(e) + " | url: " + url, self.api_key) raise e try: title = self.get_title(shirt) except Exception as e: self.save_content(response, url) send_msg(self.target, str(e) + " | url: " + url, self.api_key) raise e try: brand = self.get_brand(shirt) except Exception as e: print("Could not get brand of shirt: ",title) brand = None # its possible that amazon does not show brand on overview page. Therefore raise is not neccessary. #self.save_content(response, url) #send_msg(self.target, str(e) + " | url: " + url, self.api_key) #raise e try: url_product = self.get_url_product(shirt, url) except Exception as e: self.save_content(response, url) send_msg(self.target, str(e) + " | url: " + url, self.api_key) raise e try: url_image_lowq,url_image_q2,url_image_q3,url_image_q4,url_image_hq = self.get_img_urls(shirt) except Exception as e: self.save_content(response, url) send_msg(self.target, str(e) + " | url: " + url, self.api_key) raise e try: asin = self.get_asin(shirt) except Exception as e: self.save_content(response, url) send_msg(self.target, str(e) + " | url: " + url, self.api_key) raise e try: uuid = self.get_uuid(shirt) except Exception as e: self.save_content(response, url) send_msg(self.target, str(e) + " | url: " + url, self.api_key) raise e crawlingdate = datetime.datetime.now() # append to general crawler df_products = pd.DataFrame(data={"title":[title],"brand":[brand],"url_product":[url_product],"url_image_lowq":[url_image_lowq],"url_image_hq":[url_image_hq],"price":[price],"asin":[asin],"uuid":[uuid], "timestamp":[crawlingdate]}) df_mba_images = pd.DataFrame(data={"asin":[asin],"url_image_lowq":[url_image_lowq],"url_image_q2":[url_image_q2], "url_image_q3":[url_image_q3], "url_image_q4":[url_image_q4],"url_image_hq":[url_image_hq], "timestamp":[crawlingdate]}) shirt_number = int(shirt_number_page + ((int(page)-1)*self.shirts_per_page)) df_mba_relevance = pd.DataFrame(data={"asin":[asin],"sort":[self.sort],"number":[shirt_number],"timestamp":[crawlingdate]}) self.df_products = self.df_products.append(df_products) self.df_mba_images = self.df_mba_images.append(df_mba_images) self.df_mba_relevance = self.df_mba_relevance.append(df_mba_relevance) # crawl only image if not already crawled if asin not in self.products_images_already_downloaded: image_urls.append(url_image_hq) asins.append(asin) url_mba_lowqs.append(url_image_lowq) # crawl images image_item = MbaCrawlerItem() image_item["image_urls"] = image_urls image_item["asins"] = asins image_item["url_mba_lowqs"] = url_mba_lowqs image_item["marketplace"] = self.marketplace if self.marketplace in ["com", "de"]: yield image_item self.page_count = self.page_count + 1 self.status_update() #url_next = "/".join(url.split("/")[0:3]) + response.css("ul.a-pagination li.a-last a::attr(href)").get() '''
def parse(self, response): asin = response.meta["asin"] proxy = self.get_proxy(response) url = response.url #send_msg(self.target, "Response catched: {} with proxy {}".format(url,proxy), self.api_key) if self.is_captcha_required(response): #self.response_is_ban(request, response, is_ban=True) print("Captcha required for proxy: " + proxy) self.captcha_count = self.captcha_count + 1 self.update_ban_count(proxy) #send_msg(self.target, "Captcha: " + url, self.api_key) headers = get_random_headers(self.marketplace) # send new request with high priority request = scrapy.Request(url=url, callback=self.parse, headers=headers, priority=0, dont_filter=True, errback=self.errback_httpbin, meta={"asin": asin}) yield request ''' raise Exception("Captcha required") send_msg(self.target, "Captcha required" + " | asin: " + asin, self.api_key) self.captcha_count = self.captcha_count + 1 # add download dely if captcha happens self.settings.attributes["DOWNLOAD_DELAY"].value = self.settings.attributes["DOWNLOAD_DELAY"].value + 3 if self.captcha_count > self.settings.attributes["MAX_CAPTCHA_NUMBER"].value: raise CloseSpider(reason='To many catchas received') raise Exception("Captcha required") ''' # do not proceed if its not a mba shirt elif not self.is_mba_shirt(response): self.df_products_no_mba_shirt = self.df_products_no_mba_shirt.append( pd.DataFrame( data={ "asin": [asin], "url": [url], "timestamp": [datetime.datetime.now()] })) else: self.ip_addresses.append(response.ip_address.compressed) try: price_str, price = self.get_price(response) except Exception as e: #self.save_content(response, asin) #send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) price_str, price = "", 0.0 try: mba_bsr_str, mba_bsr, array_mba_bsr, array_mba_bsr_categorie = self.get_bsr( response) except Exception as e: self.save_content(response, asin) #send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) if "no bsr" in str(e): self.df_products_no_bsr = self.df_products_no_bsr.append( pd.DataFrame( data={ "asin": [asin], "url": [url], "timestamp": [datetime.datetime.now()] })) if self.daily: raise e else: # Cases exists like https://www.amazon.com/dp/B0855BCBZ6, which should have BSR but dont contain it on html # Therefore, we want to crawl it just once (if not daily crawl) mba_bsr_str, mba_bsr, array_mba_bsr, array_mba_bsr_categorie = "", 0, [], [] try: customer_review_score_mean, customer_review_score, customer_review_count = self.get_customer_review( response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e # if not daily crawler more data of website need to be crawled if not self.daily: try: title = self.get_title(response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e try: brand, url_brand = self.get_brand_infos(response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e try: fit_types = self.get_fit_types(response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e try: array_color_names, color_count = self.get_color_infos( response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e try: array_product_feature = self.get_product_features(response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e try: description = self.get_description(response) except Exception as e: #self.save_content(response, asin) #send_msg(self.target, str(e) + "| asin: " + asin, self.api_key) #raise e description = "" try: weight = self.get_weight(response) except Exception as e: weight = "not found" self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) try: upload_date_str, upload_date = self.get_upload_date( response) except Exception as e: self.save_content(response, asin) send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e crawlingdate = datetime.datetime.now() if not self.daily: # append to general crawler df = pd.DataFrame( data={ "asin": [asin], "title": [title], "brand": [brand], "url_brand": [url_brand], "price": [price_str], "fit_types": [fit_types], "color_names": [array_color_names], "color_count": [color_count], "product_features": [array_product_feature], "description": [description], "weight": [weight], "upload_date_str": [upload_date_str], "upload_date": [upload_date], "customer_review_score": [customer_review_score], "customer_review_count": [customer_review_count], "mba_bsr_str": [mba_bsr_str], "mba_bsr": [array_mba_bsr], "mba_bsr_categorie": [array_mba_bsr_categorie], "timestamp": [crawlingdate] }) self.df_products_details = self.df_products_details.append(df) # append to daily crawler df = pd.DataFrame( data={ "asin": [asin], "price": [price], "price_str": [price_str], "bsr": [mba_bsr], "bsr_str": [mba_bsr_str], "array_bsr": [array_mba_bsr], "array_bsr_categorie": [array_mba_bsr_categorie], "customer_review_score_mean": [customer_review_score_mean], "customer_review_score": [customer_review_score], "customer_review_count": [customer_review_count], "timestamp": [crawlingdate] }) self.df_products_details_daily = self.df_products_details_daily.append( df) self.status_update()
def parse(self, response): asin = response.meta["asin"] proxy = self.get_proxy(response) url = response.url #send_msg(self.target, "Response catched: {} with proxy {}".format(url,proxy), self.api_key) if self.is_captcha_required(response): #self.response_is_ban(request, response, is_ban=True) print("Captcha required for proxy: " + proxy) self.captcha_count = self.captcha_count + 1 self.update_ban_count(proxy) #send_msg(self.target, "Captcha: " + url, self.api_key) headers = get_random_headers(self.marketplace) # send new request with high priority request = scrapy.Request(url=url, callback=self.parse, headers=headers, priority=0, dont_filter=True, errback=self.errback_httpbin, meta={"asin": asin}) yield request ''' raise Exception("Captcha required") send_msg(self.target, "Captcha required" + " | asin: " + asin, self.api_key) self.captcha_count = self.captcha_count + 1 # add download dely if captcha happens self.settings.attributes["DOWNLOAD_DELAY"].value = self.settings.attributes["DOWNLOAD_DELAY"].value + 3 if self.captcha_count > self.settings.attributes["MAX_CAPTCHA_NUMBER"].value: raise CloseSpider(reason='To many catchas received') raise Exception("Captcha required") ''' else: self.ip_addresses.append(response.ip_address.compressed) try: # TODO: This scipt was not further developed, since image references are not the same on product page and overview page import ast image_urls_string = response.css( 'div#imgTagWrapperId img::attr(data-a-dynamic-image)').get( ) image_urls_dict = ast.literal_eval(image_urls_string) url_image_lowq, url_image_q2, url_image_q3, url_image_q4, url_image_hq = self.get_price( response) except Exception as e: #self.save_content(response, asin) #send_msg(self.target, str(e) + " | asin: " + asin, self.api_key) raise e crawlingdate = datetime.datetime.now() df_mba_images = pd.DataFrame( data={ "asin": [asin], "url_image_lowq": [url_image_lowq], "url_image_q2": [url_image_q2], "url_image_q3": [url_image_q3], "url_image_q4": [url_image_q4], "url_image_hq": [url_image_hq], "timestamp": [crawlingdate] }) self.df_mba_images = self.df_mba_images.append(df_mba_images)