def start_requests(self): for st in self.searchterms: yield Request( self.url_formatter.format( self.SEARCH_URL, search_term=urllib.quote_plus(st.encode('utf-8')), page_num=1, ), meta={ 'search_term': st, 'remaining': self.quantity }, ) if self.product_url: prod = SiteProductItem() prod['is_single_result'] = True prod['url'] = self.product_url prod['search_term'] = '' yield Request(self.product_url, self._parse_single_product, meta={'product': prod}) if self.products_url: urls = self.products_url.split('||||') for url in urls: prod = SiteProductItem() prod['url'] = url prod['search_term'] = '' yield Request(self.product_url, self._parse_single_product, meta={'product': prod})
def _scrape_product_links(self, response): product_links = [] search_term = response.meta['search_term'] link_domain = "https://www.carmax.com/car/" try: product_data = json.loads(response.body).get('Results', {}) for data in product_data: param = str(data.get('StockNumber')) product_links.append(link_domain + param) except Exception as e: self.log("Error while parsing the product links {}".format(e)) if product_links: for link in product_links: prod_item = SiteProductItem() req = Request(url=link, callback=self.parse_product, meta={ 'product': prod_item, 'search_term': search_term, 'remaining': sys.maxint, }, dont_filter=True, headers={"User-Agent": self.agent}) yield req, prod_item
def _start_requests(self, response): apiKey = re.search('"key":"(.*?)"}', response.body) if not self.product_url and apiKey: apiKey = apiKey.group(1) for st in self.searchterms: yield Request( url=self.API_SEARCH_URL.format( search_term=st, start_index=(self.current_page - 1) * 20, page_num=self.current_page, apiKey=apiKey), meta={ 'search_term': st, 'apiKey': apiKey, 'remaining': sys.maxint }, dont_filter=True, headers=self.HEADERS, ) elif self.product_url: prod = SiteProductItem() prod['url'] = self.product_url prod['search_term'] = '' yield Request( self.product_url, meta={ 'product': prod, }, callback=self._parse_single_product, headers={"User-Agent": self.agent}, )
def _scrape_product_links(self, response): products_container = response.xpath( '//div[@id="threadslist"]//div[@class="trow text-center"]//div[@class="tcell alt1 text-left"]' ) products_link = response.xpath( '//div[@id="threadslist"]//div[@class="trow text-center"]//div[@class="tcell alt1 text-left"]//a[contains(@id, "title")]/@href' ).extract() for product in products_container: dealer_ship = product.xpath( './/span[@style="color:blue"]/b').extract() link = product.xpath( './/a[contains(@id, "title")]/@href').extract() sold_status = product.xpath( '.// span[@class="highlight alert"]/strong').extract() product_item = SiteProductItem() cond_set_value(product_item, 'url', 'https://rennlist.com/forums/' + link[0]) try: cond_set_value(product_item, 'dealer_ship', dealer_ship[0]) except: cond_set_value(product_item, 'dealer_ship', '') try: cond_set_value(product_item, 'sold_status', sold_status[0]) except: cond_set_value(product_item, 'sold_status', '') yield 'https://rennlist.com/forums/' + link[0], product_item
def _start_requests(self, response): #cookies = [] #cookies = response.headers['Set-Cookie'] #c = Cookie.SimpleCookie(cookies) #self.cookie = c['incap_ses_534_984766'].value if not self.product_url: for st in self.searchterms: yield Request( url=self.SEARCH_URL.format( search_term=st, start_index=(self.current_page - 1) * 20, page_num=self.current_page), meta={ 'search_term': st, 'remaining': sys.maxint }, cookies={ #"visid_incap_984766":"L4pKYVELTiGpHf1vVyxGjeO22VoAAAAAQkIPAAAAAACArqGDAdWIXo83eSJ2rHCkeVxVa362DIvv", #"__utmz":"114059806.1524215273.1.1.", #"utmcsr":"(direct)|utmccn=(direct)|utmcmd=(none)", #"__gads":"=ID=d3dc5ca0d446d107:T=1524217574:S=ALNI_MbpZii_me-AyJXoThp97Vg5eJpHIw; AMCV_653F60B351E568560A490D4D%40AdobeOrg=1766948455%7CMCMID%7C64157379492117918583861199695401241584%7CMCAID%7CNONE%7CMCAAMLH-1524621134%7C6%7CMCAAMB-1524621134%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI", #"fitracking_12":"no", # "__qca":"P0-88327587-1524016340616; s_fid=78FC1AD688A7D176-2A5AF404FA2C8C06; c_code_2016=DE; U=35824841115ad6a4d867f7f7a4cee3; nlbi_984766=TfMKZQ3te3He+uwEqlAkVQAAAADFe45X8+obTbeWAkWEkh/h; s_cc=true", #"__utmc":"114059806", "incap_ses_534_984766": self.cookies_dict.get('incap_ses_534_984766'), #"__utma":"114059806.204852835.1524215273.1524215273.1524215273.1", #"__utmt":"1", #"fiutm":"direct|direct||||", #"__utmb":"114059806.1.10.1524215273", }, dont_filter=True, headers=self.HEADERS, ) elif self.product_url: prod = SiteProductItem() prod['url'] = self.product_url prod['search_term'] = '' yield Request( self.product_url, meta={ 'product': prod, }, callback=self._parse_single_product, headers={"User-Agent": self.agent}, )
def _scrape_product_links(self, response): products = response.xpath( '//div[@id="inventory-list-container"]//div[@class="vehicle-listing-ymm"]/a/@href' ) for product in products: st = response.meta['search_term'] link = product.extract() prod_item = SiteProductItem() req = Request(url=self.HOME_URL.format(search_term=st) + link, callback=self.parse_product, meta={ 'product': prod_item, 'search_term': st, 'remaining': sys.maxint, }, dont_filter=True, headers={"User-Agent": self.agent}) yield req, prod_item
def _scrape_product_links(self, response): data = json.loads(response.body) products = data['listings'] for product in products: title = product.get('title') vin = product.get('vin') try: price = product.get('derivedPrice').replace("$", "").replace(",", "") except Exception as e: price = 0 cond = product.get('listingType') if cond.lower().find('new') > -1: continue msrp = product.get('msrp') try: trim = product['trim'] except Exception as e: trim = '' modelCode = product.get('modelCode') description = product.get('description') exterior_color = product.get('colorExteriorSimple') try: model_detail = (modelCode + ' ' + trim).strip() except Exception as e: model_detail = '' st = response.meta['search_term'] link_zip = product.get('vdpSeoUrl') try: link = re.search('(.*)&zip', link_zip).group(1) except Exception as e: link = link_zip prod_item = SiteProductItem() # prod_item['listing_title'] = title prod_item['vin'] = vin prod_item['price'] = int(price) prod_item['condition'] = cond prod_item['listing_model'] = modelCode prod_item['listing_description'] = description prod_item['listing_color'] = exterior_color prod_item['listing_model'] = modelCode prod_item['listing_model_detail'] = model_detail prod_item['url'] = self.HOME_URL + link req = Request( url=self.HOME_URL + link, callback=self.parse_product, meta={ 'product': prod_item, 'search_term': st, 'remaining': sys.maxint, }, dont_filter=True, headers={"User-Agent": self.agent} ) yield req, prod_item
def _scrape_product_links(self, response): products = response.xpath('//div[@id="sortable-results"]//ul/li') search_term = response.meta['search_term'] product_shortcode = re.search('https://(.*)\.craigslist', response.url).group(1) neighborhoods = {} for item in self.city_obj: if item['state'] == self.state_shortcodes[product_shortcode]: if len(item['subregions']) > 0: for subregion in item['subregions']: if len(subregion['neighborhoods']) > 0: for neighborhood in subregion['neighborhoods']: neighborhoods[neighborhood['name'].lower( )] = neighborhood['id'] else: neighborhoods[subregion['name'].lower( )] = subregion['shortcode'] else: neighborhoods[item['name'].lower()] = item['shortcode'] for product in products: link = product.xpath('a/@href')[0].extract() try: listing_date = product.xpath( 'p[@class="result-info"]/time/@datetime')[0].extract() price = product.xpath( 'p[@class="result-info"]/span/span[@class="result-price"]/text()' )[0].extract() price = re.search('\$(.*)', price).group(1) price = int(price) except Exception as err: print(err) price = 0 try: city = product.xpath( 'p[@class="result-info"]/span/span[@class="result-hood"]/text()' )[0].extract() city = re.search('\((.*?)\)', city).group(1) if neighborhoods.get(city.lower()) == None: city = '' except Exception as err: print(err) city = '' prod_item = SiteProductItem() if listing_date not in (None, ''): dt = datetime.datetime.strptime(listing_date, '%Y-%m-%d %H:%M') #listing_date = dt.strftime('%m-%d-%Y') prod_item['listing_date'] = dt #print(listing_date) prod_item['price'] = price prod_item['city'] = city prod_item['state'] = self.state req = Request(url=link, callback=self.parse_product, meta={ 'product': prod_item, 'search_term': search_term, 'remaining': sys.maxint, }, dont_filter=True, headers={"User-Agent": self.agent}) yield req, prod_item
def _scrape_product_links(self, response): products = response.xpath( '//div[@id="resultdata"]//div[@class="rs-inner col-md-12 web-result"]' ) search_term = response.meta['search_term'] for product in products: link = product.xpath('a/@href')[0].extract() try: city_state = product.xpath( 'div[@class="col-xs-7 col-md-9 result-text"]/h4/text()' )[0].extract() city_state_match = re.match('(.*),\s(.*)', city_state) city, state = city_state_match.groups() title = product.xpath( 'a/h3[@class="rs-headline"]/text()')[0].extract() price_str = product.xpath( 'div/h3[@class="rs-headline class_price"]/text()' )[0].extract() price = re.sub(r'[^\d.]', '', price_str) if price == '': price = 0 else: price = int(price) except Exception as err: print(err) price = 0 seller_type_str = self._clean_text(''.join( product.xpath( 'div[@class="col-xs-7 col-md-9 result-text"]//div[@style="padding-top:5px;"]/text()' ).extract())) if seller_type_str == '': if price_str == "Auction": seller_type = 'Auction' else: seller_type = 'Dealership' else: if seller_type_str == 'Private Seller': seller_type = 'Private Party' else: seller_type = 'Dealership' prod_item = SiteProductItem() #print(listing_date) prod_item['listing_title'] = title prod_item['price'] = price prod_item['city'] = city prod_item['state'] = state prod_item['seller_type'] = seller_type prod_item['url'] = self.HOME_URL + link req = Request(url=self.HOME_URL + link, callback=self.parse_product, meta={ 'product': prod_item, 'search_term': search_term, 'remaining': sys.maxint, }, dont_filter=True, headers={"User-Agent": self.agent}) yield req, prod_item