Exemplo n.º 1
0
    def start_requests(self):

        for st in self.searchterms:
            yield Request(
                self.url_formatter.format(
                    self.SEARCH_URL,
                    search_term=urllib.quote_plus(st.encode('utf-8')),
                    page_num=1,
                ),
                meta={
                    'search_term': st,
                    'remaining': self.quantity
                },
            )

        if self.product_url:
            prod = SiteProductItem()
            prod['is_single_result'] = True
            prod['url'] = self.product_url
            prod['search_term'] = ''
            yield Request(self.product_url,
                          self._parse_single_product,
                          meta={'product': prod})

        if self.products_url:
            urls = self.products_url.split('||||')
            for url in urls:
                prod = SiteProductItem()
                prod['url'] = url
                prod['search_term'] = ''
                yield Request(self.product_url,
                              self._parse_single_product,
                              meta={'product': prod})
Exemplo n.º 2
0
    def _scrape_product_links(self, response):
        product_links = []
        search_term = response.meta['search_term']
        link_domain = "https://www.carmax.com/car/"
        try:
            product_data = json.loads(response.body).get('Results', {})
            for data in product_data:
                param = str(data.get('StockNumber'))
                product_links.append(link_domain + param)

        except Exception as e:
            self.log("Error while parsing the product links {}".format(e))
        if product_links:
            for link in product_links:
                prod_item = SiteProductItem()
                req = Request(url=link,
                              callback=self.parse_product,
                              meta={
                                  'product': prod_item,
                                  'search_term': search_term,
                                  'remaining': sys.maxint,
                              },
                              dont_filter=True,
                              headers={"User-Agent": self.agent})
                yield req, prod_item
Exemplo n.º 3
0
    def _start_requests(self, response):
        apiKey = re.search('"key":"(.*?)"}', response.body)

        if not self.product_url and apiKey:
            apiKey = apiKey.group(1)
            for st in self.searchterms:
                yield Request(
                    url=self.API_SEARCH_URL.format(
                        search_term=st,
                        start_index=(self.current_page - 1) * 20,
                        page_num=self.current_page,
                        apiKey=apiKey),
                    meta={
                        'search_term': st,
                        'apiKey': apiKey,
                        'remaining': sys.maxint
                    },
                    dont_filter=True,
                    headers=self.HEADERS,
                )
        elif self.product_url:
            prod = SiteProductItem()
            prod['url'] = self.product_url
            prod['search_term'] = ''

            yield Request(
                self.product_url,
                meta={
                    'product': prod,
                },
                callback=self._parse_single_product,
                headers={"User-Agent": self.agent},
            )
Exemplo n.º 4
0
    def _scrape_product_links(self, response):
        products_container = response.xpath(
            '//div[@id="threadslist"]//div[@class="trow text-center"]//div[@class="tcell alt1 text-left"]'
        )
        products_link = response.xpath(
            '//div[@id="threadslist"]//div[@class="trow text-center"]//div[@class="tcell alt1 text-left"]//a[contains(@id, "title")]/@href'
        ).extract()
        for product in products_container:
            dealer_ship = product.xpath(
                './/span[@style="color:blue"]/b').extract()
            link = product.xpath(
                './/a[contains(@id, "title")]/@href').extract()
            sold_status = product.xpath(
                './/  span[@class="highlight alert"]/strong').extract()
            product_item = SiteProductItem()

            cond_set_value(product_item, 'url',
                           'https://rennlist.com/forums/' + link[0])

            try:
                cond_set_value(product_item, 'dealer_ship', dealer_ship[0])
            except:
                cond_set_value(product_item, 'dealer_ship', '')

            try:
                cond_set_value(product_item, 'sold_status', sold_status[0])
            except:
                cond_set_value(product_item, 'sold_status', '')
            yield 'https://rennlist.com/forums/' + link[0], product_item
Exemplo n.º 5
0
    def _start_requests(self, response):
        #cookies = []
        #cookies = response.headers['Set-Cookie']
        #c = Cookie.SimpleCookie(cookies)
        #self.cookie = c['incap_ses_534_984766'].value
        if not self.product_url:
            for st in self.searchterms:
                yield Request(
                    url=self.SEARCH_URL.format(
                        search_term=st,
                        start_index=(self.current_page - 1) * 20,
                        page_num=self.current_page),
                    meta={
                        'search_term': st,
                        'remaining': sys.maxint
                    },
                    cookies={
                        #"visid_incap_984766":"L4pKYVELTiGpHf1vVyxGjeO22VoAAAAAQkIPAAAAAACArqGDAdWIXo83eSJ2rHCkeVxVa362DIvv",
                        #"__utmz":"114059806.1524215273.1.1.",
                        #"utmcsr":"(direct)|utmccn=(direct)|utmcmd=(none)",
                        #"__gads":"=ID=d3dc5ca0d446d107:T=1524217574:S=ALNI_MbpZii_me-AyJXoThp97Vg5eJpHIw; AMCV_653F60B351E568560A490D4D%40AdobeOrg=1766948455%7CMCMID%7C64157379492117918583861199695401241584%7CMCAID%7CNONE%7CMCAAMLH-1524621134%7C6%7CMCAAMB-1524621134%7Cj8Odv6LonN4r3an7LhD3WZrU1bUpAkFkkiY1ncBR96t2PTI",
                        #"fitracking_12":"no",
                        # "__qca":"P0-88327587-1524016340616; s_fid=78FC1AD688A7D176-2A5AF404FA2C8C06; c_code_2016=DE; U=35824841115ad6a4d867f7f7a4cee3; nlbi_984766=TfMKZQ3te3He+uwEqlAkVQAAAADFe45X8+obTbeWAkWEkh/h; s_cc=true",
                        #"__utmc":"114059806",
                        "incap_ses_534_984766":
                        self.cookies_dict.get('incap_ses_534_984766'),
                        #"__utma":"114059806.204852835.1524215273.1524215273.1524215273.1",
                        #"__utmt":"1",
                        #"fiutm":"direct|direct||||",
                        #"__utmb":"114059806.1.10.1524215273",
                    },
                    dont_filter=True,
                    headers=self.HEADERS,
                )
        elif self.product_url:
            prod = SiteProductItem()
            prod['url'] = self.product_url
            prod['search_term'] = ''

            yield Request(
                self.product_url,
                meta={
                    'product': prod,
                },
                callback=self._parse_single_product,
                headers={"User-Agent": self.agent},
            )
Exemplo n.º 6
0
    def _scrape_product_links(self, response):
        products = response.xpath(
            '//div[@id="inventory-list-container"]//div[@class="vehicle-listing-ymm"]/a/@href'
        )

        for product in products:
            st = response.meta['search_term']
            link = product.extract()
            prod_item = SiteProductItem()

            req = Request(url=self.HOME_URL.format(search_term=st) + link,
                          callback=self.parse_product,
                          meta={
                              'product': prod_item,
                              'search_term': st,
                              'remaining': sys.maxint,
                          },
                          dont_filter=True,
                          headers={"User-Agent": self.agent})
            yield req, prod_item
Exemplo n.º 7
0
    def _scrape_product_links(self, response):
        data = json.loads(response.body)
        products = data['listings']

        for product in products:
            title = product.get('title')
            vin = product.get('vin')
            try:
                price = product.get('derivedPrice').replace("$", "").replace(",", "")
            except Exception as e:
                price = 0

            cond = product.get('listingType')

            if cond.lower().find('new') > -1:
                continue

            msrp = product.get('msrp')
            try:
                trim = product['trim']
            except Exception as e:
                trim = ''

            modelCode = product.get('modelCode')
            description = product.get('description')
            exterior_color = product.get('colorExteriorSimple')
            try:
                model_detail = (modelCode + ' ' + trim).strip()
            except Exception as e:
                model_detail = ''
            st = response.meta['search_term']
            link_zip = product.get('vdpSeoUrl')

            try:
                link = re.search('(.*)&zip', link_zip).group(1)
            except Exception as e:
                link = link_zip

            prod_item = SiteProductItem()
            #
            prod_item['listing_title'] = title
            prod_item['vin'] = vin
            prod_item['price'] = int(price)
            prod_item['condition'] = cond
            prod_item['listing_model'] = modelCode
            prod_item['listing_description'] = description
            prod_item['listing_color'] = exterior_color
            prod_item['listing_model'] = modelCode
            prod_item['listing_model_detail'] = model_detail
            prod_item['url'] = self.HOME_URL + link

            req = Request(
                url=self.HOME_URL + link,
                callback=self.parse_product,
                meta={
                    'product': prod_item,
                    'search_term': st,
                    'remaining': sys.maxint,
                },
                dont_filter=True,
                headers={"User-Agent": self.agent}
            )
            yield req, prod_item
Exemplo n.º 8
0
    def _scrape_product_links(self, response):
        products = response.xpath('//div[@id="sortable-results"]//ul/li')
        search_term = response.meta['search_term']
        product_shortcode = re.search('https://(.*)\.craigslist',
                                      response.url).group(1)
        neighborhoods = {}

        for item in self.city_obj:
            if item['state'] == self.state_shortcodes[product_shortcode]:
                if len(item['subregions']) > 0:
                    for subregion in item['subregions']:
                        if len(subregion['neighborhoods']) > 0:
                            for neighborhood in subregion['neighborhoods']:
                                neighborhoods[neighborhood['name'].lower(
                                )] = neighborhood['id']
                        else:
                            neighborhoods[subregion['name'].lower(
                            )] = subregion['shortcode']
                else:
                    neighborhoods[item['name'].lower()] = item['shortcode']

        for product in products:
            link = product.xpath('a/@href')[0].extract()

            try:
                listing_date = product.xpath(
                    'p[@class="result-info"]/time/@datetime')[0].extract()
                price = product.xpath(
                    'p[@class="result-info"]/span/span[@class="result-price"]/text()'
                )[0].extract()
                price = re.search('\$(.*)', price).group(1)
                price = int(price)
            except Exception as err:
                print(err)
                price = 0

            try:
                city = product.xpath(
                    'p[@class="result-info"]/span/span[@class="result-hood"]/text()'
                )[0].extract()
                city = re.search('\((.*?)\)', city).group(1)
                if neighborhoods.get(city.lower()) == None:
                    city = ''
            except Exception as err:
                print(err)
                city = ''

            prod_item = SiteProductItem()
            if listing_date not in (None, ''):
                dt = datetime.datetime.strptime(listing_date, '%Y-%m-%d %H:%M')
                #listing_date = dt.strftime('%m-%d-%Y')

            prod_item['listing_date'] = dt
            #print(listing_date)
            prod_item['price'] = price
            prod_item['city'] = city
            prod_item['state'] = self.state

            req = Request(url=link,
                          callback=self.parse_product,
                          meta={
                              'product': prod_item,
                              'search_term': search_term,
                              'remaining': sys.maxint,
                          },
                          dont_filter=True,
                          headers={"User-Agent": self.agent})
            yield req, prod_item
Exemplo n.º 9
0
    def _scrape_product_links(self, response):
        products = response.xpath(
            '//div[@id="resultdata"]//div[@class="rs-inner col-md-12 web-result"]'
        )

        search_term = response.meta['search_term']

        for product in products:
            link = product.xpath('a/@href')[0].extract()

            try:
                city_state = product.xpath(
                    'div[@class="col-xs-7 col-md-9 result-text"]/h4/text()'
                )[0].extract()
                city_state_match = re.match('(.*),\s(.*)', city_state)
                city, state = city_state_match.groups()

                title = product.xpath(
                    'a/h3[@class="rs-headline"]/text()')[0].extract()
                price_str = product.xpath(
                    'div/h3[@class="rs-headline class_price"]/text()'
                )[0].extract()
                price = re.sub(r'[^\d.]', '', price_str)

                if price == '':
                    price = 0
                else:
                    price = int(price)

            except Exception as err:
                print(err)
                price = 0

            seller_type_str = self._clean_text(''.join(
                product.xpath(
                    'div[@class="col-xs-7 col-md-9 result-text"]//div[@style="padding-top:5px;"]/text()'
                ).extract()))

            if seller_type_str == '':
                if price_str == "Auction":
                    seller_type = 'Auction'
                else:
                    seller_type = 'Dealership'
            else:
                if seller_type_str == 'Private Seller':
                    seller_type = 'Private Party'
                else:
                    seller_type = 'Dealership'
            prod_item = SiteProductItem()

            #print(listing_date)
            prod_item['listing_title'] = title
            prod_item['price'] = price
            prod_item['city'] = city
            prod_item['state'] = state
            prod_item['seller_type'] = seller_type
            prod_item['url'] = self.HOME_URL + link
            req = Request(url=self.HOME_URL + link,
                          callback=self.parse_product,
                          meta={
                              'product': prod_item,
                              'search_term': search_term,
                              'remaining': sys.maxint,
                          },
                          dont_filter=True,
                          headers={"User-Agent": self.agent})
            yield req, prod_item