Python Matcher 예제들, product_spiders.base_spiders.matcher.Matcher Python 예제들

예제 #1

0

파일 보기

    def __init__(self, *args, **kwargs):
        super(KwikFitSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)

예제 #2

0

파일 보기

    def __init__(self, *args, **kwargs):
        super(MyTyresSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)

        self.errors = []

        self.search_history = set()

예제 #3

0

파일 보기

    def __init__(self, *args, **kwargs):
        super(TyreDriveSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)

        with open(os.path.join(HERE, 'manmarks.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.all_man_marks[row['code']] = row['manufacturer_mark']

        self.errors = []

예제 #4

0

파일 보기

    def __init__(self, *args, **kwargs):
        super(EtyresSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)

        with open(os.path.join(HERE, 'manmarks.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.all_man_marks[row['code']] = row['manufacturer_mark']

        self.custom_man_marks[' JAGUAR FITMENT'] = 'J'
        self.custom_man_marks[' RAV4 FITMENT'] = ''
        self.custom_man_marks[' NISSAN JUKE FITMENT'] = ''
        self.custom_man_marks[' (PORSCHE FITMENT)'] = 'N0'
        self.custom_man_marks[' LEXUS FITMENT'] = ''
        self.custom_man_marks[' PRIUS FITMENT'] = ''
        self.custom_man_marks[' TOYOTA AURIS FITMENT'] = ''
        self.custom_man_marks[' - TOYOTA RAV4 FITMENT'] = ''
        self.custom_man_marks[' BMW MINI FITMENT'] = '*'
        self.custom_man_marks[' AUDI FITMENT'] = 'AO'
        self.custom_man_marks[' JAG FITMENT'] = 'J'
        self.custom_man_marks[' FERRARI MASERATI FITMENT'] = ''
        self.custom_man_marks[' MASERATI FITMENT'] = ''
        self.custom_man_marks[' - BMW FITMENT'] = '*'
        self.custom_man_marks[' ASTON MARTIN FITMENT'] = ''
        self.custom_man_marks[' MERCEDES & RENAULT FITMENT'] = 'MO'

예제 #5

0

파일 보기

파일: bestbuytyres.py 프로젝트: oceancloud82/scraping

    def __init__(self, *args, **kwargs):
        super(BestBuyTyresSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)

        with open(os.path.join(HERE, 'manmarks.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.all_man_marks[row['code']] = row['manufacturer_mark']

        self.brand_fixes['Bridgestone'] = ["b'stone", 'b/stone', 'bridestone', 'bridgestohne', 'brridgestone']
        self.brand_fixes['Continental'] = ['conti', 'contiental', 'continenal', 'continntal', 'contintenal']
        self.brand_fixes['Dunlop'] = ['dlp']
        self.brand_fixes['Goodyear'] = ['g’year', 'g’yr', 'g/year', 'goodyea', 'gy', 'gyr']
        self.brand_fixes['Michelin'] = ['mich']
        self.brand_fixes['Pirelli'] = ['pir', 'pire', 'pireelli']
        #self.brand_fixes['Uniroyal'] = ['uni']
        self.custom_man_marks = {
            '(LEXUS FITMENT)': '',
            '()': '',
            '(BMW FITMENT)': '*',
            '(RAV 4)': '',
            '(BMW)': '*'
        }

        self.errors = []

예제 #6

0

파일 보기

파일: points_spider.py 프로젝트: oceancloud82/scraping

    def __init__(self, *args, **kwargs):
        super(PointSSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)

        with open(os.path.join(HERE, 'manmarks.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.all_man_marks[row['code']] = row['manufacturer_mark']

        self.brands = [row['Brand'] for row in self.tyre_sizes]

        self.processed_rows = {}

예제 #7

0

파일 보기

    def __init__(self, *args, **kwargs):
        super(EcraterSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)
        with open(os.path.join(HERE, 'lego.csv')) as f:
            reader = csv.reader(f)
            self.products = {
                prod[2]: prod[3].decode('utf8')
                for prod in reader
            }

        dispatcher.connect(self.spider_closed, signals.spider_closed)

        if os.path.exists(os.path.join(HERE, 'ecrater_products.csv')):
            shutil.copy(os.path.join(HERE, 'ecrater_products.csv'),
                        os.path.join(HERE, 'ecrater_products.csv.bak'))

        # Errors
        self.errors = []

예제 #8

0

파일 보기

    def __init__(self, *args, **kwargs):
        super(LegoUsaEbaySpider, self).__init__()
        self._csv_file = os.path.join(self.HERE, 'lego.csv')
        self._converted_price = True
        self._ebay_url = 'http://www.ebay.com'
        self._search_fields = [3, 2]
        self._all_vendors = True
        self._look_related = False
        self._meta_fields = [('sku', 2), ('name', 3), ('price', 4),
                             ('category', 1)]
        self._match_fields = ('sku', 'identifier')
        self._check_valid_item = self._valid_item_

        self._re_sku = re.compile(r'(\d{3,})')

        self._check_diff_ratio = True
        # self._ratio_accuracy = 60

        self.matcher = Matcher(self.log)

예제 #9

0

파일 보기

    def __init__(self, *args, **kwargs):
        super(EvenTyresSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        self.tyre_sizes = []
        self.all_man_marks = {}
        self.manually_matched = []

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                new_row = row.copy()
                self.tyre_sizes.append(new_row)

        with open(os.path.join(HERE, 'manmarks.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.all_man_marks[row['code']] = row['manufacturer_mark']

        self.errors = []

예제 #10

0

파일 보기

    def __init__(self, *args, **kwargs):
        super(TyreGiantSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)

        with open(os.path.join(HERE, 'manmarks.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.all_man_marks[row['code']] = row['manufacturer_mark']

        self.brands = [row['Brand'] for row in self.tyre_sizes]

        self.search_history = set()

        self.finished = False

        dispatcher.connect(self.spider_idle, signals.spider_idle)

예제 #11

0

파일 보기

파일: mytyres.py 프로젝트: oceancloud82/scraping

    def __init__(self, *args, **kwargs):
        super(MyTyresSpider, self).__init__(*args, **kwargs)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)

        self.ip_codes = {}
        self.ip_codes_filename = os.path.join(HERE, 'mytyres_ip_codes.csv')
        if os.path.exists(self.ip_codes_filename):
            with open(self.ip_codes_filename) as f:
                reader = csv.DictReader(f)
                for row in reader:
                    self.ip_codes[row['identifier']] = row['ip_code']

        self.errors = []

        self.search_history = set()

예제 #12

0

파일 보기

    def __init__(self, *args, **kwargs):
        super(LoveTyresSpider, self).__init__(*args, **kwargs)

        self.matcher = Matcher(self.log)
        self.images = {}
        self.all_man_marks = {}

        with open(os.path.join(HERE, 'manmarks.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.all_man_marks[row['code']] = row['manufacturer_mark']

        if os.path.exists(self.images_filename):
            with open(self.images_filename) as f:
                reader = csv.DictReader(f)
                for row in reader:
                    self.images[row['product_url']] = row['image_url']

        self.errors = []

        dispatcher.connect(self.spider_closed, signals.spider_closed)

예제 #13

0

파일 보기

파일: etyres.py 프로젝트: oceancloud82/scraping

    def __init__(self, *args, **kwargs):
        super(EtyresSpider, self).__init__(*args, **kwargs)
        dispatcher.connect(self.spider_closed, signals.spider_closed)

        self.matcher = Matcher(self.log)

        self.all_man_marks = {}
        self.custom_man_marks = {}
        self.tyre_sizes = []
        self.tyre_widths = {}
        self.tyre_profiles = {}
        self.tyre_rims = {}

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)

        with open(os.path.join(HERE, 'manmarks.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.all_man_marks[row['code']] = row['manufacturer_mark']

        self.ip_codes = {}
        self.ip_codes_filename = os.path.join(HERE, 'etyres_ip_codes.csv')
        if os.path.exists(self.ip_codes_filename):
            with open(self.ip_codes_filename) as f:
                reader = csv.DictReader(f)
                for row in reader:
                    self.ip_codes[row['identifier']] = row['ip_code']

        self.custom_man_marks[' JAGUAR FITMENT'] = 'J'
        self.custom_man_marks[' RAV4 FITMENT'] = ''
        self.custom_man_marks[' NISSAN JUKE FITMENT'] = ''
        self.custom_man_marks[' (PORSCHE FITMENT)'] = 'N0'
        self.custom_man_marks[' LEXUS FITMENT'] = ''
        self.custom_man_marks[' PRIUS FITMENT'] = ''
        self.custom_man_marks[' TOYOTA AURIS FITMENT'] = ''
        self.custom_man_marks[' - TOYOTA RAV4 FITMENT'] = ''
        self.custom_man_marks[' BMW MINI FITMENT'] = '*'
        self.custom_man_marks[' AUDI FITMENT'] = 'AO'
        self.custom_man_marks[' JAG FITMENT'] = 'J'
        self.custom_man_marks[' FERRARI MASERATI FITMENT'] = ''
        self.custom_man_marks[' MASERATI FITMENT'] = ''
        self.custom_man_marks[' - BMW FITMENT'] = '*'
        self.custom_man_marks[' ASTON MARTIN FITMENT'] = ''
        self.custom_man_marks[' MERCEDES & RENAULT FITMENT'] = 'MO'

예제 #14

0

파일 보기

    def __init__(self, *args, **kwargs):
        super(HusqvarnaDEEbaySpider, self).__init__()
        self._ebay_url = 'http://www.ebay.de'
        self._search_fields = ['brand', 'sku']
        self._all_vendors = True
        self._meta_fields = [('name', 'name'), ('price', 'price'),
                             ('brand', 'brand'), ('category', 'category')]
        self._match_fields = ('sku', )
        self._check_valid_item = self.__valid_item_
        self._converted_price = False
        self._check_diff_ratio = True
        self._re_sku = re.compile(r'(\d{3,})')
        self._look_related = False

        self.__collected_items = set()

        self._check_diff_ratio = True
        self.matcher = Matcher(self.log)

예제 #15

0

파일 보기

    def __init__(self, *args, **kwargs):
        super(TyrebookersSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)
        self.tyre_sizes = self.tyre_sizes[::-1]

        with open(os.path.join(HERE, 'manmarks.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.all_man_marks[row['code']] = row['manufacturer_mark']

        self.already_processed = []

        self.custom_man_marks[' Merc'] = 'MO'
        self.custom_man_marks[' BMW'] = '*'
        self.custom_man_marks[' Audi'] = 'AO'

예제 #16

0

파일 보기

파일: oponeo.py 프로젝트: oceancloud82/scraping

class OponeoSpider(BaseSpider):
    name = 'oponeo.co.uk_test'
    allowed_domains = ['oponeo.co.uk']
    start_urls = ('http://www.oponeo.co.uk', )
    tyre_sizes = []
    all_man_marks = {}

    download_delay = 1

    def __init__(self, *args, **kwargs):
        super(OponeoSpider, self).__init__(*args, **kwargs)

        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for i, row in enumerate(reader):
                self.tyre_sizes.append(row)

        with open(os.path.join(HERE, 'manmarks.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.all_man_marks[row['code']] = row['manufacturer_mark']

        self.errors = []

    def start_requests(self):
        requests = []
        cookie = 1
        self.log("[OPONEO] Row to process: %d" % len(self.tyre_sizes))

        for i, row in enumerate(self.tyre_sizes, 1):
            self.log("[OPONEO] Searching for tyre %d: %s, MTS code: %s" %
                     (i, row['Full Tyre Size'], row['MTS Stockcode']))
            search = str(row['Width']) + '/' + str(row['Aspect Ratio']) + \
                     str(row['Speed rating']) + str(row['Rim'])
            meta = {'row': row, 'search': search, 'cookiejar': cookie}
            cookie += 1
            search_url = 'http://www.oponeo.co.uk/tyre-finder/s=2/summer,all-season/t=1/car/r=1/{Width}-{Aspect Ratio}-r{Rim}'.format(
                **row)
            yield Request(
                search_url,
                meta=meta,
                headers={
                    'User-Agent':
                    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0'
                })

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products = hxs.select(
            '//*[@id="productList"]//div[@itemprop="offers"]')

        next_page = hxs.select(
            '//li[contains(@class, "next") and contains(@class, "nextItem")]/a/@id'
        ).extract()
        if next_page:
            next_page_id = next_page[0]
            req = FormRequest.from_response(
                response,
                formname='form1',
                formdata={
                    '__ASYNCPOST': 'true',
                    '__EVENTTARGET': next_page_id,
                    '__EVENTARGUMENT': ''
                },
                headers={
                    'X-MicrosoftAjax':
                    'Delta=true',
                    'X-Requested-With':
                    'XMLHttpRequest',
                    'User-Agent':
                    'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:30.0) Gecko/20100101 Firefox/30.0'
                },
                meta=response.meta,
                callback=self.parse_next_page,
                dont_filter=True)
            yield req

        for product in products:
            product_url = product.select(
                './/div[@class="productName"]//a/@href')[0].extract()
            yield Request(urljoin(base_url, product_url),
                          callback=self.parse_product,
                          meta=response.meta)

    def parse_next_page(self, response):
        yield Request(response.url, dont_filter=True, meta=response.meta)

    def retry_request(self, response):
        try_no = response.meta.get('try', 1)
        if try_no < self.max_retry_count:
            meta = {'try': try_no + 1}
            meta['recache'] = True
            self.log("[WARNING] Retrying. Failed to scrape product page: %s" %
                     response.url)
            yield Request(response.url,
                          meta=meta,
                          callback=self.parse_product,
                          dont_filter=True)
        else:
            self.log("[WARNING] Gave up. Failed to scrape product page: %s" %
                     response.url)
            self.errors.append("Failed to scrape product page: %s" %
                               response.url)

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl),
        # the pattern should be set as the product's name

        fitting_method = 'Delivered'

        loader.add_value('url', response.url)

        image_url = hxs.select('//img[@itemprop="image"]/@src').extract()
        if image_url:
            loader.add_value('image_url',
                             urljoin(get_base_url(response), image_url[0]))

        identifier = hxs.select('//form[@name="form1"]/@action').extract()
        if not identifier:
            yield self.retry_request(response)
            return
        identifier = identifier[0]
        loader.add_value('identifier', identifier)
        price = hxs.select(
            '//*[@class="price"]/*[@class="mainPrice"]/text()')[0].extract()
        loader.add_value('price', price)
        if not loader.get_output_value('price'):
            loader.add_value('stock', 0)

        brand = hxs.select(
            '//div[@class="hidden"]/input[@class="producerName"]/@value'
        ).extract()
        if not brand:
            yield self.retry_request(response)
            return
        brand = brand[0].strip()
        loader.add_value('brand', unify_brand(brand))
        loader.add_value('category',
                         find_brand_segment(loader.get_output_value('brand')))
        brand = re.sub(u'\u0119', u'e', brand)

        product_name = hxs.select(
            '//h1[@itemprop="name"]/text()')[0].extract().strip()
        product_name = re.sub(u'[:\u2122]', u'', product_name)
        product_name = product_name.replace(brand, '').strip()

        data = parse_pattern(product_name)
        if not data:
            log.msg('ERROR parsing "{}" [{}]'.format(product_name,
                                                     response.url))
            self.errors.append('ERROR parsing "{}" [{}]'.format(
                product_name, response.url))
            return

        loader.add_value('name', data['Name'])

        metadata = MicheldeverMeta()
        metadata['aspect_ratio'] = data['Aspect_Ratio']
        metadata['rim'] = data['Rim']
        metadata['speed_rating'] = data['Speed_Rating']

        metadata['width'] = data['Width']
        metadata['fitting_method'] = fitting_method
        metadata['load_rating'] = data['Load_Rating'] or ''
        metadata['alternative_speed_rating'] = ''
        xl = 'XL' in product_name
        metadata['xl'] = 'Yes' if xl else 'No'

        run_flat = 'run on flat' in product_name.lower(
        ) or 'run flat' in product_name.lower()
        metadata['run_flat'] = 'Yes' if run_flat else 'No'
        manufacturer_mark = [
            mark for mark in self.all_man_marks.keys()
            if mark in product_name.split(' ')
        ]
        manufacturer_mark = manufacturer_mark[0].strip(
        ) if manufacturer_mark else []
        metadata['manufacturer_mark'] = self.all_man_marks.get(
            manufacturer_mark, '') if manufacturer_mark else ''
        metadata['full_tyre_size'] = '/'.join(
            (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
             metadata['load_rating'], metadata['speed_rating']))
        # metadata['alternative_speed_rating']))

        product = loader.load_item()
        product['metadata'] = metadata

        if not is_product_correct(product):
            return

        product['metadata']['mts_stock_code'] = find_mts_stock_code(
            product, spider_name=self.name, log=self.log)

        new_speed_rating = get_speed_rating(product)
        new_alt_speed = get_alt_speed(product)
        product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
            product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
        product['metadata']['speed_rating'] = new_speed_rating

        yield product

    def match_name(self,
                   search_name,
                   new_item,
                   match_threshold=80,
                   important_words=None):
        r = self.matcher.match_ratio(search_name, new_item, important_words)
        return r >= match_threshold

예제 #17

0

파일 보기

class EvenTyresSpider(Spider):
    name = 'event-tyres.co.uk'
    allowed_domains = ['event-tyres.co.uk']

    website_url = 'http://www.event-tyres.co.uk/'
    postal_code = 'WA5 7ZB'
    price_discount = False  # extract multiple tyre discount price?

    def __init__(self, *args, **kwargs):
        super(EvenTyresSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        self.tyre_sizes = []
        self.all_man_marks = {}
        self.manually_matched = []

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                new_row = row.copy()
                self.tyre_sizes.append(new_row)

        with open(os.path.join(HERE, 'manmarks.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.all_man_marks[row['code']] = row['manufacturer_mark']

        self.errors = []

    def start_requests(self):
        for i, row in enumerate(self.tyre_sizes):
            yield Request(self.website_url,
                          callback=self.next_search,
                          meta={
                              'row': row,
                              'cookiejar': str(i)
                          },
                          dont_filter=True)

    def next_search(self, response):
        form_token = response.xpath(
            '//input[@id="search_form__token"]/@value').extract()[0]
        row = response.meta['row']
        params = {
            'search_form[width]': row['Width'],
            'search_form[profile]': row['Aspect Ratio'],
            'search_form[size]': row['Rim'],
            'search_form[postcode]': self.postal_code,
            'search_form[_token]': form_token,
            'search_form[search]': '',
        }
        r = FormRequest(url=self.website_url,
                        meta={'cookiejar': response.meta['cookiejar']},
                        formdata=params)
        yield r

    def parse(self, response):
        pages = set(
            response.xpath(
                '//*[contains(@class, "pagination__item")]/a[not(contains(@class, "pagination__current"))]/@href'
            ).extract())
        for page_url in pages:
            yield Request(response.urljoin(page_url), meta=response.meta)

        products = response.xpath(
            '//article[@itemtype="http://schema.org/Product"]')

        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)

            brand = product_el.xpath(
                './/*[@itemprop="brand"]//*[@itemprop="name"]/text()').extract(
                )[0].strip()
            if brand.upper() in brands_substitute:
                brand = brands_substitute[brand.upper()]
            full_name = product_el.xpath(
                './/*[contains(@class, "product__title") and @itemprop="name"]/text()'
            ).extract()[0]
            try:
                tyre_size, name = re.split(brand, full_name, flags=re.I)
            except ValueError:
                self.log(
                    "[[TESTING]] Can not split tyre '%s' with brand '%s'" %
                    (full_name, brand))
                continue
            # tyre_size, name = full_name.split(brand)
            loader.add_value('name', name)

            winter_tyre = product_el.xpath(
                './/*[@class="product__info"]//*[@data-icon="S" and contains(text(), "Winter")]'
            )
            if not winter_tyre:
                loader.add_value('brand', unify_brand(brand))
                loader.add_value(
                    'category',
                    find_brand_segment(loader.get_output_value('brand')))
                identifier = self.get_identifier(product_el)

                out_of_stock = product_el.xpath(
                    './/*[@itemprop="availability" and contains(@content, "Out")]'
                )
                if out_of_stock:
                    loader.add_value('stock', 0)

                loader.add_value('url', response.url)

                image_url = product_el.xpath(
                    './/img[@itemprop="image"]/@src').extract()

                if image_url:
                    loader.add_value('image_url',
                                     response.urljoin(image_url[0]))

                loader.add_value('identifier', identifier)
                price = product_el.xpath('@data-price').extract()[0]
                loader.add_value('price', price)

                metadata = MicheldeverMeta()
                res = parse_pattern(tyre_size)
                if not res:
                    continue
                width, ratio, rim, load_rating, speed_rating = res
                metadata['aspect_ratio'] = ratio
                metadata['rim'] = rim
                metadata['speed_rating'] = speed_rating
                metadata['load_rating'] = load_rating
                metadata['width'] = width

                metadata['fitting_method'] = 'Fitted'
                metadata['alternative_speed_rating'] = ''
                xl = bool(
                    product_el.xpath(
                        './/*[@class="product__info"]//*[@data-icon="XL"]'))
                metadata['xl'] = 'Yes' if xl else 'No'
                run_flat_found = is_run_flat(full_name)
                run_flat = bool(
                    product_el.xpath(
                        './/*[@class="product__info"]//*[@data-icon="RF"]'))
                if not run_flat:
                    run_flat = ' RFT' in name
                metadata[
                    'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No'

                man_code = self._get_manufacturer_code(full_name)

                metadata['manufacturer_mark'] = man_code

                metadata['full_tyre_size'] = '/'.join(
                    (metadata['width'], metadata['aspect_ratio'],
                     metadata['rim'], metadata['load_rating'],
                     metadata['speed_rating']))

                try:
                    fuel, grip, noise = product_el.xpath(
                        './/li[contains(@class, "product__meta-item--")]/text()'
                    ).extract()
                except:
                    fuel, grip, noise = ('', '', '')

                metadata['fuel'] = fuel
                metadata['grip'] = grip
                metadata['noise'] = noise

                product = loader.load_item()
                # The website is defaulting to 2 tyres with a discount of £10
                if product.get('price') and (not self.price_discount):
                    product['price'] += Decimal('10')
                product['metadata'] = metadata

                if not is_product_correct(product):
                    continue

                product['metadata'][
                    'mts_stock_code'] = self.find_mts_stock_code(product)

                yield product

    # Please don't remove this method. This method is overridden by the children.
    def find_mts_stock_code(self, product):
        return find_mts_stock_code(product,
                                   spider_name=self.name,
                                   log=self.log)

    # Please don't remove this method. This method is overridden by the children.
    def get_identifier(self, selector):
        return selector.xpath('@data-product').extract()[0]

    def _get_manufacturer_code(self, name):
        name = name.upper()
        for code, manufacturer_mark in self.all_man_marks.items():
            if code not in name:
                continue

            if code in name.split(' ') or code == '*':
                return manufacturer_mark

        return ''

    def match_name(self,
                   search_name,
                   new_item,
                   match_threshold=90,
                   important_words=None):
        r = self.matcher.match_ratio(search_name, new_item, important_words)
        return r >= match_threshold

예제 #18

0

파일 보기

class BlackcirclesSpider(Spider):
    name = 'blackcircles.com'
    allowed_domains = ['blackcircles.com']
    start_urls = ('http://www.blackcircles.com', )
    tyre_sizes = []

    errors = []
    seen_ids = set()

    def __init__(self, *args, **kwargs):
        super(BlackcirclesSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)

    def start_requests(self):
        search_seen = set()
        for row in self.tyre_sizes:
            formdata = {
                'profile': row['Aspect Ratio'],
                'rim': row['Rim'],
                'speed': 'Any',
                'width': row['Width'],
                'displayall': '999',
                'delivery': '0',
            }

            search_key = '{}:{}:{}'.format(row['Aspect Ratio'], row['Rim'],
                                           row['Width'])
            if search_key not in search_seen:
                yield FormRequest(
                    'http://www.blackcircles.com/order/tyres/search',
                    dont_filter=True,
                    formdata=formdata,
                    meta={'row': row},
                    callback=self.parse)
                search_seen.add(search_key)
            else:
                self.log('Duplicate search: {}'.format(search_key))

    def parse(self, response):
        row = response.meta['row']

        json_data = None
        for line in response.body.split('\n'):
            if "JsonObject = " in line:
                json_data = json.loads(
                    line.replace('JsonObject = ', '').replace('; \r', ''))

        products = json_data['Rest'] + json_data['Deals']

        collected_products = []

        self.log('Results found {} {}'.format(len(products), response.meta))
        for product_info in products:
            # skip winter tyres
            if product_info['WinterTyre']:
                continue

            loader = ProductLoader(item=Product(), selector=product_info)
            loader.add_value('name', product_info['ModelName'])
            brand = product_info['Manufacturer']

            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            identifier = product_info['PrimaryId']
            fitting_method = 'Fitted'
            if str(identifier) + '-' + fitting_method in self.seen_ids:
                continue

            url = '/catalogue' + product_info[
                'CatalogueUrl'] + '/f?tyre=' + str(product_info['PrimaryId'])
            loader.add_value('url', response.urljoin(url))

            image_url = product_info.get('ModelImageLarge')
            if not image_url:
                image_url = product_info.get('ModelImage')

            if image_url:
                image_url = image_url.split('src="')[-1].split('"')[0]
                loader.add_value('image_url', response.urljoin(image_url))

            spec = product_info['SpecificationName']
            metadata = MicheldeverMeta()
            # metadata['mts_stock_code'] = row['MTS Stockcode']
            metadata['aspect_ratio'] = row['Aspect Ratio']
            metadata['rim'] = row['Rim']
            metadata['speed_rating'] = spec.split()[-1]
            metadata['width'] = row['Width']

            load_rating = product_info['LoadRatingName']
            metadata['load_rating'] = load_rating
            metadata['alternative_speed_rating'] = ''
            xl = product_info['Reinforced']
            metadata['xl'] = 'Yes' if xl else 'No'
            run_flat_found = is_run_flat(product_info['ModelName'])
            run_flat = product_info['RunFlat']
            metadata[
                'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No'
            manufacturer_mark = product_info['Variant']
            if manufacturer_mark:
                manufacturer_mark = manufacturer_mark.split()[0].strip()

            full_tyre_size = '/'.join(
                (row['Width'], row['Aspect Ratio'], row['Rim'],
                 metadata['load_rating'], metadata['speed_rating']))
            # MOE Exception for this product
            if manufacturer_mark and 'MO EXTENDED' in product_info['Variant'].upper()\
               and product_info['ModelName'] == 'Potenza S001' and full_tyre_size == '245/40/18/97/Y':
                metadata['manufacturer_mark'] = 'MOE'
            else:
                metadata['manufacturer_mark'] = find_man_mark(
                    manufacturer_mark) if manufacturer_mark else ''

            metadata['full_tyre_size'] = full_tyre_size

            try:
                metadata['fuel'] = product_info['TyreLabelFuel']['Score']
            except Exception:
                metadata['fuel'] = ''

            try:
                metadata['grip'] = product_info['TyreLabelWet']['Score']
            except Exception:
                metadata['grip'] = ''

            try:
                metadata['noise'] = product_info['TyreLabelNoise'][
                    'NoiseLevel']
            except Exception:
                metadata['noise'] = ''

            product = loader.load_item()
            product['metadata'] = metadata

            product['price'] = product_info['FullyFittedPrice']
            fitting_method = 'Fitted'
            product['identifier'] = str(identifier) + '-' + fitting_method
            product['metadata']['fitting_method'] = fitting_method

            t1 = time.time()
            if not is_product_correct(product):
                self.log('Search: {}'.format(str(response.meta)))
                self.seen_ids.add(str(identifier) + '-' + fitting_method)
                self.log('PRODUCT IS NOT CORRECT => %r' % product)
                continue
            t2 = time.time()
            self.log('Time taken by product correct: {}'.format(t2 - t1))

            t1 = time.time()
            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)
            t2 = time.time()
            self.log('Time taken by mts stock: {}'.format(t2 - t1))

            collected_products.append(product)

        min_price_products = {}
        for product in collected_products:
            key = "%s-%s-%s-%s-%s-%s-%s" % (
                product['brand'], product['name'],
                product['metadata']['fitting_method'],
                product['metadata']['full_tyre_size'],
                product['metadata']['xl'], product['metadata']['run_flat'],
                product['metadata']['manufacturer_mark'])
            if key in min_price_products:
                if product['price'] < min_price_products[key]['price']:
                    min_price_products[key] = product
            else:
                min_price_products[key] = product

        for product in min_price_products.values():
            self.seen_ids.add(product['identifier'])
            yield product

    def match_name(self,
                   search_name,
                   new_item,
                   match_threshold=90,
                   important_words=None):
        r = self.matcher.match_ratio(search_name, new_item, important_words)
        return r >= match_threshold

예제 #19

0

파일 보기

class KwikFitSpider(BaseSpider):
    name = 'kwik-fit.com_test'
    allowed_domains = ['kwik-fit.com']
    start_urls = ('http://www.kwik-fit.com',)
    tyre_sizes = []

    download_delay = 0.1

    def __init__(self, *args, **kwargs):
        super(KwikFitSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)

    def start_requests(self):
        for row in self.tyre_sizes:
            formdata = {}
            search = row['Width']+'/'+row['Aspect Ratio']+row['Speed rating']+row['Rim']
            formdata['dts'] = search
            formdata['sop'] = 'TyreSize'
            formdata['ssq'] = '3'
            formdata['tsf'] = search
            formdata['tsr'] = search
            formdata['MobileQuote'] = 'false'
            formdata['ShowSummerTyres'] = 'true'
            formdata['ShowTyresForBookOnline'] = 'true'
            formdata['ShowTyresForQuotation'] = 'true'
            formdata['ShowWinterTyres'] = 'true'
            formdata['Stage'] = '2'
            yield FormRequest('http://www.kwik-fit.com/tyre-search.asp', dont_filter=True, formdata=formdata, meta={'row':row, 'search':search}, callback=self.parse)

            if row['Alt Speed']:
                formdata = {}
                search = row['Width']+'/'+row['Aspect Ratio']+row['Alt Speed']+row['Rim']
                formdata['dts'] = search
                formdata['sop'] = 'TyreSize'
                formdata['ssq'] = '3'
                formdata['tsf'] = search
                formdata['tsr'] = search
                formdata['MobileQuote'] = 'false'
                formdata['ShowSummerTyres'] = 'true'
                formdata['ShowTyresForBookOnline'] = 'true'
                formdata['ShowTyresForQuotation'] = 'true'
                formdata['ShowWinterTyres'] = 'true'
                formdata['Stage'] = '2'
                yield FormRequest('http://www.kwik-fit.com/tyre-search.asp', dont_filter=True, formdata=formdata, meta={'row':row, 'search':search}, callback=self.parse)

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
  
        products = hxs.select('//div[contains(@id,"Tyre") and contains(@class, "tyre-list-tyre")]')

        for product in products:
            loader = ProductLoader(item=Product(), selector=product)
            loader.add_xpath('name', 'div//div[@class="manufacturerText"]/p/strong/text()')
            brand = ''.join(product.select('div//div[@class="manufacturerImage"]/img/@alt').extract()).split(' - ')[0]
            winter_tyre = product.select('div//img[@alt="Winter Tyre"]')
            if not winter_tyre:
                loader.add_value('brand', unify_brand(brand))
                loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))
                identifier = product.select('div//div[@class="pricingAddToOrder clearfix"]/input/@value').extract()[0]
 
                loader.add_value('url', '')

                image_url = product.select('div[@class="image"]/img/@src').extract()
                if image_url:
                    loader.add_value('image_url', urljoin(get_base_url(response), image_url[0]))

                loader.add_value('identifier', identifier)
                price = product.select('div//div[contains(@class, "pricingSelection")]//a/strong/text()').extract()
                price = re.findall(r"\d+.\d+", price[0]) if price else '0.0'
                loader.add_value('price', price)

                tyresize_text = product.select('.//div[contains(@class, "manufacturerText")]/p/span/text()').extract()[0].strip()
                width, aspect, speed_rating, rim = re.search(r'tyre size (\d+)\/(\d+)(\w{1})(\d+)', tyresize_text, re.I).groups()

                fitting_method = 'Fitted'

                metadata = MicheldeverMeta()
                metadata['aspect_ratio'] = aspect
                metadata['rim'] = rim

                metadata['speed_rating'] = speed_rating

                metadata['width'] = width
                metadata['fitting_method'] = fitting_method
                load_rating = product.select('div//li/a[@rel="load-index-description"]/text()').extract()
                metadata['load_rating'] = load_rating[0].split(': ')[-1] if load_rating else ''
                metadata['alternative_speed_rating'] = ''
                xl = product.select('div//img[@title="Reinforced"]/@title').extract()
                metadata['xl'] = 'Yes' if xl else 'No'

                run_flat = product.select('div//img[@title="Run Flat"]').extract()
                metadata['run_flat'] = 'Yes' if run_flat else 'No'
                manufacturer_mark = product.select('div//img[contains(@title, "Homologated for fitment to certai")]/@title').extract()
                manufacturer_mark = manufacturer_mark[0].replace('Homologated for fitment to certain ' ,'').replace(' cars.' ,'') if manufacturer_mark else ''
 
                metadata['manufacturer_mark'] = find_man_mark(manufacturer_mark) if manufacturer_mark else ''

                metadata['full_tyre_size'] = '/'.join((metadata['width'],
                                                       metadata['aspect_ratio'],
                                                       metadata['rim'],
                                                       metadata['load_rating'], 
                                                       metadata['speed_rating']))
                                                       #metadata['alternative_speed_rating']))
     
                product = loader.load_item()
                product['metadata'] = metadata

                if not is_product_correct(product):
                    continue

                product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log)

                new_speed_rating = get_speed_rating(product)
                new_alt_speed = get_alt_speed(product)
                product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                    product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
                product['metadata']['speed_rating'] = new_speed_rating
 
                yield product
        
    def match_name(self, search_name, new_item, match_threshold=80, important_words=None):
        r = self.matcher.match_ratio(search_name, new_item, important_words)
        return r >= match_threshold

예제 #20

0

파일 보기

class TyreGiantSpider(BaseSpider):
    name = 'tyregiant.com_test'
    allowed_domains = ['tyregiant.com']
    start_urls = ('http://www.tyregiant.com/', )
    tyre_sizes = []
    brands = []
    manually_matched = []
    all_man_marks = {}

    download_delay = 0.1

    def __init__(self, *args, **kwargs):
        super(TyreGiantSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)

        with open(os.path.join(HERE, 'manmarks.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.all_man_marks[row['code']] = row['manufacturer_mark']

        self.brands = [row['Brand'] for row in self.tyre_sizes]

        self.search_history = set()

        self.finished = False

        dispatcher.connect(self.spider_idle, signals.spider_idle)

    def _get_history_key(self, search_params):
        key = "%(width)s-%(rim)s-%(aspect_ratio)s-%(speed_rating)s" % search_params
        return key

    def check_in_history(self, search_params):
        if self._get_history_key(search_params) in self.search_history:
            return True
        return False

    def add_to_history(self, search_params):
        self.search_history.add(self._get_history_key(search_params))

    def spider_idle(self, spider):
        if not self.finished:
            request = Request(self.start_urls[0],
                              dont_filter=True,
                              callback=self.parse)
            self._crawler.engine.crawl(request, self)
            raise DontCloseSpider

    def parse(self, response):
        for r in self.next_search():
            yield r

    def next_search(self):
        request_sent = False
        for i, row in enumerate(self.tyre_sizes, 1):
            for speed_rating in [row['Speed rating'], row['Alt Speed']]:
                if not speed_rating:
                    continue

                search_params = {
                    'width': row['Width'],
                    'aspect_ratio': row['Aspect Ratio'],
                    'speed_rating': speed_rating,
                    'rim': row['Rim']
                }

                if self.check_in_history(search_params):
                    continue

                self.log("Checking row: %s" %
                         str({
                             'width': row['Width'],
                             'aspect_ratio': row['Aspect Ratio'],
                             'speed_rating': row['Speed rating'],
                             'rim': row['Rim']
                         }))

                self.add_to_history(search_params)

                url = 'http://www.tyregiant.com/%(width)s-%(aspect_ratio)s-%(rim)s?speed=%(speed_rating)s' % \
                      search_params
                yield Request(url,
                              dont_filter=True,
                              meta={'search_params': search_params},
                              callback=self.parse_search)
                request_sent = True
                break
            if request_sent:
                break
        else:
            self.finished = True
            return

    def parse_search(self, response):
        meta = response.meta
        url = 'http://www.tyregiant.com/update-tyres/1'
        meta['page'] = 1
        yield Request(url,
                      dont_filter=True,
                      callback=self.parse_products,
                      meta=meta)

    def parse_products(self, response):
        html_response = json.loads(response.body)['display_tyres']
        hxs = HtmlXPathSelector(text=html_response)

        search_params = response.meta['search_params']

        products = hxs.select('//div[contains(@class, "tyre_container")]')

        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)

            brand = product_el.select(
                './/form/span[@class="tyre_brand_text"]/text()').extract()
            brand = brand[0] if brand else ''

            winter_tyre = hxs.select(
                '/div/div/div[@class="winter_img"]').extract()
            if not winter_tyre:
                for tyre_brand in self.brands:
                    if tyre_brand.upper() == brand.strip().upper():
                        brand = tyre_brand
                full_name = product_el.select(
                    './/form/span[@class="tyre_brand_text"]/text()').extract(
                    )[-1]

                loader.add_value('name', full_name)
                loader.add_value('brand', unify_brand(brand))
                loader.add_value(
                    'category',
                    find_brand_segment(loader.get_output_value('brand')))
                identifier = product_el.select(
                    './/input[@name="tyre"]/@value').extract()
                loader.add_value('identifier', identifier)

                loader.add_value('url', 'http://www.tyregiant.com')

                image_url = product_el.select(
                    './/img[@class="tyre_image"]/@src').extract()

                if image_url:
                    loader.add_value(
                        'image_url',
                        urljoin(get_base_url(response), image_url[0]))

                price = product_el.select(
                    './/*[@class="tyre_price"]/span/text()').extract()

                if not price:
                    loader.add_value('stock', 0)

                loader.add_value('price', price)

                metadata = MicheldeverMeta()
                metadata['aspect_ratio'] = search_params['aspect_ratio']
                metadata['rim'] = search_params['rim']

                tyre_details = product_el.select(
                    './/form/p[@class="tyre_details"]/text()').extract()[0]
                speed = re.search('(\s\d+\w+\s)', tyre_details)
                load_rating = speed.group().strip()[:-1] if speed else ''
                speed_rating = speed.group().strip()[-1] if speed else ''

                metadata['speed_rating'] = speed_rating
                metadata['load_rating'] = load_rating

                metadata['width'] = search_params['width']

                metadata['fitting_method'] = 'Fitted'
                metadata['alternative_speed_rating'] = ''
                xl = product_el.select(
                    './/img[@class="xl_img"]/@src').extract()
                metadata['xl'] = 'Yes' if xl else 'No'
                run_flat = product_el.select(
                    './/img[@class="rf_img"]/@src').extract()
                metadata['run_flat'] = 'Yes' if run_flat else 'No'

                metadata['manufacturer_mark'] = self._get_manufacturer_code(
                    full_name)

                metadata['full_tyre_size'] = '/'.join(
                    (search_params['width'], search_params['aspect_ratio'],
                     search_params['rim'], metadata['load_rating'],
                     metadata['speed_rating']))
                # metadata['alternative_speed_rating']))
                product = loader.load_item()
                product['metadata'] = metadata

                if not is_product_correct(product):
                    continue

                product['metadata']['mts_stock_code'] = find_mts_stock_code(
                    product, spider_name=self.name, log=self.log)

                new_speed_rating = get_speed_rating(product)
                new_alt_speed = get_alt_speed(product)
                product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                    product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
                product['metadata']['speed_rating'] = new_speed_rating
                yield product

        if products:
            meta = response.meta
            next_page = meta['page'] + 1
            next_url = 'http://www.tyregiant.com/update-tyres/%s' % str(
                next_page)
            meta['page'] = next_page
            yield Request(next_url,
                          dont_filter=True,
                          callback=self.parse_products,
                          meta=meta)

    def _get_manufacturer_code(self, name):
        name = name.upper()
        for code, manufacturer_mark in self.all_man_marks.items():
            if code not in name:
                continue

            if code in name.split(' ') or code == '*':
                return manufacturer_mark

        return ''

    def match_name(self,
                   search_name,
                   new_item,
                   match_threshold=90,
                   important_words=None):
        r = self.matcher.match_ratio(search_name, new_item, important_words)
        return r >= match_threshold

예제 #21

0

파일 보기

class TyreDriveSpider(BaseSpider):
    name = 'micheldever-tyredrive.co.uk_test'
    allowed_domains = ['tyredrive.co.uk']
    start_urls = ('http://www.tyredrive.co.uk', )
    tyre_sizes = []
    all_man_marks = {}

    def __init__(self, *args, **kwargs):
        super(TyreDriveSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)

        with open(os.path.join(HERE, 'manmarks.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.all_man_marks[row['code']] = row['manufacturer_mark']

        self.errors = []

    def start_requests(self):
        for row in self.tyre_sizes:
            search = str(row['Width']) + '/' + str(row['Aspect Ratio']) + \
                     str(row['Speed rating']) + str(row['Rim'])
            parameters = {
                'section': row['Width'],
                'profile': row['Aspect Ratio'],
                'rim': row['Rim'],
                'speed': '0',
                'tyre_brand': '0',
                'submit': 'SEARCH'
            }
            yield Request('http://www.tyredrive.co.uk/search.php?' +
                          urllib.urlencode(parameters),
                          meta={
                              'row': row,
                              'search': search
                          },
                          callback=self.parse)

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        row = response.meta['row']
        products = hxs.select('//td[@class="tyreinfo"]/a/@href').extract()
        log.msg('Products found: {!s} items [{}]'.format(
            len(products), response.url))
        if not products:
            log.msg('No products: [{}]'.format(response.url))

        pages = hxs.select('//a[contains(@href,"pagpage")]/@href').extract()
        for page in pages:
            yield Request(urljoin(base_url, page), meta=response.meta)

        for url in products:
            yield Request(urljoin(base_url, url),
                          callback=self.parse_product,
                          meta=response.meta)

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        loader = ProductLoader(item=Product(), selector=hxs)
        # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl),
        # the pattern should be set as the product's name
        name = hxs.select('//td[@class="tread"]/text()').extract()
        if not name:
            msg = "No name found on page: %s" % response.url
            self.errors.append(msg)
            self.log("[ERROR] %s" % msg)
            return
        loader.add_value('name', name[0])
        brand = hxs.select(
            '//table[@class="single searchresults"]//td[@class="tyreinfo"]/b/text()'
        ).extract()[0].strip()
        loader.add_value('brand', unify_brand(brand))
        loader.add_value('category', find_brand_segment(brand))
        fitting_method = 'Delivered'

        loader.add_value('url', response.url)

        out_of_stock = hxs.select(
            '//table[@class="single searchresults"]//span[@class="outofstock"]'
        )
        if out_of_stock:
            loader.add_value('stock', 0)

        image_url = hxs.select(
            '//table[@class="single searchresults"]//td[@class="logo-pic"]/img/@src'
        ).extract()
        if image_url:
            loader.add_value('image_url',
                             urljoin(get_base_url(response), image_url[0]))

        identifier = hxs.select(
            '//table[@class="single searchresults"]//form/input[@name="pid"]/@value'
        )[0].extract()
        loader.add_value('identifier', identifier)
        price = hxs.select(
            '//table[@class="single searchresults"]//td[@class="netprice"]/text()'
        )[0].extract()
        loader.add_value('price', price)

        name = hxs.select(
            '//table[@class="single searchresults"]//td[@class="tyreinfo"]/span/text()'
        )[0].extract()
        data = parse_pattern(name)
        if not data:
            log.msg('ERROR parsing "{}" [{}]'.format(name, response.url))
            self.errors.append('ERROR parsing "{}" [{}]'.format(
                name, response.url))
            return
        metadata = MicheldeverMeta()
        metadata['aspect_ratio'] = data['Aspect_Ratio']
        metadata['rim'] = data['Rim']
        metadata['speed_rating'] = data['Speed_Rating']

        metadata['width'] = data['Width']
        metadata['fitting_method'] = fitting_method
        metadata['load_rating'] = data['Load_Rating']
        metadata['alternative_speed_rating'] = ''
        xl = 'XL' in name
        metadata['xl'] = 'Yes' if xl else 'No'

        run_flat = 'rflat' in name.lower()
        metadata['run_flat'] = 'Yes' if run_flat else 'No'
        if '*' in name:
            manufacturer_mark = '*'
        else:
            manufacturer_mark = [
                mark for mark in self.all_man_marks.keys()
                if mark in name.split(' ')
            ]
        manufacturer_mark = manufacturer_mark[0].strip(
        ) if manufacturer_mark else []
        metadata['manufacturer_mark'] = self.all_man_marks.get(manufacturer_mark, '') if manufacturer_mark \
                                                                                      else ''
        metadata['mts_stock_code'] = ''
        metadata['full_tyre_size'] = '/'.join(
            (metadata['width'], metadata['aspect_ratio'], metadata['rim'],
             metadata['load_rating'], metadata['speed_rating']))
        #metadata['alternative_speed_rating']))

        product = loader.load_item()
        product['metadata'] = metadata

        if not is_product_correct(product):
            return

        product['metadata']['mts_stock_code'] = find_mts_stock_code(
            product, spider_name=self.name, log=self.log)

        new_speed_rating = get_speed_rating(product)
        new_alt_speed = get_alt_speed(product)
        product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
            product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
        product['metadata']['speed_rating'] = new_speed_rating

        yield product

    def match_name(self,
                   search_name,
                   new_item,
                   match_threshold=80,
                   important_words=None):
        r = self.matcher.match_ratio(search_name, new_item, important_words)
        return r >= match_threshold

예제 #22

0

파일 보기

class LoveTyresSpider(BaseSpider):
    name = 'lovetyres.com'
    allowed_domains = ['lovetyres.com']
    start_urls = ['http://www.lovetyres.com']

    images_filename = os.path.join(HERE, 'lovetyres_images.csv')

    def __init__(self, *args, **kwargs):
        super(LoveTyresSpider, self).__init__(*args, **kwargs)

        self.matcher = Matcher(self.log)
        self.images = {}
        self.all_man_marks = {}

        with open(os.path.join(HERE, 'manmarks.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.all_man_marks[row['code']] = row['manufacturer_mark']

        if os.path.exists(self.images_filename):
            with open(self.images_filename) as f:
                reader = csv.DictReader(f)
                for row in reader:
                    self.images[row['product_url']] = row['image_url']

        self.errors = []

        dispatcher.connect(self.spider_closed, signals.spider_closed)

    def spider_closed(self, spider):
        with open(self.images_filename, 'w') as f:
            writer = csv.DictWriter(f, ['product_url', 'image_url'])
            writer.writeheader()
            for product_url, image_url in self.images.items():
                writer.writerow({
                    'product_url': product_url,
                    'image_url': image_url
                })

    def start_requests(self):
        requests = []
        urls = set()
        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                search = str(row['Width']) + '/' + str(row['Aspect Ratio']) + \
                         str(row['Speed rating']) + str(row['Rim'])

                search_url = 'http://www.lovetyres.com/search/tyres/%(Width)s-%(Aspect Ratio)s-%(Rim)s' % row

                if search_url not in urls:
                    self.log(search_url)
                    urls.add(search_url)
                    requests.append(
                        Request(search_url,
                                meta={'search': search},
                                callback=self.parse))

        self.log('TOTAL SEARCH REQUESTS: %s' % len(requests))

        return requests

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        products = hxs.select('//tr[contains(@class,"tyre-search-row")]')

        next_page = []
        if next_page:
            yield Request(urljoin_rfc(base_url, next_page[0]),
                          meta=response.meta)

        not_found_count = 0

        for product in products:
            url = product.select('.//td/b/a/@href')[0].extract()
            winter_tyre = product.select('.//td/b/a/text()')[0].extract()
            winter_tyre = 'winter' in winter_tyre.lower()
            if not winter_tyre:
                brand = product.select('.//a/img/@src')[0].extract()
                brand = re.search('/public/brands/(.*?)(-tyres)?\.',
                                  brand).group(1).replace('-', ' ').title()
                product_name = product.select('.//td/b/a/text()')[0].extract()
                product_name = re.sub(brand, '', product_name).strip()
                fitting_method = 'Delivered'
                identifier = product.select(
                    './/input[@name="item_id"]/@value').extract()
                if not identifier:
                    identifier = product.select('.//a/@href').re(
                        'email_me_stock/(.*)')
                if not identifier:
                    continue
                try:
                    fuel, grip, noise = map(
                        unicode.strip,
                        product.select(
                            './/img[contains(@alt, "Tyre Label")]/following-sibling::text()'
                        ).extract())
                except:
                    fuel = ''
                    grip = ''
                    noise = ''

                price = product.select("td[3]/b/text()").extract()
                loader = ProductLoader(item=Product(), selector=hxs)
                loader.add_value('identifier', identifier[0])
                loader.add_value('name', product_name)
                loader.add_value('brand', unify_brand(brand))
                loader.add_value(
                    'category',
                    find_brand_segment(loader.get_output_value('brand')))
                loader.add_value('url', url)
                if price:
                    loader.add_value('price', price[0])
                else:
                    loader.add_value('price', '0.00')
                    loader.add_value('stock', 0)

                pattern_name = product.select('.//i/text()').extract()
                if not pattern_name:
                    continue
                pattern_name = pattern_name[0]

                data = re.search(
                    '(?P<Width>\d+)/(?P<Aspect_Ratio>\d+) R(?P<Rim>\d+) (?P<Speed_Rating>[A-Za-z]{1,2}) \((?P<Load_Rating>\d+).*?\)',
                    pattern_name)
                if data:
                    data = data.groupdict()
                else:
                    msg = 'ERROR parsing "{}" [{}]'.format(
                        pattern_name, response.url)
                    self.log(msg)
                    continue

                metadata = MicheldeverMeta()
                metadata['aspect_ratio'] = data['Aspect_Ratio']
                metadata['rim'] = data['Rim']
                metadata['speed_rating'] = data['Speed_Rating'].upper()

                metadata['width'] = data['Width']
                metadata['fitting_method'] = fitting_method
                metadata['load_rating'] = data['Load_Rating'] or ''
                metadata['alternative_speed_rating'] = ''
                xl = 'XL' in pattern_name
                metadata['xl'] = 'Yes' if xl else 'No'

                run_flat_found = is_run_flat(pattern_name)
                run_flat = 'run flat' in pattern_name.lower(
                ) or 'runflat' in pattern_name.lower() or run_flat_found
                metadata['run_flat'] = 'Yes' if run_flat else 'No'
                manufacturer_mark = [
                    mark for mark in self.all_man_marks.keys()
                    if mark in pattern_name.split(' ')
                ]
                manufacturer_mark = manufacturer_mark[0].strip(
                ) if manufacturer_mark else []
                metadata['manufacturer_mark'] = find_man_mark(
                    manufacturer_mark) if manufacturer_mark else ''

                metadata['full_tyre_size'] = '/'.join(
                    (metadata['width'], metadata['aspect_ratio'],
                     metadata['rim'], metadata['load_rating'],
                     metadata['speed_rating']))

                metadata['fuel'] = fuel
                metadata['grip'] = grip
                metadata['noise'] = noise

                product = loader.load_item()
                product['metadata'] = metadata

                if not is_product_correct(product):
                    not_found_count += 1
                    self.log('%s - PRODUCT IS NOT CORRECT: %r' %
                             (not_found_count, product))
                    continue

                product['metadata']['mts_stock_code'] = find_mts_stock_code(
                    product, spider_name=self.name, log=self.log)

                if product['url'] in self.images:
                    product['image_url'] = self.images[product['url']]
                    yield product
                else:
                    yield Request(product['url'],
                                  callback=self.parse_image,
                                  meta={'product': product},
                                  dont_filter=True)

    def parse_image(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        loader = ProductLoader(item=response.meta['product'],
                               selector=response)
        image_url = hxs.select('//div[@class="item"]/a/img/@src').extract()
        if image_url:
            loader.add_value('image_url', urljoin_rfc(base_url, image_url[0]))

        product = loader.load_item()
        if 'image_url' in product and product['image_url'].strip():
            self.images[product['url']] = product['image_url']

        yield product

    def match_name(self,
                   search_name,
                   new_item,
                   match_threshold=80,
                   important_words=None):
        r = self.matcher.match_ratio(search_name, new_item, important_words)
        return r >= match_threshold

예제 #23

0

파일 보기

파일: points_spider.py 프로젝트: oceancloud82/scraping

class PointSSpider(Spider):
    name = 'micheldever-point-s.co.uk'
    allowed_domains = ['point-s.co.uk']
    start_urls = ('http://www.point-s.co.uk/', )
    tyre_sizes = []
    brands = []
    all_man_marks = {}

    def __init__(self, *args, **kwargs):
        super(PointSSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)

        with open(os.path.join(HERE, 'manmarks.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.all_man_marks[row['code']] = row['manufacturer_mark']

        self.brands = [row['Brand'] for row in self.tyre_sizes]

        self.processed_rows = {}

    def start_requests(self):
        for row in self.tyre_sizes:
            if self.check_row_is_processed(row):
                continue

            self.add_row_to_history(row)

            meta = {'row': row}
            xl = ''
            if row['XL'] == 'XL':
                xl = 'Y'
                meta['xl'] = True

            run_flat = ''
            if row['Run Flat'] == 'RF':
                run_flat = 'Y'
                meta['run_flat'] = True

            url = 'http://www.point-s.co.uk/tyres?s=&width=' + row[
                'Width'] + '&profile=' + row['Aspect Ratio'] + '&size=' + row[
                    'Rim'] + '&speed=' + row[
                        'Speed rating'] + '&paginate=true&runflat=' + run_flat + '&extra_load=' + xl
            yield Request(url, dont_filter=True, meta=meta)

            if row['Alt Speed']:
                url = 'http://www.point-s.co.uk/tyres?s=&width=' + row[
                    'Width'] + '&profile=' + row[
                        'Aspect Ratio'] + '&size=' + row['Rim'] + '&speed=' + row[
                            'Alt Speed'] + '&paginate=true&runflat=' + run_flat + '&extra_load=' + xl
                yield Request(url, dont_filter=True, meta=meta)

    def get_row_key(self, row):
        fields_to_save = [
            'Width', 'Rim', 'Aspect Ratio', 'Speed rating', 'Alt Speed', 'XL',
            'Run Flat'
        ]
        return tuple([row[x] for x in fields_to_save])

    def check_row_is_processed(self, row):
        key = self.get_row_key(row)
        if self.processed_rows.get(key):
            return True
        return False

    def add_row_to_history(self, row):
        key = self.get_row_key(row)
        self.processed_rows[key] = True

    def parse(self, response):
        row = response.meta['row']

        products = response.xpath(
            '//div[contains(@class, "product-recommended")]')
        products += response.xpath(
            '//div[@class="product-section"]/div[contains(@class, "product")]')
        for product_el in products:
            loader = ProductLoader(item=Product(), selector=product_el)

            brand = product_el.xpath(
                './/input[@name="brand"]/@value').extract()
            brand = brand[0] if brand else ''

            for tyre_brand in self.brands:
                if tyre_brand.upper() == brand.strip().upper():
                    brand = tyre_brand

            full_name = ''.join(product_el.xpath('.//h2/text()').extract())
            if not full_name:
                continue

            full_name_splt = re.split(brand, full_name, flags=re.I)
            tyre_code = full_name_splt[0]
            name = ' '.join(full_name_splt[1:]).strip()
            tyre_code = tyre_code.strip()
            name = name.strip()
            loader.add_value('name', name)

            # loader.add_value('name', full_name.split(brand)[-1])
            loader.add_value('brand', unify_brand(brand))
            loader.add_value(
                'category',
                find_brand_segment(loader.get_output_value('brand')))
            identifier = product_el.xpath(
                './/input[@name="prodCode"]/@value').extract()
            if identifier:
                identifier = identifier[0]
            else:
                self.log('Product without identifier')
                search_params = '/'.join([
                    row['Aspect Ratio'], row['Rim'], row['Width'],
                    row['Alt Speed']
                ])
                self.log('Search parameters: ' + search_params)
                return

            loader.add_value('url', response.url)
            image_url = product_el.xpath(
                './/div[contains(@class, "product-im")]/img/@src').extract()
            if image_url:
                loader.add_value('image_url', response.urljoin(image_url[0]))
            loader.add_value('identifier', identifier)

            price = ''.join(
                product_el.xpath('.//*[@class="price"]//text()').re(
                    r'[\d\.,]+'))

            if not price:
                continue

            loader.add_value('price', price)

            metadata = MicheldeverMeta()

            metadata['aspect_ratio'] = row['Aspect Ratio']
            metadata['rim'] = row['Rim']

            speed = re.search('(\s\d+\w+\s)', full_name)
            speed_rating = speed.group().strip()[-1] if speed else ''
            load_rating = speed.group().strip()[:-1] if speed else ''

            metadata['speed_rating'] = speed_rating
            metadata['load_rating'] = load_rating

            metadata['width'] = row['Width']

            metadata['fitting_method'] = 'Fitted'
            metadata['alternative_speed_rating'] = ''
            metadata['xl'] = 'Yes' if 'XL' in full_name.upper() else 'No'
            run_flat_found = is_run_flat(full_name)
            metadata['run_flat'] = 'Yes' if 'RUNFLAT' in full_name.upper(
            ) or run_flat_found else 'No'

            metadata['manufacturer_mark'] = self._get_manufacturer_code(
                full_name)

            metadata['full_tyre_size'] = '/'.join(
                (row['Width'], row['Aspect Ratio'], row['Rim'],
                 metadata['load_rating'], metadata['speed_rating']))

            try:
                fuel, grip, noise = map(
                    unicode.strip,
                    product_el.xpath(
                        './/div[contains(@class, "feature-image") or contains(@class, "feature-block")]'
                        '//span[@class="icon-text"]/text()').extract())
            except:
                fuel = ''
                grip = ''
                noise = ''

            metadata['fuel'] = fuel
            metadata['grip'] = grip
            metadata['noise'] = noise

            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(
                product, spider_name=self.name, log=self.log)

            yield product

        next_page = response.xpath(
            u'//ul[@class="pagination"]//a[contains(text(), ">")]/@data-url'
        ).extract()
        if next_page:
            yield Request(next_page[0], dont_filter=True, meta=response.meta)

    def _get_manufacturer_code(self, name):
        name = name.upper().strip()
        for code, manufacturer_mark in self.all_man_marks.items():
            if code not in name:
                continue

            if code in map(unicode.strip, name.split(' ')) or code == '*':
                return manufacturer_mark

        return ''

    def match_name(self,
                   search_name,
                   new_item,
                   match_threshold=90,
                   important_words=None):
        r = self.matcher.match_ratio(search_name, new_item, important_words)
        return r >= match_threshold

예제 #24

0

파일 보기

파일: kwikfit_spider.py 프로젝트: oceancloud82/scraping

class KwikFitSpider(BaseSpider):
    name = 'kwik-fit.com'
    allowed_domains = ['kwik-fit.com']
    start_urls = ('http://www.kwik-fit.com', )
    tyre_sizes = []

    download_delay = 0.1

    def __init__(self, *args, **kwargs):
        super(KwikFitSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)

    def start_requests(self):
        for row in self.tyre_sizes:
            search = row['Width'] + '/' + row['Aspect Ratio'] + row[
                'Speed rating'] + row['Rim']
            yield Request(
                'https://www.kwik-fit.com/tyres/search/results/%(Width)s/%(Aspect Ratio)s/%(Rim)s/%(Speed rating)s'
                % row,
                dont_filter=True,
                meta={
                    'row': row,
                    'search': search
                },
                callback=self.parse)

            if row['Alt Speed']:
                search = row['Width'] + '/' + row['Aspect Ratio'] + row[
                    'Alt Speed'] + row['Rim']
                yield Request(
                    'https://www.kwik-fit.com/tyres/search/results/%(Width)s/%(Aspect Ratio)s/%(Rim)s/%(Alt Speed)s'
                    % row,
                    dont_filter=True,
                    meta={
                        'row': row,
                        'search': search
                    },
                    callback=self.parse)

    def parse(self, response):
        products = response.xpath(
            '//div[contains(@class, "tyres_search_results_tyre") and @data-viewtype="grid"]'
        )

        for product in products:
            winter_tyre = product.xpath(
                '@data-filter-season').extract()[0] == 'Winter'
            if not winter_tyre:
                name = product.xpath(
                    './/div[contains(@class, "tyre-model text-center")]/text()'
                ).extract()[0]
                brand = product.xpath('@data-filter-brand').extract()[0]

                loader = ProductLoader(item=Product(), selector=product)
                loader.add_value('name', brand + ' ' + name)
                loader.add_value('brand', unify_brand(brand))
                loader.add_value(
                    'category',
                    find_brand_segment(loader.get_output_value('brand')))
                identifier = product.xpath('@data-tyreid').extract()[0]
                loader.add_value('identifier', identifier)
                loader.add_value('url', response.url)
                image_url = product.xpath(
                    './/div[contains(@class, "tyre-image")]//img/@src'
                ).extract()
                if image_url:
                    loader.add_value(
                        'image_url',
                        urljoin(get_base_url(response), image_url[0]))
                price = product.xpath(
                    './/div[contains(@class, "tyre-pricing-information")]/div/text()'
                ).re(r'[\d,.]+')
                price = price[0] if price else '0.00'
                loader.add_value('price', price)
                tyresize_text = product.xpath(
                    './/div[contains(@class, "tyre-size")]/text()').extract(
                    )[0].strip()
                try:
                    width, aspect, speed_rating, rim, load_rating = re.search(
                        r'(\d+)\/(\d+)(\w{1})(\d+)\s\((\d+)\)', tyresize_text,
                        re.I).groups()
                except:
                    width, aspect, speed_rating, rim = re.search(
                        r'(\d+)\/(\d+)(\w{1})(\d+)', tyresize_text,
                        re.I).groups()
                    load_rating = ''

                fitting_method = 'Fitted'

                metadata = MicheldeverMeta()
                metadata['aspect_ratio'] = aspect
                metadata['rim'] = rim

                metadata['speed_rating'] = speed_rating

                metadata['width'] = width
                metadata['fitting_method'] = fitting_method
                metadata['load_rating'] = load_rating
                metadata['alternative_speed_rating'] = ''
                xl = product.xpath(
                    '@data-filter-reinforced').extract()[0] == 'Y'
                metadata['xl'] = 'Yes' if xl else 'No'

                run_flat_found = is_run_flat(loader.get_output_value('name'))
                run_flat = product.xpath(
                    '@data-filter-runflat').extract()[0] == 'Y'
                metadata[
                    'run_flat'] = 'Yes' if run_flat or run_flat_found else 'No'
                manufacturer_mark = product.xpath('.//span[contains(@title, "Homologated for fitment to certai")]/@title')\
                                           .re(r'Homologated for fitment to certain (.*) cars\.')

                metadata['manufacturer_mark'] = find_man_mark(
                    manufacturer_mark[0]) if manufacturer_mark else ''

                metadata['full_tyre_size'] = '/'.join(
                    (metadata['width'], metadata['aspect_ratio'],
                     metadata['rim'], metadata['load_rating'],
                     metadata['speed_rating']))

                fuel, grip, noise = product.xpath('@data-filter-tyreefficiencyr'
                                                  '|@data-filter-tyreefficiencyg'
                                                  '|@data-filter-tyreefficiencyd')\
                                           .extract()
                metadata['fuel'] = fuel
                metadata['grip'] = grip
                metadata['noise'] = noise

                product = loader.load_item()
                product['metadata'] = metadata

                if not is_product_correct(product):
                    continue

                product['metadata']['mts_stock_code'] = find_mts_stock_code(
                    product, spider_name=self.name, log=self.log)

                yield product

    def match_name(self,
                   search_name,
                   new_item,
                   match_threshold=80,
                   important_words=None):
        r = self.matcher.match_ratio(search_name, new_item, important_words)
        return r >= match_threshold

예제 #25

0

파일 보기

class TrovaprezziSpider(BaseSpider):
    name = u'trovaprezzi.it'
    allowed_domains = [u'trovaprezzi.it']
    start_urls = [
        u'http://www.trovaprezzi.it/prezzi_elettronica-elettricita.aspx'
    ]

    items = []

    def __init__(self, *args, **kwargs):
        super(TrovaprezziSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)
        dispatcher.connect(self.spider_idle, signals.spider_idle)

    def start_requests(self):
        with open(os.path.join(HERE, 'product_list.csv')) as f:
            reader = csv.DictReader(cStringIO.StringIO(f.read()))
            for row in reader:
                meta = {
                    'sku': row['model'],
                    'identifier': row['ean'],
                    'brand': row['brand'],
                    'name': row['name']
                }

                if row['model']:
                    search = row['brand'] + '+' + row['model']
                    meta['model_search'] = True
                    url = 'http://www.trovaprezzi.it/categoria.aspx?libera=' + search + '&id=-1&prezzomin=&prezzomax='
                else:
                    url = 'http://www.trovaprezzi.it/categoria.aspx?libera=' + row[
                        'name'].replace(' ',
                                        '+') + '&id=-1&prezzomin=&prezzomax='
                yield Request(url, meta=meta)

    def spider_idle(self, spider):
        if self.items:
            request = Request(self.start_urls[0],
                              dont_filter=True,
                              callback=self.closing_parse)
            self._crawler.engine.crawl(request, self)

    def closing_parse(self, response):
        self.log("Processing items after finish")
        items_dict = {}
        items = sorted(self.items, key=lambda x: x['sku'])
        for item in items:
            if item['sku'] in items_dict:
                old_item = items_dict[item['sku']]
                if item['price'] < old_item['price']:
                    items_dict[item['sku']] = item
            else:
                items_dict[item['sku']] = item

        self.items = []

        for sku, item in items_dict.items():
            loader = ProductLoader(item=Product(), response=response)
            loader.add_value('name', item['name'])
            loader.add_value('url', item['url'])
            loader.add_value('price', item['price'])
            loader.add_value('sku', item['sku'])
            loader.add_value('category', item['category'])
            loader.add_value('brand', item['brand'])
            loader.add_value('identifier', item['identifier'])
            loader.add_value('dealer', item['dealer'])
            loader.add_value('image_url', item['image_url'])
            product = loader.load_item()
            yield product

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        meta = response.meta

        relevant_categories = hxs.select(
            '//div[@class="catsMI"]/div/a/@href').extract()
        for category in relevant_categories:
            yield Request(urljoin_rfc(base_url, category), meta=meta)

        products = hxs.select('//table[@id="productlist-table"]/tbody/tr')
        if not products and meta.get('model_search', False):
            url = 'http://www.trovaprezzi.it/categoria.aspx?libera=' + meta[
                'name'].replace(' ', '+') + '&id=-1&prezzomin=&prezzomax='
            meta['model_search'] = False
            yield Request(url, meta=meta)
        else:
            category = hxs.select(
                '//div[@id="divTitle"]/h1/text()').extract()[0]
            pr = None
            for product in products:
                name = product.select(
                    'td[@class="descCol"]/a/b/text()').extract()[0]
                if self.match_name(meta['name'], name, match_threshold=70):
                    loader = ProductLoader(item=Product(), selector=product)
                    image_url = product.select(
                        'td[@class="imgCol"]/a/img/@src').extract()
                    if image_url:
                        image_url = urljoin_rfc(base_url, image_url[0])
                    else:
                        image_url = ''
                    loader.add_value('image_url', image_url)
                    loader.add_xpath('dealer',
                                     'td[@class="mercCol"]/a/img/@alt')
                    loader.add_xpath('name', 'td[@class="descCol"]/a/b/text()')
                    loader.add_value('category', category)
                    loader.add_value('sku', response.meta.get('sku'))

                    url = product.select(
                        'td[@class="descCol"]/a/@href').extract()[0]
                    loader.add_value('url', urljoin_rfc(base_url, url))

                    price = product.select('td[@class="prodListPrezzo"]/text()'
                                           ).extract()[0].strip().replace(
                                               '.', '').replace(',', '.')
                    loader.add_value('price', price)
                    shipping_cost = product.select(
                        'td[@class="prodListPrezzo"]/' +
                        'span[@class="deliveryCost nobr"]/' +
                        'text()').extract()[0].strip().replace('.',
                                                               '').replace(
                                                                   ',', '.')
                    loader.add_value('shipping_cost', shipping_cost)
                    loader.add_value('identifier',
                                     response.meta.get('identifier'))

                    if loader.get_output_value('price') and (
                            pr is None or pr.get_output_value('price') >
                            loader.get_output_value('price')):
                        pr = loader
            if pr:
                item = pr.load_item()
                if not item in self.items:
                    self.items.append(item)

    def match_name(self,
                   search_name,
                   new_item,
                   match_threshold=90,
                   important_words=None):
        r = self.matcher.match_ratio(search_name, new_item, important_words)
        return r >= match_threshold

예제 #26

0

파일 보기

class LegoUsaEbaySpider(BaseeBaySpider):

    HERE = os.path.abspath(os.path.dirname(__file__))

    name = 'legousa-ebay.com'

    # Map deviation screenshot feature
    map_deviation_detection = True
    map_deviation_csv = os.path.join(HERE, 'ebay_map_deviation.csv')
    map_screenshot_method = 'scrapy_response'
    map_screenshot_html_files = {}

    def __init__(self, *args, **kwargs):
        super(LegoUsaEbaySpider, self).__init__()
        self._csv_file = os.path.join(self.HERE, 'lego.csv')
        self._converted_price = True
        self._ebay_url = 'http://www.ebay.com'
        self._search_fields = [3, 2]
        self._all_vendors = True
        self._look_related = False
        self._meta_fields = [('sku', 2), ('name', 3), ('price', 4),
                             ('category', 1)]
        self._match_fields = ('sku', 'identifier')
        self._check_valid_item = self._valid_item_

        self._re_sku = re.compile(r'(\d{3,})')

        self._check_diff_ratio = True
        # self._ratio_accuracy = 60

        self.matcher = Matcher(self.log)

    def match_text(self,
                   text,
                   item_field,
                   match_threshold=90,
                   important_words=None):
        r = self.matcher.match_ratio(text, item_field, important_words)
        self.log('Searching for %s in %s: %s' % (text, item_field, r))
        return r >= match_threshold

    def start_requests(self):
        with open(self._csv_file) as f:
            reader = csv.reader(cStringIO.StringIO(f.read()))

            number = 0
            for row in reader:
                number += 1
                meta = dict(
                    dict((m_k, row[m_f]) for m_k, m_f in self._meta_fields))
                search = ' '.join(row[field].strip()
                                  for field in self._search_fields)
                if not 'lego' in search.lower():
                    search = 'LEGO ' + search
                meta.update({'search': search})
                # Get URL
                search = self._clean_search(search)  # Clean search
                url = self._get_url_search(search)
                self.log('Item %s | SKU: %s | Search by: %s' %
                         (number, meta.get('sku', None), search))
                yield self._search(url, meta)

                search = 'LEGO ' + row[2]
                meta.update({'search': search})
                # Get URL
                search = self._clean_search(search)  # Clean search
                url = self._get_url_search(search)
                self.log('Item %s | SKU: %s | Search by: %s' %
                         (number, meta.get('sku', None), search))
                yield self._search(url, meta)

    def load_item(self, *args, **kwargs):
        product_loader = super(LegoUsaEbaySpider,
                               self).load_item(*args, **kwargs)
        product_loader.replace_value('brand', 'LEGO')

        identifier = product_loader.get_output_value('identifier')
        response = args[-1]

        html_path = os.path.join('/tmp', 'ebay_%s.html' % identifier)
        with open(html_path, 'w') as f_html:
            f_html.write(response.body)
        self.map_screenshot_html_files[identifier] = html_path

        return product_loader

    def _valid_item_(self, item_loader, response):
        item_name = item_loader.get_output_value('name').lower()

        if not self._check_exclude_terms(item_name):
            return False

        name = item_loader.get_output_value('name')
        search_sku = item_loader.get_output_value('sku')
        sku = self._re_sku.findall(name.replace(' ', ''))
        sku.extend(self._re_sku.findall(name))
        category = item_loader.get_output_value('category')

        if not self._check_name_valid(name):
            return False

        if not self._check_category_valid(category):
            return False

        sku = set(sku)

        search_name = response.meta['item_meta']['name'].decode('utf-8')
        if not self.match_text(search_name, name, match_threshold=70):
            return False

        if sku:
            search_price = response.meta['item_meta'].get('price')
            price = item_loader.get_output_value('price')
            if not len(sku) > 1 or self._check_max_price(search_price, price):
                match_sku = search_sku in sku
                self.log('SKU %s in %s ? %s' % (search_sku, sku, match_sku))
                return match_sku
            else:
                self.log('Reject lot of products => %s' %
                         item_loader.get_output_value('url'))
                return False

        return True

    def _check_name_valid(self, name):
        """
        >>> spider = LegoUsaEbaySpider()
        >>> spider._check_name_valid("Lego 123")
        True
        >>> spider._check_name_valid("Lego 123 figure")
        False
        """
        if (self.match_text('mini figures from', name)
                or self.match_text('mini figures only', name)
                or self.match_text('mini figures', name)
                or self.match_text('mini figure', name)
                or self.match_text('minifigures', name)
                or self.match_text('minifigure', name)
                or self.match_text('figure', name)
                or self.match_text('loose', name)
                or self.match_text('no box', name)
                or self.match_text('nobox', name)):
            return False
        return True

    def _check_category_valid(self, category):
        """
        >>> spider = LegoUsaEbaySpider()
        >>> spider._check_category_valid('asd')
        True
        >>> spider._check_category_valid("figures")
        False
        >>> spider._check_category_valid("figure")
        False
        """
        if category and (self.match_text('figure', category)):
            return False
        return True

    def _check_valid_price(self, search_price, price):
        ''' Checks price variation '''
        price_diff = 0.5
        search_price = Decimal(search_price)
        diff = Decimal(search_price) * Decimal(price_diff)
        return search_price - diff <= Decimal(price)

    def _check_max_price(self, search_price, price):
        ''' Checks price variation '''
        price_diff = 0.5
        search_price = Decimal(search_price)
        diff = Decimal(search_price) * Decimal(price_diff)
        return Decimal(price) <= search_price + diff

    def _check_exclude_terms(self, item_name):
        '''
        [([<list terms to exclude>], [<list exceptions>]),
         ([...], [...]),
         ([...], [...])]
        '''
        exclude_ = [(['NO MINIFIG'], []),
                    (['MINIFIG', 'MINIFG'], ['MINIFIGURES'])]
        for values, exceptions in exclude_:
            for w in values:
                if w.lower() in item_name:
                    itsvalid = False
                    for e in exceptions:
                        if e.lower() in item_name:
                            itsvalid = True
                            break
                    if not itsvalid:
                        return False
        return True

예제 #27

0

파일 보기

class EcraterSpider(BaseSpider):
    name = 'legousa-ecrater.com'
    allowed_domains = ['ecrater.com']
    start_urls = (
        'http://www.ecrater.com/filter.php?cid=542133&keywords=lego&slocation=d&new=1',
        'http://www.ecrater.com/filter.php?cid=542133&slocation=d&new=1')
    _re_sku = re.compile('(\d\d\d\d\d?)')

    # Map deviation screenshot feature
    map_deviation_detection = True
    map_deviation_csv = os.path.join(HERE, 'ecrater_map_deviation.csv')

    def __init__(self, *args, **kwargs):
        super(EcraterSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)
        with open(os.path.join(HERE, 'lego.csv')) as f:
            reader = csv.reader(f)
            self.products = {
                prod[2]: prod[3].decode('utf8')
                for prod in reader
            }

        dispatcher.connect(self.spider_closed, signals.spider_closed)

        if os.path.exists(os.path.join(HERE, 'ecrater_products.csv')):
            shutil.copy(os.path.join(HERE, 'ecrater_products.csv'),
                        os.path.join(HERE, 'ecrater_products.csv.bak'))

        # Errors
        self.errors = []

    def spider_closed(self, spider):
        shutil.copy('data/%s_products.csv' % spider.crawl_id,
                    os.path.join(HERE, 'toysrus_products.csv'))

    def start_requests(self):
        # Parse default items and then start_urls
        yield Request('http://www.ecrater.com', self.parse_default)

    def parse_default(self, response):
        with open(os.path.join(HERE, 'ecrater_products.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                yield Request(row['url'], self.parse_product)

        # Scrape start urls
        for url in self.start_urls:
            yield Request(url)

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        next = hxs.select(
            '//ul[@class="pagination-controls nav"]/li/a[@title="Next Page"]/@href'
        ).extract()
        if next:
            yield Request(urljoin_rfc(base_url, next[-1]), callback=self.parse)

        products = hxs.select(
            '//div[@class="product-details"]/h2/a/@href').extract()
        for product in products:
            if 'keywords=lego' in response.url or 'lego' in product:
                yield Request(urljoin_rfc(base_url, product),
                              callback=self.parse_product)

        if not products:
            self.errors.append('WARNING: No products in %s' % response.url)

    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)

        seller = hxs.select(
            '//a[@class="seller-username"]/text()').extract()[0]

        cart_url = hxs.select(
            '//div[@id="product-title-actions"]/a/@href').extract()[0]
        identifier = url_query_parameter(urljoin_rfc(base_url, cart_url),
                                         'pid', None)

        if not identifier:
            identifier_regex = re.search(r'p/(\d+)/', response.url)
            if not identifier_regex:
                self.errors.append('WARNING: No identifier in %s' %
                                   response.url)
                return
            else:
                identifier = identifier_regex.groups()[0]

        name = hxs.select('//div[@id="product-title"]/h1/text()').extract()[0]

        sku = self._re_sku.findall(name)
        sku = sku[0] if sku else ''

        loader = ProductLoader(item=Product(), response=response)
        loader.add_value('identifier', identifier + '-' + seller)
        loader.add_value('name', name)
        loader.add_value('brand', 'LEGO')
        loader.add_xpath(
            'category',
            '//ul[@class="breadcrumb"]/li/a[@class="active"]/text()')
        loader.add_value('url', response.url)
        price = hxs.select(
            '//div[@id="product-title-actions"]/span/text()').extract()[0]

        loader.add_value('price', price)
        image_url = hxs.select(
            '//img[@id="product-image-display"]/@src').extract()
        if image_url:
            loader.add_value('image_url', image_url[0])

        stock = hxs.select('//p[@id="product-quantity"]/text()').extract()
        if stock:
            stock = re.findall("\d+", stock[0])
            stock = stock[0] if stock else 0
            loader.add_value('stock', stock)

        shipping = hxs.select(
            '//p[a[@href="#shipping-rates"]]/text()').extract()
        if shipping:
            shipping = re.findall("\d+.\d+", shipping[0])
            shipping = shipping[0] if shipping else 0
            loader.add_value('shipping_cost', shipping)

        loader.add_value('dealer', seller)
        if sku in self.products.keys():
            if self.match_name(self.products[sku], name):
                loader.add_value('sku', sku)
            else:
                log.msg('###########################')
                log.msg(response.url)
                log.msg('###########################')
        else:
            loader.add_value('sku', sku)

        yield loader.load_item()

    def match_name(self,
                   search_name,
                   new_item,
                   match_threshold=90,
                   important_words=None):
        r = self.matcher.match_ratio(search_name, new_item, important_words)
        return r >= match_threshold

예제 #28

0

파일 보기

파일: lovetyres.py 프로젝트: oceancloud82/scraping

class LoveTyresSpider(BaseSpider):
    name = 'lovetyres.com_test'
    allowed_domains = ['lovetyres.com']
    start_urls = ('http://www.lovetyres.com',)
    tyre_sizes = []
    all_man_marks = {}

    def __init__(self, *args, **kwargs):
        super(LoveTyresSpider, self).__init__(*args, **kwargs)
        self.matcher = Matcher(self.log)

        with open(os.path.join(HERE, 'mtsstockcodes.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.tyre_sizes.append(row)

        with open(os.path.join(HERE, 'manmarks.csv')) as f:
            reader = csv.DictReader(f)
            for row in reader:
                self.all_man_marks[row['code']] = row['manufacturer_mark']

        self.errors = []

    def start_requests(self):
        for row in self.tyre_sizes:
            search = str(row['Width']) + '/' + str(row['Aspect Ratio']) + \
                     str(row['Speed rating']) + str(row['Rim'])
            yield Request('http://www.lovetyres.com/search/tyres/{Width}-{Aspect Ratio}-{Rim}'.format(**row),
                          meta={'search': search}, callback=self.parse)

    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        base_url = get_base_url(response)
  
        products = hxs.select('//tr[contains(@class,"tyre-search-row")]')

        next_page = []
        if next_page:
            yield Request(urljoin(base_url, next_page[0]), meta=response.meta)

        for product in products:
            url = product.select('.//td/b/a/@href')[0].extract()
            winter_tyre = product.select('.//td/b/a/text()')[0].extract()
            winter_tyre = 'winter' in winter_tyre.lower()
            if not winter_tyre:
                brand = product.select('.//a/img/@src')[0].extract()
                brand = re.search('/public/brands/(.*?)(-tyres)?\.', brand).group(1).replace('-', ' ').title()
                meta = response.meta
                meta['brand'] = brand
                price = product.select("td[3]/b/text()").extract()
                if price:
                    meta['price'] = price[0]
                yield Request(urljoin(base_url, url), callback=self.parse_product, meta=meta)
            
    def parse_product(self, response):
        hxs = HtmlXPathSelector(response)
        base_loader = ProductLoader(item=Product(), selector=hxs)
        # the full name of the tyre (name variable) is used to extract metadata (i.e. run flat, xl),
        # the pattern should be set as the product's name
        brand = response.meta.get('brand') or ''
        product_name = hxs.select('//h2[@class="heading black"]/text()')[0].extract().strip()
        product_name = re.sub(brand, '', product_name).strip()
        fitting_method = 'Delivered'
    
        base_loader.add_value('url', response.url)
    
        image_url = hxs.select('//div[@class="item"]/a/img/@src').extract()
        options = hxs.select('//div[@style="background: #fff; padding: 6px; "]')
        for option in options:
            loader = ProductLoader(item=Product(), selector=hxs)
            loader.add_value('name', product_name)
            loader.add_value('brand', unify_brand(brand))
            loader.add_value('category', find_brand_segment(loader.get_output_value('brand')))
            loader.add_value('url', response.url)
            if image_url:
                loader.add_value('image_url', urljoin(get_base_url(response), image_url[0]))
            identifier = option.select('../input[@type="hidden" and @name="item_id"]/@value').extract()
            if not identifier:
                identifier = option.select('./a/@href').re('email_me_stock/(.*)')
            if not identifier:
                continue
            loader.add_value('identifier', identifier[0])
            price = option.select('./strong[@class="price" and not(contains(text(),"On Backorder"))]/text()').extract()
            if price:
                loader.add_value('price', price[0]) 
            else:
                if response.meta.get('price'):
                    loader.add_value('price', response.meta['price'])
                else:
                    loader.add_value('price', '0.00')
                loader.add_value('stock', 0)
        
            pattern_name = option.select('./p/strong/text()').extract()
            if not pattern_name:
                pattern_name = option.select('./strong/text()').extract()
            pattern_name = pattern_name[0]
            data = re.search('(?P<Width>\d+)/(?P<Aspect_Ratio>\d+) R(?P<Rim>\d+) (?P<Speed_Rating>[A-Za-z]{1,2}) \((?P<Load_Rating>\d+).*?\)',
                              pattern_name)
            if data:
                data = data.groupdict()
            else:
                msg = 'ERROR parsing "{}" [{}]'.format(pattern_name, response.url)
                log.msg(msg)
                self.errors.append(msg)
                continue
            metadata = MicheldeverMeta()
            metadata['aspect_ratio'] = data['Aspect_Ratio']
            metadata['rim'] = data['Rim']
            metadata['speed_rating'] = data['Speed_Rating'].upper()
        
            metadata['width'] = data['Width']
            metadata['fitting_method'] = fitting_method
            metadata['load_rating'] = data['Load_Rating'] or ''
            metadata['alternative_speed_rating'] = ''
            xl = 'XL' in pattern_name
            metadata['xl'] = 'Yes' if xl else 'No'
        
            run_flat = 'run flat' in pattern_name.lower() or 'runflat' in pattern_name.lower()
            metadata['run_flat'] = 'Yes' if run_flat else 'No'
            manufacturer_mark = [mark for mark in self.all_man_marks.keys() if mark in pattern_name.split(' ')]
            manufacturer_mark = manufacturer_mark[0].strip() if manufacturer_mark else []
            metadata['manufacturer_mark'] = find_man_mark(manufacturer_mark) if manufacturer_mark else ''

            metadata['full_tyre_size'] = '/'.join((metadata['width'],
                                                   metadata['aspect_ratio'],
                                                   metadata['rim'],
                                                   metadata['load_rating'], 
                                                   metadata['speed_rating']))
                                                    #metadata['alternative_speed_rating']))
        
            product = loader.load_item()
            product['metadata'] = metadata

            if not is_product_correct(product):
                continue

            product['metadata']['mts_stock_code'] = find_mts_stock_code(product, spider_name=self.name, log=self.log)

            new_speed_rating = get_speed_rating(product)
            new_alt_speed = get_alt_speed(product)
            product['metadata']['alternative_speed_rating'] = new_alt_speed if new_alt_speed else \
                product['metadata']['speed_rating'] if product['metadata']['speed_rating'] != new_speed_rating else ''
            product['metadata']['speed_rating'] = new_speed_rating
        
            yield product
        
    def match_name(self, search_name, new_item, match_threshold=80, important_words=None):
        r = self.matcher.match_ratio(search_name, new_item, important_words)
        return r >= match_threshold

예제 #29

0

파일 보기

 def __init__(self, *args, **kwargs):
     super(TrovaprezziSpider, self).__init__(*args, **kwargs)
     self.matcher = Matcher(self.log)
     dispatcher.connect(self.spider_idle, signals.spider_idle)

예제 #30

0

파일 보기

    """
    >>> find_man_mark('bmw')
    '*'
    >>> find_man_mark('Mercedes')
    'MO'
    >>> find_man_mark('por')
    'N0'
    """
    man_marks = load_manufacturers_marks()
    for code, manufacturer_mark in man_marks.items():
        if mark.lower() in code.lower():
            return manufacturer_mark
    return ''


matcher = Matcher(logging.error)


def match_name(search_name,
               new_item,
               match_threshold=90,
               important_words=None):
    r = matcher.match_ratio(search_name, new_item, important_words)
    return r >= match_threshold


def match_pattern(pattern, name, match_threshold=70):
    """
    >>> match_pattern('B 250 ECOPIA', 'B250ECO')
    True
    >>> match_pattern('DSPORT', 'Dueler Sport')