Пример #1
0
 def parse_content_sitemap(self, parse_url):
     log('start sitemap', parse_url)
     try:
         data_string = self.opener.open(parse_url).read()
     except urllib2.HTTPError, e:
         print 'ERROR while sitmap loading', e, parse_url
         return None, [{'force': True, 'parse_url': parse_url}]
Пример #2
0
    def save_data(self, data):
        if self.config.update_current:
            log('update', data['shop_url'])
            with self.update_lock:
                Article.objects.filter(shop=data['shop_url']).update(**data)
        else:
            log('save', data['shop_url'])
            Article.objects.bulk_create([Article(**data)])
        self.articles_found += 1

        self.status.data = self.get_status()
        self.status.save()
Пример #3
0
    def parse_content_microdata(self, parse_url):
        # https://developers.google.com/structured-data/testing-tool/
        # not working with http://www.bonprix.de/produkt/maxi-jerseykleid-dunkelblau-bedruckt-958483/
        # which is good microformat according to google

        log('parse', parse_url)

        items = microdata.get_items(urllib.urlopen(parse_url))

        data = [i.json() for i in items]
        urls = []

        return data, urls
Пример #4
0
    def handle_noargs(self, **options):
        from crawl.helpers import log
        from articles.models import Article

        i = 0
        for d in Article.objects.values('image').annotate(Count('id')).order_by().filter(id__count__gt=1,
                                                                                         image__isnull=False):
            Article.objects.filter(
                id__in=Article.objects.filter(image=d['image']).values_list('id', flat=True)[1:]
            ).delete()

            i += 1
            log(i, 'duplicates found ...')

        log('end', i, 'duplicates found')
Пример #5
0
    def run(self):
        self.start_time = time.time()
        self.status = IndexStatus(shop=self.get_name(), data=self.get_status(), config=self.config.get_original())
        self.status.save()

        log('start', self.get_name())

        self.add_urls([self.config.start_url])

        self.wait_completion()

        self.status.data = self.get_status()
        log('end crawling', self.get_name(), ' - ', ', '.join(['%s: %s' % (d[0], d[1]) for d in self.status.data.items()]))

        self.status.finished = True
        self.status.save()
Пример #6
0
        def f(*args, **kwargs):
            t1 = time.time()
            result = function(*args, **kwargs)
            duration = int(round((time.time() - t1) * 1000, -1))

            # if settings.GOOGLE_ANALYTICS_CODE:
            #     url = 'http://www.google-analytics.com/collect?v=1&tid=%s&utv=%s&utt=%s' % (
            #         settings.GOOGLE_ANALYTICS_CODE, name, duration
            #     )
            #     try:
            #         requests.get(url, timeout=1)
            #     except Exception, e:
            #         log("Error in log_timing", e)

            log("%s took \x1b[32m%sms\x1b[0;39m" % (name, duration))
            return result
Пример #7
0
    def run(self):
        while True:
            url = self.queue.get()

            try:
                data, urls = self.parser.parse(url)

                if data:
                    self.save_data_func(data)

                if urls:
                    self.add_urls_func(urls)

            except Exception, e:
                log('ERROR', url, e, '\n', traceback.format_exc())

            gc.collect()
            self.queue.task_done()
Пример #8
0
    def parse_content_sitemap(self, parse_url):
        log('start sitemap', parse_url)
        data_string = self.opener.open(parse_url).read()

        if parse_url.endswith('.gz'):
            compressedFile = StringIO.StringIO()
            compressedFile.write(data_string)

            compressedFile.seek(0)
            xml = gzip.GzipFile(fileobj=compressedFile, mode='rb')
            data_string = xml.read()
            del xml

        found = LOC_RE.findall(data_string)

        urls = []
        for loc in found:
            href = self.sanitize_url(loc)
            if self.is_valid_href(href):
                urls.append(href)

        log('end sitemap', parse_url, len(urls))

        return None, urls
Пример #9
0
 def handle_noargs(self, **options):
     from crawl.helpers import log
     print log('update index ...')
     call_command('update_index', interactive=False)
     print log('update index finished')
Пример #10
0
    def parse_content_html(self, parse_url):
        log('parse', parse_url)

        urls = []
        data = {}

        data_string = self.opener.open(parse_url).read()
        soup = BeautifulSoup(data_string, "lxml")

        if not self.config.article_url_regexp or self.config.article_url_regexp.findall(parse_url):

            data = {'shop_url': parse_url, 'shop': self.config.shop_name}

            for k, v in self.config.field_match.items():

                v, use_html = Parser.use_pseudo(v, '::html')
                v, use_href = Parser.use_pseudo(v, '::href')
                v, use_all = Parser.use_pseudo(v, '::all')
                v, custom_attr = Parser.use_pseudo_with_arg(v, '::attr')

                def get_attr(tag, attr):
                    return tag.attrs.get(custom_attr if custom_attr else attr, '')

                hits = soup.select(v)
                if len(hits):
                    tag = hits[0]

                    if tag.name == 'img':
                        data[k] = urlparse.urljoin(self.config.base_url, get_attr(tag, 'src'))
                    elif use_href:
                        data[k] = get_attr(tag, 'href')
                    elif use_html:
                        data[k] = removetags(Parser.purify_text('%s' % tag), 'img script style span br')
                    elif use_all:
                        data[k] = [Parser.purify_text(h.text.replace(',', ' ')) for h in hits if
                                   self.valid_tag(h.text)]
                        if k == 'tags' and self.config.generic_tag:
                            data[k].append(self.config.generic_tag)
                    else:
                        text = get_attr(tag, 'content')
                        if not text:
                            text = tag.text
                        if not text:
                            text = tag.attrs.get('href', '')

                        if k == 'image':
                            text = urlparse.urljoin(self.config.base_url, text)

                        data[k] = Parser.purify_text(text)

            if not data.has_key('title') or not data.has_key('description') or not data.has_key('price'):
                data = {}

        if not self.config.leaf_only:

            # respect <base href="http://www.base.com/shop/"/> head tag
            sanitize_url_kwargs = {}
            base = soup.select('base')
            if len(base):
                sanitize_url_kwargs['base'] = base[0].attrs.get('href')

            for tag in soup.select(self.config.link_selector):
                href = tag.attrs.get('href')
                if href and self.is_valid_href(href):
                    href = self.sanitize_url(href, **sanitize_url_kwargs)
                    urls.append(href)

        soup.decompose()
        del soup
        del data_string

        return data, urls
Пример #11
0
    def handle_noargs(self, **options):

        delete = options.get('delete')
        if delete:
            from articles.models import Article

            log('delete', Article.objects.filter(shop=delete).count())
            Article.objects.filter(shop=delete).delete()
            return

        if options.get('dummy', False):
            shop_filter = DUMMY
        else:
            shop_filter = options.get('shop_filter', None)

        update_current = options.get('update_current', False)
        keep_current = options.get('keep_current', False)
        spider_count = int(options.get('spider_count'))

        shops_config = {

            'mondovino': {
                'startUrl': 'https://www.mondovino.ch/sitemap/sitemap_de.xml',
                'ignoreUrlRegexp': r'/sortiment|mailto|javascript',
                'leafOnly': True,
                'genericTag': 'Wein',
                'fieldMatch':
                    {
                        'title': '.mod_product_detail__title',
                        'description': '.mod_product_detail__description_text',
                        'price': 'span.mod_product_detail__price_box_price',
                        'image': '.mod_product_detail__product_image noscript img',
                        'tags': 'a[href="#grapes"]::all'
                    }
            },

            'c-and-a': {
                'startUrl': 'http://www.c-and-a.com/ch/de/sitemap/sitemap.xml',
                'ignoreUrlRegexp': r'/information|/service|/corporate|/blog|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|/de/de/|.jpg|mailto|javascript|/land/',
                'articleUrlRegexp': r'/ch/de/',
                'fieldMatch':
                    {

                        'title': '#productDetail h1',
                        'description': '#productDetail .list::html',
                        'price': '.price .normal span',
                        'image': '.productImage a::href',
                        'tags': '.breadcrumb a::all'
                    }
            },

            'interdiscount': {
                'startUrl': 'http://www.interdiscount.ch/idshop/index.jsf',
                'ignoreUrlRegexp': r'/idshop/eneCategory/_/detail.jsf|prospect.jsf|/page/|/pages/|jsessionid|FulltextSearch|mailto|javascript|atwork|__HYBRIS__|/land/',
                'articleUrlRegexp': r'/product/',
                'fieldMatch':
                    {
                        'title': '.innercontent .productNameLine h1',
                        'description': '.innercontent .features::html',
                        'price': '.innercontent .productPrice',
                        'image': '.innercontent .largeImage',
                        'availability': '.availabilityIcon',
                        'tags': '.breadcrumb a::all'
                    }
            },

            'microspot': {
                'startUrl': 'http://www.microspot.ch/sitemap_index.xml',
                'leafOnly': True,
                'ignoreUrlRegexp': r'/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|/de/cat-|prospect.jsf|/page/|/pages/|jsessionid|FulltextSearch|mailto|javascript|atwork|__HYBRIS__|selectedLanguage=it|selectedLanguage=fr|printProduct.jsf',
                'fieldMatch':
                    {
                        'title': '.productName h1',
                        'description': '.rf-tab-cnt::html',
                        'price': '.productList_price',
                        'image': '.mainProductPicture img',
                        'availability': '.deliveryCheckImage',
                        'tags': '.breadcrumb a::all'
                    }
            },

            'galaxus': {
                'startUrl': 'http://www.microspot.ch/sitemap.xml.gz',
                'ignoreUrlRegexp': r'/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|/brand/|/producttype/',

                'leafOnly': True,
                'fieldMatch': get_rdf_selectors()
            },

            'oswald': {
                'startUrl': 'http://www.oswald.ch/xmlsitemaps/ch_de/sitemap.xml',
                'ignoreUrlRegexp': r'/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|/rezepte/',
                'articleUrlRegexp': r'/de/',
                'linkSelector': '.category-view a',

                'fieldMatch': {
                    'title': '.product-name h1',
                    'description': '#marketing-text',
                    'price': '.regular-price',
                    'image': '.product-image img',
                    'availability': '.deliveryCheckImage',
                    'tags': '[itemtype="http://data-vocabulary.org/Breadcrumb"] a span::all'
                }

            },

            'globus': {
                'startUrl': 'https://www.globus.ch/sitemap.xml',
                'ignoreUrlRegexp': r'/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|sitemap-globus-fr',

                'leafOnly': True,
                'fieldMatch': {
                    'title': '[itemprop="name"]',
                    'description': '[itemprop="description"]',
                    'price': '[itemprop="price"]',
                    'image': 'img.js_pdimage',
                    'tags': '[itemprop="category"] a::all',
                    'availability': '[itemprop="availability"]',
                }

            },

            'bonprix': {
                'startUrl': 'http://www.bonprix.de/sitemap.xml',
                'ignoreUrlRegexp': r'/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|video.xml|editorial.xml',

                'leafOnly': True,
                'fieldMatch': {
                    'title': '.product-name',
                    'description': '#product-info',
                    'price': '[itemtype="http://schema.org/Product"] [itemprop="price"]',
                    'image': '#productimage::attr/data-main-image',
                    'tags': '[itemtype="http://data-vocabulary.org/Breadcrumb"] a::all',
                    'availability': '[itemtype="http://schema.org/Product"] [itemprop="availability"]',
                }
            },

            'zalando': {
                'startUrl': 'https://www.zalando.de/sitemap.xml',
                'ignoreUrlRegexp': r'/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|/alle/',
                'linkSelector': 'a.catalogArticlesList_productBox',

                'fieldMatch': {
                    'title': '[itemprop="name"]',
                    'description': '#productDetails .content',
                    'price': '.price',
                    'image': '.articleMedia_imagePlaceholder',
                    'tags': '.breadcrumbs_link::all',
                    'availability': '[itemprop="availability"]',
                    'shop_id': '[itemprop="identifier"]'

                }

            },

            'impo': {
                'startUrl': 'http://www.impo.ch/images/sitemap/sitemap_de.xml',
                'ignoreUrlRegexp': r'/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|/addToBasket/|/de/cat-|prospect.jsf|/page/|/pages/|jsessionid|FulltextSearch|mailto|javascript|atwork|__HYBRIS__|selectedLanguage=it|selectedLanguage=fr|printProduct.jsf',
                'linkSelector': '.main-center-catalog a',

                'fieldMatch': {
                    'title': '.text-overview-title',
                    'description': '.text-overview-desc',
                    'price': '#productPrice',
                    'image': '#product-pic-variants',
                    'tags': '.breadcrumb-item a::all',
                    'shop_id': '#zobjectid'

                }
            },

            'ochsner': {
                'startUrl': 'http://shop.ochsner-sport.ch/CH/de/shop/sitemap.xml',
                'ignoreUrlRegexp': r'sport.ch#|sport.ch/#|/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|/addToBasket/|/de/cat-|prospect.jsf|/page/|/pages/|FulltextSearch|mailto|javascript|atwork|__HYBRIS__|selectedLanguage=it|selectedLanguage=fr|printProduct.jsf',
                'linkSelector': '.thumbname a',
                'articleUrlRegexp': r'/CH/de/shop/',
                'fieldMatch': {
                    'title': '#m_product_facts_name',
                    'description': '.m_product_config .tabContent::html',
                    'price': '#m_product_facts_price',
                    'image': '.img-zoom img',
                    'tags': '#ariadne a::all',
                    'shop_id': '#product.details.product.code'

                }
            },

            'bauundhobby': {
                'startUrl': 'http://www.bauundhobby.ch/sitemap/sitemap_de.xml',
                'ignoreUrlRegexp': r't.ch#|/bauho/|/coop/|.ch/#|/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|/addToBasket/|/de/cat-|prospect.jsf|/page/|/pages/|FulltextSearch|mailto|javascript|atwork|__HYBRIS__|selectedLanguage=it|selectedLanguage=fr|printProduct.jsf',
                'linkSelector': '#content-container .product-link',
                'fieldMatch': {
                    'title': '.product-detail-information h1',
                    'description': '.product-details::html',
                    'price': '[itemprop="price"]',
                    'image': '[itemprop="image"]',
                    'availability': '.btn-shopping-cart .btn-text',
                    'tags': '.breadcrumb a::all',

                }
            },

            'chain': {
                'startUrl': 'http://www.chainreactioncycles.com/products-sitemap-index.xml.gz',
                'leafOnly': True,
                'suppressUA': True,
                'generic_tag': 'Bike',
                'fieldMatch':
                    {
                        'title': '.product_title',
                        'description': '.short_desc::html',
                        'price': '#crc_product_rp',
                        'image': '#s7_zoomviewer_staticImage',
                        'availability': '.inventory',
                        'tags': '.breadcrumb a::all'
                    }
            },

            'rei': {
                'startUrl': 'http://www.rei.com/sitemap.xml',
                'ignoreUrlRegexp': r'/smartwool/|/stores/|/b/',
                'articleUrlRegexp': r'/product/',
                'leafOnly': True,
                'fieldMatch': get_rdf_selectors({'tags': '.breadcrumb .itemTitle::all'})
            },

            'otto': {
                'startUrl': 'https://www.otto.de/product/sitemap_index.xml',
                'leafOnly': True,
                'fieldMatch': get_rdf_selectors({})
            },

            DUMMY: {
                'startUrl': 'http://localhost:8888/',
                'genericTag': 'Dummy',
                'fieldMatch':
                    {
                        'title': '.title',
                        'description': '.description::html',
                        'price': '.price',
                        'tags': '.tag::all',
                        'image': 'img',
                    }
            }
        }

        if not shop_filter:
            shop_filter = ','.join([k for k in shops_config.keys() if k != DUMMY])

        for name, config in shops_config.items():

            if name in shop_filter:
                c = SpiderPool(config, spider_count)
                if update_current:
                    c.update_current()
                elif keep_current:
                    c.skip_current()
                else:
                    c.clean_current()

                c.run()