def parse_content_sitemap(self, parse_url): log('start sitemap', parse_url) try: data_string = self.opener.open(parse_url).read() except urllib2.HTTPError, e: print 'ERROR while sitmap loading', e, parse_url return None, [{'force': True, 'parse_url': parse_url}]
def save_data(self, data): if self.config.update_current: log('update', data['shop_url']) with self.update_lock: Article.objects.filter(shop=data['shop_url']).update(**data) else: log('save', data['shop_url']) Article.objects.bulk_create([Article(**data)]) self.articles_found += 1 self.status.data = self.get_status() self.status.save()
def parse_content_microdata(self, parse_url): # https://developers.google.com/structured-data/testing-tool/ # not working with http://www.bonprix.de/produkt/maxi-jerseykleid-dunkelblau-bedruckt-958483/ # which is good microformat according to google log('parse', parse_url) items = microdata.get_items(urllib.urlopen(parse_url)) data = [i.json() for i in items] urls = [] return data, urls
def handle_noargs(self, **options): from crawl.helpers import log from articles.models import Article i = 0 for d in Article.objects.values('image').annotate(Count('id')).order_by().filter(id__count__gt=1, image__isnull=False): Article.objects.filter( id__in=Article.objects.filter(image=d['image']).values_list('id', flat=True)[1:] ).delete() i += 1 log(i, 'duplicates found ...') log('end', i, 'duplicates found')
def run(self): self.start_time = time.time() self.status = IndexStatus(shop=self.get_name(), data=self.get_status(), config=self.config.get_original()) self.status.save() log('start', self.get_name()) self.add_urls([self.config.start_url]) self.wait_completion() self.status.data = self.get_status() log('end crawling', self.get_name(), ' - ', ', '.join(['%s: %s' % (d[0], d[1]) for d in self.status.data.items()])) self.status.finished = True self.status.save()
def f(*args, **kwargs): t1 = time.time() result = function(*args, **kwargs) duration = int(round((time.time() - t1) * 1000, -1)) # if settings.GOOGLE_ANALYTICS_CODE: # url = 'http://www.google-analytics.com/collect?v=1&tid=%s&utv=%s&utt=%s' % ( # settings.GOOGLE_ANALYTICS_CODE, name, duration # ) # try: # requests.get(url, timeout=1) # except Exception, e: # log("Error in log_timing", e) log("%s took \x1b[32m%sms\x1b[0;39m" % (name, duration)) return result
def run(self): while True: url = self.queue.get() try: data, urls = self.parser.parse(url) if data: self.save_data_func(data) if urls: self.add_urls_func(urls) except Exception, e: log('ERROR', url, e, '\n', traceback.format_exc()) gc.collect() self.queue.task_done()
def parse_content_sitemap(self, parse_url): log('start sitemap', parse_url) data_string = self.opener.open(parse_url).read() if parse_url.endswith('.gz'): compressedFile = StringIO.StringIO() compressedFile.write(data_string) compressedFile.seek(0) xml = gzip.GzipFile(fileobj=compressedFile, mode='rb') data_string = xml.read() del xml found = LOC_RE.findall(data_string) urls = [] for loc in found: href = self.sanitize_url(loc) if self.is_valid_href(href): urls.append(href) log('end sitemap', parse_url, len(urls)) return None, urls
def handle_noargs(self, **options): from crawl.helpers import log print log('update index ...') call_command('update_index', interactive=False) print log('update index finished')
def parse_content_html(self, parse_url): log('parse', parse_url) urls = [] data = {} data_string = self.opener.open(parse_url).read() soup = BeautifulSoup(data_string, "lxml") if not self.config.article_url_regexp or self.config.article_url_regexp.findall(parse_url): data = {'shop_url': parse_url, 'shop': self.config.shop_name} for k, v in self.config.field_match.items(): v, use_html = Parser.use_pseudo(v, '::html') v, use_href = Parser.use_pseudo(v, '::href') v, use_all = Parser.use_pseudo(v, '::all') v, custom_attr = Parser.use_pseudo_with_arg(v, '::attr') def get_attr(tag, attr): return tag.attrs.get(custom_attr if custom_attr else attr, '') hits = soup.select(v) if len(hits): tag = hits[0] if tag.name == 'img': data[k] = urlparse.urljoin(self.config.base_url, get_attr(tag, 'src')) elif use_href: data[k] = get_attr(tag, 'href') elif use_html: data[k] = removetags(Parser.purify_text('%s' % tag), 'img script style span br') elif use_all: data[k] = [Parser.purify_text(h.text.replace(',', ' ')) for h in hits if self.valid_tag(h.text)] if k == 'tags' and self.config.generic_tag: data[k].append(self.config.generic_tag) else: text = get_attr(tag, 'content') if not text: text = tag.text if not text: text = tag.attrs.get('href', '') if k == 'image': text = urlparse.urljoin(self.config.base_url, text) data[k] = Parser.purify_text(text) if not data.has_key('title') or not data.has_key('description') or not data.has_key('price'): data = {} if not self.config.leaf_only: # respect <base href="http://www.base.com/shop/"/> head tag sanitize_url_kwargs = {} base = soup.select('base') if len(base): sanitize_url_kwargs['base'] = base[0].attrs.get('href') for tag in soup.select(self.config.link_selector): href = tag.attrs.get('href') if href and self.is_valid_href(href): href = self.sanitize_url(href, **sanitize_url_kwargs) urls.append(href) soup.decompose() del soup del data_string return data, urls
def handle_noargs(self, **options): delete = options.get('delete') if delete: from articles.models import Article log('delete', Article.objects.filter(shop=delete).count()) Article.objects.filter(shop=delete).delete() return if options.get('dummy', False): shop_filter = DUMMY else: shop_filter = options.get('shop_filter', None) update_current = options.get('update_current', False) keep_current = options.get('keep_current', False) spider_count = int(options.get('spider_count')) shops_config = { 'mondovino': { 'startUrl': 'https://www.mondovino.ch/sitemap/sitemap_de.xml', 'ignoreUrlRegexp': r'/sortiment|mailto|javascript', 'leafOnly': True, 'genericTag': 'Wein', 'fieldMatch': { 'title': '.mod_product_detail__title', 'description': '.mod_product_detail__description_text', 'price': 'span.mod_product_detail__price_box_price', 'image': '.mod_product_detail__product_image noscript img', 'tags': 'a[href="#grapes"]::all' } }, 'c-and-a': { 'startUrl': 'http://www.c-and-a.com/ch/de/sitemap/sitemap.xml', 'ignoreUrlRegexp': r'/information|/service|/corporate|/blog|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|/de/de/|.jpg|mailto|javascript|/land/', 'articleUrlRegexp': r'/ch/de/', 'fieldMatch': { 'title': '#productDetail h1', 'description': '#productDetail .list::html', 'price': '.price .normal span', 'image': '.productImage a::href', 'tags': '.breadcrumb a::all' } }, 'interdiscount': { 'startUrl': 'http://www.interdiscount.ch/idshop/index.jsf', 'ignoreUrlRegexp': r'/idshop/eneCategory/_/detail.jsf|prospect.jsf|/page/|/pages/|jsessionid|FulltextSearch|mailto|javascript|atwork|__HYBRIS__|/land/', 'articleUrlRegexp': r'/product/', 'fieldMatch': { 'title': '.innercontent .productNameLine h1', 'description': '.innercontent .features::html', 'price': '.innercontent .productPrice', 'image': '.innercontent .largeImage', 'availability': '.availabilityIcon', 'tags': '.breadcrumb a::all' } }, 'microspot': { 'startUrl': 'http://www.microspot.ch/sitemap_index.xml', 'leafOnly': True, 'ignoreUrlRegexp': r'/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|/de/cat-|prospect.jsf|/page/|/pages/|jsessionid|FulltextSearch|mailto|javascript|atwork|__HYBRIS__|selectedLanguage=it|selectedLanguage=fr|printProduct.jsf', 'fieldMatch': { 'title': '.productName h1', 'description': '.rf-tab-cnt::html', 'price': '.productList_price', 'image': '.mainProductPicture img', 'availability': '.deliveryCheckImage', 'tags': '.breadcrumb a::all' } }, 'galaxus': { 'startUrl': 'http://www.microspot.ch/sitemap.xml.gz', 'ignoreUrlRegexp': r'/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|/brand/|/producttype/', 'leafOnly': True, 'fieldMatch': get_rdf_selectors() }, 'oswald': { 'startUrl': 'http://www.oswald.ch/xmlsitemaps/ch_de/sitemap.xml', 'ignoreUrlRegexp': r'/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|/rezepte/', 'articleUrlRegexp': r'/de/', 'linkSelector': '.category-view a', 'fieldMatch': { 'title': '.product-name h1', 'description': '#marketing-text', 'price': '.regular-price', 'image': '.product-image img', 'availability': '.deliveryCheckImage', 'tags': '[itemtype="http://data-vocabulary.org/Breadcrumb"] a span::all' } }, 'globus': { 'startUrl': 'https://www.globus.ch/sitemap.xml', 'ignoreUrlRegexp': r'/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|sitemap-globus-fr', 'leafOnly': True, 'fieldMatch': { 'title': '[itemprop="name"]', 'description': '[itemprop="description"]', 'price': '[itemprop="price"]', 'image': 'img.js_pdimage', 'tags': '[itemprop="category"] a::all', 'availability': '[itemprop="availability"]', } }, 'bonprix': { 'startUrl': 'http://www.bonprix.de/sitemap.xml', 'ignoreUrlRegexp': r'/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|video.xml|editorial.xml', 'leafOnly': True, 'fieldMatch': { 'title': '.product-name', 'description': '#product-info', 'price': '[itemtype="http://schema.org/Product"] [itemprop="price"]', 'image': '#productimage::attr/data-main-image', 'tags': '[itemtype="http://data-vocabulary.org/Breadcrumb"] a::all', 'availability': '[itemtype="http://schema.org/Product"] [itemprop="availability"]', } }, 'zalando': { 'startUrl': 'https://www.zalando.de/sitemap.xml', 'ignoreUrlRegexp': r'/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|/alle/', 'linkSelector': 'a.catalogArticlesList_productBox', 'fieldMatch': { 'title': '[itemprop="name"]', 'description': '#productDetails .content', 'price': '.price', 'image': '.articleMedia_imagePlaceholder', 'tags': '.breadcrumbs_link::all', 'availability': '[itemprop="availability"]', 'shop_id': '[itemprop="identifier"]' } }, 'impo': { 'startUrl': 'http://www.impo.ch/images/sitemap/sitemap_de.xml', 'ignoreUrlRegexp': r'/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|/addToBasket/|/de/cat-|prospect.jsf|/page/|/pages/|jsessionid|FulltextSearch|mailto|javascript|atwork|__HYBRIS__|selectedLanguage=it|selectedLanguage=fr|printProduct.jsf', 'linkSelector': '.main-center-catalog a', 'fieldMatch': { 'title': '.text-overview-title', 'description': '.text-overview-desc', 'price': '#productPrice', 'image': '#product-pic-variants', 'tags': '.breadcrumb-item a::all', 'shop_id': '#zobjectid' } }, 'ochsner': { 'startUrl': 'http://shop.ochsner-sport.ch/CH/de/shop/sitemap.xml', 'ignoreUrlRegexp': r'sport.ch#|sport.ch/#|/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|/addToBasket/|/de/cat-|prospect.jsf|/page/|/pages/|FulltextSearch|mailto|javascript|atwork|__HYBRIS__|selectedLanguage=it|selectedLanguage=fr|printProduct.jsf', 'linkSelector': '.thumbname a', 'articleUrlRegexp': r'/CH/de/shop/', 'fieldMatch': { 'title': '#m_product_facts_name', 'description': '.m_product_config .tabContent::html', 'price': '#m_product_facts_price', 'image': '.img-zoom img', 'tags': '#ariadne a::all', 'shop_id': '#product.details.product.code' } }, 'bauundhobby': { 'startUrl': 'http://www.bauundhobby.ch/sitemap/sitemap_de.xml', 'ignoreUrlRegexp': r't.ch#|/bauho/|/coop/|.ch/#|/en/|/fr/|/it/|/es/|/nl/|/pl/|/be/|/at/|/addToBasket/|/de/cat-|prospect.jsf|/page/|/pages/|FulltextSearch|mailto|javascript|atwork|__HYBRIS__|selectedLanguage=it|selectedLanguage=fr|printProduct.jsf', 'linkSelector': '#content-container .product-link', 'fieldMatch': { 'title': '.product-detail-information h1', 'description': '.product-details::html', 'price': '[itemprop="price"]', 'image': '[itemprop="image"]', 'availability': '.btn-shopping-cart .btn-text', 'tags': '.breadcrumb a::all', } }, 'chain': { 'startUrl': 'http://www.chainreactioncycles.com/products-sitemap-index.xml.gz', 'leafOnly': True, 'suppressUA': True, 'generic_tag': 'Bike', 'fieldMatch': { 'title': '.product_title', 'description': '.short_desc::html', 'price': '#crc_product_rp', 'image': '#s7_zoomviewer_staticImage', 'availability': '.inventory', 'tags': '.breadcrumb a::all' } }, 'rei': { 'startUrl': 'http://www.rei.com/sitemap.xml', 'ignoreUrlRegexp': r'/smartwool/|/stores/|/b/', 'articleUrlRegexp': r'/product/', 'leafOnly': True, 'fieldMatch': get_rdf_selectors({'tags': '.breadcrumb .itemTitle::all'}) }, 'otto': { 'startUrl': 'https://www.otto.de/product/sitemap_index.xml', 'leafOnly': True, 'fieldMatch': get_rdf_selectors({}) }, DUMMY: { 'startUrl': 'http://localhost:8888/', 'genericTag': 'Dummy', 'fieldMatch': { 'title': '.title', 'description': '.description::html', 'price': '.price', 'tags': '.tag::all', 'image': 'img', } } } if not shop_filter: shop_filter = ','.join([k for k in shops_config.keys() if k != DUMMY]) for name, config in shops_config.items(): if name in shop_filter: c = SpiderPool(config, spider_count) if update_current: c.update_current() elif keep_current: c.skip_current() else: c.clean_current() c.run()