def parse_state(self, response): """ Yields a scrapy.Request object for each city with a store in the state """ state_url = 'stores.joann.com/{}*'.format(response.meta['state']) extractor = LinkExtractor(allow=state_url) for link in extractor.extract_links(response): yield scrapy.Request(link.url, callback=self.parse_city, headers=HEADERS)
def parse_sesja(self, response): # uchwaly uchwaly_le = LinkExtractor(allow=FindReportsSpider.UCHWALA_RE, restrict_xpaths="//table") links = uchwaly_le.extract_links(response) self.print_links("uchwaly", links) cnt = 0 for link in links: yield scrapy.Request(link.url, callback=self.parse_uchwala) k = items.PageItem() k["text"] = link.text.encode("utf8") k["url"] = link.url k["ref"] = response.url k["order"] = cnt yield k if cnt >= DEBUG_CNT and DEBUG: break cnt += 1 # files (glosowania, obecnosc) le = LinkExtractor(allow=FindReportsSpider.PLIK_RE) links = le.extract_links(response) self.print_links("glosowania", links) cnt = 0 for link in links: fi = items.FiledownloadItem() fi["file_urls"] = [link.url] fi["text"] = link.text.encode("utf8") fi["url"] = link.url fi["ref"] = response.url fi["order"] = cnt yield fi if cnt >= DEBUG_CNT and DEBUG: break cnt += 1
def parse(self,response): extractor = LinkExtractor(allow="/article/*") links = extractor.extract_links(response) for link in links: item = XiubaiItem() req = Request(link.url, self.parse_detail_page) req.meta['item'] = item yield req
def parse_link(self, response): # log self.logger.info('Hi, this is an item page! %s', response.url) # parse link linkExtractor = LinkExtractor(allow=r".+\.shtml", restrict_css='div.list > ul', unique=True) links = linkExtractor.extract_links(response) for link in links: yield scrapy.Request(link.url, callback=self.parse_content)
def parse(self, response): le = LinkExtractor() for link in le.extract_links(response): yield scrapy.Request(link.url, self.parse_link, meta={ 'splash': { 'args': {'har': 1, 'html': 0}, } })
def parse(self, response): if response.status != 200 or response.body == "": return ads_links = response.xpath("//a[img]") for ads_link in ads_links: link_href = ads_link.xpath("@href").extract_first() if self._from_same_site(response.url, link_href): continue ads_profile = AdsProfileItem() ads_profile["ads_host"] = response.url ads_profile["ads_present_mode"] = "normal_1" ads_profile["ads_target_url"] = link_href img_src = response.urljoin(ads_link.xpath("img/@src").extract_first()) ads_profile["ads_content_url"] = img_src ads_profile["ads_content_frame"] = "" ads_profile["ads_host_domain"] = urlparse(response.url).netloc ads_profile["ads_target_domain"] = urlparse(link_href).netloc yield ads_profile if isinstance(response, SplashJsonResponse): if "childFrames" in response.data: frames = self._get_all_child_frames(response) print "Get %s childFrames in %s" % (len(frames), response.url) for frame_response in frames: if not self._is_valid_frame(frame_response.url): continue ads_links = frame_response.xpath("//a[img]") for ads_link in ads_links: link_href = ads_link.xpath("@href").extract_first() if self._from_same_site(response.url, link_href): continue ads_profile = AdsProfileItem() ads_profile["ads_host"] = response.url ads_profile["ads_present_mode"] = "normal_1" ads_profile["ads_target_url"] = link_href img_src = frame_response.urljoin(ads_link.xpath("img/@src").extract_first()) ads_profile["ads_content_url"] = img_src ads_profile["ads_content_frame"] = frame_response.url ads_profile["ads_host_domain"] = urlparse(response.url).netloc ads_profile["ads_target_domain"] = urlparse(link_href).netloc yield ads_profile link_extractor = LinkExtractor() all_links = link_extractor.extract_links(response) for link in all_links: request = SplashRequest( response.urljoin(link.url), self.parse, endpoint="render.json", slot_policy=SlotPolicy.PER_DOMAIN, args={"html": 1, "iframes": 1}, ) request.headers.setdefault("User-Agent", self.ua_generater.get_user_agent()) yield request
def parse(self, response): e = LinkExtractor() urls = [link.url for link in e.extract_links(response)] for url in urls: parsed = urlparse.urlsplit(url) qs = urlparse.parse_qs(parsed.query) if qs and 'Url' in qs: event_url = qs['Url'][0] yield self.add_url(event_url)
def parse_code(self, response): #提取source code的url # le = LinkExtractor(restrict_css='div.bodywrapper p', allow='matplotlib.org/examples') # link = le.extract_links(response) le = LinkExtractor(restrict_css='a.reference.external') link = le.extract_links(response) file = FilesItem() file['file_urls'] = [link[0].url] return file
def parse(self, response): le = LinkExtractor() user_profiles = [] for link in le.extract_links(response): result = re.search(r'.*(http://www.last.fm/user/.*)', link.url) if result: user_profiles.append(result.group(1)) for user_profile in user_profiles: print user_profile
def parse(self, response): link_extractor = LinkExtractor() links = link_extractor.extract_links(response) for link in links: item = DomainItem() item['link'] = link.url item['domain'] = self.getHost(link.url) yield item for link in links: if (not db.scrapy_items.find_one({'link': link.url})): yield scrapy.Request(link.url, callback=self.parse)
def parse(self, response): name = 'example' lx = LinkExtractor() lst = lx.extract_links(response) # List contains the list of jobs # Call the function which compares between lst and MongoDB. Return Boolean Value flag = compare(name, lst) # if True, call the function which send an email to users if flag: notify(name) else: print("No Update")
def parse(self, response): le = LinkExtractor() for link in le.extract_links(response): yield SplashRequest( link.url, self.parse_link, endpoint='render.json', args={ 'har': 1, 'html': 1, } )
def parse(self, response): e = LinkExtractor() urls = [link.url for link in e.extract_links(response)] for url in urls: if response.url != url: yield self.add_url(url) if urls: qs = urlparse.parse_qs(urlparse.urlparse(response.url).query) qs = dict((k, v[0]) for (k, v) in qs.iteritems()) qs['p'] = int(qs['p']) + 1 url = 'http://comeon5678.com/event/list' yield scrapy.Request('%s?%s' % (url, urllib.urlencode(qs)))
def parse(self, response): #提取书籍页面中每本书的链接 le = LinkExtractor(restrict_css='article.product_pod h3') for link in le.extract_links(response): yield scrapy.Request(link.url, callback=self.parse_book) #提取下一页的链接 le = LinkExtractor(restrict_css='ul.pager li.next') links = le.extract_links(response) if links: next_url = links[0].url yield scrapy.Request (next_url, callback=self.parse)
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None, deny_extensions=None): LinkExtractor.__init__(self, allow = allow, deny = deny, allow_domains = allow_domains, deny_domains = deny_domains, restrict_xpaths = restrict_xpaths, tags = tags, attrs = attrs, canonicalize = canonicalize, unique = unique, process_value = self.process_value, deny_extensions = deny_extensions )
class MySpider(scrapy.Spider): # Your spider definition name="fetch_data" def __init__(self, *args, **kwargs): super(MySpider, self).__init__(*args, **kwargs) self.start_urls = [kwargs.get('start_url')] self.link_extractor = LinkExtractor() urls = self.start_urls def parse(self, response): item = WebpageScraperItem() item['key'] = self.start_urls item['title'] = response.xpath('//title/text()').extract() item['paragraphs'] = response.xpath('//p/text()').extract() item['headings'] = response.xpath('//h1/text()').extract() links = self.link_extractor.extract_links(response) item['links'] = [x.url for x in links] img_urls = [] img_url = response.xpath('//img/@src').extract() for img in img_url: parse_url = urlparse.urlparse(img) parsed_url = parse_url._replace(**{"scheme":"http"}) img_urls.append(parsed_url.geturl()) item['image_urls'] = img_urls return item
class BCSpider(Spider): name = 'bc' def __init__(self, *args, **kwargs): super(BCSpider, self).__init__(*args, **kwargs) self.le = LinkExtractor() def parse(self, response): if not isinstance(response, HtmlResponse): return for link in self.le.extract_links(response): r = Request(url=link.url) r.meta.update(link_text=link.text) yield r @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(BCSpider, cls).from_crawler(crawler, *args, **kwargs) spider._set_crawler(crawler) spider.crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle) return spider def spider_idle(self): self.log("Spider idle signal caught.") raise DontCloseSpider
def __init__(self, *args, **kwargs): super(GeneralSpider, self).__init__(*args, **kwargs) f = open("seeds_es_smp.txt") la = [urlparse(url.strip()).netloc for url in f.readlines()] f.close() self.la = la self.le = LinkExtractor()
def print_url(self, response): """ @url http://www.ura.org.hk/en/schemes-and-policies/redevelopment/ura-implemented-projects/reimbursement.aspx @returns items 1 1 @returns requests 0 0 @scrapes title link html text last_updated file_urls """ l = ItemLoader(item=UrbanRenewalItem(), response=response) l.add_xpath('title', '//title') l.add_value('link', response.url) l.add_xpath('text', '//div[@id="content"]') l.add_xpath('html', '/html') l.add_xpath('last_updated', '//div[@class="lastUpdated"]') lx = LinkExtractor(allow=['\.' + ext for ext in file_extension], deny_extensions=()) l.add_value('file_urls', [link.url for link in lx.extract_links(response)]) return l.load_item()
def parse_main(self, response): le = LinkExtractor(allow=KADENCJA_RE) links = le.extract_links(response) self.print_links("kadencje", links) cnt = 0 for link in links: yield scrapy.Request(link.url, callback=self.parse_kadencja) k = items.PageItem() k["text"] = link.text.encode("utf8") k["url"] = link.url k["ref"] = response.url k["order"] = cnt yield k if cnt >= DEBUG_CNT and DEBUG: break cnt += 1
def parse_uchwala(self, response): # generate list of files to download le = LinkExtractor(allow=FindReportsSpider.PLIK_RE) links = le.extract_links(response) self.print_links("files", links) cnt = 0 for link in links: fi = items.FiledownloadItem() fi["file_urls"] = [link.url] fi["text"] = link.text.encode("utf8") fi["url"] = link.url fi["ref"] = response.url fi["order"] = cnt yield fi if cnt >= DEBUG_CNT and DEBUG: break cnt += 1
def parse(self, response): for sel in response.css('article.product_pod'): book = BookstoresItem() book['name'] = sel.xpath('./h3/a/@title').extract_first() book['price'] = sel.css('p.price_color::text').extract_first() yield book # 提取链接 # next_url = response.css('ul.pager li.next a::attr(href)').extract_first() # if next_url: # next_url = response.urljoin(next_url) # yield scrapy.Request(next_url,callback=self.parse) le = LinkExtractor(restrict_css='ul.pager li.next' ) links = le.extract_links(response) if links: next_url = links[0].url yield scrapy.Request(next_url,callback=self.parse)
def parse_kadencja(self, response): # 'LIX Sesja Rady Miasta 24 września 2014 r.' # 'http://www.bip.olsztyn.eu/bip/dokument/305103/lix_sesja_rady_miasta_24_wrzesnia_2014_r_/' le = LinkExtractor(allow=FindReportsSpider.SESJA_RE) links = le.extract_links(response) self.print_links("sesje", links) cnt = 0 for link in links: yield scrapy.Request(link.url, callback=self.parse_sesja) k = items.PageItem() k["text"] = link.text.encode("utf8") k["url"] = link.url k["ref"] = response.url k["order"] = cnt yield k if cnt >= DEBUG_CNT and DEBUG: break cnt += 1
def __init__(self, **kw): super(FollowAllSpider, self).__init__(**kw) url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/' if not url.startswith('http://') and not url.startswith('https://'): url = 'http://%s/' % url self.url = url self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)] self.link_extractor = LinkExtractor() self.cookies_seen = set()
def parse(self, response): print(response.url) # Extract internal links from webpage IGNORED_EXTENSIONS.append('gz') IGNORED_EXTENSIONS.append('tar') urlextract = LinkExtractor(allow_domains=self.allowed_domains) # Store internal links links = urlextract.extract_links(response) links = [l.url for l in links] if response.url not in self.data: self.data[response.url] = links yield # Follow internal links for url in links: yield scrapy.Request(url, self.parse)
def __init__(self, **kw): super(FollowAllSpider, self).__init__(**kw) url = kw.get('url') or kw.get('domain') or 'https://zh.wikipedia.org/wiki/%E5%9C%9F%E8%B1%86%E7%BD%91' if not url.startswith('http://') and not url.startswith('https://'): url = 'http://%s/' % url self.url = url self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)] self.link_extractor = LinkExtractor() self.cookies_seen = set()
def parse_item(self, response): self.write_response(response.url, response) print("----------------------------------", response.real_url, response.url) le = LinkExtractor() for link in le.extract_links(response): splashRequestObj = SplashRequest( link.url, self.parse_item, endpoint='render.html', args={ 'wait':0.8, 'html': 1, } ) yield splashRequestObj
def parse(self, response): self.write_response(response.url, response) if not response.url.lower().find(r"cisco.com/en/us/docs") == -1 or not response.url.lower().find(r"cisco.com/c/en/us/td/docs") == -1 or not response.url.lower().find(r"register") == -1: return le = LinkExtractor() for link in le.extract_links(response): splashRequestObj = SplashRequest( link.url, self.parse, endpoint='render.html', args={ 'wait':0.8, 'html': 1, } ) yield splashRequestObj
def parse_item(self, response): internal_item = InternalItem() internal_item["url"] = response.url yield internal_item #Use the inbuilt LinkExtractor to find urls, filtering out internal urls extractor_external = LinkExtractor(deny_domains=self.allowed_domains) external_links = extractor_external.extract_links(response) for link in external_links: external_item = ExternalItem() external_item["url"] = link.url yield external_item for src in response.css("img::attr('src')"): asset_item = AssetItem() asset_item["url"] = response.urljoin(src.extract()) yield asset_item for src in response.css("script::attr('src')"): asset_item = AssetItem() asset_item["url"] = response.urljoin(src.extract()) yield asset_item
def extract_links(response, xpaths, tag=None, attr=None): """Extract links on a page matching given XPaths. :param response: Scrapy response whose body contains links to extract :type response: :class:`scrapy.http.Response` :param xpaths: unique or iterable of XPath(s) matching links to extract :type xpaths: `unicode` or `iterable` of `unicode` :param tag: tag name from which extract links :type tag: `unicode` :param attr: attribute name in :data:`tag` tag from which extract links :type attr: `unicode` :yield: extracted links (canonicalized URLs), directly usable as :data:`scrapy.http.Request.url` parameters :rtype: `generator` orf `unicode` """ # Construct LinkExtractor parameters extractor_attrs = { 'restrict_xpaths': xpaths, 'canonicalize': True, } if tag: extractor_attrs['tags'] = (tag,) if attr: extractor_attrs['attrs'] = (attr,) # Extract links link_extractor = \ LinkExtractor(**extractor_attrs) links = link_extractor.extract_links(response) # Generate links for link in links: yield link.url
class KaggleSpider(CrawlSpider): name = "kaggle" allowed_domains = [ "blog.kaggle.com", ] start_urls = [ 'http://blog.kaggle.com', ] __queue = [] rules = [ Rule( LinkExtractor( allow=[], deny=__queue, restrict_xpaths=[ '//*[@class="back-link"]', '//*[@class="post clearfix"]/h1/a[1]', # '//?page=\d+', # '//\w+/\w+/\w+/w+' ]), callback='parse_extract_data', follow=True) ] def parse_extract_data(self, response): if response.xpath('//*[@class="back-link"]' ) and 'Bandwidth exceeded' in response.body: raise CloseSpider('Exit') item = CrawlBlogItem() res = Selector(response) # import ipdb; ipdb.set_trace() # title = res.xpath('//*[@id="ctl01"]/div[5]/div[3]/div/div[1]/div[2]/div/div[1]/div[2]/h1/text()').extract() # item['title'] = ''.join(title).strip() # item['author'] = ''.join( response.xpath('//span[@class="author vcard"]/a/text()').extract()) item['name'] = ''.join( response.xpath( '//div[@class="article-header-inside"]/h1/text()').extract()) date_time = ''.join( response.xpath('//span[@class="entry-date"]/a[2]/@href').extract()) if date_time: item['datetime'] = date_time[-11:] item['url'] = response.url content = enumerate( response.xpath('//div[@class="entry-content"]/node()')) content_data = {} check_point = 'Summary' for index, data in content: _data = data.extract() if check_point not in content_data: content_data[check_point] = [] if '<p>' in _data or '\n' in _data or 'attachment' in _data: content_data[check_point].append(data.extract()) if '<h2>' in _data: check_point = BeautifulSoup(_data).text item['content'] = content_data if 'name' in item and item['name']: return item
class NetEaseSpider(CrawlSpider): # 爬虫名称 name = 'netease' # 定义爬取网址 start_urls = [ 'https://news.163.com/domestic/', 'https://news.163.com/world/' ] # start_urls = ['https://money.163.com/'] # 允许的爬取域 allowed_domains = ['news.163.com'] # allowed_domains = ['money.163.com'] # 爬取规则 # 日期区间在get_base_url后输入 rules = [ Rule(LinkExtractor(allow=r'({0})\d+/.*?html'.format(bace_url)), callback='parse_item', follow=True) for bace_url in get_base_url('20200508', '20200508') ] def parse_item(self, response): item = NewsSpiderItem() item['news_thread'] = response.url.strip().split('/')[-1][:-5] self.get_source(response, item) self.get_source_url(response, item) self.get_url(response, item) self.get_time(response, item) self.get_title(response, item) self.get_text(response, item) return item def get_text(self, response, item): text = response.css('.post_text p::text').extract() if text: print('text:{}'.format(text).replace(' ', '')) new_text = list() for line in text: if line: new_text.append( line.replace(' ', '').replace('\n', '').replace('\t', '')) item['news_text'] = new_text def get_url(self, response, item): url = response.url print(url) if url: item['news_url'] = url def get_title(self, response, item): title = response.css('title::text').extract() if title: print('title:{}'.format(title[0])) item['news_title'] = title[0] def get_time(self, response, item): time = response.css('div.post_time_source::text').extract() if time: print('time:{}'.format(time[0].strip().replace('来源', '').replace( '\u3000', ''))) item['news_time'] = time[0].strip().replace('来源', '').replace( '\u3000', '') def get_source(self, response, item): source = response.css('#ne_article_source::text').extract() if source: print('source:{}'.format(source[0])) item['news_source'] = source[0] def get_source_url(self, response, item): source_url = response.css('#ne_article_source::attr(href)').extract() if source_url: print('source_url:{}'.format(source_url[0])) item['news_source_url'] = source_url[0]
def parse(self, response): print("RESPONSE ", response) xlink = LinkExtractor() yield ScrapyLink(link=xlink.extract_links(response))
class LagouSpider(CrawlSpider): name = 'lagou' allowed_domains = ['www.lagou.com'] start_urls = ['https://www.lagou.com/'] rules = ( Rule(LinkExtractor(allow=('zhaopin/.*', )), follow=True), Rule(LinkExtractor(allow=r'gongsi/j\d+.html'), follow=True), Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True), ) # headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0", # "HOST": "www.lagou.com"} # # def start_requests(self): # yield scrapy.Request(url=self.start_urls[0], headers=self.headers, callback=self.parse_job, dont_filter=True) custom_settings = { "COOKIES_ENABLED": False, "DOWNLOAD_DELAY": 1, 'DEFAULT_REQUEST_HEADERS': { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Connection': 'keep-alive', 'Cookie': 'user_trace_token=20171015132411-12af3b52-3a51-466f-bfae-a98fc96b4f90; LGUID=20171015132412-13eaf40f-b169-11e7-960b-525400f775ce; SEARCH_ID=070e82cdbbc04cc8b97710c2c0159ce1; ab_test_random_num=0; X_HTTP_TOKEN=d1cf855aacf760c3965ee017e0d3eb96; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DsXIrWUxpNGLE2g_bKzlUCXPTRJMHxfCs6L20RqgCpUq%26wd%3D%26eqid%3Dee53adaf00026e940000000559e354cc; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_hotjob; login=false; unick=""; _putrc=""; JSESSIONID=ABAAABAAAFCAAEG50060B788C4EED616EB9D1BF30380575; _gat=1; _ga=GA1.2.471681568.1508045060; LGSID=20171015203008-94e1afa5-b1a4-11e7-9788-525400f775ce; LGRID=20171015204552-c792b887-b1a6-11e7-9788-525400f775ce', 'Host': 'www.lagou.com', 'Origin': 'https://www.lagou.com', 'Referer': 'https://www.lagou.com/', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', } } def parse_job(self, response): item_loader = LagouJobItemLoader(item=LagouJobItem(), response=response) # i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract() # i['name'] = response.xpath('//div[@id="name"]').extract() # i['description'] = response.xpath('//div[@id="description"]').extract() item_loader.add_css("title", ".job-name::attr(title)") item_loader.add_value("url", response.url) item_loader.add_value("url_object_id", get_md5(response.url)) item_loader.add_css("salary", ".job_request p span.salary::text") item_loader.add_xpath("job_city", "//dd[@class='job_request']/p/span[2]/text()") item_loader.add_xpath("work_years", "//dd[@class='job_request']/p/span[3]/text()") item_loader.add_xpath("degree_need", "//dd[@class='job_request']/p/span[4]/text()") item_loader.add_xpath("job_type", "//dd[@class='job_request']/p/span[5]/text()") item_loader.add_css("publish_time", ".job_request p.publish_time::text") item_loader.add_css("job_advantage", ".job-advantage p::text") item_loader.add_css("job_desc", ".job_bt div p") item_loader.add_css("job_addr", ".work_addr") item_loader.add_css("tags", ".position-label.clearfix li::text") item_loader.add_css("company_name", ".job_company dt a img::attr(alt)") item_loader.add_css("company_url", ".job_company dt a::attr(href)") item_loader.add_value("crawl_time", datetime.datetime.now()) # item_loader.add_css("crawl_update_time",".work_addr") lagou_item = item_loader.load_item() return lagou_item
class CrawlerSpider(AutoExtractSpider): """ Crawler Spider discovers links and returns AutoExtract items too. Required params: * seeds: one, or more seed URLs (as YAML list) Example: > -a seeds=http://example.com/ Or: > -a seeds='[http://blog.example.com/, http://shop.example.com/]' The mandatory "page-type" param from the parent AutoExtract Spider is also required. Optional params: * seeds-file-url: an optional URL to a plain text file with a list of seed URLs; * max-items: how many items (articles, or products) should the spider extract, per host; When the items are extracted, the spider stops. default: 100; * max-pages: how many pages should the spider follow per host, when discovering links; default: 1000; * count-limits: a YAML dict with page or item max count; example: {page_count: 90, item_count: 10} * extract-rules: a YAML dict with allowed and denied hosts and patterns; They will be used to initialize a scrapy.linkextractors.LinkExtractor; example: {allow: "/en/items/", deny: ["/privacy-?policy/?$", "/about-?(us)?$"]} * same-domain: limit the discovery of links to the same domains as the seeds; default: True * discovery-only: discover the links and return them, without AutoExtract items; default: False Extra options: * DEPTH_LIMIT: maximum depth that will be allowed to crawl; default: 1. * CLOSESPIDER_TIMEOUT: if the spider is running for more than that number of seconds, it will be automatically closed. default: 21600 seconds. """ # name = 'crawler' only_discovery = False same_origin = True seed_urls = None seeds_file_url = None count_limits = DEFAULT_COUNT_LIMITS rules = [ Rule(LinkExtractor(), process_links='_rule_process_links', process_req_resp='_rule_process_req_resp', follow=True), ] @classmethod def update_settings(cls, settings): super().update_settings(settings) update_redirect_middleware(settings) @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super().from_crawler(crawler, *args, **kwargs) spider.main_callback = spider.parse_page spider.main_errback = spider.errback_page for rule in spider.rules: rule._compile(spider) # Discovery only for seeds, without items if spider.get_arg('discovery-only'): spider.only_discovery = yaml.load(spider.get_arg('discovery-only')) # Limit requests to the same domain if spider.get_arg('same-domain'): spider.same_origin = yaml.load(spider.get_arg('same-domain')) # Seed URLs if getattr(spider, 'seeds', None): seeds = spider.seeds if isinstance(seeds, str): try: spider.seed_urls = yaml.load(seeds) except Exception as err: raise ValueError('Invalid seed URLs: %s %s', seeds, err) elif isinstance(seeds, (list, tuple)): spider.seed_urls = seeds del spider.seeds if spider.seed_urls: spider.seed_urls = arg_to_iter(spider.seed_urls) # Seeds file URL if spider.get_arg('seeds-file-url'): spider.seeds_file_url = spider.get_arg('seeds-file-url') # Domains allowed to be crawled, for OffsiteMiddleware and others if spider.same_origin and spider.seed_urls: if not hasattr(spider, 'allowed_domains'): spider.allowed_domains = DEFAULT_ALLOWED_DOMAINS spider.allowed_domains.extend( urlsplit(u).netloc.lower() for u in spider.seed_urls) crawler.signals.connect(spider.open_spider, signals.spider_opened) return spider def open_spider(self): # noqa: C901 """ Parse command line args. """ super().open_spider() # JSON count limits for pages or items if self.get_arg('count-limits'): limits = self.get_arg('count-limits') try: self.count_limits = yaml.load(limits) if not isinstance( limits, dict) else limits except Exception as err: raise ValueError('Invalid count limits: %s %s', limits, err) # JSON link extraction rules if self.get_arg('extract-rules'): rules = self.get_arg('extract-rules') try: self.extract_rules = yaml.load(rules) if not isinstance( rules, dict) else rules except Exception as err: raise ValueError('Invalid extraction rules: %s %s', rules, err) else: self.extract_rules = {} # Shortcut to limit global requests if self.get_arg('max-pages'): max_pages = int(self.get_arg('max-pages')) self.count_limits['page_host_count'] = max_pages if self.seed_urls: self.count_limits['page_count'] = max_pages * len( self.seed_urls) * 2 else: self.count_limits['page_count'] = max_pages * 2 if self.get_arg('max-items'): max_items = int(self.get_arg('max-items')) self.count_limits['item_host_count'] = max_items if self.seed_urls: self.count_limits['item_count'] = max_items * len( self.seed_urls) * 2 else: self.count_limits['item_count'] = max_items * 2 if self.count_limits: self.logger.debug('Using count limits: %s', self.count_limits) # Shortcut to allow and ignore links if self.get_arg('allow-links'): try: self.extract_rules['allow'] = yaml.load( self.get_arg('allow-links')) except Exception as err: raise ValueError('Invalid allow-links: %s', err) if self.get_arg('ignore-links'): try: self.extract_rules['deny'] = yaml.load( self.get_arg('ignore-links')) except Exception as err: raise ValueError('Invalid ignore-links: %s', err) if self.extract_rules: self.logger.debug('Using extract rules: %s', self.extract_rules) if self.only_discovery: self.logger.debug('Discovery ONLY mode enabled') return self @crawlera_session.init_start_requests def start_requests(self): """ The main function. """ # Process exact item URLs for Articles, or Products (if any) yield from super().start_requests() # Discover links and process the items yield from self._process_seeds() def _process_seeds(self) -> str: """ Seeds are website URLs (can be JSON, JL, TXT, or CSV with 1 column) Because the list is expected to be small, the input can be one, or more URLs. Seed URLs will be crawled deeply, trying to find articles, or products. """ if self.seeds_file_url: yield Request(self.seeds_file_url, meta={'source_url': self.seeds_file_url}, callback=self.parse_seeds_file, errback=self.main_errback, dont_filter=True) if not self.seed_urls: return self.logger.info('Using seeds: %s', self.seed_urls) yield from self._schedule_seed_urls(self.seed_urls) def parse_seeds_file(self, response): """ Process seeds file url response and schedule seed urls for processing. """ if not isinstance(response, TextResponse): return seeds = response.text.split() yield from self._schedule_seed_urls(seeds) def _schedule_seed_urls(self, seed_urls): """ A helper to process seed urls and yield appropriate requests. """ for url in seed_urls: url = url.strip() if not is_valid_url(url): self.logger.warning('Ignoring invalid seed URL: %s', url) continue # Initial request to the seed URL self.crawler.stats.inc_value('x_request/seeds') yield Request(url, meta={'source_url': url}, callback=self.main_callback, errback=self.main_errback, dont_filter=True) def parse_page(self, response): """ Parse the spider response. """ if not isinstance(response, TextResponse): return # Try to parse the AutoExtract response (if available) and return the correct Item is_autoextract_response = is_autoextract_request(response) if not self.only_discovery: if is_autoextract_response: yield from self.parse_item(response) else: # For discovery-only mode, return only the URLs item = {'url': response.url} item['scraped_at'] = utc_iso_date() if response.meta.get('source_url'): item['source_url'] = response.meta['source_url'] if response.meta.get('link_text'): item['link_text'] = response.meta['link_text'].strip() yield item # Cycle and follow links # Currently AutoExtract responses don't contain the full page HTML, # so there are no links and nothing to follow if response.body and not is_autoextract_response: for request in self._requests_to_follow(response): yield crawlera_session.init_request(request) elif is_autoextract_response: # Make another request to fetch the full page HTML # Risk of being banned self.crawler.stats.inc_value('x_request/discovery') request = Request(response.url, meta={'source_url': response.meta['source_url']}, callback=self.main_callback, errback=self.main_errback, dont_filter=True) yield crawlera_session.init_request(request) def _rule_process_links(self, links): """ Simple helper used by the default Rule to drop links, when the same-origin option is enabled. """ if not self.same_origin: return links valid_links = [] for lnk in links: host = urlsplit(lnk.url).netloc.lower() if not hasattr(self, 'allowed_domains') or host in self.allowed_domains: valid_links.append(lnk) return valid_links def _rule_process_req_resp(self, request, response): """ Simple helper used by the default Rule to fix the current request. """ for m in META_TO_KEEP: if response.meta.get(m): request.meta[m] = response.meta[m] request.meta['scraped_at'] = utc_iso_date() request.callback = self.parse_page request.errback = self.errback_page return request def _requests_to_follow(self, response): seen = set() for n, rule in enumerate(self.rules): links = [ lnk for lnk in rule.link_extractor.extract_links(response) if lnk.url not in seen ] if links and callable(rule.process_links): links = rule.process_links(links) for link in links: seen.add(link.url) meta = {'rule': n, 'link_text': link.text} request = self.make_extract_request(link.url, meta=meta) if not request: continue if callable(rule.process_req_resp): request = rule.process_req_resp(request, response) yield request def errback_page(self, failure): if failure.check(IgnoreRequest, DropItem): return request = getattr(failure, 'request', None) if request: self.logger.warning('Page %s failed: %s', request.body, failure) self.crawler.stats.inc_value('error/failed_page')
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name' : "//meta[@property='og:title']/@content", 'price' : "//tr/td/div[@class='giasanpham']", 'category' : "//div[@class='dvtitleproduct_kind']/table/tr/td[2]/a", 'description' : "//div[@class='producthotcatend']/div[@id='p1']/table", 'images' : "//img[@id='anhchinh']/@src", 'canonical' : "//link[@rel='canonical']/@href", 'base_url' : "", 'brand' : "" } name = 'hongha.asia' allowed_domains = ['hongha.asia'] start_urls = ['http://hongha.asia/main/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/product/']), 'parse_item'), Rule(LinkExtractor(allow=['/cat/','/page+\d+\.html']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class exhibitMeishujiaSpider(CrawlSpider): name = 'exhibit.meishujia' catid = 6 typeid = 0 sysadd = 1 status = 99 # allowed_domains = ['artist.meishujia.cn'] start_urls = [ "http://exhibit.meishujia.cn/index.php?page=1&act=app&appid=4099" ] # 设置下载延时 download_delay = 1 custom_settings = { 'ITEM_PIPELINES': { 'baby.pipelines.exhibitPipeline': 300, # 'baby.pipelines.JsonWriterPipeline': 350, # 'baby.pipelines.MultiImagesPipeline': 400, # 'baby.pipelines.MysqlWriterPipeline': 500, }, } rules = ( # 地址分页 # Rule(LinkExtractor(allow=('/index.php?page=1&act=pps&smid=2'), allow_domains=('meishujia.cn'),restrict_xpaths=('//ul[@class="sert"]'))), # 详情页1 # Rule(LinkExtractor(restrict_xpaths=('//li[@class="i42c"]/div[@class="i42ck"]'))), # 详情页 2 /?act=usite&usid=[0-9]{1,10}&inview=[a-z-0-9-]+&said=528 /?act=usite&usid=8646&inview=appid-241-mid-619&said=528 #只有一个规则的时候,后面的“,”要加上,不然报 TypeError: 'Rule' object is not iterable 错误 Rule(LinkExtractor(restrict_xpaths=( '//dd[re:test(@class,"theme_body_1609")]//ul[@class="srre"]//div[@class="srremap"]/a' )), callback='parse_item'), ) def detail_lik(self, links): yield links def parse_item(self, response): # http://blog.51cto.com/pcliuyang/1543031 l = DefaultItemLoader(item=exhibitMeishujiaItem(), selector=response) l.add_value('spider_link', get_base_url(response)) l.add_xpath( 'spider_img', '//dd[re:test(@class,"theme_body_1611")]//ul[re:test(@class,"zl_r_af")]//img[@src]' ) l.add_value('spider_imgs', '//*[@id="photos"]//div[@class="panel"]') l.add_xpath( 'title', 'normalize-space(//dd[re:test(@class,"theme_body_1611")]//h1)') l.add_xpath('attr', '//dd[re:test(@class,"theme_body_1611")]/ol//text()') l.add_value('attr_value', []) l.add_xpath( 'content', '//dd[re:test(@class,"theme_body_1611")]//ul[re:test(@class,"zl_r_b zl_r_bt")]/node()' ) l.add_value('keywords', '') l.add_value('description', '') l.add_value('thumbs', '') l.add_value('catid', self.catid) l.add_value('status', self.status) l.add_value('sysadd', self.sysadd) l.add_value('typeid', self.typeid) l.add_value('inputtime', int(time.time())) l.add_value('updatetime', int(time.time())) l.add_value('create_time', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) l.add_value('update_time', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) # l.add_xpath('content', '//dd[re:test(@class,"theme_body_4656")]//table[2]//tr[3]/td') # l.add_xpath('content', '//dd[re:test(@class,"theme_body_4656")]//table[2]//tr[3]/td//text()') # l.add_xpath('attr', '//dd[re:test(@class,"theme_body_1611")]/ol/span/text()') # l.add_xpath('attr_value', '//dd[re:test(@class,"theme_body_1611")]/ol/text()') d = l.load_item() # print(d) yield d def parse_content_item(self, selector): pass
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name' : "//body/div[@id='container']/div[@id='content']/h1", 'price' : "//div[@id='content']/div[@class='product-info']/div[@class='right']/div[@class='price']/text()", 'category' : "//div[@id='container']/div[@id='content']/div[@class='breadcrumb']/a", 'description' : "//body/div[@id='container']/div[@id='content']/div[@id='tab-attribute']", 'images' : "//div[@class='left']/div[@class='image']/a[@class='colorbox cboxElement']/img/@src", 'canonical' : "//link[@rel='canonical']/@href", 'base_url' : "//base/@href", 'brand' : "" } name = 'laptopnew.vn' allowed_domains = ['laptopnew.vn'] start_urls = ['http://laptopnew.vn'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['']), 'parse_item'), Rule(LinkExtractor(allow=['']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
import furl from scrapy.linkextractors import LinkExtractor from ..items import WebToonItem link_extractor = LinkExtractor( r'/webtoon/detail.nhn\?titleId=\d+&no=\d+&weekday=\w+$', restrict_xpaths=".//a[contains(@onclick, 'lst.title')]") def _extract_title(response): xpath = "//div[@class='comicinfo']/div[@class='detail']/h2/text()" return response.xpath(xpath).get().strip() def _extract_description(response): xpath = "//div[@class='comicinfo']/div[@class='detail']/p/text()" return '\n'.join([desc.get() for desc in response.xpath(xpath)]) def _extract_thumbnail_src(response, titleId, **kwargs): contain = f"https://shared-comic.pstatic.net/thumb/webtoon/{titleId}/thumbnail/" for thumb in response.xpath(f"//img[contains(@src, '{contain}')]/@src"): return thumb.get() def _extract_author(response): xpath = ".//span[@class='wrt_nm']/text()" return response.xpath(xpath).get().strip()
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//div[@class='product-info']/div[@class='info']/h1[@class='p-name cufon']", 'price': "//div[@class='price cufon']/span[@class='num']", 'category': "//div[@id='navation']/nav/ul/li/a/span", 'description': "", 'images': "//div[@class='pic-thumb']/span[@class='wp-pic']/img[@class='zoom-pic']/@src", 'canonical': "", 'base_url': "", 'brand': "" } name = 'nonson.vn' allowed_domains = ['nonson.vn'] start_urls = ['http://www.nonson.vn'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(), 'parse_item'), Rule(LinkExtractor(), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
def process_response(self, response): """ Process the given scrapy response. Extract new URLs, HTTP parameters, new network locations, cookies and code comments. :return: a set of URLs that shall be crawled in the future """ if response.status == 404: return set() # store response HTTP code if not redirect if not (response.status == 301 or response.status == 302): if response.url not in self.crawled_urls: self.crawled_urls[response.url] = response.status # some colorful printing if self.verbose: code = str(response.status) extra_print = "" if code[0] == "2": color = util.GREEN elif code[0] == "3": color = util.BRIGHT_CYAN extra_print = (util.BRIGHT_CYAN + " --> " + util.SANE + response.headers["Location"].decode()) elif code[0] == "4": color = util.RED elif code[0] == "5": color = util.MAGENTA else: color = util.SANE print_str = " [" + color + str( response.status ) + util.SANE + "] " + response.url + extra_print util.printit(print_str) # extract cookies and their paths from HTTP response header cookie_paths = self.extract_cookies( response.headers.getlist("Set-Cookie"), response.url) cookie_urls = set() for path in cookie_paths: cookie_urls.add(self.to_absolute_url(path, response.urljoin)) # use scrapy's lxml linkextractor to extract links / URLs scrapy_urls = set() try: # extract <base> URL's domain if a <base> tag exists base_domain = "" base_tag_sels = response.xpath("//base") for base_tag_sel in base_tag_sels: href_sels = base_tag_sel.xpath("@href") if href_sels: href = href_sels.extract_first() base_domain = urllib.parse.urlparse(href).netloc break # setup allowed domains and extract new links allowed_domains = [self.domain, "%s:%s" % (self.domain, self.port)] if base_domain: allowed_domains.append(base_domain) raw_scrapy_links = LinkExtractor( allow_domains=allowed_domains, tags=("a", "area", "script", "link", "source", "img"), attrs=("src", "href"), deny_extensions=set()).extract_links(response) raw_scrapy_urls = [link.url for link in raw_scrapy_links] # copy discovered URLs and additionally insert initial network location scrapy_urls = raw_scrapy_urls.copy() if base_domain and base_domain != allowed_domains[ 0] and base_domain != allowed_domains[1]: orig_netloc = urllib.parse.urlparse(response.url).netloc for scrapy_url in raw_scrapy_urls: parsed_scrapy_url = list(urllib.parse.urlsplit(scrapy_url)) parsed_scrapy_url[1] = orig_netloc scrapy_urls.append( urllib.parse.urlunsplit(parsed_scrapy_url)) scrapy_urls = set(scrapy_urls) except (AttributeError, scrapy.exceptions.NotSupported) as e: if str(e) == "Response content isn't text": # stop processing and return no new URLs return set() raise e # run the different URL / link discovery mechanisms linkfinder_urls, dynamic_urls, form_urls, sub_urls = set(), set(), set( ), set() if self.config["use_linkfinder"].lower() == "true": linkfinder_urls = self.run_linkfinder(response.text, response.urljoin) if self.config["use_selenium"].lower() == "true": dynamic_urls = self.extract_dynamic_urls(response.url) if self.config["extract_info_from_forms"].lower() == "true": form_data = extract_form_data(response) # extract new URLs and HTTP parameters from parsed form data form_urls = self.process_form_data(form_data, response.urljoin) # extract sub URLs, i.e. URLs with parent paths sub_urls = extract_sub_urls(response.url) # extract comments if configured if self.config["extract_comments"].lower() == "true": self.extract_comments(response) # unite discovered URLs urls = set() urls |= cookie_urls urls |= scrapy_urls urls |= linkfinder_urls urls |= dynamic_urls urls |= form_urls urls |= sub_urls # store info about redirect and add redirect URL to discovered URLs if response.status == 301 or response.status == 302: location = response.headers["Location"].decode() self.redirects[response.url] = { "code": response.status, "to": location } urls.add(self.to_absolute_url(location, response.urljoin)) # process all the discovered URLs, i.e. extract new information and decide which to crawl yield_urls = set() for url in urls: # strip anchor if "#" in url: url = url[:url.rfind("#")] # replace entities and parse URL url = url.replace("&", "&") url = url.replace("&", "&") parsed_url = urllib.parse.urlparse(url) # disregard information about directory listing sorting if parsed_url.path.endswith("/") and re.match( "C=[A-Z];O=[A-Z]", parsed_url.query): continue # extract GET parameters and cut URL if option is configured params = {} if parsed_url.query: if self.config["crawl_parameter_links"].lower() != "true": url = "%s://%s/%s" % (parsed_url.scheme, parsed_url.netloc, parsed_url.path) params = get_query_params(parsed_url.query) elif url.endswith("?"): url = url[:-1] # add URL as instance of its path if self.url_has_netloc(url) and params: self.add_path_instance(parsed_url.path, params, {}, {}) # skip already crawled URLs if url in self.found_urls: continue self.found_urls.add(url) # skip URLs with different network location if not self.url_has_netloc(url): continue if url == response.url: continue # skip paths that are excluded from crawling if self.exclude_paths and url.count("/") > 2: check_str = "/" + "/".join(url.split("/")[3:]) if any( re_path.match(check_str) for re_path in self.exclude_paths): continue # check whether to add this URL to the to-be-crawled URLs if url not in yield_urls: # limit the crawling depth max_depth = int(self.config["max_depth"]) if max_depth > 0: depth = parsed_url.path.count("/") if depth > max_depth: continue # limit the number of times a path can be crawled to avoid endless # crawling upon GET parameter variation if parsed_url.path not in self.crawled_paths: self.crawled_paths[parsed_url.path] = 0 self.crawled_paths[parsed_url.path] += 1 if self.crawled_paths[parsed_url.path] > int( self.config["max_path_visits"]): continue yield_urls.add(url) return yield_urls
class ITjuziSpider(RedisCrawlSpider): name = 'itjuzi' allowed_domains = ['www.itjuzi.com'] # start_urls = ['http://www.itjuzi.com/company'] redis_key = 'itjuzispider:start_urls' rules = [ # 获取每一页的链接 Rule(link_extractor=LinkExtractor(allow=('/company\?page=\d+'))), # 获取每一个公司的详情 Rule(link_extractor=LinkExtractor(allow=('/company/\d+')), callback='parse_item') ] def parse_item(self, response): soup = BeautifulSoup(response.body, 'lxml') # 开头部分: //div[@class="infoheadrow-v2 ugc-block-item"] cpy1 = soup.find('div', class_='infoheadrow-v2') if cpy1: # 公司名称://span[@class="title"]/b/text()[1] company_name = cpy1.find( class_='title').b.contents[0].strip().replace('\t', '').replace( '\n', '') # 口号: //div[@class="info-line"]/p slogan = cpy1.find(class_='info-line').p.get_text() # 分类:子分类//span[@class="scope c-gray-aset"]/a[1] scope_a = cpy1.find(class_='scope c-gray-aset').find_all('a') # 分类://span[@class="scope c-gray-aset"]/a[1] scope = scope_a[0].get_text().strip() if len(scope_a) > 0 else '' # 子分类:# //span[@class="scope c-gray-aset"]/a[2] sub_scope = scope_a[1].get_text().strip( ) if len(scope_a) > 1 else '' # 城市+区域://span[@class="loca c-gray-aset"]/a city_a = cpy1.find(class_='loca c-gray-aset').find_all('a') # 城市://span[@class="loca c-gray-aset"]/a[1] city = city_a[0].get_text().strip() if len(city_a) > 0 else '' # 区域://span[@class="loca c-gray-aset"]/a[2] area = city_a[1].get_text().strip() if len(city_a) > 1 else '' # 主页://a[@class="weblink marl10"]/@href home_page = cpy1.find(class_='weblink marl10')['href'] # 标签://div[@class="tagset dbi c-gray-aset"]/a tags = cpy1.find(class_='tagset dbi c-gray-aset').get_text().strip( ).strip().replace('\n', ',') #基本信息://div[@class="block-inc-info on-edit-hide"] cpy2 = soup.find('div', class_='block-inc-info on-edit-hide') if cpy2: # 公司简介://div[@class="block-inc-info on-edit-hide"]//div[@class="des"] company_intro = cpy2.find(class_='des').get_text().strip() # 公司全称:成立时间:公司规模:运行状态://div[@class="des-more"] cpy2_content = cpy2.find(class_='des-more').contents # 公司全称://div[@class="des-more"]/div[1] company_full_name = cpy2_content[1].get_text().strip( )[len('公司全称:'):] if cpy2_content[1] else '' # 成立时间://div[@class="des-more"]/div[2]/span[1] found_time = cpy2_content[3].contents[1].get_text().strip( )[len('成立时间:'):] if cpy2_content[3] else '' # 公司规模://div[@class="des-more"]/div[2]/span[2] company_size = cpy2_content[3].contents[3].get_text().strip( )[len('公司规模:'):] if cpy2_content[3] else '' #运营状态://div[@class="des-more"]/div[3] company_status = cpy2_content[5].get_text().strip( ) if cpy2_content[5] else '' # 主体信息: main = soup.find('div', class_='main') # 投资情况://table[@class="list-round-v2 need2login"] # 投资情况,包含获投时间、融资阶段、融资金额、投资公司 tz = main.find('table', 'list-round-v2') tz_list = [] if tz: all_tr = tz.find_all('tr') for tr in all_tr: tz_dict = {} all_td = tr.find_all('td') tz_dict['tz_time'] = all_td[0].span.get_text().strip() tz_dict['tz_round'] = all_td[1].get_text().strip() tz_dict['tz_finades'] = all_td[2].get_text().strip() tz_dict['tz_capital'] = all_td[3].get_text().strip().replace( '\n', ',') tz_list.append(tz_dict) # 团队信息:成员姓名、成员职称、成员介绍 tm = main.find('ul', class_='list-prodcase limited-itemnum') tm_list = [] if tm: for li in tm.find_all('li'): tm_dict = {} tm_dict['tm_m_name'] = li.find('span', class_='c').get_text().strip() tm_dict['tm_m_title'] = li.find( 'span', class_='c-gray').get_text().strip() tm_dict['tm_m_intro'] = li.find( 'p', class_='mart10 person-des').get_text().strip() tm_list.append(tm_dict) # 产品信息:产品名称、产品类型、产品介绍 pdt = main.find('ul', class_='list-prod limited-itemnum') pdt_list = [] if pdt: for li in pdt.find_all('li'): pdt_dict = {} pdt_dict['pdt_name'] = li.find('h4').b.get_text().strip() pdt_dict['pdt_type'] = li.find( 'span', class_='tag yellow').get_text().strip() pdt_dict['pdt_intro'] = li.find( class_='on-edit-hide').p.get_text().strip() pdt_list.append(pdt_dict) item = JuziItem() item['info_id'] = response.url.split('/')[-1:][0] item['company_name'] = company_name item['slogan'] = slogan item['scope'] = scope item['sub_scope'] = sub_scope item['city'] = city item['area'] = area item['home_page'] = home_page item['tags'] = tags item['company_intro'] = company_intro item['company_full_name'] = company_full_name item['found_time'] = found_time item['company_size'] = company_size item['company_status'] = company_status item['tz_info'] = tz_list item['tm_info'] = tm_list item['pdt_info'] = pdt_list return item
class BjdfundSpider(CrawlSpider): name='bjdall' source = "京报网" allowed_domains = ["bjd.com.cn"] reg='jx' start_urls = [ 'http://www.bjd.com.cn/jx/toutiao/', 'http://www.bjd.com.cn/jx/jj/' ] rules=( Rule(LinkExtractor(allow=reg), callback="parse_news", follow=True), Rule(LinkExtractor(allow='_[0-9].\.html')), ) def printcn(uni): for i in uni: print uni.encode('utf-8') def parse_news(self,response): item = GenericItem() self.get_id(response,item) self.get_url(response,item) self.get_source(response,item) self.get_title(response,item) self.get_date(response,item) self.get_body(response,item) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse return item def get_id(self,response,item): id=uuid.uuid4() if id: item['id']=id def get_url(self,response,item): news_url=response.url if news_url: item['url']=news_url def get_source(self,response,item): source=self.source if source: item['source']=source def get_title(self,response,item): title=response.xpath('//div[@class="tit"]/text()').extract() if title: item['title']=''.join(title).strip() def get_date(self,response,item): date=response.xpath('//div[@class="info"]/span[1]/text()').extract() if date: item['date']=''.join(date).replace(u'-',u'').replace(u':',u'').replace(u' ',u'').strip() def get_body(self,response,item): paras = response.xpath('//div[@class="TRS_Editor"]/p') if not paras: paras = response.xpath('//div[@class="TRS_Editor"]') news_body = '' for p in paras: data = p.xpath('string(.)').extract() if data: body = '' for line in ''.join(data).splitlines(): # print entry.encode('utf-8') body += line.strip() news_body += body + '_|_' item['body'] = news_body.replace('_|__|_','_|_')
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//h1[@class='h1Title']", 'price': "//div[@class='row_infoP']/span[@class='dt_price']", 'category': "//div[@class='path flt']/a/span", 'description': "//div[@id='tabs_detail_content']/div[@class='section'][1]", 'images': "//img[@id='mainImage']/@src", 'canonical': "", 'base_url': "", 'brand': "" } name = 'amazona.vn' allowed_domains = ['amazona.vn'] start_urls = ['http://amazona.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/san-pham/[a-zA-Z0-9-]+\.html$']), 'parse_item'), Rule(LinkExtractor(allow=['/danh-muc/[a-zA-Z0-9-]+\.html($|\?page=\d+)']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class EastmoneySpider(CrawlSpider): name = 'eastmoney' source = "东方财富网" allowed_domains = ["eastmoney.com"] yesterday = datetime.date.today() - datetime.timedelta(days=1) yesterday = yesterday.strftime('%Y%m%d') reg = yesterday start_urls = [ 'http://finance.eastmoney.com/news/ccjdd.html', 'http://finance.eastmoney.com/news/cywjh.html', 'http://finance.eastmoney.com/news/chgjj.html', 'http://finance.eastmoney.com/news/cjrzb.html', 'http://finance.eastmoney.com/news/ccyjj.html', 'http://finance.eastmoney.com/news/cssgs.html', 'http://finance.eastmoney.com/news/cgnjj.html', 'http://finance.eastmoney.com/news/cgjjj.html', 'http://finance.eastmoney.com/news/ccjxw.html', 'http://finance.eastmoney.com/news/cjjsp.html', 'http://finance.eastmoney.com/news/ccyts.html', 'http://finance.eastmoney.com/news/csygc.html', 'http://finance.eastmoney.com/news/czfgy.html', 'http://finance.eastmoney.com/news/csyjy.html', 'http://finance.eastmoney.com/news/cjjxr.html', 'http://finance.eastmoney.com/news/csxy.html', 'http://finance.eastmoney.com/news/czsdc.html', 'http://finance.eastmoney.com/news/crdsm.html', 'http://stock.eastmoney.com/news/cgsxw.html' ] rules = ( Rule(LinkExtractor(allow=reg, deny='data.eastmoney.com'), callback="parse_news", follow=True), # Rule(LinkExtractor(allow='_[0-9]+.html')) Rule(LinkExtractor(allow='_[1-6].html'))) def printcn(uni): for i in uni: print uni.encode('utf-8') def parse_news(self, response): item = GenericItem() self.get_id(response, item) self.get_url(response, item) self.get_source(response, item) self.get_title(response, item) self.get_date(response, item) self.get_body(response, item) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse if item['body']: return item def get_id(self, response, item): id = uuid.uuid4() if id: item['id'] = id def get_url(self, response, item): news_url = response.url if news_url: item['url'] = news_url def get_source(self, response, item): source = self.source if source: item['source'] = source def get_title(self, response, item): title = response.xpath( '//div[@class="newsContent"]/h1/text()').extract() if title: item['title'] = title def get_date(self, response, item): date = response.xpath('//div[@class="time"]/text()').extract() if date: item['date'] = ''.join(date).replace(u'年', u'').replace( u'月', u'').replace(u'日', u'').replace(u':', u'').replace( u' ', u'') + '00' def get_body(self, response, item): abstract = response.xpath('//div[@class="b-review"]/text()').extract() paras = response.xpath('//*[@id="ContentBody"]/p') news_body = '' for p in paras: data = p.xpath('string(.)').extract() if data: body = '' for line in ''.join(data).splitlines(): # print entry.encode('utf-8') body += line.strip() news_body += body + '_|_' item['body'] = ''.join(abstract) + '_|_' + news_body.replace( '_|__|_', '_|_')
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//div[@class='product-col-desc']/h1[@class='title']", 'price': "//div[@class='product-price']/strong[@class='colorh']", 'category': "//div[@class='BreadcrumbText']/a", 'description': "//div[@class='content-responsive']/div[@class='content news']", 'images': "//div[@class='clearfix']/a/@href", 'canonical': "", 'base_url': "", 'brand': "" } name = 'lanopearl.com.vn' allowed_domains = ['lanopearl.com.vn'] start_urls = ['http://lanopearl.com.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/san-pham/[a-zA-Z0-9-/]+\.html$']), 'parse_item'), Rule(LinkExtractor(allow=['/[a-zA-Z0-9-]+/$']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class TruliaSpider(scrapy.Spider): name = 'sold_150' allowed_domains = ['trulia.com'] custom_settings = { 'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/iterate/sold_%(start)s_%(time)s.jl'), 'FEED_FORMAT': 'jsonlines' } def __init__(self, state='IL', city='Chicago', start=150, *args, **kwargs): super().__init__(*args, **kwargs) self.state = state self.city = city self.start = start self.start_urls = [ 'https://www.trulia.com/sold/{city},{state}/'.format(state=state, city=city) ] self.le = LinkExtractor(allow=r'^https://www.trulia.com/p/') def parse(self, response): #N = 598 #trulia.TruliaSpider.get_number_of_pages_to_scrape(response) M = self.start N = M + 50 self.logger.info( "Seaching between index page {M} and index page {N} ".format(N=N, M=M)) for url in [ response.urljoin("{n}_p/".format(n=n)) for n in range(M, N + 1) ]: yield scrapy.Request(url=url, callback=self.parse_index_page) def parse_index_page(self, response): for link in self.le.extract_links(response): yield scrapy.Request(url=link.url, callback=self.parse_property_page) def parse_property_page(self, response): item_loader = TruliaItemLoader(item=TruliaItem(), response=response) trulia.TruliaSpider.load_common_fields(item_loader=item_loader, response=response) details = item_loader.nested_css('.homeDetailsHeading') taxes = details.nested_xpath( './/*[text() = "Property Taxes and Assessment"]/parent::div') taxes.add_xpath( 'property_tax_assessment_year', './following-sibling::div/div[contains(text(), "Year")]/following-sibling::div/text()' ) taxes.add_xpath( 'property_tax', './following-sibling::div/div[contains(text(), "Tax")]/following-sibling::div/text()' ) taxes.add_xpath( 'property_tax_assessment_land', './following-sibling::div/div/div[contains(text(), "Land")]/following-sibling::div/text()' ) taxes.add_xpath( 'property_tax_assessment_improvements', './following-sibling::div/div/div[contains(text(), "Improvements")]/following-sibling::div/text()' ) taxes.add_xpath( 'property_tax_assessment_total', './following-sibling::div/div/div[contains(text(), "Total")]/following-sibling::div/text()' ) taxes.add_xpath( 'property_tax_market_value', './following-sibling::div/div[contains(text(), "Market Value")]/following-sibling::div/text()' ) item = item_loader.load_item() trulia.TruliaSpider.post_process(item=item) return item
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name' : "//div[@class='container']/form/div/div/p", 'price' : "//div[@class='container']/form/div/div/label", 'category' : "//div[@class='content left']/a", 'description' : "//div[@class='center-content']/div[@class='center-content']/div[@class='container']/div[@class='container']", 'images' : "//a[@class='gallery']/img/@src", 'canonical' : "", 'base_url' : "", 'brand' : "" } name = 'anhemfeather.vn' allowed_domains = ['anhemfeather.vn'] start_urls = ['http://anhemfeather.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow = ['/task,view/']), 'parse_item'), Rule(LinkExtractor(allow = ['/task,cat/']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class zhiLianSpider(RedisCrawlSpider): name = "zhilian_1" start_urls = 'https://sou.zhaopin.com/?jl=489' def __init__(self, *args, **kwargs): self.allowed_domains = ['zhaopin.com', 'baidu.com'] self.start_urls = 'https://sou.zhaopin.com/?jl=489' super().__init__(*args, **kwargs) # for x in self.create_2_request("123"): # pass manu_conn_redis1.lpush("zhilian_1:start_urls", start_urls) rules = ( Rule(LinkExtractor(allow=(r"zhaopin\.com/\?jl")), callback='process_item_c1', follow=True), Rule(LinkExtractor( allow=(r"https://fe-api\.zhaopin\.com/c/i/sou/\?cityId.+")), callback='process_item_c1', follow=True), ) def test_rule_is_have_use(self, response): manu_conn_redis1.lpush("test_content", response.text) #redisSpider说,一定需要定义好parse函数,那没办法了,那就定义吧. def process_item_c1(self, response): print("I am the kumanxuan") # print(response.url) self.handle_json_2_item(response) # def parse(self,response): # yield scrapy.Request("https://www.baidu.com") #由于智联招聘现在的前端已经换了模式,所以需要采用特殊模式,直接当作是客户端处理 def handle_json_2_item(self, response): #格式化内容,转换json变成dict #response里面有text方法的,不过大多数情况是使用xpath这个方法的,因为这个用途比较广泛 pass response_dict = json.loads(response.text) # items = SpiderZhilianItem() for x in response_dict['data']['results']: items = {} items['city_name'] = x['city']['display'] items['city'] = x['city']['items'][0]['code'] items['company_name'] = x['company']['name'] items['number'] = x['company']['number'] items['education'] = x['eduLevel']['name'] items['experience'] = x['workingExp']['name'] items['salary'] = x['salary'] items['job_name'] = x['jobName'] #这里没完成一次,都把这些字典数据保存到redis数据库当中.! manu_conn_redis1.lpush('res_' + str(items['city']), items) yield items # return items #创建批量的请求, #这里的话,还需要解析出具体的省份信息 #此处,应该只能运行一次!因为只是负责生成批量的url地址.! #对了,就是直接将这方法扔到init的地方就可以完成初始化了.! def create_2_request(self, response): #那需要获取省份信息的话,那就调用旁边的啦!.哈哈.提高利用率嘛. #这里就先假设是用热门城市来做了 hot_city = return_province_info()['hot_citys'] #制作大量url地址,又重新提交给scrapy #然后就是code和url配合 redis_conn = manu_conn_redis1 redis_conn.lpush("zhilian_1:start_urls", self.start_urls) for x in hot_city: for x1 in crawler_zhilian(x['code']): for x2 in x1: print(x2) yield redis_conn.lpush("zhilian_1:start_urls", x2)
class PlymouthSpider(CrawlSpider): name = 'worcester_s' allowed_domains = ['www.worcester.ac.uk'] start_urls = [] base_url = 'https://www.worcester.ac.uk%s' Lists = [ '/journey/uwic-programme-accountancy-business-marketing.html', '/journey/accounting-and-finance-ba-hons-wbs.html', '/journey/advancing-practice-msc.html', '/journey/allied-health-studies-mphil-phd.html', '/journey/animal-biology-bsc-hons.html', '/journey/animal-biology-mbiol-integrated-masters.html', '/journey/animal-biology-mphil-phd.html', '/journey/animal-biology-degrees.html', '/journey/animation-ba-hons.html', '/journey/animation-degrees.html', '/journey/applied-health-social-care-ba-top-up.html', '/journey/applied-sport-science-msc.html', '/journey/applied-sports-performance-analysis-msc.html', '/journey/arabic-module.html', '/journey/archaeology-heritage-studies-degrees.html', '/journey/archaeology-mphil-phd.html', '/journey/master-research-archaeology.html', '/journey/archaeology-and-heritage-studies-ba-hons.html', '/journey/art-and-design-mphil-phd.html', '/journey/uwic-programme-art-design-creative-media.html', '/journey/atmospheric-sciences-mphil-phd.html', '/journey/biochemistry-bsc-hons.html', '/journey/biochemistry-mbiol-integrated-masters.html', '/journey/biochemistry-mphil-phd.html', '/journey/biology-degrees.html', '/journey/biology-bsc-hons.html', '/journey/biology-mbiol-integrated-masters.html', '/journey/master-research-biology.html', '/journey/biomedical-sciences-bsc-hons.html', '/journey/birth-and-beyond-ba-top-up.html', '/journey/birth-beyond-fda.html', '/journey/business-and-accountancy-ba-hons.html', '/journey/business-and-digital-communications-ba-hons.html', '/journey/business-and-enterprise-ba-hons.html', '/journey/business-and-finance-ba-hons.html', '/journey/business-and-human-resource-management-ba-hons.html', '/journey/business-and-marketing-ba-hons.html', '/journey/uwic-programme-pg-business.html', '/journey/business-administration-ba-hons.html', '/journey/business-information-technology-bsc-hons.html', '/journey/business-mphil-phd.html', '/journey/business-management-ba-hons-top-up.html', '/journey/business-management-ba-hons-wbs.html', '/journey/business-management-hnd.html', '/journey/business-management-degrees.html', '/journey/business-psychology-bsc-hons.html', '/journey/business-studies-ba-hons.html', '/journey/business-economics-finance-ba-hons-wbs.html', '/journey/online-celta-course.html', '/journey/chartered-manager-degree-apprenticeship.html', '/journey/child-adolescent-mental-health-fdsc.html', '/journey/child-adolescent-mental-health-bsc-hons-top-up.html', '/journey/mres-clinical-education.html', '/journey/clinical-psychology-bsc-hons.html', '/journey/collaborative-working-with-children-young-people-families-fda.html', '/journey/computer-games-design-development-bsc-hons.html', '/journey/computing-bsc-hons.html', '/journey/computing-hnd.html', '/journey/computing-mphil-phd.html', '/journey/computing-degrees.html', '/journey/counselling-fdsc.html', '/journey/counselling-msc.html', '/journey/counselling-psychology-bsc-hons.html', '/journey/creative-professional-writing-ba-hons.html', '/journey/creative-professional-writing-degrees.html', '/journey/creative-digital-media-mphil-phd.html', '/journey/creative-digital-media-degrees.html', '/journey/creative-digital-media-ba-hons.html', '/journey/creative-media-ma.html', '/journey/cricket-coaching-management-bsc-hons.html', '/journey/criminology-ba-hons.html', '/journey/criminology-mphil-phd.html', '/journey/criminology-with-policing-ba-hons.html', '/journey/dance-hnd.html', '/journey/dance-and-community-practice-ba-hons.html', '/journey/dementia-studies-mphil-phd.html', '/journey/dental-technology-fdsc.html', '/journey/design-mres.html', '/journey/developmental-psychology-bsc-hons.html', '/journey/diploma-in-education-and-training.html', '/journey/doctor-business-administration-dba.html', '/journey/doctor-education-edd.html', '/journey/drama-performance-degrees.html', '/journey/drama-performance-ma.html', '/journey/drama-performance-ba-hons.html', '/journey/drama-and-performance-mphil-phd.html', '/journey/msc-emdr-therapy.html', '/journey/early-childhood-professional-practice-ba-hons.html', '/journey/early-modern-studies-mres.html', '/journey/early-years-foundation-degree-flexible-distributed-learning-pathway.html', '/journey/early-years-sector-endorsed-fda.html', '/journey/ecology-bsc-hons.html', '/journey/ecology-mphil-phd.html', '/journey/mres-ecology-environmental-management.html', '/journey/ecology-degrees.html', '/journey/education-ma.html', '/journey/education-mphil-phd.html', '/journey/mres-education.html', '/journey/education-studies-ba-hons.html', '/journey/education-studies-degrees.html', '/journey/english-language-studies-ba-hons.html', '/journey/english-language-degrees.html', '/journey/english-literature-ba-hons.html', '/journey/english-literature-and-language-mphil-phd.html', '/journey/english-literature-degrees.html', '/journey/entrepreneurship-ba-hons-wbs.html', '/journey/environmental-science-bsc-hons.html', '/journey/environmental-science-degrees.html', '/journey/environmental-studies-science-mphil-phd.html', '/journey/film-production-ba-hons.html', '/journey/film-production-degrees.html', '/journey/film-studies-ba-hons.html', '/journey/film-studies-mphil-phd.html', '/journey/film-studies-degrees.html', '/journey/fine-art-practice-ba-hons.html', '/journey/fine-art-mres.html', '/journey/fine-art-degrees.html', '/journey/football-business-management-coaching-fdsc.html', '/journey/forensic-psychology-bsc-hons.html', '/journey/forensic-and-applied-biology-bsc-hons.html', '/journey/free-general-english-classes.html', '/journey/french-module.html', '/journey/game-art-ba-hons.html', '/journey/game-art-degrees.html', '/journey/general-english-classes-advanced.html', '/journey/general-english-classes-english-foreign-language.html', '/journey/geography-bsc-hons.html', '/journey/geography-degrees.html', '/journey/german-module.html', '/journey/graphic-design-ba-hons.html', '/journey/graphic-design-multimedia-degrees.html', '/journey/green-media-mres.html', '/journey/health-sciences-bsc-hons.html', '/journey/health-and-social-care-fdsc.html', '/journey/higher-education-mapgdippgcert.html', '/journey/history-ba-hons.html', '/journey/history-mphil-phd.html', '/journey/history-mres.html', '/journey/history-degrees.html', '/journey/human-biology-bsc-hons.html', '/journey/human-biology-mbiol-integrated-masters.html', '/journey/human-biology-mphil-phd.html', '/journey/human-biology-degrees.html', '/journey/human-geography-ba-hons.html', '/journey/huma-geography-mphil-phd.html', '/journey/human-nutrition-bsc-hons.html', '/journey/human-nutrition-degrees.html', '/journey/human-resource-management-ma.html', '/journey/management-human-resources-msc.html', '/journey/ielts-preparation-classes-english-foreign-language.html', '/journey/illustration-ba-hons.html', '/journey/illustration-degrees.html', '/journey/language-module-improving-english-in-academic-language.html', '/journey/language-module-improving-english-in-academic-writing-non-native-speakers.html', '/journey/integrated-working-children-families-ba-hons-top-up-degree.html', '/journey/integrative-counselling-ba-hons.html', '/journey/integrative-counselling-fda.html', '/journey/international-business-management-ba-hons-wbs.html', '/journey/international-finance-ba-hons-top-up-wbs.html', '/journey/management-msc.html', '/journey/international-sport-management-msc.html', '/journey/introduction-to-teaching-english-foreign-language.html', '/journey/intro-teaching-english-as-a-foreign-language-lang1001.html', '/journey/introduction-to-tefl-language-awareness-lang1012.html', '/journey/introduction-to-tefl-lang1013.html', '/journey/italian-module.html', '/journey/japanese-module.html', '/journey/journalism-ba-hons.html', '/journey/journalism-degrees.html', '/journey/21020.html', '/journey/language-awareness-and-analysis-tefl-module.html', '/journey/uwic-programme-law.html', '/journey/law-llb-hons.html', '/journey/law-with-criminology-llb-hons.html', '/journey/law-with-forensic-psychology-llb-hons.html', '/journey/leadership-and-management-fda-ba.html', '/journey/leading-culture-change-in-safeguarding-pgcert.html', '/journey/leading-early-years-practice-pgcert.html', '/journey/learning-support-fda.html', '/journey/learning-and-development-early-years-to-adolescence-fda.html', '/journey/teaching-and-learning-in-higher-education-pg-cert.html', '/journey/master-of-business-administration-mba.html', '/journey/mba-executive-leadership-and-management-part-time.html', '/journey/marketing-ba-hons-wbs.html', '/journey/marketing-advertising-and-public-relations-ba-hons-wbs.html', '/journey/mathematics-mphil-phd.html', '/journey/mathematics-bsc-hons.html', '/journey/media-and-cultural-studies-ba-hons.html', '/journey/media-culture-degrees.html', '/journey/media-and-cultural-studies-mphil-phd.html', '/journey/mental-health-fdsc.html', '/journey/mentoring-in-early-childhood-pgcert.html', '/journey/midwifery-bsc-hons.html', '/journey/midwifery-mphil-phd.html', '/journey/music-education-ba-mmus.html', '/journey/national-award-senco-nasc-special-educational-needs-coordination-pg-cert.html', '/journey/nursing-bsc-hons.html', '/journey/nursing-mphil-phd.html', '/journey/nursing-studies-bsc-hons.html', '/journey/nutrition-and-health-access-module.html', '/journey/nutritional-therapy-msc.html', '/journey/occupational-therapy-bsc-hons.html', '/journey/occupational-therapy-mphil-phd.html', '/journey/business-psychology-msc-occupational-psychology-msc.html', '/journey/open-short-language-courses.html', '/journey/outdoor-adventure-leadership-management-bsc-hons.html', '/journey/outdoor-education-ma.html', '/journey/supervision-pgcert.html', '/journey/postgraduate-certificate-in-education-pgce-primary.html', '/journey/pgce-primary-mathematics.html', '/journey/pgce-primary-physical-education.html', '/journey/pgce-school-direct-primary.html', '/journey/pgce-school-direct-secondary.html', '/journey/postgraduate-certificate-in-education-pgce-secondary.html', '/journey/paramedic-science-bsc-hons.html', '/journey/pharmacology-bsc-hons.html', '/journey/physical-education-bsc-hons.html', '/journey/physical-education-and-dance-ba-hons.html', '/journey/physical-education-and-outdoor-education-bsc-hons.html', '/journey/physical-education-degrees.html', '/journey/physical-geography-bsc-hons.html', '/journey/physical-geography-mphil-phd.html', '/journey/physician-associate-msc.html', '/journey/physiotherapy-bsc-hons.html', '/journey/plant-biology-mphil-phd.html', '/journey/politics-ba-hons.html', '/journey/positive-psychology-coaching-pgcert.html', '/journey/primary-initial-teacher-education-ba-hons.html', '/journey/primary-outdoor-education-ba-hons.html', '/journey/professional-practice-ba-hons-top-up-degree .html', '/journey/psychology-bsc-hons.html', '/journey/psychology-mphil-phd.html', '/journey/psychology-msc.html', '/journey/psychology-degrees.html', '/journey/public-health-msc.html', '/journey/religion-philosophy-and-values-in-education-ba-hons.html', '/journey/master-research-river-science.html', '/journey/uwic-programme-science-health-social-science.html', '/journey/screenwriting-ba-hons.html', '/journey/screenwriting-degrees.html', '/journey/social-work-social-policy-mphil-phd.html', '/journey/social-work-ba-hons.html', '/journey/social-work-ma.html', '/journey/master-research-socio-cultural-studies-sport-exercise.html', '/journey/sociology-ba-hons.html', '/journey/sociology-mphil-phd.html', '/journey/sociology-degrees.html', '/journey/special-educational-needs-disabilities-inclusion-ba-hons.html', '/journey/sport-exercise-psychology-bsc-hons.html', '/journey/uwic-programme-sport-exercise-science.html', '/journey/sport-exercise-science-bsc-hons.html', '/journey/sport-exercise-science-mphil-phd.html', '/journey/sport-business-management-bsc-hons.html', '/journey/sport-development-coaching-ba-hons.html', '/journey/sport-coaching-physical-education-hnd.html', '/journey/sports-coaching-msc.html', '/journey/sports-coaching-science-bsc-hons.html', '/journey/sports-coaching-science-degrees.html', '/journey/sports-coaching-science-with-disability-sport-bsc-hons.html', '/journey/sports-studies-bsc-hons.html', '/journey/sports-studies-degrees.html', '/journey/sports-therapy-bsc-hons.html', '/journey/subject-knowledge-enhancement-ske.html', '/journey/teaching-english-celta-course.html', '/journey/teaching-and-learning-fda.html', '/journey/theatre-performance-mres.html', '/journey/mtheatre-touring-theatre-integrated-masters.html', '/journey/understanding-domestic-and-sexual-violence-ma.html', '/journey/university-diploma-in-leadership-and-management.html', '/journey/university-diploma-in-academic-tutoring.html', '/journey/web-development-bsc-hons.html' ] for i in Lists: fullurl = base_url % i start_urls.append(fullurl) rules = ( Rule(LinkExtractor(allow=(r'.*'), restrict_xpaths=('//*[@id="aToZ"]/ul/li/a')), callback='parse_item', follow=True), # Rule(LinkExtractor(allow=r'/courses/.*'),callback='parse_item',follow=True), Rule(LinkExtractor( allow=(r'.*'), restrict_xpaths=( '//*[@class="box__inner box__inner--purple"]//a')), callback='parse_item', follow=False), ) # def parse(self, response): # if self.start_urls == 'https://www.worcester.ac.uk/courses/archaeology-heritage-studies-and-art-design-ba-hons.html': # link_list = response.xpath('//*[@id="#content"]/div/div/section//a/@href') # print("======================++++++++++++++++++++++++++++++++") # print(link_list) # print("======================++++++++++++++++++++++++++++++++") # for i in link_list: # link = "https://www.worcester.ac.uk" + str(i) # yield scrapy.Request(link, callback=self.parse_item) # else: # print('错误页面') def parse_item(self, response): print('==================================', response.url) item = HooliItem() url = response.url print(1, url) university = 'University of Worcester' print(2, university) country = 'UK' city = 'NULL' website = 'https://www.worcester.ac.uk' department = 'NULL' programme_s = response.xpath( '//*[@id="content"]/div/h1//text()').extract() # programme = response.xpath('//section[@class="pageHead"]/h1/text()').extract() programme_s = ''.join(programme_s) if len(programme_s) > 0: programme = programme_s else: programme = 'NULL' print(3, programme) ucas_code = 'NULL' degree_level = '' degree_type = response.xpath( '//*[@id="content"]/div/h1//text()').extract() # degree_type = response.xpath('//section[@class="pageHead"]/h1/text()').extract() degree_type = ''.join(degree_type) degree_type = self.getDegree_type(degree_type) print(4, degree_type) start_date = 'NULL' # start_date = ''.join(start_date) # print(5,start_date) degree_description = 'NULL' overview = response.xpath( '//div[@class="left logo-bg"]//text()').extract() # overview = response.xpath('//div[@class="body-copy"]/ul/li/text()').extract() overview = ''.join(overview) print(5, overview) mode = 'NULL' # mode = ''.join(mode).replace('\r\n','') # mode = mode.replace('\n','') # mode = mode.replace(' ','') # print(7,mode) duration = 'NULL' # duration = ''.join(duration).replace('\r\n','') # duration = duration.replace('\n','') # duration = duration.replace(' ','') # print(8,duration) modules = response.xpath( '//*[@id="section-3"]/div/table/tbody//text()').extract() modules = ''.join(modules) # modules = modules.replace('\n','') print(6, modules) teaching = response.xpath( '//*[@id="section-4"]/div/div//text()').extract() teaching = ''.join(teaching) print(7.7, teaching) assessment = response.xpath( '//*[@id="section-4"]/div/div//text()').extract() assessment = ''.join(assessment) print(7, assessment) career = response.xpath( '//*[@id="section-5"]/div/div//text()').extract() career = ''.join(career) print(8, career) application_date = 'NULL' deadline = 'NULL' application_fee = 'NULL' tuition_fee_s = response.xpath( '//*[@id="section-6"]//text()').extract()[33:37] tuition_fee_s = ''.join(tuition_fee_s) tuition_fee_s = tuition_fee_s.replace('\r\n', '') tuition_fee_s = tuition_fee_s.replace(' ', '') tuition_fee_s = self.getTuition_fee(tuition_fee_s) try: if tuition_fee_s > 0: tuition_fee = tuition_fee_s else: tuition_fee = "NULL" except: tuition_fee = "报错!" print(9, tuition_fee) location = 'worcester' # location = ''.join(location) # print(13,location) GPA = 'NULL' ATAS = 'NULL' average_score = 'NULL' accredited_university = 'NULL' Alevel = 'NULL' IB = 'NULL' IELTS_s = response.xpath('//*[@id="section-2"]//p/text()').extract() IELTS_s = ''.join(IELTS_s) # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) try: if " IELTS" in IELTS_s: start = IELTS_s.find(" IELTS") IELTS = IELTS_s[start:] IELTS = IELTS[:100] item["IELTS"] = IELTS else: IELTS = "NULL" except: IELTS = "报错!" print(10, IELTS) # IELTS = ''.join(IELTS) # # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS) # print(10, IELTS) IELTS_L = 'NULL' IELTS_S = 'NULL' IELTS_R = 'NULL' IELTS_W = 'NULL' TOEFL = 'NULL' TOEFL_L = 'NULL' TOEFL_S = 'NULL' TOEFL_R = 'NULL' TOEFL_W = 'NULL' GRE = 'NULL' GMAT = 'NULL' LSAT = 'NULL' MCAT = 'NULL' working_experience = 'NULL' interview = 'NULL' portfolio = 'NULL' application_documents = 'NULL' how_to_apply = response.xpath('//*[@id="section-7"]//text()').extract() how_to_apply = ''.join(how_to_apply) print(11, how_to_apply) entry_requirements = response.xpath( '//*[@id="section-2"]/div//text()').extract() entry_requirements = ''.join(entry_requirements) print(12, entry_requirements) chinese_requirements = 'NULL' school_test = 'NULL' SATI = 'NULL' SATII = 'NULL' SAT_code = 'NULL' ACT = 'NULL' ACT_code = 'NULL' other = 'NULL' create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(15, create_time) item["url"] = url item["university"] = university item["country"] = country item["city"] = city item["website"] = website item["department"] = department item["programme"] = programme item["ucas_code"] = ucas_code item["degree_level"] = degree_level item["degree_type"] = degree_type item["start_date"] = start_date item["degree_description"] = degree_description item["overview"] = overview item["mode"] = mode item["duration"] = duration item["modules"] = modules item["teaching"] = teaching item["assessment"] = assessment item["career"] = career item["application_date"] = application_date item["deadline"] = deadline item["application_fee"] = application_fee item["tuition_fee"] = tuition_fee item["location"] = location item["ATAS"] = ATAS item["GPA"] = GPA item["average_score"] = average_score item["accredited_university"] = accredited_university item["Alevel"] = Alevel item["IB"] = IB item["IELTS"] = IELTS item["IELTS_L"] = IELTS_L item["IELTS_S"] = IELTS_S item["IELTS_R"] = IELTS_R item["IELTS_W"] = IELTS_W item["TOEFL"] = TOEFL item["TOEFL_L"] = TOEFL_L item["TOEFL_S"] = TOEFL_S item["TOEFL_R"] = TOEFL_R item["TOEFL_W"] = TOEFL_W item["GRE"] = GRE item["GMAT"] = GMAT item["LSAT"] = LSAT item["MCAT"] = MCAT item["working_experience"] = working_experience item["interview"] = interview item["portfolio"] = portfolio item["application_documents"] = application_documents item["how_to_apply"] = how_to_apply item["entry_requirements"] = entry_requirements item["chinese_requirements"] = chinese_requirements item["school_test"] = school_test item["SATI"] = SATI item["SATII"] = SATII item["SAT_code"] = SAT_code item["ACT"] = ACT item["ACT_code"] = ACT_code item["other"] = other item["create_time"] = create_time yield item def getTuition_fee(self, tuition_fee): allfee = re.findall(r'\d+,\d+', tuition_fee) # print(allfee) for index in range(len(allfee)): fee = allfee[index].split(",") allfee[index] = ''.join(fee) # print(allfee[index]) # print(allfee) maxfee = 0 for fee in allfee: if int(fee) >= maxfee: maxfee = int(fee) return maxfee def getDegree_type(self, degree_type): try: if "BSc" in degree_type: degree_type = 'Bsc' elif "MSc" in degree_type: degree_type = "MSc" elif "BA" in degree_type: degree_type = 'BA' elif "MNSW" in degree_type: degree_type = 'MNSW' elif "PGCert" in degree_type: degree_type = 'PGCert' elif "MBA" in degree_type: degree_type = 'MBA' elif "MA" in degree_type: degree_type = 'MA' elif "MComp" in degree_type: degree_type = 'MComp' elif "PhD" in degree_type: degree_type = 'PhD' elif "FdA" in degree_type: degree_type = 'FdA' elif "PGCE" in degree_type: degree_type = 'PGCE' elif "IFP" in degree_type: degree_type = 'IFP' elif "LLB" in degree_type: degree_type = 'LLB' elif "MHealth Res" in degree_type: degree_type = 'MHealth Res' elif "MRes" in degree_type: degree_type = 'MRes' elif "MMed" in degree_type: degree_type = 'MMed' elif "MSci" in degree_type: degree_type = 'MSci' elif "MCh" in degree_type: degree_type = 'MCh' elif "LLM" in degree_type: degree_type = "LLM" elif "Y2QF" in degree_type: degree_type = "Y2QF" elif "Y2QG" in degree_type: degree_type = "Y2QG" elif "HND" in degree_type: degree_type = 'HND' elif len(degree_type) == 0: degree_type = 'NULL' else: degree_type = 'Ordinary degree' except: degree_type = "NULL" return degree_type
from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//h1[@itemprop='name']", 'price': "//p[@class='price']/span[@class='woocommerce-Price-amount amount']", 'category': "", 'description': "", 'images': "//div[@class='images']/a[@itemprop='image']/@href", 'canonical': "//link[@rel='canonical']/@href", 'base_url': "", 'brand': "", 'in_stock': "", 'guarantee': "", 'promotion': "" } name = 'ilahui.vn' allowed_domains = ['ilahui.vn'] start_urls = ['http://ilahui.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [''] rules = [ Rule( LinkExtractor(allow=['/san-pham/[a-zA-Z0-9-]+/$'], deny=['/san-pham/($|page/\d+/$)']), 'parse_item'), Rule(LinkExtractor(allow=['/san-pham/($|page/\d+/$)']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class X10jqkaSpider(CrawlSpider): name = '10jqka' source = "同花顺财经" allowed_domains = ["10jqka.com.cn"] yesterday = datetime.date.today() - datetime.timedelta(days=1) yesterday = yesterday.strftime('%Y%m%d') reg = yesterday start_urls = [ 'http://news.10jqka.com.cn/today_list/', 'http://news.10jqka.com.cn/cjzx_list/', 'http://news.10jqka.com.cn/cjkx_list/', 'http://news.10jqka.com.cn/guojicj_list/', 'http://news.10jqka.com.cn/jrsc_list/', 'http://news.10jqka.com.cn/fssgsxw_list/', 'http://news.10jqka.com.cn/region_list/', 'http://news.10jqka.com.cn/gat_list/', 'http://news.10jqka.com.cn/fortune_list/', 'http://news.10jqka.com.cn/cjrw_list/', 'http://news.10jqka.com.cn/dzxf_list/' ] rules = ( Rule(LinkExtractor(allow=reg), callback="parse_news", follow=True), Rule(LinkExtractor(allow='_[1-2]+.shtml')), #Rule(LinkExtractor(allow='_[0-9]+.shtml')) ) def printcn(uni): for i in uni: print uni.encode('utf-8') def parse_news(self, response): item = GenericItem() self.get_id(response, item) self.get_url(response, item) self.get_source(response, item) self.get_title(response, item) self.get_date(response, item) self.get_body(response, item) #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse return item def get_id(self, response, item): id = uuid.uuid4() if id: item['id'] = id def get_url(self, response, item): news_url = response.url if news_url: item['url'] = news_url def get_source(self, response, item): source = self.source if source: item['source'] = source def get_title(self, response, item): title = response.xpath('//div[@class="atc-head"]/h1/text()').extract() if title: item['title'] = ''.join(title).strip() def get_date(self, response, item): date = response.xpath('//*[@id="pubtime_baidu"]/text()').extract() if date: item['date'] = ''.join(date).replace(u'-', u'').replace( u':', u'').replace(u' ', u'').strip() def get_body(self, response, item): paras = response.xpath('//div[@class="atc-content"]/p') news_body = '' for p in paras: data = p.xpath('string(.)').extract() if data: body = '' for line in ''.join(data).splitlines(): # print entry.encode('utf-8') body += line.strip() news_body += body + '_|_' item['body'] = news_body.replace('_|__|_', '_|_')
class LagouSpider(CrawlSpider): name = 'lagou' allowed_domains = ['www.lagou.com'] start_urls = ['https://www.lagou.com/'] """ LinkExtractor URL抽取主要方法 """ rules = ( # Rule(LinkExtractor(allow=("zhaopin/.*",)), follow = True), # Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True), Rule(LinkExtractor(allow=r'jobs/\d+.html'), callback='parse_job', follow=True), ) """ # def parse_start_url(self, response): # return [] # # def process_results(self, response, results): # return results # def start_requests(self): # # 去使用selenium模拟登录后拿到cookie交给scrapy的request使用 # # 1、通过selenium模拟登录 # # 从文件中读取cookies # cookies = [] # driver_path = BASE_DIR + "/LagouSpider/Driver/chromedriver" # browser = webdriver.Chrome(executable_path=driver_path) # browser.get("https://passport.lagou.com/login/login.html") # if os.path.exists(BASE_DIR + "/LagouSpider/cookies/lagou.cookie"): # cookies = pickle.load(open(BASE_DIR + "/cookies/lagou.cookie", "rb")) # for cookie in cookies: # browser.add_cookie(cookie) # browser.get("https://www.lagou.com/") # # if not cookies: # browser.get("https://passport.lagou.com/login/login.html") # browser.find_element_by_css_selector(".form_body .input.input_white").send_keys("*****@*****.**") # browser.find_element_by_css_selector('.form_body input[type="password"]').send_keys("123456") # browser.find_element_by_css_selector('div[data-view="passwordLogin"] input.btn_lg').click() # import time # time.sleep(15) # cookies = browser.get_cookies() # # 写入cookie到文件中 # pickle.dump(cookies, open(BASE_DIR + "/LagouSpider/cookies/lagou.cookie", "wb")) # cookie_dict = {} # for cookie in cookies: # cookie_dict[cookie["name"]] = cookie["value"] # # for url in self.start_urls: # yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict) """ def parse_job(self, response): """ 回掉函数 :param response: :return: """ # 创建ItemLoader的格式 item_loader = LagouItemLoader(item=LagouJobItem(), response=response) item_loader.add_css('title', '.job-name::attr(title)') # 职位标题 item_loader.add_value('url', response.url) item_loader.add_value('url_object_id', get_md5(response.url)) item_loader.add_css('salary', '.job_request .salary::text') item_loader.add_xpath('job_city', '//*[@class="job_request"]/h3/span[2]/text()') item_loader.add_xpath('work_years', '//*[@class="job_request"]/h3/span[3]/text()') item_loader.add_xpath('degree_need', '//*[@class="job_request"]/h3/span[4]/text()') item_loader.add_xpath('job_type', '//*[@class="job_request"]/h3/span[5]/text()') item_loader.add_css('tags', '.position-label li::text') item_loader.add_css('publish_time', '.publish_time::text') item_loader.add_css('job_advantage', '.job-advantage p::text') item_loader.add_css('job_desc', '.job_bt div') item_loader.add_css('job_addr', '.work_addr') item_loader.add_css('company_name', '.job_company dt a img::attr(alt)') item_loader.add_value('crawl_time', datetime.now()) job_item = item_loader.load_item() return job_item
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name' : "//div[@id='colcenter']/div[@id='content']/div/h1/a/span", 'price' : "//div[@class='contentContainer']/div[@class='contentText']/h2/span/span/span", 'category' : "//div[@id='midle']/div[@class='navigation']/a", 'description' : "//div[@class='boxproduct']/div[@class='boxproductcontent']/p | //div[@id='ViEtDeVdIvId']/p | //div[@id='ViEtDeVdIvId']/table", 'images' : "//div[@class='contentText']/div[@id='piGal']/div/a/@href | //div[@class='contentText']/div[@id='piGal']/div/a/img/@src | //div/a[@class='fancyLink']/img/@src | //div[@id='piGal']/div/a/@href | //div[@id='piGal']/div/a/img/@src", 'canonical' : "", 'base_url' : "", 'brand' : "" } name = 'sieuthimayvanphong.com.vn' allowed_domains = ['sieuthimayvanphong.com.vn'] start_urls = ['http://sieuthimayvanphong.com.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/[a-zA-Z0-9-_]+-p+\d+\.qmc']), 'parse_item'), Rule(LinkExtractor(allow=['/[a-zA-Z0-9-_]+-c+\d+\.qmc']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class TheiconicComAu(BasePortiaSpider): name = "www.theiconic.com.au" allowed_domains = [u'www.theiconic.com.au'] start_urls = [ u'http://www.theiconic.com.au/nike-power-legend-women-s-high-rise-training-tights-488916.html' ] rules = [ Rule(LinkExtractor(allow=(u'.html', u'/sale/'), deny=()), callback='parse_item', follow=True) ] items = [[ Item(PortiaItem, None, u'.product-information', [ Field( u'Cat_1', '.medium-7 > .product-gallery > div:nth-child(1) > .small-12 > .breadcrumbs > li:nth-child(1) > .ga-track-link-click *::text', []), Field( u'Cat_2', '.medium-7 > .product-gallery > div:nth-child(1) > .small-12 > .breadcrumbs > li:nth-child(2) > .ga-track-link-click *::text', []), Field( u'Cat_3', '.medium-7 > .product-gallery > div:nth-child(1) > .small-12 > .breadcrumbs > li:nth-child(3) > .ga-track-link-click *::text', []), Field( u'Image', '.medium-7 > .product-gallery > div:nth-child(3) > .small-12 > .main-image-frame > .img > .owl-wrapper-outer > .owl-wrapper > div:nth-child(1) > .image-wrapper > .image-frame > img::attr(src)', []), Field( u'Brand', '.medium-5 > .main > .item-details > .product-info > .product-title > .small-12 > .product-name > .brand-title > a *::text', []), Field( u'Name', '.medium-5 > .main > .item-details > .product-info > .product-title > .small-12 > .product-name > span:nth-child(2) *::text', []), Field( u'Price', '.medium-5 > .main > .item-details > .product-info > .product-price > .small-12 > .price *::text', [], True), Field( u'Colour', '.medium-5 > .main > .item-details > .add-to-bag > .row > .small-12 > .ti-dropdown > .dropdown > span > .color-name *::text', []), Field( u'Size', '.medium-5 > .main > .item-details > .add-to-bag > .ng-pristine > div:nth-child(1) > .small-7 > .ti-dropdown > .f-dropdown *::text', []), Field( u'Description', '.medium-5 > .main > .item-details > .accordion > dd *::text', []) ]), Item(PortiaItem, None, u'.product-information', [ Field( u'Cat_1', '.medium-7 > .product-gallery > div:nth-child(1) > .small-12 > .breadcrumbs > li:nth-child(1) > .ga-track-link-click *::text', []), Field( u'Cat_2', '.medium-7 > .product-gallery > div:nth-child(1) > .small-12 > .breadcrumbs > li:nth-child(2) > .ga-track-link-click *::text', []), Field( u'Cat_3', '.medium-7 > .product-gallery > div:nth-child(1) > .small-12 > .breadcrumbs > li:nth-child(3) > .ga-track-link-click *::text', []), Field( u'Image', '.medium-7 > .product-gallery > div:nth-child(3) > .small-12 > .main-image-frame > .img > .owl-wrapper-outer > .owl-wrapper > div:nth-child(1) > .image-wrapper > .image-frame > img::attr(src)', []), Field( u'Name', '.medium-5 > .main > .item-details > .product-info > .product-title > .small-12 > .product-name > span:nth-child(2) *::text', []), Field( u'Colour', '.medium-5 > .main > .item-details > .product-info > .product-price > .small-12 *::text', []), Field( u'PriceRrp', '.medium-5 > .main > .item-details > .product-info > .product-price > .small-12 > .original *::text', []), Field( u'PriceSale', '.medium-5 > .main > .item-details > .product-info > .product-price > .small-12 > .final *::text', []), Field( u'Size', '.medium-5 > .main > .item-details > .add-to-bag > .ng-pristine > div:nth-child(1) > .small-7 > .ti-dropdown > .f-dropdown *::text', []), Field(u'Description', '.medium-5 > .main > .item-details > .accordion *::text', []) ]) ]]
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//div[@class='product-title']/h1", 'price': "//div[@class='Row Price']/div[@class='ProductPrice VariationProductPrice']", 'category': "//div[@class='Breadcrumb']/ul/li/a", 'description': "//div[@class='ProductTabs']/div[@class='Block Panel ProductDescription']", 'images': "//div[@class='ProductThumb']/div[@class='ProductThumbImage']/a/@href", 'canonical': "//link[@rel='canonical']/@href", 'base_url': "", 'brand': "" } name = 'aligro.vn' allowed_domains = ['aligro.vn'] start_urls = ['http://aligro.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/[a-zA-Z0-9-]+-\d+\.html$']), 'parse_item'), Rule(LinkExtractor(allow=['/[a-zA-Z0-9-]+-b\d+\.html($|\?pn=\d+)']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//div[@class='sanpham']/div/h4/strong", 'price': "//div[@class='pro_detail']/div[@class='cat_pirce']/h4", 'category': "//div[@class='link_menu']/a", 'description': "//div[@class='sanpham']/div[@class='noidung_sp']", 'images': "//div/div[@class='sanpham']/div[@class='img_detail_view']/a/img/@src", 'canonical': "", 'base_url': "", 'brand': "" } name = 'bibicare.vn' allowed_domains = ['bibicare.vn'] start_urls = ['http://bibicare.vn'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/detail-product-view-+[a-zA-z0-9-_]+\.html']), 'parse_item'), Rule(LinkExtractor(allow=['/menu-product-+[a-zA-z0-9-_]+\.html']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
# Auto generated by generator.py. Delete this line if you make modification. from scrapy.spiders import Rule from scrapy.linkextractors import LinkExtractor XPATH = { 'name': "//h3[@class='productname']", 'price': "//div[@class='price']/h3[@id='our_price_display']", 'category': "//ol[@class='breadcrumb']/li/a", 'description': "//div[@class='description']", 'images': "//div[@id='thumbs_list']/ul//a/@href", 'canonical': "", 'base_url': "", 'brand': "" } name = 'coophomeshopping.vn' allowed_domains = ['coophomeshopping.vn'] start_urls = ['http://coophomeshopping.vn/'] tracking_url = '' sitemap_urls = [''] sitemap_rules = [('', 'parse_item')] sitemap_follow = [] rules = [ Rule(LinkExtractor(allow=['/[a-zA-Z0-9-/]+\.html$']), 'parse_item'), Rule(LinkExtractor(allow=['/[a-zA-Z0-9-/]+($|/#/page-\d+)']), 'parse'), #Rule(LinkExtractor(), 'parse_item_and_links'), ]
class DogSpider(CrawlSpider): name = 'environment' allowed_domains = [ 'topontiki.gr', 'popaganda.gr', 'lifo.gr', 'naftemporiki.gr', 'kathimerini.gr', 'cnn.gr', 'protagon.gr', 'iefimerida.gr', ] url = [ 'https://popaganda.gr/newstrack/environment/', 'https://www.naftemporiki.gr/green', 'https://www.cnn.gr/', 'https://www.protagon.gr/epikairotita/', 'https://www.iefimerida.gr', ] topontiki_urls = [ 'http://www.topontiki.gr/category/perivallon?page={}'.format(x) for x in range(0, TOPONTIKI_VARS['ENVIRONMENT_PAGES']) ] lifo_urls = [ 'https://www.lifo.gr/now/perivallon/page:{}'.format(x) for x in range(1, LIFO_VARS['ENVIRONMENT_PAGES']) ] kathimerini_urls = [ 'https://www.kathimerini.gr/box-ajax?id=b1_1885015423_1194114316&page={}' .format(x) for x in range(0, KATHIMERINI_VARS['ENVIRONMENT_PAGES']) ] urls = url + kathimerini_urls + lifo_urls + topontiki_urls start_urls = urls[:] rules = ( Rule(LinkExtractor(allow=('topontiki.gr/article/'), deny=('binteo', 'videos', 'gallery', 'eikones', 'twit')), callback='parse_topontiki', follow=True, process_request='process_topontiki'), Rule(LinkExtractor(allow=(r'popaganda\.gr.+newstrack/'), deny=('binteo', 'videos', 'gallery', 'eikones', 'twit', 'comment')), callback='parse_popaganda', follow=True, process_request='process_popaganda'), Rule(LinkExtractor(allow=(r'www\.lifo\.gr.+perivallon'), deny=('binteo', 'videos', 'gallery', 'eikones', 'twit', 'comment')), callback='parse_lifo', follow=True, process_request='process_lifo'), Rule(LinkExtractor(allow=(r'www\.lifo\.gr.+environment_articles'), deny=('binteo', 'videos', 'gallery', 'eikones', 'twit', 'comment')), callback='parse_lifo', follow=True, process_request='process_lifo'), Rule(LinkExtractor( allow=(r'\.naftemporiki\.gr/story|\.naftemporiki\.gr/storypn'), deny=('binteo', 'videos', 'gallery', 'eikones', 'twit')), callback='parse_naftemporiki', follow=True, process_request='process_naftemporiki'), Rule(LinkExtractor( allow=(r"\.kathimerini\.gr.+epikairothta/perivallon/"), deny=('binteo', 'videos', 'gallery', 'eikones', 'twit')), callback='parse_kathimerini', follow=True, process_request='process_kathimerini'), Rule(LinkExtractor(allow=('https://www.iefimerida.gr/green'), deny=('binteo', 'videos', 'gallery', 'eikones', 'twit')), callback='parse_iefimerida', follow=True, process_request='process_iefimerida'), Rule(LinkExtractor(allow=('cnn.gr/news/perivallon')), callback='parse_cnn', follow=True, process_request='process_cnn'), Rule(LinkExtractor(allow=('protagon.gr/epikairotita/'), deny=('binteo', 'videos', 'gallery', 'eikones', 'twit')), callback='parse_protagon', follow=True, process_request='process_protagon'), ) def parse_cnn(self, response): global cnn_counter #check if we are in an articles url title = response.xpath('//h1[@class="story-title"]/text()').get() if title is not None and cnn_counter < 300: #get the article's text text = response.xpath( '//div[@class="story-content"]//p/text()|//div[@class="story-content"]//strong/text()|//div[@class="story-content"]//a/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneede_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneede_spaces) clear_characters = re.sub("\xa0", "", final_text) date = re.sub( r'\n|\t', "", response.xpath( '//div[@class="story-date story-credits icon icon-time"]/text()' ).get()) final_date = formatdate(date) url = response.url if len(clear_characters) > GENERAL_CATEGORIES['ALLOWED_LENGTH']: cnn_counter += 1 yield { "subtopic": GENERAL_CATEGORIES['ENVIRONMENT'], "website": CNN_VARS['WEBSITE'], "title": title, "article_date": final_date, "author": re.sub( r'\n|\t', "", response.xpath( '//div[@class="story-author"]/text()').get()), "article_body": re.sub(r'\n|\t', "", clear_characters), "url": url, } def process_cnn(self, request): global cnn_counter if cnn_counter < 300: return request def parse_protagon(self, response): global protagon_counter #check if we are in an articles url title = response.xpath('//h1[@class="entry-title"]/text()').get() if title is not None and protagon_counter < 300: #check if we are in the correct category sub = response.xpath('//span[@class="s_roumpr"]/a/text()').get() if sub == PROTAGON_VARS['ENVIRONMENT']: #get the article's text text = response.xpath( '//div[@class="left-single-column "]//p/text()|//div[@class="left-single-column "]//strong/text()|//div[@class="left-single-column "]//p/*/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneede_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneede_spaces) clear_characters = re.sub("\xa0", "", final_text) #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url author = re.findall( r"(\w+).(\w+)", response.xpath( '//strong[@class="generalbold uppercase"]/a/text()'). get()) list_to_tuple = author[0] author = ' '.join(list_to_tuple) date = response.xpath( '//span[@class="generalight uppercase"]/text()').get() final_date = formatdate(date) #check if we are in an article and that it doesn't have images if len(clear_characters) > GENERAL_CATEGORIES[ 'ALLOWED_LENGTH'] and flag is None: protagon_counter += 1 yield { "subtopic": GENERAL_CATEGORIES['ENVIRONMENT'], "website": PROTAGON_VARS['WEBSITE'], "title": title, "article_date": final_date, "author": author, "article_body": re.sub(r'\s\s\s', "", clear_characters), "url": url, } def process_protagon(self, request): global protagon_counter if protagon_counter < 300: return request def parse_iefimerida(self, response): global iefimerida_counter #check if we are in an articles url title = response.xpath('//h1/span/text()').get() if title is not None and iefimerida_counter < 300: #get the article's text text = response.xpath( '//div[@class="field--name-body on-container"]//p/text()|//div[@class="field--name-body on-container"]/strong/text()|//div[@class="field--name-body on-container"]//p/*/text()|//div[@class="field--name-body on-container"]//p//li/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url date = response.xpath('//span[@class="created"]/text()').get() final_date = formatdate(date) #check if we are in an article and that it doesn't have images if len(final_text ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None: iefimerida_counter += 1 yield { "subtopic": GENERAL_CATEGORIES['ENVIRONMENT'], "website": IEFIMERIDA_VARS['AUTHOR'], "title": title, "article_date": final_date, "author": IEFIMERIDA_VARS['AUTHOR'], "article_body": re.sub(r'\s\s\s|\n', "", final_text), "url": url, } def process_iefimerida(self, request): global iefimerida_counter if iefimerida_counter < 300: return request def parse_kathimerini(self, response): global kathimerini_counter #check if we are in an articles url title = response.xpath('//h2[@class="item-title"]/text()').get() if title is not None and kathimerini_counter < 300: list_to_string = " ".join(" ".join(title)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) put_spaces_back = re.sub("space", " ", uneeded_spaces) final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back) #get the article's text text = response.xpath( '//div[@class="freetext"]//p/text()|//div[@class="freetext"]//strong/text()|//div[@class="freetext"]//h3/text()|//div[@class="freetext"]//p/*/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) date = response.xpath('//time/text()').get() final_date = formatdate(date) #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url author = response.xpath( '//span[@class="item-author"]/a/text()').get() if author == KATHIMERINI_VARS['CATEGORY_AUTHOR']: author = KATHIMERINI_VARS['AUTHOR'] #check if we are in an article and that it doesn't have images if len(final_text ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None: kathimerini_counter += 1 yield { "subtopic": GENERAL_CATEGORIES['ENVIRONMENT'], "website": KATHIMERINI_VARS['AUTHOR'], "title": final_title, "article_date": final_date, "author": author, "article_body": re.sub(r'\s\s\s|\n', "", final_text), "url": url, } def process_kathimerini(self, request): global kathimerini_counter if kathimerini_counter < 300: return request def parse_naftemporiki(self, response): global naftemporiki_counter #check if we are in an articles url title = response.xpath('//h2[@id="sTitle"]/text()').get() if title is not None and naftemporiki_counter < 300: #check if we are in the correct category subtopic = response.xpath( '//span[@itemprop="articleSection"]/text()').get() if subtopic == NAFTEMPORIKI_VARS['CATEGORY_ENVIRONMENT']: #fix the title's format list_to_string = " ".join(" ".join(title)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) put_spaces_back = re.sub("space", " ", uneeded_spaces) final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back) #get the article's text text = response.xpath( '//div[@class="entityMain article"]//p/text()|//div[@class="entityMain article"]/p/strong/text()|//div[@class="entityMain article"]//h3/text()|//div[@class="entityMain article"]//p/*/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) date = response.xpath('//div[@class="Date"]/text()').get() final_date = formatdate(date) #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url #check if we are in an article and that it doesn't have images if len(final_text) > GENERAL_CATEGORIES[ 'ALLOWED_LENGTH'] and flag is None: naftemporiki_counter += 1 yield { "subtopic": response.xpath( '//div[@class="Breadcrumb"]/a[2]/text()').get(), "website": NAFTEMPORIKI_VARS['AUTHOR'], "title": final_title, "article_date": final_date, "author": NAFTEMPORIKI_VARS['AUTHOR'], "article_body": re.sub(r'\s\s\s|\n', "", final_text), "url": url, } def process_naftemporiki(self, request): global naftemporiki_counter if naftemporiki_counter < 300: return request def parse_lifo(self, response): global lifo_counter #check if we are in an articles url title = response.xpath( '//h1[@itemprop="headline"]/text()|//meta[@itemprop="headline"]/text()|//h1/*/text()' ).get() if title is not None and lifo_counter < 300: #fix the title's format list_to_string = " ".join(" ".join(title)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) put_spaces_back = re.sub("space", " ", uneeded_spaces) final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back) date = response.xpath('//time/text()').get() final_date = formatdate(date) #get the article's text text = response.xpath( '//div[@class="clearfix wide bodycontent"]//p/text()|//div[@class="clearfix wide bodycontent"]/p/strong/text()|//div[@class="clearfix wide bodycontent"]//h3/text()|//div[@class="clearfix wide bodycontent"]//p/*/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) author = response.xpath( '//div[@class="author"]/a/text()|//div[@itemprop="author"]/*/text()' ).get() if author == None: author = LIFO_VARS['AUTHOR'] #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url #check if we are in an article and that it doesn't have images if len(clear_characters ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None: lifo_counter += 1 yield { "subtopic": GENERAL_CATEGORIES['ENVIRONMENT'], "website": LIFO_VARS['AUTHOR'], "title": final_title, "article_date": final_date, "author": author, "article_body": re.sub(r'\s\s\s|\n', "", clear_characters), "url": url, } def process_lifo(self, request): global lifo_counter if lifo_counter < 300: return request def parse_popaganda(self, response): global popaganda_counter #check if we are in an articles url title = response.xpath('//h1/text()').get() if title is not None and popaganda_counter < 300: #check if we are in the correct category category = response.xpath( '//div[@class="category"]/a/text()').get() if category == POPAGANDA_VARS['CATEGORY_ENVIRONMENT']: #fix the title's format list_to_string = " ".join(" ".join(title)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) put_spaces_back = re.sub("space", " ", uneeded_spaces) final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back) #get the article's text text = response.xpath( '//div[@class="post-content newstrack-post-content"]//p/text()|//div[@class="post-content newstrack-post-content"]/p/strong/text()|//div[@class="post-content newstrack-post-content"]//h3/text()|//div[@class="post-content newstrack-post-content"]//p/*/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = re.sub("\xa0", "", final_text) author = response.xpath( '//div[@class="author"]/a/text()|//div[@itemprop="author"]/*/text()' ).get() if author == None: author = POPAGANDA_VARS['WEBSITE'] date = response.xpath( '//div[@class="date"]/text()|//div[@class="fullscreen-date"]/text()' ).get() final_date = formatdate(date) #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url #check if we are in an article and that it doesn't have images if len(clear_characters) > GENERAL_CATEGORIES[ 'ALLOWED_LENGTH'] and flag is None: popaganda_counter += 1 yield { "subtopic": POPAGANDA_VARS['ENVIRONMENT'], "website": POPAGANDA_VARS['WEBSITE'], "title": final_title, "article_date": final_date, "author": POPAGANDA_VARS['WEBSITE'], "article_body": re.sub(r'\s\s\s|\n', "", clear_characters), "url": url, } def process_popaganda(self, request): global popaganda_counter if popaganda_counter < 300: return request def parse_topontiki(self, response): global topontiki_counter #check if we are in an articles url title = response.xpath('//h1/text()').get() if title is not None and topontiki_counter < 300: #check if we are in the correct category sub = response.xpath('//h2/a/text()').get() if sub == TOPONTIKI_VARS['CATEGORY_ENVIRONMENT']: #fix the title's format list_to_string = " ".join(" ".join(title)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) put_spaces_back = re.sub("space", " ", uneeded_spaces) final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back) #get the article's text text = response.xpath( '//div[@class="field-item even"]//p/text()|//div[@class="field-item even"]//p/*/text()|//div[@class="field-item even"]//p//span/text()' ).getall() list_to_string = " ".join(" ".join(text)) markspaces = re.sub(" ", "space", list_to_string) uneeded_spaces = re.sub(" ", "", markspaces) final_text = re.sub("space", " ", uneeded_spaces) clear_characters = final_text.replace("\xa0", "") date = response.xpath('//span[@class="date"]/text()').get() final_date = formatdate(date) #flag to see later on if we have tweets ect flag = re.search(r"@", clear_characters) url = response.url #check if we are in an article and that it doesn't have images if len(clear_characters) > GENERAL_CATEGORIES[ 'ALLOWED_LENGTH'] and flag is None: topontiki_counter += 1 yield { "subtopic": GENERAL_CATEGORIES['ENVIRONMENT'], "website": TOPONTIKI_VARS['WEBSITE'], "title": final_title, "article_date": final_date, "author": TOPONTIKI_VARS['WEBSITE'], "article_body": re.sub(r'\s\s\s|\n', "", clear_characters), "url": url, } def process_topontiki(self, request): global topontiki_counter if topontiki_counter < 300: return request
class MenHoodiesSpider(SplashCrawlSpider): name: str = "men_hoodies" allowed_domains: List[str] = ["www.dresslily.com"] start_urls: List[str] = [ "https://www.dresslily.com/hoodies-c-181-page-1.html", ] item_reviews_page_url: str = "https://www.dresslily.com/m-review-a-view_review-goods_id-{product_id}-page-{page_num}.html" rules: Tuple[Rule, ...] = ( Rule( LinkExtractor(allow=r"page-[0-9]+"), ), Rule( LinkExtractor(allow=r"product[0-9]+"), callback="parse_hoodie", ), ) splash_args: Dict[str, Any] = { "wait": 3.0, "images": 0, } # Hoodie selectors NAME_XPATH: str = "//h1/span[@class='goodtitle']/text()" DISCOUNT_CSS: str = "span.off.js-dl-cutoff > span ::text" INFO_KEYS_XPATH: str = "//div[@class='xxkkk']/div//strong/text()" INFO_VALUES_XPATH: str = "//div[@class='xxkkk20']/text()" TOTAL_REVIEWS_XPATH: str = "//*[@id='js_reviewCountText']/text()" ORIGINAL_PRICE_WITHOUT_DISCOUNT_CSS: str = ( "span.curPrice.my-shop-price.js-dl-curPrice ::attr(data-orgp)" ) ORIGINAL_PRICE_WITH_DISCOUNT_CSS: str = "span.js-dl-marketPrice.marketPrice.my-shop-price.dl-has-rrp-tag > span.dl-price ::text" # Hoodie reviews selectors REVIEWS_LIST_XPATH: str = "//div[@class='reviewinfo']" RATING_SELECTED_STARS_XPATH: str = ( ".//p[@class='starscon_b dib']/i[@class='icon-star-black']" ) TIMESTAMP_XPATH: str = ".//span[@class='reviewtime']/text()" TEXT_XPATH: str = ".//p[@class='reviewcon']/text()" SIZE_XPATH: str = ".//p[@class='color-size']/span[1]/text()" COLOR_XPATH: str = ".//p[@class='color-size']/span[2]/text()" TIMESTAMP_FORMAT: str = "%b,%d %Y %H:%M:%S" REVIEWS_BY_PAGE_COUNT: int = 6 def parse_hoodie( self, response: HtmlResponse ) -> Union[Iterable[Item], Iterable[ScrapyRequest]]: product_url: str = response.meta["real_url"] product_id: int = int( product_url.split("product")[-1].replace(".html", "") ) name: str = response.xpath(self.NAME_XPATH).get("") original_price: float = float( response.css(self.ORIGINAL_PRICE_WITHOUT_DISCOUNT_CSS).get(0.0) ) discounted_price: float = 0.0 discount: int = int(response.css(self.DISCOUNT_CSS).get(0)) if discount: discounted_price = original_price original_price = float( response.css(self.ORIGINAL_PRICE_WITH_DISCOUNT_CSS).getall()[ -1 ] ) product_info_keys: List[str] = response.xpath( self.INFO_KEYS_XPATH ).getall() product_info_values: List[str] = response.xpath( self.INFO_VALUES_XPATH ).getall()[1::2] product_info: str = "".join( [ f"{k.strip()}{v.strip()};" for (k, v) in zip(product_info_keys, product_info_values) ] ) total_reviews: int = int( response.xpath(self.TOTAL_REVIEWS_XPATH).get(0) ) yield MenHoodieItem( product_id=product_id, product_url=product_url, name=name, discount=discount, discounted_price=discounted_price, original_price=original_price, total_reviews=total_reviews, product_info=product_info, ) if total_reviews > 0: yield from self.parse_reviews_pages( product_id=product_id, total_reviews=total_reviews, ) def parse_reviews_pages( self, product_id: int, total_reviews: int ) -> Iterable[ScrapyRequest]: reviews_left: int = total_reviews page_num: int = 1 while reviews_left > 0: # No need in loading js yield ScrapyRequest( url=self.item_reviews_page_url.format( product_id=product_id, page_num=page_num, ), callback=self.parse_reviews, cb_kwargs={"product_id": product_id}, ) reviews_left -= self.REVIEWS_BY_PAGE_COUNT page_num += 1 def parse_reviews( self, response: HtmlResponse, product_id: int ) -> Iterable[Item]: reviews: List[HtmlResponse] = response.xpath(self.REVIEWS_LIST_XPATH) for review in reviews: rating: int = len( review.xpath(self.RATING_SELECTED_STARS_XPATH).getall() ) time: str = review.xpath(self.TIMESTAMP_XPATH).get("") timestamp: float = ( mktime( datetime.strptime(time, self.TIMESTAMP_FORMAT).timetuple() ) if time else 0.0 ) text: str = review.xpath(self.TEXT_XPATH).get("") size: str = review.xpath(self.SIZE_XPATH).get(": ").split(": ")[-1] color: str = ( review.xpath(self.COLOR_XPATH).get(": ").split(": ")[-1] ) yield ReviewItem( product_id=product_id, rating=rating, timestamp=timestamp, text=text, size=size, color=color, )