def parse_items_links(self, response): categories_links_extractor = LinkExtractor(restrict_xpaths='.//div[@class="categoryListContainer"]') cat_links = categories_links_extractor.extract_links(response) for link in cat_links: yield Request(url=link.url, callback=self.parse_items_links) items_links_extractor = LinkExtractor(restrict_xpaths='.//div[@class="directory-listing"]/h3') items_links = items_links_extractor.extract_links(response) for link in items_links: yield Request(url=link.url, callback=self.parse_item) pagination_link = cond_set_value(response.xpath('.//a[@class="more"]/@href').extract()) if pagination_link: full_pagination_link = urljoin(self.start_urls[0], pagination_link) yield Request(url=full_pagination_link, callback=self.parse_items_links)
def parse_categories(self, response): request_again = self.error_handler(response) if request_again: yield request_again return categories_extractor = LinkExtractor( restrict_xpaths='.//ul[@class="popTermsList"]') categories_links = categories_extractor.extract_links(response) for link in categories_links: yield Request(url=link.url, callback=self.get_items_and_pagination) letters_extractor = LinkExtractor( restrict_xpaths='.//div[@class="popTermsNavBar"]') letters_links = letters_extractor.extract_links(response) for link in letters_links: yield Request(url=link.url, callback=self.parse_categories)
def parse_newspage(self, response): #self.log('We are parsing news page: %s' % response.url) #self.log(str(xpathlist)) linkextractor = LinkExtractor(allow_domains=self.allowed_domains, restrict_xpaths = self.LINK_XPATHLIST) for link in linkextractor.extract_links(response): yield scrapy.Request(link.url, callback=self.parse_myarticle)
def _parse_category(self, response): category = response.meta['category'] parent = response.meta.get('parent', {}) category['catid'] = self._get_catid() category['url'] = response.url category['parent_text'] = parent.get('text') category['parent_url'] = parent.get('url') category['parent_catid'] = parent.get('catid') category['grandparent_text'] = parent.get('parent_text') category['grandparent_url'] = parent.get('parent_url') category['level'] = parent.get('level', 0) + 1 category['department_text'] = response.meta['department']['text'] category['department_url'] = response.meta['department']['url'] category['department_id'] = response.meta['department']['catid'] #category['description_text'] = self._description_text.first(response) description_text = first(response.xpath(self._xpath_description_text).extract()) if description_text: category['description_wc'] = len(Utils.normalize_text(description_text)) keywords = first(response.xpath(self._xpath_keywords).extract()) if description_text: category['description_text'] = description_text if description_text and keywords: (category['keyword_count'], category['keyword_density']) = Utils.phrases_freq(keywords, description_text) if category.get('nr_products') is None: nr_products = re_find('\d+', first(response.css(self._css_product_numbers_text).extract())) category['nr_products'] = int(nr_products) if nr_products is not None else None subcategory_links = LinkExtractor(restrict_xpaths=self._xpath_category_links) for link in subcategory_links.extract_links(response): text, nr_products = re.search('(.+?) \((\d+)\) *', link.text).groups() nr_products = int(nr_products) child = CategoryItem(text=text, nr_products=nr_products) meta = {'category': child, 'department': response.meta['department'], 'parent': category} yield Request(link.url, callback=self._parse_category, meta=meta) yield category
def get_items_and_pagination(self, response): request_again = self.error_handler(response) if request_again: yield request_again return items_extractor = LinkExtractor(deny=[r'\/image\/', r'\/map'], restrict_xpaths='.//div[@class="itemInfo"]/h2') items_links = items_extractor.extract_links(response) for link in items_links: yield Request(url=link.url, callback=self.parse_item) if response.xpath('.//a[@class="next"]').extract(): total_quantity = response.xpath( '(.//div[@class="pageResults"]/span[@class="results"]' '/text()[normalize-space()])[2]').re(r'\d+') if total_quantity: total_quantity = int(total_quantity[0]) pages = total_quantity/25 page_range = range(1, pages+2) category = cond_set_value(response.xpath( './/input[@id="FrmWho"]/@value').extract()) quoted_category = quote_plus(category) for page in page_range: next_url = self.pagination_pattern.format(prase=quoted_category, page=page) yield Request(url=next_url, headers=self.pagination_headers, dont_filter=True, method='POST', callback=self.parse_pagination)
def get_companies_links(self, response): companies_link_extractor = LinkExtractor(allow=r'\/company_\d{5,7}') companies_links = companies_link_extractor.extract_links(response) for link in companies_links: yield Request(url=link.url, callback=self.parse_item, # cookies=None, meta={'category': response.meta.get('category')})
def _extract_links(self, response, params): """ parse links from response @return hrefs """ params['allow_domains'] = tuple(self.allowed_domains) link_extractor = LinkExtractor(**params) return link_extractor.extract_links(response)
class FollowAllSpider(Spider): name = 'followall' def __init__(self, **kw): super(FollowAllSpider, self).__init__(**kw) url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/' if not url.startswith('http://') and not url.startswith('https://'): url = 'http://%s/' % url self.url = url self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)] self.link_extractor = LinkExtractor() self.cookies_seen = set() def start_requests(self): return [Request(self.url, callback=self.parse, dont_filter=True)] def parse(self, response): """Parse a PageItem and all requests to follow @url http://www.scrapinghub.com/ @returns items 1 1 @returns requests 1 @scrapes url title foo """ page = self._get_item(response) r = [page] r.extend(self._extract_requests(response)) return r def _get_item(self, response): item = Page(url=response.url, size=str(len(response.body)), referer=response.request.headers.get('Referer')) self._set_title(item, response) self._set_new_cookies(item, response) return item def _extract_requests(self, response): r = [] if isinstance(response, HtmlResponse): links = self.link_extractor.extract_links(response) r.extend(Request(x.url, callback=self.parse) for x in links) return r def _set_title(self, page, response): if isinstance(response, HtmlResponse): title = Selector(response).xpath("//title/text()").extract() if title: page['title'] = title[0] def _set_new_cookies(self, page, response): cookies = [] for cookie in [x.split(';', 1)[0] for x in response.headers.getlist('Set-Cookie')]: if cookie not in self.cookies_seen: self.cookies_seen.add(cookie) cookies.append(cookie) if cookies: page['newcookies'] = cookies
def parse(self, response): le = LinkExtractor() for link in le.extract_links(response): url = urljoin(response.url, link.url) yield scrapy.Request(url, self.parse_link, meta={ 'splash': { 'args': {'har': 1, 'html': 0}, } })
def extract_links(self, response): # The parent can do most of it for us links = LinkExtractor.extract_links(self, response) try: good_links = [link for link in links if link.text.isdigit()] except TypeError: return None return good_links
def get_categories(self, response): # http://www.construction.co.uk/double-glazing-repairs/category_33.htm link_extractor = LinkExtractor(allow=r'\/category_\d+') links = link_extractor.extract_links(response) for link in links: category = link.text yield Request(url=link.url, callback=self.get_companies_links_by_letter, meta={'category':category})
def _populate_related_products(self, response, product): xpath = '//ul[contains(@class, "might_like")]/li/' \ 'div[contains(@class, "product_description")]/a' extractor = LinkExtractor(restrict_xpaths=xpath) products = [ RelatedProduct(url=urljoin(response.url, link.url), title=link.text.strip()) for link in extractor.extract_links(response) ] cond_set_value(product, 'related_products', {'You might also like': products})
def parse(self, response): le = LinkExtractor() for link in le.extract_links(response): url = urljoin(response.url, link.url) yield scrapy.Request( url, self.parse_link, meta={'splash': { 'args': { 'har': 1, 'html': 0 }, }})
def get_companies_links_by_letter(self, response): # http://www.construction.co.uk/heating-contractors-and-consultants/22_A.htm letter_link_extractor = LinkExtractor(allow=r'\/\d+_[A-Z].htm') links_by_letter = letter_link_extractor.extract_links(response) if links_by_letter: for link in links_by_letter: yield Request(url=link.url, callback=self.get_companies_links, meta={'category': response.meta.get('category')}) else: # there is no letters pagination at the page for request in self.get_companies_links(response): yield request
class startPageSpider(Spider): name = 'startPageSpider' def __init__(self, taskId, *a, **kw): """Constructor""" self.name +='_'+str(taskId) super(startPageSpider, self).__init__(*a, **kw) pool = redis.ConnectionPool(host='localhost', port=6379, db=0) self.redis = redis.Redis(connection_pool=pool) self.dbUtils = db.dbUtils() self.taskId = int(taskId) self.project = None self.domain = None self.hasCrawlSet = set() self.hasInsertSet = set() project = self.dbUtils.queryRow('SELECT * FROM project_setting WHERE iStatus=1 AND iPid=%d' % self.taskId) if project : self.project = project self.start_urls = str(project['szStartUrl']).split('~') self.domain = ".".join(urlparse(project['szDomain']).hostname.split(".")[-2:]) def parse(self, response): print 'startPageSpider==========================>',response.url # log.msg(format='%(iPid)s, %(url)s, %(project)s ', iPid = self.taskId, url = response.url, project=self.project) listQuqueCount = self.redis.llen('scrapy:startPageSpider:listQuque:%s' % self.taskId) if listQuqueCount == 1: self._crawler.signals.send_catch_log('writeListQuque') elif listQuqueCount == 0: self._crawler.signals.send_catch_log('emptyListQuque') print 'startPageSpider---------send_catch_log->emptyListQuque' if response.url not in self.hasCrawlSet: pattern = re.compile(r'%s' % self.project['szStartUrlReg']) self.hasCrawlSet.add(response.url) if pattern.match(response.url) and response.url not in self.hasInsertSet: title = "|".join(response.xpath('/html/head/title/text()').extract()) insertSql = 'INSERT INTO project_start_page(iPid, szUrl, szTitle,dtLastScrapyTime) VALUES(%d, "%s", "%s", "%s")' % (self.taskId, response.url, title, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) self.dbUtils.insert(insertSql) self.hasInsertSet.add(response.url) self.redis.lpush('scrapy:startPageSpider:listQuque:%s' % self.taskId, response.url) #self.redis.sadd('scrapy:startPageSpider:startPage:2', response.url) log.msg(format='spider=startPageSpider iPid=%(i)s, title=%(t)s url=%(u)s', i = self.taskId, t=title, u=response.url) _allow = ( _allow for _allow in self.project['szStartUrlReg'].split('~')) self.linkExtractor = LinkExtractor(allow_domains=self.domain, allow=_allow) links = [ link for link in self.linkExtractor.extract_links(response) if link.url not in self.hasCrawlSet ] for link in links: yield self.make_requests_from_url(link.url)
def parse(self, response): link = LinkExtractor( allow='/baojie/.*', restrict_xpaths= '//dl[@class="selitem selitem-area clearfix"]/dd[@class="posrelative w-area"]/a' ) links = link.extract_links(response) for i in links: self.city_name = re.split('\/', i.url)[-2] yield Request(i.url, callback=self.get_index, meta={ 'city_name': self.city_name, 'dont_redirect': True }, dont_filter=True)
def get_index(self, response): city_name = response.meta['city_name'] link = LinkExtractor( restrict_xpaths= '//div[@class="leftBox"]//div[@class="list"]/ul/li[@class="list-img"]/div[@class="pic"]/a' ) links = link.extract_links(response) for i in links: yield Request(i.url, callback=self.get_message, meta={ 'city_name': city_name, 'dont_redirect': True }, dont_filter=True)
def parse(self, response): # set the allowed domains in link ln_extractor = LinkExtractor(allow_domains=("news.sina.cn"), allow = (".*vt=1.*")) # get the links from the response links = ln_extractor.extract_links(response) urls = [] items = [] for i in links: urls.append(i.url) # all the not visited urls are put into container and queue. if i.url not in self.g_container_urls: self.g_queue_urls.put(i.url) self.g_container_urls.add(i.url) # make all the request in the queue for j in range(self.g_queue_urls.qsize()): tp_url = self.g_queue_urls.get() items.append(self.make_requests_from_url(tp_url). replace(callback=self.parse_page)) items.append(self.make_requests_from_url(tp_url)) return items
def parse(self, response): xlink = LinkExtractor() itemre = re.compile(self.itemurl_re) for link in xlink.extract_links(response): if itemre.search(link.url): yield Request(url=link.url, callback=self.parse_item)
def parse(self, response): self.links.extend(LinkExtractor.extract_links(response))
def parse_index(self, response): l = LinkExtractor(allow=(r'/wiki/.*\.html'), restrict_xpaths=('//div[@id="wrapper"]//div[@id="content"]')) for link in l.extract_links(response): yield Request(link.url, callback=self.parse_pages)
def parse(self, response): dpt_links = LinkExtractor(restrict_xpaths=self._xpath_department_links) for link in dpt_links.extract_links(response): category = CategoryItem(text=link.text.strip(' \t\n')) yield Request(link.url, callback=self._parse_category, meta={'category': category, 'department': category})
def parse_level3_contents(self, response): baseurl = response.xpath('//base/@href').extract()[0] le = LinkExtractor() for link in le.extract_links(response): if self.allowed_domains[0] in link.url: yield Request(link.url, callback=self.final_contents)
class TopicalFinder(SplashSpiderBase): name = 'topical_finder' save_html = None use_splash = None def __init__(self, seed_urls=None, save_html=1, use_splash=1, screenshot_dir='/memex-pinterest/ui/static/images/screenshots', op_time=10, **kwargs): ''' Constructs spider instance from command=line or scrapyd daemon. :param seed_urls: Comma-separated list of URLs, if empty crawler will be following not crawled URLs from storage :param save_html: boolean 0/1 :param use_splash: boolean 0/1 :param screenshot_dir: used only when use_splash=1 :param op_time: operating time in minutes, negative - don't use that constraint :param kwargs: :return: ''' super(TopicalFinder, self).__init__(screenshot_dir=screenshot_dir, **kwargs) self.screenshot_dir = screenshot_dir log.msg("SCREENSHOT DIR IS SET TO: %s" % str(screenshot_dir), _level=log.DEBUG) if seed_urls: self.start_urls = [add_scheme_if_missing(url) for url in seed_urls.split(',')] self.ranker = Ranker.load() self.linkextractor = LinkExtractor() self.save_html = bool(save_html) self.use_splash = bool(use_splash) self.operating_time = int(op_time) * 60 self.start_time = datetime.utcnow() self.finishing = False def start_requests(self): for url in self.start_urls: yield self.make_requests_from_url(url, is_seed=True) def make_requests_from_url(self, url, is_seed=False): if self.use_splash: r = self._splash_request(url) else: r = super(TopicalFinder, self).make_requests_from_url(url) r.meta['score'] = 0.0 r.meta['is_seed'] = False if is_seed: r.meta['is_seed'] = True r.meta['score'] = 1.0 # setting maximum score value for seeds log.msg("Making request to %s with meta: %s" % (r.url, str(r.meta)), _level=log.DEBUG) return r def set_crawler(self, crawler): super(TopicalFinder, self).set_crawler(crawler) self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) def spider_idle(self): log.msg("Spider idle signal caught.", _level=log.DEBUG) raise DontCloseSpider def parse(self, response): ld = self._load_webpage_item(response, is_seed=response.meta['is_seed']) if self.use_splash: self._process_splash_response(response, ld) yield ld.load_item() if self.finishing: return now = datetime.utcnow() if self.operating_time > 0 and (now - self.start_time).total_seconds() > self.operating_time: log.msg("Reached operating time constraint. Waiting for Scrapy queue to exhaust.") self.finishing = True self.crawler.stop() return if not isinstance(response, TextResponse): return body = response.body_as_unicode().strip().encode('utf8') or '<html/>' score = self.ranker.score_html(body) log.msg("TC: %s has score=%f" % (response.url, score), _level=log.DEBUG) if score > 0.5: #!for some reason this is returning the raw splash response JSON #!and not the rendered HTML from splash #log.msg(u"\n\n\n****---Response body:\n %s----***\n\n\n" % response.body_as_unicode(), _level=log.DEBUG) #for link in self.linkextractor.extract_links(response): #can something like the line below fix it? Seems like a hack... for link in self.linkextractor.extract_links(response): log.msg("****---LINK EXTRACED: %s----***" % str(link.url), _level=log.DEBUG) if self.use_splash: r = self._splash_request(url=link.url) else: r = Request(url=link.url) external = is_external_url(response.url, link.url) depth = response.meta.get('link_depth', 0) r.meta.update({ 'link': { 'url': link.url, 'text': link.text, 'fragment': link.fragment, 'nofollow': link.nofollow}, 'link_depth': 0 if external else depth + 1, 'referrer_depth': depth, 'referrer_url': response.url, }) url_parts = urlparse_cached(r) path_parts = url_parts.path.split('/') r.meta['score'] = 1.0 / len(path_parts) r.meta['is_seed'] = False yield r def _load_webpage_item(self, response, is_seed): depth = response.meta.get('link_depth', 0) ld = WebpageItemLoader(response=response) ld.add_value('url', response.url) ld.add_value('host', get_domain(response.url)) ld.add_xpath('title', '//title/text()') ld.add_value('depth', depth) ld.add_value('total_depth', response.meta.get('depth')) ld.add_value('crawled_at', datetime.utcnow()) ld.add_value('is_seed', is_seed) ld.add_value('crawler_score', response.meta['score']) if self.save_html: ld.add_value('html', response.body_as_unicode()) if 'link' in response.meta: link = response.meta['link'] ld.add_value('link_text', link['text']) ld.add_value('link_url', link['url']) ld.add_value('referrer_url', response.meta['referrer_url']) ld.add_value('referrer_depth', response.meta['referrer_depth']) return ld
def parse(self, response): # item2: 提取此页面所有的详情页,加入请求队列 item2 = LinkExtractor(allow='\/detail', allow_domains='cheng95.com') for link in item2.extract_links(response): yield Request(url=link.url, priority=2, callback=self.parse, meta={'dont_redirect': True}) # item1:提取此页面所有的索引页,加入请求队列 item1 = LinkExtractor(allow='\/positions', allow_domains='cheng95.com', deny=('\/detail', '\/analyze')) for link in item1.extract_links(response): yield Request(url=link.url, priority=1, callback=self.parse, meta={'dont_redirect': True}) # 如果当前请求的页面是索引页,从页面里解析出它的Total_count(总页面数),并加入请求队列 if 'search' in response.url: self.logger.debug('索引页') pattern = re.compile('totalpage: (\d*),', re.S) total_count = re.findall(pattern, response.text) if total_count: if 'page' not in response.request.url: count = int(total_count[0]) if count > 500: count = 500 # print('count数量:', count) for page in range(2, count + 1): next_url = response.request.url + '&page=' + str(page) yield Request(url=next_url, priority=1, callback=self.parse, meta={'dont_redirect': True}) else: self.logger.debug('索引页IP被禁') # 当目前的页面是详情页时,Xpath方式解析字段 elif 'detail' in response.url: self.logger.debug('详情页') item = Cheng95Item() if response.xpath('//div[@class="basic-inner"]/h1/span[1]/text()' ).extract_first() != None: item['url'] = response.request.url item['title'] = response.xpath( '//div[@class="basic-inner"]/h1/span[1]/text()' ).extract_first() item['company'] = response.xpath( '//h2[@class="company-name"]/text()').extract_first('') item['salary'] = response.xpath( '//div[@class="basic-inner"]/h1/span[2]/text()' ).extract_first('') others = response.xpath( '//p[@class="extra-info clearfix"]/span/text()').extract() if others: pattern1 = re.compile('招聘\d+人', re.S) pattern2 = re.compile('中专|高中|本科|专科|硕士|博士', re.S) if re.findall(pattern1, str(others)): item['need'] = re.findall(pattern1, str(others))[0] else: item['need'] = None if re.findall(pattern2, str(others)): item['education'] = re.findall(pattern2, str(others))[0] else: item['education'] = None item['come_from'] = others[-3] item['release_time'] = others[-1] item['address'] = others[-5] head = response.xpath( '//div[@class="position-module position-detail"]/div[@class="module-hd"]/h3/text()' ).extract() content = response.xpath( '//div[@class="position-module position-detail"]/div[@class="module-bd"]' ) for i in range(len(head)): word = head.pop(0) if word == '工作内容': q = content.pop(0).xpath('text()').extract() item['job_content'] = "".join([i.strip() for i in q]) elif word == '职位要求': q = content.pop(0).xpath('text()').extract() item['job_requirement'] = "".join( [i.strip() for i in q]) elif word == '工作地点': item['detail_address'] = content.pop(0).xpath( 'text()').extract_first('') else: pass yield item else: self.logger.debug('详情页IP被禁') else: pass
class FollowAllSpider(Spider): name = 'followall' def __init__(self, **kw): super(FollowAllSpider, self).__init__(**kw) url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/' if not url.startswith('http://') and not url.startswith('https://'): url = 'http://%s/' % url self.url = url self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)] self.link_extractor = LinkExtractor() self.cookies_seen = set() def start_requests(self): return [Request(self.url, callback=self.parse, dont_filter=True)] def parse(self, response): """Parse a PageItem and all requests to follow @url http://www.scrapinghub.com/ @returns items 1 1 @returns requests 1 @scrapes url title foo """ page = self._get_item(response) r = [page] r.extend(self._extract_requests(response)) return r def _get_item(self, response): item = Page(url=response.url, size=str(len(response.body)), referer=response.request.headers.get('Referer')) self._set_title(item, response) self._set_new_cookies(item, response) return item def _extract_requests(self, response): r = [] if isinstance(response, HtmlResponse): links = self.link_extractor.extract_links(response) r.extend(Request(x.url, callback=self.parse) for x in links) return r def _set_title(self, page, response): if isinstance(response, HtmlResponse): title = Selector(response).xpath("//title/text()").extract() if title: page['title'] = title[0] def _set_new_cookies(self, page, response): cookies = [] for cookie in [ x.split(';', 1)[0] for x in response.headers.getlist('Set-Cookie') ]: if cookie not in self.cookies_seen: self.cookies_seen.add(cookie) cookies.append(cookie) if cookies: page['newcookies'] = cookies
class TopicalFinder(SplashSpiderBase): name = 'topical_finder' save_html = None use_splash = None def __init__( self, seed_urls=None, save_html=1, use_splash=1, screenshot_dir='/memex-pinterest/ui/static/images/screenshots', op_time=10, **kwargs): ''' Constructs spider instance from command=line or scrapyd daemon. :param seed_urls: Comma-separated list of URLs, if empty crawler will be following not crawled URLs from storage :param save_html: boolean 0/1 :param use_splash: boolean 0/1 :param screenshot_dir: used only when use_splash=1 :param op_time: operating time in minutes, negative - don't use that constraint :param kwargs: :return: ''' super(TopicalFinder, self).__init__(screenshot_dir=screenshot_dir, **kwargs) self.screenshot_dir = screenshot_dir log.msg("SCREENSHOT DIR IS SET TO: %s" % str(screenshot_dir), _level=log.DEBUG) if seed_urls: self.start_urls = [ add_scheme_if_missing(url) for url in seed_urls.split(',') ] self.ranker = Ranker.load() self.linkextractor = LinkExtractor() self.save_html = bool(save_html) self.use_splash = bool(use_splash) self.operating_time = int(op_time) * 60 self.start_time = datetime.utcnow() self.finishing = False def start_requests(self): for url in self.start_urls: yield self.make_requests_from_url(url, is_seed=True) def make_requests_from_url(self, url, is_seed=False): if self.use_splash: r = self._splash_request(url) else: r = super(TopicalFinder, self).make_requests_from_url(url) r.meta['score'] = 0.0 r.meta['is_seed'] = False if is_seed: r.meta['is_seed'] = True r.meta['score'] = 1.0 # setting maximum score value for seeds log.msg("Making request to %s with meta: %s" % (r.url, str(r.meta)), _level=log.DEBUG) return r def set_crawler(self, crawler): super(TopicalFinder, self).set_crawler(crawler) self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle) def spider_idle(self): log.msg("Spider idle signal caught.", _level=log.DEBUG) raise DontCloseSpider def parse(self, response): ld = self._load_webpage_item(response, is_seed=response.meta['is_seed']) if self.use_splash: self._process_splash_response(response, ld) yield ld.load_item() if self.finishing: return now = datetime.utcnow() if self.operating_time > 0 and ( now - self.start_time).total_seconds() > self.operating_time: log.msg( "Reached operating time constraint. Waiting for Scrapy queue to exhaust." ) self.finishing = True self.crawler.stop() return if not isinstance(response, TextResponse): return body = response.body_as_unicode().strip().encode('utf8') or '<html/>' score = self.ranker.score_html(body) log.msg("TC: %s has score=%f" % (response.url, score), _level=log.DEBUG) if score > 0.5: #!for some reason this is returning the raw splash response JSON #!and not the rendered HTML from splash #log.msg(u"\n\n\n****---Response body:\n %s----***\n\n\n" % response.body_as_unicode(), _level=log.DEBUG) #for link in self.linkextractor.extract_links(response): #can something like the line below fix it? Seems like a hack... for link in self.linkextractor.extract_links(response): log.msg("****---LINK EXTRACED: %s----***" % str(link.url), _level=log.DEBUG) if self.use_splash: r = self._splash_request(url=link.url) else: r = Request(url=link.url) external = is_external_url(response.url, link.url) depth = response.meta.get('link_depth', 0) r.meta.update({ 'link': { 'url': link.url, 'text': link.text, 'fragment': link.fragment, 'nofollow': link.nofollow }, 'link_depth': 0 if external else depth + 1, 'referrer_depth': depth, 'referrer_url': response.url, }) url_parts = urlparse_cached(r) path_parts = url_parts.path.split('/') r.meta['score'] = 1.0 / len(path_parts) r.meta['is_seed'] = False yield r def _load_webpage_item(self, response, is_seed): depth = response.meta.get('link_depth', 0) ld = WebpageItemLoader(response=response) ld.add_value('url', response.url) ld.add_value('host', get_domain(response.url)) ld.add_xpath('title', '//title/text()') ld.add_value('depth', depth) ld.add_value('total_depth', response.meta.get('depth')) ld.add_value('crawled_at', datetime.utcnow()) ld.add_value('is_seed', is_seed) ld.add_value('crawler_score', response.meta['score']) if self.save_html: ld.add_value('html', response.body_as_unicode()) if 'link' in response.meta: link = response.meta['link'] ld.add_value('link_text', link['text']) ld.add_value('link_url', link['url']) ld.add_value('referrer_url', response.meta['referrer_url']) ld.add_value('referrer_depth', response.meta['referrer_depth']) return ld
class listPageSpider(Spider): name = 'listPageSpider' def __init__(self, taskId, *a, **kw): """Constructor""" self.name +='_'+str(taskId) super(listPageSpider, self).__init__(*a, **kw) pool = redis.ConnectionPool(host='localhost', port=6379, db=0) self.redis = redis.Redis(connection_pool=pool) self.dbUtils = db.dbUtils() self.taskId = int(taskId) self.domain = None self.project = None self.hasCrawlSet = set() self.hasInsertSet = set() self.isExit = 0 project = self.dbUtils.queryRow('SELECT * FROM project_setting WHERE iStatus=1 AND iPid=%d' % self.taskId) if project : self.project = project self.domain = ".".join(urlparse(project['szDomain']).hostname.split(".")[-2:]) # self.start_urls = ['http://www.ty2016.com/cn/2.html', 'http://www.ty2016.com/cn/3.html', 'http://www.ty2016.com/cn/4.html'] def stopSpider(self): self.isExit = 1 def getStartUrl(self): url = self.redis.rpop('scrapy:startPageSpider:listQuque') if not url: self.getStartUrl() return url def start_requests(self): # url = self.getStartUrl() # print '=====================>',url # yield self.make_requests_from_url(url) while True : #if self._crawler.engine is not None: #if self._crawler.engine.paused: break #if not self._crawler.engine.running: break url = self.redis.rpop('scrapy:startPageSpider:listQuque:%s' % self.taskId) #print 'listPageSpider==========================>',url if url: #self.redis.sadd('scrapy:startPageSpider:startPage:1', url) yield self.make_requests_from_url(url) #else: #self._crawler.signals.send_catch_log('emptyListQuque') #print 'listPageSpider---------send_catch_log->emptyListQuque' def parse(self, response): #self.redis.sadd('scrapy:startPageSpider:startPage:3', response.url) if response.url not in self.hasCrawlSet: #self.redis.sadd('scrapy:startPageSpider:startPage:4', response.url) self.hasCrawlSet.add(response.url) _allow = ( _allow for _allow in self.project['szUrlReg'].split('~')) self.linkExtractor = LinkExtractor(allow_domains=self.domain, allow=_allow) links = [ link for link in self.linkExtractor.extract_links(response) if link.url not in self.hasInsertSet ] #self.redis.hset('scrapy:startPageSpider:listPage:count', response.url, len(links)) for link in links: if link.url in self.hasInsertSet : continue insertSql = 'INSERT INTO project_list_page(iPid, szUrl, szTitle, szSourceUrl,dtLastScrapyTime) VALUES(%d, "%s", "%s", "%s", "%s")' % (self.taskId,link.url, link.text, response.url, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) self.dbUtils.insert(insertSql) self.hasInsertSet.add(link.url) log.msg(format='spider=listPageSpider iPid=%(i)s, title=%(t)s url=%(u)s', i = self.taskId, t=link.text, u=link.url)
def parse_urls(self, response): extractor = LinkExtractor(restrict_xpaths=('//table[contains(@class,ibm-data-table)]/tbody',)) links = extractor.extract_links(response) for link in links: url = link.url yield Request(url, callback=self.parse_items)
def parse_categories(self, response): l = LinkExtractor(restrict_xpaths='.//div[@class="categoryListContainer"]') links = l.extract_links(response) for link in links: yield Request(url=link.url, callback=self.parse_items_links)
def parse_sitemap(self, response): #self.log('We are parsing site map: %s' % response.url) linkextractor = LinkExtractor(allow_domains=self.allowed_domains, restrict_xpaths=self.NEWS_CATEGORY_XPATH) for link in linkextractor.extract_links(response): yield scrapy.Request(link.url, callback=self.parse_newspage)
def _scrape_product_links(self, response): link_extractor = LinkExtractor( restrict_xpaths='//div[@id="prodtitle"]') links = link_extractor.extract_links(response) links = [(l.url, SiteProductItem()) for l in links] return links
def parse(self, response): #self.log('We are at URL: %s' % response.url) yield scrapy.Request(response.url+'robots.txt', callback=self.parse_robot) linkextractor = LinkExtractor(allow_domains=self.allowed_domains, restrict_xpaths=self.SITEMAP_XPATH) for link in linkextractor.extract_links(response): yield scrapy.Request(link.url, callback=self.parse_sitemap)
def parse(self, response): ret = [] url = response.url if "login.php" in url: # dont go to login pages return None domres1 = tldextract.extract(url) #sd = domres.subdomain dm1 = domres1.domain #tld = domres.suffix path = url.replace(" ", "").replace("/", "_").replace(":", "").replace(".", "") title = path if response.body: content = response.body_as_unicode() else: content = "NONE" links = [] domains = [] for x in response.xpath('//*//@href'): y = x.extract() links.append(y) domres = tldextract.extract(y) sd = domres.subdomain dm = domres.domain tld = domres.suffix if dm: #print domres d = "{0}.{1}.{2}".format(sd, dm, tld) if d not in domains: domains.append(d) p = Page(title=title, path=path, url=url, content=content, links=links, domains=domains) ret.append(p) l = LinkExtractor() for x in l.extract_links(response): domres = tldextract.extract(x.url) dm = domres.domain if dm.lower() == dm1.lower(): print "extracted", x if "login.php" in x.url: print "skipping login" else: ret.append(scrapy.http.Request(x.url)) else: print "Skip", domres, domres1, x pass # # spider the other local pages # for sel in response.xpath('//*/@href'): # link = sel.extract() # desc = sel.xpath('text()').extract() # if link: # l = url + link[0] # print "LINK",l, desc # ret.append(scrapy.http.Request(l)) return ret
def parse_urls(self, response): extractor = LinkExtractor(restrict_xpaths=('//div[contains(@class, "news_type2")]/h2',)) links = extractor.extract_links(response) for link in links: url = link.url yield Request(url, callback=self.parse_items)