def parse_items_links(self, response):
        categories_links_extractor = LinkExtractor(restrict_xpaths='.//div[@class="categoryListContainer"]')
        cat_links = categories_links_extractor.extract_links(response)
        for link in cat_links:
            yield Request(url=link.url, callback=self.parse_items_links)

        items_links_extractor = LinkExtractor(restrict_xpaths='.//div[@class="directory-listing"]/h3')
        items_links = items_links_extractor.extract_links(response)
        for link in items_links:
            yield Request(url=link.url, callback=self.parse_item)

        pagination_link = cond_set_value(response.xpath('.//a[@class="more"]/@href').extract())
        if pagination_link:
            full_pagination_link = urljoin(self.start_urls[0], pagination_link)
            yield Request(url=full_pagination_link, callback=self.parse_items_links)
Пример #2
0
 def parse_categories(self, response):
     request_again = self.error_handler(response)
     if request_again:
         yield request_again
         return
     categories_extractor = LinkExtractor(
         restrict_xpaths='.//ul[@class="popTermsList"]')
     categories_links = categories_extractor.extract_links(response)
     for link in categories_links:
         yield Request(url=link.url, callback=self.get_items_and_pagination)
     letters_extractor = LinkExtractor(
         restrict_xpaths='.//div[@class="popTermsNavBar"]')
     letters_links = letters_extractor.extract_links(response)
     for link in letters_links:
         yield Request(url=link.url, callback=self.parse_categories)
Пример #3
0
	def parse_newspage(self, response):
		#self.log('We are parsing news page: %s' % response.url)
		
		#self.log(str(xpathlist))
		linkextractor = LinkExtractor(allow_domains=self.allowed_domains, restrict_xpaths = self.LINK_XPATHLIST)
		for link in linkextractor.extract_links(response):
			yield scrapy.Request(link.url, callback=self.parse_myarticle)
Пример #4
0
 def _parse_category(self, response):
     category = response.meta['category']
     parent = response.meta.get('parent', {})
     category['catid'] = self._get_catid()
     category['url'] = response.url
     category['parent_text'] = parent.get('text')
     category['parent_url'] = parent.get('url')
     category['parent_catid'] = parent.get('catid')
     category['grandparent_text'] = parent.get('parent_text')
     category['grandparent_url'] = parent.get('parent_url')
     category['level'] = parent.get('level', 0) + 1
     category['department_text'] = response.meta['department']['text']
     category['department_url'] = response.meta['department']['url']
     category['department_id'] = response.meta['department']['catid']
     #category['description_text'] = self._description_text.first(response)
     description_text = first(response.xpath(self._xpath_description_text).extract())
     if description_text:
         category['description_wc'] = len(Utils.normalize_text(description_text))
     keywords = first(response.xpath(self._xpath_keywords).extract())
     if description_text:
         category['description_text'] = description_text
     if description_text and keywords:
         (category['keyword_count'], category['keyword_density']) = Utils.phrases_freq(keywords, description_text)
     if category.get('nr_products') is None:
         nr_products = re_find('\d+', first(response.css(self._css_product_numbers_text).extract()))
         category['nr_products'] = int(nr_products) if nr_products is not None else None
     subcategory_links = LinkExtractor(restrict_xpaths=self._xpath_category_links)
     for link in subcategory_links.extract_links(response):
         text, nr_products = re.search('(.+?) \((\d+)\) *', link.text).groups()
         nr_products = int(nr_products)
         child = CategoryItem(text=text, nr_products=nr_products)
         meta = {'category': child, 'department': response.meta['department'], 'parent': category}
         yield Request(link.url, callback=self._parse_category, meta=meta)
     yield category
Пример #5
0
    def get_items_and_pagination(self, response):
        request_again = self.error_handler(response)
        if request_again:
            yield request_again
            return
        items_extractor = LinkExtractor(deny=[r'\/image\/', r'\/map'],
                                        restrict_xpaths='.//div[@class="itemInfo"]/h2')
        items_links = items_extractor.extract_links(response)
        for link in items_links:
            yield Request(url=link.url, callback=self.parse_item)
        if response.xpath('.//a[@class="next"]').extract():
            total_quantity = response.xpath(
                '(.//div[@class="pageResults"]/span[@class="results"]'
                '/text()[normalize-space()])[2]').re(r'\d+')
            if total_quantity:
                total_quantity = int(total_quantity[0])
                pages = total_quantity/25
                page_range = range(1, pages+2)

            category = cond_set_value(response.xpath(
                './/input[@id="FrmWho"]/@value').extract())
            quoted_category = quote_plus(category)
            for page in page_range:
                next_url = self.pagination_pattern.format(prase=quoted_category,
                                                          page=page)
                yield Request(url=next_url, headers=self.pagination_headers,
                              dont_filter=True, method='POST',
                              callback=self.parse_pagination)
 def get_companies_links(self, response):
     companies_link_extractor = LinkExtractor(allow=r'\/company_\d{5,7}')
     companies_links = companies_link_extractor.extract_links(response)
     for link in companies_links:
         yield Request(url=link.url,
                       callback=self.parse_item,
                       # cookies=None,
                       meta={'category': response.meta.get('category')})
    def _extract_links(self, response, params):
        """ parse links from response
            @return hrefs
        """

        params['allow_domains'] = tuple(self.allowed_domains)
        link_extractor = LinkExtractor(**params)
        return link_extractor.extract_links(response)
Пример #8
0
class FollowAllSpider(Spider):

    name = 'followall'

    def __init__(self, **kw):
        super(FollowAllSpider, self).__init__(**kw)
        url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/'
        if not url.startswith('http://') and not url.startswith('https://'):
            url = 'http://%s/' % url
        self.url = url
        self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
        self.link_extractor = LinkExtractor()
        self.cookies_seen = set()

    def start_requests(self):
        return [Request(self.url, callback=self.parse, dont_filter=True)]

    def parse(self, response):
        """Parse a PageItem and all requests to follow

        @url http://www.scrapinghub.com/
        @returns items 1 1
        @returns requests 1
        @scrapes url title foo
        """
        page = self._get_item(response)
        r = [page]
        r.extend(self._extract_requests(response))
        return r

    def _get_item(self, response):
        item = Page(url=response.url, size=str(len(response.body)),
            referer=response.request.headers.get('Referer'))
        self._set_title(item, response)
        self._set_new_cookies(item, response)
        return item

    def _extract_requests(self, response):
        r = []
        if isinstance(response, HtmlResponse):
            links = self.link_extractor.extract_links(response)
            r.extend(Request(x.url, callback=self.parse) for x in links)
        return r

    def _set_title(self, page, response):
        if isinstance(response, HtmlResponse):
            title = Selector(response).xpath("//title/text()").extract()
            if title:
                page['title'] = title[0]

    def _set_new_cookies(self, page, response):
        cookies = []
        for cookie in [x.split(';', 1)[0] for x in response.headers.getlist('Set-Cookie')]:
            if cookie not in self.cookies_seen:
                self.cookies_seen.add(cookie)
                cookies.append(cookie)
        if cookies:
            page['newcookies'] = cookies
Пример #9
0
 def parse(self, response):
     le = LinkExtractor()
     for link in le.extract_links(response):
         url = urljoin(response.url, link.url)
         yield scrapy.Request(url, self.parse_link, meta={
             'splash': {
                 'args': {'har': 1, 'html': 0},
             }
         })
    def extract_links(self, response):
        # The parent can do most of it for us
        links = LinkExtractor.extract_links(self, response)
        try:
            good_links = [link for link in links if link.text.isdigit()]
        except TypeError:
            return None

        return good_links
 def get_categories(self, response):
     # http://www.construction.co.uk/double-glazing-repairs/category_33.htm
     link_extractor = LinkExtractor(allow=r'\/category_\d+')
     links = link_extractor.extract_links(response)
     for link in links:
         category = link.text
         yield Request(url=link.url,
                       callback=self.get_companies_links_by_letter,
                       meta={'category':category})
Пример #12
0
 def _populate_related_products(self, response, product):
     xpath = '//ul[contains(@class, "might_like")]/li/' \
         'div[contains(@class, "product_description")]/a'
     extractor = LinkExtractor(restrict_xpaths=xpath)
     products = [
         RelatedProduct(url=urljoin(response.url, link.url),
                        title=link.text.strip())
         for link in extractor.extract_links(response)
     ]
     cond_set_value(product, 'related_products',
                    {'You might also like': products})
Пример #13
0
 def parse(self, response):
     le = LinkExtractor()
     for link in le.extract_links(response):
         url = urljoin(response.url, link.url)
         yield scrapy.Request(
             url,
             self.parse_link,
             meta={'splash': {
                 'args': {
                     'har': 1,
                     'html': 0
                 },
             }})
 def get_companies_links_by_letter(self, response):
     # http://www.construction.co.uk/heating-contractors-and-consultants/22_A.htm
     letter_link_extractor = LinkExtractor(allow=r'\/\d+_[A-Z].htm')
     links_by_letter = letter_link_extractor.extract_links(response)
     if links_by_letter:
         for link in links_by_letter:
             yield Request(url=link.url,
                           callback=self.get_companies_links,
                           meta={'category': response.meta.get('category')})
     else:
         # there is no letters pagination at the page
         for request in self.get_companies_links(response):
             yield request
Пример #15
0
class startPageSpider(Spider):

    name = 'startPageSpider'

    def __init__(self, taskId, *a, **kw):
        """Constructor"""
        self.name +='_'+str(taskId)
        super(startPageSpider, self).__init__(*a, **kw)
        pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
        self.redis = redis.Redis(connection_pool=pool)
        self.dbUtils = db.dbUtils()
        self.taskId = int(taskId)

        self.project = None
        self.domain = None
        self.hasCrawlSet = set()
        self.hasInsertSet = set()
        

        project = self.dbUtils.queryRow('SELECT * FROM project_setting WHERE iStatus=1 AND iPid=%d' % self.taskId)
        if  project :
            self.project = project
            self.start_urls = str(project['szStartUrl']).split('~')
            self.domain = ".".join(urlparse(project['szDomain']).hostname.split(".")[-2:])

    def parse(self, response):
        print 'startPageSpider==========================>',response.url
#         log.msg(format='%(iPid)s, %(url)s, %(project)s ', iPid = self.taskId, url = response.url, project=self.project)
        listQuqueCount = self.redis.llen('scrapy:startPageSpider:listQuque:%s' % self.taskId)
        if listQuqueCount == 1:
            self._crawler.signals.send_catch_log('writeListQuque')
        elif listQuqueCount == 0:
            self._crawler.signals.send_catch_log('emptyListQuque')
            print 'startPageSpider---------send_catch_log->emptyListQuque'
        if response.url not in self.hasCrawlSet:
            pattern = re.compile(r'%s' % self.project['szStartUrlReg'])
            self.hasCrawlSet.add(response.url)
            if pattern.match(response.url) and response.url not in self.hasInsertSet:
                title = "|".join(response.xpath('/html/head/title/text()').extract())
                insertSql = 'INSERT INTO project_start_page(iPid, szUrl, szTitle,dtLastScrapyTime) VALUES(%d, "%s", "%s", "%s")' % (self.taskId, response.url,  title,  time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
                self.dbUtils.insert(insertSql)
                self.hasInsertSet.add(response.url)
                self.redis.lpush('scrapy:startPageSpider:listQuque:%s' % self.taskId, response.url)
                #self.redis.sadd('scrapy:startPageSpider:startPage:2', response.url)
                log.msg(format='spider=startPageSpider iPid=%(i)s, title=%(t)s url=%(u)s', i = self.taskId, t=title, u=response.url)

            _allow = ( _allow for _allow in self.project['szStartUrlReg'].split('~'))
            self.linkExtractor = LinkExtractor(allow_domains=self.domain, allow=_allow)
            links = [ link for link in self.linkExtractor.extract_links(response) if link.url not in self.hasCrawlSet ]
            for link in links:
                yield self.make_requests_from_url(link.url)
Пример #16
0
 def parse(self, response):
     link = LinkExtractor(
         allow='/baojie/.*',
         restrict_xpaths=
         '//dl[@class="selitem selitem-area clearfix"]/dd[@class="posrelative w-area"]/a'
     )
     links = link.extract_links(response)
     for i in links:
         self.city_name = re.split('\/', i.url)[-2]
         yield Request(i.url,
                       callback=self.get_index,
                       meta={
                           'city_name': self.city_name,
                           'dont_redirect': True
                       },
                       dont_filter=True)
Пример #17
0
    def get_index(self, response):
        city_name = response.meta['city_name']
        link = LinkExtractor(
            restrict_xpaths=
            '//div[@class="leftBox"]//div[@class="list"]/ul/li[@class="list-img"]/div[@class="pic"]/a'
        )
        links = link.extract_links(response)

        for i in links:
            yield Request(i.url,
                          callback=self.get_message,
                          meta={
                              'city_name': city_name,
                              'dont_redirect': True
                          },
                          dont_filter=True)
Пример #18
0
	def parse(self, response):
		# set the allowed domains in link
		ln_extractor = LinkExtractor(allow_domains=("news.sina.cn"),
			 allow = (".*vt=1.*"))
		# get the links from the response
		links = ln_extractor.extract_links(response)
		urls = []
		items = []
		for i in links:
			urls.append(i.url)
			# all the not visited urls are put into container and queue.
			if i.url not in self.g_container_urls:
				self.g_queue_urls.put(i.url)
				self.g_container_urls.add(i.url)
		# make all the request in the queue
		for j in range(self.g_queue_urls.qsize()):
			tp_url = self.g_queue_urls.get()
			items.append(self.make_requests_from_url(tp_url).
				replace(callback=self.parse_page))
			items.append(self.make_requests_from_url(tp_url))
		return items
Пример #19
0
 def parse(self, response):
     xlink = LinkExtractor()
     itemre = re.compile(self.itemurl_re)
     for link in xlink.extract_links(response):
         if itemre.search(link.url):
             yield Request(url=link.url, callback=self.parse_item)
Пример #20
0
 def parse(self, response):
     self.links.extend(LinkExtractor.extract_links(response))
Пример #21
0
	def parse_index(self, response):
		l = LinkExtractor(allow=(r'/wiki/.*\.html'), restrict_xpaths=('//div[@id="wrapper"]//div[@id="content"]'))
		for link in l.extract_links(response):
			yield Request(link.url, callback=self.parse_pages)
Пример #22
0
 def parse(self, response):
     dpt_links = LinkExtractor(restrict_xpaths=self._xpath_department_links)
     for link in dpt_links.extract_links(response):
         category = CategoryItem(text=link.text.strip(' \t\n'))
         yield Request(link.url, callback=self._parse_category, meta={'category': category, 'department': category})
Пример #23
0
	def parse_level3_contents(self, response):
		baseurl = response.xpath('//base/@href').extract()[0]
		le = LinkExtractor()
		for link in le.extract_links(response):
			if self.allowed_domains[0] in link.url:
				yield Request(link.url, callback=self.final_contents)
Пример #24
0
class TopicalFinder(SplashSpiderBase):
    name = 'topical_finder'

    save_html = None
    use_splash = None

    def __init__(self, seed_urls=None, save_html=1, use_splash=1, screenshot_dir='/memex-pinterest/ui/static/images/screenshots', op_time=10, **kwargs):
        '''
        Constructs spider instance from command=line or scrapyd daemon.

        :param seed_urls: Comma-separated list of URLs, if empty crawler will be following not crawled URLs from storage
        :param save_html: boolean 0/1
        :param use_splash: boolean 0/1
        :param screenshot_dir: used only when use_splash=1
        :param op_time: operating time in minutes, negative - don't use that constraint
        :param kwargs:
        :return:
        '''
        super(TopicalFinder, self).__init__(screenshot_dir=screenshot_dir, **kwargs)
        self.screenshot_dir = screenshot_dir
        log.msg("SCREENSHOT DIR IS SET TO: %s" % str(screenshot_dir), _level=log.DEBUG)

        if seed_urls:
            self.start_urls = [add_scheme_if_missing(url) for url in seed_urls.split(',')]
        self.ranker = Ranker.load()
        self.linkextractor = LinkExtractor()
        self.save_html = bool(save_html)
        self.use_splash = bool(use_splash)
        self.operating_time = int(op_time) * 60

        self.start_time = datetime.utcnow()
        self.finishing = False

    def start_requests(self):
        for url in self.start_urls:
            yield self.make_requests_from_url(url, is_seed=True)

    def make_requests_from_url(self, url, is_seed=False):
        if self.use_splash:
            r = self._splash_request(url)
        else:
            r = super(TopicalFinder, self).make_requests_from_url(url)
        r.meta['score'] = 0.0
        r.meta['is_seed'] = False

        if is_seed:
            r.meta['is_seed'] = True
            r.meta['score'] = 1.0  # setting maximum score value for seeds

        log.msg("Making request to %s with meta: %s" % (r.url, str(r.meta)), _level=log.DEBUG)

        return r

    def set_crawler(self, crawler):
        super(TopicalFinder, self).set_crawler(crawler)
        self.crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)

    def spider_idle(self):
        log.msg("Spider idle signal caught.", _level=log.DEBUG)
        raise DontCloseSpider

    def parse(self, response):
        ld = self._load_webpage_item(response, is_seed=response.meta['is_seed'])
        if self.use_splash:
            self._process_splash_response(response, ld)
        yield ld.load_item()

        if self.finishing:
            return

        now = datetime.utcnow()
        if self.operating_time > 0 and (now - self.start_time).total_seconds() > self.operating_time:
            log.msg("Reached operating time constraint. Waiting for Scrapy queue to exhaust.")
            self.finishing = True
            self.crawler.stop()
            return

        if not isinstance(response, TextResponse):
            return

        body = response.body_as_unicode().strip().encode('utf8') or '<html/>'
        score = self.ranker.score_html(body)
        log.msg("TC: %s has score=%f" % (response.url, score), _level=log.DEBUG)

        if score > 0.5:

            #!for some reason this is returning the raw splash response JSON
            #!and not the rendered HTML from splash
            #log.msg(u"\n\n\n****---Response body:\n %s----***\n\n\n" % response.body_as_unicode(), _level=log.DEBUG)

            #for link in self.linkextractor.extract_links(response):
            #can something like the line below fix it? Seems like a hack...
            for link in self.linkextractor.extract_links(response):

                log.msg("****---LINK EXTRACED: %s----***" % str(link.url), _level=log.DEBUG)

                if self.use_splash:
                    r = self._splash_request(url=link.url)
                else:
                    r = Request(url=link.url)

                external = is_external_url(response.url, link.url)
                depth = response.meta.get('link_depth', 0)
                r.meta.update({
                    'link': {
                        'url': link.url,
                        'text': link.text,
                        'fragment': link.fragment,
                        'nofollow': link.nofollow},
                    'link_depth': 0 if external else depth + 1,
                    'referrer_depth': depth,
                    'referrer_url': response.url,
                })

                url_parts = urlparse_cached(r)
                path_parts = url_parts.path.split('/')
                r.meta['score'] = 1.0 / len(path_parts)
                r.meta['is_seed'] = False
                yield r

    def _load_webpage_item(self, response, is_seed):
        depth = response.meta.get('link_depth', 0)
        ld = WebpageItemLoader(response=response)
        ld.add_value('url', response.url)
        ld.add_value('host', get_domain(response.url))
        ld.add_xpath('title', '//title/text()')
        ld.add_value('depth', depth)
        ld.add_value('total_depth', response.meta.get('depth'))
        ld.add_value('crawled_at', datetime.utcnow())
        ld.add_value('is_seed', is_seed)
        ld.add_value('crawler_score', response.meta['score'])

        if self.save_html:
            ld.add_value('html', response.body_as_unicode())

        if 'link' in response.meta:
            link = response.meta['link']
            ld.add_value('link_text', link['text'])
            ld.add_value('link_url', link['url'])
            ld.add_value('referrer_url', response.meta['referrer_url'])
            ld.add_value('referrer_depth', response.meta['referrer_depth'])
        return ld
Пример #25
0
 def parse(self, response):
     # item2: 提取此页面所有的详情页,加入请求队列
     item2 = LinkExtractor(allow='\/detail', allow_domains='cheng95.com')
     for link in item2.extract_links(response):
         yield Request(url=link.url,
                       priority=2,
                       callback=self.parse,
                       meta={'dont_redirect': True})
     # item1:提取此页面所有的索引页,加入请求队列
     item1 = LinkExtractor(allow='\/positions',
                           allow_domains='cheng95.com',
                           deny=('\/detail', '\/analyze'))
     for link in item1.extract_links(response):
         yield Request(url=link.url,
                       priority=1,
                       callback=self.parse,
                       meta={'dont_redirect': True})
     # 如果当前请求的页面是索引页,从页面里解析出它的Total_count(总页面数),并加入请求队列
     if 'search' in response.url:
         self.logger.debug('索引页')
         pattern = re.compile('totalpage: (\d*),', re.S)
         total_count = re.findall(pattern, response.text)
         if total_count:
             if 'page' not in response.request.url:
                 count = int(total_count[0])
                 if count > 500:
                     count = 500
                 # print('count数量:', count)
                 for page in range(2, count + 1):
                     next_url = response.request.url + '&page=' + str(page)
                     yield Request(url=next_url,
                                   priority=1,
                                   callback=self.parse,
                                   meta={'dont_redirect': True})
         else:
             self.logger.debug('索引页IP被禁')
     # 当目前的页面是详情页时,Xpath方式解析字段
     elif 'detail' in response.url:
         self.logger.debug('详情页')
         item = Cheng95Item()
         if response.xpath('//div[@class="basic-inner"]/h1/span[1]/text()'
                           ).extract_first() != None:
             item['url'] = response.request.url
             item['title'] = response.xpath(
                 '//div[@class="basic-inner"]/h1/span[1]/text()'
             ).extract_first()
             item['company'] = response.xpath(
                 '//h2[@class="company-name"]/text()').extract_first('')
             item['salary'] = response.xpath(
                 '//div[@class="basic-inner"]/h1/span[2]/text()'
             ).extract_first('')
             others = response.xpath(
                 '//p[@class="extra-info clearfix"]/span/text()').extract()
             if others:
                 pattern1 = re.compile('招聘\d+人', re.S)
                 pattern2 = re.compile('中专|高中|本科|专科|硕士|博士', re.S)
                 if re.findall(pattern1, str(others)):
                     item['need'] = re.findall(pattern1, str(others))[0]
                 else:
                     item['need'] = None
                 if re.findall(pattern2, str(others)):
                     item['education'] = re.findall(pattern2,
                                                    str(others))[0]
                 else:
                     item['education'] = None
                 item['come_from'] = others[-3]
                 item['release_time'] = others[-1]
                 item['address'] = others[-5]
             head = response.xpath(
                 '//div[@class="position-module position-detail"]/div[@class="module-hd"]/h3/text()'
             ).extract()
             content = response.xpath(
                 '//div[@class="position-module position-detail"]/div[@class="module-bd"]'
             )
             for i in range(len(head)):
                 word = head.pop(0)
                 if word == '工作内容':
                     q = content.pop(0).xpath('text()').extract()
                     item['job_content'] = "".join([i.strip() for i in q])
                 elif word == '职位要求':
                     q = content.pop(0).xpath('text()').extract()
                     item['job_requirement'] = "".join(
                         [i.strip() for i in q])
                 elif word == '工作地点':
                     item['detail_address'] = content.pop(0).xpath(
                         'text()').extract_first('')
                 else:
                     pass
             yield item
         else:
             self.logger.debug('详情页IP被禁')
     else:
         pass
Пример #26
0
class FollowAllSpider(Spider):

    name = 'followall'

    def __init__(self, **kw):
        super(FollowAllSpider, self).__init__(**kw)
        url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/'
        if not url.startswith('http://') and not url.startswith('https://'):
            url = 'http://%s/' % url
        self.url = url
        self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
        self.link_extractor = LinkExtractor()
        self.cookies_seen = set()

    def start_requests(self):
        return [Request(self.url, callback=self.parse, dont_filter=True)]

    def parse(self, response):
        """Parse a PageItem and all requests to follow

        @url http://www.scrapinghub.com/
        @returns items 1 1
        @returns requests 1
        @scrapes url title foo
        """
        page = self._get_item(response)
        r = [page]
        r.extend(self._extract_requests(response))
        return r

    def _get_item(self, response):
        item = Page(url=response.url,
                    size=str(len(response.body)),
                    referer=response.request.headers.get('Referer'))
        self._set_title(item, response)
        self._set_new_cookies(item, response)
        return item

    def _extract_requests(self, response):
        r = []
        if isinstance(response, HtmlResponse):
            links = self.link_extractor.extract_links(response)
            r.extend(Request(x.url, callback=self.parse) for x in links)
        return r

    def _set_title(self, page, response):
        if isinstance(response, HtmlResponse):
            title = Selector(response).xpath("//title/text()").extract()
            if title:
                page['title'] = title[0]

    def _set_new_cookies(self, page, response):
        cookies = []
        for cookie in [
                x.split(';', 1)[0]
                for x in response.headers.getlist('Set-Cookie')
        ]:
            if cookie not in self.cookies_seen:
                self.cookies_seen.add(cookie)
                cookies.append(cookie)
        if cookies:
            page['newcookies'] = cookies
Пример #27
0
class TopicalFinder(SplashSpiderBase):
    name = 'topical_finder'

    save_html = None
    use_splash = None

    def __init__(
            self,
            seed_urls=None,
            save_html=1,
            use_splash=1,
            screenshot_dir='/memex-pinterest/ui/static/images/screenshots',
            op_time=10,
            **kwargs):
        '''
        Constructs spider instance from command=line or scrapyd daemon.

        :param seed_urls: Comma-separated list of URLs, if empty crawler will be following not crawled URLs from storage
        :param save_html: boolean 0/1
        :param use_splash: boolean 0/1
        :param screenshot_dir: used only when use_splash=1
        :param op_time: operating time in minutes, negative - don't use that constraint
        :param kwargs:
        :return:
        '''
        super(TopicalFinder, self).__init__(screenshot_dir=screenshot_dir,
                                            **kwargs)
        self.screenshot_dir = screenshot_dir
        log.msg("SCREENSHOT DIR IS SET TO: %s" % str(screenshot_dir),
                _level=log.DEBUG)

        if seed_urls:
            self.start_urls = [
                add_scheme_if_missing(url) for url in seed_urls.split(',')
            ]
        self.ranker = Ranker.load()
        self.linkextractor = LinkExtractor()
        self.save_html = bool(save_html)
        self.use_splash = bool(use_splash)
        self.operating_time = int(op_time) * 60

        self.start_time = datetime.utcnow()
        self.finishing = False

    def start_requests(self):
        for url in self.start_urls:
            yield self.make_requests_from_url(url, is_seed=True)

    def make_requests_from_url(self, url, is_seed=False):
        if self.use_splash:
            r = self._splash_request(url)
        else:
            r = super(TopicalFinder, self).make_requests_from_url(url)
        r.meta['score'] = 0.0
        r.meta['is_seed'] = False

        if is_seed:
            r.meta['is_seed'] = True
            r.meta['score'] = 1.0  # setting maximum score value for seeds

        log.msg("Making request to %s with meta: %s" % (r.url, str(r.meta)),
                _level=log.DEBUG)

        return r

    def set_crawler(self, crawler):
        super(TopicalFinder, self).set_crawler(crawler)
        self.crawler.signals.connect(self.spider_idle,
                                     signal=signals.spider_idle)

    def spider_idle(self):
        log.msg("Spider idle signal caught.", _level=log.DEBUG)
        raise DontCloseSpider

    def parse(self, response):
        ld = self._load_webpage_item(response,
                                     is_seed=response.meta['is_seed'])
        if self.use_splash:
            self._process_splash_response(response, ld)
        yield ld.load_item()

        if self.finishing:
            return

        now = datetime.utcnow()
        if self.operating_time > 0 and (
                now - self.start_time).total_seconds() > self.operating_time:
            log.msg(
                "Reached operating time constraint. Waiting for Scrapy queue to exhaust."
            )
            self.finishing = True
            self.crawler.stop()
            return

        if not isinstance(response, TextResponse):
            return

        body = response.body_as_unicode().strip().encode('utf8') or '<html/>'
        score = self.ranker.score_html(body)
        log.msg("TC: %s has score=%f" % (response.url, score),
                _level=log.DEBUG)

        if score > 0.5:

            #!for some reason this is returning the raw splash response JSON
            #!and not the rendered HTML from splash
            #log.msg(u"\n\n\n****---Response body:\n %s----***\n\n\n" % response.body_as_unicode(), _level=log.DEBUG)

            #for link in self.linkextractor.extract_links(response):
            #can something like the line below fix it? Seems like a hack...
            for link in self.linkextractor.extract_links(response):

                log.msg("****---LINK EXTRACED: %s----***" % str(link.url),
                        _level=log.DEBUG)

                if self.use_splash:
                    r = self._splash_request(url=link.url)
                else:
                    r = Request(url=link.url)

                external = is_external_url(response.url, link.url)
                depth = response.meta.get('link_depth', 0)
                r.meta.update({
                    'link': {
                        'url': link.url,
                        'text': link.text,
                        'fragment': link.fragment,
                        'nofollow': link.nofollow
                    },
                    'link_depth': 0 if external else depth + 1,
                    'referrer_depth': depth,
                    'referrer_url': response.url,
                })

                url_parts = urlparse_cached(r)
                path_parts = url_parts.path.split('/')
                r.meta['score'] = 1.0 / len(path_parts)
                r.meta['is_seed'] = False
                yield r

    def _load_webpage_item(self, response, is_seed):
        depth = response.meta.get('link_depth', 0)
        ld = WebpageItemLoader(response=response)
        ld.add_value('url', response.url)
        ld.add_value('host', get_domain(response.url))
        ld.add_xpath('title', '//title/text()')
        ld.add_value('depth', depth)
        ld.add_value('total_depth', response.meta.get('depth'))
        ld.add_value('crawled_at', datetime.utcnow())
        ld.add_value('is_seed', is_seed)
        ld.add_value('crawler_score', response.meta['score'])

        if self.save_html:
            ld.add_value('html', response.body_as_unicode())

        if 'link' in response.meta:
            link = response.meta['link']
            ld.add_value('link_text', link['text'])
            ld.add_value('link_url', link['url'])
            ld.add_value('referrer_url', response.meta['referrer_url'])
            ld.add_value('referrer_depth', response.meta['referrer_depth'])
        return ld
Пример #28
0
 def parse(self, response):
     xlink = LinkExtractor()
     itemre = re.compile(self.itemurl_re)
     for link in xlink.extract_links(response):
         if itemre.search(link.url):
             yield Request(url=link.url, callback=self.parse_item)
Пример #29
0
class listPageSpider(Spider):

    name = 'listPageSpider'

    def __init__(self, taskId, *a, **kw):
        """Constructor"""
        self.name +='_'+str(taskId)
        super(listPageSpider, self).__init__(*a, **kw)
        pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
        self.redis = redis.Redis(connection_pool=pool)
        self.dbUtils = db.dbUtils()
        self.taskId = int(taskId)
        self.domain = None
        self.project = None
        self.hasCrawlSet = set()
        self.hasInsertSet = set()
        self.isExit = 0
        project = self.dbUtils.queryRow('SELECT * FROM project_setting WHERE iStatus=1 AND iPid=%d' % self.taskId)
        if project :
            self.project = project
            self.domain = ".".join(urlparse(project['szDomain']).hostname.split(".")[-2:])
#             self.start_urls = ['http://www.ty2016.com/cn/2.html', 'http://www.ty2016.com/cn/3.html', 'http://www.ty2016.com/cn/4.html']


    def stopSpider(self):
        self.isExit = 1

    def getStartUrl(self):
        url = self.redis.rpop('scrapy:startPageSpider:listQuque')
        if not url:
            self.getStartUrl()
        return url


    def start_requests(self):
#         url = self.getStartUrl()
#         print '=====================>',url
#         yield self.make_requests_from_url(url)
        while True :
            #if self._crawler.engine is not None:
                #if self._crawler.engine.paused: break
                #if not self._crawler.engine.running: break
            url = self.redis.rpop('scrapy:startPageSpider:listQuque:%s' % self.taskId)
            #print 'listPageSpider==========================>',url
            if url:
                #self.redis.sadd('scrapy:startPageSpider:startPage:1', url)
                yield self.make_requests_from_url(url)
            #else:
                #self._crawler.signals.send_catch_log('emptyListQuque')
                #print 'listPageSpider---------send_catch_log->emptyListQuque'


    def parse(self, response):
        #self.redis.sadd('scrapy:startPageSpider:startPage:3', response.url)
        if response.url not in self.hasCrawlSet:
            #self.redis.sadd('scrapy:startPageSpider:startPage:4', response.url)
            self.hasCrawlSet.add(response.url)
            _allow = ( _allow for _allow in self.project['szUrlReg'].split('~'))
            self.linkExtractor = LinkExtractor(allow_domains=self.domain, allow=_allow)
            links = [ link for link in self.linkExtractor.extract_links(response) if link.url not in self.hasInsertSet ]
            #self.redis.hset('scrapy:startPageSpider:listPage:count', response.url, len(links))
            for link in links:
                if link.url in self.hasInsertSet : continue
                insertSql = 'INSERT INTO project_list_page(iPid, szUrl, szTitle, szSourceUrl,dtLastScrapyTime) VALUES(%d, "%s", "%s", "%s", "%s")' % (self.taskId,link.url, link.text, response.url, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
                self.dbUtils.insert(insertSql)
                self.hasInsertSet.add(link.url)
                log.msg(format='spider=listPageSpider iPid=%(i)s, title=%(t)s url=%(u)s', i = self.taskId, t=link.text, u=link.url)
Пример #30
0
 def parse_urls(self, response):
     extractor = LinkExtractor(restrict_xpaths=('//table[contains(@class,ibm-data-table)]/tbody',))
     links = extractor.extract_links(response)
     for link in links:
         url = link.url
         yield Request(url, callback=self.parse_items)
 def parse_categories(self, response):
     l = LinkExtractor(restrict_xpaths='.//div[@class="categoryListContainer"]')
     links = l.extract_links(response)
     for link in links:
         yield Request(url=link.url, callback=self.parse_items_links)
Пример #32
0
	def parse_sitemap(self, response):
		#self.log('We are parsing site map: %s' % response.url)
		linkextractor = LinkExtractor(allow_domains=self.allowed_domains, restrict_xpaths=self.NEWS_CATEGORY_XPATH)
		for link in linkextractor.extract_links(response):
			yield scrapy.Request(link.url, callback=self.parse_newspage)
Пример #33
0
 def _scrape_product_links(self, response):
     link_extractor = LinkExtractor(
         restrict_xpaths='//div[@id="prodtitle"]')
     links = link_extractor.extract_links(response)
     links = [(l.url, SiteProductItem()) for l in links]
     return links
Пример #34
0
	def parse(self, response):
		#self.log('We are at URL: %s' % response.url)
		yield scrapy.Request(response.url+'robots.txt', callback=self.parse_robot)
		linkextractor = LinkExtractor(allow_domains=self.allowed_domains, restrict_xpaths=self.SITEMAP_XPATH)
		for link in linkextractor.extract_links(response):
			yield scrapy.Request(link.url, callback=self.parse_sitemap)
Пример #35
0
    def parse(self, response):
        ret = []
        url = response.url

        if "login.php" in url:
            # dont go to login pages
            return None

        domres1 = tldextract.extract(url)
        #sd = domres.subdomain
        dm1 = domres1.domain
        #tld = domres.suffix

        path = url.replace(" ", "").replace("/",
                                            "_").replace(":",
                                                         "").replace(".", "")
        title = path
        if response.body:
            content = response.body_as_unicode()
        else:
            content = "NONE"
        links = []
        domains = []

        for x in response.xpath('//*//@href'):
            y = x.extract()
            links.append(y)
            domres = tldextract.extract(y)
            sd = domres.subdomain
            dm = domres.domain
            tld = domres.suffix
            if dm:
                #print domres
                d = "{0}.{1}.{2}".format(sd, dm, tld)
                if d not in domains:
                    domains.append(d)

        p = Page(title=title,
                 path=path,
                 url=url,
                 content=content,
                 links=links,
                 domains=domains)
        ret.append(p)

        l = LinkExtractor()
        for x in l.extract_links(response):
            domres = tldextract.extract(x.url)
            dm = domres.domain
            if dm.lower() == dm1.lower():
                print "extracted", x

                if "login.php" in x.url:
                    print "skipping login"
                else:
                    ret.append(scrapy.http.Request(x.url))
            else:
                print "Skip", domres, domres1, x
                pass

        # # spider the other local pages
        # for sel in response.xpath('//*/@href'):
        #     link = sel.extract()
        #     desc = sel.xpath('text()').extract()
        #     if link:
        #         l = url + link[0]
        #         print "LINK",l, desc
        #         ret.append(scrapy.http.Request(l))

        return ret
Пример #36
0
 def parse_urls(self, response):
     extractor = LinkExtractor(restrict_xpaths=('//div[contains(@class, "news_type2")]/h2',))
     links = extractor.extract_links(response)
     for link in links:
         url = link.url
         yield Request(url, callback=self.parse_items)