예제 #1
0
파일: spiders.py 프로젝트: kyotaw/Conata
class RawHtmlSpider(CrawlSpider):
    name = 'yahoo_news'
    allowed_domains = ['news.yahoo.co.jp', 'headlines.yahoo.co.jp']
    start_urls = [
        #		'http://news.yahoo.co.jp/list/?c=domestic',
        #		'http://news.yahoo.co.jp/list/?c=world',
        #		'http://news.yahoo.co.jp/list/?c=economy',
        #		'http://news.yahoo.co.jp/list/?c=entertainment',
        #		'http://news.yahoo.co.jp/list/?c=sports',
        #		'http://news.yahoo.co.jp/list/?c=computer',
        #		'http://news.yahoo.co.jp/list/?c=science',
        'http://news.yahoo.co.jp/list/?c=local',
    ]

    rules = [
        Rule(
            LxmlLinkExtractor(restrict_xpaths="//a[@class='next']",
                              unique=True)),
        Rule(LxmlLinkExtractor(allow=(r'pickup'), unique=True)),
        Rule(LxmlLinkExtractor(
            restrict_xpaths="//div[@class='headlineTxt']/a[@class='newsLink']"
        ),
             callback="parse_article"),
    ]

    def parse_article(self, response):
        htmlRes = HtmlResponse(url=response.url, body=response.body)
        item = RawHtmlItem()
        item["url"] = htmlRes.url
        item["body"] = htmlRes.body
        item["encoding"] = htmlRes.encoding
        return item
예제 #2
0
 def __init__(self, **kw):
     super(TheCrawler, self).__init__(**kw)
     
     self.channel = kw.get('channel')
     self.domain = kw.get('domain')
     
     full_config_path = '%s%s' % (SITE_CONFIG_PATH, self.channel)
     
     self.config_path = "%s/%s.txt" % (full_config_path, self.domain)
     self.config_items = self._parse_config(self.config_path)
     
     try:
         self.url = self.config_items['start_url']
     except KeyError:
         self.url = 'http://%s/' % self.domain
         
     try:
         self.link_extractor = LxmlLinkExtractor(restrict_xpaths=self.config_items['crawl_areas'], unique=True)
     except KeyError:
         self.link_extractor = LxmlLinkExtractor(unique=True)
                     
     self.real_domain = urlparse(self.url).hostname.lstrip('www.')
     self.allowed_domains = [urlparse(self.url).hostname.lstrip('www.')]
     
     self.cookies_seen = set()
예제 #3
0
파일: habr.py 프로젝트: kreedz/Scrapy
class HabrSpider(CrawlSpider):
    name = 'habr'
    allowed_domains = ['habrahabr.ru']
    start_urls = ['http://habrahabr.ru/']

    rules = (
        Rule(LxmlLinkExtractor(restrict_xpaths=('.//h1/a[@class="post_title"]')), callback='parse_item'),
        Rule(LxmlLinkExtractor(restrict_xpaths=('.//*[@id="nav-pages"]/li/a')), follow=True),
    )

    def __init__(self, category=None, *args, **kwargs):
        super(HabrSpider, self).__init__(*args, **kwargs)
        log.ScrapyFileLogObserver(open('debug.log', 'w'), level=log.DEBUG).start()
        log.ScrapyFileLogObserver(open('error.log', 'w'), level=log.ERROR).start()

    def parse_item(self, response):
        xpath = './/div[@class="content_left"]'
        sel = response.xpath(xpath)
        if not sel:
            return
        l = ItemLoader(item=HabrahabrItem(), selector=sel, response=response)
        l.add_xpath('title', '//h1/span/text()')
        l.add_xpath('image_urls', '//div[@class="content html_format"]/img/@src')
        comments_items = []
        comments = sel.xpath('//div[starts-with(@class, "message html_format")]').extract()
        for comment in comments:
            comment_item = ItemLoader(item=HabrahabrComment(), selector=sel, response=response)
            comment_item.add_value('comment', comment)
            comments_items.append(comment_item.load_item())
        l.add_value('comments', comments_items)
        yield l.load_item()
class YellowSpider(CrawlSpider):
    name = 'yellow'
    allowed_domains = ['yellow.co.nz']
    # rules = (Rule(LxmlLinkExtractor(allow=(r'\/([A-Z])([A-Z0-9]{9})'),deny=('')),callback='parse_item'),Rule(LxmlLinkExtractor(allow=(''))),),)

    rules = (Rule(
        LxmlLinkExtractor(
            allow=(r'https://yellow.co.nz/canterbury-region/plumbers/page/.*',
                   r'.*what=plumbers&where=Canterbury+Region.*')),
        follow=True,
    ),
             Rule(
                 LxmlLinkExtractor(allow=(r'https://yellow.co.nz/y/.*'),
                                   deny=(r'.*more', r'.*Other')),
                 callback='parse_business',
                 follow=False,
             ), Rule(
                 LxmlLinkExtractor(allow=('')),
                 follow=False,
             ))

    def __init__(self, *args, **kwargs):
        super(YellowSpider, self).__init__(*args, **kwargs)
        start_url = 'https://yellow.co.nz/canterbury-region/plumbers/page/1?what=plumbers&where=Canterbury+Region'
        self.start_urls = [start_url]

    def parse_business(self, response):
        item = YellowItem()
        print "\n\n---------------------START-----------------------"
        print "\n\n---------------------START-----------------------"
        print "\n\n---------------------START-----------------------"
        print response.url
        item["Company"] = response.xpath(
            '//*[@id="businessDetailsPrimary"]/div[1]/div[3]/h1/span').extract(
            )
        item["PhoneNumber"] = response.xpath(
            '//*[@id="businessDetailsPrimary"]/div[2]/div/span[1]/a[1]'
        ).extract()
        item["MailingAddress"] = response.xpath(
            '//*[@id="detailSectionSecondary"]/div[2]/section[3]/div[2]/p'
        ).extract()
        item["email"] = response.xpath(
            '//*[@id="businessDetailsPrimary"]/div[2]/div/meta').extract()
        item["url"] = response.url
        print item
        yield item

    def process_links(self, links):
        print "\n       LINKS"
        links_list = []
        for i in links:
            if "https://www.tripadvisor.com/Attraction_Review" in i.url:
                links_list.append(i)
                print i.url
        return links_list
class AmazonSpider(CrawlSpider):
    name = 'aragog'
    allowed_domains = ['amazon.in']
    rules = (
        Rule(LxmlLinkExtractor(allow=(r'\/([A-Z])([A-Z0-9]{9})'),
                               deny=(r'product\-reviews', r'offer\-listing',
                                     r'ebook')),
             callback='parse_item'),
        Rule(LxmlLinkExtractor(allow=(''))),
    )

    def __init__(
            self,
            start_url='http://www.amazon.in/Laptops/b/ref=nav_shopall_computers_laptop?ie=UTF8&node=1375424031',
            *args,
            **kwargs):
        super(AmazonSpider, self).__init__(*args, **kwargs)
        self.start_urls = [start_url]

    def parse_item(self, response):
        # print(str(response.url))
        item = AmazonscrapingMongodbItem()
        try:
            name = response.xpath(
                '//*[@id="productTitle"]/text()').extract()[0].encode(
                    'ascii', 'ignore')
            item['name'] = name.strip().split("\n")
            item['reviews'] = response.xpath(
                '//*[@id="acrCustomerReviewText"]/text()').extract()[0].encode(
                    'ascii', 'ignore')
            item['url'] = response.url
            # print(response.xpath('//*[@id="avgRating"]/span/text()').extract())
            item['rating'] = response.xpath(
                '//*[@id="avgRating"]/span/text()').extract()[0].encode(
                    'ascii', 'ignore').replace('\n', ' ').strip()
            item['pid'] = response.url.split('/ref=')[0].split('/')[-1].encode(
                'ascii', 'ignore')
            item['price'] = [
                response.xpath(
                    '//*[@id="price"]/table//span[starts-with(@id,"priceblock")]//text()'
                ).extract()[1].encode('ascii', 'ignore').strip()
            ]
            item['desc'] = [
                desc.encode('ascii', 'ignore') for desc in response.xpath(
                    '//*[@id="feature-bullets"]/ul/li/span/text()').extract()
            ]
            item['timestamp'] = [str(datetime.datetime.now())]
            print(item)
        except:
            print('Not a product!')
            item = None
        yield item

    def dummy(self, response):
        print(str(response.url))
예제 #6
0
class SearchdisconnectSpider(CrawlSpider):
    name = "searchdisconnect"
    allowed_domains = ["https://search.disconnect.me/"]
    start_urls = ('https://search.disconnect.me/', )

    rules = [
        Rule(LxmlLinkExtractor(restrict_xpaths=["//div[@class='pagination']"]),
             callback="parse_links")
    ]

    ITEM_CLASS = SearchdisconnectcrawlerItem

    def __init__(self, keyword="p**n", *args, **kwargs):
        super(SearchdisconnectSpider, self).__init__(*args, **kwargs)
        self._query = keyword

    def parse_start_url(self, response):
        return FormRequest.from_response(response,
                                         formdata={"query": self._query},
                                         callback=self.parse_links)

    def parse_links(self, response):
        item = SearchdisconnectcrawlerItem()
        item["keyword"] = self._query
        urls = response.css("a.title::attr(href)").extract()
        for url in urls:
            item["url"] = url
            yield item
        pagination_links = response.css(
            "div.pagination a::attr(href)").extract()
        for link in pagination_links:
            yield Request(self.start_urls[0] + "searchTerms/" + link[2:],
                          callback=self.parse_links)
예제 #7
0
class LbColdDriedFruits(LocalBanya2Crawler):
    name = "lb_colddriedfruits"
    start_urls = ['http://www.localbanya.com/products/Fruits-&-Vegetables/Cold-Dried-Fruits/180/234']
    rules = (
        Rule(LxmlLinkExtractor(allow='product-details/Fruits---Vegetables/Cold-Dried-Fruits'), callback='parse_product',
             follow=True),
    )
예제 #8
0
class SwhSpider(CrawlSpider):
    name = 'swh'
    allowed_domains = ['www.smh.com.au']
    start_urls = ['http://www.smh.com.au/']
    '''def start_requests(self):
		for url in self.start_urls:
			yield SplashRequest(url, self.parse,meta={
				'splash':{
					'endpoint':'render.html',
					'args':{'wait': 0.5},
				}
			})'''
    def splash_request(self, request):
        return SplashRequest(request.url,
                             self.parse_page,
                             args={
                                 'wait': 10,
                                 'timeout': 3600
                             },
                             meta={'real_url': request.url})

    rules = (Rule(LxmlLinkExtractor(allow=(), deny=()),
                  callback="parse_page",
                  process_request="splash_request",
                  follow=True), )

    def parse_page(self, response):
        t = str(response.css('title::text').extract()[0])
        nt = t + '.text'
        c = ' '.join(response.css('._1665V').xpath(
            './/p//text()').extract()).encode('utf-8')
        if c:
            with open(os.path.join(dest, nt), 'wb') as f:
                f.write(c)
            yield {'title': t}
예제 #9
0
class CalendarSpiderSpider(CrawlSpider):
    name = "calendar-spider"
    allowed_domains = ["uvic.ca"]
    start_urls = [
        'http://web.uvic.ca/calendar2015-09/CDs/CSC/CTs.html',
        'http://web.uvic.ca/calendar2015-09/CDs/CSC/466.html'
    ]

    #(http://web.uvic.ca/calendar2015-09/CDs/)(CSC|MATH|SENG).+')

    rules = [
        Rule(LxmlLinkExtractor(
            allow=('(http://web.uvic.ca/calendar2015-09/CDs/).+'),
            restrict_xpaths=('//div[@id="CDpage"]', '//ul[@class="CDTL"]')),
             callback='parse_item',
             follow=True)
    ]

    def parse_item(self, response):

        item = WebItem()
        item['title'] = response.xpath(
            '//title/text()').extract().pop().encode('utf-8')
        item['url'] = response.url

        return item
예제 #10
0
class MySpider(Spider):
    name = 'example'
    link_extractor = LxmlLinkExtractor()

    def parse(self, response):
        soup = BeautifulSoup(response.body)
        for script in soup(["script", "style"]):
            script.extract()
        text = soup.get_text()

        response.meta.update(score=KeywordScorer.score(text))
        response.meta.update(content_hash=xxhash.xxh64(
            text.encode('ascii', 'ignore')).intdigest())

        for link in self.link_extractor.extract_links(response):
            request = Request(url=link.url)
            request.meta.update(link_text=link.text)
            link_score = KeywordScorer.score(link.text)
            request.meta.update(score=link_score)
            yield request

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = cls(*args, **kwargs)
        crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle)
        return spider

    def spider_idle(self):
        raise DontCloseSpider
예제 #11
0
class WwwExpansysComSgCrawler(CrawlSpider):
  name = 'www_expansys_com_sg_crawler'
  allowed_domains = ['expansys.com.sg']
  start_urls = [
      'http://www.expansys.com.sg/'
      ]
  
  rules = [
    Rule(LinkExtractor(allow='page=\d+#listing'),follow=True,),
    Rule(LxmlLinkExtractor(allow=(r'.+/\S+\d+/',),deny = (r'.+/.filter',)),
        callback = 'parse_item',
        follow=True,)
]
  def parse_item(self,response):
            items = list()
            
            for sel in response.xpath('//div[@id="product"]'):
                item = ExpansysItem()
                item['url'] = response.url or None
                item['sku'] = sel.xpath('//div[@id="prod_core"]/ul/li[1]/span/text()').extract()
                item['title'] = sel.xpath('//div[@id="title"]/h1/text()').extract()
                item['description'] = sel.xpath('//div[@id="description"]/h2/text()').extract()
                item['price'] = sel.xpath('//div[@id="prod_core"]/span/ul[@class="details"]/li[@class="price"]/p[@id="price"]/strong/span/text()').extract()
                item['ean'] = sel.xpath('//div[@id="prod_core"]/ul/li[2]/span/text()').extract()
                item['mpn'] = sel.xpath('//div[@id="prod_core"]/ul/li[3]/span/text()').extract()
                item['brand'] = sel.xpath('//div[@id="prod_core"]/ul/li[4]/a/text()').extract()
                item['currency'] = sel.xpath('//p[@id="price"]/meta/@content').extract()
                item['img_urls'] = sel.xpath('//div[@id="prod_left"]/div[2]/a/img/@src').extract()
                item['categories'] = sel.xpath('//li[@id="n_audio"]/div/div[1]/ul/li/a/text()').extract()
                item['availability'] = sel.xpath('//li[@id="stock"]/text()').extract()
                item['rating'] = sel.xpath('//div[@id="review_avg"]/span[1]/text()').extract()
                items.append(item)
                yield item
예제 #12
0
class DmozSpiderSpider(CrawlSpider):
    name = "dmoz-spider"
    allowed_domains = ["dmoz.org"]
    start_urls = [
        'http://www.dmoz.org/Computers/', 'http://www.dmoz.org/Society/',
        'http://www.dmoz.org/Sports/'
    ]

    rules = [
        Rule(LxmlLinkExtractor(allow=('(http://www.dmoz.org/).+')),
             callback='parse_item',
             follow=True)
    ]

    #LOG_FILE = "data/scrapy_%s.log" % datetime.now().strftime('%Y-%m-%dZ%H-%M')
    #logfile = open(LOG_FILE, 'w')
    #log_observer = ScrapyFileLogObserver(logfile, level=logging.INFO)
    #log_observer.start()

    def parse_item(self, response):

        item = WebItem()

        item['title'] = response.xpath(
            '//title/text()').extract().pop().encode('utf-8')
        item['url'] = response.url

        return item
예제 #13
0
class CuponomiaScrapper(BaseCouponsCrawler):
    """crawler for site http://www.coupondunia.in"""

    name = 'test'
    allowed_domains = ["promotionalcodes.com"]
    start_urls = ['http://www.promotionalcodes.com/stores-by-letter/m']
    rules = [
        Rule(
            LxmlLinkExtractor(allow=(
                # '1-800-mobiles-coupons',
                'http://promotionalcodes.com/macys-coupons')),
            callback='parse_items',
            follow=False)
    ]

    def __init__(self, *args, **kwargs):
        super(CuponomiaScrapper, self).__init__(*args, **kwargs)

    store_name_path = '//*[@id="bodywrap"]/div/div[3]/div[1]/h2/a/text()'
    store_homepage_path = '//*[@id="bodywrap"]/div/div[3]/div[1]/h2/a/@href'
    store_logo_path = '//*[@id="bodywrap"]/div/div[1]/div/div/img/@src'
    store_description_path = '//*[@id="bodywrap"]/div/div[3]/div[1]/p/text()'

    coupon_name_path = './h3/a/text()'
    coupon_description_path = './p/text()'
    coupon_code_path = '/html/body/div[3]/div[1]/div[5]/div[2]/div[1]/div/article[1]/div[1]/div[1]/div[2]/div[2]/p/a/span[2]/text()'
    coupons_selector_css = 'div.coupon_box.widecoupon:not(.widecoupon-expired):not(.widecoupon-addCouponForm) > div.coupon_content > div.coupon_main_column'
class DmozSpider(CrawlSpider):
    name = "buzzfeedNews"
    allowed_domains = ["buzzfeed.com"]
    start_urls = ["http://www.buzzfeed.com/"]

    rules = (
        # Extract links matching 'item.php' and parse them with the spider's method parse_item
        Rule(LxmlLinkExtractor(allow_domains=('buzzfeed.com')),
             callback='parse_item'), )

    def parse_item(self, response):
        items = []
        depth = response.meta["depth"]
        referring_url = response.request.headers.get('Referer', None)
        current_url = response.url
        title = response.xpath('//div[@id="buzz_header"]//h1/text()').extract()
        for link in response.xpath(
                '//div[@id="buzz_sub_buzz"]//div[not(contains(@class,"share-box"))]//a[not(@rel="nofollow")]/@href[not(contains(text(),"buzzfeed") or contains(text(),"buzzfed"))]'
        ):
            l = link.extract()
            if str(l) != "javascript:;":
                item = BuzzlinksItem()
                item["depth"] = depth
                item["current_url"] = current_url
                item["referring_url"] = referring_url
                item["link"] = link.extract()
                item["article_title"] = title
                parsed_uri = urlparse(link.extract())
                domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
                item["link_domain"] = domain

                items.append(item)
        return items
class CouponduniaScrapper(BaseCouponsCrawler):
    """crawler for site http://www.coupondunia.in"""

    name = 'coupondunia'
    allowed_domains = ["coupondunia.in"]
    start_urls = ['http://www.coupondunia.in/stores']
    rules = [
        Rule(
            LxmlLinkExtractor(
                allow=(
                    # '1-800-mobiles-coupons',
                    # 'http://promotionalcodes.com/macys-coupons',
                    # 'zoffio',
                    # 'ebay',
                ),
                deny=('coupondunia.in/stores'),
                restrict_xpaths=('/html/body/div[2]/div', )),
            callback='parse_items',
            follow=False)
    ]

    def __init__(self, *args, **kwargs):
        super(CouponduniaScrapper, self).__init__(*args, **kwargs)

    store_name_path = '/html/body/div[2]/div/div/div[2]/div[1]/h1/span[1]/text()'
    store_homepage_path = '/html/body/div[2]/div/div/div[1]/div/div[1]/div/div/div/a/@href'
    store_logo_path = '/html/body/div[2]/div/div/div[1]/div/div[1]/div/div/img/@src'
    # store_description_path = '//*[@id="bodywrap"]/div/div[3]/div[1]/p/text()'

    coupon_name_path = './div/div[@class="offer-title"]/a/text()'
    coupon_description_path = './div/div[@class="offer-description-full"]/text()'
    coupon_code_path = './div/div[@class="offer-getcode"]/div/@data-code'
    coupons_selector_css = 'html body div.page-content.dark div.container div.row div.col-19 div#coupon_container.row.margin-left-right-none div.offer-big.offer.sms-parent.col-24'
예제 #16
0
class ActualSpider(CrawlSpider):
    name = 'newsite'
    allowed_domains = ['edition.cnn.com', 'economictimes.indiatimes.com']
    start_urls = [
        'https://edition.cnn.com/', 'https://economictimes.indiatimes.com/'
    ]
    '''def abs_link(value):
        return urlparse.urljoin(response.url, value.strip())'''
    #Note:callback function name should always be something different from parse
    rules = (Rule(LxmlLinkExtractor(
        allow=('https://edition.cnn.com/',
               'https://economictimes.indiatimes.com/'),
        deny=('https://plus.google.com/', )),
                  callback="parse_page",
                  follow=True), )

    def parse_page(self, response):
        site = response.meta['download_slot']
        t = str(response.css('title::text').extract()[0])
        nt = t + '.text'
        if site == "edition.cnn.com":
            sel = '.zn-body__paragraph *::text'
            dest = '/home/pannaga/work/extraction/extraction/CNN'
        elif site == "economictimes.indiatimes.com":
            sel = '.Normal *::text'
            dest = '/home/pannaga/work/extraction/extraction/ET'
        c = ' '.join(response.css(sel).extract()).encode('utf-8')
        if c:
            with open(os.path.join(dest, nt), 'w') as f:
                f.write(c)
            #yield {'title':t}
            yield {'desti': dest}
예제 #17
0
class LbFruits(LocalBanya2Crawler):
    name = "lb_exoticfruitsveg"
    start_urls = [
        'http://www.localbanya.com/products/Fruits-&-Vegetables/Exotic-Fruits-&-Vegetables/180/55'
    ]
    rules = (Rule(
        LxmlLinkExtractor(allow='product-details/Fruits---Vegetables/Fruits-'),
        callback='parse_product',
        follow=True), )
예제 #18
0
class Century21Spider(CrawlSpider):

    name = 'century21'
    allowed_domains = ['century21.fr']
    start_urls = [
        URL_TEMPLATE % postcode
        for postcode in "75010 75011 75012 75018 75019 75020".split(' ')
    ]
    regex = r'http://www\.century21\.fr/trouver_logement/detail/\d+/'
    f = lambda link: re.match(
        r'(http://www\.century21\.fr/trouver_logement/detail/\d+/).*').groups(
        )[0]
    rules = [
        Rule(LinkExtractor(allow=regex), 'parse_ad'),
        Rule(
            LxmlLinkExtractor(
                allow='.*',
                restrict_xpaths=
                "//div[contains(@class,'btnSUIV_PREC suivant')]/a[contains(text(), 'suivant')]",
                process_value=f))
    ]

    def parse_ad(self, response):
        pty = Property()
        pty['url'] = response.url
        pty['listed_on'] = self.name

        # Price
        price = ' '.join(
            response.css('section.tarif span b').xpath('text()').extract())
        pty['price'] = int(
            re.sub('\s+', ' ', price).replace(
                u'\xa0', '').rstrip().strip(u' \u20ac').replace(' ', ''))

        # Surface
        details = ' '.join(
            response.css('section.precision p').xpath("text()").extract())
        pty['size'] = float(
            re.search(r'(\d+,?\d*) ?[mM][2\xb2]',
                      details).groups()[0].replace(',', '.'))

        # Post code
        filariane = ' '.join(
            response.css('div#filAriane div a span').xpath("text()").extract())
        pty['postcode'] = int(re.search(r'(750\d{2})', filariane).groups()[0])

        # Content
        pty['title'] = ' '.join(
            response.css('h1.h1_page').xpath('text()').extract())
        pty['description'] = ' '.join(
            response.css('div#descTextAnnonce.descriptionLongue p').xpath(
                'text()').extract())

        # Price per square meter
        pty['ppsqm'] = float(pty['price']) / pty['size']

        return pty
예제 #19
0
class LbLeafies(LocalBanya2Crawler):
    name = "lb_leafies"
    start_urls = [
        'http://www.localbanya.com/products/Fruits-&-Vegetables/Leafies/180/244'
    ]
    rules = (Rule(
        LxmlLinkExtractor(allow='product-details/Fruits---Vegetables/Leafies'),
        callback='parse_product',
        follow=True), )
예제 #20
0
class NeteaseNewsSpider(CrawlSpider):
    name = "netease_news_spider"
    allowed_domains = ['news.163.com']
    start_urls = ['http://news.163.com/']

    # http://news.163.com/17/0823/20/CSI5PH3Q000189FH.html
    url_pattern = r'(http://news\.163\.com)/(\d{2})/(\d{4})/\d+/(\w+)\.html'
    rules = [
        Rule(LxmlLinkExtractor(allow=[url_pattern]),
             callback='parse_news',
             follow=True)
    ]

    def parse_news(self, response):
        sel = Selector(response)
        pattern = re.match(self.url_pattern, str(response.url))
        source = 'news.163.com'
        if sel.xpath('//div[@class="post_time_source"]/text()'):
            time = sel.xpath('//div[@class="post_time_source"]/text()'
                             ).extract_first().split()[0] + ' ' + sel.xpath(
                                 '//div[@class="post_time_source"]/text()'
                             ).extract_first().split()[1]
        else:
            time = 'unknown'
        date = '20' + pattern.group(2) + '/' + pattern.group(
            3)[0:2] + '/' + pattern.group(3)[2:]
        newsId = pattern.group(4)
        url = response.url
        title = sel.xpath("//h1/text()").extract()[0]
        contents = ListCombiner(sel.xpath('//p/text()').extract()[2:-3])
        comment_url = 'http://comment.news.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/{}'.format(
            newsId)
        yield Request(comment_url,
                      self.parse_comment,
                      meta={
                          'source': source,
                          'date': date,
                          'newsId': newsId,
                          'url': url,
                          'title': title,
                          'contents': contents,
                          'time': time
                      })

    def parse_comment(self, response):
        result = json.loads(response.text)
        item = NewsItem()
        item['source'] = response.meta['source']
        item['date'] = response.meta['date']
        item['newsId'] = response.meta['newsId']
        item['url'] = response.meta['url']
        item['title'] = response.meta['title']
        item['contents'] = response.meta['contents']
        item['comments'] = result['cmtAgainst'] + result['cmtVote'] + result[
            'rcount']
        item['time'] = response.meta['time']
        return item
예제 #21
0
    def parse_item(self, response):
        sel = Selector(response)
        items = LinkParser.extract_page_links(sel)
        num_onsite_links = 0
        num_offsite_links = 0

        page_id = ObjectId()

        for item in items:
            item['page_id'] = page_id
            item['domain'] = ""
            item['org_id'] = self.org
            item['referer'] = response.meta.get('Referer')

            if 'uri' in item:
                parse_uri = urlparse(item['uri'])
                item['domain'] = parse_uri[1]

            item['onsite'] = False
            for dom in self.allowed_domains:
                if item['domain'] == "" or item['domain'] in dom:
                    item['onsite'] = True
                    num_onsite_links = num_onsite_links + 1

            if item['onsite'] == False:
                num_offsite_links = num_offsite_links + 1

            yield item

        page = LinkParser.get_page_data(response)
        page['page_id'] = page_id
        page['useragent'] = response.meta.get('User-Agent')
        page['referer'] = response.meta.get('Referer')
        page['org_id'] = self.org
        page['num_offsite_links'] = num_offsite_links
        page['num_onsite_links'] = num_onsite_links

        yield page

        #limit page depth
        if self.pages_crawled >= settings.PAGES_PER_DOMAIN:
            return

        for link in LxmlLinkExtractor(
                unique=True,
                allow_domains=self.allowed_domains).extract_links(response):
            if not link.url in self.already_crawled and self.pages_crawled <= settings.PAGES_PER_DOMAIN:
                self.already_crawled.add(link.url)
                self.pages_crawled = self.pages_crawled + 1
                print "yielding request for ", link.url
                yield WebdriverRequest(link.url, callback=self.parse_item)
            elif self.pages_crawled >= settings.PAGES_PER_DOMAIN:
                print "reached max crawl"
                return
            else:
                print "avoiding duplicate request for: ", link.url
예제 #22
0
class NbadraftSpider(CrawlSpider):

    name = "nbadraftnet"
    allowed_domains = ["nbadraft.net"]
    start_urls = (
        'http://www.nbadraft.net/articles',
    )

    calendar = parsedatetime.Calendar()

    rules = (
        Rule(LxmlLinkExtractor(restrict_xpaths="//div[@id='content']//td/a"), callback='parse_article'),
        Rule(LxmlLinkExtractor(allow='/articles')),
        Rule(LxmlLinkExtractor(allow='/players/'), callback='parse_article', cb_kwargs={'base_relevance': 100})
    )

    def parse_article(self, response, **kwargs):

        relevance = kwargs.get('base_relevance',0)

        content_selector = response.css('#content-area .content')
        images = content_selector.xpath("//img/@src").extract()
        base = get_base_url(response)

        #Fri, 07/27/2012 - 4:16pm
        parsed = self.calendar.parse(response.css('.date::text').extract()[0])
        date = datetime.datetime(*parsed[0][:7])
        main_content = content_selector.extract()[0]

        #replace relative image links
        for link in images:
            if link[0] == '/':
                main_content.replace(link,base + link)

        title = response.xpath('//h1/text()').extract()[0]

        yield ArticleItem(
            title=title,
            date=date,
            content=main_content,
            relevance=relevance,
            url=response.url
        )
예제 #23
0
class TheSpider(CrawlSpider):
    name = 'khana'
    allowed_domains = ['fr.khanacademy.org']
    start_urls = ['https://fr.khanacademy.org']
    rules = (Rule(LxmlLinkExtractor(allow_domains=(['fr.khanacademy.org/math/','fr.khanacademy.org/science/','fr.khanacademy.org/computing/'])), callback='parse_url', follow=True), )
    def parse_url(self, response):
		item=ScraperItems()
		item['links']=response.xpath('//a[contains(@class, "topic-list-item")]/@href').extract() + response.xpath('//link[contains(@rel, "image_src")]/@href').extract()
		for i in item['links']:
			return Request(urlparse.urljoin('response.url', i[1:]))
예제 #24
0
class SinaNewsSpider(CrawlSpider):
    name = "sina_news_spider"
    allowed_domains = ['news.sina.com.cn']
    start_urls = ['http://news.sina.com.cn']
    # http://finance.sina.com.cn/review/hgds/2017-08-25/doc-ifykkfas7684775.shtml
    # url_pattern = r'(http://(?:\w+\.)*news\.sina\.com\.cn)/.*/(\d{4}-\d{2}-\d{2})/doc-(.*)\.shtml'
    today_date = time.strftime('%Y-%m-%d',time.localtime(time.time()))
    url_pattern = r'(http://(?:\w+\.)*news\.sina\.com\.cn)/.*/({})/doc-(.*)\.shtml'.format(today_date)

    rules = [
        Rule(LxmlLinkExtractor(allow=[url_pattern]), callback='parse_news', follow=True)
    ]

    def parse_news(self, response):
        sel = Selector(response)
        if sel.xpath("//h1[@id='artibodyTitle']/text()"):
            title = sel.xpath("//h1[@id='artibodyTitle']/text()").extract()[0]
            pattern = re.match(self.url_pattern, str(response.url))
            source = 'sina'
            date = pattern.group(2).replace('-','/')
            if sel.xpath('//span[@class="time-source"]/text()'):
                time_ = sel.xpath('//span[@class="time-source"]/text()').extract_first().split()[0]
            else:
                time_ = 'unknown'
            newsId = pattern.group(3)
            url = response.url
            contents = ListCombiner(sel.xpath('//p/text()').extract()[:-3])
            comment_elements = sel.xpath("//meta[@name='sudameta']").xpath('@content').extract()[1]
            comment_channel = comment_elements.split(';')[0].split(':')[1]
            comment_id = comment_elements.split(';')[1].split(':')[1]
            comment_url = 'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel={}&newsid={}'.format(comment_channel,comment_id)
            yield Request(comment_url, self.parse_comment, meta={'source':source,
                                                                 'date':date,
                                                                 'newsId':newsId,
                                                                 'url':url,
                                                                 'title':title,
                                                                 'contents':contents,
                                                                 'time':time_
                                                                })

    def parse_comment(self, response):
        if re.findall(r'"total": (\d*)\,', response.text):
            comments = re.findall(r'"total": (\d*)\,', response.text)[0]
        else:
            comments = 0
        item = NewsItem()
        item['comments'] = comments
        item['title'] = response.meta['title']
        item['url'] = response.meta['url']
        item['contents'] = response.meta['contents']
        item['source'] = response.meta['source']
        item['date'] = response.meta['date']
        item['newsId'] = response.meta['newsId']
        item['time'] = response.meta['time']
        return item
 def parse(self, response):
     items = []
     for link in LxmlLinkExtractor(allow=self.allowed_domains).extract_links(response):
         item = Gentest1Item()
         item['url'] = link.url
         item['document_name'] = response.meta['document_name']
         items.append(item)
         requests = self.make_requests_from_url(link.url)
         requests.meta['document_name'] = response.meta['document_name']
         items.append(requests)
     return items
예제 #26
0
파일: pap.py 프로젝트: blagarde/realescrape
class PAPSpider(CrawlSpider):

    name = 'pap'
    allowed_domains = ['pap.fr']
    start_urls = [
        URL_TEMPLATE % (37767 + int(postcode))
        for postcode in "10 11 12 18 19 20".split(' ')
    ]
    regex = r'http://www.pap.fr/annonce/vente-appartements-paris-.*-r\d{9}'
    rules = [
        Rule(LinkExtractor(allow=regex), 'parse_ad'),
        Rule(
            LxmlLinkExtractor(
                allow='.*',
                restrict_xpaths=
                "//ul[contains(@class,'pagination')]/li[contains(@class,'next')]/a[contains(text(), 'Suivante')]"
            ))
    ]

    def parse_ad(self, response):
        pty = Property()
        pty['url'] = response.url
        pty['listed_on'] = 'pap'
        # Price
        prices = response.css('h1 span.prix').xpath('text()').extract()
        assert len(prices) == 1
        pty['price'] = int(prices[0].rstrip(u' $\u20ac').replace('.', ''))

        # Surface
        li = response.css('.footer-descriptif ul').xpath(
            "//li[contains(span//text(), 'Surface')]").xpath("text()")
        # >>> li.xpath("text()").extract()
        # [u'\n\t\t\t\t\t\t\t\t', u'\n\t\t\t\t\t\t\t\t40\xa0', u'\t\t\t\t\t\t\t']
        assert len(li) == 3
        pty['size'] = float(li[1].extract().strip())

        # Post code
        titles = response.css('.text-annonce h2').xpath("text()").extract()
        assert len(titles) == 1
        match = re.search(r'\d{5}', titles[0])
        assert match is not None
        pty['postcode'] = int(match.group())

        # Content
        pty['title'] = response.css('h1 span.title').xpath(
            'text()').extract()[0] + ' - ' + titles[0]
        pty['description'] = ' '.join(
            response.css('div.text-annonce p').xpath('text()').extract())

        # Price per square meter
        pty['ppsqm'] = float(pty['price']) / pty['size']

        return pty
예제 #27
0
class recruitSpider(CrawlSpider):
    name = "tencentRecruitSpider"
    allowed_domains = ["tencent.com"]
    #爬虫的入口网页url
    start_urls = ["http://hr.tencent.com/position.php"]
    #根据任意一页职位url自定义爬取规则(http://hr.tencent.com/position.php?&start=1370#a)
    rules = [
        Rule(LxmlLinkExtractor(allow=('/position.php\?&start=\d{,4}#a')),
             follow=True,
             callback='parseItem')
    ]

    #定义提取网页数据到Items中的实现函数
    def parseItem(self, response):
        items = []
        sel = Selector(response)
        base_url = get_base_url(response)
        sites_even = sel.css('table.tablelist tr.even')
        for site in sites_even:
            item = PositionItem()
            item['name'] = site.css('.l.square a').xpath(
                'text()').extract()[0].encode('gbk')
            relative_url = site.css('.l.square a').xpath('@href').extract()[0]
            item['positionLink'] = urljoin_rfc(base_url, relative_url)
            item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()
            item['workPlace'] = site.css(
                'tr > td:nth-child(4)::text').extract()[0]
            item['number'] = site.css(
                'tr > td:nth-child(3)::text').extract()[0]
            item['releaseTime'] = site.css(
                'tr > td:nth-child(5)::text').extract()[0]
            items.append(item)

        sites_odd = sel.css('table.tablelist tr.odd')
        for site in sites_odd:
            item = PositionItem()
            item['name'] = site.css('.l.square a').xpath('text()').extract()[0]
            relative_url = site.css('.l.square a').xpath('@href').extract()[0]
            item['positionLink'] = urljoin_rfc(base_url, relative_url)
            item['catalog'] = site.css('tr > td:nth-child(2)::text').extract()
            item['workPlace'] = site.css(
                'tr > td:nth-child(4)::text').extract()[0]
            item['number'] = site.css(
                'tr > td:nth-child(3)::text').extract()[0]
            item['releaseTime'] = site.css(
                'tr > td:nth-child(5)::text').extract()[0]
            items.append(item)
        info('parsed ' + str(response))
        return items

    def _process_request(self, request):
        info('process ' + str(request))
        return request
    def parse_page(self, response):

        # ------products_paths------

        products_paths = response.xpath("//div[@id='articles']/div/a/@href").extract()

        for product_path in products_paths:
            item = ProductItem()
            item["path"] = product_path

            request = Request(url="http://www.madeleine.de" + product_path, callback=self.parse_product)
            request.meta['item'] = item

            yield request


        extr = LxmlLinkExtractor(allow="seite-\d+")
        links = extr.extract_links(response)

        for link in links:
            yield Request(url=link.url, callback=self.parse_page)
예제 #29
0
class GoogleSpider(CrawlSpider):
	name = 'google'
	allowed_domains = ['google.com']
	# rules = (Rule(LxmlLinkExtractor(allow=(r'\/([A-Z])([A-Z0-9]{9})'),deny=('')),callback='parse_item'),Rule(LxmlLinkExtractor(allow=(''))),),)

	# rules = (Rule(LxmlLinkExtractor(allow=(r'https://www.tripadvisor.com/Attraction_Review.*')),callback='parse_trip', process_links='process_links'),)
	rules = (Rule(LxmlLinkExtractor(allow=(r'https://www.google.com/.*')),callback='parse_search'),Rule(LxmlLinkExtractor(allow=(''))),follow=False)

	def __init__(self,*args, **kwargs):
		super(TripadvisorSpider, self).__init__(*args, **kwargs)
		start_url='https://www.tripadvisor.com/Attractions-g187337-Activities-Frankfurt_Hesse.html'
		# start_url='https://www.tripadvisor.com/'
		self.start_urls = [start_url]
	

	def parse_trip(self,response):
		item = GoogleItem()
		print "\n\n---------------------START-----------------------"
		print response.url
		# print response.xpath('//a/@href').extract()
		# try:
		item['name'] = response.xpath('//*[@id="HEADING"]/text()').extract()[0].encode('ascii','ignore')
		# item['rating'] = parsing_rating(response.xpath('//*[@id="HEADING_GROUP"]/div/div[2]/div[1]/div/span/img').extract())
		# item['neighborhood'] = response.xpath('//*[@id="MAP_AND_LISTING"]/div[2]/div/div[2]/div/div[1]/div/address/span/span').extract()
		# item['classification'] = response.xpath('//*[@id="HEADING_GROUP"]/div/div[3]/div[2]/div').extract()
		item['url'] = response.url
		# item['price'] = response.xpath('//*[@id="ABOVE_THE_FOLD"]/div[2]/div[1]/div/div[2]/div/div[1]/div/div[2]/div[1]/text()').extract()
		# item['hours'] = response.xpath('//*[@id="MAP_AND_LISTING"]/div[2]/div/div[2]/div/div[4]/div/div[2]/div').extract()
		# item['desc'] = response.xpath('//*[@id="OVERLAY_CONTENTS"]/div/p/text()').extract()
		# item['desc'] = [desc.encode('ascii','ignore') for desc in response.xpath('//*[@id="feature-bullets"]/ul/li/span/text()').extract() ]
		# usernames = response.xpath('//*[@class="username mo"]').extract()
		# reviews = response.xpath('//*[@class="partial_entry"]/text()').extract()
		# item['reviews'] = zip(usernames,reviews)
		print "\n\n---------------------------------------------------"
		print(item)

		# except:
		# 	print('Not a product!')
		# 	item = None
		yield item

	def process_links(self,links):
		print "\n       LINKS"
		links_list = []
		for i in links:
			if "https://www.tripadvisor.com/Attraction_Review" in i.url:
				links_list.append(i)
				print i.url
		return links_list

	def dummy(self,response):
		print(str(response.url))
    def parse_page(self, response):

        # ------products_paths------

        products_paths = response.xpath(
            "//div[@id='articles']/div/a/@href").extract()

        for product_path in products_paths:
            item = ProductItem()
            item["path"] = product_path

            request = Request(url="http://www.madeleine.de" + product_path,
                              callback=self.parse_product)
            request.meta['item'] = item

            yield request

        extr = LxmlLinkExtractor(allow="seite-\d+")
        links = extr.extract_links(response)

        for link in links:
            yield Request(url=link.url, callback=self.parse_page)
class LittleBrotherSpider(CrawlSpider):
    name = "little_brother"
    allowed_domains = ["camara.gov.br"]
    start_urls = (
        #This list should contain all the deputies from this period of thime (legislatura 54) -- change this number to get deputies from other periods
        'http://www.camara.gov.br/internet/deputado/Dep_Lista.asp?Legislatura=54&Partido=QQ&SX=QQ&Todos=None&UF=QQ&condic=QQ&forma=lista&nome=&ordem=nome&origem=None',
        #Example: 'http://www.camara.leg.br/Internet/Deputado/dep_Detalhe.asp?id=141463',
    )
    rules = ( Rule(LxmlLinkExtractor(allow=(".*Dep_Detalhe\.asp", ),), callback="parse_deputy", follow= True,),\
              Rule(LxmlLinkExtractor(allow=(".*RelVotacoes\.asp", ),), callback="parse_voting",  follow= True, ),
            )

    def parse_deputy(self, response):
        basic_info = response.xpath("//div[@class='bloco clearedBox']/ul/li")

        item = DeputyItem()
        for sel in basic_info:
            strong = sel.xpath("strong/text()").extract()
            if strong and "nome civil" in strong[0].lower():
                item["name"] = sel.xpath("text()").extract()[0].strip()

            if strong and "partido" in strong[0].lower():
                info = sel.xpath("text()").extract()[0].strip().split("/")
                item["party"] = info[0].strip()
                item["state"] = info[1].strip()
                item["active"] = info[2].strip()

            gid = re.match(".*id=(?P<id>\w*)", response.url)
            item["deputy_id"] = gid.group("id")
            #item["deputy_register"] =

        yield item

    def parse_voting(self, response):
        print "Parsing:", response.url
        pass

    def filter_deputy(self, response):
        pass
class Jdv2Spider(CrawlSpider):
    name = "linkextrator"
    allowed_domains = ["lagou.com"]
    start_urls = ('http://www.lagou.com/zhaopin/Java?labelWords=label', )
    rules = [
        Rule(LxmlLinkExtractor(allow=("http://www.lagou.com/jobs/")),
             callback='parse_item',
             follow=True)
    ]

    def parse_item(self, response):
        item = CategoryItem()
        item['category'] = response.css(".job_bt").extract()
        return item
예제 #33
0
class LinkProcedure(BaseProcedure):
    """
    基于scrapy的LxmlLinkExtractor的链接提取器
    link xpath
    xpath string|array  参考LxmlLinkExtractor的restrict_xpaths
    """
    def __init__(self, *args):
        xpath = args[0]
        self._extractor = LxmlLinkExtractor(restrict_xpaths=xpath)

    def do(self, input_, **kwargs):
        if isinstance(input_, Response):
            links = self._extractor.extract_links(input_)
            return [i.url.strip() for i in links]
        else:
            raise Exception('link input error')
예제 #34
0
class TheCrawler(Spider):

    name = 'TheCrawlerEngineV1'

    def __init__(self, **kw):
        super(TheCrawler, self).__init__(**kw)
        
        self.channel = kw.get('channel')
        self.domain = kw.get('domain')
        
        full_config_path = '%s%s' % (SITE_CONFIG_PATH, self.channel)
        
        self.config_path = "%s/%s.txt" % (full_config_path, self.domain)
        self.config_items = self._parse_config(self.config_path)
        
        try:
            self.url = self.config_items['start_url']
        except KeyError:
            self.url = 'http://%s/' % self.domain
            
        try:
            self.link_extractor = LxmlLinkExtractor(restrict_xpaths=self.config_items['crawl_areas'], unique=True)
        except KeyError:
            self.link_extractor = LxmlLinkExtractor(unique=True)
                        
        self.real_domain = urlparse(self.url).hostname.lstrip('www.')
        self.allowed_domains = [urlparse(self.url).hostname.lstrip('www.')]
        
        self.cookies_seen = set()
        
    def start_requests(self):
        return [Request(self.url, callback=self.parse)]

    def parse(self, response):
        page = self._get_item(response)
        r = [page]
        r.extend(self._extract_requests(response))
        return r

    def _get_item(self, response):        
        item = ContentAttributes(
            url=response.url,
            size=str(len(response.body)),
            referer=response.request.headers.get('Referer')
        )

        self._set_content_data(item, response)
        self._set_new_cookies(item, response)
        
        return item

    def _extract_requests(self, response):
        r = []
        if isinstance(response, Response):
            links = self.link_extractor.extract_links(response)
            r.extend(Request(x.url, callback=self.parse) for x in links)
        return r

    def _set_content_data(self, page, response):
        if isinstance(response, Response):
            title = Selector(response).xpath(self.config_items['title']).extract()
            content = Selector(response).xpath(self.config_items['body']).extract()
            
            try:
                published_date = Selector(response).xpath(self.config_items['publish_date']).extract()
            except KeyError:
                published_date = None
            
            try:
                images = Selector(response).xpath(self.config_items['image']).extract()
            except KeyError:
                images = Selector(response).xpath('%s//img/@src' % self.config_items['body']).extract()

            image_urls = []
            for img_url in images:
                image_url_hostname = urlparse(img_url).hostname   
                image_url_scheme = urlparse(img_url).scheme
                
                if image_url_hostname is None:
                    img_url = "http://%s%s" % (self.real_domain, img_url)
                    
                if image_url_scheme is None:
                    img_url = "http://%s" % img_url
                    
                image_urls.append(img_url)
                        
            if title:
                page['title'] = title[0]
                
            if content:
                page['content'] = content[0]

            pubdate = 0
            if published_date is not None:
                if published_date:
                    day_pattern = self.config_items['publish_date_day_pattern']
                    month_pattern = self.config_items['publish_date_month_pattern']
                    year_pattern = self.config_items['publish_date_year_pattern']
                    time_pattern = self.config_items['publish_date_time_pattern']
                    
                    pubdate = self._translate_publish_date_to_timestamp(published_date[0], day_pattern, month_pattern, year_pattern, time_pattern)
            
            page['publish_date'] = str(pubdate)
            page['image_urls'] = image_urls 
            page['channel'] = self.channel
            page['domain'] = self.real_domain
        
    def _set_new_cookies(self, page, response):
        cookies = []
        for cookie in [x.split(';', 1)[0] for x in response.headers.getlist('Set-Cookie')]:
            if cookie not in self.cookies_seen:
                self.cookies_seen.add(cookie)
                cookies.append(cookie)
        if cookies:
            page['newcookies'] = cookies
            
    def _parse_config(self, filename):
        config = {}
        f = open(filename, 'r')

        for line in f.readlines():
            config_per_line = line.split(': ')
            if len(config_per_line) > 1:
                if config_per_line[0] != 'test_url':
                    config[config_per_line[0]] = config_per_line[1].rstrip()

                if config_per_line[0] == 'publish_date_time_pattern':
                    config[config_per_line[0]] = ':'.join(config_per_line[1:]).rstrip()
        f.close()
        
        return config
    
    def _translate_publish_date_to_timestamp(self, pubdate, day_pattern, month_pattern, year_pattern, time_pattern):
        try:
            re_day = re.compile(day_pattern)
            re_month = re.compile(month_pattern)
            re_year = re.compile(year_pattern)
            re_time = re.compile(time_pattern)

            try:
                day = re_day.findall(pubdate)[0]
            except IndexError:
                day = "00"

            try:
                month = re_month.findall(pubdate)[0].lower()
            except IndexError:
                month = "00"

            try:
                year = re_year.findall(pubdate)[0]
            except IndexError:
                year = "0000"

            try:
                parsed_time = re_time.findall(pubdate)[0]
            except IndexError:
                parsed_time = "00:00"
            try:
                pubdate_timestamp = convert_datetime_to_unix_timestamp("%s/%s/%s %s" % (year, MONTH_DICTIONARY["%s%s" % (month[0].upper(), month[1:])], day, parsed_time))
            except KeyError:
                pubdate_timestamp = 0
            
            return pubdate_timestamp
        except AttributeError:
            return 0
예제 #35
0
 def __init__(self, *args):
     xpath = args[0]
     self._extractor = LxmlLinkExtractor(restrict_xpaths=xpath)