예제 #1
0
    def parse_state(self, response):
        """ Yields a scrapy.Request object for each city with a store in the state """
        state_url = 'stores.joann.com/{}*'.format(response.meta['state'])
        extractor = LinkExtractor(allow=state_url)

        for link in extractor.extract_links(response):
            yield scrapy.Request(link.url, callback=self.parse_city, headers=HEADERS)
예제 #2
0
파일: find_reports.py 프로젝트: orian/umo
    def parse_sesja(self, response):
        # uchwaly
        uchwaly_le = LinkExtractor(allow=FindReportsSpider.UCHWALA_RE, restrict_xpaths="//table")
        links = uchwaly_le.extract_links(response)
        self.print_links("uchwaly", links)
        cnt = 0
        for link in links:
            yield scrapy.Request(link.url, callback=self.parse_uchwala)
            k = items.PageItem()
            k["text"] = link.text.encode("utf8")
            k["url"] = link.url
            k["ref"] = response.url
            k["order"] = cnt
            yield k
            if cnt >= DEBUG_CNT and DEBUG:
                break
            cnt += 1

        # files (glosowania, obecnosc)
        le = LinkExtractor(allow=FindReportsSpider.PLIK_RE)
        links = le.extract_links(response)
        self.print_links("glosowania", links)
        cnt = 0
        for link in links:
            fi = items.FiledownloadItem()
            fi["file_urls"] = [link.url]
            fi["text"] = link.text.encode("utf8")
            fi["url"] = link.url
            fi["ref"] = response.url
            fi["order"] = cnt
            yield fi
            if cnt >= DEBUG_CNT and DEBUG:
                break
            cnt += 1
예제 #3
0
 def parse(self,response):
     extractor = LinkExtractor(allow="/article/*")
     links = extractor.extract_links(response)
     for link in links:
         item = XiubaiItem()
         req = Request(link.url, self.parse_detail_page)
         req.meta['item'] = item
         yield req
예제 #4
0
 def parse_link(self, response):
     # log
     self.logger.info('Hi, this is an item page! %s', response.url)
     # parse link
     linkExtractor = LinkExtractor(allow=r".+\.shtml", restrict_css='div.list > ul', unique=True)
     links = linkExtractor.extract_links(response)
     for link in links:
         yield scrapy.Request(link.url, callback=self.parse_content)
예제 #5
0
 def parse(self, response):
     le = LinkExtractor()
     for link in le.extract_links(response):
         yield scrapy.Request(link.url, self.parse_link, meta={
             'splash': {
                 'args': {'har': 1, 'html': 0},
             }
         })
예제 #6
0
    def parse(self, response):
        if response.status != 200 or response.body == "":
            return

        ads_links = response.xpath("//a[img]")
        for ads_link in ads_links:
            link_href = ads_link.xpath("@href").extract_first()
            if self._from_same_site(response.url, link_href):
                continue

            ads_profile = AdsProfileItem()
            ads_profile["ads_host"] = response.url
            ads_profile["ads_present_mode"] = "normal_1"
            ads_profile["ads_target_url"] = link_href
            img_src = response.urljoin(ads_link.xpath("img/@src").extract_first())
            ads_profile["ads_content_url"] = img_src
            ads_profile["ads_content_frame"] = ""
            ads_profile["ads_host_domain"] = urlparse(response.url).netloc
            ads_profile["ads_target_domain"] = urlparse(link_href).netloc
            yield ads_profile

        if isinstance(response, SplashJsonResponse):
            if "childFrames" in response.data:
                frames = self._get_all_child_frames(response)
                print "Get %s childFrames in %s" % (len(frames), response.url)
                for frame_response in frames:
                    if not self._is_valid_frame(frame_response.url):
                        continue
                    ads_links = frame_response.xpath("//a[img]")
                    for ads_link in ads_links:
                        link_href = ads_link.xpath("@href").extract_first()
                        if self._from_same_site(response.url, link_href):
                            continue

                        ads_profile = AdsProfileItem()
                        ads_profile["ads_host"] = response.url
                        ads_profile["ads_present_mode"] = "normal_1"
                        ads_profile["ads_target_url"] = link_href
                        img_src = frame_response.urljoin(ads_link.xpath("img/@src").extract_first())
                        ads_profile["ads_content_url"] = img_src
                        ads_profile["ads_content_frame"] = frame_response.url
                        ads_profile["ads_host_domain"] = urlparse(response.url).netloc
                        ads_profile["ads_target_domain"] = urlparse(link_href).netloc
                        yield ads_profile

        link_extractor = LinkExtractor()
        all_links = link_extractor.extract_links(response)
        for link in all_links:
            request = SplashRequest(
                response.urljoin(link.url),
                self.parse,
                endpoint="render.json",
                slot_policy=SlotPolicy.PER_DOMAIN,
                args={"html": 1, "iframes": 1},
            )
            request.headers.setdefault("User-Agent", self.ua_generater.get_user_agent())
            yield request
예제 #7
0
 def parse(self, response):
     e = LinkExtractor()
     urls = [link.url for link in e.extract_links(response)]
     for url in urls:
         parsed = urlparse.urlsplit(url)
         qs = urlparse.parse_qs(parsed.query)
         if qs and 'Url' in qs:
             event_url = qs['Url'][0]
             yield self.add_url(event_url)
예제 #8
0
    def parse_code(self, response):
        #提取source code的url
#        le = LinkExtractor(restrict_css='div.bodywrapper p', allow='matplotlib.org/examples')
#        link = le.extract_links(response)
        le = LinkExtractor(restrict_css='a.reference.external')
        link = le.extract_links(response)
        
        file = FilesItem()
        file['file_urls'] = [link[0].url]
        return file
예제 #9
0
    def parse(self, response):
        le = LinkExtractor()
        user_profiles = []
        for link in le.extract_links(response):
            result = re.search(r'.*(http://www.last.fm/user/.*)', link.url)
            if result:
                user_profiles.append(result.group(1))

        for user_profile in user_profiles:
            print user_profile
예제 #10
0
 def parse(self, response):
     link_extractor = LinkExtractor()
     links = link_extractor.extract_links(response)
     for link in links:
         item = DomainItem()
         item['link'] = link.url
         item['domain'] = self.getHost(link.url)
         yield item
     for link in links:
         if (not db.scrapy_items.find_one({'link': link.url})):
             yield scrapy.Request(link.url, callback=self.parse)
예제 #11
0
 def parse(self, response):
     name = 'example'
     lx = LinkExtractor()
     lst = lx.extract_links(response)  # List contains the list of jobs
     # Call the function which compares between lst and MongoDB. Return Boolean Value
     flag = compare(name, lst)
     # if True, call the function which send an email to users
     if flag:
         notify(name)
     else:
         print("No Update")
예제 #12
0
 def parse(self, response):
     le = LinkExtractor()
     for link in le.extract_links(response):
         yield SplashRequest(
             link.url,
             self.parse_link,
             endpoint='render.json',
             args={
                 'har': 1,
                 'html': 1,
             }
         )
예제 #13
0
 def parse(self, response):
     e = LinkExtractor()
     urls = [link.url for link in e.extract_links(response)]
     for url in urls:
         if response.url != url:
             yield self.add_url(url)
     if urls:
         qs = urlparse.parse_qs(urlparse.urlparse(response.url).query)
         qs = dict((k, v[0]) for (k, v) in qs.iteritems())
         qs['p'] = int(qs['p']) + 1
         url = 'http://comeon5678.com/event/list'
         yield scrapy.Request('%s?%s' % (url, urllib.urlencode(qs)))
예제 #14
0
파일: book.py 프로젝트: daguanqiao/gitt1
 def parse(self, response):
     #提取书籍页面中每本书的链接
     le = LinkExtractor(restrict_css='article.product_pod h3')
     for link in le.extract_links(response):
         yield scrapy.Request(link.url, callback=self.parse_book)
         
     #提取下一页的链接
     le =  LinkExtractor(restrict_css='ul.pager li.next')
     links = le.extract_links(response)
     if links:
         next_url = links[0].url
         yield scrapy.Request (next_url, callback=self.parse)
예제 #15
0
 def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(),
                tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None,
                deny_extensions=None):
   LinkExtractor.__init__(self, allow = allow,
       deny = deny,
       allow_domains = allow_domains,
       deny_domains = deny_domains,
       restrict_xpaths = restrict_xpaths,
       tags = tags,
       attrs = attrs,
       canonicalize = canonicalize,
       unique = unique,
       process_value = self.process_value,
       deny_extensions = deny_extensions
       )
예제 #16
0
class MySpider(scrapy.Spider):
    # Your spider definition
    name="fetch_data"

    def __init__(self, *args, **kwargs):
    	super(MySpider, self).__init__(*args, **kwargs)
        self.start_urls = [kwargs.get('start_url')]
        self.link_extractor = LinkExtractor()
        urls = self.start_urls

    def parse(self, response):
    	item = WebpageScraperItem()
        
        item['key'] = self.start_urls
    	item['title'] = response.xpath('//title/text()').extract()
    	item['paragraphs'] = response.xpath('//p/text()').extract()
    	item['headings'] = response.xpath('//h1/text()').extract()
        
        links = self.link_extractor.extract_links(response)
        item['links'] = [x.url for x in links]
        
        img_urls = []
        img_url = response.xpath('//img/@src').extract()
        for img in img_url:
            parse_url = urlparse.urlparse(img)
            parsed_url = parse_url._replace(**{"scheme":"http"})
            img_urls.append(parsed_url.geturl())
    	
        item['image_urls'] = img_urls
        return item
예제 #17
0
파일: bc.py 프로젝트: scrapinghub/frontera
class BCSpider(Spider):
    name = 'bc'

    def __init__(self, *args, **kwargs):
        super(BCSpider, self).__init__(*args, **kwargs)
        self.le = LinkExtractor()

    def parse(self, response):
        if not isinstance(response, HtmlResponse):
            return

        for link in self.le.extract_links(response):
            r = Request(url=link.url)
            r.meta.update(link_text=link.text)
            yield r

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(BCSpider, cls).from_crawler(crawler, *args, **kwargs)
        spider._set_crawler(crawler)
        spider.crawler.signals.connect(spider.spider_idle, signal=signals.spider_idle)
        return spider

    def spider_idle(self):
        self.log("Spider idle signal caught.")
        raise DontCloseSpider
예제 #18
0
 def __init__(self, *args, **kwargs):
     super(GeneralSpider, self).__init__(*args, **kwargs)
     f = open("seeds_es_smp.txt")
     la = [urlparse(url.strip()).netloc for url in f.readlines()]
     f.close()
     self.la = la
     self.le = LinkExtractor()
 def print_url(self, response):
     """
         @url http://www.ura.org.hk/en/schemes-and-policies/redevelopment/ura-implemented-projects/reimbursement.aspx
         @returns items 1 1
         @returns requests 0 0
         @scrapes title link html text last_updated file_urls
     """
     l = ItemLoader(item=UrbanRenewalItem(), response=response)
     l.add_xpath('title', '//title')
     l.add_value('link', response.url)
     l.add_xpath('text', '//div[@id="content"]')
     l.add_xpath('html', '/html')
     l.add_xpath('last_updated', '//div[@class="lastUpdated"]')
     lx = LinkExtractor(allow=['\.' + ext for ext in file_extension],
                        deny_extensions=())
     l.add_value('file_urls', [link.url for link in lx.extract_links(response)])
     return l.load_item()
예제 #20
0
파일: find_reports.py 프로젝트: orian/umo
    def parse_main(self, response):
        le = LinkExtractor(allow=KADENCJA_RE)
        links = le.extract_links(response)
        self.print_links("kadencje", links)
        cnt = 0

        for link in links:
            yield scrapy.Request(link.url, callback=self.parse_kadencja)
            k = items.PageItem()
            k["text"] = link.text.encode("utf8")
            k["url"] = link.url
            k["ref"] = response.url
            k["order"] = cnt
            yield k
            if cnt >= DEBUG_CNT and DEBUG:
                break
            cnt += 1
예제 #21
0
파일: find_reports.py 프로젝트: orian/umo
 def parse_uchwala(self, response):
     # generate list of files to download
     le = LinkExtractor(allow=FindReportsSpider.PLIK_RE)
     links = le.extract_links(response)
     self.print_links("files", links)
     cnt = 0
     for link in links:
         fi = items.FiledownloadItem()
         fi["file_urls"] = [link.url]
         fi["text"] = link.text.encode("utf8")
         fi["url"] = link.url
         fi["ref"] = response.url
         fi["order"] = cnt
         yield fi
         if cnt >= DEBUG_CNT and DEBUG:
             break
         cnt += 1
예제 #22
0
    def parse(self, response):
        for sel in response.css('article.product_pod'):
            book = BookstoresItem()
            book['name'] = sel.xpath('./h3/a/@title').extract_first()
            book['price'] = sel.css('p.price_color::text').extract_first()
            yield book
            
        # 提取链接
#        next_url = response.css('ul.pager li.next a::attr(href)').extract_first()
#        if next_url:
#            next_url = response.urljoin(next_url)
#            yield scrapy.Request(next_url,callback=self.parse)
        le = LinkExtractor(restrict_css='ul.pager li.next' ) 
        links = le.extract_links(response)
        if links:
            next_url = links[0].url
            yield scrapy.Request(next_url,callback=self.parse)
예제 #23
0
파일: find_reports.py 프로젝트: orian/umo
 def parse_kadencja(self, response):
     #    'LIX Sesja Rady Miasta 24 września 2014 r.'
     #    'http://www.bip.olsztyn.eu/bip/dokument/305103/lix_sesja_rady_miasta_24_wrzesnia_2014_r_/'
     le = LinkExtractor(allow=FindReportsSpider.SESJA_RE)
     links = le.extract_links(response)
     self.print_links("sesje", links)
     cnt = 0
     for link in links:
         yield scrapy.Request(link.url, callback=self.parse_sesja)
         k = items.PageItem()
         k["text"] = link.text.encode("utf8")
         k["url"] = link.url
         k["ref"] = response.url
         k["order"] = cnt
         yield k
         if cnt >= DEBUG_CNT and DEBUG:
             break
         cnt += 1
예제 #24
0
 def __init__(self, **kw):
     super(FollowAllSpider, self).__init__(**kw)
     url = kw.get('url') or kw.get('domain') or 'http://scrapinghub.com/'
     if not url.startswith('http://') and not url.startswith('https://'):
         url = 'http://%s/' % url
     self.url = url
     self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
     self.link_extractor = LinkExtractor()
     self.cookies_seen = set()
예제 #25
0
    def parse(self, response):
        print(response.url)

        # Extract internal links from webpage
        IGNORED_EXTENSIONS.append('gz')
        IGNORED_EXTENSIONS.append('tar')
        urlextract = LinkExtractor(allow_domains=self.allowed_domains)

        # Store internal links
        links = urlextract.extract_links(response)
        links = [l.url for l in links]
        if response.url not in self.data:
            self.data[response.url] = links
        yield

        # Follow internal links
        for url in links:
            yield scrapy.Request(url, self.parse)
예제 #26
0
 def __init__(self, **kw):
     super(FollowAllSpider, self).__init__(**kw)
     url = kw.get('url') or kw.get('domain') or 'https://zh.wikipedia.org/wiki/%E5%9C%9F%E8%B1%86%E7%BD%91'
     if not url.startswith('http://') and not url.startswith('https://'):
         url = 'http://%s/' % url
     self.url = url
     self.allowed_domains = [re.sub(r'^www\.', '', urlparse(url).hostname)]
     self.link_extractor = LinkExtractor()
     self.cookies_seen = set()
예제 #27
0
파일: quotes.py 프로젝트: gentlekai/Economy
 def parse_item(self, response):
     self.write_response(response.url, response)
     
     print("----------------------------------", response.real_url, response.url)
     
     le = LinkExtractor()
     for link in le.extract_links(response):
         splashRequestObj = SplashRequest(
             link.url,
             self.parse_item,
             endpoint='render.html',
             args={
                 'wait':0.8,
                 'html': 1,
             }
         )
         
         yield splashRequestObj
예제 #28
0
파일: quotes.py 프로젝트: gentlekai/Economy
 def parse(self, response):
     self.write_response(response.url, response)
     
     if not response.url.lower().find(r"cisco.com/en/us/docs") == -1 or not response.url.lower().find(r"cisco.com/c/en/us/td/docs") == -1  or not response.url.lower().find(r"register") == -1:
         return
     
     le = LinkExtractor()
     for link in le.extract_links(response):
         splashRequestObj = SplashRequest(
             link.url,
             self.parse,
             endpoint='render.html',
             args={
                 'wait':0.8,
                 'html': 1,
             }
         )
         
         yield splashRequestObj
예제 #29
0
  def parse_item(self, response):
    internal_item = InternalItem()
    internal_item["url"] = response.url
    yield internal_item

    #Use the inbuilt LinkExtractor to find urls, filtering out internal urls
    extractor_external = LinkExtractor(deny_domains=self.allowed_domains)
    external_links = extractor_external.extract_links(response)
    for link in external_links:
      external_item = ExternalItem()
      external_item["url"] = link.url
      yield external_item

    for src in response.css("img::attr('src')"):
      asset_item = AssetItem()
      asset_item["url"] = response.urljoin(src.extract())
      yield asset_item

    for src in response.css("script::attr('src')"):
      asset_item = AssetItem()
      asset_item["url"] = response.urljoin(src.extract())
      yield asset_item
예제 #30
0
def extract_links(response, xpaths, tag=None, attr=None):
    """Extract links on a page matching given XPaths.

    :param response:    Scrapy response whose body contains links to extract
    :type response:     :class:`scrapy.http.Response`
    :param xpaths:      unique or iterable of XPath(s) matching links
                        to extract
    :type xpaths:       `unicode` or `iterable` of `unicode`
    :param tag:         tag name from which extract links
    :type tag:          `unicode`
    :param attr:        attribute name in :data:`tag` tag from which extract
                        links
    :type attr:         `unicode`
    :yield:             extracted links (canonicalized URLs), directly usable
                        as :data:`scrapy.http.Request.url` parameters
    :rtype:             `generator` orf `unicode`

    """
    # Construct LinkExtractor parameters
    extractor_attrs = {
        'restrict_xpaths': xpaths,
        'canonicalize': True,
        }
    if tag:
        extractor_attrs['tags'] = (tag,)
    if attr:
        extractor_attrs['attrs'] = (attr,)

    # Extract links
    link_extractor = \
        LinkExtractor(**extractor_attrs)
    links = link_extractor.extract_links(response)

    # Generate links
    for link in links:
        yield link.url
예제 #31
0
class KaggleSpider(CrawlSpider):
    name = "kaggle"
    allowed_domains = [
        "blog.kaggle.com",
    ]

    start_urls = [
        'http://blog.kaggle.com',
    ]

    __queue = []

    rules = [
        Rule(
            LinkExtractor(
                allow=[],
                deny=__queue,
                restrict_xpaths=[
                    '//*[@class="back-link"]',
                    '//*[@class="post clearfix"]/h1/a[1]',
                    # '//?page=\d+',
                    # '//\w+/\w+/\w+/w+'
                ]),
            callback='parse_extract_data',
            follow=True)
    ]

    def parse_extract_data(self, response):

        if response.xpath('//*[@class="back-link"]'
                          ) and 'Bandwidth exceeded' in response.body:
            raise CloseSpider('Exit')
        item = CrawlBlogItem()
        res = Selector(response)
        # import ipdb; ipdb.set_trace()
        # 		title = res.xpath('//*[@id="ctl01"]/div[5]/div[3]/div/div[1]/div[2]/div/div[1]/div[2]/h1/text()').extract()
        # 		item['title'] = ''.join(title).strip()
        #
        item['author'] = ''.join(
            response.xpath('//span[@class="author vcard"]/a/text()').extract())
        item['name'] = ''.join(
            response.xpath(
                '//div[@class="article-header-inside"]/h1/text()').extract())

        date_time = ''.join(
            response.xpath('//span[@class="entry-date"]/a[2]/@href').extract())
        if date_time:
            item['datetime'] = date_time[-11:]

        item['url'] = response.url

        content = enumerate(
            response.xpath('//div[@class="entry-content"]/node()'))
        content_data = {}
        check_point = 'Summary'

        for index, data in content:
            _data = data.extract()
            if check_point not in content_data:
                content_data[check_point] = []
            if '<p>' in _data or '\n' in _data or 'attachment' in _data:
                content_data[check_point].append(data.extract())
            if '<h2>' in _data:
                check_point = BeautifulSoup(_data).text

        item['content'] = content_data

        if 'name' in item and item['name']:
            return item
예제 #32
0
class NetEaseSpider(CrawlSpider):
    # 爬虫名称
    name = 'netease'
    # 定义爬取网址
    start_urls = [
        'https://news.163.com/domestic/', 'https://news.163.com/world/'
    ]
    # start_urls = ['https://money.163.com/']
    # 允许的爬取域
    allowed_domains = ['news.163.com']
    # allowed_domains = ['money.163.com']

    # 爬取规则
    # 日期区间在get_base_url后输入
    rules = [
        Rule(LinkExtractor(allow=r'({0})\d+/.*?html'.format(bace_url)),
             callback='parse_item',
             follow=True) for bace_url in get_base_url('20200508', '20200508')
    ]

    def parse_item(self, response):
        item = NewsSpiderItem()
        item['news_thread'] = response.url.strip().split('/')[-1][:-5]
        self.get_source(response, item)
        self.get_source_url(response, item)
        self.get_url(response, item)
        self.get_time(response, item)
        self.get_title(response, item)
        self.get_text(response, item)
        return item

    def get_text(self, response, item):
        text = response.css('.post_text p::text').extract()
        if text:
            print('text:{}'.format(text).replace(' ', ''))
            new_text = list()
            for line in text:
                if line:
                    new_text.append(
                        line.replace(' ', '').replace('\n',
                                                      '').replace('\t', ''))
            item['news_text'] = new_text

    def get_url(self, response, item):
        url = response.url
        print(url)
        if url:
            item['news_url'] = url

    def get_title(self, response, item):
        title = response.css('title::text').extract()
        if title:
            print('title:{}'.format(title[0]))
            item['news_title'] = title[0]

    def get_time(self, response, item):
        time = response.css('div.post_time_source::text').extract()
        if time:
            print('time:{}'.format(time[0].strip().replace('来源', '').replace(
                '\u3000', '')))
            item['news_time'] = time[0].strip().replace('来源', '').replace(
                '\u3000', '')

    def get_source(self, response, item):
        source = response.css('#ne_article_source::text').extract()
        if source:
            print('source:{}'.format(source[0]))
            item['news_source'] = source[0]

    def get_source_url(self, response, item):
        source_url = response.css('#ne_article_source::attr(href)').extract()
        if source_url:
            print('source_url:{}'.format(source_url[0]))
            item['news_source_url'] = source_url[0]
예제 #33
0
 def parse(self, response):
     print("RESPONSE ", response)
     xlink = LinkExtractor()
     yield ScrapyLink(link=xlink.extract_links(response))
예제 #34
0
class LagouSpider(CrawlSpider):
    name = 'lagou'
    allowed_domains = ['www.lagou.com']
    start_urls = ['https://www.lagou.com/']

    rules = (
        Rule(LinkExtractor(allow=('zhaopin/.*', )), follow=True),
        Rule(LinkExtractor(allow=r'gongsi/j\d+.html'), follow=True),
        Rule(LinkExtractor(allow=r'jobs/\d+.html'),
             callback='parse_job',
             follow=True),
    )
    # headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0",
    #            "HOST": "www.lagou.com"}
    #
    # def start_requests(self):
    #     yield scrapy.Request(url=self.start_urls[0], headers=self.headers, callback=self.parse_job, dont_filter=True)
    custom_settings = {
        "COOKIES_ENABLED": False,
        "DOWNLOAD_DELAY": 1,
        'DEFAULT_REQUEST_HEADERS': {
            'Accept':
            'application/json, text/javascript, */*; q=0.01',
            'Accept-Encoding':
            'gzip, deflate, br',
            'Accept-Language':
            'zh-CN,zh;q=0.8',
            'Connection':
            'keep-alive',
            'Cookie':
            'user_trace_token=20171015132411-12af3b52-3a51-466f-bfae-a98fc96b4f90; LGUID=20171015132412-13eaf40f-b169-11e7-960b-525400f775ce; SEARCH_ID=070e82cdbbc04cc8b97710c2c0159ce1; ab_test_random_num=0; X_HTTP_TOKEN=d1cf855aacf760c3965ee017e0d3eb96; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DsXIrWUxpNGLE2g_bKzlUCXPTRJMHxfCs6L20RqgCpUq%26wd%3D%26eqid%3Dee53adaf00026e940000000559e354cc; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; index_location_city=%E5%85%A8%E5%9B%BD; TG-TRACK-CODE=index_hotjob; login=false; unick=""; _putrc=""; JSESSIONID=ABAAABAAAFCAAEG50060B788C4EED616EB9D1BF30380575; _gat=1; _ga=GA1.2.471681568.1508045060; LGSID=20171015203008-94e1afa5-b1a4-11e7-9788-525400f775ce; LGRID=20171015204552-c792b887-b1a6-11e7-9788-525400f775ce',
            'Host':
            'www.lagou.com',
            'Origin':
            'https://www.lagou.com',
            'Referer':
            'https://www.lagou.com/',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        }
    }

    def parse_job(self, response):
        item_loader = LagouJobItemLoader(item=LagouJobItem(),
                                         response=response)
        # i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        # i['name'] = response.xpath('//div[@id="name"]').extract()
        # i['description'] = response.xpath('//div[@id="description"]').extract()
        item_loader.add_css("title", ".job-name::attr(title)")
        item_loader.add_value("url", response.url)
        item_loader.add_value("url_object_id", get_md5(response.url))
        item_loader.add_css("salary", ".job_request p span.salary::text")
        item_loader.add_xpath("job_city",
                              "//dd[@class='job_request']/p/span[2]/text()")
        item_loader.add_xpath("work_years",
                              "//dd[@class='job_request']/p/span[3]/text()")
        item_loader.add_xpath("degree_need",
                              "//dd[@class='job_request']/p/span[4]/text()")
        item_loader.add_xpath("job_type",
                              "//dd[@class='job_request']/p/span[5]/text()")
        item_loader.add_css("publish_time",
                            ".job_request p.publish_time::text")
        item_loader.add_css("job_advantage", ".job-advantage p::text")
        item_loader.add_css("job_desc", ".job_bt div p")
        item_loader.add_css("job_addr", ".work_addr")
        item_loader.add_css("tags", ".position-label.clearfix li::text")
        item_loader.add_css("company_name", ".job_company dt a img::attr(alt)")
        item_loader.add_css("company_url", ".job_company dt a::attr(href)")
        item_loader.add_value("crawl_time", datetime.datetime.now())
        # item_loader.add_css("crawl_update_time",".work_addr")
        lagou_item = item_loader.load_item()
        return lagou_item
예제 #35
0
class CrawlerSpider(AutoExtractSpider):
    """
    Crawler Spider discovers links and returns AutoExtract items too.

    Required params:
    * seeds: one, or more seed URLs (as YAML list)
    Example:
    > -a seeds=http://example.com/
    Or:
    > -a seeds='[http://blog.example.com/, http://shop.example.com/]'

    The mandatory "page-type" param from the parent AutoExtract Spider is also required.

    Optional params:
    * seeds-file-url: an optional URL to a plain text file with a list of seed URLs;
    * max-items: how many items (articles, or products) should the spider extract, per host;
        When the items are extracted, the spider stops. default: 100;
    * max-pages: how many pages should the spider follow per host, when discovering links;
        default: 1000;
    * count-limits: a YAML dict with page or item max count;
        example: {page_count: 90, item_count: 10}
    * extract-rules: a YAML dict with allowed and denied hosts and patterns;
        They will be used to initialize a scrapy.linkextractors.LinkExtractor;
        example: {allow: "/en/items/", deny: ["/privacy-?policy/?$", "/about-?(us)?$"]}
    * same-domain: limit the discovery of links to the same domains as the seeds;
        default: True
    * discovery-only: discover the links and return them, without AutoExtract items;
        default: False

    Extra options:
    * DEPTH_LIMIT: maximum depth that will be allowed to crawl; default: 1.
    * CLOSESPIDER_TIMEOUT: if the spider is running for more than that number of seconds,
        it will be automatically closed. default: 21600 seconds.
    """
    # name = 'crawler'
    only_discovery = False
    same_origin = True
    seed_urls = None
    seeds_file_url = None
    count_limits = DEFAULT_COUNT_LIMITS
    rules = [
        Rule(LinkExtractor(),
             process_links='_rule_process_links',
             process_req_resp='_rule_process_req_resp',
             follow=True),
    ]

    @classmethod
    def update_settings(cls, settings):
        super().update_settings(settings)
        update_redirect_middleware(settings)

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super().from_crawler(crawler, *args, **kwargs)
        spider.main_callback = spider.parse_page
        spider.main_errback = spider.errback_page

        for rule in spider.rules:
            rule._compile(spider)

        # Discovery only for seeds, without items
        if spider.get_arg('discovery-only'):
            spider.only_discovery = yaml.load(spider.get_arg('discovery-only'))
        # Limit requests to the same domain
        if spider.get_arg('same-domain'):
            spider.same_origin = yaml.load(spider.get_arg('same-domain'))

        # Seed URLs
        if getattr(spider, 'seeds', None):
            seeds = spider.seeds
            if isinstance(seeds, str):
                try:
                    spider.seed_urls = yaml.load(seeds)
                except Exception as err:
                    raise ValueError('Invalid seed URLs: %s %s', seeds, err)
            elif isinstance(seeds, (list, tuple)):
                spider.seed_urls = seeds
            del spider.seeds
        if spider.seed_urls:
            spider.seed_urls = arg_to_iter(spider.seed_urls)
        # Seeds file URL
        if spider.get_arg('seeds-file-url'):
            spider.seeds_file_url = spider.get_arg('seeds-file-url')

        # Domains allowed to be crawled, for OffsiteMiddleware and others
        if spider.same_origin and spider.seed_urls:
            if not hasattr(spider, 'allowed_domains'):
                spider.allowed_domains = DEFAULT_ALLOWED_DOMAINS
            spider.allowed_domains.extend(
                urlsplit(u).netloc.lower() for u in spider.seed_urls)

        crawler.signals.connect(spider.open_spider, signals.spider_opened)
        return spider

    def open_spider(self):  # noqa: C901
        """
        Parse command line args.
        """
        super().open_spider()

        # JSON count limits for pages or items
        if self.get_arg('count-limits'):
            limits = self.get_arg('count-limits')
            try:
                self.count_limits = yaml.load(limits) if not isinstance(
                    limits, dict) else limits
            except Exception as err:
                raise ValueError('Invalid count limits: %s %s', limits, err)
        # JSON link extraction rules
        if self.get_arg('extract-rules'):
            rules = self.get_arg('extract-rules')
            try:
                self.extract_rules = yaml.load(rules) if not isinstance(
                    rules, dict) else rules
            except Exception as err:
                raise ValueError('Invalid extraction rules: %s %s', rules, err)
        else:
            self.extract_rules = {}

        # Shortcut to limit global requests
        if self.get_arg('max-pages'):
            max_pages = int(self.get_arg('max-pages'))
            self.count_limits['page_host_count'] = max_pages
            if self.seed_urls:
                self.count_limits['page_count'] = max_pages * len(
                    self.seed_urls) * 2
            else:
                self.count_limits['page_count'] = max_pages * 2
        if self.get_arg('max-items'):
            max_items = int(self.get_arg('max-items'))
            self.count_limits['item_host_count'] = max_items
            if self.seed_urls:
                self.count_limits['item_count'] = max_items * len(
                    self.seed_urls) * 2
            else:
                self.count_limits['item_count'] = max_items * 2
        if self.count_limits:
            self.logger.debug('Using count limits: %s', self.count_limits)

        # Shortcut to allow and ignore links
        if self.get_arg('allow-links'):
            try:
                self.extract_rules['allow'] = yaml.load(
                    self.get_arg('allow-links'))
            except Exception as err:
                raise ValueError('Invalid allow-links: %s', err)
        if self.get_arg('ignore-links'):
            try:
                self.extract_rules['deny'] = yaml.load(
                    self.get_arg('ignore-links'))
            except Exception as err:
                raise ValueError('Invalid ignore-links: %s', err)
        if self.extract_rules:
            self.logger.debug('Using extract rules: %s', self.extract_rules)

        if self.only_discovery:
            self.logger.debug('Discovery ONLY mode enabled')

        return self

    @crawlera_session.init_start_requests
    def start_requests(self):
        """
        The main function.
        """
        # Process exact item URLs for Articles, or Products (if any)
        yield from super().start_requests()
        # Discover links and process the items
        yield from self._process_seeds()

    def _process_seeds(self) -> str:
        """
        Seeds are website URLs (can be JSON, JL, TXT, or CSV with 1 column)
        Because the list is expected to be small, the input can be one, or more URLs.
        Seed URLs will be crawled deeply, trying to find articles, or products.
        """
        if self.seeds_file_url:
            yield Request(self.seeds_file_url,
                          meta={'source_url': self.seeds_file_url},
                          callback=self.parse_seeds_file,
                          errback=self.main_errback,
                          dont_filter=True)

        if not self.seed_urls:
            return

        self.logger.info('Using seeds: %s', self.seed_urls)
        yield from self._schedule_seed_urls(self.seed_urls)

    def parse_seeds_file(self, response):
        """
        Process seeds file url response and schedule seed urls for processing.
        """
        if not isinstance(response, TextResponse):
            return
        seeds = response.text.split()
        yield from self._schedule_seed_urls(seeds)

    def _schedule_seed_urls(self, seed_urls):
        """
        A helper to process seed urls and yield appropriate requests.
        """
        for url in seed_urls:
            url = url.strip()
            if not is_valid_url(url):
                self.logger.warning('Ignoring invalid seed URL: %s', url)
                continue
            # Initial request to the seed URL
            self.crawler.stats.inc_value('x_request/seeds')
            yield Request(url,
                          meta={'source_url': url},
                          callback=self.main_callback,
                          errback=self.main_errback,
                          dont_filter=True)

    def parse_page(self, response):
        """
        Parse the spider response.
        """
        if not isinstance(response, TextResponse):
            return

        # Try to parse the AutoExtract response (if available) and return the correct Item
        is_autoextract_response = is_autoextract_request(response)
        if not self.only_discovery:
            if is_autoextract_response:
                yield from self.parse_item(response)
        else:
            # For discovery-only mode, return only the URLs
            item = {'url': response.url}
            item['scraped_at'] = utc_iso_date()
            if response.meta.get('source_url'):
                item['source_url'] = response.meta['source_url']
            if response.meta.get('link_text'):
                item['link_text'] = response.meta['link_text'].strip()
            yield item

        # Cycle and follow links
        # Currently AutoExtract responses don't contain the full page HTML,
        # so there are no links and nothing to follow
        if response.body and not is_autoextract_response:
            for request in self._requests_to_follow(response):
                yield crawlera_session.init_request(request)
        elif is_autoextract_response:
            # Make another request to fetch the full page HTML
            # Risk of being banned
            self.crawler.stats.inc_value('x_request/discovery')
            request = Request(response.url,
                              meta={'source_url': response.meta['source_url']},
                              callback=self.main_callback,
                              errback=self.main_errback,
                              dont_filter=True)
            yield crawlera_session.init_request(request)

    def _rule_process_links(self, links):
        """
        Simple helper used by the default Rule to drop links,
        when the same-origin option is enabled.
        """
        if not self.same_origin:
            return links
        valid_links = []
        for lnk in links:
            host = urlsplit(lnk.url).netloc.lower()
            if not hasattr(self,
                           'allowed_domains') or host in self.allowed_domains:
                valid_links.append(lnk)
        return valid_links

    def _rule_process_req_resp(self, request, response):
        """
        Simple helper used by the default Rule to fix the current request.
        """
        for m in META_TO_KEEP:
            if response.meta.get(m):
                request.meta[m] = response.meta[m]
        request.meta['scraped_at'] = utc_iso_date()
        request.callback = self.parse_page
        request.errback = self.errback_page
        return request

    def _requests_to_follow(self, response):
        seen = set()
        for n, rule in enumerate(self.rules):
            links = [
                lnk for lnk in rule.link_extractor.extract_links(response)
                if lnk.url not in seen
            ]
            if links and callable(rule.process_links):
                links = rule.process_links(links)
            for link in links:
                seen.add(link.url)
                meta = {'rule': n, 'link_text': link.text}
                request = self.make_extract_request(link.url, meta=meta)
                if not request:
                    continue
                if callable(rule.process_req_resp):
                    request = rule.process_req_resp(request, response)
                yield request

    def errback_page(self, failure):
        if failure.check(IgnoreRequest, DropItem):
            return
        request = getattr(failure, 'request', None)
        if request:
            self.logger.warning('Page %s failed: %s', request.body, failure)
            self.crawler.stats.inc_value('error/failed_page')
예제 #36
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name' : "//meta[@property='og:title']/@content",
    'price' : "//tr/td/div[@class='giasanpham']",
    'category' : "//div[@class='dvtitleproduct_kind']/table/tr/td[2]/a",
    'description' : "//div[@class='producthotcatend']/div[@id='p1']/table",
    'images' : "//img[@id='anhchinh']/@src",
    'canonical' : "//link[@rel='canonical']/@href",
    'base_url' : "",
    'brand' : ""
}
name = 'hongha.asia'
allowed_domains = ['hongha.asia']
start_urls = ['http://hongha.asia/main/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/product/']), 'parse_item'),
    Rule(LinkExtractor(allow=['/cat/','/page+\d+\.html']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
예제 #37
0
class exhibitMeishujiaSpider(CrawlSpider):
    name = 'exhibit.meishujia'
    catid = 6
    typeid = 0
    sysadd = 1
    status = 99

    # allowed_domains = ['artist.meishujia.cn']
    start_urls = [
        "http://exhibit.meishujia.cn/index.php?page=1&act=app&appid=4099"
    ]
    # 设置下载延时
    download_delay = 1
    custom_settings = {
        'ITEM_PIPELINES': {
            'baby.pipelines.exhibitPipeline': 300,
            # 'baby.pipelines.JsonWriterPipeline': 350,
            # 'baby.pipelines.MultiImagesPipeline': 400,
            # 'baby.pipelines.MysqlWriterPipeline': 500,
        },
    }
    rules = (
        # 地址分页
        # Rule(LinkExtractor(allow=('/index.php?page=1&act=pps&smid=2'), allow_domains=('meishujia.cn'),restrict_xpaths=('//ul[@class="sert"]'))),
        # 详情页1
        # Rule(LinkExtractor(restrict_xpaths=('//li[@class="i42c"]/div[@class="i42ck"]'))),
        # 详情页 2 /?act=usite&usid=[0-9]{1,10}&inview=[a-z-0-9-]+&said=528  /?act=usite&usid=8646&inview=appid-241-mid-619&said=528
        #只有一个规则的时候,后面的“,”要加上,不然报 TypeError: 'Rule' object is not iterable 错误
        Rule(LinkExtractor(restrict_xpaths=(
            '//dd[re:test(@class,"theme_body_1609")]//ul[@class="srre"]//div[@class="srremap"]/a'
        )),
             callback='parse_item'), )

    def detail_lik(self, links):
        yield links

    def parse_item(self, response):
        # http://blog.51cto.com/pcliuyang/1543031
        l = DefaultItemLoader(item=exhibitMeishujiaItem(), selector=response)
        l.add_value('spider_link', get_base_url(response))
        l.add_xpath(
            'spider_img',
            '//dd[re:test(@class,"theme_body_1611")]//ul[re:test(@class,"zl_r_af")]//img[@src]'
        )
        l.add_value('spider_imgs', '//*[@id="photos"]//div[@class="panel"]')
        l.add_xpath(
            'title',
            'normalize-space(//dd[re:test(@class,"theme_body_1611")]//h1)')

        l.add_xpath('attr',
                    '//dd[re:test(@class,"theme_body_1611")]/ol//text()')
        l.add_value('attr_value', [])

        l.add_xpath(
            'content',
            '//dd[re:test(@class,"theme_body_1611")]//ul[re:test(@class,"zl_r_b zl_r_bt")]/node()'
        )
        l.add_value('keywords', '')
        l.add_value('description', '')
        l.add_value('thumbs', '')
        l.add_value('catid', self.catid)
        l.add_value('status', self.status)
        l.add_value('sysadd', self.sysadd)
        l.add_value('typeid', self.typeid)
        l.add_value('inputtime', int(time.time()))
        l.add_value('updatetime', int(time.time()))
        l.add_value('create_time',
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        l.add_value('update_time',
                    datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))

        # l.add_xpath('content', '//dd[re:test(@class,"theme_body_4656")]//table[2]//tr[3]/td')
        # l.add_xpath('content', '//dd[re:test(@class,"theme_body_4656")]//table[2]//tr[3]/td//text()')
        # l.add_xpath('attr', '//dd[re:test(@class,"theme_body_1611")]/ol/span/text()')
        # l.add_xpath('attr_value', '//dd[re:test(@class,"theme_body_1611")]/ol/text()')

        d = l.load_item()
        # print(d)
        yield d

    def parse_content_item(self, selector):
        pass
예제 #38
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name' : "//body/div[@id='container']/div[@id='content']/h1",
    'price' : "//div[@id='content']/div[@class='product-info']/div[@class='right']/div[@class='price']/text()",
    'category' : "//div[@id='container']/div[@id='content']/div[@class='breadcrumb']/a",
    'description' : "//body/div[@id='container']/div[@id='content']/div[@id='tab-attribute']",
    'images' : "//div[@class='left']/div[@class='image']/a[@class='colorbox cboxElement']/img/@src",
    'canonical' : "//link[@rel='canonical']/@href",
    'base_url' : "//base/@href",
    'brand' : ""
}
name = 'laptopnew.vn'
allowed_domains = ['laptopnew.vn']
start_urls = ['http://laptopnew.vn']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['']), 'parse_item'),
    Rule(LinkExtractor(allow=['']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
예제 #39
0
import furl
from scrapy.linkextractors import LinkExtractor

from ..items import WebToonItem

link_extractor = LinkExtractor(
    r'/webtoon/detail.nhn\?titleId=\d+&no=\d+&weekday=\w+$',
    restrict_xpaths=".//a[contains(@onclick, 'lst.title')]")


def _extract_title(response):
    xpath = "//div[@class='comicinfo']/div[@class='detail']/h2/text()"
    return response.xpath(xpath).get().strip()


def _extract_description(response):
    xpath = "//div[@class='comicinfo']/div[@class='detail']/p/text()"
    return '\n'.join([desc.get() for desc in response.xpath(xpath)])


def _extract_thumbnail_src(response, titleId, **kwargs):
    contain = f"https://shared-comic.pstatic.net/thumb/webtoon/{titleId}/thumbnail/"
    for thumb in response.xpath(f"//img[contains(@src, '{contain}')]/@src"):
        return thumb.get()


def _extract_author(response):
    xpath = ".//span[@class='wrt_nm']/text()"
    return response.xpath(xpath).get().strip()

예제 #40
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name':
    "//div[@class='product-info']/div[@class='info']/h1[@class='p-name cufon']",
    'price': "//div[@class='price cufon']/span[@class='num']",
    'category': "//div[@id='navation']/nav/ul/li/a/span",
    'description': "",
    'images':
    "//div[@class='pic-thumb']/span[@class='wp-pic']/img[@class='zoom-pic']/@src",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'nonson.vn'
allowed_domains = ['nonson.vn']
start_urls = ['http://www.nonson.vn']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(), 'parse_item'),
    Rule(LinkExtractor(), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
예제 #41
0
파일: crawler.py 프로젝트: Anandtamil/avain
    def process_response(self, response):
        """
        Process the given scrapy response. Extract new URLs, HTTP parameters,
        new network locations, cookies and code comments.

        :return: a set of URLs that shall be crawled in the future
        """

        if response.status == 404:
            return set()

        # store response HTTP code if not redirect
        if not (response.status == 301 or response.status == 302):
            if response.url not in self.crawled_urls:
                self.crawled_urls[response.url] = response.status

        # some colorful printing
        if self.verbose:
            code = str(response.status)
            extra_print = ""
            if code[0] == "2":
                color = util.GREEN
            elif code[0] == "3":
                color = util.BRIGHT_CYAN
                extra_print = (util.BRIGHT_CYAN + " --> " + util.SANE +
                               response.headers["Location"].decode())
            elif code[0] == "4":
                color = util.RED
            elif code[0] == "5":
                color = util.MAGENTA
            else:
                color = util.SANE
            print_str = "  [" + color + str(
                response.status
            ) + util.SANE + "]  " + response.url + extra_print
            util.printit(print_str)

        # extract cookies and their paths from HTTP response header
        cookie_paths = self.extract_cookies(
            response.headers.getlist("Set-Cookie"), response.url)
        cookie_urls = set()
        for path in cookie_paths:
            cookie_urls.add(self.to_absolute_url(path, response.urljoin))

        # use scrapy's lxml linkextractor to extract links / URLs
        scrapy_urls = set()
        try:
            # extract <base> URL's domain if a <base> tag exists
            base_domain = ""
            base_tag_sels = response.xpath("//base")
            for base_tag_sel in base_tag_sels:
                href_sels = base_tag_sel.xpath("@href")
                if href_sels:
                    href = href_sels.extract_first()
                    base_domain = urllib.parse.urlparse(href).netloc
                    break

            # setup allowed domains and extract new links
            allowed_domains = [self.domain, "%s:%s" % (self.domain, self.port)]
            if base_domain:
                allowed_domains.append(base_domain)
            raw_scrapy_links = LinkExtractor(
                allow_domains=allowed_domains,
                tags=("a", "area", "script", "link", "source", "img"),
                attrs=("src", "href"),
                deny_extensions=set()).extract_links(response)
            raw_scrapy_urls = [link.url for link in raw_scrapy_links]

            # copy discovered URLs and additionally insert initial network location
            scrapy_urls = raw_scrapy_urls.copy()
            if base_domain and base_domain != allowed_domains[
                    0] and base_domain != allowed_domains[1]:
                orig_netloc = urllib.parse.urlparse(response.url).netloc
                for scrapy_url in raw_scrapy_urls:
                    parsed_scrapy_url = list(urllib.parse.urlsplit(scrapy_url))
                    parsed_scrapy_url[1] = orig_netloc
                    scrapy_urls.append(
                        urllib.parse.urlunsplit(parsed_scrapy_url))
            scrapy_urls = set(scrapy_urls)
        except (AttributeError, scrapy.exceptions.NotSupported) as e:
            if str(e) == "Response content isn't text":
                # stop processing and return no new URLs
                return set()
            raise e

        # run the different URL / link discovery mechanisms
        linkfinder_urls, dynamic_urls, form_urls, sub_urls = set(), set(), set(
        ), set()
        if self.config["use_linkfinder"].lower() == "true":
            linkfinder_urls = self.run_linkfinder(response.text,
                                                  response.urljoin)
        if self.config["use_selenium"].lower() == "true":
            dynamic_urls = self.extract_dynamic_urls(response.url)
        if self.config["extract_info_from_forms"].lower() == "true":
            form_data = extract_form_data(response)
            # extract new URLs and HTTP parameters from parsed form data
            form_urls = self.process_form_data(form_data, response.urljoin)

        # extract sub URLs, i.e. URLs with parent paths
        sub_urls = extract_sub_urls(response.url)

        # extract comments if configured
        if self.config["extract_comments"].lower() == "true":
            self.extract_comments(response)

        # unite discovered URLs
        urls = set()
        urls |= cookie_urls
        urls |= scrapy_urls
        urls |= linkfinder_urls
        urls |= dynamic_urls
        urls |= form_urls
        urls |= sub_urls

        # store info about redirect and add redirect URL to discovered URLs
        if response.status == 301 or response.status == 302:
            location = response.headers["Location"].decode()
            self.redirects[response.url] = {
                "code": response.status,
                "to": location
            }
            urls.add(self.to_absolute_url(location, response.urljoin))

        # process all the discovered URLs, i.e. extract new information and decide which to crawl
        yield_urls = set()
        for url in urls:
            # strip anchor
            if "#" in url:
                url = url[:url.rfind("#")]

            # replace entities and parse URL
            url = url.replace("&amp;", "&")
            url = url.replace("&#038;", "&")
            parsed_url = urllib.parse.urlparse(url)

            # disregard information about directory listing sorting
            if parsed_url.path.endswith("/") and re.match(
                    "C=[A-Z];O=[A-Z]", parsed_url.query):
                continue

            # extract GET parameters and cut URL if option is configured
            params = {}
            if parsed_url.query:
                if self.config["crawl_parameter_links"].lower() != "true":
                    url = "%s://%s/%s" % (parsed_url.scheme, parsed_url.netloc,
                                          parsed_url.path)
                params = get_query_params(parsed_url.query)
            elif url.endswith("?"):
                url = url[:-1]

            # add URL as instance of its path
            if self.url_has_netloc(url) and params:
                self.add_path_instance(parsed_url.path, params, {}, {})

            # skip already crawled URLs
            if url in self.found_urls:
                continue
            self.found_urls.add(url)

            # skip URLs with different network location
            if not self.url_has_netloc(url):
                continue
            if url == response.url:
                continue

            # skip paths that are excluded from crawling
            if self.exclude_paths and url.count("/") > 2:
                check_str = "/" + "/".join(url.split("/")[3:])
                if any(
                        re_path.match(check_str)
                        for re_path in self.exclude_paths):
                    continue

            # check whether to add this URL to the to-be-crawled URLs
            if url not in yield_urls:
                # limit the crawling depth
                max_depth = int(self.config["max_depth"])
                if max_depth > 0:
                    depth = parsed_url.path.count("/")
                    if depth > max_depth:
                        continue

                # limit the number of times a path can be crawled to avoid endless
                # crawling upon GET parameter variation
                if parsed_url.path not in self.crawled_paths:
                    self.crawled_paths[parsed_url.path] = 0
                self.crawled_paths[parsed_url.path] += 1
                if self.crawled_paths[parsed_url.path] > int(
                        self.config["max_path_visits"]):
                    continue

                yield_urls.add(url)

        return yield_urls
예제 #42
0
class ITjuziSpider(RedisCrawlSpider):
    name = 'itjuzi'
    allowed_domains = ['www.itjuzi.com']
    # start_urls = ['http://www.itjuzi.com/company']
    redis_key = 'itjuzispider:start_urls'
    rules = [
        # 获取每一页的链接
        Rule(link_extractor=LinkExtractor(allow=('/company\?page=\d+'))),
        # 获取每一个公司的详情
        Rule(link_extractor=LinkExtractor(allow=('/company/\d+')),
             callback='parse_item')
    ]

    def parse_item(self, response):
        soup = BeautifulSoup(response.body, 'lxml')

        # 开头部分: //div[@class="infoheadrow-v2 ugc-block-item"]
        cpy1 = soup.find('div', class_='infoheadrow-v2')
        if cpy1:
            # 公司名称://span[@class="title"]/b/text()[1]
            company_name = cpy1.find(
                class_='title').b.contents[0].strip().replace('\t',
                                                              '').replace(
                                                                  '\n', '')

            # 口号: //div[@class="info-line"]/p
            slogan = cpy1.find(class_='info-line').p.get_text()

            # 分类:子分类//span[@class="scope c-gray-aset"]/a[1]
            scope_a = cpy1.find(class_='scope c-gray-aset').find_all('a')
            # 分类://span[@class="scope c-gray-aset"]/a[1]
            scope = scope_a[0].get_text().strip() if len(scope_a) > 0 else ''
            # 子分类:# //span[@class="scope c-gray-aset"]/a[2]
            sub_scope = scope_a[1].get_text().strip(
            ) if len(scope_a) > 1 else ''

            # 城市+区域://span[@class="loca c-gray-aset"]/a
            city_a = cpy1.find(class_='loca c-gray-aset').find_all('a')
            # 城市://span[@class="loca c-gray-aset"]/a[1]
            city = city_a[0].get_text().strip() if len(city_a) > 0 else ''
            # 区域://span[@class="loca c-gray-aset"]/a[2]
            area = city_a[1].get_text().strip() if len(city_a) > 1 else ''

            # 主页://a[@class="weblink marl10"]/@href
            home_page = cpy1.find(class_='weblink marl10')['href']
            # 标签://div[@class="tagset dbi c-gray-aset"]/a
            tags = cpy1.find(class_='tagset dbi c-gray-aset').get_text().strip(
            ).strip().replace('\n', ',')

        #基本信息://div[@class="block-inc-info on-edit-hide"]
        cpy2 = soup.find('div', class_='block-inc-info on-edit-hide')
        if cpy2:

            # 公司简介://div[@class="block-inc-info on-edit-hide"]//div[@class="des"]
            company_intro = cpy2.find(class_='des').get_text().strip()

            # 公司全称:成立时间:公司规模:运行状态://div[@class="des-more"]
            cpy2_content = cpy2.find(class_='des-more').contents

            # 公司全称://div[@class="des-more"]/div[1]
            company_full_name = cpy2_content[1].get_text().strip(
            )[len('公司全称:'):] if cpy2_content[1] else ''

            # 成立时间://div[@class="des-more"]/div[2]/span[1]
            found_time = cpy2_content[3].contents[1].get_text().strip(
            )[len('成立时间:'):] if cpy2_content[3] else ''

            # 公司规模://div[@class="des-more"]/div[2]/span[2]
            company_size = cpy2_content[3].contents[3].get_text().strip(
            )[len('公司规模:'):] if cpy2_content[3] else ''

            #运营状态://div[@class="des-more"]/div[3]
            company_status = cpy2_content[5].get_text().strip(
            ) if cpy2_content[5] else ''

        # 主体信息:
        main = soup.find('div', class_='main')

        # 投资情况://table[@class="list-round-v2 need2login"]
        # 投资情况,包含获投时间、融资阶段、融资金额、投资公司
        tz = main.find('table', 'list-round-v2')
        tz_list = []
        if tz:
            all_tr = tz.find_all('tr')
            for tr in all_tr:
                tz_dict = {}
                all_td = tr.find_all('td')
                tz_dict['tz_time'] = all_td[0].span.get_text().strip()
                tz_dict['tz_round'] = all_td[1].get_text().strip()
                tz_dict['tz_finades'] = all_td[2].get_text().strip()
                tz_dict['tz_capital'] = all_td[3].get_text().strip().replace(
                    '\n', ',')
                tz_list.append(tz_dict)

        # 团队信息:成员姓名、成员职称、成员介绍
        tm = main.find('ul', class_='list-prodcase limited-itemnum')
        tm_list = []
        if tm:
            for li in tm.find_all('li'):
                tm_dict = {}
                tm_dict['tm_m_name'] = li.find('span',
                                               class_='c').get_text().strip()
                tm_dict['tm_m_title'] = li.find(
                    'span', class_='c-gray').get_text().strip()
                tm_dict['tm_m_intro'] = li.find(
                    'p', class_='mart10 person-des').get_text().strip()
                tm_list.append(tm_dict)

        # 产品信息:产品名称、产品类型、产品介绍
        pdt = main.find('ul', class_='list-prod limited-itemnum')
        pdt_list = []
        if pdt:
            for li in pdt.find_all('li'):
                pdt_dict = {}
                pdt_dict['pdt_name'] = li.find('h4').b.get_text().strip()
                pdt_dict['pdt_type'] = li.find(
                    'span', class_='tag yellow').get_text().strip()
                pdt_dict['pdt_intro'] = li.find(
                    class_='on-edit-hide').p.get_text().strip()
                pdt_list.append(pdt_dict)

        item = JuziItem()
        item['info_id'] = response.url.split('/')[-1:][0]
        item['company_name'] = company_name
        item['slogan'] = slogan
        item['scope'] = scope
        item['sub_scope'] = sub_scope
        item['city'] = city
        item['area'] = area
        item['home_page'] = home_page
        item['tags'] = tags
        item['company_intro'] = company_intro
        item['company_full_name'] = company_full_name
        item['found_time'] = found_time
        item['company_size'] = company_size
        item['company_status'] = company_status
        item['tz_info'] = tz_list
        item['tm_info'] = tm_list
        item['pdt_info'] = pdt_list
        return item
예제 #43
0
class BjdfundSpider(CrawlSpider):
    name='bjdall'
    source = "京报网"
    allowed_domains = ["bjd.com.cn"]
    reg='jx'
    start_urls = [
        'http://www.bjd.com.cn/jx/toutiao/',
        'http://www.bjd.com.cn/jx/jj/'
    ]
    rules=(
        Rule(LinkExtractor(allow=reg), callback="parse_news", follow=True),
        Rule(LinkExtractor(allow='_[0-9].\.html')),
    )
    def printcn(uni):
        for i in uni:
            print uni.encode('utf-8')
    def parse_news(self,response):
        item = GenericItem()
        self.get_id(response,item)
        self.get_url(response,item)
        self.get_source(response,item)
        self.get_title(response,item)
        self.get_date(response,item)
        self.get_body(response,item)
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse
        return item

    def get_id(self,response,item):
        id=uuid.uuid4()
        if id:
            item['id']=id
    def get_url(self,response,item):
        news_url=response.url
        if news_url:
            item['url']=news_url
    def get_source(self,response,item):
        source=self.source
        if source:
            item['source']=source
    def get_title(self,response,item):
        title=response.xpath('//div[@class="tit"]/text()').extract()
        if title:
            item['title']=''.join(title).strip()
    def get_date(self,response,item):
        date=response.xpath('//div[@class="info"]/span[1]/text()').extract()
        if date:
            item['date']=''.join(date).replace(u'-',u'').replace(u':',u'').replace(u' ',u'').strip()
    def get_body(self,response,item):
        paras = response.xpath('//div[@class="TRS_Editor"]/p')
        if not paras:
            paras = response.xpath('//div[@class="TRS_Editor"]')
        news_body = ''
        for p in paras:
            data = p.xpath('string(.)').extract()
            if data:
                body = ''
                for line in ''.join(data).splitlines():
                    #   print entry.encode('utf-8')
                    body += line.strip()
                news_body += body + '_|_'
        item['body'] = news_body.replace('_|__|_','_|_')
예제 #44
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//h1[@class='h1Title']",
    'price': "//div[@class='row_infoP']/span[@class='dt_price']",
    'category': "//div[@class='path flt']/a/span",
    'description': "//div[@id='tabs_detail_content']/div[@class='section'][1]",
    'images': "//img[@id='mainImage']/@src",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'amazona.vn'
allowed_domains = ['amazona.vn']
start_urls = ['http://amazona.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/san-pham/[a-zA-Z0-9-]+\.html$']),
         'parse_item'),
    Rule(LinkExtractor(allow=['/danh-muc/[a-zA-Z0-9-]+\.html($|\?page=\d+)']),
         'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
예제 #45
0
class EastmoneySpider(CrawlSpider):
    name = 'eastmoney'
    source = "东方财富网"
    allowed_domains = ["eastmoney.com"]
    yesterday = datetime.date.today() - datetime.timedelta(days=1)
    yesterday = yesterday.strftime('%Y%m%d')
    reg = yesterday
    start_urls = [
        'http://finance.eastmoney.com/news/ccjdd.html',
        'http://finance.eastmoney.com/news/cywjh.html',
        'http://finance.eastmoney.com/news/chgjj.html',
        'http://finance.eastmoney.com/news/cjrzb.html',
        'http://finance.eastmoney.com/news/ccyjj.html',
        'http://finance.eastmoney.com/news/cssgs.html',
        'http://finance.eastmoney.com/news/cgnjj.html',
        'http://finance.eastmoney.com/news/cgjjj.html',
        'http://finance.eastmoney.com/news/ccjxw.html',
        'http://finance.eastmoney.com/news/cjjsp.html',
        'http://finance.eastmoney.com/news/ccyts.html',
        'http://finance.eastmoney.com/news/csygc.html',
        'http://finance.eastmoney.com/news/czfgy.html',
        'http://finance.eastmoney.com/news/csyjy.html',
        'http://finance.eastmoney.com/news/cjjxr.html',
        'http://finance.eastmoney.com/news/csxy.html',
        'http://finance.eastmoney.com/news/czsdc.html',
        'http://finance.eastmoney.com/news/crdsm.html',
        'http://stock.eastmoney.com/news/cgsxw.html'
    ]
    rules = (
        Rule(LinkExtractor(allow=reg, deny='data.eastmoney.com'),
             callback="parse_news",
             follow=True),
        # Rule(LinkExtractor(allow='_[0-9]+.html'))
        Rule(LinkExtractor(allow='_[1-6].html')))

    def printcn(uni):
        for i in uni:
            print uni.encode('utf-8')

    def parse_news(self, response):
        item = GenericItem()
        self.get_id(response, item)
        self.get_url(response, item)
        self.get_source(response, item)
        self.get_title(response, item)
        self.get_date(response, item)
        self.get_body(response, item)
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse
        if item['body']:
            return item

    def get_id(self, response, item):
        id = uuid.uuid4()
        if id:
            item['id'] = id

    def get_url(self, response, item):
        news_url = response.url
        if news_url:
            item['url'] = news_url

    def get_source(self, response, item):
        source = self.source
        if source:
            item['source'] = source

    def get_title(self, response, item):
        title = response.xpath(
            '//div[@class="newsContent"]/h1/text()').extract()
        if title:
            item['title'] = title

    def get_date(self, response, item):
        date = response.xpath('//div[@class="time"]/text()').extract()
        if date:
            item['date'] = ''.join(date).replace(u'年', u'').replace(
                u'月', u'').replace(u'日', u'').replace(u':', u'').replace(
                    u' ', u'') + '00'

    def get_body(self, response, item):
        abstract = response.xpath('//div[@class="b-review"]/text()').extract()
        paras = response.xpath('//*[@id="ContentBody"]/p')
        news_body = ''
        for p in paras:
            data = p.xpath('string(.)').extract()
            if data:
                body = ''
                for line in ''.join(data).splitlines():
                    #   print entry.encode('utf-8')
                    body += line.strip()
                news_body += body + '_|_'
        item['body'] = ''.join(abstract) + '_|_' + news_body.replace(
            '_|__|_', '_|_')
예제 #46
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//div[@class='product-col-desc']/h1[@class='title']",
    'price': "//div[@class='product-price']/strong[@class='colorh']",
    'category': "//div[@class='BreadcrumbText']/a",
    'description':
    "//div[@class='content-responsive']/div[@class='content news']",
    'images': "//div[@class='clearfix']/a/@href",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'lanopearl.com.vn'
allowed_domains = ['lanopearl.com.vn']
start_urls = ['http://lanopearl.com.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/san-pham/[a-zA-Z0-9-/]+\.html$']),
         'parse_item'),
    Rule(LinkExtractor(allow=['/[a-zA-Z0-9-]+/$']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
예제 #47
0
class TruliaSpider(scrapy.Spider):
    name = 'sold_150'
    allowed_domains = ['trulia.com']
    custom_settings = {
        'FEED_URI':
        os.path.join(os.path.dirname(closest_scrapy_cfg()),
                     'data/iterate/sold_%(start)s_%(time)s.jl'),
        'FEED_FORMAT':
        'jsonlines'
    }

    def __init__(self, state='IL', city='Chicago', start=150, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.state = state
        self.city = city
        self.start = start
        self.start_urls = [
            'https://www.trulia.com/sold/{city},{state}/'.format(state=state,
                                                                 city=city)
        ]
        self.le = LinkExtractor(allow=r'^https://www.trulia.com/p/')

    def parse(self, response):
        #N = 598 #trulia.TruliaSpider.get_number_of_pages_to_scrape(response)
        M = self.start
        N = M + 50
        self.logger.info(
            "Seaching between index page {M} and index page {N} ".format(N=N,
                                                                         M=M))
        for url in [
                response.urljoin("{n}_p/".format(n=n))
                for n in range(M, N + 1)
        ]:
            yield scrapy.Request(url=url, callback=self.parse_index_page)

    def parse_index_page(self, response):
        for link in self.le.extract_links(response):
            yield scrapy.Request(url=link.url,
                                 callback=self.parse_property_page)

    def parse_property_page(self, response):
        item_loader = TruliaItemLoader(item=TruliaItem(), response=response)
        trulia.TruliaSpider.load_common_fields(item_loader=item_loader,
                                               response=response)

        details = item_loader.nested_css('.homeDetailsHeading')
        taxes = details.nested_xpath(
            './/*[text() = "Property Taxes and Assessment"]/parent::div')
        taxes.add_xpath(
            'property_tax_assessment_year',
            './following-sibling::div/div[contains(text(), "Year")]/following-sibling::div/text()'
        )
        taxes.add_xpath(
            'property_tax',
            './following-sibling::div/div[contains(text(), "Tax")]/following-sibling::div/text()'
        )
        taxes.add_xpath(
            'property_tax_assessment_land',
            './following-sibling::div/div/div[contains(text(), "Land")]/following-sibling::div/text()'
        )
        taxes.add_xpath(
            'property_tax_assessment_improvements',
            './following-sibling::div/div/div[contains(text(), "Improvements")]/following-sibling::div/text()'
        )
        taxes.add_xpath(
            'property_tax_assessment_total',
            './following-sibling::div/div/div[contains(text(), "Total")]/following-sibling::div/text()'
        )
        taxes.add_xpath(
            'property_tax_market_value',
            './following-sibling::div/div[contains(text(), "Market Value")]/following-sibling::div/text()'
        )

        item = item_loader.load_item()
        trulia.TruliaSpider.post_process(item=item)
        return item
예제 #48
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name' : "//div[@class='container']/form/div/div/p",
    'price' : "//div[@class='container']/form/div/div/label",
    'category' : "//div[@class='content left']/a",
    'description' : "//div[@class='center-content']/div[@class='center-content']/div[@class='container']/div[@class='container']",
    'images' : "//a[@class='gallery']/img/@src",
    'canonical' : "",
    'base_url' : "",
    'brand' : ""
}
name = 'anhemfeather.vn'
allowed_domains = ['anhemfeather.vn']
start_urls = ['http://anhemfeather.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow = ['/task,view/']), 'parse_item'),
    Rule(LinkExtractor(allow = ['/task,cat/']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
예제 #49
0
class zhiLianSpider(RedisCrawlSpider):
    name = "zhilian_1"
    start_urls = 'https://sou.zhaopin.com/?jl=489'

    def __init__(self, *args, **kwargs):
        self.allowed_domains = ['zhaopin.com', 'baidu.com']
        self.start_urls = 'https://sou.zhaopin.com/?jl=489'
        super().__init__(*args, **kwargs)

    #     for x in self.create_2_request("123"):
    #         pass

    manu_conn_redis1.lpush("zhilian_1:start_urls", start_urls)

    rules = (
        Rule(LinkExtractor(allow=(r"zhaopin\.com/\?jl")),
             callback='process_item_c1',
             follow=True),
        Rule(LinkExtractor(
            allow=(r"https://fe-api\.zhaopin\.com/c/i/sou/\?cityId.+")),
             callback='process_item_c1',
             follow=True),
    )

    def test_rule_is_have_use(self, response):
        manu_conn_redis1.lpush("test_content", response.text)

    #redisSpider说,一定需要定义好parse函数,那没办法了,那就定义吧.
    def process_item_c1(self, response):
        print("I am the kumanxuan")
        # print(response.url)
        self.handle_json_2_item(response)

    # def parse(self,response):

    #     yield scrapy.Request("https://www.baidu.com")

#由于智联招聘现在的前端已经换了模式,所以需要采用特殊模式,直接当作是客户端处理

    def handle_json_2_item(self, response):
        #格式化内容,转换json变成dict
        #response里面有text方法的,不过大多数情况是使用xpath这个方法的,因为这个用途比较广泛
        pass
        response_dict = json.loads(response.text)
        # items = SpiderZhilianItem()
        for x in response_dict['data']['results']:
            items = {}
            items['city_name'] = x['city']['display']
            items['city'] = x['city']['items'][0]['code']
            items['company_name'] = x['company']['name']
            items['number'] = x['company']['number']

            items['education'] = x['eduLevel']['name']
            items['experience'] = x['workingExp']['name']
            items['salary'] = x['salary']
            items['job_name'] = x['jobName']

            #这里没完成一次,都把这些字典数据保存到redis数据库当中.!
            manu_conn_redis1.lpush('res_' + str(items['city']), items)

            yield items
        # return  items

    #创建批量的请求,
    #这里的话,还需要解析出具体的省份信息

    #此处,应该只能运行一次!因为只是负责生成批量的url地址.!
    #对了,就是直接将这方法扔到init的地方就可以完成初始化了.!
    def create_2_request(self, response):
        #那需要获取省份信息的话,那就调用旁边的啦!.哈哈.提高利用率嘛.

        #这里就先假设是用热门城市来做了
        hot_city = return_province_info()['hot_citys']

        #制作大量url地址,又重新提交给scrapy
        #然后就是code和url配合
        redis_conn = manu_conn_redis1

        redis_conn.lpush("zhilian_1:start_urls", self.start_urls)

        for x in hot_city:
            for x1 in crawler_zhilian(x['code']):
                for x2 in x1:
                    print(x2)
                    yield redis_conn.lpush("zhilian_1:start_urls", x2)
예제 #50
0
class PlymouthSpider(CrawlSpider):
    name = 'worcester_s'
    allowed_domains = ['www.worcester.ac.uk']
    start_urls = []
    base_url = 'https://www.worcester.ac.uk%s'

    Lists = [
        '/journey/uwic-programme-accountancy-business-marketing.html',
        '/journey/accounting-and-finance-ba-hons-wbs.html',
        '/journey/advancing-practice-msc.html',
        '/journey/allied-health-studies-mphil-phd.html',
        '/journey/animal-biology-bsc-hons.html',
        '/journey/animal-biology-mbiol-integrated-masters.html',
        '/journey/animal-biology-mphil-phd.html',
        '/journey/animal-biology-degrees.html',
        '/journey/animation-ba-hons.html', '/journey/animation-degrees.html',
        '/journey/applied-health-social-care-ba-top-up.html',
        '/journey/applied-sport-science-msc.html',
        '/journey/applied-sports-performance-analysis-msc.html',
        '/journey/arabic-module.html',
        '/journey/archaeology-heritage-studies-degrees.html',
        '/journey/archaeology-mphil-phd.html',
        '/journey/master-research-archaeology.html',
        '/journey/archaeology-and-heritage-studies-ba-hons.html',
        '/journey/art-and-design-mphil-phd.html',
        '/journey/uwic-programme-art-design-creative-media.html',
        '/journey/atmospheric-sciences-mphil-phd.html',
        '/journey/biochemistry-bsc-hons.html',
        '/journey/biochemistry-mbiol-integrated-masters.html',
        '/journey/biochemistry-mphil-phd.html',
        '/journey/biology-degrees.html', '/journey/biology-bsc-hons.html',
        '/journey/biology-mbiol-integrated-masters.html',
        '/journey/master-research-biology.html',
        '/journey/biomedical-sciences-bsc-hons.html',
        '/journey/birth-and-beyond-ba-top-up.html',
        '/journey/birth-beyond-fda.html',
        '/journey/business-and-accountancy-ba-hons.html',
        '/journey/business-and-digital-communications-ba-hons.html',
        '/journey/business-and-enterprise-ba-hons.html',
        '/journey/business-and-finance-ba-hons.html',
        '/journey/business-and-human-resource-management-ba-hons.html',
        '/journey/business-and-marketing-ba-hons.html',
        '/journey/uwic-programme-pg-business.html',
        '/journey/business-administration-ba-hons.html',
        '/journey/business-information-technology-bsc-hons.html',
        '/journey/business-mphil-phd.html',
        '/journey/business-management-ba-hons-top-up.html',
        '/journey/business-management-ba-hons-wbs.html',
        '/journey/business-management-hnd.html',
        '/journey/business-management-degrees.html',
        '/journey/business-psychology-bsc-hons.html',
        '/journey/business-studies-ba-hons.html',
        '/journey/business-economics-finance-ba-hons-wbs.html',
        '/journey/online-celta-course.html',
        '/journey/chartered-manager-degree-apprenticeship.html',
        '/journey/child-adolescent-mental-health-fdsc.html',
        '/journey/child-adolescent-mental-health-bsc-hons-top-up.html',
        '/journey/mres-clinical-education.html',
        '/journey/clinical-psychology-bsc-hons.html',
        '/journey/collaborative-working-with-children-young-people-families-fda.html',
        '/journey/computer-games-design-development-bsc-hons.html',
        '/journey/computing-bsc-hons.html', '/journey/computing-hnd.html',
        '/journey/computing-mphil-phd.html', '/journey/computing-degrees.html',
        '/journey/counselling-fdsc.html', '/journey/counselling-msc.html',
        '/journey/counselling-psychology-bsc-hons.html',
        '/journey/creative-professional-writing-ba-hons.html',
        '/journey/creative-professional-writing-degrees.html',
        '/journey/creative-digital-media-mphil-phd.html',
        '/journey/creative-digital-media-degrees.html',
        '/journey/creative-digital-media-ba-hons.html',
        '/journey/creative-media-ma.html',
        '/journey/cricket-coaching-management-bsc-hons.html',
        '/journey/criminology-ba-hons.html',
        '/journey/criminology-mphil-phd.html',
        '/journey/criminology-with-policing-ba-hons.html',
        '/journey/dance-hnd.html',
        '/journey/dance-and-community-practice-ba-hons.html',
        '/journey/dementia-studies-mphil-phd.html',
        '/journey/dental-technology-fdsc.html', '/journey/design-mres.html',
        '/journey/developmental-psychology-bsc-hons.html',
        '/journey/diploma-in-education-and-training.html',
        '/journey/doctor-business-administration-dba.html',
        '/journey/doctor-education-edd.html',
        '/journey/drama-performance-degrees.html',
        '/journey/drama-performance-ma.html',
        '/journey/drama-performance-ba-hons.html',
        '/journey/drama-and-performance-mphil-phd.html',
        '/journey/msc-emdr-therapy.html',
        '/journey/early-childhood-professional-practice-ba-hons.html',
        '/journey/early-modern-studies-mres.html',
        '/journey/early-years-foundation-degree-flexible-distributed-learning-pathway.html',
        '/journey/early-years-sector-endorsed-fda.html',
        '/journey/ecology-bsc-hons.html', '/journey/ecology-mphil-phd.html',
        '/journey/mres-ecology-environmental-management.html',
        '/journey/ecology-degrees.html', '/journey/education-ma.html',
        '/journey/education-mphil-phd.html', '/journey/mres-education.html',
        '/journey/education-studies-ba-hons.html',
        '/journey/education-studies-degrees.html',
        '/journey/english-language-studies-ba-hons.html',
        '/journey/english-language-degrees.html',
        '/journey/english-literature-ba-hons.html',
        '/journey/english-literature-and-language-mphil-phd.html',
        '/journey/english-literature-degrees.html',
        '/journey/entrepreneurship-ba-hons-wbs.html',
        '/journey/environmental-science-bsc-hons.html',
        '/journey/environmental-science-degrees.html',
        '/journey/environmental-studies-science-mphil-phd.html',
        '/journey/film-production-ba-hons.html',
        '/journey/film-production-degrees.html',
        '/journey/film-studies-ba-hons.html',
        '/journey/film-studies-mphil-phd.html',
        '/journey/film-studies-degrees.html',
        '/journey/fine-art-practice-ba-hons.html',
        '/journey/fine-art-mres.html', '/journey/fine-art-degrees.html',
        '/journey/football-business-management-coaching-fdsc.html',
        '/journey/forensic-psychology-bsc-hons.html',
        '/journey/forensic-and-applied-biology-bsc-hons.html',
        '/journey/free-general-english-classes.html',
        '/journey/french-module.html', '/journey/game-art-ba-hons.html',
        '/journey/game-art-degrees.html',
        '/journey/general-english-classes-advanced.html',
        '/journey/general-english-classes-english-foreign-language.html',
        '/journey/geography-bsc-hons.html', '/journey/geography-degrees.html',
        '/journey/german-module.html', '/journey/graphic-design-ba-hons.html',
        '/journey/graphic-design-multimedia-degrees.html',
        '/journey/green-media-mres.html',
        '/journey/health-sciences-bsc-hons.html',
        '/journey/health-and-social-care-fdsc.html',
        '/journey/higher-education-mapgdippgcert.html',
        '/journey/history-ba-hons.html', '/journey/history-mphil-phd.html',
        '/journey/history-mres.html', '/journey/history-degrees.html',
        '/journey/human-biology-bsc-hons.html',
        '/journey/human-biology-mbiol-integrated-masters.html',
        '/journey/human-biology-mphil-phd.html',
        '/journey/human-biology-degrees.html',
        '/journey/human-geography-ba-hons.html',
        '/journey/huma-geography-mphil-phd.html',
        '/journey/human-nutrition-bsc-hons.html',
        '/journey/human-nutrition-degrees.html',
        '/journey/human-resource-management-ma.html',
        '/journey/management-human-resources-msc.html',
        '/journey/ielts-preparation-classes-english-foreign-language.html',
        '/journey/illustration-ba-hons.html',
        '/journey/illustration-degrees.html',
        '/journey/language-module-improving-english-in-academic-language.html',
        '/journey/language-module-improving-english-in-academic-writing-non-native-speakers.html',
        '/journey/integrated-working-children-families-ba-hons-top-up-degree.html',
        '/journey/integrative-counselling-ba-hons.html',
        '/journey/integrative-counselling-fda.html',
        '/journey/international-business-management-ba-hons-wbs.html',
        '/journey/international-finance-ba-hons-top-up-wbs.html',
        '/journey/management-msc.html',
        '/journey/international-sport-management-msc.html',
        '/journey/introduction-to-teaching-english-foreign-language.html',
        '/journey/intro-teaching-english-as-a-foreign-language-lang1001.html',
        '/journey/introduction-to-tefl-language-awareness-lang1012.html',
        '/journey/introduction-to-tefl-lang1013.html',
        '/journey/italian-module.html', '/journey/japanese-module.html',
        '/journey/journalism-ba-hons.html', '/journey/journalism-degrees.html',
        '/journey/21020.html',
        '/journey/language-awareness-and-analysis-tefl-module.html',
        '/journey/uwic-programme-law.html', '/journey/law-llb-hons.html',
        '/journey/law-with-criminology-llb-hons.html',
        '/journey/law-with-forensic-psychology-llb-hons.html',
        '/journey/leadership-and-management-fda-ba.html',
        '/journey/leading-culture-change-in-safeguarding-pgcert.html',
        '/journey/leading-early-years-practice-pgcert.html',
        '/journey/learning-support-fda.html',
        '/journey/learning-and-development-early-years-to-adolescence-fda.html',
        '/journey/teaching-and-learning-in-higher-education-pg-cert.html',
        '/journey/master-of-business-administration-mba.html',
        '/journey/mba-executive-leadership-and-management-part-time.html',
        '/journey/marketing-ba-hons-wbs.html',
        '/journey/marketing-advertising-and-public-relations-ba-hons-wbs.html',
        '/journey/mathematics-mphil-phd.html',
        '/journey/mathematics-bsc-hons.html',
        '/journey/media-and-cultural-studies-ba-hons.html',
        '/journey/media-culture-degrees.html',
        '/journey/media-and-cultural-studies-mphil-phd.html',
        '/journey/mental-health-fdsc.html',
        '/journey/mentoring-in-early-childhood-pgcert.html',
        '/journey/midwifery-bsc-hons.html',
        '/journey/midwifery-mphil-phd.html',
        '/journey/music-education-ba-mmus.html',
        '/journey/national-award-senco-nasc-special-educational-needs-coordination-pg-cert.html',
        '/journey/nursing-bsc-hons.html', '/journey/nursing-mphil-phd.html',
        '/journey/nursing-studies-bsc-hons.html',
        '/journey/nutrition-and-health-access-module.html',
        '/journey/nutritional-therapy-msc.html',
        '/journey/occupational-therapy-bsc-hons.html',
        '/journey/occupational-therapy-mphil-phd.html',
        '/journey/business-psychology-msc-occupational-psychology-msc.html',
        '/journey/open-short-language-courses.html',
        '/journey/outdoor-adventure-leadership-management-bsc-hons.html',
        '/journey/outdoor-education-ma.html',
        '/journey/supervision-pgcert.html',
        '/journey/postgraduate-certificate-in-education-pgce-primary.html',
        '/journey/pgce-primary-mathematics.html',
        '/journey/pgce-primary-physical-education.html',
        '/journey/pgce-school-direct-primary.html',
        '/journey/pgce-school-direct-secondary.html',
        '/journey/postgraduate-certificate-in-education-pgce-secondary.html',
        '/journey/paramedic-science-bsc-hons.html',
        '/journey/pharmacology-bsc-hons.html',
        '/journey/physical-education-bsc-hons.html',
        '/journey/physical-education-and-dance-ba-hons.html',
        '/journey/physical-education-and-outdoor-education-bsc-hons.html',
        '/journey/physical-education-degrees.html',
        '/journey/physical-geography-bsc-hons.html',
        '/journey/physical-geography-mphil-phd.html',
        '/journey/physician-associate-msc.html',
        '/journey/physiotherapy-bsc-hons.html',
        '/journey/plant-biology-mphil-phd.html',
        '/journey/politics-ba-hons.html',
        '/journey/positive-psychology-coaching-pgcert.html',
        '/journey/primary-initial-teacher-education-ba-hons.html',
        '/journey/primary-outdoor-education-ba-hons.html',
        '/journey/professional-practice-ba-hons-top-up-degree .html',
        '/journey/psychology-bsc-hons.html',
        '/journey/psychology-mphil-phd.html', '/journey/psychology-msc.html',
        '/journey/psychology-degrees.html', '/journey/public-health-msc.html',
        '/journey/religion-philosophy-and-values-in-education-ba-hons.html',
        '/journey/master-research-river-science.html',
        '/journey/uwic-programme-science-health-social-science.html',
        '/journey/screenwriting-ba-hons.html',
        '/journey/screenwriting-degrees.html',
        '/journey/social-work-social-policy-mphil-phd.html',
        '/journey/social-work-ba-hons.html', '/journey/social-work-ma.html',
        '/journey/master-research-socio-cultural-studies-sport-exercise.html',
        '/journey/sociology-ba-hons.html', '/journey/sociology-mphil-phd.html',
        '/journey/sociology-degrees.html',
        '/journey/special-educational-needs-disabilities-inclusion-ba-hons.html',
        '/journey/sport-exercise-psychology-bsc-hons.html',
        '/journey/uwic-programme-sport-exercise-science.html',
        '/journey/sport-exercise-science-bsc-hons.html',
        '/journey/sport-exercise-science-mphil-phd.html',
        '/journey/sport-business-management-bsc-hons.html',
        '/journey/sport-development-coaching-ba-hons.html',
        '/journey/sport-coaching-physical-education-hnd.html',
        '/journey/sports-coaching-msc.html',
        '/journey/sports-coaching-science-bsc-hons.html',
        '/journey/sports-coaching-science-degrees.html',
        '/journey/sports-coaching-science-with-disability-sport-bsc-hons.html',
        '/journey/sports-studies-bsc-hons.html',
        '/journey/sports-studies-degrees.html',
        '/journey/sports-therapy-bsc-hons.html',
        '/journey/subject-knowledge-enhancement-ske.html',
        '/journey/teaching-english-celta-course.html',
        '/journey/teaching-and-learning-fda.html',
        '/journey/theatre-performance-mres.html',
        '/journey/mtheatre-touring-theatre-integrated-masters.html',
        '/journey/understanding-domestic-and-sexual-violence-ma.html',
        '/journey/university-diploma-in-leadership-and-management.html',
        '/journey/university-diploma-in-academic-tutoring.html',
        '/journey/web-development-bsc-hons.html'
    ]

    for i in Lists:
        fullurl = base_url % i
        start_urls.append(fullurl)

    rules = (
        Rule(LinkExtractor(allow=(r'.*'),
                           restrict_xpaths=('//*[@id="aToZ"]/ul/li/a')),
             callback='parse_item',
             follow=True),
        # Rule(LinkExtractor(allow=r'/courses/.*'),callback='parse_item',follow=True),
        Rule(LinkExtractor(
            allow=(r'.*'),
            restrict_xpaths=(
                '//*[@class="box__inner box__inner--purple"]//a')),
             callback='parse_item',
             follow=False),
    )

    # def parse(self, response):
    #     if self.start_urls == 'https://www.worcester.ac.uk/courses/archaeology-heritage-studies-and-art-design-ba-hons.html':
    #         link_list = response.xpath('//*[@id="#content"]/div/div/section//a/@href')
    #         print("======================++++++++++++++++++++++++++++++++")
    #         print(link_list)
    #         print("======================++++++++++++++++++++++++++++++++")
    #         for i in link_list:
    #             link = "https://www.worcester.ac.uk" + str(i)
    #             yield scrapy.Request(link, callback=self.parse_item)
    #     else:
    #         print('错误页面')

    def parse_item(self, response):
        print('==================================', response.url)
        item = HooliItem()

        url = response.url
        print(1, url)

        university = 'University of Worcester'
        print(2, university)

        country = 'UK'

        city = 'NULL'

        website = 'https://www.worcester.ac.uk'

        department = 'NULL'

        programme_s = response.xpath(
            '//*[@id="content"]/div/h1//text()').extract()
        # programme = response.xpath('//section[@class="pageHead"]/h1/text()').extract()
        programme_s = ''.join(programme_s)
        if len(programme_s) > 0:
            programme = programme_s
        else:
            programme = 'NULL'
        print(3, programme)

        ucas_code = 'NULL'

        degree_level = ''

        degree_type = response.xpath(
            '//*[@id="content"]/div/h1//text()').extract()
        # degree_type = response.xpath('//section[@class="pageHead"]/h1/text()').extract()
        degree_type = ''.join(degree_type)
        degree_type = self.getDegree_type(degree_type)
        print(4, degree_type)

        start_date = 'NULL'
        # start_date = ''.join(start_date)
        # print(5,start_date)

        degree_description = 'NULL'

        overview = response.xpath(
            '//div[@class="left logo-bg"]//text()').extract()
        # overview = response.xpath('//div[@class="body-copy"]/ul/li/text()').extract()
        overview = ''.join(overview)
        print(5, overview)

        mode = 'NULL'
        # mode = ''.join(mode).replace('\r\n','')
        # mode = mode.replace('\n','')
        # mode = mode.replace('      ','')
        # print(7,mode)

        duration = 'NULL'
        # duration = ''.join(duration).replace('\r\n','')
        # duration = duration.replace('\n','')
        # duration = duration.replace('    ','')
        # print(8,duration)

        modules = response.xpath(
            '//*[@id="section-3"]/div/table/tbody//text()').extract()
        modules = ''.join(modules)
        # modules = modules.replace('\n','')
        print(6, modules)

        teaching = response.xpath(
            '//*[@id="section-4"]/div/div//text()').extract()
        teaching = ''.join(teaching)
        print(7.7, teaching)

        assessment = response.xpath(
            '//*[@id="section-4"]/div/div//text()').extract()
        assessment = ''.join(assessment)
        print(7, assessment)

        career = response.xpath(
            '//*[@id="section-5"]/div/div//text()').extract()
        career = ''.join(career)
        print(8, career)

        application_date = 'NULL'

        deadline = 'NULL'

        application_fee = 'NULL'

        tuition_fee_s = response.xpath(
            '//*[@id="section-6"]//text()').extract()[33:37]
        tuition_fee_s = ''.join(tuition_fee_s)
        tuition_fee_s = tuition_fee_s.replace('\r\n', '')
        tuition_fee_s = tuition_fee_s.replace('    ', '')
        tuition_fee_s = self.getTuition_fee(tuition_fee_s)
        try:
            if tuition_fee_s > 0:
                tuition_fee = tuition_fee_s
            else:
                tuition_fee = "NULL"
        except:
            tuition_fee = "报错!"

        print(9, tuition_fee)

        location = 'worcester'
        # location = ''.join(location)
        # print(13,location)

        GPA = 'NULL'

        ATAS = 'NULL'

        average_score = 'NULL'

        accredited_university = 'NULL'

        Alevel = 'NULL'

        IB = 'NULL'

        IELTS_s = response.xpath('//*[@id="section-2"]//p/text()').extract()
        IELTS_s = ''.join(IELTS_s)
        # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        try:
            if " IELTS" in IELTS_s:
                start = IELTS_s.find(" IELTS")
                IELTS = IELTS_s[start:]
                IELTS = IELTS[:100]
                item["IELTS"] = IELTS
            else:
                IELTS = "NULL"
        except:
            IELTS = "报错!"
        print(10, IELTS)

        # IELTS = ''.join(IELTS)
        # # IELTS = re.findall('(IELTS:|IELTS)? (.*){0,5} \d?.\d? .{0,70}',IELTS)
        # print(10, IELTS)
        IELTS_L = 'NULL'
        IELTS_S = 'NULL'
        IELTS_R = 'NULL'
        IELTS_W = 'NULL'

        TOEFL = 'NULL'
        TOEFL_L = 'NULL'
        TOEFL_S = 'NULL'
        TOEFL_R = 'NULL'
        TOEFL_W = 'NULL'

        GRE = 'NULL'

        GMAT = 'NULL'

        LSAT = 'NULL'

        MCAT = 'NULL'

        working_experience = 'NULL'

        interview = 'NULL'

        portfolio = 'NULL'

        application_documents = 'NULL'

        how_to_apply = response.xpath('//*[@id="section-7"]//text()').extract()
        how_to_apply = ''.join(how_to_apply)
        print(11, how_to_apply)

        entry_requirements = response.xpath(
            '//*[@id="section-2"]/div//text()').extract()
        entry_requirements = ''.join(entry_requirements)
        print(12, entry_requirements)

        chinese_requirements = 'NULL'

        school_test = 'NULL'

        SATI = 'NULL'

        SATII = 'NULL'

        SAT_code = 'NULL'

        ACT = 'NULL'

        ACT_code = 'NULL'

        other = 'NULL'

        create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(15, create_time)

        item["url"] = url
        item["university"] = university
        item["country"] = country
        item["city"] = city
        item["website"] = website
        item["department"] = department
        item["programme"] = programme
        item["ucas_code"] = ucas_code
        item["degree_level"] = degree_level
        item["degree_type"] = degree_type
        item["start_date"] = start_date
        item["degree_description"] = degree_description
        item["overview"] = overview
        item["mode"] = mode
        item["duration"] = duration
        item["modules"] = modules
        item["teaching"] = teaching
        item["assessment"] = assessment
        item["career"] = career
        item["application_date"] = application_date
        item["deadline"] = deadline
        item["application_fee"] = application_fee
        item["tuition_fee"] = tuition_fee
        item["location"] = location
        item["ATAS"] = ATAS
        item["GPA"] = GPA
        item["average_score"] = average_score
        item["accredited_university"] = accredited_university
        item["Alevel"] = Alevel
        item["IB"] = IB
        item["IELTS"] = IELTS
        item["IELTS_L"] = IELTS_L
        item["IELTS_S"] = IELTS_S
        item["IELTS_R"] = IELTS_R
        item["IELTS_W"] = IELTS_W
        item["TOEFL"] = TOEFL
        item["TOEFL_L"] = TOEFL_L
        item["TOEFL_S"] = TOEFL_S
        item["TOEFL_R"] = TOEFL_R
        item["TOEFL_W"] = TOEFL_W
        item["GRE"] = GRE
        item["GMAT"] = GMAT
        item["LSAT"] = LSAT
        item["MCAT"] = MCAT
        item["working_experience"] = working_experience
        item["interview"] = interview
        item["portfolio"] = portfolio
        item["application_documents"] = application_documents
        item["how_to_apply"] = how_to_apply
        item["entry_requirements"] = entry_requirements
        item["chinese_requirements"] = chinese_requirements
        item["school_test"] = school_test
        item["SATI"] = SATI
        item["SATII"] = SATII
        item["SAT_code"] = SAT_code
        item["ACT"] = ACT
        item["ACT_code"] = ACT_code
        item["other"] = other
        item["create_time"] = create_time

        yield item

    def getTuition_fee(self, tuition_fee):
        allfee = re.findall(r'\d+,\d+', tuition_fee)
        # print(allfee)
        for index in range(len(allfee)):
            fee = allfee[index].split(",")
            allfee[index] = ''.join(fee)
            # print(allfee[index])
        # print(allfee)
        maxfee = 0
        for fee in allfee:
            if int(fee) >= maxfee:
                maxfee = int(fee)
        return maxfee

    def getDegree_type(self, degree_type):
        try:
            if "BSc" in degree_type:
                degree_type = 'Bsc'
            elif "MSc" in degree_type:
                degree_type = "MSc"
            elif "BA" in degree_type:
                degree_type = 'BA'
            elif "MNSW" in degree_type:
                degree_type = 'MNSW'
            elif "PGCert" in degree_type:
                degree_type = 'PGCert'
            elif "MBA" in degree_type:
                degree_type = 'MBA'
            elif "MA" in degree_type:
                degree_type = 'MA'
            elif "MComp" in degree_type:
                degree_type = 'MComp'
            elif "PhD" in degree_type:
                degree_type = 'PhD'
            elif "FdA" in degree_type:
                degree_type = 'FdA'
            elif "PGCE" in degree_type:
                degree_type = 'PGCE'
            elif "IFP" in degree_type:
                degree_type = 'IFP'
            elif "LLB" in degree_type:
                degree_type = 'LLB'
            elif "MHealth Res" in degree_type:
                degree_type = 'MHealth Res'
            elif "MRes" in degree_type:
                degree_type = 'MRes'
            elif "MMed" in degree_type:
                degree_type = 'MMed'
            elif "MSci" in degree_type:
                degree_type = 'MSci'
            elif "MCh" in degree_type:
                degree_type = 'MCh'
            elif "LLM" in degree_type:
                degree_type = "LLM"
            elif "Y2QF" in degree_type:
                degree_type = "Y2QF"
            elif "Y2QG" in degree_type:
                degree_type = "Y2QG"
            elif "HND" in degree_type:
                degree_type = 'HND'
            elif len(degree_type) == 0:
                degree_type = 'NULL'
            else:
                degree_type = 'Ordinary degree'
        except:
            degree_type = "NULL"
        return degree_type
예제 #51
0
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//h1[@itemprop='name']",
    'price':
    "//p[@class='price']/span[@class='woocommerce-Price-amount amount']",
    'category': "",
    'description': "",
    'images': "//div[@class='images']/a[@itemprop='image']/@href",
    'canonical': "//link[@rel='canonical']/@href",
    'base_url': "",
    'brand': "",
    'in_stock': "",
    'guarantee': "",
    'promotion': ""
}
name = 'ilahui.vn'
allowed_domains = ['ilahui.vn']
start_urls = ['http://ilahui.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = ['']
rules = [
    Rule(
        LinkExtractor(allow=['/san-pham/[a-zA-Z0-9-]+/$'],
                      deny=['/san-pham/($|page/\d+/$)']), 'parse_item'),
    Rule(LinkExtractor(allow=['/san-pham/($|page/\d+/$)']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
예제 #52
0
class X10jqkaSpider(CrawlSpider):
    name = '10jqka'
    source = "同花顺财经"
    allowed_domains = ["10jqka.com.cn"]
    yesterday = datetime.date.today() - datetime.timedelta(days=1)
    yesterday = yesterday.strftime('%Y%m%d')
    reg = yesterday
    start_urls = [
        'http://news.10jqka.com.cn/today_list/',
        'http://news.10jqka.com.cn/cjzx_list/',
        'http://news.10jqka.com.cn/cjkx_list/',
        'http://news.10jqka.com.cn/guojicj_list/',
        'http://news.10jqka.com.cn/jrsc_list/',
        'http://news.10jqka.com.cn/fssgsxw_list/',
        'http://news.10jqka.com.cn/region_list/',
        'http://news.10jqka.com.cn/gat_list/',
        'http://news.10jqka.com.cn/fortune_list/',
        'http://news.10jqka.com.cn/cjrw_list/',
        'http://news.10jqka.com.cn/dzxf_list/'
    ]
    rules = (
        Rule(LinkExtractor(allow=reg), callback="parse_news", follow=True),
        Rule(LinkExtractor(allow='_[1-2]+.shtml')),
        #Rule(LinkExtractor(allow='_[0-9]+.shtml'))
    )

    def printcn(uni):
        for i in uni:
            print uni.encode('utf-8')

    def parse_news(self, response):
        item = GenericItem()
        self.get_id(response, item)
        self.get_url(response, item)
        self.get_source(response, item)
        self.get_title(response, item)
        self.get_date(response, item)
        self.get_body(response, item)
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!remenber to Retrun Item after parse
        return item

    def get_id(self, response, item):
        id = uuid.uuid4()
        if id:
            item['id'] = id

    def get_url(self, response, item):
        news_url = response.url
        if news_url:
            item['url'] = news_url

    def get_source(self, response, item):
        source = self.source
        if source:
            item['source'] = source

    def get_title(self, response, item):
        title = response.xpath('//div[@class="atc-head"]/h1/text()').extract()
        if title:
            item['title'] = ''.join(title).strip()

    def get_date(self, response, item):
        date = response.xpath('//*[@id="pubtime_baidu"]/text()').extract()
        if date:
            item['date'] = ''.join(date).replace(u'-', u'').replace(
                u':', u'').replace(u' ', u'').strip()

    def get_body(self, response, item):
        paras = response.xpath('//div[@class="atc-content"]/p')
        news_body = ''
        for p in paras:
            data = p.xpath('string(.)').extract()
            if data:
                body = ''
                for line in ''.join(data).splitlines():
                    #   print entry.encode('utf-8')
                    body += line.strip()
                news_body += body + '_|_'
        item['body'] = news_body.replace('_|__|_', '_|_')
예제 #53
0
class LagouSpider(CrawlSpider):
    name = 'lagou'
    allowed_domains = ['www.lagou.com']
    start_urls = ['https://www.lagou.com/']
    """
    LinkExtractor URL抽取主要方法
    """
    rules = (
        # Rule(LinkExtractor(allow=("zhaopin/.*",)), follow = True),
        # Rule(LinkExtractor(allow=("gongsi/j\d+.html",)), follow=True),
        Rule(LinkExtractor(allow=r'jobs/\d+.html'),
             callback='parse_job',
             follow=True), )
    """
    # def parse_start_url(self, response):
    #     return []
    #
    # def process_results(self, response, results):
    #     return results

    # def start_requests(self):
    #     # 去使用selenium模拟登录后拿到cookie交给scrapy的request使用
    #     # 1、通过selenium模拟登录
    #     # 从文件中读取cookies
    #     cookies = []
    #     driver_path = BASE_DIR + "/LagouSpider/Driver/chromedriver"
    #     browser = webdriver.Chrome(executable_path=driver_path)
    #     browser.get("https://passport.lagou.com/login/login.html")
    #     if os.path.exists(BASE_DIR + "/LagouSpider/cookies/lagou.cookie"):
    #         cookies = pickle.load(open(BASE_DIR + "/cookies/lagou.cookie", "rb"))
    #         for cookie in cookies:
    #             browser.add_cookie(cookie)
    #         browser.get("https://www.lagou.com/")
    #
    #     if not cookies:
    #         browser.get("https://passport.lagou.com/login/login.html")
    #         browser.find_element_by_css_selector(".form_body .input.input_white").send_keys("*****@*****.**")
    #         browser.find_element_by_css_selector('.form_body input[type="password"]').send_keys("123456")
    #         browser.find_element_by_css_selector('div[data-view="passwordLogin"] input.btn_lg').click()
    #         import time
    #         time.sleep(15)
    #         cookies = browser.get_cookies()
    #         # 写入cookie到文件中
    #         pickle.dump(cookies, open(BASE_DIR + "/LagouSpider/cookies/lagou.cookie", "wb"))
    #     cookie_dict = {}
    #     for cookie in cookies:
    #         cookie_dict[cookie["name"]] = cookie["value"]
    #
    #     for url in self.start_urls:
    #         yield scrapy.Request(url, dont_filter=True, cookies=cookie_dict)
    """
    def parse_job(self, response):
        """
        回掉函数
        :param response:
        :return:
        """
        # 创建ItemLoader的格式
        item_loader = LagouItemLoader(item=LagouJobItem(), response=response)
        item_loader.add_css('title', '.job-name::attr(title)')  # 职位标题
        item_loader.add_value('url', response.url)
        item_loader.add_value('url_object_id', get_md5(response.url))
        item_loader.add_css('salary', '.job_request .salary::text')

        item_loader.add_xpath('job_city',
                              '//*[@class="job_request"]/h3/span[2]/text()')
        item_loader.add_xpath('work_years',
                              '//*[@class="job_request"]/h3/span[3]/text()')
        item_loader.add_xpath('degree_need',
                              '//*[@class="job_request"]/h3/span[4]/text()')
        item_loader.add_xpath('job_type',
                              '//*[@class="job_request"]/h3/span[5]/text()')

        item_loader.add_css('tags', '.position-label li::text')
        item_loader.add_css('publish_time', '.publish_time::text')
        item_loader.add_css('job_advantage', '.job-advantage p::text')
        item_loader.add_css('job_desc', '.job_bt div')
        item_loader.add_css('job_addr', '.work_addr')
        item_loader.add_css('company_name', '.job_company dt a img::attr(alt)')
        item_loader.add_value('crawl_time', datetime.now())

        job_item = item_loader.load_item()

        return job_item
예제 #54
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name' : "//div[@id='colcenter']/div[@id='content']/div/h1/a/span",
    'price' : "//div[@class='contentContainer']/div[@class='contentText']/h2/span/span/span",
    'category' : "//div[@id='midle']/div[@class='navigation']/a",
    'description' : "//div[@class='boxproduct']/div[@class='boxproductcontent']/p | //div[@id='ViEtDeVdIvId']/p | //div[@id='ViEtDeVdIvId']/table",
    'images' : "//div[@class='contentText']/div[@id='piGal']/div/a/@href | //div[@class='contentText']/div[@id='piGal']/div/a/img/@src | //div/a[@class='fancyLink']/img/@src | //div[@id='piGal']/div/a/@href | //div[@id='piGal']/div/a/img/@src",
    'canonical' : "",
    'base_url' : "",
    'brand' : ""
}
name = 'sieuthimayvanphong.com.vn'
allowed_domains = ['sieuthimayvanphong.com.vn']
start_urls = ['http://sieuthimayvanphong.com.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/[a-zA-Z0-9-_]+-p+\d+\.qmc']), 'parse_item'),
    Rule(LinkExtractor(allow=['/[a-zA-Z0-9-_]+-c+\d+\.qmc']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
예제 #55
0
class TheiconicComAu(BasePortiaSpider):
    name = "www.theiconic.com.au"
    allowed_domains = [u'www.theiconic.com.au']
    start_urls = [
        u'http://www.theiconic.com.au/nike-power-legend-women-s-high-rise-training-tights-488916.html'
    ]
    rules = [
        Rule(LinkExtractor(allow=(u'.html', u'/sale/'), deny=()),
             callback='parse_item',
             follow=True)
    ]
    items = [[
        Item(PortiaItem, None, u'.product-information', [
            Field(
                u'Cat_1',
                '.medium-7 > .product-gallery > div:nth-child(1) > .small-12 > .breadcrumbs > li:nth-child(1) > .ga-track-link-click *::text',
                []),
            Field(
                u'Cat_2',
                '.medium-7 > .product-gallery > div:nth-child(1) > .small-12 > .breadcrumbs > li:nth-child(2) > .ga-track-link-click *::text',
                []),
            Field(
                u'Cat_3',
                '.medium-7 > .product-gallery > div:nth-child(1) > .small-12 > .breadcrumbs > li:nth-child(3) > .ga-track-link-click *::text',
                []),
            Field(
                u'Image',
                '.medium-7 > .product-gallery > div:nth-child(3) > .small-12 > .main-image-frame > .img > .owl-wrapper-outer > .owl-wrapper > div:nth-child(1) > .image-wrapper > .image-frame > img::attr(src)',
                []),
            Field(
                u'Brand',
                '.medium-5 > .main > .item-details > .product-info > .product-title > .small-12 > .product-name > .brand-title > a *::text',
                []),
            Field(
                u'Name',
                '.medium-5 > .main > .item-details > .product-info > .product-title > .small-12 > .product-name > span:nth-child(2) *::text',
                []),
            Field(
                u'Price',
                '.medium-5 > .main > .item-details > .product-info > .product-price > .small-12 > .price *::text',
                [], True),
            Field(
                u'Colour',
                '.medium-5 > .main > .item-details > .add-to-bag > .row > .small-12 > .ti-dropdown > .dropdown > span > .color-name *::text',
                []),
            Field(
                u'Size',
                '.medium-5 > .main > .item-details > .add-to-bag > .ng-pristine > div:nth-child(1) > .small-7 > .ti-dropdown > .f-dropdown *::text',
                []),
            Field(
                u'Description',
                '.medium-5 > .main > .item-details > .accordion > dd *::text',
                [])
        ]),
        Item(PortiaItem, None, u'.product-information', [
            Field(
                u'Cat_1',
                '.medium-7 > .product-gallery > div:nth-child(1) > .small-12 > .breadcrumbs > li:nth-child(1) > .ga-track-link-click *::text',
                []),
            Field(
                u'Cat_2',
                '.medium-7 > .product-gallery > div:nth-child(1) > .small-12 > .breadcrumbs > li:nth-child(2) > .ga-track-link-click *::text',
                []),
            Field(
                u'Cat_3',
                '.medium-7 > .product-gallery > div:nth-child(1) > .small-12 > .breadcrumbs > li:nth-child(3) > .ga-track-link-click *::text',
                []),
            Field(
                u'Image',
                '.medium-7 > .product-gallery > div:nth-child(3) > .small-12 > .main-image-frame > .img > .owl-wrapper-outer > .owl-wrapper > div:nth-child(1) > .image-wrapper > .image-frame > img::attr(src)',
                []),
            Field(
                u'Name',
                '.medium-5 > .main > .item-details > .product-info > .product-title > .small-12 > .product-name > span:nth-child(2) *::text',
                []),
            Field(
                u'Colour',
                '.medium-5 > .main > .item-details > .product-info > .product-price > .small-12 *::text',
                []),
            Field(
                u'PriceRrp',
                '.medium-5 > .main > .item-details > .product-info > .product-price > .small-12 > .original *::text',
                []),
            Field(
                u'PriceSale',
                '.medium-5 > .main > .item-details > .product-info > .product-price > .small-12 > .final *::text',
                []),
            Field(
                u'Size',
                '.medium-5 > .main > .item-details > .add-to-bag > .ng-pristine > div:nth-child(1) > .small-7 > .ti-dropdown > .f-dropdown *::text',
                []),
            Field(u'Description',
                  '.medium-5 > .main > .item-details > .accordion *::text', [])
        ])
    ]]
예제 #56
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//div[@class='product-title']/h1",
    'price':
    "//div[@class='Row Price']/div[@class='ProductPrice VariationProductPrice']",
    'category': "//div[@class='Breadcrumb']/ul/li/a",
    'description':
    "//div[@class='ProductTabs']/div[@class='Block Panel ProductDescription']",
    'images':
    "//div[@class='ProductThumb']/div[@class='ProductThumbImage']/a/@href",
    'canonical': "//link[@rel='canonical']/@href",
    'base_url': "",
    'brand': ""
}
name = 'aligro.vn'
allowed_domains = ['aligro.vn']
start_urls = ['http://aligro.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/[a-zA-Z0-9-]+-\d+\.html$']), 'parse_item'),
    Rule(LinkExtractor(allow=['/[a-zA-Z0-9-]+-b\d+\.html($|\?pn=\d+)']),
         'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
예제 #57
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//div[@class='sanpham']/div/h4/strong",
    'price': "//div[@class='pro_detail']/div[@class='cat_pirce']/h4",
    'category': "//div[@class='link_menu']/a",
    'description': "//div[@class='sanpham']/div[@class='noidung_sp']",
    'images':
    "//div/div[@class='sanpham']/div[@class='img_detail_view']/a/img/@src",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'bibicare.vn'
allowed_domains = ['bibicare.vn']
start_urls = ['http://bibicare.vn']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/detail-product-view-+[a-zA-z0-9-_]+\.html']),
         'parse_item'),
    Rule(LinkExtractor(allow=['/menu-product-+[a-zA-z0-9-_]+\.html']),
         'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
예제 #58
0
# Auto generated by generator.py. Delete this line if you make modification.
from scrapy.spiders import Rule
from scrapy.linkextractors import LinkExtractor

XPATH = {
    'name': "//h3[@class='productname']",
    'price': "//div[@class='price']/h3[@id='our_price_display']",
    'category': "//ol[@class='breadcrumb']/li/a",
    'description': "//div[@class='description']",
    'images': "//div[@id='thumbs_list']/ul//a/@href",
    'canonical': "",
    'base_url': "",
    'brand': ""
}
name = 'coophomeshopping.vn'
allowed_domains = ['coophomeshopping.vn']
start_urls = ['http://coophomeshopping.vn/']
tracking_url = ''
sitemap_urls = ['']
sitemap_rules = [('', 'parse_item')]
sitemap_follow = []
rules = [
    Rule(LinkExtractor(allow=['/[a-zA-Z0-9-/]+\.html$']), 'parse_item'),
    Rule(LinkExtractor(allow=['/[a-zA-Z0-9-/]+($|/#/page-\d+)']), 'parse'),
    #Rule(LinkExtractor(), 'parse_item_and_links'),
]
예제 #59
0
class DogSpider(CrawlSpider):
    name = 'environment'
    allowed_domains = [
        'topontiki.gr',
        'popaganda.gr',
        'lifo.gr',
        'naftemporiki.gr',
        'kathimerini.gr',
        'cnn.gr',
        'protagon.gr',
        'iefimerida.gr',
    ]
    url = [
        'https://popaganda.gr/newstrack/environment/',
        'https://www.naftemporiki.gr/green',
        'https://www.cnn.gr/',
        'https://www.protagon.gr/epikairotita/',
        'https://www.iefimerida.gr',
    ]
    topontiki_urls = [
        'http://www.topontiki.gr/category/perivallon?page={}'.format(x)
        for x in range(0, TOPONTIKI_VARS['ENVIRONMENT_PAGES'])
    ]
    lifo_urls = [
        'https://www.lifo.gr/now/perivallon/page:{}'.format(x)
        for x in range(1, LIFO_VARS['ENVIRONMENT_PAGES'])
    ]
    kathimerini_urls = [
        'https://www.kathimerini.gr/box-ajax?id=b1_1885015423_1194114316&page={}'
        .format(x) for x in range(0, KATHIMERINI_VARS['ENVIRONMENT_PAGES'])
    ]
    urls = url + kathimerini_urls + lifo_urls + topontiki_urls
    start_urls = urls[:]

    rules = (
        Rule(LinkExtractor(allow=('topontiki.gr/article/'),
                           deny=('binteo', 'videos', 'gallery', 'eikones',
                                 'twit')),
             callback='parse_topontiki',
             follow=True,
             process_request='process_topontiki'),
        Rule(LinkExtractor(allow=(r'popaganda\.gr.+newstrack/'),
                           deny=('binteo', 'videos', 'gallery', 'eikones',
                                 'twit', 'comment')),
             callback='parse_popaganda',
             follow=True,
             process_request='process_popaganda'),
        Rule(LinkExtractor(allow=(r'www\.lifo\.gr.+perivallon'),
                           deny=('binteo', 'videos', 'gallery', 'eikones',
                                 'twit', 'comment')),
             callback='parse_lifo',
             follow=True,
             process_request='process_lifo'),
        Rule(LinkExtractor(allow=(r'www\.lifo\.gr.+environment_articles'),
                           deny=('binteo', 'videos', 'gallery', 'eikones',
                                 'twit', 'comment')),
             callback='parse_lifo',
             follow=True,
             process_request='process_lifo'),
        Rule(LinkExtractor(
            allow=(r'\.naftemporiki\.gr/story|\.naftemporiki\.gr/storypn'),
            deny=('binteo', 'videos', 'gallery', 'eikones', 'twit')),
             callback='parse_naftemporiki',
             follow=True,
             process_request='process_naftemporiki'),
        Rule(LinkExtractor(
            allow=(r"\.kathimerini\.gr.+epikairothta/perivallon/"),
            deny=('binteo', 'videos', 'gallery', 'eikones', 'twit')),
             callback='parse_kathimerini',
             follow=True,
             process_request='process_kathimerini'),
        Rule(LinkExtractor(allow=('https://www.iefimerida.gr/green'),
                           deny=('binteo', 'videos', 'gallery', 'eikones',
                                 'twit')),
             callback='parse_iefimerida',
             follow=True,
             process_request='process_iefimerida'),
        Rule(LinkExtractor(allow=('cnn.gr/news/perivallon')),
             callback='parse_cnn',
             follow=True,
             process_request='process_cnn'),
        Rule(LinkExtractor(allow=('protagon.gr/epikairotita/'),
                           deny=('binteo', 'videos', 'gallery', 'eikones',
                                 'twit')),
             callback='parse_protagon',
             follow=True,
             process_request='process_protagon'),
    )

    def parse_cnn(self, response):
        global cnn_counter
        #check if we are in an articles url
        title = response.xpath('//h1[@class="story-title"]/text()').get()
        if title is not None and cnn_counter < 300:
            #get the article's text
            text = response.xpath(
                '//div[@class="story-content"]//p/text()|//div[@class="story-content"]//strong/text()|//div[@class="story-content"]//a/text()'
            ).getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub("  ", "space", list_to_string)
            uneede_spaces = re.sub(" ", "", markspaces)
            final_text = re.sub("space", " ", uneede_spaces)
            clear_characters = re.sub("\xa0", "", final_text)

            date = re.sub(
                r'\n|\t', "",
                response.xpath(
                    '//div[@class="story-date story-credits icon icon-time"]/text()'
                ).get())
            final_date = formatdate(date)

            url = response.url

            if len(clear_characters) > GENERAL_CATEGORIES['ALLOWED_LENGTH']:
                cnn_counter += 1
                yield {
                    "subtopic":
                    GENERAL_CATEGORIES['ENVIRONMENT'],
                    "website":
                    CNN_VARS['WEBSITE'],
                    "title":
                    title,
                    "article_date":
                    final_date,
                    "author":
                    re.sub(
                        r'\n|\t', "",
                        response.xpath(
                            '//div[@class="story-author"]/text()').get()),
                    "article_body":
                    re.sub(r'\n|\t', "", clear_characters),
                    "url":
                    url,
                }

    def process_cnn(self, request):
        global cnn_counter
        if cnn_counter < 300:
            return request

    def parse_protagon(self, response):
        global protagon_counter
        #check if we are in an articles url
        title = response.xpath('//h1[@class="entry-title"]/text()').get()
        if title is not None and protagon_counter < 300:
            #check if we are in the correct category
            sub = response.xpath('//span[@class="s_roumpr"]/a/text()').get()
            if sub == PROTAGON_VARS['ENVIRONMENT']:
                #get the article's text
                text = response.xpath(
                    '//div[@class="left-single-column "]//p/text()|//div[@class="left-single-column "]//strong/text()|//div[@class="left-single-column "]//p/*/text()'
                ).getall()
                list_to_string = " ".join(" ".join(text))
                markspaces = re.sub("  ", "space", list_to_string)
                uneede_spaces = re.sub(" ", "", markspaces)
                final_text = re.sub("space", " ", uneede_spaces)
                clear_characters = re.sub("\xa0", "", final_text)

                #flag to see later on if we have tweets ect
                flag = re.search(r"@", clear_characters)
                url = response.url

                author = re.findall(
                    r"(\w+).(\w+)",
                    response.xpath(
                        '//strong[@class="generalbold uppercase"]/a/text()').
                    get())
                list_to_tuple = author[0]
                author = ' '.join(list_to_tuple)

                date = response.xpath(
                    '//span[@class="generalight uppercase"]/text()').get()
                final_date = formatdate(date)

                #check if we are in an article and that it doesn't have images
                if len(clear_characters) > GENERAL_CATEGORIES[
                        'ALLOWED_LENGTH'] and flag is None:
                    protagon_counter += 1
                    yield {
                        "subtopic": GENERAL_CATEGORIES['ENVIRONMENT'],
                        "website": PROTAGON_VARS['WEBSITE'],
                        "title": title,
                        "article_date": final_date,
                        "author": author,
                        "article_body": re.sub(r'\s\s\s', "",
                                               clear_characters),
                        "url": url,
                    }

    def process_protagon(self, request):
        global protagon_counter
        if protagon_counter < 300:
            return request

    def parse_iefimerida(self, response):
        global iefimerida_counter
        #check if we are in an articles url
        title = response.xpath('//h1/span/text()').get()
        if title is not None and iefimerida_counter < 300:
            #get the article's text
            text = response.xpath(
                '//div[@class="field--name-body on-container"]//p/text()|//div[@class="field--name-body on-container"]/strong/text()|//div[@class="field--name-body on-container"]//p/*/text()|//div[@class="field--name-body on-container"]//p//li/text()'
            ).getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub("  ", "space", list_to_string)
            uneeded_spaces = re.sub(" ", "", markspaces)
            final_text = re.sub("space", " ", uneeded_spaces)
            clear_characters = re.sub("\xa0", "", final_text)

            #flag to see later on if we have tweets ect
            flag = re.search(r"@", clear_characters)
            url = response.url

            date = response.xpath('//span[@class="created"]/text()').get()
            final_date = formatdate(date)

            #check if we are in an article and that it doesn't have images
            if len(final_text
                   ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None:
                iefimerida_counter += 1
                yield {
                    "subtopic": GENERAL_CATEGORIES['ENVIRONMENT'],
                    "website": IEFIMERIDA_VARS['AUTHOR'],
                    "title": title,
                    "article_date": final_date,
                    "author": IEFIMERIDA_VARS['AUTHOR'],
                    "article_body": re.sub(r'\s\s\s|\n', "", final_text),
                    "url": url,
                }

    def process_iefimerida(self, request):
        global iefimerida_counter
        if iefimerida_counter < 300:
            return request

    def parse_kathimerini(self, response):
        global kathimerini_counter
        #check if we are in an articles url
        title = response.xpath('//h2[@class="item-title"]/text()').get()
        if title is not None and kathimerini_counter < 300:
            list_to_string = " ".join(" ".join(title))
            markspaces = re.sub("       ", "space", list_to_string)
            uneeded_spaces = re.sub(" ", "", markspaces)
            put_spaces_back = re.sub("space", " ", uneeded_spaces)
            final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back)

            #get the article's text
            text = response.xpath(
                '//div[@class="freetext"]//p/text()|//div[@class="freetext"]//strong/text()|//div[@class="freetext"]//h3/text()|//div[@class="freetext"]//p/*/text()'
            ).getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub("  ", "space", list_to_string)
            uneeded_spaces = re.sub(" ", "", markspaces)
            final_text = re.sub("space", " ", uneeded_spaces)
            clear_characters = re.sub("\xa0", "", final_text)

            date = response.xpath('//time/text()').get()
            final_date = formatdate(date)

            #flag to see later on if we have tweets ect
            flag = re.search(r"@", clear_characters)
            url = response.url

            author = response.xpath(
                '//span[@class="item-author"]/a/text()').get()
            if author == KATHIMERINI_VARS['CATEGORY_AUTHOR']:
                author = KATHIMERINI_VARS['AUTHOR']

            #check if we are in an article and that it doesn't have images
            if len(final_text
                   ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None:
                kathimerini_counter += 1
                yield {
                    "subtopic": GENERAL_CATEGORIES['ENVIRONMENT'],
                    "website": KATHIMERINI_VARS['AUTHOR'],
                    "title": final_title,
                    "article_date": final_date,
                    "author": author,
                    "article_body": re.sub(r'\s\s\s|\n', "", final_text),
                    "url": url,
                }

    def process_kathimerini(self, request):
        global kathimerini_counter
        if kathimerini_counter < 300:
            return request

    def parse_naftemporiki(self, response):
        global naftemporiki_counter
        #check if we are in an articles url
        title = response.xpath('//h2[@id="sTitle"]/text()').get()
        if title is not None and naftemporiki_counter < 300:
            #check if we are in the correct category
            subtopic = response.xpath(
                '//span[@itemprop="articleSection"]/text()').get()
            if subtopic == NAFTEMPORIKI_VARS['CATEGORY_ENVIRONMENT']:
                #fix the title's format
                list_to_string = " ".join(" ".join(title))
                markspaces = re.sub("       ", "space", list_to_string)
                uneeded_spaces = re.sub(" ", "", markspaces)
                put_spaces_back = re.sub("space", " ", uneeded_spaces)
                final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back)

                #get the article's text
                text = response.xpath(
                    '//div[@class="entityMain article"]//p/text()|//div[@class="entityMain article"]/p/strong/text()|//div[@class="entityMain article"]//h3/text()|//div[@class="entityMain article"]//p/*/text()'
                ).getall()
                list_to_string = " ".join(" ".join(text))
                markspaces = re.sub("  ", "space", list_to_string)
                uneeded_spaces = re.sub(" ", "", markspaces)
                final_text = re.sub("space", " ", uneeded_spaces)
                clear_characters = re.sub("\xa0", "", final_text)

                date = response.xpath('//div[@class="Date"]/text()').get()
                final_date = formatdate(date)

                #flag to see later on if we have tweets ect
                flag = re.search(r"@", clear_characters)
                url = response.url

                #check if we are in an article and that it doesn't have images
                if len(final_text) > GENERAL_CATEGORIES[
                        'ALLOWED_LENGTH'] and flag is None:
                    naftemporiki_counter += 1
                    yield {
                        "subtopic":
                        response.xpath(
                            '//div[@class="Breadcrumb"]/a[2]/text()').get(),
                        "website":
                        NAFTEMPORIKI_VARS['AUTHOR'],
                        "title":
                        final_title,
                        "article_date":
                        final_date,
                        "author":
                        NAFTEMPORIKI_VARS['AUTHOR'],
                        "article_body":
                        re.sub(r'\s\s\s|\n', "", final_text),
                        "url":
                        url,
                    }

    def process_naftemporiki(self, request):
        global naftemporiki_counter
        if naftemporiki_counter < 300:
            return request

    def parse_lifo(self, response):
        global lifo_counter
        #check if we are in an articles url
        title = response.xpath(
            '//h1[@itemprop="headline"]/text()|//meta[@itemprop="headline"]/text()|//h1/*/text()'
        ).get()
        if title is not None and lifo_counter < 300:
            #fix the title's format
            list_to_string = " ".join(" ".join(title))
            markspaces = re.sub("       ", "space", list_to_string)
            uneeded_spaces = re.sub(" ", "", markspaces)
            put_spaces_back = re.sub("space", " ", uneeded_spaces)
            final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back)

            date = response.xpath('//time/text()').get()
            final_date = formatdate(date)

            #get the article's text
            text = response.xpath(
                '//div[@class="clearfix wide bodycontent"]//p/text()|//div[@class="clearfix wide bodycontent"]/p/strong/text()|//div[@class="clearfix wide bodycontent"]//h3/text()|//div[@class="clearfix wide bodycontent"]//p/*/text()'
            ).getall()
            list_to_string = " ".join(" ".join(text))
            markspaces = re.sub("  ", "space", list_to_string)
            uneeded_spaces = re.sub(" ", "", markspaces)
            final_text = re.sub("space", " ", uneeded_spaces)
            clear_characters = re.sub("\xa0", "", final_text)

            author = response.xpath(
                '//div[@class="author"]/a/text()|//div[@itemprop="author"]/*/text()'
            ).get()
            if author == None:
                author = LIFO_VARS['AUTHOR']

            #flag to see later on if we have tweets ect
            flag = re.search(r"@", clear_characters)
            url = response.url

            #check if we are in an article and that it doesn't have images
            if len(clear_characters
                   ) > GENERAL_CATEGORIES['ALLOWED_LENGTH'] and flag is None:
                lifo_counter += 1
                yield {
                    "subtopic": GENERAL_CATEGORIES['ENVIRONMENT'],
                    "website": LIFO_VARS['AUTHOR'],
                    "title": final_title,
                    "article_date": final_date,
                    "author": author,
                    "article_body": re.sub(r'\s\s\s|\n', "", clear_characters),
                    "url": url,
                }

    def process_lifo(self, request):
        global lifo_counter
        if lifo_counter < 300:
            return request

    def parse_popaganda(self, response):
        global popaganda_counter
        #check if we are in an articles url
        title = response.xpath('//h1/text()').get()
        if title is not None and popaganda_counter < 300:
            #check if we are in the correct category
            category = response.xpath(
                '//div[@class="category"]/a/text()').get()
            if category == POPAGANDA_VARS['CATEGORY_ENVIRONMENT']:
                #fix the title's format
                list_to_string = " ".join(" ".join(title))
                markspaces = re.sub("       ", "space", list_to_string)
                uneeded_spaces = re.sub(" ", "", markspaces)
                put_spaces_back = re.sub("space", " ", uneeded_spaces)
                final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back)

                #get the article's text
                text = response.xpath(
                    '//div[@class="post-content newstrack-post-content"]//p/text()|//div[@class="post-content newstrack-post-content"]/p/strong/text()|//div[@class="post-content newstrack-post-content"]//h3/text()|//div[@class="post-content newstrack-post-content"]//p/*/text()'
                ).getall()
                list_to_string = " ".join(" ".join(text))
                markspaces = re.sub("  ", "space", list_to_string)
                uneeded_spaces = re.sub(" ", "", markspaces)
                final_text = re.sub("space", " ", uneeded_spaces)
                clear_characters = re.sub("\xa0", "", final_text)

                author = response.xpath(
                    '//div[@class="author"]/a/text()|//div[@itemprop="author"]/*/text()'
                ).get()
                if author == None:
                    author = POPAGANDA_VARS['WEBSITE']

                date = response.xpath(
                    '//div[@class="date"]/text()|//div[@class="fullscreen-date"]/text()'
                ).get()
                final_date = formatdate(date)

                #flag to see later on if we have tweets ect
                flag = re.search(r"@", clear_characters)
                url = response.url

                #check if we are in an article and that it doesn't have images
                if len(clear_characters) > GENERAL_CATEGORIES[
                        'ALLOWED_LENGTH'] and flag is None:
                    popaganda_counter += 1
                    yield {
                        "subtopic": POPAGANDA_VARS['ENVIRONMENT'],
                        "website": POPAGANDA_VARS['WEBSITE'],
                        "title": final_title,
                        "article_date": final_date,
                        "author": POPAGANDA_VARS['WEBSITE'],
                        "article_body": re.sub(r'\s\s\s|\n', "",
                                               clear_characters),
                        "url": url,
                    }

    def process_popaganda(self, request):
        global popaganda_counter
        if popaganda_counter < 300:
            return request

    def parse_topontiki(self, response):
        global topontiki_counter
        #check if we are in an articles url
        title = response.xpath('//h1/text()').get()
        if title is not None and topontiki_counter < 300:
            #check if we are in the correct category
            sub = response.xpath('//h2/a/text()').get()
            if sub == TOPONTIKI_VARS['CATEGORY_ENVIRONMENT']:
                #fix the title's format
                list_to_string = " ".join(" ".join(title))
                markspaces = re.sub("       ", "space", list_to_string)
                uneeded_spaces = re.sub(" ", "", markspaces)
                put_spaces_back = re.sub("space", " ", uneeded_spaces)
                final_title = re.sub(r'\n|\s\s\s', "", put_spaces_back)

                #get the article's text
                text = response.xpath(
                    '//div[@class="field-item even"]//p/text()|//div[@class="field-item even"]//p/*/text()|//div[@class="field-item even"]//p//span/text()'
                ).getall()
                list_to_string = " ".join(" ".join(text))
                markspaces = re.sub("  ", "space", list_to_string)
                uneeded_spaces = re.sub(" ", "", markspaces)
                final_text = re.sub("space", " ", uneeded_spaces)
                clear_characters = final_text.replace("\xa0", "")

                date = response.xpath('//span[@class="date"]/text()').get()
                final_date = formatdate(date)

                #flag to see later on if we have tweets ect
                flag = re.search(r"@", clear_characters)
                url = response.url

                #check if we are in an article and that it doesn't have images
                if len(clear_characters) > GENERAL_CATEGORIES[
                        'ALLOWED_LENGTH'] and flag is None:
                    topontiki_counter += 1
                    yield {
                        "subtopic": GENERAL_CATEGORIES['ENVIRONMENT'],
                        "website": TOPONTIKI_VARS['WEBSITE'],
                        "title": final_title,
                        "article_date": final_date,
                        "author": TOPONTIKI_VARS['WEBSITE'],
                        "article_body": re.sub(r'\s\s\s|\n', "",
                                               clear_characters),
                        "url": url,
                    }

    def process_topontiki(self, request):
        global topontiki_counter
        if topontiki_counter < 300:
            return request
예제 #60
0
class MenHoodiesSpider(SplashCrawlSpider):
    name: str = "men_hoodies"

    allowed_domains: List[str] = ["www.dresslily.com"]
    start_urls: List[str] = [
        "https://www.dresslily.com/hoodies-c-181-page-1.html",
    ]
    item_reviews_page_url: str = "https://www.dresslily.com/m-review-a-view_review-goods_id-{product_id}-page-{page_num}.html"

    rules: Tuple[Rule, ...] = (
        Rule(
            LinkExtractor(allow=r"page-[0-9]+"),
        ),
        Rule(
            LinkExtractor(allow=r"product[0-9]+"),
            callback="parse_hoodie",
        ),
    )

    splash_args: Dict[str, Any] = {
        "wait": 3.0,
        "images": 0,
    }

    # Hoodie selectors
    NAME_XPATH: str = "//h1/span[@class='goodtitle']/text()"
    DISCOUNT_CSS: str = "span.off.js-dl-cutoff > span ::text"
    INFO_KEYS_XPATH: str = "//div[@class='xxkkk']/div//strong/text()"
    INFO_VALUES_XPATH: str = "//div[@class='xxkkk20']/text()"
    TOTAL_REVIEWS_XPATH: str = "//*[@id='js_reviewCountText']/text()"
    ORIGINAL_PRICE_WITHOUT_DISCOUNT_CSS: str = (
        "span.curPrice.my-shop-price.js-dl-curPrice ::attr(data-orgp)"
    )
    ORIGINAL_PRICE_WITH_DISCOUNT_CSS: str = "span.js-dl-marketPrice.marketPrice.my-shop-price.dl-has-rrp-tag > span.dl-price ::text"

    # Hoodie reviews selectors
    REVIEWS_LIST_XPATH: str = "//div[@class='reviewinfo']"
    RATING_SELECTED_STARS_XPATH: str = (
        ".//p[@class='starscon_b dib']/i[@class='icon-star-black']"
    )
    TIMESTAMP_XPATH: str = ".//span[@class='reviewtime']/text()"
    TEXT_XPATH: str = ".//p[@class='reviewcon']/text()"
    SIZE_XPATH: str = ".//p[@class='color-size']/span[1]/text()"
    COLOR_XPATH: str = ".//p[@class='color-size']/span[2]/text()"

    TIMESTAMP_FORMAT: str = "%b,%d %Y %H:%M:%S"
    REVIEWS_BY_PAGE_COUNT: int = 6

    def parse_hoodie(
        self, response: HtmlResponse
    ) -> Union[Iterable[Item], Iterable[ScrapyRequest]]:
        product_url: str = response.meta["real_url"]
        product_id: int = int(
            product_url.split("product")[-1].replace(".html", "")
        )
        name: str = response.xpath(self.NAME_XPATH).get("")
        original_price: float = float(
            response.css(self.ORIGINAL_PRICE_WITHOUT_DISCOUNT_CSS).get(0.0)
        )
        discounted_price: float = 0.0
        discount: int = int(response.css(self.DISCOUNT_CSS).get(0))

        if discount:
            discounted_price = original_price
            original_price = float(
                response.css(self.ORIGINAL_PRICE_WITH_DISCOUNT_CSS).getall()[
                    -1
                ]
            )

        product_info_keys: List[str] = response.xpath(
            self.INFO_KEYS_XPATH
        ).getall()

        product_info_values: List[str] = response.xpath(
            self.INFO_VALUES_XPATH
        ).getall()[1::2]

        product_info: str = "".join(
            [
                f"{k.strip()}{v.strip()};"
                for (k, v) in zip(product_info_keys, product_info_values)
            ]
        )

        total_reviews: int = int(
            response.xpath(self.TOTAL_REVIEWS_XPATH).get(0)
        )

        yield MenHoodieItem(
            product_id=product_id,
            product_url=product_url,
            name=name,
            discount=discount,
            discounted_price=discounted_price,
            original_price=original_price,
            total_reviews=total_reviews,
            product_info=product_info,
        )

        if total_reviews > 0:
            yield from self.parse_reviews_pages(
                product_id=product_id,
                total_reviews=total_reviews,
            )

    def parse_reviews_pages(
        self, product_id: int, total_reviews: int
    ) -> Iterable[ScrapyRequest]:
        reviews_left: int = total_reviews
        page_num: int = 1

        while reviews_left > 0:
            # No need in loading js
            yield ScrapyRequest(
                url=self.item_reviews_page_url.format(
                    product_id=product_id,
                    page_num=page_num,
                ),
                callback=self.parse_reviews,
                cb_kwargs={"product_id": product_id},
            )
            reviews_left -= self.REVIEWS_BY_PAGE_COUNT
            page_num += 1

    def parse_reviews(
        self, response: HtmlResponse, product_id: int
    ) -> Iterable[Item]:
        reviews: List[HtmlResponse] = response.xpath(self.REVIEWS_LIST_XPATH)

        for review in reviews:
            rating: int = len(
                review.xpath(self.RATING_SELECTED_STARS_XPATH).getall()
            )
            time: str = review.xpath(self.TIMESTAMP_XPATH).get("")
            timestamp: float = (
                mktime(
                    datetime.strptime(time, self.TIMESTAMP_FORMAT).timetuple()
                )
                if time
                else 0.0
            )
            text: str = review.xpath(self.TEXT_XPATH).get("")
            size: str = review.xpath(self.SIZE_XPATH).get(": ").split(": ")[-1]
            color: str = (
                review.xpath(self.COLOR_XPATH).get(": ").split(": ")[-1]
            )

            yield ReviewItem(
                product_id=product_id,
                rating=rating,
                timestamp=timestamp,
                text=text,
                size=size,
                color=color,
            )