示例#1
0
    def parse(self, response):

        # follow pagination links
        if response.xpath(
                "//a[@class='results']/span[@class='next']").extract_first():
            next = response.xpath("//a[@class='results']/@href").extract()[-1]
            full_url = urlparse.urljoin(response.url, next)
            yield scrapy.Request(url=full_url,
                                 headers=self.headers,
                                 callback=self.parse,
                                 dont_filter=True)

        l = ItemLoader(item=BabiesrusItem(), response=response)
        l.add_xpath('name', "//a[contains(@class, 'prodtitle')]/text()")
        l.add_xpath('price', "//span[contains(@class, 'ourPrice2')]/text()")
        l.add_xpath('ratings', "//span[contains(@class, 'pr-rounded')]/text()")
        l.add_xpath('producturl', "//a[contains(@class, 'prodtitle')]/@href")
        l.add_value('pageurl', response.url)

        names = l.get_output_value('name')
        prices = l.get_output_value('price')
        ratingss = l.get_output_value('ratings')
        producturls = l.get_output_value('producturl')
        pageurl = response.url

        for i in range(len(names)):
            yield {
                'name': names[i],
                'price': prices[i],
                'ratings': ratingss[i],
                'producturl': producturls[i],
                'pageurl': pageurl
            }
示例#2
0
    def details(self, response):
        item = EducationItem()
        # print "!!!!!!!!!!!!!!!!!!!!!!!!",response.url
        l = ItemLoader(item=EducationItem(), response=response)
        l.add_xpath('title', '//h1[@id="firstHeading"]')
        title = l.get_output_value('title')
        # print "++++++++++++++++++++++++++++++",title

        l.add_xpath('details', '//div[@class="mw-parser-output"]/p')
        details = l.get_output_value('details')
        # print "==================",details

        return l.load_item()
        print "============================", item
示例#3
0
    def parse_post(self, response):
        new = ItemLoader(item=PostItem(),
                         response=response,
                         parent=response.meta['item'])
        new.add_xpath(
            'source',
            "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href, 'post_id')]/strong/text()"
        )
        new.add_xpath('date', '//div/div/abbr/text()')
        new.add_xpath(
            'text',
            '//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()'
        )
        new.add_xpath(
            'reactions',
            "//a[contains(@href, 'reaction/profile')]/div/div/text()")
        if new.get_output_value('comments'):
            yield scrapy.Request(response.urljoin(
                response.meta['item'].get_output_value('url')),
                                 callback=self.parse_comments,
                                 dont_filter=True,
                                 meta={'item': new})

        reactions = response.xpath(
            "//div[contains(@id, 'sentence')]/a[contains(@href, 'reaction/profile')]/@href"
        )
        reactions = response.urljoin(reactions[0].extract())
        yield scrapy.Request(reactions,
                             callback=self.parse_reactions,
                             dont_filter=True,
                             meta={'item': new})
示例#4
0
文件: tiki.py 项目: nvvu99/FlashSale
    def parse_category(self, response):
        meta = response.meta
        categories_dom_xpath = '/html/body/header/div[@class="main-nav"]/div/nav/ul/li'
        category_url_xpath = './a/@href'
        category_name_xpath = './a/span/text()'

        categories_dom = response.xpath(categories_dom_xpath)
        for category_dom in categories_dom:
            category_loader = ItemLoader(item=CategoryItem(),
                                         selector=category_dom)
            category_loader.add_xpath('category_id', category_url_xpath)
            category_loader.add_xpath('category_name', category_name_xpath)
            yield category_loader.load_item()

            category_id = category_loader.get_output_value('category_id')
            page = 1

            # request to onsale product page
            yield scrapy.Request(url=self.sale_urls[ONSALE].format(
                category_id, page),
                                 callback=self.parse_product,
                                 meta={
                                     'time': ONSALE,
                                     'category_id': category_id
                                 })

            # request to coming sale product page
            yield scrapy.Request(url=self.sale_urls[COMING].format(
                category_id, page),
                                 callback=self.parse_product,
                                 meta={
                                     'time': COMING,
                                     'category_id': category_id
                                 })
示例#5
0
 def parse(self, response, **kwargs):
     loader = ItemLoader(item=YelpItem(), response=response)
     loader.default_output_processor = TakeFirst()
     for script in response.css('script').getall():
         if '{"gaConfig' in script:
             detail_json = json.loads(
                 re.search(r'({"gaConfig.*?)-->', script).group(1))
     loader.add_value('direct_url', detail_json['staticUrl'])
     loader.add_value(
         'business_id', detail_json['bizDetailsPageProps']
         ['bizContactInfoProps']['businessId'])
     loader.add_value(
         'categories', detail_json['gaConfig']['dimensions']['www']
         ['second_level_categories'][1])
     loader.add_value(
         'site', detail_json['bizDetailsPageProps']['bizContactInfoProps']
         ['businessWebsite']['linkText'])
     loader.add_value('title',
                      detail_json['bizDetailsPageProps']['businessName'])
     loader.add_value(
         'review_count', detail_json['bizDetailsPageProps']
         ['ratingDetailsProps']['numReviews'])
     yield scrapy.Request(
         'https://www.yelp.com/biz_attribute?biz_id={}'.format("".join(
             loader.get_output_value('business_id'))),
         method='GET',
         callback=self.linkedData,
         meta={'item': loader.load_item()})
    def parse_table(self, response):
        data = ItemLoader(item=ParsetauntondeedsItem(), response=response)

        data.add_xpath(
            'date',
            '//table[@class="grid"]//tr[@onmouseout="this.className=this.originalClass;"]/'
            'td[2]/text()')
        data.add_xpath(
            'type',
            '//table[@class="grid"]//tr[@onmouseout="this.className=this.originalClass;"]/'
            'td[3]/text()')
        data.add_xpath(
            'book',
            '//table[@class="grid"]//tr[@onmouseout="this.className=this.originalClass;"]/'
            'td[4]/text()')
        data.add_xpath(
            'page_num',
            '//table[@class="grid"]//tr[@onmouseout="this.className=this.originalClass;"]/'
            'td[5]/text()')
        data.add_xpath(
            'doc_num',
            '//table[@class="grid"]//tr[@onmouseout="this.className=this.originalClass;"]/'
            'td[6]/text()')
        data.add_xpath(
            'city',
            '//table[@class="grid"]//tr[@onmouseout="this.className=this.originalClass;"]/'
            'td[7]/text()')
        data.add_xpath(
            'description',
            '//table[@class="grid"]//tr[@onmouseout="this.className=this.originalClass;"]/'
            'td[8]/span/text()')
        data.add_value(
            'cost',
            parse_functions.get_cost(data.get_output_value('description')),
            MapCompose(float))
        data.add_value(
            'street_address',
            parse_functions.get_street_address(
                data.get_output_value('description')))
        data.add_value(
            'state',
            parse_functions.get_state(data.get_output_value('description')))
        data.add_value(
            'zip',
            parse_functions.get_zip(data.get_output_value('description')))

        return data.load_item()
 def test_get_output_value_list(self):
     """Getting output value must not remove value from item"""
     input_item = self.item_class(name=["foo", "bar"])
     il = ItemLoader(item=input_item)
     self.assertEqual(il.get_output_value("name"), ["foo", "bar"])
     loaded_item = il.load_item()
     self.assertIsInstance(loaded_item, self.item_class)
     self.assertEqual(loaded_item, dict({"name": ["foo", "bar"]}))
示例#8
0
 def test_get_output_value_singlevalue(self):
     """Getting output value must not remove value from item"""
     input_item = self.item_class(name='foo')
     il = ItemLoader(item=input_item)
     self.assertEqual(il.get_output_value('name'), ['foo'])
     loaded_item = il.load_item()
     self.assertIsInstance(loaded_item, self.item_class)
     self.assertEqual(ItemAdapter(loaded_item).asdict(), dict({'name': ['foo']}))
示例#9
0
 def test_get_output_value_list(self):
     """Getting output value must not remove value from item"""
     input_item = self.item_class(name=['foo', 'bar'])
     il = ItemLoader(item=input_item)
     self.assertEqual(il.get_output_value('name'), ['foo', 'bar'])
     loaded_item = il.load_item()
     self.assertIsInstance(loaded_item, self.item_class)
     self.assertEqual(loaded_item, dict({'name': ['foo', 'bar']}))
示例#10
0
    def _article_requests(self, response):
        index = self.order

        request_list = []  # 用来存放返回的request对象
        loader = ItemLoader(item=XfjyItemItem(), response=response)
        loader.add_xpath("tags_list", "string(//div[@class='weizhi']//td)")

        tags_list = loader.get_output_value("tags_list")

        # 解析出文章title,date,url,并且进行文章爬取
        tr_tags = response.xpath(
            "//div[@class='main_nei_you_baio_content']//tr[@height='20']")
        for tr_tag in tr_tags:
            # 提取文章的url,并且拼接为完整的链接
            url = tr_tag.xpath(".//a//@href").extract_first()
            if url:
                url = response.urljoin(url)
            else:
                self.log("没有解析到文章url,板块链接:%s" % response.url,
                         level=logging.ERROR)

            # 将title提取出来并且进行解析
            title = tr_tag.xpath(".//a//@title").extract_first()
            if title:
                title = "".join(title.split())
            else:
                self.log("没有解析到title,板块链接:%s" % response.url,
                         level=logging.WARNING)

            # 将date提取出来,并且进行解析成时间戳
            date = tr_tag.xpath(
                ".//span[@class='timestyle44007']//text()").extract_first()
            if date:
                date = int(
                    time.mktime(
                        time.strptime("".join(date.split()), "%Y年%m月%d日")))
            else:
                self.log("没有解析到date,板块链接:%s" % response.url,
                         level=logging.WARNING)
                date = None

            exist = self.filter.filter(url)
            if not exist:
                request = Request(
                    url,
                    meta={
                        "title": title,
                        "date": date,
                        "tags_list": tags_list,
                        "type": "article",
                        "index": index
                    },
                    callback=self.parse_article,
                )
                request_list.append(request)
                index += 1
        return request_list
示例#11
0
 def getAmenities(self, response):
     loader = ItemLoader(item=response.meta['item'], response=response)
     response_json = json.loads(response.text)[0]
     if response_json['data']['business']['organizedProperties']:
         amenities = [amenity['displayText'] for amenity in response_json['data']['business']['organizedProperties'][0]['properties']]
         loader.add_value('amenities', amenities)
     yield scrapy.Request('https://www.yelp.com/biz/{}/props'.format("".join(loader.get_output_value('business_id')))
                          , method='GET', headers={'Content-Type': 'application/json', 'X-Requested-With':
             'XMLHttpRequest', 'Accept':	'application/json', 'Referer':'https://www.yelp.com/biz/fog-harbor-fish-house-san-francisco-2'},
                          callback=self.getAbout,
                          meta={'item': loader.load_item()})
示例#12
0
    def parse(self, response):
        #l = ItemLoader(item=Product(), response=response)
        # Data from xpath1 is extracted, and passed through the input processor of the name field. The result of the
        # input processor is collected and kept in the Item Loader (but not yet assigned to the item).
        #l.add_css("title", "h1.post-title::text")

        #for i in response.css('title::text').extract():
        #l.add_css("title", i.encode(utf-8))
        #l.add_xpath('name', '//div[@class="product_title"]')
        #l.add_css("title", 'title::text')

        #l.add_css("img", "a > .wp-post-image::attr(src)")
        #l.add_css("tags", ".bs-cat a::text, .bs-tags a::text")

        #l.add_css("text", ".pf-content")
        #l.add_css("date_of_publish", '.rp-date')
        #l.add_css("source", '.ai-info h6 a::text')
        #l.add_value("url", response.url)
        #yield l.load_item()

        l = ItemLoader(item=Product(), response=response)
        l.add_css("author", '.bauthor::text')
        l.add_css("title", ".btitle::text")
        l.add_css("date_of_publish", '.bdate::text')
        yield l.load_item()

        with open('blog_data.txt', 'a') as f:
            title_list = l.get_output_value('title')
            author_list = l.get_output_value('author')
            data_list = l.get_output_value('date_of_publish')

            for author in author_list:
                f.write('author: {0}\n'.format(author.encode('utf-8')))

            for title in title_list:

                f.write('title: {0}\n'.format(title.encode('utf-8')))

            for data in data_list:

                f.write('data: {0}\n'.format(data.encode('utf-8')))
示例#13
0
 def parse(self, response):
     for recipe in response.xpath("//div[@class='recipe-page']"):
         loader = ItemLoader(item=TitleItem(), selector=recipe)
         loader.add_xpath('title', "//div[@class='recipe-title']/h1")
         yield {
             'title':
             loader.get_output_value('title'),
             'ingredients':
             response.xpath(
                 '//div[contains(@class, "ingredients-card")]//li//text()').
             getall(),
             'link':
             response.url
         }
示例#14
0
    def parse_item(self, response):
        """
        @url http://splash:8050/render.html?&url=http://www.nettruyenco.com/truyen-tranh/boyfriend-17550&wait=1
        @scrapes name source image_src total_chap description chapters web_source full
        """
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)
        manga.add_xpath("unicode_name", '//h1[@class="title-detail"]/text()')
        manga.add_value("name",
                        unidecode(manga.get_output_value("unicode_name")[0]))
        manga.add_value("source", response.url)
        manga.add_xpath("image_src",
                        '//*[@class="col-xs-4 col-image"]/img/@src')
        manga.add_xpath("description",
                        '//*[@class="detail-content"]/p//text()', Join("\n"))
        chapter_xpath = '//*[@id="nt_listchapter"]/nav/ul/li[not(contains (@class, "row heading"))]/div[1]/a'
        chapter_source = manga.get_xpath(chapter_xpath + "/@href")
        chapter_name = manga.get_xpath(chapter_xpath + "/text()")
        chapters = zip(chapter_name, chapter_source)

        if "Hoàn thành" in manga.get_xpath(
                '//*[@class="status row"]/p[2]/text()'):
            manga.add_value("full", True)
            manga.add_value(
                "total_chap",
                manga.get_xpath(
                    chapter_xpath + "/text()",
                    MapCompose(lambda x: re.findall(r"\d+", x)),
                    MapCompose(int),
                )[0],
            )
        else:
            manga.add_value("full", False)
            manga.add_value(
                "total_chap",
                manga.get_xpath(
                    "//title/text()",
                    MapCompose(
                        lambda x: re.findall(r" Chapter \d+| Chap \d+", x)),
                    MapCompose(lambda x: re.findall(r"\d+", x)),
                    MapCompose(float),
                    MapCompose(int),
                    TakeFirst(),
                ),
            )

        manga.add_value("chapters", chapters)
        manga.add_value("web_source", "nettruyen")
        print(manga.load_item())

        return manga.load_item()
示例#15
0
 def parse_feed(self, response):
     if response.status == 200:
         loader = ItemLoader(item=Feed(), response=response)
         loader.add_value('id', response.meta['id'])
         loader.add_value('user_id', response.meta['user_id'])
         loader.add_value('feed_id', response.meta['feed_id'])
         loader.add_value('feed_url', response.url)
         loader.add_xpath(
             'post_time',
             "//div[starts-with(@data-ft,'{\"tn\":')]/div/abbr/text()")
         loader.add_xpath('content', '//title/text()')
         content = loader.get_output_value('content')
         if content == 'Photo' or 'Profile Pictures' or 'Cover Photos':
             # type = 'photo'
             loader.add_value('type', 'photo')
         else:
             if content == 'Comments':
                 # type = 'comments'
                 loader.add_value('type', 'comments')
             else:
                 # type = 'regular'
                 loader.add_value('type', 'regular')
             headline = response.xpath(
                 'string((//div[@id="root"]//table'
                 '[@role="presentation"])[1]//h3)').extract_first()
             if headline:
                 loader.add_value('headline', headline)
                 loader.add_xpath(
                     'links',
                     '(//div[@id="root"]//table[@role="presentation"]'
                     ')[1]//strong/following-sibling::a/@href')
                 location_selector = response.xpath(
                     "//div[starts-with(@data-ft,'{\"tn\":')]"
                     "div/abbr/following-sibling::a")
                 if location_selector:
                     loader.add_value(
                         'location', {
                             'location':
                             location_selector.xpath(
                                 './text()').extract_first(),
                             'url':
                             location_selector.xpath(
                                 './@href').extract_first()
                         })
         loader.add_value('timestamp', datetime.datetime.now())
         return loader.load_item()
     else:
         pass
示例#16
0
 def parse(self, response):
     data = re.findall(r"global.document.metadata=(.+?);\n", response.body.decode("utf-8"), re.S)
     data_dict = json.loads(data[0])
     if data_dict:
         if data_dict['contentType'] == 'books':
             loader = ItemLoader(item=Book(), selector=response)
             loader.default_output_processor = Join()
             loader.add_value('title', data_dict['title'])
             loader.add_value('author', [i['name'] for i in data_dict['authors']])
             loader.add_value('publisher', data_dict['publisher'])
             loader.add_value('chapters', '0')
             loader.add_value('abstract', data_dict['abstract'])
             loader.add_value('doi', data_dict['doi'])
             loader.add_value('ISBN', [i['value'] for i in data_dict['isbn']][1])
             loader.add_value('url', self.start_urls[0])
             loader.add_value('ID', loader.get_output_value('author').split(' ')[0])
             loader.add_value('ENTRYTYPE', 'Book')
         elif data_dict['contentType'] == 'conferences' or data_dict['contentType'] == 'chapter':
             loader = ItemLoader(item=ConferencePaper(), selector=response)
             loader.default_output_processor = Join()
             loader.add_value('title', data_dict['title'])
             loader.add_value('author', [i['name'] for i in data_dict['authors']])
             loader.add_value('booktitle', data_dict['publicationTitle'])
             loader.add_value('publisher', data_dict['publisher'])
             loader.add_value('year', data_dict['publicationYear'])
             loader.add_value('abstract', data_dict['abstract'])
             loader.add_value('doi', data_dict['doi'])
             loader.add_value('timestamp', data_dict['publicationDate'])
             loader.add_value('url', self.start_urls[0])
             loader.add_value('ENTRYTYPE', 'paper')
             loader.add_value('ID', load_id(loader))
         else:
             loader = ItemLoader(item=Article(), selector=response)
             loader.default_output_processor = Join()
             loader.add_value('author', [i['name'] for i in data_dict['authors']])
             loader.add_value('title', data_dict['title'])
             loader.add_value('journal', data_dict['publicationTitle'])
             loader.add_value('publisher', data_dict['publisher'])
             loader.add_value('abstract', data_dict['abstract'])
             loader.add_value('year', data_dict['publicationYear'])
             loader.add_value('timestamp', data_dict['publicationDate'])
             loader.add_value('doi', data_dict['doi'])
             loader.add_value('url', self.start_urls[0])
             loader.add_value('ENTRYTYPE', 'article')
             loader.add_value('ID', load_id(loader))
         yield loader.load_item()
示例#17
0
    def parse_item(self, response):
        """
        @url https://doctruyen3q.info/truyen-tranh/dao-hai-tac/77
        @scrapes name source image_src total_chap description chapters web_source full unicode_name
        """
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)
        category = manga.get_xpath("//*[@class='category row']/p[2]//text()")
        categories = re.sub(r'\s+', '', "".join(category))
        if any(i in unidecode(categories).lower() for i in ["18+", "smut", "yaoi", "ntr", "yuri", 'adult', 'dammy']):
            return
        manga.add_xpath("unicode_name", '//h1[@class="title-manga"]/text()')
        manga.add_value("name", unidecode(
            manga.get_output_value("unicode_name")[0]))
        manga.add_value("source", response.url)
        manga.add_xpath(
            "image_src", '//*[@class="image-comic"]/@src')
        manga.add_xpath(
            "description", '//*[@class="detail-summary"]/text()'
        )
        chapter_xpath = '//*[@id="list-chapter-dt"]/nav/ul/li/div[1]/a'
        chapter_source = manga.get_xpath(chapter_xpath + "/@href")
        chapter_name = manga.get_xpath(chapter_xpath + "/text()")
        chapters = zip(chapter_name, chapter_source)

        if "Đã hoàn thành" in manga.get_xpath('//*[@class="status row"]//text()'):
            manga.add_value("full", True)
        else:
            manga.add_value("full", False)

        manga.add_value(
            "total_chap",
            manga.get_xpath(
                '//*[@id="list-chapter-dt"]/nav/ul/li[1]/div[1]/a/text()',
                MapCompose(lambda x: re.findall(r"(\d+(?:\.\d+)?)", x)),
                MapCompose(float),
                MapCompose(int),
                TakeFirst(),
            ),
        )

        manga.add_value("chapters", chapters)
        manga.add_value("web_source", "doctruyen3q")
        print(manga.load_item())

        return manga.load_item()
示例#18
0
    def parse_homepage(self, response):
        loader = ItemLoader(item=FacebookProfile())
        parsed = urlparse(response.url)
        base_url = '{}://{}/{}'.format(parsed.scheme, parsed.netloc,
                                       filter(bool, parsed.path.split('/'))[0])
        if 'id=' in parsed.query and '/profile.php' in parsed.path:
            loader.add_value(
                'profile_url',
                base_url + '?id=' + parse_qs(parsed.query)['id'][0])
            base_url = base_url + '?id=' + \
                parse_qs(parsed.query)['id'][0] + '&'
            # loader.add_value('user_id', parse_qs(parsed.query)['id'])
        else:
            loader.add_value('profile_url', base_url)
            base_url = base_url + '?'
            loader.add_value('user_name', parsed.path[1:])
            # parse about page
        # get id in the database
        # print loader.get_output_value('profile_url')
        id = get_id(loader.get_output_value('profile_url'))
        loader.add_value('id', id)

        yield Request(url=base_url + 'v=info',
                      callback=self.parse_about_page,
                      priority=1000,
                      meta={
                          'loader':
                          loader,
                          'base_url':
                          base_url,
                          'search_friends_depth':
                          response.meta.get(
                              'search_friends_depth',
                              self.settings.get('SEARCH_FRIENDS_DEPTH', 1)),
                          'id':
                          id,
                          'friend_with':
                          response.meta.get('friend_with', None),
                          'enable_selenium':
                          True,
                          'title':
                          response.xpath('//title/text()').extract_first()
                      })
示例#19
0
    def parse_item(self, response):
        """
        @url http://splash:8050/render.html?&url=https://vlogtruyen.net/bokutachi-wa-hanshoku-wo-yameta.html&wait=1
        @scrapes name unicode_name source image_src total_chap description chapters web_source full
        """

        manga = ItemLoader(item=MangaCrawlerItem(), response=response)
        manga.add_xpath("unicode_name",
                        '//h1[@class="title-commic-detail"]/text()')
        manga.add_value("name",
                        unidecode(manga.get_output_value("unicode_name")[0]))
        manga.add_value("source", response.url)
        manga.add_xpath("image_src", '//meta[@property="og:image"]/@content')
        manga.add_xpath("description",
                        '//*[@class="desc-commic-detail"]/text()', Join("\n"))
        chapter_xpath = '//*[@class="ul-list-chaper-detail-commic"]/li/a'
        chapter_source = manga.get_xpath(chapter_xpath + "/@href")
        chapter_name = manga.get_xpath(chapter_xpath + "/h3/text()")
        chapters = zip(chapter_name, chapter_source)

        if "Đã hoàn thành" in manga.get_xpath(
                '//*[@class="manga-status"]/p/text()'):
            manga.add_value("full", True)
        else:
            manga.add_value("full", False)

        manga.add_value(
            "total_chap",
            manga.get_xpath(
                '//*[@class="ul-list-chaper-detail-commic"]/li[1]/a/h3/text()',
                MapCompose(lambda x: re.findall(r"(\d+(?:\.\d+)?)", x)),
                TakeFirst(),
            ),
        )
        manga.add_value("chapters", chapters)
        manga.add_value("web_source", "vlogtruyen")

        return manga.load_item()
示例#20
0
    def parse_item(self, response):
        """
        @url https://mangasee123.com/manga/Kingdom
        @scrapes name source image_src total_chap description chapters web_source full
        """
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)
        manga.add_xpath(
            "unicode_name",
            "//div[@class='container MainContainer']//li[1]/h1/text()")
        manga.add_value("name",
                        unidecode(manga.get_output_value("unicode_name")[0]))
        manga.add_value("source", response.url)
        manga.add_xpath("image_src", '//meta[@property="og:image"]/@content')
        manga.add_xpath("description", "//div[@class='top-5 Content']/text()",
                        Join("\n"))

        if "Complete (Publish)" in manga.get_xpath(
                '//*[@class="PublishStatus"]/text()'):
            manga.add_value("full", True)
        else:
            manga.add_value("full", False)

        rss = manga.get_xpath("//a[normalize-space()='RSS Feed']/@href")
        rss_url = BASE_URL + rss[0]

        feed = feedparser.parse(rss_url, agent="Mozilla/5.0")

        manga.add_value(
            "total_chap",
            re.findall(r"\d+", feed['entries'][0]['title'])[0],
        )

        chapters = [(i['title'], i['link']) for i in feed['entries']]
        manga.add_value("chapters", chapters)
        manga.add_value("web_source", "mangaseeonline")

        return manga.load_item()
示例#21
0
    def parse(self, response, **kwargs):
        loader = ItemLoader(item=YelpItem(), response=response)
        for script in response.css('script').getall():
            if '{"gaConfig' in script:
                detail_json = json.loads(re.search(r'({"gaConfig.*?)-->', script).group(1))
        loader.add_value('direct_url', detail_json['staticUrl'])
        loader.add_value('business_id', detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessId'])
        loader.add_value('categories', detail_json['gaConfig']['dimensions']['www']['second_level_categories'][1])
        if detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessWebsite']:
            loader.add_value('site', detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessWebsite']['linkText'])
        loader.add_value('title', detail_json['bizDetailsPageProps']['businessName'])
        loader.add_value('review_count', detail_json['bizDetailsPageProps']['ratingDetailsProps']['numReviews'])
        #TODO: find way to not use hardcoded documentIds
        post_data = [{"operationName":"getLocalBusinessJsonLinkedData","variables":{"BizEncId": "".join(loader.get_output_value('business_id'))},"extensions":{"documentId":"1cf362b8e8f9b3dae26d9f55e7204acd8355c916348a038f913845670139f60a"}}]

        yield scrapy.Request('https://www.yelp.com/gql/batch', method='POST', body=json.dumps(post_data),
            headers={'Content-Type': 'application/json'}, callback=self.linkedData, meta={'item': loader.load_item()})
示例#22
0
    def parse_filing_documents(self, response):
        request_manager = response.meta['request_manager']
        docket_loader = response.meta['docket_loader']

        table_rows = response.xpath(
            "//div[@id='apexir_DATA_PANEL']//table[@class='apexir_WORKSHEET_DATA']//"
            "tr[@class='even'] | //tr[@class='odd']")

        if table_rows:
            for row in table_rows:
                filing_loader = ItemLoader(item=Filing(),
                                           response=response,
                                           selector=row)
                filing_loader.add_xpath('description',
                                        'td[@headers="DESCRIPTION"]/text()')
                print("d. {} ".format(
                    filing_loader.get_output_value('description')))
                filing_loader.add_xpath('filled_on',
                                        'td[@headers="FILING_DATE"]/text()')
                filing_loader.add_xpath(
                    'types', 'td[@headers="DOCUMENT_TYPE"]//u/text()')
                filing_loader.add_xpath('filing_parties',
                                        "td[@headers='FILED_BY']/text()")
                document_link = row.xpath(
                    "td[@headers='DOCUMENT_TYPE']/a/@href").get()

                if document_link != 'http://www.cpuc.ca.gov/orderadocument/':
                    request_parameters = {
                        'document_link': document_link,
                        'docket_loader': docket_loader,
                        'filing_loader': filing_loader
                    }
                    request_manager.filing_requests.append(request_parameters)

            next_btn = response.xpath(
                '//*[@id="apexir_DATA_PANEL"]/table/tr[1]/td/span/a/@href'
            ).get()

            if next_btn:
                next_btn = next_btn.split("'")[1]
                if response.xpath(
                        "//*[@name='p_instance']/@value").get() is None:
                    p_instance = response.meta['p_instance']
                else:
                    p_instance = response.xpath(
                        "//*[@name='p_instance']/@value").get(),
                formdata = {
                    'p_request':
                    'APXWGT',
                    'p_instance':
                    p_instance,
                    'p_flow_id':
                    '401',
                    'p_flow_step_id':
                    '57',
                    'p_widget_num_return':
                    '100',
                    'p_widget_name':
                    'worksheet',
                    'p_widget_mod':
                    'ACTION',
                    'p_widget_action':
                    'PAGE',
                    'p_widget_action_mod':
                    next_btn,
                    'x01':
                    response.xpath(
                        '//input[@id="apexir_WORKSHEET_ID"]/@value').get(),
                    'x02':
                    response.xpath(
                        '//input[@id="apexir_REPORT_ID"]/@value').get(),
                }
                yield scrapy.FormRequest(
                    'https://apps.cpuc.ca.gov/apex/wwv_flow.show',
                    formdata=formdata,
                    method="POST",
                    callback=self.parse_filing_documents,
                    meta={
                        'docket_loader': docket_loader,
                        'cookiejar': response.meta['cookiejar'],
                        'request_manager': request_manager,
                        'p_instance': p_instance
                    })
            else:
                if request_manager.filing_requests:
                    request_parameters = request_manager.filing_requests.pop()
                    print("length of request_manager {} ".format(
                        len(request_manager.filing_requests)))
                    # yield response.follow(request_parameters['document_link'],
                    #                       meta={
                    #                             'dont_merge_cookies': True,
                    #                             'docket_loader': request_parameters['docket_loader'],
                    #                             'filing_loader': request_parameters['filing_loader'],
                    #                             'request_manager': request_manager},
                    #                       callback=self.parse_document_page
                    #                       )
        else:
            return docket_loader.load_item()
示例#23
0
 def getBusinessHours(self, response):
     loader = ItemLoader(item=response.meta['item'], response=response)
     response_json = json.loads(response.text)[0]
     schedule = dict()
     if response_json['data']['business']['operationHours']:
         for day in response_json['data']['business']['operationHours']['regularHoursMergedWithSpecialHoursForCurrentWeek']:
             schedule[day['dayOfWeekShort']] = "".join(day['regularHours'])
     loader.add_value('schedule', schedule)
     post_data = [{"operationName":"GetBizPageProperties","variables":{"BizEncId":"".join(loader.get_output_value('business_id'))},"extensions":{"documentId":"f06d155f02e55e7aadb01d6469e34d4bad301f14b6e0eba92a31e635694ebc21"}}]
     yield scrapy.Request('https://www.yelp.com/gql/batch', method='POST', body=json.dumps(post_data),
                          headers={'Content-Type': 'application/json'}, callback=self.getAmenities,
                          meta={'item': loader.load_item()})
示例#24
0
    def parse(self, response):
        r"""
               Parse the page

           * The type of the publication is found out from meta tag og:type.
           * The fields are extracted from the web-page from javascript variable global.document.metadata, selector is the response object itself and loaded into Article or Book or ConferencePaper Item depend on the contentType
           * The javascript variable is extracted using regex r"global.document.metadata=(.+?);" and saved as a json object
           * title,
           * author,
           * Journal,
           * publisher,
           * year,
           * abstract,
           * doi,
           * timestamp,
           * url,
           * booktitle,
           * ENTRYTYPE,
           * ID, (The ID populated from the function load_id))

           :return: Itemloader (item= Article or Conference paper depending on the type)

        """
        data = re.findall(r"global.document.metadata=(.+?);\n", response.body.decode("utf-8"), re.S)
        data_dict = json.loads(data[0])
        if data_dict:
            if data_dict['contentType'] == 'books':
                loader = ItemLoader(item=Book(), selector=response)
                loader.default_output_processor = Join()
                loader.add_value('title', data_dict['title'])
                loader.add_value('author', [i['name'] for i in data_dict['authors']])
                loader.add_value('publisher', data_dict['publisher'])
                loader.add_value('chapters', '0')
                loader.add_value('abstract', data_dict['abstract'])
                loader.add_value('doi', data_dict['doi'])
                loader.add_value('ISBN', [i['value'] for i in data_dict['isbn']][1])
                loader.add_value('url', self.start_urls[0])
                loader.add_value('ID', loader.get_output_value('author').split(' ')[0])
                loader.add_value('ENTRYTYPE', 'Book')
            elif data_dict['contentType'] == 'conferences' or data_dict['contentType'] == 'chapter':
                loader = ItemLoader(item=ConferencePaper(), selector=response)
                loader.default_output_processor = Join()
                loader.add_value('title', data_dict['title'])
                loader.add_value('author', [i['name'] for i in data_dict['authors']])
                loader.add_value('booktitle', data_dict['publicationTitle'])
                loader.add_value('publisher', data_dict['publisher'])
                loader.add_value('year', data_dict['publicationYear'])
                loader.add_value('abstract', data_dict['abstract'])
                loader.add_value('doi', data_dict['doi'])
                loader.add_value('timestamp', data_dict['publicationDate'])
                loader.add_value('url', self.start_urls[0])
                loader.add_value('ENTRYTYPE', 'paper')
                loader.add_value('ID', load_id(loader))
            else:
                loader = ItemLoader(item=Article(), selector=response)
                loader.default_output_processor = Join()
                loader.add_value('author', [i['name'] for i in data_dict['authors']])
                loader.add_value('title', data_dict['title'])
                loader.add_value('journal', data_dict['publicationTitle'])
                loader.add_value('publisher', data_dict['publisher'])
                loader.add_value('abstract', data_dict['abstract'])
                loader.add_value('year', data_dict['publicationYear'])
                loader.add_value('timestamp', data_dict['journalDisplayDateOfPublication'])
                loader.add_value('doi', data_dict['doi'])
                loader.add_value('url', self.start_urls[0])
                loader.add_value('ENTRYTYPE', 'article')
                loader.add_value('ID', load_id(loader))
            yield loader.load_item()
示例#25
0
    def parse(self, response):
        """
            Parse the page.

        * The type of the publication is found out from meta tag og:type
        * The fields are extracted from the web-page from meta tag , selector is the response object itself and loaded into Article Item
        * title,(//div[@class='page-title']/h1/text())
        * author, (//span[@class='authors-affiliations__name']/text())
        * Journal,
        * publisher,(//span[@id='publisher-name']/text())
        * chapters, (//span[@class='c-tabs__deemphasize']/text())
        * year, (//meta[@name='citation_publication_date']/@content)
        * abstract, (//meta[@name='description']/@content)
        * doi, (//input[@name='doi']/@value)
        * timestamp, (//meta[@name='citation_publication_date']/@content)
        * url, (//meta[@property='og:url']/@content)
        * booktitle, (//meta[@name='citation_inbook_title']/@content)
        * ENTRYTYPE, (//meta[@property='og:type']/@content)
        * ID, (The ID populated from the function load_id))

        :return: Itemloader (item= Article or Conference paper depending on the type)

        """
        if response.xpath("//meta[@property='og:type']/@content").extract():
            type_of_article = response.xpath("//meta[@property='og:type']/@content").extract()[0]
        elif response.xpath("//span[@class='test-content-type']/text()"):
            type_of_article = 'Book'
        else:
            return None

        if type_of_article == 'Book':
            book = response.xpath("//body")
            loader = ItemLoader(item=Book(), selector=book)
            loader.default_output_processor = Join()
            loader.add_xpath('title', "//div[@class='page-title']/h1/text()")
            loader.add_xpath('author', "//span[@class='authors-affiliations__name']/text()")
            loader.add_xpath('publisher', "//span[@id='publisher-name']/text()")
            loader.add_xpath('chapters', "//span[@class='c-tabs__deemphasize']/text()")
            loader.add_xpath('abstract', "//meta[@name='description']/@content")
            loader.add_xpath('doi', "//input[@name='doi']/@value")
            loader.add_xpath('ISBN', "//span[@id='electronic-isbn']/text()")
            loader.add_value('url', self.start_urls[0])
            loader.add_value('ID', loader.get_output_value('author').split(' ')[0])
            loader.add_value('ENTRYTYPE', 'Book')
        elif type_of_article == 'Paper':
            details = response
            loader = ItemLoader(item=ConferencePaper(), selector=details)
            loader.default_output_processor = Join()
            loader.add_xpath('title', "//meta[@name='citation_title']/@content")
            loader.add_xpath('author', "//meta[@name='citation_author']/@content")
            loader.add_xpath('booktitle', "//meta[@name='citation_inbook_title']/@content")
            loader.add_xpath('publisher', "//meta[@name='citation_publisher']/@content")
            loader.add_xpath('year', "//meta[@name='citation_publication_date']/@content")
            loader.add_xpath('abstract', "//meta[@name='description']/@content")
            loader.add_xpath('doi', "//meta[@name='citation_doi']/@content")
            loader.add_xpath('timestamp', "//meta[@name='citation_publication_date']/@content")
            loader.add_xpath('url', "//meta[@property='og:url']/@content")
            loader.add_value('ENTRYTYPE', type_of_article)
            loader.add_value('ID', load_id(loader))
        else:
            details = response
            loader = ItemLoader(item=Article(), selector=details)
            loader.default_output_processor = Join()
            loader.add_xpath('author', "//meta[@name='citation_author']/@content")
            loader.add_xpath('title', "//meta[@name='citation_title']/@content")
            loader.add_xpath('journal', "//meta[@name='citation_journal_title']/@content")
            loader.add_xpath('publisher', "//meta[@name='dc.publisher']/@content")
            loader.add_xpath('abstract', "//div[@class='c-article-section__content']/p/text()")
            loader.add_xpath('year', "//meta[@name='citation_publication_date']/@content")
            loader.add_xpath('timestamp', "//meta[@name='dc.date']/@content")
            loader.add_xpath('timestamp', "//meta[@name='citation_publication_date']/@content")
            loader.add_xpath('doi', "//meta[@name='citation_doi']/@content")
            loader.add_xpath('url', "//meta[@name='prism.url']/@content")
            loader.add_value('ENTRYTYPE', type_of_article)
            loader.add_value('ID', load_id(loader))
        yield loader.load_item()
示例#26
0
    def linkedData(self, response):
        loader = ItemLoader(item=response.meta['item'], response=response)

        address = {}
        response_json = json.loads(response.text)[0]
        address['street'] = "{}, {}".format(response_json['data']['business']['location']['address']['addressLine1'],
                                            response_json['data']['business']['location']['address']['addressLine2'],
                                            response_json['data']['business']['location']['address']['addressLine3'])
        address['city'] = response_json['data']['business']['location']['address']['city']
        address['stateprov'] = response_json['data']['business']['location']['address']['regionCode']
        address['country'] = response_json['data']['business']['location']['country']['code']
        address['postalCode'] = response_json['data']['business']['location']['address']['postalCode']
        loader.add_value('main_img_url', response_json['data']['business']['primaryPhoto']['photoUrl']['url'])
        loader.add_value('phone', response_json['data']['business']['phoneNumber']['formatted'])
        loader.add_value('average_rating', response_json['data']['business']['rating'])
        loader.add_value('address', address)
        post_data = [{"operationName":"GetBusinessHours","variables":{"BizEncId":"".join(loader.get_output_value('business_id'))},"extensions":{"documentId":"35437a3b2abdff32ea1f4d018dbfe66f58fcfb4c804b7ae1c7e341389e9de873"}}]
        yield scrapy.Request('https://www.yelp.com/gql/batch', method='POST', body=json.dumps(post_data),
            headers={'Content-Type': 'application/json'}, callback=self.getBusinessHours, meta={'item': loader.load_item()})
示例#27
0
    def parse_mr_sqlite(self, response):
        loader = ItemLoader(item=OnsiteItemSqlite(), response=response)
        loader.add_css('property_id',
                       'ul.amenities-detail li:nth-child(2)::text')
        loader.add_css('last_update',
                       'ul.amenities-detail li:nth-child(4)::text')
        loader.add_css('suburb',
                       'ul.amenities-detail li:nth-child(7) strong::text',
                       MapCompose(str.strip))
        loader.add_css('agency', 'img.sidebarAgentLogo::attr(alt)',
                       TakeFirst())
        loader.add_css('agent', 'div.pgl-agent-info h3 a::text', TakeFirst())
        loader.add_css('title',
                       'div.pgl-detail div.row div.col-sm-12 h1::text')
        loader.add_css('price',
                       'div.pgl-detail div.row div.col-sm-12 h2::text',
                       TakeFirst())
        loader.add_css('income', '#collapseOne ul li:nth-child(2)::text',
                       TakeFirst())
        loader.add_css('unit_price', '#collapseOne ul li:nth-child(3)::text',
                       TakeFirst())
        loader.add_css('multiplier',
                       '#collapseOne ul li:nth-child(4)::text',
                       re='\s(\d+[.]\d+)')
        loader.add_css('letting',
                       '#collapseTwo  li:nth-child(1)::text',
                       re='\s(\d+).*')
        loader.add_css('owner_occupy',
                       '#collapseTwo  li:nth-child(2)::text',
                       re='\s(\d+).*')
        loader.add_css('look_ups',
                       '#collapseTwo  li:nth-child(3)::text',
                       re='\s(\d+).*')
        loader.add_css('outside_agents',
                       '#collapseTwo  li:nth-child(4)::text',
                       re='\s(\d+).*')
        loader.add_css('total_unit',
                       '#collapseTwo  li:nth-child(5)::text',
                       re='\s(\d+).*')
        loader.add_css('remuneration', '#collapseThree  li:nth-child(1)::text',
                       TakeFirst())
        loader.add_css('agreement_term',
                       '#collapseThree  li:nth-child(2)::text',
                       MapCompose(str.strip),
                       re='(\d+)')
        loader.add_css('agreement_remain',
                       '#collapseThree  li:nth-child(3)::text',
                       MapCompose(str.strip),
                       re='(\d+)')
        loader.add_css('agreement_age',
                       '#collapseThree  li:nth-child(4)::text',
                       MapCompose(str.strip),
                       re='(\d+)')
        loader.add_css('office_hour', '#collapseThree  li:nth-child(5)::text')
        loader.add_css('complex_feature',
                       '#collapseThree  li:nth-child(6)::text')
        loader.add_css('manager_bed', '#collapseFour  li:nth-child(1)::text',
                       Compose(lambda v: v[1], str.strip, stop_on_none=True))
        loader.add_css('manager_bathroom',
                       '#collapseFour  li:nth-child(1)::text',
                       Compose(lambda v: v[2], str.strip, stop_on_none=True))
        loader.add_css('manager_car', '#collapseFour  li:nth-child(3)::text')
        loader.add_css('office',
                       '#collapseFour  li:nth-child(4)::text',
                       re='\s(\d+).*')
        loader.add_css('pets', '#collapseFour  li:nth-child(5)::text',
                       MapCompose(str.strip))
        loader.add_css('unit_feature', '#collapseFour  li:nth-child(6)::text')
        loader.add_css('description',
                       'div.pgl-detail div.row div.col-sm-12 p::text')
        #loader.add_value('description','tmp description')
        loader.add_value('url', response.url)
        loader.add_value('crawl_date', datetime.date.today())
        price = loader.get_output_value('price')[0]
        #self.logger.info('get out_put price={0}'.format(price))
        unit_price = loader.get_output_value('unit_price')[0]

        try:
            if price != 0:
                loader.add_value('unit_percentage',
                                 round(unit_price / price, 2))
        except Exception as e:
            print('error when calculate unit pecentage: {0}'.format(e))
            loader.add_value('unit_percentage', 0)

        try:
            income = loader.get_output_value('income')[0]
            remuneration = loader.get_output_value('remuneration')[0]
            total_unit = loader.get_output_value('total_unit')[0]
            letting = loader.get_output_value('letting')[0]
            if total_unit != 0:
                loader.add_value('wage_per_unit',
                                 round(remuneration / total_unit, 2))
            else:
                loader.add_value('wage_per_unit', 0)
            if letting != 0:
                loader.add_value('income_per_letting',
                                 round((income - remuneration) / letting, 2))
            else:
                loader.add_value('income_per_letting', 0)
        except Exception as e:
            print('error when calculate income pecentage: {0}'.format(e))
            loader.add_value('wage_per_unit', 0)
            loader.add_value('income_per_letting', 0)

        item = loader.load_item()

        return item
    def parse_statistics(self, response):
        driver = response.meta['driver']
        nav_urls = response.meta['nav_urls']
        parent_loader = response.meta['loader']
        loader = ItemLoader(parent=parent_loader, response=response)
        fiftytwo_week_high = response.xpath(
            "//tr/td/span[text()='52 Week High']/parent::td/following-sibling::td[1]/text()"
        ).get()
        loader.add_value('fiftytwo_week_high', fiftytwo_week_high)
        previous_close = locale.atof(loader.get_output_value('previous_close'))
        one_year_target_est = locale.atof(
            loader.get_output_value('one_year_target_est'))
        diff_to_52_week_high = 1 - (previous_close -
                                    locale.atof(fiftytwo_week_high))
        diff_to_1y_target_est = 1 - (one_year_target_est - previous_close)
        loader.add_value(
            'diff_to_52_week_high',
            f"{self._round_off_2_decimal(diff_to_52_week_high)}%")
        loader.add_value(
            'diff_to_1y_target_est',
            f"{self._round_off_2_decimal(diff_to_1y_target_est)}%")

        forward_pe = self._wait_and_find_elem(
            driver,
            "//tr/td/span[text()='Forward P/E']/parent::td/following-sibling::td[1]"
        ).text
        loader.add_xpath('forward_pe', forward_pe)
        market_cap = response.xpath(
            "//tr/td/span[contains(text(), 'Market Cap')]/parent::td/following-sibling::td[1]/text()"
        ).get()
        unit = market_cap[-1]
        if unit == 'B':
            multiplier = 1000
        elif unit == 'T':
            multiplier = 1000000
        else:
            multiplier = 1
        market_cap = float(market_cap[0:-1]) * multiplier
        loader.add_value('market_cap', market_cap)
        peg_ratio = response.xpath("//tr/td/span[contains(text(), 'PEG Ratio')]/parent::td/following-sibling::td[1]/text()").get() or \
            response.xpath("//tr/td/span[contains(text(), 'PEG Ratio')]/parent::td/following-sibling::td[1]/span/text()").get()
        loader.add_value('peg_ratio', peg_ratio)
        loader.add_xpath(
            'price_over_sales',
            "//tr/td/span[contains(text(), 'Price/Sales')]/parent::td/following-sibling::td[1]/text()"
        )
        price_over_book = response.xpath("//tr/td/span[contains(text(), 'Price/Book')]/parent::td/following-sibling::td[1]/text()").get() or \
            response.xpath("//tr/td/span[contains(text(), 'Price/Book')]/parent::td/following-sibling::td[1]/span/text()").get()
        loader.add_value('price_over_book', price_over_book)
        return_on_assets = response.xpath("//tr/td/span[contains(text(), 'Return on Assets')]/parent::td/following-sibling::td[1]/text()").get() or \
            response.xpath("//tr/td/span[contains(text(), 'Return on Assets')]/parent::td/following-sibling::td[1]/span/text()").get()
        loader.add_value('return_on_assets', return_on_assets)
        return_on_equity = response.xpath("//tr/td/span[contains(text(), 'Return on Equity')]/parent::td/following-sibling::td[1]/text()").get() or \
            response.xpath("//tr/td/span[contains(text(), 'Return on Equity')]/parent::td/following-sibling::td[1]/span/text()").get()
        loader.add_value('return_on_equity', return_on_equity)
        loader.add_xpath(
            'diluted_eps',
            "//tr/td/span[contains(text(), 'Diluted EPS')]/parent::td/following-sibling::td[1]/text()"
        )
        quarterly_earnings_growth = response.xpath("//tr/td/span[contains(text(), 'Quarterly Earnings Growth')]/parent::td/following-sibling::td[1]/text()").get() or \
            response.xpath("//tr/td/span[contains(text(), 'Quarterly Earnings Growth')]/parent::td/following-sibling::td[1]/span/text()").get()
        loader.add_value('quarterly_earnings_growth',
                         quarterly_earnings_growth)
        fwd_annual_dividend_rate = response.xpath("//tr/td/span[contains(text(), 'Forward Annual Dividend Rate')]/parent::td/following-sibling::td[1]/text()").get() or \
            response.xpath("//tr/td/span[contains(text(), 'Forward Annual Dividend Rate')]/parent::td/following-sibling::td[1]/span/text()").get()
        loader.add_value('fwd_annual_dividend_rate', fwd_annual_dividend_rate)
        fwd_annual_dividend_yield = response.xpath("//tr/td/span[contains(text(), 'Forward Annual Dividend Yield')]/parent::td/following-sibling::td[1]/text()").get() or \
            response.xpath("//tr/td/span[contains(text(), 'Forward Annual Dividend Yield')]/parent::td/following-sibling::td[1]/span/text()").get()
        loader.add_value('fwd_annual_dividend_yield',
                         fwd_annual_dividend_yield)
        ex_dividend_date = response.xpath("//tr/td/span[contains(text(), 'Ex-Dividend Date')]/parent::td/following-sibling::td[1]/text()").get() or \
            response.xpath("//tr/td/span[contains(text(), 'Ex-Dividend Date')]/parent::td/following-sibling::td[1]/span/text()").get()
        loader.add_value('ex_dividend_date', ex_dividend_date)

        yield SeleniumRequest(url=nav_urls['profile_url'],
                              callback=self.parse_profile,
                              previous_response=response,
                              meta={
                                  "loader": loader,
                                  "nav_urls": nav_urls
                              })