def parse(self, response): # follow pagination links if response.xpath( "//a[@class='results']/span[@class='next']").extract_first(): next = response.xpath("//a[@class='results']/@href").extract()[-1] full_url = urlparse.urljoin(response.url, next) yield scrapy.Request(url=full_url, headers=self.headers, callback=self.parse, dont_filter=True) l = ItemLoader(item=BabiesrusItem(), response=response) l.add_xpath('name', "//a[contains(@class, 'prodtitle')]/text()") l.add_xpath('price', "//span[contains(@class, 'ourPrice2')]/text()") l.add_xpath('ratings', "//span[contains(@class, 'pr-rounded')]/text()") l.add_xpath('producturl', "//a[contains(@class, 'prodtitle')]/@href") l.add_value('pageurl', response.url) names = l.get_output_value('name') prices = l.get_output_value('price') ratingss = l.get_output_value('ratings') producturls = l.get_output_value('producturl') pageurl = response.url for i in range(len(names)): yield { 'name': names[i], 'price': prices[i], 'ratings': ratingss[i], 'producturl': producturls[i], 'pageurl': pageurl }
def details(self, response): item = EducationItem() # print "!!!!!!!!!!!!!!!!!!!!!!!!",response.url l = ItemLoader(item=EducationItem(), response=response) l.add_xpath('title', '//h1[@id="firstHeading"]') title = l.get_output_value('title') # print "++++++++++++++++++++++++++++++",title l.add_xpath('details', '//div[@class="mw-parser-output"]/p') details = l.get_output_value('details') # print "==================",details return l.load_item() print "============================", item
def parse_post(self, response): new = ItemLoader(item=PostItem(), response=response, parent=response.meta['item']) new.add_xpath( 'source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href, 'post_id')]/strong/text()" ) new.add_xpath('date', '//div/div/abbr/text()') new.add_xpath( 'text', '//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()' ) new.add_xpath( 'reactions', "//a[contains(@href, 'reaction/profile')]/div/div/text()") if new.get_output_value('comments'): yield scrapy.Request(response.urljoin( response.meta['item'].get_output_value('url')), callback=self.parse_comments, dont_filter=True, meta={'item': new}) reactions = response.xpath( "//div[contains(@id, 'sentence')]/a[contains(@href, 'reaction/profile')]/@href" ) reactions = response.urljoin(reactions[0].extract()) yield scrapy.Request(reactions, callback=self.parse_reactions, dont_filter=True, meta={'item': new})
def parse_category(self, response): meta = response.meta categories_dom_xpath = '/html/body/header/div[@class="main-nav"]/div/nav/ul/li' category_url_xpath = './a/@href' category_name_xpath = './a/span/text()' categories_dom = response.xpath(categories_dom_xpath) for category_dom in categories_dom: category_loader = ItemLoader(item=CategoryItem(), selector=category_dom) category_loader.add_xpath('category_id', category_url_xpath) category_loader.add_xpath('category_name', category_name_xpath) yield category_loader.load_item() category_id = category_loader.get_output_value('category_id') page = 1 # request to onsale product page yield scrapy.Request(url=self.sale_urls[ONSALE].format( category_id, page), callback=self.parse_product, meta={ 'time': ONSALE, 'category_id': category_id }) # request to coming sale product page yield scrapy.Request(url=self.sale_urls[COMING].format( category_id, page), callback=self.parse_product, meta={ 'time': COMING, 'category_id': category_id })
def parse(self, response, **kwargs): loader = ItemLoader(item=YelpItem(), response=response) loader.default_output_processor = TakeFirst() for script in response.css('script').getall(): if '{"gaConfig' in script: detail_json = json.loads( re.search(r'({"gaConfig.*?)-->', script).group(1)) loader.add_value('direct_url', detail_json['staticUrl']) loader.add_value( 'business_id', detail_json['bizDetailsPageProps'] ['bizContactInfoProps']['businessId']) loader.add_value( 'categories', detail_json['gaConfig']['dimensions']['www'] ['second_level_categories'][1]) loader.add_value( 'site', detail_json['bizDetailsPageProps']['bizContactInfoProps'] ['businessWebsite']['linkText']) loader.add_value('title', detail_json['bizDetailsPageProps']['businessName']) loader.add_value( 'review_count', detail_json['bizDetailsPageProps'] ['ratingDetailsProps']['numReviews']) yield scrapy.Request( 'https://www.yelp.com/biz_attribute?biz_id={}'.format("".join( loader.get_output_value('business_id'))), method='GET', callback=self.linkedData, meta={'item': loader.load_item()})
def parse_table(self, response): data = ItemLoader(item=ParsetauntondeedsItem(), response=response) data.add_xpath( 'date', '//table[@class="grid"]//tr[@onmouseout="this.className=this.originalClass;"]/' 'td[2]/text()') data.add_xpath( 'type', '//table[@class="grid"]//tr[@onmouseout="this.className=this.originalClass;"]/' 'td[3]/text()') data.add_xpath( 'book', '//table[@class="grid"]//tr[@onmouseout="this.className=this.originalClass;"]/' 'td[4]/text()') data.add_xpath( 'page_num', '//table[@class="grid"]//tr[@onmouseout="this.className=this.originalClass;"]/' 'td[5]/text()') data.add_xpath( 'doc_num', '//table[@class="grid"]//tr[@onmouseout="this.className=this.originalClass;"]/' 'td[6]/text()') data.add_xpath( 'city', '//table[@class="grid"]//tr[@onmouseout="this.className=this.originalClass;"]/' 'td[7]/text()') data.add_xpath( 'description', '//table[@class="grid"]//tr[@onmouseout="this.className=this.originalClass;"]/' 'td[8]/span/text()') data.add_value( 'cost', parse_functions.get_cost(data.get_output_value('description')), MapCompose(float)) data.add_value( 'street_address', parse_functions.get_street_address( data.get_output_value('description'))) data.add_value( 'state', parse_functions.get_state(data.get_output_value('description'))) data.add_value( 'zip', parse_functions.get_zip(data.get_output_value('description'))) return data.load_item()
def test_get_output_value_list(self): """Getting output value must not remove value from item""" input_item = self.item_class(name=["foo", "bar"]) il = ItemLoader(item=input_item) self.assertEqual(il.get_output_value("name"), ["foo", "bar"]) loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) self.assertEqual(loaded_item, dict({"name": ["foo", "bar"]}))
def test_get_output_value_singlevalue(self): """Getting output value must not remove value from item""" input_item = self.item_class(name='foo') il = ItemLoader(item=input_item) self.assertEqual(il.get_output_value('name'), ['foo']) loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) self.assertEqual(ItemAdapter(loaded_item).asdict(), dict({'name': ['foo']}))
def test_get_output_value_list(self): """Getting output value must not remove value from item""" input_item = self.item_class(name=['foo', 'bar']) il = ItemLoader(item=input_item) self.assertEqual(il.get_output_value('name'), ['foo', 'bar']) loaded_item = il.load_item() self.assertIsInstance(loaded_item, self.item_class) self.assertEqual(loaded_item, dict({'name': ['foo', 'bar']}))
def _article_requests(self, response): index = self.order request_list = [] # 用来存放返回的request对象 loader = ItemLoader(item=XfjyItemItem(), response=response) loader.add_xpath("tags_list", "string(//div[@class='weizhi']//td)") tags_list = loader.get_output_value("tags_list") # 解析出文章title,date,url,并且进行文章爬取 tr_tags = response.xpath( "//div[@class='main_nei_you_baio_content']//tr[@height='20']") for tr_tag in tr_tags: # 提取文章的url,并且拼接为完整的链接 url = tr_tag.xpath(".//a//@href").extract_first() if url: url = response.urljoin(url) else: self.log("没有解析到文章url,板块链接:%s" % response.url, level=logging.ERROR) # 将title提取出来并且进行解析 title = tr_tag.xpath(".//a//@title").extract_first() if title: title = "".join(title.split()) else: self.log("没有解析到title,板块链接:%s" % response.url, level=logging.WARNING) # 将date提取出来,并且进行解析成时间戳 date = tr_tag.xpath( ".//span[@class='timestyle44007']//text()").extract_first() if date: date = int( time.mktime( time.strptime("".join(date.split()), "%Y年%m月%d日"))) else: self.log("没有解析到date,板块链接:%s" % response.url, level=logging.WARNING) date = None exist = self.filter.filter(url) if not exist: request = Request( url, meta={ "title": title, "date": date, "tags_list": tags_list, "type": "article", "index": index }, callback=self.parse_article, ) request_list.append(request) index += 1 return request_list
def getAmenities(self, response): loader = ItemLoader(item=response.meta['item'], response=response) response_json = json.loads(response.text)[0] if response_json['data']['business']['organizedProperties']: amenities = [amenity['displayText'] for amenity in response_json['data']['business']['organizedProperties'][0]['properties']] loader.add_value('amenities', amenities) yield scrapy.Request('https://www.yelp.com/biz/{}/props'.format("".join(loader.get_output_value('business_id'))) , method='GET', headers={'Content-Type': 'application/json', 'X-Requested-With': 'XMLHttpRequest', 'Accept': 'application/json', 'Referer':'https://www.yelp.com/biz/fog-harbor-fish-house-san-francisco-2'}, callback=self.getAbout, meta={'item': loader.load_item()})
def parse(self, response): #l = ItemLoader(item=Product(), response=response) # Data from xpath1 is extracted, and passed through the input processor of the name field. The result of the # input processor is collected and kept in the Item Loader (but not yet assigned to the item). #l.add_css("title", "h1.post-title::text") #for i in response.css('title::text').extract(): #l.add_css("title", i.encode(utf-8)) #l.add_xpath('name', '//div[@class="product_title"]') #l.add_css("title", 'title::text') #l.add_css("img", "a > .wp-post-image::attr(src)") #l.add_css("tags", ".bs-cat a::text, .bs-tags a::text") #l.add_css("text", ".pf-content") #l.add_css("date_of_publish", '.rp-date') #l.add_css("source", '.ai-info h6 a::text') #l.add_value("url", response.url) #yield l.load_item() l = ItemLoader(item=Product(), response=response) l.add_css("author", '.bauthor::text') l.add_css("title", ".btitle::text") l.add_css("date_of_publish", '.bdate::text') yield l.load_item() with open('blog_data.txt', 'a') as f: title_list = l.get_output_value('title') author_list = l.get_output_value('author') data_list = l.get_output_value('date_of_publish') for author in author_list: f.write('author: {0}\n'.format(author.encode('utf-8'))) for title in title_list: f.write('title: {0}\n'.format(title.encode('utf-8'))) for data in data_list: f.write('data: {0}\n'.format(data.encode('utf-8')))
def parse(self, response): for recipe in response.xpath("//div[@class='recipe-page']"): loader = ItemLoader(item=TitleItem(), selector=recipe) loader.add_xpath('title', "//div[@class='recipe-title']/h1") yield { 'title': loader.get_output_value('title'), 'ingredients': response.xpath( '//div[contains(@class, "ingredients-card")]//li//text()'). getall(), 'link': response.url }
def parse_item(self, response): """ @url http://splash:8050/render.html?&url=http://www.nettruyenco.com/truyen-tranh/boyfriend-17550&wait=1 @scrapes name source image_src total_chap description chapters web_source full """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath("unicode_name", '//h1[@class="title-detail"]/text()') manga.add_value("name", unidecode(manga.get_output_value("unicode_name")[0])) manga.add_value("source", response.url) manga.add_xpath("image_src", '//*[@class="col-xs-4 col-image"]/img/@src') manga.add_xpath("description", '//*[@class="detail-content"]/p//text()', Join("\n")) chapter_xpath = '//*[@id="nt_listchapter"]/nav/ul/li[not(contains (@class, "row heading"))]/div[1]/a' chapter_source = manga.get_xpath(chapter_xpath + "/@href") chapter_name = manga.get_xpath(chapter_xpath + "/text()") chapters = zip(chapter_name, chapter_source) if "Hoàn thành" in manga.get_xpath( '//*[@class="status row"]/p[2]/text()'): manga.add_value("full", True) manga.add_value( "total_chap", manga.get_xpath( chapter_xpath + "/text()", MapCompose(lambda x: re.findall(r"\d+", x)), MapCompose(int), )[0], ) else: manga.add_value("full", False) manga.add_value( "total_chap", manga.get_xpath( "//title/text()", MapCompose( lambda x: re.findall(r" Chapter \d+| Chap \d+", x)), MapCompose(lambda x: re.findall(r"\d+", x)), MapCompose(float), MapCompose(int), TakeFirst(), ), ) manga.add_value("chapters", chapters) manga.add_value("web_source", "nettruyen") print(manga.load_item()) return manga.load_item()
def parse_feed(self, response): if response.status == 200: loader = ItemLoader(item=Feed(), response=response) loader.add_value('id', response.meta['id']) loader.add_value('user_id', response.meta['user_id']) loader.add_value('feed_id', response.meta['feed_id']) loader.add_value('feed_url', response.url) loader.add_xpath( 'post_time', "//div[starts-with(@data-ft,'{\"tn\":')]/div/abbr/text()") loader.add_xpath('content', '//title/text()') content = loader.get_output_value('content') if content == 'Photo' or 'Profile Pictures' or 'Cover Photos': # type = 'photo' loader.add_value('type', 'photo') else: if content == 'Comments': # type = 'comments' loader.add_value('type', 'comments') else: # type = 'regular' loader.add_value('type', 'regular') headline = response.xpath( 'string((//div[@id="root"]//table' '[@role="presentation"])[1]//h3)').extract_first() if headline: loader.add_value('headline', headline) loader.add_xpath( 'links', '(//div[@id="root"]//table[@role="presentation"]' ')[1]//strong/following-sibling::a/@href') location_selector = response.xpath( "//div[starts-with(@data-ft,'{\"tn\":')]" "div/abbr/following-sibling::a") if location_selector: loader.add_value( 'location', { 'location': location_selector.xpath( './text()').extract_first(), 'url': location_selector.xpath( './@href').extract_first() }) loader.add_value('timestamp', datetime.datetime.now()) return loader.load_item() else: pass
def parse(self, response): data = re.findall(r"global.document.metadata=(.+?);\n", response.body.decode("utf-8"), re.S) data_dict = json.loads(data[0]) if data_dict: if data_dict['contentType'] == 'books': loader = ItemLoader(item=Book(), selector=response) loader.default_output_processor = Join() loader.add_value('title', data_dict['title']) loader.add_value('author', [i['name'] for i in data_dict['authors']]) loader.add_value('publisher', data_dict['publisher']) loader.add_value('chapters', '0') loader.add_value('abstract', data_dict['abstract']) loader.add_value('doi', data_dict['doi']) loader.add_value('ISBN', [i['value'] for i in data_dict['isbn']][1]) loader.add_value('url', self.start_urls[0]) loader.add_value('ID', loader.get_output_value('author').split(' ')[0]) loader.add_value('ENTRYTYPE', 'Book') elif data_dict['contentType'] == 'conferences' or data_dict['contentType'] == 'chapter': loader = ItemLoader(item=ConferencePaper(), selector=response) loader.default_output_processor = Join() loader.add_value('title', data_dict['title']) loader.add_value('author', [i['name'] for i in data_dict['authors']]) loader.add_value('booktitle', data_dict['publicationTitle']) loader.add_value('publisher', data_dict['publisher']) loader.add_value('year', data_dict['publicationYear']) loader.add_value('abstract', data_dict['abstract']) loader.add_value('doi', data_dict['doi']) loader.add_value('timestamp', data_dict['publicationDate']) loader.add_value('url', self.start_urls[0]) loader.add_value('ENTRYTYPE', 'paper') loader.add_value('ID', load_id(loader)) else: loader = ItemLoader(item=Article(), selector=response) loader.default_output_processor = Join() loader.add_value('author', [i['name'] for i in data_dict['authors']]) loader.add_value('title', data_dict['title']) loader.add_value('journal', data_dict['publicationTitle']) loader.add_value('publisher', data_dict['publisher']) loader.add_value('abstract', data_dict['abstract']) loader.add_value('year', data_dict['publicationYear']) loader.add_value('timestamp', data_dict['publicationDate']) loader.add_value('doi', data_dict['doi']) loader.add_value('url', self.start_urls[0]) loader.add_value('ENTRYTYPE', 'article') loader.add_value('ID', load_id(loader)) yield loader.load_item()
def parse_item(self, response): """ @url https://doctruyen3q.info/truyen-tranh/dao-hai-tac/77 @scrapes name source image_src total_chap description chapters web_source full unicode_name """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) category = manga.get_xpath("//*[@class='category row']/p[2]//text()") categories = re.sub(r'\s+', '', "".join(category)) if any(i in unidecode(categories).lower() for i in ["18+", "smut", "yaoi", "ntr", "yuri", 'adult', 'dammy']): return manga.add_xpath("unicode_name", '//h1[@class="title-manga"]/text()') manga.add_value("name", unidecode( manga.get_output_value("unicode_name")[0])) manga.add_value("source", response.url) manga.add_xpath( "image_src", '//*[@class="image-comic"]/@src') manga.add_xpath( "description", '//*[@class="detail-summary"]/text()' ) chapter_xpath = '//*[@id="list-chapter-dt"]/nav/ul/li/div[1]/a' chapter_source = manga.get_xpath(chapter_xpath + "/@href") chapter_name = manga.get_xpath(chapter_xpath + "/text()") chapters = zip(chapter_name, chapter_source) if "Đã hoàn thành" in manga.get_xpath('//*[@class="status row"]//text()'): manga.add_value("full", True) else: manga.add_value("full", False) manga.add_value( "total_chap", manga.get_xpath( '//*[@id="list-chapter-dt"]/nav/ul/li[1]/div[1]/a/text()', MapCompose(lambda x: re.findall(r"(\d+(?:\.\d+)?)", x)), MapCompose(float), MapCompose(int), TakeFirst(), ), ) manga.add_value("chapters", chapters) manga.add_value("web_source", "doctruyen3q") print(manga.load_item()) return manga.load_item()
def parse_homepage(self, response): loader = ItemLoader(item=FacebookProfile()) parsed = urlparse(response.url) base_url = '{}://{}/{}'.format(parsed.scheme, parsed.netloc, filter(bool, parsed.path.split('/'))[0]) if 'id=' in parsed.query and '/profile.php' in parsed.path: loader.add_value( 'profile_url', base_url + '?id=' + parse_qs(parsed.query)['id'][0]) base_url = base_url + '?id=' + \ parse_qs(parsed.query)['id'][0] + '&' # loader.add_value('user_id', parse_qs(parsed.query)['id']) else: loader.add_value('profile_url', base_url) base_url = base_url + '?' loader.add_value('user_name', parsed.path[1:]) # parse about page # get id in the database # print loader.get_output_value('profile_url') id = get_id(loader.get_output_value('profile_url')) loader.add_value('id', id) yield Request(url=base_url + 'v=info', callback=self.parse_about_page, priority=1000, meta={ 'loader': loader, 'base_url': base_url, 'search_friends_depth': response.meta.get( 'search_friends_depth', self.settings.get('SEARCH_FRIENDS_DEPTH', 1)), 'id': id, 'friend_with': response.meta.get('friend_with', None), 'enable_selenium': True, 'title': response.xpath('//title/text()').extract_first() })
def parse_item(self, response): """ @url http://splash:8050/render.html?&url=https://vlogtruyen.net/bokutachi-wa-hanshoku-wo-yameta.html&wait=1 @scrapes name unicode_name source image_src total_chap description chapters web_source full """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath("unicode_name", '//h1[@class="title-commic-detail"]/text()') manga.add_value("name", unidecode(manga.get_output_value("unicode_name")[0])) manga.add_value("source", response.url) manga.add_xpath("image_src", '//meta[@property="og:image"]/@content') manga.add_xpath("description", '//*[@class="desc-commic-detail"]/text()', Join("\n")) chapter_xpath = '//*[@class="ul-list-chaper-detail-commic"]/li/a' chapter_source = manga.get_xpath(chapter_xpath + "/@href") chapter_name = manga.get_xpath(chapter_xpath + "/h3/text()") chapters = zip(chapter_name, chapter_source) if "Đã hoàn thành" in manga.get_xpath( '//*[@class="manga-status"]/p/text()'): manga.add_value("full", True) else: manga.add_value("full", False) manga.add_value( "total_chap", manga.get_xpath( '//*[@class="ul-list-chaper-detail-commic"]/li[1]/a/h3/text()', MapCompose(lambda x: re.findall(r"(\d+(?:\.\d+)?)", x)), TakeFirst(), ), ) manga.add_value("chapters", chapters) manga.add_value("web_source", "vlogtruyen") return manga.load_item()
def parse_item(self, response): """ @url https://mangasee123.com/manga/Kingdom @scrapes name source image_src total_chap description chapters web_source full """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath( "unicode_name", "//div[@class='container MainContainer']//li[1]/h1/text()") manga.add_value("name", unidecode(manga.get_output_value("unicode_name")[0])) manga.add_value("source", response.url) manga.add_xpath("image_src", '//meta[@property="og:image"]/@content') manga.add_xpath("description", "//div[@class='top-5 Content']/text()", Join("\n")) if "Complete (Publish)" in manga.get_xpath( '//*[@class="PublishStatus"]/text()'): manga.add_value("full", True) else: manga.add_value("full", False) rss = manga.get_xpath("//a[normalize-space()='RSS Feed']/@href") rss_url = BASE_URL + rss[0] feed = feedparser.parse(rss_url, agent="Mozilla/5.0") manga.add_value( "total_chap", re.findall(r"\d+", feed['entries'][0]['title'])[0], ) chapters = [(i['title'], i['link']) for i in feed['entries']] manga.add_value("chapters", chapters) manga.add_value("web_source", "mangaseeonline") return manga.load_item()
def parse(self, response, **kwargs): loader = ItemLoader(item=YelpItem(), response=response) for script in response.css('script').getall(): if '{"gaConfig' in script: detail_json = json.loads(re.search(r'({"gaConfig.*?)-->', script).group(1)) loader.add_value('direct_url', detail_json['staticUrl']) loader.add_value('business_id', detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessId']) loader.add_value('categories', detail_json['gaConfig']['dimensions']['www']['second_level_categories'][1]) if detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessWebsite']: loader.add_value('site', detail_json['bizDetailsPageProps']['bizContactInfoProps']['businessWebsite']['linkText']) loader.add_value('title', detail_json['bizDetailsPageProps']['businessName']) loader.add_value('review_count', detail_json['bizDetailsPageProps']['ratingDetailsProps']['numReviews']) #TODO: find way to not use hardcoded documentIds post_data = [{"operationName":"getLocalBusinessJsonLinkedData","variables":{"BizEncId": "".join(loader.get_output_value('business_id'))},"extensions":{"documentId":"1cf362b8e8f9b3dae26d9f55e7204acd8355c916348a038f913845670139f60a"}}] yield scrapy.Request('https://www.yelp.com/gql/batch', method='POST', body=json.dumps(post_data), headers={'Content-Type': 'application/json'}, callback=self.linkedData, meta={'item': loader.load_item()})
def parse_filing_documents(self, response): request_manager = response.meta['request_manager'] docket_loader = response.meta['docket_loader'] table_rows = response.xpath( "//div[@id='apexir_DATA_PANEL']//table[@class='apexir_WORKSHEET_DATA']//" "tr[@class='even'] | //tr[@class='odd']") if table_rows: for row in table_rows: filing_loader = ItemLoader(item=Filing(), response=response, selector=row) filing_loader.add_xpath('description', 'td[@headers="DESCRIPTION"]/text()') print("d. {} ".format( filing_loader.get_output_value('description'))) filing_loader.add_xpath('filled_on', 'td[@headers="FILING_DATE"]/text()') filing_loader.add_xpath( 'types', 'td[@headers="DOCUMENT_TYPE"]//u/text()') filing_loader.add_xpath('filing_parties', "td[@headers='FILED_BY']/text()") document_link = row.xpath( "td[@headers='DOCUMENT_TYPE']/a/@href").get() if document_link != 'http://www.cpuc.ca.gov/orderadocument/': request_parameters = { 'document_link': document_link, 'docket_loader': docket_loader, 'filing_loader': filing_loader } request_manager.filing_requests.append(request_parameters) next_btn = response.xpath( '//*[@id="apexir_DATA_PANEL"]/table/tr[1]/td/span/a/@href' ).get() if next_btn: next_btn = next_btn.split("'")[1] if response.xpath( "//*[@name='p_instance']/@value").get() is None: p_instance = response.meta['p_instance'] else: p_instance = response.xpath( "//*[@name='p_instance']/@value").get(), formdata = { 'p_request': 'APXWGT', 'p_instance': p_instance, 'p_flow_id': '401', 'p_flow_step_id': '57', 'p_widget_num_return': '100', 'p_widget_name': 'worksheet', 'p_widget_mod': 'ACTION', 'p_widget_action': 'PAGE', 'p_widget_action_mod': next_btn, 'x01': response.xpath( '//input[@id="apexir_WORKSHEET_ID"]/@value').get(), 'x02': response.xpath( '//input[@id="apexir_REPORT_ID"]/@value').get(), } yield scrapy.FormRequest( 'https://apps.cpuc.ca.gov/apex/wwv_flow.show', formdata=formdata, method="POST", callback=self.parse_filing_documents, meta={ 'docket_loader': docket_loader, 'cookiejar': response.meta['cookiejar'], 'request_manager': request_manager, 'p_instance': p_instance }) else: if request_manager.filing_requests: request_parameters = request_manager.filing_requests.pop() print("length of request_manager {} ".format( len(request_manager.filing_requests))) # yield response.follow(request_parameters['document_link'], # meta={ # 'dont_merge_cookies': True, # 'docket_loader': request_parameters['docket_loader'], # 'filing_loader': request_parameters['filing_loader'], # 'request_manager': request_manager}, # callback=self.parse_document_page # ) else: return docket_loader.load_item()
def getBusinessHours(self, response): loader = ItemLoader(item=response.meta['item'], response=response) response_json = json.loads(response.text)[0] schedule = dict() if response_json['data']['business']['operationHours']: for day in response_json['data']['business']['operationHours']['regularHoursMergedWithSpecialHoursForCurrentWeek']: schedule[day['dayOfWeekShort']] = "".join(day['regularHours']) loader.add_value('schedule', schedule) post_data = [{"operationName":"GetBizPageProperties","variables":{"BizEncId":"".join(loader.get_output_value('business_id'))},"extensions":{"documentId":"f06d155f02e55e7aadb01d6469e34d4bad301f14b6e0eba92a31e635694ebc21"}}] yield scrapy.Request('https://www.yelp.com/gql/batch', method='POST', body=json.dumps(post_data), headers={'Content-Type': 'application/json'}, callback=self.getAmenities, meta={'item': loader.load_item()})
def parse(self, response): r""" Parse the page * The type of the publication is found out from meta tag og:type. * The fields are extracted from the web-page from javascript variable global.document.metadata, selector is the response object itself and loaded into Article or Book or ConferencePaper Item depend on the contentType * The javascript variable is extracted using regex r"global.document.metadata=(.+?);" and saved as a json object * title, * author, * Journal, * publisher, * year, * abstract, * doi, * timestamp, * url, * booktitle, * ENTRYTYPE, * ID, (The ID populated from the function load_id)) :return: Itemloader (item= Article or Conference paper depending on the type) """ data = re.findall(r"global.document.metadata=(.+?);\n", response.body.decode("utf-8"), re.S) data_dict = json.loads(data[0]) if data_dict: if data_dict['contentType'] == 'books': loader = ItemLoader(item=Book(), selector=response) loader.default_output_processor = Join() loader.add_value('title', data_dict['title']) loader.add_value('author', [i['name'] for i in data_dict['authors']]) loader.add_value('publisher', data_dict['publisher']) loader.add_value('chapters', '0') loader.add_value('abstract', data_dict['abstract']) loader.add_value('doi', data_dict['doi']) loader.add_value('ISBN', [i['value'] for i in data_dict['isbn']][1]) loader.add_value('url', self.start_urls[0]) loader.add_value('ID', loader.get_output_value('author').split(' ')[0]) loader.add_value('ENTRYTYPE', 'Book') elif data_dict['contentType'] == 'conferences' or data_dict['contentType'] == 'chapter': loader = ItemLoader(item=ConferencePaper(), selector=response) loader.default_output_processor = Join() loader.add_value('title', data_dict['title']) loader.add_value('author', [i['name'] for i in data_dict['authors']]) loader.add_value('booktitle', data_dict['publicationTitle']) loader.add_value('publisher', data_dict['publisher']) loader.add_value('year', data_dict['publicationYear']) loader.add_value('abstract', data_dict['abstract']) loader.add_value('doi', data_dict['doi']) loader.add_value('timestamp', data_dict['publicationDate']) loader.add_value('url', self.start_urls[0]) loader.add_value('ENTRYTYPE', 'paper') loader.add_value('ID', load_id(loader)) else: loader = ItemLoader(item=Article(), selector=response) loader.default_output_processor = Join() loader.add_value('author', [i['name'] for i in data_dict['authors']]) loader.add_value('title', data_dict['title']) loader.add_value('journal', data_dict['publicationTitle']) loader.add_value('publisher', data_dict['publisher']) loader.add_value('abstract', data_dict['abstract']) loader.add_value('year', data_dict['publicationYear']) loader.add_value('timestamp', data_dict['journalDisplayDateOfPublication']) loader.add_value('doi', data_dict['doi']) loader.add_value('url', self.start_urls[0]) loader.add_value('ENTRYTYPE', 'article') loader.add_value('ID', load_id(loader)) yield loader.load_item()
def parse(self, response): """ Parse the page. * The type of the publication is found out from meta tag og:type * The fields are extracted from the web-page from meta tag , selector is the response object itself and loaded into Article Item * title,(//div[@class='page-title']/h1/text()) * author, (//span[@class='authors-affiliations__name']/text()) * Journal, * publisher,(//span[@id='publisher-name']/text()) * chapters, (//span[@class='c-tabs__deemphasize']/text()) * year, (//meta[@name='citation_publication_date']/@content) * abstract, (//meta[@name='description']/@content) * doi, (//input[@name='doi']/@value) * timestamp, (//meta[@name='citation_publication_date']/@content) * url, (//meta[@property='og:url']/@content) * booktitle, (//meta[@name='citation_inbook_title']/@content) * ENTRYTYPE, (//meta[@property='og:type']/@content) * ID, (The ID populated from the function load_id)) :return: Itemloader (item= Article or Conference paper depending on the type) """ if response.xpath("//meta[@property='og:type']/@content").extract(): type_of_article = response.xpath("//meta[@property='og:type']/@content").extract()[0] elif response.xpath("//span[@class='test-content-type']/text()"): type_of_article = 'Book' else: return None if type_of_article == 'Book': book = response.xpath("//body") loader = ItemLoader(item=Book(), selector=book) loader.default_output_processor = Join() loader.add_xpath('title', "//div[@class='page-title']/h1/text()") loader.add_xpath('author', "//span[@class='authors-affiliations__name']/text()") loader.add_xpath('publisher', "//span[@id='publisher-name']/text()") loader.add_xpath('chapters', "//span[@class='c-tabs__deemphasize']/text()") loader.add_xpath('abstract', "//meta[@name='description']/@content") loader.add_xpath('doi', "//input[@name='doi']/@value") loader.add_xpath('ISBN', "//span[@id='electronic-isbn']/text()") loader.add_value('url', self.start_urls[0]) loader.add_value('ID', loader.get_output_value('author').split(' ')[0]) loader.add_value('ENTRYTYPE', 'Book') elif type_of_article == 'Paper': details = response loader = ItemLoader(item=ConferencePaper(), selector=details) loader.default_output_processor = Join() loader.add_xpath('title', "//meta[@name='citation_title']/@content") loader.add_xpath('author', "//meta[@name='citation_author']/@content") loader.add_xpath('booktitle', "//meta[@name='citation_inbook_title']/@content") loader.add_xpath('publisher', "//meta[@name='citation_publisher']/@content") loader.add_xpath('year', "//meta[@name='citation_publication_date']/@content") loader.add_xpath('abstract', "//meta[@name='description']/@content") loader.add_xpath('doi', "//meta[@name='citation_doi']/@content") loader.add_xpath('timestamp', "//meta[@name='citation_publication_date']/@content") loader.add_xpath('url', "//meta[@property='og:url']/@content") loader.add_value('ENTRYTYPE', type_of_article) loader.add_value('ID', load_id(loader)) else: details = response loader = ItemLoader(item=Article(), selector=details) loader.default_output_processor = Join() loader.add_xpath('author', "//meta[@name='citation_author']/@content") loader.add_xpath('title', "//meta[@name='citation_title']/@content") loader.add_xpath('journal', "//meta[@name='citation_journal_title']/@content") loader.add_xpath('publisher', "//meta[@name='dc.publisher']/@content") loader.add_xpath('abstract', "//div[@class='c-article-section__content']/p/text()") loader.add_xpath('year', "//meta[@name='citation_publication_date']/@content") loader.add_xpath('timestamp', "//meta[@name='dc.date']/@content") loader.add_xpath('timestamp', "//meta[@name='citation_publication_date']/@content") loader.add_xpath('doi', "//meta[@name='citation_doi']/@content") loader.add_xpath('url', "//meta[@name='prism.url']/@content") loader.add_value('ENTRYTYPE', type_of_article) loader.add_value('ID', load_id(loader)) yield loader.load_item()
def linkedData(self, response): loader = ItemLoader(item=response.meta['item'], response=response) address = {} response_json = json.loads(response.text)[0] address['street'] = "{}, {}".format(response_json['data']['business']['location']['address']['addressLine1'], response_json['data']['business']['location']['address']['addressLine2'], response_json['data']['business']['location']['address']['addressLine3']) address['city'] = response_json['data']['business']['location']['address']['city'] address['stateprov'] = response_json['data']['business']['location']['address']['regionCode'] address['country'] = response_json['data']['business']['location']['country']['code'] address['postalCode'] = response_json['data']['business']['location']['address']['postalCode'] loader.add_value('main_img_url', response_json['data']['business']['primaryPhoto']['photoUrl']['url']) loader.add_value('phone', response_json['data']['business']['phoneNumber']['formatted']) loader.add_value('average_rating', response_json['data']['business']['rating']) loader.add_value('address', address) post_data = [{"operationName":"GetBusinessHours","variables":{"BizEncId":"".join(loader.get_output_value('business_id'))},"extensions":{"documentId":"35437a3b2abdff32ea1f4d018dbfe66f58fcfb4c804b7ae1c7e341389e9de873"}}] yield scrapy.Request('https://www.yelp.com/gql/batch', method='POST', body=json.dumps(post_data), headers={'Content-Type': 'application/json'}, callback=self.getBusinessHours, meta={'item': loader.load_item()})
def parse_mr_sqlite(self, response): loader = ItemLoader(item=OnsiteItemSqlite(), response=response) loader.add_css('property_id', 'ul.amenities-detail li:nth-child(2)::text') loader.add_css('last_update', 'ul.amenities-detail li:nth-child(4)::text') loader.add_css('suburb', 'ul.amenities-detail li:nth-child(7) strong::text', MapCompose(str.strip)) loader.add_css('agency', 'img.sidebarAgentLogo::attr(alt)', TakeFirst()) loader.add_css('agent', 'div.pgl-agent-info h3 a::text', TakeFirst()) loader.add_css('title', 'div.pgl-detail div.row div.col-sm-12 h1::text') loader.add_css('price', 'div.pgl-detail div.row div.col-sm-12 h2::text', TakeFirst()) loader.add_css('income', '#collapseOne ul li:nth-child(2)::text', TakeFirst()) loader.add_css('unit_price', '#collapseOne ul li:nth-child(3)::text', TakeFirst()) loader.add_css('multiplier', '#collapseOne ul li:nth-child(4)::text', re='\s(\d+[.]\d+)') loader.add_css('letting', '#collapseTwo li:nth-child(1)::text', re='\s(\d+).*') loader.add_css('owner_occupy', '#collapseTwo li:nth-child(2)::text', re='\s(\d+).*') loader.add_css('look_ups', '#collapseTwo li:nth-child(3)::text', re='\s(\d+).*') loader.add_css('outside_agents', '#collapseTwo li:nth-child(4)::text', re='\s(\d+).*') loader.add_css('total_unit', '#collapseTwo li:nth-child(5)::text', re='\s(\d+).*') loader.add_css('remuneration', '#collapseThree li:nth-child(1)::text', TakeFirst()) loader.add_css('agreement_term', '#collapseThree li:nth-child(2)::text', MapCompose(str.strip), re='(\d+)') loader.add_css('agreement_remain', '#collapseThree li:nth-child(3)::text', MapCompose(str.strip), re='(\d+)') loader.add_css('agreement_age', '#collapseThree li:nth-child(4)::text', MapCompose(str.strip), re='(\d+)') loader.add_css('office_hour', '#collapseThree li:nth-child(5)::text') loader.add_css('complex_feature', '#collapseThree li:nth-child(6)::text') loader.add_css('manager_bed', '#collapseFour li:nth-child(1)::text', Compose(lambda v: v[1], str.strip, stop_on_none=True)) loader.add_css('manager_bathroom', '#collapseFour li:nth-child(1)::text', Compose(lambda v: v[2], str.strip, stop_on_none=True)) loader.add_css('manager_car', '#collapseFour li:nth-child(3)::text') loader.add_css('office', '#collapseFour li:nth-child(4)::text', re='\s(\d+).*') loader.add_css('pets', '#collapseFour li:nth-child(5)::text', MapCompose(str.strip)) loader.add_css('unit_feature', '#collapseFour li:nth-child(6)::text') loader.add_css('description', 'div.pgl-detail div.row div.col-sm-12 p::text') #loader.add_value('description','tmp description') loader.add_value('url', response.url) loader.add_value('crawl_date', datetime.date.today()) price = loader.get_output_value('price')[0] #self.logger.info('get out_put price={0}'.format(price)) unit_price = loader.get_output_value('unit_price')[0] try: if price != 0: loader.add_value('unit_percentage', round(unit_price / price, 2)) except Exception as e: print('error when calculate unit pecentage: {0}'.format(e)) loader.add_value('unit_percentage', 0) try: income = loader.get_output_value('income')[0] remuneration = loader.get_output_value('remuneration')[0] total_unit = loader.get_output_value('total_unit')[0] letting = loader.get_output_value('letting')[0] if total_unit != 0: loader.add_value('wage_per_unit', round(remuneration / total_unit, 2)) else: loader.add_value('wage_per_unit', 0) if letting != 0: loader.add_value('income_per_letting', round((income - remuneration) / letting, 2)) else: loader.add_value('income_per_letting', 0) except Exception as e: print('error when calculate income pecentage: {0}'.format(e)) loader.add_value('wage_per_unit', 0) loader.add_value('income_per_letting', 0) item = loader.load_item() return item
def parse_statistics(self, response): driver = response.meta['driver'] nav_urls = response.meta['nav_urls'] parent_loader = response.meta['loader'] loader = ItemLoader(parent=parent_loader, response=response) fiftytwo_week_high = response.xpath( "//tr/td/span[text()='52 Week High']/parent::td/following-sibling::td[1]/text()" ).get() loader.add_value('fiftytwo_week_high', fiftytwo_week_high) previous_close = locale.atof(loader.get_output_value('previous_close')) one_year_target_est = locale.atof( loader.get_output_value('one_year_target_est')) diff_to_52_week_high = 1 - (previous_close - locale.atof(fiftytwo_week_high)) diff_to_1y_target_est = 1 - (one_year_target_est - previous_close) loader.add_value( 'diff_to_52_week_high', f"{self._round_off_2_decimal(diff_to_52_week_high)}%") loader.add_value( 'diff_to_1y_target_est', f"{self._round_off_2_decimal(diff_to_1y_target_est)}%") forward_pe = self._wait_and_find_elem( driver, "//tr/td/span[text()='Forward P/E']/parent::td/following-sibling::td[1]" ).text loader.add_xpath('forward_pe', forward_pe) market_cap = response.xpath( "//tr/td/span[contains(text(), 'Market Cap')]/parent::td/following-sibling::td[1]/text()" ).get() unit = market_cap[-1] if unit == 'B': multiplier = 1000 elif unit == 'T': multiplier = 1000000 else: multiplier = 1 market_cap = float(market_cap[0:-1]) * multiplier loader.add_value('market_cap', market_cap) peg_ratio = response.xpath("//tr/td/span[contains(text(), 'PEG Ratio')]/parent::td/following-sibling::td[1]/text()").get() or \ response.xpath("//tr/td/span[contains(text(), 'PEG Ratio')]/parent::td/following-sibling::td[1]/span/text()").get() loader.add_value('peg_ratio', peg_ratio) loader.add_xpath( 'price_over_sales', "//tr/td/span[contains(text(), 'Price/Sales')]/parent::td/following-sibling::td[1]/text()" ) price_over_book = response.xpath("//tr/td/span[contains(text(), 'Price/Book')]/parent::td/following-sibling::td[1]/text()").get() or \ response.xpath("//tr/td/span[contains(text(), 'Price/Book')]/parent::td/following-sibling::td[1]/span/text()").get() loader.add_value('price_over_book', price_over_book) return_on_assets = response.xpath("//tr/td/span[contains(text(), 'Return on Assets')]/parent::td/following-sibling::td[1]/text()").get() or \ response.xpath("//tr/td/span[contains(text(), 'Return on Assets')]/parent::td/following-sibling::td[1]/span/text()").get() loader.add_value('return_on_assets', return_on_assets) return_on_equity = response.xpath("//tr/td/span[contains(text(), 'Return on Equity')]/parent::td/following-sibling::td[1]/text()").get() or \ response.xpath("//tr/td/span[contains(text(), 'Return on Equity')]/parent::td/following-sibling::td[1]/span/text()").get() loader.add_value('return_on_equity', return_on_equity) loader.add_xpath( 'diluted_eps', "//tr/td/span[contains(text(), 'Diluted EPS')]/parent::td/following-sibling::td[1]/text()" ) quarterly_earnings_growth = response.xpath("//tr/td/span[contains(text(), 'Quarterly Earnings Growth')]/parent::td/following-sibling::td[1]/text()").get() or \ response.xpath("//tr/td/span[contains(text(), 'Quarterly Earnings Growth')]/parent::td/following-sibling::td[1]/span/text()").get() loader.add_value('quarterly_earnings_growth', quarterly_earnings_growth) fwd_annual_dividend_rate = response.xpath("//tr/td/span[contains(text(), 'Forward Annual Dividend Rate')]/parent::td/following-sibling::td[1]/text()").get() or \ response.xpath("//tr/td/span[contains(text(), 'Forward Annual Dividend Rate')]/parent::td/following-sibling::td[1]/span/text()").get() loader.add_value('fwd_annual_dividend_rate', fwd_annual_dividend_rate) fwd_annual_dividend_yield = response.xpath("//tr/td/span[contains(text(), 'Forward Annual Dividend Yield')]/parent::td/following-sibling::td[1]/text()").get() or \ response.xpath("//tr/td/span[contains(text(), 'Forward Annual Dividend Yield')]/parent::td/following-sibling::td[1]/span/text()").get() loader.add_value('fwd_annual_dividend_yield', fwd_annual_dividend_yield) ex_dividend_date = response.xpath("//tr/td/span[contains(text(), 'Ex-Dividend Date')]/parent::td/following-sibling::td[1]/text()").get() or \ response.xpath("//tr/td/span[contains(text(), 'Ex-Dividend Date')]/parent::td/following-sibling::td[1]/span/text()").get() loader.add_value('ex_dividend_date', ex_dividend_date) yield SeleniumRequest(url=nav_urls['profile_url'], callback=self.parse_profile, previous_response=response, meta={ "loader": loader, "nav_urls": nav_urls })