def get_contact(self, response: Response) -> dict: """ Gets the contact information. :param response: the response object :return: a dict containing the phone and email """ contact = { 'email': '', 'phone': '', 'website': response.url, 'meet': '' } for result in response.xpath( 'string(//*[@id="formTechPub1"]/div/table/tr/td[3])').getall(): phone = extract_phone(result) if len(phone) > 0: contact['phone'] = phone[0] for text in response.xpath( '//*[@id="formTechPub1"]/div/table/tr/td[3]//a/@href').getall( ): if text.startswith('mailto:'): contact['email'] = text.split(':')[-1] break return contact
def response_transform(self, response: Response) -> Iterable[Thread]: """ Makes a list of items from the response """ forum_id = self.extract_forum_id_from_url(response.url) print(str(forum_id)) thread_strings = response.xpath('//tbody/tr[@class="thread"]/@id' ).extract() # gives 'thread#######' thread_authors = response.xpath( '//tbody//td[@class="author"]/a/text()').extract() titles = response.xpath('//a[@class="thread_title"]/text()').extract() views = response.xpath('//td[@class="views"]/text()').extract() replies = response.xpath('//td[@class="replies"]/text()').extract() # parse everything for i in range(0, 40): thnum = re.search('(\d{7})', thread_strings[i]).group(0) author = thread_authors[i] title = titles[i] vw = views[i] reply = replies[i] if views == '-' or reply == '-': # admin threads, dgaf continue # print(str([thread_authors,titles,views,replies])) item = Thread(int(thnum), title, author, int(vw), int(reply), int(forum_id)) yield item
def parse(self, response: Response) -> Iterator[Optional[Dict[str, Any]]]: """Parse reponse from IseWan Vessel Traffic Service Centre website.""" reported_date = response.xpath( '//div[@class="_inner"]/p/text()').extract_first() events = [ ] # to hold sequential list of vessel lineup for advanced parsing table = response.xpath('//table[@class="generalTB"]') for row_idx, row in enumerate(table.xpath('.//tr')): # first row of source table is always the header if row_idx == 0: headers = row.xpath('.//th/text()').extract() continue # subsequent rows are exclusively vessel movements only raw_item = row_to_dict(row, headers) # contextualise item with meta info raw_item.update(provider_name=self.provider, reported_date=reported_date) # standardize character width for key, value in raw_item.items(): raw_item[key] = may_strip(_standardize_char_width(value)) event = normalize.process_item(raw_item) events.append(event) if event else None # combine arrival and departure events into a single 'PortCall' datatype for event in events: yield from normalize.combine_event(event, events)
def parse(self, response: Response): all_link = response.xpath('//a/@href') links = [] for link in all_link: url = link.extract() url = response.url + url links.append(url) log.info(url) is_dirs = [] all_text = response.xpath('/html/body/pre/text()') for _text in all_text: text: str = _text.extract() text = text.strip() laststr = text.split(' ')[-1] print("-->" + laststr + "<--") is_dirs.append(laststr == '-') for i in range(len(all_link)): item = PackagesItem() url = links[i] if '../' in url: continue is_dir = is_dirs[i] item['url'] = url item['is_dir'] = is_dir if is_dir: yield scrapy.Request(url, callback=self.parse) else: yield item
def ads_item_parse(self, response: Response): db = self.data_base_client['db_habr'] collection = db['habr_blog'] title = response.xpath( '//span[@class="post__title-text"]/text()').extract_first() url_stat = response.request._get_url() name_autor = response.xpath( '//span[contains(@class, "user-info__nickname")]//text()' ).extract_first() href_autor = response.xpath( '//header[contains(@class, "post__meta")]//a//@href' ).extract_first() images = response.xpath('//img/@src').getall() collection.insert_one({ 'title': title, 'url_stat': url_stat, 'name_autor': name_autor, 'href_autor': href_autor, 'images': images }) yield { 'title': title, 'url_stat': url_stat, 'name_autor': name_autor, 'href_autor': href_autor, 'images': images }
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for link in response.xpath( "//div[@class='view-content']/div[contains(@class,'views-row')]/div/h3/a" ): text = link.xpath("text()").get() url = link.xpath("@href").get() self.log("find technology {}/{}".format(text, url), level=logging.INFO) patent_links.append(url) # for next page next_page = response.xpath("//li[@class='pager-next']/a/@href").get() if next_page is not None: self.log('process page {}'.format(next_page), level=logging.INFO) yield response.follow( url=next_page, callback=self.parse_list, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure) for p in patent_links: name = self.parse_name_from_url(p) if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def get_contact(self, response: Response) -> dict: """ Gets the contact information. :param response: the response object :return: a dict containing the phone and email """ contact = { 'email': '', 'phone': '', 'website': response.url, 'meet': '' } # for phone number for text in response.xpath( "//div[@class='information']/ul/li/text()").getall(): result = extract_phone(text) if len(result) > 0: contact['phone'] = result[0] break for text in response.xpath( "//div[@class='information']/ul/li/a/@href").getall(): if text.startswith('mailto:'): contact['email'] = text.split(':')[-1] break return contact
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for link in response.xpath("//a[@class='lead-in']/@href").getall(): patent_links.append(link) # for next page next_page = response.xpath( "//div[@class='nav-previous']/a/@href").get() if next_page is not None: self.log('process page {}'.format(next_page), level=logging.INFO) yield response.follow( url=next_page, callback=self.parse_list, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure) for p in patent_links: name = p.split('/')[-2] if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def extract_table( response: Response) -> Generator[Tuple[str, str], None, None]: versions = response.xpath( TPLinkGPL.XPATH['table_device_version']).extract() links = response.xpath(TPLinkGPL.XPATH['table_device_link']).extract() for version, link in zip(versions, links): yield version.strip(), link.strip()
def parse_thread(self, response: Response) -> Iterator[Request]: """ Parse a thread, gathering the URLs within. A thread consists of: - Email URLs - Next Page Pagination URL See https://marc.info/?t=111957107900001&r=1&w=2 :param response: Scrapy response :return: Request object callback to sub-parsing functions. """ self.logger.info(f"parse_thread: {response.url}") email_urls = response.xpath( "//a[contains(@href, '&m=')]/@href").extract() email_urls = [self.marc_url + u for u in email_urls] for url in email_urls: yield Request(url, self.parse_email) next_page_url = response.xpath( "//pre//a[contains(text(), 'Next')][1]//@href").extract() if any(next_page_url): next_url = self.marc_url + next_page_url[0] yield Request(next_url, callback=self.parse_thread)
def response_transform(response: Response): """ Makes a list of items from the response """ forum_id = extract_forum_id_from_url(response.url) thread_strings = response.xpath( '//tbody/tr[contains(@class,"thread")]/@id').extract( ) # gives 'thread#######' thread_authors = response.xpath( '//tbody/tr[@id]/td[@class="author"]/a/text()').extract() thread_author_ids = BQThreadSpider.get_thread_author_ids(response) titles = response.xpath('//a[@class="thread_title"]/text()').extract() if not (len(titles) == len(thread_author_ids) and len(thread_author_ids) == len(thread_authors) and len(thread_authors) == len(thread_strings)): log.warning( "WARNING Extracted components do not match on page %s--titles: \t %d \n author ids: \t %d \n authors: %d \n threadids: \t %d", response.url, len(titles), len(thread_author_ids), len(thread_authors), len(thread_strings)) # parse everything for i in range(0, len(thread_strings)): thnum = re.search('(\d{7})', thread_strings[i]).group(0) author = thread_authors[i] title = titles[i] aid = thread_author_ids[i] # print(str([thread_authors,titles,views,replies])) item = ThreadAvro(int(forum_id), int(thnum), title, author, int(aid), False) yield item
def get_contact(self, response: Response) -> (dict, dict): """ Gets the contact information. :param response: the response object :return: a tuple of two dict, one for an user and the other for the contact information """ contact = { 'email': '', 'phone': '', 'website': response.url, 'meet': '' } # manager name = response.xpath("//dd[@class='manager']/a/text()").get() link = response.xpath("//dd[@class='manager']/a/@href").get() manager = create_user() manager['name'] = name manager['ref'] = link tag = response.xpath("//dd[@class='manager']/div/em[1]/text()").get() if tag is not None and isinstance(tag, str): manager['tag'] = remove_empty_string_from_array(tag.split(', ')) contact['phone'] = response.xpath( "//dd[@class='manager']/div/em[2]/text()").get() manager['contact'] = contact manager['contact']['website'] = link self.log('find manager {} with contact {}'.format(manager, contact), level=logging.DEBUG) return manager, contact
def parse_forum(self, response: Response): forum_url_query = urllib.parse.urlparse(response.url).query forum_id = int(urllib.parse.parse_qs(forum_url_query)['id'][0]) forum_title = response.xpath( '//div[@id="pun-main"]/h1/span/text()').get() section_id = self.forums[forum_id] yield ForumItem(id=forum_id, title=forum_title, section_id=section_id) for topic in response.xpath( '//div[@class="forum"]/div[@class="container"]//tbody//tr/td[@class="tcl"]/div[@class="tclcon"]' ): topic_url = topic.xpath('a/@href').get() topic_url_query = urllib.parse.urlparse(topic_url).query topic_id = int(urllib.parse.parse_qs(topic_url_query)['id'][0]) self.topics[topic_id] = forum_id yield response.follow(topic_url, self.parse_topic) next_page_url = response.xpath( '//div[@class="pagelink"]/a[@class="next"]/@href').get() if next_page_url: yield response.follow(next_page_url, callback=self.parse_forum)
def scrape_chapter(response: Response): book_num = response.meta['book_num'] file_name = response.url.replace( "https://practicalguidetoevil.wordpress.com", "") file_name = file_name.strip("/").replace("/", "_") chapter_title = response.css("header > h1").xpath("text()").extract_first() paragraphs = response.xpath('//*[starts-with(@id, "post")]/div/div/p') with open(f"chapters/Book {book_num}/{file_name}.txt", "w", encoding="utf8") as file: # just makes a readable txt file - this removes useful stuff like italics/bold file.write(f"---{chapter_title}---") file.write("\r\n\r\n") for p in paragraphs: p_str = " ".join(p.xpath("*//text()").extract()) if p_str == "": p_str = " ".join(p.xpath("text()").extract()) file.write(p_str) file.write("\r\n") with open(f"sources/Book {book_num}/{file_name}.html", "w", encoding="utf8") as file: # "HTML" source file.write( response.xpath( '//*[starts-with(@id, "post")]/div/div').extract_first()) print(f"Scraped Book {book_num}: {chapter_title}")
def parse(self, response: Response): for row in response.xpath( '//div[@class="usertable"]//div[@class="container"]//tbody/tr' ): profile_url = row.xpath('td[@class="tcl"]//a/@href').get() yield response.follow(profile_url, callback=self.parse_profile) next_page_url = response.xpath( '//div[@class="pagelink"]/a[@class="next"]/@href').get() if next_page_url: yield response.follow(next_page_url, callback=self.parse)
def parse_detail(self, response: Response): XPATH_TITLE = "//div[@class='text']//h4[1]/text()" XPATH_COURSE = "//div[@class='childtitle']//p/text()" XPATH_VIDEO = "//video/@src" title = response.xpath(XPATH_TITLE).get() course = response.xpath(XPATH_COURSE).get() video_url = response.urljoin(response.xpath(XPATH_VIDEO).get()) return XdvideoItem(title=title, course=course, file_urls=[video_url], episode=response.meta["n"])
def _get_lyrics_text(response: Response) -> str: selectors: List[Selector] = response.xpath("//pre") if not selectors: try: return response.body.decode("latin1") except Exception: raise OhhlaException(f"Skipping {response.url}; could not decode into 'latin1' encoding.") elif len(selectors) > 1: raise OhhlaException(f"Skipping {response.url}; non-conformant for a song page.") else: return response.xpath("//pre")[0].root.text
def parse(self, response: Response, **kwargs): if self.url_to_crawl: yield response.follow(url=self.url_to_crawl, callback=self.parse_residences) else: residences = response.xpath("//a[contains(@class,'detalii-proprietate')][contains(.,'Vezi detalii')]/@href").getall() residences = list(set(residences)) yield from response.follow_all(urls=residences, callback=self.parse_residences) next_page = response.xpath("//a[@class='inainte butonpaginare']/@href").get() if next_page: yield response.follow(url=next_page, callback=self.parse)
def single_parse(self, response: Response): self.debug(response) if not response.xpath('//h1[@id="title"]/*/text()').get('').strip(): return Request(response.url, self.single_parse, dont_filter=True, errback=self.errors('single_parse')) log('product') now = dt.now(timezone('Asia/Tokyo')) product = AmazonItem() product['time'] = now.strftime('%Y-%m-%dT%H-%M-%S') product['title'] = response.xpath('//h1[@id="title"]/*/text()').get( '').strip() product['url'] = response.url review = response.css('span#acrCustomerReviewText::text').get('') product['review_num'] = review[0:-4] if review else 0 product['description'] = '\n'.join([ x.strip() for x in response.css( '#feature-bullets > ul > li *::text').getall() if x.strip() not in ('', 'モデル番号を入力してください', 'これが適合するか確認:') ]) seller = response.css('a#sellerProfileTriggerId') if seller: shop_name = seller.css('*::text').get('') seller_id = get_query_val(seller.attrib['href'], 'seller') shop_url = f'{amazon_url}/sp?seller={seller_id}' if seller_id else '' elif response.xpath('//*[@id="merchant-info"]/a'): shop_name = 'Amazon.co.jp' shop_url = 'https://www.amazon.co.jp/gp/help/customer/display.html?nodeId=202008070' else: shop_name = '-' shop_url = '' product['shop_name'] = shop_name product['shop_url'] = shop_url product['categories'] = ' > '.join([ el.get().strip() for el in response.css( '#wayfinding-breadcrumbs_feature_div > ul > li > span > a::text' ) ]) if shop_url == 'https://www.amazon.co.jp/gp/help/customer/display.html?nodeId=202008070': product['shop_address'] = '〒153-0064 東京都目黒区下目黒1-8-1 日本' yield product elif shop_url: yield Request(shop_url, self.shop_parse, meta={'product': product}, dont_filter=True, errback=self.errors('single_parse', response.url)) else: product['shop_address'] = '---' yield product
def _document(self, response: Response, **kwargs: dict) -> Generator[dict, None, None]: link = kwargs['url'] title = response.xpath( '//h1[@class="documentFirstHeading"]/text()').get() paragraphs = response.xpath( '//div[@class="field-item even"]//p[string-length(text()) > 3 and not(@class)]/text()' ).getall() img = response.xpath( '//div[@class="field-item even"]//a[not(@class) and @target="_blank"]/img/@src' ).get() yield {'url': link, 'title': title, 'body': paragraphs, 'img': img}
def parse(self, response: Response, **kwargs): a_tags = response.xpath( "//div[@class='search-lists-container']//div[@class='car-name-left']/h4/a" ) for a in a_tags: url = a.xpath('./@href').get() yield Request(url=response.urljoin(url), callback=self.parse_detail) next_page_link = response.xpath( "(//li[@class='pagination-li pag-next']/a/@href)[1]").get() if next_page_link is not None: yield Request(url=response.urljoin(next_page_link), callback=self.parse)
def readScript(self, response: Response): if self.num < self.maxNum: script = response.xpath( "//td[@class='scrtext']/pre").extract_first() if not script: script = response.xpath( "//td[@class='scrtext']").extract_first() if script: if not os.path.exists("data"): os.mkdir("data") with open("data/script" + str(self.num) + ".txt", "w+") as f: f.write(remove_tags(script)) self.num += 1
def get_contact(self, response: Response) -> dict: """ Gets the contact information. :param response: the response object :return: a dict containing the phone and email """ contact = {'email': '', 'phone': '', 'website': response.url, 'meet': ''} contact['email'] = response.xpath("string(//*[@id='email'])").get() phone = extract_phone(response.xpath("//*[@id='PhoneNumber']/@onclick").get()) if len(phone) > 0: contact['phone'] = phone[0] self.log('Found contact {}'.format(contact), level=logging.DEBUG) return contact
def parse_item(self, response: Response): item = PhotoSetItem() item['title'] = response.xpath( '//div[@class="title"]/h1/text()').extract()[0] info = response.xpath('//div[@class="info"]/left/text()').extract()[0] item['datetime_published'], item['author'], item['source'] = [ e[3:] for e in re.split(r'\xa0+', info) ] item['url'] = response.url item['tags'] = '/'.join( response.xpath( '//div[@class="tags"]/a[@target="_blank"]/text()').extract()) item['description'] = response.xpath( '//div[@class="text"]/text()').extract()[0] yield item
def add_keywords(self, response: Response) -> list: """ Obtain the keywords of the patent :param response: response :return list of keywords """ categories = response.xpath( "//div[@class='ncd-data otherdata-categories display-block indented ']//a/text()").getall() try: # keyword may not exist categories.extend(response.xpath("//div[@class='ncd-data otherdata-keywords']/p/text()").get().split(', ')) except: pass return categories
def parse_list(self, response: Response): # wait for page to load # wait for the redirect to finish. patent_links = [] for link in response.xpath( '//*[@id="formTechPub1"]/div/table[2]/tr/td/a'): text = link.xpath("text()").get() url = link.xpath("@href").get() self.log("find technology {}/{}".format(text, url), level=logging.INFO) patent_links.append(url) # for next page total_result = self.statictics(response) self.page += 1 if self.page * self.item_per_page < total_result: self.log('process page {}'.format(self.page), level=logging.INFO) yield response.follow( url=self.next_page_template.format(self.page), callback=self.parse_list, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure) for p in patent_links: name = self.parse_name_from_url(p) if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get()} if self.with_proxy else {}, errback=self.handle_failure)
def get_contact(self, response: Response) -> dict: """ Gets the contact information. :param response: the response object :return: the contact information """ contact = {"website": "", "meet": "", "email": "", "phone": ""} email = response.xpath("//div[@class='c_tp_contact']/a/text()").get() if email is not None: contact['email'] = email phone = extract_phone( response.xpath("string(//div[@class='c_tp_contact'])").get()) if len(phone) > 0: contact['phone'] = phone[0] return contact
def parse_content(self, response: Response): if response.status == 200: contents = response.xpath('//*[@id=$id]//*/text()', id='text110').extract() title = response.meta.get('title') return {'title': title, 'contents': contents}
def parse(self, response: Response, **kwargs): for script in response.xpath('//script/text()').getall(): # Look for the specific script tag we want if 'INITIAL_STATE' in script: # Extract the interesting part from the script tag m = re.match(r'window\.INITIAL_STATE\s+=\s+({[\s\S]+});', script) # Decode it properly, handling annoying unicode escapes and nonsense from the site renderer custom_demjson = CustomJSON(json_options=demjson.json_options( compactly=False)) decoded = custom_demjson.decode(m.group(1), encoding='unicode-escape') # Write a proper valid JSON file out # with open('example.json', 'w', encoding='utf-8') as file: # file.write(custom_demjson.encode(decoded)) raw_data = decoded['searchData'] word = Word.from_raw(data=raw_data) urls = word.get_urls() new = urls - self.queue self.queue.update(new) if len(new) > 0: print(f'Found {len(new)} more URLs.') return response.follow_all(new)
def extract_multi_firmware( response: Response) -> Generator[Tuple[str, str], None, None]: names = response.xpath(TPLinkGPL.XPATH['device_names_multi']).extract() links = response.xpath(TPLinkGPL.XPATH['device_links_multi']).extract() for device, link in zip(names, links): yield device.strip( ), f'https://www.tp-link.com/phppage/gpl-res-list.html{link.strip()}&appPath=de'