def handle_page(self, response: Response) -> TorrentFileItem: torrents = response.css( 'a[href^="forum.php?mod=attachment"]:contains("torrent")::attr(href)' ).extract() page_links = response.css( 'a[href^="imc_attachad-ad.html"]:contains("torrent")::attr(href)' ).extract() if len(torrents) < 1 and len(page_links) < 1: return for torrent in torrents: request = DownloadRequest( url=response.urljoin(torrent), # relative url to absolute callback=self.handle_item) request.meta['from_url'] = response.url yield request regex = re.compile(r'aid=(\w+)') for page_link in page_links: match = regex.search(page_link) if not match: continue id = match.group(1) request = DownloadRequest( url=response.urljoin('forum.php?mod=attachment&aid=%s' % id), # relative url to absolute callback=self.handle_item, dont_filter=True) request.meta['from_url'] = response.url yield request
def parse(self, response: Response, **kwargs): a_tags = response.xpath( "//div[@class='search-lists-container']//div[@class='car-name-left']/h4/a" ) for a in a_tags: url = a.xpath('./@href').get() yield Request(url=response.urljoin(url), callback=self.parse_detail) next_page_link = response.xpath( "(//li[@class='pagination-li pag-next']/a/@href)[1]").get() if next_page_link is not None: yield Request(url=response.urljoin(next_page_link), callback=self.parse)
def getScript(self, response: Response): if self.num < self.maxNum: scriptLink = response.xpath( "//a[starts-with(@href, '/scripts/')]/@href").extract_first() if scriptLink.endswith("html"): yield Request(response.urljoin(scriptLink), callback=self.readScript)
def parse(self, response: Response) -> Iterable[Union[Request, Mapping]]: yield from self.follow_pages(response) for ingredient in response.css("a.promo__ingredient"): yield { "name": ingredient.css("h3::text").get(), "url": response.urljoin(ingredient.attrib["href"]), }
def parse_product(self, response: Response, device_name: str) -> Generator[Request, None, None]: software_page = response.xpath(self.x_path['software_exists']).get() if software_page: yield Request(url=response.urljoin(software_page), callback=self.parse_versions, cb_kwargs=dict(device_name=device_name))
def parse(self, response: Response) -> Iterator[Union[Case, Request]]: for case in response.css("div.announcement"): # if we have cases, this is not the last page offset = response.meta.get("offset", 0) yield self.request_offset(offset + 1) # get main data from the case title = case.css("h4 a") href = title.attrib["href"] name = title.css("::text").get() if not name: continue # parse textual data from the case contents = tuple( line.strip() for line in case.css("::text").getall() if line.strip() ) kwargs = {name: parser(contents) for name, parser in PARSERS.items()} yield Case( name=name, url=response.urljoin(href), full_text="\n".join(contents), **kwargs, )
def parse(self, response: Response): links = response.xpath('//main//a[@class="article"]/@href').extract() for link in links: absolute_url = response.urljoin(link) yield scrapy.Request(absolute_url, callback=self.parse_concert, dont_filter=True)
def parse(self, response: Response): self.num = 0 for nextPage in response.xpath( "//a[@title and starts-with(@href, '/Movie Scripts/')]/@href" ).extract(): if self.num < self.maxNum: yield Request(response.urljoin(nextPage), callback=self.getScript)
def parse(self, response: Response) -> Generator[Request, None, None]: for product_url, device_name in list( zip( response.xpath(self.x_path['product_urls']).extract(), response.xpath(self.x_path['device_names']).extract())): yield Request(url=response.urljoin(product_url), callback=self.parse_product, cb_kwargs=dict(device_name=device_name))
def parse(self, response: Response) -> Generator[Generator, None, None]: links_declassified = response.xpath( '//a[starts-with(@href,"collection") and (parent::h3 | parent::h2)]/@href' ).getall() for link in links_declassified: yield response.follow(link, callback=self._document, cb_kwargs={'url': response.urljoin(link)})
def get_csv_files(self, response: Response): # get all <a> items with 'Descargar CSV' a_buttons = response.xpath('//table[@class="table-fill"]//a[contains(@title, "Descargar CSV")]') for btn in a_buttons: href = btn.xpath('@href').extract_first() # join url to download csv url = response.urljoin(href) # yield request to process yield Request(url=url, callback=self.decompress)
def _parse_news_list(self, response: Response, depth=10): """ handle the raw html :param depth: maximum depth we should search for articles :param response: the top level news response """ log.debug("Parsing news list link: {}".format(response.url)) for link in self._article_links(response): link = response.urljoin(link) yield scrapy.Request(url=link, callback=self._parse_article_link) # if next link exists and depth not exceeded, visit next link and yield results. next_page = response.css( self._config['next_page_selector']).extract_first() # we keep iterating through until our maximum depth is reached. if next_page is not None and depth > 0: next_page = response.urljoin(next_page) yield scrapy.Request(url=next_page, callback=lambda list_response: self. _parse_news_list(list_response, depth - 1))
def parse_index_page(self, response: Response): # # 1. Find all the individual opinions on this index page # and request a parse for each. # opinion_paths = response.xpath( "//td[contains(@class, 'views-field-title')]/a/@href").getall() for url in [response.urljoin(p) for p in opinion_paths]: yield Request(url, callback=self.parse_opinion_page) # # 2. Go to the next index page, if there is one. # next_page_path = response.xpath( "//a[contains(@title, 'Go to next page')]/@href").get() if next_page_path is not None: yield Request(response.urljoin(next_page_path), callback=self.parse_index_page)
def parse_detail(self, response: Response): XPATH_TITLE = "//div[@class='text']//h4[1]/text()" XPATH_COURSE = "//div[@class='childtitle']//p/text()" XPATH_VIDEO = "//video/@src" title = response.xpath(XPATH_TITLE).get() course = response.xpath(XPATH_COURSE).get() video_url = response.urljoin(response.xpath(XPATH_VIDEO).get()) return XdvideoItem(title=title, course=course, file_urls=[video_url], episode=response.meta["n"])
def handle_page(self, response: Response) -> TorrentFileItem: torrents = response.css('a[href^="attachment.php?aid="]:contains("torrent")::attr(href)').extract() if len(torrents) < 1: return for torrent in torrents: request = DownloadRequest( url=response.urljoin(torrent), # relative url to absolute callback=self.handle_item ) request.meta['from_url'] = response.url yield request
def handle_list(self, response: Response) -> Request: torrents = response.css('td a[href$=torrent]::attr(href)').extract() if len(torrents) < 1: return for torrent in torrents: request = DownloadRequest( url=response.urljoin(torrent), # relative url to absolute callback=self.handle_item, dont_filter=True) request.meta['from_url'] = response.url yield request
def handle_page(self, response: Response) -> TorrentFileItem: torrents = response.css( 'a[href^="/download.php?id="]::attr(href)').extract() if len(torrents) < 1: return for torrent in torrents: request = DownloadRequest( url=response.urljoin(torrent), # relative url to absolute callback=self.handle_item, dont_filter=True) request.meta['from_url'] = response.url yield request
def _get_ingredient(self, response: Response, ingredient: Any) -> Mapping: url = ingredient.css("a::attr(href)").get() if url is not None: url = response.urljoin(url) description = "".join(ingredient.css("::text").getall()) result = { "description": description, "url": url, } return result
def get_contact(self, response: Response) -> list: """ Gets the contact information. :param response: the response object :return: a list of contact """ users = [] for row in response.xpath("//div[@class='associate-item']/div"): user = create_user() user['ref'] = response.urljoin(row.xpath("a/@href").get()) user['contact']['website'] = user['ref'] user['logo'] = response.urljoin(row.xpath("a/img/@src").get()) user['name'] = row.xpath("h4[@class='team-name']/a/text()").get() user['abs'] = row.xpath("strong[@class='team-position']/text()").get() user['exp']['exp']['title'] = user['abs'] user['exp']['exp']['company'] = self.name user['contact']['email'] = response.xpath("ul/li[@class='bottom-item bottom-email']/a/@href").get() user['contact']['phone'] = response.xpath("ul/li[@class='bottom-item bottom-phone']/a/text()").get() users.append(user) return users
def parse_product( self, response: Response ) -> Union[Generator[FirmwareItem, None, None], Generator[Request, None, None]]: path = response.request.url.split('/')[:-1] if path[-1] == 'fritz.os': yield from self.parse_firmware(response=response, device_name=path[-3]) else: for sub_directory in self.extract_links(response=response, ignore=('recover', '..')): yield Request(url=response.urljoin(sub_directory), callback=self.parse_product)
def parse_nav_page(cls, response: Response, selector: dict): items = list() for k, v in selector.items(): data = [] values = response.xpath(v).extract() for value in values: if k == 'infor_url': value = response.urljoin(value.strip()) data.append({k: value.strip()}) items.append(data) for item in zip(*items): temp_d = dict() for i in item: temp_d.update(i) yield temp_d
def parse(self,response:Response): # urlfile = open('urls', 'w', encoding='utf-8') item = SicrawlerItem() item['url'] = response.url item['body'] = response.css('p::text').getall() # item['body'] = response.body yield item links = response.css('a::attr(href)').getall() # print(f'La longitud de links es : {len(links)}') # print(response.urljoin(links[0])) for a in links: if a == '/': continue urljoined = response.urljoin(a) # urlfile.write(urljoined+'\n') yield scrapy.Request(urljoined, callback=self.parse)
def parse(self, response: Response): for index, row in enumerate(response.css("table tr")): if index < 2: # 前两行不是有效数据 continue loader = ItemLoader(item=IPItem(), selector=row) loader.add_value('source', 'ip66') loader.add_value('protocol', 'http') loader.add_css('ip', 'td:nth-child(1)::text') loader.add_css('port', 'td:nth-child(2)::text') loader.add_css('remark', 'td:nth-child(3)::text') yield loader.load_item() if self.page < self.MAX_PAGE: self.page += 1 next_page = response.css('#PageList a:last-child::attr("href")').extract_first() yield Request( url=response.urljoin(next_page) )
def parse(self, response: Response): # scrapy.shell.inspect_response(response, self) directory = urlparse(response.url).path.split("/")[-1].split(".")[0] requests = [] for table in response.xpath("//table[not(.//table)]"): if not self.is_table(table): continue links = table.xpath(".//td/a/@href").extract() for link in links: url = response.urljoin(link) requests.append(Request(url, self.parse_chapter, meta={"directory": directory})) self._requests = iter(requests) for i in range(self.start_index): next(self._requests) yield next(self._requests, None)
def parse(self, response: Response) -> Iterator[Union[Case, Request]]: for case in response.css("div.announcement"): # if we have cases, this is not the last page offset = response.meta.get("offset", 0) yield self.request_offset(offset + 1) # get main data from the case title = case.css("h4 a") href = title.attrib["href"] name = title.css("::text").get() if not name: continue # parse textual data from the case contents = tuple(line.strip() for line in case.css("::text").getall() if line.strip()) kwargs = { name: parser(contents) for name, parser in PARSERS.items() } if (kwargs["age_at_occurrence"] is None and kwargs["dob"] is not None and kwargs["missing_since"] is not None): dob = datetime.strptime(str(kwargs["dob"]), "%Y-%m-%d") missing_since = datetime.strptime(str(kwargs["missing_since"]), "%Y-%m-%d") kwargs["age_at_occurrence"] = missing_since.year - dob.year last_seen_at = kwargs.pop("last_seen_at") last_seen_at = str(last_seen_at) if last_seen_at else "" kwargs.update(self.normalize_city_for(last_seen_at)) yield Case( name=name, url=response.urljoin(href), full_text="\n".join(contents), **kwargs, )
def parse_chapter(self, response: Response): # scrapy.shell.inspect_response(response, self) filename = urlparse(response.url).path.split("/")[-1].split(".")[0] el: Selector = next(iter(response.xpath("//a[img[@oncontextmenu='return false']]")), None) if el is None: yield next(self._requests, None) return imgurl = el.xpath("img/@src").extract_first() imgname = urlparse(imgurl).path.split("/")[-1] filename = filename + " " + imgname meta = response.meta.copy() meta["filename"] = filename yield Request(imgurl, callback=self.download_image, meta=meta) nexturl = response.urljoin(el.xpath("@href").extract_first()) yield Request(nexturl, callback=self.parse_chapter, meta=meta, dont_filter=True) print("{} {} {}".format(filename, imgurl, nexturl))
def parse(self, response: Response): for index, row in enumerate(response.css('table#ip_list tr')): if index == 0: # 第一行是表头,不进行解析 continue loader = ItemLoader(item=IPItem(), selector=row) loader.add_value('source', 'xicidaili') loader.add_css('ip', 'td:nth-child(2)::text') loader.add_css('port', 'td:nth-child(3)::text') loader.add_css('remark', 'td:nth-child(4) a::text') loader.add_css('protocol', 'td:nth-child(6)::text') item = loader.load_item() if not item.get('ip'): # 有时候有些row数据无效 continue yield item if self.page < self.MAX_PAGE: self.page += 1 next_page = 'https://www.xicidaili.com/nt/{}'.format(self.page) yield Request(url=response.urljoin(next_page), callback=self.parse)
def get_contact(self, response: Response) -> dict: """ Gets the contact information. :param response: the response object :return: the contact information """ user = create_user() user['name'] = response.xpath( "//div[@class='case-manager']/a/text()").get() user['ref'] = response.urljoin( response.xpath("//div[@class='case-manager']/a/@href").get()) user['contact']['website'] = user['ref'] user['contact']['email'] = response.xpath( "//div[@class='case-manager']/span/a/text()").get() if user['contact']['email'] is None: user['contact']['email'] = '' phone = extract_phone( response.xpath("string(//div[@class='case-manager'])").get()) if len(phone) > 0: user['contact']['phone'] = phone[0] return user
def parse(self, response: Response): for row in response.css('table tr')[1:]: # 解密ip secret_ip = row.css('td:nth-child(1) script::text').re_first( r'rot13\(\"(.*?)\"') if secret_ip is None: continue decoded_by_rot13 = codecs.decode(secret_ip, 'rot13') ip = base64.b64decode(decoded_by_rot13).decode() loader = ItemLoader(item=IPItem(), selector=row) loader.add_value('source', 'cool-proxy') loader.add_value('ip', ip) loader.add_value('protocol', 'http') loader.add_css('port', "td:nth-child(2)::text") loader.add_css('remark', 'td:nth-child(4)::text') yield loader.load_item() next_page = response.css('span.next a::attr("href")').extract_first() if next_page: yield Request(response.urljoin(next_page), callback=self.parse)
def extract_links(response: Response) -> List[str]: return [ response.urljoin(p) for p in response.xpath(AVMGPL.XPATH['links']).extract() ]