예제 #1
0
    def handle_page(self, response: Response) -> TorrentFileItem:
        torrents = response.css(
            'a[href^="forum.php?mod=attachment"]:contains("torrent")::attr(href)'
        ).extract()
        page_links = response.css(
            'a[href^="imc_attachad-ad.html"]:contains("torrent")::attr(href)'
        ).extract()
        if len(torrents) < 1 and len(page_links) < 1:
            return

        for torrent in torrents:
            request = DownloadRequest(
                url=response.urljoin(torrent),  # relative url to absolute
                callback=self.handle_item)
            request.meta['from_url'] = response.url
            yield request

        regex = re.compile(r'aid=(\w+)')
        for page_link in page_links:
            match = regex.search(page_link)
            if not match:
                continue
            id = match.group(1)
            request = DownloadRequest(
                url=response.urljoin('forum.php?mod=attachment&aid=%s' %
                                     id),  # relative url to absolute
                callback=self.handle_item,
                dont_filter=True)
            request.meta['from_url'] = response.url
            yield request
예제 #2
0
 def parse(self, response: Response, **kwargs):
     a_tags = response.xpath(
         "//div[@class='search-lists-container']//div[@class='car-name-left']/h4/a"
     )
     for a in a_tags:
         url = a.xpath('./@href').get()
         yield Request(url=response.urljoin(url),
                       callback=self.parse_detail)
     next_page_link = response.xpath(
         "(//li[@class='pagination-li pag-next']/a/@href)[1]").get()
     if next_page_link is not None:
         yield Request(url=response.urljoin(next_page_link),
                       callback=self.parse)
예제 #3
0
 def getScript(self, response: Response):
     if self.num < self.maxNum:
         scriptLink = response.xpath(
             "//a[starts-with(@href, '/scripts/')]/@href").extract_first()
         if scriptLink.endswith("html"):
             yield Request(response.urljoin(scriptLink),
                           callback=self.readScript)
예제 #4
0
 def parse(self, response: Response) -> Iterable[Union[Request, Mapping]]:
     yield from self.follow_pages(response)
     for ingredient in response.css("a.promo__ingredient"):
         yield {
             "name": ingredient.css("h3::text").get(),
             "url": response.urljoin(ingredient.attrib["href"]),
         }
예제 #5
0
 def parse_product(self, response: Response,
                   device_name: str) -> Generator[Request, None, None]:
     software_page = response.xpath(self.x_path['software_exists']).get()
     if software_page:
         yield Request(url=response.urljoin(software_page),
                       callback=self.parse_versions,
                       cb_kwargs=dict(device_name=device_name))
예제 #6
0
    def parse(self, response: Response) -> Iterator[Union[Case, Request]]:
        for case in response.css("div.announcement"):

            # if we have cases, this is not the last page
            offset = response.meta.get("offset", 0)
            yield self.request_offset(offset + 1)

            # get main data from the case
            title = case.css("h4 a")
            href = title.attrib["href"]
            name = title.css("::text").get()
            if not name:
                continue

            # parse textual data from the case
            contents = tuple(
                line.strip() for line in case.css("::text").getall() if line.strip()
            )

            kwargs = {name: parser(contents) for name, parser in PARSERS.items()}
            yield Case(
                name=name,
                url=response.urljoin(href),
                full_text="\n".join(contents),
                **kwargs,
            )
예제 #7
0
 def parse(self, response: Response):
     links = response.xpath('//main//a[@class="article"]/@href').extract()
     for link in links:
         absolute_url = response.urljoin(link)
         yield scrapy.Request(absolute_url,
                              callback=self.parse_concert,
                              dont_filter=True)
예제 #8
0
 def parse(self, response: Response):
     self.num = 0
     for nextPage in response.xpath(
             "//a[@title and starts-with(@href, '/Movie Scripts/')]/@href"
     ).extract():
         if self.num < self.maxNum:
             yield Request(response.urljoin(nextPage),
                           callback=self.getScript)
예제 #9
0
 def parse(self, response: Response) -> Generator[Request, None, None]:
     for product_url, device_name in list(
             zip(
                 response.xpath(self.x_path['product_urls']).extract(),
                 response.xpath(self.x_path['device_names']).extract())):
         yield Request(url=response.urljoin(product_url),
                       callback=self.parse_product,
                       cb_kwargs=dict(device_name=device_name))
예제 #10
0
    def parse(self, response: Response) -> Generator[Generator, None, None]:
        links_declassified = response.xpath(
            '//a[starts-with(@href,"collection") and (parent::h3 | parent::h2)]/@href'
        ).getall()

        for link in links_declassified:
            yield response.follow(link,
                                  callback=self._document,
                                  cb_kwargs={'url': response.urljoin(link)})
예제 #11
0
 def get_csv_files(self, response: Response):
     # get all <a> items with 'Descargar CSV'
     a_buttons = response.xpath('//table[@class="table-fill"]//a[contains(@title, "Descargar CSV")]')
     for btn in a_buttons:
         href = btn.xpath('@href').extract_first()
         # join url to download csv
         url = response.urljoin(href)
         # yield request to process
         yield Request(url=url, callback=self.decompress)
예제 #12
0
    def _parse_news_list(self, response: Response, depth=10):
        """
        handle the raw html
        :param depth: maximum depth we should search for articles
        :param response: the top level news response
        """
        log.debug("Parsing news list link: {}".format(response.url))
        for link in self._article_links(response):
            link = response.urljoin(link)
            yield scrapy.Request(url=link, callback=self._parse_article_link)
        # if next link exists and depth not exceeded, visit next link and yield results.
        next_page = response.css(
            self._config['next_page_selector']).extract_first()

        # we keep iterating through until our maximum depth is reached.
        if next_page is not None and depth > 0:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(url=next_page,
                                 callback=lambda list_response: self.
                                 _parse_news_list(list_response, depth - 1))
    def parse_index_page(self, response: Response):
        #
        # 1. Find all the individual opinions on this index page
        # and request a parse for each.
        #
        opinion_paths = response.xpath(
            "//td[contains(@class, 'views-field-title')]/a/@href").getall()

        for url in [response.urljoin(p) for p in opinion_paths]:
            yield Request(url, callback=self.parse_opinion_page)

        #
        # 2. Go to the next index page, if there is one.
        #
        next_page_path = response.xpath(
            "//a[contains(@title, 'Go to next page')]/@href").get()

        if next_page_path is not None:
            yield Request(response.urljoin(next_page_path),
                          callback=self.parse_index_page)
예제 #14
0
 def parse_detail(self, response: Response):
     XPATH_TITLE = "//div[@class='text']//h4[1]/text()"
     XPATH_COURSE = "//div[@class='childtitle']//p/text()"
     XPATH_VIDEO = "//video/@src"
     title = response.xpath(XPATH_TITLE).get()
     course = response.xpath(XPATH_COURSE).get()
     video_url = response.urljoin(response.xpath(XPATH_VIDEO).get())
     return XdvideoItem(title=title,
                        course=course,
                        file_urls=[video_url],
                        episode=response.meta["n"])
예제 #15
0
 def handle_page(self, response: Response) -> TorrentFileItem:
     torrents = response.css('a[href^="attachment.php?aid="]:contains("torrent")::attr(href)').extract()
     if len(torrents) < 1:
         return
     for torrent in torrents:
         request = DownloadRequest(
             url=response.urljoin(torrent),  # relative url to absolute
             callback=self.handle_item
         )
         request.meta['from_url'] = response.url
         yield request
예제 #16
0
 def handle_list(self, response: Response) -> Request:
     torrents = response.css('td a[href$=torrent]::attr(href)').extract()
     if len(torrents) < 1:
         return
     for torrent in torrents:
         request = DownloadRequest(
             url=response.urljoin(torrent),  # relative url to absolute
             callback=self.handle_item,
             dont_filter=True)
         request.meta['from_url'] = response.url
         yield request
예제 #17
0
 def handle_page(self, response: Response) -> TorrentFileItem:
     torrents = response.css(
         'a[href^="/download.php?id="]::attr(href)').extract()
     if len(torrents) < 1:
         return
     for torrent in torrents:
         request = DownloadRequest(
             url=response.urljoin(torrent),  # relative url to absolute
             callback=self.handle_item,
             dont_filter=True)
         request.meta['from_url'] = response.url
         yield request
예제 #18
0
    def _get_ingredient(self, response: Response, ingredient: Any) -> Mapping:
        url = ingredient.css("a::attr(href)").get()
        if url is not None:
            url = response.urljoin(url)
        description = "".join(ingredient.css("::text").getall())

        result = {
            "description": description,
            "url": url,
        }

        return result
예제 #19
0
    def get_contact(self, response: Response) -> list:
        """
        Gets the contact information.

        :param response: the response object
        :return: a list of contact
        """
        users = []
        for row in response.xpath("//div[@class='associate-item']/div"):
            user = create_user()
            user['ref'] = response.urljoin(row.xpath("a/@href").get())
            user['contact']['website'] = user['ref']
            user['logo'] = response.urljoin(row.xpath("a/img/@src").get())
            user['name'] = row.xpath("h4[@class='team-name']/a/text()").get()
            user['abs'] = row.xpath("strong[@class='team-position']/text()").get()
            user['exp']['exp']['title'] = user['abs']
            user['exp']['exp']['company'] = self.name
            user['contact']['email'] = response.xpath("ul/li[@class='bottom-item bottom-email']/a/@href").get()
            user['contact']['phone'] = response.xpath("ul/li[@class='bottom-item bottom-phone']/a/text()").get()
            users.append(user)
        return users
예제 #20
0
파일: avm.py 프로젝트: mmg1/FirmwareScraper
 def parse_product(
     self, response: Response
 ) -> Union[Generator[FirmwareItem, None, None], Generator[Request, None,
                                                           None]]:
     path = response.request.url.split('/')[:-1]
     if path[-1] == 'fritz.os':
         yield from self.parse_firmware(response=response,
                                        device_name=path[-3])
     else:
         for sub_directory in self.extract_links(response=response,
                                                 ignore=('recover', '..')):
             yield Request(url=response.urljoin(sub_directory),
                           callback=self.parse_product)
예제 #21
0
 def parse_nav_page(cls, response: Response, selector: dict):
     items = list()
     for k, v in selector.items():
         data = []
         values = response.xpath(v).extract()
         for value in values:
             if k == 'infor_url':
                 value = response.urljoin(value.strip())
             data.append({k: value.strip()})
         items.append(data)
     for item in zip(*items):
         temp_d = dict()
         for i in item:
             temp_d.update(i)
         yield temp_d
예제 #22
0
    def parse(self,response:Response):
        # urlfile = open('urls', 'w', encoding='utf-8')

        item = SicrawlerItem()
        item['url'] = response.url
        item['body'] = response.css('p::text').getall()
        # item['body'] = response.body
        yield item

        links = response.css('a::attr(href)').getall()
        # print(f'La longitud de links es : {len(links)}')
        # print(response.urljoin(links[0]))
        for a in links:
            if a == '/': continue
            urljoined = response.urljoin(a)
            # urlfile.write(urljoined+'\n')
            yield scrapy.Request(urljoined, callback=self.parse)
예제 #23
0
    def parse(self, response: Response):
        for index, row in enumerate(response.css("table tr")):
            if index < 2:
                # 前两行不是有效数据
                continue
            loader = ItemLoader(item=IPItem(), selector=row)
            loader.add_value('source', 'ip66')
            loader.add_value('protocol', 'http')
            loader.add_css('ip', 'td:nth-child(1)::text')
            loader.add_css('port', 'td:nth-child(2)::text')
            loader.add_css('remark', 'td:nth-child(3)::text')
            yield loader.load_item()

        if self.page < self.MAX_PAGE:
            self.page += 1
            next_page = response.css('#PageList a:last-child::attr("href")').extract_first()
            yield Request(
                url=response.urljoin(next_page)
            )
예제 #24
0
    def parse(self, response: Response):
        # scrapy.shell.inspect_response(response, self)
        directory = urlparse(response.url).path.split("/")[-1].split(".")[0]
        requests = []
        for table in response.xpath("//table[not(.//table)]"):
            if not self.is_table(table):
                continue

            links = table.xpath(".//td/a/@href").extract()
            for link in links:
                url = response.urljoin(link)
                requests.append(Request(url, self.parse_chapter, meta={"directory": directory}))

        self._requests = iter(requests)

        for i in range(self.start_index):
            next(self._requests)

        yield next(self._requests, None)
예제 #25
0
    def parse(self, response: Response) -> Iterator[Union[Case, Request]]:
        for case in response.css("div.announcement"):

            # if we have cases, this is not the last page
            offset = response.meta.get("offset", 0)
            yield self.request_offset(offset + 1)

            # get main data from the case
            title = case.css("h4 a")
            href = title.attrib["href"]
            name = title.css("::text").get()
            if not name:
                continue

            # parse textual data from the case
            contents = tuple(line.strip()
                             for line in case.css("::text").getall()
                             if line.strip())

            kwargs = {
                name: parser(contents)
                for name, parser in PARSERS.items()
            }

            if (kwargs["age_at_occurrence"] is None
                    and kwargs["dob"] is not None
                    and kwargs["missing_since"] is not None):
                dob = datetime.strptime(str(kwargs["dob"]), "%Y-%m-%d")
                missing_since = datetime.strptime(str(kwargs["missing_since"]),
                                                  "%Y-%m-%d")
                kwargs["age_at_occurrence"] = missing_since.year - dob.year

            last_seen_at = kwargs.pop("last_seen_at")
            last_seen_at = str(last_seen_at) if last_seen_at else ""
            kwargs.update(self.normalize_city_for(last_seen_at))

            yield Case(
                name=name,
                url=response.urljoin(href),
                full_text="\n".join(contents),
                **kwargs,
            )
예제 #26
0
    def parse_chapter(self, response: Response):
        # scrapy.shell.inspect_response(response, self)
        filename = urlparse(response.url).path.split("/")[-1].split(".")[0]
        el: Selector = next(iter(response.xpath("//a[img[@oncontextmenu='return false']]")), None)
        if el is None:
            yield next(self._requests, None)
            return

        imgurl = el.xpath("img/@src").extract_first()
        imgname = urlparse(imgurl).path.split("/")[-1]
        filename = filename + " " + imgname

        meta = response.meta.copy()
        meta["filename"] = filename

        yield Request(imgurl, callback=self.download_image, meta=meta)

        nexturl = response.urljoin(el.xpath("@href").extract_first())
        yield Request(nexturl, callback=self.parse_chapter, meta=meta, dont_filter=True)
        print("{} {} {}".format(filename, imgurl, nexturl))
예제 #27
0
    def parse(self, response: Response):
        for index, row in enumerate(response.css('table#ip_list tr')):
            if index == 0:
                # 第一行是表头,不进行解析
                continue
            loader = ItemLoader(item=IPItem(), selector=row)
            loader.add_value('source', 'xicidaili')
            loader.add_css('ip', 'td:nth-child(2)::text')
            loader.add_css('port', 'td:nth-child(3)::text')
            loader.add_css('remark', 'td:nth-child(4) a::text')
            loader.add_css('protocol', 'td:nth-child(6)::text')

            item = loader.load_item()
            if not item.get('ip'):
                # 有时候有些row数据无效
                continue
            yield item

        if self.page < self.MAX_PAGE:
            self.page += 1
            next_page = 'https://www.xicidaili.com/nt/{}'.format(self.page)
            yield Request(url=response.urljoin(next_page), callback=self.parse)
예제 #28
0
    def get_contact(self, response: Response) -> dict:
        """
        Gets the contact information.

        :param response: the response object
        :return: the contact information
        """
        user = create_user()
        user['name'] = response.xpath(
            "//div[@class='case-manager']/a/text()").get()
        user['ref'] = response.urljoin(
            response.xpath("//div[@class='case-manager']/a/@href").get())
        user['contact']['website'] = user['ref']
        user['contact']['email'] = response.xpath(
            "//div[@class='case-manager']/span/a/text()").get()
        if user['contact']['email'] is None:
            user['contact']['email'] = ''
        phone = extract_phone(
            response.xpath("string(//div[@class='case-manager'])").get())
        if len(phone) > 0:
            user['contact']['phone'] = phone[0]
        return user
예제 #29
0
    def parse(self, response: Response):
        for row in response.css('table tr')[1:]:
            # 解密ip
            secret_ip = row.css('td:nth-child(1) script::text').re_first(
                r'rot13\(\"(.*?)\"')
            if secret_ip is None:
                continue
            decoded_by_rot13 = codecs.decode(secret_ip, 'rot13')
            ip = base64.b64decode(decoded_by_rot13).decode()

            loader = ItemLoader(item=IPItem(), selector=row)
            loader.add_value('source', 'cool-proxy')
            loader.add_value('ip', ip)
            loader.add_value('protocol', 'http')
            loader.add_css('port', "td:nth-child(2)::text")
            loader.add_css('remark', 'td:nth-child(4)::text')

            yield loader.load_item()

        next_page = response.css('span.next a::attr("href")').extract_first()
        if next_page:
            yield Request(response.urljoin(next_page), callback=self.parse)
예제 #30
0
 def extract_links(response: Response) -> List[str]:
     return [
         response.urljoin(p)
         for p in response.xpath(AVMGPL.XPATH['links']).extract()
     ]