示例#1
0
文件: site.py 项目: raspi/scrapy-AMP
    def parse(self, response: scrapy.http.Response):
        """
        Get list of tunes
        """

        u: SplitResult = urlsplit(response.url)
        q: dict = dict(queryparse(u.query))

        for tune in response.xpath(
                "//div[@id='result']/table/tr/th[@colspan='6']/../../tr[@class]"
        ):
            artist = "".join(tune.xpath("./td[2]//text()").getall()).strip()
            title = "".join(tune.xpath("./td[1]//text()").getall()).strip()
            link = tune.xpath("./td[1]/a/@href").get().strip()
            fileformat = "".join(
                tune.xpath("./td[3]//text()").getall()).strip().lower()

            # Download tune
            yield scrapy.Request(
                response.urljoin(link),
                callback=self.download_mod,
                meta={
                    "tune": {
                        "id": q['view'],
                        "artist": artist,
                        "title": title,
                        "format": fileformat,
                    }
                },
            )
示例#2
0
    def parse_bid(self, response: scrapy.http.Response):
        rq = dict(queryparse(urlsplit(response.url).query))

        for link in response.xpath("//a"):
            href = link.xpath("./@href").get()
            q = dict(queryparse(urlsplit(href).query))
            if not q:
                continue

            if ('doctype' in q) or ('docid' in q):
                yield scrapy.Request(
                    response.urljoin(href),
                    meta={
                        "name": response.meta["name"],
                        "id": rq[' bid'],
                    },
                    callback=self.dl_doc,
                )
示例#3
0
    def parse_search_result(self, response: scrapy.http.Response):
        for link in response.xpath("//a"):
            href = link.xpath("./@href").get()
            q = dict(queryparse(urlsplit(href).query))
            if not q:
                continue

            if ' bid' in q:
                yield scrapy.Request(
                    response.urljoin(href),
                    meta={
                        "name": response.meta["name"],
                        "dont_cache": True,
                    },
                    callback=self.parse_bid,
                )
示例#4
0
    def parse_motherboard(self, response: scrapy.http.Response):
        query = dict(queryparse(urlsplit(response.url).query))
        current_page = int(query['page'])

        data = json.loads(response.body)
        for memmodule in data['results']:

            if 'url' in memmodule:
                memmodule['url'] = response.urljoin(memmodule['url'])

            remove_keys = [
                'stock',
                'priceRange',
                'availableForPickup',
            ]

            for idx, (k, v) in enumerate(memmodule.items()):
                if v is None:
                    remove_keys.append(k)

            for k in remove_keys:
                if k in memmodule:
                    del memmodule[k]

            if 'stock' in memmodule:
                del memmodule['stock']

            yield Memory({
                '_manufacturer': self.manufacturer,
                '_model': response.meta['model'],
                'memory': memmodule,
            })

        if current_page == 0 and data['pagination']['numberOfPages'] > 1:
            for pnum in range(1, data['pagination']['numberOfPages']):
                query['page'] = str(pnum)

                # Call the same page with increased page number
                yield scrapy.Request(
                    response.urljoin("?" + urlencode(query)),
                    callback=self.parse_motherboard,
                    meta={
                        'model': response.meta['model'],
                    },
                )
示例#5
0
    def _dl_url(self, url: str, cachetime: datetime.timedelta = None) -> dict:
        """
        Download JSON from given URL and cache the result
        """

        # Remove all from url except path and query string
        urlS = urlsplit(url)
        urltmp: str = urlS.path.lstrip("/").replace("/", "-")
        urlQ: dict = dict(queryparse(urlS.query))
        if 'app_id' in urlQ:
            del urlQ['app_id']
        if 'app_key' in urlQ:
            del urlQ['app_key']

        urltmp += self._qstr(urlQ)

        if cachetime is None:
            cachetime = self.defaultCacheTime

        cachefile = os.path.join(self.cacheDir, urltmp + ".json")

        if os.path.isfile(cachefile):
            # Cache file exists
            now = datetime.datetime.now()
            fmodtime = datetime.datetime.fromtimestamp(
                os.path.getmtime(cachefile))

            if (fmodtime - now) <= cachetime:
                # Cache is not expired yet. Read from cache
                self.log.debug(f"Getting <URL: {url} > from cache")
                with open(cachefile, "r", encoding="utf8") as f:
                    return json.loads(f.read())

        data: dict = {}

        self.log.debug(f"Getting <URL: {url} >")
        try:
            # Get from HTTP
            sleep(0.2)
            with urllib.request.urlopen(url) as response:
                if response.code != 200:
                    raise ValueError("url couldn't be loaded")
                if response.headers.get_content_type() != "application/json":
                    raise ValueError("invalid content type")

                resp: dict = json.loads(response.read())

                if os.path.isfile(cachefile):
                    # Destroy stale cache
                    os.unlink(cachefile)

                # Save to temporary file
                tmpf = NamedTemporaryFile("w",
                                          prefix="yle-areena-cli-",
                                          suffix=".json",
                                          encoding="utf8",
                                          delete=False)
                with tmpf as f:
                    json.dump(resp, f)
                    f.flush()

                # Rename file
                newpath = move(tmpf.name, cachefile)
                self.log.debug(f"Renamed {tmpf.name} to {newpath}")

                data = resp
        except HTTPError as e:
            raise e

        return data
示例#6
0
    def parse(self, response: scrapy.http.Response):
        """
        Get file list
        :param response:
        :return:
        """

        u: SplitResult = urlsplit(response.url)
        q: dict = dict(queryparse(u.query))

        max_page = max(
            list(
                map(
                    int,
                    response.xpath(
                        "/html/body/table/tr/td[@class='button']/form/input[@name='page']/@value"
                    ).getall())))
        current_page = int(q['page'])

        if current_page < max_page:
            # Call next page
            q['page'] = str(current_page + 1)
            yield scrapy.Request(
                response.urljoin("?" + urlencode(q)),
                callback=self.parse,
            )

        for row in response.xpath(
                "/html/body/table[@width='100%']/tr[@class='newfiles']"):
            # Iterate through uploaded files
            link = row.xpath("./td[4]/a/@href").get()
            song = row.xpath("./td[4]/a/text()").get()
            if song is None:
                # Use file name
                song = os.path.basename(urlsplit(link).path)

            sequencer = row.xpath("./td[5]/text()").get()
            if sequencer is None:
                sequencer = "Unknown"

            game = row.xpath("./td[3]/text()").get()
            if game is None:
                game = "Unknown"

            uploadtime = datetime.strptime(
                row.xpath("./td[1]/text()").get(), "%Y-%m-%d %H:%M:%S")
            system = row.xpath("./td[2]/text()").get()

            yield scrapy.Request(
                response.urljoin(link),
                callback=self.dl_midi,
                meta={
                    "tune":
                    Tune(
                        artist=sequencer,
                        title=song,
                        system=system,
                        game=game,
                        uploadtime=uploadtime,
                        data=None,
                    ),
                },
            )