def parse(self, response: scrapy.http.Response): """ Get list of tunes """ u: SplitResult = urlsplit(response.url) q: dict = dict(queryparse(u.query)) for tune in response.xpath( "//div[@id='result']/table/tr/th[@colspan='6']/../../tr[@class]" ): artist = "".join(tune.xpath("./td[2]//text()").getall()).strip() title = "".join(tune.xpath("./td[1]//text()").getall()).strip() link = tune.xpath("./td[1]/a/@href").get().strip() fileformat = "".join( tune.xpath("./td[3]//text()").getall()).strip().lower() # Download tune yield scrapy.Request( response.urljoin(link), callback=self.download_mod, meta={ "tune": { "id": q['view'], "artist": artist, "title": title, "format": fileformat, } }, )
def parse_bid(self, response: scrapy.http.Response): rq = dict(queryparse(urlsplit(response.url).query)) for link in response.xpath("//a"): href = link.xpath("./@href").get() q = dict(queryparse(urlsplit(href).query)) if not q: continue if ('doctype' in q) or ('docid' in q): yield scrapy.Request( response.urljoin(href), meta={ "name": response.meta["name"], "id": rq[' bid'], }, callback=self.dl_doc, )
def parse_search_result(self, response: scrapy.http.Response): for link in response.xpath("//a"): href = link.xpath("./@href").get() q = dict(queryparse(urlsplit(href).query)) if not q: continue if ' bid' in q: yield scrapy.Request( response.urljoin(href), meta={ "name": response.meta["name"], "dont_cache": True, }, callback=self.parse_bid, )
def parse_motherboard(self, response: scrapy.http.Response): query = dict(queryparse(urlsplit(response.url).query)) current_page = int(query['page']) data = json.loads(response.body) for memmodule in data['results']: if 'url' in memmodule: memmodule['url'] = response.urljoin(memmodule['url']) remove_keys = [ 'stock', 'priceRange', 'availableForPickup', ] for idx, (k, v) in enumerate(memmodule.items()): if v is None: remove_keys.append(k) for k in remove_keys: if k in memmodule: del memmodule[k] if 'stock' in memmodule: del memmodule['stock'] yield Memory({ '_manufacturer': self.manufacturer, '_model': response.meta['model'], 'memory': memmodule, }) if current_page == 0 and data['pagination']['numberOfPages'] > 1: for pnum in range(1, data['pagination']['numberOfPages']): query['page'] = str(pnum) # Call the same page with increased page number yield scrapy.Request( response.urljoin("?" + urlencode(query)), callback=self.parse_motherboard, meta={ 'model': response.meta['model'], }, )
def _dl_url(self, url: str, cachetime: datetime.timedelta = None) -> dict: """ Download JSON from given URL and cache the result """ # Remove all from url except path and query string urlS = urlsplit(url) urltmp: str = urlS.path.lstrip("/").replace("/", "-") urlQ: dict = dict(queryparse(urlS.query)) if 'app_id' in urlQ: del urlQ['app_id'] if 'app_key' in urlQ: del urlQ['app_key'] urltmp += self._qstr(urlQ) if cachetime is None: cachetime = self.defaultCacheTime cachefile = os.path.join(self.cacheDir, urltmp + ".json") if os.path.isfile(cachefile): # Cache file exists now = datetime.datetime.now() fmodtime = datetime.datetime.fromtimestamp( os.path.getmtime(cachefile)) if (fmodtime - now) <= cachetime: # Cache is not expired yet. Read from cache self.log.debug(f"Getting <URL: {url} > from cache") with open(cachefile, "r", encoding="utf8") as f: return json.loads(f.read()) data: dict = {} self.log.debug(f"Getting <URL: {url} >") try: # Get from HTTP sleep(0.2) with urllib.request.urlopen(url) as response: if response.code != 200: raise ValueError("url couldn't be loaded") if response.headers.get_content_type() != "application/json": raise ValueError("invalid content type") resp: dict = json.loads(response.read()) if os.path.isfile(cachefile): # Destroy stale cache os.unlink(cachefile) # Save to temporary file tmpf = NamedTemporaryFile("w", prefix="yle-areena-cli-", suffix=".json", encoding="utf8", delete=False) with tmpf as f: json.dump(resp, f) f.flush() # Rename file newpath = move(tmpf.name, cachefile) self.log.debug(f"Renamed {tmpf.name} to {newpath}") data = resp except HTTPError as e: raise e return data
def parse(self, response: scrapy.http.Response): """ Get file list :param response: :return: """ u: SplitResult = urlsplit(response.url) q: dict = dict(queryparse(u.query)) max_page = max( list( map( int, response.xpath( "/html/body/table/tr/td[@class='button']/form/input[@name='page']/@value" ).getall()))) current_page = int(q['page']) if current_page < max_page: # Call next page q['page'] = str(current_page + 1) yield scrapy.Request( response.urljoin("?" + urlencode(q)), callback=self.parse, ) for row in response.xpath( "/html/body/table[@width='100%']/tr[@class='newfiles']"): # Iterate through uploaded files link = row.xpath("./td[4]/a/@href").get() song = row.xpath("./td[4]/a/text()").get() if song is None: # Use file name song = os.path.basename(urlsplit(link).path) sequencer = row.xpath("./td[5]/text()").get() if sequencer is None: sequencer = "Unknown" game = row.xpath("./td[3]/text()").get() if game is None: game = "Unknown" uploadtime = datetime.strptime( row.xpath("./td[1]/text()").get(), "%Y-%m-%d %H:%M:%S") system = row.xpath("./td[2]/text()").get() yield scrapy.Request( response.urljoin(link), callback=self.dl_midi, meta={ "tune": Tune( artist=sequencer, title=song, system=system, game=game, uploadtime=uploadtime, data=None, ), }, )