def parse(self, resp): hxs = Selector(resp) # handle pagination recursively base_url = get_base_url(resp) for pagination in hxs.css("ul.pager a"): txt, url = extract_link(pagination) if txt.endswith("Next"): yield Request(urljoin_rfc(base_url, url), self.parse) mangas = hxs.xpath("//table[@class='listing']/tr/td[1]/a") for manga in mangas: item = MangaItem() item['name'], item['link'] = extract_link(manga) yield item
def parse(self, resp): hxs = Selector(resp) for manga in hxs.css("a.tooltip_manga"): item = MangaItem() item['name'], item['link'] = extract_link(manga) yield item
def crawler(): counter = 1 for url_ref in config.FULL_URLS: resp = requests.get(url_ref) if resp.status_code == 200: _, name = get_name(url_ref) # Ensure folder exists folter_path = create_folder([config.LYRICS_FOLDER, name]) # Get all links parsed_html = BeautifulSoup(resp.content, features='html.parser') lyrics_links = parsed_html.select('.listalbum-item a') LOG.info(f"Number of {name.upper()} songs: {len(lyrics_links)}") lyric_paths = [extract_link(link) for link in lyrics_links] for lyric_path in lyric_paths: try: writer, song_name = get_name(lyric_path) if name != writer: alt_folder = create_folder( [config.LYRICS_FOLDER, writer]) lyrics_file = alt_folder.joinpath(song_name + '.txt') file_found = lyrics_file.is_file() else: writer = name lyrics_file = folter_path.joinpath(song_name + '.txt') file_found = lyrics_file.is_file() if not file_found: # url = config.BASE_URL + lyric_path text = get_lyrics(lyric_path).strip() LOG.info("Downloading (" + str(counter).zfill(3) + f") [{writer}]: {song_name}") counter += 1 with open(lyrics_file, "w") as f: f.write(text) time.sleep(config.CRAWLER_WAIT + config.CRAWLER_WAIT * random.random()) except IndexError: LOG.error( f"Access denied while scraping: {lyric_path} \n" f"Try increasing the waiting time.\n" f"Finishing the scrapping for the moment. Try to access on your browser to unblock access" ) return except Exception as err: print(f"ERROR: {lyric_path}: {err}") else: LOG.warning(f"Unable to load: {url_ref}")
def parse(self, resp): hxs = Selector(resp) for row in hxs.xpath("//table[@id='listing']//tr"): item = MangaChapterItem() cells = row.xpath("td") if not cells: continue item["name"], item["link"] = extract_link(cells[0].xpath("a")) item["date"] = self.parsedate(cells[-1].xpath("text()").extract()[0]) yield item
def parse(self, resp): hxs = Selector(resp) for row in hxs.css("ul.chapterlistfull > li"): item = MangaChapterItem() try: item["name"], item["link"] = extract_link(row.xpath("a")[0]) dt = row.css("span.date::text") item["date"] = self.parsedate(dt.extract()[0]) except IndexError: continue yield item
def parse(self, resp): hxs = Selector(resp) for row in hxs.css("div.detail_list > ul > li"): item = MangaChapterItem() cells = row.xpath("span") if not cells: continue try: item['name'], item['link'] = extract_link(cells[0].xpath("a")) item['date'] = self.parsedate( cells[-1].xpath('text()').extract()[0]) yield item except IndexError: pass
def run(D): active = True for fn in L: D[fn] = True with open(d + fn, 'r') as fh: data = fh.read() sL = data.strip().split('\n') for line in sL: # don't read quoted blocks if line.strip() == '```': active = not (active) if not active: continue link = ut.extract_link(line) if link: check(fn, link, D)
def parse(self, resp): hxs = Selector(resp) rows = hxs.xpath("//table[@class='listing']//tr") for row in rows: item = MangaChapterItem() cells = row.xpath("td") if not cells: continue try: item['name'], item['link'] = extract_link(cells.xpath("a")[0]) dt = cells.xpath("text()")[-1] item["date"] = self.parsedate(dt.extract()) except IndexError: continue except ValueError: continue yield item
def parse(self, resp): hxs = Selector(resp) for manga in hxs.css("ul.series_alpha > li > a"): item = MangaItem() item["name"], item["link"] = extract_link(manga) yield item