def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = get_page(url, session).text except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): comicurl = match.group(2) name = format_name(match.group(3)) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue try: if "/d/" not in comicurl: check_robotstxt(comicurl + "d/", session) else: check_robotstxt(comicurl, session) except IOError: print("INFO: robots.txt denied for keenspot", repr(name)) continue res[name] = comicurl
def handle_url(url, session, res): """Parse one search result page.""" print("Parsing", url, file=sys.stderr) try: data = get_page(url, session).text except IOError as msg: print("ERROR:", msg, file=sys.stderr) return for match in url_matcher.finditer(data): url = match.group(1) + '/' name = format_name(match.group(2)) if name in exclude_comics: continue if contains_case_insensitive(res, name): # we cannot handle two comics that only differ in case print("INFO: skipping possible duplicate", repr(name), file=sys.stderr) continue # find out how many images this comic has end = match.end() mo = num_matcher.search(data[end:]) if not mo: print("ERROR:", repr(data[end:end + 300]), file=sys.stderr) continue num = int(mo.group(1)) url = url_overrides.get(name, url) try: if "/d/" not in url: check_robotstxt(url + "d/", session) else: check_robotstxt(url, session) except IOError: print("INFO: robots.txt denied for comicgenesis", repr(name)) continue else: res[name] = (url, num)
def get_url(self, url, expand=True): """Get an HTML page and parse it with LXML.""" print("Parsing", url, file=sys.stderr) try: data = html.document_fromstring(get_page(url, self.session).text) if expand: data.make_links_absolute(url) return data except IOError as msg: print("ERROR:", msg, file=sys.stderr) raise
def get_url(self, url, expand=True, robot=True): """Get an HTML page and parse it with LXML.""" print("Parsing", url, file=sys.stderr) try: pagetext = get_page(url, self.session, robot).text data = lxml.html.document_fromstring(pagetext) if expand: data.make_links_absolute(url) if self.sleep > 0: time.sleep(self.sleep) return data except IOError as msg: print("ERROR:", msg, file=sys.stderr) raise