def test(url, debug=True): parsed = urllib.parse.urlparse(url) root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", "")) new = db.WebPages( url = url, starturl = root, netloc = parsed.netloc, distance = 50000, is_text = True, priority = 500000, type = 'unknown', fetchtime = datetime.datetime.now(), ) if debug: print(new) archiver = SiteArchiver(None) ret = archiver.taskProcess(job_test=new) if debug: print(archiver) print(ret.keys()) if "plainLinks" in ret and "rsrcLinks" in ret: # Looks like a HTML page. Print the relevant info print_html_response(archiver, new, ret) if "rss-content" in ret: print_rss_response(archiver, new, ret) pass
def test(url, debug=True, rss_debug=False): if rss_debug: print("Debugging RSS") flags.RSS_DEBUG = True parsed = urllib.parse.urlparse(url) root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", "")) new = db.WebPages( url = url, starturl = root, netloc = parsed.netloc, distance = 50000, is_text = True, priority = 500000, type = 'unknown', fetchtime = datetime.datetime.now(), ) if debug: print(new) archiver = SiteArchiver(None) archiver.taskProcess(job_test=new)