def crawl_mathy(): # Build constraint that describes which outgoing WebNode links to follow constraint = LinkConstraint('http', 'www.math.kit.edu') # Prevent downloading links with these endings # Frequent candidates: '.png', '.jpg', '.jpeg', '.pdf', '.ico', '.doc', '.txt', '.gz', '.zip', '.tar','.ps', # '.docx', '.tex', 'gif', '.ppt', '.m', '.mw', '.mp3', '.wav', '.mp4' forbidden_endings = ['.pdf', '.png', '.ico', '#top'] # for fast exclusion constraint.add_rule(lambda link: all((not link.lower().endswith(ending) for ending in forbidden_endings))) # Forbid every point in the last path segment as this likely is a file and we are not interested in it def rule_no_point_in_last_path_segment(link_parsed): split = link_parsed.path.split("/") return len(split) == 0 or "." not in split[-1] constraint.add_rule(rule_no_point_in_last_path_segment, parsed_link=True) # Start the crawler from a start domain, optionally loading already existing nodes from pyoogle.config import DATABASE_PATH path = DATABASE_PATH c = Crawler(path, constraint) c.start("http://www.math.kit.edu", clear_store=False) # Wait for the crawler to finish c.join() webnet = c.web_net logging.info("DONE, webnet contains %d nodes", len(webnet)) return path, webnet
def crawl_spon(): constraint = LinkConstraint('', 'www.spiegel.de') # Forbid every point in the last path segment as this likely is a file and we are not interested in it def rule_no_point_in_last_path_segment(link_parsed): split = link_parsed.path.split("/") return len(split) == 0 or ("." not in split[-1] or split[-1].lower().endswith(".html") or split[-1].lower().endswith(".htm")) constraint.add_rule(rule_no_point_in_last_path_segment, parsed_link=True) path = "/home/daniel/PycharmProjects/PageRank/spon.db" c = Crawler(path, constraint) c.start("http://www.spiegel.de", clear_store=False) # Wait for the crawler to finish c.join() webnet = c.web_net logging.info("DONE, webnet contains %d nodes", len(webnet)) return path, webnet
def crawl_mathy(): # Build constraint that describes which outgoing WebNode links to follow constraint = LinkConstraint('http', 'www.math.kit.edu') # Prevent downloading links with these endings # Frequent candidates: '.png', '.jpg', '.jpeg', '.pdf', '.ico', '.doc', '.txt', '.gz', '.zip', '.tar','.ps', # '.docx', '.tex', 'gif', '.ppt', '.m', '.mw', '.mp3', '.wav', '.mp4' forbidden_endings = ['.pdf', '.png', '.ico', '#top'] # for fast exclusion constraint.add_rule(lambda link: all( (not link.lower().endswith(ending) for ending in forbidden_endings))) # Forbid every point in the last path segment as this likely is a file and we are not interested in it def rule_no_point_in_last_path_segment(link_parsed): split = link_parsed.path.split("/") return len(split) == 0 or "." not in split[-1] constraint.add_rule(rule_no_point_in_last_path_segment, parsed_link=True) # Start the crawler from a start domain, optionally loading already existing nodes from pyoogle.config import DATABASE_PATH path = DATABASE_PATH c = Crawler(path, constraint) c.start("http://www.math.kit.edu", clear_store=False) # Wait for the crawler to finish c.join() webnet = c.web_net logging.info("DONE, webnet contains %d nodes", len(webnet)) return path, webnet