def export_graph_data(): data_file = open(EXPORT_EDGES_FILE, "w") db = db_utils.getDBInstance() nodes = set([]) data_file.write("Source\tTarget\n") pages = db.webpages.find() for page in pages: ilinks = page["incoming_links"] for link in ilinks: if link.startswith("javascript"): continue if page["url"].startswith("javascript"): continue link = link.replace(",", "") nodes.add(link) page["url"] = page["url"].replace(",", "") nodes.add(page["url"]) data_file.write(("%s\t%s\n"%(link, page["url"]))) nodes_file = open(EXPORT_NODES_FILE, "w") nodes_file.write("Id\tLabel\n") for node in nodes: nodes_file.write("%s\t%s\n"%(node, utils.domainOf(node))) nodes_file.close() data_file.close()
def regex_domain_match(db, url): visited_domain_patterns = db.domain_regex_patterns.find_one({'domain_regexes': 'regex'}) regexes = [] if visited_domain_patterns: regexes = visited_domain_patterns["regexes"] for regex in regexes: if re.search(regex, url, re.I|re.M): return True domain = utils.domainOf(url) if domain.startswith("www."): domain = domain.replace("www.", "") if domain.count('.') > 1: spli = domain.split(".") domain = domain[-2] + "." + domain[-1] regexes.append(domain) if not visited_domain_patterns: visited_domain_patterns = { 'domain_regexes': "regex"} visited_domain_patterns["regexes"] = list(set(regexes)) if "_id" in visited_domain_patterns: visited_domain_patterns.pop("_id") query = {'domain_regexes': 'regex'} db.domain_regex_patterns.find_and_modify(query=query, update=visited_domain_patterns, upsert=True, new=True) return False
def prepareURLsForNextCrawl(urls): new_urls = [] for url in urls: url = utils.sanitize_url(url) if url.endswith(".js") or url.endswith("exe"): url = utils.domainOf(url) url = utils.sanitize_url(url) new_urls.append(url) new_urls = [url for url in new_urls if not isWhiteListedURL(url)] return list(set(new_urls))
def persistURLsForNextCrawlInDB(db, urls): urls = prepareURLsForNextCrawl(urls) crawl_queue = db.crawl_queue visited_urls = db.visited_urls for url in urls: domain = utils.domainOf(url) domain_hash = utils.get_url_hash(domain) url_hash = utils.get_url_hash(url) db_query = {domain_hash: domain} update = {url_hash: url} url_obj = visited_urls.find_one(db_query) if not url_obj: crawl_queue.insert(update)
def _domain(self): return utils.domainOf(self.url)
if obj: print "Updating for: ", out_url query = {'url_hash': url_hash} obj["incoming_links"].append(url) obj["incoming_links"] = list(set(obj["incoming_links"])) if "_id" in obj: obj.pop("_id") new_obj = db.webpages.find_and_modify(query=query, update=obj, new=True, upsert=True) else: new_webpage = WebPage(out_url) new_webpage.incoming_links.add(url) db.webpages.insert(new_webpage.json()) print "Marking as visited" domain = utils.domainOf(url) domain_hash = utils.get_url_hash(domain) db_query = {domain_hash: domain} update = {url_hash: url} vis = db.visited_urls.find_and_modify(query=db_query, update=db_query, upsert=True, new=True) print vis print "Updating crawl queue" persistURLsForNextCrawlInDB(db, all_urls) print "Updated" if __name__=='__main__': #crawl_url("http://sugarudyog.com/index.htm?adasd=asdas&sadgas=afs", headless=True, # save_into_db=True) print