def crawl(in_file, html_dir, status_dir, agent): urls = set() url_objects = [] with open(in_file) as lines: for line in lines: values = line.strip("\n").split("\t") url_object = {"url_meta": {}} if len(values) == 4: url_object["url_meta"]["topic"] = values[0] url_object["url_meta"]["site"] = values[1] url = URLUtility.normalize(values[2]) url_object["url"] = url url_object["url_meta"]["subtopic"] = values[3] else: url = URLUtility.normalize(values[0]) url_object["url"] = url if url not in urls: urls.add(url) url_objects.append(url_object) jobs = [] for i in range(Config.PROCESS_NUMBER): p = Process(target=crawlprocess, args=(url_objects, i, html_dir, status_dir, agent)) jobs.append(p) p.start() for p in jobs: p.join()
def extract_links_bs(url, html): ''' Extract links from html source using beautiful soup Args: - url: url of the html source, used to construct absolute url from relative url - html: html source - html: html source Returns: - links: extracted links ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html) links = set() for tag in soup.findAll('a', href=True): link = tag['href'] try: link = urlparse.urljoin(url, link) except: continue link = URLUtility.validate_link(link) if link: link = URLUtility.normalize(link) if link: links.add(link) return list(links)
def expand(indir, output_file): files = os.listdir(indir) uniq_links = set()#many seed urls come from the same site, so there exists duplicated outlinks from seed urls out = open(output_file, "w") for f in files: if f.split(".")[-1] != "json": #make sure this is json file continue filename = indir + "/" + f with open(filename) as lines: for line in lines: try: data = json.loads(line) url = data['url'] url = URLUtility.normalize(url) html_content = data['html'] #links = HTMLParser.extract_links(url, html_content) links = HTMLParser.extract_links_bs(url, html_content) for link in links: if URLUtility.is_same_site(url, link): if link not in uniq_links: uniq_links.add(link) out.write(link.encode('utf-8') + "\n") if url not in links: out.write(url.encode('utf-8') + "\n") except: traceback.print_exc() continue
def extract_external_links(self, url, html): ''' Extract external outlinks, that link to different websites Returns: - list of unique urls ''' try: soup = BeautifulSoup(html, 'lxml') links = set() tld = URLUtility.get_tld(url) for tag in soup.findAll('a', href=True): link = tag['href'] values = urlparse.urlparse(link) if (values.netloc == "") or (values.netloc == tld) or (tld in values.netloc): continue link = URLUtility.validate_link(link) if link: link = URLUtility.normalize(link) if link: links.add(link) return list(links) except: traceback.print_exc() return []
def extract_insite_links(self, url, html): ''' Returns: - list of insite urls that are different from the input url ''' try: soup = BeautifulSoup(html, 'html.parser') #soup = BeautifulSoup(html, 'lxml') # Couldn't parse http://www.gunsinternational.com/ links = set() tld = URLUtility.get_tld(url) for tag in soup.findAll('a', href=True): link = tag['href'] try: link = urlparse.urljoin(url, link) except: traceback.print_exc() continue values = urlparse.urlparse(link) if tld in values.netloc: link = URLUtility.validate_link(link) if link: link = URLUtility.normalize(link) if link and link != url: links.add(link) return list(links) except: print "Parsing with BeautifulSoup failed" return []
def get_text(self, text_type): """ Return extracted text from the html. Extract text if neccessary NOTE: this function's flow can be confusing cause it does not only serve as extraction but also cache the extracted text in different scenarios. Parameters: ----------- text_type: string, optional """ if not self.html: return '' if text_type == 'body': if not self.body: self.body = Text_Extractor.extract_body(self.html) self.body = URLUtility.clean_text(self.body) return self.body elif text_type == 'meta': if not self.meta: self.meta = Text_Extractor.extract_body(self.html) self.meta = URLUtility.clean_text(self.meta) return self.meta elif text_type == 'title': if not self.title: self.title = Text_Extractor.extract_body(self.html) self.title = URLUtility.clean_text(self.title) return self.title else: print "Wrong text_type" return ''
def extract_links_bs(self, url, html): ''' Extract all outlinks from html using beautiful soup. Return list of links Args: - url: url of the html source, used to construct absolute url from relative url - html: html source Returns: - links: list of outlinks ''' try: soup = BeautifulSoup(html, 'lxml') except: print "Parsing with beautiful soup failed" return [] links = set() for tag in soup.findAll('a', href=True): link = tag['href'] try: link = urlparse.urljoin(url, link) except: continue link = URLUtility.validate_link(link) if link: link = URLUtility.normalize(link) if link: links.add(link) return list(links)
def evaluate_recall(result_file, test_file): """ Count number of urls in test found""" test_host = set() with open(test_file) as lines: for line in lines: url = line.strip() #url = url_normalize(url) host = URLUtility.get_tld(url) test_host.add(host) found_host = set() with open(result_file) as lines: for line in lines: values = line.strip().split() url = values[1] #url = url_normalize(url) host = URLUtility.get_tld(url) found_host.add(host) found = 0 for host in found_host: if host in test_host: found += 1 print host, found print found, len(test_host)
def crawl(in_file, html_dir, status_dir, agent): urls = set() url_objects = [] with open(in_file) as lines: for line in lines: values = line.strip("\n").split("\t") url_object = {"url_meta":{}} if len(values) == 4: url_object["url_meta"]["topic"] = values[0] url_object["url_meta"]["site"] = values[1] url = URLUtility.normalize(values[2]) url_object["url"] = url url_object["url_meta"]["subtopic"] = values[3] else: url = URLUtility.normalize(values[0]) url_object["url"] = url if url not in urls: urls.add(url) url_objects.append(url_object) jobs = [] for i in range(Config.PROCESS_NUMBER): p = Process(target = crawlprocess, args = (url_objects, i, html_dir, status_dir, agent)) jobs.append(p) p.start() for p in jobs: p.join()
def _read_ac_result_file(result_file, max_pages): """ Load all sites from the result file of ACHE """ count = 0 sites = set() with open(result_file) as lines: for line in lines: count += 1 url = line.split()[0] url = URLUtility.normalize(url) site = URLUtility.get_host(url) sites.add(site) if count == max_pages: break return sites
def search_backlinks(self, url, limit=5): """ Return a list of urls Args: limit: maximum number of results to return """ urls = [] try: results = self.client.links(url, scope="page_to_page", sort="page_authority", filters=["external"], limit=limit) #results = self.client.links(url, scope="page_to_page", sort="spam_score", filters=["external"], limit=limit) #results = self.client.links(url, scope="page_to_page", sort="page_authority") for res in results: if 'uu' in res: url = URLUtility.normalize(res['uu']) if url: urls.append(url) else: print "Error: key does not exisit" print res except: traceback.print_exc() return urls
def search(self, keyword, k): """ Search for a keyword and return top matched urls Reference: https://developers.google.com/custom-search/json-api/v1/reference/cse/list Args: k: Number of search results to return. """ k = min(k, self.max_results) urls = [] index = 1 while index<=k: try: res = self.service.cse().list(q=keyword, cx=self.cse_id, num=10, start=index).execute() # maximum 10 results for each query if 'items' in res: res = res['items'] for item in res: url = URLUtility.normalize(item['link']) if url: urls.append(url) if len(res)<10: # Early stop paging break else: print res break # No more results, stop paging except: traceback.print_exc() break index += 10 return urls
def test(): fetcher = Fetcher("test/fetcher_test_data") urls = URLUtility.load_urls("test/data/urls.txt") sites = fetcher.fetch(urls) for site in sites: for page in site: print page.get_text('body')[:100].replace("\n", "")
def _read_ac_result_file(result_file): urls = [] with open(result_file) as lines: for line in lines: url = line.split()[0] url = URLUtility.normalize(url) urls.append(url) return urls
def _read_sf_result_file(result_file, max_pages): """ Load all sites from the result file of SEEDFINDER """ sites = set() count = 0 with open(result_file) as lines: for line in lines: count += 1 values = line.strip().split(', ') url = values[-1] url = URLUtility.normalize(url) site = URLUtility.get_host(url) sites.add(site) if count == max_pages: break return sites
def _read_sf_result_file(result_file): urls = [] with open(result_file) as lines: for line in lines: values = line.strip().split(', ') url = values[-1] url = URLUtility.normalize(url) urls.append(url) return urls
def run(infile, outdir): urls = set([]) with open(infile) as lines: for line in lines: url = line.strip().split("\t")[0] url = URLUtility.normalize(url) urls.add(url) urls = list(urls) Download.download(urls, outdir)
def load_blacklist(f): bl = set() with open(f) as lines: for line in lines: line = line.strip().lower() host = URLUtility.get_host(line) print host bl.add(host) return bl
def search_site(url_file, out_file, keyword): """ Write results as json line objects into out_file Format of each json object: list<str>: list of urls. First url is the main site """ urls = URLUtility.load_urls(url_file) site2urls = read_json(out_file) k = 10 out = open(out_file, "a+") for i, url in enumerate(urls): site = URLUtility.get_host(url) if site not in site2urls: results = bing_search.search_site(keyword, url, 10) results = [site, url] + results json.dump(results, out) out.write("\n") out.close()
def extract_links(url, html): ''' Extract links from html source using regular expression Args: - url: url of the html source, used to construct absolute url from relative url - html: html source Returns: - links: extracted (normalized and validated) links ''' match = HTMLParser.LINK_PATTERN.findall(html) links = set([]) for link in match: link = urlparse.urljoin(url, link) link = URLUtility.validate_link(link) if link: link = URLUtility.normalize(link) if link: links.add(link) return list(links)
def count(infile): sites = set([]) with open(infile) as lines: for line in lines: obj = json.loads(line) url = obj['url'] site = URLUtility.get_host(url) sites.add(site) for site in sites: print site print len(sites)
def _read_relev_file(clf_file): """ Load all sites from the classification file Note that all classification files from different discovery tools have the same format """ sites = set() with open(clf_file) as lines: for line in lines: try: values = line.strip().split(",") url = ''.join(values[:-1]) label = int(values[-1]) url = URLUtility.normalize(url) site = URLUtility.get_host(url) if label != -1 and label != 1: print "Parsed label is incorrect" if label == 1: sites.add(site) except: traceback.print_exc() return sites
def update_seeds(self, seed_urls): '''Update seed urls in the current seed list. Fetch the seed urls''' new_seed_urls = [] for url in seed_urls: host = URLUtility.get_tld(url) if host not in self.host: self.host.add(host) new_seed_urls.append(url) urls, text = self.fetcher.fetch_urls(new_seed_urls) self.similarity.update_seeds(urls, text) self.K = max(len(self.similarity.seed_pages.keys())/2, 10)
def __init__(self, url): #url = url_normalize.url_normalize(url) self.host = URLUtility.get_host(url) self.pages = [] self.word_set = set() self.vsm = None self.jaccard = None # Don't use this if seeds are updated self.cosine = None # Don't use this if seeds are updated self.clf_vsm = None self.bstf_vsm = None self.bsbin_vsm = None self.cs_vsm = None
def crawlprocess(url_objects, start, html_dir, status_dir, agent): status_file = open(status_dir + "/status_temp_" + str(start) + ".json", "w") content_file = None if Config.DATA_FORMAT == "ONE_FILE": content_file = open(html_dir + "/html_" + str(start) + ".json", "a+") save_content = save_content_one_file(content_file) elif Config.DATA_FORMAT == "MULTI_FILE": save_content = save_content_multi_file(html_dir) for i in range(start, len(url_objects), Config.PROCESS_NUMBER): url_obj = url_objects[i] url = url_obj["url"] try: if Config.USE_TOR: res = requests.get(url, headers=Config.HEADERS[agent], proxies=TOR_PROXY, verify=False, timeout=5) else: res = requests.get(url, headers=Config.HEADERS[agent], verify=False, timeout=5) if Config.SAVE_HTML: save_content(url, res) save_response(url, URLUtility.encode(url), str(res.status_code), None, res.headers, agent, url_obj, status_file) except requests.ConnectionError: #In the event of a network problem (e.g. DNS failure, refused connection, etc) save_response(url, URLUtility.encode(url), None, "ConnectionError", None, agent, url_obj, status_file) except requests.HTTPError: #In the rare event of an invalid HTTP response save_response(url, URLUtility.encode(url), None, "HTTPError", None, agent, url_obj, status_file) except requests.Timeout: save_response(url, URLUtility.encode(url), None, "Timeout", None, agent, url_obj, status_file) except requests.TooManyRedirects: save_response(url, URLUtility.encode(url), None, "TooManyRedirects", None, agent, url_obj, status_file) except Exception: save_response(url, URLUtility.encode(url), None, "OtherExceptions", None, agent, url_obj, status_file) status_file.close() if content_file: content_file.close()
def deduplicate(outfile, indirs): writer = open(outfile, 'w') cached_urls = set() for indir in indirs: for fname in os.listdir(indir): print "Reading", indir+'/'+fname for line in open(indir+'/'+fname): data = json.loads(line) url = URLUtility.normalize(data['url']) if url in cached_urls: continue cached_urls.add(url) writer.write(line)
def export_host(indir, outfile): urls = ExportURL.load_urls(indir) uniq_hosts = set([]) out = open(outfile, "w") for url in urls: try: host = URLUtility.get_host(url) if host not in uniq_hosts: uniq_hosts.add(host) out.write(host.encode('utf-8') + "\n") except: traceback.print_exc() out.close()
def _read_urls_from_json(url_file): urls = set() with open(url_file) as lines: for line in lines: try: jsonobj = json.loads(line) for url in jsonobj[1:]: url = URLUtility.normalize(url) urls.add(url) except: traceback.print_exc() print "Number of urls read from json file: ", len(urls) return list(urls)
def __init__(self, seed_file, result_file, data_dir): """ Args: seed_file: contains list of seed urls data_dir: stores crawled data result_file: stores urls and their scores """ self.train_urls = URLUtility.load_urls(seed_file) self.fetcher = Fetcher( data_dir, None, False ) # Note: Fetcher contains Bing Search but does not use it (just for website ranking evaluation) self.result_file = result_file self.ranked_result_file = result_file + ".rank" self.searcher = Search_APIs(data_dir, self.fetcher)
def run_filter(infile, outfile): blacklists = load_blacklist("blacklist.txt") out = open(outfile, "w") counter = {} with open(infile) as lines: for line in lines: url = line.strip() if is_filter(url, blacklists, counter): continue else: host = URLUtility.get_host(url) out.write(line) out.close()
def _read_clf_file(clf_file): url2label = {} site2label = {} with open(clf_file) as lines: for line in lines: try: values = line.strip().split(",") url = ''.join(values[:-1]) label = int(values[-1]) url = URLUtility.normalize(url) site = URLUtility.get_host(url) if label > 0: url2label[url] = True site2label[site] = True else: url2label[url] = False if site not in site2label: site2label[site] = False except: print line return url2label, site2label
def expand(indir, output_file): files = os.listdir(indir) uniq_links = set( ) #many seed urls come from the same site, so there exists duplicated outlinks from seed urls out = open(output_file, "w") for f in files: filename = indir + "/" + f with open(filename) as lines: for line in lines: data = json.loads(line) url = data['url'] url = URLUtility.normalize(url) html_content = data['text'] #links = HTMLParser.extract_links(url, html_content) links = HTMLParser.extract_links_bs(url, html_content) for link in links: if URLUtility.is_same_site(url, link): if link not in uniq_links: uniq_links.add(link) out.write(link.encode('utf-8') + "\n") if url not in links: out.write(url.encode('utf-8') + "\n") out.close()
def custom_recrawl(status_file, supplement_status_file, html_dir, status_dir, agent): #Read the status_file, recrawl the website that causes exception downloaded_urls = set() with open(status_file) as lines: for line in lines: try: json_data = json.loads(line) if "exception" in json_data: continue downloaded_urls.add(json_data['url']) except: print "recrawl exception" traceback.print_exc() continue url_objects = [] with open(supplement_status_file) as lines: for line in lines: try: values = line.strip("\n").split("\t") url = URLUtility.normalize(values[2]) if url not in downloaded_urls: url_object = {"url_meta":{}} url_object["url_meta"]["topic"] = values[0] url_object["url_meta"]["site"] = values[1] url_object["url_meta"]["subtopic"] = values[3] url_object["url"] = url url_objects.append(url_object) except: print "custom recrawl exception" traceback.print_exc() continue print "Number of urls to download: " + str(len(url_objects)) jobs = [] for i in range(Config.PROCESS_NUMBER): p = Process(target = crawlprocess, args = (url_objects, i, html_dir, status_dir, agent)) jobs.append(p) p.start() for p in jobs: p.join()
def save_content(url, res): html_filename = html_dir + "/" + URLUtility.encode(url) + ".html" html_file = open(html_filename, "w") text = res.text.encode('utf-8') html_file.write(text) html_file.close()