def crawl(in_file, html_dir, status_dir, agent): urls = set() url_objects = [] with open(in_file) as lines: for line in lines: values = line.strip("\n").split("\t") url_object = {"url_meta":{}} if len(values) == 4: url_object["url_meta"]["topic"] = values[0] url_object["url_meta"]["site"] = values[1] url = URLUtility.normalize(values[2]) url_object["url"] = url url_object["url_meta"]["subtopic"] = values[3] else: url = URLUtility.normalize(values[0]) url_object["url"] = url if url not in urls: urls.add(url) url_objects.append(url_object) jobs = [] for i in range(Config.PROCESS_NUMBER): p = Process(target = crawlprocess, args = (url_objects, i, html_dir, status_dir, agent)) jobs.append(p) p.start() for p in jobs: p.join()
def crawl(in_file, html_dir, status_dir, agent): urls = set() url_objects = [] with open(in_file) as lines: for line in lines: values = line.strip("\n").split("\t") url_object = {"url_meta": {}} if len(values) == 4: url_object["url_meta"]["topic"] = values[0] url_object["url_meta"]["site"] = values[1] url = URLUtility.normalize(values[2]) url_object["url"] = url url_object["url_meta"]["subtopic"] = values[3] else: url = URLUtility.normalize(values[0]) url_object["url"] = url if url not in urls: urls.add(url) url_objects.append(url_object) jobs = [] for i in range(Config.PROCESS_NUMBER): p = Process(target=crawlprocess, args=(url_objects, i, html_dir, status_dir, agent)) jobs.append(p) p.start() for p in jobs: p.join()
def expand(indir, output_file): files = os.listdir(indir) uniq_links = set()#many seed urls come from the same site, so there exists duplicated outlinks from seed urls out = open(output_file, "w") for f in files: if f.split(".")[-1] != "json": #make sure this is json file continue filename = indir + "/" + f with open(filename) as lines: for line in lines: try: data = json.loads(line) url = data['url'] url = URLUtility.normalize(url) html_content = data['html'] #links = HTMLParser.extract_links(url, html_content) links = HTMLParser.extract_links_bs(url, html_content) for link in links: if URLUtility.is_same_site(url, link): if link not in uniq_links: uniq_links.add(link) out.write(link.encode('utf-8') + "\n") if url not in links: out.write(url.encode('utf-8') + "\n") except: traceback.print_exc() continue
def extract_links_bs(url, html): ''' Extract links from html source using beautiful soup Args: - url: url of the html source, used to construct absolute url from relative url - html: html source - html: html source Returns: - links: extracted links ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html) links = set() for tag in soup.findAll('a', href=True): link = tag['href'] try: link = urlparse.urljoin(url, link) except: continue link = URLUtility.validate_link(link) if link: link = URLUtility.normalize(link) if link: links.add(link) return list(links)
def search_backlinks(self, url, limit=5): """ Return a list of urls Args: limit: maximum number of results to return """ urls = [] try: results = self.client.links(url, scope="page_to_page", sort="page_authority", filters=["external"], limit=limit) #results = self.client.links(url, scope="page_to_page", sort="spam_score", filters=["external"], limit=limit) #results = self.client.links(url, scope="page_to_page", sort="page_authority") for res in results: if 'uu' in res: url = URLUtility.normalize(res['uu']) if url: urls.append(url) else: print "Error: key does not exisit" print res except: traceback.print_exc() return urls
def extract_external_links(self, url, html): ''' Extract external outlinks, that link to different websites Returns: - list of unique urls ''' try: soup = BeautifulSoup(html, 'lxml') links = set() tld = URLUtility.get_tld(url) for tag in soup.findAll('a', href=True): link = tag['href'] values = urlparse.urlparse(link) if (values.netloc == "") or (values.netloc == tld) or (tld in values.netloc): continue link = URLUtility.validate_link(link) if link: link = URLUtility.normalize(link) if link: links.add(link) return list(links) except: traceback.print_exc() return []
def extract_insite_links(self, url, html): ''' Returns: - list of insite urls that are different from the input url ''' try: soup = BeautifulSoup(html, 'html.parser') #soup = BeautifulSoup(html, 'lxml') # Couldn't parse http://www.gunsinternational.com/ links = set() tld = URLUtility.get_tld(url) for tag in soup.findAll('a', href=True): link = tag['href'] try: link = urlparse.urljoin(url, link) except: traceback.print_exc() continue values = urlparse.urlparse(link) if tld in values.netloc: link = URLUtility.validate_link(link) if link: link = URLUtility.normalize(link) if link and link != url: links.add(link) return list(links) except: print "Parsing with BeautifulSoup failed" return []
def extract_links_bs(self, url, html): ''' Extract all outlinks from html using beautiful soup. Return list of links Args: - url: url of the html source, used to construct absolute url from relative url - html: html source Returns: - links: list of outlinks ''' try: soup = BeautifulSoup(html, 'lxml') except: print "Parsing with beautiful soup failed" return [] links = set() for tag in soup.findAll('a', href=True): link = tag['href'] try: link = urlparse.urljoin(url, link) except: continue link = URLUtility.validate_link(link) if link: link = URLUtility.normalize(link) if link: links.add(link) return list(links)
def search(self, keyword, k): """ Search for a keyword and return top matched urls Reference: https://developers.google.com/custom-search/json-api/v1/reference/cse/list Args: k: Number of search results to return. """ k = min(k, self.max_results) urls = [] index = 1 while index<=k: try: res = self.service.cse().list(q=keyword, cx=self.cse_id, num=10, start=index).execute() # maximum 10 results for each query if 'items' in res: res = res['items'] for item in res: url = URLUtility.normalize(item['link']) if url: urls.append(url) if len(res)<10: # Early stop paging break else: print res break # No more results, stop paging except: traceback.print_exc() break index += 10 return urls
def _read_ac_result_file(result_file): urls = [] with open(result_file) as lines: for line in lines: url = line.split()[0] url = URLUtility.normalize(url) urls.append(url) return urls
def run(infile, outdir): urls = set([]) with open(infile) as lines: for line in lines: url = line.strip().split("\t")[0] url = URLUtility.normalize(url) urls.add(url) urls = list(urls) Download.download(urls, outdir)
def _read_sf_result_file(result_file): urls = [] with open(result_file) as lines: for line in lines: values = line.strip().split(', ') url = values[-1] url = URLUtility.normalize(url) urls.append(url) return urls
def deduplicate(outfile, indirs): writer = open(outfile, 'w') cached_urls = set() for indir in indirs: for fname in os.listdir(indir): print "Reading", indir+'/'+fname for line in open(indir+'/'+fname): data = json.loads(line) url = URLUtility.normalize(data['url']) if url in cached_urls: continue cached_urls.add(url) writer.write(line)
def _read_urls_from_json(url_file): urls = set() with open(url_file) as lines: for line in lines: try: jsonobj = json.loads(line) for url in jsonobj[1:]: url = URLUtility.normalize(url) urls.add(url) except: traceback.print_exc() print "Number of urls read from json file: ", len(urls) return list(urls)
def _read_ac_result_file(result_file, max_pages): """ Load all sites from the result file of ACHE """ count = 0 sites = set() with open(result_file) as lines: for line in lines: count += 1 url = line.split()[0] url = URLUtility.normalize(url) site = URLUtility.get_host(url) sites.add(site) if count == max_pages: break return sites
def search(self, query_term, count=10): """ Reference: https://docs.microsoft.com/en-us/rest/api/cognitiveservices/bing-web-api-v5-reference#query-parameters Args: count: The number of search results to return in the response. If count is greater than 50, paging will be used to fetch the results since maximum results of each query is 50 """ if self.cache and self.cache.contains(query_term): urls = self.cache.get(query_term) return [url for url in urls if self.is_valid(url)] urls = [] offset = 0 while count>0: params = urllib.urlencode({ # Request parameters 'q': query_term, 'count': str(min(count, 50)), 'offset': str(offset), 'mkt': 'en-us', 'safesearch': 'Moderate'}) try: conn = httplib.HTTPSConnection('api.cognitive.microsoft.com') #conn.request("GET", "/bing/v5.0/search?%s" % params, "{body}", headers) conn.request("GET", "/bing/v7.0/search?%s" % params, "{body}", self.headers) response = conn.getresponse() data = response.read() obj = json.loads(data) if 'webPages' in obj: webPages = obj['webPages'] values = webPages['value'] for value in values: if self.is_valid(value['url']): url = URLUtility.normalize(value['url']) if url: urls.append(url) conn.close() except: traceback.print_exc() count -= 50 offset += 1 if self.cache: self.cache.add(query_term, urls) return urls
def _read_sf_result_file(result_file, max_pages): """ Load all sites from the result file of SEEDFINDER """ sites = set() count = 0 with open(result_file) as lines: for line in lines: count += 1 values = line.strip().split(', ') url = values[-1] url = URLUtility.normalize(url) site = URLUtility.get_host(url) sites.add(site) if count == max_pages: break return sites
def extract_links(url, html): ''' Extract links from html source using regular expression Args: - url: url of the html source, used to construct absolute url from relative url - html: html source Returns: - links: extracted (normalized and validated) links ''' match = HTMLParser.LINK_PATTERN.findall(html) links = set([]) for link in match: link = urlparse.urljoin(url, link) link = URLUtility.validate_link(link) if link: link = URLUtility.normalize(link) if link: links.add(link) return list(links)
def custom_recrawl(status_file, supplement_status_file, html_dir, status_dir, agent): #Read the status_file, recrawl the website that causes exception downloaded_urls = set() with open(status_file) as lines: for line in lines: try: json_data = json.loads(line) if "exception" in json_data: continue downloaded_urls.add(json_data['url']) except: print "recrawl exception" traceback.print_exc() continue url_objects = [] with open(supplement_status_file) as lines: for line in lines: try: values = line.strip("\n").split("\t") url = URLUtility.normalize(values[2]) if url not in downloaded_urls: url_object = {"url_meta": {}} url_object["url_meta"]["topic"] = values[0] url_object["url_meta"]["site"] = values[1] url_object["url_meta"]["subtopic"] = values[3] url_object["url"] = url url_objects.append(url_object) except: print "custom recrawl exception" traceback.print_exc() continue print "Number of urls to download: " + str(len(url_objects)) jobs = [] for i in range(Config.PROCESS_NUMBER): p = Process(target=crawlprocess, args=(url_objects, i, html_dir, status_dir, agent)) jobs.append(p) p.start() for p in jobs: p.join()
def custom_recrawl(status_file, supplement_status_file, html_dir, status_dir, agent): #Read the status_file, recrawl the website that causes exception downloaded_urls = set() with open(status_file) as lines: for line in lines: try: json_data = json.loads(line) if "exception" in json_data: continue downloaded_urls.add(json_data['url']) except: print "recrawl exception" traceback.print_exc() continue url_objects = [] with open(supplement_status_file) as lines: for line in lines: try: values = line.strip("\n").split("\t") url = URLUtility.normalize(values[2]) if url not in downloaded_urls: url_object = {"url_meta":{}} url_object["url_meta"]["topic"] = values[0] url_object["url_meta"]["site"] = values[1] url_object["url_meta"]["subtopic"] = values[3] url_object["url"] = url url_objects.append(url_object) except: print "custom recrawl exception" traceback.print_exc() continue print "Number of urls to download: " + str(len(url_objects)) jobs = [] for i in range(Config.PROCESS_NUMBER): p = Process(target = crawlprocess, args = (url_objects, i, html_dir, status_dir, agent)) jobs.append(p) p.start() for p in jobs: p.join()
def _read_relev_file(clf_file): """ Load all sites from the classification file Note that all classification files from different discovery tools have the same format """ sites = set() with open(clf_file) as lines: for line in lines: try: values = line.strip().split(",") url = ''.join(values[:-1]) label = int(values[-1]) url = URLUtility.normalize(url) site = URLUtility.get_host(url) if label != -1 and label != 1: print "Parsed label is incorrect" if label == 1: sites.add(site) except: traceback.print_exc() return sites
def _read_clf_file(clf_file): url2label = {} site2label = {} with open(clf_file) as lines: for line in lines: try: values = line.strip().split(",") url = ''.join(values[:-1]) label = int(values[-1]) url = URLUtility.normalize(url) site = URLUtility.get_host(url) if label > 0: url2label[url] = True site2label[site] = True else: url2label[url] = False if site not in site2label: site2label[site] = False except: print line return url2label, site2label
def expand(indir, output_file): files = os.listdir(indir) uniq_links = set( ) #many seed urls come from the same site, so there exists duplicated outlinks from seed urls out = open(output_file, "w") for f in files: filename = indir + "/" + f with open(filename) as lines: for line in lines: data = json.loads(line) url = data['url'] url = URLUtility.normalize(url) html_content = data['text'] #links = HTMLParser.extract_links(url, html_content) links = HTMLParser.extract_links_bs(url, html_content) for link in links: if URLUtility.is_same_site(url, link): if link not in uniq_links: uniq_links.add(link) out.write(link.encode('utf-8') + "\n") if url not in links: out.write(url.encode('utf-8') + "\n") out.close()