def crawl(in_file, html_dir, status_dir, agent):
    urls = set()
    url_objects = []
    with open(in_file) as lines:
        for line in lines:
            values = line.strip("\n").split("\t")
            url_object = {"url_meta": {}}
            if len(values) == 4:
                url_object["url_meta"]["topic"] = values[0]
                url_object["url_meta"]["site"] = values[1]
                url = URLUtility.normalize(values[2])
                url_object["url"] = url
                url_object["url_meta"]["subtopic"] = values[3]

            else:
                url = URLUtility.normalize(values[0])
                url_object["url"] = url
            if url not in urls:
                urls.add(url)
                url_objects.append(url_object)
    jobs = []
    for i in range(Config.PROCESS_NUMBER):
        p = Process(target=crawlprocess,
                    args=(url_objects, i, html_dir, status_dir, agent))
        jobs.append(p)
        p.start()
    for p in jobs:
        p.join()
 def extract_links_bs(url, html):
     '''
     Extract links from html source using beautiful soup
     Args:
         - url: url of the html source, used to construct absolute url from relative url
         - html: html source
         - html: html source
     Returns:
         - links: extracted links
    
     '''
     from bs4 import BeautifulSoup
     soup = BeautifulSoup(html)
     links = set()
     for tag in soup.findAll('a', href=True):
         link = tag['href']
         try:
             link = urlparse.urljoin(url, link)
         except:
             continue
         link = URLUtility.validate_link(link)
         if link:
             link = URLUtility.normalize(link)
             if link:
                 links.add(link)
     return list(links)
示例#3
0
def expand(indir, output_file):
    files = os.listdir(indir)
    uniq_links = set()#many seed urls come from the same site, so there exists duplicated outlinks from seed urls
    out = open(output_file, "w") 
    for f in files:
        if f.split(".")[-1] != "json":
            #make sure this is json file
            continue
        filename = indir + "/" + f
        with open(filename) as lines:
            for line in lines:
                try:
                    data = json.loads(line)
                    url = data['url']
                    url = URLUtility.normalize(url)
                    html_content = data['html'] 
                    #links = HTMLParser.extract_links(url, html_content)
                    links = HTMLParser.extract_links_bs(url, html_content)
                    for link in links:
                        if URLUtility.is_same_site(url, link):
                            if link not in uniq_links:
                                uniq_links.add(link)
                                out.write(link.encode('utf-8') + "\n")
                    if url not in links:
                        out.write(url.encode('utf-8') + "\n")
                except:
                    traceback.print_exc()
                    continue
    def extract_external_links(self, url, html):
        '''
        Extract external outlinks, that link to different websites
        Returns: 
            - list of unique urls
        '''
        try:
            soup = BeautifulSoup(html, 'lxml')
            links = set()
            tld = URLUtility.get_tld(url)

            for tag in soup.findAll('a', href=True):
                link = tag['href']
                values = urlparse.urlparse(link)
                if (values.netloc == "") or (values.netloc
                                             == tld) or (tld in values.netloc):
                    continue
                link = URLUtility.validate_link(link)
                if link:
                    link = URLUtility.normalize(link)
                    if link:
                        links.add(link)
            return list(links)
        except:
            traceback.print_exc()
            return []
 def extract_insite_links(self, url, html):
     '''
     Returns: 
         - list of insite urls that are different from the input url
     '''
     try:
         soup = BeautifulSoup(html, 'html.parser')
         #soup = BeautifulSoup(html, 'lxml') # Couldn't parse http://www.gunsinternational.com/
         links = set()
         tld = URLUtility.get_tld(url)
         for tag in soup.findAll('a', href=True):
             link = tag['href']
             try:
                 link = urlparse.urljoin(url, link)
             except:
                 traceback.print_exc()
                 continue
             values = urlparse.urlparse(link)
             if tld in values.netloc:
                 link = URLUtility.validate_link(link)
                 if link:
                     link = URLUtility.normalize(link)
                     if link and link != url:
                         links.add(link)
         return list(links)
     except:
         print "Parsing with BeautifulSoup failed"
         return []
示例#6
0
    def get_text(self, text_type):
        """
        Return extracted text from the html. Extract text if neccessary
        NOTE: this function's flow can be confusing cause it does not only serve as extraction
        but also cache the extracted text in different scenarios. 

        Parameters:
        -----------
        text_type: string, optional

        """
        if not self.html:
            return ''

        if text_type == 'body':
            if not self.body:
                self.body = Text_Extractor.extract_body(self.html)
                self.body = URLUtility.clean_text(self.body)
            return self.body
        elif text_type == 'meta':
            if not self.meta:
                self.meta = Text_Extractor.extract_body(self.html)
                self.meta = URLUtility.clean_text(self.meta)
            return self.meta
        elif text_type == 'title':
            if not self.title:
                self.title = Text_Extractor.extract_body(self.html)
                self.title = URLUtility.clean_text(self.title)
            return self.title

        else:
            print "Wrong text_type"
            return ''
    def extract_links_bs(self, url, html):
        '''
        Extract all outlinks from html using beautiful soup. Return list of links

        Args:
            - url: url of the html source, used to construct absolute url from relative url
            - html: html source
        Returns:
            - links: list of outlinks
       
        '''
        try:
            soup = BeautifulSoup(html, 'lxml')
        except:
            print "Parsing with beautiful soup failed"
            return []
        links = set()
        for tag in soup.findAll('a', href=True):
            link = tag['href']
            try:
                link = urlparse.urljoin(url, link)
            except:
                continue
            link = URLUtility.validate_link(link)
            if link:
                link = URLUtility.normalize(link)
                if link:
                    links.add(link)
        return list(links)
示例#8
0
def evaluate_recall(result_file, test_file):
    """ Count number of urls in test found"""
    test_host = set()
    with open(test_file) as lines:
        for line in lines:
            url = line.strip()
            #url = url_normalize(url)
            host = URLUtility.get_tld(url)
            test_host.add(host)

    found_host = set()
    with open(result_file) as lines:
        for line in lines:
            values = line.strip().split()
            url = values[1]
            #url = url_normalize(url)
            host = URLUtility.get_tld(url)
            found_host.add(host)

    found = 0
    for host in found_host:
        if host in test_host:
            found += 1
            print host, found

    print found, len(test_host)
示例#9
0
def crawl(in_file, html_dir, status_dir, agent):
    urls = set()
    url_objects = []
    with open(in_file) as lines:
        for line in lines:
            values = line.strip("\n").split("\t")
            url_object = {"url_meta":{}}
            if len(values) == 4:
                url_object["url_meta"]["topic"] = values[0]
                url_object["url_meta"]["site"] = values[1]
                url = URLUtility.normalize(values[2])
                url_object["url"] = url 
                url_object["url_meta"]["subtopic"] = values[3]

            else:
                url = URLUtility.normalize(values[0])
                url_object["url"] = url 
            if url not in urls:
                urls.add(url)
                url_objects.append(url_object)              
    jobs = []
    for i in range(Config.PROCESS_NUMBER):
        p = Process(target = crawlprocess, args = (url_objects, i, html_dir, status_dir, agent))
        jobs.append(p)
        p.start()
    for p in jobs:
        p.join()
示例#10
0
def _read_ac_result_file(result_file, max_pages):
    """
    Load all sites from the result file of ACHE 
    """
    count = 0
    sites = set()
    with open(result_file) as lines:
        for line in lines:
            count += 1
            url = line.split()[0]
            url = URLUtility.normalize(url)
            site = URLUtility.get_host(url)
            sites.add(site)
            if count == max_pages:
                break
    return sites
示例#11
0
    def search_backlinks(self, url, limit=5):
        """
        Return a list of urls
        Args:
            limit: maximum number of results to return
        """
        urls = []
        try:
            results = self.client.links(url,
                                        scope="page_to_page",
                                        sort="page_authority",
                                        filters=["external"],
                                        limit=limit)
            #results = self.client.links(url, scope="page_to_page", sort="spam_score", filters=["external"], limit=limit)
            #results = self.client.links(url, scope="page_to_page", sort="page_authority")

            for res in results:
                if 'uu' in res:
                    url = URLUtility.normalize(res['uu'])
                    if url:
                        urls.append(url)
                else:
                    print "Error: key does not exisit"
                    print res
        except:
            traceback.print_exc()

        return urls
    def search(self, keyword, k):
        """
        Search for a keyword and return top matched urls
        Reference: https://developers.google.com/custom-search/json-api/v1/reference/cse/list

        Args:
            k: Number of search results to return. 
        """
        k = min(k, self.max_results) 
        urls = []
        index = 1
        while index<=k:
            try:
                res = self.service.cse().list(q=keyword, cx=self.cse_id, num=10, start=index).execute() # maximum 10 results for each query
                if 'items' in res:
                    res = res['items']
                    for item in res:
                        url = URLUtility.normalize(item['link'])
                        if url:
                            urls.append(url)
                    if len(res)<10:
                        # Early stop paging
                        break
                else:
                    print res
                    break # No more results, stop paging
            except:
                traceback.print_exc()
                break

            index += 10

        return urls
示例#13
0
def test():
    fetcher = Fetcher("test/fetcher_test_data")
    urls = URLUtility.load_urls("test/data/urls.txt")
    sites = fetcher.fetch(urls)
    for site in sites:
        for page in site:
            print page.get_text('body')[:100].replace("\n", "")
示例#14
0
def _read_ac_result_file(result_file):
    urls = []
    with open(result_file) as lines:
        for line in lines:
            url = line.split()[0]
            url = URLUtility.normalize(url)
            urls.append(url)
    return urls
示例#15
0
def _read_sf_result_file(result_file, max_pages):
    """
    Load all sites from the result file of SEEDFINDER 
    """
    sites = set()
    count = 0
    with open(result_file) as lines:
        for line in lines:
            count += 1
            values = line.strip().split(', ')
            url = values[-1]
            url = URLUtility.normalize(url)
            site = URLUtility.get_host(url)
            sites.add(site)
            if count == max_pages:
                break
    return sites
示例#16
0
def _read_sf_result_file(result_file):
    urls = []
    with open(result_file) as lines:
        for line in lines:
            values = line.strip().split(', ')
            url = values[-1]
            url = URLUtility.normalize(url)
            urls.append(url)
    return urls
def run(infile, outdir):
    urls = set([])
    with open(infile) as lines:
        for line in lines:
            url = line.strip().split("\t")[0]
            url = URLUtility.normalize(url)
            urls.add(url)
    urls = list(urls)
    Download.download(urls, outdir)
def load_blacklist(f):
    bl = set()
    with open(f) as lines:
        for line in lines:
            line = line.strip().lower()
            host = URLUtility.get_host(line)
            print host
            bl.add(host)
    return bl
示例#19
0
def search_site(url_file, out_file, keyword):
    """
    Write results as json line objects into out_file
    Format of each json object:
        list<str>: list of urls. First url is the main site
    """
    urls = URLUtility.load_urls(url_file)
    site2urls = read_json(out_file)
    k = 10

    out = open(out_file, "a+")
    for i, url in enumerate(urls):
        site = URLUtility.get_host(url)
        if site not in site2urls:
            results = bing_search.search_site(keyword, url, 10)
            results = [site, url] + results
            json.dump(results, out)
            out.write("\n")
    out.close()
 def extract_links(url, html):
     '''
     Extract links from html source using regular expression
     Args:
         - url: url of the html source, used to construct absolute url from relative url
         - html: html source
     Returns:
         - links: extracted (normalized and validated) links
     '''
     match = HTMLParser.LINK_PATTERN.findall(html)
     links = set([])
     for link in match:
         link = urlparse.urljoin(url, link)
         link = URLUtility.validate_link(link)
         if link:
             link = URLUtility.normalize(link)
             if link:
                 links.add(link)
     return list(links)
def count(infile):
    sites = set([])
    with open(infile) as lines:
        for line in lines:
            obj = json.loads(line)
            url = obj['url']
            site = URLUtility.get_host(url)
            sites.add(site)
    for site in sites:
        print site
    print len(sites)
示例#22
0
def _read_relev_file(clf_file):
    """
    Load all sites from the classification file
    Note that all classification files from different discovery tools have the same format
    """
    sites = set()
    with open(clf_file) as lines:
        for line in lines:
            try:
                values = line.strip().split(",")
                url = ''.join(values[:-1])
                label = int(values[-1])
                url = URLUtility.normalize(url)
                site = URLUtility.get_host(url)
                if label != -1 and label != 1:
                    print "Parsed label is incorrect"
                if label == 1:
                    sites.add(site)
            except:
                traceback.print_exc()
    return sites
示例#23
0
 def update_seeds(self, seed_urls):
     '''Update seed urls in the current seed list.
     Fetch the seed urls'''
     new_seed_urls = []
     for url in seed_urls:
         host = URLUtility.get_tld(url)
         if host not in self.host:
             self.host.add(host)
             new_seed_urls.append(url)
     urls, text = self.fetcher.fetch_urls(new_seed_urls)
     self.similarity.update_seeds(urls, text)
     self.K = max(len(self.similarity.seed_pages.keys())/2, 10)
示例#24
0
 def __init__(self, url):
     #url = url_normalize.url_normalize(url)
     self.host = URLUtility.get_host(url)
     self.pages = []
     self.word_set = set()
     self.vsm = None
     self.jaccard = None  # Don't use this if seeds are updated
     self.cosine = None  # Don't use this if seeds are updated
     self.clf_vsm = None
     self.bstf_vsm = None
     self.bsbin_vsm = None
     self.cs_vsm = None
示例#25
0
def crawlprocess(url_objects, start, html_dir, status_dir, agent):
    status_file = open(status_dir + "/status_temp_" + str(start) + ".json", "w")
    content_file = None
    if Config.DATA_FORMAT == "ONE_FILE":
        content_file = open(html_dir  + "/html_" + str(start) + ".json", "a+")
        save_content = save_content_one_file(content_file)
    elif Config.DATA_FORMAT == "MULTI_FILE":
        save_content = save_content_multi_file(html_dir)

    for i in range(start, len(url_objects), Config.PROCESS_NUMBER):
        url_obj = url_objects[i]
        url = url_obj["url"] 
        try:
            if Config.USE_TOR:
                res = requests.get(url, headers=Config.HEADERS[agent], proxies=TOR_PROXY, verify=False, timeout=5)
            else:
                res = requests.get(url, headers=Config.HEADERS[agent], verify=False, timeout=5)
            if Config.SAVE_HTML:
                save_content(url, res)
            save_response(url, URLUtility.encode(url), str(res.status_code), None, res.headers, agent, url_obj, status_file)
        except requests.ConnectionError:
            #In the event of a network problem (e.g. DNS failure, refused connection, etc)
            save_response(url, URLUtility.encode(url), None, "ConnectionError", None, agent, url_obj, status_file)
        except requests.HTTPError:
            #In the rare event of an invalid HTTP response
            save_response(url, URLUtility.encode(url), None, "HTTPError", None, agent, url_obj, status_file)
        except requests.Timeout:
            save_response(url, URLUtility.encode(url), None, "Timeout", None, agent, url_obj, status_file)
        except requests.TooManyRedirects:
            save_response(url, URLUtility.encode(url), None, "TooManyRedirects", None, agent, url_obj, status_file)
        except Exception:
            save_response(url, URLUtility.encode(url), None, "OtherExceptions", None, agent, url_obj, status_file)
    status_file.close()
    if content_file:
        content_file.close()
示例#26
0
def deduplicate(outfile, indirs):
    writer = open(outfile, 'w')
    cached_urls = set()
    for indir in indirs:
        for fname in os.listdir(indir):
            print "Reading", indir+'/'+fname
            for line in open(indir+'/'+fname):
                data = json.loads(line)
                url = URLUtility.normalize(data['url'])
                if url in cached_urls: 
                    continue
                cached_urls.add(url)
                writer.write(line)
 def export_host(indir, outfile):
     urls = ExportURL.load_urls(indir)
     uniq_hosts = set([])
     out = open(outfile, "w")
     for url in urls:
         try:
             host = URLUtility.get_host(url)
             if host not in uniq_hosts:
                 uniq_hosts.add(host)
                 out.write(host.encode('utf-8') + "\n")
         except:
             traceback.print_exc()
     out.close()
示例#28
0
def _read_urls_from_json(url_file):
    urls = set()
    with open(url_file) as lines:
        for line in lines:
            try:
                jsonobj = json.loads(line)
                for url in jsonobj[1:]:
                    url = URLUtility.normalize(url)
                    urls.add(url)
            except:
                traceback.print_exc()

    print "Number of urls read from json file: ", len(urls)
    return list(urls)
 def __init__(self, seed_file, result_file, data_dir):
     """
     Args:
         seed_file: contains list of seed urls
         data_dir: stores crawled data
         result_file: stores urls and their scores
     """
     self.train_urls = URLUtility.load_urls(seed_file)
     self.fetcher = Fetcher(
         data_dir, None, False
     )  # Note: Fetcher contains Bing Search but does not use it (just for website ranking evaluation)
     self.result_file = result_file
     self.ranked_result_file = result_file + ".rank"
     self.searcher = Search_APIs(data_dir, self.fetcher)
def run_filter(infile, outfile):
    blacklists = load_blacklist("blacklist.txt")
    out = open(outfile, "w")
    counter = {}
    with open(infile) as lines:
        for line in lines:
            url = line.strip()
            if is_filter(url, blacklists, counter):
                continue
            else:
                host = URLUtility.get_host(url)

                out.write(line)
    out.close()
示例#31
0
def _read_clf_file(clf_file):
    url2label = {}
    site2label = {}

    with open(clf_file) as lines:
        for line in lines:
            try:
                values = line.strip().split(",")
                url = ''.join(values[:-1])
                label = int(values[-1])
                url = URLUtility.normalize(url)
                site = URLUtility.get_host(url)

                if label > 0:
                    url2label[url] = True
                    site2label[site] = True
                else:
                    url2label[url] = False
                    if site not in site2label:
                        site2label[site] = False
            except:
                print line
    return url2label, site2label
def expand(indir, output_file):
    files = os.listdir(indir)
    uniq_links = set(
    )  #many seed urls come from the same site, so there exists duplicated outlinks from seed urls
    out = open(output_file, "w")
    for f in files:
        filename = indir + "/" + f
        with open(filename) as lines:
            for line in lines:
                data = json.loads(line)
                url = data['url']
                url = URLUtility.normalize(url)
                html_content = data['text']
                #links = HTMLParser.extract_links(url, html_content)
                links = HTMLParser.extract_links_bs(url, html_content)
                for link in links:
                    if URLUtility.is_same_site(url, link):
                        if link not in uniq_links:
                            uniq_links.add(link)
                            out.write(link.encode('utf-8') + "\n")
                if url not in links:
                    out.write(url.encode('utf-8') + "\n")

    out.close()
示例#33
0
def custom_recrawl(status_file, supplement_status_file, html_dir, status_dir, agent):
    #Read the status_file, recrawl the website that causes exception

    downloaded_urls = set()
    with open(status_file) as lines:
        for line in lines:
            try:
                json_data = json.loads(line)
                if "exception" in json_data:
                    continue
                downloaded_urls.add(json_data['url'])
            except:
                print "recrawl exception"
                traceback.print_exc()
                continue
 
    url_objects = []
    with open(supplement_status_file) as lines:
        for line in lines:
            try:
                values = line.strip("\n").split("\t")
                url = URLUtility.normalize(values[2])
                if url not in downloaded_urls:
                    url_object = {"url_meta":{}}
                    url_object["url_meta"]["topic"] = values[0]
                    url_object["url_meta"]["site"] = values[1]
                    url_object["url_meta"]["subtopic"] = values[3]
                    url_object["url"] = url 
                    url_objects.append(url_object)
            except:
                print "custom recrawl exception"
                traceback.print_exc()
                continue
    print "Number of urls to download: " + str(len(url_objects))
    jobs = []
    for i in range(Config.PROCESS_NUMBER):
        p = Process(target = crawlprocess, args = (url_objects, i, html_dir, status_dir, agent))
        jobs.append(p)
        p.start()
    for p in jobs:
        p.join()   
示例#34
0
 def save_content(url, res):
     html_filename = html_dir + "/" + URLUtility.encode(url) + ".html"
     html_file = open(html_filename, "w")
     text = res.text.encode('utf-8')
     html_file.write(text)
     html_file.close()