Exemplo n.º 1
0
def evaluate_recall(result_file, test_file):
    """ Count number of urls in test found"""
    test_host = set()
    with open(test_file) as lines:
        for line in lines:
            url = line.strip()
            #url = url_normalize(url)
            host = URLUtility.get_tld(url)
            test_host.add(host)

    found_host = set()
    with open(result_file) as lines:
        for line in lines:
            values = line.strip().split()
            url = values[1]
            #url = url_normalize(url)
            host = URLUtility.get_tld(url)
            found_host.add(host)

    found = 0
    for host in found_host:
        if host in test_host:
            found += 1
            print host, found

    print found, len(test_host)
    def extract_external_links(self, url, html):
        '''
        Extract external outlinks, that link to different websites
        Returns: 
            - list of unique urls
        '''
        try:
            soup = BeautifulSoup(html, 'lxml')
            links = set()
            tld = URLUtility.get_tld(url)

            for tag in soup.findAll('a', href=True):
                link = tag['href']
                values = urlparse.urlparse(link)
                if (values.netloc == "") or (values.netloc
                                             == tld) or (tld in values.netloc):
                    continue
                link = URLUtility.validate_link(link)
                if link:
                    link = URLUtility.normalize(link)
                    if link:
                        links.add(link)
            return list(links)
        except:
            traceback.print_exc()
            return []
 def extract_insite_links(self, url, html):
     '''
     Returns: 
         - list of insite urls that are different from the input url
     '''
     try:
         soup = BeautifulSoup(html, 'html.parser')
         #soup = BeautifulSoup(html, 'lxml') # Couldn't parse http://www.gunsinternational.com/
         links = set()
         tld = URLUtility.get_tld(url)
         for tag in soup.findAll('a', href=True):
             link = tag['href']
             try:
                 link = urlparse.urljoin(url, link)
             except:
                 traceback.print_exc()
                 continue
             values = urlparse.urlparse(link)
             if tld in values.netloc:
                 link = URLUtility.validate_link(link)
                 if link:
                     link = URLUtility.normalize(link)
                     if link and link != url:
                         links.add(link)
         return list(links)
     except:
         print "Parsing with BeautifulSoup failed"
         return []
Exemplo n.º 4
0
 def update_seeds(self, seed_urls):
     '''Update seed urls in the current seed list.
     Fetch the seed urls'''
     new_seed_urls = []
     for url in seed_urls:
         host = URLUtility.get_tld(url)
         if host not in self.host:
             self.host.add(host)
             new_seed_urls.append(url)
     urls, text = self.fetcher.fetch_urls(new_seed_urls)
     self.similarity.update_seeds(urls, text)
     self.K = max(len(self.similarity.seed_pages.keys())/2, 10)
Exemplo n.º 5
0
 def get_tld(self):
     return URLUtility.get_tld(self.url)