def join_urls(baseurl, url): if url.startswith('//'): url = 'http:' + url return norms(url) elif url.startswith('www.'): http_url = SCHEME_HTTP + url if is_absolute(http_url): return norms(http_url) elif is_absolute(url): return norms(url) else: return norms(urljoin(baseurl, url))
def normalize(self, item): normal = urlnorm.norms(item) # remove anchor-references normal = normal.split('#', 1)[0] # remove apache directory extravaganza [?[NDMS]=[AD] parm = normal.split('?', 1)[-1] if parm and len(parm) == 3 and parm[1] == '=' and parm[2] in 'AD' and parm[0] in 'NDMS': normal = normal[-4] return normal
def parseForUrlsInHtml(data, location): # search for a possible base reference bases = searchInForTag(BasePattern, data) baseRef = None if len(bases)>=1: baseRef = bases[0][0] if len(bases)>1: print "more than one base tag found" # search for tags and add found tags to URL queue ret = {} for pattern in LinkPatterns: urls = searchInForTag(pattern, data) for url, name in urls: url = urlnorm.norms(urljoin(get_absolute_url(url, baseRef, location), url)) ret[url] = 1 return ret.keys()
def parse_page(dbinfo, html): soup = BeautifulSoup(html, "lxml") blog_count = 0 done = False for table in soup.find_all('table'): rank = None href = None auth = None for tr in table.find_all('tr'): for string in tr.strings: if re.match(rank_regex, string) is not None: rank = string.strip()[:-1] for td in tr.find_all('td'): td_class = td.get('class') if td_class is not None and 'site-details' in td_class: site_details = parse_site_details(td) if site_details is not None: href = site_details if td_class is not None and 'statistics' in td_class: statistics = parse_statistics(td) if statistics is not None: auth = statistics if (rank is not None or href is not None or auth is not None): print((rank, href, auth)) if (rank is not None and href is not None and auth is not None): if int(auth) > AUTH_SCORE_THRESHOLD: blog = { 'link': urlnorm.norms(href), # Alternative construction for archive.org URLs: # blog = {'link': 'https://web.archive.org' + urlnorm.norms(href), 'rank': rank, 'auth_score': auth } store_blog_ranking(dbinfo, blog) blog_count += 1 else: done = True return (blog_count, done)
def parse_page(dbinfo, html): soup = BeautifulSoup(html, "lxml") blog_count = 0 done = False for table in soup.find_all('table'): rank = None href = None auth = None for tr in table.find_all('tr'): for string in tr.strings: if re.match(rank_regex, string) is not None: rank = string.strip()[:-1] for td in tr.find_all('td'): td_class = td.get('class') if td_class is not None and 'site-details' in td_class: site_details = parse_site_details(td) if site_details is not None: href = site_details if td_class is not None and 'statistics' in td_class: statistics = parse_statistics(td) if statistics is not None: auth = statistics if (rank is not None or href is not None or auth is not None): print( (rank, href, auth) ) if (rank is not None and href is not None and auth is not None): if int(auth) > AUTH_SCORE_THRESHOLD: blog = {'link': urlnorm.norms(href), # Alternative construction for archive.org URLs: # blog = {'link': 'https://web.archive.org' + urlnorm.norms(href), 'rank': rank, 'auth_score': auth} store_blog_ranking(dbinfo, blog) blog_count += 1 else: done = True return (blog_count, done)
def clean_url(url): (proto, host, path, params, frag) = urlsplit(urlnorm.norms(url)) return urlunsplit((proto, host, path, params, ''))
def mycanonicalization(urllink): parsedurl = urlparse.urlsplit(urllink) return urlnorm.norms(parsedurl.scheme+"://"+parsedurl.netloc+"//"+parsedurl.path)