Пример #1
0
def get_links( tags ):
    links = []
    for link in tags.find_all('a'):
            url = link.get('href')
            #check whether we have a bunch of relative links, or tags, or scripts or even just garbage
            if validator.non_shitty_link(url):
                url = validator.clean_crappy_link(url)
                if validator.relative_link( url ) or not (validator.has_http(url) or validator.has_https(url) ):
                    url = validator.make_non_relative_link( site_url, url ) #this seems inefficient...
                 
                links.append(url)
    return links
Пример #2
0
def get_links( site_url, tags ):
    links = []
    top_url = validator.get_top_level_url(site_url)
    for link in tags.find_all('a'):
            url = link.get('href')
            #print( "Top:%s ---- and siteURL: %s ---- "%(top_url, site_url))
            if url and str(top_url) not in str(url) and not is_a_file( url ) and not validator.skip_this_link(url):
            #check whether we have a bunch of relative links, or tags, or scripts or even just garbage
                if validator.non_shitty_link(url):
                    url = validator.clean_crappy_link(url)
                    if validator.relative_link( url ) or not (validator.has_http(url) or validator.has_https(url) ):
                        continue#url = validator.make_non_relative_link( site_url, url ) 
                    
                    links.append(url)
            else:
                pass
                #print("EXCLUDING %s for similarity/file issue. "%url)
    return links