def get_links( tags ): links = [] for link in tags.find_all('a'): url = link.get('href') #check whether we have a bunch of relative links, or tags, or scripts or even just garbage if validator.non_shitty_link(url): url = validator.clean_crappy_link(url) if validator.relative_link( url ) or not (validator.has_http(url) or validator.has_https(url) ): url = validator.make_non_relative_link( site_url, url ) #this seems inefficient... links.append(url) return links
def get_links( site_url, tags ): links = [] top_url = validator.get_top_level_url(site_url) for link in tags.find_all('a'): url = link.get('href') #print( "Top:%s ---- and siteURL: %s ---- "%(top_url, site_url)) if url and str(top_url) not in str(url) and not is_a_file( url ) and not validator.skip_this_link(url): #check whether we have a bunch of relative links, or tags, or scripts or even just garbage if validator.non_shitty_link(url): url = validator.clean_crappy_link(url) if validator.relative_link( url ) or not (validator.has_http(url) or validator.has_https(url) ): continue#url = validator.make_non_relative_link( site_url, url ) links.append(url) else: pass #print("EXCLUDING %s for similarity/file issue. "%url) return links