def get_soup_impressum(website_links_list): impressumlist = [ "impressum", "legal", "imprint", "disclaimer", "kontakt", "contact" ] for impressum in impressumlist: for link in website_links_list: if impressum in link.lower(): print(link) soup = get_soup(link) return soup break return None
def get_soup_northdata(firma): base_url = "https://www.northdata.de/" name_split = firma.split(" ") name_split_string = [""] for i in range(0, len(name_split)): name_split_string[0] = str(name_split_string[0]) + str( name_split[i]) + "+" name_split_string[0] = name_split_string[0][:-1] search_url = str(base_url) + str(name_split_string[0]) # print(search_url) soup = get_soup(search_url) soup = test_northdata(soup) return soup
def GetLinks(url): RawLinks = [] soup = get_soup(url) links_html = soup.find_all('a') for link_html in links_html: link = str(link_html.get('href')) if len(link) >= 4: if url[-1] == "/": url = url[:-1] if 'http' not in link: link = url + link RawLinks.append(link) RawLinks = list(set(RawLinks)) #print(RawLinks) return RawLinks