def generate_existing_internal_url(url: pyd.Url): return pyd.Url( url=url.url, fqdn=url.fqdn, url_pagerank=url.url_pagerank, url_discovery_date=url.url_discovery_date, )
def generate_new_internal_url(url: pyd.Url): return pyd.Url( url=gen.get_similar_url(url).url, fqdn=gen.get_fqdn_from_url(url), url_pagerank=gen.random_pagerank(), url_discovery_date=datetime.now(), )
def generate_random_url(fqdn=None) -> pyd.Url: applied_fqdn = get_random_fqdn() if fqdn is None else fqdn return pyd.Url( url="http://{}/{}{}".format(applied_fqdn, get_random_german_text(), get_random_web_filename()), fqdn=applied_fqdn, url_pagerank=random_pagerank(), url_discovery_date=datetime.now(), )
def generate_existing_external_url(session: requests.Session, fqdn: str = None): url = gen.get_random_existing_url(session=session, fqdn=fqdn) return pyd.Url( url=url.url, fqdn=url.fqdn, url_pagerank=gen.random_pagerank(), url_discovery_date=datetime.now(), )
def test_url_to_dict(): test_url = pyd.Url(url="https://www.example.com/abcefg", fqdn="www.example.com") assert datsav.url_dict(test_url) == { "url": "https://www.example.com/abcefg", "fqdn": "www.example.com", "url_discovery_date": None, "url_last_visited": None, "url_blacklisted": None, "url_bot_excluded": None, "url_pagerank": None, }
def test_simulate_short_term_fetch(): short_term_frontier = pyd.Frontier( fqdn="www.example.de", tld="de", fqdn_last_ipv4="123.456.78.91", fqdn_last_ipv6="2001:DB8::1234", fqdn_pagerank=0.00001, fqdn_crawl_delay=None, fqdn_url_count=2, url_list=[ pyd.Url( url="http://www.example.de/html/index", fqdn="www.example.de", url_discovery_date=None, url_last_visited="2020-01-01T06:00:00", url_blacklisted=False, url_bot_excluded=False, ), pyd.Url( url="http://www.example.de/html/contact", fqdn="www.example.de", url_discovery_date=None, url_last_visited="2020-01-01T07:00:00", url_blacklisted=False, url_bot_excluded=False, ), ], ) short_term_fetch_result = fetch.simulate_short_term_fetch(short_term_frontier) assert isinstance(short_term_fetch_result, List) assert len(short_term_fetch_result) == len(short_term_frontier.url_list) * ( s.max_links_per_page + 1 ) for i in range(len(short_term_fetch_result)): assert isinstance(short_term_fetch_result[i], pyd.Url) assert isinstance(short_term_fetch_result[i].url, HttpUrl)
def get_random_existing_url(session: requests.Session, fqdn: str = None) -> pyd.Url: if fqdn is None: random_url = session.get(s.websch_random_urls_endpoint).json() else: random_url = session.get("{}?fqdn={}".format( s.websch_random_urls_endpoint, fqdn)).json() if len(random_url["url_list"]) == 0: new_url = generate_random_url() random_url["url_list"].append(dict(url=new_url.url, fqdn=new_url.fqdn)) return pyd.Url(url=random_url["url_list"][0]["url"], fqdn=random_url["url_list"][0]["fqdn"])
def test_simulate_parse_url(): url = pyd.Url( url="http://www.example.de/html/index", fqdn="www.example.de", url_discovery_date=None, url_last_visited="2020-01-01T06:00:00", rl_blacklisted=False, url_bot_excluded=False, ) parsed_list = fetch.simulate_parse_url(url, session) assert isinstance(parsed_list[0].url, HttpUrl) assert parsed_list[0].url_discovery_date is None assert parsed_list[0].url_last_visited != "2020-01-01T06:00:00" assert isinstance(parsed_list[1].url, HttpUrl)
def test_simulate_fetch(): frontier_partition = pyd.FrontierResponse( uuid="12345678-90ab-cdef-0000-000000000000", response_url="http://www.example.com/submit", latest_return="2020-10-10T23:00:00.000000", url_frontiers_count=2, urls_count=2, url_frontiers=[ pyd.Frontier( fqdn="www.example.de", tld="de", fqdn_last_ipv4="123.456.78.90", fqdn_last_ipv6="2001:DB8::1234", fqdn_pagerank=0.00001, fqdn_crawl_delay=5, fqdn_url_count=1, url_list=[ pyd.Url( url="http://www.example.de/html/index", fqdn="www.example.de", url_discovery_date=None, url_last_visited="2020-01-01T06:00:00", url_blacklisted=False, url_bot_excluded=False, ), ], ), pyd.Frontier( fqdn="www.example.com", tld="com", fqdn_last_ipv4="123.456.78.90", fqdn_last_ipv6="2001:DB8::1234", fqdn_pagerank=0.00001, fqdn_crawl_delay=5, fqdn_url_count=1, url_list=[ pyd.Url( url="http://www.example.com/html/index", fqdn="www.example.com", url_discovery_date=None, url_last_visited="2020-01-01T06:00:00", url_blacklisted=False, url_bot_excluded=False, ), ], ), ], ) processed_list = fetch.simulate_full_fetch(frontier_partition) assert processed_list.uuid == frontier_partition.uuid assert isinstance(processed_list.url_count, int) assert processed_list.url_count == frontier_partition.urls_count * ( s.max_links_per_page + 1 ) for i in range(len(processed_list.urls)): assert isinstance(processed_list.urls[i].url, HttpUrl) assert ( isinstance(processed_list.urls[i].url_discovery_date, datetime) or processed_list.urls[i].url_discovery_date is None ) assert ( isinstance(processed_list.urls[i].url_last_visited, datetime) or processed_list.urls[i].url_last_visited is None ) assert isinstance( processed_list.urls[i].url_discovery_date, datetime ) or isinstance(processed_list.urls[i].url_last_visited, datetime)