def rate_urllist_on_moment(urllist: UrlList, when: datetime = None, prevent_duplicates: bool = True): # If there is no time slicing, then it's today. if not when: when = datetime.now(pytz.utc) log.info("Creating report for urllist %s on %s" % ( urllist, when, )) if UrlListReport.objects.all().filter(urllist=urllist, at_when=when).exists(): log.debug( "UrllistReport already exists for %s on %s. Not overwriting." % (urllist, when)) return urls = relevant_urls_at_timepoint_urllist(urllist=urllist, when=when) all_url_ratings = get_latest_urlratings_fast(urls, when) calculation = aggegrate_url_rating_scores( all_url_ratings, only_include_issues=urllist_report_content[urllist.scan_type]) try: last = UrlListReport.objects.filter( urllist=urllist, at_when__lte=when).latest('at_when') except UrlListReport.DoesNotExist: last = UrlListReport() # create a dummy one for comparison calculation['name'] = urllist.name if prevent_duplicates: if not DeepDiff(last.calculation, calculation, ignore_order=True, report_repetition=True): log.warning( "The report for %s on %s is the same as the report from %s. Not saving." % (urllist, when, last.at_when)) return log.info( "The calculation for %s on %s has changed, so we're saving this rating." % (urllist, when)) # remove urls and name from scores object, so it can be used as initialization parameters (saves lines) # this is by reference, meaning that the calculation will be affected if we don't work on a clone. init_scores = deepcopy(calculation) del (init_scores['name']) del (init_scores['urls']) report = UrlListReport(**init_scores) report.urllist = urllist report.at_when = when report.average_internet_nl_score = sum_internet_nl_scores_over_rating( calculation) report.calculation = calculation report.save()
def test_report_upgrade(db, monkeypatch) -> None: # Create urllist with a lot of unscannable domains, only apple.com is scannable. # megaupload.com will never be scannable, and the rest can have an endpoint and might be in the report # already because of this (but without endpoints) urls = ['akamaihd.net', 'apple.com', 'bp.blogspot.com', 'clickbank.net', 'cocolog-nifty.com', 'fda.gov', 'geocities.jp', 'ggpht.com', 'googleusercontent.com', 'megaupload.com', 'nhk.or.jp', 'ssl-images-amazon.com', 'ytimg.com'] # create the list, code from test domain management: account, created = Account.objects.all().get_or_create(name="test") urllist = UrlList() urllist.name = "upgrade" urllist.account = account urllist.save() scan = AccountInternetNLScan() scan.urllist = urllist scan.account = account scan.save() for url in urls: new_url = Url() new_url.url = url new_url.save() urllist.urls.add(new_url) urllist.save() # fake a report on these domains, without any upgrades, taken from the acc environment: fake_calculation = { "high": 19, "medium": 4, "low": 3, "ok": 15, "total_urls": 1, "high_urls": 1, "medium_urls": 0, "low_urls": 0, "ok_urls": 0, "explained_high": 0, "explained_medium": 0, "explained_low": 0, "explained_high_endpoints": 0, "explained_medium_endpoints": 0, "explained_low_endpoints": 0, "explained_high_urls": 0, "explained_medium_urls": 0, "explained_low_urls": 0, "explained_total_url_issues": 0, "explained_url_issues_high": 0, "explained_url_issues_medium": 0, "explained_url_issues_low": 0, "explained_total_endpoint_issues": 0, "explained_endpoint_issues_high": 0, "explained_endpoint_issues_medium": 0, "explained_endpoint_issues_low": 0, "total_endpoints": 1, "high_endpoints": 1, "medium_endpoints": 0, "low_endpoints": 0, "ok_endpoints": 0, "total_url_issues": 0, "total_endpoint_issues": 26, "url_issues_high": 0, "url_issues_medium": 0, "url_issues_low": 0, "endpoint_issues_high": 19, "endpoint_issues_medium": 4, "endpoint_issues_low": 3, "urls": [ { "url": "apple.com", "ratings": [], "endpoints": [ { "id": 4599, "concat": "dns_a_aaaa/0 IPv0", "ip": 0, "ip_version": 0, "port": 0, "protocol": "dns_a_aaaa", "v4": False, "ratings": [ { "type": "internet_nl_web_ipv6_ws_address", "explanation": "Test internet_nl_web_ipv6_ws_address resulted in failed.", "since": "2020-01-15T13:00:01.116013+00:00", "last_scan": "2020-01-15T13:00:01.116689+00:00", "high": 1, "medium": 0, "low": 0, "ok": 0, "not_testable": False, "not_applicable": False, "error_in_test": False, "is_explained": False, "comply_or_explain_explanation": "", "comply_or_explain_explained_on": "", "comply_or_explain_explanation_valid_until": "", "comply_or_explain_valid_at_time_of_report": False, "scan": 114575, "scan_type": "internet_nl_web_ipv6_ws_address" }, { "type": "internet_nl_web_dnssec_valid", "explanation": "Test internet_nl_web_dnssec_valid resulted in failed.", "since": "2020-01-15T13:00:00.684906+00:00", "last_scan": "2020-01-15T13:00:00.685193+00:00", "high": 1, "medium": 0, "low": 0, "ok": 0, "not_testable": False, "not_applicable": False, "error_in_test": False, "is_explained": False, "comply_or_explain_explanation": "", "comply_or_explain_explained_on": "", "comply_or_explain_explanation_valid_until": "", "comply_or_explain_valid_at_time_of_report": False, "scan": 114556, "scan_type": "internet_nl_web_dnssec_valid" }, ], "high": 19, "medium": 4, "low": 3, "ok": 15, "explained_high": 0, "explained_medium": 0, "explained_low": 0 } ], "total_issues": 26, "high": 19, "medium": 4, "low": 3, "ok": 15, "total_endpoints": 1, "high_endpoints": 1, "medium_endpoints": 0, "low_endpoints": 0, "ok_endpoints": 0, "total_url_issues": 0, "url_issues_high": 0, "url_issues_medium": 0, "url_issues_low": 0, "url_ok": 0, "total_endpoint_issues": 26, "endpoint_issues_high": 19, "endpoint_issues_medium": 4, "endpoint_issues_low": 3, "explained_total_issues": 0, "explained_high": 0, "explained_medium": 0, "explained_low": 0, "explained_high_endpoints": 0, "explained_medium_endpoints": 0, "explained_low_endpoints": 0, "explained_total_url_issues": 0, "explained_url_issues_high": 0, "explained_url_issues_medium": 0, "explained_url_issues_low": 0, "explained_total_endpoint_issues": 0, "explained_endpoint_issues_high": 0, "explained_endpoint_issues_medium": 0, "explained_endpoint_issues_low": 0 } ], "total_issues": 26, "name": "Unscannable Web + one scannable" } fake_report = UrlListReport() fake_report.calculation = fake_calculation fake_report.urllist = urllist fake_report.at_when = timezone.now() fake_report.save() # First check if we are removing the comply_or_explain keys, mainly to save data: remove_comply_or_explain(fake_calculation) assert "explained_endpoint_issues_high" not in fake_calculation['urls'][0] assert "comply_or_explain_explanation" not in fake_calculation['urls'][0]['endpoints'][0]["ratings"][0] # Now add ratings based on keys, which makes direct access possible: add_keyed_ratings(fake_calculation) assert "ratings_by_type" in fake_calculation['urls'][0]['endpoints'][0] assert "internet_nl_web_ipv6_ws_address" in fake_calculation['urls'][0]['endpoints'][0]['ratings_by_type'] # Add graph statistics, so the graphs can be instantly created based on report data add_statistics_over_ratings(fake_calculation) assert "statistics_per_issue_type" in fake_calculation assert "internet_nl_web_ipv6_ws_address" in fake_calculation["statistics_per_issue_type"] # todo: we can add some tests here to see if the aggregation is correct # add some statistics over all these metrics add_percentages_to_statistics(fake_calculation) assert "pct_ok" in fake_calculation["statistics_per_issue_type"]["internet_nl_web_ipv6_ws_address"] # and make sure the report is complete: meaning that all urls requested are present, even though they # could not be scanned. So a top 100 stays a top 100. assert (len(fake_calculation['urls']) == 1) upgrade_report_with_unscannable_urls(fake_report.id, scan.id) fake_report = UrlListReport.objects.all().first() assert(len(fake_report.calculation['urls']) == len(urls)) # the first url should still be by apple: assert fake_report.calculation['urls'][0]['url'] == "apple.com"
def rate_urllist_on_moment(urllist: UrlList, when: datetime = None, prevent_duplicates: bool = True): """ :param urllist: :param when: A moment in time of which data should be aggregated :param prevent_duplicates: If the last report had the same data, don't save a new report but return the last report instead. :return: UrlListReport """ # If there is no time slicing, then it's today. if not when: when = datetime.now(pytz.utc) log.info("Creating report for urllist %s on %s" % (urllist, when, )) if UrlListReport.objects.all().filter(urllist=urllist, at_when=when).exists(): log.debug("UrllistReport already exists for %s on %s. Not overwriting." % (urllist, when)) existing_report = UrlListReport.objects.all().filter(urllist=urllist, at_when=when).first() return existing_report urls = relevant_urls_at_timepoint_urllist(urllist=urllist, when=when) all_url_ratings = get_latest_urlratings_fast(urls, when) # Clean the url_ratings to only include the content we need, only the content (being removed) # and only the endpoint types for urlrating in all_url_ratings: calculation = remove_issues_from_calculation(urlrating.calculation, urllist_report_content[urllist.scan_type]) # Some endpoint types use the same ratings, such as dns_soa and dns_mx... This means that not # all endpoints will be removed for internet.nl. We need the following endpoints per scan: # -> note: urllist stores web/mail, they mean: web and mail_dashboard. endpoint_types_per_scan = {"web": "dns_a_aaaa", "mail": "dns_soa"} calculation = only_include_endpoint_protocols(calculation, [endpoint_types_per_scan[urllist.scan_type]]) # This already overrides endpoint statistics, use the calculation you get from this. calculation, amount_of_issues = statistics_over_url_calculation(calculation) # overwrite the rest of the statistics. calculation = add_statistics_to_calculation(calculation, amount_of_issues) urlrating.calculation = calculation calculation = aggegrate_url_rating_scores(all_url_ratings) try: last = UrlListReport.objects.filter(urllist=urllist, at_when__lte=when).latest('at_when') except UrlListReport.DoesNotExist: last = UrlListReport() # create a dummy one for comparison calculation['name'] = urllist.name if prevent_duplicates: if not DeepDiff(last.calculation, calculation, ignore_order=True, report_repetition=True): log.warning("The report for %s on %s is the same as the report from %s. Not saving." % ( urllist, when, last.at_when)) return last log.info("The calculation for %s on %s has changed, so we're saving this rating." % (urllist, when)) # remove urls and name from scores object, so it can be used as initialization parameters (saves lines) # this is by reference, meaning that the calculation will be affected if we don't work on a clone. init_scores = deepcopy(calculation) del(init_scores['name']) del(init_scores['urls']) report = UrlListReport(**init_scores) report.urllist = urllist report.at_when = when report.average_internet_nl_score = sum_internet_nl_scores_over_rating(calculation) report.calculation = calculation report.save() return report
def rate_urllist_on_moment(urllist: UrlList, when: datetime = None, prevent_duplicates: bool = True, scan_type: str = "web") -> int: """ :param urllist: :param when: A moment in time of which data should be aggregated :param prevent_duplicates: If the last report had the same data, don't save a new report but return the last report instead. :return: UrlListReport id """ # If there is no time slicing, then it's today. if not when: when = datetime.now(pytz.utc) log.info(f"Creating report for urllist {urllist} on {when}") if UrlListReport.objects.all().filter(urllist=urllist, at_when=when).exists(): log.debug( f"UrllistReport already exists for {urllist} on {when}. Not overwriting." ) existing_report = UrlListReport.objects.all().filter( urllist=urllist, at_when=when).first() return int(existing_report.id) urls = relevant_urls_at_timepoint_urllist(urllist=urllist, when=when) log.debug(f'Found {len(urls)} to be relevant at this moment.') calculation = create_calculation_on_urls(urls, when, scan_type=scan_type) try: last = UrlListReport.objects.filter( urllist=urllist, at_when__lte=when).latest('at_when') except UrlListReport.DoesNotExist: last = UrlListReport() # create a dummy one for comparison calculation['name'] = urllist.name if prevent_duplicates: if not DeepDiff(last.calculation, calculation, ignore_order=True, report_repetition=True): log.info( f"The report for {urllist} on {when} is the same as the report from {last.at_when}. Not saving." ) return int(last.id) log.info( f"The calculation for {urllist} on {when} has changed, so we're saving this rating." ) # remove urls and name from scores object, so it can be used as initialization parameters (saves lines) # this is by reference, meaning that the calculation will be affected if we don't work on a clone. init_scores = deepcopy(calculation) del init_scores['name'] del init_scores['urls'] external_scan_type = { "web": "web", "mail": "mail", "mail_dashboard": "mail" } report = UrlListReport(**init_scores) report.urllist = urllist report.report_type = external_scan_type[scan_type] report.at_when = when report.average_internet_nl_score = sum_internet_nl_scores_over_rating( calculation) report.calculation = calculation report.save() return int(report.id)