def rate_urllist_on_moment(urllist: UrlList, when: datetime = None, prevent_duplicates: bool = True): # If there is no time slicing, then it's today. if not when: when = datetime.now(pytz.utc) log.info("Creating report for urllist %s on %s" % ( urllist, when, )) if UrlListReport.objects.all().filter(urllist=urllist, at_when=when).exists(): log.debug( "UrllistReport already exists for %s on %s. Not overwriting." % (urllist, when)) return urls = relevant_urls_at_timepoint_urllist(urllist=urllist, when=when) all_url_ratings = get_latest_urlratings_fast(urls, when) calculation = aggegrate_url_rating_scores( all_url_ratings, only_include_issues=urllist_report_content[urllist.scan_type]) try: last = UrlListReport.objects.filter( urllist=urllist, at_when__lte=when).latest('at_when') except UrlListReport.DoesNotExist: last = UrlListReport() # create a dummy one for comparison calculation['name'] = urllist.name if prevent_duplicates: if not DeepDiff(last.calculation, calculation, ignore_order=True, report_repetition=True): log.warning( "The report for %s on %s is the same as the report from %s. Not saving." % (urllist, when, last.at_when)) return log.info( "The calculation for %s on %s has changed, so we're saving this rating." % (urllist, when)) # remove urls and name from scores object, so it can be used as initialization parameters (saves lines) # this is by reference, meaning that the calculation will be affected if we don't work on a clone. init_scores = deepcopy(calculation) del (init_scores['name']) del (init_scores['urls']) report = UrlListReport(**init_scores) report.urllist = urllist report.at_when = when report.average_internet_nl_score = sum_internet_nl_scores_over_rating( calculation) report.calculation = calculation report.save()
def rate_urllist_on_moment(urllist: UrlList, when: datetime = None, prevent_duplicates: bool = True): """ :param urllist: :param when: A moment in time of which data should be aggregated :param prevent_duplicates: If the last report had the same data, don't save a new report but return the last report instead. :return: UrlListReport """ # If there is no time slicing, then it's today. if not when: when = datetime.now(pytz.utc) log.info("Creating report for urllist %s on %s" % (urllist, when, )) if UrlListReport.objects.all().filter(urllist=urllist, at_when=when).exists(): log.debug("UrllistReport already exists for %s on %s. Not overwriting." % (urllist, when)) existing_report = UrlListReport.objects.all().filter(urllist=urllist, at_when=when).first() return existing_report urls = relevant_urls_at_timepoint_urllist(urllist=urllist, when=when) all_url_ratings = get_latest_urlratings_fast(urls, when) # Clean the url_ratings to only include the content we need, only the content (being removed) # and only the endpoint types for urlrating in all_url_ratings: calculation = remove_issues_from_calculation(urlrating.calculation, urllist_report_content[urllist.scan_type]) # Some endpoint types use the same ratings, such as dns_soa and dns_mx... This means that not # all endpoints will be removed for internet.nl. We need the following endpoints per scan: # -> note: urllist stores web/mail, they mean: web and mail_dashboard. endpoint_types_per_scan = {"web": "dns_a_aaaa", "mail": "dns_soa"} calculation = only_include_endpoint_protocols(calculation, [endpoint_types_per_scan[urllist.scan_type]]) # This already overrides endpoint statistics, use the calculation you get from this. calculation, amount_of_issues = statistics_over_url_calculation(calculation) # overwrite the rest of the statistics. calculation = add_statistics_to_calculation(calculation, amount_of_issues) urlrating.calculation = calculation calculation = aggegrate_url_rating_scores(all_url_ratings) try: last = UrlListReport.objects.filter(urllist=urllist, at_when__lte=when).latest('at_when') except UrlListReport.DoesNotExist: last = UrlListReport() # create a dummy one for comparison calculation['name'] = urllist.name if prevent_duplicates: if not DeepDiff(last.calculation, calculation, ignore_order=True, report_repetition=True): log.warning("The report for %s on %s is the same as the report from %s. Not saving." % ( urllist, when, last.at_when)) return last log.info("The calculation for %s on %s has changed, so we're saving this rating." % (urllist, when)) # remove urls and name from scores object, so it can be used as initialization parameters (saves lines) # this is by reference, meaning that the calculation will be affected if we don't work on a clone. init_scores = deepcopy(calculation) del(init_scores['name']) del(init_scores['urls']) report = UrlListReport(**init_scores) report.urllist = urllist report.at_when = when report.average_internet_nl_score = sum_internet_nl_scores_over_rating(calculation) report.calculation = calculation report.save() return report
def rate_urllist_on_moment(urllist: UrlList, when: datetime = None, prevent_duplicates: bool = True, scan_type: str = "web") -> int: """ :param urllist: :param when: A moment in time of which data should be aggregated :param prevent_duplicates: If the last report had the same data, don't save a new report but return the last report instead. :return: UrlListReport id """ # If there is no time slicing, then it's today. if not when: when = datetime.now(pytz.utc) log.info(f"Creating report for urllist {urllist} on {when}") if UrlListReport.objects.all().filter(urllist=urllist, at_when=when).exists(): log.debug( f"UrllistReport already exists for {urllist} on {when}. Not overwriting." ) existing_report = UrlListReport.objects.all().filter( urllist=urllist, at_when=when).first() return int(existing_report.id) urls = relevant_urls_at_timepoint_urllist(urllist=urllist, when=when) log.debug(f'Found {len(urls)} to be relevant at this moment.') calculation = create_calculation_on_urls(urls, when, scan_type=scan_type) try: last = UrlListReport.objects.filter( urllist=urllist, at_when__lte=when).latest('at_when') except UrlListReport.DoesNotExist: last = UrlListReport() # create a dummy one for comparison calculation['name'] = urllist.name if prevent_duplicates: if not DeepDiff(last.calculation, calculation, ignore_order=True, report_repetition=True): log.info( f"The report for {urllist} on {when} is the same as the report from {last.at_when}. Not saving." ) return int(last.id) log.info( f"The calculation for {urllist} on {when} has changed, so we're saving this rating." ) # remove urls and name from scores object, so it can be used as initialization parameters (saves lines) # this is by reference, meaning that the calculation will be affected if we don't work on a clone. init_scores = deepcopy(calculation) del init_scores['name'] del init_scores['urls'] external_scan_type = { "web": "web", "mail": "mail", "mail_dashboard": "mail" } report = UrlListReport(**init_scores) report.urllist = urllist report.report_type = external_scan_type[scan_type] report.at_when = when report.average_internet_nl_score = sum_internet_nl_scores_over_rating( calculation) report.calculation = calculation report.save() return int(report.id)