def check_legalcode(args): print("\n\nChecking LegalCode License...\n\n") license_names = get_legalcode(args) if args.log_level <= INFO: print("Number of files to be checked:", len(license_names)) errors_total = 0 exit_status = 0 for license_name in license_names: caught_errors = 0 context_printed = False filename = license_name[:-len(".html")] base_url = create_base_link(args, filename) context = f"\n\nChecking: legalcode\nURL: {base_url}" if args.local: source_html = request_local_text(LICENSE_LOCAL_PATH, license_name) else: page_url = "{}{}".format(LICENSE_GITHUB_BASE, license_name) source_html = request_text(page_url) license_soup = BeautifulSoup(source_html, "lxml") links_found = license_soup.find_all("a") link_count = len(links_found) if args.log_level <= INFO: print(f"{context}\nNumber of links found: {link_count}") context_printed = True valid_anchors, valid_links, context_printed = get_scrapable_links( args, base_url, links_found, context, context_printed) if valid_links: memoized_results = get_memoized_result(valid_links, valid_anchors) stored_links = memoized_results[0] stored_anchors = memoized_results[1] stored_result = memoized_results[2] check_links = memoized_results[3] check_anchors = memoized_results[4] if check_links: rs = ( # Since we're only checking for validity, we can retreive # only the headers/metadata grequests.head(link, timeout=REQUESTS_TIMEOUT) for link in check_links) responses = list() # Explicitly close connections to free up file handles and # avoid Connection Errors per: # https://stackoverflow.com/a/22839550 for response in grequests.map( rs, exception_handler=exception_handler): try: responses.append(response.status_code) response.close() except AttributeError: responses.append(response) memoize_result(check_links, responses) stored_anchors += check_anchors stored_result += responses stored_links += check_links caught_errors = write_response( args, stored_links, stored_result, base_url, license_name, stored_anchors, context, context_printed, ) if caught_errors: errors_total += caught_errors exit_status = 1 return license_names, errors_total, exit_status
def check_rdfs(args, index=False): if index: print("\n\nChecking index.rdf...\n\n") rdf_obj_list = get_index_rdf(args) else: print("\n\nChecking RDFs...\n\n") rdf_obj_list = get_rdf(args) if args.log_level <= INFO: if not index: print("Number of RDF files to be checked:", len(rdf_obj_list)) else: print( "Number of RDF objects/sections to be checked in index.rdf:", len(rdf_obj_list), ) errors_total = 0 exit_status = 0 for rdf_obj in rdf_obj_list: caught_errors = 0 context_printed = False rdf_url = (rdf_obj["rdf:about"] if index else f"{rdf_obj['rdf:about']}rdf") links_found = get_links_from_rdf(rdf_obj) checking = "URL" if not index else "RDF_ABOUT" context = f"\n\nChecking: \n{checking}: {rdf_url}" link_count = len(links_found) if args.log_level <= INFO: print(f"{context}\nNumber of links found: {link_count}") context_printed = True base_url = rdf_url valid_anchors, valid_links, context_printed = get_scrapable_links( args, base_url, links_found, context, context_printed, rdf=True, ) if valid_links: memoized_results = get_memoized_result(valid_links, valid_anchors) stored_links = memoized_results[0] stored_anchors = memoized_results[1] stored_result = memoized_results[2] check_links = memoized_results[3] check_anchors = memoized_results[4] if check_links: rs = ( # Since we're only checking for validity, # we can retreive # only the headers/metadata grequests.head(link, timeout=REQUESTS_TIMEOUT) for link in check_links) responses = list() # Explicitly close connections to free up file handles and # avoid Connection Errors per: # https://stackoverflow.com/a/22839550 for response in grequests.map( rs, exception_handler=exception_handler): try: responses.append(response.status_code) response.close() except AttributeError: responses.append(response) memoize_result(check_links, responses) stored_anchors += check_anchors stored_result += responses stored_links += check_links caught_errors = write_response( args, stored_links, stored_result, rdf_url, rdf_obj, stored_anchors, context, context_printed, ) if caught_errors: errors_total += caught_errors exit_status = 1 return rdf_obj_list, errors_total, exit_status
def check_deeds(args): print("\n\nChecking Deeds...\n\n") license_names = get_legalcode(args) if args.log_level <= INFO: print("Number of files to be checked:", len(license_names)) errors_total = 0 exit_status = 0 for license_name in license_names: caught_errors = 0 context_printed = False filename = license_name[:-len(".html")] deed_base_url = create_base_link(args, filename, for_deeds=True) # Deeds template: # https://github.com/creativecommons/cc.engine/blob/master/cc/engine/templates/licenses/standard_deed.html # Scrapping the html found on the active site if deed_base_url: context = f"\n\nChecking: deed\nURL: {deed_base_url}" page_url = deed_base_url source_html = request_text(page_url) license_soup = BeautifulSoup(source_html, "lxml") links_found = license_soup.find_all("a") link_count = len(links_found) if args.log_level <= INFO: print(f"{context}\nNumber of links found: {link_count}") context_printed = True base_url = deed_base_url valid_anchors, valid_links, context_printed = get_scrapable_links( args, base_url, links_found, context, context_printed) if valid_links: memoized_results = get_memoized_result(valid_links, valid_anchors) stored_links = memoized_results[0] stored_anchors = memoized_results[1] stored_result = memoized_results[2] check_links = memoized_results[3] check_anchors = memoized_results[4] if check_links: rs = ( # Since we're only checking for validity, # we can retreive # only the headers/metadata grequests.head(link, timeout=REQUESTS_TIMEOUT) for link in check_links) responses = list() # Explicitly close connections to free up file handles and # avoid Connection Errors per: # https://stackoverflow.com/a/22839550 for response in grequests.map( rs, exception_handler=exception_handler): try: responses.append(response.status_code) response.close() except AttributeError: responses.append(response) memoize_result(check_links, responses) stored_anchors += check_anchors stored_result += responses stored_links += check_links caught_errors = write_response( args, stored_links, stored_result, base_url, license_name, stored_anchors, context, context_printed, ) if caught_errors: errors_total += caught_errors exit_status = 1 return license_names, errors_total, exit_status