def print_canonical(args): license_names = get_legalcode(args) grouped = [ set(), # 0: by* 4.0 licenses set(), # 1: by* 3.0 licenses set(), # 2: by* 2.5 licenses set(), # 3: by* 2.1 licenses set(), # 4: by* 2.0 licenses set(), # 5: by* 1.x licenes set(), # 6: miscellanious licenses set(), # 7: zero 1.0 public domain set(), # 8: miscellanious public domain ] for license_name in license_names: if not args.include_gnu: testname = license_name.lower() if testname.startswith("gpl") or testname.startswith("lgpl"): continue filename = license_name[:-len(".html")] url = create_base_link(args, filename, for_canonical=True) parts = url.split("/") bystar_starts = ("by", "nc", "nd", "sa") if parts[3] == "licenses" and parts[4].startswith(bystar_starts): if parts[5].startswith("4"): grouped[0].add(url) elif parts[5].startswith("3"): grouped[1].add(url) elif parts[5] == "2.5": grouped[2].add(url) elif parts[5] == "2.1": grouped[3].add(url) elif parts[5] == "2.0": grouped[4].add(url) elif parts[5].startswith("1"): grouped[5].add(url) else: grouped[6].add(url) elif parts[3] == "publicdomain" and parts[4] == "zero": grouped[7].add(url) else: grouped[8].add(url) for urls in grouped: urls = list(urls) urls.sort() for url in urls: print(url) return [], 0, 0
def check_legalcode(args): print("\n\nChecking LegalCode License...\n\n") license_names = get_legalcode(args) if args.log_level <= INFO: print("Number of files to be checked:", len(license_names)) errors_total = 0 exit_status = 0 for license_name in license_names: caught_errors = 0 context_printed = False filename = license_name[:-len(".html")] base_url = create_base_link(args, filename) context = f"\n\nChecking: legalcode\nURL: {base_url}" if args.local: source_html = request_local_text(LICENSE_LOCAL_PATH, license_name) else: page_url = "{}{}".format(LICENSE_GITHUB_BASE, license_name) source_html = request_text(page_url) license_soup = BeautifulSoup(source_html, "lxml") links_found = license_soup.find_all("a") link_count = len(links_found) if args.log_level <= INFO: print(f"{context}\nNumber of links found: {link_count}") context_printed = True valid_anchors, valid_links, context_printed = get_scrapable_links( args, base_url, links_found, context, context_printed) if valid_links: memoized_results = get_memoized_result(valid_links, valid_anchors) stored_links = memoized_results[0] stored_anchors = memoized_results[1] stored_result = memoized_results[2] check_links = memoized_results[3] check_anchors = memoized_results[4] if check_links: rs = ( # Since we're only checking for validity, we can retreive # only the headers/metadata grequests.head(link, timeout=REQUESTS_TIMEOUT) for link in check_links) responses = list() # Explicitly close connections to free up file handles and # avoid Connection Errors per: # https://stackoverflow.com/a/22839550 for response in grequests.map( rs, exception_handler=exception_handler): try: responses.append(response.status_code) response.close() except AttributeError: responses.append(response) memoize_result(check_links, responses) stored_anchors += check_anchors stored_result += responses stored_links += check_links caught_errors = write_response( args, stored_links, stored_result, base_url, license_name, stored_anchors, context, context_printed, ) if caught_errors: errors_total += caught_errors exit_status = 1 return license_names, errors_total, exit_status
def check_deeds(args): print("\n\nChecking Deeds...\n\n") license_names = get_legalcode(args) if args.log_level <= INFO: print("Number of files to be checked:", len(license_names)) errors_total = 0 exit_status = 0 for license_name in license_names: caught_errors = 0 context_printed = False filename = license_name[:-len(".html")] deed_base_url = create_base_link(args, filename, for_deeds=True) # Deeds template: # https://github.com/creativecommons/cc.engine/blob/master/cc/engine/templates/licenses/standard_deed.html # Scrapping the html found on the active site if deed_base_url: context = f"\n\nChecking: deed\nURL: {deed_base_url}" page_url = deed_base_url source_html = request_text(page_url) license_soup = BeautifulSoup(source_html, "lxml") links_found = license_soup.find_all("a") link_count = len(links_found) if args.log_level <= INFO: print(f"{context}\nNumber of links found: {link_count}") context_printed = True base_url = deed_base_url valid_anchors, valid_links, context_printed = get_scrapable_links( args, base_url, links_found, context, context_printed) if valid_links: memoized_results = get_memoized_result(valid_links, valid_anchors) stored_links = memoized_results[0] stored_anchors = memoized_results[1] stored_result = memoized_results[2] check_links = memoized_results[3] check_anchors = memoized_results[4] if check_links: rs = ( # Since we're only checking for validity, # we can retreive # only the headers/metadata grequests.head(link, timeout=REQUESTS_TIMEOUT) for link in check_links) responses = list() # Explicitly close connections to free up file handles and # avoid Connection Errors per: # https://stackoverflow.com/a/22839550 for response in grequests.map( rs, exception_handler=exception_handler): try: responses.append(response.status_code) response.close() except AttributeError: responses.append(response) memoize_result(check_links, responses) stored_anchors += check_anchors stored_result += responses stored_links += check_links caught_errors = write_response( args, stored_links, stored_result, base_url, license_name, stored_anchors, context, context_printed, ) if caught_errors: errors_total += caught_errors exit_status = 1 return license_names, errors_total, exit_status