示例#1
0
def print_canonical(args):
    license_names = get_legalcode(args)
    grouped = [
        set(),  # 0: by* 4.0 licenses
        set(),  # 1: by* 3.0 licenses
        set(),  # 2: by* 2.5 licenses
        set(),  # 3: by* 2.1 licenses
        set(),  # 4: by* 2.0 licenses
        set(),  # 5: by* 1.x licenes
        set(),  # 6: miscellanious licenses
        set(),  # 7: zero 1.0 public domain
        set(),  # 8: miscellanious public domain
    ]
    for license_name in license_names:
        if not args.include_gnu:
            testname = license_name.lower()
            if testname.startswith("gpl") or testname.startswith("lgpl"):
                continue
        filename = license_name[:-len(".html")]
        url = create_base_link(args, filename, for_canonical=True)
        parts = url.split("/")
        bystar_starts = ("by", "nc", "nd", "sa")
        if parts[3] == "licenses" and parts[4].startswith(bystar_starts):
            if parts[5].startswith("4"):
                grouped[0].add(url)
            elif parts[5].startswith("3"):
                grouped[1].add(url)
            elif parts[5] == "2.5":
                grouped[2].add(url)
            elif parts[5] == "2.1":
                grouped[3].add(url)
            elif parts[5] == "2.0":
                grouped[4].add(url)
            elif parts[5].startswith("1"):
                grouped[5].add(url)
            else:
                grouped[6].add(url)
        elif parts[3] == "publicdomain" and parts[4] == "zero":
            grouped[7].add(url)
        else:
            grouped[8].add(url)
    for urls in grouped:
        urls = list(urls)
        urls.sort()
        for url in urls:
            print(url)
    return [], 0, 0
示例#2
0
def check_legalcode(args):
    print("\n\nChecking LegalCode License...\n\n")
    license_names = get_legalcode(args)
    if args.log_level <= INFO:
        print("Number of files to be checked:", len(license_names))
    errors_total = 0
    exit_status = 0
    for license_name in license_names:
        caught_errors = 0
        context_printed = False
        filename = license_name[:-len(".html")]
        base_url = create_base_link(args, filename)
        context = f"\n\nChecking: legalcode\nURL: {base_url}"
        if args.local:
            source_html = request_local_text(LICENSE_LOCAL_PATH, license_name)
        else:
            page_url = "{}{}".format(LICENSE_GITHUB_BASE, license_name)
            source_html = request_text(page_url)
        license_soup = BeautifulSoup(source_html, "lxml")
        links_found = license_soup.find_all("a")
        link_count = len(links_found)
        if args.log_level <= INFO:
            print(f"{context}\nNumber of links found: {link_count}")
            context_printed = True
        valid_anchors, valid_links, context_printed = get_scrapable_links(
            args, base_url, links_found, context, context_printed)
        if valid_links:
            memoized_results = get_memoized_result(valid_links, valid_anchors)
            stored_links = memoized_results[0]
            stored_anchors = memoized_results[1]
            stored_result = memoized_results[2]
            check_links = memoized_results[3]
            check_anchors = memoized_results[4]
            if check_links:
                rs = (
                    # Since we're only checking for validity, we can retreive
                    # only the headers/metadata
                    grequests.head(link, timeout=REQUESTS_TIMEOUT)
                    for link in check_links)
                responses = list()
                # Explicitly close connections to free up file handles and
                # avoid Connection Errors per:
                # https://stackoverflow.com/a/22839550
                for response in grequests.map(
                        rs, exception_handler=exception_handler):
                    try:
                        responses.append(response.status_code)
                        response.close()
                    except AttributeError:
                        responses.append(response)
                memoize_result(check_links, responses)
                stored_anchors += check_anchors
                stored_result += responses
            stored_links += check_links
            caught_errors = write_response(
                args,
                stored_links,
                stored_result,
                base_url,
                license_name,
                stored_anchors,
                context,
                context_printed,
            )

        if caught_errors:
            errors_total += caught_errors
            exit_status = 1

    return license_names, errors_total, exit_status
示例#3
0
def check_deeds(args):
    print("\n\nChecking Deeds...\n\n")
    license_names = get_legalcode(args)
    if args.log_level <= INFO:
        print("Number of files to be checked:", len(license_names))
    errors_total = 0
    exit_status = 0
    for license_name in license_names:
        caught_errors = 0
        context_printed = False
        filename = license_name[:-len(".html")]
        deed_base_url = create_base_link(args, filename, for_deeds=True)
        # Deeds template:
        # https://github.com/creativecommons/cc.engine/blob/master/cc/engine/templates/licenses/standard_deed.html

        # Scrapping the html found on the active site
        if deed_base_url:
            context = f"\n\nChecking: deed\nURL: {deed_base_url}"
            page_url = deed_base_url
            source_html = request_text(page_url)
            license_soup = BeautifulSoup(source_html, "lxml")
            links_found = license_soup.find_all("a")
            link_count = len(links_found)
            if args.log_level <= INFO:
                print(f"{context}\nNumber of links found: {link_count}")
                context_printed = True
            base_url = deed_base_url
            valid_anchors, valid_links, context_printed = get_scrapable_links(
                args, base_url, links_found, context, context_printed)
            if valid_links:
                memoized_results = get_memoized_result(valid_links,
                                                       valid_anchors)
                stored_links = memoized_results[0]
                stored_anchors = memoized_results[1]
                stored_result = memoized_results[2]

                check_links = memoized_results[3]
                check_anchors = memoized_results[4]
                if check_links:
                    rs = (
                        # Since we're only checking for validity,
                        # we can retreive
                        # only the headers/metadata
                        grequests.head(link, timeout=REQUESTS_TIMEOUT)
                        for link in check_links)
                    responses = list()
                    # Explicitly close connections to free up file handles and
                    # avoid Connection Errors per:
                    # https://stackoverflow.com/a/22839550
                    for response in grequests.map(
                            rs, exception_handler=exception_handler):
                        try:
                            responses.append(response.status_code)
                            response.close()
                        except AttributeError:
                            responses.append(response)
                    memoize_result(check_links, responses)
                    stored_anchors += check_anchors
                    stored_result += responses
                stored_links += check_links
                caught_errors = write_response(
                    args,
                    stored_links,
                    stored_result,
                    base_url,
                    license_name,
                    stored_anchors,
                    context,
                    context_printed,
                )

            if caught_errors:
                errors_total += caught_errors
                exit_status = 1

    return license_names, errors_total, exit_status