Exemplo n.º 1
0
 def test_get_language_for_jurisdiction(self):
     # 'be' default is "fr"
     self.assertEqual("fr",
                      get_default_language_for_jurisdiction("be", "ar"))
     # There is none for "xx" so we return the default instead
     self.assertEqual("ar",
                      get_default_language_for_jurisdiction("xx", "ar"))
Exemplo n.º 2
0
def build_license_url(license_code, version, jurisdiction_code, language_code):
    """
    Return a URL to view the license specified by the inputs. Jurisdiction
    and language are optional.
    language_code is a CC language code.
    """
    # UGH. Is there any way we could do this with a simple url 'reverse'? The
    # URL regex would be complicated, but we have unit tests to determine if
    # we've got it right. See test_templatetags.py.
    assert language_code
    if version == "4.0":
        assert not jurisdiction_code
    if jurisdiction_code:
        url = (
            f"/licenses/{license_code}/{version}/{jurisdiction_code}/legalcode"
        )
        default_language = get_default_language_for_jurisdiction(
            jurisdiction_code)
        # A few exceptions to how URLs are formed:
        include_language_anyway = (version
                                   == "3.0") and (jurisdiction_code
                                                  in ["es", "ca", "ch"])
        if include_language_anyway or language_code != default_language:
            url = f"{url}.{language_code}"
        return url
    else:
        default_language = DEFAULT_LANGUAGE_CODE
        if language_code == default_language or not language_code:
            return f"/licenses/{license_code}/{version}/legalcode"
        else:
            return (
                f"/licenses/{license_code}/{version}/legalcode.{language_code}"
            )
Exemplo n.º 3
0
def parse_legal_code_filename(filename):
    """
    Given the filename where the HTML text of a license is stored,
    return a dictionary with the metadata we can figure out from it.

    The filename should not include any path. A trailing .html is okay.

    Partially based on:
    https://github.com/creativecommons/cc-link-checker/blob/a255d2b5d72df31b3e750b34dac2ac6effe7c792/link_checker/utils.py#L419-L469  # noqa: E501
    """

    basename = filename
    if basename.endswith(".html"):
        basename = basename[:-5]

    parts = basename.split("_")

    unit = parts.pop(0)
    if unit == "samplingplus":
        unit = "sampling+"
    elif unit == "nc-samplingplus":
        unit = "nc-sampling+"

    version = parts.pop(0)

    jurisdiction = None
    language_code = None
    deed_only = False
    if unit in legal_tools.models.UNITS_DEED_ONLY:
        deed_only = True
    deprecated_on = None
    if unit in legal_tools.models.UNITS_DEPRECATED:
        deprecated_on = legal_tools.models.UNITS_DEPRECATED[unit]
    unit_to_return = unit
    if unit in legal_tools.models.UNITS_PUBLIC_DOMAIN or unit == "zero":
        category = "publicdomain"
        if unit == "certification":
            jurisdiction = "us"
    elif unit in legal_tools.models.UNITS_LICENSES:
        category = "licenses"
        if parts and float(version) < 4.0:
            jurisdiction = parts.pop(0)
    else:
        return None

    # Set and validate language_code
    if parts:
        language_code = map_legacy_to_django_language_code(parts.pop(0))
    if jurisdiction:
        language_code = language_code or get_default_language_for_jurisdiction(
            jurisdiction, "")
    else:
        language_code = language_code or settings.LANGUAGE_CODE
    if not language_code:
        raise ValueError(f"What language? filename={filename}")
    if language_code not in settings.LANG_INFO:
        # Valid Django language_codes are extended in settings with the
        # defaults in:
        # https://github.com/django/django/blob/main/django/conf/global_settings.py
        raise ValueError(f"{filename}: Invalid language_code={language_code}")

    canonical_url = compute_canonical_url(category, unit, version,
                                          jurisdiction)

    data = dict(
        category=category,
        unit=unit_to_return,
        version=version,
        jurisdiction_code=jurisdiction or "",
        language_code=language_code,
        canonical_url=canonical_url,
        deprecated_on=deprecated_on,
        deed_only=deed_only,
    )

    return data
Exemplo n.º 4
0
    def handle(self, input_directory, **options):
        if options["versions"]:
            versions_to_include = options["versions"].split(",")
        else:
            versions_to_include = None
        if options["languages"]:
            languages_to_include = set(["en"]) | set(options["languages"].split(","))
        else:
            languages_to_include = None
        self.unwrapped = options["unwrapped"]

        licenses_created = 0
        legalcodes_created = 0
        legalcodes_to_import = []

        # Get list of html filenames for CC0 and any BY license (any version).
        # We'll filter out the filenames for unwanted versions later.
        html_filenames = sorted(
            [
                f
                for f in os.listdir(input_directory)
                if (f.startswith("by") or f.startswith("zero_1.0"))
                and f.endswith(".html")
            ]
        )
        for filename in html_filenames:
            # print(filename)
            metadata = parse_legalcode_filename(filename)

            basename = os.path.splitext(filename)[0]
            fullpath = os.path.join(input_directory, filename)

            license_code = metadata["license_code"]
            version = metadata["version"]
            jurisdiction_code = metadata["jurisdiction_code"]
            cc_language_code = metadata[
                "cc_language_code"
            ] or get_default_language_for_jurisdiction(jurisdiction_code)
            # Make sure this is a valid language code (one we know about)
            django_language_code = cc_to_django_language_code(cc_language_code)
            if django_language_code not in settings.LANG_INFO:
                raise ValueError(f"Invalid language_code={cc_language_code}")

            # Just CC0, BY 3.0, & 4.0, and apply any command line options
            include = (
                (
                    (license_code in BY_LICENSE_CODES and version in {"3.0", "4.0"})
                    or license_code in CC0_LICENSE_CODES
                )
                and (versions_to_include is None or version in versions_to_include)
                and (
                    languages_to_include is None
                    or cc_language_code in languages_to_include
                )
            )
            if not include:
                continue

            about_url = metadata["about_url"]

            # These are valid for BY only
            license_code_parts = license_code.split("-")
            if "by" in license_code_parts:
                permits_derivative_works = "nd" not in license_code_parts
                permits_reproduction = "nd" not in license_code_parts
                permits_distribution = "nd" not in license_code_parts
                permits_sharing = "nd" not in license_code_parts
                requires_share_alike = "sa" in license_code_parts
                requires_notice = True
                requires_attribution = True
                requires_source_code = False  # GPL, LGPL only, I think
                prohibits_commercial_use = "nc" in license_code_parts
                prohibits_high_income_nation_use = False  # Not any BY 4.0 license
            elif license_code == "CC0":
                # permits anything, requires nothing, prohibits nothing
                permits_derivative_works = True
                permits_reproduction = True
                permits_distribution = True
                permits_sharing = True
                requires_share_alike = False
                requires_notice = False
                requires_attribution = False
                requires_source_code = False
                prohibits_commercial_use = False
                prohibits_high_income_nation_use = False
            else:
                raise NotImplementedError(basename)

            # Find or create a License object
            license, created = License.objects.get_or_create(
                about=about_url,
                defaults=dict(
                    license_code=license_code,
                    version=version,
                    jurisdiction_code=jurisdiction_code,
                    permits_derivative_works=permits_derivative_works,
                    permits_reproduction=permits_reproduction,
                    permits_distribution=permits_distribution,
                    permits_sharing=permits_sharing,
                    requires_share_alike=requires_share_alike,
                    requires_notice=requires_notice,
                    requires_attribution=requires_attribution,
                    requires_source_code=requires_source_code,
                    prohibits_commercial_use=prohibits_commercial_use,
                    prohibits_high_income_nation_use=prohibits_high_income_nation_use,
                ),
            )
            if created:
                licenses_created += 1
            # Find or create a LegalCode object
            legalcode, created = LegalCode.objects.get_or_create(
                license=license,
                language_code=cc_language_code,
                defaults=dict(
                    html_file=fullpath,
                ),
            )

            if created:
                legalcodes_created += 1
            legalcodes_to_import.append(legalcode)
        # print(
        #     f"Created {licenses_created} licenses and {legalcodes_created} translation objects"
        # )

        # NOW parse the HTML and output message files
        legalcodes_to_import = LegalCode.objects.filter(
            pk__in=[lc.pk for lc in legalcodes_to_import]
        )

        # What are the language codes we have HTML files for?
        cc_language_codes = sorted(set(lc.language_code for lc in legalcodes_to_import))

        english_by_license_code_version = {}

        # We have to do English first. Django gets confused if you try to load
        # another language and it can't find English, I guess it's looking for
        # something to fall back to.
        cc_language_codes.remove(
            "en"
        )  # If english isn't in this list, something is wrong
        for cc_language_code in ["en"] + cc_language_codes:
            for legalcode in legalcodes_to_import.filter(
                language_code=cc_language_code,
            ).order_by(
                "-license__version",
                "license__license_code",
                "license__jurisdiction_code",
            ):
                license = legalcode.license
                license_code = license.license_code
                version = license.version
                # print(
                #     f"Importing {legalcode.html_file} {license_code} lang={cc_language_code}"
                # )
                with open(legalcode.html_file, "r", encoding="utf-8") as f:
                    content = f.read()

                if version == "4.0":
                    messages_text = self.import_by_40_license_html(
                        content=content,
                        legalcode=legalcode,
                    )
                elif version == "3.0":
                    if license.jurisdiction_code:
                        # Ported license: we just save the HTML for now
                        legalcode.html = self.import_by_30_ported_license_html(
                            content=content,
                            legalcode=legalcode,
                        )
                        legalcode.save()
                        continue
                    else:
                        # Unported license: we parse out the messages like 4.0
                        messages_text = self.import_by_30_unported_license_html(
                            content=content,
                            legalcode=legalcode,
                        )
                elif license_code == "CC0":
                    messages_text = self.import_cc0_license_html(
                        content=content,
                        legalcode=legalcode,
                    )
                else:
                    raise NotImplementedError(
                        f"Have not implemented parsing for {license_code} {version} licenses."
                    )

                if version != "3.0":
                    # 3.0 doesn't have any translation files - might be the same for other versions
                    key = f"{license_code}|{version}"
                    if cc_language_code == "en":
                        english_by_license_code_version[key] = messages_text
                    english_messages = english_by_license_code_version[key]

                    pofile = POFile()
                    # The syntax used to wrap messages in a .po file is difficult if you ever
                    # want to copy/paste the messages, so if --unwrapped was passed, set a
                    # wrap width that will essentially disable wrapping.
                    if self.unwrapped:
                        pofile.wrapwidth = 999999
                    pofile.metadata = {
                        "Project-Id-Version": f"{license_code}-{version}",
                        # 'Report-Msgid-Bugs-To': '*****@*****.**',
                        # 'POT-Creation-Date': '2007-10-18 14:00+0100',
                        # 'PO-Revision-Date': '2007-10-18 14:00+0100',
                        # 'Last-Translator': 'you <*****@*****.**>',
                        # 'Language-Team': 'English <*****@*****.**>',
                        "Language": cc_language_code,
                        "MIME-Version": "1.0",
                        "Content-Type": "text/plain; charset=utf-8",
                        "Content-Transfer-Encoding": "8bit",
                    }

                    # Use the English message text as the message key
                    for internal_key, translation in messages_text.items():
                        if cc_language_code == "en":
                            message_key = translation.strip()
                            message_value = ""
                        else:
                            # WORKAROUND - by-nc-nd 4.0 NL has an extra item under s3a.
                            # https://github.com/creativecommons/creativecommons.org/pull/1160
                            if (
                                internal_key == "s3a4_if_you_share_adapted_material"
                                and internal_key not in english_messages
                            ):
                                message_key = (
                                    "If You Share Adapted Material You produce, the Adapter's "
                                    "License You apply must not prevent recipients of the Adapted "
                                    "Material from complying with this Public License."
                                )
                            else:
                                message_key = english_messages[internal_key]
                            message_value = translation

                        pofile.append(
                            POEntry(
                                msgid=clean_string(message_key),
                                msgstr=clean_string(message_value),
                            )
                        )

                    po_filename = legalcode.translation_filename()
                    dir = os.path.dirname(po_filename)
                    if not os.path.isdir(dir):
                        os.makedirs(dir)
                    # Save mofile ourself. We could call 'compilemessages' but it wants to
                    # compile everything, which is both overkill and can fail if the venv
                    # or project source is not writable. We know this dir is writable, so
                    # just save this pofile and mofile ourselves.
                    save_pofile_as_pofile_and_mofile(pofile, po_filename)
Exemplo n.º 5
0
def parse_legalcode_filename(filename):
    """
    Given the filename where the HTML text of a license is stored,
    return a dictionary with the metadata we can figure out from it.

    The filename should not include any path. A trailing .html is okay.

    COPIED FROM
    https://github.com/creativecommons/cc-link-checker/blob/6bb2eae4151c5f7949b73f8d066c309f2413c4a5/link_checker.py#L231
    and modified a great deal.
    """

    basename = filename
    if basename.endswith(".html"):
        basename = basename[:-5]

    parts = basename.split("_")

    license = parts.pop(0)
    if license == "samplingplus":
        license = "sampling+"
    elif license == "nc-samplingplus":
        license = "nc-sampling+"

    license_code_for_url = license

    version = parts.pop(0)

    jurisdiction = None
    language = None
    if license.startswith("zero"):
        license_code_to_return = "CC0"
        path_base = "publicdomain"
    else:
        license_code_to_return = license
        path_base = "licenses"
        if parts and float(version) < 4.0:
            jurisdiction = parts.pop(0)

    if parts:
        language = parts.pop(0)

    if language:
        legalcode = f"legalcode.{language}"
    else:
        legalcode = False

    url = posixpath.join("http://creativecommons.org", path_base)
    url = posixpath.join(url, license_code_for_url)
    url = posixpath.join(url, version)

    if jurisdiction:
        url = posixpath.join(url, jurisdiction)
        cc_language_code = language or get_default_language_for_jurisdiction(
            jurisdiction, "")
    else:
        cc_language_code = language or DEFAULT_LANGUAGE_CODE

    if legalcode:
        url = posixpath.join(url, legalcode)
    else:
        url = f"{url}/"

    if not cc_language_code:
        raise ValueError(f"What language? filename={filename}")

    # Make sure this is a valid language code (one we know about)
    django_language_code = cc_to_django_language_code(cc_language_code)
    if django_language_code not in settings.LANG_INFO:
        raise ValueError(
            f"Invalid language_code={cc_language_code} dj={django_language_code}"
        )

    data = dict(
        license_code=license_code_to_return,
        version=version,
        jurisdiction_code=jurisdiction or "",
        cc_language_code=cc_language_code,
        url=url,
        about_url=compute_about_url(license_code_for_url, version, jurisdiction
                                    or ""),
    )

    return data