示例#1
0
 def test_clean_string(self):
     data = [
         # input, expected result
         ("foo", "foo"),
         ("foo bar", "foo bar"),
         ("foo  bar", "foo bar"),
         ("foo   bar", "foo bar"),
         (" x ", "x"),
         ("one\ntwo", "one two"),
     ]
     for input, expected in data:
         with self.subTest(input):
             self.assertEqual(expected, clean_string(input))
示例#2
0
    def handle(self, input_directory, **options):
        if options["versions"]:
            versions_to_include = options["versions"].split(",")
        else:
            versions_to_include = None
        if options["languages"]:
            languages_to_include = set(["en"]) | set(options["languages"].split(","))
        else:
            languages_to_include = None
        self.unwrapped = options["unwrapped"]

        licenses_created = 0
        legalcodes_created = 0
        legalcodes_to_import = []

        # Get list of html filenames for CC0 and any BY license (any version).
        # We'll filter out the filenames for unwanted versions later.
        html_filenames = sorted(
            [
                f
                for f in os.listdir(input_directory)
                if (f.startswith("by") or f.startswith("zero_1.0"))
                and f.endswith(".html")
            ]
        )
        for filename in html_filenames:
            # print(filename)
            metadata = parse_legalcode_filename(filename)

            basename = os.path.splitext(filename)[0]
            fullpath = os.path.join(input_directory, filename)

            license_code = metadata["license_code"]
            version = metadata["version"]
            jurisdiction_code = metadata["jurisdiction_code"]
            cc_language_code = metadata[
                "cc_language_code"
            ] or get_default_language_for_jurisdiction(jurisdiction_code)
            # Make sure this is a valid language code (one we know about)
            django_language_code = cc_to_django_language_code(cc_language_code)
            if django_language_code not in settings.LANG_INFO:
                raise ValueError(f"Invalid language_code={cc_language_code}")

            # Just CC0, BY 3.0, & 4.0, and apply any command line options
            include = (
                (
                    (license_code in BY_LICENSE_CODES and version in {"3.0", "4.0"})
                    or license_code in CC0_LICENSE_CODES
                )
                and (versions_to_include is None or version in versions_to_include)
                and (
                    languages_to_include is None
                    or cc_language_code in languages_to_include
                )
            )
            if not include:
                continue

            about_url = metadata["about_url"]

            # These are valid for BY only
            license_code_parts = license_code.split("-")
            if "by" in license_code_parts:
                permits_derivative_works = "nd" not in license_code_parts
                permits_reproduction = "nd" not in license_code_parts
                permits_distribution = "nd" not in license_code_parts
                permits_sharing = "nd" not in license_code_parts
                requires_share_alike = "sa" in license_code_parts
                requires_notice = True
                requires_attribution = True
                requires_source_code = False  # GPL, LGPL only, I think
                prohibits_commercial_use = "nc" in license_code_parts
                prohibits_high_income_nation_use = False  # Not any BY 4.0 license
            elif license_code == "CC0":
                # permits anything, requires nothing, prohibits nothing
                permits_derivative_works = True
                permits_reproduction = True
                permits_distribution = True
                permits_sharing = True
                requires_share_alike = False
                requires_notice = False
                requires_attribution = False
                requires_source_code = False
                prohibits_commercial_use = False
                prohibits_high_income_nation_use = False
            else:
                raise NotImplementedError(basename)

            # Find or create a License object
            license, created = License.objects.get_or_create(
                about=about_url,
                defaults=dict(
                    license_code=license_code,
                    version=version,
                    jurisdiction_code=jurisdiction_code,
                    permits_derivative_works=permits_derivative_works,
                    permits_reproduction=permits_reproduction,
                    permits_distribution=permits_distribution,
                    permits_sharing=permits_sharing,
                    requires_share_alike=requires_share_alike,
                    requires_notice=requires_notice,
                    requires_attribution=requires_attribution,
                    requires_source_code=requires_source_code,
                    prohibits_commercial_use=prohibits_commercial_use,
                    prohibits_high_income_nation_use=prohibits_high_income_nation_use,
                ),
            )
            if created:
                licenses_created += 1
            # Find or create a LegalCode object
            legalcode, created = LegalCode.objects.get_or_create(
                license=license,
                language_code=cc_language_code,
                defaults=dict(
                    html_file=fullpath,
                ),
            )

            if created:
                legalcodes_created += 1
            legalcodes_to_import.append(legalcode)
        # print(
        #     f"Created {licenses_created} licenses and {legalcodes_created} translation objects"
        # )

        # NOW parse the HTML and output message files
        legalcodes_to_import = LegalCode.objects.filter(
            pk__in=[lc.pk for lc in legalcodes_to_import]
        )

        # What are the language codes we have HTML files for?
        cc_language_codes = sorted(set(lc.language_code for lc in legalcodes_to_import))

        english_by_license_code_version = {}

        # We have to do English first. Django gets confused if you try to load
        # another language and it can't find English, I guess it's looking for
        # something to fall back to.
        cc_language_codes.remove(
            "en"
        )  # If english isn't in this list, something is wrong
        for cc_language_code in ["en"] + cc_language_codes:
            for legalcode in legalcodes_to_import.filter(
                language_code=cc_language_code,
            ).order_by(
                "-license__version",
                "license__license_code",
                "license__jurisdiction_code",
            ):
                license = legalcode.license
                license_code = license.license_code
                version = license.version
                # print(
                #     f"Importing {legalcode.html_file} {license_code} lang={cc_language_code}"
                # )
                with open(legalcode.html_file, "r", encoding="utf-8") as f:
                    content = f.read()

                if version == "4.0":
                    messages_text = self.import_by_40_license_html(
                        content=content,
                        legalcode=legalcode,
                    )
                elif version == "3.0":
                    if license.jurisdiction_code:
                        # Ported license: we just save the HTML for now
                        legalcode.html = self.import_by_30_ported_license_html(
                            content=content,
                            legalcode=legalcode,
                        )
                        legalcode.save()
                        continue
                    else:
                        # Unported license: we parse out the messages like 4.0
                        messages_text = self.import_by_30_unported_license_html(
                            content=content,
                            legalcode=legalcode,
                        )
                elif license_code == "CC0":
                    messages_text = self.import_cc0_license_html(
                        content=content,
                        legalcode=legalcode,
                    )
                else:
                    raise NotImplementedError(
                        f"Have not implemented parsing for {license_code} {version} licenses."
                    )

                if version != "3.0":
                    # 3.0 doesn't have any translation files - might be the same for other versions
                    key = f"{license_code}|{version}"
                    if cc_language_code == "en":
                        english_by_license_code_version[key] = messages_text
                    english_messages = english_by_license_code_version[key]

                    pofile = POFile()
                    # The syntax used to wrap messages in a .po file is difficult if you ever
                    # want to copy/paste the messages, so if --unwrapped was passed, set a
                    # wrap width that will essentially disable wrapping.
                    if self.unwrapped:
                        pofile.wrapwidth = 999999
                    pofile.metadata = {
                        "Project-Id-Version": f"{license_code}-{version}",
                        # 'Report-Msgid-Bugs-To': '*****@*****.**',
                        # 'POT-Creation-Date': '2007-10-18 14:00+0100',
                        # 'PO-Revision-Date': '2007-10-18 14:00+0100',
                        # 'Last-Translator': 'you <*****@*****.**>',
                        # 'Language-Team': 'English <*****@*****.**>',
                        "Language": cc_language_code,
                        "MIME-Version": "1.0",
                        "Content-Type": "text/plain; charset=utf-8",
                        "Content-Transfer-Encoding": "8bit",
                    }

                    # Use the English message text as the message key
                    for internal_key, translation in messages_text.items():
                        if cc_language_code == "en":
                            message_key = translation.strip()
                            message_value = ""
                        else:
                            # WORKAROUND - by-nc-nd 4.0 NL has an extra item under s3a.
                            # https://github.com/creativecommons/creativecommons.org/pull/1160
                            if (
                                internal_key == "s3a4_if_you_share_adapted_material"
                                and internal_key not in english_messages
                            ):
                                message_key = (
                                    "If You Share Adapted Material You produce, the Adapter's "
                                    "License You apply must not prevent recipients of the Adapted "
                                    "Material from complying with this Public License."
                                )
                            else:
                                message_key = english_messages[internal_key]
                            message_value = translation

                        pofile.append(
                            POEntry(
                                msgid=clean_string(message_key),
                                msgstr=clean_string(message_value),
                            )
                        )

                    po_filename = legalcode.translation_filename()
                    dir = os.path.dirname(po_filename)
                    if not os.path.isdir(dir):
                        os.makedirs(dir)
                    # Save mofile ourself. We could call 'compilemessages' but it wants to
                    # compile everything, which is both overkill and can fail if the venv
                    # or project source is not writable. We know this dir is writable, so
                    # just save this pofile and mofile ourselves.
                    save_pofile_as_pofile_and_mofile(pofile, po_filename)
示例#3
0
    def import_cc0_license_html(self, *, content, legalcode):
        license = legalcode.license
        assert license.version == "1.0", f"{license.version} is not '1.0'"
        assert license.license_code == "CC0", f"{license.license_code} is not 'CC0'"
        messages = {}
        raw_html = content
        # Parse the raw HTML to a BeautifulSoup object.
        soup = BeautifulSoup(raw_html, "lxml")
        deed_main_content = soup.find(id="deed-main-content")
        messages["license_medium"] = inner_html(soup.find(id="deed-license").h2)
        legalcode.title = messages["license_medium"]
        legalcode.save()

        # Big disclaimer (all caps)
        messages["disclaimer"] = clean_string(nested_text(deed_main_content.blockquote))

        # Statement of Purpose section: "<h3><em>Statement of Purpose</em></h3>"
        messages["statement_of_purpose"] = nested_text(deed_main_content.h3)

        # SOP section is formatted as paragraphs
        paragraphs = deed_main_content.find_all("p")

        # First 3 paragraphs in the SOP section are just text
        messages["sop_p1"] = nested_text(paragraphs[0])
        messages["sop_p2"] = nested_text(paragraphs[1])
        messages["sop_p3"] = nested_text(paragraphs[2])

        # Next paragraph is a bold term, and its definition
        # <p><strong>1. Copyright and Related Rights.</strong>
        # A Work... </p>
        nt = name_and_text(paragraphs[3])
        messages["s1_title"] = nt["name"]
        messages["s1_par"] = nt["text"]

        # Followed by an ordered list with 7 items
        ol = paragraphs[3].find_next_sibling("ol")
        for i, part in enumerate(ol.find_all("li")):
            messages[f"s1_item{i}"] = nested_text(part)

        # Then two more numbered paragraphs that are definitions
        # <p><strong>2. Waiver.</strong> To the ...</p>
        nt = name_and_text(paragraphs[4])
        messages["s2_title"] = nt["name"]
        messages["s2_text"] = nt["text"]

        # <p><strong>3. Public License Fallback.</strong> Should...</p>
        nt = name_and_text(paragraphs[5])
        messages["s3_title"] = nt["name"]
        messages["s3_text"] = nt["text"]

        # Finally the Limitations header, no intro text, and an ol with 4 items.
        # <p><strong>4. Limitations and Disclaimers.</strong></p>
        s4 = paragraphs[6]  # <p><strong>4. Limitations...
        messages["s4_title"] = nested_text(s4)

        # In English, s4 is followed by an ol with 4 items.
        # In .el, s4 is followed by a <p class="tab"> with
        # 3 <br/> dividing the 4 parts.
        ol = s4.find_next_sibling("ol")
        if ol:
            for i, part in enumerate(ol.find_all("li")):
                messages[f"s4_part_{i}"] = nested_text(part)
        else:
            p4 = s4.find_next_sibling("p", class_="tab")
            text = nested_text(p4)
            parts = text.split("<br />")
            for i, part in enumerate(parts):
                messages[f"s4_part_{i}"] = str(part)

        # And that's it. The CC0 "license" is relatively short.

        validate_dictionary_is_all_text(messages)

        return messages