Пример #1
0
 def test_validate_dictionary_is_all_text(self):
     validate_dictionary_is_all_text({"1": "a", "2": "b"})
     with self.assertRaises(ValueError):
         validate_dictionary_is_all_text({"1": "a", "2": 1})
     with self.assertRaises(ValueError):
         validate_dictionary_is_all_text({"1": "a", "2": 3.14})
     with self.assertRaises(ValueError):
         validate_dictionary_is_all_text({"1": "a", "2": object()})
     soup = BeautifulSoup("<span>foo</span>", "lxml")
     navstring = soup.span.string
     self.assertEqual({"a": "foo"},
                      validate_dictionary_is_all_text({"a": navstring}))
     self.assertEqual({"a": ["foo"]},
                      validate_dictionary_is_all_text({"a": [navstring]}))
     self.assertEqual(
         {"a": {
             "b": "foo"
         }},
         validate_dictionary_is_all_text({"a": {
             "b": "foo"
         }}),
     )
Пример #2
0
    def import_by_30_unported_license_html(self, *, content, legalcode):
        """
        Returns a dictionary mapping our internal keys to strings.
        """
        messages = {}
        raw_html = content
        # Some trivial making consistent - some translators changed 'strong' to 'b'
        # for some unknown reason.
        raw_html = raw_html.replace("<b>", "<strong>").replace("</b>", "</strong>")
        raw_html = raw_html.replace("<B>", "<strong>").replace("</B>", "</strong>")

        # Parse the raw HTML to a BeautifulSoup object.
        soup = BeautifulSoup(raw_html, "lxml")
        messages["license_medium"] = inner_html(soup.find(id="deed-license").h2)
        legalcode.title = messages["license_medium"]
        legalcode.save()

        deed_main_content = soup.find(id="deed-main-content")

        messages["not_a_law_firm"] = nested_text(deed_main_content.blockquote)
        # <h3><em>License</em></h3>
        messages["license"] = nested_text(deed_main_content.h3)

        # Top level paragraphs
        def paragraphs_generator():
            for p in direct_children_with_tag(deed_main_content, "p"):
                yield p

        paragraphs = paragraphs_generator()

        def ols_generator():
            for ol in direct_children_with_tag(deed_main_content, "ol"):
                yield ol

        ordered_lists = ols_generator()

        # Two paragraphs of introduction
        messages["par1"] = nested_text(next(paragraphs))
        messages["par2"] = nested_text(next(paragraphs))

        # <p><strong>1. Definitions</strong></p>
        messages["definitions"] = nested_text(next(paragraphs))

        # An ordered list of definitions
        ol = next(ordered_lists)
        for i, li in enumerate(direct_children_with_tag(ol, "li")):
            nt = name_and_text(li)
            name = nt["name"]
            text = nt["text"]
            messages[f"def{i}name"] = name
            messages[f"def{i}text"] = text

        # <p><strong>2. Fair Dealing Rights.</strong> Nothing ... </p>
        nt = name_and_text(next(paragraphs))
        messages["fair_dealing_rights"] = nt["name"]
        messages["fair_dealing_rights_text"] = nt["text"]

        # <p><strong>3. License Grant.</strong> Subject ... </p>
        nt = name_and_text(next(paragraphs))
        messages["grant"] = nt["name"]
        messages["grant_text"] = nt["text"]

        # another ol
        ol = next(ordered_lists)
        for i, li in enumerate(direct_children_with_tag(ol, "li")):
            messages[f"grant{i}"] = nested_text(li)

        messages["par5"] = nested_text(next(paragraphs))

        # <p><strong>4. Restrictions.</strong> The ... </p>
        nt = name_and_text(next(paragraphs))
        messages["restrictions"] = nt["name"]
        messages["restrictions_text"] = nt["text"]

        ol = next(ordered_lists)
        for i, li in enumerate(direct_children_with_tag(ol, "li")):
            # Most of these li's just have text.
            # one has a <p></p> followed by another ordered list
            if li.p:
                messages["restrictions avoid doubt"] = nested_text(li.p)
                ol2 = li.ol
                for j, li2 in enumerate(direct_children_with_tag(ol2, "li")):
                    nt = name_and_text(li2)
                    messages[f"restrictions name {i};{j}"] = nt["name"]
                    messages[f"restrictions text {i};{j}"] = nt["text"]
            else:
                messages[f"restrictions{i}"] = nested_text(li)

        # <p><strong>5. Representations, Warranties and Disclaimer</strong></p>
        messages["reps_and_disclaimer"] = nested_text(next(paragraphs))
        messages["unless_mutual"] = nested_text(next(paragraphs))

        # <p><strong>6. Limitation on Liability.</strong> EXCEPT ...</p>
        nt = name_and_text(next(paragraphs))
        messages["Limitation"] = nt["name"]
        messages["Limitation_text"] = nt["text"]

        # <p><strong>7. Termination</strong></p>
        messages["termination"] = nested_text(next(paragraphs))

        ol = next(ordered_lists)
        for i, li in enumerate(direct_children_with_tag(ol, "li")):
            messages[f"termination{i}"] = nested_text(li)

        # <p><strong>8. Miscellaneous</strong></p>
        messages["misc"] = nested_text(next(paragraphs))

        ol = next(ordered_lists)
        for i, li in enumerate(direct_children_with_tag(ol, "li")):
            messages[f"misc{i}"] = nested_text(li)

        # That's it for the license. The rest is disclaimer that we're handling elsewhere.

        validate_dictionary_is_all_text(messages)

        return messages
Пример #3
0
    def import_by_40_license_html(self, *, content, legalcode):
        """
        Returns a dictionary mapping our internal keys to strings.
        """
        license = legalcode.license
        license_code = license.license_code
        language_code = legalcode.language_code
        assert license.version == "4.0", f"{license.version} is not '4.0'"
        assert license.license_code.startswith("by")

        messages = {}
        raw_html = content
        # Some trivial making consistent - some translators changed 'strong' to 'b'
        # for some unknown reason.
        raw_html = raw_html.replace("<b>", "<strong>").replace("</b>", "</strong>")
        raw_html = raw_html.replace("<B>", "<strong>").replace("</B>", "</strong>")

        # Parse the raw HTML to a BeautifulSoup object.
        soup = BeautifulSoup(raw_html, "lxml")

        # Get the license titles and intro text.

        deed_main_content = soup.find(id="deed-main-content")

        messages["license_medium"] = inner_html(soup.find(id="deed-license").h2)
        legalcode.title = messages["license_medium"]
        legalcode.save()
        messages["license_long"] = inner_html(deed_main_content.h3)
        messages["license_intro"] = inner_html(
            deed_main_content.h3.find_next_sibling("p")
        )

        # Section 1 – Definitions.

        # We're going to work out a list of what definitions we expect in this license,
        # and in what order.
        # Start with the definitions common to all the BY 4.0 licenses
        expected_definitions = [
            "adapted_material",
            "copyright_and_similar_rights",
            "effective_technological_measures",
            "exceptions_and_limitations",
            "licensed_material",
            "licensed_rights",
            "licensor",
            "share",
            "sui_generis_database_rights",
            "you",
        ]

        # now insert the optional ones
        def insert_after(after_this, what_to_insert):
            i = expected_definitions.index(after_this)
            expected_definitions.insert(i + 1, what_to_insert)

        if license_code == "by-sa":
            insert_after("adapted_material", "adapters_license")
            insert_after("adapters_license", "by_sa_compatible_license")
            insert_after("exceptions_and_limitations", "license_elements_sa")
            # See https://github.com/creativecommons/creativecommons.org/issues/1153
            # BY-SA 4.0 for "pt" has an extra definition. Work around for now.
            if language_code == "pt":
                insert_after("you", "you2")
        elif license_code == "by":
            insert_after("adapted_material", "adapters_license")
        elif license_code == "by-nc":
            insert_after("adapted_material", "adapters_license")
            insert_after("licensor", "noncommercial")
        elif license_code == "by-nd":
            pass
        elif license_code == "by-nc-nd":
            insert_after("licensor", "noncommercial")
        elif license_code == "by-nc-sa":
            insert_after("adapted_material", "adapters_license")
            insert_after("exceptions_and_limitations", "license_elements_nc_sa")
            insert_after("adapters_license", "by_nc_sa_compatible_license")
            insert_after("licensor", "noncommercial")

        # definitions are in an "ol" that is the next sibling of the id=s1 element.
        messages["s1_definitions_title"] = inner_html(soup.find(id="s1").strong)
        for i, definition in enumerate(
            soup.find(id="s1").find_next_siblings("ol")[0].find_all("li")
        ):
            thing = name_and_text(definition)
            defn_key = expected_definitions[i]
            messages[
                f"s1_definitions_{defn_key}"
            ] = f"""<span style="text-decoration: underline;">{thing['name']}</span> {thing['text']}"""

        # Section 2 – Scope.
        messages["s2_scope"] = inner_html(soup.find(id="s2").strong)

        # Section 2a - License Grant
        # translation of "License grant"
        s2a = soup.find(id="s2a")
        if s2a.strong:
            messages["s2a_license_grant_title"] = inner_html(s2a.strong)
        elif s2a.b:
            messages["s2a_license_grant_title"] = inner_html(s2a.b)
        else:
            print(f"How do I handle {s2a}?")
            sys.exit(1)

        # s2a1: rights
        messages["s2a_license_grant_intro"] = str(list(soup.find(id="s2a1"))[0]).strip()

        messages["s2a_license_grant_share"] = str(
            list(soup.find(id="s2a1A"))[0]
        ).strip()
        messages["s2a_license_grant_adapted"] = str(
            list(soup.find(id="s2a1B"))[0]
        ).strip()

        # s2a2: exceptions and limitations
        nt = name_and_text(soup.find(id="s2a2"))
        messages[
            "s2a2_license_grant_exceptions"
        ] = f"<strong>{nt['name']}</strong>{nt['text']}"

        # s2a3: term
        nt = name_and_text(soup.find(id="s2a3"))
        messages[
            "s2a3_license_grant_term"
        ] = f"<strong>{nt['name']}</strong>{nt['text']}"

        # s2a4: media
        nt = name_and_text(soup.find(id="s2a4"))
        messages[
            "s2a4_license_grant_media"
        ] = f"<strong>{nt['name']}</strong>{nt['text']}"

        # s2a5: scope/grant/downstream
        # The title is just the prefix to the list of items, which are in their
        # own div, so this is slightly messy. Using the name from name_and_text
        # will get us the text we want without wrappings.
        nt = name_and_text(soup.find(id="s2a5"))
        messages["s2a5_license_grant_downstream_title"] = nt["name"]

        expected_downstreams = [
            "offer",
            "no_restrictions",
        ]
        if license_code in ["by-sa", "by-nc-sa"]:
            expected_downstreams.insert(1, "adapted_material")

        # Process top-level "li" elements under the ol
        for i, li in enumerate(
            soup.find(id="s2a5").div.ol.find_all("li", recursive=False)
        ):
            key = expected_downstreams[i]
            thing = name_and_text(li)
            messages[f"s2a5_license_grant_downstream_{key}_name"] = thing["name"]
            messages[f"s2a5_license_grant_downstream_{key}_text"] = thing["text"]

        nt = name_and_text(soup.find(id="s2a6"))
        messages["s2a6_license_grant_no_endorsement_name"] = nt["name"]
        messages["s2a6_license_grant_no_endorsement_text"] = nt["text"]

        # s2b: other rights
        s2b = soup.find(id="s2b")
        if s2b.p and s2b.p.strong:
            messages["s2b_other_rights_title"] = nested_text(s2b.p.strong)
        elif s2b.p:
            messages["s2b_other_rights_title"] = nested_text(s2b.p)
        elif s2b.strong:
            messages["s2b_other_rights_title"] = nested_text(s2b.strong)
        else:
            print(str(s2b))
            raise ValueError("Where is s2b's title?")
        list_items = soup.find(id="s2b").ol.find_all("li", recursive=False)
        assert list_items[0].name == "li"
        messages["s2b1_other_rights_moral"] = nested_text(list_items[0])
        messages["s2b2_other_rights_patent"] = nested_text(list_items[1])
        messages["s2b3_other_rights_waive"] = nested_text(list_items[2])

        # Section 3: conditions
        s3 = soup.find(id="s3")
        messages["s3_conditions_title"] = nested_text(s3)
        messages["s3_conditions_intro"] = nested_text(
            soup.find(id="s3").find_next_sibling("p")
        )

        # <p id="s3"><strong>Section 3 – License Conditions.</strong></p>
        #
        #      <p>Your exercise of the Licensed Rights is expressly made subject to the following conditions.</p>
        #
        #      <ol type="a">
        #          <li id="s3a"><p><strong>Attribution</strong>.</p>
        #          <ol>

        s3a = soup.find(id="s3a")
        inside = str(inner_html(s3a))
        if inside.startswith(" "):  # ar translation takes liberties with whitespace
            s3a = BeautifulSoup(inside.strip(), "lxml")
        if s3a.p and s3a.p.strong:
            messages["s3_conditions_attribution"] = nested_text(s3a.p.strong)
        elif s3a.strong:
            messages["s3_conditions_attribution"] = nested_text(s3a.strong)
        else:
            print(str(s3a))
            raise ValueError("Fix s3a's attribution string")

        messages["s3_conditions_if_you_share"] = text_up_to(soup.find(id="s3a1"), "ol")

        messages["s3_conditions_retain_the_following"] = text_up_to(
            soup.find(id="s3a1A"), "ol"
        )
        messages["s3a1Ai_conditions_identification"] = inner_html(
            soup.find(id="s3a1Ai")
        )
        messages["s3a1Aii_conditions_copyright"] = inner_html(soup.find(id="s3a1Aii"))
        messages["s3a1Aiii_conditions_license"] = inner_html(soup.find(id="s3a1Aiii"))
        messages["s3a1Aiv_conditions_disclaimer"] = inner_html(soup.find(id="s3a1Aiv"))
        messages["s3a1Av_conditions_link"] = inner_html(soup.find(id="s3a1Av"))
        messages["s3a1B_conditions_modified"] = inner_html(soup.find(id="s3a1B"))
        messages["s3a1C_conditions_licensed"] = inner_html(soup.find(id="s3a1C"))
        messages["s3a2_conditions_satisfy"] = inner_html(soup.find(id="s3a2"))
        messages["s3a3_conditions_remove"] = inner_html(soup.find(id="s3a3"))
        if soup.find(id="s3a4"):
            # Only present if neither SA or ND.
            # OR in the NL translation of by-nc-nd, go figure...
            messages["s3a4_if_you_share_adapted_material"] = nested_text(
                soup.find(id="s3a4")
            )

        # share-alike is only in some licenses
        if license_code.endswith("-sa"):
            messages["sharealike_name"] = nested_text(soup.find(id="s3b").strong)
            messages["sharealike_intro"] = nested_text(soup.find(id="s3b").p)

            messages["s3b1"] = nested_text(soup.find(id="s3b1"))
            messages["s3b2"] = nested_text(soup.find(id="s3b2"))
            messages["s3b3"] = nested_text(soup.find(id="s3b3"))

        # Section 4: Sui generis database rights
        messages["s4_sui_generics_database_rights_titles"] = nested_text(
            soup.find(id="s4")
        )
        messages["s4_sui_generics_database_rights_intro"] = (
            soup.find(id="s4").find_next_sibling("p").string
        )

        s4a = nested_text(soup.find(id="s4a"))
        if "nc" in license_code:
            messages["s4_sui_generics_database_rights_extract_reuse_nc"] = s4a
        else:
            messages["s4_sui_generics_database_rights_extract_reuse"] = s4a

        s4b = nested_text(soup.find(id="s4b"))
        if license_code.endswith("-sa"):
            messages["s4_sui_generics_database_rights_adapted_material_sa"] = s4b
        else:
            messages["s4_sui_generics_database_rights_adapted_material"] = s4b
        messages["s4_sui_generics_database_rights_comply_s3a"] = nested_text(
            soup.find(id="s4c")
        )
        # The next text comes after the 'ol' after s4, but isn't inside a tag itself!
        parent = soup.find(id="s4").parent
        s4_seen = False
        take_rest = False
        parts = []
        for item in parent.children:
            if take_rest:
                if item.name == "p":
                    # Stop at the next paragraph
                    break
                parts.append(str(item))
            elif not s4_seen:
                if isinstance(item, Tag) and item.get("id") == "s4":
                    s4_seen = True
                    continue
            elif not take_rest and item.name == "ol":
                # already seen s4, this is the ol, so the next child is our text
                take_rest = True
        messages["s4_sui_generics_database_rights_postscript"] = " ".join(parts)

        # Section 5: Disclaimer
        messages["s5_disclaimer_title"] = soup.find(id="s5").string
        messages["s5_a"] = soup.find(id="s5a").string  # bold
        messages["s5_b"] = soup.find(id="s5b").string  # bold
        messages["s5_c"] = soup.find(id="s5c").string  # not bold

        # Section 6: Term and Termination
        messages["s6_termination_title"] = nested_text(soup.find(id="s6"))
        messages["s6_termination_applies"] = nested_text(soup.find(id="s6a"))
        s6b = soup.find(id="s6b")
        if s6b.p:
            # most languages put the introductory text in a paragraph, making it easy
            messages["s6_termination_reinstates_where"] = soup.find(
                id="s6b"
            ).p.get_text()
        else:
            # if they don't, we have to pick out the text from the beginning of s6b's
            # content until the beginning of the "ol" inside it.
            s = ""
            for child in s6b:
                if child.name == "ol":
                    break
                s += str(child)
            messages["s6_termination_reinstates_where"] = s
        messages["s6_termination_reinstates_automatically"] = soup.find(
            id="s6b1"
        ).get_text()
        messages["s6_termination_reinstates_express"] = soup.find(id="s6b2").get_text()

        children_of_s6b = list(soup.find(id="s6b").children)
        messages["s6_termination_reinstates_postscript"] = (
            "".join(str(x) for x in children_of_s6b[4:7])
        ).strip()
        messages["s6_separate_terms"] = inner_html(soup.find(id="s6c"))
        messages["s6_survival"] = inner_html(soup.find(id="s6d"))

        # Section 7: Other terms and conditions
        messages["s7_other_terms_title"] = soup.find(id="s7").string
        messages["s7_a"] = soup.find(id="s7a").string
        messages["s7_b"] = soup.find(id="s7b").string

        # Section 8: Interpretation
        messages["s8_interpretation_title"] = soup.find(id="s8").string
        for key in ["s8a", "s8b", "s8c", "s8d"]:
            messages[key] = inner_html(soup.find(id=key))

        validate_dictionary_is_all_text(messages)

        return messages
Пример #4
0
    def import_cc0_license_html(self, *, content, legalcode):
        license = legalcode.license
        assert license.version == "1.0", f"{license.version} is not '1.0'"
        assert license.license_code == "CC0", f"{license.license_code} is not 'CC0'"
        messages = {}
        raw_html = content
        # Parse the raw HTML to a BeautifulSoup object.
        soup = BeautifulSoup(raw_html, "lxml")
        deed_main_content = soup.find(id="deed-main-content")
        messages["license_medium"] = inner_html(soup.find(id="deed-license").h2)
        legalcode.title = messages["license_medium"]
        legalcode.save()

        # Big disclaimer (all caps)
        messages["disclaimer"] = clean_string(nested_text(deed_main_content.blockquote))

        # Statement of Purpose section: "<h3><em>Statement of Purpose</em></h3>"
        messages["statement_of_purpose"] = nested_text(deed_main_content.h3)

        # SOP section is formatted as paragraphs
        paragraphs = deed_main_content.find_all("p")

        # First 3 paragraphs in the SOP section are just text
        messages["sop_p1"] = nested_text(paragraphs[0])
        messages["sop_p2"] = nested_text(paragraphs[1])
        messages["sop_p3"] = nested_text(paragraphs[2])

        # Next paragraph is a bold term, and its definition
        # <p><strong>1. Copyright and Related Rights.</strong>
        # A Work... </p>
        nt = name_and_text(paragraphs[3])
        messages["s1_title"] = nt["name"]
        messages["s1_par"] = nt["text"]

        # Followed by an ordered list with 7 items
        ol = paragraphs[3].find_next_sibling("ol")
        for i, part in enumerate(ol.find_all("li")):
            messages[f"s1_item{i}"] = nested_text(part)

        # Then two more numbered paragraphs that are definitions
        # <p><strong>2. Waiver.</strong> To the ...</p>
        nt = name_and_text(paragraphs[4])
        messages["s2_title"] = nt["name"]
        messages["s2_text"] = nt["text"]

        # <p><strong>3. Public License Fallback.</strong> Should...</p>
        nt = name_and_text(paragraphs[5])
        messages["s3_title"] = nt["name"]
        messages["s3_text"] = nt["text"]

        # Finally the Limitations header, no intro text, and an ol with 4 items.
        # <p><strong>4. Limitations and Disclaimers.</strong></p>
        s4 = paragraphs[6]  # <p><strong>4. Limitations...
        messages["s4_title"] = nested_text(s4)

        # In English, s4 is followed by an ol with 4 items.
        # In .el, s4 is followed by a <p class="tab"> with
        # 3 <br/> dividing the 4 parts.
        ol = s4.find_next_sibling("ol")
        if ol:
            for i, part in enumerate(ol.find_all("li")):
                messages[f"s4_part_{i}"] = nested_text(part)
        else:
            p4 = s4.find_next_sibling("p", class_="tab")
            text = nested_text(p4)
            parts = text.split("<br />")
            for i, part in enumerate(parts):
                messages[f"s4_part_{i}"] = str(part)

        # And that's it. The CC0 "license" is relatively short.

        validate_dictionary_is_all_text(messages)

        return messages