def test_clean_string(self): data = [ # input, expected result ("foo", "foo"), ("foo bar", "foo bar"), ("foo bar", "foo bar"), ("foo bar", "foo bar"), (" x ", "x"), ("one\ntwo", "one two"), ] for input, expected in data: with self.subTest(input): self.assertEqual(expected, clean_string(input))
def handle(self, input_directory, **options): if options["versions"]: versions_to_include = options["versions"].split(",") else: versions_to_include = None if options["languages"]: languages_to_include = set(["en"]) | set(options["languages"].split(",")) else: languages_to_include = None self.unwrapped = options["unwrapped"] licenses_created = 0 legalcodes_created = 0 legalcodes_to_import = [] # Get list of html filenames for CC0 and any BY license (any version). # We'll filter out the filenames for unwanted versions later. html_filenames = sorted( [ f for f in os.listdir(input_directory) if (f.startswith("by") or f.startswith("zero_1.0")) and f.endswith(".html") ] ) for filename in html_filenames: # print(filename) metadata = parse_legalcode_filename(filename) basename = os.path.splitext(filename)[0] fullpath = os.path.join(input_directory, filename) license_code = metadata["license_code"] version = metadata["version"] jurisdiction_code = metadata["jurisdiction_code"] cc_language_code = metadata[ "cc_language_code" ] or get_default_language_for_jurisdiction(jurisdiction_code) # Make sure this is a valid language code (one we know about) django_language_code = cc_to_django_language_code(cc_language_code) if django_language_code not in settings.LANG_INFO: raise ValueError(f"Invalid language_code={cc_language_code}") # Just CC0, BY 3.0, & 4.0, and apply any command line options include = ( ( (license_code in BY_LICENSE_CODES and version in {"3.0", "4.0"}) or license_code in CC0_LICENSE_CODES ) and (versions_to_include is None or version in versions_to_include) and ( languages_to_include is None or cc_language_code in languages_to_include ) ) if not include: continue about_url = metadata["about_url"] # These are valid for BY only license_code_parts = license_code.split("-") if "by" in license_code_parts: permits_derivative_works = "nd" not in license_code_parts permits_reproduction = "nd" not in license_code_parts permits_distribution = "nd" not in license_code_parts permits_sharing = "nd" not in license_code_parts requires_share_alike = "sa" in license_code_parts requires_notice = True requires_attribution = True requires_source_code = False # GPL, LGPL only, I think prohibits_commercial_use = "nc" in license_code_parts prohibits_high_income_nation_use = False # Not any BY 4.0 license elif license_code == "CC0": # permits anything, requires nothing, prohibits nothing permits_derivative_works = True permits_reproduction = True permits_distribution = True permits_sharing = True requires_share_alike = False requires_notice = False requires_attribution = False requires_source_code = False prohibits_commercial_use = False prohibits_high_income_nation_use = False else: raise NotImplementedError(basename) # Find or create a License object license, created = License.objects.get_or_create( about=about_url, defaults=dict( license_code=license_code, version=version, jurisdiction_code=jurisdiction_code, permits_derivative_works=permits_derivative_works, permits_reproduction=permits_reproduction, permits_distribution=permits_distribution, permits_sharing=permits_sharing, requires_share_alike=requires_share_alike, requires_notice=requires_notice, requires_attribution=requires_attribution, requires_source_code=requires_source_code, prohibits_commercial_use=prohibits_commercial_use, prohibits_high_income_nation_use=prohibits_high_income_nation_use, ), ) if created: licenses_created += 1 # Find or create a LegalCode object legalcode, created = LegalCode.objects.get_or_create( license=license, language_code=cc_language_code, defaults=dict( html_file=fullpath, ), ) if created: legalcodes_created += 1 legalcodes_to_import.append(legalcode) # print( # f"Created {licenses_created} licenses and {legalcodes_created} translation objects" # ) # NOW parse the HTML and output message files legalcodes_to_import = LegalCode.objects.filter( pk__in=[lc.pk for lc in legalcodes_to_import] ) # What are the language codes we have HTML files for? cc_language_codes = sorted(set(lc.language_code for lc in legalcodes_to_import)) english_by_license_code_version = {} # We have to do English first. Django gets confused if you try to load # another language and it can't find English, I guess it's looking for # something to fall back to. cc_language_codes.remove( "en" ) # If english isn't in this list, something is wrong for cc_language_code in ["en"] + cc_language_codes: for legalcode in legalcodes_to_import.filter( language_code=cc_language_code, ).order_by( "-license__version", "license__license_code", "license__jurisdiction_code", ): license = legalcode.license license_code = license.license_code version = license.version # print( # f"Importing {legalcode.html_file} {license_code} lang={cc_language_code}" # ) with open(legalcode.html_file, "r", encoding="utf-8") as f: content = f.read() if version == "4.0": messages_text = self.import_by_40_license_html( content=content, legalcode=legalcode, ) elif version == "3.0": if license.jurisdiction_code: # Ported license: we just save the HTML for now legalcode.html = self.import_by_30_ported_license_html( content=content, legalcode=legalcode, ) legalcode.save() continue else: # Unported license: we parse out the messages like 4.0 messages_text = self.import_by_30_unported_license_html( content=content, legalcode=legalcode, ) elif license_code == "CC0": messages_text = self.import_cc0_license_html( content=content, legalcode=legalcode, ) else: raise NotImplementedError( f"Have not implemented parsing for {license_code} {version} licenses." ) if version != "3.0": # 3.0 doesn't have any translation files - might be the same for other versions key = f"{license_code}|{version}" if cc_language_code == "en": english_by_license_code_version[key] = messages_text english_messages = english_by_license_code_version[key] pofile = POFile() # The syntax used to wrap messages in a .po file is difficult if you ever # want to copy/paste the messages, so if --unwrapped was passed, set a # wrap width that will essentially disable wrapping. if self.unwrapped: pofile.wrapwidth = 999999 pofile.metadata = { "Project-Id-Version": f"{license_code}-{version}", # 'Report-Msgid-Bugs-To': '*****@*****.**', # 'POT-Creation-Date': '2007-10-18 14:00+0100', # 'PO-Revision-Date': '2007-10-18 14:00+0100', # 'Last-Translator': 'you <*****@*****.**>', # 'Language-Team': 'English <*****@*****.**>', "Language": cc_language_code, "MIME-Version": "1.0", "Content-Type": "text/plain; charset=utf-8", "Content-Transfer-Encoding": "8bit", } # Use the English message text as the message key for internal_key, translation in messages_text.items(): if cc_language_code == "en": message_key = translation.strip() message_value = "" else: # WORKAROUND - by-nc-nd 4.0 NL has an extra item under s3a. # https://github.com/creativecommons/creativecommons.org/pull/1160 if ( internal_key == "s3a4_if_you_share_adapted_material" and internal_key not in english_messages ): message_key = ( "If You Share Adapted Material You produce, the Adapter's " "License You apply must not prevent recipients of the Adapted " "Material from complying with this Public License." ) else: message_key = english_messages[internal_key] message_value = translation pofile.append( POEntry( msgid=clean_string(message_key), msgstr=clean_string(message_value), ) ) po_filename = legalcode.translation_filename() dir = os.path.dirname(po_filename) if not os.path.isdir(dir): os.makedirs(dir) # Save mofile ourself. We could call 'compilemessages' but it wants to # compile everything, which is both overkill and can fail if the venv # or project source is not writable. We know this dir is writable, so # just save this pofile and mofile ourselves. save_pofile_as_pofile_and_mofile(pofile, po_filename)
def import_cc0_license_html(self, *, content, legalcode): license = legalcode.license assert license.version == "1.0", f"{license.version} is not '1.0'" assert license.license_code == "CC0", f"{license.license_code} is not 'CC0'" messages = {} raw_html = content # Parse the raw HTML to a BeautifulSoup object. soup = BeautifulSoup(raw_html, "lxml") deed_main_content = soup.find(id="deed-main-content") messages["license_medium"] = inner_html(soup.find(id="deed-license").h2) legalcode.title = messages["license_medium"] legalcode.save() # Big disclaimer (all caps) messages["disclaimer"] = clean_string(nested_text(deed_main_content.blockquote)) # Statement of Purpose section: "<h3><em>Statement of Purpose</em></h3>" messages["statement_of_purpose"] = nested_text(deed_main_content.h3) # SOP section is formatted as paragraphs paragraphs = deed_main_content.find_all("p") # First 3 paragraphs in the SOP section are just text messages["sop_p1"] = nested_text(paragraphs[0]) messages["sop_p2"] = nested_text(paragraphs[1]) messages["sop_p3"] = nested_text(paragraphs[2]) # Next paragraph is a bold term, and its definition # <p><strong>1. Copyright and Related Rights.</strong> # A Work... </p> nt = name_and_text(paragraphs[3]) messages["s1_title"] = nt["name"] messages["s1_par"] = nt["text"] # Followed by an ordered list with 7 items ol = paragraphs[3].find_next_sibling("ol") for i, part in enumerate(ol.find_all("li")): messages[f"s1_item{i}"] = nested_text(part) # Then two more numbered paragraphs that are definitions # <p><strong>2. Waiver.</strong> To the ...</p> nt = name_and_text(paragraphs[4]) messages["s2_title"] = nt["name"] messages["s2_text"] = nt["text"] # <p><strong>3. Public License Fallback.</strong> Should...</p> nt = name_and_text(paragraphs[5]) messages["s3_title"] = nt["name"] messages["s3_text"] = nt["text"] # Finally the Limitations header, no intro text, and an ol with 4 items. # <p><strong>4. Limitations and Disclaimers.</strong></p> s4 = paragraphs[6] # <p><strong>4. Limitations... messages["s4_title"] = nested_text(s4) # In English, s4 is followed by an ol with 4 items. # In .el, s4 is followed by a <p class="tab"> with # 3 <br/> dividing the 4 parts. ol = s4.find_next_sibling("ol") if ol: for i, part in enumerate(ol.find_all("li")): messages[f"s4_part_{i}"] = nested_text(part) else: p4 = s4.find_next_sibling("p", class_="tab") text = nested_text(p4) parts = text.split("<br />") for i, part in enumerate(parts): messages[f"s4_part_{i}"] = str(part) # And that's it. The CC0 "license" is relatively short. validate_dictionary_is_all_text(messages) return messages