def testRepresentSyntaxError(self): # Test string conversion. Most code is shared with # TranslationFormatInvalidInputError, so no need to test quite as # extensively. exception = TranslationFormatSyntaxError() self.assertEqual(str(exception), "Unknown syntax error") exception = TranslationFormatSyntaxError(filename="foo", message="x") self.assertEqual(str(exception), "foo: x")
def _parseHeaderFields(self): """Return plural form values based on the parsed header.""" for key, value in self._header_dictionary.iteritems(): if key == 'plural-forms': parts = self._parseAssignments(value) nplurals = parts.get('nplurals') if nplurals is None: # Number of plurals not specified. Default to single # form. self.number_plural_forms = 1 self.plural_form_expression = '0' elif nplurals != 'INTEGER': # We found something different than gettext's default # value. try: self.number_plural_forms = int(nplurals) except (TypeError, ValueError): # There are some po files with bad headers that have a # non numeric value here and sometimes an empty value. # In that case, set the default value. raise TranslationFormatSyntaxError( message="Invalid nplurals declaration in header: " "'%s' (should be a number)." % nplurals) if self.number_plural_forms <= 0: text = "Number of plural forms is impossibly low." raise TranslationFormatSyntaxError(message=text) max_forms = TranslationConstants.MAX_PLURAL_FORMS if self.number_plural_forms > max_forms: raise TooManyPluralFormsError() self.plural_form_expression = parts.get('plural', '0') else: # Plurals declaration contains default text. This is # probably a template, so leave the text as it is. pass elif key == 'pot-creation-date': date = self._parseOptionalDate(value) if date: self.template_creation_date = date elif key == 'po-revision-date': self.translation_revision_date = self._parseOptionalDate( value) elif key == 'last-translator': self._last_translator = value elif key == 'language-team': self.language_team = value elif key in ('x-launchpad-export-date', 'x-rosetta-export-date'): # The key we use right now to note the export date is # X-Launchpad-Export-Date but we need to accept the old one # too so old exports will still work. self.launchpad_export_date = self._parseOptionalDate(value) else: # We don't use the other keys. pass
def addTranslation(self, plural_form, translation): """See `ITranslationMessageData`.""" # Unlike msgids, we can't assume that groups of translations are # contiguous. I.e. we might get translations for plural forms 0 and 2, # but not 1. This means we need to add empty values if plural_form > # len(self._translations). # # We raise an error if plural_form < len(self.translations) and # self.translations[plural_form] is not None. assert plural_form is not None, 'plural_form cannot be None!' is_duplicate = (plural_form < len(self._translations) and self._translations[plural_form] is not None and self._translations[plural_form] != translation) if is_duplicate: error = ( "Message has more than one translation for plural form %d." % plural_form) raise TranslationFormatSyntaxError(message=error) if plural_form >= len(self.translations): # There is a hole in the list of translations so we fill it with # None. self._translations.extend( [None] * (1 + plural_form - len(self._translations))) self._translations[plural_form] = translation
def __init__(self, content): """Initialize: parse `content` as a manifest file.""" if content.startswith('\n'): raise TranslationFormatSyntaxError( message="Manifest begins with newline.") locales = [] for line in content.splitlines(): words = line.split() num_words = len(words) if num_words == 0 or words[0] != 'locale': pass elif num_words < 4: logging.info("Ignoring short manifest line: '%s'" % line) elif num_words > 4: logging.info("Ignoring long manifest line: '%s'" % line) else: locales.append(ManifestEntry(words[1], words[2], words[3])) # Eliminate duplicates. paths = set() deletions = [] for index, entry in enumerate(locales): assert entry.path.endswith('/'), "Manifest path lost its slash" if entry.path in paths: logging.info("Duplicate paths in manifest: '%s'" % entry.path) deletions.append(index) paths.add(entry.path) for index in reversed(deletions): del locales[index] self._locales = sorted(locales, key=manifest_entry_sort_key)
def testNonAsciiSyntaxError(self): # Test against non-ascii characters. exception = TranslationFormatSyntaxError(filename=u"khor-khai-\u0e01", line_number=4, message=u"khor-khai-\u0e02") self.assertEqual(str(exception), "khor-khai-\\u0e01, line 4: khor-khai-\u0e02")
def _parseLine(self, original_line): self._lineno += 1 # Skip empty lines line = original_line.strip() if len(line) == 0: return if not self._escaped_line_break: line = self._parseFreshLine(line, original_line) if line is None or len(line) == 0: return line = self._parseQuotedString(line) text_section_types = ('msgctxt', 'msgid', 'msgid_plural', 'msgstr') if self._section not in text_section_types: raise TranslationFormatSyntaxError(line_number=self._lineno, message='Invalid content: %r' % original_line) self._parsed_content += line
def getLastTranslator(self): """See `ITranslationHeaderData`.""" last_name, last_email = None, None contributor_tag = "{http://www.mozilla.org/2004/em-rdf#}contributor" # Both cElementTree and elementtree fail when trying to parse # proper unicode strings. Use our raw input instead. try: parse = cElementTree.iterparse(StringIO(self._raw_content)) for event, elem in parse: if elem.tag == contributor_tag: # An XPI header can list multiple contributors, but # here we care only about the latest one listed as a # well-formed name and email address. name, email = parseaddr(elem.text) if name != '' and '@' in email: last_name, last_email = name, email except SyntaxError as exception: raise TranslationFormatSyntaxError(filename='install.rdf', line_number=exception.lineno, message=exception.msg) return last_name, last_email
def parse(self, content): """Parse given content as a property file. Once the parse is done, self.messages has a list of the available `ITranslationMessageData`s. """ # .properties files are supposed to be unicode-escaped, but we know # that there are some .xpi language packs that instead, use UTF-8. # That's against the specification, but Mozilla applications accept # it anyway, so we try to support it too. # To do this support, we read the text as being in UTF-8 # because unicode-escaped looks like ASCII files. try: content = content.decode('utf-8') except UnicodeDecodeError: raise TranslationFormatInvalidInputError( 'Content is not valid unicode-escaped text') line_num = 0 is_multi_line_comment = False last_comment = None last_comment_line_num = 0 ignore_comment = False is_message = False translation = u'' for line in content.splitlines(): # Now, to "normalize" all to the same encoding, we encode to # unicode-escape first, and then decode it to unicode # XXX: Danilo 2006-08-01: we _might_ get performance # improvements if we reimplement this to work directly, # though, it will be hard to beat C-based de/encoder. # This call unescapes everything so we don't need to care about # quotes escaping. try: string = line.encode('raw-unicode_escape') line = string.decode('unicode_escape') except UnicodeDecodeError as exception: raise TranslationFormatInvalidInputError( filename=self.filename, line_number=line_num, message=str(exception)) line_num += 1 if not is_multi_line_comment: # Remove any white space before the useful data, like # ' # foo'. line = line.lstrip() if len(line) == 0: # It's an empty line. Reset any previous comment we have. last_comment = None last_comment_line_num = 0 ignore_comment = False elif line.startswith(u'#') or line.startswith(u'//'): # It's a whole line comment. ignore_comment = False line = line[1:].strip() if last_comment: last_comment += line elif len(line) > 0: last_comment = line if last_comment and not last_comment.endswith('\n'): # Comments must end always with a new line. last_comment += '\n' last_comment_line_num = line_num continue # Unescaped URLs are a common mistake: the "//" starts an # end-of-line comment. To work around that, treat "://" as # a special case. just_saw_colon = False while line: if is_multi_line_comment: if line.startswith(u'*/'): # The comment ended, we jump the closing tag and # continue with the parsing. line = line[2:] is_multi_line_comment = False last_comment_line_num = line_num if ignore_comment: last_comment = None ignore_comment = False # Comments must end always with a new line. last_comment += '\n' elif line.startswith(self.license_block_text): # It's a comment with a licence notice, this # comment can be ignored. ignore_comment = True # Jump the whole tag line = line[len(self.license_block_text):] else: # Store the character. if last_comment is None: last_comment = line[0] elif last_comment_line_num == line_num: last_comment += line[0] else: last_comment = u'%s\n%s' % (last_comment, line[0]) last_comment_line_num = line_num # Jump the processed char. line = line[1:] continue elif line.startswith(u'/*'): # It's a multi line comment is_multi_line_comment = True ignore_comment = False last_comment_line_num = line_num # Jump the comment starting tag line = line[2:] continue elif line.startswith(u'//') and not just_saw_colon: # End-of-line comment. last_comment = '%s\n' % line[2:].strip() last_comment_line_num = line_num # On to next line. break elif is_message: # Store the char and continue. head_char = line[0] translation += head_char line = line[1:] just_saw_colon = (head_char == ':') continue elif u'=' in line: # Looks like a message string. (key, value) = line.split('=', 1) # Remove leading and trailing white spaces. key = key.strip() if valid_property_msgid(key): is_message = True # Jump the msgid, control chars and leading white # space. line = value.lstrip() continue else: raise TranslationFormatSyntaxError( line_number=line_num, message=u"invalid msgid: '%s'" % key) else: # Got a line that is not a valid message nor a valid # comment. Ignore it because main en-US.xpi catalog from # Firefox has such line/error. We follow the 'be strict # with what you export, be permisive with what you import' # policy. break if is_message: # We just parsed a message, so we need to add it to the list # of messages. if ignore_comment or last_comment_line_num < line_num - 1: # We must ignore the comment or either the comment is not # the last thing before this message or is not in the same # line as this message. last_comment = None ignore_comment = False message = TranslationMessageData() message.msgid_singular = key message.context = self.chrome_path message.file_references_list = [ "%s:%d(%s)" % (self.filename, line_num, key) ] value = translation.strip() message.addTranslation(TranslationConstants.SINGULAR_FORM, value) message.singular_text = value message.source_comment = last_comment self.messages.append(message) # Reset status vars. last_comment = None last_comment_line_num = 0 is_message = False translation = u''
def _parseFreshLine(self, line, original_line): """Parse a new line (not a continuation after escaped newline). :param line: Remaining part of input line. :param original_line: Line as it originally was on input. :return: If there is one, the first line of a quoted string belonging to the line's section. Otherwise, None. """ is_obsolete = False if line.startswith('#~'): if line.startswith('#~|'): # This is an old msgid for an obsolete message. return None else: is_obsolete = True line = line[2:].lstrip() if len(line) == 0: return None # If we get a comment line after a msgstr or a line starting with # msgid or msgctxt, this is a new entry. if ((line.startswith('#') or line.startswith('msgid') or line.startswith('msgctxt')) and self._section == 'msgstr'): if self._message is None: # first entry - do nothing. pass elif self._message.msgid_singular: self._dumpCurrentSection() self._storeCurrentMessage() elif self._translation_file.header is None: # When there is no msgid in the parsed message, it's the # header for this file. self._dumpCurrentSection() self._parseHeader( self._message.translations[ TranslationConstants.SINGULAR_FORM], self._message.comment) else: self._emitSyntaxWarning("We got a second header.") # Start a new message. self._message = TranslationMessageData() self._message_lineno = self._lineno self._section = None self._plural_case = None self._parsed_content = u'' if self._message is not None: # Record whether the message is obsolete. self._message.is_obsolete = is_obsolete if line[0] == '#': # Record flags if line[:2] == '#,': new_flags = [flag.strip() for flag in line[2:].split(',')] self._message.flags.update(new_flags) return None # Record file references if line[:2] == '#:': if self._message.file_references: # There is already a file reference, let's split it from # the new one with a new line char. self._message.file_references += '\n' self._message.file_references += line[2:].strip() return None # Record source comments if line[:2] == '#.': self._message.source_comment += line[2:].strip() + '\n' return None # Record comments self._message.comment += line[1:] + '\n' return None # Now we are in a msgctxt or msgid section, output previous section if line.startswith('msgid_plural'): if self._section != 'msgid': raise TranslationFormatSyntaxError( line_number=self._lineno, message="Unexpected keyword: msgid_plural") self._dumpCurrentSection() self._section = 'msgid_plural' line = line[len('msgid_plural'):] elif line.startswith('msgctxt'): if (self._section is not None and (self._section == 'msgctxt' or self._section.startswith('msgid'))): raise TranslationFormatSyntaxError( line_number=self._lineno, message="Unexpected keyword: msgctxt") self._section = 'msgctxt' line = line[len('msgctxt'):] elif line.startswith('msgid'): if (self._section is not None and self._section.startswith('msgid')): raise TranslationFormatSyntaxError( line_number=self._lineno, message="Unexpected keyword: msgid") if self._section is not None: self._dumpCurrentSection() self._section = 'msgid' line = line[len('msgid'):] self._plural_case = None # Now we are in a msgstr section elif line.startswith('msgstr'): self._dumpCurrentSection() self._section = 'msgstr' line = line[len('msgstr'):] # XXX kiko 2005-08-19: if line is empty, it means we got an msgstr # followed by a newline; that may be critical, but who knows? if line.startswith('['): # Plural case new_plural_case, line = line[1:].split(']', 1) try: new_plural_case = int(new_plural_case) except ValueError: # Trigger "invalid plural case number" error. new_plural_case = -1 if new_plural_case < 0: raise TranslationFormatSyntaxError( line_number=self._lineno, message="Invalid plural case number.") elif new_plural_case >= TranslationConstants.MAX_PLURAL_FORMS: raise TranslationFormatSyntaxError( line_number=self._lineno, message="Unsupported plural case number.") if (self._plural_case is not None) and (new_plural_case != self._plural_case + 1): self._emitSyntaxWarning("Bad plural case number.") if new_plural_case != self._plural_case: self._plural_case = new_plural_case else: self._emitSyntaxWarning( "msgstr[] repeats same plural case number.") else: self._plural_case = TranslationConstants.SINGULAR_FORM elif self._section is None: raise TranslationFormatSyntaxError(line_number=self._lineno, message='Invalid content: %r' % original_line) else: # This line could be the continuation of a previous section. pass line = line.strip() if len(line) == 0: self._emitSyntaxWarning( "Line has no content; this is not supported by some " "implementations of msgfmt.") return line
def _parseQuotedString(self, string): r"""Parse a quoted string, interpreting escape sequences. >>> parser = POParser() >>> parser._parseQuotedString(u'\"abc\"') u'abc' >>> parser._parseQuotedString(u'\"abc\\ndef\"') u'abc\ndef' >>> parser._parseQuotedString(u'\"ab\x63\"') u'abc' >>> parser._parseQuotedString(u'\"ab\143\"') u'abc' After the string has been converted to unicode, the backslash escaped sequences are still in the encoding that the charset header specifies. Such quoted sequences will be converted to unicode by this method. We don't know the encoding of the escaped characters and cannot be just recoded as Unicode so it's a TranslationFormatInvalidInputError >>> utf8_string = u'"view \\302\\253${version_title}\\302\\273"' >>> parser._parseQuotedString(utf8_string) Traceback (most recent call last): ... TranslationFormatInvalidInputError: Could not decode escaped string: (\302\253) Now, we note the original encoding so we get the right Unicode string. >>> class FakeHeader: ... charset = 'UTF-8' >>> parser._translation_file = TranslationFileData() >>> parser._translation_file.header = FakeHeader() >>> parser._parseQuotedString(utf8_string) u'view \xab${version_title}\xbb' Let's see that we raise a TranslationFormatInvalidInputError exception when we have an escaped char that is not valid in the declared encoding of the original string: >>> iso8859_1_string = u'"foo \\xf9"' >>> parser._parseQuotedString(iso8859_1_string) Traceback (most recent call last): ... TranslationFormatInvalidInputError: Could not decode escaped string as UTF-8: (\xf9) An error will be raised if the entire string isn't contained in quotes properly: >>> parser._parseQuotedString(u'abc') Traceback (most recent call last): ... TranslationFormatSyntaxError: String is not quoted >>> parser._parseQuotedString(u'\"ab') Traceback (most recent call last): ... TranslationFormatSyntaxError: String not terminated >>> parser._parseQuotedString(u'\"ab\"x') Traceback (most recent call last): ... TranslationFormatSyntaxError: Extra content found after string: (x) """ if self._escaped_line_break: # Continuing a line after an escaped newline. Strip indentation. string = string.lstrip() self._escaped_line_break = False else: # Regular string. Must start with opening quote, which we strip. if string[0] != '"': raise TranslationFormatSyntaxError( line_number=self._lineno, message="String is not quoted") string = string[1:] output = '' while len(string) > 0: if string[0] == '"': # Reached the end of the quoted string. It's rare, but there # may be another quoted string on the same line. It should be # suffixed to what we already have, with any whitespace # between the strings removed. string = string[1:].lstrip() if len(string) == 0: # End of line, end of string: the normal case break if string[0] == '"': # Start of a new string. We've already swallowed the # closing quote and any intervening whitespace; now # swallow the re-opening quote and go on as if the string # just went on normally string = string[1:] continue # if there is any non-string data afterwards, raise an # exception if len(string) > 0 and not string.isspace(): raise TranslationFormatSyntaxError( line_number=self._lineno, message=("Extra content found after string: (%s)" % string)) break elif string[0] == '\\': if len(string) == 1: self._escaped_line_break = True string = '' break elif string[1] in ESCAPE_MAP: # We got one of the special escaped chars we know about. # Unescape it using the mapping table. output += ESCAPE_MAP[string[1]] string = string[2:] else: unescaped, string = ( self._unescapeNumericCharSequence(string)) output += unescaped else: # Normal text. Eat up as much as we can in one go. text = re.match(STRAIGHT_TEXT_RUN, string) output += text.group() zero, runlength = text.span() string = string[runlength:] else: # We finished parsing the string without finding the ending quote # char. raise TranslationFormatSyntaxError(line_number=self._lineno, message="String not terminated") return output
def _unescapeNumericCharSequence(self, string): """Unescape leading sequence of escaped numeric character codes. This is for characters given in hexadecimal or octal escape notation. :return: a tuple: first, any leading part of `string` as an unescaped string (empty if `string` did not start with a numeric escape sequence), and second, the remainder of `string` after the leading numeric escape sequences have been parsed. """ escaped_string = '' position = 0 length = len(string) while position + 1 < length and string[position] == '\\': # Handle escaped characters given as numeric character codes. # These will still be in the original encoding. We extract the # whole sequence of escaped chars to recode them later into # Unicode in a single call. lead_char = string[position + 1] if lead_char == 'x': # Hexadecimal escape. position += 4 elif lead_char.isdigit(): # Octal escape. position += 2 # Up to two more octal digits. for i in range(2): if string[position].isdigit(): position += 1 else: break elif lead_char in ESCAPE_MAP: # It's part of our mapping table, we ignore it here. break else: raise TranslationFormatSyntaxError( line_number=self._lineno, message=("Unknown escape sequence %s" % string[position:position + 2])) if position == 0: # No escaping to be done. return '', string # We found some text escaped that should be recoded to Unicode. # First, we unescape it. escaped_string, string = string[:position], string[position:] unescaped_string = escaped_string.decode('string-escape') if (self._translation_file is not None and self._translation_file.header is not None): # There is a header, so we know the original encoding for # the given string. charset = self._translation_file.header.charset know_charset = True else: # We don't know the original encoding of the imported file so we # cannot get the right values. We try ASCII. # XXX JeroenVermeulen 2008-02-08: might as well try UTF-8 here. # It's a superset, and anything that's not UTF-8 is very unlikely # to validate as UTF-8. charset = 'ascii' know_charset = False try: decoded_text = unescaped_string.decode(charset) except UnicodeDecodeError: if know_charset: message = ("Could not decode escaped string as %s: (%s)" % (charset, escaped_string)) else: message = ("Could not decode escaped string: (%s)" % escaped_string) raise TranslationFormatInvalidInputError(line_number=self._lineno, message=message) return decoded_text, string
def parse(self, content_text): """Parse string as a PO file.""" # Initialize the parser. self._translation_file = TranslationFileData() self._messageids = set() self._pending_chars = content_text self._pending_unichars = u'' self._lineno = 0 # Message specific variables. self._message = TranslationMessageData() self._message_lineno = self._lineno self._section = None self._plural_case = None self._parsed_content = u'' # First thing to do is to get the charset used in the content_text. charset = parse_charset(content_text) # Now, parse the header, inefficiently. It ought to be short, so # this isn't disastrous. line = self._getHeaderLine() while line is not None: self._parseLine(line.decode(charset)) if (self._translation_file.header is not None or self._message.msgid_singular): # Either found the header already or it's a message with a # non empty msgid which means is not a header. break line = self._getHeaderLine() if line is None: if (self._translation_file.header is None and not self._message.msgid_singular): # This file contains no actual messages. self._dumpCurrentSection() # It may contain a header though. if not self._message.translations: raise TranslationFormatSyntaxError( message="File contains no messages.") self._parseHeader( self._message.translations[ TranslationConstants.SINGULAR_FORM], self._message.comment) # There is nothing left to parse. return self._translation_file # Parse anything left all in one go. lines = re.split(r'\n|\r\n|\r', self._pending_unichars) for line in lines: self._parseLine(line) if self._translation_file.header is None: raise TranslationFormatSyntaxError( message='No header found in this pofile') if self._message is not None: # We need to dump latest message. if self._section is None: # The message has not content or it's just a comment, ignore # it. return self._translation_file elif self._section == 'msgstr': self._dumpCurrentSection() self._storeCurrentMessage() else: raise TranslationFormatSyntaxError( line_number=self._lineno, message='Got a truncated message!') return self._translation_file
def error(self, msg): raise TranslationFormatSyntaxError( filename=self.filename, message=msg)