def _build_text_filter(self): """ Build a suitable TextFilter object. """ text_filter = TextFilter(logger=self.logger) self.log(u"Created TextFilter object") for key, cls, param_name in [ (gc.PPN_TASK_IS_TEXT_FILE_IGNORE_REGEX, TextFilterIgnoreRegex, "regex"), (gc.PPN_TASK_IS_TEXT_FILE_TRANSLITERATE_MAP, TextFilterTransliterate, "map_file_path") ]: cls_name = cls.__name__ param_value = gf.safe_get(self.parameters, key, None) if param_value is not None: self.log([u"Creating %s object...", cls_name]) params = {param_name: param_value, "logger": self.logger} try: inner_filter = cls(**params) text_filter.add_filter(inner_filter) self.log([u"Creating %s object... done", cls_name]) except ValueError as exc: self.log_exc(u"Creating %s object failed" % (cls_name), exc, False, None) return text_filter
def _build_text_filter(self): """ Build a suitable TextFilter object. """ text_filter = TextFilter(logger=self.logger) self.log(u"Created TextFilter object") for key, cls, param_name in [ ( gc.PPN_TASK_IS_TEXT_FILE_IGNORE_REGEX, TextFilterIgnoreRegex, "regex" ), ( gc.PPN_TASK_IS_TEXT_FILE_TRANSLITERATE_MAP, TextFilterTransliterate, "map_file_path" ) ]: cls_name = cls.__name__ param_value = gf.safe_get(self.parameters, key, None) if param_value is not None: self.log([u"Creating %s object...", cls_name]) params = { param_name : param_value, "logger" : self.logger } try: inner_filter = cls(**params) text_filter.add_filter(inner_filter) self.log([u"Creating %s object... done", cls_name]) except ValueError as exc: self.log_exc(u"Creating %s object failed" % (cls_name), exc, False, None) return text_filter
def _read_unparsed(self, lines): """ Read text fragments from an unparsed format text file. :param list lines: the lines of the unparsed text file """ def filter_attributes(): """ Return a dict with the bs4 filter parameters """ attributes = {} for attribute_name, filter_name in [ ("class", gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX), ("id", gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX) ]: if filter_name in self.parameters: regex_string = self.parameters[filter_name] if regex_string is not None: self.log([u"Regex for %s: '%s'", attribute_name, regex_string]) regex = re.compile(r".*\b" + regex_string + r"\b.*") attributes[attribute_name] = regex return attributes # # TODO better and/or parametric parsing, # for example, removing tags but keeping text, etc. # self.log(u"Parsing fragments from unparsed text format") # transform text in a soup object self.log(u"Creating soup") soup = BeautifulSoup("\n".join(lines), "lxml") # extract according to class_regex and id_regex text_from_id = {} ids = [] filter_attributes = filter_attributes() self.log([u"Finding elements matching attributes '%s'", filter_attributes]) nodes = soup.findAll(attrs=filter_attributes) for node in nodes: try: f_id = gf.safe_unicode(node["id"]) f_text = gf.safe_unicode(node.text) text_from_id[f_id] = f_text ids.append(f_id) except KeyError: self.log_warn(u"KeyError while parsing a node") # sort by ID as requested id_sort = gf.safe_get( dictionary=self.parameters, key=gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT, default_value=IDSortingAlgorithm.UNSORTED, can_return_none=False ) self.log([u"Sorting text fragments using '%s'", id_sort]) sorted_ids = IDSortingAlgorithm(id_sort).sort(ids) # append to fragments self.log(u"Appending fragments") self._create_text_fragments([(key, [text_from_id[key]]) for key in sorted_ids])
def test_safe_get(self): tests = [ (None, None, u"default", u"default"), (None, u"key", u"default", u"default"), ({}, None, u"default", u"default"), ({}, u"key", u"default", u"default"), ([], u"key", u"default", u"default"), ({u"key": u"value"}, None, u"default", u"default"), ({u"key": u"value"}, u"key", u"default", u"value"), ] for test in tests: self.assertEqual(gf.safe_get(test[0], test[1], test[2]), test[3])
def _get_id_format(self): """ Return the id regex from the parameters""" id_format = gf.safe_get(self.parameters, gc.PPN_TASK_OS_FILE_ID_REGEX, self.DEFAULT_ID_FORMAT, can_return_none=False) try: identifier = id_format % 1 except (TypeError, ValueError) as exc: self.log_exc(u"String '%s' is not a valid id format" % (id_format), exc, True, ValueError) return id_format
def _get_id_format(self): """ Return the id regex from the parameters""" id_format = gf.safe_get( self.parameters, gc.PPN_TASK_OS_FILE_ID_REGEX, self.DEFAULT_ID_FORMAT, can_return_none=False ) try: identifier = id_format % 1 except (TypeError, ValueError) as exc: self.log_exc(u"String '%s' is not a valid id format" % (id_format), exc, True, ValueError) return id_format
def read(self, sync_map_format, input_file_path, parameters=None): """ Read sync map fragments from the given file in the specified format, and add them the current (this) sync map. Return ``True`` if the call succeeded, ``False`` if an error occurred. :param sync_map_format: the format of the sync map :type sync_map_format: :class:`~aeneas.syncmap.SyncMapFormat` :param string input_file_path: the path to the input file to read :param dict parameters: additional parameters (e.g., for ``SMIL`` input) :raises: ValueError: if ``sync_map_format`` is ``None`` or it is not an allowed value :raises: OSError: if ``input_file_path`` does not exist """ if sync_map_format is None: self.log_exc(u"Sync map format is None", None, True, ValueError) if sync_map_format not in SyncMapFormat.CODE_TO_CLASS: self.log_exc( u"Sync map format '%s' is not allowed" % (sync_map_format), None, True, ValueError) if not gf.file_can_be_read(input_file_path): self.log_exc( u"Cannot read sync map file '%s'. Wrong permissions?" % (input_file_path), None, True, OSError) self.log([u"Input format: '%s'", sync_map_format]) self.log([u"Input path: '%s'", input_file_path]) self.log([u"Input parameters: '%s'", parameters]) reader = (SyncMapFormat.CODE_TO_CLASS[sync_map_format])( variant=sync_map_format, parameters=parameters, rconf=self.rconf, logger=self.logger) # open file for reading self.log(u"Reading input file...") with io.open(input_file_path, "r", encoding="utf-8") as input_file: input_text = input_file.read() reader.parse(input_text=input_text, syncmap=self) self.log(u"Reading input file... done") # overwrite language if requested language = gf.safe_get(parameters, gc.PPN_SYNCMAP_LANGUAGE, None) if language is not None: self.log([u"Overwriting language to '%s'", language]) for fragment in self.fragments: fragment.text_fragment.language = language
def _mplain_word_separator(self): """ Get the word separator to split words in mplain format. :rtype: string """ word_separator = gf.safe_get(self.parameters, gc.PPN_TASK_IS_TEXT_MPLAIN_WORD_SEPARATOR, u" ") if (word_separator is None) or (word_separator == "space"): return u" " elif word_separator == "equal": return u"=" elif word_separator == "pipe": return u"|" elif word_separator == "tab": return u"\u0009" return word_separator
def test_safe_get(self): tests = [ (None, None, u"default", u"default"), (None, u"key", u"default", u"default"), ({}, None, u"default", u"default"), ({}, u"key", u"default", u"default"), ([], u"key", u"default", u"default"), ({ u"key": u"value" }, None, u"default", u"default"), ({ u"key": u"value" }, u"key", u"default", u"value"), ] for test in tests: self.assertEqual(gf.safe_get(test[0], test[1], test[2]), test[3])
def read(self, sync_map_format, input_file_path, parameters=None): """ Read sync map fragments from the given file in the specified format, and add them the current (this) sync map. Return ``True`` if the call succeeded, ``False`` if an error occurred. :param sync_map_format: the format of the sync map :type sync_map_format: :class:`~aeneas.syncmap.SyncMapFormat` :param string input_file_path: the path to the input file to read :param dict parameters: additional parameters (e.g., for ``SMIL`` input) :raises: ValueError: if ``sync_map_format`` is ``None`` or it is not an allowed value :raises: OSError: if ``input_file_path`` does not exist """ if sync_map_format is None: self.log_exc(u"Sync map format is None", None, True, ValueError) if sync_map_format not in SyncMapFormat.CODE_TO_CLASS: self.log_exc(u"Sync map format '%s' is not allowed" % (sync_map_format), None, True, ValueError) if not gf.file_can_be_read(input_file_path): self.log_exc(u"Cannot read sync map file '%s'. Wrong permissions?" % (input_file_path), None, True, OSError) self.log([u"Input format: '%s'", sync_map_format]) self.log([u"Input path: '%s'", input_file_path]) self.log([u"Input parameters: '%s'", parameters]) reader = (SyncMapFormat.CODE_TO_CLASS[sync_map_format])( variant=sync_map_format, parameters=parameters, rconf=self.rconf, logger=self.logger ) # open file for reading self.log(u"Reading input file...") with io.open(input_file_path, "r", encoding="utf-8") as input_file: input_text = input_file.read() reader.parse(input_text=input_text, syncmap=self) self.log(u"Reading input file... done") # overwrite language if requested language = gf.safe_get(parameters, gc.PPN_SYNCMAP_LANGUAGE, None) if language is not None: self.log([u"Overwriting language to '%s'", language]) for fragment in self.fragments: fragment.text_fragment.language = language
def _read_unparsed(self, lines): """ Read text fragments from an unparsed format text file. :param list lines: the lines of the unparsed text file """ from bs4 import BeautifulSoup def filter_attributes(): """ Return a dict with the bs4 filter parameters """ attributes = {} for attribute_name, filter_name in [ ("class", gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX), ("id", gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX) ]: if filter_name in self.parameters: regex_string = self.parameters[filter_name] if regex_string is not None: self.log([ u"Regex for %s: '%s'", attribute_name, regex_string ]) regex = re.compile(r".*\b" + regex_string + r"\b.*") attributes[attribute_name] = regex return attributes # # TODO better and/or parametric parsing, # for example, removing tags but keeping text, etc. # self.log(u"Parsing fragments from unparsed text format") # transform text in a soup object self.log(u"Creating soup") soup = BeautifulSoup("\n".join(lines), "lxml") # extract according to class_regex and id_regex text_from_id = {} ids = [] filter_attributes = filter_attributes() self.log( [u"Finding elements matching attributes '%s'", filter_attributes]) nodes = soup.findAll(attrs=filter_attributes) for node in nodes: try: f_id = gf.safe_unicode(node["id"]) f_text = gf.safe_unicode(node.text) text_from_id[f_id] = f_text ids.append(f_id) except KeyError: self.log_warn(u"KeyError while parsing a node") # sort by ID as requested id_sort = gf.safe_get(dictionary=self.parameters, key=gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT, default_value=IDSortingAlgorithm.UNSORTED, can_return_none=False) self.log([u"Sorting text fragments using '%s'", id_sort]) sorted_ids = IDSortingAlgorithm(id_sort).sort(ids) # append to fragments self.log(u"Appending fragments") self._create_text_fragments([(key, [text_from_id[key]]) for key in sorted_ids])
def format(self, syncmap): # check for required parameters for key in [ gc.PPN_TASK_OS_FILE_SMIL_PAGE_REF, gc.PPN_TASK_OS_FILE_SMIL_AUDIO_REF ]: if gf.safe_get(self.parameters, key, None) is None: self.log_exc(u"Parameter %s must be specified for format %s" % (key, self.variant), None, True, SyncMapMissingParameterError) from lxml import etree # we are sure we have them text_ref = self.parameters[gc.PPN_TASK_OS_FILE_SMIL_PAGE_REF] audio_ref = self.parameters[gc.PPN_TASK_OS_FILE_SMIL_AUDIO_REF] # namespaces smil_ns = "http://www.w3.org/ns/SMIL" epub_ns = "http://www.idpf.org/2007/ops" ns_map = {None: smil_ns, "epub": epub_ns} # build tree smil_elem = etree.Element("{%s}smil" % smil_ns, nsmap=ns_map) smil_elem.attrib["version"] = "3.0" body_elem = etree.SubElement(smil_elem, "{%s}body" % smil_ns) seq_elem = etree.SubElement(body_elem, "{%s}seq" % smil_ns) seq_elem.attrib["id"] = u"seq000001" seq_elem.attrib["{%s}textref" % epub_ns] = text_ref if syncmap.is_single_level: # single level for i, fragment in enumerate(syncmap.fragments, 1): text = fragment.text_fragment par_elem = etree.SubElement(seq_elem, "{%s}par" % smil_ns) par_elem.attrib["id"] = "par%06d" % (i) text_elem = etree.SubElement(par_elem, "{%s}text" % smil_ns) text_elem.attrib["src"] = "%s#%s" % (text_ref, text.identifier) audio_elem = etree.SubElement(par_elem, "{%s}audio" % smil_ns) audio_elem.attrib["src"] = audio_ref audio_elem.attrib["clipBegin"] = self.format_time_function(fragment.begin) audio_elem.attrib["clipEnd"] = self.format_time_function(fragment.end) else: # TODO support generic multiple levels # multiple levels for par_index, par_child in enumerate(syncmap.fragments_tree.children_not_empty, 1): par_seq_elem = etree.SubElement(seq_elem, "{%s}seq" % smil_ns) # COMMENTED par_seq_elem.attrib["id"] = "p%06d" % (par_index) par_seq_elem.attrib["{%s}type" % epub_ns] = "paragraph" par_seq_elem.attrib["{%s}textref" % epub_ns] = text_ref + "#" + par_child.value.text_fragment.identifier for sen_index, sen_child in enumerate(par_child.children_not_empty, 1): sen_seq_elem = etree.SubElement(par_seq_elem, "{%s}seq" % smil_ns) # COMMENTED sen_seq_elem.attrib["id"] = par_seq_elem.attrib["id"] + "s%06d" % (sen_index) sen_seq_elem.attrib["{%s}type" % epub_ns] = "sentence" sen_seq_elem.attrib["{%s}textref" % epub_ns] = text_ref + "#" + sen_child.value.text_fragment.identifier for wor_index, wor_child in enumerate(sen_child.children_not_empty, 1): fragment = wor_child.value text = fragment.text_fragment wor_seq_elem = etree.SubElement(sen_seq_elem, "{%s}seq" % smil_ns) # COMMENTED wor_seq_elem.attrib["id"] = sen_seq_elem.attrib["id"] + "w%06d" % (wor_index) wor_seq_elem.attrib["{%s}type" % epub_ns] = "word" wor_seq_elem.attrib["{%s}textref" % epub_ns] = text_ref + "#" + text.identifier wor_par_elem = etree.SubElement(wor_seq_elem, "{%s}par" % smil_ns) text_elem = etree.SubElement(wor_par_elem, "{%s}text" % smil_ns) text_elem.attrib["src"] = "%s#%s" % (text_ref, text.identifier) audio_elem = etree.SubElement(wor_par_elem, "{%s}audio" % smil_ns) audio_elem.attrib["src"] = audio_ref audio_elem.attrib["clipBegin"] = self.format_time_function(fragment.begin) audio_elem.attrib["clipEnd"] = self.format_time_function(fragment.end) return self._tree_to_string(smil_elem, xml_declaration=False)