Пример #1
0
 def parse(self, input_text, syncmap):
     from lxml import etree
     # namespaces
     xsi = "http://www.w3.org/2001/XMLSchema-instance"
     ns_map = {"xsi": xsi}
     # get root
     root = etree.fromstring(gf.safe_bytes(input_text))
     # get time slots
     time_slots = dict()
     for ts in root.iter("TIME_SLOT"):
         time_slots[ts.get("TIME_SLOT_ID")] = gf.time_from_ssmmm(ts.get("TIME_VALUE")) / 1000
     # parse annotations
     for alignable in root.iter("ALIGNABLE_ANNOTATION"):
         identifier = gf.safe_unicode(alignable.get("ANNOTATION_ID"))
         begin = time_slots[alignable.get("TIME_SLOT_REF1")]
         end = time_slots[alignable.get("TIME_SLOT_REF2")]
         lines = []
         for value in alignable.iter("ANNOTATION_VALUE"):
             lines.append(gf.safe_unicode(value.text))
         self._add_fragment(
             syncmap=syncmap,
             identifier=identifier,
             lines=lines,
             begin=begin,
             end=end
         )
Пример #2
0
 def parse(self, input_text, syncmap):
     from lxml import etree
     # namespaces
     xsi = "http://www.w3.org/2001/XMLSchema-instance"
     ns_map = {"xsi": xsi}
     # get root
     root = etree.fromstring(gf.safe_bytes(input_text))
     # get time slots
     time_slots = dict()
     for ts in root.iter("TIME_SLOT"):
         time_slots[ts.get("TIME_SLOT_ID")] = gf.time_from_ssmmm(
             ts.get("TIME_VALUE")) / 1000
     # parse annotations
     for alignable in root.iter("ALIGNABLE_ANNOTATION"):
         identifier = gf.safe_unicode(alignable.get("ANNOTATION_ID"))
         begin = time_slots[alignable.get("TIME_SLOT_REF1")]
         end = time_slots[alignable.get("TIME_SLOT_REF2")]
         lines = []
         for value in alignable.iter("ANNOTATION_VALUE"):
             lines.append(gf.safe_unicode(value.text))
         self._add_fragment(syncmap=syncmap,
                            identifier=identifier,
                            lines=lines,
                            begin=begin,
                            end=end)
Пример #3
0
    def _read_unparsed(self, lines):
        """
        Read text fragments from an unparsed format text file.

        :param list lines: the lines of the unparsed text file
        """
        def filter_attributes():
            """ Return a dict with the bs4 filter parameters """
            attributes = {}
            for attribute_name, filter_name in [
                    ("class", gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX),
                    ("id", gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX)
            ]:
                if filter_name in self.parameters:
                    regex_string = self.parameters[filter_name]
                    if regex_string is not None:
                        self.log([u"Regex for %s: '%s'", attribute_name, regex_string])
                        regex = re.compile(r".*\b" + regex_string + r"\b.*")
                        attributes[attribute_name] = regex
            return attributes
        #
        # TODO better and/or parametric parsing,
        #      for example, removing tags but keeping text, etc.
        #
        self.log(u"Parsing fragments from unparsed text format")

        # transform text in a soup object
        self.log(u"Creating soup")
        soup = BeautifulSoup("\n".join(lines), "lxml")

        # extract according to class_regex and id_regex
        text_from_id = {}
        ids = []
        filter_attributes = filter_attributes()
        self.log([u"Finding elements matching attributes '%s'", filter_attributes])
        nodes = soup.findAll(attrs=filter_attributes)
        for node in nodes:
            try:
                f_id = gf.safe_unicode(node["id"])
                f_text = gf.safe_unicode(node.text)
                text_from_id[f_id] = f_text
                ids.append(f_id)
            except KeyError:
                self.log_warn(u"KeyError while parsing a node")

        # sort by ID as requested
        id_sort = gf.safe_get(
            dictionary=self.parameters,
            key=gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT,
            default_value=IDSortingAlgorithm.UNSORTED,
            can_return_none=False
        )
        self.log([u"Sorting text fragments using '%s'", id_sort])
        sorted_ids = IDSortingAlgorithm(id_sort).sort(ids)

        # append to fragments
        self.log(u"Appending fragments")
        self._create_text_fragments([(key, [text_from_id[key]]) for key in sorted_ids])
Пример #4
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 2:
            return self.print_help()
        text_format = gf.safe_unicode(self.actual_arguments[0])
        if text_format == u"list":
            text = gf.safe_unicode(self.actual_arguments[1])
        elif text_format in TextFileFormat.ALLOWED_VALUES:
            text = self.actual_arguments[1]
            if not self.check_input_file(text):
                return self.ERROR_EXIT_CODE
        else:
            return self.print_help()

        l1_id_regex = self.has_option_with_value(u"--l1-id-regex")
        l2_id_regex = self.has_option_with_value(u"--l2-id-regex")
        l3_id_regex = self.has_option_with_value(u"--l3-id-regex")
        id_regex = self.has_option_with_value(u"--id-regex")
        id_format = self.has_option_with_value(u"--id-format")
        class_regex = self.has_option_with_value(u"--class-regex")
        sort = self.has_option_with_value(u"--sort")
        parameters = {
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX : l1_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX : l2_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX : l3_id_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX : id_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX : class_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT : sort,
            gc.PPN_TASK_OS_FILE_ID_REGEX : id_format
        }
        if (text_format == TextFileFormat.MUNPARSED) and ((l1_id_regex is None) or (l2_id_regex is None) or (l3_id_regex is None)):
            self.print_error(u"You must specify --l1-id-regex and --l2-id-regex and --l3-id-regex for munparsed format")
            return self.ERROR_EXIT_CODE
        if (text_format == TextFileFormat.UNPARSED) and (id_regex is None) and (class_regex is None):
            self.print_error(u"You must specify --id-regex and/or --class-regex for unparsed format")
            return self.ERROR_EXIT_CODE
        if (text_format in [TextFileFormat.PLAIN, TextFileFormat.SUBTITLES]) and (id_format is not None):
            try:
                identifier = id_format % 1
            except (TypeError, ValueError):
                self.print_error(u"The given string '%s' is not a valid id format" % id_format)
                return self.ERROR_EXIT_CODE

        text_file = self.get_text_file(text_format, text, parameters)
        if text_file is None:
            self.print_error(u"Unable to build a TextFile from the given parameters")
        elif len(text_file) == 0:
            self.print_error(u"No text fragments found")
        else:
            self.print_generic(text_file.__unicode__())
            return self.NO_ERROR_EXIT_CODE
        return self.ERROR_EXIT_CODE
Пример #5
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 2:
            return self.print_help()
        text_format = gf.safe_unicode(self.actual_arguments[0])
        if text_format == u"list":
            text = gf.safe_unicode(self.actual_arguments[1])
        elif text_format in TextFileFormat.ALLOWED_VALUES:
            text = self.actual_arguments[1]
            if not self.check_input_file(text):
                return self.ERROR_EXIT_CODE
        else:
            return self.print_help()

        l1_id_regex = self.has_option_with_value(u"--l1-id-regex")
        l2_id_regex = self.has_option_with_value(u"--l2-id-regex")
        l3_id_regex = self.has_option_with_value(u"--l3-id-regex")
        id_regex = self.has_option_with_value(u"--id-regex")
        id_format = self.has_option_with_value(u"--id-format")
        class_regex = self.has_option_with_value(u"--class-regex")
        sort = self.has_option_with_value(u"--sort")
        parameters = {
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX: l1_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX: l2_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX: l3_id_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX: id_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX: class_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT: sort,
            gc.PPN_TASK_OS_FILE_ID_REGEX: id_format
        }
        if (text_format == TextFileFormat.MUNPARSED) and ((l1_id_regex is None) or (l2_id_regex is None) or (l3_id_regex is None)):
            self.print_error(u"You must specify --l1-id-regex and --l2-id-regex and --l3-id-regex for munparsed format")
            return self.ERROR_EXIT_CODE
        if (text_format == TextFileFormat.UNPARSED) and (id_regex is None) and (class_regex is None):
            self.print_error(u"You must specify --id-regex and/or --class-regex for unparsed format")
            return self.ERROR_EXIT_CODE
        if (text_format in [TextFileFormat.PLAIN, TextFileFormat.SUBTITLES]) and (id_format is not None):
            try:
                identifier = id_format % 1
            except (TypeError, ValueError):
                self.print_error(u"The given string '%s' is not a valid id format" % id_format)
                return self.ERROR_EXIT_CODE

        text_file = self.get_text_file(text_format, text, parameters)
        if text_file is None:
            self.print_error(u"Unable to build a TextFile from the given parameters")
        elif len(text_file) == 0:
            self.print_error(u"No text fragments found")
        else:
            self.print_generic(text_file.__unicode__())
            return self.NO_ERROR_EXIT_CODE
        return self.ERROR_EXIT_CODE
Пример #6
0
def main():
    """
    Run ``aeneas.cew``, reading input text from file and writing audio and interval data to file.
    """

    # make sure we have enough parameters
    if len(sys.argv) < 6:
        print("You must pass five arguments: QUIT_AFTER BACKWARDS TEXT_FILE_PATH AUDIO_FILE_PATH DATA_FILE_PATH")
        return 1

    # read parameters
    c_quit_after = float(sys.argv[1]) # NOTE: cew needs float, not TimeValue
    c_backwards = int(sys.argv[2])
    text_file_path = sys.argv[3]
    audio_file_path = sys.argv[4]
    data_file_path = sys.argv[5]

    # read (voice_code, text) from file
    s_text = []
    with io.open(text_file_path, "r", encoding="utf-8") as text:
        for line in text.readlines():
            # NOTE: not using strip() to avoid removing trailing blank characters
            line = line.replace(u"\n", u"").replace(u"\r", u"")
            idx = line.find(" ")
            if idx > 0:
                f_voice_code = line[:idx]
                f_text = line[idx+1:]
                #print("%s => '%s' and '%s'" % (line, f_voice_code, f_text))
                s_text.append((f_voice_code, f_text))

    # convert to bytes/unicode as required by subprocess
    c_text = []
    if gf.PY2:
        for f_voice_code, f_text in s_text:
            c_text.append((gf.safe_bytes(f_voice_code), gf.safe_bytes(f_text)))
    else:
        for f_voice_code, f_text in s_text:
            c_text.append((gf.safe_unicode(f_voice_code), gf.safe_unicode(f_text)))

    try:
        import aeneas.cew.cew
        sr, sf, intervals = aeneas.cew.cew.synthesize_multiple(
            audio_file_path,
            c_quit_after,
            c_backwards,
            c_text
        )
        with io.open(data_file_path, "w", encoding="utf-8") as data:
            data.write(u"%d\n" % (sr))
            data.write(u"%d\n" % (sf))
            data.write(u"\n".join([u"%.3f %.3f" % (i[0], i[1]) for i in intervals]))
    except Exception as exc:
        print(u"Unexpected error: %s" % str(exc))
Пример #7
0
def main():
    """
    Run ``aeneas.cew``, reading input text from file and writing audio and interval data to file.
    """

    # make sure we have enough parameters
    if len(sys.argv) < 6:
        print(
            "You must pass five arguments: QUIT_AFTER BACKWARDS TEXT_FILE_PATH AUDIO_FILE_PATH DATA_FILE_PATH"
        )
        return 1

    # read parameters
    c_quit_after = float(sys.argv[1])  # NOTE: cew needs float, not TimeValue
    c_backwards = int(sys.argv[2])
    text_file_path = sys.argv[3]
    audio_file_path = sys.argv[4]
    data_file_path = sys.argv[5]

    # read (voice_code, text) from file
    s_text = []
    with io.open(text_file_path, "r", encoding="utf-8") as text:
        for line in text.readlines():
            # NOTE: not using strip() to avoid removing trailing blank characters
            line = line.replace(u"\n", u"").replace(u"\r", u"")
            idx = line.find(" ")
            if idx > 0:
                f_voice_code = line[:idx]
                f_text = line[idx + 1:]
                #print("%s => '%s' and '%s'" % (line, f_voice_code, f_text))
                s_text.append((f_voice_code, f_text))

    # convert to bytes/unicode as required by subprocess
    c_text = []
    if gf.PY2:
        for f_voice_code, f_text in s_text:
            c_text.append((gf.safe_bytes(f_voice_code), gf.safe_bytes(f_text)))
    else:
        for f_voice_code, f_text in s_text:
            c_text.append(
                (gf.safe_unicode(f_voice_code), gf.safe_unicode(f_text)))

    try:
        import aeneas.cew.cew
        sr, sf, intervals = aeneas.cew.cew.synthesize_multiple(
            audio_file_path, c_quit_after, c_backwards, c_text)
        with io.open(data_file_path, "w", encoding="utf-8") as data:
            data.write(u"%d\n" % (sr))
            data.write(u"%d\n" % (sf))
            data.write(u"\n".join(
                [u"%.3f %.3f" % (i[0], i[1]) for i in intervals]))
    except Exception as exc:
        print(u"Unexpected error: %s" % str(exc))
Пример #8
0
 def test_safe_unicode(self):
     tests = [
         ("", u""),
         ("foo", u"foo"),
         ("foà", u"foà"),
         (u"", u""),
         (u"foo", u"foo"),
         (u"foà", u"foà"),
     ]
     self.assertIsNone(gf.safe_unicode(None))
     for test in tests:
         self.assertEqual(gf.safe_unicode(test[0]), test[1])
Пример #9
0
 def test_safe_unicode(self):
     tests = [
         ("", u""),
         ("foo", u"foo"),
         ("foà", u"foà"),
         (u"", u""),
         (u"foo", u"foo"),
         (u"foà", u"foà"),
     ]
     self.assertIsNone(gf.safe_unicode(None))
     for test in tests:
         self.assertEqual(gf.safe_unicode(test[0]), test[1])
Пример #10
0
 def _get_lines_from_node_text(cls, node):
     """
     Given an ``lxml`` node, get lines from ``node.text``,
     where the line separator is ``<br xmlns=... />``.
     """
     # TODO more robust parsing
     from lxml import etree
     parts = ([node.text] + list(chain(*([etree.tostring(c, with_tail=False), c.tail] for c in node.getchildren()))) + [node.tail])
     parts = [gf.safe_unicode(p) for p in parts]
     parts = [p.strip() for p in parts if not p.startswith(u"<br ")]
     parts = [p for p in parts if len(p) > 0]
     uparts = []
     for part in parts:
         uparts.append(gf.safe_unicode(part))
     return uparts
Пример #11
0
    def parse(self, input_text, syncmap):
        """
        Read from SMIL file.

        Limitations:
        1. parses only ``<par>`` elements, in order
        2. timings must have ``hh:mm:ss.mmm`` or ``ss.mmm`` format (autodetected)
        3. both ``clipBegin`` and ``clipEnd`` attributes of ``<audio>`` must be populated
        """
        from lxml import etree
        smil_ns = "{http://www.w3.org/ns/SMIL}"
        root = etree.fromstring(gf.safe_bytes(input_text))
        for par in root.iter(smil_ns + "par"):
            for child in par:
                if child.tag == (smil_ns + "text"):
                    identifier = gf.safe_unicode(gf.split_url(child.get("src"))[1])
                elif child.tag == (smil_ns + "audio"):
                    begin_text = child.get("clipBegin")
                    if ":" in begin_text:
                        begin = gf.time_from_hhmmssmmm(begin_text)
                    else:
                        begin = gf.time_from_ssmmm(begin_text)
                    end_text = child.get("clipEnd")
                    if ":" in end_text:
                        end = gf.time_from_hhmmssmmm(end_text)
                    else:
                        end = gf.time_from_ssmmm(end_text)
            # TODO read text from additional text_file?
            self._add_fragment(
                syncmap=syncmap,
                identifier=identifier,
                lines=[u""],
                begin=begin,
                end=end
            )
Пример #12
0
    def check_raw_string(self, string, is_bstring=True):
        """
        Check whether the given string
        is properly UTF-8 encoded (if ``is_bytes`` is ``True``),
        it is not empty, and
        it does not contain reserved characters.

        :param string string: the byte string or Unicode string to be checked
        :param bool is_bstring: if True, string is a byte string
        :rtype: :class:`~aeneas.validator.ValidatorResult`
        """
        self.log(u"Checking the given byte string")
        self.result = ValidatorResult()
        if self._are_safety_checks_disabled(u"check_raw_string"):
            return self.result
        if is_bstring:
            self._check_utf8_encoding(string)
            if not self.result.passed:
                return self.result
            string = gf.safe_unicode(string)
        self._check_not_empty(string)
        if not self.result.passed:
            return self.result
        self._check_reserved_characters(string)
        return self.result
Пример #13
0
    def json_string(self):
        """
        Return a JSON representation of the sync map.

        :rtype: string

        .. versionadded:: 1.3.1
        """
        def visit_children(node):
            """ Recursively visit the fragments_tree """
            output_fragments = []
            for child in node.children_not_empty:
                fragment = child.value
                text = fragment.text_fragment
                output_fragments.append({
                    "id": text.identifier,
                    "language": text.language,
                    "lines": text.lines,
                    "begin": gf.time_to_ssmmm(fragment.begin),
                    "end": gf.time_to_ssmmm(fragment.end),
                    "children": visit_children(child)
                })
            return output_fragments
        output_fragments = visit_children(self.fragments_tree)
        return gf.safe_unicode(
            json.dumps({"fragments": output_fragments}, indent=1, sort_keys=True)
        )
Пример #14
0
    def check_config_txt(self, contents, is_config_string=False):
        """
        Check whether the given TXT config file contents
        (if ``is_config_string`` is ``False``) or
        TXT config string (if ``is_config_string`` is ``True``)
        is well-formed and it has all the required parameters.

        :param string contents: the TXT config file contents or TXT config string
        :param bool is_config_string: if ``True``, contents is a config string
        :rtype: :class:`~aeneas.validator.ValidatorResult`
        """
        self.log(u"Checking contents TXT config file")
        self.result = ValidatorResult()
        if self._are_safety_checks_disabled(u"check_config_txt"):
            return self.result
        is_bstring = gf.is_bytes(contents)
        if is_bstring:
            self.log(u"Checking that contents is well formed")
            self.check_raw_string(contents, is_bstring=True)
            if not self.result.passed:
                return self.result
            contents = gf.safe_unicode(contents)
        if not is_config_string:
            self.log(u"Converting file contents to config string")
            contents = gf.config_txt_to_string(contents)
        self.log(u"Checking required parameters")
        required_parameters = self.TXT_REQUIRED_PARAMETERS
        parameters = gf.config_string_to_dict(contents, self.result)
        self._check_required_parameters(required_parameters, parameters)
        self.log([u"Checking contents: returning %s", self.result.passed])
        return self.result
Пример #15
0
    def json_string(self):
        """
        Return a JSON representation of the sync map.

        :rtype: string

        .. versionadded:: 1.3.1
        """
        def visit_children(node):
            """ Recursively visit the fragments_tree """
            output_fragments = []
            for child in node.children_not_empty:
                fragment = child.value
                text = fragment.text_fragment
                output_fragments.append({
                    "id":
                    text.identifier,
                    "language":
                    text.language,
                    "lines":
                    text.lines,
                    "begin":
                    gf.time_to_ssmmm(fragment.begin),
                    "end":
                    gf.time_to_ssmmm(fragment.end),
                    "children":
                    visit_children(child)
                })
            return output_fragments

        output_fragments = visit_children(self.fragments_tree)
        return gf.safe_unicode(
            json.dumps({"fragments": output_fragments},
                       indent=1,
                       sort_keys=True))
Пример #16
0
 def _get_lines_from_node_text(cls, node):
     """
     Given an ``lxml`` node, get lines from ``node.text``,
     where the line separator is ``<br xmlns=... />``.
     """
     # TODO more robust parsing
     from lxml import etree
     parts = ([node.text] + list(
         chain(*([etree.tostring(c, with_tail=False), c.tail]
                 for c in node.getchildren()))) + [node.tail])
     parts = [gf.safe_unicode(p) for p in parts]
     parts = [p.strip() for p in parts if not p.startswith(u"<br ")]
     parts = [p for p in parts if len(p) > 0]
     uparts = []
     for part in parts:
         uparts.append(gf.safe_unicode(part))
     return uparts
Пример #17
0
 def parse(self, input_text, syncmap):
     from lxml import etree
     root = etree.fromstring(gf.safe_bytes(input_text))
     for frag in root:
         identifier = gf.safe_unicode(frag.get("id"))
         begin = gf.time_from_ssmmm(frag.get("begin"))
         end = gf.time_from_ssmmm(frag.get("end"))
         lines = []
         for child in frag:
             if child.tag == "line":
                 lines.append(gf.safe_unicode(child.text))
         self._add_fragment(
             syncmap=syncmap,
             identifier=identifier,
             lines=lines,
             begin=begin,
             end=end
         )
Пример #18
0
    def _synthesize_single_c_extension(self, text, voice_code, output_file_path):
        """
        Synthesize a single text fragment, using the cew extension.

        Return the duration of the synthesized text, in seconds.

        :rtype: (bool, (:class:`~aeneas.timevalue.TimeValue`, ))
        """
        self.log(u"Synthesizing using C extension...")

        end = None
        if self.rconf[RuntimeConfiguration.CEW_SUBPROCESS_ENABLED]:
            self.log(u"Using cewsubprocess to call aeneas.cew")
            try:
                self.log(u"Importing aeneas.cewsubprocess...")
                from aeneas.cewsubprocess import CEWSubprocess
                self.log(u"Importing aeneas.cewsubprocess... done")
                self.log(u"Calling aeneas.cewsubprocess...")
                cewsub = CEWSubprocess(rconf=self.rconf, logger=self.logger)
                end = cewsub.synthesize_single(output_file_path, voice_code, text)
                self.log(u"Calling aeneas.cewsubprocess... done")
            except Exception as exc:
                self.log_exc(u"An unexpected error occurred while running cewsubprocess", exc, False, None)
                # NOTE not critical, try calling aeneas.cew directly
                #return (False, None)

        if end is None:
            self.log(u"Preparing c_text...")
            if gf.PY2:
                # Python 2 => pass byte strings
                c_text = gf.safe_bytes(text)
            else:
                # Python 3 => pass Unicode strings
                c_text = gf.safe_unicode(text)
            self.log(u"Preparing c_text... done")

            self.log(u"Calling aeneas.cew directly")
            try:
                self.log(u"Importing aeneas.cew...")
                import aeneas.cew.cew
                self.log(u"Importing aeneas.cew... done")
                self.log(u"Calling aeneas.cew...")
                sr, begin, end = aeneas.cew.cew.synthesize_single(
                    output_file_path,
                    voice_code,
                    c_text
                )
                end = TimeValue(end)
                self.log(u"Calling aeneas.cew... done")
            except Exception as exc:
                self.log_exc(u"An unexpected error occurred while running cew", exc, False, None)
                return (False, None)

        self.log(u"Synthesizing using C extension... done")
        return (True, (end, ))
Пример #19
0
 def _tree_to_string(cls, root_element, xml_declaration=True, pretty_print=True):
     """
     Return an ``lxml`` tree as a Unicode string.
     """
     from lxml import etree
     return gf.safe_unicode(etree.tostring(
         root_element,
         encoding="UTF-8",
         method="xml",
         xml_declaration=xml_declaration,
         pretty_print=pretty_print
     ))
Пример #20
0
 def _tree_to_string(cls,
                     root_element,
                     xml_declaration=True,
                     pretty_print=True):
     """
     Return an ``lxml`` tree as a Unicode string.
     """
     from lxml import etree
     return gf.safe_unicode(
         etree.tostring(root_element,
                        encoding="UTF-8",
                        method="xml",
                        xml_declaration=xml_declaration,
                        pretty_print=pretty_print))
Пример #21
0
 def parse(self, input_text, syncmap):
     from lxml import etree
     root = etree.fromstring(gf.safe_bytes(input_text))
     for frag in root:
         for child in frag:
             if child.tag == "identifier":
                 identifier = gf.safe_unicode(child.text)
             elif child.tag == "start":
                 begin = gf.time_from_ssmmm(child.text)
             elif child.tag == "end":
                 end = gf.time_from_ssmmm(child.text)
         # TODO read text from additional text_file?
         self._add_fragment(syncmap=syncmap,
                            identifier=identifier,
                            lines=[u""],
                            begin=begin,
                            end=end)
Пример #22
0
 def parse(self, input_text, syncmap):
     from lxml import etree
     ttml_ns = "{http://www.w3.org/ns/ttml}"
     xml_ns = "{http://www.w3.org/XML/1998/namespace}"
     root = etree.fromstring(gf.safe_bytes(input_text))
     language = root.get(xml_ns + "lang")
     for elem in root.iter(ttml_ns + "p"):
         identifier = gf.safe_unicode(elem.get(xml_ns + "id"))
         begin = gf.time_from_ttml(elem.get("begin"))
         end = gf.time_from_ttml(elem.get("end"))
         fragment_lines = self._get_lines_from_node_text(elem)
         self._add_fragment(syncmap=syncmap,
                            identifier=identifier,
                            language=language,
                            lines=fragment_lines,
                            begin=begin,
                            end=end)
Пример #23
0
 def format(self, syncmap):
     smil_data = []
     smil_ids = []
     for fragment in syncmap.fragments:
         text = fragment.text_fragment
         smil_data.append({
             "id": text.identifier,
             "begin": gf.time_to_ssmmm(fragment.begin),
             "end": gf.time_to_ssmmm(fragment.end)
         })
         smil_ids.append(text.identifier)
     return gf.safe_unicode(
         json.dumps(obj={
             "smil_ids": smil_ids,
             "smil_data": smil_data
         },
                    indent=1,
                    sort_keys=True))
Пример #24
0
 def parse(self, input_text, syncmap):
     from lxml import etree
     root = etree.fromstring(gf.safe_bytes(input_text))
     for frag in root:
         for child in frag:
             if child.tag == "identifier":
                 identifier = gf.safe_unicode(child.text)
             elif child.tag == "start":
                 begin = gf.time_from_ssmmm(child.text)
             elif child.tag == "end":
                 end = gf.time_from_ssmmm(child.text)
         # TODO read text from additional text_file?
         self._add_fragment(
             syncmap=syncmap,
             identifier=identifier,
             lines=[u""],
             begin=begin,
             end=end
         )
Пример #25
0
    def check_configuration_string(
            self,
            config_string,
            is_job=True,
            external_name=False
    ):
        """
        Check whether the given job or task configuration string
        is well-formed (if ``is_bstring`` is ``True``)
        and it has all the required parameters.

        :param string config_string: the byte string or Unicode string to be checked
        :param bool is_job: if ``True``, ``config_string`` is a job config string
        :param bool external_name: if ``True``, the task name is provided externally,
                                   and it is not required to appear
                                   in the config string
        :rtype: :class:`~aeneas.validator.ValidatorResult`
        """
        if is_job:
            self.log(u"Checking job configuration string")
        else:
            self.log(u"Checking task configuration string")
        self.result = ValidatorResult()
        if self._are_safety_checks_disabled(u"check_configuration_string"):
            return self.result
        if is_job:
            required_parameters = self.JOB_REQUIRED_PARAMETERS
        elif external_name:
            required_parameters = self.TASK_REQUIRED_PARAMETERS_EXTERNAL_NAME
        else:
            required_parameters = self.TASK_REQUIRED_PARAMETERS
        is_bstring = gf.is_bytes(config_string)
        if is_bstring:
            self.log(u"Checking that config_string is well formed")
            self.check_raw_string(config_string, is_bstring=True)
            if not self.result.passed:
                return self.result
            config_string = gf.safe_unicode(config_string)
        self.log(u"Checking required parameters")
        parameters = gf.config_string_to_dict(config_string, self.result)
        self._check_required_parameters(required_parameters, parameters)
        self.log([u"Checking config_string: returning %s", self.result.passed])
        return self.result
Пример #26
0
 def parse(self, input_text, syncmap):
     from lxml import etree
     ttml_ns = "{http://www.w3.org/ns/ttml}"
     xml_ns = "{http://www.w3.org/XML/1998/namespace}"
     root = etree.fromstring(gf.safe_bytes(input_text))
     language = root.get(xml_ns + "lang")
     for elem in root.iter(ttml_ns + "p"):
         identifier = gf.safe_unicode(elem.get(xml_ns + "id"))
         begin = gf.time_from_ttml(elem.get("begin"))
         end = gf.time_from_ttml(elem.get("end"))
         fragment_lines = self._get_lines_from_node_text(elem)
         self._add_fragment(
             syncmap=syncmap,
             identifier=identifier,
             language=language,
             lines=fragment_lines,
             begin=begin,
             end=end
         )
Пример #27
0
 def format(self, syncmap):
     smil_data = []
     smil_ids = []
     for fragment in syncmap.fragments:
         text = fragment.text_fragment
         smil_data.append({
             "id": text.identifier,
             "begin": gf.time_to_ssmmm(fragment.begin),
             "end": gf.time_to_ssmmm(fragment.end)
         })
         smil_ids.append(text.identifier)
     return gf.safe_unicode(
         json.dumps(
             obj={
                 "smil_ids": smil_ids,
                 "smil_data": smil_data
             },
             indent=1,
             sort_keys=True
         )
     )
Пример #28
0
 def format(self, syncmap):
     try:
         import tgt
     except ImportError as exc:
         self.log_exc(u"Python module tgt is not installed", exc, True, ImportError)
     # from https://github.com/hbuschme/TextGridTools/blob/master/tgt/io.py
     textgrid = tgt.TextGrid()
     tier = tgt.IntervalTier(name="Token")
     for fragment in syncmap.fragments:
         begin = float(fragment.begin)
         end = float(fragment.end)
         text = fragment.text_fragment.text
         if text == u"":
             text = u"SIL"
         interval = tgt.Interval(begin, end, text=text)
         tier.add_interval(interval)
     textgrid.add_tier(tier)
     if self.variant == self.DEFAULT:
         msg = tgt.io.export_to_long_textgrid(textgrid)
     else:
         msg = tgt.io.export_to_short_textgrid(textgrid)
     return gf.safe_unicode(msg)
Пример #29
0
    def _read_munparsed(self, lines):
        """
        Read text fragments from an munparsed format text file.

        :param list lines: the lines of the unparsed text file
        """
        def nodes_at_level(root, level):
            """ Return a dict with the bs4 filter parameters """
            LEVEL_TO_REGEX_MAP = [
                None,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX,
            ]
            attribute_name = "id"
            regex_string = self.parameters[LEVEL_TO_REGEX_MAP[level]]
            indent = u" " * 2 * (level - 1)
            self.log([u"%sRegex for %s: '%s'", indent, attribute_name, regex_string])
            regex = re.compile(r".*\b" + regex_string + r"\b.*")
            return root.findAll(attrs={ attribute_name: regex })
        #
        # TODO better and/or parametric parsing,
        #      for example, removing tags but keeping text, etc.
        #
        self.log(u"Parsing fragments from munparsed text format")
        # transform text in a soup object
        self.log(u"Creating soup")
        soup = BeautifulSoup("\n".join(lines), "lxml")
        # extract according to class_regex and id_regex
        text_from_id = {}
        ids = []
        self.log(u"Finding l1 elements")
        tree = Tree()
        for l1_node in nodes_at_level(soup, 1):
            has_word = False
            try:
                l1_id = gf.safe_unicode(l1_node["id"])
                self.log([u"Found l1 node with id:   '%s'", l1_id])
                l1_text = []
                paragraph_node = Tree()
                paragraph_text = []
                for l2_node in nodes_at_level(l1_node, 2):
                    l2_id = gf.safe_unicode(l2_node["id"])
                    self.log([u"  Found l2 node with id:   '%s'", l2_id])
                    l2_text = []
                    sentence_node = Tree()
                    paragraph_node.add_child(sentence_node)
                    sentence_text = []
                    for l3_node in nodes_at_level(l2_node, 3):
                        l3_id = gf.safe_unicode(l3_node["id"])
                        l3_text = gf.safe_unicode(l3_node.text)
                        self.log([u"    Found l3 node with id:   '%s'", l3_id])
                        self.log([u"    Found l3 node with text: '%s'", l3_text])
                        word_fragment = TextFragment(
                            identifier=l3_id,
                            lines=[l3_text],
                            filtered_lines=[l3_text]
                        )
                        word_node = Tree(value=word_fragment)
                        sentence_node.add_child(word_node)
                        sentence_text.append(l3_text)
                        has_word = True
                    sentence_text = u" ".join(sentence_text)
                    paragraph_text.append(sentence_text)
                    sentence_node.value = TextFragment(
                        identifier=l2_id,
                        lines=[sentence_text],
                        filtered_lines=[sentence_text]
                    )
                    self.log([u"  Found l2 node with text: '%s'" % sentence_text])
                if has_word:
                    paragraph_text = u" ".join(paragraph_text)
                    paragraph_node.value = TextFragment(
                        identifier=l1_id,
                        lines=[paragraph_text],
                        filtered_lines=[paragraph_text]
                    )
                    tree.add_child(paragraph_node)
                    self.log([u"Found l1 node with text: '%s'" % paragraph_text])
                else:
                    self.log(u"Found l1 node but it has no words, skipping")
            except KeyError:
                self.log_warn(u"KeyError while parsing a l1 node")
        # append to fragments
        self.log(u"Storing tree")
        self.fragments_tree = tree
Пример #30
0
    def _read_munparsed(self, lines):
        """
        Read text fragments from an munparsed format text file.

        :param list lines: the lines of the unparsed text file
        """
        from bs4 import BeautifulSoup

        def nodes_at_level(root, level):
            """ Return a dict with the bs4 filter parameters """
            LEVEL_TO_REGEX_MAP = [
                None,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX,
                gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX,
            ]
            attribute_name = "id"
            regex_string = self.parameters[LEVEL_TO_REGEX_MAP[level]]
            indent = u" " * 2 * (level - 1)
            self.log([
                u"%sRegex for %s: '%s'", indent, attribute_name, regex_string
            ])
            regex = re.compile(r".*\b" + regex_string + r"\b.*")
            return root.findAll(attrs={attribute_name: regex})

        #
        # TODO better and/or parametric parsing,
        #      for example, removing tags but keeping text, etc.
        #
        self.log(u"Parsing fragments from munparsed text format")
        # transform text in a soup object
        self.log(u"Creating soup")
        soup = BeautifulSoup("\n".join(lines), "lxml")
        # extract according to class_regex and id_regex
        text_from_id = {}
        ids = []
        self.log(u"Finding l1 elements")
        tree = Tree()
        for l1_node in nodes_at_level(soup, 1):
            has_word = False
            try:
                l1_id = gf.safe_unicode(l1_node["id"])
                self.log([u"Found l1 node with id:   '%s'", l1_id])
                l1_text = []
                paragraph_node = Tree()
                paragraph_text = []
                for l2_node in nodes_at_level(l1_node, 2):
                    l2_id = gf.safe_unicode(l2_node["id"])
                    self.log([u"  Found l2 node with id:   '%s'", l2_id])
                    l2_text = []
                    sentence_node = Tree()
                    paragraph_node.add_child(sentence_node)
                    sentence_text = []
                    for l3_node in nodes_at_level(l2_node, 3):
                        l3_id = gf.safe_unicode(l3_node["id"])
                        l3_text = gf.safe_unicode(l3_node.text)
                        self.log([u"    Found l3 node with id:   '%s'", l3_id])
                        self.log(
                            [u"    Found l3 node with text: '%s'", l3_text])
                        word_fragment = TextFragment(identifier=l3_id,
                                                     lines=[l3_text],
                                                     filtered_lines=[l3_text])
                        word_node = Tree(value=word_fragment)
                        sentence_node.add_child(word_node)
                        sentence_text.append(l3_text)
                        has_word = True
                    sentence_text = u" ".join(sentence_text)
                    paragraph_text.append(sentence_text)
                    sentence_node.value = TextFragment(
                        identifier=l2_id,
                        lines=[sentence_text],
                        filtered_lines=[sentence_text])
                    self.log(
                        [u"  Found l2 node with text: '%s'" % sentence_text])
                if has_word:
                    paragraph_text = u" ".join(paragraph_text)
                    paragraph_node.value = TextFragment(
                        identifier=l1_id,
                        lines=[paragraph_text],
                        filtered_lines=[paragraph_text])
                    tree.add_child(paragraph_node)
                    self.log(
                        [u"Found l1 node with text: '%s'" % paragraph_text])
                else:
                    self.log(u"Found l1 node but it has no words, skipping")
            except KeyError:
                self.log_warn(u"KeyError while parsing a l1 node")
        # append to fragments
        self.log(u"Storing tree")
        self.fragments_tree = tree
Пример #31
0
    def read_properties(self, audio_file_path):
        """
        Read the properties of an audio file
        and return them as a dictionary.

        Example: ::

            d["index"]=0
            d["codec_name"]=mp3
            d["codec_long_name"]=MP3 (MPEG audio layer 3)
            d["profile"]=unknown
            d["codec_type"]=audio
            d["codec_time_base"]=1/44100
            d["codec_tag_string"]=[0][0][0][0]
            d["codec_tag"]=0x0000
            d["sample_fmt"]=s16p
            d["sample_rate"]=44100
            d["channels"]=1
            d["channel_layout"]=mono
            d["bits_per_sample"]=0
            d["id"]=N/A
            d["r_frame_rate"]=0/0
            d["avg_frame_rate"]=0/0
            d["time_base"]=1/14112000
            d["start_pts"]=0
            d["start_time"]=0.000000
            d["duration_ts"]=1545083190
            d["duration"]=109.487188
            d["bit_rate"]=128000
            d["max_bit_rate"]=N/A
            d["bits_per_raw_sample"]=N/A
            d["nb_frames"]=N/A
            d["nb_read_frames"]=N/A
            d["nb_read_packets"]=N/A
            d["DISPOSITION:default"]=0
            d["DISPOSITION:dub"]=0
            d["DISPOSITION:original"]=0
            d["DISPOSITION:comment"]=0
            d["DISPOSITION:lyrics"]=0
            d["DISPOSITION:karaoke"]=0
            d["DISPOSITION:forced"]=0
            d["DISPOSITION:hearing_impaired"]=0
            d["DISPOSITION:visual_impaired"]=0
            d["DISPOSITION:clean_effects"]=0
            d["DISPOSITION:attached_pic"]=0

        :param string audio_file_path: the path of the audio file to analyze
        :rtype: dict
        :raises: TypeError: if ``audio_file_path`` is None
        :raises: OSError: if the file at ``audio_file_path`` cannot be read
        :raises: FFPROBEParsingError: if the call to ``ffprobe`` does not produce any output
        :raises: FFPROBEPathError: if the path to the ``ffprobe`` executable cannot be called
        :raises: FFPROBEUnsupportedFormatError: if the file has a format not supported by ``ffprobe``
        """

        # test if we can read the file at audio_file_path
        if audio_file_path is None:
            self.log_exc(u"The audio file path is None", None, True, TypeError)
        if not gf.file_can_be_read(audio_file_path):
            self.log_exc(u"Input file '%s' cannot be read" % (audio_file_path), None, True, OSError)

        # call ffprobe
        arguments = [self.rconf[RuntimeConfiguration.FFPROBE_PATH]]
        arguments.extend(self.FFPROBE_PARAMETERS)
        arguments.append(audio_file_path)
        self.log([u"Calling with arguments '%s'", arguments])
        try:
            proc = subprocess.Popen(
                arguments,
                stdout=subprocess.PIPE,
                stdin=subprocess.PIPE,
                stderr=subprocess.PIPE
            )
            (stdoutdata, stderrdata) = proc.communicate()
            proc.stdout.close()
            proc.stdin.close()
            proc.stderr.close()
        except OSError as exc:
            self.log_exc(u"Unable to call the '%s' ffprobe executable" % (self.rconf[RuntimeConfiguration.FFPROBE_PATH]), exc, True, FFPROBEPathError)
        self.log(u"Call completed")

        # check there is some output
        if (stdoutdata is None) or (len(stderrdata) == 0):
            self.log_exc(u"ffprobe produced no output", None, True, FFPROBEParsingError)

        # decode stdoutdata and stderrdata to Unicode string
        try:
            stdoutdata = gf.safe_unicode(stdoutdata)
            stderrdata = gf.safe_unicode(stderrdata)
        except UnicodeDecodeError as exc:
            self.log_exc(u"Unable to decode ffprobe out/err", exc, True, FFPROBEParsingError)

        # dictionary for the results
        results = {
            self.STDOUT_CHANNELS : None,
            self.STDOUT_CODEC_NAME : None,
            self.STDOUT_DURATION : None,
            self.STDOUT_SAMPLE_RATE : None
        }

        # scan the first audio stream the ffprobe stdout output
        # TODO more robust parsing
        # TODO deal with multiple audio streams
        for line in stdoutdata.splitlines():
            if line == self.STDOUT_END_STREAM:
                self.log(u"Reached end of the stream")
                break
            elif len(line.split("=")) == 2:
                key, value = line.split("=")
                results[key] = value
                self.log([u"Found property '%s'='%s'", key, value])

        try:
            self.log([u"Duration found in stdout: '%s'", results[self.STDOUT_DURATION]])
            results[self.STDOUT_DURATION] = TimeValue(results[self.STDOUT_DURATION])
            self.log(u"Valid duration")
        except:
            self.log_warn(u"Invalid duration")
            results[self.STDOUT_DURATION] = None
            # try scanning ffprobe stderr output
            for line in stderrdata.splitlines():
                match = self.STDERR_DURATION_REGEX.search(line)
                if match is not None:
                    self.log([u"Found matching line '%s'", line])
                    results[self.STDOUT_DURATION] = gf.time_from_hhmmssmmm(line)
                    self.log([u"Extracted duration '%.3f'", results[self.STDOUT_DURATION]])
                    break

        if results[self.STDOUT_DURATION] is None:
            self.log_exc(u"No duration found in stdout or stderr. Unsupported audio file format?", None, True, FFPROBEUnsupportedFormatError)

        # return dictionary
        self.log(u"Returning dict")
        return results
Пример #32
0
    def _analyze_txt_config(self, config_string=None):
        """
        Analyze the given container and return the corresponding job.

        If ``config_string`` is ``None``,
        try reading it from the TXT config file inside the container.

        :param string config_string: the configuration string
        :rtype: :class:`~aeneas.job.Job`
        """
        self.log(u"Analyzing container with TXT config string")

        if config_string is None:
            self.log(u"Analyzing container with TXT config file")
            config_entry = self.container.entry_config_txt
            self.log([u"Found TXT config entry '%s'", config_entry])
            config_dir = os.path.dirname(config_entry)
            self.log([u"Directory of TXT config entry: '%s'", config_dir])
            self.log([u"Reading TXT config entry: '%s'", config_entry])
            config_contents = self.container.read_entry(config_entry)
            self.log(u"Converting config contents to config string")
            config_contents = gf.safe_unicode(config_contents)
            config_string = gf.config_txt_to_string(config_contents)
        else:
            self.log([
                u"Analyzing container with TXT config string '%s'",
                config_string
            ])
            config_dir = ""

        self.log(u"Creating the Job object")
        job = Job(config_string)

        self.log(u"Getting entries")
        entries = self.container.entries

        self.log(u"Converting config string into config dict")
        parameters = gf.config_string_to_dict(config_string)

        self.log(u"Calculating the path of the tasks root directory")
        tasks_root_directory = gf.norm_join(
            config_dir, parameters[gc.PPN_JOB_IS_HIERARCHY_PREFIX])
        self.log(
            [u"Path of the tasks root directory: '%s'", tasks_root_directory])

        self.log(u"Calculating the path of the sync map root directory")
        sync_map_root_directory = gf.norm_join(
            config_dir, parameters[gc.PPN_JOB_OS_HIERARCHY_PREFIX])
        job_os_hierarchy_type = parameters[gc.PPN_JOB_OS_HIERARCHY_TYPE]
        self.log([
            u"Path of the sync map root directory: '%s'",
            sync_map_root_directory
        ])

        text_file_relative_path = parameters[
            gc.PPN_JOB_IS_TEXT_FILE_RELATIVE_PATH]
        self.log(
            [u"Relative path for text file: '%s'", text_file_relative_path])
        text_file_name_regex = re.compile(
            r"" + parameters[gc.PPN_JOB_IS_TEXT_FILE_NAME_REGEX])
        self.log([
            u"Regex for text file: '%s'",
            parameters[gc.PPN_JOB_IS_TEXT_FILE_NAME_REGEX]
        ])
        audio_file_relative_path = parameters[
            gc.PPN_JOB_IS_AUDIO_FILE_RELATIVE_PATH]
        self.log(
            [u"Relative path for audio file: '%s'", audio_file_relative_path])
        audio_file_name_regex = re.compile(
            r"" + parameters[gc.PPN_JOB_IS_AUDIO_FILE_NAME_REGEX])
        self.log([
            u"Regex for audio file: '%s'",
            parameters[gc.PPN_JOB_IS_AUDIO_FILE_NAME_REGEX]
        ])

        if parameters[gc.PPN_JOB_IS_HIERARCHY_TYPE] == HierarchyType.FLAT:
            self.log(u"Looking for text/audio pairs in flat hierarchy")
            text_files = self._find_files(entries, tasks_root_directory,
                                          text_file_relative_path,
                                          text_file_name_regex)
            self.log([u"Found text files: '%s'", text_files])
            audio_files = self._find_files(entries, tasks_root_directory,
                                           audio_file_relative_path,
                                           audio_file_name_regex)
            self.log([u"Found audio files: '%s'", audio_files])

            self.log(u"Matching files in flat hierarchy...")
            matched_tasks = self._match_files_flat_hierarchy(
                text_files, audio_files)
            self.log(u"Matching files in flat hierarchy... done")

            for task_info in matched_tasks:
                self.log([u"Creating task: '%s'", str(task_info)])
                task = self._create_task(task_info, config_string,
                                         sync_map_root_directory,
                                         job_os_hierarchy_type)
                job.add_task(task)

        if parameters[gc.PPN_JOB_IS_HIERARCHY_TYPE] == HierarchyType.PAGED:
            self.log(u"Looking for text/audio pairs in paged hierarchy")
            # find all subdirectories of tasks_root_directory
            # that match gc.PPN_JOB_IS_TASK_DIRECTORY_NAME_REGEX
            matched_directories = self._match_directories(
                entries, tasks_root_directory,
                parameters[gc.PPN_JOB_IS_TASK_DIRECTORY_NAME_REGEX])
            for matched_directory in matched_directories:
                # rebuild the full path
                matched_directory_full_path = gf.norm_join(
                    tasks_root_directory, matched_directory)
                self.log([
                    u"Looking for text/audio pairs in directory '%s'",
                    matched_directory_full_path
                ])

                # look for text and audio files there
                text_files = self._find_files(entries,
                                              matched_directory_full_path,
                                              text_file_relative_path,
                                              text_file_name_regex)
                self.log([u"Found text files: '%s'", text_files])
                audio_files = self._find_files(entries,
                                               matched_directory_full_path,
                                               audio_file_relative_path,
                                               audio_file_name_regex)
                self.log([u"Found audio files: '%s'", audio_files])

                # if we have found exactly one text and one audio file,
                # create a Task
                if (len(text_files) == 1) and (len(audio_files) == 1):
                    self.log([
                        u"Exactly one text file and one audio file in '%s'",
                        matched_directory
                    ])
                    task_info = [
                        matched_directory, text_files[0], audio_files[0]
                    ]
                    self.log([u"Creating task: '%s'", str(task_info)])
                    task = self._create_task(task_info, config_string,
                                             sync_map_root_directory,
                                             job_os_hierarchy_type)
                    job.add_task(task)
                elif len(text_files) > 1:
                    self.log([
                        u"More than one text file in '%s'", matched_directory
                    ])
                elif len(audio_files) > 1:
                    self.log([
                        u"More than one audio file in '%s'", matched_directory
                    ])
                else:
                    self.log(
                        [u"No text nor audio file in '%s'", matched_directory])

        return job
Пример #33
0
    def _analyze_txt_config(self, config_string=None):
        """
        Analyze the given container and return the corresponding job.

        If ``config_string`` is ``None``,
        try reading it from the TXT config file inside the container.

        :param string config_string: the configuration string
        :rtype: :class:`~aeneas.job.Job`
        """
        self.log(u"Analyzing container with TXT config string")

        if config_string is None:
            self.log(u"Analyzing container with TXT config file")
            config_entry = self.container.entry_config_txt
            self.log([u"Found TXT config entry '%s'", config_entry])
            config_dir = os.path.dirname(config_entry)
            self.log([u"Directory of TXT config entry: '%s'", config_dir])
            self.log([u"Reading TXT config entry: '%s'", config_entry])
            config_contents = self.container.read_entry(config_entry)
            self.log(u"Converting config contents to config string")
            config_contents = gf.safe_unicode(config_contents)
            config_string = gf.config_txt_to_string(config_contents)
        else:
            self.log([u"Analyzing container with TXT config string '%s'", config_string])
            config_dir = ""

        self.log(u"Creating the Job object")
        job = Job(config_string)

        self.log(u"Getting entries")
        entries = self.container.entries

        self.log(u"Converting config string into config dict")
        parameters = gf.config_string_to_dict(config_string)

        self.log(u"Calculating the path of the tasks root directory")
        tasks_root_directory = gf.norm_join(
            config_dir,
            parameters[gc.PPN_JOB_IS_HIERARCHY_PREFIX]
        )
        self.log([u"Path of the tasks root directory: '%s'", tasks_root_directory])

        self.log(u"Calculating the path of the sync map root directory")
        sync_map_root_directory = gf.norm_join(
            config_dir,
            parameters[gc.PPN_JOB_OS_HIERARCHY_PREFIX]
        )
        job_os_hierarchy_type = parameters[gc.PPN_JOB_OS_HIERARCHY_TYPE]
        self.log([u"Path of the sync map root directory: '%s'", sync_map_root_directory])

        text_file_relative_path = parameters[gc.PPN_JOB_IS_TEXT_FILE_RELATIVE_PATH]
        self.log([u"Relative path for text file: '%s'", text_file_relative_path])
        text_file_name_regex = re.compile(r"" + parameters[gc.PPN_JOB_IS_TEXT_FILE_NAME_REGEX])
        self.log([u"Regex for text file: '%s'", parameters[gc.PPN_JOB_IS_TEXT_FILE_NAME_REGEX]])
        audio_file_relative_path = parameters[gc.PPN_JOB_IS_AUDIO_FILE_RELATIVE_PATH]
        self.log([u"Relative path for audio file: '%s'", audio_file_relative_path])
        audio_file_name_regex = re.compile(r"" + parameters[gc.PPN_JOB_IS_AUDIO_FILE_NAME_REGEX])
        self.log([u"Regex for audio file: '%s'", parameters[gc.PPN_JOB_IS_AUDIO_FILE_NAME_REGEX]])

        if parameters[gc.PPN_JOB_IS_HIERARCHY_TYPE] == HierarchyType.FLAT:
            self.log(u"Looking for text/audio pairs in flat hierarchy")
            text_files = self._find_files(
                entries,
                tasks_root_directory,
                text_file_relative_path,
                text_file_name_regex
            )
            self.log([u"Found text files: '%s'", text_files])
            audio_files = self._find_files(
                entries,
                tasks_root_directory,
                audio_file_relative_path,
                audio_file_name_regex
            )
            self.log([u"Found audio files: '%s'", audio_files])

            self.log(u"Matching files in flat hierarchy...")
            matched_tasks = self._match_files_flat_hierarchy(
                text_files,
                audio_files
            )
            self.log(u"Matching files in flat hierarchy... done")

            for task_info in matched_tasks:
                self.log([u"Creating task: '%s'", str(task_info)])
                task = self._create_task(
                    task_info,
                    config_string,
                    sync_map_root_directory,
                    job_os_hierarchy_type
                )
                job.add_task(task)

        if parameters[gc.PPN_JOB_IS_HIERARCHY_TYPE] == HierarchyType.PAGED:
            self.log(u"Looking for text/audio pairs in paged hierarchy")
            # find all subdirectories of tasks_root_directory
            # that match gc.PPN_JOB_IS_TASK_DIRECTORY_NAME_REGEX
            matched_directories = self._match_directories(
                entries,
                tasks_root_directory,
                parameters[gc.PPN_JOB_IS_TASK_DIRECTORY_NAME_REGEX]
            )
            for matched_directory in matched_directories:
                # rebuild the full path
                matched_directory_full_path = gf.norm_join(
                    tasks_root_directory,
                    matched_directory
                )
                self.log([u"Looking for text/audio pairs in directory '%s'", matched_directory_full_path])

                # look for text and audio files there
                text_files = self._find_files(
                    entries,
                    matched_directory_full_path,
                    text_file_relative_path,
                    text_file_name_regex
                )
                self.log([u"Found text files: '%s'", text_files])
                audio_files = self._find_files(
                    entries,
                    matched_directory_full_path,
                    audio_file_relative_path,
                    audio_file_name_regex
                )
                self.log([u"Found audio files: '%s'", audio_files])

                # if we have found exactly one text and one audio file,
                # create a Task
                if (len(text_files) == 1) and (len(audio_files) == 1):
                    self.log([u"Exactly one text file and one audio file in '%s'", matched_directory])
                    task_info = [
                        matched_directory,
                        text_files[0],
                        audio_files[0]
                    ]
                    self.log([u"Creating task: '%s'", str(task_info)])
                    task = self._create_task(
                        task_info,
                        config_string,
                        sync_map_root_directory,
                        job_os_hierarchy_type
                    )
                    job.add_task(task)
                elif len(text_files) > 1:
                    self.log([u"More than one text file in '%s'", matched_directory])
                elif len(audio_files) > 1:
                    self.log([u"More than one audio file in '%s'", matched_directory])
                else:
                    self.log([u"No text nor audio file in '%s'", matched_directory])

        return job
Пример #34
0
    def run(self, arguments, show_help=True):
        """
        Program entry point.

        Please note that the first item in ``arguments`` is discarded,
        as it is assumed to be the script/invocation name;
        pass a "dumb" placeholder if you call this method with
        an argument different that ``sys.argv``.

        :param arguments: the list of arguments
        :type  arguments: list
        :param show_help: if ``False``, do not show help on ``-h`` and ``--help``
        :type  show_help: bool
        :rtype: int
        """
        # convert arguments into Unicode strings
        if self.use_sys:
            # check that sys.stdin.encoding and sys.stdout.encoding are set to utf-8
            if not gf.FROZEN:
                if sys.stdin.encoding not in ["UTF-8", "UTF8"]:
                    self.print_warning(u"The default input encoding is not UTF-8.")
                    self.print_warning(u"You might want to set 'PYTHONIOENCODING=UTF-8' in your shell.")
                if sys.stdout.encoding not in ["UTF-8", "UTF8"]:
                    self.print_warning(u"The default output encoding is not UTF-8.")
                    self.print_warning(u"You might want to set 'PYTHONIOENCODING=UTF-8' in your shell.")
            # decode using sys.stdin.encoding
            args = [gf.safe_unicode_stdin(arg) for arg in arguments]
        else:
            # decode using utf-8 (but you should pass Unicode strings as parameters anyway)
            args = [gf.safe_unicode(arg) for arg in arguments]

        if show_help:
            if u"-h" in args:
                return self.print_help(short=True)

            if u"--help" in args:
                return self.print_help(short=False)

            if u"--version" in args:
                return self.print_name_version()

        # store formal arguments
        self.formal_arguments_raw = arguments
        self.formal_arguments = args

        # to obtain the actual arguments,
        # remove the first one and "special" switches
        args = args[1:]
        set_args = set(args)

        # set verbosity, if requested
        for flag in set([u"-v", u"--verbose"]) & set_args:
            self.verbose = True
            args.remove(flag)
        for flag in set([u"-vv", u"--very-verbose"]) & set_args:
            self.verbose = True
            self.very_verbose = True
            args.remove(flag)

        # set RuntimeConfiguration string, if specified
        for flag in [u"-r", u"--runtime-configuration"]:
            rconf_string = self.has_option_with_value(flag, actual_arguments=False)
            if rconf_string is not None:
                self.rconf = RuntimeConfiguration(rconf_string)
                args.remove("%s=%s" % (flag, rconf_string))

        # set log file path, if requested
        log_path = None
        for flag in [u"-l", u"--log"]:
            log_path = self.has_option_with_value(flag, actual_arguments=False)
            if log_path is not None:
                args.remove("%s=%s" % (flag, log_path))
            elif flag in set_args:
                handler, log_path = gf.tmp_file(suffix=u".log", root=self.rconf[RuntimeConfiguration.TMP_PATH])
                args.remove(flag)
            if log_path is not None:
                self.log_file_path = log_path

        # if no actual arguments left, print help
        if (len(args) < 1) and (show_help):
            return self.print_help(short=True)

        # store actual arguments
        self.actual_arguments = args

        # create logger
        self.logger = Logger(tee=self.verbose, tee_show_datetime=self.very_verbose)
        self.log([u"Formal arguments: %s", self.formal_arguments])
        self.log([u"Actual arguments: %s", self.actual_arguments])
        self.log([u"Runtime configuration: '%s'", self.rconf.config_string()])

        # perform command
        exit_code = self.perform_command()
        self.log([u"Execution completed with code %d", exit_code])

        # output log if requested
        if self.log_file_path is not None:
            self.log([u"User requested saving log to file '%s'", self.log_file_path])
            self.logger.write(self.log_file_path)
            if self.use_sys:
                self.print_info(u"Log written to file '%s'" % self.log_file_path)

        return self.exit(exit_code)
Пример #35
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 4:
            return self.print_help()
        text_format = gf.safe_unicode(self.actual_arguments[0])
        if text_format == u"list":
            text = gf.safe_unicode(self.actual_arguments[1])
        elif text_format in TextFileFormat.ALLOWED_VALUES:
            text = self.actual_arguments[1]
            if not self.check_input_file(text):
                return self.ERROR_EXIT_CODE
        else:
            return self.print_help()

        l1_id_regex = self.has_option_with_value(u"--l1-id-regex")
        l2_id_regex = self.has_option_with_value(u"--l2-id-regex")
        l3_id_regex = self.has_option_with_value(u"--l3-id-regex")
        id_regex = self.has_option_with_value(u"--id-regex")
        class_regex = self.has_option_with_value(u"--class-regex")
        sort = self.has_option_with_value(u"--sort")
        backwards = self.has_option([u"-b", u"--backwards"])
        quit_after = gf.safe_float(self.has_option_with_value(u"--quit-after"), None)
        start_fragment = gf.safe_int(self.has_option_with_value(u"--start"), None)
        end_fragment = gf.safe_int(self.has_option_with_value(u"--end"), None)
        parameters = {
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX: l1_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX: l2_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX: l3_id_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX: class_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX: id_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT: sort,
        }
        if (text_format == TextFileFormat.MUNPARSED) and ((l1_id_regex is None) or (l2_id_regex is None) or (l3_id_regex is None)):
            self.print_error(u"You must specify --l1-id-regex and --l2-id-regex and --l3-id-regex for munparsed format")
            return self.ERROR_EXIT_CODE
        if (text_format == TextFileFormat.UNPARSED) and (id_regex is None) and (class_regex is None):
            self.print_error(u"You must specify --id-regex and/or --class-regex for unparsed format")
            return self.ERROR_EXIT_CODE

        language = gf.safe_unicode(self.actual_arguments[2])

        output_file_path = self.actual_arguments[3]
        if not self.check_output_file(output_file_path):
            return self.ERROR_EXIT_CODE

        text_file = self.get_text_file(text_format, text, parameters)
        if text_file is None:
            self.print_error(u"Unable to build a TextFile from the given parameters")
            return self.ERROR_EXIT_CODE
        elif len(text_file) == 0:
            self.print_error(u"No text fragments found")
            return self.ERROR_EXIT_CODE
        text_file.set_language(language)
        self.print_info(u"Read input text with %d fragments" % (len(text_file)))
        if start_fragment is not None:
            self.print_info(u"Slicing from index %d" % (start_fragment))
        if end_fragment is not None:
            self.print_info(u"Slicing to index %d" % (end_fragment))
        text_slice = text_file.get_slice(start_fragment, end_fragment)
        self.print_info(u"Synthesizing %d fragments" % (len(text_slice)))

        if quit_after is not None:
            self.print_info(u"Stop synthesizing upon reaching %.3f seconds" % (quit_after))

        try:
            synt = Synthesizer(rconf=self.rconf, logger=self.logger)
            synt.synthesize(
                text_slice,
                output_file_path,
                quit_after=quit_after,
                backwards=backwards
            )
            self.print_success(u"Created file '%s'" % output_file_path)
            synt.clear_cache()
            return self.NO_ERROR_EXIT_CODE
        except ImportError as exc:
            tts = self.rconf[RuntimeConfiguration.TTS]
            if tts == Synthesizer.AWS:
                self.print_error(u"You need to install Python module boto3 to use the AWS Polly TTS API wrapper. Run:")
                self.print_error(u"$ pip install boto3")
                self.print_error(u"or, to install for all users:")
                self.print_error(u"$ sudo pip install boto3")
            elif tts == Synthesizer.NUANCE:
                self.print_error(u"You need to install Python module requests to use the Nuance TTS API wrapper. Run:")
                self.print_error(u"$ pip install requests")
                self.print_error(u"or, to install for all users:")
                self.print_error(u"$ sudo pip install requests")
            else:
                self.print_error(u"An unexpected error occurred while synthesizing text:")
                self.print_error(u"%s" % exc)
        except Exception as exc:
            self.print_error(u"An unexpected error occurred while synthesizing text:")
            self.print_error(u"%s" % exc)

        return self.ERROR_EXIT_CODE
Пример #36
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 1:
            return self.print_help()

        if self.has_option([u"-e", u"--examples"]):
            return self.print_examples(False)

        if self.has_option(u"--examples-all"):
            return self.print_examples(True)

        if self.has_option([u"--list-parameters"]):
            return self.print_parameters()

        parameter = self.has_option_with_value(u"--list-values")
        if parameter is not None:
            return self.print_values(parameter)
        elif self.has_option(u"--list-values"):
            return self.print_values(u"?")

        # NOTE list() is needed for Python3, where keys() is not a list!
        demo = self.has_option(list(self.DEMOS.keys()))
        demo_parameters = u""
        download_from_youtube = self.has_option([u"-y", u"--youtube"])
        largest_audio = self.has_option(u"--largest-audio")
        keep_audio = self.has_option(u"--keep-audio")
        output_html = self.has_option(u"--output-html")
        validate = not self.has_option(u"--skip-validator")
        print_faster_rate = self.has_option(u"--faster-rate")
        print_rates = self.has_option(u"--rate")
        print_zero = self.has_option(u"--zero")
        presets_word = self.has_option(u"--presets-word")

        if demo:
            validate = False
            for key in self.DEMOS:
                if self.has_option(key):
                    demo_parameters = self.DEMOS[key]
                    audio_file_path = demo_parameters[u"audio"]
                    text_file_path = demo_parameters[u"text"]
                    config_string = demo_parameters[u"config"]
                    sync_map_file_path = demo_parameters[u"syncmap"]
                    # TODO allow injecting rconf options directly from DEMOS options field
                    if key == u"--example-cewsubprocess":
                        self.rconf[RuntimeConfiguration.CEW_SUBPROCESS_ENABLED] = True
                    elif key == u"--example-ctw-espeak":
                        self.rconf[RuntimeConfiguration.TTS] = "custom"
                        self.rconf[RuntimeConfiguration.TTS_PATH] = self.CTW_ESPEAK
                    elif key == u"--example-ctw-speect":
                        self.rconf[RuntimeConfiguration.TTS] = "custom"
                        self.rconf[RuntimeConfiguration.TTS_PATH] = self.CTW_SPEECT
                    elif key == u"--example-festival":
                        self.rconf[RuntimeConfiguration.TTS] = "festival"
                    elif key == u"--example-mws":
                        self.rconf[RuntimeConfiguration.MFCC_WINDOW_LENGTH] = "1.500"
                        self.rconf[RuntimeConfiguration.MFCC_WINDOW_SHIFT] = "0.500"
                    elif key == u"--example-multilevel-tts":
                        self.rconf[RuntimeConfiguration.TTS_L1] = "festival"
                        self.rconf[RuntimeConfiguration.TTS_L2] = "festival"
                        self.rconf[RuntimeConfiguration.TTS_L3] = "espeak"
                    elif key == u"--example-words-festival-cache":
                        self.rconf[RuntimeConfiguration.TTS] = "festival"
                        self.rconf[RuntimeConfiguration.TTS_CACHE] = True
                    elif key == u"--example-faster-rate":
                        print_faster_rate = True
                    elif key == u"--example-no-zero":
                        print_zero = True
                    elif key == u"--example-py":
                        self.rconf[RuntimeConfiguration.C_EXTENSIONS] = False
                    elif key == u"--example-rate":
                        print_rates = True
                    elif key == u"--example-remove-nonspeech-rateaggressive":
                        print_rates = True
                    elif key == u"--example-youtube":
                        download_from_youtube = True
                    break
        else:
            if len(self.actual_arguments) < 4:
                return self.print_help()
            audio_file_path = self.actual_arguments[0]
            text_file_path = self.actual_arguments[1]
            config_string = self.actual_arguments[2]
            sync_map_file_path = self.actual_arguments[3]

        if presets_word:
            self.print_info(u"Preset for word-level alignment")
            self.rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH] = True
            self.rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH_L3] = True

        html_file_path = None
        if output_html:
            keep_audio = True
            html_file_path = sync_map_file_path + u".html"

        if download_from_youtube:
            youtube_url = gf.safe_unicode(audio_file_path)

        if (not download_from_youtube) and (not self.check_input_file(audio_file_path)):
            return self.ERROR_EXIT_CODE
        if not self.check_input_file(text_file_path):
            return self.ERROR_EXIT_CODE
        if not self.check_output_file(sync_map_file_path):
            return self.ERROR_EXIT_CODE
        if (html_file_path is not None) and (not self.check_output_file(html_file_path)):
            return self.ERROR_EXIT_CODE

        self.check_c_extensions()

        if demo:
            msg = []
            msg.append(u"Running example task with arguments:")
            if download_from_youtube:
                msg.append(u"  YouTube URL:   %s" % youtube_url)
            else:
                msg.append(u"  Audio file:    %s" % audio_file_path)
            msg.append(u"  Text file:     %s" % text_file_path)
            msg.append(u"  Config string: %s" % config_string)
            msg.append(u"  Sync map file: %s" % sync_map_file_path)
            if len(demo_parameters[u"options"]) > 0:
                msg.append(u"  Options:       %s" % demo_parameters[u"options"])
            self.print_info(u"\n".join(msg))

        if validate:
            self.print_info(u"Validating config string (specify --skip-validator to bypass)...")
            validator = Validator(logger=self.logger)
            result = validator.check_configuration_string(config_string, is_job=False, external_name=True)
            if not result.passed:
                self.print_error(u"The given config string is not valid:")
                self.print_generic(result.pretty_print())
                return self.ERROR_EXIT_CODE
            self.print_info(u"Validating config string... done")

        if download_from_youtube:
            try:
                self.print_info(u"Downloading audio from '%s' ..." % youtube_url)
                downloader = Downloader(logger=self.logger)
                audio_file_path = downloader.audio_from_youtube(
                    youtube_url,
                    download=True,
                    output_file_path=None,
                    largest_audio=largest_audio
                )
                self.print_info(u"Downloading audio from '%s' ... done" % youtube_url)
            except ImportError:
                self.print_no_dependency_error()
                return self.ERROR_EXIT_CODE
            except Exception as exc:
                self.print_error(u"An unexpected error occurred while downloading audio from YouTube:")
                self.print_error(u"%s" % exc)
                return self.ERROR_EXIT_CODE
        else:
            audio_extension = gf.file_extension(audio_file_path)
            if audio_extension.lower() not in AudioFile.FILE_EXTENSIONS:
                self.print_warning(u"Your audio file path has extension '%s', which is uncommon for an audio file." % audio_extension)
                self.print_warning(u"Attempting at executing your Task anyway.")
                self.print_warning(u"If it fails, you might have swapped the first two arguments.")
                self.print_warning(u"The audio file path should be the first argument, the text file path the second.")

        try:
            self.print_info(u"Creating task...")
            task = Task(config_string, logger=self.logger)
            task.audio_file_path_absolute = audio_file_path
            task.text_file_path_absolute = text_file_path
            task.sync_map_file_path_absolute = sync_map_file_path
            self.print_info(u"Creating task... done")
        except Exception as exc:
            self.print_error(u"An unexpected error occurred while creating the task:")
            self.print_error(u"%s" % exc)
            return self.ERROR_EXIT_CODE

        try:
            self.print_info(u"Executing task...")
            executor = ExecuteTask(task=task, rconf=self.rconf, logger=self.logger)
            executor.execute()
            self.print_info(u"Executing task... done")
        except Exception as exc:
            self.print_error(u"An unexpected error occurred while executing the task:")
            self.print_error(u"%s" % exc)
            return self.ERROR_EXIT_CODE

        try:
            self.print_info(u"Creating output sync map file...")
            path = task.output_sync_map_file()
            self.print_info(u"Creating output sync map file... done")
            self.print_success(u"Created file '%s'" % path)
        except Exception as exc:
            self.print_error(u"An unexpected error occurred while writing the sync map file:")
            self.print_error(u"%s" % exc)
            return self.ERROR_EXIT_CODE

        if output_html:
            try:
                parameters = {}
                parameters[gc.PPN_TASK_OS_FILE_FORMAT] = task.configuration["o_format"]
                parameters[gc.PPN_TASK_OS_FILE_EAF_AUDIO_REF] = task.configuration["o_eaf_audio_ref"]
                parameters[gc.PPN_TASK_OS_FILE_SMIL_AUDIO_REF] = task.configuration["o_smil_audio_ref"]
                parameters[gc.PPN_TASK_OS_FILE_SMIL_PAGE_REF] = task.configuration["o_smil_page_ref"]
                self.print_info(u"Creating output HTML file...")
                task.sync_map.output_html_for_tuning(audio_file_path, html_file_path, parameters)
                self.print_info(u"Creating output HTML file... done")
                self.print_success(u"Created file '%s'" % html_file_path)
            except Exception as exc:
                self.print_error(u"An unexpected error occurred while writing the HTML file:")
                self.print_error(u"%s" % exc)
                return self.ERROR_EXIT_CODE

        if download_from_youtube:
            if keep_audio:
                self.print_info(u"Option --keep-audio set: keeping downloaded file '%s'" % audio_file_path)
            else:
                gf.delete_file(None, audio_file_path)

        if print_zero:
            zero_duration = [l for l in task.sync_map_leaves(SyncMapFragment.REGULAR) if l.begin == l.end]
            if len(zero_duration) > 0:
                self.print_warning(u"Fragments with zero duration:")
                for fragment in zero_duration:
                    self.print_generic(u"  %s" % (fragment.pretty_print))

        if print_rates:
            self.print_info(u"Fragments with rates:")
            for fragment in task.sync_map_leaves(SyncMapFragment.REGULAR):
                self.print_generic(u"  %s\t%.3f" % (fragment.pretty_print, fragment.rate or 0.0))

        if print_faster_rate:
            max_rate = task.configuration["aba_rate_value"]
            if max_rate is not None:
                faster = [l for l in task.sync_map_leaves(SyncMapFragment.REGULAR) if l.rate >= max_rate + Decimal("0.001")]
                if len(faster) > 0:
                    self.print_warning(u"Fragments with rate greater than %.3f:" % max_rate)
                    for fragment in faster:
                        self.print_generic(u"  %s\t%.3f" % (fragment.pretty_print, fragment.rate or 0.0))

        return self.NO_ERROR_EXIT_CODE
Пример #37
0
    def _read_unparsed(self, lines):
        """
        Read text fragments from an unparsed format text file.

        :param list lines: the lines of the unparsed text file
        """
        from bs4 import BeautifulSoup

        def filter_attributes():
            """ Return a dict with the bs4 filter parameters """
            attributes = {}
            for attribute_name, filter_name in [
                ("class", gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX),
                ("id", gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX)
            ]:
                if filter_name in self.parameters:
                    regex_string = self.parameters[filter_name]
                    if regex_string is not None:
                        self.log([
                            u"Regex for %s: '%s'", attribute_name, regex_string
                        ])
                        regex = re.compile(r".*\b" + regex_string + r"\b.*")
                        attributes[attribute_name] = regex
            return attributes

        #
        # TODO better and/or parametric parsing,
        #      for example, removing tags but keeping text, etc.
        #
        self.log(u"Parsing fragments from unparsed text format")

        # transform text in a soup object
        self.log(u"Creating soup")
        soup = BeautifulSoup("\n".join(lines), "lxml")

        # extract according to class_regex and id_regex
        text_from_id = {}
        ids = []
        filter_attributes = filter_attributes()
        self.log(
            [u"Finding elements matching attributes '%s'", filter_attributes])
        nodes = soup.findAll(attrs=filter_attributes)
        for node in nodes:
            try:
                f_id = gf.safe_unicode(node["id"])
                f_text = gf.safe_unicode(node.text)
                text_from_id[f_id] = f_text
                ids.append(f_id)
            except KeyError:
                self.log_warn(u"KeyError while parsing a node")

        # sort by ID as requested
        id_sort = gf.safe_get(dictionary=self.parameters,
                              key=gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT,
                              default_value=IDSortingAlgorithm.UNSORTED,
                              can_return_none=False)
        self.log([u"Sorting text fragments using '%s'", id_sort])
        sorted_ids = IDSortingAlgorithm(id_sort).sort(ids)

        # append to fragments
        self.log(u"Appending fragments")
        self._create_text_fragments([(key, [text_from_id[key]])
                                     for key in sorted_ids])
Пример #38
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 1:
            return self.print_help()

        if self.has_option([u"-e", u"--examples"]):
            return self.print_examples(False)

        if self.has_option(u"--examples-all"):
            return self.print_examples(True)

        if self.has_option([u"--list-parameters"]):
            return self.print_parameters()

        parameter = self.has_option_with_value(u"--list-values")
        if parameter is not None:
            return self.print_values(parameter)
        elif self.has_option(u"--list-values"):
            return self.print_values(u"?")

        # NOTE list() is needed for Python3, where keys() is not a list!
        demo = self.has_option(list(self.DEMOS.keys()))
        demo_parameters = u""
        download_from_youtube = self.has_option([u"-y", u"--youtube"])
        largest_audio = self.has_option(u"--largest-audio")
        keep_audio = self.has_option(u"--keep-audio")
        output_html = self.has_option(u"--output-html")
        validate = not self.has_option(u"--skip-validator")
        print_faster_rate = self.has_option(u"--faster-rate")
        print_rates = self.has_option(u"--rate")
        print_zero = self.has_option(u"--zero")
        presets_word = self.has_option(u"--presets-word")

        if demo:
            validate = False
            for key in self.DEMOS:
                if self.has_option(key):
                    demo_parameters = self.DEMOS[key]
                    audio_file_path = demo_parameters[u"audio"]
                    text_file_path = demo_parameters[u"text"]
                    config_string = demo_parameters[u"config"]
                    sync_map_file_path = demo_parameters[u"syncmap"]
                    # TODO allow injecting rconf options directly from DEMOS options field
                    if key == u"--example-cewsubprocess":
                        self.rconf[
                            RuntimeConfiguration.CEW_SUBPROCESS_ENABLED] = True
                    elif key == u"--example-ctw-espeak":
                        self.rconf[RuntimeConfiguration.TTS] = "custom"
                        self.rconf[
                            RuntimeConfiguration.TTS_PATH] = self.CTW_ESPEAK
                    elif key == u"--example-ctw-speect":
                        self.rconf[RuntimeConfiguration.TTS] = "custom"
                        self.rconf[
                            RuntimeConfiguration.TTS_PATH] = self.CTW_SPEECT
                    elif key == u"--example-festival":
                        self.rconf[RuntimeConfiguration.TTS] = "festival"
                    elif key == u"--example-mws":
                        self.rconf[
                            RuntimeConfiguration.MFCC_WINDOW_LENGTH] = "1.500"
                        self.rconf[
                            RuntimeConfiguration.MFCC_WINDOW_SHIFT] = "0.500"
                    elif key == u"--example-multilevel-tts":
                        self.rconf[RuntimeConfiguration.TTS_L1] = "festival"
                        self.rconf[RuntimeConfiguration.TTS_L2] = "festival"
                        self.rconf[RuntimeConfiguration.TTS_L3] = "espeak"
                    elif key == u"--example-words-festival-cache":
                        self.rconf[RuntimeConfiguration.TTS] = "festival"
                        self.rconf[RuntimeConfiguration.TTS_CACHE] = True
                    elif key == u"--example-faster-rate":
                        print_faster_rate = True
                    elif key == u"--example-no-zero":
                        print_zero = True
                    elif key == u"--example-py":
                        self.rconf[RuntimeConfiguration.C_EXTENSIONS] = False
                    elif key == u"--example-rate":
                        print_rates = True
                    elif key == u"--example-remove-nonspeech-rateaggressive":
                        print_rates = True
                    elif key == u"--example-youtube":
                        download_from_youtube = True
                    break
        else:
            if len(self.actual_arguments) < 4:
                return self.print_help()
            audio_file_path = self.actual_arguments[0]
            text_file_path = self.actual_arguments[1]
            config_string = self.actual_arguments[2]
            sync_map_file_path = self.actual_arguments[3]

        if presets_word:
            self.print_info(u"Preset for word-level alignment")
            self.rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH] = True
            self.rconf[RuntimeConfiguration.MFCC_MASK_NONSPEECH_L3] = True

        html_file_path = None
        if output_html:
            keep_audio = True
            html_file_path = sync_map_file_path + u".html"

        if download_from_youtube:
            youtube_url = gf.safe_unicode(audio_file_path)

        if (not download_from_youtube) and (
                not self.check_input_file(audio_file_path)):
            return self.ERROR_EXIT_CODE
        if not self.check_input_file(text_file_path):
            return self.ERROR_EXIT_CODE
        if not self.check_output_file(sync_map_file_path):
            return self.ERROR_EXIT_CODE
        if (html_file_path
                is not None) and (not self.check_output_file(html_file_path)):
            return self.ERROR_EXIT_CODE

        self.check_c_extensions()

        if demo:
            msg = []
            msg.append(u"Running example task with arguments:")
            if download_from_youtube:
                msg.append(u"  YouTube URL:   %s" % youtube_url)
            else:
                msg.append(u"  Audio file:    %s" % audio_file_path)
            msg.append(u"  Text file:     %s" % text_file_path)
            msg.append(u"  Config string: %s" % config_string)
            msg.append(u"  Sync map file: %s" % sync_map_file_path)
            if len(demo_parameters[u"options"]) > 0:
                msg.append(u"  Options:       %s" %
                           demo_parameters[u"options"])
            self.print_info(u"\n".join(msg))

        if validate:
            self.print_info(
                u"Validating config string (specify --skip-validator to bypass)..."
            )
            validator = Validator(logger=self.logger)
            result = validator.check_configuration_string(config_string,
                                                          is_job=False,
                                                          external_name=True)
            if not result.passed:
                self.print_error(u"The given config string is not valid:")
                self.print_generic(result.pretty_print())
                return self.ERROR_EXIT_CODE
            self.print_info(u"Validating config string... done")

        if download_from_youtube:
            try:
                self.print_info(u"Downloading audio from '%s' ..." %
                                youtube_url)
                downloader = Downloader(logger=self.logger)
                audio_file_path = downloader.audio_from_youtube(
                    youtube_url,
                    download=True,
                    output_file_path=None,
                    largest_audio=largest_audio)
                self.print_info(u"Downloading audio from '%s' ... done" %
                                youtube_url)
            except ImportError:
                self.print_no_dependency_error()
                return self.ERROR_EXIT_CODE
            except Exception as exc:
                self.print_error(
                    u"An unexpected error occurred while downloading audio from YouTube:"
                )
                self.print_error(u"%s" % exc)
                return self.ERROR_EXIT_CODE
        else:
            audio_extension = gf.file_extension(audio_file_path)
            if audio_extension.lower() not in AudioFile.FILE_EXTENSIONS:
                self.print_warning(
                    u"Your audio file path has extension '%s', which is uncommon for an audio file."
                    % audio_extension)
                self.print_warning(
                    u"Attempting at executing your Task anyway.")
                self.print_warning(
                    u"If it fails, you might have swapped the first two arguments."
                )
                self.print_warning(
                    u"The audio file path should be the first argument, the text file path the second."
                )

        try:
            self.print_info(u"Creating task...")
            task = Task(config_string, logger=self.logger)
            task.audio_file_path_absolute = audio_file_path
            task.text_file_path_absolute = text_file_path
            task.sync_map_file_path_absolute = sync_map_file_path
            self.print_info(u"Creating task... done")
        except Exception as exc:
            self.print_error(
                u"An unexpected error occurred while creating the task:")
            self.print_error(u"%s" % exc)
            return self.ERROR_EXIT_CODE

        try:
            self.print_info(u"Executing task...")
            executor = ExecuteTask(task=task,
                                   rconf=self.rconf,
                                   logger=self.logger)
            executor.execute()
            self.print_info(u"Executing task... done")
        except Exception as exc:
            self.print_error(
                u"An unexpected error occurred while executing the task:")
            self.print_error(u"%s" % exc)
            return self.ERROR_EXIT_CODE

        try:
            self.print_info(u"Creating output sync map file...")
            path = task.output_sync_map_file()
            self.print_info(u"Creating output sync map file... done")
            self.print_success(u"Created file '%s'" % path)
        except Exception as exc:
            self.print_error(
                u"An unexpected error occurred while writing the sync map file:"
            )
            self.print_error(u"%s" % exc)
            return self.ERROR_EXIT_CODE

        if output_html:
            try:
                parameters = {}
                parameters[gc.PPN_TASK_OS_FILE_FORMAT] = task.configuration[
                    "o_format"]
                parameters[
                    gc.PPN_TASK_OS_FILE_EAF_AUDIO_REF] = task.configuration[
                        "o_eaf_audio_ref"]
                parameters[
                    gc.PPN_TASK_OS_FILE_SMIL_AUDIO_REF] = task.configuration[
                        "o_smil_audio_ref"]
                parameters[
                    gc.PPN_TASK_OS_FILE_SMIL_PAGE_REF] = task.configuration[
                        "o_smil_page_ref"]
                self.print_info(u"Creating output HTML file...")
                task.sync_map.output_html_for_tuning(audio_file_path,
                                                     html_file_path,
                                                     parameters)
                self.print_info(u"Creating output HTML file... done")
                self.print_success(u"Created file '%s'" % html_file_path)
            except Exception as exc:
                self.print_error(
                    u"An unexpected error occurred while writing the HTML file:"
                )
                self.print_error(u"%s" % exc)
                return self.ERROR_EXIT_CODE

        if download_from_youtube:
            if keep_audio:
                self.print_info(
                    u"Option --keep-audio set: keeping downloaded file '%s'" %
                    audio_file_path)
            else:
                gf.delete_file(None, audio_file_path)

        if print_zero:
            zero_duration = [
                l for l in task.sync_map_leaves(SyncMapFragment.REGULAR)
                if l.begin == l.end
            ]
            if len(zero_duration) > 0:
                self.print_warning(u"Fragments with zero duration:")
                for fragment in zero_duration:
                    self.print_generic(u"  %s" % (fragment.pretty_print))

        if print_rates:
            self.print_info(u"Fragments with rates:")
            for fragment in task.sync_map_leaves(SyncMapFragment.REGULAR):
                self.print_generic(
                    u"  %s\t%.3f" %
                    (fragment.pretty_print, fragment.rate or 0.0))

        if print_faster_rate:
            max_rate = task.configuration["aba_rate_value"]
            if max_rate is not None:
                faster = [
                    l for l in task.sync_map_leaves(SyncMapFragment.REGULAR)
                    if l.rate >= max_rate + Decimal("0.001")
                ]
                if len(faster) > 0:
                    self.print_warning(
                        u"Fragments with rate greater than %.3f:" % max_rate)
                    for fragment in faster:
                        self.print_generic(
                            u"  %s\t%.3f" %
                            (fragment.pretty_print, fragment.rate or 0.0))

        return self.NO_ERROR_EXIT_CODE
Пример #39
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 4:
            return self.print_help()
        text_format = gf.safe_unicode(self.actual_arguments[0])
        if text_format == u"list":
            text = gf.safe_unicode(self.actual_arguments[1])
        elif text_format in TextFileFormat.ALLOWED_VALUES:
            text = self.actual_arguments[1]
            if not self.check_input_file(text):
                return self.ERROR_EXIT_CODE
        else:
            return self.print_help()

        l1_id_regex = self.has_option_with_value(u"--l1-id-regex")
        l2_id_regex = self.has_option_with_value(u"--l2-id-regex")
        l3_id_regex = self.has_option_with_value(u"--l3-id-regex")
        id_regex = self.has_option_with_value(u"--id-regex")
        class_regex = self.has_option_with_value(u"--class-regex")
        sort = self.has_option_with_value(u"--sort")
        backwards = self.has_option([u"-b", u"--backwards"])
        quit_after = gf.safe_float(self.has_option_with_value(u"--quit-after"),
                                   None)
        start_fragment = gf.safe_int(self.has_option_with_value(u"--start"),
                                     None)
        end_fragment = gf.safe_int(self.has_option_with_value(u"--end"), None)
        parameters = {
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX: l1_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX: l2_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX: l3_id_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX: class_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX: id_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT: sort,
        }
        if (text_format == TextFileFormat.MUNPARSED) and (
            (l1_id_regex is None) or (l2_id_regex is None) or
            (l3_id_regex is None)):
            self.print_error(
                u"You must specify --l1-id-regex and --l2-id-regex and --l3-id-regex for munparsed format"
            )
            return self.ERROR_EXIT_CODE
        if (text_format == TextFileFormat.UNPARSED) and (
                id_regex is None) and (class_regex is None):
            self.print_error(
                u"You must specify --id-regex and/or --class-regex for unparsed format"
            )
            return self.ERROR_EXIT_CODE

        language = gf.safe_unicode(self.actual_arguments[2])

        output_file_path = self.actual_arguments[3]
        if not self.check_output_file(output_file_path):
            return self.ERROR_EXIT_CODE

        text_file = self.get_text_file(text_format, text, parameters)
        if text_file is None:
            self.print_error(
                u"Unable to build a TextFile from the given parameters")
            return self.ERROR_EXIT_CODE
        elif len(text_file) == 0:
            self.print_error(u"No text fragments found")
            return self.ERROR_EXIT_CODE
        text_file.set_language(language)
        self.print_info(u"Read input text with %d fragments" %
                        (len(text_file)))
        if start_fragment is not None:
            self.print_info(u"Slicing from index %d" % (start_fragment))
        if end_fragment is not None:
            self.print_info(u"Slicing to index %d" % (end_fragment))
        text_slice = text_file.get_slice(start_fragment, end_fragment)
        self.print_info(u"Synthesizing %d fragments" % (len(text_slice)))

        if quit_after is not None:
            self.print_info(u"Stop synthesizing upon reaching %.3f seconds" %
                            (quit_after))

        try:
            synt = Synthesizer(rconf=self.rconf, logger=self.logger)
            synt.synthesize(text_slice,
                            output_file_path,
                            quit_after=quit_after,
                            backwards=backwards)
            self.print_success(u"Created file '%s'" % output_file_path)
            synt.clear_cache()
            return self.NO_ERROR_EXIT_CODE
        except ImportError as exc:
            tts = self.rconf[RuntimeConfiguration.TTS]
            if tts == Synthesizer.AWS:
                self.print_error(
                    u"You need to install Python module boto3 to use the AWS Polly TTS API wrapper. Run:"
                )
                self.print_error(u"$ pip install boto3")
                self.print_error(u"or, to install for all users:")
                self.print_error(u"$ sudo pip install boto3")
            elif tts == Synthesizer.NUANCE:
                self.print_error(
                    u"You need to install Python module requests to use the Nuance TTS API wrapper. Run:"
                )
                self.print_error(u"$ pip install requests")
                self.print_error(u"or, to install for all users:")
                self.print_error(u"$ sudo pip install requests")
            else:
                self.print_error(
                    u"An unexpected error occurred while synthesizing text:")
                self.print_error(u"%s" % exc)
        except Exception as exc:
            self.print_error(
                u"An unexpected error occurred while synthesizing text:")
            self.print_error(u"%s" % exc)

        return self.ERROR_EXIT_CODE
Пример #40
0
    def _synthesize_multiple_c_extension(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple text fragments, using the cfw extension.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, :class:`~aeneas.exacttiming.TimeValue`, int))
        """
        self.log(u"Synthesizing using C extension...")

        # convert parameters from Python values to C values
        try:
            c_quit_after = float(quit_after)
        except TypeError:
            c_quit_after = 0.0
        c_backwards = 0
        if backwards:
            c_backwards = 1
        self.log([u"output_file_path: %s", output_file_path])
        self.log([u"c_quit_after:     %.3f", c_quit_after])
        self.log([u"c_backwards:      %d", c_backwards])
        self.log(u"Preparing u_text...")
        u_text = []
        fragments = text_file.fragments
        for fragment in fragments:
            f_lang = fragment.language
            f_text = fragment.filtered_text
            if f_lang is None:
                f_lang = self.DEFAULT_LANGUAGE
            f_voice_code = self.VOICE_CODE_TO_SUBPROCESS[self._language_to_voice_code(f_lang)]
            if f_text is None:
                f_text = u""
            u_text.append((f_voice_code, f_text))
        self.log(u"Preparing u_text... done")

        # call C extension
        sr = None
        sf = None
        intervals = None

        self.log(u"Preparing c_text...")
        if gf.PY2:
            # Python 2 => pass byte strings
            c_text = [(gf.safe_bytes(t[0]), gf.safe_bytes(t[1])) for t in u_text]
        else:
            # Python 3 => pass Unicode strings
            c_text = [(gf.safe_unicode(t[0]), gf.safe_unicode(t[1])) for t in u_text]
        self.log(u"Preparing c_text... done")

        self.log(u"Calling aeneas.cfw directly")
        try:
            self.log(u"Importing aeneas.cfw...")
            import aeneas.cfw.cfw
            self.log(u"Importing aeneas.cfw... done")
            self.log(u"Calling aeneas.cfw...")
            sr, sf, intervals = aeneas.cfw.cfw.synthesize_multiple(
                output_file_path,
                c_quit_after,
                c_backwards,
                c_text
            )
            self.log(u"Calling aeneas.cfw... done")
        except Exception as exc:
            self.log_exc(u"An unexpected error occurred while running cfw", exc, False, None)
            return (False, None)

        self.log([u"sr: %d", sr])
        self.log([u"sf: %d", sf])

        # create output
        anchors = []
        current_time = TimeValue("0.000")
        num_chars = 0
        if backwards:
            fragments = fragments[::-1]
        for i in range(sf):
            # get the correct fragment
            fragment = fragments[i]
            # store for later output
            anchors.append([
                TimeValue(intervals[i][0]),
                fragment.identifier,
                fragment.filtered_text
            ])
            # increase the character counter
            num_chars += fragment.characters
            # update current_time
            current_time = TimeValue(intervals[i][1])

        # return output
        # NOTE anchors do not make sense if backwards == True
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Synthesizing using C extension... done")
        return (True, (anchors, current_time, num_chars))
Пример #41
0
    def read_properties(self, audio_file_path):
        """
        Read the properties of an audio file
        and return them as a dictionary.

        Example: ::

            d["index"]=0
            d["codec_name"]=mp3
            d["codec_long_name"]=MP3 (MPEG audio layer 3)
            d["profile"]=unknown
            d["codec_type"]=audio
            d["codec_time_base"]=1/44100
            d["codec_tag_string"]=[0][0][0][0]
            d["codec_tag"]=0x0000
            d["sample_fmt"]=s16p
            d["sample_rate"]=44100
            d["channels"]=1
            d["channel_layout"]=mono
            d["bits_per_sample"]=0
            d["id"]=N/A
            d["r_frame_rate"]=0/0
            d["avg_frame_rate"]=0/0
            d["time_base"]=1/14112000
            d["start_pts"]=0
            d["start_time"]=0.000000
            d["duration_ts"]=1545083190
            d["duration"]=109.487188
            d["bit_rate"]=128000
            d["max_bit_rate"]=N/A
            d["bits_per_raw_sample"]=N/A
            d["nb_frames"]=N/A
            d["nb_read_frames"]=N/A
            d["nb_read_packets"]=N/A
            d["DISPOSITION:default"]=0
            d["DISPOSITION:dub"]=0
            d["DISPOSITION:original"]=0
            d["DISPOSITION:comment"]=0
            d["DISPOSITION:lyrics"]=0
            d["DISPOSITION:karaoke"]=0
            d["DISPOSITION:forced"]=0
            d["DISPOSITION:hearing_impaired"]=0
            d["DISPOSITION:visual_impaired"]=0
            d["DISPOSITION:clean_effects"]=0
            d["DISPOSITION:attached_pic"]=0

        :param string audio_file_path: the path of the audio file to analyze
        :rtype: dict
        :raises: TypeError: if ``audio_file_path`` is None
        :raises: OSError: if the file at ``audio_file_path`` cannot be read
        :raises: FFPROBEParsingError: if the call to ``ffprobe`` does not produce any output
        :raises: FFPROBEPathError: if the path to the ``ffprobe`` executable cannot be called
        :raises: FFPROBEUnsupportedFormatError: if the file has a format not supported by ``ffprobe``
        """

        # test if we can read the file at audio_file_path
        if audio_file_path is None:
            self.log_exc(u"The audio file path is None", None, True, TypeError)
        if not gf.file_can_be_read(audio_file_path):
            self.log_exc(u"Input file '%s' cannot be read" % (audio_file_path),
                         None, True, OSError)

        # call ffprobe
        arguments = [self.rconf[RuntimeConfiguration.FFPROBE_PATH]]
        arguments.extend(self.FFPROBE_PARAMETERS)
        arguments.append(audio_file_path)
        self.log([u"Calling with arguments '%s'", arguments])
        try:
            proc = subprocess.Popen(arguments,
                                    stdout=subprocess.PIPE,
                                    stdin=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
            (stdoutdata, stderrdata) = proc.communicate()
            proc.stdout.close()
            proc.stdin.close()
            proc.stderr.close()
        except OSError as exc:
            self.log_exc(
                u"Unable to call the '%s' ffprobe executable" %
                (self.rconf[RuntimeConfiguration.FFPROBE_PATH]), exc, True,
                FFPROBEPathError)
        self.log(u"Call completed")

        # check there is some output
        if (stdoutdata is None) or (len(stderrdata) == 0):
            self.log_exc(u"ffprobe produced no output", None, True,
                         FFPROBEParsingError)

        # decode stdoutdata and stderrdata to Unicode string
        try:
            stdoutdata = gf.safe_unicode(stdoutdata)
            stderrdata = gf.safe_unicode(stderrdata)
        except UnicodeDecodeError as exc:
            self.log_exc(u"Unable to decode ffprobe out/err", exc, True,
                         FFPROBEParsingError)

        # dictionary for the results
        results = {
            self.STDOUT_CHANNELS: None,
            self.STDOUT_CODEC_NAME: None,
            self.STDOUT_DURATION: None,
            self.STDOUT_SAMPLE_RATE: None
        }

        # scan the first audio stream the ffprobe stdout output
        # TODO more robust parsing
        # TODO deal with multiple audio streams
        for line in stdoutdata.splitlines():
            if line == self.STDOUT_END_STREAM:
                self.log(u"Reached end of the stream")
                break
            elif len(line.split("=")) == 2:
                key, value = line.split("=")
                results[key] = value
                self.log([u"Found property '%s'='%s'", key, value])

        try:
            self.log([
                u"Duration found in stdout: '%s'",
                results[self.STDOUT_DURATION]
            ])
            results[self.STDOUT_DURATION] = TimeValue(
                results[self.STDOUT_DURATION])
            self.log(u"Valid duration")
        except:
            self.log_warn(u"Invalid duration")
            results[self.STDOUT_DURATION] = None
            # try scanning ffprobe stderr output
            for line in stderrdata.splitlines():
                match = self.STDERR_DURATION_REGEX.search(line)
                if match is not None:
                    self.log([u"Found matching line '%s'", line])
                    results[self.STDOUT_DURATION] = gf.time_from_hhmmssmmm(
                        line)
                    self.log([
                        u"Extracted duration '%.3f'",
                        results[self.STDOUT_DURATION]
                    ])
                    break

        if results[self.STDOUT_DURATION] is None:
            self.log_exc(
                u"No duration found in stdout or stderr. Unsupported audio file format?",
                None, True, FFPROBEUnsupportedFormatError)

        # return dictionary
        self.log(u"Returning dict")
        return results
Пример #42
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 4:
            return self.print_help()
        text_format = gf.safe_unicode(self.actual_arguments[0])
        if text_format == u"list":
            text = gf.safe_unicode(self.actual_arguments[1])
        elif text_format in TextFileFormat.ALLOWED_VALUES:
            text = self.actual_arguments[1]
            if not self.check_input_file(text):
                return self.ERROR_EXIT_CODE
        else:
            return self.print_help()

        l1_id_regex = self.has_option_with_value(u"--l1-id-regex")
        l2_id_regex = self.has_option_with_value(u"--l2-id-regex")
        l3_id_regex = self.has_option_with_value(u"--l3-id-regex")
        id_regex = self.has_option_with_value(u"--id-regex")
        class_regex = self.has_option_with_value(u"--class-regex")
        sort = self.has_option_with_value(u"--sort")
        parameters = {
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX : l1_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX : l2_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX : l3_id_regex,
            gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX : id_regex,
            gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX : class_regex,
            gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT : sort
        }
        if (text_format == TextFileFormat.MUNPARSED) and ((l1_id_regex is None) or (l2_id_regex is None) or (l3_id_regex is None)):
            self.print_error(u"You must specify --l1-id-regex and --l2-id-regex and --l3-id-regex for munparsed format")
            return self.ERROR_EXIT_CODE
        if (text_format == TextFileFormat.UNPARSED) and (id_regex is None) and (class_regex is None):
            self.print_error(u"You must specify --id-regex and/or --class-regex for unparsed format")
            return self.ERROR_EXIT_CODE

        language = gf.safe_unicode(self.actual_arguments[2])

        audio_file_path = self.actual_arguments[3]
        if not self.check_input_file(audio_file_path):
            return self.ERROR_EXIT_CODE

        text_file = self.get_text_file(text_format, text, parameters)
        if text_file is None:
            self.print_error(u"Unable to build a TextFile from the given parameters")
            return self.ERROR_EXIT_CODE
        elif len(text_file) == 0:
            self.print_error(u"No text fragments found")
            return self.ERROR_EXIT_CODE
        text_file.set_language(language)
        self.print_info(u"Read input text with %d fragments" % (len(text_file)))

        self.print_info(u"Reading audio...")
        try:
            audio_file_mfcc = AudioFileMFCC(audio_file_path, rconf=self.rconf, logger=self.logger)
        except AudioFileConverterError:
            self.print_error(u"Unable to call the ffmpeg executable '%s'" % (self.rconf[RuntimeConfiguration.FFMPEG_PATH]))
            self.print_error(u"Make sure the path to ffmpeg is correct")
            return self.ERROR_EXIT_CODE
        except (AudioFileUnsupportedFormatError, AudioFileNotInitializedError):
            self.print_error(u"Cannot read file '%s'" % (audio_file_path))
            self.print_error(u"Check that its format is supported by ffmpeg")
            return self.ERROR_EXIT_CODE
        except Exception as exc:
            self.print_error(u"An unexpected error occurred while reading the audio file:")
            self.print_error(u"%s" % exc)
            return self.ERROR_EXIT_CODE
        self.print_info(u"Reading audio... done")

        self.print_info(u"Running VAD...")
        audio_file_mfcc.run_vad()
        self.print_info(u"Running VAD... done")

        min_head = gf.safe_float(self.has_option_with_value(u"--min-head"), None)
        max_head = gf.safe_float(self.has_option_with_value(u"--max-head"), None)
        min_tail = gf.safe_float(self.has_option_with_value(u"--min-tail"), None)
        max_tail = gf.safe_float(self.has_option_with_value(u"--max-tail"), None)

        self.print_info(u"Detecting audio interval...")
        start_detector = SD(audio_file_mfcc, text_file, rconf=self.rconf, logger=self.logger)
        start, end = start_detector.detect_interval(min_head, max_head, min_tail, max_tail)
        self.print_info(u"Detecting audio interval... done")

        self.print_result(audio_file_mfcc.audio_length, start, end)
        return self.NO_ERROR_EXIT_CODE
Пример #43
0
    def run(self, arguments, show_help=True):
        """
        Program entry point.

        Please note that the first item in ``arguments`` is discarded,
        as it is assumed to be the script/invocation name;
        pass a "dumb" placeholder if you call this method with
        an argument different that ``sys.argv``.

        :param arguments: the list of arguments
        :type  arguments: list
        :param show_help: if ``False``, do not show help on ``-h`` and ``--help``
        :type  show_help: bool
        :rtype: int
        """
        # convert arguments into Unicode strings
        if self.use_sys:
            # check that sys.stdin.encoding and sys.stdout.encoding are set to utf-8
            if not gf.FROZEN:
                if sys.stdin.encoding not in ["UTF-8", "UTF8"]:
                    self.print_warning(
                        u"The default input encoding is not UTF-8.")
                    self.print_warning(
                        u"You might want to set 'PYTHONIOENCODING=UTF-8' in your shell."
                    )
                if sys.stdout.encoding not in ["UTF-8", "UTF8"]:
                    self.print_warning(
                        u"The default output encoding is not UTF-8.")
                    self.print_warning(
                        u"You might want to set 'PYTHONIOENCODING=UTF-8' in your shell."
                    )
            # decode using sys.stdin.encoding
            args = [gf.safe_unicode_stdin(arg) for arg in arguments]
        else:
            # decode using utf-8 (but you should pass Unicode strings as parameters anyway)
            args = [gf.safe_unicode(arg) for arg in arguments]

        if show_help:
            if u"-h" in args:
                return self.print_help(short=True)

            if u"--help" in args:
                return self.print_help(short=False)

            if u"--version" in args:
                return self.print_name_version()

        # store formal arguments
        self.formal_arguments_raw = arguments
        self.formal_arguments = args

        # to obtain the actual arguments,
        # remove the first one and "special" switches
        args = args[1:]
        set_args = set(args)

        # set verbosity, if requested
        for flag in set([u"-v", u"--verbose"]) & set_args:
            self.verbose = True
            args.remove(flag)
        for flag in set([u"-vv", u"--very-verbose"]) & set_args:
            self.verbose = True
            self.very_verbose = True
            args.remove(flag)

        # set RuntimeConfiguration string, if specified
        for flag in [u"-r", u"--runtime-configuration"]:
            rconf_string = self.has_option_with_value(flag,
                                                      actual_arguments=False)
            if rconf_string is not None:
                self.rconf = RuntimeConfiguration(rconf_string)
                args.remove("%s=%s" % (flag, rconf_string))

        # set log file path, if requested
        log_path = None
        for flag in [u"-l", u"--log"]:
            log_path = self.has_option_with_value(flag, actual_arguments=False)
            if log_path is not None:
                args.remove("%s=%s" % (flag, log_path))
            elif flag in set_args:
                handler, log_path = gf.tmp_file(
                    suffix=u".log",
                    root=self.rconf[RuntimeConfiguration.TMP_PATH])
                args.remove(flag)
            if log_path is not None:
                self.log_file_path = log_path

        # if no actual arguments left, print help
        if (len(args) < 1) and (show_help):
            return self.print_help(short=True)

        # store actual arguments
        self.actual_arguments = args

        # create logger
        self.logger = Logger(tee=self.verbose,
                             tee_show_datetime=self.very_verbose)
        self.log([u"Formal arguments: %s", self.formal_arguments])
        self.log([u"Actual arguments: %s", self.actual_arguments])
        self.log([u"Runtime configuration: '%s'", self.rconf.config_string()])

        # perform command
        exit_code = self.perform_command()
        self.log([u"Execution completed with code %d", exit_code])

        # output log if requested
        if self.log_file_path is not None:
            self.log([
                u"User requested saving log to file '%s'", self.log_file_path
            ])
            self.logger.write(self.log_file_path)
            if self.use_sys:
                self.print_info(u"Log written to file '%s'" %
                                self.log_file_path)

        return self.exit(exit_code)
Пример #44
0
    def _synthesize_multiple_c_extension(self, text_file, output_file_path, quit_after=None, backwards=False):
        """
        Synthesize multiple text fragments, using the cew extension.

        Return a tuple (anchors, total_time, num_chars).

        :rtype: (bool, (list, :class:`~aeneas.timevalue.TimeValue`, int))
        """
        self.log(u"Synthesizing using C extension...")

        # convert parameters from Python values to C values
        try:
            c_quit_after = float(quit_after)
        except TypeError:
            c_quit_after = 0.0
        c_backwards = 0
        if backwards:
            c_backwards = 1
        self.log([u"output_file_path: %s", output_file_path])
        self.log([u"c_quit_after:     %.3f", c_quit_after])
        self.log([u"c_backwards:      %d", c_backwards])
        self.log(u"Preparing u_text...")
        u_text = []
        fragments = text_file.fragments
        for fragment in fragments:
            f_lang = fragment.language
            f_text = fragment.filtered_text
            if f_lang is None:
                f_lang = self.DEFAULT_LANGUAGE
            f_voice_code = self._language_to_voice_code(f_lang)
            if f_text is None:
                f_text = u""
            u_text.append((f_voice_code, f_text))
        self.log(u"Preparing u_text... done")

        # call C extension
        sr = None
        sf = None
        intervals = None
        if self.rconf[RuntimeConfiguration.CEW_SUBPROCESS_ENABLED]:
            self.log(u"Using cewsubprocess to call aeneas.cew")
            try:
                self.log(u"Importing aeneas.cewsubprocess...")
                from aeneas.cewsubprocess import CEWSubprocess
                self.log(u"Importing aeneas.cewsubprocess... done")
                self.log(u"Calling aeneas.cewsubprocess...")
                cewsub = CEWSubprocess(rconf=self.rconf, logger=self.logger)
                sr, sf, intervals = cewsub.synthesize_multiple(output_file_path, c_quit_after, c_backwards, u_text)
                self.log(u"Calling aeneas.cewsubprocess... done")
            except Exception as exc:
                self.log_exc(u"An unexpected error occurred while running cewsubprocess", exc, False, None)
                # NOTE not critical, try calling aeneas.cew directly
                #return (False, None)

        if sr is None:
            self.log(u"Preparing c_text...")
            if gf.PY2:
                # Python 2 => pass byte strings
                c_text = [(gf.safe_bytes(t[0]), gf.safe_bytes(t[1])) for t in u_text]
            else:
                # Python 3 => pass Unicode strings
                c_text = [(gf.safe_unicode(t[0]), gf.safe_unicode(t[1])) for t in u_text]
            self.log(u"Preparing c_text... done")

            self.log(u"Calling aeneas.cew directly")
            try:
                self.log(u"Importing aeneas.cew...")
                import aeneas.cew.cew
                self.log(u"Importing aeneas.cew... done")
                self.log(u"Calling aeneas.cew...")
                sr, sf, intervals = aeneas.cew.cew.synthesize_multiple(
                    output_file_path,
                    c_quit_after,
                    c_backwards,
                    c_text
                )
                self.log(u"Calling aeneas.cew... done")
            except Exception as exc:
                self.log_exc(u"An unexpected error occurred while running cew", exc, False, None)
                return (False, None)

        self.log([u"sr: %d", sr])
        self.log([u"sf: %d", sf])

        # create output
        anchors = []
        current_time = TimeValue("0.000")
        num_chars = 0
        if backwards:
            fragments = fragments[::-1]
        for i in range(sf):
            # get the correct fragment
            fragment = fragments[i]
            # store for later output
            anchors.append([
                TimeValue(intervals[i][0]),
                fragment.identifier,
                fragment.filtered_text
            ])
            # increase the character counter
            num_chars += fragment.characters
            # update current_time
            current_time = TimeValue(intervals[i][1])

        # return output
        # NOTE anchors do not make sense if backwards == True
        self.log([u"Returning %d time anchors", len(anchors)])
        self.log([u"Current time %.3f", current_time])
        self.log([u"Synthesized %d characters", num_chars])
        self.log(u"Synthesizing using C extension... done")
        return (True, (anchors, current_time, num_chars))
Пример #45
0
    def perform_command(self):
        """
        Perform command and return the appropriate exit code.

        :rtype: int
        """
        if len(self.actual_arguments) < 4:
            return self.print_help()
        text_format = gf.safe_unicode(self.actual_arguments[0])
        if text_format == u"list":
            text = gf.safe_unicode(self.actual_arguments[1])
        elif text_format in TextFileFormat.ALLOWED_VALUES:
            text = self.actual_arguments[1]
            if not self.check_input_file(text):
                return self.ERROR_EXIT_CODE
        else:
            return self.print_help()

        l1_id_regex = self.has_option_with_value(u"--l1-id-regex")
        l2_id_regex = self.has_option_with_value(u"--l2-id-regex")
        l3_id_regex = self.has_option_with_value(u"--l3-id-regex")
        id_regex = self.has_option_with_value(u"--id-regex")
        class_regex = self.has_option_with_value(u"--class-regex")
        sort = self.has_option_with_value(u"--sort")
        parameters = {
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX: l1_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX: l2_id_regex,
            gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX: l3_id_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX: class_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX: id_regex,
            gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT: sort,
        }
        if (text_format == TextFileFormat.MUNPARSED) and (
            (l1_id_regex is None) or (l2_id_regex is None) or
            (l3_id_regex is None)):
            self.print_error(
                u"You must specify --l1-id-regex and --l2-id-regex and --l3-id-regex for munparsed format"
            )
            return self.ERROR_EXIT_CODE
        if (text_format == TextFileFormat.UNPARSED) and (
                id_regex is None) and (class_regex is None):
            self.print_error(
                u"You must specify --id-regex and/or --class-regex for unparsed format"
            )
            return self.ERROR_EXIT_CODE

        language = gf.safe_unicode(self.actual_arguments[2])

        audio_file_path = self.actual_arguments[3]
        if not self.check_input_file(audio_file_path):
            return self.ERROR_EXIT_CODE

        text_file = self.get_text_file(text_format, text, parameters)
        if text_file is None:
            self.print_error(
                u"Unable to build a TextFile from the given parameters")
            return self.ERROR_EXIT_CODE
        elif len(text_file) == 0:
            self.print_error(u"No text fragments found")
            return self.ERROR_EXIT_CODE
        text_file.set_language(language)
        self.print_info(u"Read input text with %d fragments" %
                        (len(text_file)))

        self.print_info(u"Reading audio...")
        try:
            audio_file_mfcc = AudioFileMFCC(audio_file_path,
                                            rconf=self.rconf,
                                            logger=self.logger)
        except AudioFileConverterError:
            self.print_error(u"Unable to call the ffmpeg executable '%s'" %
                             (self.rconf[RuntimeConfiguration.FFMPEG_PATH]))
            self.print_error(u"Make sure the path to ffmpeg is correct")
            return self.ERROR_EXIT_CODE
        except (AudioFileUnsupportedFormatError, AudioFileNotInitializedError):
            self.print_error(u"Cannot read file '%s'" % (audio_file_path))
            self.print_error(u"Check that its format is supported by ffmpeg")
            return self.ERROR_EXIT_CODE
        except Exception as exc:
            self.print_error(
                u"An unexpected error occurred while reading the audio file:")
            self.print_error(u"%s" % exc)
            return self.ERROR_EXIT_CODE
        self.print_info(u"Reading audio... done")

        self.print_info(u"Running VAD...")
        audio_file_mfcc.run_vad()
        self.print_info(u"Running VAD... done")

        min_head = gf.safe_float(self.has_option_with_value(u"--min-head"),
                                 None)
        max_head = gf.safe_float(self.has_option_with_value(u"--max-head"),
                                 None)
        min_tail = gf.safe_float(self.has_option_with_value(u"--min-tail"),
                                 None)
        max_tail = gf.safe_float(self.has_option_with_value(u"--max-tail"),
                                 None)

        self.print_info(u"Detecting audio interval...")
        start_detector = SD(audio_file_mfcc,
                            text_file,
                            rconf=self.rconf,
                            logger=self.logger)
        start, end = start_detector.detect_interval(min_head, max_head,
                                                    min_tail, max_tail)
        self.print_info(u"Detecting audio interval... done")

        self.print_result(audio_file_mfcc.audio_length, start, end)
        return self.NO_ERROR_EXIT_CODE