Exemplo n.º 1
0
def steps(step):
    """ Go through steps """
    if step == 1:
        session.clear()
        session["temp_dir"] = mkdtemp()
        temp_dir = session["temp_dir"]
        langs, lang_names = getLangs()
        return render_template(
            "upload.html",
            uploaded=uploaded_files(temp_dir),
            maps=[{"code": m, "name": lang_names[m]} for m in langs],
        )
    elif step == 2:
        return render_template("preview.html")
    elif step == 3:
        if "audio" not in session or "text" not in session:
            log = "Sorry, it looks like something is wrong with your audio or text. Please try again"
        else:
            flags = ["--force-overwrite"]
            for option in ["--closed-captioning", "--save-temps", "--text-grid"]:
                if session["config"].get(option, False):
                    flags.append(option)
            if session["text"].endswith("txt"):
                flags.append("--text-input")
                flags.append("--language")
                flags.append(session["config"]["lang"])
            timestamp = str(int(datetime.now().timestamp()))
            output_base = "aligned" + timestamp
            args = (
                ["readalongs", "align"]
                + flags
                + [
                    session["text"],
                    session["audio"],
                    os.path.join(session["temp_dir"], output_base),
                ]
            )
            LOGGER.warning(args)
            _, audio_ext = os.path.splitext(session["audio"])
            data = {"audio_ext": audio_ext, "base": output_base}
            if session["config"].get("show-log", False):
                log = run(args, capture_output=True, check=False)
                data["log"] = log
            else:
                run(args, check=False)
            data["audio_path"] = os.path.join(
                session["temp_dir"], output_base, output_base + audio_ext
            )
            data["audio_fn"] = f"/file/{output_base}" + audio_ext
            data["text_path"] = os.path.join(
                session["temp_dir"], output_base, output_base + ".xml"
            )
            data["text_fn"] = f"/file/{output_base}" + ".xml"
            data["smil_path"] = os.path.join(
                session["temp_dir"], output_base, output_base + ".smil"
            )
            data["smil_fn"] = f"/file/{output_base}" + ".smil"
        return render_template("export.html", data=data)
    else:
        abort(404)
Exemplo n.º 2
0
def mute_section(audio: AudioSegment, start: int, end: int) -> AudioSegment:
    """ Given an AudioSegment, reduce the gain between a given interval by 120db.
        Effectively, make it silent.

    Parameters
    ----------
    audio : AudioSegment
        audio segment to mute
    start : int
        start timestamp of audio (ms)
    end : int
        end timestamp of audio (ms)

    Returns
    -------
    AudioSegment
        A muted audio segment
    """
    try:
        return audio[:start] + audio[start:end].apply_gain(-120) + audio[end:]
    except IndexError:
        LOGGER.error(
            f"Tried to mute audio between {start} and {end}, but audio is only {len(audio)}ms long. \
                     Returning unmuted audio instead.")
        return audio
Exemplo n.º 3
0
def make_fsg(word_elements, filename):
    name = slugify(os.path.splitext(os.path.basename(filename))[0])
    data = {
        "name": name,  # If name includes special characters, pocketsphinx throws a RuntimeError: new_Decoder returned -1
        "states": [],
        "num_states": 0,
    }

    for e in word_elements:
        if "id" not in e.attrib:  # don't put in elements with no id
            continue
        if not e.text or not e.text.strip():
            LOGGER.warning("No text in node %s", e.attrib["id"])
            continue
        text = e.text.strip()
        # if not e.text.strip():  # don't put in elements with no text
        #    continue
        data["states"].append(
            {
                "id": e.attrib["id"] if text else "",
                "current": data["num_states"],
                "next": data["num_states"] + 1,
            }
        )
        data["num_states"] += 1

    data["final_state"] = data["num_states"]
    data["num_states"] += 1

    return chevron.render(FSG_TEMPLATE, data)
Exemplo n.º 4
0
 def test_align_removed(self):
     """Try aligning section with removed audio"""
     # Process Audio
     removed_segment = remove_section(self.noisy_segment, 1500, 2500)
     audio_output_path = os.path.join(self.tempdir, "removed_sample.mp3")
     with open(audio_output_path, "wb") as f:
         removed_segment.export(f)
     # Align
     input_text_path = os.path.join(self.data_dir, "audio_sample.txt")
     input_audio_path = audio_output_path
     flags = ["-l", "eng"]
     output_path = os.path.join(self.tempdir, "output_removed")
     process = self.align(input_text_path, input_audio_path, output_path,
                          flags)
     if process.returncode != 0:
         LOGGER.error("Subprocess readalongs align failed: %s",
                      process.stderr)
     # Check Result
     smilpath = Path(output_path)
     smil_files = smilpath.glob("*.smil")
     self.assertTrue(
         next(smil_files, False),
         "No *.smil files found; "
         "pip install --force-reinstall --upgrade might be required "
         "if dependencies changed.",
     )
Exemplo n.º 5
0
def extract_section(audio: AudioSegment, start: Union[None, int],
                    end: Union[None, int]) -> AudioSegment:
    """ Given an AudioSegment, extract and keep only the [start, end) interval

    Args:
        audio (AudioSegment): audio segment to extract a section from
        start (Union[None,int]): start timestamp of audio to extract (ms)
            (None means begining of audio)
        end (Union[None,int]): end timestamp of audio to extract (ms)
            (None means end of audio)

    Returns:
        AudioSegment: the extracted audio segment
    """
    # Optimization: don't copy the data if we're extracting from None to None
    if start is None and end is None:
        return audio

    try:
        return audio[start:end]
    except IndexError:
        LOGGER.error(
            f"Tried to extract audio between {start} and {end}, but audio is only "
            f"{len(audio)}ms long. Returning whole audio instead.")
        return audio
Exemplo n.º 6
0
def return_temp_file(fname):
    fn, ext = os.path.splitext(fname)
    LOGGER.warn(session["temp_dir"])
    path = os.path.join(session["temp_dir"], fn, fname)
    if os.path.exists(path):
        return send_file(path)
    else:
        abort(404, "Sorry, we couldn't find that file.")
Exemplo n.º 7
0
 def test_generate_output_name(self):
     input_file = os.path.join(self.tempdir, "someinput.txt")
     copyfile(os.path.join(self.data_dir, "fra.txt"), input_file)
     results = self.runner.invoke(prepare, ["-l", "fra", input_file])
     LOGGER.warning("Output: {}".format(results.output))
     LOGGER.warning("Exception: {}".format(results.exception))
     self.assertEqual(results.exit_code, 0)
     self.assertRegex(results.stdout, "Wrote.*someinput[.]xml")
     self.assertTrue(os.path.exists(os.path.join(self.tempdir, "someinput.xml")))
Exemplo n.º 8
0
def load_tsv(input_path, labels):
    results = []
    with open(input_path, "r", encoding="utf-8") as fin:
        for i, line in enumerate(fin, start=1):
            pieces = line.strip("\n").strip(" ").split("\t")
            if len(pieces) > len(labels):
                LOGGER.error("More columns than labels on line %s" % i)
                continue
            results.append(OrderedDict(zip(labels, pieces)))
    return results
Exemplo n.º 9
0
def remove_section(audio: AudioSegment, start: int, end: int) -> AudioSegment:
    """ Given an AudioSement, remove the section between start (ms) and end (ms)
    """
    try:
        return audio[:start] + audio[end:]
    except IndexError:
        LOGGER.error(
            f"Tried to remove audio between {start} and {end}, but audio is only "
            f"{len(audio)}ms long. Returning unchanged audio instead.")
        return audio
Exemplo n.º 10
0
def tokenize_xml(xml):
    tokenizer = XMLTokenizer()
    xml = deepcopy(xml)
    # FIXME: different langs have different normalizations, is this necessary?
    unicode_normalize_xml(xml)
    words = xml.xpath(".//w")
    if words:
        LOGGER.info("Words (<w>) already present; skipping tokenization")
        return xml
    LOGGER.info("Words (<w>) not present; tokenizing")
    return tokenizer.add_word_children(xml)
Exemplo n.º 11
0
def tokenize_xml(xml):
    """Returns a deep copy of xml with all words wrapped in a "w" XML element"""
    xml = deepcopy(xml)
    # FIXME: different langs have different normalizations, is this necessary?
    unicode_normalize_xml(xml)
    words = xml.xpath(".//w")
    if words:
        LOGGER.info("Words (<w>) already present; skipping tokenization")
        return xml
    LOGGER.info("Words (<w>) not present; tokenizing")
    return tokenize_xml_in_place(xml)
Exemplo n.º 12
0
def join_section(audio: AudioSegment, audio_to_insert: AudioSegment,
                 start: int):
    """ Given two AudioSegments, insert the second into the first at start (ms)
    """
    try:
        return audio[:start] + audio_to_insert + audio[start:]
    except IndexError:
        LOGGER.error(
            f"Tried to insert audio at {start}, but audio is only {len(audio)}ms long. "
            "Returning unchanged audio instead.")
        return audio
Exemplo n.º 13
0
    def convert_word(word: str, lang: str):
        """Convert one individual word through the specified cascade of g2p mappings.

        Args:
            word (str): input word to map through g2p
            lang (str): the language code to use to attempt the g2p mapping

        Returns:
            g2p_text (str), valid(bool):
              - g2p_text is the word mapping from lang to output_orthography
              - valid is a flag indicating whether g2p conversion yielded valid
                output, which includes making sure IPA output was valid IPA and
                ARPABET output was valid ARPABET, at all intermediate steps as
                well as in the final output.
        """

        if lang == "eng":
            # Hack to use old English LexiconG2P
            # Note: adding eng_ prefix to vars that are used in both blocks to make mypy
            # happy. Since the two sides of the if and in the same scope, it complains about
            # type checking otherwise.
            assert output_orthography == "eng-arpabet"
            eng_converter = getLexiconG2P(
                os.path.join(os.path.dirname(LEXICON_PATH), "cmu_sphinx.metadata.json")
            )
            try:
                eng_text, _ = eng_converter.convert(word)
                eng_valid = is_arpabet(eng_text)
            except KeyError as e:
                if verbose_warnings:
                    LOGGER.warning(f'Could not g2p "{word}" as English: {e.args[0]}')
                eng_text = word
                eng_valid = False
            return eng_text, eng_valid
        else:
            try:
                converter = make_g2p(lang, output_orthography)
            except InvalidLanguageCode as e:
                raise ValueError(
                    f'Could not g2p "{word}" as "{lang}": invalid language code. '
                    f"Use one of {getLangs()[0]}"
                ) from e
            except NoPath as e:
                raise ValueError(
                    f'Count not g2p "{word}" as "{lang}": no path to "{output_orthography}". '
                    f"Use one of {getLangs()[0]}"
                ) from e
            tg = converter(word)
            text = tg.output_string.strip()
            valid = converter.check(tg, shallow=True)
            if not valid and verbose_warnings:
                converter.check(tg, shallow=False, display_warnings=verbose_warnings)
            return text, valid
Exemplo n.º 14
0
 def align(self, input_text_path, input_audio_path, output_path, flags):
     args = [
         "readalongs",
         "align",
         input_text_path,
         input_audio_path,
         output_path,
     ] + flags
     LOGGER.info(
         f"Aligning {input_text_path} and {input_audio_path}, outputting to {output_path}"
     )
     return run(args, capture_output=True)
Exemplo n.º 15
0
def get_sequences(xml,
                  xml_filename,
                  unit="w",
                  anchor="anchor") -> List[WordSequence]:
    """Return the list of anchor-separated word sequences in xml

    Args:
        xml (etree): xml structure in which to search for words and anchors
        xml_filename (str): filename, used for error messages only
        unit (str): element tag of the word units
        anchor (str): element tag of the anchors

    Returns:
        List[WordSequence]: all sequences found in xml
    """

    sequences: List[WordSequence] = []
    start = None
    words = []
    all_good = True
    for e in xml.xpath(f".//{unit} | .//{anchor}"):
        if e.tag == unit:
            words.append(e)
        else:
            assert e.tag == anchor
            try:
                end = parse_time(e.attrib["time"])
            except KeyError:
                LOGGER.error(
                    f'Invalid {anchor} element in {xml_filename}: missing "time" attribute'
                )
                all_good = False
                continue
            except ValueError as err:
                LOGGER.error(
                    f'Invalid {anchor} element in {xml_filename}: invalid "time" '
                    f'attribute "{e.attrib["time"]}": {err}')
                all_good = False
                continue
            if words:
                sequences.append(WordSequence(start, end, words))
            words = []
            start = end
    if words:
        sequences.append(WordSequence(start, None, words))

    if not all_good:
        raise RuntimeError(
            f"Could not parse all anchors in {xml_filename}, please make sure each anchor "
            'element is properly formatted, e.g., <anchor time="34.5s"/>.  Aborting.'
        )

    return sequences
Exemplo n.º 16
0
 def align(self, input_text_path, input_audio_path, output_path, flags):
     """Wrapper for invoking readalongs align via subprocess.run"""
     args = [
         "readalongs",
         "align",
         input_text_path,
         input_audio_path,
         output_path,
     ] + flags
     LOGGER.info(
         f"Aligning {input_text_path} and {input_audio_path}, outputting to {output_path}"
     )
     return run(args, capture_output=True, check=False, encoding="utf-8")
Exemplo n.º 17
0
def encode_from_path(path: str) -> str:
    """Encode file from bytes to b64 string with data and mime signature

    Args:
        path (str): path to file

    Returns:
        str: base64 string with data and mime signature
    """
    import requests  # Defer expensive import

    with open(path, "rb") as f:
        path_bytes = f.read()
    if path.endswith("xml"):
        root = etree.fromstring(path_bytes)
        for img in root.xpath("//graphic"):
            url = img.get("url")
            res = requests.get(url) if url.startswith("http") else None
            mime = guess_type(url)
            if os.path.exists(url):
                with open(url, "rb") as f:
                    img_bytes = f.read()
                img_b64 = str(b64encode(img_bytes), encoding="utf8")
            elif res and res.status_code == 200:
                img_b64 = str(b64encode(res.content), encoding="utf8")
            else:
                LOGGER.warn(
                    f"The image declared at {url} could not be found. Please check that it exists."
                )
                continue
            img.attrib["url"] = f"data:{mime[0]};base64,{img_b64}"
        path_bytes = etree.tostring(root)
    b64 = str(b64encode(path_bytes), encoding="utf8")
    mime = guess_type(path)
    if path.endswith(
            ".m4a"
    ):  # hack to get around guess_type choosing the wrong mime type for .m4a files
        # TODO: Check other popular audio formats, .wav, .mp3, .ogg, etc...
        mime_type = "audio/mp4"
    elif mime[0]:
        mime_type = mime[0].replace(
            "video", "audio"
        )  # Hack: until we properly extract audio from video files, force any video-based mime type to be read as audio
    else:
        mime_type = "application"
        LOGGER.warn(
            f"We could not guess the mime type of file at {path}, we will try the generic mime type 'application', but this might not work with some files"
        )
    return f"data:{mime_type};base64,{b64}"
Exemplo n.º 18
0
def make_dict(word_elements, input_filename, unit="m"):
    data = {"items": []}
    nwords = 0
    for e in word_elements:
        if "id" not in e.attrib:
            LOGGER.error("%s-type element without id in file %s" %
                         (unit, input_filename))
        text = e.attrib.get("ARPABET", "").strip()
        if not text:
            continue
        nwords += 1
        data["items"].append({"id": e.attrib["id"], "pronunciation": text})
    if nwords == 0:
        raise RuntimeError("No words in dictionary!")
    return chevron.render(DICT_TEMPLATE, data)
Exemplo n.º 19
0
def process_src_attrib(src_text, id_prefix, mimetypes):
    filename = src_text.split("#")[0]
    filename_without_ext, ext = os.path.splitext(filename)
    ext = ext.strip(".")
    if ext not in mimetypes:
        LOGGER.warning("Unknown extension in SMIL: %s", ext)
        return None
    entry = {
        "origin_path": filename,
        "dest_path": filename,
        "ext": ext.lower(),
        "id": id_prefix + os.path.basename(filename_without_ext),
        "mimetype": mimetypes[ext],
    }
    return entry
Exemplo n.º 20
0
 def test_align_sample(self):
     """ Sanity check that test audio should align
     """
     # Align
     input_text_path = os.path.join(self.data_path, "audio_sample.txt")
     input_audio_path = os.path.join(self.data_path, "audio_sample.ogg")
     flags = ["-i", "-l", "eng"]
     output_path = os.path.join(self.tempdir, "output")
     log = self.align(input_text_path, input_audio_path, output_path, flags)
     LOGGER.info(str(log))
     # Check Result
     smilpath = Path(output_path)
     smil_files = smilpath.glob("*.smil")
     self.assertGreaterEqual(len([x for x in smil_files]), 1)
     self.assertFalse("error" in str(log).lower())
Exemplo n.º 21
0
class BasicTestCase(TestCase):
    """A Basic Unittest build block class that comes bundled with
    a temporary directory (tempdir), and access to an app runner
    (self.runner)
    """

    LOGGER.setLevel("DEBUG")
    data_dir = os.path.join(os.path.dirname(__file__), "data")

    # Set this to True to keep the temp dirs after running, for manual inspection
    # but please don't push a commit setting this to True!
    keep_temp_dir_after_running = False

    def setUp(self):
        """Create a temporary directory, self.tempdir, and a test runner, self.runner"""
        app.logger.setLevel("DEBUG")
        self.runner = app.test_cli_runner()
        tempdir_prefix = f"tmpdir_{type(self).__name__}_"
        if not self.keep_temp_dir_after_running:
            self.tempdirobj = tempfile.TemporaryDirectory(
                prefix=tempdir_prefix, dir=".")
            self.tempdir = self.tempdirobj.name
        else:
            # Alternative tempdir code keeps it after running, for manual inspection:
            self.tempdir = tempfile.mkdtemp(prefix=tempdir_prefix, dir=".")
            print("tmpdir={}".format(self.tempdir))

    def tearDown(self):
        """Clean up the temporary directory"""
        if not self.keep_temp_dir_after_running:
            self.tempdirobj.cleanup()
Exemplo n.º 22
0
class TestForceAlignment(unittest.TestCase):
    LOGGER.setLevel("DEBUG")
    data_dir = os.path.join(os.path.dirname(__file__), "data")

    def testAlign(self):
        xml_path = os.path.join(self.data_dir, "ej-fra.xml")
        wav_path = os.path.join(self.data_dir, "ej-fra.m4a")
        results = align_audio(xml_path, wav_path, unit="w")

        # Verify that the same IDs are in the output
        converted_path = os.path.join(self.data_dir, "ej-fra-converted.xml")
        xml = etree.parse(converted_path).getroot()
        words = results["words"]
        xml_words = xml.xpath(".//w")
        self.assertEqual(len(words), len(xml_words))
        for w, xw in zip(words, xml_words):
            self.assertEqual(xw.attrib["id"], w["id"])

    def testAlignText(self):
        txt_path = os.path.join(self.data_dir, "ej-fra.txt")
        wav_path = os.path.join(self.data_dir, "ej-fra.m4a")
        # tempfh, temp_fn = create_input_xml(txt_path, text_language='git', save_temps="unit")
        tempfh, temp_fn = create_input_tei(input_file_name=txt_path,
                                           text_language="fra",
                                           save_temps=None)
        results = align_audio(temp_fn, wav_path, unit="w", save_temps=None)

        # Verify that the same IDs are in the output
        converted_path = os.path.join(self.data_dir, "ej-fra-converted.xml")
        xml = etree.parse(converted_path).getroot()
        words = results["words"]
        xml_words = xml.xpath(".//w")
        self.assertEqual(len(words), len(xml_words))
        for w, xw in zip(words, xml_words):
            self.assertEqual(xw.attrib["id"], w["id"])
Exemplo n.º 23
0
def run_tests(suite):
    if suite == "e2e":
        suite = TestSuite(e2e_tests)
    elif suite == "dev":
        suite = TestSuite(indices_tests + other_tests + e2e_tests)
    elif suite == "prod" or suite == "all":
        suite = loader.discover(os.path.dirname(__file__))
    elif suite == "other":
        suite = TestSuite(other_tests)
    else:
        LOGGER.error(
            "Sorry, you need to select a Test Suite to run, like 'dev' or 'prod'"
        )

    runner = TextTestRunner(verbosity=3)
    return runner.run(suite)
Exemplo n.º 24
0
def run_tests(suite):
    """Run the specified test suite"""

    if suite == "e2e":
        suite = TestSuite(e2e_tests)
    elif suite == "dev":
        suite = TestSuite(indices_tests + other_tests + e2e_tests)
    elif suite in ("prod", "all"):
        suite = loader.discover(os.path.dirname(__file__))
    elif suite == "other":
        suite = TestSuite(other_tests)
    else:
        LOGGER.error("Sorry, you need to select a Test Suite to run, one of: "
                     "dev, all (or prod), e2e, other")
        sys.exit(1)

    runner = TextTestRunner(verbosity=3)
    return runner.run(suite)
Exemplo n.º 25
0
def create_web_component_html(
    text_path: str,
    alignment_path: str,
    audio_path: str,
    title="Title goes here",
    header="Header goes here",
    subheader="Subheader goes here",
    theme="light",
) -> str:
    import requests  # Defer expensive import

    js = requests.get(JS_BUNDLE_URL)
    fonts = requests.get(FONTS_BUNDLE_URL)
    if js.status_code != 200:
        LOGGER.warn(
            f"Sorry, the JavaScript bundle that is supposed to be at {JS_BUNDLE_URL} returned a {js.status_code}. Your ReadAlong will be bundled using a version that may not be up-to-date. Please check your internet connection."
        )
        with open(os.path.join(os.path.dirname(__file__), "bundle.js"),
                  encoding="utf8") as f:
            js_raw = f.read()
    else:
        js_raw = js.text
    if fonts.status_code != 200:
        LOGGER.warn(
            f"Sorry, the fonts bundle that is supposed to be at {FONTS_BUNDLE_URL} returned a {fonts.status_code}. Your ReadAlong will be bundled using a version that may not be up-to-date. Please check your internet connection."
        )
        with open(os.path.join(os.path.dirname(__file__), "bundle.css"),
                  encoding="utf8") as f:
            fonts_raw = f.read()
    else:
        fonts_raw = fonts.text

    return BASIC_HTML.format(
        text=encode_from_path(text_path),
        alignment=encode_from_path(alignment_path),
        audio=encode_from_path(audio_path),
        js=js_raw,
        fonts=fonts_raw,
        title=title,
        header=header,
        subheader=subheader,
        theme=theme,
    )
Exemplo n.º 26
0
 def test_align_removed(self):
     """ Try aligning section with removed audio
     """
     # Process Audio
     removed_segment = remove_section(self.noisy_segment, 1500, 2500)
     audio_output_path = os.path.join(self.tempdir, "removed_sample.mp3")
     removed_segment.export(audio_output_path)
     # Align
     input_text_path = os.path.join(self.data_path, "audio_sample.txt")
     input_audio_path = audio_output_path
     flags = ["-i", "-l", "eng"]
     output_path = os.path.join(self.tempdir, "output_removed")
     log = self.align(input_text_path, input_audio_path, output_path, flags)
     LOGGER.info(str(log))
     # Check Result
     smilpath = Path(output_path)
     smil_files = smilpath.glob("*.smil")
     self.assertGreaterEqual(len([x for x in smil_files]), 1)
     self.assertFalse("error" in str(log).lower())
Exemplo n.º 27
0
class TestTokenizeCli(TestCase):
    LOGGER.setLevel("DEBUG")
    data_dir = os.path.join(os.path.dirname(__file__), "data")

    def setUp(self):
        app.logger.setLevel("DEBUG")
        self.runner = app.test_cli_runner()
        self.tempdirobj = tempfile.TemporaryDirectory(
            prefix="test_tokenize_cli_tmpdir", dir=".")
        self.tempdir = self.tempdirobj.name
        # Alternative tempdir code keeps it after running, for manual inspection:
        # self.tempdir = tempfile.mkdtemp(prefix="test_tokenize_cli_tmpdir", dir=".")
        # print('tmpdir={}'.format(self.tempdir))

        self.xmlfile = os.path.join(self.tempdir, "fra.xml")
        _ = self.runner.invoke(prepare, [
            "-l", "fra",
            os.path.join(self.data_dir, "fra.txt"), self.xmlfile
        ])

    def tearDown(self):
        self.tempdirobj.cleanup()

    def test_invoke_tok(self):
        results = self.runner.invoke(
            tokenize,
            [self.xmlfile, os.path.join(self.tempdir, "delme")])
        self.assertEqual(results.exit_code, 0)
        self.assertTrue(os.path.exists(os.path.join(self.tempdir,
                                                    "delme.xml")))

    def test_generate_output_name(self):
        results = self.runner.invoke(tokenize, [self.xmlfile])
        self.assertEqual(results.exit_code, 0)
        self.assertTrue(
            os.path.exists(os.path.join(self.tempdir, "fra.tokenized.xml")))

    def test_with_stdin(self):
        with io.open(self.xmlfile) as f:
            inputtext = f.read()
        results = self.runner.invoke(tokenize, "-", input=inputtext)
        self.assertEqual(results.exit_code, 0)
        self.assertIn("<s><w>Ceci</w> <w>est</w> <w>une</w> <w>phrase</w>",
                      results.output)

    def test_file_already_exists(self):
        results = self.runner.invoke(tokenize, [self.xmlfile, self.xmlfile])
        self.assertNotEqual(results.exit_code, 0)
        self.assertIn("use -f to overwrite", results.output)

    def test_bad_input(self):
        results = self.runner.invoke(tokenize, "- -", input="this is not XML!")
        self.assertNotEqual(results.exit_code, 0)
        self.assertIn("Error parsing", results.output)
Exemplo n.º 28
0
 def test_align_sample(self):
     """Sanity check that test audio should align"""
     # Align
     input_text_path = os.path.join(self.data_dir, "audio_sample.txt")
     input_audio_path = os.path.join(self.data_dir, "audio_sample.ogg")
     flags = ["-l", "eng"]
     output_path = os.path.join(self.tempdir, "output")
     process = self.align(input_text_path, input_audio_path, output_path,
                          flags)
     if process.returncode != 0:
         LOGGER.error("Subprocess readalongs align failed: %s",
                      process.stderr)
     # Check Result
     smilpath = Path(output_path)
     smil_files = smilpath.glob("*.smil")
     self.assertTrue(
         next(smil_files, False),
         "No *.smil files found; "
         "pip install --force-reinstall --upgrade might be required "
         "if dependencies changed.",
     )
Exemplo n.º 29
0
    def __init__(self, metadata_path):
        self.metadata = load_json(metadata_path)
        self.in_lang = self.metadata["in_metadata"]["lang"]
        self.out_lang = self.metadata["out_metadata"]["lang"]

        dirname = os.path.dirname(metadata_path)
        if "src" not in self.metadata:
            LOGGER.error("File %s does not specify a source document",
                         metadata_path)
            return
        self.src_path = os.path.join(dirname, self.metadata["src"])

        self.entries = defaultdict(list)
        if "src_format" not in self.metadata:
            LOGGER.error(
                "File %s lacking a source format ('src_format') attribute",
                metadata_path,
            )
            return

        if self.metadata["src_format"] not in LEXICON_LOADERS:
            LOGGER.error(
                "File %s references an unknown lexicon format: %s",
                metadata_path,
                self.metadata["src_format"],
            )

        self.loader = LEXICON_LOADERS[self.metadata["src_format"]]
Exemplo n.º 30
0
def create_epub(input_path, output_path, unpacked=False):
    if os.path.isdir(output_path):
        shutil.rmtree(output_path)
    ensure_dirs(output_path)
    input_dirname = os.path.dirname(input_path)
    if unpacked:
        os.mkdir(output_path)
        copy = copy_file_to_dir
        save = save_txt_to_dir
    else:
        copy = copy_file_to_zip
        save = save_txt_zip

    # mimetype file
    copy(output_path, MIMETYPE_ORIGIN_PATH, MIMETYPE_DEST_PATH)

    # container.xml file
    container_template = load_txt(CONTAINER_ORIGIN_PATH)
    container_txt = pystache.render(container_template,
                                    {"package_path": PACKAGE_DEST_PATH})
    save(output_path, CONTAINER_DEST_PATH, container_txt)

    # the SMIL and all the files referenced in the SMIL
    package_data = extract_files_from_SMIL(input_path)
    package_template = load_txt(PACKAGE_ORIGIN_PATH)
    package_txt = pystache.render(package_template, package_data)
    save(output_path, PACKAGE_DEST_PATH, package_txt)

    for entry in package_data["media"]:
        origin_path = os.path.join(input_dirname, entry["origin_path"])
        if not os.path.exists(origin_path):
            LOGGER.warning("Cannot find file %s to copy into EPUB file",
                           origin_path)
            continue
        dest_path = os.path.join(EPUB_PATH, entry["dest_path"])
        copy(output_path, origin_path, dest_path)

    # CSS file
    copy(output_path, STYLESHEET_ORIGIN_PATH, STYLESHEET_DEST_PATH)