Пример #1
0
def test_write_to_sqlite(input_file):
    """
    Test production of sqlite output

    GIVEN an input file
    WHEN writing to sqlite
    THEN check output exists and contains content
    """

    logging.info("test_write_to_sqlite")

    # GIVEN an input file
    # WHEN writing to sqlite
    output_filename = Path(f"{uuid4().hex}.db")
    tscribe.write(input_file, save_as=output_filename, format="sqlite")

    # THEN check output exists and contains content
    assert output_filename.is_file(), "Output file should exist"

    conn = sqlite3.connect(str(output_filename))
    c = conn.cursor()
    c.execute("SELECT * FROM transcript")
    query = c.fetchall()

    data = tscribe.load_json_as_dict(input_file)
    df = tscribe.decode_transcript_to_dataframe(data)

    assert len(query) == len(df), "Database table should be length of dataframe"

    # Teardown
    os.remove(output_filename)
Пример #2
0
def test_calculate_confidence_statistics(input_file):
    """
    Test confidence stats data modeling

    GIVEN a data dict
    WHEN calling calculate_confidence_statistics(...)
    THEN return the data model with the right components
    """

    logging.info("test_calculate_confidence_statistics")

    # GIVEN a data dict
    # input_file = "sample_multiple.json"
    data = tscribe.load_json_as_dict(input_file)

    # WHEN calling calculate_confidence_statistics(...)
    stats = tscribe.calculate_confidence_statistics(data)

    # THEN return the data model with the right components
    assert isinstance(stats, dict), "Stats should be of dict type"
    assert "timestamps" in stats, "Data model should include timestamps"
    assert "9.8" in stats, "Data model should include 9.8"
    assert "9" in stats, "Data model should include 9"
    assert "8" in stats, "Data model should include 8"
    assert "7" in stats, "Data model should include 7"
    assert "6" in stats, "Data model should include 6"
    assert "5" in stats, "Data model should include 5"
    assert "4" in stats, "Data model should include 4"
    assert "3" in stats, "Data model should include 3"
    assert "2" in stats, "Data model should include 2"
    assert "1" in stats, "Data model should include 1"
    assert "0" in stats, "Data model should include 0"
Пример #3
0
def test_write_to_csv(input_file):
    """
    Test production of csv output

    GIVEN an input file
    WHEN writing to csv
    THEN check output exists and contains content
    """

    logging.info("test_write_to_csv")

    # GIVEN an input file
    # WHEN writing to csv
    output_filename = Path(f"{uuid4().hex}.csv")
    tscribe.write(input_file, save_as=output_filename, format="csv")

    # THEN check output exists and contains content
    assert output_filename.is_file(), "Output file should exist"

    with open(output_filename, "r") as file:
        lines = file.readlines()

    data = tscribe.load_json_as_dict(input_file)
    df = tscribe.decode_transcript_to_dataframe(data)

    assert len(lines) == len(df) + 1, "CSV should be length of dataframe + headers"

    # Teardown
    os.remove(output_filename)
Пример #4
0
def test_sample_files(sample):
    """Confirm test files accessible and safe"""
    logging.info("test_sample_files")
    assert Path(sample).is_file(), "Sample file should exist"
    assert Path(sample).suffix == ".json", "Sample files should be json files"
    data = tscribe.load_json_as_dict(sample)
    assert data["accountId"] == "XXXXXXXXXXXX"
Пример #5
0
def test_decode_transcript_to_dataframe(input_file):
    """
    Test transcript decoding function

    GIVEN a data dict
    WHEN calling decode_transcript_to_dataframe(...)
    THEN
    """

    logging.info("test_decode_transcript_to_dataframe")

    # GIVEN a data dict
    data = tscribe.load_json_as_dict(input_file)

    # WHEN calling decode_transcript_to_dataframe(...)
    df = tscribe.decode_transcript_to_dataframe(data)

    # THEN
    assert isinstance(
        df, pandas.DataFrame
    ), "decode_transcript_to_dataframe should return a Pandas Data Frame"

    rows, cols = df.shape

    assert cols == 4, "Dataframe should have four columns"

    if input_file == "sample_single.json":
        # TODO
        pass

    if input_file == "sample_multiple.json":
        assert rows == len(
            data["results"]["speaker_labels"]["segments"]
        ), "Rows should match number of segments"
Пример #6
0
def test_make_graph_png(input_file):
    """
    Test function for creating graphs from confidence stats

    GIVEN confidence stats from an input file
    WHEN calling make_graph_png(...)
    THEN produce chart.png
    """

    logging.info("test_make_graph_png")
    filepath = Path("chart.png")

    # Ensure blank slate
    if filepath.is_file():
        os.remove(filepath)

    # GIVEN confidence stats from an input file
    data = tscribe.load_json_as_dict(input_file)
    stats = tscribe.calculate_confidence_statistics(data)

    # WHEN calling make_graph_png(...)
    tscribe.make_graph_png(stats, "./")

    # THEN produce chart.png
    assert filepath.is_file(), "chart.png should be created"

    os.remove(filepath)
Пример #7
0
def test_write_to_docx(input_file):
    """
    Test production of docx output

    GIVEN an input file
    WHEN writing to docx
    THEN check output exists and contains content
    """

    logging.info("test_write_to_docx")

    # GIVEN an input file
    # WHEN writing to docx
    output_filename = Path(f"{uuid4().hex}.docx")
    tscribe.write(input_file, save_as=output_filename, format="docx")

    # THEN check output exists and contains content
    assert output_filename.is_file(), "Output file should exist"

    document = Document(output_filename)

    assert (
        len(document.tables) == 2
    ), "Document should contain two tables, stats and transcript"

    t_conf = document.tables[0].cell(0, 0).text
    t_count = document.tables[0].cell(0, 1).text
    t_perc = document.tables[0].cell(0, 2).text
    assert (t_conf, t_count, t_perc) == (
        "Confidence",
        "Count",
        "Percentage",
    ), "First table should be stats headers"
    assert len(document.tables[0].rows) == 12, "Stats table should hold 12 rows"

    t_time = document.tables[1].cell(0, 0).text
    t_speaker = document.tables[1].cell(0, 1).text
    t_content = document.tables[1].cell(0, 2).text
    assert (t_time, t_speaker, t_content) == (
        "Time",
        "Speaker",
        "Content",
    ), "Second table should be transcript headers"
    data = tscribe.load_json_as_dict(input_file)
    df = tscribe.decode_transcript_to_dataframe(data)
    assert (
        len(document.tables[1].rows) == len(df) + 1
    ), "Second table should be length of dataframe + headers"

    assert (
        "chart.png" in document.paragraphs[6]._p.xml
    ), "Chart should be in paragraph six"

    # Teardown
    os.remove(output_filename)
Пример #8
0
def test_load_json_as_dict(input_file):
    """
    Test json to dict function

    GIVEN a sample json file
    WHEN calling tscribe.load_json_as_dict(...)
    THEN return a dict
    """

    logging.info("test_load_json_as_dict")

    # GIVEN a sample json file
    # provided through parametrize

    # WHEN calling tscribe.load_json_as_dict(...)
    data = tscribe.load_json_as_dict(input_file)

    # THEN return a dict
    assert isinstance(data, dict), "Data should by of dict type"
Пример #9
0
def test_write_to_vtt(input_file):
    """
    Test production of vtt format

    GIVEN an input file
    WHEN writing to vtt
    THEN check output exists and contains content
    """

    logging.info("test_write_to_vtt")

    # GIVEN an input file
    # WHEN writing to vtt
    output_filename = Path(f"{uuid4().hex}.vtt")
    tscribe.write(input_file, save_as=output_filename, format="vtt")

    # THEN check output exists and contains content
    vtt = webvtt.read(output_filename)

    data = tscribe.load_json_as_dict(input_file)
    df = tscribe.decode_transcript_to_dataframe(data)
    assert len(vtt.captions) == len(
        df
    ), "vtt file should have equal captions to df rows"

    for caption in vtt.captions:

        assert hasattr(caption, "start"), "each caption should have a start_time"
        assert hasattr(caption, "end"), "each caption should have a end_time"
        assert hasattr(caption, "text"), "each caption should have text"
        assert (
            len(caption.lines) >= len(caption.text) / 80
        ), "text should be split into max 80 long lines"
        if input_file != "sample_single.json":
            assert hasattr(
                caption, "identifier"
            ), "each caption should have an identifier"

    # Teardown
    os.remove(output_filename)