示例#1
0
def test_json_to_stm_conversion():
    " execute json to stm tests "

    transcript = time_aligned_text(f"{sample_dir}/BillGatesTEDTalk.json")
    convert_and_test_it_loads(transcript, f"{test_dir}/json_to_stm_test_1.stm")

    transcript = time_aligned_text(f"{sample_dir}/simple_test.json")
    convert_and_test_it_loads(transcript, f"{test_dir}/json_to_stm_test_2.stm")
示例#2
0
def test_conversion():
  " execute single test "

  reference_file = time_aligned_text("../samples/BillGatesTEDTalk.stm")
  transcript_file = time_aligned_text("../samples/BillGatesTEDTalk_intentionally_poor_transcription.txt")

  # test fixed precision output of wer calculation
  assert "{:5.3f}".format(wer(reference_file.text(), transcript_file.text(), True)) == "3.332"
示例#3
0
def convert_and_test_it_loads(transcript_obj, output_filename):
    """
    Tests that conversion works
    Tests that the file can reload
    Removes transitory file
    """
    transcript_obj.write(output_filename)
    time_aligned_text(output_filename)

    os.remove(output_filename)
示例#4
0
def align_json(ref_txt, json_file, filename=None):
    """
    CLI for forced alignment tools
    Using a reference txt file and a hypothesis gk json
        file, this time-aligns the reference txt file
        and outputs an STM file
    Input
      ref_txt, str - reference text file containing ground truth
      json_file, str - hypothesis gk JSON file
      filename, str - output STM filename
    """

    ref_tokens = preprocess_txt.parse_transcript(ref_txt)
    gk_json = preprocess_gk_json.preprocess_transcript(json_file)
    segments = align(gk_json, ref_tokens)

    if filename is None:
        filename = basename(sanitize(strip_extension(ref_txt))) + ".stm"

    # fix segment filename and speaker
    for seg in segments:
        seg.filename = strip_extension(filename)
        seg.speaker = strip_extension(filename) + "UnknownSpeaker"

    output = time_aligned_text()
    output.segments = segments
    output.write(filename)
示例#5
0
def test_json_to_rttm_conversion_without_speaker():
    """
    execute json to rttm test
    """
    transcript = time_aligned_text(f"{test_dir}/no_speaker.json")

    convert_and_test_it_loads(transcript, f"{test_dir}/no_speaker.rttm")
示例#6
0
def split_audio_file(source_audio_file, source_transcript, target_directory):
    """
    Execute the split logic
    """
    source_audio = audio_file(source_audio_file)
    transcript = time_aligned_text(source_transcript)
    source_audio.split(transcript, target_directory)
def validate_sample(ext, expected_transcripts, out_segments):
    base_output = f"{test_dir}/good"
    convert(f"{sample_dir}/invalid.stm", base_output + ext)
    validated_transcript = time_aligned_text(base_output + ext)
    assert len(validated_transcript.segments) == out_segments
    for seg, expected_text in zip(validated_transcript.segments, expected_transcripts):
        assert seg.text == expected_text
示例#8
0
def test_json_to_stm_conversion():
  " execute json to stm tests "

  input_file = time_aligned_text("../samples/BillGatesTEDTalk.json")
  reference_sha = hashlib.sha1(
    open("../samples/BillGatesTEDTalk_transcribed.stm", 'r', encoding='utf8').read().encode()
  ).hexdigest()
  input_file.write("json_to_stm_test_1.stm")
  new_sha = hashlib.sha1(open("json_to_stm_test_1.stm", 'r', encoding='utf8').read().encode()).hexdigest()
  assert reference_sha == new_sha

  input_file = time_aligned_text("../samples/simple_test.json")
  reference_sha = hashlib.sha1(open("../samples/simple_test.stm", 'r', encoding='utf8').read().encode()).hexdigest()
  input_file.write("json_to_stm_test_2.stm")
  new_sha = hashlib.sha1(open("json_to_stm_test_2.stm", 'r', encoding='utf8').read().encode()).hexdigest()
  assert reference_sha == new_sha
示例#9
0
    def __init__(self, *args, **kwargs):
        """
        Initialize from location and populate list of
        SPH, WAV, or MP3 audio files
        and STM files into segments
        """
        for dictionary in args:
            if isinstance(dictionary, dict):
                for key in dictionary:
                    setattr(self, key, dictionary[key])
        for key in kwargs:
            setattr(self, key, kwargs[key])

        # only if not defined above should we search for exemplars
        # based on location
        if not self.exemplars:
            # instantiate exemplars for this object to override
            # static class variable
            self.exemplars = []

            audio_extensions_to_try = ["sph", "wav", "mp3"][::-1]
            self.exemplars += [
                exemplar({
                    "audio_file":
                    audio_file(fl),
                    "transcript_file":
                    time_aligned_text(strip_extension(fl) + ".stm"),
                }) for audio_extension in audio_extensions_to_try
                for fl in (get_files(self.location, audio_extension) if self.
                           location else [])
                if (os.path.exists(strip_extension(fl) + ".stm"))
            ]

            # gather all exemplars from /stm and /sph subdirectories if present
            self.exemplars += [
                exemplar({
                    "audio_file":
                    audio_file(fl),
                    "transcript_file":
                    time_aligned_text(self.location + "/stm/" +
                                      basename(strip_extension(fl)) + ".stm"),
                }) for audio_extension in audio_extensions_to_try for fl in
                (get_files(self.location +
                           "/sph/", audio_extension) if self.location else [])
                if (os.path.exists(self.location + "/stm/" +
                                   basename(strip_extension(fl)) + ".stm"))
            ]
示例#10
0
def test_json_to_txt_conversion():
  " execute json to txt test "

  input_file = time_aligned_text("../samples/simple_test.json")
  reference_sha = hashlib.sha1(open("../samples/simple_test.txt", 'r', encoding='utf8').read().encode()).hexdigest()
  input_file.write("json_to_txt_test.txt")
  new_sha = hashlib.sha1(open("json_to_txt_test.txt", 'r', encoding='utf8').read().encode()).hexdigest()
  assert reference_sha == new_sha
示例#11
0
def validate_sample(ext, expected_transcripts, out_segments):
    base_output = 'tests/good'
    convert('samples/invalid.stm', base_output + ext)
    validated_transcript = time_aligned_text(base_output + ext)
    assert len(validated_transcript.segments) == out_segments
    for seg, expected_text in zip(validated_transcript.segments,
                                  expected_transcripts):
        assert seg.text == expected_text
示例#12
0
def test_txt_initialization():
    " execute single test "

    input_dict = json.load(open("samples/BillGatesTEDTalk.json"))
    text = time_aligned_text(input_dict)
    text.file_extension = "txt"

    text_object = time_aligned_text(text.__str__())

    reference_sha = hashlib.sha1(
        open("samples/BillGatesTEDTalk_transcribed.txt", "r",
             encoding="utf8").read().encode()).hexdigest()
    text_object.write("tests/file_conversion_test.txt")
    new_sha = hashlib.sha1(
        open("tests/file_conversion_test.txt", "r",
             encoding="utf8").read().encode()).hexdigest()
    assert reference_sha == new_sha
示例#13
0
def split_audio_file(source_audio_file, source_transcript, target_directory):
    """
    Split source audio file into segments denoted by transcript file
    into target_directory
    Results in stm and sph files in target directory
    """
    source_audio = audio_file(source_audio_file)
    transcript = time_aligned_text(source_transcript)
    source_audio.split(transcript, target_directory)
示例#14
0
def test_stm_to_html_conversion():
  " execute stm to html test "

  input_file = time_aligned_text("../samples/BillGatesTEDTalk.stm")
  input_file.write("stm_to_html_test.html")
  reference_sha = hashlib.sha1(open("../samples/BillGatesTEDTalk.html", 'r',
                                    encoding='utf8').read().encode()).hexdigest()
  new_sha = hashlib.sha1(open("stm_to_html_test.html", 'r', encoding='utf8').read().encode()).hexdigest()
  assert reference_sha == new_sha
示例#15
0
def main():
  parser = argparse.ArgumentParser(description='convert between text file formats')
  parser.add_argument('input_file', metavar='input_file', type=str, help='input stm file')
  args = parser.parse_args()

  # after reading in, only valid lines will remain
  input_file = time_aligned_text(args.input_file)

  # write back to original file name
  input_file.write(args.input_file)
示例#16
0
def test_stm_to_txt_conversion():
    " execute stm to txt test "

    input_file = time_aligned_text("samples/BillGatesTEDTalk.stm")
    input_file.write("tests/stm_to_txt_test.txt")
    reference_sha = hashlib.sha1(
        open("samples/BillGatesTEDTalk.txt", "r",
             encoding="utf8").read().encode()).hexdigest()
    new_sha = hashlib.sha1(
        open("tests/stm_to_txt_test.txt", "r",
             encoding="utf8").read().encode()).hexdigest()
    assert reference_sha == new_sha
示例#17
0
def test_conversion():
    " execute single test "

    from asrtoolkit.data_structures.time_aligned_text import time_aligned_text
    input_file = time_aligned_text("../samples/BillGatesTEDTalk.stm")
    input_file.write("file_conversion_test.txt")
    reference_sha = hashlib.sha1(
        open("../samples/BillGatesTEDTalk.txt", 'r',
             encoding='utf8').read().encode()).hexdigest()
    new_sha = hashlib.sha1(
        open("file_conversion_test.txt", 'r',
             encoding='utf8').read().encode()).hexdigest()
    assert reference_sha == new_sha
示例#18
0
def test_conversion():
    " execute single test "

    from asrtoolkit.data_structures.time_aligned_text import time_aligned_text
    input_file = time_aligned_text("../samples/BillGatesTEDTalk.stm")
    input_file.write("file_conversion_test.txt")
    reference_sha = subprocess.Popen(
        ["sha1sum", "../samples/BillGatesTEDTalk.txt"],
        stdout=subprocess.PIPE).stdout.read().decode().split()[0]
    new_sha = subprocess.Popen(
        ["sha1sum", "file_conversion_test.txt"],
        stdout=subprocess.PIPE).stdout.read().decode().split()[0]
    assert reference_sha == new_sha
示例#19
0
def test_json_initialization():
    " execute single test "

    input_dict = json.load(open("samples/BillGatesTEDTalk.json"))
    text_object = time_aligned_text(input_dict)

    reference_sha = hashlib.sha1(
        open("samples/BillGatesTEDTalk_transcribed.stm", 'r',
             encoding='utf8').read().encode()).hexdigest()
    text_object.write("tests/file_conversion_test.stm")
    new_sha = hashlib.sha1(
        open("tests/file_conversion_test.stm", 'r',
             encoding='utf8').read().encode()).hexdigest()
    assert reference_sha == new_sha
示例#20
0
def main():
    parser = argparse.ArgumentParser(
        description='convert between text file formats')
    parser.add_argument('input_file',
                        metavar='input_file',
                        type=str,
                        help='input file')
    parser.add_argument('output_file',
                        metavar='output_file',
                        type=str,
                        help='output file')
    args = parser.parse_args()

    input_file = time_aligned_text(args.input_file)
    input_file.write(args.output_file)
示例#21
0
def main():
  parser = argparse.ArgumentParser(
    description='Compares a reference and transcript file and calculates word error rate (WER) between these two files'
  )
  parser.add_argument('reference_file', metavar='reference_file', type=str, help='reference "truth" file')
  parser.add_argument(
    'transcript_file', metavar='transcript_file', type=str, help='transcript possibly containing errors'
  )
  parser.add_argument(
    "--char-level", help="calculate character error rate instead of word error rate", action="store_true"
  )
  parser.add_argument("--ignore-nsns", help="ignore non silence noises like um, uh, etc.", action="store_true")

  # parse arguments
  args = parser.parse_args()

  # read files from arguments
  ref = time_aligned_text(args.reference_file)
  hyp = time_aligned_text(args.transcript_file)

  if args.char_level:
    print("CER: {:5.3f}%".format(cer(ref, hyp, args.ignore_nsns)))
  else:
    print("WER: {:5.3f}%".format(wer(ref, hyp, args.ignore_nsns)))
示例#22
0
 def __init__(self, input_dict=None):
     """
   Initialize from location and populate list of SPH and STM files into segments
 """
     self.__dict__.update(input_dict if input_dict else {})
     if not self.exemplars:
         audio_files = [
             audio_file(_) for _ in sorted(get_files(self.location, "sph"))
         ]
         transcript_files = [
             time_aligned_text(_)
             for _ in sorted(get_files(self.location, "stm"))
         ]
         self.exemplars = [
             exemplar({
                 "audio_file": af,
                 "transcript_file": tf
             }) for af, tf in zip(audio_files, transcript_files)
         ]
示例#23
0
def test_json_to_rttm_conversion():
    """
    execute json to rttm test
    """
    transcript = time_aligned_text(f"{sample_dir}/simple_test.json")
    convert_and_test_it_loads(transcript, f"{test_dir}/json_to_rttm_test.rttm")
示例#24
0
def test_json_to_txt_conversion():
    " execute json to txt test "

    transcript = time_aligned_text(f"{sample_dir}/simple_test.json")
    convert_and_test_it_loads(transcript, f"{test_dir}/json_to_txt_test.txt")
示例#25
0
def test_stm_to_srt_conversion():
    " execute stm to srt test "

    transcript = time_aligned_text(f"{sample_dir}/BillGatesTEDTalk.stm")
    convert_and_test_it_loads(transcript, f"{test_dir}/stm_to_srt_test.srt")
示例#26
0
def check_transcript(transcript):
    if valid_input_file(transcript):
        return time_aligned_text(input_data=transcript)
    else:
        LOGGER.error("Invalid transcript file {}".format(transcript))
        sys.exit(1)
def assign_if_valid(file_name):
    from asrtoolkit.data_structures.time_aligned_text import time_aligned_text

    " returns a time_aligned_text object if valid else None"
    return time_aligned_text(file_name) if valid_input_file(file_name) else None