filename_prep = re.search(r"(?<=system-output\/)(.*?)(?=\.txt)", infile).group(0) outfile = "./results/google/universal/google-" + filename_prep + ".csv" trans_file = "./results/google/system-trans-text/google-" + filename_prep + "-trans.txt" # setting initial utterance as jiwer can't handle empty strings. # tsoft = the start of the file. prev = "tsotf" utt = "" # Google specific processing. # This function extracts each new hypothesis with its time and processes it. # Simultaneously, finalised hypotheses are stored for final WER calculations. with open(infile, 'r') as f: for line in f: if line.startswith("Finished"): fin = re.search(r"(?<=Finished: )(.*)(?=\n)", line).group(0) if line.startswith("Time"): time = re.search(r"(?<=Time: )(.*)(?=\n)", line).group(0) if line.startswith("Transcript"): utt = re.search(r"(?<=Transcript: )(.*)(?=\n)", line).group(0) utt = utt.replace(".", "") if fin == "False": process(outfile, time, prev, utt) prev = utt else: process(outfile, time, prev, utt) add_trans_chunk(trans_file, utt.lower()) prev = "tsotf" # Universal output finalised. clean_csv(outfile)
from universal import add_trans_chunk import sys import re # Given a system output and which channel it is from, the final transcript is stored. # This is to calculate the WER. infile = sys.argv[1] side = sys.argv[2] if side == "left": filename_prep = re.search(r"(?<=left\/)(.*?)(?=\.txt)", infile).group(0) elif side == "right": filename_prep = re.search(r"(?<=right\/)(.*?)(?=\.txt)", infile).group(0) else: print("Which side?") trans_file = "./results/msoft/split-system-trans-text/" + side + "-" + filename_prep + "-trans.txt" with open(infile, 'r') as f: for line in f: if line.startswith("JSON"): transcript = re.search(r"(?<=DisplayText\":\")(.*?)(?=\")", line) if transcript: transcript = transcript.group(0) add_trans_chunk(trans_file, transcript.lower())
# IBM specific processing. # This function extracts each new hypothesis with its time and processes it. # Simultaneously, finalised hypotheses are stored for final WER calculations. with open(infile, 'r') as f: for line in f: check = line.replace(" ", "").replace("%HESITATION", "") try: number = re.search(r"[+-]?([0-9]*[.])?[0-9]+", line).group(0) except AttributeError: pass if check.startswith("\"transcript"): utt = re.search(r"(?<=transcript\"\: \")(.*?)(?=\")", line).group(0) utt = utt.lower().replace("%hesitation", "") if check.startswith(number) and not "," in line: time = check.replace("\n", "") if check.startswith("\"final"): if not utt.isspace(): # print(utt) if "false" in line: process(outfile, time, prev, utt) prev = utt else: process(outfile, time, prev, utt) add_trans_chunk(trans_file, utt) prev = "tsotf" # Universal output finalised. clean_csv(outfile)