def load_stats_tsv(statspath): """ :param statspath: Path to the tsv file that is generated by etl_transcribe_stats.py :return: List of keys ordered as appearing in tsv file, and a dictionary containing the tsv data. """ key_list = [] stats_dict = {} header = None with open(statspath, 'r') as tsv_fp: tsv = csv.reader(tsv_fp, delimiter='\t') for row in tsv: if not header: header = row log_kv("header", header) else: key_list.append(row[0]) key = row[0] if key in stats_dict: logging.error("Key already encountered previously: %s", key) logging.error("Previous entry: %s New entry: %s", stats_dict[key], row[1:]) else: stats_dict[key] = {} for ii in range(1, len(header)): stats_dict[key][header[ii]] = row[ii] return key_list, stats_dict
def load_txt(path): result = {} log_kv("Loading", path) if os.path.exists(path): with open(path) as fp: result = fp.read() else: logging.error("Not exist: %s", path) return result
def fetch_rows_for_column(con, cur, tablename, column, value): value = '"%s"' % value if type(value) is str else value query = 'SELECT * FROM {tn} WHERE {cc} = {val}'.format(tn=tablename, cc=column, val=value) log_kv("query", query) cur.execute(query) con.commit() return cur.fetchall()
def insert_from_tsv(con, tablename, path): sql_insert = "insert into " + tablename + " %s " with open(path, 'r') as fp: log_kv("Loading %s from" % tablename, path) reader = csv.reader(fp, delimiter='\t') header = next(reader) query = sql_insert % str(tuple(header)) + " VALUES %s" for rec in reader: con.execute(query % str(tuple(rec)))
def fetch_rows_columns_for_column(con, cur, columns, tablename, column, value): value = '"%s"' % value if type(value) is str else value columns_string = ",".join(columns) query = 'SELECT {xx} FROM {tn} WHERE {cc} = {val}'.format( xx=columns_string, tn=tablename, cc=column, val=value) log_kv("query", query) cur.execute(query) con.commit() return cur.fetchall()
def load_json(path): result = {} log_kv("Loading", path) if os.path.exists(path): with open(path) as file1: result = json.load(file1) else: logging.error("Not exist: %s", path) return result
def calc_transcript_counts(ibm_stats_path, google_stats_path): """ :param ibm_stats_path: to file containing index of IBM transcripts :param google_stats_path: to file containing index of Google transcripts :return: Counts number of IBM transcripts Counts number of Google transcripts Counts portion of IBM transcripts within processed folders that have Google transcript Counts portion of Google transcripts within processed folders that have IBM transcript """ ibm_stats = load_json(ibm_stats_path) google_stats = load_json(google_stats_path) log_kv("Number of IBM Transcripts", len(ibm_stats)) log_kv("Numberof Google Transcripts", len(google_stats)) i_set = set([os.path.dirname(x).replace(IBM_PATH,'').replace(BASE_PATH,'') for x in ibm_stats]) g_set = set([os.path.dirname(x).replace(GOOGLE_PATH,'').replace(BASE_PATH,'') for x in google_stats]) print i_top_level_folders = sorted(set([xx.split("/")[0] for xx in i_set])) print "IBM folders in %s :\n%s " % (ibm_stats_path, i_top_level_folders) print g_top_level_folders = sorted(set([xx.split("/")[0] for xx in g_set])) print "Google folders in %s :\n%s" % (google_stats_path, g_top_level_folders) i_count = 0 i_in_g = 0 for xx in i_set: if xx.split("/")[0] not in g_top_level_folders: continue i_count += 1 if xx in g_set: i_in_g += 1 i_portion = float(i_in_g) / i_count g_count = 0 g_in_i = 0 for xx in g_set: if xx.split("/")[0] not in i_top_level_folders: continue g_count += 1 if xx in i_set: g_in_i += 1 g_portion = float(g_in_i) / g_count print;print print "IBM also in Google: %d/%d (%.2f) " % (i_in_g, i_count, i_portion) print "Google also in IBM: %d/%d (%.2f) " % (g_in_i, g_count, g_portion)
def etl_transcripts(log_stats_path, word_counts_path, api, ext=".out"): """ Merges the two datasets using a canonicalized key. If the two share a filed then the value in <word_counts_path> will overwrite the value from <log_stats_path> :param log_stats_path: gives processing time :param word_counts_path: gives word counts :param api: "ibm" or "google" :param ext: extension stripped from basename :return: dict """ result = {} loaded = {} if os.path.isfile(log_stats_path): with open(log_stats_path) as fp: log_kv("Loading(%s log stats)" % api, log_stats_path) loaded = json.load(fp) if loaded and type(loaded) is dict: log_kv("Count (%s log stats)" % api, len(loaded)) else: logging.error("Expected log stats data.") for key, row in loaded.items(): id = key if key.startswith(API_META[api]["base"] + "/"): id = key.replace(API_META[api]["base"] + "/", "") result[id] = row counts = {} if os.path.isfile(word_counts_path): with open(word_counts_path) as fp: log_kv("Loading(%s word counts)" % api, word_counts_path) counts = json.load(fp) if counts and type(counts) is dict: log_kv("Count (%s word counts)" % api, len(counts)) else: logging.error("Expected word counts data.") for key, row in counts.items(): id = key if key.startswith(API_META[api]["base"] + "/"): id = key.replace(API_META[api]["base"] + "/", "") for suffix in API_META[api]["transcripts"]: if id.endswith("/" + suffix): id = re.sub(re.escape("/" + suffix) + r"$", "", id) if id.endswith(ext): id = re.sub(re.escape(ext) + r"$", "", id) if id in result: for x, y in row.items(): result[id][x] = y else: result[id] = row return result
def calc_bleu_scores(google_results, ibm_results, verbose=False): """ :param google_results: basic stats about google transcripts :param ibm_results: basic stats about ibm transcripts :param verbose: prints warnings when bleu is averaged with jaccard, which is done when hypothesis word count falls below threshold. :return: first two arguments, supplemented with bleu and ratcliff """ logging.info("=== Processing Google transcripts ===") time2 = time.time() google_results = do_comparisons(google_results, verbose) logging.info("(%.2f min)" % ((time.time() - time2) / 60.0)) logging.info("=== Processing IBM transcripts ===") time3 = time.time() ibm_results = do_comparisons(ibm_results) logging.info("(%.2f min)" % ((time.time() - time3) / 60.0)) ibm_bleu_count = len([1 for x in ibm_results if "bleu" in ibm_results[x]]) ibm_avg_bleu = sum([ibm_results[x]["bleu"] for x in ibm_results if "bleu" in ibm_results[x]]) \ / float(ibm_bleu_count) google_bleu_count = len( [1 for x in google_results if "bleu" in google_results[x]]) google_avg_bleu = sum([google_results[x]["bleu"] for x in google_results if "bleu" in google_results[x]]) \ / float(google_bleu_count) print log_kv("ibm bleu count", ibm_bleu_count) log_kv("google bleu count", google_bleu_count) log_kv("ibm avg bleu", "%.5f" % ibm_avg_bleu) log_kv("google avg bleu", "%.5f" % google_avg_bleu) ibm_ratcliff_count = len( [1 for x in ibm_results if "ratcliff" in ibm_results[x]]) ibm_avg_ratcliff = sum([ibm_results[x]["ratcliff"] for x in ibm_results if "ratcliff" in ibm_results[x]]) \ / float(ibm_ratcliff_count) google_ratcliff_count = len( [1 for x in google_results if "ratcliff" in google_results[x]]) google_avg_ratcliff = sum([google_results[x]["ratcliff"] for x in google_results if "ratcliff" in google_results[x]]) \ / float(google_ratcliff_count) print log_kv("ibm ratcliff count", ibm_ratcliff_count) log_kv("google ratcliff count", google_ratcliff_count) log_kv("ibm avg ratcliff", "%.5f" % ibm_avg_ratcliff) log_kv("google avg ratcliff", "%.5f" % google_avg_ratcliff) return google_results, ibm_results
def do_comparisons(stats, verbose=False): """ Calculates bleu and ratcliff similarity between reference and hypothesis transcripts. :param stats: dict containing pointers to reference and hypothesis transcripts. :param verbose: :return: first argument, supplemented with bleu and ratcliff stats. """ count = 0 for key in stats: if "reference_path" in stats[key] and "transcript_path" in stats[key]: if not os.path.exists(stats[key]["reference_path"]): raise ValueError("Expected path to exist: %s", stats[key]["reference_path"]) if not os.path.exists(stats[key]["transcript_path"]): raise ValueError("Expected path to exist: %s", stats[key]["transcript_path"]) with open(stats[key]["reference_path"], "r") as fp1: reference_string = fp1.read() with open(stats[key]["transcript_path"], "r") as fp2: hypothesis_string = fp2.read() stats[key]["ratcliff"] = ratcliff_obershelp_similarity( reference_string, hypothesis_string) reference_tokens = tokenize(reference_string) hypothesis_tokens = tokenize(hypothesis_string) if len(reference_tokens) < 7: bleu_score = nltk.translate.bleu_score.sentence_bleu( reference_tokens, hypothesis_tokens, weights=(0.5, 0.5)) if verbose: logging.warn( "Short reference: %2d words. Hypothesis:%5d words. Bleu: %.5f", len(reference_tokens), len(hypothesis_tokens), bleu_score) if len(hypothesis_tokens) > 2 * len(reference_tokens): logging.warn("Reference path : %s", stats[key]["reference_path"]) logging.warn("Hypothesis path: %s", stats[key]["transcript_path"]) elif len(hypothesis_tokens) < 15: bleu_score = nltk.translate.bleu_score.sentence_bleu( reference_tokens, hypothesis_tokens, weights=(0.5, 0.5)) jaccard = jaccard_score(reference_tokens, hypothesis_tokens) size_h = len(set(hypothesis_tokens)) size_r = len(set(reference_tokens)) size_b = size_h + size_r avg_bleu_score = (bleu_score * size_h / size_b) + (jaccard * size_r / size_b) if verbose: logging.warn( "Short hypothesis. Using avg(bleu,jaccard). " "Reference:%5d words/%5d set. Hypothesis:%5d words/%5d set. " "Bleu: %.5f Jaccard: %.5f Avg: %.5f", len(reference_tokens), size_r, len(hypothesis_tokens), size_h, bleu_score, jaccard, avg_bleu_score) if avg_bleu_score > max(bleu_score, jaccard): print logging.error("Avg bleu (%.5f) > max(bleu, jaccard).", avg_bleu_score) logging.warn( "avg_bleu_score = (bleu_score * size_h/size_b + (jaccard * size_r/size_b))" ) logging.warn( " %.5f = ( %.5f * %d/%d ) + ( %.5f * %d/%d )", avg_bleu_score, bleu_score, size_h, size_b, jaccard, size_r, size_b) print bleu_score = avg_bleu_score else: bleu_score = nltk.translate.bleu_score.sentence_bleu( reference_tokens, hypothesis_tokens) stats[key]['bleu'] = bleu_score stats[key]['word_count'] = len(reference_tokens) count += 1 if count % 50 == 0: log_kv("completed", count) # # if count<11: # break else: continue log_kv("done", count) return stats
'-s', action='store', default=STATS_FILEPATH, help='TSV file containing transcription stats ') parser.add_argument('--api', '-a', action='store', default="ibm", help='API. Default=ibm') parser.add_argument('--load', '-L', action='store_true', help='Load previously stored results.') args = parser.parse_args() log_kv("Running", __file__) log_kv("From", os.path.dirname(os.path.realpath(__file__))) references_path = os.path.realpath(os.path.expanduser(args.reference)) log_kv("references folder", references_path) google_path = os.path.realpath(os.path.expanduser(args.google)) log_kv("google path", google_path) ibm_path = os.path.realpath(os.path.expanduser(args.ibm)) log_kv("ibm path", ibm_path) outpath = os.path.realpath(os.path.expanduser(args.outfolder)) log_kv("outpath", outpath) # Loads transcript statistics file
parser = argparse.ArgumentParser(description='Sqlite Helper') parser.add_argument('--infolder', '-i', action='store', default='.', help='folder containing previous ETL files') parser.add_argument('--outfolder', '-o', action='store', default='./output', help='output directory') args = parser.parse_args() log_kv("Running", __file__) log_kv("From", os.path.dirname(os.path.realpath(__file__))) print inpath = os.path.realpath(args.infolder if args.infolder else os.getcwd()) log_kv("inpath", inpath) outpath = os.path.realpath( args.outfolder if args.outfolder else u'./output') log_kv("outpath", outpath) make_dir(outpath) log_kv("") log_kv("Audio stats file", AUDIO_STATS_TSV) log_kv("Transcript stats", TRANSCRIBED_STATS_TSV) log_kv("")
def process_transcript_stats(inpath, basepath, outpath, args): file_stats_path = os.path.join(outpath, IBM_TRANSCRIPT_STATS_FILENAME) if args.google: file_stats_path = os.path.join(outpath, GOOGLE_TRANSCRIPT_STATS_FILENAME) log_kv("file stats", file_stats_path) previous_results = {} result_dict = {} if os.path.exists(file_stats_path): log_kv("Loading file", file_stats_path) with open(file_stats_path) as file1: previous_results = json.load(file1) log_kv("Count(previous)", len(previous_results)) for xx in previous_results: result_dict[xx] = previous_results[xx] print print 105 * "=" print num_processed = 0 num_skipped = 0 num_done = 0 # Gets list of transcript filepaths file_list = walk_files(folder=inpath + "/", basepath=basepath) uniques = set() for x, y in file_list: uniques = get_uniques(x, uniques) log_kv("Number Transcriptions", len(file_list)) log_kv("Unique Transcriptions", len(uniques)) print print 105 * "=" print num_dictated = 0 sum_word_count = 0 sum_char_count = 0 for uu in uniques: id = uu.replace(basepath, '').lstrip("/") if args.max and num_processed >= args.max: log_kv("Max met", args.max) break num_dictated += 1 if uu.endswith(".dictated") else 0 num_processed += 1 if id in previous_results: num_skipped += 1 logging.debug("Skipping %s", uu) if result_dict[id] != previous_results[id]: logging.error("Mismatch") raise Exception('Expected %s , but encountered %s', result_dict[id], previous_results[id]) sum_word_count += result_dict[id]["word_count"] sum_char_count += result_dict[id]["char_count"] else: num_done += 1 logging.debug("Doing %s", uu) word_count, char_count = calc_transcription_counts( uu, basepath, args) sum_word_count += word_count sum_char_count += char_count result_dict[id] = { "word_count": word_count, "char_count": char_count } log_kv("Previous", len(previous_results)) log_kv("Processed", num_processed) log_kv("Dictated", num_dictated) log_kv("Done", num_done) log_kv("Skipped", num_skipped) log_kv("Result count", len(result_dict)) print running_avg_word_count = (float(sum_word_count) / len(result_dict)) running_avg_char_count = (float(sum_char_count) / len(result_dict)) log_kv("Avg Word count", "%.1f" % running_avg_word_count) log_kv("Avg Char count", "%.1f" % running_avg_char_count) log_kv("Writing", file_stats_path) with open(file_stats_path, 'w') as outfile: json.dump(result_dict, outfile, indent=2)
if __name__ == '__main__': start_time = time.time() parser = argparse.ArgumentParser(description='Compare Google STT vs IBM STT') parser.add_argument('--folder','-f', action='store', default='/tmp/transcription/text2stats', help='text2stats.py output directory') parser.add_argument('--verbose','-v', action='store_true', help='Spew logs profusely.') args = parser.parse_args() if args.verbose: print "Relies on the following intermediate result files under %s :" % args.folder print ", ".join([IBM_TRANSCRIPT_STATS_FILENAME, GOOGLE_TRANSCRIPT_STATS_FILENAME, IBM_PROCESSED_STATS_FILENAME, GOOGLE_PROCESSED_STATS_FILENAME, RESULT_FILENAME]) log_kv("Running", __file__) log_kv("From", os.path.dirname(os.path.realpath(__file__))) folder = args.folder log_kv("--folder", folder) path = os.path.realpath(folder) if not os.path.isdir(path): raise IOError("Path not found: %s" % path) ibm_stats_path = os.path.join(path, IBM_TRANSCRIPT_STATS_FILENAME) google_stats_path = os.path.join(path, GOOGLE_TRANSCRIPT_STATS_FILENAME) ibm_pstats_path = os.path.join(path, IBM_PROCESSED_STATS_FILENAME) google_pstats_path = os.path.join(path, GOOGLE_PROCESSED_STATS_FILENAME) audio_stats_path = os.path.join(path, RESULT_FILENAME)
parser = argparse.ArgumentParser(description='Analyze transcribe rate') parser.add_argument('--infolder', '-i', action='store', default='.', help='folder containing previous ETL files') parser.add_argument('--outfolder', '-o', action='store', default='./output', help='output directory') args = parser.parse_args() log_kv("Running", __file__) log_kv("From", os.path.dirname(os.path.realpath(__file__))) print inpath = os.path.realpath(args.infolder if args.infolder else os.getcwd()) log_kv("inpath", inpath) outpath = os.path.realpath( args.outfolder if args.outfolder else u'./output') log_kv("outpath", outpath) make_dir(outpath) log_kv("") log_kv("IBM log stats", IBM_LOG_STATS_FILEPATH) log_kv("IBM transcript stats", IBM_WORD_COUNT_FILEPATH) log_kv("Google log stats", GOOGLE_LOG_STATS_FILEPATH) log_kv("Google transcript stats", GOOGLE_WORD_COUNT_FILEPATH)
def run_query(con, cur, query): log_kv("query", query) cur.execute(query) con.commit() return cur.fetchall()
'-k', action='store_true', help= 'Do not overwrite previously converted audio files, or results folder already containing hypotheses.txt.' ) parser.add_argument( '--google', '-g', action='store_true', help='Analyze Google transcripts instead of IBM Watson.') args = parser.parse_args() if not args.basefolder: args.basefolder = args.infolder log_kv("Running", __file__) log_kv("From", os.path.dirname(os.path.realpath(__file__))) print inpath = os.path.realpath(args.infolder if args.infolder else os.getcwd()) log_kv("inpath", inpath) basepath = os.path.realpath(args.basefolder if args.basefolder else u'/') log_kv("basepath", basepath) if inpath.startswith(basepath): if inpath == basepath: logging.warn("inpath == basepath. Are you sure? [Y/n]") choice = raw_input().lower() if choice not in set(['yes', 'y']): logging.info("Quitting")
def calc_transcript_words_per_minute(ibm_stats_path, google_stats_path, ibm_pstats_path, google_pstats_path, audio_stats_path): ibm_stats = load_json(ibm_stats_path) google_stats = load_json(google_stats_path) ibm_pstats = load_json(ibm_pstats_path) google_pstats = load_json(google_pstats_path) audio_stats = load_json(audio_stats_path) count_processed_ibm = len(ibm_pstats) count_processed_google = len(google_pstats) count_transcribed_ibm = len(ibm_stats) count_transcribed_google = len(google_stats) print if count_processed_ibm < count_transcribed_ibm : logging.error("count_processed_ibm < count_transcribed_ibm") log_kv("IBM Transcribed/Processed", "%d/%d" % (count_transcribed_ibm, count_processed_ibm)) if count_processed_google < count_transcribed_google: logging.error("count_processed_google < count_transcribed_google") log_kv("Google Transcribed/Processed", "%d/%d" % (count_transcribed_google, count_processed_google)) print log_kv("Num audio files", len(audio_stats)) print print "===============================================================" print "Calculating number of IBM transcript words per minute of audio" print "===============================================================" suffix = ".out/hypotheses.txt.dictated" suffix2 = ".out/hypotheses.txt" prefix = "ibm_stt/" i_words_per_min = calc_stat_per_minute(ibm_stats, audio_stats, prefix, suffix, suffix2, "word_count") print "===============================================================" print "IBM wpm tallied: %d" % len(i_words_per_min) print "===============================================================" print print "===============================================================" print "Calculating IBM processing time per minute of audio" print "===============================================================" i_proc_per_min = calc_stat_per_minute(ibm_pstats, audio_stats, prefix, suffix, suffix2, "transcribe_seconds") print "===============================================================" print "IBM ppm tallied: %d" % len(i_proc_per_min) print "===============================================================" print print "===============================================================" print "Calculating number of Google transcript words per minute of audio" print "===============================================================" suffix = ".out/transcript.txt.dictated" suffix2 = ".out/transcript.txt" prefix = "google_stt/" g_words_per_min = calc_stat_per_minute(google_stats, audio_stats, prefix, suffix, suffix2, "word_count") print "===============================================================" print "Google wpm tallied: %d" % len(g_words_per_min) print "===============================================================" print print "===============================================================" print "Calculating Google processing time per minute of audio" print "===============================================================" i_proc_per_min = calc_stat_per_minute(google_pstats, audio_stats, prefix, suffix, suffix2, "transcribe_seconds") print "===============================================================" print "Google ppm tallied: %d" % len(i_proc_per_min) print "===============================================================" print print
def analyze_transcribe_time(inpath, basepath, outpath, ext=".out", logname="sttclient.log"): result = {} if args.google: processed_filepath = os.path.join(outpath, GOOGLE_PROCESSED_STATS_FILENAME) else: processed_filepath = os.path.join(outpath, IBM_PROCESSED_STATS_FILENAME) if os.path.isfile(processed_filepath): with open(processed_filepath) as file1: loaded = json.load(file1) log_kv("Loaded", processed_filepath) if loaded and type(loaded) is dict: result = loaded log_kv("Count(previous)", len(result)) logs = walk_logs(inpath, basepath, logname) cumulative_time = 0.0 count = 0 prev = 0 skipped = 0 total = 0 for xx, yy in logs: total += 1 # The actual key used to store the result. # If basepath==inpath, then keys in google result match keys in ibm result for easier cross-reference. # If basepath<inpath, then keys in google and ibm results retain their distinction for easier merge and safety id = yy.replace((ext + "/" + logname), '') if args.keep and id in result: prev += 1 count += 1 try: cumulative_time += result[id]["transcribe_seconds"] except Exception as e: ValueError("Missing field transcribe_seconds : %s", yy) continue unixmtime = os.path.getmtime(xx) birthtime = os.stat(xx).st_birthtime diff = unixmtime - birthtime if unixmtime: if id not in result: result[id] = {"unixmtime": unixmtime} else: result[id]["unixmtime"] = unixmtime if diff > 3600 or diff < 10: logging.warn("Skipped transcription time: %s", id) skipped += 1 continue else: cumulative_time += diff count += 1 if id not in result: result[id] = {"transcribe_seconds": diff} else: result[id]["transcribe_seconds"] = diff log_kv("Skipped", skipped) log_kv("Result size", len(result)) if result: if DRYRUN: log_kv("Warning", "Dry run only") else: log_kv("Writing", processed_filepath) with open(processed_filepath, 'w') as outfile: json.dump(result, outfile, indent=2) print "\n\n" print "Transcription Processing Time (estimated) \n" if skipped: print "Previous: %d Skipped: %d Total: %d" % (prev, skipped, total) print "Count: %s Avg transcribe time: %.2f minutes" % ( count, float(cumulative_time) / 60.0 / count if count else 0) print
logging.info("") if __name__ == '__main__': start_time = time.time() parser = argparse.ArgumentParser(description='Tally audio file specs') parser.add_argument('--infolder','-i', action='store', default='.', help='folder containing audio files') parser.add_argument('--basefolder','-b', action='store', default=DEFAULT_BASE_PATH, help='base directory') parser.add_argument('--outfolder','-o', action='store', default='/tmp/transcription/text2stats_dev', help='output directory') parser.add_argument('--verbose','-v', action='store_true', help='Spew logs profusely.') parser.add_argument('--keep','-k', action='store_true', help='Do not reprocess files already in previous result.') args = parser.parse_args() log_kv("Running", __file__) log_kv("From", os.path.dirname(os.path.realpath(__file__))) log_kv("--infolder", args.infolder) inpath = os.path.realpath(os.path.expanduser(args.infolder)) inpath = inpath if inpath.endswith("/") else inpath+"/" log_kv("inpath", inpath) log_kv("--outfolder", args.outfolder) outpath = os.path.realpath(os.path.expanduser(args.outfolder)) log_kv("outpath", outpath) log_kv("--basefolder", args.basefolder) basepath = os.path.realpath(os.path.expanduser(args.basefolder)) log_kv("basepath", basepath) result_filepath = os.path.join(outpath, RESULT_FILENAME) log_kv('result_filepath', result_filepath) make_dir(outpath)