references = [line.strip().lower() for line in trg_file] else: hypotheses = [line.strip() for line in src_file] references = [line.strip() for line in trg_file] if args.max_size is not None: hypotheses = hypotheses[:args.max_size] references = references[:args.max_size] if len(hypotheses) != len(references): sys.stderr.write( 'warning: source and target don\'t have the same length\n') size = min(len(hypotheses), len(references)) hypotheses = hypotheses[:size] references = references[:size] avg_stats, stats = tercom_statistics(hypotheses, references) ters = [stats_['TER'] for stats_ in stats] mean = sum(ters) / len(ters) variance = sum((ter - mean)**2 for ter in ters) / (len(ters) - 1) ts = {0.01: 2.5841, 0.05: 1.9639, 0.10: 1.6474} t = ts.get(args.p) if t is None: raise Exception d = t * np.sqrt(variance / len(ters)) print('{:.3f} +/- {:.3f}'.format(mean, d))
parser.add_argument('--average', nargs='+', type=int) if __name__ == '__main__': args = parser.parse_args() with open(args.ref_file) as f: references = [line.strip() for line in f] hypotheses = [] for hyp_file in args.hyp_files: with open(hyp_file) as f: hypotheses.append([line.strip() for line in f]) if args.reverse: scores = [tercom_statistics(references, hyp)[0] for hyp in hypotheses] else: scores = [tercom_statistics(hyp, references)[0] for hyp in hypotheses] N = len(args.average) if args.average else len(args.hyp_files) ind = np.arange(N) op_name_mapping = {'ins': 'Insertions', 'del': 'Deletions', 'sub': 'Substitutions', 'shift': 'Shifts'} ref_words = np.array([score["REF_WORDS"] for score in scores]) bars = [] legend = [] bottom = np.zeros(N) colors = ['#e66101', '#fdb863', '#b2abd2', '#5e3c99']
n = 1000 avg_length = 0 while True: i += 1 hypotheses = list(islice(src_file, n)) references = list(islice(trg_file, n)) if not hypotheses or not references: break hypotheses = [line.strip() for line in hypotheses] references = [line.strip() for line in references] _, stats = tercom_statistics(hypotheses, references, not args.case_insensitive) if avg_length == 0: avg_length = sum(stats_['REF_WORDS'] for stats_ in stats) / len(stats) for stats_ in stats: for field in ('DEL', 'INS', 'SUB', 'WORD_SHIFT'): stats_[field] /= stats_['REF_WORDS'] stats_['REF_WORDS'] = (stats_['REF_WORDS'] - avg_length) / avg_length stats_['TER'] /= 100 if not args.output: print('\n'.join(','.join(str(round(stats_[k], args.precision)) for k in fields) for stats_ in stats)) else: