Пример #1
0
def main(args):

    ref2source = list()

    # NOTE Make sure synced with order dumped in runDecode.py
    with open(args.infile, 'rb') as fid:
        hyps = pickle.load(fid)
        print 'Loaded hyps'
        refs = pickle.load(fid)
        print 'Loaded refs'
        pickle.load(fid)  # hypscores
        pickle.load(fid)  # refscores
        pickle.load(fid)  # numphones
        pickle.load(fid)  # subsets
        alignments = pickle.load(fid)
        print 'Loaded alignments'
    print 'Loaded data'

    pbar = ProgressBar(maxval=len(hyps)).start()
    j = 0

    for (hyp, ref, align) in zip(hyps, refs, alignments):
        #print ref, len(hyp), len(ref), len(align)
        dist, eq, ins, dels, subs, errs_by_pos, hyp_corr, ref_corr = ed(
            hyp, ref)
        r2h = ref_to_hyp(hyp_corr, ref_corr)
        r2s = list()
        #print 'len align:', len(align)
        #print 'len hyp:', len(hyp)
        #print 'len ref:', len(ref)
        #print hyp
        #print ref
        #print r2h
        for k in xrange(len(r2h)):
            if len(align) == 0:  # empty hyp
                r2s.append(0)
                continue
            ind = r2h[k]
            if ind == len(align):
                ind -= 1  # edge case
            r2s.append(align[ind])
        ref2source.append(r2s)

        j += 1
        pbar.update(j)

    print '%d alignments computed' % len(ref2source)
    with open(args.outfile, 'wb') as fid:
        pickle.dump(np.array(ref2source), fid)
Пример #2
0
def main(args):

    ref2source = list()

    # NOTE Make sure synced with order dumped in runDecode.py
    with open(args.infile, 'rb') as fid:
        hyps = pickle.load(fid)
        print 'Loaded hyps'
        refs = pickle.load(fid)
        print 'Loaded refs'
        pickle.load(fid)  # hypscores
        pickle.load(fid)  # refscores
        pickle.load(fid)  # numphones
        pickle.load(fid)  # subsets
        alignments = pickle.load(fid)
        print 'Loaded alignments'
    print 'Loaded data'

    pbar = ProgressBar(maxval=len(hyps)).start()
    j = 0

    for (hyp, ref, align) in zip(hyps, refs, alignments):
        #print ref, len(hyp), len(ref), len(align)
        dist, eq, ins, dels, subs, errs_by_pos, hyp_corr, ref_corr = ed(hyp, ref)
        r2h = ref_to_hyp(hyp_corr, ref_corr)
        r2s = list()
        #print 'len align:', len(align)
        #print 'len hyp:', len(hyp)
        #print 'len ref:', len(ref)
        #print hyp
        #print ref
        #print r2h
        for k in xrange(len(r2h)):
            if len(align) == 0:  # empty hyp
                r2s.append(0)
                continue
            ind = r2h[k]
            if ind == len(align):
                ind -= 1  # edge case
            r2s.append(align[ind])
        ref2source.append(r2s)

        j += 1
        pbar.update(j)

    print '%d alignments computed' % len(ref2source)
    with open(args.outfile, 'wb') as fid:
        pickle.dump(np.array(ref2source), fid)
Пример #3
0
    # Read in hyps and refs

    fid = open(args.pk1, 'rb')
    hyps1 = np.array(pickle.load(fid))
    refs = np.array(pickle.load(fid))
    fid.close()

    fid = open(args.pk2, 'rb')
    hyps2 = np.array(pickle.load(fid))
    fid.close()

    hyp1_better = 0
    hyp2_better = 0
    for (h1, h2, ref) in zip(hyps1, hyps2, refs):
        dist1, eq1, ins1, dels1, subs1, errs_by_pos1, hyp_corr1, ref_corr1 = ed(
            h1, ref)
        dist2, eq2, ins2, dels2, subs2, errs_by_pos2, hyp_corr2, ref_corr2 = ed(
            h2, ref)

        if dist1 < dist2:
            hyp1_better += 1
        elif dist2 < dist1:
            hyp2_better += 1

        # FIXME Just display cases where hyp2 is better
        if dist2 < dist1:
            disp_err_corr(hyp_corr1, ref_corr1)
            disp_err_corr(hyp_corr2, ref_corr2)
            print

    print '%d cases hyp1 better' % hyp1_better
Пример #4
0
def compute_and_display_stats(hyps, refs, hypscores, refscores, numphones, subsets, subset=None, display=False):
    # Filter by subset
    if subset:
        print 'USING SUBSET: %s' % subset
        filt = subsets == subset
        hyps = hyps[filt]
        refs = refs[filt]
        hypscores = hypscores[filt]
        refscores = refscores[filt]
        numphones = numphones[filt]

    '''
    Compute stats
    '''

    hyp_lens = [len(s) for s in hyps]
    ref_lens = [len(s) for s in refs]

    max_hyp_len = max([len(hyp) for hyp in hyps])
    tot_errs_by_pos = np.zeros(max_hyp_len)
    counts_by_pos = np.zeros(max_hyp_len, dtype=np.int32)

    tot_dist = tot_eq = tot_ins = tot_dels = tot_subs = 0.0
    num_sents_correct = 0
    correct_sents_len = 0

    #pbar = ProgressBar(maxval=len(hyps)).start()

    k = 0
    for (hyp, ref, hypscore, refscore) in reversed(zip(hyps, refs, hypscores, refscores)):
        #hyp = replace_contractions(hyp)
        dist, eq, ins, dels, subs, errs_by_pos, hyp_corr, ref_corr = ed(hyp, ref)
        tot_eq += eq
        tot_ins += ins
        tot_dels += dels
        tot_subs += subs
        tot_errs_by_pos[0:errs_by_pos.shape[0]] += errs_by_pos
        counts_by_pos[0:errs_by_pos.shape[0]] += 1
        k += 1
        #pbar.update(k)

        if dist == 0:
            num_sents_correct += 1
            correct_sents_len += len(ref)
        tot_dist += dist

        if display:
            disp_err_corr(hyp_corr, ref_corr)
            print

    '''
    Display aggregate stats
    '''

    print 'avg len hyp: %f' % np.mean(hyp_lens)
    print 'avg len ref: %f' % np.mean(ref_lens)
    print 'avg num phones: %f' % np.mean(numphones)

    print 'avg ref score: %f' % (sum(refscores) / len(refscores))
    print 'avg hyp score: %f' % (sum(hypscores) / len(hypscores))

    tot_comp_len = float(np.sum([max(h, r) for (h, r) in zip(hyp_lens, ref_lens)]))
    print 'frac eq: %f ins: %f del: %f sub: %f' %\
        tuple(np.array([tot_eq, tot_ins, tot_dels, tot_subs]) / tot_comp_len)

    print 'CER: %f' % (100.0 * tot_dist / np.sum(numphones))

    print '%d/%d sents correct' % (num_sents_correct, len(hyps))
    print 'avg len of correct sent: %f' % (correct_sents_len / float(num_sents_correct))

    disp_errs_by_pos(tot_errs_by_pos / counts_by_pos, 'err_by_pos.%s.png' % ('all' if not subset else subset))
Пример #5
0
    # Read in hyps and refs

    fid = open(args.pk1, 'rb')
    hyps1 = np.array(pickle.load(fid))
    refs = np.array(pickle.load(fid))
    fid.close()

    fid = open(args.pk2, 'rb')
    hyps2 = np.array(pickle.load(fid))
    fid.close()

    hyp1_better = 0
    hyp2_better = 0
    for (h1, h2, ref) in zip(hyps1, hyps2, refs):
        dist1, eq1, ins1, dels1, subs1, errs_by_pos1, hyp_corr1, ref_corr1 = ed(h1, ref)
        dist2, eq2, ins2, dels2, subs2, errs_by_pos2, hyp_corr2, ref_corr2 = ed(h2, ref)

        if dist1 < dist2:
            hyp1_better += 1
        elif dist2 < dist1:
            hyp2_better += 1

        # FIXME Just display cases where hyp2 is better
        if dist2 < dist1:
            disp_err_corr(hyp_corr1, ref_corr1)
            disp_err_corr(hyp_corr2, ref_corr2)
            print

    print '%d cases hyp1 better' % hyp1_better
    print '%d cases hyp2 better' % hyp2_better
Пример #6
0
def compute_and_display_stats(hyps, refs, hypscores, refscores, numphones, subsets, subset=None, display=False):
    # Filter by subset
    if subset:
        print "USING SUBSET: %s" % subset
        filt = subsets == subset
        hyps = hyps[filt]
        refs = refs[filt]
        hypscores = hypscores[filt]
        refscores = refscores[filt]
        numphones = numphones[filt]

    """
    Compute stats
    """

    hyp_lens = [len(s) for s in hyps]
    ref_lens = [len(s) for s in refs]

    max_hyp_len = max([len(hyp) for hyp in hyps])
    tot_errs_by_pos = np.zeros(max_hyp_len)
    counts_by_pos = np.zeros(max_hyp_len, dtype=np.int32)

    tot_dist = tot_eq = tot_ins = tot_dels = tot_subs = 0.0
    num_sents_correct = 0
    correct_sents_len = 0

    # pbar = ProgressBar(maxval=len(hyps)).start()

    k = 0
    for (hyp, ref, hypscore, refscore) in reversed(zip(hyps, refs, hypscores, refscores)):
        # hyp = replace_contractions(hyp)
        dist, eq, ins, dels, subs, errs_by_pos, hyp_corr, ref_corr = ed(hyp, ref)
        tot_eq += eq
        tot_ins += ins
        tot_dels += dels
        tot_subs += subs
        tot_errs_by_pos[0 : errs_by_pos.shape[0]] += errs_by_pos
        counts_by_pos[0 : errs_by_pos.shape[0]] += 1
        k += 1
        # pbar.update(k)

        if dist == 0:
            num_sents_correct += 1
            correct_sents_len += len(ref)
        tot_dist += dist

        if display:
            disp_err_corr(hyp_corr, ref_corr)
            print

    """
    Display aggregate stats
    """

    print "avg len hyp: %f" % np.mean(hyp_lens)
    print "avg len ref: %f" % np.mean(ref_lens)
    print "avg num phones: %f" % np.mean(numphones)

    print "avg ref score: %f" % (sum(refscores) / len(refscores))
    print "avg hyp score: %f" % (sum(hypscores) / len(hypscores))

    tot_comp_len = float(np.sum([max(h, r) for (h, r) in zip(hyp_lens, ref_lens)]))
    print "frac eq: %f ins: %f del: %f sub: %f" % tuple(np.array([tot_eq, tot_ins, tot_dels, tot_subs]) / tot_comp_len)

    print "CER: %f" % (100.0 * tot_dist / np.sum(numphones))

    print "%d/%d sents correct" % (num_sents_correct, len(hyps))
    print "avg len of correct sent: %f" % (correct_sents_len / float(num_sents_correct))

    disp_errs_by_pos(tot_errs_by_pos / counts_by_pos, "err_by_pos.%s.png" % ("all" if not subset else subset))