def nlp(disc_clsdict, gold_clsdict, fragments_within, fragments_cross, dest, verbose, n_jobs): if verbose: print banner('NLP') nc, cc = _nlp_sub(disc_clsdict, gold_clsdict, fragments_cross, 'cross', verbose, n_jobs) nw, cw = _nlp_sub(disc_clsdict, gold_clsdict, fragments_within, 'within', verbose, n_jobs) # calculating the pairs/clusters found in the discovery algoritms, # it's stored on the 'nlp' output file, used to compare diff algoritms nclust = len(disc_clsdict.items()) npairs = sum([nCr(len(v[1]), 2) for v in disc_clsdict.items()]) with open(path.join(dest, 'nlp'), 'w') as fid: fid.write(pretty_score_nlp(nc, cc, 'NLP total', len(fragments_cross), sum(map(len, fragments_cross)), nclust, npairs)) fid.write('\n') fid.write(pretty_score_nlp(nw, cw, 'NLP within-speaker only', len(fragments_within), sum(map(len, fragments_within)), nclust, npairs))
def token_type(disc_clsdict, wrd_corpus, fragments_within, fragments_cross, dest, verbose, n_jobs): if verbose: print banner('TOKEN/TYPE') ptoc, rtoc, ptyc, rtyc = _token_type_sub(disc_clsdict, wrd_corpus, fragments_cross, 'cross', verbose, n_jobs) ftoc = np.fromiter((fscore(ptoc[i], rtoc[i]) for i in xrange(ptoc.shape[0])), dtype=np.double) ftyc = np.fromiter((fscore(ptyc[i], rtyc[i]) for i in xrange(ptyc.shape[0])), dtype=np.double) ptow, rtow, ptyw, rtyw = _token_type_sub(disc_clsdict, wrd_corpus, fragments_within, 'within', verbose, n_jobs) ftow = np.fromiter((fscore(ptow[i], rtow[i]) for i in xrange(ptow.shape[0])), dtype=np.double) ftyw = np.fromiter((fscore(ptyw[i], rtyw[i]) for i in xrange(rtyw.shape[0])), dtype=np.double) with open(path.join(dest, 'token_type'), 'w') as fid: fid.write(pretty_score_f(ptoc, rtoc, ftoc, 'token total', len(fragments_cross), sum(map(len, fragments_cross)))) fid.write('\n') fid.write(pretty_score_f(ptyc, rtyc, ftyc, 'type total', len(fragments_cross), sum(map(len, fragments_cross)))) fid.write('\n') fid.write(pretty_score_f(ptow, rtow, ftow, 'token within-speaker only', len(fragments_within), sum(map(len, fragments_within)))) fid.write('\n') fid.write(pretty_score_f(ptyw, rtyw, ftyw, 'type within-speaker only', len(fragments_within), sum(map(len, fragments_within))))
def make_typeset(pclus, verbose, debug): with verb_print('constructing typeset', verbose, True, True): ts = list(typeset(pclus)) if debug: print banner('TYPESET ({0})'.format(len(ts))) print pformat(ts) print return ts
def make_weights(pclus, verbose, debug): with verb_print('constructing weights', verbose, True, True): ws = weights(pclus) if debug: print banner('WEIGHTS') print pformat(ws) print return ws
def make_psubs_nmatch(psubs, verbose, debug): with verb_print('making psubs nmatch', verbose, True, True): psubs_nmatch = nmatch(psubs) if debug: print banner('NMATCH(PSUBS)') print pformat(psubs_nmatch) print return psubs_nmatch
def make_pgoldclus_nmatch(pgoldclus, verbose, debug): with verb_print('constructing pgoldclus_nmatch', verbose, True, True): pgoldclus_nmatch = nmatch(pgoldclus) if debug: print banner('NMATCH(PGOLDCLUS)') print pformat(pgoldclus_nmatch) print return pgoldclus_nmatch
def make_pdisc(disc_clsdict, verbose, debug): with verb_print('constructing pdisc set', verbose, True, True): pdisc = list(Pclus(disc_clsdict)) if debug: print banner('PDISC ({0})'.format(len(pdisc))) print pretty_pairs(pdisc) print return pdisc
def make_typeset(psubs, verbose, debug): with verb_print('making typeset', verbose, True, True): ts = list(typeset(psubs)) if debug: print banner('TYPES(PSUBS) ({0})'.format(len(ts))) print pformat(ts) print return ts
def make_weights(psubs, verbose, debug): with verb_print('making weights', verbose, True, True): ws = weights(psubs) if debug: print banner('WEIGHTS(PSUBS) ({0})'.format(len(ws))) print pformat(ws) print return ws
def make_pgold(gold_clsdict, verbose, debug): with verb_print('constructing pgold set', verbose, True, True): pgold = list(Pclus(gold_clsdict)) if debug: print banner('PGOLD ({0})'.format(len(pgold))) print pretty_pairs(pgold) print return pgold
def make_pgold_nmatch(pgold, verbose, debug): with verb_print('constructing nmatch_gold', verbose, True, True): nmatch_gold = nmatch(pgold) if debug: print banner('nmatch_gold') for k, v in nmatch_gold.iteritems(): print k, v return nmatch_gold
def make_pgoldclus(disc_clsdict, verbose, debug): with verb_print('constructing pgoldclus', verbose, True, True): pgoldclus = list(Pgoldclus(disc_clsdict)) if debug: pgoldclus = list(pgoldclus) print banner('PGOLDCLUS ({0})'.format(len(pgoldclus))) print pretty_pairs(pgoldclus) print return pgoldclus
def make_psubs_pgold_nmatch(pgold, psubs, verbose, debug): with verb_print('making psubs/pgold nmatch', verbose, True, True): psubs_pgold_intersect = intersection(pgold, psubs) psubs_pgold_nmatch = nmatch(psubs_pgold_intersect) if debug: print banner('NMATCH(PSUBS/PGOLD)') print pformat(psubs_pgold_nmatch) print return psubs_pgold_nmatch
def make_pclus_pgoldclus_nmatch(pclus, pgoldclus, verbose, debug): with verb_print('making pclus/pgoldclus nmatch', verbose, True, True): pclus_pgoldclus_intersect = list(intersection(pclus, pgoldclus)) pclus_pgoldclus_nmatch = nmatch(pclus_pgoldclus_intersect) if debug: print banner('NMATCH(PCLUS/PGOLDCLUS)') print pformat(pclus_pgoldclus_nmatch) print return pclus_pgoldclus_nmatch
def make_psubs(disc_clsdict, corpus, minlength, maxlength, verbose, debug): with verb_print('constructing psubs set', verbose, True, True): psubs = list(Psubs(disc_clsdict, corpus, minlength=minlength, maxlength=maxlength)) if debug: print banner('PSUBS ({0})'.format(len(psubs))) print pretty_pairs(psubs) print return psubs
def make_pclus(disc_clsdict, verbose, debug): with verb_print('constructing pclus', verbose, True, True): pclus = list(tuple(sorted((f1, f2), key=lambda f: (f.name, f.interval.start))) for f1, f2 in Pclus_single(disc_clsdict)) if debug: print banner('PCLUS ({0})'.format(len(pclus))) print pretty_pairs(pclus) print return pclus
def boundary(disc_clsdict, corpus, fragments_cross, dest, verbose, n_jobs): if verbose: print banner('BOUNDARY') pc, rc = _boundary_sub(disc_clsdict, corpus, fragments_cross, 'cross', verbose, n_jobs) fc = np.fromiter((fscore(pc[i], rc[i]) for i in xrange(pc.shape[0])), dtype=np.double) with open(path.join(dest, 'boundary'), 'w') as fid: fid.write(pretty_score(pc, rc, fc, 'boundary total', sum(map(len, fragments_cross))))
def make_psubs(disc_clsdict, corpus, minlength, maxlength, verbose, debug): with verb_print('constructing psubs set', verbose, True, True): psubs = list( Psubs(disc_clsdict, corpus, minlength=minlength, maxlength=maxlength)) if debug: print banner('PSUBS ({0})'.format(len(psubs))) print pretty_pairs(psubs) print return psubs
def group(disc_clsdict, fragments_all, dest, verbose, n_jobs): if verbose: print banner('GROUP') #TODO CHECK SCORE ACROSS/WITHIN! pc, rc = _group_sub(disc_clsdict, fragments_all, 'all', verbose, n_jobs) fc = np.fromiter((fscore(pc[i], rc[i]) for i in xrange(pc.shape[0])), dtype=np.double) #pw, rw = _group_sub(disc_clsdict, fragments_within, 'within', verbose, n_jobs) #fw = np.fromiter((fscore(pw[i], rw[i]) for i in xrange(pw.shape[0])), dtype=np.double) with open(path.join(dest, 'group'), 'w') as fid: fid.write(pretty_score(pc, rc, fc, 'group total', sum(map(len, fragments_all))))
def group(disc_clsdict, fragments_within, fragments_cross, dest, verbose, n_jobs): if verbose: print banner('GROUP') pc, rc = _group_sub(disc_clsdict, fragments_cross, 'cross', verbose, n_jobs) fc = np.fromiter((fscore(pc[i], rc[i]) for i in xrange(pc.shape[0])), dtype=np.double) pw, rw = _group_sub(disc_clsdict, fragments_within, 'within', verbose, n_jobs) fw = np.fromiter((fscore(pw[i], rw[i]) for i in xrange(pw.shape[0])), dtype=np.double) with open(path.join(dest, 'group'), 'w') as fid: fid.write(pretty_score_f(pc, rc, fc, 'group total', len(fragments_cross), sum(map(len, fragments_cross)))) fid.write('\n') fid.write(pretty_score_f(pw, rw, fw, 'group within-speaker only', len(fragments_within), sum(map(len, fragments_within))))
def nlp(disc_clsdict, gold_clsdict, fragments_within, fragments_cross, dest, verbose, n_jobs): if verbose: print banner('NLP') nc, cc = _nlp_sub(disc_clsdict, gold_clsdict, fragments_cross, 'cross', verbose, n_jobs) nw, cw = _nlp_sub(disc_clsdict, gold_clsdict, fragments_within, 'within', verbose, n_jobs) with open(path.join(dest, 'nlp'), 'w') as fid: fid.write(pretty_score_nlp(nc, cc, 'NLP total', len(fragments_within), sum(map(len, fragments_within)))) fid.write('\n') fid.write(pretty_score_nlp(nw, cw, 'NLP within-speaker only', len(fragments_cross), sum(map(len, fragments_cross))))
def boundary_wrd(disc_clsdict, corpus, fragments_within, fragments_cross, dest, verbose, n_jobs, threshold=0.03): if verbose: print banner('BOUNDARY (WRD)') pc, rc = _boundary_sub(disc_clsdict, corpus, fragments_cross, 'cross', verbose, n_jobs, threshold=threshold) fc = np.fromiter((fscore(pc[i], rc[i]) for i in xrange(pc.shape[0])), dtype=np.double) pw, rw = _boundary_sub(disc_clsdict, corpus, fragments_within, 'within', verbose, n_jobs, threshold=threshold) fw = np.fromiter((fscore(pw[i], rw[i]) for i in xrange(pw.shape[0])), dtype=np.double) with open(path.join(dest, 'boundary_wrd'), 'w') as fid: fid.write(pretty_score_f(pc, rc, fc, 'boundary total', len(fragments_cross), sum(map(len, fragments_cross)))) fid.write('\n') fid.write(pretty_score_f(pw, rw, fw, 'boundary within-speaker only', len(fragments_within), sum(map(len, fragments_within))))
def match(disc_clsdict, gold_clsdict, phn_corpus, fragments_within, fragments_cross, dest, verbose, n_jobs): if verbose: print banner('MATCHING') pc, rc = _match_sub(disc_clsdict, gold_clsdict, phn_corpus, fragments_cross, 'cross', verbose, n_jobs) fc = np.fromiter((fscore(pc[i], rc[i]) for i in xrange(pc.shape[0])), dtype=np.double) pw, rw = _match_sub(disc_clsdict, gold_clsdict, phn_corpus, fragments_within, 'within', verbose, n_jobs) fw = np.fromiter((fscore(pw[i], rw[i]) for i in xrange(pw.shape[0])), dtype=np.double) with open(path.join(dest, 'matching'), 'w') as fid: fid.write(pretty_score_f(pc, rc, fc, 'match total', len(fragments_cross), sum(map(len, fragments_cross)))) fid.write('\n') fid.write(pretty_score_f(pw, rw, fw, 'match within-speaker only', len(fragments_within), sum(map(len, fragments_within))))
def load_disc(fname, corpus, split_file, truncate, verbose): with verb_print(' loading discovered classes', verbose, True, True, True): split_mapping = load_split(split_file) disc, errors = _load_classes(fname, corpus, split_mapping) if not truncate: errors_found = len(errors) > 0 if len(errors) > 100: print 'There were more than 100 interval errors found.' print 'Printing only the first 100.' print errors = errors[:100] for fragment in sorted(errors, key=lambda x: (x.name, x.interval.start)): print ' error: {0} [{1:.3f}, {2:.3f}]'.format( fragment.name, fragment.interval.start, fragment.interval.end) if not truncate and errors_found: print 'There were errors in {0}. Use option -f to'\ ' automatically skip invalid intervals.'.format(fname) sys.exit() if truncate: with verb_print(' checking discovered classes and truncating'): disc, filename_errors, interval_errors = \ truncate_intervals(disc, corpus, split_mapping) else: with verb_print(' checking discovered classes', verbose, True, True, True): filename_errors, interval_errors = \ check_intervals(disc, split_mapping) if not truncate: filename_errors = sorted(filename_errors, key=lambda x: (x.name, x.interval.start)) interval_errors = sorted(interval_errors, key=lambda x: (x.name, x.interval.start)) interval_error = len(interval_errors) > 0 filename_error = len(filename_errors) > 0 errors_found = filename_error or interval_error if interval_error: print banner('intervals found in {0} outside of valid' ' splits'.format(fname)) if len(interval_errors) > 100: print 'There were more than 100 interval errors found.' print 'Printing only the first 100.' print interval_errors = interval_errors[:100] for fragment in sorted(interval_errors, key=lambda x: (x.name, x.interval.start)): print ' error: {0} [{1:.3f}, {2:.3f}]'.format( fragment.name, fragment.interval.start, fragment.interval.end) if filename_error: print banner('unknown filenames found in {0}'.format(fname)) if len(filename_errors) > 100: print 'There were more than 100 filename errors found.' print 'Printing only the first 100.' print filename_errors = filename_errors[:100] for fragment in sorted(filename_errors, key=lambda x: (x.name, x.interval.start)): print ' error: {0}'.format(fragment.name) if not truncate and errors_found: print 'There were errors in {0}. Use option -f to automatically skip invalid intervals.'.format( fname) sys.exit() return disc
'xitsonga.intervals.within') gold_clsfile = path.join(resource_dir, 'xitsonga.classes') phn_corpus_file = path.join(resource_dir, 'xitsonga.phn') wrd_corpus_file = path.join(resource_dir, 'xitsonga.wrd') split_file = path.join(resource_dir, 'xitsonga.split') if verbose: print 'xitsonga_eval2 version {0}'.format(VERSION) print '--------------------------' print 'dataset: xitsonga' print 'inputfile: {0}'.format(disc_clsfile) print 'destination: {0}'.format(dest) print if verbose: print banner('LOADING FILES') wrd_corpus = load_wrd_corpus(wrd_corpus_file, verbose) phn_corpus = load_phn_corpus(phn_corpus_file, verbose) fragments_cross = load_fragments_cross(fragments_cross_file, verbose) fragments_within = load_fragments_within(fragments_within_file, verbose) truncate = args['truncate'] disc_clsdict = load_disc(disc_clsfile, phn_corpus, split_file, truncate, verbose) gold_clsdict = load_gold(gold_clsfile, phn_corpus, verbose) try: os.makedirs(dest) except OSError:
def load_disc(fname, corpus, split_file, truncate, verbose): with verb_print(' loading discovered classes', verbose, True, True, True): split_mapping = load_split(split_file) disc, errors = _load_classes(fname, corpus, split_mapping) if not truncate: errors_found = len(errors) > 0 if len(errors) > 100: print 'There were more than 100 interval errors found.' print 'Printing only the first 100.' print errors = errors[:100] for fragment in sorted(errors, key=lambda x: (x.name, x.interval.start)): print ' error: {0} [{1:.3f}, {2:.3f}]'.format( fragment.name, fragment.interval.start, fragment.interval.end) if not truncate and errors_found: print 'There were errors in {0}. Use option -f to'\ ' automatically skip invalid intervals.'.format(fname) sys.exit() if truncate: with verb_print(' checking discovered classes and truncating'): disc, filename_errors, interval_errors = \ truncate_intervals(disc, corpus, split_mapping) else: with verb_print(' checking discovered classes', verbose, True, True, True): filename_errors, interval_errors = \ check_intervals(disc, split_mapping) if not truncate: filename_errors = sorted(filename_errors, key=lambda x: (x.name, x.interval.start)) interval_errors = sorted(interval_errors, key=lambda x: (x.name, x.interval.start)) interval_error = len(interval_errors) > 0 filename_error = len(filename_errors) > 0 errors_found = filename_error or interval_error if interval_error: print banner('intervals found in {0} outside of valid' ' splits'.format(fname)) if len(interval_errors) > 100: print 'There were more than 100 interval errors found.' print 'Printing only the first 100.' print interval_errors = interval_errors[:100] for fragment in sorted(interval_errors, key=lambda x: (x.name, x.interval.start)): print ' error: {0} [{1:.3f}, {2:.3f}]'.format( fragment.name, fragment.interval.start, fragment.interval.end) if filename_error: print banner('unknown filenames found in {0}' .format(fname)) if len(filename_errors) > 100: print 'There were more than 100 filename errors found.' print 'Printing only the first 100.' print filename_errors = filename_errors[:100] for fragment in sorted(filename_errors, key=lambda x: (x.name, x.interval.start)): print ' error: {0}'.format(fragment.name) if not truncate and errors_found: print 'There were errors in {0}. Use option -f to automatically skip invalid intervals.'.format(fname) sys.exit() return disc
rdir = path.dirname(path.realpath(__file__)) resource_dir = path.join(rdir, 'resources') prefix = 'globalphone-' + lang phn_corpus_file = path.join(resource_dir, prefix + '.phn') if verbose: print 'globalphone_eval2 version {0}'.format(VERSION) print '----------------------------' print 'dataset: globalphone-' + lang print 'inputfile: {0}'.format(match_fn) print 'destination: {0}'.format(dest) print if verbose: print banner('Loading phone corpus.') phn_corpus = load_corpus_txt(phn_corpus_file) if verbose: print banner('Loading matches from master_match.') matches = load_match_file(match_fn, phn_corpus) ned_scores, dtw_scores = ned_sub(matches, verbose, n_jobs) with open(dest, 'w') as f: for ned_score, dtw_score in zip(ned_scores, dtw_scores): f.write("%.4f %.4f\n" % (ned_score, dtw_score)) # sns.jointplot(np.array(dtw_scores), np.array(ned_scores), kind='kde') # plt.show()
fragments_within_file = path.join(resource_dir, 'sample.intervals.within') gold_clsfile = path.join(resource_dir, 'sample.classes') phn_corpus_file = path.join(resource_dir, 'sample.phn') wrd_corpus_file = path.join(resource_dir, 'sample.wrd') split_file = path.join(resource_dir, 'sample.split') if verbose: print 'sample_eval2 version {0}'.format(VERSION) print '--------------------------' print 'dataset: sample' print 'inputfile: {0}'.format(disc_clsfile) print 'destination: {0}'.format(dest) print if verbose: print banner('LOADING FILES') wrd_corpus = load_wrd_corpus(wrd_corpus_file, verbose) phn_corpus = load_phn_corpus(phn_corpus_file, verbose) fragments_cross = load_fragments_cross(fragments_cross_file, verbose) fragments_within = load_fragments_within(fragments_within_file, verbose) truncate = args['truncate'] disc_clsdict = load_disc(disc_clsfile, phn_corpus, split_file, truncate, verbose) gold_clsdict = load_gold(gold_clsfile, phn_corpus, verbose) try: os.makedirs(dest) except OSError: