def process(h5file, ratio): isomerlist = ["scyllo", "chiro", "water"] plot_data = [] mean_contact_list = [] std_contact_list = [] #read in files for each system and aggregate format="pp_nonpolar_vs_t.xvg" for iso in isomerlist: print "processing", iso pattern = re.compile(r"%(iso)s.*%(ratio)s.*%(format)s" % vars()) if iso == "water": pattern = re.compile(r"%(iso)s.*%(format)s" % vars()) datalist=[] for table in h5file.listNodes(where='/pp_nonpolar'): table_path = os.path.join('/pp_nonpolar', table.name) if pattern.search(table.name): data = myh5.getTableAsMatrix(h5file, table_path) if data is not None: data = data.astype('float') datalist.append(data[0:config.LASTFRAME, 1]) else: print "no data was read in" print "datalist", datalist data_matrix = numpy.transpose(numpy.vstack(datalist)) print "data_matrix", data_matrix, data_matrix.shape avg, std = utils.summary_statistics(data_matrix, sum_across="columns") avg_contacts = numpy.average(data_matrix[config.STARTFRAME:config.LASTFRAME], axis=0) mean_contact = numpy.average(avg_contacts) std_contact = numpy.std(avg_contacts) print mean_contact print std_contact mean_contact_list.append(mean_contact) std_contact_list.append(std_contact) avg_smoothed = utils.smooth(avg/config.NMOLECULES, 500, time_present=False, timestep=2) std_smoothed = utils.smooth(std/config.NMOLECULES, 500, time_present=True, timestep=2) plot_data.append(avg_smoothed) plot_data.append(std_smoothed) timeseries_matrix = numpy.hstack(plot_data) print "timeseries_matrix", timeseries_matrix, timeseries_matrix.shape print "time", timeseries_matrix[:,0] numpy.savetxt(ratio + "_pp_nonpolar_smoothed.txt.gz", timeseries_matrix, fmt='%0.3f') utils.savetxt(ratio + "_avg_pp_nonpolar_contact.txt", "#scyllo chiro water", numpy.vstack([mean_contact_list, std_contact_list]), fmt='%0.3f') return timeseries_matrix
def paired_bootstrap_resampling(ref, out, otherout, num_samples, sample_percent, dummy_ref, dummy_out, dummy_otherout, verbose=False): ref = load_if_file(ref) out = load_if_file(out) otherout = load_if_file(otherout) n = len(ref) assert n == len(out), 'Mismatched reference and output file size' assert n == len( otherout), 'Mismatched reference and other output file size' k = int(sample_percent * n / 100) bleus = [] for i in range(num_samples): subset = np.random.choice(n, k) savetxt(dummy_out, out[subset]) savetxt(dummy_otherout, otherout[subset]) savetxt(dummy_ref, ref[subset]) bleu1, _ = bleu_score(dummy_ref, dummy_out) bleu2, _ = bleu_score(dummy_ref, dummy_otherout) bleus.append([bleu1, bleu2]) if verbose and (i + 1) % (num_samples // 10) == 0: print('%d%% done' % ((i + 1) // (num_samples // 10) * 10)) sys.stdout.flush() bleus = np.asarray(bleus) return bleus
def txtsave(sims, output): def vfun(sim, output, selector): return modelutils.taccum( sim.outputs[output], **sim.model.select[selector]).islice(t=sim.t[-1]) def fname(name, output, selector): return os.path.join( config.path['data'], 'values', '-'.join([shortname(name), output, selector]) + '.txt') fmts = { 'prevalence': lambda x: '{:.0f}\%'.format(100 * float(x)), 'C': lambda x: '{:.1f}'.format(float(x)), 'ratio': lambda x: '{:.1f}'.format(float(x)), } for name, sim in sims.items(): utils.savetxt(fname(name, output, 'high'), fmts[output](vfun(sim, output, 'high'))) utils.savetxt(fname(name, output, 'low'), fmts[output](vfun(sim, output, 'low'))) utils.savetxt( fname(name, output, 'ratio'), fmts['ratio']( vfun(sim, output, 'high') / vfun(sim, output, 'low')))
def eval_user_adaptation(opt): log = utils.Logger(opt.verbose) timer = utils.Timer() # Read vocabs lexicon = helpers.get_lexicon(opt) # Read data filepairs = load_user_filepairs(opt.usr_file_list) # Get target language model lang_model = None # Load model s2s = helpers.build_model(opt, lexicon, lang_model, test=True) if opt.update_mode == 'mixture_weights' and not opt.user_recognizer == 'fact_voc': log.info('Updating only the mixture weights doesn\'t make sense here') exit() s2s.lm = lexicon.trg_unigrams # s2s.freeze_parameters() # Trainer trainer = helpers.get_trainer(opt, s2s) # print config if opt.verbose: options.print_config(opt, src_dict_size=len(lexicon.w2ids), trg_dict_size=len(lexicon.w2idt)) # This will store translations and gold sentences base_translations = [] adapt_translations = [] gold = [] # Run training for usr_id, (src_file, trg_file) in enumerate(filepairs): log.info('Evaluating on files %s' % os.path.basename(src_file).split()[0]) # Load file pair src_data = data.read_corpus(src_file, lexicon.w2ids, raw=True) trg_data = data.read_corpus(trg_file, lexicon.w2idt, raw=True) # split train/test train_src, test_src, train_trg, test_trg, order = split_user_data( src_data, trg_data, n_test=opt.n_test) # Convert train data to indices train_src = lexicon.sents_to_ids(train_src) train_trg = lexicon.sents_to_ids(train_trg, trg=True) # Save test data for s in test_trg: gold.append(' '.join(s)) # Reset model s2s.load() s2s.reset_usr_vec() # Translate with baseline model base_translations.extend(evaluate_model(s2s, test_src, opt.beam_size)) # Start loop n_train = opt.max_n_train adapt_translations.extend( adapt_user(s2s, trainer, train_src[:n_train], train_trg[:n_train], test_src, opt)) # Temp files temp_gold = utils.exp_temp_filename(opt, 'gold.txt') temp_base = utils.exp_temp_filename(opt, '%s_base.txt' % opt.update_mode) temp_adapt = utils.exp_temp_filename(opt, '%s_adapt.txt' % opt.update_mode) utils.savetxt(temp_gold, gold) utils.savetxt(temp_base, base_translations) utils.savetxt(temp_adapt, adapt_translations) # Evaluate base translations bleu, details = evaluation.bleu_score(temp_gold, temp_base) log.info('Base BLEU score: %.2f' % bleu) # Evaluate base translations bleu, details = evaluation.bleu_score(temp_gold, temp_adapt) log.info('Adaptation BLEU score: %.2f' % bleu) # Compare both temp_bootstrap_gold = utils.exp_temp_filename(opt, 'bootstrap_gold.txt') temp_bootstrap_base = utils.exp_temp_filename(opt, 'bootstrap_base.txt') temp_bootstrap_adapt = utils.exp_temp_filename(opt, 'bootstrap_adapt.txt') bleus = evaluation.paired_bootstrap_resampling( temp_gold, temp_base, temp_adapt, opt.bootstrap_num_samples, opt.bootstrap_sample_size, temp_bootstrap_gold, temp_bootstrap_base, temp_bootstrap_adapt) evaluation.print_paired_stats(bleus) os.remove(temp_bootstrap_gold) os.remove(temp_bootstrap_base) os.remove(temp_bootstrap_adapt)
def save_element(output, select, tmax, phi, tau): if config.save: fname = fname_element(output.name, select.name, phi=phi, tau=tau) value = modelutils.taccum(output, **select).islice(t=tmax) utils.savetxt(fname, float(value))
def process(h5file, ratio, format="p2p_vs_t.dat"): # given a h5file return a list of data to be plotted as line plots # and a corresponding list of labels header = "# time average_inter std_inter average_intra std_intra" datalist = [] labellist = [] isomerlist = ["scyllo", "chiro", "water"] mean_contact_list = [] std_contact_list = [] for iso in isomerlist: print "processing", iso pattern = re.compile(r"%(iso)s.*%(ratio)s.*%(format)s" % vars()) if iso == "water": pattern = re.compile(r"%(iso)s.*%(format)s" % vars()) data_inter = [] data_intra = [] for table in h5file.listNodes(where='/polar'): table_path = os.path.join('/polar', table.name) if pattern.search(table.name): print "processing", table.name data = myh5.getTableAsMatrix(h5file, table_path, dtype=numpy.int32) data = data.astype('float') print "converted to float32", data nrows, ncols = data.shape assert nrows > ncols print "Test data read in dimensions", data.shape, data.dtype data_inter.append(data[0:config.LASTFRAME,1]) data_intra.append(data[0:config.LASTFRAME,2]) # compute summary statistics print "summarizing statistics ... " inter_matrix = utils.array_list_to_matrix(data_inter) intra_matrix = utils.array_list_to_matrix(data_intra) average_inter, std_inter = utils.summary_statistics(inter_matrix) average_intra, std_intra = utils.summary_statistics(intra_matrix) # compute the time average number of contacts and its std error avg_contacts = numpy.average(inter_matrix, axis=0) mean_contact = numpy.average(avg_contacts) std_contact = numpy.std(avg_contacts) mean_contact_list.append(mean_contact) std_contact_list.append(std_contact) print mean_contact, std_contact time = data[0:config.LASTFRAME,0] # print "Test: dimensions of average_inter", average_inter.shape plotdata = utils.array_list_to_matrix([ time, average_inter, std_inter, average_intra, std_intra ]) print "plotdata", plotdata print "Test: dimensions of plotdata for", iso, ratio, plotdata.shape plotdata_smoothed = utils.smooth(plotdata, 500, time_present=True, timestep=2) print plotdata_smoothed datalist.append(plotdata_smoothed) print "smoothed data", plotdata_smoothed, plotdata_smoothed.shape ratiolabel = config.RATIO[ratio] if iso == "water": labellist.append("water" % vars()) else: labellist.append("%(iso)s (%(ratiolabel)s)" % vars()) utils.savetxt('%(ratio)s_p2p_vs_t.txt' % vars(), header, plotdata, fmt='%0.2f') utils.savetxt('%(ratio)s_p2p_vs_t_smoothed.txt' % vars(), header, plotdata_smoothed, fmt='%0.2f') utils.savetxt('%(ratio)s_avg_contacts_w_err.txt' % vars(), "#scyllo chiro water", numpy.vstack([mean_contact_list, std_contact_list]), fmt='%0.2f') return (datalist, labellist)