示例#1
0
def process(h5file, ratio):
	isomerlist = ["scyllo", "chiro", "water"]
	plot_data = []
	mean_contact_list = []
	std_contact_list = []
	#read in files for each system and aggregate
	format="pp_nonpolar_vs_t.xvg"
	for iso in isomerlist:
		print "processing", iso
		pattern = re.compile(r"%(iso)s.*%(ratio)s.*%(format)s" % vars())
		if iso == "water":
			pattern = re.compile(r"%(iso)s.*%(format)s" % vars())

		datalist=[]
		for table in h5file.listNodes(where='/pp_nonpolar'):
			table_path = os.path.join('/pp_nonpolar', table.name)
			if pattern.search(table.name):			
				data = myh5.getTableAsMatrix(h5file, table_path)
				if data is not None:
					data = data.astype('float')
					datalist.append(data[0:config.LASTFRAME, 1])
				else:
					print "no data was read in"
			
		print "datalist", datalist
		data_matrix = numpy.transpose(numpy.vstack(datalist))	
		print "data_matrix", data_matrix, data_matrix.shape

		avg, std = utils.summary_statistics(data_matrix, sum_across="columns")
		
		avg_contacts = numpy.average(data_matrix[config.STARTFRAME:config.LASTFRAME], axis=0)
		mean_contact = numpy.average(avg_contacts)
		std_contact = numpy.std(avg_contacts)
		print mean_contact
		print std_contact
		mean_contact_list.append(mean_contact)
		std_contact_list.append(std_contact)
		
		avg_smoothed = utils.smooth(avg/config.NMOLECULES, 500, time_present=False, timestep=2)
		std_smoothed = utils.smooth(std/config.NMOLECULES, 500, time_present=True, timestep=2)
		plot_data.append(avg_smoothed)
		plot_data.append(std_smoothed)
	
	timeseries_matrix = numpy.hstack(plot_data)
	print "timeseries_matrix", timeseries_matrix, timeseries_matrix.shape
	print "time", timeseries_matrix[:,0]
	numpy.savetxt(ratio + "_pp_nonpolar_smoothed.txt.gz", timeseries_matrix, fmt='%0.3f')
	utils.savetxt(ratio + "_avg_pp_nonpolar_contact.txt", "#scyllo chiro water", numpy.vstack([mean_contact_list, std_contact_list]), fmt='%0.3f')

	return timeseries_matrix
示例#2
0
def paired_bootstrap_resampling(ref,
                                out,
                                otherout,
                                num_samples,
                                sample_percent,
                                dummy_ref,
                                dummy_out,
                                dummy_otherout,
                                verbose=False):
    ref = load_if_file(ref)
    out = load_if_file(out)
    otherout = load_if_file(otherout)
    n = len(ref)
    assert n == len(out), 'Mismatched reference and output file size'
    assert n == len(
        otherout), 'Mismatched reference and other output file size'
    k = int(sample_percent * n / 100)
    bleus = []
    for i in range(num_samples):
        subset = np.random.choice(n, k)
        savetxt(dummy_out, out[subset])
        savetxt(dummy_otherout, otherout[subset])
        savetxt(dummy_ref, ref[subset])
        bleu1, _ = bleu_score(dummy_ref, dummy_out)
        bleu2, _ = bleu_score(dummy_ref, dummy_otherout)
        bleus.append([bleu1, bleu2])
        if verbose and (i + 1) % (num_samples // 10) == 0:
            print('%d%% done' % ((i + 1) // (num_samples // 10) * 10))
            sys.stdout.flush()
    bleus = np.asarray(bleus)
    return bleus
示例#3
0
def txtsave(sims, output):
    def vfun(sim, output, selector):
        return modelutils.taccum(
            sim.outputs[output],
            **sim.model.select[selector]).islice(t=sim.t[-1])

    def fname(name, output, selector):
        return os.path.join(
            config.path['data'], 'values',
            '-'.join([shortname(name), output, selector]) + '.txt')

    fmts = {
        'prevalence': lambda x: '{:.0f}\%'.format(100 * float(x)),
        'C': lambda x: '{:.1f}'.format(float(x)),
        'ratio': lambda x: '{:.1f}'.format(float(x)),
    }
    for name, sim in sims.items():
        utils.savetxt(fname(name, output, 'high'),
                      fmts[output](vfun(sim, output, 'high')))
        utils.savetxt(fname(name, output, 'low'),
                      fmts[output](vfun(sim, output, 'low')))
        utils.savetxt(
            fname(name, output, 'ratio'), fmts['ratio'](
                vfun(sim, output, 'high') / vfun(sim, output, 'low')))
示例#4
0
def eval_user_adaptation(opt):
    log = utils.Logger(opt.verbose)
    timer = utils.Timer()
    # Read vocabs
    lexicon = helpers.get_lexicon(opt)
    # Read data
    filepairs = load_user_filepairs(opt.usr_file_list)
    # Get target language model
    lang_model = None
    # Load model
    s2s = helpers.build_model(opt, lexicon, lang_model, test=True)
    if opt.update_mode == 'mixture_weights' and not opt.user_recognizer == 'fact_voc':
        log.info('Updating only the mixture weights doesn\'t make sense here')
        exit()
    s2s.lm = lexicon.trg_unigrams
    #    s2s.freeze_parameters()
    # Trainer
    trainer = helpers.get_trainer(opt, s2s)
    # print config
    if opt.verbose:
        options.print_config(opt,
                             src_dict_size=len(lexicon.w2ids),
                             trg_dict_size=len(lexicon.w2idt))
    # This will store translations and gold sentences
    base_translations = []
    adapt_translations = []
    gold = []
    # Run training
    for usr_id, (src_file, trg_file) in enumerate(filepairs):
        log.info('Evaluating on files %s' %
                 os.path.basename(src_file).split()[0])
        # Load file pair
        src_data = data.read_corpus(src_file, lexicon.w2ids, raw=True)
        trg_data = data.read_corpus(trg_file, lexicon.w2idt, raw=True)
        # split train/test
        train_src, test_src, train_trg, test_trg, order = split_user_data(
            src_data, trg_data, n_test=opt.n_test)
        # Convert train data to indices
        train_src = lexicon.sents_to_ids(train_src)
        train_trg = lexicon.sents_to_ids(train_trg, trg=True)
        # Save test data
        for s in test_trg:
            gold.append(' '.join(s))
        # Reset model
        s2s.load()
        s2s.reset_usr_vec()
        # Translate with baseline model
        base_translations.extend(evaluate_model(s2s, test_src, opt.beam_size))
        # Start loop
        n_train = opt.max_n_train
        adapt_translations.extend(
            adapt_user(s2s, trainer, train_src[:n_train], train_trg[:n_train],
                       test_src, opt))

    # Temp files
    temp_gold = utils.exp_temp_filename(opt, 'gold.txt')
    temp_base = utils.exp_temp_filename(opt, '%s_base.txt' % opt.update_mode)
    temp_adapt = utils.exp_temp_filename(opt, '%s_adapt.txt' % opt.update_mode)
    utils.savetxt(temp_gold, gold)
    utils.savetxt(temp_base, base_translations)
    utils.savetxt(temp_adapt, adapt_translations)
    # Evaluate base translations
    bleu, details = evaluation.bleu_score(temp_gold, temp_base)
    log.info('Base BLEU score: %.2f' % bleu)
    # Evaluate base translations
    bleu, details = evaluation.bleu_score(temp_gold, temp_adapt)
    log.info('Adaptation BLEU score: %.2f' % bleu)
    # Compare both
    temp_bootstrap_gold = utils.exp_temp_filename(opt, 'bootstrap_gold.txt')
    temp_bootstrap_base = utils.exp_temp_filename(opt, 'bootstrap_base.txt')
    temp_bootstrap_adapt = utils.exp_temp_filename(opt, 'bootstrap_adapt.txt')
    bleus = evaluation.paired_bootstrap_resampling(
        temp_gold, temp_base, temp_adapt, opt.bootstrap_num_samples,
        opt.bootstrap_sample_size, temp_bootstrap_gold, temp_bootstrap_base,
        temp_bootstrap_adapt)
    evaluation.print_paired_stats(bleus)
    os.remove(temp_bootstrap_gold)
    os.remove(temp_bootstrap_base)
    os.remove(temp_bootstrap_adapt)
示例#5
0
def save_element(output, select, tmax, phi, tau):
    if config.save:
        fname = fname_element(output.name, select.name, phi=phi, tau=tau)
        value = modelutils.taccum(output, **select).islice(t=tmax)
        utils.savetxt(fname, float(value))
示例#6
0
def process(h5file, ratio, format="p2p_vs_t.dat"):
	# given a h5file return a list of data to be plotted as line plots 
	# and a corresponding list of labels

	header = "# time average_inter std_inter average_intra std_intra"
	datalist = []
	labellist = []
	isomerlist = ["scyllo", "chiro", "water"]
	mean_contact_list = []
	std_contact_list = []
	
	for iso in isomerlist:
		print "processing", iso
		pattern = re.compile(r"%(iso)s.*%(ratio)s.*%(format)s" % vars())
		if iso == "water":
			pattern = re.compile(r"%(iso)s.*%(format)s" % vars())

		data_inter = []
		data_intra = []
		for table in h5file.listNodes(where='/polar'):
			table_path = os.path.join('/polar', table.name)
			if pattern.search(table.name):
				print "processing", table.name
				data = myh5.getTableAsMatrix(h5file, table_path, dtype=numpy.int32)
				data = data.astype('float')
				print "converted to float32", data

				nrows, ncols = data.shape
				assert nrows > ncols
				print "Test data read in dimensions", data.shape, data.dtype
				data_inter.append(data[0:config.LASTFRAME,1])
				data_intra.append(data[0:config.LASTFRAME,2])

		# compute summary statistics
		print "summarizing statistics ... "
		inter_matrix = utils.array_list_to_matrix(data_inter)
		intra_matrix = utils.array_list_to_matrix(data_intra)
		average_inter, std_inter = utils.summary_statistics(inter_matrix)
		average_intra, std_intra = utils.summary_statistics(intra_matrix)
		
		# compute the time average number of contacts and its std error
		avg_contacts = numpy.average(inter_matrix, axis=0)
		mean_contact = numpy.average(avg_contacts)
		std_contact = numpy.std(avg_contacts)

		mean_contact_list.append(mean_contact)
		std_contact_list.append(std_contact)
		print mean_contact, std_contact

		time = data[0:config.LASTFRAME,0]
		# print "Test: dimensions of average_inter", average_inter.shape
		plotdata = utils.array_list_to_matrix([ time, average_inter, std_inter, average_intra, std_intra ])
		print "plotdata", plotdata
		print "Test: dimensions of plotdata for", iso, ratio, plotdata.shape
		plotdata_smoothed = utils.smooth(plotdata, 500, time_present=True, timestep=2)
		print plotdata_smoothed

		datalist.append(plotdata_smoothed)
		print "smoothed data", plotdata_smoothed, plotdata_smoothed.shape

		ratiolabel = config.RATIO[ratio]
		if iso == "water":
			labellist.append("water" % vars())
		else:
			labellist.append("%(iso)s (%(ratiolabel)s)" % vars())

	utils.savetxt('%(ratio)s_p2p_vs_t.txt' % vars(), header, plotdata, fmt='%0.2f')
	utils.savetxt('%(ratio)s_p2p_vs_t_smoothed.txt' % vars(), header, plotdata_smoothed, fmt='%0.2f')
	utils.savetxt('%(ratio)s_avg_contacts_w_err.txt' % vars(), "#scyllo chiro water", numpy.vstack([mean_contact_list, std_contact_list]), fmt='%0.2f')

	return (datalist, labellist)