def load_polarizer_data(params, metric = 'nuc'): analysis_folder = test_flu.flu_analysis_folder # construct file mask and load files base_name, name_mod = test_flu.get_fname(params) fmask = analysis_folder+'_'.join([base_name, 'polarizer', metric])+'.dat' print "loading", fmask flist = glob.glob(fmask) predictions = [] for fname in flist: year = int(fname.split('_')[len(analysis_folder.split('_'))+1]) predictions.append((year, np.loadtxt(fname))) # sort predictions by year predictions.sort(key = lambda x:x[0]) # make a list of all years for which we have prediction years = np.array([a[0] for a in predictions]) # calculate averages over the 50 replicates done for each year and normalize to random picks mean_distance = np.array([a[1][:,0].mean() for a in predictions]) minimal_distance = np.array([a[1][:,1].mean() for a in predictions])/mean_distance average_distances = np.array([a[1].mean(axis=0)/mean_distance[ai] for ai,a in enumerate(predictions)]) # normalize the observed distances such that random=1 and best = 0 normed_distances = ((average_distances[:,1:].T - minimal_distance)/(1-minimal_distance)).T return years, average_distances, normed_distances
def load_date_distribution(params, top_strain_method): ''' returns the sampling dates of all predicted strains measured in days relative to Jan 1st of the year preceding the prediction year ''' from datetime import date # construct file mask and load files analysis_folder = test_flu.flu_analysis_folder boost = params.boost # construct file mask and load files base_name, name_mod = test_flu.get_fname(params) fmask = analysis_folder+'_'.join([base_name, name_mod, top_strain_method, 'topstrains.dat']) flist = glob.glob(fmask) sampling_dates = {} for fname in flist: tmp_dates = [] year = int(fname.split('_')[len(analysis_folder.split('_'))+1]) base_line = date(year-1,1,1).toordinal() with open(fname, 'r') as infile: for line in infile: strain_year, strain_month, strain_day = map(int, line.split()[-1].split('-')) tmp_dates.append(date(strain_year, strain_month,strain_day).toordinal()-base_line) sampling_dates[year] = tmp_dates return sampling_dates
def load_polarizer_data(params, metric='nuc'): analysis_folder = test_flu.flu_analysis_folder # construct file mask and load files base_name, name_mod = test_flu.get_fname(params) fmask = analysis_folder + '_'.join([base_name, 'polarizer', metric ]) + '.dat' print "loading", fmask flist = glob.glob(fmask) predictions = [] for fname in flist: year = int(fname.split('_')[len(analysis_folder.split('_')) + 1]) predictions.append((year, np.loadtxt(fname))) # sort predictions by year predictions.sort(key=lambda x: x[0]) # make a list of all years for which we have prediction years = np.array([a[0] for a in predictions]) # calculate averages over the 50 replicates done for each year and normalize to random picks mean_distance = np.array([a[1][:, 0].mean() for a in predictions]) minimal_distance = np.array([a[1][:, 1].mean() for a in predictions]) / mean_distance average_distances = np.array([ a[1].mean(axis=0) / mean_distance[ai] for ai, a in enumerate(predictions) ]) # normalize the observed distances such that random=1 and best = 0 normed_distances = ((average_distances[:, 1:].T - minimal_distance) / (1 - minimal_distance)).T return years, average_distances, normed_distances
def load_date_distribution(params, top_strain_method): ''' returns the sampling dates of all predicted strains measured in days relative to Jan 1st of the year preceding the prediction year ''' from datetime import date # construct file mask and load files analysis_folder = test_flu.flu_analysis_folder boost = params.boost # construct file mask and load files base_name, name_mod = test_flu.get_fname(params) fmask = analysis_folder + '_'.join( [base_name, name_mod, top_strain_method, 'topstrains.dat']) flist = glob.glob(fmask) sampling_dates = {} for fname in flist: tmp_dates = [] year = int(fname.split('_')[len(analysis_folder.split('_')) + 1]) base_line = date(year - 1, 1, 1).toordinal() with open(fname, 'r') as infile: for line in infile: strain_year, strain_month, strain_day = map( int, line.split()[-1].split('-')) tmp_dates.append( date(strain_year, strain_month, strain_day).toordinal() - base_line) sampling_dates[year] = tmp_dates return sampling_dates
def load_prediction_data(params, metric='nuc'): analysis_folder = test_flu.flu_analysis_folder boost = params.boost # construct file mask and load files base_name, name_mod = test_flu.get_fname(params) fmask = analysis_folder + '_'.join([base_name, name_mod, metric]) + '.dat' print "loading", fmask flist = glob.glob(fmask) predictions = [] for fname in flist: year = int(fname.split('_')[len(analysis_folder.split('_')) + 1]) predictions.append((year, np.loadtxt(fname))) # sort predictions by year predictions.sort(key=lambda x: x[0]) # make a list of all years for which we have prediction years = np.array([a[0] for a in predictions]) # calculate averages over the 50 replicates done for each year and normalize to random picks average_distance = np.array([a[1][:, 0].mean() for a in predictions]) minimal_distance = np.array([a[1][:, 1].mean() for a in predictions]) / average_distance prediction_distances = { (method, boost, label): np.array([a[1][:, methodi].mean() for a in predictions]) / average_distance for methodi, method, label in methods } prediction_distances[('average', boost, 'average')] = average_distance prediction_distances[('minimal', boost, 'minimal')] = minimal_distance # normalize the observed distances to normed_distances = { method: (np.mean((preds - minimal_distance) / (1 - minimal_distance)), boot_strap((preds - minimal_distance) / (1 - minimal_distance), 1000)) for method, preds in prediction_distances.iteritems() } # the L&L predictions sit in column 2 prediction_distances[( 'L&L', boost, 'L\&L')] = np.array([a[1][:, 2].mean() for a in predictions]) / average_distance normed_distances[('L&L', boost, 'L\&L')] = ( np.mean(( (prediction_distances[('L&L', boost, 'L\&L')] - minimal_distance) / (1 - minimal_distance))[laessig_years(years)]), boot_strap(( (prediction_distances[('L&L', boost, 'L\&L')] - minimal_distance) / (1 - minimal_distance))[laessig_years(years)])) return years, prediction_distances, normed_distances
def load_prediction_data(params, metric = 'nuc'): analysis_folder = test_flu.flu_analysis_folder boost = params.boost # construct file mask and load files base_name, name_mod = test_flu.get_fname(params) fmask = analysis_folder+'_'.join([base_name, name_mod, metric])+'.dat' print "loading", fmask flist = glob.glob(fmask) predictions = [] for fname in flist: year = int(fname.split('_')[len(analysis_folder.split('_'))+1]) predictions.append((year, np.loadtxt(fname))) # sort predictions by year predictions.sort(key = lambda x:x[0]) # make a list of all years for which we have prediction years = np.array([a[0] for a in predictions]) # calculate averages over the 50 replicates done for each year and normalize to random picks average_distance = np.array([a[1][:,0].mean() for a in predictions]) minimal_distance = np.array([a[1][:,1].mean() for a in predictions])/average_distance prediction_distances = {(method,boost,label):np.array([a[1][:,methodi].mean() for a in predictions])/average_distance for methodi, method,label in methods} prediction_distances[('average',boost,'average')] = average_distance prediction_distances[('minimal',boost,'minimal')] = minimal_distance # normalize the observed distances to normed_distances = {method:(np.mean((preds - minimal_distance)/(1-minimal_distance)), boot_strap((preds - minimal_distance)/(1-minimal_distance),1000)) for method,preds in prediction_distances.iteritems()} # the L&L predictions sit in column 2 prediction_distances[('L&L',boost,'L\&L')] = np.array([a[1][:,2].mean() for a in predictions])/average_distance normed_distances[('L&L',boost,'L\&L')] = (np.mean(((prediction_distances[('L&L', boost,'L\&L')] - minimal_distance)/(1-minimal_distance))[laessig_years(years)]), boot_strap(((prediction_distances[('L&L', boost,'L\&L')] - minimal_distance)/(1-minimal_distance))[laessig_years(years)])) return years, prediction_distances, normed_distances
import tree_utils import numpy as np from scipy import stats import glob, pickle, gzip, os, argparse from datetime import date analysis_folder = test_flu.flu_analysis_folder # parse the commandline arguments parser = test_flu.make_flu_parser() params = parser.parse_args() params.pred = params.pred.replace('^', ' ') params.test = params.test.replace('^', ' ') params.subsample = 0.7 # get run specific file names fname_base, name_mod = test_flu.get_fname(params) top_strain_method = 'mean_fitness' # allocate arrays to save the predictions nuc_dist_array = np.zeros((params.nreps, 12)) epi_dist_array = np.zeros((params.nreps, 12)) top_strains = [] for ii in xrange(params.nreps): # set up the prediction and pass all parameters to the wrapper function prediction = test_flu.predict_params([ 'mean_fitness', 'expansion_score', 'depth', 'polarizer', flu.combined_ranking_internal, flu.combined_ranking_external ], params) # define the methodes for which the predictions are to be evaluated methods = [('mean_fitness', '_ext', prediction.terminals),
from matplotlib import pyplot as plt import numpy as np from scipy import stats import glob,pickle,gzip,os,argparse from datetime import date plt.rcParams.update(test_flu.mpl_params) tree_figure_folder = '../figures_trees/' analysis_folder = test_flu.flu_analysis_folder # parse the commandline arguments parser = test_flu.make_flu_parser() parser.add_argument('--tau', default = 1.0, type = float, help= 'memory time scale of the tree polarizer') params=parser.parse_args() # get name snippets to link output files to run parameters base_name, name_mod = test_flu.get_fname(params) params.gamma=1.0 params.diffusion=1.0 # set up the prediction and pass all parameters to the wrapper function prediction = test_flu.predict_params(['polarizer'], params) prediction.calculate_polarizers(params.tau) # define the methodes for which the predictions are to be evaluated methods = [ ('polarizer', '_ext', prediction.terminals), ('polarizer', '_int', prediction.non_terminals)] distances, distances_epi, test_data = test_flu.evaluate(prediction, methods, params) # calculate the fitness differentials for each internal branch and associate with # different types of mutations that happen on these branches dfit = []