def make_test_set(prediction, params): flutype = params.flutype if 'toy'!=params.test: year=params.year prediction_regions = params.pred.split(',') test_regions = params.test.split(',') # define test data sets by choosing start and stop dates if "oceania" in test_regions: test_set = {'start':date(year, 3,1), 'stop':date(year, 9,30), 'regions':test_regions, 'sample_size':params.sample_size} elif 'toy' in params.test: test_set = {} test_set['start']= params.gen+(params.valdt - 0.25)*params.dt test_set['stop'] = params.gen+(params.valdt + 0.25)*params.dt test_set['regions'] = test_regions test_set['sample_size'] = 50 else: # assume northern hemisphere test_set = {'start':date(year, 10,1), 'stop':date(year+1, 3,31), 'regions':test_regions, 'sample_size':params.sample_size} # load data to test test_data = PF.flu_alignment(prediction.data.aln_file_name,prediction.data.outgroup, prediction.data.annotation,cds=prediction.data.cds, criteria = [[test_set['start'], test_set['stop'], [reg],test_set['sample_size']] for reg in test_set['regions']], collapse=params.collapse, build_tree=False) return test_data, test_set
def make_combined_data(prediction, test_data, otherseqsnames=None, collapse = False): assert prediction.data.aln.get_alignment_length()==test_data.aln.get_alignment_length(),\ "predict_and_test: prediction and test alignment have different length" # combined data set needed for plotting seqname_list = [seq.name for seq in prediction.data.aln] + [seq.name for seq in test_data.aln] if otherseqsnames: for seqname in otherseqsnames: if seqname not in seqname_list: seqname_list.append(seqname) combined_data = PF.flu_alignment(test_data.aln_file_name, test_data.outgroup, test_data.annotation, cds = test_data.cds, seq_names = seqname_list, collapse=collapse) return combined_data
def predict_params(methods, params): flutype = params.flutype if 'toy'!=params.pred: year=params.year prediction_regions = params.pred.split(',') test_regions = params.test.split(',') pseudo_count = 5 # define prediction and test data sets by choosing start and stop dates if "oceania" in prediction_regions: prediction_set={'start': date(year-2, 10,1), 'stop':date(year-1, 9,30), 'regions':prediction_regions, 'sample_size':params.sample_size} elif "toy" in prediction_regions: prediction_set={} prediction_set['start'] = params.gen-0.5*params.dt prediction_set['stop'] = params.gen+0.5*params.dt prediction_set['regions'] = prediction_regions prediction_set['sample_size'] = params.sample_size else: # assume northern hemisphere # begin previous year on may 1st, end this year on Feb 28 prediction_set={'start': date(year-1,5,1), 'stop':date(year, 2,28), 'regions':prediction_regions, 'sample_size':params.sample_size} if 'toy' in params.pred: aln_fname, annotation, outgroup,cds = get_aln_annotation_outgroup_cds_toy(params) tbins = None else: aln_fname, annotation, outgroup,cds = get_aln_annotation_outgroup_cds(params) # define 105 day intervals to estimate changing clade frequencies. # chosen to have 3 intervals between May and Feb bin_dt = 105 tbins = [ date.fromordinal(prediction_set['stop'].toordinal()-ii*bin_dt) for ii in range( (prediction_set['stop'].toordinal()-prediction_set['start'].toordinal())//bin_dt,-1,-1)] prediction_data = PF.flu_alignment(aln_fname,outgroup,annotation,cds=cds, subsample_factor = params.subsample, criteria = [[prediction_set['start'], prediction_set['stop'], [reg],prediction_set['sample_size']] for reg in prediction_set['regions']], collapse=params.collapse) # PREDICT prediction = PF.flu_ranking(prediction_data, eps_branch_length = params.eps_branch, boost = params.boost, time_bins = tbins, methods=methods, D=params.diffusion,samp_frac = params.omega, distance_scale = params.gamma,pseudo_count = pseudo_count) prediction.predict() print "prediction done" return prediction