def make_test_set(prediction, params):
    flutype = params.flutype
    if 'toy'!=params.test:
        year=params.year
    prediction_regions = params.pred.split(',')
    test_regions = params.test.split(',')
    # define test data sets by choosing start and stop dates 
    if "oceania" in test_regions:
        test_set = {'start':date(year, 3,1), 'stop':date(year, 9,30),
                    'regions':test_regions, 'sample_size':params.sample_size}
    elif 'toy' in params.test:
        test_set = {}
        test_set['start']= params.gen+(params.valdt - 0.25)*params.dt
        test_set['stop'] = params.gen+(params.valdt + 0.25)*params.dt
        test_set['regions'] = test_regions
        test_set['sample_size'] = 50        
    else:  # assume northern hemisphere
        test_set = {'start':date(year, 10,1), 'stop':date(year+1, 3,31),
                    'regions':test_regions, 'sample_size':params.sample_size}
    
    # load data to test
    test_data = PF.flu_alignment(prediction.data.aln_file_name,prediction.data.outgroup,
                                 prediction.data.annotation,cds=prediction.data.cds,
                             criteria = [[test_set['start'], test_set['stop'],
                            [reg],test_set['sample_size']] for reg in test_set['regions']],
                             collapse=params.collapse, build_tree=False)
    return test_data, test_set
def make_combined_data(prediction, test_data, otherseqsnames=None, collapse = False):
    assert prediction.data.aln.get_alignment_length()==test_data.aln.get_alignment_length(),\
        "predict_and_test: prediction and test alignment have different length"
    # combined data set needed for plotting
    seqname_list = [seq.name for seq in prediction.data.aln] + [seq.name for seq in test_data.aln]
    if otherseqsnames:
        for seqname in otherseqsnames:
            if seqname not in seqname_list:
                seqname_list.append(seqname)

    combined_data = PF.flu_alignment(test_data.aln_file_name, test_data.outgroup, test_data.annotation,
                                     cds = test_data.cds, seq_names = seqname_list, collapse=collapse)
    
    return combined_data
def predict_params(methods, params):
    flutype = params.flutype
    if 'toy'!=params.pred:
        year=params.year
    prediction_regions = params.pred.split(',')
    test_regions = params.test.split(',')
    pseudo_count = 5
        
    # define prediction and test data sets by choosing start and stop dates 
    if "oceania" in prediction_regions:
        prediction_set={'start': date(year-2, 10,1), 'stop':date(year-1, 9,30),
                        'regions':prediction_regions, 'sample_size':params.sample_size}
    elif "toy" in prediction_regions:
        prediction_set={}
        prediction_set['start'] = params.gen-0.5*params.dt
        prediction_set['stop'] =  params.gen+0.5*params.dt
        prediction_set['regions'] = prediction_regions
        prediction_set['sample_size'] = params.sample_size
    else:  # assume northern hemisphere
        # begin previous year on may 1st, end this year on Feb 28
        prediction_set={'start': date(year-1,5,1), 'stop':date(year, 2,28),
                        'regions':prediction_regions, 'sample_size':params.sample_size}
    
    if 'toy' in params.pred:
        aln_fname, annotation, outgroup,cds = get_aln_annotation_outgroup_cds_toy(params)
        tbins = None
    else:
        aln_fname, annotation, outgroup,cds = get_aln_annotation_outgroup_cds(params)
        # define 105 day intervals to estimate changing clade frequencies.
        # chosen to have 3 intervals between May and Feb
        bin_dt = 105
        tbins = [ date.fromordinal(prediction_set['stop'].toordinal()-ii*bin_dt) for ii in range(
                (prediction_set['stop'].toordinal()-prediction_set['start'].toordinal())//bin_dt,-1,-1)]

    prediction_data = PF.flu_alignment(aln_fname,outgroup,annotation,cds=cds, subsample_factor = params.subsample,
                                      criteria = [[prediction_set['start'], prediction_set['stop'], 
                                    [reg],prediction_set['sample_size']] for reg in prediction_set['regions']], collapse=params.collapse)

    # PREDICT
    prediction = PF.flu_ranking(prediction_data, eps_branch_length = params.eps_branch, boost = params.boost,
                                time_bins = tbins, methods=methods, D=params.diffusion,samp_frac = params.omega,
                                 distance_scale = params.gamma,pseudo_count = pseudo_count)
    prediction.predict()
    print "prediction done"
    return prediction