def __init__(self, **kwargs): ''' Congruent with other nextstrain builds, dengue_process is a catch-all class that initially holds the input data paths and params arguments. ''' super(process, self).__init__() ##### Handle serotype-specific file input/output. ##### self.serotype = kwargs['serotype'] self.lineage = 'dengue_%s' % self.serotype if self.serotype == 'all': # For all-serotype build, use dengue 4 outgroup and look for files like dengue.fasta self.reference_fname = './dengue/metadata/dengue_denv4_outgroup.gb' newest_sequence_file = sorted( glob('../fauna/data/%s.fasta' % self.lineage), key=lambda f: os.path.getmtime(f))[-1] else: self.reference_fname = './dengue/metadata/%s_outgroup.gb' % self.lineage try: # Look for a serotype-specific fasta newest_sequence_file = sorted( glob('../fauna/data/%s*.fasta' % self.lineage), key=lambda f: os.path.getmtime(f))[-1] except: # If it doesn't exist, try to pull serotype-specific sequences out of the all-serotype fasta (warn the user of this behavior) newest_sequence_file = select_serotype( '../fauna/data/dengue_all.fasta', '../fauna/data/', self.serotype) print( 'WARNING: Did not find serotype-specific fasta file.\nPulled sequences with serotype %s from all-serotype fasta file %s\nWrote these to file %s' % (self.serotype, '../fauna/data/dengue.fasta', newest_sequence_file)) self.input_data_path = newest_sequence_file.split('.fasta')[0] self.sequence_fname = newest_sequence_file self.store_data_path = 'store/' + self.lineage + '_' self.build_data_path = 'build/' + self.lineage + '_' self.proteins = [ 'C', 'M', 'E', 'NS1', 'NS2A', 'NS2B', 'NS3', 'NS4A', '2K', 'NS4B', 'NS5' ] ##### Initialize process object ##### self.dengue = process( input_data_path=self.input_data_path, store_data_path=self.store_data_path, build_data_path=self.build_data_path, proteins=self.proteins, reference=self.reference_fname, method='SLSQP', lat_long_fname='../fauna/source-data/geo_lat_long.tsv')
"dir": "mumps", "in": prepared_json, "newick_tree_options": { "nthreads": 4 }, "clock_filter": { "n_iqd": 4, }, "geo_inference": geo_inference, "auspice": { ## settings for auspice JSON export "color_options": color_options, "defaults": defaults }, "clean": clean } if __name__ == "__main__": params = parser.parse_args() jsons = glob.glob( "prepared/*.json") if "all" in params.jsons else params.jsons for prepared_json in jsons: print("Processing {}".format(prepared_json)) runner = process(make_config(prepared_json, params.clean, params)) runner.align() runner.build_tree() runner.timetree_setup_filter_run() runner.run_geo_inference() runner.save_as_nexus() runner.auspice_export()
"newick_tree_options": {"nthreads": 4}, "clock_filter": { "n_iqd": 4, }, "geo_inference": ['country', 'region'], # what traits to perform this on "geo_inference_options": { "root_state": { "region": "southeast_asia", "country": "vietnam", }, }, "auspice": { ## settings for auspice JSON export "color_options": { "country":{"key":"country", "legendTitle":"Country", "menuItem":"country", "type":"discrete"}, "region":{"key":"region", "legendTitle":"Region", "menuItem":"region", "type":"discrete"}, }, "controls": {'authors':['authors']} } } if __name__=="__main__": params = parser.parse_args() if params.clean: config["clean"] = True runner = process(config) runner.align() runner.build_tree() runner.timetree_setup_filter_run() runner.run_geo_inference() runner.save_as_nexus() runner.auspice_export()
def __init__(self, **kwargs): super(process, self).__init__() self.serotype = kwargs['serotype'] if self.serotype == 'any': # For any-serotype build, use dengue 3 outgroup and look for files like dengue.fasta self.lineage = 'dengue' self.reference_fname = './dengue/metadata/dengue_%s_outgroup.gb' % '3' newest_sequence_file = sorted( glob('../fauna/data/%s.fasta' % self.lineage), key=lambda f: os.path.getmtime(f))[-1] else: self.lineage = 'dengue_%s' % self.serotype # For serotype-specific build, use the corresponding outgroup self.reference_fname = './dengue/metadata/dengue_%s_outgroup.gb' % self.serotype try: # Look for a serotype-specific fasta newest_sequence_file = sorted( glob('../fauna/data/%s*.fasta' % self.lineage), key=lambda f: os.path.getmtime(f))[-1] except: # If it doesn't exist, try to pull serotype-specific sequences out of the any-serotype fasta (warn the user of this behavior) newest_sequence_file = select_serotype( '../fauna/data/dengue.fasta', '../fauna/data/', self.serotype) print( 'WARNING: Did not find serotype-specific fasta file.\nPulled sequences with serotype %s from any-serotype fasta file %s\nWrote these to file %s' % (self.serotype, '../fauna/data/dengue.fasta', newest_sequence_file)) self.input_data_path = newest_sequence_file.split('.fasta')[0] self.sequence_fname = newest_sequence_file self.store_data_path = 'store/' + self.lineage + '_' self.build_data_path = 'build/' + self.lineage + '_' self.proteins = [ 'C', 'M', 'E', 'NS1', 'NS2A', 'NS2B', 'NS3', 'NS4A', '2K', 'NS4B', 'NS5' ] self.dengue = process(input_data_path=self.input_data_path, store_data_path=self.store_data_path, build_data_path=self.build_data_path, proteins=self.proteins, reference=self.reference_fname, method='SLSQP') if params.load: self.dengue.load() else: self.fasta_fields = { 0: 'strain', 1: 'accession', 2: 'date', 3: 'region', 4: 'country', 5: 'division', 6: 'location' } self.dengue.load_sequences(fields=self.fasta_fields) self.dengue.seqs.filter(lambda s: len(s.seq) >= 5000) self.dropped_strains = [] self.dengue.seqs.filter(lambda s: s.id not in self.dropped_strains) self.dengue.seqs.subsample( category=lambda x: (x.attributes['region'], x.attributes[ 'date'].year, x.attributes['date'].month), threshold=params.viruses_per_month) self.dengue.align() self.dengue.build_tree() self.dengue.clock_filter(n_iqd=3, plot=True) self.dengue.annotate_tree(Tc=0.005, timetree=True, reroot='best') self.dengue.tree.geo_inference('region') self.dengue.export(controls=attribute_nesting)
default=1.0, help='number of hours raxml is run') parser.add_argument('--load', action='store_true', help='recover from file') params = parser.parse_args() lineage = 'ebola' input_data_path = '../fauna/data/' + lineage store_data_path = 'store/' + lineage + '_' build_data_path = 'build/' + lineage + '_' ebola = process( input_data_path=input_data_path, store_data_path=store_data_path, build_data_path=build_data_path, reference='ebola/metadata/ebola_outgroup.gb', proteins=['NP', 'VP35', 'VP40', 'GP', 'sGP', 'VP30', 'VP24', 'L'], method='SLSQP') if params.load == False: fasta_fields = { 0: 'strain', 2: 'accession', 3: 'date', 4: 'region', 5: 'country', 6: 'division', 8: 'db', 10: 'authors' }
"menuItem": "date", "type": "continuous" }, "gt": { "key": "genotype", "legendTitle": "Genotype", "menuItem": "genotype", "type": "discrete" } } } HA = process(input_data_path=params["HA"]["input_data"], store_data_path='store/' + params["HA"]["lineage"] + '_', build_data_path='build/' + params["HA"]["lineage"] + '_', reference=params["HA"]["reference_fname"], lat_long_fname='../fauna/source-data/geo_lat_long.tsv', proteins=params["HA"]['proteins'], method='SLSQP', verbose=0) NA = process(input_data_path=params["NA"]["input_data"], store_data_path='store/' + params["NA"]["lineage"] + '_', build_data_path='build/' + params["NA"]["lineage"] + '_', reference=params["NA"]["reference_fname"], lat_long_fname='../fauna/source-data/geo_lat_long.tsv', proteins=params["NA"]['proteins'], method='SLSQP', verbose=0) segments = [HA, NA] segmentNames = ["HA", "NA"]
time_interval = [datetime.strptime(x, '%Y-%m-%d').date() for x in params.time_interval] pivots = np.arange(time_interval[0].year+(time_interval[0].month-1)/12.0, time_interval[1].year+time_interval[1].month/12.0, 1.0/ppy) # load data from all segments segment_names = ['pb1', 'pb2', 'pa', 'ha', 'np', 'na', 'ma', 'ns'] segments = {} viruses = defaultdict(list) for seg in segment_names: input_data_path = '../fauna/data/'+params.lineage+'_'+seg if seg=='m': input_data_path+='p' store_data_path = 'store/'+params.lineage + '_' + params.resolution + '_' + seg + '_' build_data_path = 'build/'+params.lineage + '_' + params.resolution + '_' + seg + '_' flu = process(input_data_path = input_data_path, store_data_path = store_data_path, build_data_path = build_data_path, reference='flu/metadata/'+params.lineage + '_' + seg +'_outgroup.gb', proteins=['SigPep', 'HA1', 'HA2'], method='SLSQP', inertia=np.exp(-1.0/ppy), stiffness=2.*ppy) flu.load_sequences(fields={0:'strain', 2:'isolate_id', 3:'date', 4:'region', 5:'country', 7:"city", 12:"subtype",13:'lineage'}) print("## loading data for segment %s, found %d number of sequences"%(seg, len(flu.seqs.all_seqs))) for sequence in flu.seqs.all_seqs: viruses[sequence].append(seg) segments[seg] = flu # determine strains that are complete complete_strains = filter(lambda x:len(viruses[x])==len(segment_names), viruses.keys()) # determine filter every segment down to the sequences for which all other segments exist segments['ha'].seqs.filter(lambda s: s.name in complete_strains)
"estimate_tree_frequencies": not args.no_tree_freqs, "clean": args.clean, "pivot_spacing": 1.0 / 12, "timetree_options": { "Tc": 0.03 } } if __name__ == "__main__": args = collect_args() jsons = glob.glob("prepared/*.json") if "all" in args.jsons else args.jsons for prepared_json in jsons: pprint("Processing {}".format(prepared_json)) runner = process(make_config(prepared_json, args)) runner.align() # estimate mutation frequencies here. # While this could be in a wrapper, it is hopefully more readable this way! if runner.config["estimate_mutation_frequencies"]: pivots = runner.get_pivots_via_spacing() runner.estimate_mutation_frequencies(pivots=pivots, min_freq=0.02, inertia=np.exp(-1.0 / 12), stiffness=0.8 * 12) acronyms = set( [x[1] for x in runner.info["regions"] if x[1] != ""]) region_groups = { str(x): [str(y[0]) for y in runner.info["regions"] if y[1] == x]
parser.add_argument('--load', action='store_true', help='recover from file') params = parser.parse_args() lineage = 'zika' input_data_path = '../fauna/data/' + lineage store_data_path = 'store/' + lineage + '_' build_data_path = 'build/' + lineage + '_' zika = process(input_data_path=input_data_path, store_data_path=store_data_path, build_data_path=build_data_path, reference='zika/metadata/zika_outgroup.gb', lat_long_fname='../fauna/source-data/geo_lat_long.tsv', proteins=[ 'CA', 'PRO', 'MP', 'ENV', 'NS1', 'NS2A', 'NS2B', 'NS3', 'NS4A', 'NS4B', 'NS5' ], method='SLSQP', verbose=params.verbose) if params.load: zika.load() else: fasta_fields = { 0: 'strain', 2: 'accession', 3: 'date', 4: 'region', 5: 'country', 6: 'division',
parser = argparse.ArgumentParser(description='Process virus sequences, build tree, and prepare of web visualization') parser.add_argument('-v', '--viruses_per_month', type = int, default = 100, help='number of viruses sampled per month') parser.add_argument('-r', '--raxml_time_limit', type = float, default = 1.0, help='number of hours raxml is run') parser.add_argument('--load', action='store_true', help = 'recover from file') params = parser.parse_args() lineage = 'zika' input_data_path = '../fauna/data/'+lineage store_data_path = 'store/'+lineage + '_' build_data_path = 'build/'+lineage + '_' zika = process(input_data_path = input_data_path, store_data_path = store_data_path, build_data_path = build_data_path, reference='zika/metadata/zika_outgroup.gb', proteins=['CA', 'PRO', 'MP', 'ENV', 'NS1', 'NS2A', 'NS2B', 'NS3', 'NS4A', 'NS4B', 'NS5'], method='SLSQP') if params.load: zika.load() else: fasta_fields = {0:'strain', 2:'accession', 3:'date', 4:'region', 5:'country', 6:'division', 8:'db', 10:'authors', 11:'latitude', 12:'longitude'} zika.load_sequences(fields=fasta_fields) zika.seqs.filter(lambda s: s.attributes['date']>=datetime(2012,1,1).date() and s.attributes['date']< datetime(2017,1,1).date()) zika.seqs.filter(lambda s: len(s.seq)>=2000) dropped_strains = [ "THA/PLCal_ZV/2013", "PLCal_ZV", # true strains, too basal for analysis