def add_older_new_viruses(self, dt = 3, dtref = None): from date_util import numerical_date for v in self.new_strains: if v['strain'] not in [x['strain'] for x in self.viruses]: tmp_date = numerical_date(v['date']) if tmp_date<self.time_interval[0] and tmp_date>=self.time_interval[0]-dt: self.viruses.append(v) print("adding ",v['strain'], v['date'], tmp_date, self.time_interval) else: print("skipping ",v['strain'], v['date'], tmp_date, self.time_interval) new_strain_names = [v['strain'] for v in self.new_strains] try: if dtref==None: dtref=dt*0.5 from json import load as jload with open('/Users/yujiazhou/Documents/nextflu/H9_nextflu-master/augur/source-data/'+self.virus_type+'_ref_strains.json', 'r') as infile: self.reference_viruses = jload(infile) for v in self.reference_viruses: if v['strain'] not in [x['strain'] for x in self.viruses]: tmp_date = numerical_date(v['date']) tmp_strain = v['strain'] print(tmp_strain) if tmp_strain not in new_strain_names: if tmp_date<self.time_interval[0] and tmp_date>=self.time_interval[0]-dtref: self.viruses.append(v) print("adding ",v['strain'], v['date'], tmp_date, self.time_interval) else: print("skipping ",v['strain'], v['date'], tmp_date, self.time_interval) except: print("can't find reference_viruses")
def calc_time_censcored_tree_frequencies(self): print("fitting clade frequencies for seasons") region = "global_fit" freq_cutoff = 25.0 total_pivots = 12 pivots_fit = 2 freq_window = 0.0 from date_util import numerical_date for n in self.tree.preorder_node_iter(): n.fit_frequencies = {} n.freq_slope = {} for s in self.seasons: time_interval = [numerical_date(s[0]) - freq_window, numerical_date(s[1])] pivots = np.linspace(time_interval[0], time_interval[1], total_pivots) n_nodes = len(self.tree.seed_node.season_tips[s]) self.estimate_tree_frequencies(pivots=pivots, threshold = 40, regions=None, region_name = region, time_interval=time_interval) for n in self.tree.preorder_node_iter(): if n.logit_freq[region] is not None: n.fit_frequencies[s] = np.minimum(freq_cutoff, np.maximum(-freq_cutoff,n.logit_freq[region])) else: n.fit_frequencies[s] = n.parent_node.fit_frequencies[s] try: slope, intercept, rval, pval, stderr = linregress(pivots[pivots_fit:], n.fit_frequencies[s][pivots_fit:]) n.freq_slope[s] = slope except: import ipdb; ipdb.set_trace() # reset pivots in tree to global pivots self.tree.seed_node.pivots = self.pivots
def add_older_vaccine_viruses(self, dt = 3, dtref = None): ''' addes additional vaccine viruses prior to the time interval to provide phylogenetic context ''' from date_util import numerical_date for v in self.vaccine_strains: if v['strain'] not in [x['strain'] for x in self.viruses]: tmp_date = numerical_date(v['date']) if tmp_date<self.time_interval[0] and tmp_date>=self.time_interval[0]-dt: self.viruses.append(v) print("adding ",v['strain'], v['date'], tmp_date, self.time_interval) else: print("skipping ",v['strain'], v['date'], tmp_date, self.time_interval) vaccine_strain_names = [v['strain'] for v in self.vaccine_strains] try: if dtref==None: dtref=dt*0.5 from json import load as jload with open('source-data/'+self.virus_type+'_ref_strains.json', 'r') as infile: self.reference_viruses = jload(infile) for v in self.reference_viruses: if v['strain'] not in [x['strain'] for x in self.viruses]: tmp_date = numerical_date(v['date']) tmp_strain = v['strain'] print(tmp_strain) if tmp_strain not in vaccine_strain_names: if tmp_date<self.time_interval[0] and tmp_date>=self.time_interval[0]-dtref: self.viruses.append(v) print("adding ",v['strain'], v['date'], tmp_date, self.time_interval) else: print("skipping ",v['strain'], v['date'], tmp_date, self.time_interval) except: print("can't find reference_viruses")
def auto_outgroup_blast(self): from random import sample from Bio.Blast.Applications import NcbiblastxCommandline from Bio.Blast import NCBIXML self.make_run_dir() nvir = 10 max_ref_seqs = 5 tmp_dates = [] for v in self.viruses: try: tmp_dates.append(numerical_date(v["date"])) except: print("Can't parse date for",v['strain'], v['date']) earliest_date = np.min(tmp_dates) all_strains = [v["strain"] for v in self.viruses] representatives = [SeqRecord(Seq(v['seq']), id=v['strain']) for v in sample(self.viruses, min(nvir, len(self.viruses)))] standard_outgroups = self.load_standard_outgroups() SeqIO.write(representatives, self.run_dir+'representatives.fasta', 'fasta') blast_out = self.run_dir+"outgroup_blast.xml" blast_cline = NcbiblastxCommandline(query=self.run_dir+"representatives.fasta", db=std_outgroup_file_blast, evalue=0.01, outfmt=5, out=blast_out) stdout, stderr = blast_cline() with open(blast_out, 'r') as bfile: og_blast = NCBIXML.parse(bfile) by_og = defaultdict(list) for rep in og_blast: for hit in rep.alignments: for aln in hit.hsps: by_og[hit.hit_def].append((rep.query, aln.score, aln.score/aln.align_length, 1.0*aln.identities/aln.align_length)) by_og = by_og.items() print by_og[1] # sort by number of hits, then mean score by_og.sort(key = lambda x:(len(x[1]), np.mean([y[1] for y in x[1]])), reverse=True) outgroups_older_than_sample = [(og, hits) for (og, hits) in by_og if (numerical_date(standard_outgroups[og]['date'])<earliest_date-5) or ('A/California/07/2009' in standard_outgroups[og]['strain'])] if len(outgroups_older_than_sample) and np.mean([y[-1] for y in outgroups_older_than_sample[0][1]])>0.8: outgroup = outgroups_older_than_sample[0][0] else: outgroup = by_og[0][0] self.midpoint_rooting = True print("will root at midpoint") for oi, (ref, hits) in enumerate(by_og): if (np.max([y[-1] for y in hits])>0.9+oi*0.02) and ref!=outgroup: self.viruses.append(standard_outgroups[ref]) print("including reference strain ",ref, [y[-1] for y in hits]) if oi>max_ref_seqs: break self.outgroup = standard_outgroups[outgroup] if 'A/California/07/2009' not in self.outgroup['strain']: self.outgroup['strain']+='OG' prot = Seq(self.outgroup['seq']).translate(to_stop=True) self.cds = [0,min(len(prot)*3,len(self.outgroup['seq']))] print("chosen outgroup",self.outgroup['strain'])
def unique_date(self): ''' add a unique numerical date to each leaf. uniqueness is achieved adding a small number ''' from date_util import numerical_date og = self.sequence_lookup[self.outgroup['strain']] if hasattr(og, 'date'): try: og.num_date = numerical_date(og.date) except: print "cannot parse date" og.num_date="undefined"; for ii, v in enumerate(self.viruses): if hasattr(v, 'date'): try: v.num_date = numerical_date(v.date, self.date_format['fields']) + 1e-7*(ii+1) except: print "cannot parse date" v.num_date="undefined";
def add_older_vaccine_viruses(self, dt=3, dtref=None): ''' addes additional vaccine viruses prior to the time interval to provide phylogenetic context ''' from date_util import numerical_date for v in self.vaccine_strains: if v['strain'] not in [x['strain'] for x in self.viruses]: tmp_date = numerical_date(v['date']) if tmp_date < self.time_interval[ 0] and tmp_date >= self.time_interval[0] - dt: self.viruses.append(v) print("adding ", v['strain'], v['date'], tmp_date, self.time_interval) else: print("skipping ", v['strain'], v['date'], tmp_date, self.time_interval) vaccine_strain_names = [v['strain'] for v in self.vaccine_strains] try: if dtref == None: dtref = dt * 0.5 from json import load as jload with open('source-data/' + self.virus_type + '_ref_strains.json', 'r') as infile: self.reference_viruses = jload(infile) for v in self.reference_viruses: if v['strain'] not in [x['strain'] for x in self.viruses]: tmp_date = numerical_date(v['date']) tmp_strain = v['strain'] print(tmp_strain) if tmp_strain not in vaccine_strain_names: if tmp_date < self.time_interval[ 0] and tmp_date >= self.time_interval[ 0] - dtref: self.viruses.append(v) print("adding ", v['strain'], v['date'], tmp_date, self.time_interval) else: print("skipping ", v['strain'], v['date'], tmp_date, self.time_interval) except: print("can't find reference_viruses")
def auto_outgroup_blast(self): from random import sample from Bio.Blast.Applications import NcbiblastxCommandline from Bio.Blast import NCBIXML self.make_run_dir() nvir = 10 max_ref_seqs = 5 tmp_dates = [] for v in self.viruses: try: tmp_dates.append(numerical_date(v["date"])) except: print("Can't parse date for", v['strain'], v['date']) earliest_date = np.min(tmp_dates) all_strains = [v["strain"] for v in self.viruses] representatives = [ SeqRecord(Seq(v['seq']), id=v['strain']) for v in sample(self.viruses, min(nvir, len(self.viruses))) ] standard_outgroups = self.load_standard_outgroups() SeqIO.write(representatives, self.run_dir + 'representatives.fasta', 'fasta') blast_out = self.run_dir + "outgroup_blast.xml" blast_cline = NcbiblastxCommandline(query=self.run_dir + "representatives.fasta", db=std_outgroup_file_blast, evalue=0.01, outfmt=5, out=blast_out) stdout, stderr = blast_cline() with open(blast_out, 'r') as bfile: og_blast = NCBIXML.parse(bfile) by_og = defaultdict(list) for rep in og_blast: for hit in rep.alignments: for aln in hit.hsps: by_og[hit.hit_def].append( (rep.query, aln.score, aln.score / aln.align_length, 1.0 * aln.identities / aln.align_length)) by_og = by_og.items() print by_og[1] # sort by number of hits, then mean score by_og.sort(key=lambda x: (len(x[1]), np.mean([y[1] for y in x[1]])), reverse=True) outgroups_older_than_sample = [(og, hits) for (og, hits) in by_og if ( numerical_date(standard_outgroups[og]['date']) < earliest_date - 5) or ('A/California/07/2009' in standard_outgroups[og]['strain'])] if len(outgroups_older_than_sample) and np.mean( [y[-1] for y in outgroups_older_than_sample[0][1]]) > 0.8: outgroup = outgroups_older_than_sample[0][0] else: outgroup = by_og[0][0] self.midpoint_rooting = True print("will root at midpoint") for oi, (ref, hits) in enumerate(by_og): if (np.max([y[-1] for y in hits]) > 0.9 + oi * 0.02) and ref != outgroup: self.viruses.append(standard_outgroups[ref]) print("including reference strain ", ref, [y[-1] for y in hits]) if oi > max_ref_seqs: break self.outgroup = standard_outgroups[outgroup] if 'A/California/07/2009' not in self.outgroup['strain']: self.outgroup['strain'] += 'OG' prot = Seq(self.outgroup['seq']).translate(to_stop=True) self.cds = [0, min(len(prot) * 3, len(self.outgroup['seq']))] print("chosen outgroup", self.outgroup['strain'])