def __init__(self, csv=None, iso_format=True, origin=date(1970, 1, 1), delimiter='_', field=-1, ft2path=None, Rpath=None, java=None, tmpfile='anchre-tmp', beast_xml_template=None): self.csv = csv self.iso_format = iso_format self.origin = origin self.delimiter = delimiter self.field = field # paths to binaries self.ft2path = ft2path self.Rpath = Rpath self.java = java self.pyphy = PyPhy(os.getcwd(), 1) # instance of HyPhy self.beauti = Beauti(beast_xml_template) # if given, parse dates from csv self.dates = {} if self.csv is not None: self.parse_date_csv() # store sequence records self.fasta = {} self.last_date = None self.tmp = tempfile.gettempdir() self.tmpfile = os.path.join(self.tmp, tmpfile) self.test()
def __init__( self, csv=None, iso_format=True, origin=date(1970, 1, 1), delimiter="_", field=-1, ft2path=None, Rpath=None, java=None, tmpfile="anchre-tmp", beast_xml_template=None, ): self.csv = csv self.iso_format = iso_format self.origin = origin self.delimiter = delimiter self.field = field # paths to binaries self.ft2path = ft2path self.Rpath = Rpath self.java = java self.pyphy = PyPhy(os.getcwd(), 1) # instance of HyPhy self.beauti = Beauti(beast_xml_template) # if given, parse dates from csv self.dates = {} if self.csv is not None: self.parse_date_csv() # store sequence records self.fasta = {} self.last_date = None self.tmp = tempfile.gettempdir() self.tmpfile = os.path.join(self.tmp, tmpfile) self.test()
class Anchre: def __init__( self, csv=None, iso_format=True, origin=date(1970, 1, 1), delimiter="_", field=-1, ft2path=None, Rpath=None, java=None, tmpfile="anchre-tmp", beast_xml_template=None, ): self.csv = csv self.iso_format = iso_format self.origin = origin self.delimiter = delimiter self.field = field # paths to binaries self.ft2path = ft2path self.Rpath = Rpath self.java = java self.pyphy = PyPhy(os.getcwd(), 1) # instance of HyPhy self.beauti = Beauti(beast_xml_template) # if given, parse dates from csv self.dates = {} if self.csv is not None: self.parse_date_csv() # store sequence records self.fasta = {} self.last_date = None self.tmp = tempfile.gettempdir() self.tmpfile = os.path.join(self.tmp, tmpfile) self.test() def test(self): """ Check whether expected binaries are accessible :return: """ if not os.path.exists(self.ft2path): print "ERROR: Failed to detect FastTree2 at", self.ft2path sys.exit() if not os.path.exists(self.Rpath): print "ERROR: Failed to detect R at", self.ft2path sys.exit() def parse_date(self, date_str): """ Convert a string representation of a sample collection date into an integer value (number of days since some time in the past). """ if self.iso_format: try: days = (dateup.parse(date_str).date() - self.origin).days except ValueError: print "ERROR: Failed to parse date", date_str raise else: # expressed as number of days since some time in the past (BEAST style) try: days = int(date_str) except: print "ERROR: Expected integer value for sequence date, found", date_str raise return days def parse_dates_csv(self): """ Parse dates from CSV. """ reader = DictReader(open(self.csv, "rU")) for row in reader: self.dates.update({row["header"]: self.parse_date(row["date"])}) def read(self, handle): """ Parse open file as FASTA. Clean sequence labels. """ self.fasta = {} # reset container h = None sequence = "" count = 0 for line in handle: if line.startswith("$"): # skip comments continue if line.startswith(">") or line.startswith("#"): if sequence: # create record days = self.get_date(h) self.fasta.update({h: {"header": "%d_%d" % (count, days), "sequence": sequence, "days": days}}) sequence = "" # reset container h = line.strip(">#\n") count += 1 else: sequence += line.strip("\n").upper() # append last entry days = self.get_date(h) self.fasta.update({h: {"header": "%d_%d" % (count, days), "sequence": sequence, "days": days}}) # determine most recent sample date all_dates = [v["days"] for v in self.fasta.itervalues()] assert len(set(all_dates)) > 1, "ERROR: Only one sample date in data" all_dates.sort(reverse=True) self.last_date = all_dates[0] def get_date(self, h): """ If dates were provided as a CSV input file, then return the date associated with the sequence header supplied as the first argument. Otherwise, parse date field from sequence header in FASTA object. """ if self.csv: try: # this will always be days since X return self.dates[h] except: print "ERROR: sequence header", h, "not found in dates parsed from CSV" raise # otherwise, parse date from sequence headers date_field = h.split(self.delimiter)[self.field] return self.parse_date(date_field) def newick2phylo(self, nwk): handle = StringIO(nwk) phy = Phylo.read(handle, "newick") return phy def phylo2newick(self, t): """ Convert Phylo into Newick tree string. """ output = StringIO() Phylo.write(t, output, "newick") return output.getvalue() def plurality_consensus(self, column, alphabet="ACGT", resolve=False): """ Plurality consensus - nucleotide with highest frequency. In case of tie, report mixtures. """ mixture_dict = { "W": "AT", "R": "AG", "K": "GT", "Y": "CT", "S": "CG", "M": "AC", "V": "AGC", "H": "ATC", "D": "ATG", "B": "TGC", "N": "ATGC", "-": "ATGC", } ambig_dict = dict(("".join(sorted(v)), k) for k, v in mixture_dict.iteritems()) freqs = {} for char in alphabet: freqs.update({char: 0}) # freqs = {"A": 0, "T": 0, "C": 0, "G": 0, "-": 0} for char in column: if char in alphabet: freqs[char] += 1 elif mixture_dict.has_key(char): # handled ambiguous nucleotides with equal weighting resolutions = mixture_dict[char] for char2 in resolutions: freqs[char2] += 1.0 / len(resolutions) else: # unrecognized nucleotide character pass base = max(freqs, key=lambda n: freqs[n]) max_count = freqs[base] possib = filter(lambda n: freqs[n] == max_count, freqs) if len(possib) == 1: return possib[0] elif "-" in possib: if resolve: possib.remove("-") if len(possib) == 0: return "-" elif len(possib) == 1: return possib[0] else: return ambig_dict["".join(sorted(possib))] else: # gap character overrides ties return "-" else: return ambig_dict["".join(sorted(possib))] def consensus(self, seqs, alphabet="ACGT", resolve=False): """ Return plurality consensus of alignment. """ # transpose the alignment n_columns = len(seqs[0]) columns = [] for c in range(n_columns): columns.append([s[c] for s in seqs]) consen = [] for column in columns: consen.append(self.plurality_consensus(column, alphabet=alphabet, resolve=resolve)) return "".join(consen) def earliest_sample(self): # determine the earliest sample date dates = [v["days"] for v in self.fasta.itervalues()] dates.sort() # defaults to increasing order earliest_date = dates[0] # retrieve all sequences with this date first_sample = [v["sequence"] for k, v in self.fasta.iteritems() if v["days"] == earliest_date] return first_sample def consensus_earliest(self): """ Return the consensus of sequences from the earliest sample. :param fasta: :return: """ if not self.fasta: # no sequences have been parsed return None sample = self.earliest_sample() # list of sequences return self.consensus(sample) def consensus_all(self): """ Return the consensus of all sequences. :return: """ all_seqs = [v["sequence"] for v in self.fasta.itervalues()] return self.consensus(all_seqs) def output_fasta(self): """ Write contents of self.fasta to temporary file :return: Absolute path to temporary file """ with open(self.tmpfile, "w") as f: for i, (h, data) in enumerate(self.fasta.iteritems()): f.write(">%s\n%s\n" % (data["header"], data["sequence"])) def call_fasttree2(self, raw=False): """ Call FastTree2 on FASTA file :param raw: if True, retain original sequence headers :return: """ self.output_fasta() # writes to self.tmpfile p = subprocess.Popen( [self.ft2path, "-quiet", "-nosupport", "-nt", "-gtr"], stdin=open(self.tmpfile, "rU"), stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) stdout, stderr = p.communicate() return stdout def call_rtt(self, tree): """ Call an R script that implements Rosemary's rtt() function for re-rooting a tree based on tip dates. :param tree: Newick tree string :return: dictionary with two key-value pairs for rooted and dated trees """ os.chdir("R/") p = subprocess.Popen([self.Rpath, "rtt.r", tree], stdout=subprocess.PIPE) stdout, stderr = p.communicate() # clean kludge from R stdout # rooted_tree, dated_tree = map(lambda s: s.replace('[1] "', '').replace('NA;"', '0:0;'), # stdout.split('\n')[:2]) # res = {'rooted': rooted_tree, 'dated': dated_tree} rooted_tree = stdout.replace("[1] ", "").strip('"\n') os.chdir("../") return rooted_tree def call_root2tip(self, tree): """ Call jar file that implements a modified version of Andrew Rambaut's root-to-tip method (Path-O-Gen). :param tree: a Newick tree string :return: a dictionary that includes the time-scaled tree """ # write tree to temporary file with open(self.tmpfile, "w") as handle: handle.write(tree) out1 = os.path.join(self.tmp, "anchre.r2t.timetree") out2 = os.path.join(self.tmp, "anchre.r2t.csv") p = subprocess.check_call( [self.java, "-jar", "java/RLRootToTip.jar", "-timetree", out1, "-newick", self.tmpfile, out2], stdout=subprocess.PIPE, ) # read outputs with open(out1, "rU") as handle: timetree = Phylo.read(handle, "nexus") with open(out2, "rU") as handle: coef = handle.readlines() # convert NEXUS to Newick string newick = self.phylo2newick(timetree) res = {"timetree": newick} values = coef[1].strip("\n").split(",") for i, key in enumerate(coef[0].strip("\n").split(",")): res.update({key: values[i]}) return res def call_hyphy_ancre(self, tree, model_spec="010010", is_codon=False): """ Ancestral reconstruction with HyPhy :param tree: Newick tree string :param is_codon: if True, interpret alignment as codon sequences :return: [ancseq] is a dictionary of header/sequence pairs. "Node0" keys the root node. [lf] is a serialization of the likelihood function. """ # cast Newick tree string as Phylo object to extract tip labels phy = self.newick2phylo(tree) tips = phy.get_terminals() tipnames = [tip.name for tip in tips] tipnames.sort() # make sure the tree labels match the sequence headers headers = [v["header"] for v in self.fasta.itervalues()] headers.sort() if headers != tipnames: print "Warning: tree labels do not match FASTA in call_hyphy_ancre()" print set(headers).difference(set(tipnames)) sys.exit() ancseq, lf = self.pyphy.ancre(fasta=self.fasta, newick=tree, model_spec=model_spec, is_codon=is_codon) return dict(ancseq), lf def call_beast( self, chain_length=1e6, screen_step=1e5, log_step=1e4, treelog_step=1e4, sample_size=100, root_height=None ): """ Use BEAST to sample trees from the posterior density under a strict molecular clock model. If you want different settings, modify the template XML file. :return: a list of Newick tree strings """ log, treelog = self.beauti.populate( fasta=self.fasta, stem=os.path.join(self.tmp, "beast"), chain_length=chain_length, screen_step=screen_step, log_step=log_step, treelog_step=treelog_step, root_height=root_height, ) self.beauti.write(self.tmpfile) # this was tested on version 1.8.1 # 1.8.1 has a bug that results in zombie processes that fail to terminate - use 1.8.2 p = subprocess.Popen( [self.java, "-Xms64m", "-Xmx256m", "-jar", "java/beast.jar", "-beagle_off", "-overwrite", self.tmpfile], stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0, ) for i, line in enumerate(p.stdout): if i % 10 == 0: sys.stdout.write(".") # progress monitor sys.stdout.write("\n") with open(log, "rU") as f: traces = self.beauti.parse_log(f) with open(treelog, "rU") as f: trees = self.beauti.parse_treelog(f, sample_size=sample_size) return traces, trees
class Anchre: def __init__(self, csv=None, iso_format=True, origin=date(1970, 1, 1), delimiter='_', field=-1, ft2path=None, Rpath=None, java=None, tmpfile='anchre-tmp', beast_xml_template=None): self.csv = csv self.iso_format = iso_format self.origin = origin self.delimiter = delimiter self.field = field # paths to binaries self.ft2path = ft2path self.Rpath = Rpath self.java = java self.pyphy = PyPhy(os.getcwd(), 1) # instance of HyPhy self.beauti = Beauti(beast_xml_template) # if given, parse dates from csv self.dates = {} if self.csv is not None: self.parse_date_csv() # store sequence records self.fasta = {} self.last_date = None self.tmp = tempfile.gettempdir() self.tmpfile = os.path.join(self.tmp, tmpfile) self.test() def test(self): """ Check whether expected binaries are accessible :return: """ if not os.path.exists(self.ft2path): print 'ERROR: Failed to detect FastTree2 at', self.ft2path sys.exit() if not os.path.exists(self.Rpath): print 'ERROR: Failed to detect R at', self.ft2path sys.exit() def parse_date(self, date_str): """ Convert a string representation of a sample collection date into an integer value (number of days since some time in the past). """ if self.iso_format: try: days = (dateup.parse(date_str).date() - self.origin).days except ValueError: print 'ERROR: Failed to parse date', date_str raise else: # expressed as number of days since some time in the past (BEAST style) try: days = int(date_str) except: print 'ERROR: Expected integer value for sequence date, found', date_str raise return days def parse_dates_csv(self): """ Parse dates from CSV. """ reader = DictReader(open(self.csv, 'rU')) for row in reader: self.dates.update({row['header']: self.parse_date(row['date'])}) def read(self, handle): """ Parse open file as FASTA. Clean sequence labels. """ self.fasta = {} # reset container h = None sequence = '' count = 0 for line in handle: if line.startswith('$'): # skip comments continue if line.startswith('>') or line.startswith('#'): if sequence: # create record days = self.get_date(h) self.fasta.update({ h: { 'header': '%d_%d' % (count, days), 'sequence': sequence, 'days': days } }) sequence = '' # reset container h = line.strip('>#\n') count += 1 else: sequence += line.strip('\n').upper() # append last entry days = self.get_date(h) self.fasta.update({ h: { 'header': '%d_%d' % (count, days), 'sequence': sequence, 'days': days } }) # determine most recent sample date all_dates = [v['days'] for v in self.fasta.itervalues()] assert len(set(all_dates)) > 1, 'ERROR: Only one sample date in data' all_dates.sort(reverse=True) self.last_date = all_dates[0] def get_date(self, h): """ If dates were provided as a CSV input file, then return the date associated with the sequence header supplied as the first argument. Otherwise, parse date field from sequence header in FASTA object. """ if self.csv: try: # this will always be days since X return self.dates[h] except: print 'ERROR: sequence header', h, 'not found in dates parsed from CSV' raise # otherwise, parse date from sequence headers date_field = h.split(self.delimiter)[self.field] return self.parse_date(date_field) def newick2phylo(self, nwk): handle = StringIO(nwk) phy = Phylo.read(handle, 'newick') return phy def phylo2newick(self, t): """ Convert Phylo into Newick tree string. """ output = StringIO() Phylo.write(t, output, 'newick') return output.getvalue() def plurality_consensus(self, column, alphabet='ACGT', resolve=False): """ Plurality consensus - nucleotide with highest frequency. In case of tie, report mixtures. """ mixture_dict = { 'W': 'AT', 'R': 'AG', 'K': 'GT', 'Y': 'CT', 'S': 'CG', 'M': 'AC', 'V': 'AGC', 'H': 'ATC', 'D': 'ATG', 'B': 'TGC', 'N': 'ATGC', '-': 'ATGC' } ambig_dict = dict( ("".join(sorted(v)), k) for k, v in mixture_dict.iteritems()) freqs = {} for char in alphabet: freqs.update({char: 0}) #freqs = {"A": 0, "T": 0, "C": 0, "G": 0, "-": 0} for char in column: if char in alphabet: freqs[char] += 1 elif mixture_dict.has_key(char): # handled ambiguous nucleotides with equal weighting resolutions = mixture_dict[char] for char2 in resolutions: freqs[char2] += 1. / len(resolutions) else: # unrecognized nucleotide character pass base = max(freqs, key=lambda n: freqs[n]) max_count = freqs[base] possib = filter(lambda n: freqs[n] == max_count, freqs) if len(possib) == 1: return possib[0] elif "-" in possib: if resolve: possib.remove("-") if len(possib) == 0: return "-" elif len(possib) == 1: return possib[0] else: return ambig_dict["".join(sorted(possib))] else: # gap character overrides ties return "-" else: return ambig_dict["".join(sorted(possib))] def consensus(self, seqs, alphabet='ACGT', resolve=False): """ Return plurality consensus of alignment. """ # transpose the alignment n_columns = len(seqs[0]) columns = [] for c in range(n_columns): columns.append([s[c] for s in seqs]) consen = [] for column in columns: consen.append( self.plurality_consensus(column, alphabet=alphabet, resolve=resolve)) return "".join(consen) def earliest_sample(self): # determine the earliest sample date dates = [v['days'] for v in self.fasta.itervalues()] dates.sort() # defaults to increasing order earliest_date = dates[0] # retrieve all sequences with this date first_sample = [ v['sequence'] for k, v in self.fasta.iteritems() if v['days'] == earliest_date ] return first_sample def consensus_earliest(self): """ Return the consensus of sequences from the earliest sample. :param fasta: :return: """ if not self.fasta: # no sequences have been parsed return None sample = self.earliest_sample() # list of sequences return self.consensus(sample) def consensus_all(self): """ Return the consensus of all sequences. :return: """ all_seqs = [v['sequence'] for v in self.fasta.itervalues()] return self.consensus(all_seqs) def output_fasta(self): """ Write contents of self.fasta to temporary file :return: Absolute path to temporary file """ with open(self.tmpfile, 'w') as f: for i, (h, data) in enumerate(self.fasta.iteritems()): f.write('>%s\n%s\n' % (data['header'], data['sequence'])) def call_fasttree2(self, raw=False): """ Call FastTree2 on FASTA file :param raw: if True, retain original sequence headers :return: """ self.output_fasta() # writes to self.tmpfile p = subprocess.Popen( [self.ft2path, '-quiet', '-nosupport', '-nt', '-gtr'], stdin=open(self.tmpfile, 'rU'), stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() return stdout def call_rtt(self, tree): """ Call an R script that implements Rosemary's rtt() function for re-rooting a tree based on tip dates. :param tree: Newick tree string :return: dictionary with two key-value pairs for rooted and dated trees """ os.chdir('R/') p = subprocess.Popen([self.Rpath, 'rtt.r', tree], stdout=subprocess.PIPE) stdout, stderr = p.communicate() # clean kludge from R stdout #rooted_tree, dated_tree = map(lambda s: s.replace('[1] "', '').replace('NA;"', '0:0;'), # stdout.split('\n')[:2]) #res = {'rooted': rooted_tree, 'dated': dated_tree} rooted_tree = stdout.replace('[1] ', '').strip('"\n') os.chdir('../') return rooted_tree def call_root2tip(self, tree): """ Call jar file that implements a modified version of Andrew Rambaut's root-to-tip method (Path-O-Gen). :param tree: a Newick tree string :return: a dictionary that includes the time-scaled tree """ # write tree to temporary file with open(self.tmpfile, 'w') as handle: handle.write(tree) out1 = os.path.join(self.tmp, 'anchre.r2t.timetree') out2 = os.path.join(self.tmp, 'anchre.r2t.csv') p = subprocess.check_call([ self.java, '-jar', 'java/RLRootToTip.jar', '-timetree', out1, '-newick', self.tmpfile, out2 ], stdout=subprocess.PIPE) # read outputs with open(out1, 'rU') as handle: timetree = Phylo.read(handle, 'nexus') with open(out2, 'rU') as handle: coef = handle.readlines() # convert NEXUS to Newick string newick = self.phylo2newick(timetree) res = {'timetree': newick} values = coef[1].strip('\n').split(',') for i, key in enumerate(coef[0].strip('\n').split(',')): res.update({key: values[i]}) return res def call_hyphy_ancre(self, tree, model_spec='010010', is_codon=False): """ Ancestral reconstruction with HyPhy :param tree: Newick tree string :param is_codon: if True, interpret alignment as codon sequences :return: [ancseq] is a dictionary of header/sequence pairs. "Node0" keys the root node. [lf] is a serialization of the likelihood function. """ # cast Newick tree string as Phylo object to extract tip labels phy = self.newick2phylo(tree) tips = phy.get_terminals() tipnames = [tip.name for tip in tips] tipnames.sort() # make sure the tree labels match the sequence headers headers = [v['header'] for v in self.fasta.itervalues()] headers.sort() if headers != tipnames: print 'Warning: tree labels do not match FASTA in call_hyphy_ancre()' print set(headers).difference(set(tipnames)) sys.exit() ancseq, lf = self.pyphy.ancre(fasta=self.fasta, newick=tree, model_spec=model_spec, is_codon=is_codon) return dict(ancseq), lf def call_beast(self, chain_length=1E6, screen_step=1E5, log_step=1E4, treelog_step=1E4, sample_size=100, root_height=None): """ Use BEAST to sample trees from the posterior density under a strict molecular clock model. If you want different settings, modify the template XML file. :return: a list of Newick tree strings """ log, treelog = self.beauti.populate(fasta=self.fasta, stem=os.path.join( self.tmp, 'beast'), chain_length=chain_length, screen_step=screen_step, log_step=log_step, treelog_step=treelog_step, root_height=root_height) self.beauti.write(self.tmpfile) # this was tested on version 1.8.1 # 1.8.1 has a bug that results in zombie processes that fail to terminate - use 1.8.2 p = subprocess.Popen([ self.java, '-Xms64m', '-Xmx256m', '-jar', 'java/beast.jar', '-beagle_off', '-overwrite', self.tmpfile ], stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=0) for i, line in enumerate(p.stdout): if i % 10 == 0: sys.stdout.write('.') # progress monitor sys.stdout.write('\n') with open(log, 'rU') as f: traces = self.beauti.parse_log(f) with open(treelog, 'rU') as f: trees = self.beauti.parse_treelog(f, sample_size=sample_size) return traces, trees