Пример #1
0
    def __init__(self,
                 csv=None,
                 iso_format=True,
                 origin=date(1970, 1, 1),
                 delimiter='_',
                 field=-1,
                 ft2path=None,
                 Rpath=None,
                 java=None,
                 tmpfile='anchre-tmp',
                 beast_xml_template=None):
        self.csv = csv
        self.iso_format = iso_format
        self.origin = origin
        self.delimiter = delimiter
        self.field = field

        # paths to binaries
        self.ft2path = ft2path
        self.Rpath = Rpath
        self.java = java

        self.pyphy = PyPhy(os.getcwd(), 1)  # instance of HyPhy
        self.beauti = Beauti(beast_xml_template)

        # if given, parse dates from csv
        self.dates = {}
        if self.csv is not None:
            self.parse_date_csv()

        # store sequence records
        self.fasta = {}
        self.last_date = None

        self.tmp = tempfile.gettempdir()
        self.tmpfile = os.path.join(self.tmp, tmpfile)
        self.test()
Пример #2
0
    def __init__(
        self,
        csv=None,
        iso_format=True,
        origin=date(1970, 1, 1),
        delimiter="_",
        field=-1,
        ft2path=None,
        Rpath=None,
        java=None,
        tmpfile="anchre-tmp",
        beast_xml_template=None,
    ):
        self.csv = csv
        self.iso_format = iso_format
        self.origin = origin
        self.delimiter = delimiter
        self.field = field

        # paths to binaries
        self.ft2path = ft2path
        self.Rpath = Rpath
        self.java = java

        self.pyphy = PyPhy(os.getcwd(), 1)  # instance of HyPhy
        self.beauti = Beauti(beast_xml_template)

        # if given, parse dates from csv
        self.dates = {}
        if self.csv is not None:
            self.parse_date_csv()

        # store sequence records
        self.fasta = {}
        self.last_date = None

        self.tmp = tempfile.gettempdir()
        self.tmpfile = os.path.join(self.tmp, tmpfile)
        self.test()
Пример #3
0
class Anchre:
    def __init__(
        self,
        csv=None,
        iso_format=True,
        origin=date(1970, 1, 1),
        delimiter="_",
        field=-1,
        ft2path=None,
        Rpath=None,
        java=None,
        tmpfile="anchre-tmp",
        beast_xml_template=None,
    ):
        self.csv = csv
        self.iso_format = iso_format
        self.origin = origin
        self.delimiter = delimiter
        self.field = field

        # paths to binaries
        self.ft2path = ft2path
        self.Rpath = Rpath
        self.java = java

        self.pyphy = PyPhy(os.getcwd(), 1)  # instance of HyPhy
        self.beauti = Beauti(beast_xml_template)

        # if given, parse dates from csv
        self.dates = {}
        if self.csv is not None:
            self.parse_date_csv()

        # store sequence records
        self.fasta = {}
        self.last_date = None

        self.tmp = tempfile.gettempdir()
        self.tmpfile = os.path.join(self.tmp, tmpfile)
        self.test()

    def test(self):
        """
        Check whether expected binaries are accessible
        :return:
        """
        if not os.path.exists(self.ft2path):
            print "ERROR: Failed to detect FastTree2 at", self.ft2path
            sys.exit()
        if not os.path.exists(self.Rpath):
            print "ERROR: Failed to detect R at", self.ft2path
            sys.exit()

    def parse_date(self, date_str):
        """
        Convert a string representation of a sample collection date into
        an integer value (number of days since some time in the past).
        """
        if self.iso_format:
            try:
                days = (dateup.parse(date_str).date() - self.origin).days
            except ValueError:
                print "ERROR: Failed to parse date", date_str
                raise
        else:
            # expressed as number of days since some time in the past (BEAST style)
            try:
                days = int(date_str)
            except:
                print "ERROR: Expected integer value for sequence date, found", date_str
                raise
        return days

    def parse_dates_csv(self):
        """
        Parse dates from CSV.
        """
        reader = DictReader(open(self.csv, "rU"))
        for row in reader:
            self.dates.update({row["header"]: self.parse_date(row["date"])})

    def read(self, handle):
        """
        Parse open file as FASTA.  Clean sequence labels.
        """
        self.fasta = {}  # reset container
        h = None
        sequence = ""
        count = 0
        for line in handle:
            if line.startswith("$"):  # skip comments
                continue

            if line.startswith(">") or line.startswith("#"):
                if sequence:
                    # create record
                    days = self.get_date(h)
                    self.fasta.update({h: {"header": "%d_%d" % (count, days), "sequence": sequence, "days": days}})
                    sequence = ""  # reset container

                h = line.strip(">#\n")
                count += 1
            else:
                sequence += line.strip("\n").upper()

        # append last entry
        days = self.get_date(h)
        self.fasta.update({h: {"header": "%d_%d" % (count, days), "sequence": sequence, "days": days}})

        # determine most recent sample date
        all_dates = [v["days"] for v in self.fasta.itervalues()]
        assert len(set(all_dates)) > 1, "ERROR: Only one sample date in data"
        all_dates.sort(reverse=True)
        self.last_date = all_dates[0]

    def get_date(self, h):
        """
        If dates were provided as a CSV input file, then return the
        date associated with the sequence header supplied as the first argument.
        Otherwise, parse date field from sequence header in FASTA object.
        """
        if self.csv:
            try:
                # this will always be days since X
                return self.dates[h]
            except:
                print "ERROR: sequence header", h, "not found in dates parsed from CSV"
                raise

        # otherwise, parse date from sequence headers
        date_field = h.split(self.delimiter)[self.field]
        return self.parse_date(date_field)

    def newick2phylo(self, nwk):
        handle = StringIO(nwk)
        phy = Phylo.read(handle, "newick")
        return phy

    def phylo2newick(self, t):
        """
        Convert Phylo into Newick tree string.
        """
        output = StringIO()
        Phylo.write(t, output, "newick")
        return output.getvalue()

    def plurality_consensus(self, column, alphabet="ACGT", resolve=False):
        """
        Plurality consensus - nucleotide with highest frequency.
        In case of tie, report mixtures.
        """
        mixture_dict = {
            "W": "AT",
            "R": "AG",
            "K": "GT",
            "Y": "CT",
            "S": "CG",
            "M": "AC",
            "V": "AGC",
            "H": "ATC",
            "D": "ATG",
            "B": "TGC",
            "N": "ATGC",
            "-": "ATGC",
        }
        ambig_dict = dict(("".join(sorted(v)), k) for k, v in mixture_dict.iteritems())
        freqs = {}

        for char in alphabet:
            freqs.update({char: 0})
        # freqs = {"A": 0, "T": 0, "C": 0, "G": 0, "-": 0}
        for char in column:
            if char in alphabet:
                freqs[char] += 1
            elif mixture_dict.has_key(char):
                # handled ambiguous nucleotides with equal weighting
                resolutions = mixture_dict[char]
                for char2 in resolutions:
                    freqs[char2] += 1.0 / len(resolutions)
            else:
                # unrecognized nucleotide character
                pass

        base = max(freqs, key=lambda n: freqs[n])
        max_count = freqs[base]
        possib = filter(lambda n: freqs[n] == max_count, freqs)
        if len(possib) == 1:
            return possib[0]
        elif "-" in possib:
            if resolve:
                possib.remove("-")
                if len(possib) == 0:
                    return "-"
                elif len(possib) == 1:
                    return possib[0]
                else:
                    return ambig_dict["".join(sorted(possib))]
            else:
                # gap character overrides ties
                return "-"
        else:
            return ambig_dict["".join(sorted(possib))]

    def consensus(self, seqs, alphabet="ACGT", resolve=False):
        """
        Return plurality consensus of alignment.
        """
        # transpose the alignment
        n_columns = len(seqs[0])
        columns = []
        for c in range(n_columns):
            columns.append([s[c] for s in seqs])

        consen = []
        for column in columns:
            consen.append(self.plurality_consensus(column, alphabet=alphabet, resolve=resolve))

        return "".join(consen)

    def earliest_sample(self):
        # determine the earliest sample date
        dates = [v["days"] for v in self.fasta.itervalues()]
        dates.sort()  # defaults to increasing order
        earliest_date = dates[0]

        # retrieve all sequences with this date
        first_sample = [v["sequence"] for k, v in self.fasta.iteritems() if v["days"] == earliest_date]
        return first_sample

    def consensus_earliest(self):
        """
        Return the consensus of sequences from the earliest sample.
        :param fasta:
        :return:
        """
        if not self.fasta:
            # no sequences have been parsed
            return None

        sample = self.earliest_sample()  # list of sequences
        return self.consensus(sample)

    def consensus_all(self):
        """
        Return the consensus of all sequences.
        :return:
        """
        all_seqs = [v["sequence"] for v in self.fasta.itervalues()]
        return self.consensus(all_seqs)

    def output_fasta(self):
        """
        Write contents of self.fasta to temporary file
        :return:  Absolute path to temporary file
        """
        with open(self.tmpfile, "w") as f:
            for i, (h, data) in enumerate(self.fasta.iteritems()):
                f.write(">%s\n%s\n" % (data["header"], data["sequence"]))

    def call_fasttree2(self, raw=False):
        """
        Call FastTree2 on FASTA file
        :param raw: if True, retain original sequence headers
        :return:
        """
        self.output_fasta()  # writes to self.tmpfile

        p = subprocess.Popen(
            [self.ft2path, "-quiet", "-nosupport", "-nt", "-gtr"],
            stdin=open(self.tmpfile, "rU"),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        stdout, stderr = p.communicate()
        return stdout

    def call_rtt(self, tree):
        """
        Call an R script that implements Rosemary's rtt() function for re-rooting
        a tree based on tip dates.
        :param tree: Newick tree string
        :return: dictionary with two key-value pairs for rooted and dated trees
        """
        os.chdir("R/")
        p = subprocess.Popen([self.Rpath, "rtt.r", tree], stdout=subprocess.PIPE)
        stdout, stderr = p.communicate()
        # clean kludge from R stdout
        # rooted_tree, dated_tree = map(lambda s: s.replace('[1] "', '').replace('NA;"', '0:0;'),
        #                              stdout.split('\n')[:2])
        # res = {'rooted': rooted_tree, 'dated': dated_tree}
        rooted_tree = stdout.replace("[1] ", "").strip('"\n')
        os.chdir("../")
        return rooted_tree

    def call_root2tip(self, tree):
        """
        Call jar file that implements a modified version of Andrew Rambaut's
        root-to-tip method (Path-O-Gen).
        :param tree: a Newick tree string
        :return: a dictionary that includes the time-scaled tree
        """
        # write tree to temporary file
        with open(self.tmpfile, "w") as handle:
            handle.write(tree)

        out1 = os.path.join(self.tmp, "anchre.r2t.timetree")
        out2 = os.path.join(self.tmp, "anchre.r2t.csv")

        p = subprocess.check_call(
            [self.java, "-jar", "java/RLRootToTip.jar", "-timetree", out1, "-newick", self.tmpfile, out2],
            stdout=subprocess.PIPE,
        )

        # read outputs
        with open(out1, "rU") as handle:
            timetree = Phylo.read(handle, "nexus")
        with open(out2, "rU") as handle:
            coef = handle.readlines()

        # convert NEXUS to Newick string
        newick = self.phylo2newick(timetree)
        res = {"timetree": newick}
        values = coef[1].strip("\n").split(",")
        for i, key in enumerate(coef[0].strip("\n").split(",")):
            res.update({key: values[i]})

        return res

    def call_hyphy_ancre(self, tree, model_spec="010010", is_codon=False):
        """
        Ancestral reconstruction with HyPhy
        :param tree: Newick tree string
        :param is_codon: if True, interpret alignment as codon sequences
        :return: [ancseq] is a dictionary of header/sequence pairs.
                   "Node0" keys the root node.
                 [lf] is a serialization of the likelihood function.
        """

        # cast Newick tree string as Phylo object to extract tip labels
        phy = self.newick2phylo(tree)
        tips = phy.get_terminals()
        tipnames = [tip.name for tip in tips]
        tipnames.sort()

        # make sure the tree labels match the sequence headers
        headers = [v["header"] for v in self.fasta.itervalues()]
        headers.sort()
        if headers != tipnames:
            print "Warning: tree labels do not match FASTA in call_hyphy_ancre()"
            print set(headers).difference(set(tipnames))
            sys.exit()

        ancseq, lf = self.pyphy.ancre(fasta=self.fasta, newick=tree, model_spec=model_spec, is_codon=is_codon)
        return dict(ancseq), lf

    def call_beast(
        self, chain_length=1e6, screen_step=1e5, log_step=1e4, treelog_step=1e4, sample_size=100, root_height=None
    ):
        """
        Use BEAST to sample trees from the posterior density under a
        strict molecular clock model.  If you want different settings,
        modify the template XML file.
        :return: a list of Newick tree strings
        """
        log, treelog = self.beauti.populate(
            fasta=self.fasta,
            stem=os.path.join(self.tmp, "beast"),
            chain_length=chain_length,
            screen_step=screen_step,
            log_step=log_step,
            treelog_step=treelog_step,
            root_height=root_height,
        )

        self.beauti.write(self.tmpfile)
        # this was tested on version 1.8.1
        # 1.8.1 has a bug that results in zombie processes that fail to terminate - use 1.8.2
        p = subprocess.Popen(
            [self.java, "-Xms64m", "-Xmx256m", "-jar", "java/beast.jar", "-beagle_off", "-overwrite", self.tmpfile],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            bufsize=0,
        )
        for i, line in enumerate(p.stdout):
            if i % 10 == 0:
                sys.stdout.write(".")  # progress monitor

        sys.stdout.write("\n")

        with open(log, "rU") as f:
            traces = self.beauti.parse_log(f)

        with open(treelog, "rU") as f:
            trees = self.beauti.parse_treelog(f, sample_size=sample_size)

        return traces, trees
Пример #4
0
class Anchre:
    def __init__(self,
                 csv=None,
                 iso_format=True,
                 origin=date(1970, 1, 1),
                 delimiter='_',
                 field=-1,
                 ft2path=None,
                 Rpath=None,
                 java=None,
                 tmpfile='anchre-tmp',
                 beast_xml_template=None):
        self.csv = csv
        self.iso_format = iso_format
        self.origin = origin
        self.delimiter = delimiter
        self.field = field

        # paths to binaries
        self.ft2path = ft2path
        self.Rpath = Rpath
        self.java = java

        self.pyphy = PyPhy(os.getcwd(), 1)  # instance of HyPhy
        self.beauti = Beauti(beast_xml_template)

        # if given, parse dates from csv
        self.dates = {}
        if self.csv is not None:
            self.parse_date_csv()

        # store sequence records
        self.fasta = {}
        self.last_date = None

        self.tmp = tempfile.gettempdir()
        self.tmpfile = os.path.join(self.tmp, tmpfile)
        self.test()

    def test(self):
        """
        Check whether expected binaries are accessible
        :return:
        """
        if not os.path.exists(self.ft2path):
            print 'ERROR: Failed to detect FastTree2 at', self.ft2path
            sys.exit()
        if not os.path.exists(self.Rpath):
            print 'ERROR: Failed to detect R at', self.ft2path
            sys.exit()

    def parse_date(self, date_str):
        """
        Convert a string representation of a sample collection date into
        an integer value (number of days since some time in the past).
        """
        if self.iso_format:
            try:
                days = (dateup.parse(date_str).date() - self.origin).days
            except ValueError:
                print 'ERROR: Failed to parse date', date_str
                raise
        else:
            # expressed as number of days since some time in the past (BEAST style)
            try:
                days = int(date_str)
            except:
                print 'ERROR: Expected integer value for sequence date, found', date_str
                raise
        return days

    def parse_dates_csv(self):
        """
        Parse dates from CSV.
        """
        reader = DictReader(open(self.csv, 'rU'))
        for row in reader:
            self.dates.update({row['header']: self.parse_date(row['date'])})

    def read(self, handle):
        """
        Parse open file as FASTA.  Clean sequence labels.
        """
        self.fasta = {}  # reset container
        h = None
        sequence = ''
        count = 0
        for line in handle:
            if line.startswith('$'):  # skip comments
                continue

            if line.startswith('>') or line.startswith('#'):
                if sequence:
                    # create record
                    days = self.get_date(h)
                    self.fasta.update({
                        h: {
                            'header': '%d_%d' % (count, days),
                            'sequence': sequence,
                            'days': days
                        }
                    })
                    sequence = ''  # reset container

                h = line.strip('>#\n')
                count += 1
            else:
                sequence += line.strip('\n').upper()

        # append last entry
        days = self.get_date(h)
        self.fasta.update({
            h: {
                'header': '%d_%d' % (count, days),
                'sequence': sequence,
                'days': days
            }
        })

        # determine most recent sample date
        all_dates = [v['days'] for v in self.fasta.itervalues()]
        assert len(set(all_dates)) > 1, 'ERROR: Only one sample date in data'
        all_dates.sort(reverse=True)
        self.last_date = all_dates[0]

    def get_date(self, h):
        """
        If dates were provided as a CSV input file, then return the
        date associated with the sequence header supplied as the first argument.
        Otherwise, parse date field from sequence header in FASTA object.
        """
        if self.csv:
            try:
                # this will always be days since X
                return self.dates[h]
            except:
                print 'ERROR: sequence header', h, 'not found in dates parsed from CSV'
                raise

        # otherwise, parse date from sequence headers
        date_field = h.split(self.delimiter)[self.field]
        return self.parse_date(date_field)

    def newick2phylo(self, nwk):
        handle = StringIO(nwk)
        phy = Phylo.read(handle, 'newick')
        return phy

    def phylo2newick(self, t):
        """
        Convert Phylo into Newick tree string.
        """
        output = StringIO()
        Phylo.write(t, output, 'newick')
        return output.getvalue()

    def plurality_consensus(self, column, alphabet='ACGT', resolve=False):
        """
        Plurality consensus - nucleotide with highest frequency.
        In case of tie, report mixtures.
        """
        mixture_dict = {
            'W': 'AT',
            'R': 'AG',
            'K': 'GT',
            'Y': 'CT',
            'S': 'CG',
            'M': 'AC',
            'V': 'AGC',
            'H': 'ATC',
            'D': 'ATG',
            'B': 'TGC',
            'N': 'ATGC',
            '-': 'ATGC'
        }
        ambig_dict = dict(
            ("".join(sorted(v)), k) for k, v in mixture_dict.iteritems())
        freqs = {}

        for char in alphabet:
            freqs.update({char: 0})
        #freqs = {"A": 0, "T": 0, "C": 0, "G": 0, "-": 0}
        for char in column:
            if char in alphabet:
                freqs[char] += 1
            elif mixture_dict.has_key(char):
                # handled ambiguous nucleotides with equal weighting
                resolutions = mixture_dict[char]
                for char2 in resolutions:
                    freqs[char2] += 1. / len(resolutions)
            else:
                # unrecognized nucleotide character
                pass

        base = max(freqs, key=lambda n: freqs[n])
        max_count = freqs[base]
        possib = filter(lambda n: freqs[n] == max_count, freqs)
        if len(possib) == 1:
            return possib[0]
        elif "-" in possib:
            if resolve:
                possib.remove("-")
                if len(possib) == 0:
                    return "-"
                elif len(possib) == 1:
                    return possib[0]
                else:
                    return ambig_dict["".join(sorted(possib))]
            else:
                # gap character overrides ties
                return "-"
        else:
            return ambig_dict["".join(sorted(possib))]

    def consensus(self, seqs, alphabet='ACGT', resolve=False):
        """
        Return plurality consensus of alignment.
        """
        # transpose the alignment
        n_columns = len(seqs[0])
        columns = []
        for c in range(n_columns):
            columns.append([s[c] for s in seqs])

        consen = []
        for column in columns:
            consen.append(
                self.plurality_consensus(column,
                                         alphabet=alphabet,
                                         resolve=resolve))

        return "".join(consen)

    def earliest_sample(self):
        # determine the earliest sample date
        dates = [v['days'] for v in self.fasta.itervalues()]
        dates.sort()  # defaults to increasing order
        earliest_date = dates[0]

        # retrieve all sequences with this date
        first_sample = [
            v['sequence'] for k, v in self.fasta.iteritems()
            if v['days'] == earliest_date
        ]
        return first_sample

    def consensus_earliest(self):
        """
        Return the consensus of sequences from the earliest sample.
        :param fasta:
        :return:
        """
        if not self.fasta:
            # no sequences have been parsed
            return None

        sample = self.earliest_sample()  # list of sequences
        return self.consensus(sample)

    def consensus_all(self):
        """
        Return the consensus of all sequences.
        :return:
        """
        all_seqs = [v['sequence'] for v in self.fasta.itervalues()]
        return self.consensus(all_seqs)

    def output_fasta(self):
        """
        Write contents of self.fasta to temporary file
        :return:  Absolute path to temporary file
        """
        with open(self.tmpfile, 'w') as f:
            for i, (h, data) in enumerate(self.fasta.iteritems()):
                f.write('>%s\n%s\n' % (data['header'], data['sequence']))

    def call_fasttree2(self, raw=False):
        """
        Call FastTree2 on FASTA file
        :param raw: if True, retain original sequence headers
        :return:
        """
        self.output_fasta()  # writes to self.tmpfile

        p = subprocess.Popen(
            [self.ft2path, '-quiet', '-nosupport', '-nt', '-gtr'],
            stdin=open(self.tmpfile, 'rU'),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE)
        stdout, stderr = p.communicate()
        return stdout

    def call_rtt(self, tree):
        """
        Call an R script that implements Rosemary's rtt() function for re-rooting
        a tree based on tip dates.
        :param tree: Newick tree string
        :return: dictionary with two key-value pairs for rooted and dated trees
        """
        os.chdir('R/')
        p = subprocess.Popen([self.Rpath, 'rtt.r', tree],
                             stdout=subprocess.PIPE)
        stdout, stderr = p.communicate()
        # clean kludge from R stdout
        #rooted_tree, dated_tree = map(lambda s: s.replace('[1] "', '').replace('NA;"', '0:0;'),
        #                              stdout.split('\n')[:2])
        #res = {'rooted': rooted_tree, 'dated': dated_tree}
        rooted_tree = stdout.replace('[1] ', '').strip('"\n')
        os.chdir('../')
        return rooted_tree

    def call_root2tip(self, tree):
        """
        Call jar file that implements a modified version of Andrew Rambaut's
        root-to-tip method (Path-O-Gen).
        :param tree: a Newick tree string
        :return: a dictionary that includes the time-scaled tree
        """
        # write tree to temporary file
        with open(self.tmpfile, 'w') as handle:
            handle.write(tree)

        out1 = os.path.join(self.tmp, 'anchre.r2t.timetree')
        out2 = os.path.join(self.tmp, 'anchre.r2t.csv')

        p = subprocess.check_call([
            self.java, '-jar', 'java/RLRootToTip.jar', '-timetree', out1,
            '-newick', self.tmpfile, out2
        ],
                                  stdout=subprocess.PIPE)

        # read outputs
        with open(out1, 'rU') as handle:
            timetree = Phylo.read(handle, 'nexus')
        with open(out2, 'rU') as handle:
            coef = handle.readlines()

        # convert NEXUS to Newick string
        newick = self.phylo2newick(timetree)
        res = {'timetree': newick}
        values = coef[1].strip('\n').split(',')
        for i, key in enumerate(coef[0].strip('\n').split(',')):
            res.update({key: values[i]})

        return res

    def call_hyphy_ancre(self, tree, model_spec='010010', is_codon=False):
        """
        Ancestral reconstruction with HyPhy
        :param tree: Newick tree string
        :param is_codon: if True, interpret alignment as codon sequences
        :return: [ancseq] is a dictionary of header/sequence pairs.
                   "Node0" keys the root node.
                 [lf] is a serialization of the likelihood function.
        """

        # cast Newick tree string as Phylo object to extract tip labels
        phy = self.newick2phylo(tree)
        tips = phy.get_terminals()
        tipnames = [tip.name for tip in tips]
        tipnames.sort()

        # make sure the tree labels match the sequence headers
        headers = [v['header'] for v in self.fasta.itervalues()]
        headers.sort()
        if headers != tipnames:
            print 'Warning: tree labels do not match FASTA in call_hyphy_ancre()'
            print set(headers).difference(set(tipnames))
            sys.exit()

        ancseq, lf = self.pyphy.ancre(fasta=self.fasta,
                                      newick=tree,
                                      model_spec=model_spec,
                                      is_codon=is_codon)
        return dict(ancseq), lf

    def call_beast(self,
                   chain_length=1E6,
                   screen_step=1E5,
                   log_step=1E4,
                   treelog_step=1E4,
                   sample_size=100,
                   root_height=None):
        """
        Use BEAST to sample trees from the posterior density under a
        strict molecular clock model.  If you want different settings,
        modify the template XML file.
        :return: a list of Newick tree strings
        """
        log, treelog = self.beauti.populate(fasta=self.fasta,
                                            stem=os.path.join(
                                                self.tmp, 'beast'),
                                            chain_length=chain_length,
                                            screen_step=screen_step,
                                            log_step=log_step,
                                            treelog_step=treelog_step,
                                            root_height=root_height)

        self.beauti.write(self.tmpfile)
        # this was tested on version 1.8.1
        # 1.8.1 has a bug that results in zombie processes that fail to terminate - use 1.8.2
        p = subprocess.Popen([
            self.java, '-Xms64m', '-Xmx256m', '-jar', 'java/beast.jar',
            '-beagle_off', '-overwrite', self.tmpfile
        ],
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             bufsize=0)
        for i, line in enumerate(p.stdout):
            if i % 10 == 0:
                sys.stdout.write('.')  # progress monitor

        sys.stdout.write('\n')

        with open(log, 'rU') as f:
            traces = self.beauti.parse_log(f)

        with open(treelog, 'rU') as f:
            trees = self.beauti.parse_treelog(f, sample_size=sample_size)

        return traces, trees