def _producer(self, replicated_num):
        """Infer tree from jackknifed alignments.

        Parameters
        ----------
        replicated_num : int
          Unique replicate number.
        """

        output_msa = os.path.join(
            self.replicate_dir,
            'jk_markers.msa.' + str(replicated_num) + '.faa')
        self.jackknife_alignment(self.msa, self.perc_markers_to_keep,
                                 self.marker_lengths, output_msa)

        fast_tree = FastTree(multithreaded=False)
        output_tree = os.path.join(
            self.replicate_dir,
            'jk_markers.tree.' + str(replicated_num) + '.tre')
        fast_tree_output = os.path.join(
            self.replicate_dir,
            'jk_markers.fasttree.' + str(replicated_num) + '.out')
        fast_tree.run(output_msa, 'prot', self.model, output_tree,
                      fast_tree_output)

        return True
示例#2
0
    def infer(self, options):
        """Infer tree from MSA."""

        self.logger.warning("Tree inference is still under development!")

        check_file_exists(options.msa_file)
        make_sure_path_exists(options.out_dir)

        if (options.cpus > 1):
            check_dependencies(['FastTreeMP'])
        else:
            check_dependencies(['FastTree'])

        self.logger.info('Inferring tree with FastTree using %s+GAMMA.' %
                         options.prot_model)
        fasttree = FastTree(multithreaded=(options.cpus > 1))

        tree_unrooted_output = os.path.join(
            options.out_dir,
            options.prefix + options.suffix + '.unrooted.tree')
        tree_log = os.path.join(options.out_dir, options.prefix + '.tree.log')
        tree_output_log = os.path.join(options.out_dir, 'fasttree.log')
        fasttree.run(options.msa_file, 'prot', options.prot_model,
                     tree_unrooted_output, tree_log, tree_output_log)

        self.logger.info('Done.')
示例#3
0
    def _producer(self, replicated_num):
        """Infer tree from bootstrapped multiple sequence alignment.

        Parameters
        ----------
        replicated_num : int
          Unique replicate number.
        """

        output_msa = os.path.join(
            self.replicate_dir,
            'bootstrap_msa.r_' + str(replicated_num) + '.fna')
        bootstrap_alignment(self.msa, output_msa, frac=self.frac)

        fast_tree = FastTree(multithreaded=False)
        output_tree = os.path.join(
            self.replicate_dir,
            'bootstrap_tree.r_' + str(replicated_num) + '.tree')
        fast_tree_output = os.path.join(
            self.replicate_dir,
            'bootstrap_fasttree.r_' + str(replicated_num) + '.out')
        fast_tree.run(output_msa, self.base_type, self.model, output_tree,
                      fast_tree_output)

        return True
示例#4
0
    def _producer(self, replicated_num):
        """Infer tree from bootstrapped multiple sequence alignment.

        Parameters
        ----------
        replicated_num : int
          Unique replicate number.
        """

        output_msa = os.path.join(
            self.replicate_dir,
            'bootstrap_msa.r_' + str(replicated_num) + '.fna')
        if os.path.exists(output_msa) and os.path.getsize(output_msa) > 0:
            self.logger.warning(
                'Skipping {} as it already exists.'.format(output_msa))
            return True

        output_tree = os.path.join(
            self.replicate_dir,
            'bootstrap_tree.r_' + str(replicated_num) + '.tree')
        fast_tree_output = os.path.join(
            self.replicate_dir,
            'bootstrap_fasttree.r_' + str(replicated_num) + '.out')
        if os.path.exists(
                fast_tree_output) and os.path.getsize(fast_tree_output) > 0:
            self.logger.warning(
                'Skipping {} as it already exists.'.format(fast_tree_output))
            return True

        bootstrap_alignment(self.msa, output_msa, frac=self.frac)
        fast_tree = FastTree(multithreaded=False)
        cmd = fast_tree.run(output_msa, self.base_type, self.model, self.gamma,
                            output_tree, fast_tree_output)

        return True
示例#5
0
    def _producer(self, replicated_num):
        """Infer tree from bootstrapped multiple sequence alignment.

        Parameters
        ----------
        replicated_num : int
          Unique replicate number.
        """

        output_msa = os.path.join(self.replicate_dir, 'bootstrap_msa.r_' + str(replicated_num) + '.fna')
        bootstrap_alignment(self.msa, output_msa, frac=self.frac)

        fast_tree = FastTree(multithreaded=False)
        output_tree = os.path.join(self.replicate_dir, 'bootstrap_tree.r_' + str(replicated_num) + '.tree')
        fast_tree_output = os.path.join(self.replicate_dir, 'bootstrap_fasttree.r_' + str(replicated_num) + '.out')
        fast_tree.run(output_msa, self.base_type, self.model, self.gamma, output_tree, fast_tree_output)

        return True
示例#6
0
    def _producer(self, replicated_num):
        """Infer tree from jackknifed alignments.

        Parameters
        ----------
        replicated_num : int
          Unique replicate number.
        """

        output_msa = os.path.join(self.replicate_dir, 'jk_taxa.msa.' + str(replicated_num) + '.fna')
        self.jackknife_taxa(self.msa, self.perc_taxa_to_keep, self.outgroup_ids, output_msa)

        fast_tree = FastTree(multithreaded=False)
        output_tree = os.path.join(self.replicate_dir, 'jk_taxa.tree.' + str(replicated_num) + '.tre')
        fast_tree_output = os.path.join(self.replicate_dir, 'jk_taxa.fasttree.' + str(replicated_num) + '.out')
        fast_tree.run(output_msa, 'prot', self.model, output_tree, fast_tree_output)

        return True
示例#7
0
    def run(self, genome_id_file, marker_id_file, model, output_dir):
        """Identify phylogenetic tree.

        Parameters
        ----------
        genome_id_file : str
            File specifying unique ids of genomes to include in tree.
        marker_id_file : str
            File specifying unique ids of marker genes  to use for inference.
        model : str ['wag' or 'jtt']
            Model of evolution to use.
        output_dir : str
            Directory to store results.
        """

        time_keeper = TimeKeeper()

        output_alignment_dir = os.path.join(output_dir, 'alignments')
        make_sure_path_exists(output_alignment_dir)

        output_model_dir = os.path.join(output_dir, 'hmm_models')
        make_sure_path_exists(output_model_dir)

        # read directory for each genome
        genome_dirs = read_genome_dir_file(self.genome_dir_file)

        # read genomes within the ingroup
        ncbi_genome_ids, user_genome_ids = read_genome_id_file(genome_id_file)
        genome_ids = ncbi_genome_ids.union(user_genome_ids)
        self.logger.info('Inferring tree for %d genomes.' % len(genome_ids))
        self.logger.info('NCBI genomes: %d' % len(ncbi_genome_ids))
        self.logger.info('User genomes: %d' % len(user_genome_ids))

        # get marker genes
        self.logger.info('Reading marker genes.')
        marker_genes = read_marker_id_file(marker_id_file)
        self.logger.info('Read %d marker genes.' % len(marker_genes))

        # gather all single-copy HMMs into a single model file
        hmm_model_out = os.path.join(output_dir, 'phylo.hmm')
        hmm_info_out = os.path.join(output_dir, 'phylo.tsv')
        self.logger.info('Generating marker gene HMM model files.')
        self._fetch_marker_models(marker_genes, hmm_model_out, hmm_info_out,
                                  output_model_dir)

        # align gene sequences
        align_markers = AlignMarkers(self.cpus)
        align_markers.run(genome_ids, genome_dirs, marker_genes, True,
                          output_alignment_dir, output_model_dir)

        # create concatenated alignment file
        self.logger.info('Concatenating alignments.')
        concatenated_alignment_file = os.path.join(
            output_dir, 'concatenated_alignment.faa')
        marker_file = os.path.join(output_dir, 'concatenated_markers.tsv')
        create_concatenated_alignment(genome_ids, marker_genes,
                                      output_alignment_dir,
                                      concatenated_alignment_file, marker_file)

        # create concatenated genome tree
        self.logger.info('Inferring concatenated genome tree.')
        concatenated_tree = os.path.join(output_dir, 'concatenated.tree')
        concatenated_tree_log = os.path.join(output_dir,
                                             'concatenated.tree.log')
        log_file = os.path.join(output_dir, 'fasttree.log')
        fast_tree = FastTree(multithreaded=True)
        fast_tree.run(concatenated_alignment_file, 'prot', model,
                      concatenated_tree, concatenated_tree_log, log_file)

        # generate summary report
        report_out = os.path.join(output_dir, 'infer_workflow.log')
        fout = open(report_out, 'w')
        fout.write('[infer]\n')
        fout.write('Genome Id file: %s\n' % genome_id_file)
        fout.write('Marker Id file: %s\n' % marker_id_file)
        fout.write('Model of evolution: %s\n' % model)
        fout.write(time_keeper.get_time_stamp())
        fout.close()
示例#8
0
    def run(self, msa_file, tree_program, prot_model, skip_rooting,
            output_dir):
        """Infer tree.

        Parameters
        ----------
        msa_file : str
          Multiple sequence alignment in fasta format.
        tree_program : str
          Program to use for tree inference ['fasttree', 'raxml'].
        prot_model : str
          Protein substitution model for tree inference ['WAG', 'LG', 'AUTO'].
        output_dir : str
          Directory to store results.
        """

        num_seqs = sum([1 for _, _ in seq_io.read_seq(msa_file)])
        if num_seqs <= 2:
            self.logger.error(
                'Insufficient number of sequences in MSA to infer tree.')
            raise SystemExit('Tree inference failed.')

        output_file = ntpath.basename(msa_file)
        prefix = output_file[0:output_file.rfind('.')]
        suffix = output_file[output_file.rfind('.') + 1:]

        if tree_program == 'fasttree':
            self.logger.info(
                'Inferring gene tree with FastTree using %s+GAMMA.' %
                prot_model)
            fasttree = FastTree(multithreaded=(self.cpus > 1))

            tree_unrooted_output = os.path.join(output_dir,
                                                prefix + '.unrooted.tree')
            tree_log = os.path.join(output_dir, prefix + '.tree.log')
            tree_output_log = os.path.join(output_dir, 'fasttree.log')
            fasttree.run(msa_file, 'prot', prot_model, tree_unrooted_output,
                         tree_log, tree_output_log)
        elif tree_program == 'raxml':
            self.logger.info(
                'Inferring gene tree with RAxML using PROTGAMMA%s.' %
                prot_model)

            # create phylip MSA file
            phylip_msa_file = msa_file.replace('.faa', '.phyx')
            cmd = 'seqmagick convert %s %s' % (msa_file, phylip_msa_file)
            os.system(cmd)

            # run RAxML
            raxml_dir = os.path.abspath(os.path.join(output_dir, 'raxml'))
            tree_output_log = os.path.join(output_dir, 'raxml.log')

            raxml = RAxML(self.cpus)
            tree_unrooted_output = raxml.run(phylip_msa_file, prot_model,
                                             raxml_dir)

        # root tree at midpoint
        if not skip_rooting:
            seqs = seq_io.read(msa_file)
            if len(seqs) > 2:
                self.logger.info('Rooting tree at midpoint.')
                tree = dendropy.Tree.get_from_path(tree_unrooted_output,
                                                   schema='newick',
                                                   rooting="force-rooted",
                                                   preserve_underscores=True)
                tree.reroot_at_midpoint(update_bipartitions=False)

            tree_output = os.path.join(output_dir, prefix + '.rooted.tree')
            tree.write_to_path(tree_output,
                               schema='newick',
                               suppress_rooting=True,
                               unquoted_underscores=True)
        else:
            tree_output = tree_unrooted_output

        return tree_output
示例#9
0
    def run(self, gene_dirs, min_per_gene, min_per_bps, tree_program,
            prot_model, split_chars, output_dir):
        """Infer concatenated gene tree.

        Parameters
        ----------
        gene_dirs : list
            GeneTreeTk output directories with information for individual genes.
        min_per_gene : float
            Minimum percentage of genes required to retain taxa.
        min_per_bps : float
            Minimum percentage of base pairs required to retain taxa.
        tree_program : str
            Program to use for tree inference ['fasttree', 'raxml'].
        prot_model : str
            Protein substitution model for tree inference ['WAG', 'LG', 'AUTO'].
        output_dir : str
            Directory to store results.
        """

        # read MSA files
        concat = defaultdict(lambda: defaultdict(list))
        msa_length = 0
        gene_lengths = {}
        for gene_dir in gene_dirs:
            homologs = os.path.join(gene_dir, 'homologs.trimmed.aligned.faa')

            for seq_id, seq in seq_io.read_seq(homologs):
                taxon_id, gene_id = self._split_ids(seq_id, split_chars)
                if not taxon_id:
                    self.logger.error('Failed to split identifier: %s' %
                                      seq_id)
                    sys.exit(-1)

                concat[taxon_id][gene_dir].append(seq)

            msa_length += len(seq)
            gene_lengths[gene_dir] = len(seq)

        # filter taxon
        mc_filter = set()
        min_per_gene_filter = set()
        min_per_bps_filter = set()
        for taxon_id in concat:
            # check if multiple copy
            missing = 0
            taxon_msa_len = 0
            for gene_id in gene_dirs:
                if gene_id not in concat[taxon_id]:
                    missing += 1
                    continue

                if len(concat[taxon_id][gene_id]) > 1:
                    mc_filter.add(taxon_id)
                    break

                taxon_msa_len += len(concat[taxon_id][gene_id][0])

            if taxon_id not in mc_filter:
                if missing > len(gene_dirs) * (1.0 -
                                               float(min_per_gene) / 100.0):
                    min_per_gene_filter.add(taxon_id)
                elif taxon_msa_len < msa_length * float(min_per_bps) / 100.0:
                    min_per_bps_filter.add(taxon_id)

        min_req_genes = math.ceil(len(gene_dirs) * float(min_per_gene) / 100.0)

        filtered_taxa = mc_filter.union(min_per_gene_filter).union(
            min_per_bps_filter)
        remaining_taxa = set(concat) - filtered_taxa
        self.logger.info('No. genes: %d' % len(gene_dirs))
        self.logger.info('No. taxa across all genes: %d' % len(concat))
        self.logger.info('Total filtered taxa: %d' % len(filtered_taxa))
        self.logger.info('  Due to multi-copy genes: %d' % len(mc_filter))
        self.logger.info('  Due to having <%d of the genes: %d' %
                         (min_req_genes, len(min_per_gene_filter)))
        self.logger.info('  Due to an insufficient number of base pairs: %d' %
                         len(min_per_bps_filter))
        self.logger.info('Remaining taxa: %d' % len(remaining_taxa))
        self.logger.info('Length of concatenated MSA: %d' % msa_length)

        # create the multiple sequences alignment
        msa_file = os.path.join(output_dir, 'concatenated.faa')
        fout = open(msa_file, 'w')
        for taxon_id in remaining_taxa:
            msa = ''
            for gene_id in gene_dirs:
                if gene_id not in concat[taxon_id]:
                    msa += '-' * gene_lengths[gene_id]
                else:
                    msa += concat[taxon_id][gene_id][0]

            fout.write('>%s\n' % taxon_id)
            fout.write('%s\n' % msa)
        fout.close()

        # read all taxonomy files
        # (assumes taxonomy is the same for taxa across all genes)
        taxonomy = {}
        for gene_id in gene_dirs:
            taxonomy_file = os.path.join(gene_id, 'taxonomy.tsv')
            t = Taxonomy().read(taxonomy_file)
            for label, taxa_str in t.iteritems():
                taxon_id, gene_id = self._split_ids(label, split_chars)
                taxonomy[taxon_id] = taxa_str

        # create taxonomy file for retained taxa
        self.logger.info('Creating taxonomy file for retained taxa.')
        output_taxonomy_file = os.path.join(output_dir, 'taxonomy.tsv')
        fout = open(output_taxonomy_file, 'w')
        for taxon_id in remaining_taxa:
            if taxon_id in taxonomy:  # query genomes will generally be missing
                fout.write('%s\t%s\n' %
                           (taxon_id, ';'.join(taxonomy[taxon_id])))
        fout.close()

        # infer tree
        if tree_program == 'fasttree':
            self.logger.info(
                'Inferring gene tree with FastTree using %s+GAMMA.' %
                prot_model)
            fasttree = FastTree(multithreaded=(self.cpus > 1))

            tree_unrooted_output = os.path.join(output_dir,
                                                'concatenated.unrooted.tree')
            tree_log = os.path.join(output_dir, 'concatenated.tree.log')
            tree_output_log = os.path.join(output_dir, 'fasttree.log')
            fasttree.run(msa_file, 'prot', prot_model, tree_unrooted_output,
                         tree_log, tree_output_log)
        elif tree_program == 'raxml':
            self.logger.info(
                'Inferring gene tree with RAxML using PROTGAMMA%s.' %
                prot_model)

            # create phylip MSA file
            phylip_msa_file = msa_file.replace('.faa', '.phyx')
            cmd = 'seqmagick convert %s %s' % (msa_file, phylip_msa_file)
            os.system(cmd)

            # run RAxML
            raxml_dir = os.path.abspath(os.path.join(output_dir, 'raxml'))
            tree_output_log = os.path.join(output_dir, 'raxml.log')

            raxml = RAxML(self.cpus)
            tree_unrooted_output = raxml.run(phylip_msa_file, prot_model,
                                             raxml_dir)

        # root tree at midpoint
        self.logger.info('Rooting tree at midpoint.')
        tree = dendropy.Tree.get_from_path(tree_unrooted_output,
                                           schema='newick',
                                           rooting="force-rooted",
                                           preserve_underscores=True)
        if len(remaining_taxa) > 2:
            tree.reroot_at_midpoint(update_bipartitions=False)
        tree_output = os.path.join(output_dir, 'concatenated.rooted.tree')
        tree.write_to_path(tree_output,
                           schema='newick',
                           suppress_rooting=True,
                           unquoted_underscores=True)

        # create tax2tree consensus map and decorate tree
        t2t_tree = os.path.join(output_dir, 'concatenated.tax2tree.tree')
        cmd = 't2t decorate -m %s -t %s -o %s' % (output_taxonomy_file,
                                                  tree_output, t2t_tree)
        os.system(cmd)

        # setup metadata for ARB file
        src_dir = os.path.dirname(os.path.realpath(__file__))
        version_file = open(os.path.join(src_dir, 'VERSION'))

        metadata = {}
        metadata['genetreetk_version'] = version_file.read().strip()

        metadata['genetreetk_tree_program'] = tree_program
        metadata['genetreetk_tree_prot_model'] = prot_model

        # create ARB metadata file
        self.logger.info('Creating ARB metadata file.')
        arb_metadata_file = os.path.join(output_dir, 'arb.metadata.txt')
        self.create_arb_metadata(msa_file, taxonomy, metadata,
                                 arb_metadata_file)
示例#10
0
    def run(self, genome_id_file,
                    marker_id_file,
                    model,
                    output_dir):
        """Identify phylogenetic tree.

        Parameters
        ----------
        genome_id_file : str
            File specifying unique ids of genomes to include in tree.
        marker_id_file : str
            File specifying unique ids of marker genes  to use for inference.
        model : str ['wag' or 'jtt']
            Model of evolution to use.
        output_dir : str
            Directory to store results.
        """

        time_keeper = TimeKeeper()

        output_alignment_dir = os.path.join(output_dir, 'alignments')
        make_sure_path_exists(output_alignment_dir)

        output_model_dir = os.path.join(output_dir, 'hmm_models')
        make_sure_path_exists(output_model_dir)

        # read directory for each genome
        genome_dirs = read_genome_dir_file(self.genome_dir_file)

        # read genomes within the ingroup
        ncbi_genome_ids, user_genome_ids = read_genome_id_file(genome_id_file)
        genome_ids = ncbi_genome_ids.union(user_genome_ids)
        self.logger.info('Inferring tree for %d genomes.' % len(genome_ids))
        self.logger.info('NCBI genomes: %d' % len(ncbi_genome_ids))
        self.logger.info('User genomes: %d' % len(user_genome_ids))

        # get marker genes
        self.logger.info('Reading marker genes.')
        marker_genes = read_marker_id_file(marker_id_file)
        self.logger.info('Read %d marker genes.' % len(marker_genes))

        # gather all single-copy HMMs into a single model file
        hmm_model_out = os.path.join(output_dir, 'phylo.hmm')
        hmm_info_out = os.path.join(output_dir, 'phylo.tsv')
        self.logger.info('Generating marker gene HMM model files.')
        self._fetch_marker_models(marker_genes, hmm_model_out, hmm_info_out, output_model_dir)

        # align gene sequences
        align_markers = AlignMarkers(self.cpus)
        align_markers.run(genome_ids, genome_dirs, marker_genes, True, output_alignment_dir, output_model_dir)

        # create concatenated alignment file
        self.logger.info('Concatenating alignments.')
        concatenated_alignment_file = os.path.join(output_dir, 'concatenated_alignment.faa')
        marker_file = os.path.join(output_dir, 'concatenated_markers.tsv')
        create_concatenated_alignment(genome_ids, marker_genes, output_alignment_dir, concatenated_alignment_file, marker_file)

        # create concatenated genome tree
        self.logger.info('Inferring concatenated genome tree.')
        concatenated_tree = os.path.join(output_dir, 'concatenated.tree')
        concatenated_tree_log = os.path.join(output_dir, 'concatenated.tree.log')
        log_file = os.path.join(output_dir, 'fasttree.log')
        fast_tree = FastTree(multithreaded=True)
        fast_tree.run(concatenated_alignment_file, 'prot', model, concatenated_tree, concatenated_tree_log, log_file)

        # generate summary report
        report_out = os.path.join(output_dir, 'infer_workflow.log')
        fout = open(report_out, 'w')
        fout.write('[infer]\n')
        fout.write('Genome Id file: %s\n' % genome_id_file)
        fout.write('Marker Id file: %s\n' % marker_id_file)
        fout.write('Model of evolution: %s\n' % model)
        fout.write(time_keeper.get_time_stamp())
        fout.close()
示例#11
0
    def run(self, genome_ids,
                    marker_genes,
                    hmm_model_file,
                    min_support,
                    min_per_taxa,
                    perc_markers_to_jackknife,
                    gene_tree_dir,
                    alignment_dir,
                    output_dir):
        """Identify gene trees which do not recover well-support, internal splits in a jackknifed genome tree.

        Parameters
        ----------
        genome_ids : iterable
            Genomes of interest.
        marker_genes : iterable
            Unique ids of marker genes.
        hmm_model_file : str
            File containing HMMs for each marker gene.
        min_support : float
            Minimum jackknife support of splits to use during LGT filtering [0, 1].
        min_per_taxa : float
            Minimum percentage of taxa required to consider a split during LGT filtering [0, 1].
        perc_markers_to_jackknife : float
            Percentage of taxa to keep during marker jackknifing [0, 1].
        gene_tree_dir : str
            Directory containing gene trees.
        alignment_dir : str
            Directory containing multiple sequence alignments.
        output_dir : str
            Output directory.
        """

        output_dir = os.path.join(output_dir, 'jackknife_markers')
        make_sure_path_exists(output_dir)

        # create concatenated alignment file
        self.logger.info('Concatenating alignments.')
        concatenated_alignment_file = os.path.join(output_dir, 'concatenated_alignment.faa')
        marker_file = os.path.join(output_dir, 'concatenated_markers.tsv')
        create_concatenated_alignment(genome_ids, marker_genes, alignment_dir, concatenated_alignment_file, marker_file)

        # create concatenated genome tree
        self.logger.info('Inferring concatenated genome tree.')
        concatenated_tree = os.path.join(output_dir, 'concatenated.tree')
        concatenated_tree_log = os.path.join(output_dir, 'concatenated.tree.log')
        log_file = os.path.join(output_dir, 'concatenated.fasttree.log')
        fast_tree = FastTree(multithreaded=True)
        fast_tree.run(concatenated_alignment_file, 'prot', 'wag', concatenated_tree, concatenated_tree_log, log_file)

        # calculate jackknife support values
        self.logger.info('Calculating jackknife marker support values.')
        jackknife_markers = JackknifeMarkers(self.cpus)
        jackknife_tree = jackknife_markers.run(concatenated_tree, concatenated_alignment_file, marker_file, perc_markers_to_jackknife, 100, 'wag', output_dir)
        # jackknife_tree = os.path.join(output_dir, 'concatenated.jk_markers.tree')

        # identify well-support, internal splits
        self.logger.info('Identifying well-support, internal splits.')
        tree = dendropy.Tree.get_from_path(jackknife_tree, schema='newick', rooting='force-unrooted', preserve_underscores=True)
        num_leaves = len(tree.leaf_nodes())

        num_internal_nodes = 0
        num_major_splits = 0
        well_supported_major_splits = 0

        splits = []
        for node in tree.internal_nodes():
            num_internal_nodes += 1

            num_node_leaves = len(node.leaf_nodes())
            if min(num_node_leaves, num_leaves - num_node_leaves) >= max(min_per_taxa * num_leaves, 2):
                num_major_splits += 1

                if int(node.label) > (min_support * 100.0):
                    well_supported_major_splits += 1
                    split = set([x.taxon.label for x in node.leaf_nodes()])
                    splits.append((split, node.edge_length))

        self.logger.info('# internal nodes: %d' % num_internal_nodes)
        self.logger.info('# major splits: %d' % num_major_splits)
        self.logger.info('# well-supported, major splits: %d' % well_supported_major_splits)

        # filter gene trees that do not recover well-support, internal splits
        self.logger.info('Filtering gene trees.')

        distances = {}
        for i, mg in enumerate(sorted(marker_genes)):
            sys.stdout.write('==> Processed %d of %d (%.2f) gene trees.\r' % (i + 1, len(marker_genes), (i + 1) * 100.0 / len(marker_genes)))
            sys.stdout.flush()

            # read gene tree
            f = mg + '.tree'
            gene_tree_file = os.path.join(gene_tree_dir, f)
            gene_tree = dendropy.Tree.get_from_path(gene_tree_file, schema='newick', rooting='force-unrooted', preserve_underscores=True)

            # prune gene tree so each genome is present exactly once
            processed_genome_ids = set()
            taxa_to_prune = []
            for node in gene_tree.leaf_nodes():
                genome_id = node.taxon.label.split(DefaultValues.SEQ_CONCAT_CHAR)[0]

                if genome_id in processed_genome_ids or genome_id not in genome_ids:
                    taxa_to_prune.append(node.taxon)

                processed_genome_ids.add(genome_id)

            gene_tree.prune_taxa(taxa_to_prune)

            # rename nodes to contain only genome id
            gene_tree_taxa_set = set()
            for node in gene_tree.leaf_nodes():
                genome_id = node.taxon.label.split(DefaultValues.SEQ_CONCAT_CHAR)[0]
                node.taxon.label = genome_id
                gene_tree_taxa_set.add(genome_id)

            # re-encode the split system over the new taxon namespace
            gene_tree.migrate_taxon_namespace(dendropy.TaxonNamespace(gene_tree_taxa_set))
            gene_tree.encode_bipartitions()
            split_bitmasks = set(b.split_bitmask for b in gene_tree.bipartition_encoding)

            # determine number of splits recovered by or compatible with this gene tree
            recovered_splits = 0
            compatible_splits = 0
            compatible_edge_length = 0
            for split, edge_length in splits:
                common_taxa_labels = split.intersection(gene_tree_taxa_set)

                common_split = gene_tree.taxon_namespace.taxa_bitmask(labels=common_taxa_labels)
                normalized_split = dendropy.Bipartition.normalize_bitmask(
                                    bitmask=common_split,
                                    fill_bitmask=gene_tree.taxon_namespace.all_taxa_bitmask(),
                                    lowest_relevant_bit=1)

                if normalized_split in split_bitmasks:
                    recovered_splits += 1

                if gene_tree.is_compatible_with_bipartition(dendropy.Bipartition(bitmask=normalized_split, is_rooted=False)):
                    compatible_splits += 1
                    compatible_edge_length += edge_length

            perc_recovered_splits = recovered_splits * 100.0 / len(splits)
            perc_comp_splits = compatible_splits * 100.0 / len(splits)
            norm_comp_edge_length = float(compatible_edge_length) / sum([s[1] for s in splits])

            # calculate weighted Robinson-Foulds (Manhattan) and Felsenstein's Euclidean
            # distances to the concatenated genome tree
            pruned_tree = tree.clone(depth=2)
            pruned_tree.retain_taxa_with_labels(gene_tree.taxon_namespace.labels())
            pruned_tree.migrate_taxon_namespace(gene_tree.taxon_namespace)
            pruned_tree.encode_bipartitions()

            pruned_tree_edge_len = sum([e.length for e in pruned_tree.edges() if e.length])
            gene_tree_edge_len = sum([e.length for e in gene_tree.edges() if e.length])
            pruned_tree.scale_edges(1.0 / pruned_tree_edge_len)
            gene_tree.scale_edges(1.0 / gene_tree_edge_len)

            manhattan = dendropy.calculate.treecompare.weighted_robinson_foulds_distance(pruned_tree, gene_tree)
            euclidean = dendropy.calculate.treecompare.euclidean_distance(pruned_tree, gene_tree)

            distances[mg] = (perc_recovered_splits, perc_comp_splits, norm_comp_edge_length, manhattan, euclidean)

        return distances, num_internal_nodes, num_major_splits, well_supported_major_splits