Пример #1
0
 def get_decomp(self, method='MDS', **kwargs):
     optioncheck(method, ['MDS', 'spectral'])
     cl = Clustering(self.dm)
     if method == 'MDS':
         return cl.mds_decomp()
     if method == 'spectral':
         return cl.spectral_decomp(**kwargs)
Пример #2
0
    def __init__(
        self,
        records=None,
        input_dir=None,
        param_dir=None,
        file_format='fasta',
        compression=None,
        header_grep=None,
    ):

        self._records = None
        self._input_files = None

        if records is not None:
            self.records = records

        elif input_dir is not None:
            input_dir = os.path.abspath(input_dir)
            directorycheck(input_dir)
            optioncheck(file_format, ['fasta', 'phylip'])
            self.records = self.read_alignments(input_dir, file_format,
                                                header_grep, compression)

        else:
            raise Exception('Provide a list of records, '
                            'or the path to a set of alignments')

        if param_dir is not None:
            self.read_parameters(param_dir)

        if not self.records:
            raise NoRecordsError(file_format, input_dir, compression)
Пример #3
0
    def __init__(
            self,
            records=None,
            input_dir=None,
            param_dir=None,
            file_format='fasta',
            compression=None,
            header_grep=None,
    ):

        self._records = None
        self._input_files = None

        if records is not None:
            self.records = records

        elif input_dir is not None:
            input_dir = os.path.abspath(input_dir)
            directorycheck(input_dir)
            optioncheck(file_format, ['fasta', 'phylip'])
            self.records = self.read_alignments(input_dir,
                                                file_format,
                                                header_grep,
                                                compression)

        else:
            raise Exception('Provide a list of records, '
                            'or the path to a set of alignments')

        if param_dir is not None:
            self.read_parameters(param_dir)

        if not self.records:
            raise NoRecordsError(file_format, input_dir, compression)
Пример #4
0
    def embedding(self, dimensions, method, **kwargs):
        """
        Embeds the distance matrix in a coordinate space. Implemented methods are:
            cmds: Classical MultiDimensional Scaling
            kpca: Kernel Principal Components Analysis
            mmds: Metric MultiDimensional Scaling
            nmmds: Non-Metric MultiDimensional Scaling
            spectral: Spectral decomposition of Laplacian matrix

        Valid kwargs:
            kpca: affinity_matrix - a precomputed array of affinities
                  sigma - the value of sigma to use when computing the affinity matrix via
                          the Radial Basis Function
            nmmds: initial_coords - a set of coordinates to refine. NMMDS works very badly
                                    without this
            spectral: affinity_matrix, sigma
                      unit_length - scale the coordinates to unit length, so points sit
                                    on the surface of the unit sphere
        :param dimensions: (int) number of coordinate axes to use
        :param method: (string) one of cmds, kpca, mmds, nmmds, spectral
        :param kwargs: unit_length (bool), affinity_matrix (np.array), sigma (float), initial_coords (np.array)
        :return: coordinate matrix (np.array)
        """
        optioncheck(method, ['cmds', 'kpca', 'mmds', 'nmmds', 'spectral'])
        if method == 'cmds':
            return self._embedding_classical_mds(dimensions)
        elif method == 'kpca':
            return self._embedding_kernel_pca(dimensions, **kwargs)
        elif method == 'mmds':
            return self._embedding_metric_mds(dimensions)
        elif method == 'nmmds':
            return self._embedding_nonmetric_mds(dimensions, **kwargs)
        elif method == 'spectral':
            return self._embedding_spectral(dimensions, **kwargs)
Пример #5
0
    def autocorrelated_relaxed_clock(self,
                                     root_rate,
                                     autocorrel,
                                     distribution='lognormal'):
        """
        Attaches rates to each node according to autocorrelated lognormal
        model from Kishino et al.(2001), or autocorrelated exponential
        """
        optioncheck(distribution, ['exponential', 'lognormal'])

        if autocorrel == 0:
            for node in self.preorder_node_iter():
                node.rate = root_rate
            return

        for node in self.preorder_node_iter():
            if node == self.seed_node:
                node.rate = root_rate
            else:
                parent_rate = node.parent_node.rate
                bl = node.edge_length
                if distribution == 'lognormal':
                    node.rate = logn_correlated_rate(parent_rate, bl,
                                                     autocorrel)
                else:
                    node.rate = np.random.exponential(parent_rate)
Пример #6
0
    def __init__(
        self,
        collection,
        analysis,
        lsf=False,
        max_guidetrees=10,
        tmpdir=None,
        datatype=None,
        verbosity=0,
        populate_cache=True,
        debug=False,
        ):

        optioncheck(analysis, ANALYSES + ['tc', 'TreeCollection'])
        if analysis == 'tc':
            self.analysis = 'TreeCollection'
        else:
            self.analysis = analysis
        self.max_guidetrees = max_guidetrees
        self.lsf = lsf
        self.collection = collection
        self.datatype = datatype or collection.datatype
        self.verbosity = verbosity
        optioncheck(self.datatype, ['protein', 'dna'])
        self.tmpdir = tmpdir or collection.tmpdir
        directorymake(self.tmpdir)
        self.cache = {}
        self.history = []
        self.debug=debug
        if populate_cache:
            self.populate_cache()
Пример #7
0
 def __init__(
     self,
     class_list,
     permutations_list,
     nspecies,
     subst_model,
     rate_model,
     master_tree_generator_method="yule",
     master_tree=None,
     class_tree_permuter="nni",
     gene_length_kappa=1.7719,
     gene_length_theta=279.9,
     gene_length_min=10,
     gamma_rate_param=None,
     outdir="./",
     autocorrelated_relaxed_clock=False,
     uncorrelated_relaxed_clock=False,
     scale_rates=False,
     verbosity=0,
 ):
     # default
     errors.optioncheck(master_tree_generator_method, ["yule", "coal", "rtree", "custom"])
     errors.optioncheck(class_tree_permuter, ["nni", "spr", "lgt", "genetree"])
     if master_tree is None and master_tree_generator_method == "custom":
         raise Exception("No custom tree was specified")
     self.num_classes = len(class_list)
     self.num_genes = sum(class_list)
     self.class_list = class_list
     self._master_tree = None
     self.verbosity = verbosity
     self.autocorrelated_relaxed_clock = autocorrelated_relaxed_clock
     self.uncorrelated_relaxed_clock = uncorrelated_relaxed_clock
     self.scale_rates = scale_rates
     self.gene_trees = list()
     if master_tree is None:
         tree = self.generate_master_tree(master_tree_generator_method, nspecies)
         self.master_tree = tree
         self.num_species = nspecies
     else:
         self.master_tree = master_tree
         if len(master_tree) != nspecies:
             msg = [
                 "Warning: supplied tree has {0} taxa.".format(len(master_tree)),
                 "Required number is {0}.\n".format(nspecies),
                 "Resetting number of species to match the supplied tree.",
             ]
             print("".join(msg))
             self.num_species = nspecies
     self.set_gene_lengths(gene_length_kappa, gene_length_theta, gene_length_min)
     self.gamma_rate_param = gamma_rate_param
     self.permuter = class_tree_permuter
     self.permutations_list = permutations_list
     self.datatype = datatype
     self.tmpdir = errors.directorymake(tmpdir)
     self.outdir = outdir
     self.generate_class_trees()  # sets self.class_trees dict
     self.make_alf_dirs()  # sets self.alf_dirs dict
     self.write_alf_params()
     self.get_true_partition()
Пример #8
0
    def uncorrelated_relaxed_clock(self, root_rate, variance,
                                   distribution='lognormal'):
        optioncheck(distribution, ['exponential', 'lognormal'])

        for node in self.preorder_node_iter():
            if node == self.seed_node:
                node.rate = root_rate
            else:
                if distribution == 'lognormal':
                    mu = np.log(root_rate) - 0.5 * variance
                    node.rate = np.random.lognormal(mu, variance)
                else:
                    node.rate = np.random.exponential(root_rate)
Пример #9
0
    def uncorrelated_relaxed_clock(self,
                                   root_rate,
                                   variance,
                                   distribution='lognormal'):
        optioncheck(distribution, ['exponential', 'lognormal'])

        for node in self.preorder_node_iter():
            if node == self.seed_node:
                node.rate = root_rate
            else:
                if distribution == 'lognormal':
                    mu = np.log(root_rate) - 0.5 * variance
                    node.rate = np.random.lognormal(mu, variance)
                else:
                    node.rate = np.random.exponential(root_rate)
Пример #10
0
    def embedding_plotter(
        self,
        coordinates,
        dimensions,
        partition=None,
        add_sphere=False,
        xlab='PCo1',
        ylab='PCo2',
        zlab='PCo3',
        title='Trees embedded in dimension-reduced space',
        outfile=False,
    ):
        """ Points are coloured according to cluster membership specified
        by Partition object (or all black if no Partition specified) """

        optioncheck(dimensions, [2, 3])
        partition = (partition or Partition(tuple([0] * len(coordinates))))

        colours = zip(
            *zip(range(len(partition)), itertools.cycle('bgrcmyk')))[1]
        print(colours)
        colour_mapping = np.array(
            [colours[i - 1] for i in partition.partition_vector])
        fig = plt.figure()

        if dimensions == 3:
            ax = fig.add_subplot(111,
                                 projection='3d',
                                 xlabel=xlab,
                                 ylabel=ylab,
                                 zlabel=zlab,
                                 title=title)
            if add_sphere:
                ax = self.sphere(ax)

        else:
            ax = fig.add_subplot(111, xlabel=xlab, ylabel=ylab, title=title)

        ax.scatter(*coordinates.T, color=colour_mapping)
        # ax.set_aspect(1)

        if outfile:
            fig.savefig('{0}.pdf'.format(outfile))

        return fig
Пример #11
0
    def __init__(
        self,
        records=None,
        input_dir=None,
        trees_dir=None,
        file_format='fasta',
        datatype=None,
        tmpdir=TMPDIR,
        calc_distances=False,
        compression=None,
        debug=False,
        ):

        self.tmpdir = directorymake(tmpdir)
        self._records = None
        self.debug = debug

        if records:
            self.records = records
            self.datatype = datatype or records[0].datatype
            optioncheck(self.datatype, ['dna', 'protein'])
            for rec in records:
                rec.tmpdir = self.tmpdir

        elif input_dir:
            directorycheck(input_dir)
            self.datatype = optioncheck(datatype, ['dna', 'protein'])
            optioncheck(file_format, ['fasta', 'phylip'])
            self.records = self.read_alignments(input_dir,
                                                file_format,
                                                compression)

        else:
            raise Exception('Provide a list of records, '
                  'or the path to a set of alignments')

        self.taxon_set = TaxonSet()
        if trees_dir:
            self.read_trees(trees_dir, self.taxon_set)

        if not self.records:
            raise NoRecordsError(file_format, input_dir, compression)

        if calc_distances:
            self.calc_distances()
Пример #12
0
    def __init__(self, nclusters, collection, tmpdir=TMPDIR,
                 analysis='nj', initial_assignment=None, scorer=None):
        optioncheck(analysis, ANALYSES + ['tc', 'TreeCollection'])
        if self.analysis == 'tc':
            self.analysis = 'TreeCollection'
        else:
            self.analysis = analysis

        self.Collection = collection

        if not self.Collection.records[0].tree:
            print('Calculating {} trees for collection...'.format(analysis))
            self.Collection.calc_NJ_trees()

        self.datatype = collection.datatype
        if scorer is not None and isinstance(scorer, Scorer):
            self.scorer = scorer
        else:
            self.scorer = Scorer(self.Collection.records,
                                 analysis=analysis,
                                 datatype=self.datatype,
                                 tmpdir=tmpdir)

        self.nclusters = nclusters
        self.tmpdir = tmpdir

        print('Calculating initial scores...')
        if initial_assignment is None:
            initial_assignment = Partition(tuple([0]*len(collection)))
            # initial_assignment = self.random_partition(nclusters)

        self.global_best_scores = {}
        self.global_best_assignments = {}
        self.global_best_scores[self.nclusters] = self.scorer.score(
            initial_assignment, history=True)
        self.global_best_assignments[self.nclusters] = initial_assignment

        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0
        self.merges = 0
Пример #13
0
 def __new__(
         cls,
         trees,
         metric,
         tmpdir=TMPDIR,
         dtype=float,
         add_noise=False,
         normalise=False,
         dec_places=None,
         lsf=False,
 ):
     optioncheck(metric, ['euc', 'geo', 'rf', 'wrf'])
     input_array = get_distance_matrix(trees, metric, tmpdir,
                                       normalise=normalise, dec_places=dec_places, lsf=lsf)
     obj = np.asarray(input_array, dtype).view(cls)
     obj.metric = metric
     obj.tmpdir = tmpdir
     if add_noise:
         obj = obj.add_noise()
     return obj
Пример #14
0
    def make_new_assignment(self, sample, scores, assignment, nreassign=1,
                            choose='max'):
        """
        MAKES A NEW PARTITION BY REASSIGNING RECORDS BETWEEN CLUSTERS
        """
        optioncheck(choose, ('max', 'min'))
        new_clusters = scores.argmax(axis=1)
        M = scores / scores.sum(axis=1)[:, np.newaxis]
        if choose == 'max':
            reassignments = M.max(axis=1).argsort()[-nreassign:]
        else:
            reassignments = M.min(axis=1).argsort()[:nreassign]

        new_assignment = list(assignment.partition_vector)

        for i in reassignments:
            new_assignment[sample[i]] = new_clusters[i] + 1
            # because cluster number is in range
            # [1,x], and new_clusters is in range [0,x-1]

        return Partition(tuple(new_assignment))
Пример #15
0
    def __init__(self, nclusters, collection, tmpdir=TMPDIR,
                 analysis='nj', initial_assignment=None, scorer=None):
        optioncheck(analysis, ANALYSES + ['tc', 'TreeCollection'])
        if self.analysis == 'tc':
            self.analysis = 'TreeCollection'
        else:
            self.analysis = analysis

        self.Collection = collection

        if not self.Collection.records[0].tree:
            print('Calculating {} trees for collection...'.format(analysis))
            self.Collection.calc_NJ_trees()

        self.datatype = collection.datatype
        if scorer is not None and isinstance(scorer, Scorer):
            self.scorer = scorer
        else:
            self.scorer = Scorer(self.Collection)

        self.nclusters = nclusters
        self.tmpdir = tmpdir

        print('Calculating initial scores...')
        if initial_assignment is None:
            initial_assignment = Partition(tuple([0] * len(collection)))
            # initial_assignment = self.random_partition(nclusters)

        self.global_best_scores = {}
        self.global_best_assignments = {}
        self.global_best_scores[self.nclusters] = self.scorer.score(
            initial_assignment, history=True)
        self.global_best_assignments[self.nclusters] = initial_assignment

        self.done_worse = 0
        self.stayed_put = 0
        self.i = 0
        self.resets = 0
        self.merges = 0
Пример #16
0
    def calc_phyml_trees(self, analysis='nj', lsf=False, strategy='dynamic',
                         minmem=256, bootstraps=None, add_originals=False,
                         verbosity=0):
        """ Calculates trees for each record using phyml """
        optioncheck(analysis, ANALYSES)
        if bootstraps is not None:
            bootstraps = int(isnumbercheck(bootstraps))
            records = list(itertools.chain(*[[r.bootstrap_sample(str(i))
                                              for i in range(bootstraps)]
                                             for r in self]))
            if add_originals:
                records.extend(self.records)
        else:
            records = self.records

        if lsf:
            trees = runLSFPhyml(records,
                                self.tmpdir,
                                analysis=analysis,
                                verbosity=verbosity,
                                strategy=strategy,
                                minmem=minmem,
                                taxon_set=self.taxon_set,
                                debug=self.debug)
            for rec, tree in zip(records, trees):
                rec.tree = TrClTree.cast(tree)

        else:
            for rec in records:
                runPhyml(rec, self.tmpdir, analysis=analysis,
                         verbosity=verbosity, taxon_set=self.taxon_set)
                rec.tree = TrClTree.cast(rec.tree)
        if verbosity == 1:
            print()

        if bootstraps is not None:
            return [r.tree for r in records]
Пример #17
0
    def simulate(self, index_tuple, model=None, lsf=False, ntimes=1):
        """ Simulate a group of sequence alignments using ALF. Uses one of
        {(GCB, JTT, LG, WAG - protein), (CPAM, ECM and ECMu - DNA)}, WAG by
        default. TO DO: add parameterised models when I have a robust (probably
        PAML) method of estimating them from alignment+tree """

        if self.datatype == 'protein':  # set some defaults
            model = model or 'WAG'
            optioncheck(model, [
                'CPAM',
                'ECM',
                'ECMu',
                'WAG',
                'JTT',
                'GCB',
                'LG',
                ])
        else:
            model = model or 'GTR'
            try:
                optioncheck(model, ['CPAM', 'ECM', 'ECMu', 'GTR'])
            except OptionError, e:
                print('Choose a DNA-friendly model for simulation:\n', e)
                return
Пример #18
0
    def autocorrelated_relaxed_clock(self, root_rate, autocorrel,
                                     distribution='lognormal'):
        """
        Attaches rates to each node according to autocorrelated lognormal
        model from Kishino et al.(2001), or autocorrelated exponential
        """
        optioncheck(distribution, ['exponential', 'lognormal'])

        if autocorrel == 0:
            for node in self.preorder_node_iter():
                node.rate = root_rate
            return

        for node in self.preorder_node_iter():
            if node == self.seed_node:
                node.rate = root_rate
            else:
                parent_rate = node.parent_node.rate
                bl = node.edge_length
                if distribution == 'lognormal':
                    node.rate = logn_correlated_rate(parent_rate, bl,
                                                     autocorrel)
                else:
                    node.rate = np.random.exponential(parent_rate)
Пример #19
0
    def read_alignments(self, input_dir, file_format, compression=None):
        """ Get list of alignment files from an input directory *.fa, *.fas and
        *.phy files only

        Stores in self.files """

        optioncheck(compression, [None, 'gz', 'bz2'])

        if file_format == 'fasta':
            extensions = ['fa', 'fas', 'fasta']

        elif file_format == 'phylip':
            extensions = ['phy']

        if compression:
            extensions = ['.'.join([x, compression]) for x in extensions]

        files = fileIO.glob_by_extensions(input_dir, extensions)
        files.sort(key=SORT_KEY)

        return [TrClSeq(f, file_format=file_format, datatype=self.datatype,
                        name=fileIO.strip_extensions(f),
                        tmpdir=self.tmpdir)
                for f in files]
Пример #20
0
    def decomp_to_coords(self, decomp, dimensions, normalise=False):
        optioncheck(dimensions, [2, 3])

        coords = decomp.coords_by_dimension(dimensions)[0]
        return coords.normalise_rows() if normalise else coords
Пример #21
0
 def __init__(
     self,
     class_list,
     permutations_list,
     nspecies,
     tmpdir,
     datatype='protein',
     master_tree_generator_method='yule',
     master_tree=None,
     class_tree_permuter='nni',
     gene_length_kappa=1.7719,
     gene_length_theta=279.9,
     gene_length_min=10,
     gamma_rate_param=None,
     outdir='./',
     autocorrelated_relaxed_clock=False,
     uncorrelated_relaxed_clock=False,
     scale_rates=False,
     verbosity=0,
     ):
     # default
     errors.optioncheck(master_tree_generator_method, ['yule', 'coal',
                        'rtree', 'custom'])
     errors.optioncheck(class_tree_permuter, ['nni', 'spr', 'lgt', 'genetree'
                        ])
     if master_tree is None and master_tree_generator_method == 'custom':
         raise Exception('No custom tree was specified')
     self.num_classes = len(class_list)
     self.num_genes = sum(class_list)
     self.class_list = class_list
     self.verbosity=verbosity
     self.autocorrelated_relaxed_clock = autocorrelated_relaxed_clock
     self.uncorrelated_relaxed_clock = uncorrelated_relaxed_clock
     self.scale_rates = scale_rates
     self.gene_trees = list()
     if master_tree is None:
         tree = self.generate_master_tree(master_tree_generator_method,
                 nspecies)
         self.master_tree = tree
         self.num_species = nspecies
     else:
         self.master_tree = master_tree
         if len(master_tree) != nspecies:
             msg = [
                 'Warning: supplied tree has {0} taxa.'.format(
                     len(master_tree)),
                 'Required number is {0}.\n'.format(nspecies),
                 'Resetting number of species to match the supplied tree.'
             ]
             print(''.join(msg))
             self.num_species = nspecies
     self.set_gene_lengths(gene_length_kappa, gene_length_theta,
                           gene_length_min)
     self.gamma_rate_param = gamma_rate_param
     self.permuter = class_tree_permuter
     self.permutations_list = permutations_list
     self.datatype = datatype
     self.tmpdir = errors.directorymake(tmpdir)
     self.outdir = outdir
     self.generate_class_trees() # sets self.class_trees dict
     self.make_alf_dirs() # sets self.alf_dirs dict
     self.write_alf_params()
     self.get_true_partition()
Пример #22
0
 def __init__(
     self,
     class_list,
     permutations_list,
     nspecies,
     subst_model,
     rate_model,
     master_tree_generator_method='yule',
     master_tree=None,
     class_tree_permuter='nni',
     gene_length_kappa=1.7719,
     gene_length_theta=279.9,
     gene_length_min=10,
     gamma_rate_param=None,
     outdir='./',
     autocorrelated_relaxed_clock=False,
     uncorrelated_relaxed_clock=False,
     scale_rates=False,
     verbosity=0,
 ):
     # default
     errors.optioncheck(master_tree_generator_method,
                        ['yule', 'coal', 'rtree', 'custom'])
     errors.optioncheck(class_tree_permuter,
                        ['nni', 'spr', 'lgt', 'genetree'])
     if master_tree is None and master_tree_generator_method == 'custom':
         raise Exception('No custom tree was specified')
     self.num_classes = len(class_list)
     self.num_genes = sum(class_list)
     self.class_list = class_list
     self._master_tree = None
     self.verbosity = verbosity
     self.autocorrelated_relaxed_clock = autocorrelated_relaxed_clock
     self.uncorrelated_relaxed_clock = uncorrelated_relaxed_clock
     self.scale_rates = scale_rates
     self.gene_trees = list()
     if master_tree is None:
         tree = self.generate_master_tree(master_tree_generator_method,
                                          nspecies)
         self.master_tree = tree
         self.num_species = nspecies
     else:
         self.master_tree = master_tree
         if len(master_tree) != nspecies:
             msg = [
                 'Warning: supplied tree has {0} taxa.'.format(
                     len(master_tree)),
                 'Required number is {0}.\n'.format(nspecies),
                 'Resetting number of species to match the supplied tree.'
             ]
             print(''.join(msg))
             self.num_species = nspecies
     self.set_gene_lengths(gene_length_kappa, gene_length_theta,
                           gene_length_min)
     self.gamma_rate_param = gamma_rate_param
     self.permuter = class_tree_permuter
     self.permutations_list = permutations_list
     self.datatype = datatype
     self.tmpdir = errors.directorymake(tmpdir)
     self.outdir = outdir
     self.generate_class_trees()  # sets self.class_trees dict
     self.make_alf_dirs()  # sets self.alf_dirs dict
     self.write_alf_params()
     self.get_true_partition()
Пример #23
0
    def read_alignments(self, input_dir, file_format, header_grep=None, compression=None):
        """ Get list of alignment files from an input directory *.fa, *.fas and
        *.phy files only

        Stores in self.files """

        optioncheck(compression, [None, 'gz', 'bz2'])

        if file_format == 'fasta':
            extensions = ['fa', 'fas', 'fasta']

        elif file_format == 'phylip':
            extensions = ['phy']

        else:
            extensions = []

        if compression:
            extensions = ['.'.join([x, compression]) for x in extensions]

        files = fileIO.glob_by_extensions(input_dir, extensions)
        files.sort(key=SORT_KEY)
        self._input_files = files
        records = []

        pbar = setup_progressbar("Loading files", len(files), simple_progress=True)
        pbar.start()

        for i, f in enumerate(files):
            if compression is not None:
                with fileIO.TempFile() as tmpfile:
                    with fileIO.freader(f, compression) as reader, fileIO.fwriter(tmpfile) as writer:
                        for line in reader:
                            writer.write(line)
                    try:
                        record = Alignment(tmpfile, file_format, True)
                    except RuntimeError:
                        record = Alignment(tmpfile, file_format, False)

            else:
                try:
                    record = Alignment(f, file_format, True)
                except RuntimeError:
                    record = Alignment(f, file_format, False)

            if header_grep:
                try:
                    datatype = 'dna' if record.is_dna() else 'protein'

                    record = Alignment([(header_grep(x), y) for (x, y) in record.get_sequences()], datatype)

                except TypeError:
                    raise TypeError("Couldn't apply header_grep to header\n"
                                    "alignment number={}, name={}\n"
                                    "header_grep={}".format(i, fileIO.strip_extensions(f), header_grep))
                except RuntimeError:
                    print('RuntimeError occurred processing alignment number={}, name={}'
                          .format(i, fileIO.strip_extensions(f)))
                    raise

            record.name = (fileIO.strip_extensions(f))
            records.append(record)
            pbar.update(i)
        pbar.finish()
        return records
Пример #24
0
    def read_alignments(self,
                        input_dir,
                        file_format,
                        header_grep=None,
                        compression=None):
        """ Get list of alignment files from an input directory *.fa, *.fas and
        *.phy files only

        Stores in self.files """

        optioncheck(compression, [None, 'gz', 'bz2'])

        if file_format == 'fasta':
            extensions = ['fa', 'fas', 'fasta']

        elif file_format == 'phylip':
            extensions = ['phy']

        else:
            extensions = []

        if compression:
            extensions = ['.'.join([x, compression]) for x in extensions]

        files = fileIO.glob_by_extensions(input_dir, extensions)
        files.sort(key=SORT_KEY)
        self._input_files = files
        records = []

        pbar = setup_progressbar("Loading files",
                                 len(files),
                                 simple_progress=True)
        pbar.start()

        for i, f in enumerate(files):
            if compression is not None:
                with fileIO.TempFile() as tmpfile:
                    with fileIO.freader(f,
                                        compression) as reader, fileIO.fwriter(
                                            tmpfile) as writer:
                        for line in reader:
                            writer.write(line)
                    try:
                        record = Alignment(tmpfile, file_format, True)
                    except RuntimeError:
                        record = Alignment(tmpfile, file_format, False)

            else:
                try:
                    record = Alignment(f, file_format, True)
                except RuntimeError:
                    record = Alignment(f, file_format, False)

            if header_grep:
                try:
                    datatype = 'dna' if record.is_dna() else 'protein'

                    record = Alignment([(header_grep(x), y)
                                        for (x, y) in record.get_sequences()],
                                       datatype)

                except TypeError:
                    raise TypeError("Couldn't apply header_grep to header\n"
                                    "alignment number={}, name={}\n"
                                    "header_grep={}".format(
                                        i, fileIO.strip_extensions(f),
                                        header_grep))
                except RuntimeError:
                    print(
                        'RuntimeError occurred processing alignment number={}, name={}'
                        .format(i, fileIO.strip_extensions(f)))
                    raise

            record.name = (fileIO.strip_extensions(f))
            records.append(record)
            pbar.update(i)
        pbar.finish()
        return records