def __init__( self, records=None, input_dir=None, file_format='fasta', datatype=None, tmpdir='/tmp', calc_distances=False, compression=None, analysis=None, ): self.tmpdir = directorycheck(tmpdir) if records: self.records = records self.datatype = datatype or records[0].datatype optioncheck(self.datatype, ['dna', 'protein']) elif input_dir: directorycheck(input_dir) self.datatype = optioncheck(datatype, ['dna', 'protein']) optioncheck(file_format, ['fasta', 'phylip']) self.records = self.read_files(input_dir, file_format, compression) else: print 'Provide a list of records, or the path to a set of alignments' if not self.records: raise NoRecordsError(file_format, input_dir, compression) if calc_distances: self.calc_distances()
def embedding_plotter( self, coordinates, dimensions, partition=None, add_sphere=False, xlab='PCo1', ylab='PCo2', zlab='PCo3', title='Trees embedded in dimension-reduced space', outfile=False, ): """ Points are coloured according to cluster membership specified by Partition object (or all black if no Partition specified) """ optioncheck(dimensions, [2,3]) partition = partition or Partition(tuple([6]*len(self.collection.records))) colours = 'bgrcmyk' colour_mapping = np.array([colours[i] for i in partition.partition_vector]) fig = plt.figure() if dimensions == 3: ax = fig.add_subplot(111, projection='3d', xlabel=xlab, ylabel=ylab, zlabel=zlab, title=title) if add_sphere: ax = self.sphere(ax) else: ax = fig.add_subplot(111, xlabel=xlab, ylabel=ylab, title=title) ax.scatter(*coordinates.T, color=colour_mapping) # ax.set_aspect(1) if outfile: fig.savefig('{0}.pdf'.format(outfile)) return fig
def get_decomp(self, method='MDS', **kwargs): optioncheck(method, ['MDS', 'spectral']) cl = Clustering(self.dm) if method == 'MDS': return cl.MDS_decomp() if method == 'spectral': return cl.spectral_decomp(**kwargs)
def read_files(self, input_dir, file_format, compression=None): """ Get list of alignment files from an input directory *.fa, *.fas and *.phy files only Stores in self.files """ optioncheck(compression, [None, 'gz', 'bz2']) if file_format == 'fasta': extensions = ['fa', 'fas', 'fasta'] elif file_format == 'phylip': extensions = ['phy'] if compression: extensions = ['.'.join([x, compression]) for x in extensions] files = fileIO.glob_by_extensions(input_dir, extensions) files.sort(key=sort_key) return [ TrClSeq(f, file_format=file_format, datatype=self.datatype, name=get_name(f), tmpdir=self.tmpdir) for f in files ]
def __new__( cls, trees, metric, tmpdir='/tmp', dtype=float, add_noise=False, normalise=False, ): optioncheck(metric, ['euc', 'geo', 'rf', 'wrf']) input_array = get_distance_matrix(trees, metric, tmpdir, normalise=normalise) obj = np.asarray(input_array, dtype).view(cls) obj.metric = metric obj.tmpdir = tmpdir if add_noise: obj = obj.add_noise() return obj
def embedding_plotter( self, coordinates, dimensions, partition=None, add_sphere=False, xlab='PCo1', ylab='PCo2', zlab='PCo3', title='Trees embedded in dimension-reduced space', outfile=False, ): """ Points are coloured according to cluster membership specified by Partition object (or all black if no Partition specified) """ optioncheck(dimensions, [2, 3]) partition = partition or Partition( tuple([6] * len(self.collection.records))) colours = 'bgrcmyk' colour_mapping = np.array( [colours[i] for i in partition.partition_vector]) fig = plt.figure() if dimensions == 3: ax = fig.add_subplot(111, projection='3d', xlabel=xlab, ylabel=ylab, zlabel=zlab, title=title) if add_sphere: ax = self.sphere(ax) else: ax = fig.add_subplot(111, xlabel=xlab, ylabel=ylab, title=title) ax.scatter(*coordinates.T, color=colour_mapping) # ax.set_aspect(1) if outfile: fig.savefig('{0}.pdf'.format(outfile)) return fig
def __init__( self, records, analysis, max_guidetrees=10, tmpdir=None, datatype=None, verbosity=0, ): self.analysis = optioncheck(analysis, ['ml', 'nj', 'TreeCollection']) self.max_guidetrees = max_guidetrees self.records = records self.datatype = datatype or records[0].datatype self.verbosity = verbosity optioncheck(self.datatype, ['protein', 'dna']) self.tmpdir = tmpdir or records[0].tmpdir self.concats = {} self.history = []
def read_files(self, input_dir, file_format, compression=None): """ Get list of alignment files from an input directory *.fa, *.fas and *.phy files only Stores in self.files """ optioncheck(compression, [None, 'gz', 'bz2']) if file_format == 'fasta': extensions = ['fa', 'fas', 'fasta'] elif file_format == 'phylip': extensions = ['phy'] if compression: extensions = ['.'.join([x, compression]) for x in extensions] files = fileIO.glob_by_extensions(input_dir, extensions) files.sort(key=sort_key) return [TrClSeq(f, file_format=file_format, datatype=self.datatype, name=get_name(f), tmpdir=self.tmpdir) for f in files]
def simulate(self, index_list, model=None): """ Simulate a group of sequence alignments using ALF. Uses one of {(GCB, JTT, LG, WAG - protein), (CPAM, ECM and ECMu - DNA)}, WAG by default. TO DO: add parameterised models when I have a robust (probably PAML) method of estimating them from alignment+tree """ if self.datatype == 'protein': # set some defaults model = model or 'WAG' optioncheck(model, [ 'CPAM', 'ECM', 'ECMu', 'WAG', 'JTT', 'GCB', 'LG', ]) else: model = model or 'ECM' try: optioncheck(model, ['CPAM', 'ECM', 'ECMu']) except OptionError, e: print 'Choose a DNA-friendly model for simulation:\n', e return
import argparse parser = argparse.ArgumentParser(description='Clustering optimiser') parser.add_argument('-n', '--nclusters', type=int) parser.add_argument('-f', '--format', default='phylip') parser.add_argument('-d', '--datatype', default='protein') parser.add_argument('-i', '--input_dir', default='./') parser.add_argument('-c', '--compression', default=None) parser.add_argument('-t', '--tmpdir', default='/tmp/') parser.add_argument('-m', '--method', default='s') parser.add_argument('-o', '--output', default=None) args = parser.parse_args() optioncheck(args.method, ['s', 'spectral', 'h', 'hierarchical', 'k', 'kmedoids', 'MDS', 'mds']) def id_generator(size=6, chars=string.ascii_uppercase + string.digits): return ''.join(random.choice(chars) for x in range(size)) new_tmpdir = tempfile.mkdtemp(prefix='tmpwrap_mgp_', dir=args.tmpdir) c = Collection(input_dir=args.input_dir, compression=args.compression, file_format=args.format, datatype=args.datatype, tmpdir=new_tmpdir) c.calc_NJ_trees() dm = c.distance_matrix('euc') cl = Clustering(dm)
def decomp_to_coords(self, decomp, dimensions, normalise=False): optioncheck(dimensions, [2,3]) coords = decomp.coords_by_dimension(dimensions)[0] return coords.normalise_rows() if normalise else coords
def decomp_to_coords(self, decomp, dimensions, normalise=False): optioncheck(dimensions, [2, 3]) coords = decomp.coords_by_dimension(dimensions)[0] return coords.normalise_rows() if normalise else coords