def permuted_copy(self, partition=None): """ Return a copy of the collection with all alignment columns permuted """ def take(n, iterable): return [iterable.next() for _ in range(n)] if partition is None: partition = Partition([1] * len(self)) index_tuples = partition.get_membership() alignments = [] for ix in index_tuples: concat = Concatenation(self, ix) sites = concat.alignment.get_sites() random.shuffle(sites) d = dict(zip(concat.alignment.get_names(), [iter(x) for x in zip(*sites)])) new_seqs = [[(k, ''.join(take(l, d[k]))) for k in d] for l in concat.lengths] for seqs, datatype, name in zip(new_seqs, concat.datatypes, concat.names): alignment = Alignment(seqs, datatype) alignment.name = name alignments.append(alignment) return self.__class__(records=sorted(alignments, key=lambda x: SORT_KEY(x.name)))
def permuted_copy(self, partition=None): """ Return a copy of the collection with all alignment columns permuted """ def take(n, iterable): return [iterable.next() for _ in range(n)] if partition is None: partition = Partition([1] * len(self)) index_tuples = partition.get_membership() alignments = [] for ix in index_tuples: concat = Concatenation(self, ix) sites = concat.alignment.get_sites() random.shuffle(sites) d = dict( zip(concat.alignment.get_names(), [iter(x) for x in zip(*sites)])) new_seqs = [[(k, ''.join(take(l, d[k]))) for k in d] for l in concat.lengths] for seqs, datatype, name in zip(new_seqs, concat.datatypes, concat.names): alignment = Alignment(seqs, datatype) alignment.name = name alignments.append(alignment) return self.__class__( records=sorted(alignments, key=lambda x: SORT_KEY(x.name)))
def read_alignments(self, input_dir, file_format, header_grep=None, compression=None): """ Get list of alignment files from an input directory *.fa, *.fas and *.phy files only Stores in self.files """ optioncheck(compression, [None, 'gz', 'bz2']) if file_format == 'fasta': extensions = ['fa', 'fas', 'fasta'] elif file_format == 'phylip': extensions = ['phy'] else: extensions = [] if compression: extensions = ['.'.join([x, compression]) for x in extensions] files = fileIO.glob_by_extensions(input_dir, extensions) files.sort(key=SORT_KEY) self._input_files = files records = [] pbar = setup_progressbar("Loading files", len(files), simple_progress=True) pbar.start() for i, f in enumerate(files): if compression is not None: with fileIO.TempFile() as tmpfile: with fileIO.freader(f, compression) as reader, fileIO.fwriter(tmpfile) as writer: for line in reader: writer.write(line) try: record = Alignment(tmpfile, file_format, True) except RuntimeError: record = Alignment(tmpfile, file_format, False) else: try: record = Alignment(f, file_format, True) except RuntimeError: record = Alignment(f, file_format, False) if header_grep: try: datatype = 'dna' if record.is_dna() else 'protein' record = Alignment([(header_grep(x), y) for (x, y) in record.get_sequences()], datatype) except TypeError: raise TypeError("Couldn't apply header_grep to header\n" "alignment number={}, name={}\n" "header_grep={}".format(i, fileIO.strip_extensions(f), header_grep)) except RuntimeError: print('RuntimeError occurred processing alignment number={}, name={}' .format(i, fileIO.strip_extensions(f))) raise record.name = (fileIO.strip_extensions(f)) records.append(record) pbar.update(i) pbar.finish() return records
def read_alignments(self, input_dir, file_format, header_grep=None, compression=None): """ Get list of alignment files from an input directory *.fa, *.fas and *.phy files only Stores in self.files """ optioncheck(compression, [None, 'gz', 'bz2']) if file_format == 'fasta': extensions = ['fa', 'fas', 'fasta'] elif file_format == 'phylip': extensions = ['phy'] else: extensions = [] if compression: extensions = ['.'.join([x, compression]) for x in extensions] files = fileIO.glob_by_extensions(input_dir, extensions) files.sort(key=SORT_KEY) self._input_files = files records = [] pbar = setup_progressbar("Loading files", len(files), simple_progress=True) pbar.start() for i, f in enumerate(files): if compression is not None: with fileIO.TempFile() as tmpfile: with fileIO.freader(f, compression) as reader, fileIO.fwriter( tmpfile) as writer: for line in reader: writer.write(line) try: record = Alignment(tmpfile, file_format, True) except RuntimeError: record = Alignment(tmpfile, file_format, False) else: try: record = Alignment(f, file_format, True) except RuntimeError: record = Alignment(f, file_format, False) if header_grep: try: datatype = 'dna' if record.is_dna() else 'protein' record = Alignment([(header_grep(x), y) for (x, y) in record.get_sequences()], datatype) except TypeError: raise TypeError("Couldn't apply header_grep to header\n" "alignment number={}, name={}\n" "header_grep={}".format( i, fileIO.strip_extensions(f), header_grep)) except RuntimeError: print( 'RuntimeError occurred processing alignment number={}, name={}' .format(i, fileIO.strip_extensions(f))) raise record.name = (fileIO.strip_extensions(f)) records.append(record) pbar.update(i) pbar.finish() return records