def test_empty_file(self): "It guesses the format of an empty file" fhand = StringIO() try: guess_format(fhand) self.fail("UnknownFormatError expected") except UnknownFormatError: pass
def test_unkown(self): "It tests unkown formats" fhand = StringIO("xseq\nACTC\n") try: guess_format(fhand) self.fail("UnknownFormatError expected") except UnknownFormatError: pass
def test_long_illumina(self): "The qualities seem illumina, but the reads are too lengthly" txt = "@read\n" txt += "T" * 400 + "\n" txt += "+\n" txt += "@" * 400 + "\n" fhand = StringIO(txt) try: guess_format(fhand) self.fail("UndecidedFastqVersionError expected") except UndecidedFastqVersionError: pass
def test_fasta(self): "It guess fasta formats" fhand = StringIO(">seq\nACTC\n") assert guess_format(fhand) == "fasta" # qual fhand = StringIO(">seq\n10 20\n") assert guess_format(fhand) == "qual" # qual qual = ">seq1\n30 30 30 30 30 30 30 30\n>seq2\n30 30 30 30 30 30 30" qual += " 30\n>seq3\n30 30 30 30 30 30 30 30\n" fhand = StringIO(qual) assert guess_format(fhand) == "qual"
def seqio(in_fhands, out_fhands, out_format, copy_if_same_format=True): 'It converts sequence files between formats' in_formats = [guess_format(fhand) for fhand in in_fhands] if (len(in_formats) == 1 and in_formats[0] == out_format and hasattr(in_fhands[0], 'name')): if copy_if_same_format: copyfileobj(in_fhands[0], out_fhands[0]) else: rel_symlink(in_fhands[0].name, out_fhands[0].name) elif len(in_fhands) == 1 and len(out_fhands) == 1: try: SeqIO.convert(in_fhands[0], in_formats[0], out_fhands[0], out_format) except ValueError as error: if error_quality_disagree(error): raise MalformedFile(str(error)) raise elif (len(in_fhands) == 1 and len(out_fhands) == 2 and out_format == 'fasta'): try: for seq in read_seqrecords([in_fhands[0]]): SeqIO.write([seq], out_fhands[0], out_format) SeqIO.write([seq], out_fhands[1], 'qual') except ValueError, error: if error_quality_disagree(error): raise MalformedFile(str(error)) raise
def test_fastq(self): "It guesses the format for the solexa and illumina fastq" txt = "@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n" txt += "TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n" txt += "+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n" txt += "efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n" fhand = StringIO(txt) assert guess_format(fhand) == "fastq-illumina" fhand = StringIO("@HWI-EAS209\n@") try: assert guess_format(fhand) == "fasta" self.fail("UnknownFormatError expected") except UnknownFormatError: pass # sanger txt = "@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n" txt += "TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n" txt += "+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n" txt += "000000000000000000000000000000000000000000000000000000000000\n" fhand = StringIO(txt) assert guess_format(fhand) == "fastq"
def count_seqs_in_files(fhands, file_format=GUESS_FORMAT): 'It counts the seqs in the given files' count = 0 for fhand in fhands: if file_format == GUESS_FORMAT or file_format is None: file_format = guess_format(fhand) else: file_format = file_format if file_format == 'fasta': count += _count_seqs_in_fasta(fhand) elif 'fastq' in file_format: count += length(QualityIO.FastqGeneralIterator(fhand)) else: count += length(read_seqrecords([fhand])) return count
def _index_seq_file(fpath, file_format=None): '''It indexes a seq file using Biopython index. It uses the title line line as the key and not just the id. ''' if file_format is None: file_format = guess_format(open(fpath)) # pylint: disable W0212 # we monkey patch to be able to index using the whole tile line and not # only the id. We need it because in a pair end file sequences with the # same id could be found accessor = _index._FormatToRandomAccess old_accessor = accessor.copy() accessor['fastq'] = FastqRandomAccess accessor['astq-sanger'] = FastqRandomAccess accessor['fastq-solexa'] = FastqRandomAccess accessor['fastq-illumina'] = FastqRandomAccess file_index = index(fpath, format=file_format) _index._FormatToRandomAccess = old_accessor return file_index
def read_seqrecords(fhands, file_format=GUESS_FORMAT): 'it returns an iterator of seqrecords' seq_iters = [] for fhand in fhands: if file_format == GUESS_FORMAT or file_format is None: fmt = guess_format(fhand) else: fmt = file_format if fmt in ('fasta', 'qual') or 'fastq' in fmt: title = title2ids if fmt == 'fasta': seq_iter = FastaIO.FastaIterator(fhand, title2ids=title) elif fmt == 'qual': seq_iter = QualityIO.QualPhredIterator(fhand, title2ids=title) elif fmt == 'fastq' or fmt == 'fastq-sanger': seq_iter = QualityIO.FastqPhredIterator(fhand, title2ids=title) elif fmt == 'fastq-solexa': seq_iter = QualityIO.FastqSolexaIterator(fhand, title2ids=title) elif fmt == 'fastq-illumina': seq_iter = QualityIO.FastqIlluminaIterator(fhand, title2ids=title) else: seq_iter = SeqIO.parse(fhand, fmt) seq_iters.append(seq_iter) return chain.from_iterable(seq_iters)
print 'hello', error parser.error(error) new_out_fhands.append(out_f) out_fhand = new_out_fhands else: try: out_fhand = compress_fhand(out_fhand, compression_kind=comp_kind) except RuntimeError, error: parser.error(error) out_format = parsed_args.out_format # The default format is the same as the first file if not out_format: if in_format == GUESS_FORMAT: out_format = guess_format(wrapped_fhands[0]) else: out_format = in_format # The original fhands should be stored, because otherwise they would be # closed args = {'out_fhand': out_fhand, 'in_fhands': wrapped_fhands, 'out_format': out_format, 'original_in_fhands': in_fhands, 'in_format': in_format} return args, parsed_args def parse_basic_process_args(parser): 'It parses the command line and it returns a dict with the arguments.' args, parsed_args = parse_basic_args(parser) args['processes'] = parsed_args.processes return args, parsed_args