def test_fastq(self): 'It guesses the format for the solexa and illumina fastq' txt = '@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n' txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n' txt += '+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n' txt += 'efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n' fhand = StringIO(txt) assert get_format(fhand) == 'fastq-illumina' txt = '@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n' txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n' txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n' txt += '+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n' txt += 'efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n' txt += 'efcfffffcfeefffcffffffddf`feed]`]_Ba_^__[YBBBBBBBBBBRTT\]][]\n' fhand = StringIO(txt + txt) assert get_format(fhand) == 'fastq-illumina' fhand = StringIO('@HWI-EAS209\n@') try: assert get_format(fhand) == 'fasta' self.fail('UndecidedFastqVersionError expected') except UndecidedFastqVersionError: pass # sanger txt = '@HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n' txt += 'TTAATTGGTAAATAAATCTCCTAATAGCTTAGATNTTACCTTNNNNNNNNNNTAGTTTCT\n' txt += '+HWI-EAS209_0006_FC706VJ:5:58:5894:21141#ATCACG/1\n' txt += '000000000000000000000000000000000000000000000000000000000000\n' fhand = StringIO(txt) assert get_format(fhand) == 'fastq'
def test_empty_file(self): 'It guesses the format of an empty file' fhand = StringIO() try: get_format(fhand) self.fail('FileIsEmptyError expected') except FileIsEmptyError: pass
def test_unkown(self): 'It tests unkown formats' fhand = StringIO('xseq\nACTC\n') try: get_format(fhand) self.fail('UnknownFormatError expected') except UnknownFormatError: pass
def test_long_illumina(self): 'The qualities seem illumina, but the reads are too lengthly' txt = '@read\n' txt += 'T' * 400 + '\n' txt += '+\n' txt += '@' * 400 + '\n' fhand = StringIO(txt) try: get_format(fhand) self.fail('UndecidedFastqVersionError expected') except UndecidedFastqVersionError: pass
def parse_basic_args(parser): 'It parses the command line and it returns a dict with the arguments.' parsed_args = parser.parse_args() # we have to wrap the file in a BufferedReader to allow peeking into stdin wrapped_fhands = [] # if input is stdin it will be a fhand not a list of fhands. # we have to convert to a list in_fhands = parsed_args.input if not isinstance(in_fhands, list): in_fhands = [in_fhands] for fhand in in_fhands: fhand = wrap_in_buffered_reader(fhand) fhand = uncompress_if_required(fhand) wrapped_fhands.append(fhand) # We have to add the one_line to the fastq files in order to get the # speed improvements of the seqitems in_format = parsed_args.in_format if in_format == GUESS_FORMAT: for wrapped_fhand in wrapped_fhands: get_format(wrapped_fhand) else: if in_format != get_format(wrapped_fhands[0]): msg = 'The given input format does not correspond to the input' msg += ' file' raise WrongFormatError(msg) if 'fastq' in in_format: for wrapped_fhand in wrapped_fhands: get_format(wrapped_fhand) else: # we dont set the first one because already did in the previous # checking for wrapped_fhand in wrapped_fhands[1:]: set_format(wrapped_fhand, in_format) out_fhand = getattr(parsed_args, OUTFILE) comp_kind = get_requested_compression(parsed_args) if isinstance(out_fhand, list): new_out_fhands = [] for out_f in out_fhand: try: out_f = compress_fhand(out_f, compression_kind=comp_kind) except RuntimeError, error: parser.error(error) new_out_fhands.append(out_f) out_fhand = new_out_fhands
def _index_seq_file(fpath, file_format=None): '''It indexes a seq file using Biopython index. It uses the title line line as the key and not just the id. ''' if file_format is None: file_format = get_format(open(fpath)) file_format = remove_multiline(file_format) # pylint: disable W0212 # we monkey patch to be able to index using the whole tile line and not # only the id. We need it because in a pair end file sequences with the # same id could be found accessor = _index._FormatToRandomAccess old_accessor = accessor.copy() accessor['fastq'] = FastqRandomAccess accessor['astq-sanger'] = FastqRandomAccess accessor['fastq-solexa'] = FastqRandomAccess accessor['fastq-illumina'] = FastqRandomAccess file_index = index(fpath, format=file_format) _index._FormatToRandomAccess = old_accessor return file_index
def _read_seqitems(fhands): 'it returns an iterator of seq items (tuples of name and chunk)' seq_iters = [] for fhand in fhands: file_format = get_format(fhand) seq_iter = _itemize_fastx(fhand) seq_iter = assing_kind_to_seqs(SEQITEM, seq_iter, file_format) seq_iters.append(seq_iter) return chain.from_iterable(seq_iters)
def test_get_format_stringio(self): "It checks the get/set format functions" # stiongIO stringIO_fhand = StringIO(">seq\natgctacgacta\n") striongIOhash = hashlib.sha224(stringIO_fhand.getvalue()[:100]).hexdigest() id_ = id(stringIO_fhand) file_format = get_format(stringIO_fhand) assert FILEFORMAT_INVENTORY[(id_, striongIOhash)] == file_format
def test_fasta(self): 'It guess fasta formats' fhand = StringIO('>seq\nACTC\n') assert get_format(fhand) == 'fasta' # multiline fasta fhand = StringIO('>seq\nACTC\nACTG\n>seq2\nACTG\n') assert get_format(fhand) == 'fasta' # qual fhand = StringIO('>seq\n10 20\n') assert get_format(fhand) == 'qual' # qual qual = ">seq1\n30 30 30 30 30 30 30 30\n>seq2\n30 30 30 30 30 30 30" qual += " 30\n>seq3\n30 30 30 30 30 30 30 30\n" fhand = StringIO(qual) assert get_format(fhand) == 'qual'
def test_get_format_stringio(self): "It checks the get/set format functions" #stiongIO stringIO_fhand = StringIO('>seq\natgctacgacta\n') striongIOhash = hashlib.sha224( stringIO_fhand.getvalue()[:100]).hexdigest() id_ = id(stringIO_fhand) file_format = get_format(stringIO_fhand) assert FILEFORMAT_INVENTORY[(id_, striongIOhash)] == file_format
def test_get_format_fhand(self): "It checks the get/set format functions" # file fhand fhand = NamedTemporaryFile() fhand.write(">seq\natgctacgacta\n") fhand.flush() name = fhand.name id_ = id(fhand) file_format = get_format(fhand) assert FILEFORMAT_INVENTORY[(id_, name)] == file_format num_keys = len(FILEFORMAT_INVENTORY) file_format = get_format(fhand) assert FILEFORMAT_INVENTORY[(id_, name)] == file_format assert len(FILEFORMAT_INVENTORY) == num_keys fhand = NamedTemporaryFile() set_format(fhand, "fasta") assert "fasta" == get_format(fhand)
def test_get_format_fhand(self): "It checks the get/set format functions" #file fhand fhand = NamedTemporaryFile() fhand.write('>seq\natgctacgacta\n') fhand.flush() name = fhand.name id_ = id(fhand) file_format = get_format(fhand) assert FILEFORMAT_INVENTORY[(id_, name)] == file_format num_keys = len(FILEFORMAT_INVENTORY) file_format = get_format(fhand) assert FILEFORMAT_INVENTORY[(id_, name)] == file_format assert len(FILEFORMAT_INVENTORY) == num_keys fhand = NamedTemporaryFile() set_format(fhand, 'fasta') assert 'fasta' == get_format(fhand)
def sort_by_position_in_ref(in_fhands, ref_fpath, directory=None, tempdir=None): in_fpaths = [fhand.name for fhand in in_fhands] file_format = get_format(in_fhands[0]) extra_params = ['-f'] if 'fasta' in file_format else [] index_fpath = get_or_create_bowtie2_index(ref_fpath, directory) bowtie2 = map_with_bowtie2(index_fpath, unpaired_fpaths=in_fpaths, extra_params=extra_params) out_fhand = NamedTemporaryFile() sort_mapped_reads(bowtie2, out_fhand.name, tempdir=tempdir) samfile = pysam.Samfile(out_fhand.name) for aligned_read in samfile: yield alignedread_to_seqitem(aligned_read, file_format)
def parse_basic_args(parser): 'It parses the command line and it returns a dict with the arguments.' parsed_args = parser.parse_args() # we have to wrap the file in a BufferedReader to allow peeking into stdin wrapped_fhands = [] # if input is stdin it will be a fhand not a list of fhands. # we have to convert to a list in_fhands = parsed_args.input if not isinstance(in_fhands, list): in_fhands = [in_fhands] for fhand in in_fhands: fhand = wrap_in_buffered_reader(fhand) fhand = uncompress_if_required(fhand) wrapped_fhands.append(fhand) # We have to add the one_line to the fastq files in order to get the # speed improvements of the seqitems in_format = parsed_args.in_format if in_format == GUESS_FORMAT: for wrapped_fhand in wrapped_fhands: get_format(wrapped_fhand) else: for wrapped_fhand in wrapped_fhands: set_format(wrapped_fhand, in_format) out_fhand = getattr(parsed_args, OUTFILE) comp_kind = get_requested_compression(parsed_args) if isinstance(out_fhand, list): new_out_fhands = [] for out_f in out_fhand: try: out_f = compress_fhand(out_f, compression_kind=comp_kind) except RuntimeError, error: parser.error(error) new_out_fhands.append(out_f) out_fhand = new_out_fhands
def sort_by_position_in_ref(in_fhand, index_fpath, directory=None, tempdir=None): #changed to bwa mem from bowtie, test doesn't work well, check it out in_fpath = in_fhand.name file_format = get_format(open(in_fpath)) extra_params = ['--very-fast'] if 'fasta' in file_format: extra_params.append('-f') bowtie2_process = map_with_bowtie2(index_fpath, paired_fpaths=None, unpaired_fpath=in_fpath, extra_params=extra_params) out_fhand = NamedTemporaryFile() map_process_to_sortedbam(bowtie2_process, out_fhand.name, tempdir=tempdir) samfile = pysam.Samfile(out_fhand.name) for aligned_read in samfile: yield alignedread_to_seqitem(aligned_read)
def _sorted_mapped_reads(ref_fpath, paired_fpaths=None, unpaired_fpaths=None, directory=None, file_format=None, min_seed_len=None): fhand = open(paired_fpaths[0]) if paired_fpaths else open(unpaired_fpaths[0]) if file_format is not None: set_format(fhand, file_format) else: file_format = get_format(fhand) index_fpath = get_or_create_bwa_index(ref_fpath, directory) extra_params = ['-a', '-M'] if min_seed_len is not None: extra_params.extend(['-k', min_seed_len]) bwa = map_with_bwamem(index_fpath, paired_fpaths=paired_fpaths, unpaired_fpath=unpaired_fpaths, extra_params=extra_params) bam_fhand = NamedTemporaryFile(dir='/home/carlos/tmp') sort_mapped_reads(bwa, bam_fhand.name, key='queryname') bamfile = pysam.Samfile(bam_fhand.name) return bamfile
def _read_seqitems(fhands): 'it returns an iterator of seq items (tuples of name and chunk)' seq_iters = [] for fhand in fhands: file_format = get_format(fhand) if file_format == 'fasta': seq_iter = _itemize_fasta(fhand) elif 'multiline' not in file_format and 'fastq' in file_format: try: seq_iter = _itemize_fastq(fhand) except ValueError as error: if error_quality_disagree(error): raise MalformedFile(str(error)) raise else: msg = 'Format not supported by the itemizers: ' + file_format raise NotImplementedError(msg) seq_iter = assing_kind_to_seqs(SEQITEM, seq_iter, file_format) seq_iters.append(seq_iter) return chain.from_iterable(seq_iters)
def _read_seqrecords(fhands): 'It returns an iterator of seqrecords' seq_iters = [] for fhand in fhands: fmt = get_format(fhand) if fmt in ('fasta', 'qual') or 'fastq' in fmt: title = title2ids if fmt == 'fasta': seq_iter = FastaIterator(fhand, title2ids=title) elif fmt == 'qual': seq_iter = QualPhredIterator(fhand, title2ids=title) elif fmt == 'fastq' or fmt == 'fastq-sanger': seq_iter = FastqPhredIterator(fhand, title2ids=title) elif fmt == 'fastq-solexa': seq_iter = FastqSolexaIterator(fhand, title2ids=title) elif fmt == 'fastq-illumina': seq_iter = FastqIlluminaIterator(fhand, title2ids=title) else: seq_iter = parse_into_seqrecs(fhand, fmt) seq_iters.append(seq_iter) return chain.from_iterable(seq_iters)
def filter_chimeras(ref_fpath, out_fhand, chimeras_fhand, in_fhands, unknown_fhand, unpaired=False, paired_result=True, settings=get_setting('CHIMERAS_SETTINGS'), min_seed_len=None, directory=None): file_format = get_format(in_fhands[0]) if unpaired: unpaired_fpaths = [fhand.name for fhand in in_fhands] paired_fpaths = None else: f_fhand = NamedTemporaryFile() r_fhand = NamedTemporaryFile() seqs = read_seqs(in_fhands) deinterleave_pairs(seqs, f_fhand, r_fhand, file_format) paired_fpaths = [f_fhand.name, r_fhand.name] unpaired_fpaths = None bamfile = _sorted_mapped_reads(ref_fpath, paired_fpaths, unpaired_fpaths, directory, file_format, min_seed_len) total = 0 chimeric = 0 unknown = 0 for pair, kind in classify_mapped_reads(bamfile, settings=settings, paired_result=paired_result, file_format=file_format): if kind is NON_CHIMERIC: write_seqs(pair, out_fhand) elif kind is CHIMERA and chimeras_fhand is not None: write_seqs(pair, chimeras_fhand) chimeric += 1 elif kind is UNKNOWN and unknown_fhand is not None: write_seqs(pair, unknown_fhand) unknown += 1 total += 1 mapped = total - chimeric - unknown print 'Total pairs analyzed: ', total print 'Chimeric pairs filtered: ', chimeric, '\t', chimeric / float(total) print 'Unknown pairs found: ', unknown, '\t', unknown / float(total) print 'Non-chimeric pairs: ', mapped, '\t', mapped / float(total)
def seqio(in_fhands, out_fhand, out_format, copy_if_same_format=True): 'It converts sequence files between formats' if out_format not in get_setting('SUPPORTED_OUTPUT_FORMATS'): raise IncompatibleFormatError("This output format is not supported") in_formats = [get_format(fhand) for fhand in in_fhands] if len(in_fhands) == 1 and in_formats[0] == out_format: if copy_if_same_format: copyfileobj(in_fhands[0], out_fhand) else: rel_symlink(in_fhands[0].name, out_fhand.name) else: seqs = _read_seqrecords(in_fhands) try: write_seqrecs(seqs, out_fhand, out_format) except ValueError, error: if error_quality_disagree(error): raise MalformedFile(str(error)) if 'No suitable quality scores' in str(error): msg = 'No qualities available to write output file' raise IncompatibleFormatError(msg) raise
def read_seqs(fhands, out_format=None, prefered_seq_classes=None): 'It returns a stream of seqs in different codings: seqrecords, seqitems...' if not prefered_seq_classes: prefered_seq_classes = [SEQITEM, SEQRECORD] try: in_format = get_format(fhands[0]) except FileIsEmptyError: return [] # seqitems is incompatible with different input and output formats # or when in_format != a fasta or fastq if ((out_format not in (None, GUESS_FORMAT) and in_format != out_format and SEQITEM in prefered_seq_classes) or (in_format not in ('fasta',) + SANGER_FASTQ_FORMATS + ILLUMINA_FASTQ_FORMATS)): prefered_seq_classes.pop(prefered_seq_classes.index(SEQITEM)) if not prefered_seq_classes: msg = 'No valid seq class left or prefered' raise ValueError(msg) for seq_class in prefered_seq_classes: if seq_class == SEQITEM: try: return _read_seqitems(fhands) except NotImplementedError: continue elif seq_class == SEQRECORD: try: seqs = _read_seqrecords(fhands) return assing_kind_to_seqs(SEQRECORD, seqs, None) except NotImplementedError: continue else: raise ValueError('Unknown class for seq: ' + seq_class) raise RuntimeError('We should not be here, fixme')
try: out_f = compress_fhand(out_f, compression_kind=comp_kind) except RuntimeError, error: parser.error(error) new_out_fhands.append(out_f) out_fhand = new_out_fhands else: try: out_fhand = compress_fhand(out_fhand, compression_kind=comp_kind) except RuntimeError, error: parser.error(error) # The default output format is the same as the first file if 'fastq' in in_format or in_format == GUESS_FORMAT: out_format = get_format(wrapped_fhands[0]) else: out_format = in_format # The original fhands should be stored, because otherwise they would be # closed args = { 'out_fhand': out_fhand, 'in_fhands': wrapped_fhands, 'out_format': out_format, 'original_in_fhands': in_fhands } return args, parsed_args def parse_basic_parallel_args(parser):
def test_with_long_desc(self): fhand = StringIO('''>comp27222_c1_seq1 len=4926 path=[89166356:0-46 89167522:47-85 89315292:86-121 89170132:122-176 89377211:177-217 89377235:218-244 89172846:245-247 89172856:248-251 89173028:252-276 89174386:277-292 89174684:293-506 89377352:507-582 89183669:583-587 89183821:588-613 89184868:614-644 89185624:645-719 89187914:720-723 89187935:724-870 89191280:871-887 89377494:888-907 89191517:908-927 89193046:928-1071 89198507:1072-1109 89199632:1110-1170 89201544:1171-1194 89202607:1195-1247 89377606:1248-1252 89377611:1253-1591 89215759:1592-1606 89215815:1607-1636 89216359:1637-1664 89377693:1665-1678 88727916:1679-2152 88743802:2153-2171 88744738:2172-2623 88759485:2624-2648 88759762:2649-2953 88769199:2954-2971 88769596:2972-3657 88791809:3658-3665 88792014:3666-3723 88793720:3724-3731 88794381:3732-3812 88799277:3813-3813 88799328:3814-3996 88807093:3997-3999 88807177:4000-4215 88813164:4216-4246 88814188:4247-4287 88815355:4288-4308 88816198:4309-4352 88817845:4353-4369 88818294:4370-4403 88818879:4404-4465 88821150:4466-4469 88821188:4470-4925] GAAGGATCGATCGGCCTCGGCGGTGTTCCCAAAAATCTAAGAGCGTTTACTCCAAGCTTC''') get_format(fhand)
def test_with_long_desc(self): fhand = StringIO( '''>comp27222_c1_seq1 len=4926 path=[89166356:0-46 89167522:47-85 89315292:86-121 89170132:122-176 89377211:177-217 89377235:218-244 89172846:245-247 89172856:248-251 89173028:252-276 89174386:277-292 89174684:293-506 89377352:507-582 89183669:583-587 89183821:588-613 89184868:614-644 89185624:645-719 89187914:720-723 89187935:724-870 89191280:871-887 89377494:888-907 89191517:908-927 89193046:928-1071 89198507:1072-1109 89199632:1110-1170 89201544:1171-1194 89202607:1195-1247 89377606:1248-1252 89377611:1253-1591 89215759:1592-1606 89215815:1607-1636 89216359:1637-1664 89377693:1665-1678 88727916:1679-2152 88743802:2153-2171 88744738:2172-2623 88759485:2624-2648 88759762:2649-2953 88769199:2954-2971 88769596:2972-3657 88791809:3658-3665 88792014:3666-3723 88793720:3724-3731 88794381:3732-3812 88799277:3813-3813 88799328:3814-3996 88807093:3997-3999 88807177:4000-4215 88813164:4216-4246 88814188:4247-4287 88815355:4288-4308 88816198:4309-4352 88817845:4353-4369 88818294:4370-4403 88818879:4404-4465 88821150:4466-4469 88821188:4470-4925] GAAGGATCGATCGGCCTCGGCGGTGTTCCCAAAAATCTAAGAGCGTTTACTCCAAGCTTC''') get_format(fhand)
try: out_f = compress_fhand(out_f, compression_kind=comp_kind) except RuntimeError, error: parser.error(error) new_out_fhands.append(out_f) out_fhand = new_out_fhands else: try: out_fhand = compress_fhand(out_fhand, compression_kind=comp_kind) except RuntimeError, error: parser.error(error) # The default output format is the same as the first file if 'fastq' in in_format or in_format == GUESS_FORMAT: out_format = get_format(wrapped_fhands[0]) else: out_format = in_format # The original fhands should be stored, because otherwise they would be # closed args = {'out_fhand': out_fhand, 'in_fhands': wrapped_fhands, 'out_format': out_format, 'original_in_fhands': in_fhands} return args, parsed_args def parse_basic_parallel_args(parser): 'It parses the command line and it returns a dict with the arguments.' args, parsed_args = parse_basic_args(parser) args['processes'] = parsed_args.processes return args, parsed_args