def match_pairs(reads, out_fhand, orphan_out_fhand, out_format, ordered=True, check_order_buffer_size=0, max_reads_memory=None, temp_dir=None): '''It matches the seq pairs in an iterator and splits the orphan seqs.''' counts = 0 check_order_buffer = KeyedSet() for pair in _get_paired_and_orphan(reads, ordered, max_reads_memory, temp_dir): if len(pair) == 1: write_seqs(pair, orphan_out_fhand, out_format) try: name = _parse_pair_direction_and_name(pair[0])[0] except PairDirectionError: name = get_name(pair[0]) if ordered and counts < check_order_buffer_size: counts += 1 if not check_order_buffer.check_add(name): msg = 'Reads are not ordered by pairs.Use unordered option' raise ItemsNotSortedError(msg) elif ordered and counts >= check_order_buffer_size: if name in check_order_buffer: msg = 'Reads are not ordered by pairs.Use unordered option' raise ItemsNotSortedError(msg) elif len(pair) == 2: write_seqs(pair, out_fhand, out_format) flush_fhand(orphan_out_fhand) flush_fhand(out_fhand)
def _test_filter_duplicates(paired_reads, n_seqs_packet): assert isinstance(n_seqs_packet, int) or n_seqs_packet == None in_fhand = NamedTemporaryFile() fastq_with_dups = (FASTQ_NO_DUPS1 + FASTQ_DUPS + FASTQ_NO_DUPS2 + FASTQ_NO_DUPS3) in_fhand.write(fastq_with_dups) in_fhand.flush() in_fhand = open(in_fhand.name) out_fhand = NamedTemporaryFile() filter_duplicates([in_fhand], out_fhand, paired_reads, n_seqs_packet) flush_fhand(out_fhand) filtered_pairs = list(_read_pairs([open(out_fhand.name)], paired_reads)) fastq_no_dups = FASTQ_NO_DUPS1 + FASTQ_NO_DUPS2 + FASTQ_NO_DUPS3 expected_pairs = list(_read_pairs([StringIO(fastq_no_dups)], paired_reads)) #print 'filtered_pairs ->', filtered_pairs #print 'expected_pairs ->', expected_pairs #print len(filtered_pairs), len(expected_pairs) #assert len(filtered_pairs) == len(expected_pairs) for pair1 in expected_pairs: counts = 0 for pair2 in filtered_pairs: if _seqitem_pairs_equal(pair1, pair2): counts += 1 assert counts == 1 in_fhand.close()
def match_pairs(seqs, out_fhand, orphan_out_fhand, out_format, memory_limit=get_setting('DEFAULT_SEQS_IN_MEM_LIMIT')): 'It matches the seq pairs in an iterator and splits the orphan seqs' buf_fwd = {'index': {}, 'items': []} buf_rev = {'index': {}, 'items': []} buf1, buf2 = buf_fwd, buf_rev # for the all orphan case for seq in seqs: try: seq_name, direction = _parse_pair_direction_and_name(seq) except PairDirectionError: write_seqs([seq], orphan_out_fhand, out_format) continue if direction == FWD: buf1 = buf_rev buf2 = buf_fwd else: buf1 = buf_fwd buf2 = buf_rev try: matching_seq_index = buf1['index'][seq_name] except KeyError: matching_seq_index = None if matching_seq_index is None: # add to buff buf2['items'].append(seq) buf2['index'][seq_name] = len(buf2['items']) - 1 # check mem limit sum_items = len(buf1['items'] + buf2['items']) if memory_limit is not None and sum_items >= memory_limit: error_msg = 'There are too many consecutive non matching seqs' error_msg += ' in your input. We have reached the memory limit' raise MaxNumReadsInMem(error_msg) else: # write seqs from buffer1 orphan_seqs = buf1['items'][:matching_seq_index] matching_seq = buf1['items'][matching_seq_index] write_seqs(orphan_seqs, orphan_out_fhand, out_format) write_seqs([matching_seq, seq], out_fhand, out_format) # fix buffers 1 buf1['items'] = buf1['items'][matching_seq_index + 1:] buf1['index'] = {s: i for i, s in enumerate(buf1['items'])} # writes seqs from buffer 2 and fix buffer2 write_seqs(buf2['items'], orphan_out_fhand, out_format) buf2['items'] = [] buf2['index'] = {} else: orphan_seqs = buf1['items'] + buf2['items'] write_seqs(orphan_seqs, orphan_out_fhand, out_format) orphan_out_fhand.flush() flush_fhand(out_fhand)
def _test_filter_duplicates(paired_reads, n_seqs_packet): assert isinstance(n_seqs_packet, int) or n_seqs_packet == None in_fhand = NamedTemporaryFile() fastq_with_dups = (FASTQ_NO_DUPS1 + FASTQ_DUPS + FASTQ_NO_DUPS2 + FASTQ_NO_DUPS3) in_fhand.write(fastq_with_dups) in_fhand.flush() in_fhand = open(in_fhand.name) out_fhand = NamedTemporaryFile() filter_duplicates([in_fhand], out_fhand, paired_reads, n_seqs_packet) flush_fhand(out_fhand) filtered_pairs = list(_read_pairs([open(out_fhand.name)], paired_reads)) fastq_no_dups = FASTQ_NO_DUPS1 + FASTQ_NO_DUPS2 + FASTQ_NO_DUPS3 expected_pairs = list(_read_pairs([StringIO(fastq_no_dups)], paired_reads)) #print 'filtered_pairs ->', filtered_pairs #print 'expected_pairs ->', expected_pairs #print len(filtered_pairs), len(expected_pairs) #assert len(filtered_pairs) == len(expected_pairs) for pair1 in expected_pairs: counts = 0 for pair2 in filtered_pairs: if _seqitem_pairs_equal(pair1, pair2): counts += 1 assert counts == 1 in_fhand.close() # use length in_fhand = NamedTemporaryFile() in_fhand.write(FASTQ_DUPS) in_fhand.flush() in_fhand = open(in_fhand.name) out_fhand = NamedTemporaryFile() filter_duplicates([in_fhand], out_fhand, paired_reads=False, n_seqs_packet=n_seqs_packet, use_length=10) flush_fhand(out_fhand) filtered_pairs = list( _read_pairs([open(out_fhand.name)], paired_reads=False)) assert len(filtered_pairs) == 2 # use length in_fhand = NamedTemporaryFile() in_fhand.write(FASTQ_DUPS) in_fhand.flush() in_fhand = open(in_fhand.name) out_fhand = NamedTemporaryFile() filter_duplicates([in_fhand], out_fhand, paired_reads=False, n_seqs_packet=n_seqs_packet, use_length=1) flush_fhand(out_fhand) filtered_pairs = list( _read_pairs([open(out_fhand.name)], paired_reads=False)) assert len(filtered_pairs) == 1
if copy_if_same_format: copyfileobj(in_fhands[0], out_fhand) else: rel_symlink(in_fhands[0].name, out_fhand.name) else: seqs = _read_seqrecords(in_fhands) try: write_seqrecs(seqs, out_fhand, out_format) except ValueError, error: if error_quality_disagree(error): raise MalformedFile(str(error)) if 'No suitable quality scores' in str(error): msg = 'No qualities available to write output file' raise IncompatibleFormatError(msg) raise flush_fhand(out_fhand) def fastaqual_to_fasta(seq_fhand, qual_fhand, out_fhand): 'It converts a fasta and a qual file into a fastq format file' seqrecords = PairedFastaQualIterator(seq_fhand, qual_fhand) try: write_seqrecs(seqrecords, out_fhand.name, 'fastq') except ValueError, error: if error_quality_disagree(error): raise MalformedFile(str(error)) raise out_fhand.flush() def guess_seq_type(fhand):
def flush(self): flush_fhand(self.stream)
def match_pairs(seqs, out_fhand, orphan_out_fhand, out_format, memory_limit=get_setting('DEFAULT_SEQS_IN_MEM_LIMIT')): 'It matches the seq pairs in an iterator and splits the orphan seqs' buf_fwd = {'index': {}, 'items': []} buf_rev = {'index': {}, 'items': []} buf1, buf2 = buf_rev, buf_fwd # for the all orphan case for seq in seqs: try: seq_name, direction = _parse_pair_direction_and_name(seq) except PairDirectionError: write_seqs([seq], orphan_out_fhand, out_format) continue # buf1 -> buffer for the reads with the same orientation as the # current one # buf2 -> buffer for the reads with the reverse orientation as the # current one if direction == FWD: buf1 = buf_fwd buf2 = buf_rev else: buf1 = buf_rev buf2 = buf_fwd try: matching_seq_index = buf2['index'][seq_name] except KeyError: matching_seq_index = None if matching_seq_index is None: # add to buff buf1['items'].append(seq) buf1['index'][seq_name] = len(buf1['items']) - 1 # check mem limit sum_items = len(buf2['items'] + buf1['items']) if memory_limit is not None and sum_items >= memory_limit: error_msg = 'There are too many consecutive non matching seqs' error_msg += ' in your input. We have reached the memory limit.' error_msg += 'Are you sure that the reads are sorted and ' error_msg += 'interleaved?. You could try with the unordered' error_msg += ' algorith' raise MaxNumReadsInMem(error_msg) else: # write seqs from buffer1 orphan_seqs = buf2['items'][:matching_seq_index] matching_seq = buf2['items'][matching_seq_index] write_seqs(orphan_seqs, orphan_out_fhand, out_format) write_seqs([matching_seq, seq], out_fhand, out_format) # fix buffer 1 if matching_seq_index != len(buf2['items']) - 1: msg = 'The given files are not sorted (ordered) and ' msg = 'interleaved. You could try with the unordered algorithm' raise MalformedFile(msg) buf2 = {'index': {}, 'items': []} # writes seqs from buffer 2 and fix buffer2 write_seqs(buf1['items'], orphan_out_fhand, out_format) buf1 = {'index': {}, 'items': []} if direction == FWD: buf_fwd = buf1 buf_rev = buf2 else: buf_rev = buf1 buf_fwd = buf2 else: orphan_seqs = buf1['items'] + buf2['items'] write_seqs(orphan_seqs, orphan_out_fhand, out_format) orphan_out_fhand.flush() flush_fhand(out_fhand)
def match_pairs(seqs, out_fhand, orphan_out_fhand, out_format, memory_limit=get_setting('DEFAULT_SEQS_IN_MEM_LIMIT')): 'It matches the seq pairs in an iterator and splits the orphan seqs' buf_fwd = {'index': {}, 'items': []} buf_rev = {'index': {}, 'items': []} buf1, buf2 = buf_rev, buf_fwd # for the all orphan case for seq in seqs: try: seq_name, direction = _parse_pair_direction_and_name(seq) except PairDirectionError: write_seqs([seq], orphan_out_fhand, out_format) continue # buf1 -> buffer for the reads with the same orientation as the # current one # buf2 -> buffer for the reads with the reverse orientation as the # current one if direction == FWD: buf1 = buf_fwd buf2 = buf_rev else: buf1 = buf_rev buf2 = buf_fwd try: matching_seq_index = buf2['index'][seq_name] except KeyError: matching_seq_index = None if matching_seq_index is None: # add to buff buf1['items'].append(seq) buf1['index'][seq_name] = len(buf1['items']) - 1 # check mem limit sum_items = len(buf2['items'] + buf1['items']) if memory_limit is not None and sum_items >= memory_limit: error_msg = 'There are too many consecutive non matching seqs' error_msg += ' in your input. We have reached the memory limit.' error_msg += 'Are you sure that the reads are sorted and ' error_msg += 'interleaved?. You could try with the unordered' error_msg += ' algorith' raise MaxNumReadsInMem(error_msg) else: # write seqs from buffer1 orphan_seqs = buf2['items'][:matching_seq_index] matching_seq = buf2['items'][matching_seq_index] write_seqs(orphan_seqs, orphan_out_fhand, out_format) write_seqs([matching_seq, seq], out_fhand, out_format) # fix buffer 1 if matching_seq_index != len(buf2['items']) - 1: msg = 'The given files are not sorted (ordered) and ' msg = 'interleaved. You could try with the unordered algorithm' raise MalformedFile(msg) buf2 = {'index': {}, 'items': []} # writes seqs from buffer 2 and fix buffer2 write_seqs(buf1['items'], orphan_out_fhand, out_format) buf1['items'] = [] buf1['index'] = {} else: orphan_seqs = buf1['items'] + buf2['items'] write_seqs(orphan_seqs, orphan_out_fhand, out_format) orphan_out_fhand.flush() flush_fhand(out_fhand)