def write_merged_to_stdout(): # We assume for now all records match. Generally, need to verify. log('About to merge ', in_fname1, in_fname2) with gene_lib.open_compressed(in_fname1, 'rt') as in_f1_handle: records1 = SeqIO.parse(in_f1_handle, format="fastq") with gene_lib.open_compressed(in_fname2, 'rt') as in_f2_handle: records2 = SeqIO.parse(in_f2_handle, format="fastq") merged = merged_paired_ends(records1, records2) Bio.SeqIO.write(merged, handle=sys.stdout, format='fastq')
def fastq_zst_records(filename): # https://github.com/indygreg/python-zstandard - pip3 install zstandard import zstandard as zstd log(f"Reading {filename}...") with open(filename, 'rb') as fastq_zst_handle: fastq_handle = zstd.ZstdDecompressor().stream_reader(fastq_zst_handle) # wrapper adds support for .readline(), for line in ... fastq_text = io.TextIOWrapper(fastq_handle, encoding='ascii') for record in SeqIO.parse(fastq_text, "fastq"): yield record
def write_fastq_to_stdout(): log('About to transform ', in_fname) with gene_lib.open_compressed(in_fname, 'rt') as in_f1_handle: records = SeqIO.parse(in_f1_handle, format="fasta") for (rec) in records: string = str(rec.seq) res_seq = SeqRecord(Seq(string), id=rec.id, name=rec.name, description=rec.description, letter_annotations={ "phred_quality": [30 for i in range(len(string))] }) Bio.SeqIO.write(res_seq, handle=sys.stdout, format='fastq')
def split_recompress(in_fname, out_basename, skip=[]): """ skip - indexes of parts to skip (still need to decompress but not compress) """ with gzip.open(in_fname, 'rb') as in_fastq: for i, chunk_iter in enumerate(chunks(in_fastq, chunk_size=100_000_000)): out_fname = f'{out_basename}.part{i}e8.fastq.gz' t0 = time.time() if i in skip: log('Skipping', out_fname) max(chunk_iter) # consume iterator, discarding values else: log('Writing', out_fname) with gzip.open(out_fname + '.tmp', 'wb', compresslevel=2) as out_fastq_gz: #with open(out_fname, 'wb') as out_fastq_gz: out_fastq_gz.writelines(chunk_iter) os.rename(out_fname + '.tmp', out_fname) t1 = time.time() log(t1 - t0, 'sec.')
def merged_paired_ends(records1, records2): tot_good = 0 tot_great = 0 tot = 0 # log('in merged_paired_ends',records1,records2) for (rec1, rec2) in zip(records1, records2): tot += 1 str1 = str(rec1.seq) str2 = str(rec2.seq.reverse_complement()) # log('-------------------------------------------\n matching ',str1,'\n',str2,'\n===================================================') end1 = str1[-common_req:] re = tre.compile(end1, tre.EXTENDED) # we expect small errors here res_seq = None match = re.search(str2, tre.Fuzzyness(maxerr=init_err)) if match: tot_good += 1 match_loc = match.groups()[0][0] to_search_len = match_loc + common_req fuzzyness = max(tot_err, ceil(0.1 * to_search_len)) re = tre.compile(str1[-to_search_len:], tre.EXTENDED) match_tot = re.search(str2, tre.Fuzzyness(maxerr=fuzzyness)) # log('step1: matched ',end1,' at',match_loc,' testing prefix ',str2[:to_search_len],'cost ',match.cost) if match_tot: # if (tot_good % 100 == 0): # log('fuzzyness = ', fuzzyness) # log('step2: matched ',str1[-to_search_len:],' at',match_tot.groups()[0][0],' testing prefix ','cost ',match.cost) tot_great += 1 # An arbitrary decision: take the common string from r2 res_str = str1[:-to_search_len] + str2 # TODO: preserve qualities res_seq = SeqRecord(Seq(res_str), id=rec1.id, name=rec1.name, description=rec1.description, letter_annotations={ "phred_quality": [30 for i in range(len(res_str))] }) if (tot_great % step == 0): log('nicely matched ', str1, '\n', str2, to_search_len, match_tot.group(0), match.group(0), match_tot.cost, match.cost) # log('result = ',str(res_seq.seq)) yield res_seq continue res_str = str1 + ('N' * padding) + str2 res_seq = SeqRecord(Seq(res_str), id=rec1.id, name=rec1.name, description=rec1.description, letter_annotations={ "phred_quality": [30 for i in range(len(res_str))] }) if (tot % step == 0): log(tot, tot_good, tot_great) # log('matched ',str1,'\n',str2, len(str1), len(str2)) # log('result = ',str(res_seq.seq)) yield res_seq
re = tre.compile(sine[:sine_header], tre.EXTENDED) fuzziness = tre.Fuzzyness(maxerr=maxerr) for rec in records: if reverse_complement: cur_seq = rec.seq.reverse_complement() else: cur_seq = rec.seq match = re.search(str(cur_seq), fuzziness) if match: # log(rec.seq) #sine_location = match.groups() #returns tuple of tuples (in this case: ((2,78), ) for example SeqIO.write(rec, sys.stdout, 'fastq') # Writes to stdout, uncompresed [sine_fname, header_len, max_error, reverse_complement, merged_input_fname] = sys.argv[1:] if reverse_complement not in {"forward", "rc"}: raise ValueError('reverse_complement arg must be "forward" or "rc"') log(f"About to screen {merged_input_fname} ({reverse_complement}) for {sine_fname} first {header_len} up to {max_error} err" ) sine = gene_lib.get_sine_forward(sine_fname) #"B1.fasta" filter_potential_sines(in_fname=merged_input_fname, sine_string=sine, sine_header=int(header_len), maxerr=int(max_error), reverse_complement=(reverse_complement == "rc"))
#from Bio.SeqRecord import SeqRecord import gene_lib from gene_lib import log from gene_lib import get_sine_forward def filter_potential_sines(in_fname, sine_string, sine_header=67, maxerr=19): """ Finds candidate SINEs with a certain distance from a prefix length. To be used for preliminary screening (input for later steps). """ with gene_lib.open_any(in_fname, 'rt') as in_file_handle: records = SeqIO.parse(in_file_handle, format="fastq") re = tre.compile(sine[:sine_header], tre.EXTENDED) fuzziness = tre.Fuzzyness(maxerr=maxerr) for rec in records: match = re.search(str(rec.seq), fuzziness) if match: # log(rec.seq) #sine_location = match.groups() #returns tuple of tuples (in this case: ((2,78), ) for example SeqIO.write(rec, sys.stdout, 'fastq') # Writes to stdout, uncompresed [sine_fname, merged_input_fname] = sys.argv[1:] log('About to screen', merged_input_fname, 'for', sine_fname) sine = gene_lib.get_sine_forward(sine_fname) #"B1.fasta" filter_potential_sines(merged_input_fname, sine)
itertools.islice(shared_input_iterator, chunk_size - 1)) def split_recompress(in_fname, out_basename, skip=[]): """ skip - indexes of parts to skip (still need to decompress but not compress) """ with gzip.open(in_fname, 'rb') as in_fastq: for i, chunk_iter in enumerate(chunks(in_fastq, chunk_size=100_000_000)): out_fname = f'{out_basename}.part{i}e8.fastq.gz' t0 = time.time() if i in skip: log('Skipping', out_fname) max(chunk_iter) # consume iterator, discarding values else: log('Writing', out_fname) with gzip.open(out_fname + '.tmp', 'wb', compresslevel=2) as out_fastq_gz: #with open(out_fname, 'wb') as out_fastq_gz: out_fastq_gz.writelines(chunk_iter) os.rename(out_fname + '.tmp', out_fname) t1 = time.time() log(t1 - t0, 'sec.') #split_recompress('Old-lung/old_lung_R1_001.fastq.gz', 'Old-lung/old_lung_R1_001') if __name__ == '__main__': [in_fname, out_basename, *skip] = sys.argv[1:] skip = [int(i) for i in skip] split_recompress(in_fname, out_basename, skip) log('REMOVING', in_fname) os.remove(in_fname) # may fail for /dev/stdin, /dev/fd/... etc. but that's OK