def dope_w_badread_removal(input1, input2, output_prefix, **kwargs): suc, startnum = _dope(input1, input2, output_prefix, **kwargs) if not suc: _bad_reads_removal((startnum / 2) - 1e4, 3e4, (input1, input2), output_prefix, kwargs) _tryrm(output_prefix + '_short_1.fastq') _tryrm(output_prefix + '_short_2.fastq')
def _bad_reads_removal(startnum, scope, fs, outpref, kwargs): rdss = _extract_suspected_sequences(outpref, startnum, scope, fs) ex_reads = set() _find_fn_reads(rdss, outpref + '_tmp', range(int(scope)), kwargs, ex_reads) with open(fs[0]) as i1, open( outpref + ('_tmp_1.fastq' if len(fs) == 2 else '_tmp.fastq'), 'wb') as o1: o1.writelines(l for i, l in enumerate(i1) if (i / 4) - startnum not in ex_reads) if len(fs) == 2: with open(fs[1]) as i2, open(outpref + '_tmp_2.fastq', 'wb') as o2: o2.writelines(l for i, l in enumerate(i2) if (i / 4) - startnum not in ex_reads) log_.info('Removed %s reads' % len(ex_reads)) _tryrm(outpref + '_short.fastq') _tryrm(outpref + '_short_1.fastq') _tryrm(outpref + '_short_2.fastq') if len(fs) == 2: os.rename(outpref + '_tmp_1.fastq', outpref + '_short_1.fastq') os.rename(outpref + '_tmp_2.fastq', outpref + '_short_2.fastq') dope_w_badread_removal(outpref + '_short_1.fastq', outpref + '_short_2.fastq', outpref, **kwargs) else: os.rename(outpref + '_tmp.fastq', outpref + '_short.fastq') dose_w_badread_removal(outpref + '_short.fastq', outpref, **kwargs)
def _extract_suspected_sequences(outpref, startnum, scope, fs): if len(fs) == 2: tmp1 = outpref + '_tmp_1.fastq' tmp2 = outpref + '_tmp_2.fastq' else: tmp1 = outpref + '_tmp.fastq' _shell_command('head -n {hn} {f} | tail -n {tn} > {of}'.\ format(hn = int((startnum + scope) * 4), f = fs[0], \ tn = int(scope * 4), of = tmp1)) if len(fs) == 2: _shell_command('head -n {hn} {f} | tail -n {tn} > {of}'.\ format(hn = int((startnum + scope) * 4), f = fs[1], \ tn = int(scope * 4), of = tmp2)) with open(tmp1) as i1: rds1 = [r for r in SeqIO.parse(i1, 'fastq')] if len(fs) == 2: with open(tmp2) as i2: rds2 = [r for r in SeqIO.parse(i2, 'fastq')] _tryrm(tmp1) if len(fs) == 2: _tryrm(tmp2) return (rds1, rds2) if len(fs) == 2 else (rds1, )
def _do_one_half(rdss, o_prefix, pos, ex_reads, kwargs): if len(rdss[0]) == 0: return True else: ns = _write_reads(rdss, o_prefix) if len(ns) == 2: suc, _ = _dope(ns[0], ns[1], o_prefix, **kwargs) else: suc, _ = _dose(ns[0], o_prefix, **kwargs) if not suc: if len(rdss[0]) == 1: ex_reads.add(pos[0]) log_.debug('Removed one read') else: _find_fn_reads(rdss, o_prefix, pos, kwargs, ex_reads) _tryrm(ns[0]) if len(ns) == 2: _tryrm(ns[1]) _tryrm(o_prefix + '.map') return suc
def _combine_map_files(seqdict, map_fs, outfile, fq_f1, fq_f2=None): mapfscopy = [f for f in map_fs] if type(seqdict) is types.FunctionType: seqdict = seqdict() out_f = outfile + '.tmp' try: fq1, fq2, ot = _open_gz_indif(fq_f1), (_open_gz_indif(fq_f2) if fq_f2 is not None else None), opengz( out_f, 'wb') map_fs = [open(f) for f in map_fs] map_fs_iters = [_stripread(f) for f in map_fs] map_new_cells = [f.next().split(TAB) for f in map_fs_iters] for r1, r2 in izip( SeqIO.parse(fq1, 'fastq'), (SeqIO.parse(fq2, 'fastq') if fq2 is not None else _none_iter())): curid = r1.id.replace('/1', '') curr = SourceRead(curid, \ [str(r1.seq)] if r2 is None else [str(r1.seq), str(r2.seq)], \ [r1.letter_annotations['phred_quality']] if r2 is None else \ [r1.letter_annotations['phred_quality'], \ r2.letter_annotations['phred_quality']]) for i in range(len(map_fs_iters))[::-1]: while len( map_new_cells[i]) > 0 and map_new_cells[i][0].replace( '/1', '').replace('/2', '').split()[0] == curid: curr._add_from_cells(map_new_cells[i], \ 0 if r1.description == map_new_cells[i][0] \ else 1 if r2.description == map_new_cells[i][0] else 2, \ seqdict) try: map_new_cells[i] = map_fs_iters[i].next().split(TAB) except StopIteration: del map_new_cells[i] del map_fs_iters[i] break if len(curr) > 0: curr.sort() ujson.dump(curr.to_ser(), ot) ot.write('\n') finally: for f in map_fs: try: f.close() except: pass try: fq1.close() except: pass try: fq2.close() except: pass try: ot.close() except: pass shutil.move(out_f, outfile) for f in mapfscopy: _tryrm(f)
def dose_w_badread_removal(input1, output_prefix, **kwargs): suc, startnum = _dose(input1, output_prefix, **kwargs) if not suc: _bad_reads_removal(startnum - 1e4, 3e4, (input1, ), output_prefix, kwargs) _tryrm(output_prefix + '_short.fastq')