示例#1
0
def dope_w_badread_removal(input1, input2, output_prefix, **kwargs):
    suc, startnum = _dope(input1, input2, output_prefix, **kwargs)
    if not suc:
        _bad_reads_removal((startnum / 2) - 1e4, 3e4, (input1, input2),
                           output_prefix, kwargs)
    _tryrm(output_prefix + '_short_1.fastq')
    _tryrm(output_prefix + '_short_2.fastq')
示例#2
0
def _bad_reads_removal(startnum, scope, fs, outpref, kwargs):
    rdss = _extract_suspected_sequences(outpref, startnum, scope, fs)

    ex_reads = set()
    _find_fn_reads(rdss, outpref + '_tmp', range(int(scope)), kwargs, ex_reads)
    with open(fs[0]) as i1, open(
            outpref + ('_tmp_1.fastq' if len(fs) == 2 else '_tmp.fastq'),
            'wb') as o1:
        o1.writelines(l for i, l in enumerate(i1)
                      if (i / 4) - startnum not in ex_reads)
    if len(fs) == 2:
        with open(fs[1]) as i2, open(outpref + '_tmp_2.fastq', 'wb') as o2:
            o2.writelines(l for i, l in enumerate(i2)
                          if (i / 4) - startnum not in ex_reads)
    log_.info('Removed %s reads' % len(ex_reads))
    _tryrm(outpref + '_short.fastq')
    _tryrm(outpref + '_short_1.fastq')
    _tryrm(outpref + '_short_2.fastq')

    if len(fs) == 2:
        os.rename(outpref + '_tmp_1.fastq', outpref + '_short_1.fastq')
        os.rename(outpref + '_tmp_2.fastq', outpref + '_short_2.fastq')
        dope_w_badread_removal(outpref + '_short_1.fastq',
                               outpref + '_short_2.fastq', outpref, **kwargs)
    else:
        os.rename(outpref + '_tmp.fastq', outpref + '_short.fastq')
        dose_w_badread_removal(outpref + '_short.fastq', outpref, **kwargs)
示例#3
0
def _extract_suspected_sequences(outpref, startnum, scope, fs):
    if len(fs) == 2:
        tmp1 = outpref + '_tmp_1.fastq'
        tmp2 = outpref + '_tmp_2.fastq'
    else:
        tmp1 = outpref + '_tmp.fastq'

    _shell_command('head -n {hn} {f} | tail -n {tn} > {of}'.\
                  format(hn = int((startnum + scope) * 4), f = fs[0], \
                         tn = int(scope * 4), of = tmp1))
    if len(fs) == 2:
        _shell_command('head -n {hn} {f} | tail -n {tn} > {of}'.\
                  format(hn = int((startnum + scope) * 4), f = fs[1], \
                         tn = int(scope * 4), of = tmp2))

    with open(tmp1) as i1:
        rds1 = [r for r in SeqIO.parse(i1, 'fastq')]
    if len(fs) == 2:
        with open(tmp2) as i2:
            rds2 = [r for r in SeqIO.parse(i2, 'fastq')]
    _tryrm(tmp1)
    if len(fs) == 2:
        _tryrm(tmp2)
    return (rds1, rds2) if len(fs) == 2 else (rds1, )
示例#4
0
def _do_one_half(rdss, o_prefix, pos, ex_reads, kwargs):
    if len(rdss[0]) == 0:
        return True
    else:
        ns = _write_reads(rdss, o_prefix)
        if len(ns) == 2:
            suc, _ = _dope(ns[0], ns[1], o_prefix, **kwargs)
        else:
            suc, _ = _dose(ns[0], o_prefix, **kwargs)
        if not suc:
            if len(rdss[0]) == 1:
                ex_reads.add(pos[0])
                log_.debug('Removed one read')
            else:
                _find_fn_reads(rdss, o_prefix, pos, kwargs, ex_reads)
        _tryrm(ns[0])
        if len(ns) == 2:
            _tryrm(ns[1])
        _tryrm(o_prefix + '.map')
        return suc
示例#5
0
def _combine_map_files(seqdict, map_fs, outfile, fq_f1, fq_f2=None):
    mapfscopy = [f for f in map_fs]
    if type(seqdict) is types.FunctionType:
        seqdict = seqdict()
    out_f = outfile + '.tmp'
    try:
        fq1, fq2, ot = _open_gz_indif(fq_f1), (_open_gz_indif(fq_f2) if fq_f2
                                               is not None else None), opengz(
                                                   out_f, 'wb')
        map_fs = [open(f) for f in map_fs]
        map_fs_iters = [_stripread(f) for f in map_fs]
        map_new_cells = [f.next().split(TAB) for f in map_fs_iters]
        for r1, r2 in izip(
                SeqIO.parse(fq1, 'fastq'),
            (SeqIO.parse(fq2, 'fastq') if fq2 is not None else _none_iter())):
            curid = r1.id.replace('/1', '')
            curr = SourceRead(curid, \
                              [str(r1.seq)] if r2 is None else [str(r1.seq), str(r2.seq)], \
                              [r1.letter_annotations['phred_quality']] if r2 is None else \
                              [r1.letter_annotations['phred_quality'], \
                               r2.letter_annotations['phred_quality']])
            for i in range(len(map_fs_iters))[::-1]:
                while len(
                        map_new_cells[i]) > 0 and map_new_cells[i][0].replace(
                            '/1', '').replace('/2', '').split()[0] == curid:
                    curr._add_from_cells(map_new_cells[i], \
                                         0 if r1.description == map_new_cells[i][0] \
                                         else 1 if r2.description == map_new_cells[i][0] else 2, \
                                         seqdict)
                    try:
                        map_new_cells[i] = map_fs_iters[i].next().split(TAB)
                    except StopIteration:
                        del map_new_cells[i]
                        del map_fs_iters[i]
                        break

            if len(curr) > 0:
                curr.sort()
                ujson.dump(curr.to_ser(), ot)
                ot.write('\n')
    finally:
        for f in map_fs:
            try:
                f.close()
            except:
                pass
        try:
            fq1.close()
        except:
            pass
        try:
            fq2.close()
        except:
            pass
        try:
            ot.close()
        except:
            pass
    shutil.move(out_f, outfile)
    for f in mapfscopy:
        _tryrm(f)
示例#6
0
def dose_w_badread_removal(input1, output_prefix, **kwargs):
    suc, startnum = _dose(input1, output_prefix, **kwargs)
    if not suc:
        _bad_reads_removal(startnum - 1e4, 3e4, (input1, ), output_prefix,
                           kwargs)
    _tryrm(output_prefix + '_short.fastq')