예제 #1
0
def prog_umi_group_ec(args):
    import os
    from fileio import zopen, FastqFormat
    from seq import QSequence

    assert args.k > 0
    assert args.max_offset >= 0

    fn, fext = os.path.splitext(args.o)
    with zopen(fn + ".discarded" + fext, 'w') as discard_file:
        umi_groups = run_umi_group_ec(
            FastqFormat.records_in(zopen(args.i, 'r')), args.k,
            args.max_offset, args.min_score,
            FastqFormat.records_out(discard_file, None))

    FastqFormat.records_out(zopen(args.o, 'w'),
                            (QSequence("UMI:%s:%s" %
                                       (umi, cqs.count), cqs.seq, cqs.qual)
                             for umi, cqs in umi_groups.iteritems()))
예제 #2
0
def prog_umi_group_ec(args):
    import os
    from fileio import zopen, FastqFormat
    from seq import QSequence

    assert args.k > 0
    assert args.max_offset >= 0
    assert args.min_score_for_offset <= args.k
    assert args.min_score_for_merge <= args.k

    fn, fext = os.path.splitext(args.o)
    discarded = FastqFormat.records_out(zopen(fn + '.discarded' + fext, 'w'),
                                        None)

    input_file = stdin if args.i is None else zopen(args.i, 'r')
    _progress_indicator(0, 0, input_file)
    umi_groups = run_umi_group_ec(records=FastqFormat.records_in(input_file),
                                  k=args.k,
                                  moff=args.max_offset,
                                  min_score4offset=args.min_score_for_offset,
                                  min_score4merge=args.min_score_for_merge,
                                  discarded=discarded,
                                  callback=_progress_indicator)
    stdout.write('\n')

    out = FastqFormat.records_out(zopen(args.o, 'w'), None)

    print 'Writing results to file'
    for name, grouplist in umi_groups.iteritems():
        grouplist = sorted(grouplist,
                           key=lambda cqs: (-cqs.count, -sum(cqs.qual)))
        cqs = grouplist[0]
        out += QSequence(
            'UMI:%s:%s:%s' % (name, '%s/%s' %
                              ('1', len(grouplist)), cqs.count), cqs.seq,
            cqs.qual)
        for i, cqs in enumerate(grouplist[1:]):
            discarded += QSequence(
                'UMI:%s:%s:%s' % (name, '%s/%s' %
                                  (i + 2, len(grouplist)), cqs.count), cqs.seq,
                cqs.qual)
예제 #3
0
파일: umi_group_ec.py 프로젝트: uubram/RTCR
def prog_umi_group_ec(args):
    import os
    from fileio import zopen, FastqFormat
    from seq import QSequence

    assert args.k > 0
    assert args.max_offset >= 0
    assert args.min_score_for_offset <= args.k
    assert args.min_score_for_merge <= args.k

    fn, fext = os.path.splitext(args.o)
    discarded = FastqFormat.records_out(
            zopen(fn + '.discarded' + fext, 'w'), None)

    input_file = stdin if args.i is None else zopen(args.i, 'r')
    _progress_indicator(0, 0, input_file)
    umi_groups = run_umi_group_ec(
            records = FastqFormat.records_in(input_file),
            k = args.k,
            moff = args.max_offset,
            min_score4offset = args.min_score_for_offset,
            min_score4merge = args.min_score_for_merge,
            discarded = discarded,
            callback = _progress_indicator)
    stdout.write('\n')

    out = FastqFormat.records_out(zopen(args.o, 'w'), None)

    print 'Writing results to file'
    for name, grouplist in umi_groups.iteritems():
        grouplist = sorted(grouplist, key = lambda cqs:(-cqs.count,
            -sum(cqs.qual)))
        cqs = grouplist[0]
        out += QSequence('UMI:%s:%s:%s'%(name,
            '%s/%s'%('1', len(grouplist)), cqs.count), cqs.seq, cqs.qual)
        for i, cqs in enumerate(grouplist[1:]):
            discarded += QSequence('UMI:%s:%s:%s'%(name, '%s/%s'%(i+2,
                len(grouplist)),cqs.count),
                    cqs.seq, cqs.qual)
예제 #4
0
def prog_checkout(args):
    search_rc = args.reverse_complement
    barcodes_fn = args.barcodes

    adapters = list(BarcodeFormat.records_in(open(barcodes_fn, 'r')))

    outfiles = {sample_id : open("%s.fastq"%sample_id,'w') \
            for (sample_id, master, slave) in adapters}

    fq1_fn = args.i
    fq2_fn = args.i2

    max_mm = args.max_mm

    fq1_file = zopen(fq1_fn, 'r')
    if isinstance(fq1_file, gzip.GzipFile):
        fq1_filesize = os.path.getsize(fq1_file.name)
        fq1_filepos = fq1_file.fileobj.tell
    else:
        fq1_filesize = filesize(fq1_file)
        fq1_filepos = fq1_file.tell

    fq1 = FastqFormat.records_in(fq1_file, encoding=None)
    if fq2_fn:
        fq2 = FastqFormat.records_in(zopen(fq2_fn, 'r'), encoding=None)
    else:
        fq2 = False

    n_accepted = 0
    prev_time = time()
    for i, r1 in enumerate(fq1):
        if time() - prev_time > .5:
            prev_time = time()
            frac = float(fq1_filepos()) / fq1_filesize
            stdout.write(term.EL(2) + term.CHA(0) + \
                    "Processed %s records (%.2f%%), accepted %s (%.2f%%)"%(i,
                        frac*100, n_accepted, 100*float(n_accepted)/i))
            stdout.flush()

        if fq2:
            r2 = next(fq2)
            assert (r1.id == r2.id)

        # Demultiplex
        best_match = None
        for (sample_id, master, slave) in adapters:
            matches, matches_rc = master.locate_in(r1.seq, max_mm, search_rc)

            master_match, is_rc = get_best_match(matches, matches_rc)

            # look for master on the mate
            if (not master_match or master_match[0] < len(master.seq)) and \
                    fq2:
                # master not found or no full length match found
                matches2, matches_rc2 = master.locate_in(
                    r2.seq, max_mm, search_rc)

                master_match2, is_rc2 = get_best_match(matches2, matches_rc2)

                if not master_match2 and not master_match:
                    # master not found on r1 nor on r2
                    continue

                if not master_match or (master_match2 and \
                        master_match2[0] < master_match[0]):
                    master_match = master_match2
                    is_rc = is_rc2
                    # apparently strands are swapped
                    r1, r2 = r2, r1

            if master_match == None:
                continue

            if is_rc:
                master_match = list(master_match)
                master_match[1] = \
                        len(r1.seq) - (master_match[1] + len(master.seq))
                master_match = tuple(master_match)
                r1 = FastqRecord(id=r1.id + " rc",
                                 desc=r1.desc,
                                 seq=revcomp(r1.seq),
                                 qual_str=r1.qual_str[::-1])
                if fq2:
                    r2 = FastqRecord(id=r2.id + " rc",
                                     desc=r2.desc,
                                     seq=revcomp(r2.seq),
                                     qual_str=r2.qual_str[::-1])

            # Master adapter has been found, retrieve its UMI (if it has one)
            master_umi = ("", "")
            if master.has_UMI():  # get umi
                master_umi = master.get_UMI(r1.seq, r1.qual_str,
                                            master_match[1])
                if master.UMI_length != len(master_umi[0]):
                    # failed to retrieve UMI from master adapter
                    continue

            # Look for slave adapter
            slave_match = None

            slave_umi = ("", "")
            if slave:  # has slave adapter
                slave_matches, slave_matches_rc = slave.locate_in(
                    r2.seq, max_mm, search_rc=False)
                slave_match = get_best_match(slave_matches, slave_matches_rc)

                if slave.has_UMI():  # get umi
                    slave_umi = slave.get_UMI(r2.seq, r2.qual_str,
                                              slave_match[1])
                    if slave.UMI_length != len(slave_umi[0]):
                        continue

            if not best_match or best_match[0][0] > master_match[0]:
                umi = [x + y for x, y in zip(master_umi, slave_umi)]
                best_match = (master_match, sample_id, umi)

        if best_match:
            master_match, sample_id, umi = best_match
            out = outfiles[sample_id]
            out.write("@%s UMI:%s:%s\n" % (r1.id, umi[0], umi[1]))
            out.write("%s\n+\n%s\n" % (r1.seq, r1.qual_str))
            n_accepted += 1
예제 #5
0
파일: checkout.py 프로젝트: Rosemary94/RTCR
def checkout(fq1_fn, fq2_fn, adapters, max_mm, search_rc, paired=False):
    assert not fq1_fn is None
    assert not (paired and not fq2_fn is None)

    print 'Handling file(s): %s' % ''.join(
        [fq1_fn, '' if fq2_fn is None else ', %s' % fq2_fn])

    fq1_file = zopen(fq1_fn, 'r')
    if isinstance(fq1_file, gzip.GzipFile):
        fq1_filesize = os.path.getsize(fq1_file.name)
        fq1_filepos = fq1_file.fileobj.tell
    else:
        fq1_filesize = filesize(fq1_file)
        fq1_filepos = fq1_file.tell

    fq1 = FastqFormat.records_in(fq1_file, encoding=None)
    if not fq2_fn is None:
        fq2 = FastqFormat.records_in(zopen(fq2_fn, 'r'), encoding=None)
    else:
        fq2 = None

    outfiles = {}
    for (sample_id, master, slave) in adapters:
        outfiles[sample_id] = {
                "out1" : (open("%s_R1.fastq"%sample_id, 'w'), 'R1') \
                        if not paired else \
                        (open("%s_R12.fastq"%sample_id, 'w'), 'R12'),
                "out2" : (None, None) if fq2 is None else \
                        (open("%s_R2.fastq"%sample_id, 'w'), 'R2')}

    n_accepted = 0
    prev_time = time()
    for i, r1 in enumerate(fq1):
        if fq2:
            r2 = next(fq2)
            assert (r1.id == r2.id)
        else:
            r2 = None

        # Demultiplex
        best_match = None
        for (sample_id, master, slave) in adapters:
            matches, matches_rc = master.locate_in(r1.seq, max_mm, search_rc)

            master_match, is_rc = get_best_match(matches, matches_rc)

            # look for master on the mate
            if (not master_match or master_match[0] < len(master.seq)) and \
                    fq2:
                # master not found or no full length match found
                matches2, matches_rc2 = master.locate_in(
                    r2.seq, max_mm, search_rc)

                master_match2, is_rc2 = get_best_match(matches2, matches_rc2)

                if not master_match2 and not master_match:
                    # master not found on r1 nor on r2
                    continue

                if not master_match or (master_match2 and \
                        master_match2[0] < master_match[0]):
                    master_match = master_match2
                    is_rc = is_rc2
                    # apparently strands are swapped
                    r1, r2 = r2, r1

            if master_match is None:
                continue

            if is_rc:
                master_match = list(master_match)
                master_match[1] = \
                        len(r1.seq) - (master_match[1] + len(master.seq))
                master_match = tuple(master_match)
                r1 = FastqRecord(id=r1.id + " rc",
                                 desc=r1.desc,
                                 seq=revcomp(r1.seq),
                                 qual_str=r1.qual_str[::-1])
                if fq2:
                    r2 = FastqRecord(id=r2.id + " rc",
                                     desc=r2.desc,
                                     seq=revcomp(r2.seq),
                                     qual_str=r2.qual_str[::-1])

            # Master adapter has been found, retrieve its UMI (if it has one)
            master_umi = ("", "")
            if master.has_UMI():  # get umi
                master_umi = master.get_UMI(r1.seq, r1.qual_str,
                                            master_match[1])
                if master.UMI_length != len(master_umi[0]):
                    # failed to retrieve UMI from master adapter
                    continue

            # Look for slave adapter
            slave_match = None

            slave_umi = ("", "")
            if slave:  # has slave adapter
                if paired:
                    r = r1
                else:
                    r = r2

                slave_matches, slave_matches_rc = slave.locate_in(
                    r.seq, max_mm, search_rc=search_rc)
                slave_match, slave_is_rc = get_best_match(
                    slave_matches, slave_matches_rc)

                if not slave_match:  # No slave found
                    continue

                if slave.has_UMI():  # get umi
                    if slave_is_rc:
                        slave_umi_start = len(
                            r.seq) - (slave_match[1] + len(slave.seq))
                        slave_umi = slave.get_UMI(revcomp(r.seq),
                                                  r.qual_str[::-1],
                                                  slave_umi_start)
                    else:
                        slave_umi = slave.get_UMI(r.seq, r.qual_str,
                                                  slave_match[1])
                    if slave.UMI_length != len(slave_umi[0]):
                        continue

            if not best_match or best_match[0][0] > master_match[0] or \
               (best_match[0][0] == master_match[0] and \
                slave_match and \
                (not best_match[1] or not best_match[1][0] or \
                 best_match[1][0] > slave_match[0])):
                umi = [x + y for x, y in zip(master_umi, slave_umi)]
                best_match = (master_match, slave_match, sample_id, umi)

        if best_match:
            master_match, slave_match, sample_id, umi = best_match
            for (r, (out, typename)) in ((r1, outfiles[sample_id]["out1"]),
                                         (r2, outfiles[sample_id]["out2"])):
                if not out:
                    continue
                out.write("@%s UMI:%s:%s:%s\n" %
                          (r.id, typename, umi[0], umi[1]))
                out.write("%s\n+\n%s\n" % (r.seq, r.qual_str))
            n_accepted += 1

        frac = float(fq1_filepos()) / fq1_filesize
        if time() - prev_time > .5 or frac == 1.0:
            prev_time = time()
            stdout.write(term.EL(2) + term.CHA(0) + \
                    "Processed %s records (%.2f%%), accepted %s (%.2f%%)"%(i + 1,
                        frac*100, n_accepted,
                        (100*float(n_accepted)/(i+1))))
            stdout.flush()

    stdout.write('\n')