Exemplo n.º 1
0
def process_self_align_into_seed(align_filename,
                                 seqids,
                                 reader_class,
                                 pCS=None,
                                 dun_use_partial=False):
    """
    Ignore hits that are - strand or qID >= sID (self hit or already reported)

    Returns:
    pCS -- preClusterSet
    orphans --- seqs that are neither in pCS nor tucked i.e. no align hits
    """
    if pCS is None:
        pCS = preClusterSet2()

    orphans = set(seqids)
    reader = reader_class(align_filename)
    for r in reader:
        if r.qID >= r.sID or r.strand == '-': continue
        s = r.characterize(30, 0.01, 30, 0.01, 30, 0.05, min_identity=0.99)
        if dun_use_partial and s == 'partial': continue

        if s == 'match':
            pCS.add_seqid_match(r.qID, r.sID)
            if r.qID in bug_ids or r.sID in bug_ids:
                print "match:", r.qID, r.sID
                print "after match:", pCS.seq_stat[r.qID], pCS.seq_stat[r.sID]
        elif s == 'partial':
            pCS.add_seqid_partial(r.qID, r.sID)
            if r.qID in bug_ids or r.sID in bug_ids:
                print "partial:", r.qID, r.sID
                print "after partial:", pCS.seq_stat[r.qID], pCS.seq_stat[
                    r.sID]
        elif s == 'q_contained':
            if r.qID in bug_ids or r.sID in bug_ids:
                print "tucking {0} into {1}".format(r.qID, r.sID)
                print "before:", pCS.seq_stat[r.qID], pCS.seq_stat[r.sID]
            pCS.add_seqid_contained(r.qID, r.sID)
            if r.qID in bug_ids or r.sID in bug_ids:
                print "after:", pCS.seq_stat[r.qID], pCS.seq_stat[r.sID]
        elif s == 's_contained':
            if r.qID in bug_ids or r.sID in bug_ids:
                print "tucking {0} into {1}".format(r.sID, r.qID)
                print "before:", pCS.seq_stat[r.sID], pCS.seq_stat[r.qID]
            pCS.add_seqid_contained(r.sID, r.qID)
            if r.qID in bug_ids or r.sID in bug_ids:
                print "after:", pCS.seq_stat[r.sID], pCS.seq_stat[r.qID]
        try:
            orphans.remove(r.qID)
        except:
            pass
        try:
            orphans.remove(r.sID)
        except:
            pass
        #sanity_checking(pCS, orphans)

    #sanity_checking(pCS, orphans)

    return pCS, orphans
Exemplo n.º 2
0
def process_self_align_into_seed(align_filename,
                                 seqids,
                                 reader_class,
                                 pCS=None,
                                 dun_use_partial=False):
    """
    Ignore hits that are - strand or qID >= sID (self hit or already reported)

    Returns:
    pCS -- preClusterSet
    orphans --- seqs that are neither in pCS nor tucked i.e. no align hits
    """
    if pCS is None:
        pCS = preClusterSet2()

    orphans = set(seqids)
    reader = reader_class(align_filename)
    for r in reader:
        if r.qID >= r.sID or r.strand == '-': continue
        s = r.characterize(100, 0.05, 50, 0.02, 20, 0.01)
        if dun_use_partial and s == 'partial': continue

        #if r.qID=='m54119_170322_155415/12845906/28_5778_CCS' or r.sID=='m54119_170322_155415/12845906/28_5778_CCS':
        #    pdb.set_trace()
        # Liz note: currently, just add all to match because minimap sensitivity not enough to do "tuck" properly
        if s == 'match' or s == 'partial' or s.endswith('_contained'):
            pCS.add_seqid_match(r.qID, r.sID)
        #elif s == 'q_contained':
        #    pCS.add_seqid_contained(r.qID, r.sID)
        #elif s == 's_contained':
        #    pCS.add_seqid_contained(r.sID, r.qID)
        try:
            orphans.remove(r.qID)
        except:
            pass
        try:
            orphans.remove(r.sID)
        except:
            pass
        #sanity_checking(pCS, orphans)

    #sanity_checking(pCS, orphans)

    return pCS, orphans
def read_seq_csv(csv_filename):
    # sanity check that "seqid" and "stat" are two valid column headers
    header_checked = False
    orphans = set()
    pCS = preClusterSet2()

    for r in DictReader(open(csv_filename), delimiter=','):
        if not header_checked:
            if 'seqid' not in r or 'stat' not in r:
                print("{0} must have the fields 'seqid' and 'stat'! Abort".format(csv_filename), file=sys.stderr)
                sys.exit(-1)
        header_checked = True
        if r['stat']=='orphan':
            orphans.add(r['seqid'])
        else:
            cid = int(r['stat'])
            if cid not in pCS.S: pCS.S[cid] = preCluster(cid=cid)
            pCS.add_seqid_to_cluster_by_cid(r['seqid'], cid)
    return pCS, orphans
Exemplo n.º 4
0
def process_self_align_into_seed(align_filename,
                                 seqids,
                                 reader_class,
                                 pCS=None):
    """
    Ignore hits that are - strand or qID >= sID (self hit or already reported)

    Returns:
    pCS -- preClusterSet
    orphans --- seqs that are neither in pCS nor tucked i.e. no align hits
    """
    if pCS is None:
        pCS = preClusterSet2()

    orphans = set(seqids)
    reader = reader_class(align_filename)
    for r in reader:
        if r.qID >= r.sID or r.strand == '-': continue
        s = r.characterize(400, 0.1, 400, 0.1, 100, 0.1)
        if s == 'partial': continue

        #if r.qID=='m54119_170322_155415/12845906/28_5778_CCS' or r.sID=='m54119_170322_155415/12845906/28_5778_CCS':
        #    pdb.set_trace()
        if s == 'match':
            pCS.add_seqid_match(r.qID, r.sID)
        elif s == 'q_contained':
            pCS.add_seqid_contained(r.qID, r.sID)
        elif s == 's_contained':
            pCS.add_seqid_contained(r.sID, r.qID)
        try:
            orphans.remove(r.qID)
        except:
            pass
        try:
            orphans.remove(r.sID)
        except:
            pass
        #sanity_checking(pCS, orphans)

    #sanity_checking(pCS, orphans)

    return pCS, orphans
Exemplo n.º 5
0
def process_self_align_into_seed(align_filename, seqids, reader_class, pCS=None, dun_use_partial=False):
    """
    Ignore hits that are - strand or qID >= sID (self hit or already reported)

    Returns:
    pCS -- preClusterSet
    orphans --- seqs that are neither in pCS nor tucked i.e. no align hits
    """
    if pCS is None:
        pCS = preClusterSet2()

    orphans = set(seqids)
    reader = reader_class(align_filename)
    for r in reader:
        if r.qID >= r.sID or r.strand == '-': continue
        s = r.characterize(400, 0.4, 100, 0.1, 50, 0.05)
        if dun_use_partial and s == 'partial': continue

        #if r.qID=='m54119_170322_155415/12845906/28_5778_CCS' or r.sID=='m54119_170322_155415/12845906/28_5778_CCS':
        #    pdb.set_trace()
        # Liz note: currently, just add all to match because minimap sensitivity not enough to do "tuck" properly
        if s == 'match' or s == 'partial' or s.endswith('_contained'):
            pCS.add_seqid_match(r.qID, r.sID)
        #elif s == 'q_contained':
        #    pCS.add_seqid_contained(r.qID, r.sID)
        #elif s == 's_contained':
        #    pCS.add_seqid_contained(r.sID, r.qID)
        try:
            orphans.remove(r.qID)
        except:
            pass
        try:
            orphans.remove(r.sID)
        except:
            pass
        #sanity_checking(pCS, orphans)


    #sanity_checking(pCS, orphans)

    return pCS, orphans