Пример #1
0
def seqblock2alignment(aligned, name, seq):
    #tranform the alinged into a dict for better reference
    align_dict = {}
    for i, j in aligned:
        if j != None:
            align_dict[j] = i

    #subjct_range = [i[1] for i in aligned if i[1]!=None]
    #right = max(subjct_range)
    #left = min(subjct_range)
    right = max(align_dict)
    left = min(align_dict)

    iLeft = min(align_dict.values())
    iRight = max(align_dict.values())

    if iRight - iLeft + 1 != len(align_dict):
        print('detect deletion!')
        cmn.append_file(name + '\n', 'hasDeletion')

    iLeft = align_dict[left]
    seq = list(seq)
    #print aligned
    while (iLeft != 0):
        iLeft -= 1
        left -= 1
        #print iLeft
        seq[iLeft] = seq[iLeft].lower()
        align_dict[left] = iLeft

    iRight = align_dict[right]
    while (iRight < len(seq) - 1):
        iRight += 1
        right += 1
        seq[iRight] = seq[iRight].lower()
        align_dict[right] = iRight

    aln = []
    for j in range(right):
        #j += 1
        try:
            i = align_dict[j]
            if i == None:
                char = 'N'
            else:
                char = seq[i]
        except KeyError:
            char = '-'
        aln.append(char)
    return ''.join(aln)
def parse_inserted_gap(ID, seq, label):
    fn = 'sampleRun_%s/bait_insertion' % ID
    #if cmn.filexist(fn) or ('N' in seq.replace('-', 'N').strip('N')):
    if cmn.filexist(fn):
        #lines = cmn.file2lines(fn)
        #lines = sorted(lines, key=lambda x: int(x.split(',')[0][1:]))
        #Ngap = 0
        #for line in lines:
        #    items = line.strip().split()
        #    Ngap += len(items[-1])

        #check what is the right range of sequence
        print('runing blast to fix %s' % ID)
        checkSeq = seq.replace('-', 'N').strip('N')
        fquery = 'tmpInput.fa'
        fasta = '>input\n%s\n' % checkSeq
        cmn.write_file(fasta, fquery)
        dn = 'tmpBr_%s.txt' % label
        cmd = 'blastn -query %s -db /project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_4verify.fa ' % fquery
        cmd += '-task blastn-short -dust no -outfmt \'6 qseqid sseqid qstart qend sstart send evalue pident qseq sseq\' -out %s' % dn
        cmn.run(cmd)
        isFixed = False
        for line in cmn.file2lines(dn):
            items = line.strip().split()
            #print items
            qstart, qend, sstart, send = list(map(int, items[2:6]))
            if sstart == 1 and send == 658 and qstart == 21:
                qseq, sseq = items[-2:]
                new = [
                    char1 for char1, char2 in zip(qseq, sseq) if char2 != '-'
                ]
                if len(new) == 658:
                    seq = seq[:qstart - 1] + ''.join(new) + seq[qend:]
                    print('solution found for %s' % ID)
                    isFixed = True
                break
            if sstart == 2 and send == 655 and qstart == 22:
                qseq, sseq = items[-2:]
                new = [
                    char1 for char1, char2 in zip(qseq, sseq) if char2 != '-'
                ]
                if len(new) == 654:
                    seq = seq[:qstart - 1] + ''.join(new) + seq[qend:]
                    print('solution found for %s' % ID)
                    isFixed = True
                break
        if not isFixed:
            cmn.append_file('%s\t%s\n' % (ID, label), 'cannot_fixed_indel.txt')
    return seq
def get_query_sequence(seqDict, genus, sp):
    #1. anything in Eudamine file has higher priority
    #fEud = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/Eudaminae-barcode-reference.txt'
    #cmd = 'grep %s %s' % (sp, fEud)
    #lines = cmn.cmd2lines(cmd)
    #if len(lines) == 1:
    #    name = lines[0].split()[0]
    #    seq = seqDict[name]
    #    fasta = '>%s\n%s\n' % (name, seq)
    #    qlen = len(seq.replace('N', ''))
    #    print 'pick %s for %s %s' % (name, genus, sp)
    #    return fasta, qlen

    names = list(seqDict.keys())
    #try to look up the exact match first
    expected_name = '%s_%s' % (genus, sp)
    tmp = [name for name in names
        if name.upper() == expected_name.upper()]

    if len(tmp) != 0:
        name = tmp[0]
        print('found exact match %s' % name)
        seq = seqDict[name]
        fasta = '>%s\n%s\n' % (name, seq)
        qlen = len(seq.replace('N', ''))
        return fasta, qlen


    #look it up in other files
    good_names = [name for name in names
    #        if genus.upper() in name.upper().split('_')]
            if genus.upper() == name.upper().split('_')[0]]

    useGenus = False
    if len(good_names) > 0:
        useGenus = True

    cmn.run('rm pickingLog.txt 2> /dev/null')
    if len(good_names) == 0:#sp is just 'sp'
        print('can not find barcode for genus keyword "%s"' % genus)
        good_names = names
        cmn.write_file('noGenus\n', 'pickingLog.txt')

    if len(good_names) > 1:
        #try to refine it
        tmp = [name for name in good_names
                if sp.upper() in name.upper().split('_')]
        if len(tmp) != 0:
            good_names = tmp
        else:
            cmn.append_file('noSpecies\n', 'pickingLog.txt')

    #############################################
    ####new here, auto pick sequences for those has no info
    #############################################
    if cmn.filexist('pickingLog.txt'):
        print('automatically pick bait by fastq similarity')
        fsp = 'restricted_genus.info'
        if useGenus and (not cmn.filexist(fsp)):
            cmd = '/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist %s' % genus
        else:
            cmd = '/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist '
        cmn.run(cmd)
        good_names = cmn.file2lines('picked_bait.txt')
        cmn.write_file('pickClosed\n', 'pickingLog.txt')


    #############################################
    #############################################
    #############################################

    #try to see if type species is there
    tmp = [name for name in good_names
            if name[0] == '*']
    if len(tmp) != 0:
        good_names = tmp
    else:
        tmp = [name for name in good_names
            if '*' in name]
        if len(tmp) != 0:
            good_names = tmp

    #then randomly pick one, get the max length ones
    name = max(good_names, key=lambda x: len(seqDict[x].replace('N', '-')))
    #name = name.replace('/', '_')
    seq = seqDict[name]
    fasta = '>%s\n%s\n' % (name, seq)
    qlen = len(seq.replace('N', ''))
    print('pick %s for %s %s' % (name, genus, sp))
    return fasta, qlen
Пример #4
0
for fmap in maplist:
    sp = cmn.lastName(fmap).split('_')[0]

    prot_dict = {}
    with open(fmap) as fp:
        for i, line in enumerate(fp):
            try:
                prot = codingI[i]
            except KeyError:
                continue

            #reach here if protein found
            a, b = line.strip().split()
            try:
                prot_dict[prot].append((a, b))
            except KeyError:
                prot_dict[prot] = [(a, b)]

    for prot in prot_dict:
        dn = '%s/%s.fa' % (outdir, prot)
        alist = prot_dict[prot]
        new = []
        for i in range(2):
            seq = ''.join([each[i] for each in alist])
            name = '%s_cp%s' % (sp, i+1)
            fasta = '>%s\n%s\n' % (name, seq)
            cmn.append_file(fasta, dn)

    print('finish processing %s' % fmap)
        name, iii, subName = record2name(record)
        q = record.qual[i:j]
        s = record.seq[i:j]
        if iii == 1:
            s = reverse_strand(s)
            q = q[::-1]
        fq = '@%s\n%s\n+\n%s\n' % (subName, s, q)
        print(name, subName, i, j, record.seq, s)
        if name not in rdict:
            rdict[name] = [None, None]

        rdict[name][iii] = fq

    print('Nbad: %s; Ntotal %s;' % (Nbad, Ntotal))
    fq1 = '%s_R1.fq' % outlabel
    fq2 = '%s_R2.fq' % outlabel
    fsingle = '%s_singleton.fq' % outlabel
    for fn in [fq1, fq2, fsingle]:
        cmn.run('rm %s' % fn)

    for name in rdict:
        alist = rdict[name]
        if alist.count(None) == 0:
            cmn.append_file(alist[0], fq1)
            cmn.append_file(alist[1], fq2)
        else:
            for each in alist:
                if each != None:
                    cmn.append_file(each, fsingle)
    cmn.run('rm bait_insertion 2> /dev/null')
    hasInsertion = False
    for key in indel_dict:
        print(key, len(indel_dict[key]))
        leftI, rightI = key
        cov = (pCoverage[leftI] + pCoverage[rightI]) / 2.0
        indel_depth = len(indel_dict[key])
        insertion_info = [key, indel_depth, cov]
        if indel_depth > 0.5 * cov:
            hasInsertion = True
            count_dict = Counter(indel_dict[key])
            maxChar = max(count_dict, key=lambda x: count_dict[x])
            insertion_info.append(maxChar)
            insertion_info = '\t'.join(map(str, insertion_info))
            cmn.append_file(insertion_info + '\n', 'bait_insertion')

            print('insert between ', insertion_info)
            for name in bait_names:
                bait_dict[name][leftI] += maxChar

    #just undergo one round of adding gap
    if not hasInsertion:
        print('No need to re-run bwa because no insertion in query')
    else:
        N = cmn.cpu_check()
        print('re-run bwa due to insertion')
        frefs = update_baits(bait_dict)

        fq_groups = group_fq(cmn.file2lines('fqlist'))
def get_query_sequence(seqDict, genus, sp):
    #1. anything in Eudamine file has higher priority
    #fEud = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/Eudaminae-barcode-reference.txt'
    #cmd = 'grep %s %s' % (sp, fEud)
    #lines = cmn.cmd2lines(cmd)
    #if len(lines) == 1:
    #    name = lines[0].split()[0]
    #    seq = seqDict[name]
    #    fasta = '>%s\n%s\n' % (name, seq)
    #    qlen = len(seq.replace('N', ''))
    #    print 'pick %s for %s %s' % (name, genus, sp)
    #    return fasta, qlen

    names = list(seqDict.keys())
    #try to look up the exact match first
    expected_name = '%s_%s' % (genus, sp)
    tmp = [name for name in names if name.upper() == expected_name.upper()]

    if len(tmp) != 0:
        name = tmp[0]
        print('found exact match %s' % name)
        seq = seqDict[name]
        fasta = '>%s\n%s\n' % (name, seq)
        qlen = len(seq.replace('N', ''))
        return fasta, qlen

    #look it up in other files
    good_names = [
        name for name in names
        #        if genus.upper() in name.upper().split('_')]
        if genus.upper() == name.upper().split('_')[0]
    ]

    cmn.run('rm pickingLog.txt')
    if len(good_names) == 0:  #sp is just 'sp'
        print('can not find barcode for genus keyword "%s"' % genus)
        good_names = names
        cmn.write_file('noGenus\n', 'pickingLog.txt')

    if len(good_names) > 1:
        #try to refine it
        tmp = [
            name for name in good_names
            if sp.upper() in name.upper().split('_')
        ]
        if len(tmp) != 0:
            good_names = tmp
        else:
            cmn.append_file('noSpecies\n', 'pickingLog.txt')

    #try to see if type species is there
    tmp = [name for name in good_names if name[0] == '*']
    if len(tmp) != 0:
        good_names = tmp
    else:
        tmp = [name for name in good_names if '*' in name]
        if len(tmp) != 0:
            good_names = tmp

    #then randomly pick one, get the max length ones
    name = max(good_names, key=lambda x: len(seqDict[x].replace('N', '-')))
    #name = name.replace('/', '_')
    seq = seqDict[name]
    fasta = '>%s\n%s\n' % (name, seq)
    qlen = len(seq.replace('N', ''))
    print('pick %s for %s %s' % (name, genus, sp))
    return fasta, qlen