示例#1
0
def make_ad_dict(fn):
    fi = '/work/biophysics/mtang/SNP_calling/scripts/data/adaptor/index_and_adaptor'
    #the index by the number
    adict = {}
    #the index by the barcode
    bdict = {}
    for line in cmn.file2lines(fi):
        index, barcode, seq = line.strip().split()
        adict[index] = seq
        bdict[barcode] = seq

    taken = set([])

    rdict = {}
    for line in cmn.file2lines(fn):
        try:
            sp, index = line.strip().split()
        except:
            continue

        if index.isdigit():
            ad1 = adict[index]
        else:
            try:
                ad1 = bdict[index]
            except KeyError:
                print('Error! can not find index info for %s' % line)
                sys.exit()

        rdict[sp] = ad1

    return rdict
示例#2
0
def parse_popDef(fpopdef):
    adict = {}
    for fn in cmn.file2lines(fpopdef):
        print(fn)
        popname = cmn.lastName(fn).replace('IDs', '')
        IDs = cmn.file2lines(fn)
        adict[popname] = IDs
    return adict
示例#3
0
def parse_popDef(fpopdef, freceipt, inclusion):
    adict = {}
    receipt = cmn.lastName(freceipt)
    for fn in cmn.file2lines(fpopdef):
        print(fn)
        if cmn.lastName(fn) == receipt:
            continue
        popname = cmn.lastName(fn).replace('IDs','').rstrip('_')
        IDs = [line.split()[0] for line in cmn.file2lines(fn)]
        IDs = set(IDs) & inclusion
        adict[popname] = IDs
    return adict
示例#4
0
def read_indel_info(fn):
    adict = {}
    for line in cmn.file2lines(fn):
        key, a, b, char = line.strip().split('\t')
        i, j = list(map(int, key[1:-1].split(', ')))
        adict[(i,j)] = char
    return adict
def read_baits():
    fns = cmn.cmd2lines('ls baits/bait*.fa')
    seqDict = {}
    for fn in fns:
        name, seq = cmn.file2lines(fn)
        seqDict[name[1:]] = list(seq)
    return seqDict
示例#6
0
def read_baits(fn):
    adict = {}
    toAdd = {}
    hasPrimer = True
    new = []
    for line in cmn.file2lines(fn):
        if line.strip() == '':
            continue
        sp, name, seq = line.split()
        print(len(seq))
        if len(seq) != 698:
            hasPrimer = False
            if len(seq) == 658:
                #fixable
                seq = add_primer(seq)
            else:
                print('Error! didn\'t recognize the length of the bait %s %s' %
                      (sp, name))
                sys.exit()
        newline = '%s\t%s\t%s\n' % (sp, name, seq)
        new.append(newline)
        key = '%s_%s' % (sp, name)
        adict[key] = seq
        toAdd[name] = seq

    if not hasPrimer:
        print('revise the input baits to add primer...')
        cmn.write_file(''.join(new), fn)

    return adict, toAdd
def read_baits(fn):
    adict = {}
    for line in cmn.file2lines(fn):
        sp, name, seq = line.split()
        key = '%s_%s' % (sp, name)
        adict[key] = seq
    return adict
def isSameGenus(fn):
    lines = cmn.file2lines(fn)[1:]
    genus_list = []

    #There are two checks:
    #   1. if the denovo barcode has a same genus with ref, take it
    #   2. otherwise, check if denovo barcode has a same one as refbase
    #   3. if both not satisfied, then unknown
    for line in lines:
        items = line.strip().split()
        found_genus = guessGenus(items[3])
        ref_genus = guessGenus(items[4])
        N = int(items[2])
        if '_denovo' in items[0]:
            if found_genus == ref_genus:
                return ',takenD'
        if N <= 20:
            genus_list.append(found_genus)

    if len(genus_list) == len(lines):
        checkSet = set(genus_list)
        if len(checkSet) == 1:
            return ',takenD'
        elif len(checkSet) > 1:
            return ',diffGenus'
    return ',unknown'
def read_aln(fn):
    seqs = {}
    for line in cmn.file2lines(fn):
        name, seq = line.strip().split()
        seqs[name] = seq

    return seqs
def read_baitInfo(fadd):
    adict = {}
    for line in cmn.file2lines(fadd):
        sp, defline, seq = line.split()
        defline = parse_name(defline)
        adict[defline] = seq
    return adict
示例#11
0
def parse_blast_output(fn, rset):
    adict = {}
    for line in cmn.file2lines(fn):
        #qseqid sseqid pident evalue qlen qstart qend slen sstart send
        items = line.strip().split()
        if len(items) == 0:
            continue
        qseqid, sseqid = items[:2]
        if qseqid in adict:
            continue
        evalue = float(items[3])
        if evalue > 0.001:
            continue

        ident = float(items[2])
        if ident < 50:
            continue

        isForward = True
        sstart, send = list(map(int, items[8:10]))
        if sstart > send:
            isForward = False

        if qseqid in rset:
            isForward = (not isForward)

        adict[qseqid] = (sseqid, isForward)
    return adict
示例#12
0
def nonGap_char(fn):
    seq = ''.join([
        line.strip() for line in cmn.file2lines(fn)
        if line.strip() != '' and (not line[0] == '>')
    ])
    seq = seq.replace('N', '-')
    N = len(seq) - seq.count('-')
    return N
示例#13
0
def read_genus_info(fn):
    adict = {}
    for line in cmn.file2lines(fn):
        sp, defline = line.split()
        genus = defline.split('_')[1]
        adict[sp] = genus

    return adict
示例#14
0
def parse_receiptInfo(fn, alist):
    adict = {}
    for line in cmn.file2lines(fn):
        sample = line.strip().split()[0]
        adict[sample] = line

    blist = [adict[sample] for sample in alist]
    return blist
示例#15
0
def parse_fastqlist(fn):
    adict = {}
    for line in cmn.file2lines(fn):
        sample = cmn.lastName(line).split('_')[0]
        try:
            adict[sample].append(line)
        except KeyError:
            adict[sample] = [line]
    return adict
示例#16
0
def read_correct_info(fn):
    #3614_mito       5       C
    rdict = {}
    for line in cmn.file2lines(fn):
        items = line.strip().split('\t')
        #TODO: asuming only working on one scaffold (mito)
        index, char, info = items[-3:]
        rdict[int(index) - 1] = char
    return rdict
示例#17
0
def check_NA(label):
    fn = label + '_stat.report'
    line = cmn.file2lines(fn)[-1]
    items = line.strip().split()
    #return isWarnning
    if len(items) != 10 or 'NA' in items:
        return True
    else:
        return False
示例#18
0
def read_aln(fn):
    seqs = {}
    seq = ''
    for line in cmn.file2lines(fn):
        if line.strip() == '':
            continue
        name, seq = line.strip().split()
        seqs[name] = seq

    return seqs, len(seq)
示例#19
0
def read_length_info(fn):
    adict = {}
    for line in cmn.file2lines(fn):
        scaf, length = line.strip().split()
        adict[scaf] = int(length)

    #assume the last scaffold is mito
    if 'mito' not in scaf:
        scaf = ''
    return adict, scaf
示例#20
0
def old_get_goodIDs():
    fn1 = 'compare.check'
    #fn2 = 'checkSummary.report'
    goodIDs = set([])
    for line in cmn.file2lines(fn1):
        items = line.strip().split()
        if 'takenD' in line:
            goodIDs.add(items[0])

    return goodIDs
示例#21
0
def get_names():
    adict = {}
    fns = cmn.cmd2lines('ls -tr /project/biophysics/Nick_lab/wli/sequencing/scripts/data/*.sampleData')
    for fn in fns:
        for line in cmn.file2lines(fn):
            line = line.strip()
            items = line.split()
            sp = items[0].split('-')[-1]
            line = line.replace(items[0], sp).replace('-', '_').replace('(','').replace(')', '')
            adict[sp] = '_'.join(line.split())
    return adict
示例#22
0
def parse_protein_query(fn):
    alist = []
    for line in cmn.file2lines(fn):
        if line.strip() == '':
            continue

        if line[0] == '>':
            Id = line[1:].split()[0]
            alist.append(Id)

    return alist
示例#23
0
def parse_blastDict(fn):
    adict = {}
    for line in cmn.file2lines(fn):
        #qseqid sseqid evalue pident qlen qstart qend slen sstart send
        items = line.strip().split()
        gene = items[0]
        qlen, qstart, qend, slen, sstart, send = list(map(int, items[4:10]))
        i, j, isReverse = find_start_and_end(qlen, qstart, qend, slen, sstart,
                                             send)
        adict[gene] = (i, j, isReverse)
    return adict
示例#24
0
def parse_bait_file(fbait):
    new = {}
    target = {}
    for line in cmn.file2lines(fbait):
        ID, sp, seq = line.strip().split()
        #name = '%s_%s' % (ID, sp)
        name = sp
        if len(target) == 0:
            new[name] = seq

        target[name] = seq
    return new, target
def parse_indel_file(fn):
    #adict = {}
    total = 0
    for line in cmn.file2lines(fn):
        N = len(line.strip().split()[-1])
        total += N
        #try:
        #    adict[N] += 1
        #except KeyError:
        #    adict[N] = 1
    #return adict
    return total
示例#26
0
def read_letter_info(fn):
    #3614_mito	107	T	C	63	3614_mito_1
    rdict = {}
    for line in cmn.file2lines(fn):
        if line.strip() == '':
            continue

        items = line.strip().split()
        scaf, index, char1, char2 = items[:4]
        phase = items[-1]
        #TODO:currently, assume only mito is present
        rdict[int(index) - 1] = (char1, char2, phase)
    return rdict
示例#27
0
def parse_refTable(fn):
    adict = {}
    for line in cmn.file2lines(fn):
        items = line.strip().split()
        sample = items[0]
        refs = items[1:]
        maxSize = 0
        for fref in refs:
            size = compute_fileSize([fref])
            maxSize = max(size, maxSize)

        adict[sample] = maxSize
    return adict
示例#28
0
def backup_vcf_coverage(wdir):
    ddir = '/project/biophysics/Nick_lab/mtang/archive/step4_postprocessing/check_vcf_coverage'
    fns = cmn.cmd2lines('ls %s/*_vcf.cov' % wdir)

    #1. only back up the new version of cov file
    for fn in fns:
        print('processing %s...' % fn)
        lines = cmn.file2lines(fn)
        items = lines[-1].strip().split()
        if len(items) != 6:
            print('skip old format file %s' % fn)
            continue
        fnlabel = cmn.lastName(fn)
        dn = '%s/%s' % (ddir, fnlabel)
        if os.path.exists(dn):
            print('merging new and old data for %s' % fnlabel)
            covOld = float(cmn.file2lines(dn)[-1].split()[-2])
            cov = float(lines[-1].split()[-2])
            if cov > covOld:
                cmn.run('cp %s %s' % (fn, dn))
        else:
            cmn.run('cp %s %s' % (fn, dn))
def read_nickMade_barcodes(fn):
    adict = {}
    for line in cmn.file2lines(fn):
        if line.strip() == '':
            continue
        if line[0] == '#':
            continue

        if line[0] == '>':
            name = line[1:].strip().split('_')[0]

        seq = line.strip()
        adict[name] = seq
    return adict
def parse_inserted_gap(ID, seq, label):
    fn = 'sampleRun_%s/bait_insertion' % ID
    #if cmn.filexist(fn) or ('N' in seq.replace('-', 'N').strip('N')):
    if cmn.filexist(fn):
        #lines = cmn.file2lines(fn)
        #lines = sorted(lines, key=lambda x: int(x.split(',')[0][1:]))
        #Ngap = 0
        #for line in lines:
        #    items = line.strip().split()
        #    Ngap += len(items[-1])

        #check what is the right range of sequence
        print('runing blast to fix %s' % ID)
        checkSeq = seq.replace('-', 'N').strip('N')
        fquery = 'tmpInput.fa'
        fasta = '>input\n%s\n' % checkSeq
        cmn.write_file(fasta, fquery)
        dn = 'tmpBr_%s.txt' % label
        cmd = 'blastn -query %s -db /project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_4verify.fa ' % fquery
        cmd += '-task blastn-short -dust no -outfmt \'6 qseqid sseqid qstart qend sstart send evalue pident qseq sseq\' -out %s' % dn
        cmn.run(cmd)
        isFixed = False
        for line in cmn.file2lines(dn):
            items = line.strip().split()
            #print items
            qstart, qend, sstart, send = list(map(int, items[2:6]))
            if sstart == 1 and send == 658 and qstart == 21:
                qseq, sseq = items[-2:]
                new = [
                    char1 for char1, char2 in zip(qseq, sseq) if char2 != '-'
                ]
                if len(new) == 658:
                    seq = seq[:qstart - 1] + ''.join(new) + seq[qend:]
                    print('solution found for %s' % ID)
                    isFixed = True
                break
            if sstart == 2 and send == 655 and qstart == 22:
                qseq, sseq = items[-2:]
                new = [
                    char1 for char1, char2 in zip(qseq, sseq) if char2 != '-'
                ]
                if len(new) == 654:
                    seq = seq[:qstart - 1] + ''.join(new) + seq[qend:]
                    print('solution found for %s' % ID)
                    isFixed = True
                break
        if not isFixed:
            cmn.append_file('%s\t%s\n' % (ID, label), 'cannot_fixed_indel.txt')
    return seq