示例#1
0
def alnDict2output(aln_dict, dn, order='sorting'):
    info = []
    if len(aln_dict) == 0:
        cmn.run('touch %s' % dn)
        return None
    #maxLength = max([len(each) for each in aln_dict.keys()])
    maxLength = 0
    maxNameLength = max([len(each) for each in aln_dict])
    nameformat = '{:<%s}' % maxNameLength

    names = list(aln_dict.keys())
    if order == 'sorting':
        names = sorted(names, key=lambda x: number4sorting(aln_dict[x]))
    elif order == 'grouping':
        #this is used to output inconsistent group
        #rank by grouping of species IDs
        names = sorted(names, key=lambda x: group_by_spnames(x))
    else:
        names.sort()

    for i, name in enumerate(names):
        #name = 'readgroup%s' % i
        aln = aln_dict[name]
        name = nameformat.format(name)

        toAdd = maxLength - len(aln)
        if toAdd > 0:
            aln += '-' * toAdd
        info.append('%s    %s\n' % (name, ''.join(aln)))
    cmn.write_file(''.join(info), dn)
示例#2
0
def read_baits(fn):
    adict = {}
    toAdd = {}
    hasPrimer = True
    new = []
    for line in cmn.file2lines(fn):
        if line.strip() == '':
            continue
        sp, name, seq = line.split()
        print(len(seq))
        if len(seq) != 698:
            hasPrimer = False
            if len(seq) == 658:
                #fixable
                seq = add_primer(seq)
            else:
                print('Error! didn\'t recognize the length of the bait %s %s' %
                      (sp, name))
                sys.exit()
        newline = '%s\t%s\t%s\n' % (sp, name, seq)
        new.append(newline)
        key = '%s_%s' % (sp, name)
        adict[key] = seq
        toAdd[name] = seq

    if not hasPrimer:
        print('revise the input baits to add primer...')
        cmn.write_file(''.join(new), fn)

    return adict, toAdd
示例#3
0
def prune_tree(ftree, fseq):
    t = ete3.Tree(ftree)
    IDlist = cmn.cmd2lines('grep ">" %s|cut -d ">" -f 2' % fseq)
    t.prune(IDlist)
    dn = 'prune_tree.tre'
    cmn.write_file(t.write(format=1), dn)
    return dn
def makeBlastDatabase(seqDict):
    dn = 'db4picking.fa'
    new = ['>%s\n%s\n' % (name, seqDict[name])
        for name in seqDict
		if seqDict[name].strip('N-X') != '']
    cmn.write_file(''.join(new), dn)
    cmd = 'module add blast; makeblastdb -dbtype=nucl -in=%s' % dn
    cmn.run(cmd)
    return dn
示例#5
0
def do_barcode_blast(sequence):
    fdb = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa'
    #fdb = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_NoN_0.95.fasta'
    namelabel = sequence.split()[0][1:].split()[0].split('|')[0].replace('*','').split('[')[0].replace('"','').replace("'", '')
    fquery = '/tmp/%s.fa' % namelabel
    cmn.write_file(sequence, fquery)
    cmd = 'module add blast; blastn -query %s -db %s ' % (fquery, fdb)
    cmd += '-outfmt \'6 sseqid qlen slen length pident\''
    lines = cmn.cmd2lines(cmd)
    #cmn.run('rm %s' % fquery)
    return lines
def update_baits(bait_dict):
    adict = {}
    for i, name in enumerate(bait_dict):
        fnlabel = 'bait%s' % i
        dn = 'baits/%s.fa' % fnlabel
        seq = bait_dict[name]
        fasta = '>%s\n%s\n' % (name, ''.join(seq))
        cmn.write_file(fasta, dn)
        cmd = 'module add bwa; bwa index %s -p %s' % (dn, fnlabel)
        cmn.run(cmd)
        adict[name] = dn
    return adict
示例#7
0
def output_matrix(arr, fn=None):
    dimension = arr.ndim
    lines = []
    for index, content in np.ndenumerate(arr):
        first = ''
        for i in range(dimension):
            first += '%s\t' % (index[i])
        lines += ['%s%s' % (first, content)]
    if fn != None:
        cmn.write_file('\n'.join(lines), fn)
    else:
        return '\n'.join(lines)
def parse_inserted_gap(ID, seq, label):
    fn = 'sampleRun_%s/bait_insertion' % ID
    #if cmn.filexist(fn) or ('N' in seq.replace('-', 'N').strip('N')):
    if cmn.filexist(fn):
        #lines = cmn.file2lines(fn)
        #lines = sorted(lines, key=lambda x: int(x.split(',')[0][1:]))
        #Ngap = 0
        #for line in lines:
        #    items = line.strip().split()
        #    Ngap += len(items[-1])

        #check what is the right range of sequence
        print('runing blast to fix %s' % ID)
        checkSeq = seq.replace('-', 'N').strip('N')
        fquery = 'tmpInput.fa'
        fasta = '>input\n%s\n' % checkSeq
        cmn.write_file(fasta, fquery)
        dn = 'tmpBr_%s.txt' % label
        cmd = 'blastn -query %s -db /project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_4verify.fa ' % fquery
        cmd += '-task blastn-short -dust no -outfmt \'6 qseqid sseqid qstart qend sstart send evalue pident qseq sseq\' -out %s' % dn
        cmn.run(cmd)
        isFixed = False
        for line in cmn.file2lines(dn):
            items = line.strip().split()
            #print items
            qstart, qend, sstart, send = list(map(int, items[2:6]))
            if sstart == 1 and send == 658 and qstart == 21:
                qseq, sseq = items[-2:]
                new = [
                    char1 for char1, char2 in zip(qseq, sseq) if char2 != '-'
                ]
                if len(new) == 658:
                    seq = seq[:qstart - 1] + ''.join(new) + seq[qend:]
                    print('solution found for %s' % ID)
                    isFixed = True
                break
            if sstart == 2 and send == 655 and qstart == 22:
                qseq, sseq = items[-2:]
                new = [
                    char1 for char1, char2 in zip(qseq, sseq) if char2 != '-'
                ]
                if len(new) == 654:
                    seq = seq[:qstart - 1] + ''.join(new) + seq[qend:]
                    print('solution found for %s' % ID)
                    isFixed = True
                break
        if not isFixed:
            cmn.append_file('%s\t%s\n' % (ID, label), 'cannot_fixed_indel.txt')
    return seq
示例#9
0
def parse_ref(seqDict):
    cmn.mkdir('baits')

    newDict = {}
    for i, name in enumerate(seqDict):
        seq = seqDict[name]
        fnlabel = 'bait%s' % i
        dn = 'baits/%s.fa' % fnlabel
        name = name.replace('*', '').replace('"', "'")
        fasta = '>%s\n%s\n' % (name, seq)
        cmn.write_file(fasta, dn)
        cmd = 'module add bwa; bwa index %s -p %s' % (dn, fnlabel)
        cmn.run(cmd)
        newDict[name] = dn
    return newDict
def do_barcode_blast(sequence, seqDict):
    #fref = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa'
    #fadd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/added_from_customBaits.baitInfo'

    fdb = makeBlastDatabase(seqDict)

    #fdb = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_NoN_0.95.fasta'
    namelabel = sequence.split()[0][1:].split()[0].split('|')[0].replace('*','').split('[')[0].replace('"','').replace("'", '')
    namelabel = namelabel.replace('/', '_')
    fquery = '/tmp/%s.fa' % namelabel
    cmn.write_file(sequence, fquery)
    cmd = 'module add blast; blastn -query %s -db %s ' % (fquery, fdb)
    cmd += '-outfmt \'6 sseqid qlen slen length pident\''
    lines = cmn.cmd2lines(cmd)
    cmn.run('rm %s' % fquery)
    return lines
def get_mash_file(name, seq):
    global mash_file_dict, cpu
    try:
        fn = mash_file_dict[name]
    except KeyError:
        fn = '/tmp/%s' % name
        seq = ''.join(seq).replace('-', '').replace('N', '')
        fasta = '>%s\n%s\n' % (name, seq)
        cmn.write_file(fasta, fn)
        cmd = '/home2/wli/local/mash-Linux64-v1.1.1/mash sketch -n -p %s %s' % (
            cpu, fn)
        cmn.run(cmd)
        dn = fn + '.msh'
        mash_file_dict[name] = dn
        fn = dn
    return fn
示例#12
0
def do_barcode_blast(sequence):
    fdb = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_4verify.fa'
    namelabel = sequence.split()[0][1:].split()[0].split('|')[0].replace('*','').split('[')[0].replace('"','').replace("'", '').split('(')[0].split('/')[0].split('\\')[0]
    fquery = '/tmp/%s.fa' % namelabel
    cmn.write_file(sequence, fquery)

    fbr = fquery + '.br'
    cmd = 'module add blast; blastn -max_target_seqs 5000 -query %s -db %s -ungapped ' % (fquery, fdb)
    cmd += '-outfmt \'6 sseqid slen length pident qstart qend qseq sseq\''
    cmd += ' -out %s ' % fbr
    #print cmd
    cmn.run(cmd)
    #cmd += ' | head -n 10'
    #lines = cmn.cmd2lines(cmd)
    lines = cmn.file2lines(fbr)
    cmn.run('rm %s' % fquery)
    cmn.run('rm %s' % fbr)
    return lines
示例#13
0
def make_index_header(fvcf, flen):

    lendict = read_length_info(flen)

    ordered_scafs = []
    taken = set([])
    with open(fvcf) as fp:
        for line in fp:
            if not line.startswith('scaffold'):
                continue
            scaf = line.split()[0]
            if scaf not in taken:
                taken.add(scaf)
                ordered_scafs.append(scaf)

    info = []
    for scaf in ordered_scafs:
        length = lendict[scaf]
        for i in range(length):
            info.append('%s\t%s\n' % (scaf, (i+1)))

    dn = 'index_header'
    cmn.write_file(''.join(info), dn)
示例#14
0
def check_difference(seq1, seq2):
    print(len(seq1), len(seq2))
    if len(seq1) == len(seq2):
        return sum([char1 != char2 for char1, char2 in zip(seq1, seq2)
            if char1 not in gapChars and char2 not in gapChars])

    cmn.write_file(seq1, 'tmpSeq1.fa')
    cmn.write_file(seq2, 'tmpSeq2.fa')
    info = cmn.cmd2info('blastn -query tmpSeq1.fa -subject tmpSeq2.fa')
    #Identities = 656/656 (100%)

    identityString = cmn.find_between(info, 'Identities = ', ' (')
    identN, totalN = list(map(int, identityString.split('/')))
    cmn.write_file(info, 'checkTmp%s.br' % (ID))
    return totalN - identN
示例#15
0
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py", file=sys.stderr)
        sys.exit()

    N = 1000000  #get this much positions
    label = '%sM' % (N / 1000000)

    seqDict, length = read_fa(fn)

    if length < N:
        print('sequence length is shorter than 10K, exist!')
        sys.exit()

    positions = random.sample(list(range(length)), N)
    positions.sort()

    new = []
    for name in seqDict:
        seq = seqDict[name]

        newSeq = [seq[i] for i in positions]

        fasta = '>%s\n%s\n' % (name, ''.join(newSeq))
        new.append(fasta)

    dn = cmn.lastName(fn).replace('.fasta', '').replace(
        '.fa', '') + '_rd%s.fa' % (label)
    cmn.write_file(''.join(new), dn)
示例#16
0
    hasCombined = True
    if len(old_fastqs) == 0:  # no old data
        print('no old libs found for %s' % label)
        cmn.run('ln -s %s' % dn)
    else:  #has old data
        print('combining old libs for %s' % label)
        old_fastqs, dup_fastqs = remove_duplication(old_fastqs)
        cmn.run('cp %s %s' % (dn, wdir))
        log_info.append('%s\t%s\n' % (label, dn))
        comb_fn = '%s/%s' % (wdir, cmn.lastName(dn))
        for old_fastq in old_fastqs:
            cmn.run('cat %s >> %s' % (old_fastq, comb_fn))
            log_info.append('%s\t%s\n' % (label, old_fastq))

if hasCombined:
    cmn.write_file(''.join(log_info), '%s/combined_libs.log' % wdir)

#make statistics for data amount

fastq_groups = group_fastq(fastqs)

new = []
for key in fastq_groups:
    fns = fastq_groups[key]
    cmd = 'python /work/biophysics/mtang/SNP_calling/scripts/check_fastq_size.py %s %s' % (
        key, ','.join(fns))
    new.append(cmd)

new.append('')
cmn.write_lines(new, 'fastq_amount.cmds')
示例#17
0
            #for tacc, copy fastq to wdir
            print('copying fastq for %s...' % olabel)
            for fastq in fastqs:
                cmd = 'cp %s %s' % (fastq, wdir)
                cmn.run(cmd)

            fastqs = [cmn.lastName(fastq) for fastq in fastqs]
            cmd, fnewSams = make_bwa_cmds(fastqs, 'assembly_selfref_v2', wdir)
            info = info.replace('[WL_mapping_cmds]', cmd)
            #7. merge mapped sams
            cmd = 'python /work/00412/mtang/sequencing/scripts/merge_mapped_sams.py %s_step2.sam %s' % (
                olabel, ' '.join(fnewSams))
            info = info.replace('[WL_merge_sam_cmds]', cmd)

            #8. re-run gatk
            #this step has been fully included in the template
            fjob = 'job_files/gatk%s.job' % olabel
            cmn.write_file(info, fjob)

            fjobs.append(fjob)

    print(
        '##########################################################################'
    )
    print('please use the following cmd to submit unfinished jobs')
    print('cd job_files')
    print('\n'.join(['sbatch %s' % cmn.lastName(fjob) for fjob in fjobs]))
    print(
        '##########################################################################'
    )
def get_query_sequence(seqDict, genus, sp):
    #1. anything in Eudamine file has higher priority
    #fEud = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/Eudaminae-barcode-reference.txt'
    #cmd = 'grep %s %s' % (sp, fEud)
    #lines = cmn.cmd2lines(cmd)
    #if len(lines) == 1:
    #    name = lines[0].split()[0]
    #    seq = seqDict[name]
    #    fasta = '>%s\n%s\n' % (name, seq)
    #    qlen = len(seq.replace('N', ''))
    #    print 'pick %s for %s %s' % (name, genus, sp)
    #    return fasta, qlen

    names = list(seqDict.keys())
    #try to look up the exact match first
    expected_name = '%s_%s' % (genus, sp)
    tmp = [name for name in names
        if name.upper() == expected_name.upper()]

    if len(tmp) != 0:
        name = tmp[0]
        print('found exact match %s' % name)
        seq = seqDict[name]
        fasta = '>%s\n%s\n' % (name, seq)
        qlen = len(seq.replace('N', ''))
        return fasta, qlen


    #look it up in other files
    good_names = [name for name in names
    #        if genus.upper() in name.upper().split('_')]
            if genus.upper() == name.upper().split('_')[0]]

    useGenus = False
    if len(good_names) > 0:
        useGenus = True

    cmn.run('rm pickingLog.txt 2> /dev/null')
    if len(good_names) == 0:#sp is just 'sp'
        print('can not find barcode for genus keyword "%s"' % genus)
        good_names = names
        cmn.write_file('noGenus\n', 'pickingLog.txt')

    if len(good_names) > 1:
        #try to refine it
        tmp = [name for name in good_names
                if sp.upper() in name.upper().split('_')]
        if len(tmp) != 0:
            good_names = tmp
        else:
            cmn.append_file('noSpecies\n', 'pickingLog.txt')

    #############################################
    ####new here, auto pick sequences for those has no info
    #############################################
    if cmn.filexist('pickingLog.txt'):
        print('automatically pick bait by fastq similarity')
        fsp = 'restricted_genus.info'
        if useGenus and (not cmn.filexist(fsp)):
            cmd = '/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist %s' % genus
        else:
            cmd = '/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist '
        cmn.run(cmd)
        good_names = cmn.file2lines('picked_bait.txt')
        cmn.write_file('pickClosed\n', 'pickingLog.txt')


    #############################################
    #############################################
    #############################################

    #try to see if type species is there
    tmp = [name for name in good_names
            if name[0] == '*']
    if len(tmp) != 0:
        good_names = tmp
    else:
        tmp = [name for name in good_names
            if '*' in name]
        if len(tmp) != 0:
            good_names = tmp

    #then randomly pick one, get the max length ones
    name = max(good_names, key=lambda x: len(seqDict[x].replace('N', '-')))
    #name = name.replace('/', '_')
    seq = seqDict[name]
    fasta = '>%s\n%s\n' % (name, seq)
    qlen = len(seq.replace('N', ''))
    print('pick %s for %s %s' % (name, genus, sp))
    return fasta, qlen
示例#19
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



if __name__=='__main__':
    #options=parse_options()
    try:
        ftree, faln=sys.argv[1:]
    except:
        print("Usage: *.py ftree fa", file=sys.stderr)
        sys.exit()



    tree = ete3.Tree(ftree)

    takenIDs = []
    with open(faln) as fp:
        for line in fp:
            if line[0] == '>':
                ID = line[1:].strip()
                takenIDs.append(ID)

    tree.prune(takenIDs)

    dn = cmn.lastName(ftree).replace('.tre', '') + '_prune.tre'
    cmn.write_file(tree.write(), dn)
示例#20
0
        info = info.replace('[WL_sam_filelist]', dn)
        info = info.replace('[WL_preprocessing]', '\n'.join(step1cmds))

        #make snp call cmds
        #f_sam = merge_sams(sp, fsams)

        info = info.replace('5328', sp)
        info = info.replace('[WL_cwd]', os.getcwd())

        info2 = template2.replace('assembly_selfref', asslabel)
        info2 = info2.replace('5328', sp)
        info2 = info2.replace('[WL_cwd]', os.getcwd())

        os.chdir('..')
        fjob = 'job_files/s1_%s.job' % sp
        cmn.write_file(info, fjob)
        cmn.run('cd job_files; sbatch s1_%s.job' % sp)

        if sp not in step1_finished:
            step1_jobs.append(fjob)

        fjob = 'job_files/s2_%s.job' % sp
        cmn.write_file(info2, fjob)
        step2_jobs.append(fjob)

        #cmn.run('cd job_files; sbatch sg%s.job' % sp)

    info = ['bash %s\n' % each for each in step1_jobs]
    cmn.write_file(''.join(info), 'step1todo.cmds')

    info = ['sbatch %s\n' % each for each in step2_jobs]
示例#21
0
    except:
        print("Usage: *.py RAxML_bestTree.noGap", file=sys.stderr)
        sys.exit()

    nameDict = get_names()

    #t = ete3.Tree(cmn.txt_read(fn).replace('[&U]', ''))

    appear = {}
    table = []
    lines = cmn.file2lines(fn)
    info = []
    for line in lines:
        if line[0] == '>':
            sp = line[1:].split('_')[0]
            print(sp)
            try:
                newline = '>' + nameDict[sp].replace('\t', '_').replace(
                    ' ', '_')
            except:
                newline = line
            info.append(newline)
        else:
            info.append(line)

    info.append('')
    info = '\n'.join(info)
    dn = fn + '.renamed'
    cmn.write_file(info, dn)
    cmn.write_file(''.join(table), dn + '.nameTable')
                seq.append(char2)
            else:  #different characters
                if char1 == '-' and char2 == '-':
                    seq.append(char3.lower())
                elif char1 == '-':
                    seq.append(char2.lower())
                elif char2 == '-':
                    seq.append(char1.lower())
                else:
                    #different chars and not a gap
                    seq.append('X')
        fasta = '>%s\n%s\n' % (Id, ''.join(seq))
        refBaseDict[Id] = ''.join(seq)
        new.append(fasta)

    cmn.write_file(''.join(new), 'sum_barcodes.fa')

    #cmn.run('rm -r sampleRun_fake')

    #check denovo pipeline one
    fns = cmn.cmd2lines('ls sampleRun_*/denovo_barcode.fa')
    denovoDict = {}
    new = []
    for fn in fns:
        Id = cmn.find_between(fn, 'sampleRun_', '/')
        lines = cmn.file2lines(fn)
        seq = ''.join(lines[1:])
        if seq > 658:
            tmp = seq.replace('N', '')
            if len(tmp) == 658:
                seq = tmp
示例#23
0
    for gene in selected:
        scaf, i, j = gene
        fa = '../introgression/0_process_scaf/scaf2_fastas/%s.fa' % scaf
        seqDict = read_fa(fa)
        print('parsing %s (i, j)' % (fa, i, j))
        try:
            exclusion = repdict[scaf]
        except:
            exclusion = set([])

        for name in seqDict:
            #seq = [char for index, char in enumerate(seqDict[name])
            #        if index not in exclusion and (i <= index <= j)]
            seq = seqDict[name][i:j]
            seq = [char for index, char in enumerate(seq)
                    if (index+i) not in exclusion]
            try:
                final[name] += seq
            except:
                final[name] = seq

    dn = 'sampled_seq_t%s.fa' % times
    fasta = ['>%s\n%s\n' % (name, ''.join(final[name]))
            for name in final]
    cmn.write_file(''.join(fasta), dn)





cov_dict = {}
#get coverage first
for exon in stack_dict:
    cov_dict[exon] = {}
    for sp in stack_dict[exon]:
        stacks = stack_dict[exon][sp]
        Nlist = [len(stacks[key]) for key in stacks]
        cov = float(sum(Nlist)) / len(Nlist)
        cov_dict[exon][sp] = cov

cov_info = [
    '%s\t%s\t%s\n' % (sp, exon, cov_dict[exon][sp]) for exon in cov_dict
    for sp in cov_dict[exon]
]

cmn.write_file(''.join(cov_info), 'cov_info.txt')

for exon in stack_dict:
    length = exon_lengths[exon]
    spDict = stack_dict[exon]
    for sp in spDict:
        stacks = spDict[sp]

        newSeq = [[], []]

        cov = cov_dict[exon][sp]

        for each in range(length):
            p = each + 1
            print(p, exon, sp)
            try:
示例#25
0
        defline = lines[0]
        seq = ''.join(lines[1:])
        adict[defline] = seq
    return adict


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':

    #fn = 'all_genomes_noGap.fa'
    #fn = 'all_genomes_charGap.fa'
    try:
        fn = sys.argv[1]
    except:
        print('*.py all_genomes_charGap.fa ')
        sys.exit()

    adict = read_fa(fn)

    fnlabel = cmn.lastName(fn).replace('.fa', '')
    outdir = 'splitS_%s' % fnlabel
    cmn.mkdir(outdir)
    for i, key in enumerate(adict):
        seq = adict[key]
        fasta = '>%s\n%s\n' % (key, seq)
        dn = '%s/%s_%s.fa' % (outdir, fnlabel, i)
        cmn.write_file(fasta, dn)
    #fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes.fasta'
    seqDict = read_fa(fall)
    #fadd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/addedBaits_fromPipeline.fa'
    #if cmn.filexist(fadd):
    #    seqDict.update(read_fa(fadd))
    #ftable = '/archive/biophysics/Nick_lab/wli/archive/barcodes/auto_tables/verified_barcodes.fa'
    #seqDict.update(read_autoTable(ftable))

    info = []
    for line in cmn.file2lines(fn):
        #5077    Autochton zarex
        items = line.strip().replace('?', ' ').split()
        try:
            sample, genus, sp = items[:3]
        except:
            sample, genus = items[:2]
            sp = 'sp'
        sp = sp.split('-')[0].split('_')[0]
        genus = parse_name(genus)
        query_sequence, qlen = get_query_sequence(seqDict, genus, sp)
        br_result = do_barcode_blast(query_sequence, seqDict)
        print('\n'.join(br_result))
        baits = pick_barcode_baits(br_result, qlen, seqDict)
        info += format_baits(sample, baits)

    dn = cmn.lastName(fn) + '.baits'
    cmn.write_file(''.join(info), dn)



            taken += cmn.cmd2lines(cmd)

        taken += [each for each in alea_list
                if cmn.lastName(each).split('_')[0] == ID]

        if len(words) != 0:
            taken = [each for each in taken
                    if all([word in each for word in words])]
        faDict[ID] = taken

    #print taken
    all_fa = sum(list(faDict.values()), [])
    ass_count = count_ass_appearance(all_fa)
    best_ass = max(list(ass_count.keys()), key=lambda x: ass_count[x])
    print('the most common assembly is %s, only take fa mapped to this assembly' % best_ass)
    cmn.write_file(best_ass, 'best_assembly.txt')
    sys.exit()

    for ID in faDict:
        alist = faDict[ID]
        taken = [each for each in alist
                if best_ass in each.replace('_withMito', '')]

        #print ID, taken

        if best_ass == 'cne' and len(taken) == 0:
            taken += [each for each in alist
                    if '3574_assembly_v1' in each]

        if best_ass == '3574_assembly_v1' and len(taken) == 0:
            taken += [each for each in alist
示例#28
0
            good_reads.append(name1)
        elif (misM2 + 1) >= misM1:
            #bad one has more mismatch, good one is good!
            #good one can be 1 bp more than the bad one
            good_reads.append(name1)
        else:
            if identity >= identity_cut:
                good_reads.append(name1)
            else:
                bad_reads.append((name2, aln1))

    print('further classify overlapping reads into:')
    print('%s good reads' % len(good_reads))
    print('%s bad reads' % len(bad_reads))
    #sp2 = name2sp(name2)

    #add back the previous IDs
    good_reads.append('#' * 100)
    for ID in good_IDs:
        name = ID1mapping[ID]
        good_reads.append(name)

    cmn.write_lines(good_reads, 'good_reads.txt')

    #bad_reads.append('#' * 100)
    for ID in bad_IDs:
        name = ID2mapping[ID]
        bad_reads.append((name, seqDict2[name]))
    bad_alignments = ['%s    %s\n' % (each[0], each[1]) for each in bad_reads]
    cmn.write_file(''.join(bad_alignments), 'bad_reads_alignment.txt')
示例#29
0
        print("Usage: *.py fa Ncores", file=sys.stderr)
        sys.exit()

    #if the nodes are less than 4 taxa, produce a random tree
    cmd = "grep '>' %s" % (fn)
    lines = [
        each[1:].strip() for each in cmn.cmd2lines(cmd) if each.strip() != ''
    ]

    N = len(lines)
    if N < 4:
        print('Warning: fastme can not make tree of less than 4 taxa')
        print('Warning: so I make a fake tree...')
        dn = '%s.phylip.fastme.tre' % cmn.lastName(fn)
        if N == 1:
            info = '(%s);\n' % lines[0]
        if N == 2:
            a, b = lines
            info = '(%s,%s);\n' % (a, b)
        elif N == 3:
            a, b, c = lines
            info = '((%s,%s),%s);\n' % (a, b, c)
        cmn.write_file(info, dn)
        sys.exit()

    label = cmn.lastName(fn)
    cmd = 'rm RAxML_*.%s;' % label
    cmd += '/home2/wli/local/RAxML/raxmlHPC-PTHREADS-SSE3 -m GTRGAMMA -p 7112 -T %s -s %s -n %s' % (
        Ncores, label, label)
    cmn.run(cmd)
    except:
        print('usage: *.py fsam fass', file=sys.stderr)
        sys.exit()

    cmd = 'module add samtools; samtools faidx %s' % fass
    cmn.run(cmd)
    cmd = 'module add picard/1.117; java -jar $PICARD/CreateSequenceDictionary.jar R=%s O=%s.dict' % (
        fass, fass[:-3])
    cmn.run(cmd)

    template = cmn.txt_read(
        '/project/biophysics/Nick_lab/wli/sequencing/scripts/templates/template_gatk_bias_fromSam.job'
    )
    template = template.replace('[WL_ref]', fass)
    template = template.replace('[INPUT.sam]', fsam)

    sampleId = cmn.lastName(fsam).replace('highQ_', '').split('_')[0]

    dnlabel = '%s_%s' % (cmn.lastName(fsam).replace(
        '.sam', ''), cmn.lastName(fass).replace('.fa', ''))
    cmn.mkdir(dnlabel)
    os.chdir(dnlabel)

    cwd = os.getcwd()
    pre_cmds = 'cd %s\n' % cwd
    template = template.replace('5642', sampleId)
    template = template.replace('[WL_preprossing]', pre_cmds)

    cmn.write_file(template, 'gatk%s.job' % sampleId)
    #cmn.run('sbatch gatk%s.job' % sampleId)