示例#1
0
def read_rep():
    dn = 'rep.dict.pkl'
    if cmn.filexist(dn):
        print('loading repeats using precomputed data...')
        return cmn.pickle_read(dn)

    freps = cmn.cmd2lines('ls annotation_repeats/*.gff3')
    repdict = {}
    for frep in freps:
        for line in cmn.file2lines(fn):
            items = line.strip().split()
            scaf = items[0]
            if scaf not in repdict:
                repdict[scaf] = set([])

            i, j = list(map(int, items[3:5]))
            repdict[scaf] = repdict[scaf] | set(range(i, j))
    cmn.pickle_write(repdict, dn)
    return repdict
示例#2
0
#6188_3842_assembly_v2_snp_step2.vcf
#vcf_dict = {cmn.lastName(fn).replace('_snp_step2.vcf', ''): fn for fn in fns}
sps = list(vcf_dict.keys())

#ref_genomes, refmapping = detect_ref_genomes(sps, bwa_dirs)
ref_genomes, refmapping = set([]), {}
for fn in fns:
    #../../step3_gatk/5729_3614_assembly_v1/5729_3614_assembly_v1_snp_step2.vcf
    fnlabel = cmn.lastName(fn).replace('_snp_step2.vcf', '')
    items = fnlabel.split('_')
    sp = items[0]
    ref = '_'.join(items[1:])
    ref_genomes.add(ref)
    refmapping[fnlabel] = ref

cmn.pickle_write(refmapping, 'ref_mapping.dict.pkl')
info = ['%s\t%s\n' % (sp, refmapping[sp]) for sp in refmapping]
cmn.write_file(''.join(info), 'ref_mapping.txt')

#3. make the length check
ref_dir = '/work/biophysics/mtang/SNP_calling/indexed_references'

unChecks = [
    ref for ref in ref_genomes
    if not os.path.exists('%s/%s_scafLength.txt' % (ref_dir, ref)) or (
        not os.path.exists('%s/%s_scaf.header' % (ref_dir, ref)))
]

#unChecks = ref_genomes

print('#######################################################')
示例#3
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#main
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

if __name__ == '__main__':
    #options=parse_options()
    try:
        fn = sys.argv[1]
    except:
        print("Usage: *.py", file=sys.stderr)
        sys.exit()

    seqDict, length = read_fa(fn)
    nameDict = {}
    count = 0

    new = ['%s\t%s' % (len(seqDict), length)]
    for name in seqDict:
        count += 1
        newName = 'ID%s' % (count)
        nameDict[newName] = name
        newName = '{:<10}'.format(newName)
        new.append('%s%s' % (newName, seqDict[name]))

    dn = cmn.lastName(fn) + '.phylip'
    cmn.write_lines(new, dn)

    dn = cmn.lastName(fn) + '.phylipNames.dict.pkl'
    cmn.pickle_write(nameDict, dn)
示例#4
0


if __name__=='__main__':
    #options=parse_options()
    try:
        fn=sys.argv[1]
    except:
        print("Usage: *.py coding.gff", file=sys.stderr)
        sys.exit()

    coding_indexes = []
    with open(fn) as fp:
        for line in fp:
            #scaffold1_cov14552_reverse	mitfi	trnM(cat)	23	89	1.809e-09	+	.
            items = line.strip().split()
            Range = list(map(int, items[3:5]))
            if items[6] == '-':
                j, i = Range
            else:
                i, j = Range
            indexes = list(range(i, j+1))
            coding_indexes += indexes


    dn = 'coding.indexes.pkl'
    cmn.pickle_write(set(coding_indexes), dn)



示例#5
0
        best_hits = find_bestE_reads(fn, label)

        #then separate it by exomes
        for line in best_hits:
            exon = line.strip().split()[1]#??
            try:
                exome_dict[exon].append(line)
            except:
                exome_dict[exon] = [line]

        try:
            sp_dict[label] += best_hits
        except:
            sp_dict[label] = best_hits

    print('writting outputs... ')
    cmn.pickle_write(exome_dict, 'blastByExon.dict.pkl')
    cmn.pickle_write(sp_dict, 'blastBySp.dict.pkl')
    #output the exome blasts
    #for exon in exome_dict:
    #    lines = exome_dict[exon]
    #    dn = '%s/%s.br' % (outdir, exon)
    #    cmn.write_file(''.join(lines), dn)







示例#6
0
    #get the reads and split them into exons
    #fns = cmn.getid(fn)

    rdict = {}
    for fn in fns:
        print('parsing ' + fn)
        with open(fn) as fp:
            for i, line in enumerate(fp):
                if i % 4 == 0:
                    #record = []
                    ID = line.strip().replace(' ', '_')
                    #print 'checkID', ID
                    try:
                        sp = good_IDs[ID]
                        isGood = True
                    except KeyError:
                        isGood = False

                #record.append(line)

                if i % 4 == 1:
                    #record = ''.join(record)
                    seq = line.strip()
                    if isGood:
                        if sp not in rdict:
                            rdict[sp] = {}
                        rdict[sp][ID] = seq

    dn = 'readsBySp.dict.pkl'
    cmn.pickle_write(rdict, dn)
示例#7
0
for fastq in fastqs:
    if not os.path.exists(fastq):
        print('fastq file %s doesn\'t exist! please email to ask!' % fastq)
    sp = cmn.lastName(fastq).split('_')[0]
    try:
        qdict[sp].append(fastq)
    except KeyError:
        qdict[sp] = [fastq]

#3. check sp to see if refs are specified
goodSPs = []
for sp in qdict:
    if sp in refdict:
        goodSPs.append(sp)
    else:
        print('no reference genome found for sample %s, please email to ask' %
              sp,
              file=sys.stderr)

#4. output the mapping relationship
new = []
for sp in goodSPs:
    for ref in refdict[sp]:
        new.append('%s\t%s\t%s\n' % (sp, ','.join(qdict[sp]), ref))

dn = 'mapping_info.txt'
cmn.write_file(''.join(new), dn)

dn = 'require_SNPs.dict.pkl'
cmn.pickle_write(requires, dn)