def read_rep(): dn = 'rep.dict.pkl' if cmn.filexist(dn): print('loading repeats using precomputed data...') return cmn.pickle_read(dn) freps = cmn.cmd2lines('ls annotation_repeats/*.gff3') repdict = {} for frep in freps: for line in cmn.file2lines(fn): items = line.strip().split() scaf = items[0] if scaf not in repdict: repdict[scaf] = set([]) i, j = list(map(int, items[3:5])) repdict[scaf] = repdict[scaf] | set(range(i, j)) cmn.pickle_write(repdict, dn) return repdict
#6188_3842_assembly_v2_snp_step2.vcf #vcf_dict = {cmn.lastName(fn).replace('_snp_step2.vcf', ''): fn for fn in fns} sps = list(vcf_dict.keys()) #ref_genomes, refmapping = detect_ref_genomes(sps, bwa_dirs) ref_genomes, refmapping = set([]), {} for fn in fns: #../../step3_gatk/5729_3614_assembly_v1/5729_3614_assembly_v1_snp_step2.vcf fnlabel = cmn.lastName(fn).replace('_snp_step2.vcf', '') items = fnlabel.split('_') sp = items[0] ref = '_'.join(items[1:]) ref_genomes.add(ref) refmapping[fnlabel] = ref cmn.pickle_write(refmapping, 'ref_mapping.dict.pkl') info = ['%s\t%s\n' % (sp, refmapping[sp]) for sp in refmapping] cmn.write_file(''.join(info), 'ref_mapping.txt') #3. make the length check ref_dir = '/work/biophysics/mtang/SNP_calling/indexed_references' unChecks = [ ref for ref in ref_genomes if not os.path.exists('%s/%s_scafLength.txt' % (ref_dir, ref)) or ( not os.path.exists('%s/%s_scaf.header' % (ref_dir, ref))) ] #unChecks = ref_genomes print('#######################################################')
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py", file=sys.stderr) sys.exit() seqDict, length = read_fa(fn) nameDict = {} count = 0 new = ['%s\t%s' % (len(seqDict), length)] for name in seqDict: count += 1 newName = 'ID%s' % (count) nameDict[newName] = name newName = '{:<10}'.format(newName) new.append('%s%s' % (newName, seqDict[name])) dn = cmn.lastName(fn) + '.phylip' cmn.write_lines(new, dn) dn = cmn.lastName(fn) + '.phylipNames.dict.pkl' cmn.pickle_write(nameDict, dn)
if __name__=='__main__': #options=parse_options() try: fn=sys.argv[1] except: print("Usage: *.py coding.gff", file=sys.stderr) sys.exit() coding_indexes = [] with open(fn) as fp: for line in fp: #scaffold1_cov14552_reverse mitfi trnM(cat) 23 89 1.809e-09 + . items = line.strip().split() Range = list(map(int, items[3:5])) if items[6] == '-': j, i = Range else: i, j = Range indexes = list(range(i, j+1)) coding_indexes += indexes dn = 'coding.indexes.pkl' cmn.pickle_write(set(coding_indexes), dn)
best_hits = find_bestE_reads(fn, label) #then separate it by exomes for line in best_hits: exon = line.strip().split()[1]#?? try: exome_dict[exon].append(line) except: exome_dict[exon] = [line] try: sp_dict[label] += best_hits except: sp_dict[label] = best_hits print('writting outputs... ') cmn.pickle_write(exome_dict, 'blastByExon.dict.pkl') cmn.pickle_write(sp_dict, 'blastBySp.dict.pkl') #output the exome blasts #for exon in exome_dict: # lines = exome_dict[exon] # dn = '%s/%s.br' % (outdir, exon) # cmn.write_file(''.join(lines), dn)
#get the reads and split them into exons #fns = cmn.getid(fn) rdict = {} for fn in fns: print('parsing ' + fn) with open(fn) as fp: for i, line in enumerate(fp): if i % 4 == 0: #record = [] ID = line.strip().replace(' ', '_') #print 'checkID', ID try: sp = good_IDs[ID] isGood = True except KeyError: isGood = False #record.append(line) if i % 4 == 1: #record = ''.join(record) seq = line.strip() if isGood: if sp not in rdict: rdict[sp] = {} rdict[sp][ID] = seq dn = 'readsBySp.dict.pkl' cmn.pickle_write(rdict, dn)
for fastq in fastqs: if not os.path.exists(fastq): print('fastq file %s doesn\'t exist! please email to ask!' % fastq) sp = cmn.lastName(fastq).split('_')[0] try: qdict[sp].append(fastq) except KeyError: qdict[sp] = [fastq] #3. check sp to see if refs are specified goodSPs = [] for sp in qdict: if sp in refdict: goodSPs.append(sp) else: print('no reference genome found for sample %s, please email to ask' % sp, file=sys.stderr) #4. output the mapping relationship new = [] for sp in goodSPs: for ref in refdict[sp]: new.append('%s\t%s\t%s\n' % (sp, ','.join(qdict[sp]), ref)) dn = 'mapping_info.txt' cmn.write_file(''.join(new), dn) dn = 'require_SNPs.dict.pkl' cmn.pickle_write(requires, dn)