### MAIN ### ### fill tables by org for org in orgs: c.execute('insert into org (short) values ("%s")' % org) con.commit() orgid = c.execute('select id from org where short="%s"' % org).fetchone()[0] print('%i %s' % (orgid, org)) # genomic contig sequences gen = '%s.genome.fa' % org if not os.path.exists(gen): continue gs = Fasta.FastaSeqs() gs.loadseqs([gen]) print('%i genomic contig seqs' % len(gs.seqs)) for s in gs.seqs.values(): ex = 'insert into genomic(org,name,seq) values(%i,"%s","%s")' % ( orgid, s.name, s.seq) c.execute(ex) con.commit() gff = '%s.gff' % org features = read_gff(gff) # CDS models (seems to be present and usable in all gffs) for cds in features['CDS'].values(): ex = 'insert into cds (org,seq,pid,start,end,strand) values (%i,"%s","%s","%s","%s","%s")' % ( orgid, cds.seq, cds.fid, cds.start, cds.end, cds.strand)
f = sys.argv[2] def expandprimer(primer): seqs = [] choices = [degen_nucs[n] for n in primer] all_combinations_gen(0, len(primer), [], seqs, choices) return seqs primers = expandprimer(primer) print('%i primers' % len(primers)) for p in primers: print p fs = Fasta.FastaSeqs() fs.loadseqs([f]) #primers.append('A') seqmatches = {} for s in fs.seqs: seqmatches[s] = [] for p in primers: prgx = re.compile(p, re.IGNORECASE) for match in prgx.finditer(fs.seqs[s].seq): seqmatches[s].append([p, match.start(), '']) rvscmp = re.compile(rvs_comp_str(p), re.IGNORECASE)
if __name__ == "__main__": import sys app = BLAST_nr() # args are blasttable, minpident, minalen bfnm = sys.argv[1] minpident = float(sys.argv[2]) minalnlen = int(sys.argv[3]) app.parseBLAST(bfnm,minpident,minalnlen) # print(app) # print(app.non_uni_clusters()) # print('\n'.join(app.keys())) grp_path = 'BLASTn_grps' if not os.path.exists(grp_path): os.mkdir(grp_path) import Fasta fa = Fasta.FastaSeqs() fa.loadseqs([sys.argv[4]]) sfx = '%i.%i' %(int(minpident),minalnlen) lngf = open('%s.lng.%s.fa' %(bfnm,sfx),'w') for k,mates in app.items(): lngf.write('>%s\n%s\n' %(k,fa.seqs[k].seq)) f = open('%s/%s.grp.%s.fa' %(grp_path,k,sfx),'w') f.write('>%s\n%s\n' %(k,fa.seqs[k].seq)) for m in mates: f.write('>%s\n%s\n' %(m,fa.seqs[m].seq)) f.close() lngf.close() exf = open('%s.extras.%s.fa' %(bfnm,sfx),'w') incids = app.allids() for k in fa.seqs: if not k in incids: