def exportBlastMatches(self, hsps): raise NotImplementedError() def _defline_fields(fields, rec): """Create a string for FASTA defline with array fields""" return " ".join(["/%s=%s".replace(" ", "_") % (field, rec[field]) for field in fields]) hsps = groupRecArray(hsps, "id_h") out = open(self.crArrBlastMatchFile, "w") blastDb = BlastDb(blastDataDir=self.opt.readsBlastDir) reader = blastDb.fastaReader(dbName=self.readsBlastDb, giFile=None, defLineTargetOnly=True, maxDegen=None) for rec in reader.records(): id_db = rec.header().split()[1] if id_db in hsps: print id_db hsps_h = hsps[id_db] descr = ",".join( [ _defline_fields(("id_q", "begin_q", "end_q", "begin_h", "end_h", "forward", "mism"), hsp) for hsp in hsps_h ] ) id = id_db seq = Seq(rec.sequence(), alphabet="dna") seq = SeqRecord(seq, id=id, description=descr) SeqIO.write([seq], out, "fasta") out.close()
def loadBlastDb(self, giIdFile): """Load specified subset of BLAST DB into HDF dataset. @param giIdFile - file with pairs 'gi id' on each line: gi will be used to pull sequence from BLAST DB, id will be used as 'id' field in HDF sequence index.""" (outGi, giFile) = makeTmpFile( mode="w", bufsize=2 ** 20, dir=self.tmpDir, prefix="hdfSeqLoader_", suffix=".gi", createParents=True ) giFile = os.path.abspath(giFile) inpGiId = open(giIdFile, "r", 2 ** 20) for line in inpGiId: outGi.write(line.split()[0] + "\n") inpGiId.close() outGi.close() blastAlias = self.blastSelAlias blastDb = BlastDb() blastDb.makeDbAlias(blastAlias, giFile) if self.debugFakeSequence: seqChecker = FakeSequenceChecker() # defLineTargetOnly is True because we should have already removed duplicate # records while collecting the sequence headers fastaInp = blastDb.fastaReader(dbName=blastAlias, giFile=giFile, defLineTargetOnly=True) inpGiId = open(giIdFile, "r", 2 ** 20) indBegin = 0 chunkSize = 1024 ** 2 iRec = 0 for (rec, giIdLine) in izip(fastaInp.records(), inpGiId): gi = int(rec.getNCBI_Id()) giCheck, id = giIdLine.split() giCheck = int(giCheck) id = int(id) assert gi == giCheck, "GI mismatch between BLAST DB FASTA stream (%i) and GI input list (%i)" % ( gi, giCheck, ) nSeq = 0 indRec = self.ind.row for chunk in rec.seqArrays(chunkSize=chunkSize): if self.debugFakeSequence: seqChecker.checkArrayGi(gi=gi, seq=chunk) nSeq += len(chunk) self.seq.append(chunk) indRec["id"] = id indRec["begin"] = indBegin indRec["size"] = nSeq indRec.append() indBegin += nSeq iRec += 1 if iRec % 10000 == 0: print "Done %s records of total length %s" % (iRec, indBegin) self.ind.flush() self.seq.flush() fastaInp.close() inpGiId.close() os.remove(giFile)
def __init__(self,dbSql): MGTOptions.__init__(self) self.dbSql = dbSql self.blastDb = BlastDb() self.taxaTree = None self.taxaLevels = None self.taxaTreeDbStore = NodeStorageDb(db=self.dbSql,tableSfx=self.taxaTreeTableSfxMain)
class TaxaCollector(MGTOptions): """Collects data about taxonomically known sequence for training the classifier. Process the NCBI BLAST DB files by calling fastacmd. Aggregate, prioritize and partially remove redundancy along the taxonomy ids.""" def __init__(self,dbSql): MGTOptions.__init__(self) self.dbSql = dbSql self.blastDb = BlastDb() self.taxaTree = None self.taxaLevels = None self.taxaTreeDbStore = NodeStorageDb(db=self.dbSql,tableSfx=self.taxaTreeTableSfxMain) def rebuild(self): #self.loadGiTaxNumpy() #self.loadRefseqAcc() #self.loadTaxTables() #self.loadSeq() # we currently use act_seq instead of seq # in loadSeqToHdf() because act_seq is filtered by # all_src and thus skips all records that we drop while # creating all_src #self.selectActiveSeq() #self.tmp_restart() #self.loadSeqToHdf() #self.indexHdfSeq() #self.indexHdfActiveSeq() # only for analysis: #self.loadTaxLevelsSql() #self.reportStat() pass def tmp_restart(self): db = self.dbSql self.selectTaxSet() self.excludePostSource() return db.createIndices(table="seq", names=["gi"], attrib={"gi":{"unique":True}}) db.createIndices(table="seq_hdr", names=["gi"], attrib={"gi":{"unique":True}}) db.ddl("analyze table seq",ifDialect="mysql") db.ddl("analyze table seq_hdr",ifDialect="mysql") self.createFullTextIndexSeqHeader() def loadTaxTables(self): """Create and fill all initial tables with taxa tree data. It only needs NCBI taxonomy dump files. It assigns dummy zero values to seq_len and seq_len_total attributes. Result: original tree dumf files as well as our tree object with nested set index etc is saved in SQL DB""" self.loadTaxCategories() self.loadTaxNodes() self.loadTaxNames() self.loadTaxLevels() self.taxaTree.setAttribute("seq_len",0L) self.taxaTree.setAttribute("seq_len_tot",0L) self.taxaTreeDbStore.save(self.taxaTree) def loadSeq(self): db = self.dbSql self.loadGiTaxPickled() self.createTableSeq() #sql = """ #load data infile '/usr/local/scratch/atovtchi.mgt_seq.bulk.tmp' #into table seq #fields #terminated by '|' #optionally enclosed by '"' #""" #sql = """ #copy into seq from '/usr/local/scratch/atovtchi.mgt_seq.bulk.tmp' #""" #db.ddl(sql) self.idGenSeq = IntIdGenerator() inserterSeq = db.makeBulkInserterFile(table='seq',bufLen=500000,workDir=self.tmpDir) inserterSeqMultiId = db.makeBulkInserterFile(table='seq_multi_id',bufLen=500000,workDir=self.tmpDir) inserterSeqHdr = db.makeBulkInserterFile(table='seq_hdr',bufLen=500000,workDir=self.tmpDir) blastDbs = self.blastDb.getDbs() self.loadSeqNCBI(blastDbs[0],inserterSeq,inserterSeqMultiId,inserterSeqHdr) self.loadSeqNCBI(blastDbs[1],inserterSeq,inserterSeqMultiId,inserterSeqHdr) self.loadSeqNCBI(blastDbs[2],inserterSeq,inserterSeqMultiId,inserterSeqHdr) self.loadSeqNCBI(blastDbs[3],inserterSeq,inserterSeqMultiId,inserterSeqHdr) self.loadSeqNCBI(blastDbs[4],inserterSeq,inserterSeqMultiId,inserterSeqHdr) inserterSeq.flush() inserterSeqMultiId.flush() inserterSeqHdr.flush() db.createIndices(table="seq", names=["gi","taxid","src_db","kind","project"], primary="id", compounds={"src":"taxid,src_db,kind,project"}) db.ddl("analyze table seq",ifDialect="mysql") db.createIndices(table="seq_multi_id", names=["gi","acc_db"], primary="id_seq") db.ddl("analyze table seq_multi_id",ifDialect="mysql") db.createIndices(table="seq_hdr", names=["gi"], primary="id_seq") db.ddl("analyze table seq_hdr",ifDialect="mysql") self.clearGiTax() self.delDuplicateGiFromSeq() db.ddl("analyze table seq",ifDialect="mysql") db.ddl("analyze table seq_hdr",ifDialect="mysql") self.createFullTextIndexSeqHeader() def clearGiTax(self): self.gi2taxa = None def createTableSeq(self): # we removed "auto_increment primary key" from id # to speed up bulk load self.dbSql.ddl(""" create table seq ( id integer, gi bigint, taxid integer, src_db char(1), project char(4), seq_len bigint, acc_db char(3), acc char(18), kind char(2) ) """, dropList=["table seq"]) self.dbSql.ddl(""" create table seq_hdr ( id_seq integer, gi bigint, hdr varchar(%i) ) """ % (self.fastaHdrSqlLen,), dropList=["table seq_hdr"]) ## If we get a header record with two gi's like: ## >gi|23455713|ref|NC_004301.1| Enterobacteria phage FI, complete genome >gi|15183|emb|X07489.1| Bacteriophage SP genomic RNA ## then we insert the second gi along with db ("emb" here) into the seq_multi_id table, related to the record in 'seq' by ## "seq_multi_id.id_seq = seq.id". self.dbSql.ddl(""" create table seq_multi_id ( id_seq integer, gi bigint, acc_db char(3) ) """, dropList=["table seq_multi_id"]) def loadSeqNCBI(self,db,inserterSeq,inserterSeqMultiId,inserterSeqHdr): print "Processing BLAST DB " + db.db inp = self.blastDb.fastaReader(dbName=db.db,defLineTargetOnly=False) iRec = 0 for rec in inp.records(): idSeq = self.idGenSeq() title = rec.header()[1:-1] #remove '>' and '\n' (gifld,gi,acc_db,acc,txt) = title.split('|',4) assert gifld == 'gi' gi = int(gi) defLine2 = txt.split(">gi|") if len(defLine2) > 1: txt = defLine2[0] (gi2,acc_db2,dummy) = defLine2[1].split('|',2) gi2 = int(gi2) inserterSeqMultiId((idSeq,gi2,acc_db2)) try: taxid = int(self.gi2taxa[gi]) except KeyError: print "Warning: Taxid not found for gi: "+gi taxid = 0 # We are not interested in taxonomically unassigned sequence. # If we late change this, we also need to modify # exportIdsForSeqDb() where we currently do the inner join # with tax nodes table if taxid != 0: #Accession number formats are described here: #http://www.ncbi.nlm.nih.gov/Sequin/acc.html #WGS id might be 'AAAA00000000' or 'NZ_AAAA00000000', both cases seen in wgs db kind = '' acc_sfx = acc if acc_sfx[2] == '_': kind = acc_sfx[:2] acc_sfx = acc_sfx[3:] project = '' if acc_sfx[:4].isalpha() and acc_sfx[5].isdigit(): project = acc_sfx[:4] seqLen = rec.seqLen() values = (idSeq,gi,taxid,db.id,project,seqLen,acc_db,acc,kind) #values = [ str(x) for x in values ] inserterSeq(values) values = (idSeq,gi,title[:self.fastaHdrSqlLen]) inserterSeqHdr(values) if iRec % 50000 == 0: print db.db, title, taxid, iRec, seqLen #if iRec >= 500000: # break iRec += 1 inp.close() def tmp_loadSeqHdr(self): idGenSeq = IntIdGenerator() db = self.dbSql db.ddl(""" create table seq_hdr ( id_seq integer, gi bigint, hdr varchar(%i) ) """ % (self.fastaHdrSqlLen,), dropList=["table seq_hdr"]) reCutLen = re.compile(r"(.*)\Wlen:[0-9]+") inserterSeqHdr = db.makeBulkInserterFile(table='seq_hdr',bufLen=500000,workDir=self.tmpDir) inp = open("all.hdr",'r') for rec in inp: idSeq = idGenSeq() title = reCutLen.search(rec[1:-1]).group(1) #remove '>' and '\n', then cut the " len:0000000" suffix (gifld,gi,txt) = title.split('|',2) assert gifld == 'gi' gi = int(gi) values = (idSeq,gi,title) inserterSeqHdr(values) inserterSeqHdr.flush() db.createIndices(table="seq_hdr", names=["gi"], primary="id_seq") db.ddl("analyze table seq_hdr",ifDialect="mysql") def createFullTextIndexSeqHeader(self): """Build a full text index for sequence FASTA hdeaders in 'seq_hdr' table (currently implemented only for MySQL back-end). Warning: it took 1 hr for 28M records.""" db = self.dbSql db.ddl("alter table seq_hdr add fulltext index hdr(hdr)",ifDialect="mysql",dropList=["index hdr on seq_hdr"]) def loadRefseqAcc(self): self.dbSql.ddl("""\ create table refseq_acc ( prefix char(2), acc char(40), molecule char(10), method char(15), descr varchar(400) ) """, dropList=["table refseq_acc"]) self.dbSql.createIndices(names=["prefix"],table="refseq_acc") self.dbSql.executemany("""\ insert into refseq_acc (prefix,acc,molecule,method,descr) values (%s,%s,%s,%s,%s) """,refseqAccFormat) def loadTaxCategories(self): self.dbSql.ddl("""\ create table taxa_cat (cat char(1), taxid_ancestor integer, taxid integer) """, dropList=["table taxa_cat"]) inp = open(self.taxaCatFile,'r') curs = self.dbSql.cursor() iRec = 0 bufLen = 500000 sql = """\ insert into taxa_cat (cat,taxid_ancestor,taxid) values (%s,%s,%s) """ inserter = self.dbSql.makeBulkInserterFile(table="taxa_cat",bufLen=500000,workDir=self.tmpDir) for rec in inp: vals = rec.split() vals[1] = int(vals[1]) vals[2] = int(vals[2]) inserter(vals) inp.close() inserter.flush() curs.close() self.dbSql.createIndices(names=["cat","taxid_ancestor"],table="taxa_cat",primary="taxid") def loadTaxNodes(self): self.dbSql.ddl("""\ create table taxa_node ( taxid integer, partaxid integer, rank char(20), embl_code char(2), divid integer, inh_div bool, gcode_id integer, inh_gc bool, mgcode_id integer, inhmgc bool, gbhidden bool, hidsubtree bool, comments char(40) ) """, dropList=["table taxa_node"]) inp = open(self.taxaNodesFile,'r') curs = self.dbSql.cursor() iRec = 0 bufLen = 500000 sql = """\ insert into taxa_node ( taxid, partaxid, rank, embl_code, divid, inh_div, gcode_id, inh_gc, mgcode_id, inhmgc, gbhidden, hidsubtree, comments ) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ inserter = self.dbSql.makeBulkInserterFile(table="taxa_node",bufLen=500000,workDir=self.tmpDir) for rec in inp: # The last '|' is behind the last field, so we cut it inserter([ x.strip() for x in rec.split('|')][:-1]) inserter.flush() inp.close() curs.close() #Get rid of spaces in 'rank' field to simplify export of data later on self.dbSql.execute("update taxa_node set rank = replace(rank,' ','_')").close() self.dbSql.createIndices(names=["partaxid","divid"],table="taxa_node",primary="taxid") def loadTaxNames(self): """We load only 'scientific name' entries.""" self.dbSql.ddl("""\ create table taxa_names ( taxid integer, name varchar(180) ) """, dropList=["table taxa_names"]) inp = open(self.taxaNamesFile,'r') inserter = self.dbSql.makeBulkInserterFile(table="taxa_names",bufLen=500000,workDir=self.tmpDir) for line in inp: rec = [ x.strip() for x in line.split('|') ] if rec[3] == "scientific name": inserter((int(rec[0]),rec[1])) inserter.flush() inp.close() self.dbSql.createIndices(table="taxa_names",primary="taxid") self.dbSql.executeAndAssertEmpty("select taxid from taxa_node where taxid not in (select taxid from taxa_names)") def loadTaxLevels(self): self.loadTaxLevelsMem(withTaxTree=True) def loadTaxLevelsSql(self): self.loadTaxLevelsMem(withTaxTree=True) self.taxaLevels.loadTaxLevelsRows() self.taxaLevels.loadTaxLevelsColumns() self.taxaLevels.makeStatsTables() def loadTaxLevelsMem(self,withTaxTree=False): if self.taxaLevels is None: if withTaxTree: self.loadTaxNodesMem() # that will also set level attributes inside tree nodes self.taxaLevels = TaxaLevelsDb(self.dbSql,self.taxaTree) def loadTaxNodesMem(self): if self.taxaTree is None: self.taxaTree = \ TaxaTree(NodeStorageNcbiDump(ncbiDumpFile=self.taxaNodesFile, ncbiNamesDumpFile=self.taxaNamesFile)) #self.taxaTree.write(sys.stdout) def loadGiTaxSql(self): self.dbSql.dropTable("gi_taxa") self.dbSql.execute( """ create table gi_taxa ( gi bigint, taxid integer ) """) #self.dbSql.executemany("insert into gi_tax (gi,taxid) values (%s,%s)",[(1,2),(3,4)]) #print "Test done" #self.dbSql.execute("insert into gi_tax (gi,taxid) values ('1','2')") inp = file(self.taxaGiFile,'r') iRow = 0 bufLen = 100000 sql = "insert into gi_taxa (gi,taxid) values (%s,%s)" curs = self.dbSql.cursor() inserter = self.dbSql.makeBulkInserterFile(table='gi_taxa',bufLen=500000,workDir=self.tmpDir) print "Start insert" for record in inp: inserter(record.split()) iRow += 1 if iRow % 100000 == 0: print record,iRow inserter.flush() curs.close() inp.close() print "Start index build" self.dbSql.ddl("""\ alter table gi_taxa ADD PRIMARY KEY gi(gi), add index taxid(taxid) """) def loadGiTaxBdb(self,inpFile): import anydbm, whichdb print whichdb.whichdb('/export/atovtchi/taxa.db') gi2taxa = anydbm.open('/export/atovtchi/taxa.db', 'c') inp = file(inpFile,'r') iRow = 0 buff = {} for line in inp: (gi,taxid) = line.split() buff[gi] = taxid if iRow % 100000 == 0: print gi, taxid, iRow gi2taxa.update(buff) buff = {} iRow += 1 inp.close() taxaCnt = {} for (gi,taxid) in gi2taxa.iteritems(): taxaCnt[taxid] = taxaCnt.get(taxid,0) + 1 print sorted(((cnt,taxid) for (taxid,cnt) in taxaCnt.iteritems())) def loadGiTaxNumpy(self): if not os.path.exists(self.taxaPickled): makeGiTaxBin(self.taxaGiFiles,self.taxaPickled) def loadGiTaxPickled(self): self.gi2taxa = loadGiTaxBin(self.taxaPickled) #pdb.set_trace() def delDuplicateGiFromSeq(self): """Delete duplicate records (by gi) from seq table, leave only those with smallest gi. Duplicate records appear because some sequences (with identical defline) are included into both 'refseq_genomic' and 'other_genomic' BLAST databases.""" db = self.dbSql db.createTableAs("tmp_gi2",""" (select gi,count(*) as cnt from seq group by gi having count(*) > 1) """) db.createIndices(names=["gi"],table="tmp_gi2") db.createTableAs("tmp_id_gi2",""" (select a.id,a.gi from seq a,tmp_gi2 b where a.gi = b.gi) """) db.createIndices(names=["id","gi"],table="tmp_id_gi2") db.createTableAs("tmp_id_gi1",""" (select a.id from tmp_id_gi2 a where a.id > SOME ( select b.id from tmp_id_gi2 b where a.gi = b.gi )) """) db.createIndices(names=["id"],table="tmp_id_gi1") db.ddl(""" delete from seq where id in (select id from tmp_id_gi1) """) db.ddl(""" delete from seq_hdr where id_seq in (select id from tmp_id_gi1) """) db.dropTables(("tmp_gi2","tmp_id_gi2","tmp_id_gi1")) db.createIndices(table="seq", names=["gi"], attrib={"gi":{"unique":True}}) db.createIndices(table="seq_hdr", names=["gi"], attrib={"gi":{"unique":True}}) def selectActiveSeq(self): """Select 'active' source sequence set - the one that can be used for training. @post tables act_seq and act_src contain usable training sequence.""" self.excludePreSource() self.selectSeqBySource() self.excludePostSource() def excludePreSource(self): """Exclude some sequence before selectTaxSource() @post table seq_excl has excluded idseq's. We exclude records that are likely to be outliers in composition: rRNA genes, plasmids and genomic islands. Then we exclude all NT records that are not complete genome or complete chromosome.""" ## Currently we use MySQL-specific full text search db = self.dbSql db.createTableAs("seq_excl_1",""" (select id_seq as id from seq_hdr where MATCH(hdr) AGAINST('"ribosomal RNA" "rRNA" "plasmid" "genomic island"' IN BOOLEAN MODE) ) """) db.createTableAs("seq_incl",""" (select id_seq as id from seq_hdr where MATCH(hdr) AGAINST('"complete chromosome" "complete genome"' IN BOOLEAN MODE) ) """) db.createIndices(primary="id",table="seq_incl") db.ddl(""" INSERT INTO seq_excl_1 SELECT a.id FROM seq a WHERE a.src_db = 'n' AND a.id NOT IN (SELECT id FROM seq_incl ) """) db.createTableAs("seq_excl",""" (select DISTINCT id from seq_excl_1) """) db.createIndices(primary="id",table="seq_excl") def selectSeqBySource(self): """For each taxid, group available sequence by its source (refseq, wgs, htgs, nt) and select groups by priority. @post table all_src that has all selected groups and synthetic primary key 'id' for each group.""" db = self.dbSql db.createTableAs("taxa_src_1",""" (SELECT taxid , src_db , kind , project , 0 AS priority, COUNT(*) AS cnt , SUM(seq_len) AS seq_len FROM seq WHERE id not in (SELECT id FROM seq_excl) GROUP BY taxid , src_db , kind , project ) """) db.createTableAs("taxa_src",""" (SELECT * FROM taxa_src_1 ) ORDER BY taxid , src_db , kind , project """) db.dropTable("taxa_src_1") #db.ddl("""ALTER TABLE taxa_src ADD id INTEGER auto_increment PRIMARY KEY""",ifDialect="mysql") db.ddl("""ALTER TABLE taxa_src ADD id INTEGER auto_increment PRIMARY KEY""") db.createIndices(names=["taxid","src_db","kind","project"],table="taxa_src") ## Drop "Alternative" full genomes (e.g. haplotypes) db.ddl(""" DELETE FROM taxa_src WHERE kind = 'AC' """) db.createTableAs("wgs_src_1",""" (SELECT * FROM taxa_src WHERE project <> '' AND kind <> '' ) """) db.ddl(""" INSERT INTO wgs_src_1 SELECT a.* FROM taxa_src a WHERE a.project <> '' AND a.taxid NOT IN (SELECT taxid FROM wgs_src_1 ) """) db.createTableAs("wgs_src_2",""" (SELECT a.* FROM wgs_src_1 a WHERE a.seq_len >= ALL (SELECT b.seq_len FROM wgs_src_1 b WHERE a.taxid = b.taxid ) ) """) db.createTableAs("wgs_src_3",""" (SELECT a.* FROM wgs_src_2 a WHERE a.id >= (SELECT MAX(b.id) FROM wgs_src_2 b WHERE a.taxid = b.taxid ) ) """) db.dropTable("wgs_src_2") db.createTableAs("wgs_src_nr",""" (SELECT * FROM wgs_src_3 ) """) db.createTableAs("nc_src_1",""" (SELECT a.* FROM taxa_src a WHERE a.kind = 'NC' AND a.seq_len * 2 > ALL (SELECT b.seq_len FROM wgs_src_nr b WHERE a.taxid = b.taxid ) ) """) db.createTableAs("htg_over_nc_src_1",""" (SELECT a.* FROM taxa_src a, nc_src_1 b WHERE a.src_db = 'h' AND a.kind <> 'NC' AND a.taxid = b.taxid AND a.seq_len > b.seq_len * 5 AND b.seq_len < 500000 AND a.taxid NOT IN (SELECT taxid FROM wgs_src_nr ) ) """) db.createTableAs("nc_src_2",""" (SELECT * FROM nc_src_1 WHERE taxid NOT IN (SELECT taxid FROM htg_over_nc_src_1 ) ) """) db.createIndices(names=["taxid"],table="nc_src_2") db.createTableAs("nc_src_3",""" (SELECT a.* FROM nc_src_2 a WHERE a.id >= (SELECT MAX(b.id) FROM nc_src_2 b WHERE a.taxid = b.taxid ) ) """) db.createTableAs("nc_src",""" (SELECT a.*, 'nc' AS stage FROM nc_src_3 a ) """) db.createIndices(names=["taxid"],table="nc_src_3") db.createTableAs("wgs_src",""" (SELECT a.*, 'wg' AS stage FROM wgs_src_nr a WHERE taxid NOT IN (SELECT taxid FROM nc_src ) ) """) db.createIndices(names=["taxid"],table="wgs_src") db.createTableAs("htg_src_1",""" (SELECT * FROM taxa_src WHERE ( src_db = 'h' OR kind <> '' ) AND (taxid NOT IN (SELECT taxid FROM nc_src ) AND taxid NOT IN (SELECT taxid FROM wgs_src ) ) ) """) db.createTableAs("htg_src",""" (SELECT a.*, 'ht' AS stage FROM htg_src_1 a ) """) db.createTableAs("gen_src",""" (SELECT * FROM nc_src ) """) db.ddl(""" INSERT INTO gen_src SELECT * FROM wgs_src UNION SELECT * FROM htg_src """) db.createIndices(names=["taxid"],table="gen_src") db.createTableAs("nt_src",""" (SELECT a.*, 'nt' AS stage FROM taxa_src a WHERE taxid NOT IN (SELECT taxid FROM gen_src ) ) """) db.createTableAs("all_src_1",""" (SELECT a.*, 'g' AS src_type FROM gen_src a ) """) db.ddl(""" INSERT INTO all_src_1 SELECT a.*, 'o' AS src_type FROM nt_src a """) #def selectTaxSet(self): #db = self.dbSql tblNodes = self.taxaTreeDbStore.tblNodes ## We exclude records that are in subtrees of taxidDrop taxids, ## or have undefined taxonomy fields ## or with divid from dividDrop list. db.createTableAs("excl_taxid",""" (SELECT b.id FROM %(tblNodes)s b, %(tblNodes)s c WHERE %(isSub_B_C)s AND c.id in %(taxidDrop)s ) """ % dict(isSub_B_C=sqlIsSubTree(aliasSub='b',aliasSup='c',withEquality=True), tblNodes=tblNodes, taxidDrop=sqlInList(self.taxidDrop))) db.ddl("ANALYZE TABLE excl_taxid",ifDialect="mysql") db.createIndices(table="excl_taxid",primary="id") db.createTableAs("all_src",""" (SELECT a.* , b.cat , c.divid, c.rank FROM all_src_1 a LEFT JOIN taxa_cat b ON a.taxid = b.taxid LEFT JOIN taxa_node c ON a.taxid = c.taxid WHERE not ( a.taxid IS NULL OR a.taxid IN (SELECT id FROM excl_taxid) OR a.taxid = 0 OR c.divid IS NULL OR b.cat IS NULL OR c.divid IN %(dividDrop)s) ) """ % dict(dividDrop=sqlInList(self.dividDrop))) db.ddl("ANALYZE TABLE all_src",ifDialect="mysql") db.createIndices(names=["taxid","src_db","kind","project","cat","divid","rank"], table="all_src", primary="id", compounds={"src":"taxid,src_db,kind,project"}, attrib={"src":{"unique":True}}) db.ddl("ANALYZE TABLE all_src",ifDialect="mysql") def excludePostSource(self): """Exclude some sequence after selectTaxSource(). @post table act_seq is mapping into seq and act_src @post table act_src is all_src with some records dropped and seq_len recomputed from seq_src. The combination of these two new tables allows to filter initial sequence set described by all_src both at individual sequence level (e.g. drop short sequences) and at source id level (meaning origin,sequence type) (e.g. retain only longest RefSeq strain for each species). Currently this method does not drop any sequence. After this method, act_seq and act_src can be used to randomly sample for training and testing sets.""" db = self.dbSql db.createTableAs("act_seq",""" (SELECT a.id, a.gi, b.id as id_src, a.taxid, a.seq_len FROM seq a, all_src b WHERE a.taxid = b.taxid AND a.src_db = b.src_db AND a.kind = b.kind AND a.project = b.project AND a.taxid <> 0 AND a.id not in (SELECT id FROM seq_excl) ) """) db.createIndices(names=["gi","id_src","taxid"],primary="id", table="act_seq",attrib={"gi":{"unique":True}}) db.ddl("ANALYZE TABLE act_seq",ifDialect="mysql") db.createTableAs("act_src",""" (SELECT a.id,a.taxid,a.src_db,a.kind,a.project,a.cat,a.divid, b.seq_len FROM all_src a, (SELECT id_src, sum(seq_len) as seq_len FROM act_seq GROUP BY id_src) b WHERE a.id = b.id_src ) """) db.ddl("ANALYZE TABLE act_src",ifDialect="mysql") db.createIndices(names=["taxid","src_db","kind","project","cat","divid"], table="act_src", primary="id", compounds={"src":"taxid,src_db,kind,project"}, attrib={"src":{"unique":True}}) db.ddl("ANALYZE TABLE act_src",ifDialect="mysql") # This is more for reporting - operational counts are based # on samples and created in LabelTaxa module db.createTableAs("taxa_seq_len",""" select taxid,sum(seq_len) as seq_len from act_src group by taxid """) db.createIndices(primary="taxid",table="taxa_seq_len") db.ddl("ANALYZE TABLE taxa_seq_len",ifDialect="mysql") # To analyze what sorts of NT records might be included, use something like # (remember that currently we drop most of NT in excludePreSource). # Withourt that drop, there are things like RNA spikes collections, # many genomic islands. The reasonable candidates are large genome clusters (100Kb), # but currently we drop them because they might be biased across subtrees. Also, # we often have for such things: Species node (genome clusters) -> strain node (complete genome). #select b.*,avg(a.seq_len) as avg_len,sum(a.seq_len) as sum_len,max(a.gi) from act_seq a, act_src b where a.seq_len >= 150000 and a.id_src = b.id and b.src_db = 'n' and b.cat = 'B' group by a.id_src having sum_len >= 100000 order by avg_len desc limit 100; def exportIdsForSeqDb(self,seqTable): """Write text files with sequence ids that will be used to create sequence HDF5 files. Ids are extracted from act_seq table. Currently we load only NCBI sequence, but we want to decouple SeqDB from NCBI GIs in case we will also use non-NCBI sequence in the future. Therefore, we write a file with (GI,ID) pairs where ID is our internal sequence id. The order is defined by our tree nested set index. That most closely corresponds to the order in which we will be traversing the tree most of the time.""" treeTable = self.taxaTreeDbStore.tblNodes db = self.dbSql # GI,ID list for fastacmd input and HDF index id db.exportToFile(\ """SELECT a.gi,a.id""", """ FROM %(seqTable)s a, %(treeTable)s b WHERE a.taxid = b.id ORDER BY b.lnest,a.id """ % {"seqTable":seqTable,"treeTable":treeTable}, fileName=self.seqGiIdFile,fieldsTerm=' ',linesTerm=r'\n') def selectSeqIds(self): db = self.dbSql self.loadTaxLevelsMem() db.createTableAs("seq_sel",""" (SELECT a.gi , a.taxid , b.id as src_id FROM seq a, all_src b WHERE a.taxid = b.taxid AND a.src_db = b.src_db AND a.kind = b.kind AND a.project = b.project AND a.taxid <> 0 ) """) ## Proved to be essential in MySQL for efficient planning of queries db.ddl("ANALYZE TABLE seq_sel",ifDialect="mysql") db.createIndices(names=["taxid","src_id"], table="seq_sel", primary="gi") db.ddl("ANALYZE TABLE seq_sel",ifDialect="mysql") ### in MySQL computation of acc_no_ver (ACC w/o '.X' suffix) ### would be just SUBSTRING_INDEX( a.acc , '.', 1 ) AS acc_no_ver, ### but we use SQL standard conforming expression #db.createTableAs("seq_sel",""" #(SELECT a.* , #b.cat , #b.stage , #b.src_type , #SUBSTRING(a.acc FROM 1 FOR #CASE #WHEN POSITION('.' IN a.acc) <> 0 #THEN POSITION('.' IN a.acc) - 1 #ELSE LENGTH(a.acc) #END) #AS acc_no_ver , #b.divid , #b.rank #FROM seq a, #all_src b #WHERE a.taxid = b.taxid #AND a.src_db = b.src_db #AND a.kind = b.kind #AND a.project = b.project #AND a.taxid <> 0 #) #""") #db.createIndices(names=["taxid","src_db","kind","project","cat", #"stage","src_type","gi","acc","acc_no_ver","divid","rank"], #table="seq_sel", #primary="id") levelsComma = self.taxaLevels.getLevelColumnsComma(alias='b',order='descend') ## save gis only into a file to be used as gi list for NCBI alias db db.exportToFile(\ """SELECT a.gi""", """FROM seq_sel a, taxa_level_col b WHERE a.taxid = b.taxid ORDER BY %s,a.taxid,a.gi """ % (levelsComma,), fileName=self.collectTaxaGiFile,fieldsTerm=' ',linesTerm=r'\n') ### save all important fields for selected seq records into a ### flat file to be merged with ### fasta sequence data extracted from blast databases ### through just created gi list ### the order must match the preceding 'select' for gis ### Full csv format will have something like OPTIONALLY ENCLOSED BY '"' #db.ddl(""" #SELECT gi , #taxid , #src_db , #kind , #project , #cat , #stage , #src_type, #id , #seq_len , #divid , #rank #INTO OUTFILE '/home/atovtchi/scratch/mgtdata/phyla_sel.csv' #FIELDS TERMINATED BY ' ' #LINES TERMINATED BY '\n' #FROM seq_sel #ORDER BY taxid, #id #""") def loadSeqToHdf(self): """Load sequence data for all records in 'seq' table into HDF dataset.""" seqTable = "act_seq" self.exportIdsForSeqDb(seqTable=seqTable) ## CAUTION! mode="w" trancates the HDF file if it already exists. ## "a" opens file for writing w/o destroying existing data, ## "r+" is the same as "a" but file must exist seq_len_tot = long(self.dbSql.selectScalar("select sum(seq_len) from %s" % seqTable)) seq_cnt = long(self.dbSql.selectScalar("select count(*) from %s" % seqTable)) hdfFile = pt.openFile(self.hdfSeqFile,mode="w") hdfLoader = HdfSeqLoader(hdfFile=hdfFile, hdfGroup=self.hdfSeqGroup, seqSizeEstimate=seq_len_tot, seqNumEstimate=seq_cnt) hdfLoader.loadBlastDb(giIdFile=self.seqGiIdFile) #hdfLoader.close() hdfFile.close() def indexHdfSeq(self): """Create SQL tables that index sequence in HDF dataset.""" db = self.dbSql hdfFile = pt.openFile(self.hdfSeqFile,mode="r") hdfSeq = HdfSeqReaderSql(db=db,hdfFile=hdfFile,hdfGroup=self.hdfSeqGroup) hdfSeq.loadIndToSql() #hdfSeq.close() hdfFile.close() def indexHdfActiveSeq(self): """Create HDF index dataset for active sequence (from act_seq table).""" hdfFile = pt.openFile(self.hdfActSeqFile,mode="w") hdfMakeActiveSeqInd(db=self.dbSql,hdfFile=hdfFile,hdfPath=self.hdfActSeqInd) hdfFile.close() def reportStat(self): """Output a report that shows a high-level overview of data.""" db = self.dbSql ## Some tables to report statistics on data ## Group into a single taxid entry per source database db.createTableAs("src_db_stat",""" (SELECT src_db , taxid , COUNT(*) AS cnt FROM all_src GROUP BY src_db, taxid ) """) db.executeAndPrint(""" SELECT src_db, COUNT(*) FROM src_db_stat GROUP BY src_db """) db.createTableAs("taxa_stat",""" (SELECT src_type , stage, taxid, cat , COUNT(*) AS cnt FROM all_src GROUP BY src_type, stage , taxid , cat ) """) db.executeAndPrint(""" SELECT stage, COUNT(*) FROM taxa_stat GROUP BY stage """) db.executeAndPrint(""" SELECT stage, cat , COUNT(*) FROM taxa_stat GROUP BY stage, cat """) db.executeAndPrint(""" SELECT src_type, cat , COUNT(*) FROM taxa_stat GROUP BY src_type, cat """) db.executeAndPrint(""" SELECT COUNT(*) FROM all_src WHERE kind = 'NC' """) ## Human must be in NC_ only db.executeAndPrint(""" SELECT * FROM all_src WHERE taxid = 9606 """) db.executeAndPrint(""" SELECT rank, COUNT(*) FROM all_src GROUP BY rank """) db.executeAndPrint(""" SELECT stage, rank, COUNT(*) FROM all_src GROUP BY stage, rank """) db.executeAndPrint(""" SELECT divid, COUNT(*) FROM seq_sel GROUP BY divid """) db.executeAndPrint(""" SELECT rank, COUNT(*) FROM seq_sel GROUP BY rank """)