def getNodesDump_online(downloadD): if not os.path.exists(downloadD): os.makedirs(downloadD) downFbase='taxdump' downExt='tar.gz' phyloNodeFtp='ftp://ftp.ncbi.nih.gov/pub/taxonomy/%s.%s' % (downFbase,downExt) nodesDfn=pathoUtilsA.wget_download2(phyloNodeFtp,downFbase,downExt,downloadD,'select','nodes.dmp') return nodesDfn
def getGi2TaxDump_online(downloadD): if not os.path.exists(downloadD): os.makedirs(downloadD) downFbase='gi_taxid_nucl.dmp' downExt='gz' gi2taxFtp='ftp://ftp.ncbi.nih.gov/pub/taxonomy/%s.%s' % (downFbase,downExt) gi2taxDump=pathoUtilsA.wget_download2(gi2taxFtp,downFbase,downExt,downloadD,'select','gi_taxid_nucl.dmp') return gi2taxDump
def gb2prepare_load_data_file(MySqlConf,tiNtDfn,downloadD): HOST_NAME,MYSQL_PORT,USER,PASSWORD,DEFAULT_DB = range(5) #TODO[to develop to maintain pathoDB , differential updates]---------- #to clean up downloadD first downloadD_gff = downloadD + '/gbff' gbDnExt=['x','x','x'] gbDnExt[0] = 'seq.gz' gbDnExt[1] = 'gbff.gz' gbDnExt[2] = 'protein.gpff.gz' if not os.path.exists(downloadD_gff): os.makedirs(downloadD_gff) else: cmd = 'rm -rf %s/*.gbff.gz*\n' % (downloadD_gff) cmd = '%srm -rf %s/*.gpff.gz*\n' % (cmd,downloadD_gff) cmd = '%srm -rf %s/*.seq.gz*\n' % (cmd,downloadD_gff) os.system(cmd) #download genabank flat format files downExt='gz' downFbase='*.seq' #downFbase='gbest?.seq' #debug gbFlatFtpD='ftp://ftp.ncbi.nih.gov/genbank' gbFlatFtp=gbFlatFtpD+'/'+downFbase+'.'+downExt dummy=pathoUtilsA.wget_download2(gbFlatFtp,downFbase,downExt,downloadD_gff,'nothing','X') #download refseq flat format files (genomic and rna) downExt='gz' downFbase='*.gbff' #downFbase='complete.?.*.gbff' #debug refSeqFlatFtpD='ftp://ftp.ncbi.nlm.nih.gov/refseq/release/complete' refSeqFlatFtp=refSeqFlatFtpD+'/'+downFbase+'.'+downExt dummy=pathoUtilsA.wget_download2(refSeqFlatFtp,downFbase,downExt,downloadD_gff,'nothing','X') #download refseq flat format files (protein) downExt='gz' downFbase='*.gpff' #downFbase='complete.?.protein.gpff' #debug refSeqFlatFtpD='ftp://ftp.ncbi.nlm.nih.gov/refseq/release/complete' refSeqFlatFtp=refSeqFlatFtpD+'/'+downFbase+'.'+downExt dummy=pathoUtilsA.wget_download2(refSeqFlatFtp,downFbase,downExt,downloadD_gff,'nothing','X') #download tsa flat format files downExt='gz' downFbase='*.gbff' #downFbase='tsa.GAA?.1.gbff' #debug refSeqFlatFtpD='ftp://ftp.ncbi.nih.gov/genbank/tsa' refSeqFlatFtp=refSeqFlatFtpD+'/'+downFbase+'.'+downExt dummy=pathoUtilsA.wget_download2(refSeqFlatFtp,downFbase,downExt,downloadD_gff,'nothing','X') #except the following format files cmd = 'rm -rf %s/*.mstr.gbff.gz' % downloadD_gff os.system(cmd) #the following two lines should be conistent with the one defined in parse_ncbi_entry() GI,REF_NAME,TAXON_ID,ORGANISM,LINEAGE,PRODUCT,STBP,EDBP,SUB_GI=range(9) GI_SUB,STRAND_SUB,STBP_SUB,EDBP_SUB,GENE_SUB,LOCUS_TAG_SUB,PRODUCT_SUB,PROTEIN_ID_SUB=range(8) NAi=0 NAs='X' ##################################################################$ #processing genbank flat file and transfer all annotation to mysql ##################################################################$ ANNO_T,DELIM_T,TAX_T=range(3) gi_annoT_fn=downloadD_gff+'/giAnnoT2load.csv' delimT_fn=downloadD_gff+'/delimT2load.csv' taxT_fn=downloadD_gff+'/tax2load.csv' h_taxLookup = {} #read tax2load to dictionary (debug) ------------> if False: fp=open(taxT_fn,'r') for i in fp: words = i.split('\t') h_taxLookup[words[1]]=1 fp.close() #<-------------------- fps=[-1,-1,-1] fps[ANNO_T]=open(gi_annoT_fn,'w') fps[DELIM_T]=open(delimT_fn,'w') fps[TAX_T]=open(taxT_fn,'w') print 'transferring gene bank report to mysql...' gbFlatTmp = '%s/gb2process.tmp' % downloadD_gff pkey_anno = 1 pkey_delim = 1 pkey_ti = 1 doneGbD = downloadD_gff+'/completed_gbff' if not os.path.exists(doneGbD): os.makedirs(doneGbD) #count a total # of gz to process F = len(os.listdir(downloadD_gff)) f = 0 for gbFlatFn in os.listdir(downloadD_gff): tick=time() if gbFlatFn.endswith(gbDnExt[0]) or gbFlatFn.endswith(gbDnExt[1]) or gbFlatFn.endswith(gbDnExt[2]):# or gbFlatFn!='gbcon208.seq.gz': #debug cmd='gunzip -c %s/%s > %s\n' % (downloadD_gff,gbFlatFn,gbFlatTmp) cmd='%smv %s/%s %s/%s\n' % (cmd,downloadD_gff,gbFlatFn,doneGbD,gbFlatFn) os.system(cmd) f+=1 else: continue print 'processing %s[%d/%d]...' % (gbFlatFn,f,F) fp = open(gbFlatTmp,'r') #skipping header header=True while header: # Dump the header in the file tmp=fp.readline() if len(tmp)>5: header = (not tmp[:5] == "LOCUS") entry = [tmp] #only focus on the section between "LOCUS" ... "//" ti=-1 for x in fp: if re.search(r'^//', x): # Every time we get to a //\n line, we read the current entry and then start collecting a new one. gB = parse_ncbi_entry(entry) #print gB[0] #debug entry=[] #............................................. #0) check if query gi has multiple sub gis has_sub=0 if len(gB[SUB_GI])>0: has_sub=1 #1) update query gi annotation #mysql_update_anno_gi(con,gB[GI],gB[REF_NAME],gB[EDBP],gB[TAXON_ID],gB[PRODUCT],has_sub) pkey_anno,fps[ANNO_T] = csv_update_anno_gi(fps[ANNO_T],gB[GI],gB[REF_NAME],gB[EDBP],gB[TAXON_ID],gB[PRODUCT],has_sub,pkey_anno) #2) update query delimit # mysql_update_delim(con,gB[GI],gB[GI],'+',gB[STBP],gB[EDBP]) for s in gB[SUB_GI]: #3) update sub_gi annotation #mysql_update_anno_sub_gi(con,s[GI],gB[REF_NAME],gB[TAXON_ID],s[PRODUCT_SUB],s[GENE_SUB],s[PROTEIN_ID_SUB]) #4) update sub_gi delimit #mysql_update_delim(con,gB[GI],s[GI_SUB],s[GENE_SUB],s[PROTEIN_ID_SUB],s[STRAND_SUB],s[STBP_SUB],s[EDBP_SUB]) pkey_delim,fps[DELIM_T]=csv_update_delim(fps[DELIM_T],gB[GI],s[GI_SUB],s[GENE_SUB],s[LOCUS_TAG_SUB],s[PROTEIN_ID_SUB],s[STRAND_SUB],s[STBP_SUB],s[EDBP_SUB],pkey_delim) #ti=mysql_update_ti(con,gB[TAXON_ID],gB[ORGANISM],gB[LINEAGE]) if h_taxLookup.get(gB[TAXON_ID],-1) == -1: pkey_ti,fps[TAX_T] = csv_update_ti(fps[TAX_T],gB[TAXON_ID],gB[ORGANISM],gB[LINEAGE],pkey_ti) h_taxLookup[gB[TAXON_ID]]=1 #............................................. else: entry.append(x) #for loop end (x in fp) fp.close() tock=time() elapsed=tock-tick print 'elasped time:[%g]' % elapsed fps[0].close() fps[1].close() fps[2].close() #(gbFlatFn) finish for loop con = dbUtils.init_mysql_innocentive(MySqlConf,0) with con: print 'loading %s...' % (gi_annoT_fn) cur=con.cursor() mysql_load_cmd = 'load data local infile \'%s\' into table giAnnoT fields terminated by \'\\t\'' % gi_annoT_fn cur.execute(mysql_load_cmd) cur=con.cursor() mysql_idx_cmd = 'create unique index idx_gi on giAnnoT (gi)' cur.execute(mysql_idx_cmd) print 'done.' print 'loading %s...' % (delimT_fn) cur=con.cursor() mysql_load_cmd = 'load data local infile \'%s\' into table giDelimT fields terminated by \'\\t\'' % delimT_fn cur.execute(mysql_load_cmd) cur=con.cursor() mysql_idx_cmd = 'create index idx_subgi on giDelimT (gi,stbp,edbp)' cur.execute(mysql_idx_cmd) print 'done.' print 'computing database size for each taxon id...' if False: #collect dbSize for each ti h_ti_dbSz = get_ti_db_size(gi_annoT_fn) update_taxT_fn(h_ti_dbSz,taxT_fn) else: add_dbsize2taxonT(tiNtDfn,taxT_fn) print 'done.' print 'loading %s...' % (taxT_fn) cur=con.cursor() mysql_load_cmd = 'load data local infile \'%s\' into table cj_taxonT fields terminated by \'\\t\'' % taxT_fn cur.execute(mysql_load_cmd) cur=con.cursor() mysql_idx_cmd = 'create unique index idx_taxon on cj_taxonT (taxon)' cur.execute(mysql_idx_cmd) print 'done.' dbUtils.mysql_close(con) print 'done'