def getNodesDump_online(downloadD):
	if not os.path.exists(downloadD):
		os.makedirs(downloadD)
	downFbase='taxdump'
	downExt='tar.gz'
	phyloNodeFtp='ftp://ftp.ncbi.nih.gov/pub/taxonomy/%s.%s' % (downFbase,downExt)
	nodesDfn=pathoUtilsA.wget_download2(phyloNodeFtp,downFbase,downExt,downloadD,'select','nodes.dmp')
	return nodesDfn
def getGi2TaxDump_online(downloadD):
	if not os.path.exists(downloadD):
		os.makedirs(downloadD)
	downFbase='gi_taxid_nucl.dmp'
	downExt='gz'
	gi2taxFtp='ftp://ftp.ncbi.nih.gov/pub/taxonomy/%s.%s' % (downFbase,downExt)
	gi2taxDump=pathoUtilsA.wget_download2(gi2taxFtp,downFbase,downExt,downloadD,'select','gi_taxid_nucl.dmp')
	return gi2taxDump
def gb2prepare_load_data_file(MySqlConf,tiNtDfn,downloadD):
	
	HOST_NAME,MYSQL_PORT,USER,PASSWORD,DEFAULT_DB = range(5)
	#TODO[to develop to maintain pathoDB , differential updates]----------
	#to clean up downloadD first
	downloadD_gff = downloadD + '/gbff'
	
	gbDnExt=['x','x','x']
	gbDnExt[0] = 'seq.gz'
	gbDnExt[1] = 'gbff.gz'
	gbDnExt[2] = 'protein.gpff.gz'
	
	
	if not os.path.exists(downloadD_gff):
		os.makedirs(downloadD_gff)
	else:
		cmd = 'rm -rf %s/*.gbff.gz*\n' % (downloadD_gff)
		cmd = '%srm -rf %s/*.gpff.gz*\n' % (cmd,downloadD_gff)
		cmd = '%srm -rf %s/*.seq.gz*\n' % (cmd,downloadD_gff)
		os.system(cmd)
			
	#download genabank flat format files
	downExt='gz'
	downFbase='*.seq'
	#downFbase='gbest?.seq' #debug
	gbFlatFtpD='ftp://ftp.ncbi.nih.gov/genbank'
	gbFlatFtp=gbFlatFtpD+'/'+downFbase+'.'+downExt
	
	dummy=pathoUtilsA.wget_download2(gbFlatFtp,downFbase,downExt,downloadD_gff,'nothing','X')
	
	#download refseq flat format files (genomic and rna)
	downExt='gz'
	downFbase='*.gbff'
	#downFbase='complete.?.*.gbff' #debug
	refSeqFlatFtpD='ftp://ftp.ncbi.nlm.nih.gov/refseq/release/complete'
	refSeqFlatFtp=refSeqFlatFtpD+'/'+downFbase+'.'+downExt
	dummy=pathoUtilsA.wget_download2(refSeqFlatFtp,downFbase,downExt,downloadD_gff,'nothing','X')
	
	
	#download refseq flat format files (protein)
	downExt='gz'
	downFbase='*.gpff'
	#downFbase='complete.?.protein.gpff' #debug
	refSeqFlatFtpD='ftp://ftp.ncbi.nlm.nih.gov/refseq/release/complete'
	refSeqFlatFtp=refSeqFlatFtpD+'/'+downFbase+'.'+downExt
	dummy=pathoUtilsA.wget_download2(refSeqFlatFtp,downFbase,downExt,downloadD_gff,'nothing','X')
			
	#download tsa flat format files
	downExt='gz'
	downFbase='*.gbff'
	#downFbase='tsa.GAA?.1.gbff' #debug
	refSeqFlatFtpD='ftp://ftp.ncbi.nih.gov/genbank/tsa'
	refSeqFlatFtp=refSeqFlatFtpD+'/'+downFbase+'.'+downExt
	dummy=pathoUtilsA.wget_download2(refSeqFlatFtp,downFbase,downExt,downloadD_gff,'nothing','X')
	#except the following format files
	cmd = 'rm -rf %s/*.mstr.gbff.gz' % downloadD_gff
	os.system(cmd)
	

	#the following two lines should be conistent with the one defined in parse_ncbi_entry()
	GI,REF_NAME,TAXON_ID,ORGANISM,LINEAGE,PRODUCT,STBP,EDBP,SUB_GI=range(9)
	GI_SUB,STRAND_SUB,STBP_SUB,EDBP_SUB,GENE_SUB,LOCUS_TAG_SUB,PRODUCT_SUB,PROTEIN_ID_SUB=range(8)
	
	NAi=0
	NAs='X'

	##################################################################$
	#processing genbank flat file and transfer all annotation to mysql
	##################################################################$
	ANNO_T,DELIM_T,TAX_T=range(3)
	
	gi_annoT_fn=downloadD_gff+'/giAnnoT2load.csv'
	delimT_fn=downloadD_gff+'/delimT2load.csv'
	taxT_fn=downloadD_gff+'/tax2load.csv'
	
	h_taxLookup = {}
	#read tax2load to dictionary (debug) ------------>
	if False:
		fp=open(taxT_fn,'r')
		for i in fp:
			words = i.split('\t')
			h_taxLookup[words[1]]=1
		fp.close()
	#<--------------------
	
	
	fps=[-1,-1,-1]
	fps[ANNO_T]=open(gi_annoT_fn,'w')
	fps[DELIM_T]=open(delimT_fn,'w')
	fps[TAX_T]=open(taxT_fn,'w')

	print 'transferring gene bank report to mysql...'
	gbFlatTmp = '%s/gb2process.tmp' % downloadD_gff
	pkey_anno = 1
	pkey_delim = 1
	pkey_ti = 1

	
	doneGbD = downloadD_gff+'/completed_gbff'
	if not os.path.exists(doneGbD):
		os.makedirs(doneGbD)

	#count a total # of gz to process

	F = len(os.listdir(downloadD_gff))
	f = 0
	for gbFlatFn in os.listdir(downloadD_gff):
		tick=time()
		
		if gbFlatFn.endswith(gbDnExt[0]) or gbFlatFn.endswith(gbDnExt[1]) or gbFlatFn.endswith(gbDnExt[2]):# or gbFlatFn!='gbcon208.seq.gz': #debug
			cmd='gunzip -c %s/%s > %s\n' % (downloadD_gff,gbFlatFn,gbFlatTmp)
			cmd='%smv %s/%s %s/%s\n' % (cmd,downloadD_gff,gbFlatFn,doneGbD,gbFlatFn)
			os.system(cmd)
			f+=1
		else:
			continue
		print 'processing %s[%d/%d]...' % (gbFlatFn,f,F)
		
		fp = open(gbFlatTmp,'r')
		#skipping header
		header=True
		while header: # Dump the header in the file
			tmp=fp.readline()
			if len(tmp)>5:
				header = (not tmp[:5] == "LOCUS")
		entry = [tmp]
		
		#only focus on the section between "LOCUS" ... "//"
		ti=-1
		for x in fp:
			if re.search(r'^//', x): # Every time we get to a //\n line, we read the current entry and then start collecting a new one.
				gB = parse_ncbi_entry(entry)
				#print gB[0] #debug
				entry=[]
				#.............................................
				#0) check if query gi has multiple sub gis
				has_sub=0
				if len(gB[SUB_GI])>0:
					has_sub=1

				#1) update query gi annotation
				#mysql_update_anno_gi(con,gB[GI],gB[REF_NAME],gB[EDBP],gB[TAXON_ID],gB[PRODUCT],has_sub)
				pkey_anno,fps[ANNO_T] = csv_update_anno_gi(fps[ANNO_T],gB[GI],gB[REF_NAME],gB[EDBP],gB[TAXON_ID],gB[PRODUCT],has_sub,pkey_anno)
				
				#2) update query delimit
				# mysql_update_delim(con,gB[GI],gB[GI],'+',gB[STBP],gB[EDBP])
				
				for s in gB[SUB_GI]:
					#3) update sub_gi annotation
					#mysql_update_anno_sub_gi(con,s[GI],gB[REF_NAME],gB[TAXON_ID],s[PRODUCT_SUB],s[GENE_SUB],s[PROTEIN_ID_SUB])
					#4) update sub_gi delimit
					#mysql_update_delim(con,gB[GI],s[GI_SUB],s[GENE_SUB],s[PROTEIN_ID_SUB],s[STRAND_SUB],s[STBP_SUB],s[EDBP_SUB])
					pkey_delim,fps[DELIM_T]=csv_update_delim(fps[DELIM_T],gB[GI],s[GI_SUB],s[GENE_SUB],s[LOCUS_TAG_SUB],s[PROTEIN_ID_SUB],s[STRAND_SUB],s[STBP_SUB],s[EDBP_SUB],pkey_delim)


				#ti=mysql_update_ti(con,gB[TAXON_ID],gB[ORGANISM],gB[LINEAGE])
				if h_taxLookup.get(gB[TAXON_ID],-1) == -1:
					pkey_ti,fps[TAX_T] = csv_update_ti(fps[TAX_T],gB[TAXON_ID],gB[ORGANISM],gB[LINEAGE],pkey_ti)
					h_taxLookup[gB[TAXON_ID]]=1
					
				#.............................................
			else:
				entry.append(x)
		#for loop end (x in fp)
		fp.close()
		tock=time()
		elapsed=tock-tick
		print 'elasped time:[%g]' % elapsed
	
	fps[0].close()
	fps[1].close()
	fps[2].close()
	#(gbFlatFn) finish for loop
	
	con = dbUtils.init_mysql_innocentive(MySqlConf,0)
	with con:

		print 'loading %s...' % (gi_annoT_fn)
		cur=con.cursor()
		mysql_load_cmd = 'load data local infile \'%s\' into table giAnnoT fields terminated by \'\\t\'' % gi_annoT_fn
		cur.execute(mysql_load_cmd)
		cur=con.cursor()
		mysql_idx_cmd = 'create unique index idx_gi on giAnnoT (gi)'
		cur.execute(mysql_idx_cmd)
		print 'done.'

		print 'loading %s...' % (delimT_fn)
		cur=con.cursor()
		mysql_load_cmd = 'load data local infile \'%s\' into table giDelimT fields terminated by \'\\t\'' % delimT_fn
		cur.execute(mysql_load_cmd)
		cur=con.cursor()
		mysql_idx_cmd = 'create index idx_subgi on giDelimT (gi,stbp,edbp)'
		cur.execute(mysql_idx_cmd)
		print 'done.'
		
		print 'computing database size for each taxon id...'
		if False:
			#collect dbSize for each ti
			h_ti_dbSz = get_ti_db_size(gi_annoT_fn)
			update_taxT_fn(h_ti_dbSz,taxT_fn)
		else:
			add_dbsize2taxonT(tiNtDfn,taxT_fn)
		print 'done.'
		
		print 'loading %s...' % (taxT_fn)
		cur=con.cursor()
		mysql_load_cmd = 'load data local infile \'%s\' into table cj_taxonT fields terminated by \'\\t\'' % taxT_fn
		cur.execute(mysql_load_cmd)
		cur=con.cursor()
		mysql_idx_cmd = 'create unique index idx_taxon on cj_taxonT (taxon)'
		cur.execute(mysql_idx_cmd)
		print 'done.'
		
	dbUtils.mysql_close(con)
	print 'done'