def get_ucsc2gid_df(self, tax_id): #mmusculus_gene_ensembl 10090 #rnorvegicus_gene_ensembl 10116 db_name = self.get_dsname_by_taxid(tax_id) if db_name is None: return pd.DataFrame() util.unix('mkdir -p ' + SyncDB.DOWNLOAD_DIR() + '/ucsc_files') file = SyncDB.DOWNLOAD_DIR() + "/ucsc_files/ucscid2gid_%s" % tax_id if not os.path.exists(file): con = db.get_con('UCSC') try: df = db.from_sql( con, "select name as source_id, value as gid from %s.knownToLocusLink" % db_name) except Exception as exp: return pd.DataFrame() df.to_csv(file, index=False) data = util.read_csv(file) data = data[data['gid'].notnull()] data = data[data['source_id'].notnull()] data[['gid']] = data[['gid']].astype(int) data['tax_id'] = tax_id return data
def get_summary_expression_data (self, id_map, id_type): #Tracer()() if not os.path.exists(os.path.join(SyncDB.DOWNLOAD_DIR(),"proteinatlas.xml")): urllib.urlretrieve("http://www.proteinatlas.org/download/proteinatlas.xml.gz", os.path.join(SyncDB.DOWNLOAD_DIR(),"proteinatlas.xml.gz")) util.unix("gunzip " + os.path.join(SyncDB.DOWNLOAD_DIR(),"proteinatlas.xml.gz")) context = etree.iterparse(os.path.join(SyncDB.DOWNLOAD_DIR(),"proteinatlas.xml"), tag='entry') out = []; i = 0; for action, elem in context: i = i + 1; if elem.tag=='entry': row = self.process_gene_entry (elem); if row[id_type] in id_map: content = str(unicode(row["protein_expression_tissue"]).encode('utf-8')) content = content if content!='nan' else ''; for gid in id_map[row[id_type]]: out.append({'gid':gid, 'content':content, 'type_name': "Protein_Atlas_PROTEIN_EXPR_TISSUE" , 'annotation_field1':row['Gene']}); content = str(unicode(row["protein_expression_cancer"]).encode('utf-8')) content = content if content!='nan' else ''; for gid in id_map[row[id_type]]: out.append({'gid':gid, 'content':content, 'type_name': "Protein_Atlas_PROTEIN_EXPR_CANCER" , 'annotation_field1':row['Gene']}); # processing goes here pass elem.clear() while elem.getprevious() is not None: del elem.getparent()[0] if i%1000 == 0: print "processed " + str(i) + " genes"; return out;
def make_gid2taxid(self): taxidList = SyncDB.SUPPORTED_SPECIES for child in self.children: if type(child).__name__ == "Species": taxidList = child.supported_species break if not os.path.exists( os.path.join(SyncDB.DOWNLOAD_DIR(), "geneid2taxid.csv")): taxid_filter = "" if len(taxidList) != 0: taxid_filter = ['$1==\"' + t + '\"' for t in taxidList] taxid_filter = "if (" + "||".join(taxid_filter) + ")" # gene_id,tax_id cmd = "time zcat " + SyncDB.DOWNLOAD_DIR( ) + "/gene_info.gz | cut -f1,2 | sed 1d | awk 'BEGIN{FS=\"\\t\"; OFS=\"\\t\"}{" + taxid_filter + " print $2,$1;}' | sort -k1,1 -t $'\\t' >" + SyncDB.DOWNLOAD_DIR( ) + "/geneid2taxid.csv" print cmd util.unix(cmd) df = util.read_csv(SyncDB.DOWNLOAD_DIR() + "/geneid2taxid.csv", names=["gid", "tax_id"], sep=r'\t') self.gid2taxid = { str(df.ix[i, 'gid']): str(df.ix[i, 'tax_id']) for i in df.index }
def dump_all(self): print 'backup...' conn_info = db.get_con_info(self.CONNECTION_ID) if conn_info is None: print "cannot find connection for " + self.CONNECTION_ID exit() tables = [ 'annotation', 'annotation_type', 'gid2source_id', 'gid2terms', 'homologene', 'id_type', 'interaction', 'interaction_type', 'taxid2name', 'term', 'term_category', 'term2gids', 'term2term', 'statistics', ] cmd = [ 'mysqldump', '--host ' + conn_info["HOST"], '-u ' + conn_info["USR"], '-p' + conn_info["PWD"], self.DATABASE, ' '.join(tables), '>' + self.OUTPUT ] print cmd util.unix(cmd)
def ensembl_trans2gene_map_by_taxid(self, tax_id): if tax_id == 9606: db_name="hsapiens_gene_ensembl"; fname="ensembl_genes_human_trans2gid_map.csv"; elif tax_id == 10090: db_name="mmusculus_gene_ensembl"; fname="ensembl_genes_mouse_trans2gid_map.csv"; elif tax_id == 10116: db_name="rnorvegicus_gene_ensembl"; fname="ensembl_genes_rat_trans2gid_map.csv"; util.unix('mkdir -p ' + SyncDB.DOWNLOAD_DIR() + '/ensembl_files'); ensembl_file = SyncDB.DOWNLOAD_DIR() + "/ensembl_files/%s"%fname; if not os.path.exists(ensembl_file ): cmd = 'wget -O ' + ensembl_file + ' \'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0"\ encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default"\ formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "' + db_name + '" interface = "default">\ <Attribute name = "ensembl_gene_id"/><Attribute name = "ensembl_transcript_id"/><Attribute name = "ensembl_peptide_id"/>\ </Dataset></Query>\''; util.unix(cmd); df=util.read_csv(ensembl_file, sep='\t', names=['Ensembl_Gene','Ensembl_Trans','Ensembl_Prot']); df=df.iloc[np.where(df["Ensembl_Trans"].notnull())[0]] df=df.drop_duplicates(); df["Tax_ID"]=tax_id; return df;
def get_ensembl2gid_map_ensembl(): if hasattr(GPUtils, 'ensembl2gid_map_ensembl'): return GPUtils.ensembl2gid_map_ensembl ensembl_file = path.join(SyncDB.DOWNLOAD_DIR(), "ensembl_genes_info.csv") if not os.path.exists(ensembl_file): cmd = 'wget -O ' + ensembl_file + ' - \'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0"\ encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default"\ formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "hsapiens_gene_ensembl" interface = "default"\ ><Attribute name = "ensembl_gene_id"/><Attribute name = "entrezgene"/></Dataset></Query>\'' util.unix(cmd) ensembl_data = util.read_csv(ensembl_file, sep="\t", header=None, names=['ensembl_gene_id', 'gene_id']) ensembl_data = ensembl_data[ensembl_data['gene_id'].notnull()] ensembl_data[['gene_id']] = ensembl_data[['gene_id']].astype(int) GPUtils.ensembl2gid_map_ensembl = {} for i in ensembl_data.index: GPUtils.ensembl2gid_map_ensembl[ensembl_data.at[ i, 'ensembl_gene_id']] = GPUtils.ensembl2gid_map_ensembl.get( ensembl_data.at[i, 'ensembl_gene_id']) or [] GPUtils.ensembl2gid_map_ensembl[ensembl_data.at[ i, 'ensembl_gene_id']].append(ensembl_data.at[i, 'gene_id']) #Tracer()() return GPUtils.ensembl2gid_map_ensembl
def restore(self): print 'restoring...' if len(self.RESTORE_DB) == 0: print "You need to specify a database name to be restored using -r option." exit() if len(self.INPUT) == 0: print "You need to specify a .sql file name -i option." exit() conn_info = db.get_con_info(self.CONNECTION_ID) if conn_info is None: print "cannot find connection for " + self.CONNECTION_ID exit() cmd = [ 'mysql', '--host ' + conn_info["HOST"], '-u ' + conn_info["USR"], '-p' + conn_info["PWD"], '-e', '"create database {0}"'.format(self.RESTORE_DB), ] print ' '.join(cmd) util.unix(cmd) cmd = [ 'mysql', '--host ' + conn_info["HOST"], '-u ' + conn_info["USR"], '-p' + conn_info["PWD"], self.RESTORE_DB, '<' + self.INPUT ] print ' '.join(cmd) util.unix(cmd)
def __init__(self, xe=None): #return #DEBUG XmlClass.__init__(self,xe=xe) self.fn_dest =os.path.join(SyncDB.DOWNLOAD_DIR(),"isoform_idmap.csv") self.taxidList = SyncDB.SUPPORTED_SPECIES; for child in self.children: if type(child).__name__ == "Species": self.taxidList = child.supported_species break; util.unix('mkdir -p ' + SyncDB.DOWNLOAD_DIR() + '/ensembl_files');
def sync_category_tables(self, dest_db): print 'synchronizing category tables of the gp_devel with the production gp.' con_prod = db.get_con_info("MYSQL01_RW") cmd = [ 'mysqldump', '--host ' + con_prod["HOST"], '-u ' + con_prod["USR"], '-p' + con_prod["PWD"], 'gp', ' '.join(['annotation_type', 'id_type', 'term_category']), '>category_tables.sql' ] print ' '.join(cmd) util.unix(cmd)
def process(self): if not os.path.exists(os.path.join(SyncDB.DOWNLOAD_DIR(), "names.dmp")): urllib.urlretrieve( "ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip", os.path.join(SyncDB.DOWNLOAD_DIR(), "taxdmp.zip")) util.unix("unzip %s " % os.path.join(SyncDB.DOWNLOAD_DIR(), "taxdmp.zip")) df = util.read_csv( os.path.join(SyncDB.DOWNLOAD_DIR(), "names.dmp"), names=['tax_id', 'name', 'uname', 'type_name'], sep="\s*\|\s*", index_col=False).query( 'type_name in ["scientific name", "genbank common name"]') df = df.query('tax_id in [%s]' % ','.join(['"' + x + '"' for x in self.taxidList])) df = df.sort('tax_id') rows = [] last_taxid = None row = None for e in df.iterrows(): if e[1]['tax_id'] != last_taxid: if row is not None and 'scientific_name' in row: row['tax_id'] = last_taxid if 'common_name' not in row: row['common_name'] = row['scientific_name'] rows.append(row) row = {} last_taxid = e[1]['tax_id'] if 'scientific name' in e[1]['type_name'].strip(): row['scientific_name'] = e[1]['name'].strip() if 'genbank common name' in e[1]['type_name'].strip(): row['common_name'] = e[1]['name'].strip() if row is not None and 'scientific_name' in row: rows.append(row) if not os.path.exists(os.path.join(SyncDB.UPLOAD_DIR(), self.dest)): util.unix('mkdir -p %s' % os.path.join(SyncDB.UPLOAD_DIR(), self.dest)) pd.DataFrame(rows).to_csv( os.path.join(SyncDB.UPLOAD_DIR(), self.dest) + "/taxid2name.csv", index=False, header=True, columns=['tax_id', 'scientific_name', 'common_name'])
def do_update(self): self.prepare() if not os.path.exists(os.path.join(SyncDB.DOWNLOAD_DIR(),"protein_atlas")): util.unix("mkdir " + os.path.join(SyncDB.DOWNLOAD_DIR(),"protein_atlas")) data = self.get_summary_expression_data(GPUtils.get_ensembl2gid_map(), "Ensembl") \ + self.get_protein_class_data(GPUtils.get_ensembl2gid_map(), "Ensembl") \ + self.get_subcellular_location() \ + self.get_rna_tissue_category(GPUtils.get_ensembl2gid_map(), "Ensembl") dff=pd.DataFrame(data); reload(sys) sys.setdefaultencoding('utf8') dff['content']=dff['content'].map(lambda x: unicode(x).encode('utf-8')) dff['content']=dff['content'].map(lambda x: x.replace('\r','').replace('\n','') if x!='nan' else '') dff['tax_id']='9606' dff.to_csv(self.fn_dest, index=False, sep=',');
def stat_js(rpath): import re con = db.get_con(SyncDB.CONNECTION_ID, db=SyncDB.DATABASE) #get last update date; dt = db.from_sql( con, "Select * FROM {0}.statistics order by history desc".format( SyncDB.DATABASE)) dt['history'] = dt['history'].apply(str) latest_build_date = max(dt['history']) #Tracer()() dt = dt.query("history=='" + latest_build_date + "'") jquery_str = "window.descriptionStatistics ={'latest_build':'" + latest_build_date + "','ds_counts':{\n" totals = [] for table in [['i_ann_', 'annotation'], ['i_gene_', 'gid2terms'], ['i_term_', 'term2gids'], ['i_interaction_', 'interaction']]: dt_type = dt.query("table_name=='" + table[1] + "'") for k, g in dt_type.groupby(by=['type_id']): total_in_one_type = g['total'].sum() totals.append("'" + table[0] + str(k) + "':" + str(total_in_one_type)) jquery_str += ',\n'.join(totals) + "}}" with open(rpath + "gp_description_stats.js", "w") as report_file: report_file.write(jquery_str) files = util.unix('cp ' + rpath + 'gp_description_stats.js ' + rpath + 'gp_description_stats_' + latest_build_date + '.js')
def biogrid(self, l_human_only=False): fn_source = os.path.join(SyncDB.DOWNLOAD_DIR(), "BIOGRID-ALL-3.4.134.tab2.txt") if not os.path.exists(fn_source): urllib.urlretrieve( "http://thebiogrid.org/downloads/archives/Release%20Archive/BIOGRID-3.4.134/BIOGRID-ALL-3.4.134.tab2.zip", os.path.join(SyncDB.DOWNLOAD_DIR(), "BIOGRID-ALL-3.4.134.tab2.zip")) cmd = "unzip " + os.path.join(SyncDB.DOWNLOAD_DIR(), "BIOGRID-ALL-3.4.134.tab2.zip" ) + " -d " + SyncDB.DOWNLOAD_DIR() print cmd util.unix(cmd) t = pd.read_table(fn_source, dtype=str) #print t.header() #['#BioGRID Interaction ID', 'Entrez Gene Interactor A', 'Entrez Gene Interactor B', 'BioGRID ID Interactor A', 'BioGRID ID Interactor B', 'Systematic Name Interactor A', 'Systematic Name Interactor B', 'Official Symbol Interactor A', 'Official Symbol Interactor B', 'Synonyms Interactor A', 'Synonyms Interactor B', 'Experimental System', 'Experimental System Type', 'Author', 'Pubmed ID', 'Organism Interactor A', 'Organism Interactor B', 'Throughput', 'Score', 'Modification', 'Phenotypes', 'Qualifications', 'Tags', 'Source Database'] print util.unique(t['Experimental System Type']) #t=t[(t['Organism Interactor A']=='9606') & (t['Organism Interactor B']=='9606')] t.rename2({ 'Entrez Gene Interactor A': 'gid_A', 'Entrez Gene Interactor B': 'gid_B', 'Experimental System Type': 'interaction_category', 'Experimental System': 'interaction_type', 'Pubmed ID': 'support', 'Source Database': 'ds', 'Organism Interactor A': 'tax_id_A', 'Organism Interactor B': 'tax_id_B', 'Score': 'score' }) #print t.header() t['interaction_type_id'] = 2 t = t[[ 'gid_A', 'gid_B', 'tax_id_A', 'tax_id_B', 'interaction_type_id', 'interaction_category', 'interaction_type', 'score', 'support', 'ds' ]] t = t[(t.gid_A != '-') & (t.gid_B != '-')] t['gid_A'] = t.gid_A.astype(int) t['gid_B'] = t.gid_B.astype(int) t['tax_id_A'] = t.tax_id_A.astype(int) t['tax_id_B'] = t.tax_id_B.astype(int) t['ds'] = 'BioGrid' t = t[(t.gid_A != t.gid_B) & (t.tax_id_A == t.tax_id_B) & (t.gid_A > 0) & (t.gid_B > 0)].copy() self.bio = t return self.bio
def __init__(self, xe=None): XmlClass.__init__(self, xe=xe) self.fn_dest_genego_annotations = os.path.join( SyncDB.DOWNLOAD_DIR(), "genego_annotations.csv") self.fn_dest_genego_terms = os.path.join(SyncDB.DOWNLOAD_DIR(), "genego_terms.csv") self.fn_dest_genego_term_gene_pair = os.path.join( SyncDB.DOWNLOAD_DIR(), "genego_term_gene_pair.csv") self.inputs = ['ds:genego'] self.taxidList = SyncDB.SUPPORTED_SPECIES for child in self.children: if type(child).__name__ == "Species": self.taxidList = child.supported_species break self.dir = SyncDB.DOWNLOAD_DIR() + '/genego_files' util.unix('mkdir -p ' + self.dir)
def optimize(s_file): if re.search(r'\.cdt$', s_file) is not None: s_file=re.sub(r'\.cdt$', '', s_file) S_cmd=[Clustering.BIN_OPT, "-m O -d P "+s_file, s_file+"Opt"] print(util.unix(" ".join(S_cmd))) #print S_cmd #Clustering._fix_missing(s_file+"Opt.cdt") Clustering._remove_temp_files(s_file)
def ensembl_idmapping_by_taxid(self, tax_id): if tax_id == 9606: db_name="hsapiens_gene_ensembl"; fname="ensembl_genes_human_idmap"; ucsc_attr='<Attribute name = "ucsc"/>' elif tax_id == 10090: db_name="mmusculus_gene_ensembl"; fname="ensembl_genes_mouse_idmap"; ucsc_attr='<Attribute name = "ucsc"/>' elif tax_id == 10116: db_name="rnorvegicus_gene_ensembl"; fname="ensembl_genes_rat_idmap"; ucsc_attr=''; ensembl_file = SyncDB.DOWNLOAD_DIR() + "/ensembl_files/%s"%fname; if not os.path.exists(ensembl_file + '_p1.csv'): cmd = 'wget -O ' + ensembl_file + '_p1.csv \'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0"\ encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default"\ formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "' + db_name + '" interface = "default">\ <Attribute name = "ensembl_gene_id"/><Attribute name = "ensembl_transcript_id"/><Attribute name = "ensembl_peptide_id"/><Attribute name = "refseq_mrna"/><Attribute name = "refseq_peptide"/>' + ucsc_attr + '</Dataset></Query>\''; util.unix(cmd); if not os.path.exists(ensembl_file + '_p2.csv'): cmd = 'wget -O ' + ensembl_file + '_p2.csv \'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0"\ encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default"\ formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "' + db_name + '" interface = "default">\ <Attribute name = "ensembl_transcript_id"/><Attribute name = "uniprot_swissprot"/><Attribute name = "entrezgene"/>\ </Dataset></Query>\''; util.unix(cmd); if ucsc_attr == '': df1=util.read_csv(ensembl_file + '_p1.csv', sep='\t', names=['Ensembl_Gene','Ensembl_Trans','Ensembl_Prot','Refseq_RNA','Refseq_Prot']); else: df1=util.read_csv(ensembl_file + '_p1.csv', sep='\t', names=['Ensembl_Gene','Ensembl_Trans','Ensembl_Prot','Refseq_RNA','Refseq_Prot','UCSC_ID']); df1["UCSC_ID"] = '-'; df2=util.read_csv(ensembl_file + '_p2.csv', sep='\t', names=['Ensembl_Trans','Uniprot_ID','GeneID']).drop_duplicates(subset='Ensembl_Trans', take_last=True); df = pd.merge(df1, df2, on='Ensembl_Trans', how='left'); df=df.iloc[np.where(df["GeneID"].notnull())[0]] df["Tax_ID"]=tax_id; df["Source"] = "Ensembl"; return df.drop_duplicates();
def get_gene2go(self): terms = util.read_csv(self.fn_dest_go_term, sep=r',', skiprows = 1).values #Tracer()() term_type_index = {terms[i][0]:terms[i][2] for i in range(len(terms))} term_description_index = {terms[i][0]:terms[i][1] for i in range(len(terms))} fn_source =os.path.join(SyncDB.DOWNLOAD_DIR(),"gene2go.gz") urllib.urlretrieve("ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz",fn_source) taxidList = SyncDB.SUPPORTED_SPECIES; for child in self.children: if type(child).__name__ == "Species": taxidList = child.supported_species break; taxid_filter = ""; if len(taxidList) !=0: taxid_filter = ['$1=="'+ t + '"' for t in taxidList] taxid_filter = "if ("+"||".join(taxid_filter) + ") " #cmd = "zegrep '^9606' %s |cut -f2,3,1,8,6 > %s" % ( fn_source, self.fn_gene2go_org) cmd = ('zcat %s | awk \'BEGIN {FS="\t"}{' + taxid_filter + ' print $0;}\' |cut -f2,3,1,8,6 > %s') % ( fn_source, self.fn_gene2go_org) print 'aaaaaaaaaaaaa' print cmd print 'aaaaaaaaaaaaa' util.unix(cmd) gene2go = util.read_csv(self.fn_gene2go_org, sep=r'\t', names=['tax_id','gene_id','term_id','type', 'description']) extended_terms = []; for k,g in gene2go.groupby('gene_id', as_index=False): extended_term_ids = []; #map gene_id to all the ancestors of t. self.extend_go_term(t) returns list of ancestors of t which includes t itself. for t in g['term_id']: extended_term_ids += self.extend_go_term(t); extended_term_ids = np.unique(extended_term_ids); taxId = g['tax_id'].values[0]; extended_terms +=[[taxId,k, t, term_type_index[t] if t in term_type_index else '', term_description_index[t] if t in term_description_index else ''] for t in extended_term_ids]; extended_terms_df = pd.DataFrame(extended_terms); extended_terms_df.columns = gene2go.columns; extended_terms_df.to_csv(self.fn_source_by_tax, index=False, sep='\t', header=False)
def dump_data_tables(self): conn_info = db.get_con_info(self.CONNECTION_ID) if conn_info is None: print "cannot find connection for " + self.CONNECTION_ID exit() tables = [ 'annotation', 'gid2source_id', 'gid2terms', 'homologene', 'term', 'term2gids', 'term2term' ] cmd = [ 'mysqldump', '--host ' + conn_info["HOST"], '-u ' + conn_info["USR"], '-p' + conn_info["PWD"], self.DATABASE, ' '.join(tables), '>' + self.OUTPUT ] util.unix(cmd) print " ".join(cmd)
def __init__(self, xe=None): XmlClass.__init__(self, xe=xe) EnsemblDownload.set_ENSEMBL_VERSION() self.tag = "EnsemblDownload" self.fn_annotations = os.path.join(SyncDB.DOWNLOAD_DIR(), "ensembl_annotations.csv") #self.fn_phenotype=os.path.join(SyncDB.DOWNLOAD_DIR(),"ensembl_phenotype_annotations.csv") self.fn_ensembl_id_map = os.path.join(SyncDB.DOWNLOAD_DIR(), "ensembl_id_map.csv") self.taxidList = SyncDB.SUPPORTED_SPECIES for child in self.children: if type(child).__name__ == "Species": self.taxidList = child.supported_species break self.chrList = [str(x + 1) for x in range(22)] self.chrList += ['X', 'Y', 'MT'] util.unix('mkdir -p ' + SyncDB.DOWNLOAD_DIR() + '/ensembl_files') self.mart_ftp = "ftp://ftp.ensembl.org/pub/release-84/mysql/ensembl_mart_84/"
def uniprot_idmapping_data(self): if not os.path.exists(os.path.join(SyncDB.DOWNLOAD_DIR(),"idmapping_selected.tab.gz")): urllib.urlretrieve("ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping_selected.tab.gz", os.path.join(SyncDB.DOWNLOAD_DIR(),"idmapping_selected.tab.gz")) print "idmapping_selected.tab.gz file downloaded..."; else: print "idmapping_selected.tab.gz exists. start processing..."; #keep the lines that contain tax_id and gene_id from the file. if not os.path.exists(os.path.join(SyncDB.DOWNLOAD_DIR(),"uniprot_isoform_idmapping.dat")): print "extracting rows of desired species with awk command...." unix_cmd = "zcat " + os.path.join(SyncDB.DOWNLOAD_DIR(),"idmapping_selected.tab.gz") + " | awk -F'\t' '{if ($13 == \"9606\" || $13==\"10090\" || $13==\"10116\") print $1,$3,$4,$13,$19,$20,$21}' > " + os.path.join(SyncDB.DOWNLOAD_DIR(),"uniprot_isoform_idmapping.dat"); print unix_cmd util.unix(unix_cmd) print "extracting rows of desired species with grep command is done." df = util.read_csv(os.path.join(SyncDB.DOWNLOAD_DIR(),"uniprot_isoform_idmapping.dat"), sep="\t", header=None, names=["Uniprot_ID","GeneID","Refseq_Prot","Tax_ID","Ensembl_Gene","Ensembl_Trans","Ensembl_Prot"]); df["Source"] = "Uniprot"; df=df.iloc[np.where(df["GeneID"].notnull())[0]] print "Uniprot ID mapping data processed..." return df;
def create_ucsc_idmapping(self): if not os.path.exists(path.join(SyncDB.DOWNLOAD_DIR(), "ucsc_refseq_to_id_map.csv")): cmd = 'mysql --host=genome-mysql.cse.ucsc.edu --user=genomep --password=password --database=hg38 -e "SELECT refLink.mrnaAcc as Refseq_RNA, kgXref.kgID as UCSC_ID from refLink, kgXref WHERE kgXref.refseq=refLink.mrnaAcc" > ' + path.join(SyncDB.DOWNLOAD_DIR(), "ucsc_refseq_to_id_map.csv"); util.unix(cmd); if not os.path.exists(path.join(SyncDB.DOWNLOAD_DIR(), "ucsc_id_to_ensemblid_map.csv")): cmd = 'mysql --host=genome-mysql.cse.ucsc.edu --user=genomep --password=password --database=hg38 -e "SELECT knownToEnsembl.name as UCSC_ID, knownToEnsembl.value as Ensembl_Trans from knownToEnsembl" > ' + path.join(SyncDB.DOWNLOAD_DIR(), "ucsc_id_to_ensemblid_map.csv"); util.unix(cmd); df=util.read_csv(SyncDB.DOWNLOAD_DIR()+'/ucsc_id_to_ensemblid_map.csv', sep='\t'); global ensemblTrans2ucscID; ensemblTrans2ucscID = {} for index, el in df.iterrows(): ensemblTrans2ucscID[el["Ensembl_Trans"]] = el["UCSC_ID"] df=util.read_csv(SyncDB.DOWNLOAD_DIR()+'/ucsc_refseq_to_id_map.csv', sep='\t'); global refseq2ucscID; refseq2ucscID = {} for index, el in df.iterrows(): refseq2ucscID[el["Refseq_RNA"]] = el["UCSC_ID"]
def ucsc_idmapping_data_by_taxid(self, tax_id): if tax_id == 9606: db_name="hg38"; fname="human_ucsc_isoform_map.csv"; elif tax_id == 10090: db_name="mm10"; fname="mouse_ucsc_isoform_map.csv"; else: print "species not supported"; return None; if not os.path.exists(path.join(SyncDB.DOWNLOAD_DIR(), fname)): cmd = 'mysql --host=genome-mysql.cse.ucsc.edu --user=genomep --password=password --database=' + db_name + ' -e "select a.*, refLink.protAcc as Refseq_Prot, knownToEnsembl.value as Ensembl_Trans from (SELECT kgXref.refseq as Refseq_RNA, knownToLocusLink.value as GeneID, knownGene.name as UCSC_ID, knownGene.proteinID as Uniprot_ID from kgXref,knownGene,knownToLocusLink WHERE knownToLocusLink.name = knownGene.name and kgXref.kgID=knownGene.name) a left join refLink on refLink.mrnaAcc = a.Refseq_RNA left join knownToEnsembl on a.UCSC_ID=knownToEnsembl.name" > ' + path.join(SyncDB.DOWNLOAD_DIR(), fname); util.unix(cmd); print "UCSC Query is done..." df=util.read_csv(path.join(SyncDB.DOWNLOAD_DIR(), fname), sep='\t'); df["Tax_ID"]=tax_id; df=df.iloc[np.where(df["GeneID"].notnull())[0]].drop_duplicates(); print "UCSC ID mapping data processed..." return df;
def dump_schema(self): tables_with_data = [ 'annotation', 'annotation_type', 'gid2source_id', 'gid2terms', 'homologene', 'id_type', 'interaction', 'interaction_type', 'server_status', 'session', 'statistics', 'taxid2name', 'term', 'term2gids', 'term2term', 'term_category' ] conn_info = db.get_con_info(self.CONNECTION_ID) if conn_info is None: print "cannot find connection for " + self.CONNECTION_ID exit() metascape_db_in_xxx = 'gp_metascape' #todo uncomment next line if len(tables_with_data): cmd = [ 'mysqldump', '--host ' + conn_info["HOST"], '-u ' + conn_info["USR"], '-p' + conn_info["PWD"], metascape_db_in_xxx, ' '.join(tables_with_data), '>' + self.OUTPUT + "_P2" ] print " ".join(cmd) util.unix(cmd) util.unix("cat " + self.OUTPUT + "_P2 >> " + self.OUTPUT) util.unix("rm " + self.OUTPUT + "_P2")
def get_ensembl2gid_df_web(self, tax_id, type): #mmusculus_gene_ensembl 10090 #rnorvegicus_gene_ensembl 10116 print "Get %s to gene id for %s" % (type, tax_id) attr = '<Attribute name = "' + type + '" />' db_name = self.get_dbname_by_taxid(tax_id) if db_name is None: return None fname = 'ensembl2gid_%s_%s' % (type, tax_id) util.unix('mkdir -p ' + SyncDB.DOWNLOAD_DIR() + '/ensembl_files') valid_files = [] for chr in self.chrList: ensembl_file = SyncDB.DOWNLOAD_DIR( ) + "/ensembl_files/%s_chr%s" % (fname, chr) print "downloading %s from %s for chr %s..." % (type, db_name, chr) if not os.path.exists(ensembl_file): cmd = 'wget -O ' + ensembl_file + ' \'http://www.ensembl.org/biomart/martservice?query=<?xml version="1.0"\ encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default"\ formatter = "TSV" header = "0" uniqueRows = "0" count = "" datasetConfigVersion = "0.6" ><Dataset name = "' + db_name + '" interface = "default"><Filter name="chromosome_name" value="' + chr + '" filter_list=""/>' + attr + '<Attribute name = "entrezgene"\ /></Dataset></Query>\'' util.unix(cmd) try: tdf = util.read_csv(ensembl_file, sep="\t", header=None, names=['source_id', 'gid'], nrows=1) except Exception as exp: tdf = pd.DataFrame() if len(tdf) != 0: valid_files.append(ensembl_file) if len(valid_files) == 0: return None cmd = 'cat %s >> %s' % (' '.join(valid_files), SyncDB.DOWNLOAD_DIR() + "/ensembl_files/" + fname) print cmd util.unix(cmd) print "downloading %s from %s done." % (type, db_name) ensembl_data = util.read_csv(SyncDB.DOWNLOAD_DIR() + "/ensembl_files/" + fname, sep="\t", header=None, names=['source_id', 'gid']) ensembl_data = ensembl_data[ensembl_data['gid'].notnull()] ensembl_data = ensembl_data[ensembl_data['source_id'].notnull()] ensembl_data[['gid']] = ensembl_data[['gid']].astype(int) ensembl_data['tax_id'] = tax_id ensembl_data['type_name'] = type return ensembl_data
def report_js(rpath): import re con = db.get_con(SyncDB.CONNECTION_ID, db=SyncDB.DATABASE) #get last update date; dt = db.from_sql( con, "Select * FROM {0}.statistics order by history desc".format( SyncDB.DATABASE)) dt['history'] = dt['history'].apply(str) cur_date = max(dt['history']) json_arr = [] for c in dt.columns: json_arr.append('"' + c + '":' + dt[c].to_json(orient='values')) with open(rpath + "gp_stats.js", "w") as report_file: report_file.write("window.buildLogStatistics=") report_file.write(dt.to_json(orient='records')) files = util.unix('cp ' + rpath + 'gp_stats.js ' + rpath + 'gp_stats_' + cur_date + '.js')
def get_go_term_and_go_term2term(self): fn_source =os.path.join(SyncDB.DOWNLOAD_DIR(),"go_daily-termdb-tables.tar.gz") dir = os.path.join(SyncDB.DOWNLOAD_DIR(),"go_daily-termdb-tables") urllib.urlretrieve("http://archive.geneontology.org/latest-termdb/go_daily-termdb-tables.tar.gz",fn_source) cmd = "tar -zxvf %s -C %s go_daily-termdb-tables/term.txt go_daily-termdb-tables/graph_path.txt go_daily-termdb-tables/term_definition.txt" % (fn_source,SyncDB.DOWNLOAD_DIR()) util.unix(cmd) fn_term = os.path.join(dir,"term.txt") df_term = util.read_csv(fn_term, sep=r'\t', skiprows = 1, names=['id','term_name','term_type','term_id','is_obsolete','is_root','is_relation']) df_term = df_term[(df_term.is_root == 0) & (df_term.is_relation == 0) ] df_term['term_type'] = df_term['term_type'].map({'biological_process':'BP', 'molecular_function':'MF', 'cellular_component':'CC'}) fn_term_description = os.path.join(dir,"term_definition.txt") #remove bad lines. util.unix("awk 'BEGIN{FS=\"\\t\"} {if(NF==5) print $0}' " + fn_term_description + " > " + fn_term_description + ".tmp") util.unix("mv " + fn_term_description + ".tmp " + fn_term_description) df_term_description = util.read_csv(fn_term_description, sep=r'\t', skiprows = 1, names=['id','description','dbxref_id','term_comment','reference'], usecols=['id','description']) df_term_final = pd.merge(df_term, df_term_description, on='id', how='left') df_term_final.drop(['id','is_obsolete','is_root','is_relation'],axis=1,inplace=True) df_term_final = df_term_final[df_term_final['term_id'].str.contains(r'^GO:')] df_term_final = df_term_final.reindex(columns=['term_id','term_name','term_type','description']) fn_gp = os.path.join(dir,"graph_path.txt") df_gp = util.read_csv(fn_gp, sep=r'\t', skiprows = 1, names=['tt_id','term1_id','term2_id','relationship_type_id','distance','relation_distance']) df_gp = pd.merge(df_gp,df_term,left_on='term1_id', right_on='id',how='left') util.rename2(df_gp, {'term_id':'parent_term_id'}) df_gp = df_gp.loc[:,['term2_id','distance','parent_term_id']] df_gp = pd.merge(df_gp,df_term,left_on='term2_id', right_on='id',how='left') df_gp = df_gp.loc[:,['parent_term_id','distance','term_id']] df_gp = df_gp[df_gp['parent_term_id'].str.contains(r'^GO:') & df_gp['term_id'].str.contains(r'^GO:')] df_gp.drop_duplicates(inplace=True) df_gp.to_csv(self.fn_dest_term2term,index=False) df_term_final.to_csv(self.fn_dest_go_term, index = False)
def report_html(rpath): import re con = db.get_con(SyncDB.CONNECTION_ID, db=SyncDB.DATABASE) #get last update date; dt = db.from_sql( con, "Select distinct history FROM {0}.statistics order by history desc" .format(SyncDB.DATABASE)) if (len(dt['history']) < 2): print "No previous build statistics found." return cur_date = str(dt['history'][0]) last_date = str(dt['history'][1]) query = """ select t.* from ( SELECT new.ds as data_source, new.table_name, new.type_name, new.total as size, (new.total - old.total) as growth, (new.total - old.total)/old.total *100 AS delta, 0 as new_missing FROM {2}.statistics old, {2}.statistics new WHERE old.history = '{0}' AND new.history = '{1}' AND old.table_name = new.table_name AND old.type_name = new.type_name UNION SELECT new.ds as data_source, new.table_name, new.type_name, new.total as size, new.total as growth, 101 AS delta, 1 as new_missing FROM {2}.statistics new left Join (select * from {2}.statistics where history = '{0}') old on old.table_name = new.table_name and old.type_name = new.type_name WHERE new.history = '{1}' AND old.table_name is NULL UNION SELECT old.ds as data_source, old.table_name, old.type_name, old.total as size, -old.total as growth, -101 AS delta, -1 as new_missing FROM {2}.statistics old left Join (select * from {2}.statistics where history = '{1}') new on old.table_name = new.table_name and old.type_name = new.type_name WHERE old.history = '{0}' AND old.table_name is NULL ) t ORDER BY data_source, table_name, delta DESC; """ query = query.format(last_date, cur_date, SyncDB.DATABASE) dt = db.from_sql(con, query) #Tracer()() missing_data = dt.query('new_missing < 0') new_data = dt.query('new_missing > 0') expanded_data = dt.query('growth > 0') reduced_data = dt.query('growth < 0') unchanged_data = dt.query('growth == 0 and new_missing==0') reports = [ r for r in sorted(os.listdir(rpath)) if "Report_" in r and r != "Report_" + cur_date + ".html" ] last_report = reports[len(reports) - 1] if len(reports) > 0 else None #import datetime #time_now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') html = '<!DOCTYPE html><html><head><title> GP Build Report ' + cur_date + '</title></head><body><h1> GP Build Report ' + cur_date + '</h1>' if last_report: html += '<span id="lastreportlink"><a href="' + last_report + '"><<</a></span>\n' html += '<span id="nextreportlink"></span>\n' html += '<div>' if (len(missing_data) > 0): html += '<h4> Missing Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size</th></tr>' for i in range(len(missing_data)): html += '<tr><td>' + missing_data.irow( i)['data_source'] + '</td><td>' + missing_data.irow( i)['table_name'] + '</td><td>' + missing_data.irow( i)['type_name'] + '</td><td>' + str( missing_data.irow(i)['growth']) + '</td></tr>' if (len(missing_data) > 0): html += '</table>' html += '</div><div>' if (len(new_data) > 0): html += '<h4> New Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size</th></tr>' for i in range(len(new_data)): html += '<tr><td>' + new_data.irow( i)['data_source'] + '</td><td>' + new_data.irow( i)['table_name'] + '</td><td>' + new_data.irow( i)['type_name'] + '</td><td>' + str( new_data.irow(i)['growth']) + '</td></tr>' if (len(new_data) > 0): html += '</table>' html += '</div><div>' if (len(expanded_data) > 0): html += '<h4> Expanded Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size Diff</th> <th>Ratio</th></tr>' for i in range(len(expanded_data)): html += '<tr><td>' + expanded_data.irow( i)['data_source'] + '</td><td>' + expanded_data.irow( i)['table_name'] + '</td><td>' + expanded_data.irow( i)['type_name'] + '</td><td>' + str( expanded_data.irow(i) ['growth']) + '</td><td>' + str( expanded_data.irow(i)['delta']) + '</td></tr>' if (len(expanded_data) > 0): html += '</table>' html += '</div><div>' if (len(reduced_data) > 0): html += '<h4> Reduced Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size Diff</th> <th>Ratio</th></tr>' for i in range(len(reduced_data)): html += '<tr><td>' + reduced_data.irow( i)['data_source'] + '</td><td>' + reduced_data.irow( i)['table_name'] + '</td><td>' + reduced_data.irow( i)['type_name'] + '</td><td>' + str( reduced_data.irow(i) ['growth']) + '</td><td>' + str( reduced_data.irow(i)['delta']) + '</td></tr>' if (len(reduced_data) > 0): html += '</table>' html += '</div><div>' if (len(unchanged_data) > 0): html += '<h4> Unchanged Data: </h4><table><tr><th>Data Source</th><th>Table Name</th><th>Type Name</th><th>Size</th> </tr>' for i in range(len(unchanged_data)): html += '<tr><td>' + unchanged_data.irow( i)['data_source'] + '</td><td>' + unchanged_data.irow( i)['table_name'] + '</td><td>' + unchanged_data.irow( i)['type_name'] + '</td><td>' + str( unchanged_data.irow(i)['size']) + '</td></tr>' if (len(unchanged_data) > 0): html += '</table>' html += '</div>' html += '</body></html>' with open(rpath + "Report.html", "w") as report_file: report_file.write(html) files = util.unix('cp ' + rpath + 'Report.html ' + rpath + 'Report_' + cur_date + '.html') if last_report: with open(rpath + last_report, "r") as last_report_file: last_html = last_report_file.read() last_html = re.sub( r"<span id=\"nextreportlink\">.*</span>", '<span id="nextreportlink"><a href="' + 'Report_' + cur_date + '.html">>></a></span>', last_html) with open(rpath + last_report, "w") as last_report_file: last_report_file.write(last_html)
def do_work(self): #get input objects print '***enter do_work', self.id #Tracer()(); if self.has_calculated is not None: print 'has calculated', self.id, 'return:', self.has_calculated return self.has_calculated print self.print_me() input_dirty = self.prepare_inputs() #check all the output is existing , len is not 0 output_dirty = any([self.is_dirty(f) for f in self.outputs]) print 'input, output dirty checking: ', input_dirty, output_dirty, 'in', self.id update = True if ((not input_dirty)\ and (not output_dirty)\ and not SyncDB.IS_REBUILD): update = False else: update = True if len(SyncDB.DEBUG_NODE_IDS): if self.id in SyncDB.DEBUG_NODE_IDS: update = True print 'input, output dirty checking: turn on by debug nodes' elif SyncDB.DEBUG_NODE_IDS_ONLY: update = False if not update: print '***do nothing,return from:', self.id r = False else: print '***calling do_update in ', self.id r = True if SyncDB.DONT_RECOVER: r = self.do_update() else: try: r = self.do_update() except Exception as exp: not_recovered = True import traceback err_msg = '!!!!!!!!!!!Error occurred while running %s (%s) \n (%s).' % ( self.id, exp, traceback.format_exc()) print err_msg SyncDB.ERROR_LOG.write(err_msg) if hasattr(SyncDB, 'RECOVERY_FOLDER') and os.path.exists( SyncDB.RECOVERY_FOLDER): #recover files from this folder for f in self.outputs: file = os.path.basename(f) if os.path.exists(SyncDB.RECOVERY_FOLDER + "/" + file): SyncDB.ERROR_LOG.write( '\tRecovered %s from folder %s\n' % (file, SyncDB.RECOVERY_FOLDER)) print '\tRecovered %s from folder %s\n' % ( file, SyncDB.RECOVERY_FOLDER) util.unix('cp ' + SyncDB.RECOVERY_FOLDER + "/" + file + ' ' + SyncDB.DOWNLOAD_DIR()) else: SyncDB.ERROR_LOG.write( '\tDid not recover %s from folder %s (not such file)\n' % (file, SyncDB.RECOVERY_FOLDER)) print '\tDid not recover %s from folder %s (not such file)\n' % ( file, SyncDB.RECOVERY_FOLDER) else: print '\tRecovery folder does not exist for %s' % (str( self.outputs)) SyncDB.ERROR_LOG.write( '\tRecovery folder does not exist\n') SyncDB.ERROR_LOG.write(str(self.outputs)) if r is None: r = True self.has_calculated = r return r
def create_idmapping(self): if not os.path.exists( os.path.join(SyncDB.DOWNLOAD_DIR(), "uniprot_idmapping.dat.gz")): urllib.urlretrieve( "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping.dat.gz", os.path.join(SyncDB.DOWNLOAD_DIR(), "uniprot_idmapping.dat.gz")) print "UniProt idmapping file downloaded..." else: print "uniprot_idmapping.dat.gz exists. start processing..." #keep the lines that contain tax_id and gene_id from the file. if not os.path.exists( os.path.join(SyncDB.DOWNLOAD_DIR(), "uniprot_idmapping_cleaned.dat")): filter = '\|'.join(['NCBI_TaxID\\t' + x for x in self.taxidList]) print "extracting rows of desired species with grep command...." unix_cmd = "LC_ALL=C zgrep 'NCBI_TaxID\|GeneID' " + os.path.join(SyncDB.DOWNLOAD_DIR(),"uniprot_idmapping.dat.gz") \ + " | LC_ALL=C grep --no-group-separator -A 1 $'" + filter + "' > " \ + os.path.join(SyncDB.DOWNLOAD_DIR(), "uniprot_idmapping_cleaned.dat") print unix_cmd util.unix(unix_cmd) print "extracting rows of desired species with grep command is done." df_itr = util.read_csv(os.path.join(SyncDB.DOWNLOAD_DIR(), "uniprot_idmapping_cleaned.dat"), sep="\t", header=None, names=["uniprot_id", "type", "value"], iterator=True, chunksize=50000) data = [] chunks = 0 cur_uid = None cur_tax_id = None cur_gene_id = None for df in df_itr: for index, el in df.iterrows(): if cur_uid != el["uniprot_id"]: cur_uid = el["uniprot_id"] #discard previous tax_id and gene_id cur_tax_id = None cur_gene_id = None if el["type"] == "GeneID": cur_gene_id = el["value"] elif el["type"] == "NCBI_TaxID": cur_tax_id = el["value"] else: print "error in idmapping_clean.dat file. expecting 'GeneID' or 'TaxID', seen ", el[ "type"] if cur_gene_id is not None and cur_tax_id is not None: #gene_id and tax_id for the current uid is found if cur_tax_id in [int(x) for x in self.taxidList]: data.append({ "source_id": cur_uid, "gid": cur_gene_id, "tax_id": cur_tax_id }) #Tracer()() chunks += 1 print "processed ", chunks * 50000, " found ", len(data) del df #break; pd.DataFrame(data).to_csv(self.fn_dest_idmapping, index=False) print "UniProt idmapping file processed. ", len(data), " ids found"
def process_uniprot_xml(self): if not os.path.exists( os.path.join(SyncDB.DOWNLOAD_DIR(), "uniprot_sprot.xml")): urllib.urlretrieve( "ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz", os.path.join(SyncDB.DOWNLOAD_DIR(), "uniprot_sprot.xml.gz")) util.unix( "gunzip " + os.path.join(SyncDB.DOWNLOAD_DIR(), "uniprot_sprot.xml.gz")) #http://stackoverflow.com/questions/7018326/lxml-iterparse-in-python-cant-handle-namespaces context = etree.iterparse(os.path.join(SyncDB.DOWNLOAD_DIR(), "uniprot_sprot.xml"), tag='{http://uniprot.org/uniprot}entry') out = [] i = 0 gid_not_found = [] multiple_gid = [] multiple_type = [] missing_type = [] missing_keyword = [] for action, elem in context: i = i + 1 if i % 10000 == 0: print "processed " + str(i) + " uniprot entries" if elem.tag == '{http://uniprot.org/uniprot}entry': row = self.process_uniprot_entry(elem) if row is None: #it's not a desired species. continue #Tracer()() if len(row['type']) == 0: if len(row['keyword']) != 0: #print "missing_type"; #Tracer()(); missing_type.append(row) continue if row['gid'] is None: gid_not_found.append(row) #print "gid not found"; #Tracer()(); continue if len(list(set(row['type']))) > 1: #print "multiple type"; #Tracer()(); multiple_type.append(row) if len(row['keyword']) == 0: #Tracer()(); missing_keyword.append(row) for t in list(set(row['type'])): content = "yes" if (t == "uniprot_transmembrane"): content = len(row['acc']) if (t == "uniprot_kinase_class"): content = ",".join(row['content']) out.append({ "gid": row["gid"], "type_name": t, "content": content, "annotation_field1": ",".join(row["acc"]), "tax_id": row["tax_id"] }) # processing goes here elem.clear() # second, delete previous siblings (records) while elem.getprevious() is not None: del elem.getparent()[0] df = pd.DataFrame(out) df.to_csv(self.fn_dest, index=False) print "Multiple type:", len(multiple_type) print "No GID:", len(gid_not_found) print "Missing type:", len(missing_type) print "Missing keyword:", len(missing_keyword) print "No GID:", gid_not_found return out