def testFakeDataSet1(self): string_db = StringDB('rdf_graph', True) string_db.graph = RDFGraph(True) self.assertEqual(len(string_db.graph), 0) ensembl = Ensembl('rdf_graph', True) prot_map = ensembl.fetch_protein_gene_map('9606') [prot_map.update({k: ['ENSEMBL:' + prot_map[k]]}) for k in prot_map.keys()] print("Finished fetching ENSP IDs, fetched {} proteins".format(len(prot_map))) # just looking # for key in prot_map: # if string_db.graph.curie_regexp.match(prot_map[key]) is None: # print("INVALID curie for %s from %s", prot_map[key], key) dataframe = pd.DataFrame(data=self.test_set_1, columns=self.columns) string_db._process_protein_links(dataframe, prot_map, '9606') # g1 <interacts with> g2 triples = """ ENSEMBL:ENSG00000001626 RO:0002434 ENSEMBL:ENSG00000004059 . """ self.assertTrue(self.test_util.test_graph_equality(triples, string_db.graph))
def parse(self, limit=None): """ Override Source.parse() Args: :param limit (int, optional) limit the number of rows processed Returns: :return None """ if limit is not None: logger.info("Only parsing first %d rows", limit) protein_paths = self._get_file_paths(self.tax_ids, 'protein_links') for taxon in protein_paths: ensembl = Ensembl(self.graph_type, self.are_bnodes_skized) string_file_path = '/'.join( (self.rawdir, protein_paths[taxon]['file'])) fh = gzip.open(string_file_path, 'rb') dataframe = pd.read_csv(fh, sep='\s+') fh.close() p2gene_map = dict() if taxon in self.id_map_files: map_file = '/'.join( (self.rawdir, self.id_map_files[taxon]['file'])) mfile_handle = open(map_file, 'r') if taxon == 9606: for line in mfile_handle.readlines(): gene, prot = line.rstrip("\n").split("\t") p2gene_map[prot.replace('9606.', '')] \ = "NCBIGene:{}".format(gene) else: for line in mfile_handle.readlines(): prot, gene = line.rstrip("\n").split("\t") p2gene_map[prot] = gene mfile_handle.close() else: logger.info( "Fetching ensembl proteins for taxon {}".format(taxon)) p2gene_map = ensembl.fetch_protein_gene_map(taxon) for key in p2gene_map.keys(): p2gene_map[key] = "ENSEMBL:{}".format(p2gene_map[key]) if taxon == 9606: temp_map = ensembl.fetch_protein_gene_map(taxon) for key in temp_map: if key not in p2gene_map: p2gene_map[key] = "ENSEMBL:{}".format(temp_map[key]) logger.info( "Finished fetching ENSP ID mappings, fetched {} proteins". format(len(p2gene_map))) logger.info( "Fetching protein protein interactions for taxon {}".format( taxon)) self._process_protein_links(dataframe, p2gene_map, taxon, limit)
def parse(self, limit=None): """ Override Source.parse() Args: :param limit (int, optional) limit the number of rows processed Returns: :return None """ if limit is not None: LOG.info("Only parsing first %d rows", limit) protein_paths = self._get_file_paths(self.tax_ids, 'protein_links') col = ['NCBI taxid', 'entrez', 'STRING'] for taxon in protein_paths: ensembl = Ensembl(self.graph_type, self.are_bnodes_skized) string_file_path = '/'.join( (self.rawdir, protein_paths[taxon]['file'])) with gzip.open(string_file_path, 'rb') as reader: dataframe = pd.read_csv(reader, sep=r'\s+') p2gene_map = dict() if taxon in self.id_map_files: LOG.info("Using string provided id_map files") map_file = '/'.join( (self.rawdir, self.id_map_files[taxon]['file'])) with gzip.open(map_file, 'rt') as reader: line = next(reader).strip() if line != '# NCBI taxid / entrez / STRING': LOG.error( 'Expected Headers:\t%s\nRecived Headers:\t%s\n', col, line) exit(-1) for line in reader.readlines(): row = line.rstrip('\n').split('\t') # tax = row[col.index(''NCBI taxid')].strip() gene = row[col.index('entrez')].strip() prot = row[col.index('STRING')].strip() genes = gene.split('|') p2gene_map[prot.replace(taxon + '.', '')] = [ "NCBIGene:" + entrez_id for entrez_id in genes ] else: LOG.info("Fetching ensembl proteins for taxon %s", taxon) p2gene_map = ensembl.fetch_protein_gene_map(taxon) for key in p2gene_map: for phen, gene in enumerate(p2gene_map[key]): p2gene_map[key][phen] = "ENSEMBL:{}".format(gene) LOG.info("Finished fetching ENSP ID mappings, fetched %i proteins", len(p2gene_map)) LOG.info("Fetching protein protein interactions for taxon %s", taxon) self._process_protein_links(dataframe, p2gene_map, taxon, limit)
def parse(self, limit=None): """ Override Source.parse() Args: :param limit (int, optional) limit the number of rows processed Returns: :return None """ if limit is not None: logger.info("Only parsing first %d rows", limit) protein_paths = self._get_file_paths(self.tax_ids, 'protein_links') for taxon in protein_paths: ensembl = Ensembl(self.graph_type, self.are_bnodes_skized) string_file_path = '/'.join(( self.rawdir, protein_paths[taxon]['file'])) fh = gzip.open(string_file_path, 'rb') dataframe = pd.read_csv(fh, sep='\s+') fh.close() p2gene_map = dict() if taxon in self.id_map_files: map_file = '/'.join(( self.rawdir, self.id_map_files[taxon]['file'])) mfile_handle = open(map_file, 'r') if taxon == 9606: for line in mfile_handle.readlines(): gene, prot = line.rstrip("\n").split("\t") p2gene_map[prot.replace('9606.', '')] \ = "NCBIGene:{}".format(gene) else: for line in mfile_handle.readlines(): prot, gene = line.rstrip("\n").split("\t") p2gene_map[prot] = gene mfile_handle.close() else: logger.info("Fetching ensembl proteins " "for taxon {}".format(taxon)) p2gene_map = ensembl.fetch_protein_gene_map(taxon) for key in p2gene_map.keys(): p2gene_map[key] = "ENSEMBL:{}".format(p2gene_map[key]) if taxon == 9606: temp_map = ensembl.fetch_protein_gene_map(taxon) for key in temp_map: if key not in p2gene_map: p2gene_map[key] = "ENSEMBL:{}".format(temp_map[key]) logger.info("Finished fetching ENSP ID mappings, " "fetched {} proteins".format(len(p2gene_map))) logger.info("Fetching protein protein interactions " "for taxon {}".format(taxon)) self._process_protein_links(dataframe, p2gene_map, taxon, limit)
def parse(self, limit=None): """ Override Source.parse() Args: :param limit (int, optional) limit the number of rows processed Returns: :return None """ if limit is not None: LOG.info("Only parsing first %d rows", limit) protein_paths = self._get_file_paths(self.tax_ids, 'protein_links') col = ['NCBI taxid', 'entrez', 'STRING'] for taxon in protein_paths: ensembl = Ensembl(self.graph_type, self.are_bnodes_skized) string_file_path = '/'.join(( self.rawdir, protein_paths[taxon]['file'])) with gzip.open(string_file_path, 'rb') as reader: dataframe = pd.read_csv(reader, sep=r'\s+') p2gene_map = dict() if taxon in self.id_map_files: LOG.info("Using string provided id_map files") map_file = '/'.join((self.rawdir, self.id_map_files[taxon]['file'])) with gzip.open(map_file, 'rt') as reader: line = next(reader).strip() if line != '# NCBI taxid / entrez / STRING': LOG.error( 'Expected Headers:\t%s\nRecived Headers:\t%s\n', col, line) exit(-1) for line in reader.readlines(): row = line.rstrip('\n').split('\t') # tax = row[col.index(''NCBI taxid')].strip() gene = row[col.index('entrez')].strip() prot = row[col.index('STRING')].strip() genes = gene.split('|') p2gene_map[prot.replace(taxon + '.', '')] = [ "NCBIGene:" + entrez_id for entrez_id in genes] else: LOG.info("Fetching ensembl proteins for taxon %s", taxon) p2gene_map = ensembl.fetch_protein_gene_map(taxon) for key in p2gene_map: for phen, gene in enumerate(p2gene_map[key]): p2gene_map[key][phen] = "ENSEMBL:{}".format(gene) LOG.info( "Finished fetching ENSP ID mappings, fetched %i proteins", len(p2gene_map)) LOG.info( "Fetching protein protein interactions for taxon %s", taxon) self._process_protein_links(dataframe, p2gene_map, taxon, limit)
def parse(self, limit=None): """ Override Source.parse() Args: :param limit (int, optional) limit the number of rows processed Returns: :return None """ if limit is not None: LOG.info("Only parsing first %d rows", limit) protein_paths = self._get_file_paths(self.tax_ids, 'protein_links') col = ['NCBI taxid', 'entrez', 'STRING'] for taxon in protein_paths: ensembl = Ensembl(self.graph_type, self.are_bnodes_skized) string_file_path = '/'.join( (self.rawdir, protein_paths[taxon]['file'])) p2gene_map = dict() with gzip.open(string_file_path, 'rb') as reader: dataframe = pd.read_csv(reader, sep=r'\s+') if taxon in self.id_map_files: LOG.info("Using string provided id_map files") map_file = '/'.join( (self.rawdir, self.id_map_files[taxon]['file'])) with gzip.open(map_file, 'rt') as reader: line = next(reader) row = line[2:-2].split(' / ') if not self.check_fileheader(col, row): pass for line in reader.readlines(): row = line.rstrip('\n').split('\t') # tax = row[col.index(''NCBI taxid')].strip() gene = row[col.index('entrez')].strip() prot = row[col.index('STRING')].strip() genes = gene.split('|') p2gene_map[prot.replace(taxon + '.', '')] = [ "NCBIGene:" + entrez_id for entrez_id in genes ] else: LOG.info("Fetching ensembl protein_gene dict for NCBITaxon:%s", taxon) p2gene_map = ensembl.fetch_protein_gene_map(taxon) p2gene_map.update( {k: ['ENSEMBL:' + p2gene_map[k]] for k in p2gene_map}) LOG.info("Finished fetching ENSP ID mappings, fetched %i proteins", len(p2gene_map)) LOG.info("Fetching protein protein interactions for taxon %s", taxon) self._process_protein_links(dataframe, p2gene_map, taxon, limit)
class EnsemblTestCase(SourceTestCase): def setUp(self): self.source = Ensembl('rdf_graph', True) self.source.test_ids = self.source.all_test_ids['gene'] self.source.settestonly(True) self._setDirToSource() return def tearDown(self): self.source = None return
class EnsemblTestCase(SourceTestCase): def setUp(self): self.source = Ensembl() self.source.test_ids = self._get_conf()['test_ids']['gene'] self.source.settestonly(True) self._setDirToSource() return def tearDown(self): self.source = None return
def setUp(self): self.test_util = TestUtils() # Test set with two proteins from same species self.test_set_1 = [[ '9606.ENSP00000000233', '9606.ENSP00000003084', 0, 0, 0, 0, 300, 0, 150, 800]] # Test set with deprecated protein id self.test_set_2 = [[ '9606.ENSP00000000233', '9606.ENSP00000006101', 0, 0, 0, 0, 300, 0, 150, 800]] self.columns = [ 'protein1', 'protein2', 'neighborhood', 'fusion', 'cooccurence', 'coexpression', 'experimental', 'database', 'textmining', 'combined_score'] ensembl = Ensembl('rdf_graph', True) self.protein_list = ensembl.fetch_protein_gene_map('9606') return
def testFakeDataSet1(self): string_db = StringDB('rdf_graph', True) string_db.graph = RDFGraph(True) self.assertEqual(len(string_db.graph), 0) ensembl = Ensembl('rdf_graph', True) prot_map = ensembl.fetch_protein_gene_map(9606) for key in prot_map.keys(): prot_map[key] = "ENSEMBL:{}".format(prot_map[key]) print("Finished fetching ENSP IDs, " "fetched {} proteins".format(len(prot_map.keys()))) dataframe = pd.DataFrame(data=self.test_set_1, columns=self.columns) string_db._process_protein_links(dataframe, prot_map, 9606) triples = """ ENSEMBL:ENSG00000001626 RO:0002434 ENSEMBL:ENSG00000004059 . """ self.assertTrue(self.test_util.test_graph_equality( triples, string_db.graph))
def testFakeDataSet1(self): string_db = StringDB('rdf_graph', True) string_db.graph = RDFGraph(True) self.assertEqual(len(string_db.graph), 0) ensembl = Ensembl('rdf_graph', True) prot_map = ensembl.fetch_protein_gene_map(9606) for key in prot_map.keys(): prot_map[key] = "ENSEMBL:{}".format(prot_map[key]) print("Finished fetching ENSP IDs, fetched {} proteins".format( len(prot_map.keys()))) dataframe = pd.DataFrame(data=self.test_set_1, columns=self.columns) string_db._process_protein_links(dataframe, prot_map, 9606) triples = """ ENSEMBL:ENSG00000001626 RO:0002434 ENSEMBL:ENSG00000004059 . """ self.assertTrue( self.test_util.test_graph_equality(triples, string_db.graph))
def testFakeDataSet1(self): string_db = StringDB('rdf_graph', True) string_db.graph.bind_all_namespaces() ensembl = Ensembl('rdf_graph', True) prot_map = ensembl.fetch_protein_gene_map(9606) for key in prot_map.keys(): prot_map[key] = "ENSEMBL:{}".format(prot_map[key]) print("Finished fetching ENSP IDs, " "fetched {} proteins".format(len(prot_map.keys()))) dataframe = pd.DataFrame(data=self.test_set_1, columns=self.columns) string_db._process_protein_links(dataframe, prot_map, 9606) sparql_query = """ SELECT ?prot WHERE { ?prot RO:0002434 ENSEMBL:ENSG00000004059 . } """ sparql_output = string_db.graph.query(sparql_query) results = list(sparql_output) expected = [(URIRef(string_db.graph._getNode("ENSEMBL:ENSG00000001626")),)] self.assertEqual(results, expected)
def main(): """ Zebrafish: 1. Map ENSP to ZFIN Ids using Intermine 2. Map deprecated ENSP IDs to ensembl genes by querying the ensembl database then use intermine to resolve to gene IDs Mouse: Map deprecated ENSP IDs to ensembl genes by querying the ensembl database then use intermine to resolve to MGI IDs Fly: ENSP IDs appear as xrefs on translation IDs Worm: Use UniProt Mapping file provided by String """ parser = argparse.ArgumentParser(usage=__doc__) parser.add_argument('--config', '-c', required=True, help='JSON configuration file') parser.add_argument('--out', '-o', required=False, help='output directory', default="./") parser.add_argument('--use_cache', '-cached', action="store_true", required=False, help='use cached files', default=False) args = parser.parse_args() # Hardcoded dir for raw files out_path = Path(args.out) raw_dir = out_path / "out" raw_dir.mkdir(parents=True, exist_ok=True) # Hardcoded unmapped file VERSION = 'v10.5' STRING_BASE = "http://string-db.org/download/" \ "protein.links.detailed.{}".format(VERSION) config_file = open(args.config, 'r') config = yaml.load(config_file) config_file.close() out_unmapped_file = out_path / "unmapped_ids.tsv" unmapped_file = out_unmapped_file.open("w") # Connect to ensembl connection = connect_to_database(host=config['database']['host'], username=config['database']['username'], port=config['database']['port']) cursor = connection.cursor() # Process MGI eqs # #################### taxon = config['taxa_specific']['mouse']['tax_id'] # IO dump_file = raw_dir / '{}.protein.links.detailed.{}.txt.gz' \ .format(taxon, VERSION) mouse_map_file = out_path / config['taxa_specific']['mouse']['output_file'] mouse_file = mouse_map_file.open('w') path = '{}/{}.protein.links.detailed.{}.txt.gz' \ .format(STRING_BASE, taxon, VERSION) if not args.use_cache: download_file(path, dump_file) ensembl = Ensembl("rdf_graph", True) p2gene_map = ensembl.fetch_protein_gene_map(taxon) fh = gzip.open(str(dump_file), 'rb') df = pd.read_csv(fh, sep='\s+') fh.close() proteins = pd.unique(df[['protein1', 'protein2']].values.ravel()) logger.info("Processing {} proteins".format(len(proteins))) for protein in proteins: prot = protein.replace('{}.'.format(str(taxon)), '') try: ens_gene = p2gene_map[prot] ens_curie = "ENSEMBL:{}".format(ens_gene) mouse_file.write("{}\t{}\n".format(prot, ens_curie)) continue except KeyError: pass ens_gene = get_deprecated_protein_gene_rel( cursor, prot, config['taxa_specific']['mouse']['ensembl'], config) intermine_resp = query_mousemine( config['taxa_specific']['mouse']['intermine'], ens_gene) if intermine_resp.is_successful: mouse_file.write("{}\t{}\n".format(prot, intermine_resp.gene_id)) else: unmapped_file.write("{}\t{}\t{}\n".format(prot, ens_gene, taxon)) mouse_file.close() # Process Fly eqs # #################### taxon = config['taxa_specific']['fly']['tax_id'] # IO dump_file = raw_dir / '{}.protein.links.detailed.{}.txt.gz' \ .format(taxon, VERSION) fly_map_file = out_path / config['taxa_specific']['fly']['output_file'] fly_file = fly_map_file.open('w') path = '{}/{}.protein.links.detailed.{}.txt.gz' \ .format(STRING_BASE, taxon, VERSION) if not args.use_cache: download_file(path, dump_file) ensembl = Ensembl("rdf_graph", True) p2gene_map = ensembl.fetch_protein_gene_map(taxon) fh = gzip.open(str(dump_file), 'rb') df = pd.read_csv(fh, sep='\s+') fh.close() proteins = pd.unique(df[['protein1', 'protein2']].values.ravel()) logger.info("Processing {} proteins".format(len(proteins))) for protein in proteins: prot = protein.replace('{}.'.format(str(taxon)), '') try: ens_gene = p2gene_map[prot] ens_curie = "ENSEMBL:{}".format(ens_gene) fly_file.write("{}\t{}\n".format(prot, ens_curie)) continue except KeyError: pass ens_gene = get_xref_protein_gene_rel( cursor, prot, config['taxa_specific']['fly']['ensembl'], config, taxon) if ens_gene is not None: fly_file.write("{}\t{}\n".format(prot, "ENSEMBL:{}".format(ens_gene))) else: unmapped_file.write("{}\t{}\t{}\n".format(prot, '', taxon)) fly_file.close() # Process Worm eqs # #################### taxon = config['taxa_specific']['worm']['tax_id'] # IO dump_file = raw_dir / '{}.protein.links.detailed.{}.txt.gz' \ .format(taxon, VERSION) uniprot_file = raw_dir / config['taxa_specific']['worm']['uniprot_file'] worm_map_file = out_path / config['taxa_specific']['worm']['output_file'] worm_file = worm_map_file.open('w') path = '{}/{}.protein.links.detailed.{}.txt.gz' \ .format(STRING_BASE, taxon, VERSION) if not args.use_cache: download_file(path, dump_file) download_file(config['taxa_specific']['worm']['uniprot_mappings'], uniprot_file) ensembl = Ensembl("rdf_graph", True) p2gene_map = ensembl.fetch_protein_gene_map(taxon) uni2gene_map = ensembl.fetch_uniprot_gene_map(taxon) fh = gzip.open(str(uniprot_file), 'rb') df = pd.read_csv(fh, sep='\s+') fh.close() string_uniprot_map = {} for index, row in df.iterrows(): uniprot_ac = row['uniprot_ac|uniprot_id'].split('|')[0] string_uniprot_map[row['string_id']] = uniprot_ac fh = gzip.open(str(dump_file), 'rb') df = pd.read_csv(fh, sep='\s+') fh.close() proteins = pd.unique(df[['protein1', 'protein2']].values.ravel()) logger.info("Processing {} proteins".format(len(proteins))) for protein in proteins: prot = protein.replace('{}.'.format(str(taxon)), '') try: ens_gene = p2gene_map[prot] ens_curie = "ENSEMBL:{}".format(ens_gene) worm_file.write("{}\t{}\n".format(prot, ens_curie)) continue except KeyError: pass try: uniprot_ac = string_uniprot_map[prot] ens_gene = uni2gene_map[uniprot_ac] ens_curie = "ENSEMBL:{}".format(ens_gene) worm_file.write("{}\t{}\n".format(prot, ens_curie)) continue except KeyError: pass unmapped_file.write("{}\t{}\t{}\n".format(prot, '', taxon)) worm_file.close() # Process ZFIN eqs # #################### taxon = config['taxa_specific']['zebrafish']['tax_id'] # IO dump_file = raw_dir / '{}.protein.links.detailed.{}.txt.gz' \ .format(taxon, VERSION) zfin_map_file = out_path / config['taxa_specific']['zebrafish']['output_file'] zfin_file = zfin_map_file.open('w') path = '{}/{}.protein.links.detailed.{}.txt.gz' \ .format(STRING_BASE, taxon, VERSION) if not args.use_cache: download_file(path, dump_file) ensembl = Ensembl("rdf_graph", True) p2gene_map = ensembl.fetch_protein_gene_map(taxon) # in 3.6 gzip accepts Paths fh = gzip.open(str(dump_file), 'rb') df = pd.read_csv(fh, sep='\s+') fh.close() proteins = pd.unique(df[['protein1', 'protein2']].values.ravel()) logger.info("Processing {} proteins".format(len(proteins))) for protein in proteins: prot = protein.replace('{}.'.format(str(taxon)), '') try: ens_gene = p2gene_map[prot] ens_curie = "ENSEMBL:{}".format(ens_gene) zfin_file.write("{}\t{}\n".format(prot, ens_curie)) continue except KeyError: pass intermine_resp = query_fishmine( config['taxa_specific']['zebrafish']['intermine'], prot) if intermine_resp.is_successful: zfin_file.write("{}\t{}\n".format(prot, intermine_resp.gene_id)) continue ens_gene = get_deprecated_protein_gene_rel( cursor, prot, config['taxa_specific']['zebrafish']['ensembl'], config) intermine_resp = query_fishmine( config['taxa_specific']['zebrafish']['intermine'], ens_gene) if intermine_resp.is_successful: zfin_file.write("{}\t{}\n".format(prot, intermine_resp.gene_id)) continue intermine_resp = query_fishmine( config['taxa_specific']['zebrafish']['intermine'], ens_gene, "Pseudogene") if intermine_resp.is_successful: zfin_file.write("{}\t{}\n".format(prot, intermine_resp.gene_id)) else: unmapped_file.write("{}\t{}\t{}\n".format(prot, ens_gene, taxon)) zfin_file.close() unmapped_file.close() connection.close() logger.info("ID Map Finished")
def main(): """ Zebrafish: 1. Map ENSP to ZFIN Ids using Intermine 2. Map deprecated ENSP IDs to ensembl genes by querying the ensembl database then use intermine to resolve to gene IDs Mouse: Map deprecated ENSP IDs to ensembl genes by querying the ensembl database then use intermine to resolve to MGI IDs Fly: ENSP IDs appear as xrefs on translation IDs Worm: Use UniProt Mapping file provided by String """ parser = argparse.ArgumentParser(usage=__doc__) parser.add_argument('--config', '-c', required=True, help='JSON configuration file') parser.add_argument('--out', '-o', required=False, help='output directory', default="./") parser.add_argument('--use_cache', '-cached', action="store_true", required=False, help='use cached files', default=False) args = parser.parse_args() # Hardcoded dir for raw files out_path = Path(args.out) raw_dir = out_path / "out" raw_dir.mkdir(parents=True, exist_ok=True) # Hardcoded unmapped file VERSION = 'v10.5' STRING_BASE = "http://string-db.org/download/" \ "protein.links.detailed.{}".format(VERSION) config_file = open(args.config, 'r') config = yaml.load(config_file) config_file.close() out_unmapped_file = out_path / "unmapped_ids.tsv" unmapped_file = out_unmapped_file.open("w") # Connect to ensembl connection = connect_to_database(host=config['database']['host'], username=config['database']['username'], port=config['database']['port']) cursor = connection.cursor() # Process MGI eqs # #################### taxon = config['taxa_specific']['mouse']['tax_id'] # IO dump_file = raw_dir / '{}.protein.links.detailed.{}.txt.gz' \ .format(taxon, VERSION) mouse_map_file = out_path / config['taxa_specific']['mouse']['output_file'] mouse_file = mouse_map_file.open('w') path = '{}/{}.protein.links.detailed.{}.txt.gz' \ .format(STRING_BASE, taxon, VERSION) if not args.use_cache: download_file(path, dump_file) ensembl = Ensembl("rdf_graph", True) p2gene_map = ensembl.fetch_protein_gene_map(taxon) fh = gzip.open(str(dump_file), 'rb') df = pd.read_csv(fh, sep='\s+') fh.close() proteins = pd.unique(df[['protein1', 'protein2']].values.ravel()) logger.info("Processing {} proteins".format(len(proteins))) for protein in proteins: prot = protein.replace('{}.'.format(str(taxon)), '') try: ens_gene = p2gene_map[prot] ens_curie = "ENSEMBL:{}".format(ens_gene) mouse_file.write("{}\t{}\n".format(prot, ens_curie)) continue except KeyError: pass ens_gene = get_deprecated_protein_gene_rel( cursor, prot, config['taxa_specific']['mouse']['ensembl'], config) intermine_resp = query_mousemine( config['taxa_specific']['mouse']['intermine'], ens_gene) if intermine_resp.is_successful: mouse_file.write("{}\t{}\n".format(prot, intermine_resp.gene_id)) else: unmapped_file.write("{}\t{}\t{}\n".format(prot, ens_gene, taxon)) mouse_file.close() # Process Fly eqs # #################### taxon = config['taxa_specific']['fly']['tax_id'] # IO dump_file = raw_dir / '{}.protein.links.detailed.{}.txt.gz' \ .format(taxon, VERSION) fly_map_file = out_path / config['taxa_specific']['fly']['output_file'] fly_file = fly_map_file.open('w') path = '{}/{}.protein.links.detailed.{}.txt.gz' \ .format(STRING_BASE, taxon, VERSION) if not args.use_cache: download_file(path, dump_file) ensembl = Ensembl("rdf_graph", True) p2gene_map = ensembl.fetch_protein_gene_map(taxon) fh = gzip.open(str(dump_file), 'rb') df = pd.read_csv(fh, sep='\s+') fh.close() proteins = pd.unique(df[['protein1', 'protein2']].values.ravel()) logger.info("Processing {} proteins".format(len(proteins))) for protein in proteins: prot = protein.replace('{}.'.format(str(taxon)), '') try: ens_gene = p2gene_map[prot] ens_curie = "ENSEMBL:{}".format(ens_gene) fly_file.write("{}\t{}\n".format(prot, ens_curie)) continue except KeyError: pass ens_gene = get_xref_protein_gene_rel( cursor, prot, config['taxa_specific']['fly']['ensembl'], config, taxon) if ens_gene is not None: fly_file.write("{}\t{}\n".format(prot, "ENSEMBL:{}".format(ens_gene))) else: unmapped_file.write("{}\t{}\t{}\n".format(prot, '', taxon)) fly_file.close() # Process Worm eqs # #################### taxon = config['taxa_specific']['worm']['tax_id'] # IO dump_file = raw_dir / '{}.protein.links.detailed.{}.txt.gz' \ .format(taxon, VERSION) uniprot_file = raw_dir / config['taxa_specific']['worm']['uniprot_file'] worm_map_file = out_path / config['taxa_specific']['worm']['output_file'] worm_file = worm_map_file.open('w') path = '{}/{}.protein.links.detailed.{}.txt.gz' \ .format(STRING_BASE, taxon, VERSION) if not args.use_cache: download_file(path, dump_file) download_file(config['taxa_specific']['worm']['uniprot_mappings'], uniprot_file) ensembl = Ensembl("rdf_graph", True) p2gene_map = ensembl.fetch_protein_gene_map(taxon) uni2gene_map = ensembl.fetch_uniprot_gene_map(taxon) fh = gzip.open(str(uniprot_file), 'rb') df = pd.read_csv(fh, sep='\s+') fh.close() string_uniprot_map = {} for index, row in df.iterrows(): uniprot_ac = row['uniprot_ac|uniprot_id'].split('|')[0] string_uniprot_map[row['string_id']] = uniprot_ac fh = gzip.open(str(dump_file), 'rb') df = pd.read_csv(fh, sep='\s+') fh.close() proteins = pd.unique(df[['protein1', 'protein2']].values.ravel()) logger.info("Processing {} proteins".format(len(proteins))) for protein in proteins: prot = protein.replace('{}.'.format(str(taxon)), '') try: ens_gene = p2gene_map[prot] ens_curie = "ENSEMBL:{}".format(ens_gene) worm_file.write("{}\t{}\n".format(prot, ens_curie)) continue except KeyError: pass try: uniprot_ac = string_uniprot_map[prot] ens_gene = uni2gene_map[uniprot_ac] ens_curie = "ENSEMBL:{}".format(ens_gene) worm_file.write("{}\t{}\n".format(prot, ens_curie)) continue except KeyError: pass unmapped_file.write("{}\t{}\t{}\n".format(prot, '', taxon)) worm_file.close() # Process ZFIN eqs # #################### taxon = config['taxa_specific']['zebrafish']['tax_id'] # IO dump_file = raw_dir / '{}.protein.links.detailed.{}.txt.gz' \ .format(taxon, VERSION) zfin_map_file = out_path / config['taxa_specific']['zebrafish'][ 'output_file'] zfin_file = zfin_map_file.open('w') path = '{}/{}.protein.links.detailed.{}.txt.gz' \ .format(STRING_BASE, taxon, VERSION) if not args.use_cache: download_file(path, dump_file) ensembl = Ensembl("rdf_graph", True) p2gene_map = ensembl.fetch_protein_gene_map(taxon) # in 3.6 gzip accepts Paths fh = gzip.open(str(dump_file), 'rb') df = pd.read_csv(fh, sep='\s+') fh.close() proteins = pd.unique(df[['protein1', 'protein2']].values.ravel()) logger.info("Processing {} proteins".format(len(proteins))) for protein in proteins: prot = protein.replace('{}.'.format(str(taxon)), '') try: ens_gene = p2gene_map[prot] ens_curie = "ENSEMBL:{}".format(ens_gene) zfin_file.write("{}\t{}\n".format(prot, ens_curie)) continue except KeyError: pass intermine_resp = query_fishmine( config['taxa_specific']['zebrafish']['intermine'], prot) if intermine_resp.is_successful: zfin_file.write("{}\t{}\n".format(prot, intermine_resp.gene_id)) continue ens_gene = get_deprecated_protein_gene_rel( cursor, prot, config['taxa_specific']['zebrafish']['ensembl'], config) intermine_resp = query_fishmine( config['taxa_specific']['zebrafish']['intermine'], ens_gene) if intermine_resp.is_successful: zfin_file.write("{}\t{}\n".format(prot, intermine_resp.gene_id)) continue intermine_resp = query_fishmine( config['taxa_specific']['zebrafish']['intermine'], ens_gene, "Pseudogene") if intermine_resp.is_successful: zfin_file.write("{}\t{}\n".format(prot, intermine_resp.gene_id)) else: unmapped_file.write("{}\t{}\t{}\n".format(prot, ens_gene, taxon)) zfin_file.close() unmapped_file.close() connection.close() logger.info("ID Map Finished")
def setUp(self): self.source = Ensembl('rdf_graph', True) self.source.test_ids = self.source.all_test_ids['gene'] self.source.settestonly(True) self._setDirToSource() return
def setUp(self): self.source = Ensembl() self.source.test_ids = self._get_conf()['test_ids']['gene'] self.source.settestonly(True) self._setDirToSource() return
def setUp(self): self.source = Ensembl('rdf_graph', True) self.source.test_ids = self._get_conf()['test_ids']['gene'] self.source.settestonly(True) self._setDirToSource() return