def _get_relations(self, limit): """ This will process each of the specified orthoxml files, and extracting the induced orthology and paralogy associations based on the specified xml group nodes. The specs for orthoxml can be found here: http://orthoxml.org We currently extract tripples for orthologous relations, paralogous relations and in_taxon relations to NCBITaxonId attributes, e.g. Triples: <protein1_id> RO:othologous <protein2_id> <assoc_id> :hasSubject <protein1_id> <assoc_id> :hasObject <protein2_id> <assoc_id> :hasPredicate <RO:orthologous> <assoc_id> dc:evidence ECO:phylogenetic_evidence :param limit: limit the number of induced pairwise relations :return: None """ logger.info("getting ortholog and paralog relations") graph = self.testgraph if self.testMode else self.graph model = Model(graph) for k in self.files.keys(): f = os.path.join(self.rawdir, self.files[k]['file']) matchcounter = 0 logger.info("Parsing %s", f) time_start = time.time() xml = lxml.etree.parse(f) parser = OrthoXMLParser(xml) logger.info( "loaded {} into memory. Took {}sec. Starting to extract relations..." .format(f, time.time() - time_start)) time0, last_cnt = time.time(), 0 for cnts, (protein_nr_a, protein_nr_b, rel_type) in enumerate( parser.extract_pairwise_relations()): protein_a = parser.gene_mapping[protein_nr_a] protein_b = parser.gene_mapping[protein_nr_b] protein_id_a = protein_a.get('protId') protein_id_b = protein_b.get('protId') if cnts % 100 == 0 and time.time() - time0 > 30: logger.info( "processed {0:d} rels in {1:.1f}sec: " "{2:.3f}/sec; overall {3:d} in {4:1f}sec " "({5:.3f}/sec); cache ratio: {6.hits}/{6.misses}". format(cnts - last_cnt, time.time() - time0, (cnts - last_cnt) / (time.time() - time0), cnts, time.time() - time_start, cnts / (time.time() - time_start), self.add_protein_to_graph.cache_info())) time0, last_cnt = time.time(), cnts if self.testMode and not (protein_id_a in self.test_ids or protein_id_b in self.test_ids): continue matchcounter += 1 taxon_a = self.extract_taxon_info(protein_a) taxon_b = self.extract_taxon_info(protein_b) # check if both protein belong to taxa that are selected if (self.tax_ids is not None and ((int(re.sub(r'NCBITaxon:', '', taxon_a.rstrip())) not in self.tax_ids) or (int(re.sub(r'NCBITaxon:', '', taxon_b.rstrip())) not in self.tax_ids))): continue protein_id_a = self.clean_protein_id(protein_id_a) protein_id_b = self.clean_protein_id(protein_id_b) # add genes to graph if needed; # assume labels will be taken care of elsewhere self.add_protein_to_graph(protein_id_a, taxon_a, model) self.add_protein_to_graph(protein_id_b, taxon_b, model) rel = self.seelf.globaltt[rel_type] evidence_id = self.globaltt[ 'phylogenetic evidence'] # 'ECO:0000080' # add the association and relevant nodes to graph assoc = OrthologyAssoc(graph, self.name, protein_id_a, protein_id_b, rel) assoc.add_evidence(evidence_id) assoc.add_association_to_graph() if not self.testMode and limit is not None and matchcounter > limit: logger.warning( "reached limit of relations to extract. Stopping early..." ) break # make report on unprocessed_gene_ids logger.info("finished processing %s", f) return
def _get_relations(self, limit): """ This will process each of the specified orthoxml files, and extracting the induced orthology and paralogy associations based on the specified xml group nodes. The specs for orthoxml can be found here: http://orthoxml.org We currently extract tripples for orthologous relations, paralogous relations and in_taxon relations to NCBITaxonId attributes, e.g. Triples: <protein1_id> RO:othologous <protein2_id> <assoc_id> :hasSubject <protein1_id> <assoc_id> :hasObject <protein2_id> <assoc_id> :hasPredicate <RO:orthologous> <assoc_id> dc:evidence ECO:phylogenetic_evidence :param limit: limit the number of induced pairwise relations :return: None """ logger.info("getting ortholog and paralog relations") g = self.testgraph if self.testMode else self.graph model = Model(g) for k in self.files.keys(): f = os.path.join(self.rawdir, self.files[k]['file']) matchcounter = 0 logger.info("Parsing %s", f) time_start = time.time() xml = lxml.etree.parse(f) parser = OrthoXMLParser(xml) logger.info("loaded {} into memory. Took {}sec to load. Starting to extract relations..." .format(f, time.time()-time_start)) time0, last_cnt = time.time(), 0 for cnts, (protein_nr_a, protein_nr_b, rel_type) in enumerate(parser.extract_pairwise_relations()): protein_a = parser.gene_mapping[protein_nr_a] protein_b = parser.gene_mapping[protein_nr_b] protein_id_a = protein_a.get('protId') protein_id_b = protein_b.get('protId') if cnts % 100 == 0 and time.time()-time0 > 30: logger.info("processed {0:d} rels in {1:.1f}sec: {2:.3f}/sec; overall {3:d} in " "{4:1f}sec ({5:.3f}/sec); cache ratio: {6.hits}/{6.misses}" .format(cnts-last_cnt, time.time()-time0, (cnts-last_cnt)/(time.time()-time0), cnts, time.time()-time_start, cnts/(time.time()-time_start), self.add_protein_to_graph.cache_info())) time0, last_cnt = time.time(), cnts if self.testMode and not \ (protein_id_a in self.test_ids or protein_id_b in self.test_ids): continue matchcounter += 1 taxon_a = self.extract_taxon_info(protein_a) taxon_b = self.extract_taxon_info(protein_b) # check if both protein belong to taxa that are selected if (self.tax_ids is not None and ( (int(re.sub(r'NCBITaxon:', '', taxon_a.rstrip())) not in self.tax_ids) or (int(re.sub(r'NCBITaxon:', '', taxon_b.rstrip())) not in self.tax_ids))): continue protein_id_a = self.clean_protein_id(protein_id_a) protein_id_b = self.clean_protein_id(protein_id_b) # add genes to graph if needed; # assume labels will be taken care of elsewhere self.add_protein_to_graph(protein_id_a, taxon_a, model) self.add_protein_to_graph(protein_id_b, taxon_b, model) rel = self._map_orthology_code_to_RO[rel_type] evidence_id = 'ECO:0000080' # phylogenetic evidence # add the association and relevant nodes to graph assoc = OrthologyAssoc(g, self.name, protein_id_a, protein_id_b, rel) assoc.add_evidence(evidence_id) assoc.add_association_to_graph() if not self.testMode \ and limit is not None and matchcounter > limit: logger.warning("reached limit of relations to extract. Stopping early...") break # make report on unprocessed_gene_ids logger.info("finished processing %s", f) return
def _get_orthologs(self, src_key, limit): """ This will process each of the specified pairwise orthology files, creating orthology associations based on the specified orthology code. this currently assumes that each of the orthology files is identically formatted. Relationships are made between genes here. There is also a nominal amount of identifier re-formatting: MGI:MGI --> MGI Ensembl --> ENSEMBL we skip any genes where we don't know how to map the gene identifiers. For example, Gene:Huwe1 for RAT is not an identifier, so we skip any mappings to this identifier. Often, the there are two entries for the same gene (base on equivalent Uniprot id), and so we are not actually losing any information. We presently have a filter to select only orthology relationships where each of the pair is found in self.tax_ids. Genes are also added to a grouping class defined with a PANTHER id. Triples: <gene1_id> RO:othologous <gene2_id> <assoc_id> :hasSubject <gene1_id> <assoc_id> :hasObject <gene2_id> <assoc_id> :hasPredicate <RO:orthologous> <assoc_id> dcterms:evidence ECO:phylogenetic_evidence <panther_id> rdf:type DATA:gene_family <panther_id> RO:has_member <gene1_id> <panther_id> RO:has_member <gene2_id> :param limit: :return: """ LOG.info("reading orthologs") if self.test_mode: graph = self.testgraph else: graph = self.graph model = Model(graph) unprocessed_gene_ids = [] src_file = '/'.join((self.rawdir, self.files[src_key]['file'])) matchcounter = line_counter = 0 col = self.files[src_key]['columns'] reader = tarfile.open(src_file, 'r:gz') LOG.info("Parsing %s", src_key) with reader.extractfile(src_key) as csvfile: # there are no comments or headers for line in csvfile: # a little feedback to the user since there's so many ... bah strace # if line_counter % 1000000 == 0: # LOG.info("Processed %d lines from %s", line_counter, fname.name) # parse each row. ancestor_taxons is unused # HUMAN|Ensembl=ENSG00000184730|UniProtKB=Q0VD83 # MOUSE|MGI=MGI=2176230|UniProtKB=Q8VBT6 # LDO Euarchontoglires PTHR15964 row = line.decode().split('\t') thing1 = row[col.index('Gene')].strip() thing2 = row[col.index('Ortholog')].strip() orthology_type = row[col.index('Type of ortholog')].strip() # ancestor_taxons = row[ # col.index('Common ancestor for the orthologs')].strip() panther_id = row[ col.index('Panther Ortholog ID')].strip() (species_a, gene_a, protein_a) = thing1.split('|') (species_b, gene_b, protein_b) = thing2.split('|') # for testing skip entries without homolog relationships to test ids if self.test_mode and not ( protein_a[9:] in self.test_ids or protein_b[9:] in self.test_ids): continue # map the species abbreviations to ncbi taxon id numbers taxon_a = self.resolve(species_a).split(':')[1].strip() taxon_b = self.resolve(species_b).split(':')[1].strip() # ### # keep orthologous relationships to genes in the given tax_ids # using AND will get you only those associations where # gene1 AND gene2 are in the taxid list (most-filter) # using OR will get you any associations where # gene1 OR gene2 are in the taxid list (some-filter) if self.tax_ids is not None and ( taxon_a not in self.tax_ids) and ( taxon_b not in self.tax_ids): continue else: matchcounter += 1 if limit is not None and matchcounter > limit: break # ### end code block for filtering on taxon # fix the gene identifiers gene_a = re.sub(r'=', ':', gene_a) gene_b = re.sub(r'=', ':', gene_b) clean_gene = self._clean_up_gene_id(gene_a, species_a) if clean_gene is None: unprocessed_gene_ids.append(gene_a) continue gene_a = clean_gene clean_gene = self._clean_up_gene_id(gene_b, species_b) if clean_gene is None: unprocessed_gene_ids.append(gene_b) continue gene_b = clean_gene rel = self.resolve(orthology_type) evidence_id = self.globaltt['phylogenetic evidence'] # add the association and relevant nodes to graph assoc = OrthologyAssoc(graph, self.name, gene_a, gene_b, rel) assoc.add_evidence(evidence_id) # add genes to graph; assume labels will be taken care of elsewhere model.addType(gene_a, self.globaltt['gene']) model.addType(gene_b, self.globaltt['gene']) # might as well add the taxon info for completeness graph.addTriple( gene_a, self.globaltt['in taxon'], 'NCBITaxon:' + taxon_a ) graph.addTriple( gene_b, self.globaltt['in taxon'], 'NCBITaxon:' + taxon_b ) assoc.add_association_to_graph( blv.terms['GeneToGeneHomologyAssociation'] ) # note this is incomplete... # it won't construct the full family hierarchy, # just the top-grouping assoc.add_gene_family_to_graph('PANTHER:' + panther_id) if not self.test_mode and\ limit is not None and line_counter > limit: break LOG.info("finished processing %s", src_file) LOG.warning( "The following gene ids were unable to be processed: %s", str(set(unprocessed_gene_ids)))
def _get_orthologs(self, limit): """ This will process each of the specified pairwise orthology files, creating orthology associations based on the specified orthology code. this currently assumes that each of the orthology files is identically formatted. Relationships are made between genes here. There is also a nominal amount of identifier re-formatting: MGI:MGI --> MGI Ensembl --> ENSEMBL we skip any genes where we don't know how to map the gene identifiers. For example, Gene:Huwe1 for RAT is not an identifier, so we skip any mappings to this identifier. Often, the there are two entries for the same gene (base on equivalent Uniprot id), and so we are not actually losing any information. We presently have a hard-coded filter to select only orthology relationships where one of the pair is in our species of interest (Mouse and Human, for the moment). This will be added as a configurable parameter in the future. Genes are also added to a grouping class defined with a PANTHER id. Triples: <gene1_id> RO:othologous <gene2_id> <assoc_id> :hasSubject <gene1_id> <assoc_id> :hasObject <gene2_id> <assoc_id> :hasPredicate <RO:orthologous> <assoc_id> dc:evidence ECO:phylogenetic_evidence <panther_id> a DATA:gene_family <panther_id> RO:has_member <gene1_id> <panther_id> RO:has_member <gene2_id> :param limit: :return: """ logger.info("getting orthologs") if self.testMode: g = self.testgraph else: g = self.graph model = Model(g) unprocessed_gene_ids = set() # may be faster to make a set after for k in self.files.keys(): f = '/'.join((self.rawdir, self.files[k]['file'])) matchcounter = 0 mytar = tarfile.open(f, 'r:gz') # assume that the first entry is the item fname = mytar.getmembers()[0] logger.info("Parsing %s", fname.name) line_counter = 0 with mytar.extractfile(fname) as csvfile: for line in csvfile: # skip comment lines if re.match(r'^#', line.decode()): logger.info("Skipping header line") continue line_counter += 1 # a little feedback to the user since there's so many if line_counter % 1000000 == 0: logger.info( "Processed %d lines from %s", line_counter, fname.name) line = line.decode().strip() # parse each row. ancestor_taxon is unused # HUMAN|Ensembl=ENSG00000184730|UniProtKB=Q0VD83 # MOUSE|MGI=MGI=2176230|UniProtKB=Q8VBT6 # LDO Euarchontoglires PTHR15964 (a, b, orthology_class, ancestor_taxon, panther_id) = line.split('\t') (species_a, gene_a, protein_a) = a.split('|') (species_b, gene_b, protein_b) = b.split('|') # skip the entries that don't have homolog relationships # with the test ids if self.testMode and not ( re.sub(r'UniProtKB=', '', protein_a) in self.test_ids or re.sub(r'UniProtKB=', '', protein_b) in self.test_ids): continue # map the taxon abbreviations to ncbi taxon ids taxon_a = self._map_taxon_abbr_to_id(species_a) taxon_b = self._map_taxon_abbr_to_id(species_b) # ###uncomment the following code block # if you want to filter based on taxid of favorite animals # taxids = [9606,10090,10116,7227,7955,6239,8355] # taxids = [9606] #human only # retain only those orthologous relationships to genes # in the specified taxids # using AND will get you only those associations where # gene1 AND gene2 are in the taxid list (most-filter) # using OR will get you any associations where # gene1 OR gene2 are in the taxid list (some-filter) if ( self.tax_ids is not None and (int(re.sub(r'NCBITaxon:', '', taxon_a.rstrip())) not in self.tax_ids) and (int(re.sub( r'NCBITaxon:', '', taxon_b.rstrip())) not in self.tax_ids)): continue else: matchcounter += 1 if limit is not None and matchcounter > limit: break # ### end code block for filtering on taxon # fix the gene identifiers gene_a = re.sub(r'=', ':', gene_a) gene_b = re.sub(r'=', ':', gene_b) clean_gene = self._clean_up_gene_id(gene_a, species_a) if clean_gene is None: unprocessed_gene_ids.add(gene_a) gene_a = clean_gene clean_gene = self._clean_up_gene_id(gene_b, species_b) if clean_gene is None: unprocessed_gene_ids.add(gene_b) gene_b = clean_gene # a special case here; mostly some rat genes # they use symbols instead of identifiers. will skip if gene_a is None or gene_b is None: continue rel = self._map_orthology_code_to_RO(orthology_class) evidence_id = 'ECO:0000080' # phylogenetic evidence # add the association and relevant nodes to graph assoc = OrthologyAssoc(g, self.name, gene_a, gene_b, rel) assoc.add_evidence(evidence_id) # add genes to graph; # assume labels will be taken care of elsewhere model.addClassToGraph(gene_a, None) model.addClassToGraph(gene_b, None) # might as well add the taxon info for completeness g.addTriple( gene_a, model.object_properties['in_taxon'], taxon_a) g.addTriple( gene_b, model.object_properties['in_taxon'], taxon_b) assoc.add_association_to_graph() # note this is incomplete... # it won't construct the full family hierarchy, # just the top-grouping assoc.add_gene_family_to_graph( ':'.join(('PANTHER', panther_id))) if not self.testMode \ and limit is not None and line_counter > limit: break # make report on unprocessed_gene_ids logger.info("finished processing %s", f) logger.warning( "The following gene ids were unable to be processed: %s", str(unprocessed_gene_ids)) return
def _get_orthologs(self, limit): """ This will process each of the specified pairwise orthology files, creating orthology associations based on the specified orthology code. this currently assumes that each of the orthology files is identically formatted. relationships are made between genes here. there is also a nominal amount of identifier re-formatting: MGI:MGI --> MGI Ensembl --> ENSEMBL we skip any genes where we don't know how to map the gene identifiers. for example, Gene:Huwe1 for RAT is not an identifier, so we skip any mappings to this identifier. Often, the there are two entries for the same gene (base on equivalent Uniprot id), and so we are not actually losing any information. We presently have a hard-coded filter to select only orthology relationships where one of the pair is in our species of interest (Mouse and Human, for the moment). This will be added as a configurable parameter in the future. Genes are also added to a grouping class defined with a PANTHER id. Triples: <gene1_id> RO:othologous <gene2_id> <assoc_id> :hasSubject <gene1_id> <assoc_id> :hasObject <gene2_id> <assoc_id> :hasPredicate <RO:orthologous> <assoc_id> dc:evidence ECO:phylogenetic_evidence <panther_id> a DATA:gene_family <panther_id> RO:has_member <gene1_id> <panther_id> RO:has_member <gene2_id> :param limit: :return: """ logger.info("getting orthologs") if self.testMode: g = self.testgraph else: g = self.graph gu = GraphUtils(curie_map.get()) unprocessed_gene_ids = set() for k in self.files.keys(): f = '/'.join((self.rawdir, self.files[k]['file'])) matchcounter = 0 mytar = tarfile.open(f, 'r:gz') # assume that the first entry is the item fname = mytar.getmembers()[0] logger.info("Parsing %s", fname.name) line_counter = 0 with mytar.extractfile(fname) as csvfile: for line in csvfile: # skip comment lines if re.match('^#', line.decode()): logger.info("Skipping header line") continue line_counter += 1 # a little feedback to the user since there's so many if line_counter % 1000000 == 0: logger.info("Processed %d lines from %s", line_counter, fname.name) line = line.decode().strip() # parse each row # HUMAN|Ensembl=ENSG00000184730|UniProtKB=Q0VD83 MOUSE|MGI=MGI=2176230|UniProtKB=Q8VBT6 LDO Euarchontoglires PTHR15964 (a, b, orthology_class, ancestor_taxon, panther_id) = line.split('\t') (species_a, gene_a, protein_a) = a.split('|') (species_b, gene_b, protein_b) = b.split('|') # skip the entries that don't have homolog relationships with the test ids if self.testMode and not (re.sub('UniProtKB=', '', protein_a) in self.test_ids or re.sub('UniProtKB=', '', protein_b) in self.test_ids): continue # map the taxon abbreviations to ncbi taxon ids taxon_a = self._map_taxon_abbr_to_id(species_a) taxon_b = self._map_taxon_abbr_to_id(species_b) # ###uncomment the following code block if you want to filter based on taxid # taxids = [9606,10090,10116,7227,7955,6239,8355] #our favorite animals # taxids = [9606] #human only # retain only those orthologous relationships to genes in the specified taxids # using AND will get you only those associations where gene1 AND gene2 are in the taxid list (most-filter) # using OR will get you any associations where gene1 OR gene2 are in the taxid list (some-filter) if (self.tax_ids is not None and (int(re.sub('NCBITaxon:', '', taxon_a.rstrip())) not in self.tax_ids) and (int(re.sub('NCBITaxon:', '', taxon_b.rstrip())) not in self.tax_ids)): continue else: matchcounter += 1 if limit is not None and matchcounter > limit: break # ###end code block for filtering on taxon # fix the gene identifiers gene_a = re.sub('=', ':', gene_a) gene_b = re.sub('=', ':', gene_b) clean_gene = self._clean_up_gene_id(gene_a, species_a) if clean_gene is None: unprocessed_gene_ids.add(gene_a) gene_a = clean_gene clean_gene = self._clean_up_gene_id(gene_b, species_b) if clean_gene is None: unprocessed_gene_ids.add(gene_b) gene_b = clean_gene # a special case here; mostly some rat genes they use symbols instead of identifiers. will skip if gene_a is None or gene_b is None: continue rel = self._map_orthology_code_to_RO(orthology_class) evidence_id = 'ECO:0000080' # phylogenetic evidence # add the association and relevant nodes to graph assoc = OrthologyAssoc(self.name, gene_a, gene_b, rel) assoc.add_evidence(evidence_id) # add genes to graph; assume labels will be taken care of elsewhere gu.addClassToGraph(g, gene_a, None) gu.addClassToGraph(g, gene_b, None) assoc.add_association_to_graph(g) # note this is incomplete... it won't construct the full family hierarchy, just the top-grouping assoc.add_gene_family_to_graph(g, ':'.join(('PANTHER', panther_id))) if not self.testMode and limit is not None and line_counter > limit: break logger.info("finished processing %s", f) logger.warn("The following gene ids were unable to be processed: %s", str(unprocessed_gene_ids)) gu.loadProperties(g, OrthologyAssoc.object_properties, gu.OBJPROP) gu.loadProperties(g, OrthologyAssoc.annotation_properties, gu.ANNOTPROP) return