def get_superclasses(self): if not self.superclasses.empty: return self.superclasses if self.from_backup: self.superclasses = open_pickle(SUPER_BACKUP_PATH) return self.superclasses engine = create_engine(self.db_url) data = """ SELECT ts.tid, ts.superclass_tid, t1.label as term_label, t1.ilx as term_ilx, t2.label as superclass_label, t2.ilx as superclass_ilx FROM term_superclasses AS ts JOIN ( SELECT * FROM terms GROUP BY terms.ilx ) as t1 ON t1.id = ts.tid JOIN ( SELECT * FROM terms GROUP BY terms.ilx ) AS t2 ON t2.id = ts.superclass_tid """ self.superclasses = pd.read_sql(data, engine) create_pickle(self.superclasses, SUPER_BACKUP_PATH) return self.superclasses
def get_annotations(self): if not self.annotations: return self.fetch_annotations() if self.from_backup: self.annotations = open_pickle(ANNOS_BACKUP_PATH) return self.annotations engine = create_engine(self.db_url) data = """ SELECT ta.tid, ta.annotation_tid as annotation_type_tid, t1.ilx as term_ilx, t2.ilx as annotation_type_ilx, t2.label as annotation_type_label, ta.value FROM term_annotations AS ta JOIN ( SELECT * FROM terms GROUP BY terms.ilx ) AS t1 ON ta.tid=t1.id JOIN ( SELECT * FROM terms GROUP BY terms.ilx ) AS t2 ON ta.annotation_tid=t2.id """ self.annotations = pd.read_sql(data, engine) create_pickle(self.annotations, ANNOS_BACKUP_PATH) return self.annotations
def get_relationships(self): if not self.relationships.empty: return self.relationships if self.from_backup: self.relationships = open_pickle(RELAS_BACKUP_PATH) return self.relationships engine = create_engine(self.db_url) data = """ SELECT t1.id as term1_tid, t1.ilx AS term1_ilx, t1.type as term1_type, t2.id as term2_tid, t2.ilx AS term2_ilx, t2.type as term2_type, t3.id as relationship_tid, t3.ilx AS relationship_ilx, t3.label as relationship_label FROM term_relationships AS tr JOIN ( SELECT * FROM terms GROUP BY terms.ilx ) t1 ON t1.id = tr.term1_id JOIN ( SELECT * FROM terms GROUP BY terms.ilx ) AS t2 ON t2.id = tr.term2_id JOIN ( SELECT * FROM terms GROUP BY terms.ilx ) AS t3 ON t3.id = tr.relationship_tid """ self.relationships = pd.read_sql(data, engine) create_pickle(self.relationships, RELAS_BACKUP_PATH) return self.relationships
def __create_pickles(self): sb.call('mkdir ' + self.output, shell=True) for i, f in enumerate(self.files, 1): print(i, len(self.files), f) name = p(f).stem output = p(self.output) / name create_pickle(Graph2Pandas(f).df, p(self.output) / name)
def get_terms(self): ''' GROUP BY is a shortcut to only getting the first in every list of group ''' if not self.terms.empty: return self.terms if self.from_backup: self.terms = open_pickle(TERMS_BACKUP_PATH) return self.terms engine = create_engine(self.db_url) data = """ SELECT t.id as tid, t.ilx, t.label, t.definition, t.type, t.comment, t.version, t.uid, t.time FROM terms t GROUP BY t.ilx """ self.terms = pd.read_sql(data, engine) create_pickle(self.terms, TERMS_BACKUP_PATH) return self.terms
def get_synonyms(self): if not self.synonyms.empty: return self.synonyms if self.from_backup: self.synonyms = open_pickle(SYNOS_BACKUP_PATH) return self.synonyms engine = create_engine(self.db_url) data = """ SELECT ts.tid as tid, t.ilx, ts.literal, ts.type FROM term_synonyms AS ts JOIN ( SELECT * FROM terms GROUP BY terms.ilx ) AS t WHERE ts.tid=t.id """ self.synonyms = pd.read_sql(data, engine) create_pickle(self.synonyms, SYNOS_BACKUP_PATH) return self.synonyms
def get_existing_ids(self): if not self.existing_ids.empty: return self.existing_ids if self.from_backup: self.existing_ids = open_pickle(EXIDS_BACKUP_PATH) return self.existing_ids engine = create_engine(self.db_url) data = """ SELECT tei.tid, tei.curie, tei.iri, tei.preferred, t.ilx, t.label, t.definition FROM ( SELECT * FROM terms GROUP BY terms.ilx ) as t JOIN term_existing_ids AS tei ON t.id = tei.tid """ self.existing_ids = pd.read_sql(data, engine) create_pickle(self.existing_ids, EXIDS_BACKUP_PATH) return self.existing_ids
def get_terms_complete(self) -> pd.DataFrame: ''' Gets complete entity data like term/view ''' if not self.terms_complete.empty: return self.terms_complete if self.from_backup: self.terms_complete = open_pickle(TERMS_COMPLETE_BACKUP_PATH) return self.terms_complete ilx2synonyms = self.get_ilx2synonyms() ilx2existing_ids = self.get_ilx2existing_ids() ilx2annotations = self.get_ilx2annotations() ilx2superclass = self.get_ilx2superclass() ilx_complete = [] header = ['Index'] + list(self.fetch_terms().columns) for row in self.fetch_terms().itertuples(): row = {header[i]:val for i, val in enumerate(row)} row['synonyms'] = ilx2synonyms.get(row['ilx']) row['existing_ids'] = ilx2existing_ids[row['ilx']] # if breaks we have worse problems row['annotations'] = ilx2annotations.get(row['ilx']) row['superclass'] = ilx2superclass.get(row['ilx']) ilx_complete.append(row) terms_complete = pd.DataFrame(ilx_complete) create_pickle(terms_complete, TERMS_COMPLETE_BACKUP_PATH) return terms_complete
if not data.get(row.ilx): data[row.ilx] = defaultdict(set) data[row.ilx]['label'].add(row.label) data[row.ilx]['ilx'].add(row.ilx) data[row.ilx]['definition'].add(row.definition) for row in annos.itertuples(): data[row.term_ilx]['annotation'].add((row.annotation_label, row.value)) for row in synonyms.itertuples(): if row.literal: data[row.ilx]['synonym'].add(row.literal) for row in existing_ids.itertuples(): data[row.ilx]['existing_ids'].add((row.curie, row.iri)) for row in superclasses.itertuples(): data[row.term_ilx]['superclass_ilx'].add(row.superclass_ilx) superclass_exs = data[row.superclass_ilx]['existing_ids'] for superclass_ex in superclass_exs: data[row.term_ilx]['superclass_existing_ids'].add(superclass_ex) for row in relationships.itertuples(): data[row.term1_ilx]['relationship'].add( (row.term1_ilx, row.relationship_ilx, row.term2_ilx)) data[row.term2_ilx]['relationship'].add( (row.term2_ilx, row.relationship_ilx, row.term1_ilx)) df = pd.DataFrame.from_records([record for record in data.values()]) create_pickle(df, output)
relationship_ilx_uri = '/'.join([ilx_uri_base, row.relationship_ilx]) graph.bind(prefix, relationship_ilx_uri) relationship_ilx_uri = URIRef(relationship_ilx_uri) term1_ilx_uri = '/'.join([ilx_uri_base, row.term1_ilx]) term1_ilx_uri = URIRef(term1_ilx_uri) if not in_sanity_check.get(term1_ilx_uri): print('relationships', term1_ilx_uri) term2_ilx_uri = '/'.join([ilx_uri_base, row.term2_ilx]) term2_ilx_uri = URIRef(term2_ilx_uri) if not in_sanity_check.get(term2_ilx_uri): print('relationships', term2_ilx_uri) graph.add((term1_ilx_uri, relationship_ilx_uri, term2_ilx_uri)) graph.add((term2_ilx_uri, relationship_ilx_uri, term1_ilx_uri)) print('=== relationship triples complete ===') graph.serialize(destination=str(p.home() / 'Dropbox/interlex_backups/InterLex.ttl'), format='turtle') graph.serialize(destination=str( p.home() / 'Dropbox/interlex_backups/SciGraph/SciGraph-core/src/test/resources/ontologies/' ), format='turtle') create_pickle(graph, p.home() / 'Dropbox/interlex_backups/InterLex.graph.pickle')
from pathlib import Path as p from ilxutils.interlex_sql import IlxSql from ilxutils.tools import create_pickle import os sql = IlxSql(db_url=os.environ.get('SCICRUNCH_DB_URL_PRODUCTION')) terms = sql.get_terms() create_pickle(terms, p.home() / 'Dropbox/interlex_backups/ilx_db_terms_backup.pickle') print('=== terms backup complete ===') del terms annos = sql.get_annotations() create_pickle(annos, p.home() / 'Dropbox/interlex_backups/ilx_db_annos_backup.pickle') print('=== annotations backup complete ===') del annos ex = sql.get_existing_ids() create_pickle(ex, p.home() / 'Dropbox/interlex_backups/ilx_db_ex_backup.pickle') print('=== existing ids backup complete ===') del ex synonyms = sql.get_synonyms() create_pickle( synonyms, p.home() / 'Dropbox/interlex_backups/ilx_db_synonyms_backup.pickle') print('=== synonyms backup complete ===') del synonyms