def _get_version() -> str: """Get the version of the current data.""" zip_path = ensure_path(PREFIX, url=URL) with zipfile.ZipFile(zip_path) as zip_file: for x in zip_file.filelist: if x.filename.endswith('.sqlite'): return x.filename[len('itisSqlite'):-len('/ITIS.sqlite')] raise ValueError('could not find a file with the version in it')
def get_path(version: str): """Get the path to the extracted ChEMBL SQLite database.""" url = f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{version}/chembl_{version}_sqlite.tar.gz' path = ensure_path(PREFIX, url=url, version=version) name = f'chembl_{version}/chembl_{version}_sqlite/chembl_{version}.db' d = get_prefix_directory(PREFIX, version=version) op = os.path.join(d, name) if not os.path.exists(op): with tarfile.open(path, mode='r', encoding='utf-8') as tar_file: tar_file.extractall(d) return op
def iter_terms() -> Iterable[Term]: """Get ITIS terms.""" zip_path = ensure_path(PREFIX, url=URL) version = _get_version() sqlite_dir = get_prefix_directory(PREFIX, version=version) sqlite_path = prefix_directory_join(PREFIX, 'ITIS.sqlite', version=version) if not os.path.exists(sqlite_path): with zipfile.ZipFile(zip_path) as zip_file: for x in zip_file.filelist: if x.filename.endswith('.sqlite'): zip_file.extract(x, sqlite_dir) shutil.move(os.path.join(sqlite_dir, f'itisSqlite{version}', 'ITIS.sqlite'), sqlite_path) os.rmdir(os.path.join(sqlite_dir, f'itisSqlite{version}')) if not os.path.exists(sqlite_path): raise FileNotFoundError(f'file missing: {sqlite_path}') conn = sqlite3.connect(sqlite_path) with closing(conn.cursor()) as cursor: cursor.execute(LONGNAMES_QUERY) id_to_reference = { str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name) for identifier, name in cursor.fetchall() } with closing(conn.cursor()) as cursor: cursor.execute(HIERARCHY_QUERY) id_to_parents = multidict( (str(child), str(parent)) for child, parent in cursor.fetchall() ) for identifier, reference in id_to_reference.items(): parents = [] for parent_identifier in id_to_parents.get(identifier, []): if parent_identifier == '0': # this means it's a plant continue parents.append(id_to_reference[parent_identifier]) term = Term( reference=reference, parents=parents, ) yield term
def iter_terms() -> Iterable[Term]: """Get ITIS terms.""" zip_path = ensure_path(PREFIX, URL) sqlite_path = prefix_directory_join(PREFIX, 'itisSqlite043020', 'ITIS.sqlite') if not os.path.exists(sqlite_path): with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(get_prefix_directory(PREFIX)) if not os.path.exists(sqlite_path): raise FileNotFoundError(f'file missing: {sqlite_path}') conn = sqlite3.connect(sqlite_path) with closing(conn.cursor()) as cursor: cursor.execute(LONGNAMES_QUERY) id_to_reference = { str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name) for identifier, name in cursor.fetchall() } with closing(conn.cursor()) as cursor: cursor.execute(HIERARCHY_QUERY) id_to_parents = multidict( (str(child), str(parent)) for child, parent in cursor.fetchall()) for identifier, reference in id_to_reference.items(): parents = [] for parent_identifier in id_to_parents.get(identifier, []): if parent_identifier == '0': # this means its a plant continue parents.append(id_to_reference[parent_identifier]) term = Term( reference=reference, parents=parents, ) yield term
def iter_terms(version: str, autodownload: bool = False) -> Iterable[Term]: """Iterate over UMLS terms.""" name = f'umls-{version}-mrconso.zip' url = f'https://download.nlm.nih.gov/umls/kss/{version}/{name}' if autodownload: # FIXME needs automated scrapy step where you put in user/password path = ensure_path(PREFIX, url=url, version=version) else: path = RAW_MODULE.get(PREFIX, version, name) if not path.exists(): raise FileNotFoundError( f'UMLS needs to be downloaded manually still and moved to {path}. ' f'See https://www.nlm.nih.gov/research/umls/index.html', ) with zipfile.ZipFile(path) as zip_file: with zip_file.open('MRCONSO.RRF', mode='r') as file: it = tqdm(file, unit_scale=True, desc='[umls] parsing') lines = (line.decode('utf-8').strip().split('|') for line in it) for cui, cui_lines in itt.groupby(lines, key=operator.itemgetter(0)): df = pd.DataFrame(list(cui_lines), columns=RRF_COLUMNS) df = df[df['LAT - Language'] == 'ENG'] idx = ((df['ISPREF - is preferred'] == 'Y') & (df['TS - Term Status'] == 'P') & (df['STT - String Type'] == 'PF'), ) pref_rows_df = df.loc[idx] if len(pref_rows_df.index) != 1: it.write( f'no preferred term for umls:{cui}. got {len(pref_rows_df.index)}' ) continue df['TTY - Term Type in Source'] = df[ 'TTY - Term Type in Source'].map(synonym_abb.__getitem__) _r = pref_rows_df.iloc[0] sdf = df[[ 'SAB - source name', 'CODE', 'TTY - Term Type in Source', 'STR' ]] synonyms = [] xrefs = [] for source, identifier, synonym_type, synonym in sdf.values: norm_source = normalize_prefix(source) if norm_source is None or not identifier: provenance = [] else: ref = Reference(prefix=norm_source, identifier=identifier) provenance = [ref] xrefs.append(ref) synonyms.append( Synonym( name=synonym, provenance=provenance, type=SynonymTypeDef.from_text(synonym_type), )) xrefs = sorted(set(xrefs), key=lambda reference: (reference.prefix, reference.identifier)) term = Term( reference=Reference(prefix=PREFIX, identifier=cui, name=_r['STR']), synonyms=synonyms, xrefs=xrefs, ) yield term