def upload_artifacts_for_prefix(*, prefix: str, bucket: str): """Upload compiled parts for the given prefix to AWS.""" logger.info('[%s] getting id->name mapping', prefix) get_id_name_mapping(prefix) id_name_path = prefix_directory_join(prefix, 'cache', 'names.tsv') id_name_key = os.path.join(prefix, 'cache', 'names.tsv') logger.info('[%s] uploading id->name mapping', prefix) upload_file(path=id_name_path, bucket=bucket, key=id_name_key) logger.info('[%s] getting id->synonyms mapping', prefix) get_id_synonyms_mapping(prefix) id_synonyms_path = prefix_directory_join(prefix, 'cache', 'synonyms.tsv') id_synonyms_key = os.path.join(prefix, 'cache', 'synonyms.tsv') logger.info('[%s] uploading id->synonyms mapping', prefix) upload_file(path=id_synonyms_path, bucket=bucket, key=id_synonyms_key) logger.info('[%s] getting xrefs', prefix) get_xrefs_df(prefix) xrefs_path = prefix_directory_join(prefix, 'cache', 'xrefs.tsv') xrefs_key = os.path.join(prefix, 'cache', 'xrefs.tsv') logger.info('[%s] uploading xrefs', prefix) upload_file(path=xrefs_path, bucket=bucket, key=xrefs_key) logger.info('[%s] getting relations', prefix) get_relations_df(prefix) relations_path = prefix_directory_join(prefix, 'cache', 'relations.tsv') relations_key = os.path.join(prefix, 'cache', 'relations.tsv') logger.info('[%s] uploading relations', prefix) upload_file(path=relations_path, bucket=bucket, key=relations_key) logger.info('[%s] getting properties', prefix) get_properties_df(prefix) properties_path = prefix_directory_join(prefix, 'cache', 'properties.tsv') properties_key = os.path.join(prefix, 'cache', 'properties.tsv') logger.info('[%s] uploading properties', prefix) upload_file(path=properties_path, bucket=bucket, key=properties_key)
def iter_terms() -> Iterable[Term]: """Get ITIS terms.""" zip_path = ensure_path(PREFIX, url=URL) version = _get_version() sqlite_dir = get_prefix_directory(PREFIX, version=version) sqlite_path = prefix_directory_join(PREFIX, 'ITIS.sqlite', version=version) if not os.path.exists(sqlite_path): with zipfile.ZipFile(zip_path) as zip_file: for x in zip_file.filelist: if x.filename.endswith('.sqlite'): zip_file.extract(x, sqlite_dir) shutil.move(os.path.join(sqlite_dir, f'itisSqlite{version}', 'ITIS.sqlite'), sqlite_path) os.rmdir(os.path.join(sqlite_dir, f'itisSqlite{version}')) if not os.path.exists(sqlite_path): raise FileNotFoundError(f'file missing: {sqlite_path}') conn = sqlite3.connect(sqlite_path) with closing(conn.cursor()) as cursor: cursor.execute(LONGNAMES_QUERY) id_to_reference = { str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name) for identifier, name in cursor.fetchall() } with closing(conn.cursor()) as cursor: cursor.execute(HIERARCHY_QUERY) id_to_parents = multidict( (str(child), str(parent)) for child, parent in cursor.fetchall() ) for identifier, reference in id_to_reference.items(): parents = [] for parent_identifier in id_to_parents.get(identifier, []): if parent_identifier == '0': # this means it's a plant continue parents.append(id_to_reference[parent_identifier]) term = Term( reference=reference, parents=parents, ) yield term
def iter_terms() -> Iterable[Term]: """Get ITIS terms.""" zip_path = ensure_path(PREFIX, URL) sqlite_path = prefix_directory_join(PREFIX, 'itisSqlite043020', 'ITIS.sqlite') if not os.path.exists(sqlite_path): with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(get_prefix_directory(PREFIX)) if not os.path.exists(sqlite_path): raise FileNotFoundError(f'file missing: {sqlite_path}') conn = sqlite3.connect(sqlite_path) with closing(conn.cursor()) as cursor: cursor.execute(LONGNAMES_QUERY) id_to_reference = { str(identifier): Reference(prefix=PREFIX, identifier=str(identifier), name=name) for identifier, name in cursor.fetchall() } with closing(conn.cursor()) as cursor: cursor.execute(HIERARCHY_QUERY) id_to_parents = multidict( (str(child), str(parent)) for child, parent in cursor.fetchall()) for identifier, reference in id_to_reference.items(): parents = [] for parent_identifier in id_to_parents.get(identifier, []): if parent_identifier == '0': # this means its a plant continue parents.append(id_to_reference[parent_identifier]) term = Term( reference=reference, parents=parents, ) yield term
def _lookup(name): if name in taxonomy_remapping: return taxonomy_remapping[name] return get_name_id_mapping('ncbitaxon')[name] def get_df() -> pd.DataFrame: """Get the BioGRID identifiers mapping dataframe.""" df = ensure_df(PREFIX, URL, skiprows=28, dtype=str) df['taxonomy_id'] = df['ORGANISM_OFFICIAL_NAME'].map(_lookup) return df @cached_mapping( path=prefix_directory_join(PREFIX, 'cache', 'xrefs', 'ncbigene.tsv'), header=['biogrid_id', 'ncbigene_id'], ) def get_ncbigene_mapping() -> Mapping[str, str]: """Get BioGRID to NCBIGENE mapping. Is basically equivalent to: .. code-block:: python from pyobo import get_filtered_xrefs biogrid_ncbigene_mapping = get_filtered_xrefs('biogrid', 'ncbigene') """ df = get_df() df = df.loc[df['IDENTIFIER_TYPE'] == 'ENTREZ_GENE', ['BIOGRID_ID', 'IDENTIFIER_VALUE']]
return get_name_id_mapping('ncbitaxon')[name] def get_df() -> pd.DataFrame: """Get the BioGRID identifiers mapping dataframe.""" version = bioversions.get_version('biogrid') url = f'{BASE_URL}/BIOGRID-{version}/BIOGRID-IDENTIFIERS-{version}.tab.zip' df = ensure_df(PREFIX, url=url, skiprows=28, dtype=str, version=version) df['taxonomy_id'] = df['ORGANISM_OFFICIAL_NAME'].map(_lookup) return df @cached_mapping( path=prefix_directory_join(PREFIX, 'cache', 'xrefs', 'ncbigene.tsv', version=version_getter(PREFIX)), header=['biogrid_id', 'ncbigene_id'], ) def get_ncbigene_mapping() -> Mapping[str, str]: """Get BioGRID to NCBIGENE mapping. Is basically equivalent to: .. code-block:: python from pyobo import get_filtered_xrefs biogrid_ncbigene_mapping = get_filtered_xrefs('biogrid', 'ncbigene') """ df = get_df()
def _get_complexportal_df(): return pd.read_csv(COMPLEXPORTAL_MAPPINGS, sep='\t', header=None, names=['source_id', 'target_id']) def get_intact_complex_portal_xrefs_df() -> pd.DataFrame: """Get IntAct-Complex Portal xrefs.""" df = _get_complexportal_df() df['source_ns'] = 'intact' df['target_ns'] = 'complexportal' df['source'] = COMPLEXPORTAL_MAPPINGS df = df[['source_ns', 'source_id', 'target_ns', 'target_id', 'source']] return df @cached_mapping( path=prefix_directory_join('intact', 'cache', 'xrefs', 'complexportal.tsv'), header=['intact_id', 'complexportal_id'], ) def get_complexportal_mapping() -> Mapping[str, str]: """Get IntAct to Complex Portal mapping. Is basically equivalent to: .. code-block:: python from pyobo import get_filtered_xrefs intact_complexportal_mapping = get_filtered_xrefs('intact', 'complexportal') """ df = _get_complexportal_df() return dict(df.values)