示例#1
0
def get_organisms_df(url: Optional[str] = None) -> pd.DataFrame:
    """Convert tab separated txt files to pandas Dataframe.

    :param url: url from KEGG tab separated file
    :return: dataframe of the file
    :rtype: pandas.DataFrame
    """
    df = pd.read_csv(
        url
        or ensure_path(MODULE_NAME, KEGG_ORGANISM_URL, path='organisms.tsv'),
        sep='\t',
        header=None,
        names=[
            'kegg_id',
            'kegg_code',
            'name',
            # fourth column is the taxonomy hierarchy
        ],
        usecols=[0, 1, 2],
    )
    df['common_name'] = df['name'].map(
        lambda name: name.replace(')', '').split(' (')[1].capitalize()
        if len(name.replace(')', '').split(' (')) > 1 else '')
    df['name'] = df['name'].map(
        lambda name: name.replace(')', '').split(' (')[0].capitalize())
    return df
示例#2
0
def _load_file(module_name: str = MODULE_NAME, url: str = URL) -> str:
    """Load the file from the URL and place it into the bio2bel_sophia directory.

    :param module_name: name of module (database)
    :param url: URL to file from database
    :return: path of saved database file
    """

    return ensure_path(prefix=module_name, url=url)
示例#3
0
def get_entity_pathway_df(url: Optional[str] = None) -> pd.DataFrame:
    """Convert tab separated text files in to DataFrame.

    :param url: An optional url from a KEGG TSV file
    """
    df = pd.read_csv(
        url or ensure_path(MODULE_NAME,
                           PROTEIN_PATHWAY_HUMAN_URL,
                           path='protein_pathway.tsv'),
        sep='\t',
        header=None,
        names=['kegg_protein_id', 'kegg_pathway_id'],
    )
    # df['kegg_pathway_id'] = df['kegg_pathway_id'].map(_remove_path_prefix)
    return df
示例#4
0
def get_pathway_df(url: Optional[str] = None) -> pd.DataFrame:
    """Convert tab separated txt files to pandpathway = parse_pathway_lines(pathway_lines)as Dataframe.

    :param url: url from KEGG tab separated file
    :return: dataframe of the file
    """
    df = pd.read_csv(
        url or ensure_path(
            MODULE_NAME, KEGG_HUMAN_PATHWAYS_URL, path='pathways.tsv'),
        sep='\t',
        header=None,
        names=['kegg_pathway_id', 'name'],
    )
    # df['kegg_pathway_id'] = df['kegg_pathway_id'].map(_remove_path_prefix)
    return df
示例#5
0
 def path(self) -> str:  # noqa: D401
     """The (ensured) path to the data."""
     return ensure_path(MODULE_NAME, self.url)
示例#6
0
    'phosphotransfer reaction': 'proteinModification',
    'disulfide bond': 'complexAbundance',
    'self interaction': '',
    'deacetylation reaction': '',
    'lipoprotein cleavage reaction': 'proteinAbundance',
    'gtpase reaction': 'reaction',
    'glycosylation reaction': 'proteinModification(Glyco)',
    'palmitoylation reaction': 'proteinModification',
    'putative self interaction': '',
    'dna cleavage': 'geneAbundance',
    'rna cleavage': 'rnaAbundace',
}

MODULE_NAME = 'intact'
URL = 'ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.zip'
path = ensure_path(MODULE_NAME, URL)
sample_path = '/Users/sophiakrix/Downloads/intact_sample.txt'

ID_INTA = '#ID(s) interactor A'
ID_INTB = 'ID(s) interactor B'
DATABASE_INT_A = 'database_intA'
DATABASE_INT_B = 'database_intB'
ONLY_ID_INT_A = 'id_intA'
ONLY_ID_INT_B = 'id_intB'
UNIPROTKB = 'uniprotkb'
ORIG_ALT_ID_COLUMN_NAMES = [
    'Alt. ID(s) interactor A', 'Alt. ID(s) interactor B'
]
NEW_ALT_ID_COLUMN_NAMES = ['alternative_intA', 'alternative_intB']