Exemplo n.º 1
0
    def _load_meta_counts(counts_filename: str, meta_filename: str) -> (pd.DataFrame, pd.DataFrame):
        """
        :raise ParseMetaException
        """
        meta = utils.read_data_table_from_file(os.path.realpath(meta_filename))
        counts = utils.read_data_table_from_file(os.path.realpath(counts_filename), index_column_first=True)

        return counts, meta
Exemplo n.º 2
0
def add_hla_genes(
    gene_base_filename: str,
    hla_genes_filename: str,
    result_filename: str = 'gene_hla.csv',
) -> None:

    gene_base = utils.read_data_table_from_file(gene_base_filename)
    hla_genes = utils.read_data_table_from_file(hla_genes_filename)

    genes_merged = gene_base.append(hla_genes, ignore_index=True)

    genes_merged.to_csv(result_filename, index=False)
Exemplo n.º 3
0
def remove_genes_in_file(
    gene_base_filename: str,
    remove_genes_filename: str,
    result_filename: str = 'gene_filtered.csv',
) -> None:

    gene_base_data = utils.read_data_table_from_file(gene_base_filename)
    remove_genes_data = utils.read_data_table_from_file(remove_genes_filename)

    genes_filtered = remove_genes.remove_genes_in_file(gene_base_data,
                                                       remove_genes_data)

    genes_filtered.to_csv(result_filename, index=False)
    def _load_meta_counts(counts_filename, meta_filename):
        try:
            meta_raw = utils.read_data_table_from_file(
                os.path.realpath(meta_filename), index_column_first=True)
            counts = utils.read_data_table_from_file(
                os.path.realpath(counts_filename), index_column_first=True)
        except ReadFileException as e:
            app_logger.error(e)
            exit(1)

        meta = pd.DataFrame(index=meta_raw.index)
        meta['cell_type'] = meta_raw.iloc[:, 0]
        return counts, meta_raw
Exemplo n.º 5
0
def remove_genes_in_file(gene_base_filename: str,
                         remove_genes_filename: str,
                         result_filename: str = 'gene_filtered.csv') -> None:
    gene_base_filename = '{}/{}'.format(app.data_dir, gene_base_filename)
    remove_genes_filename = '{}/{}'.format(app.data_dir, remove_genes_filename)

    gene_base_data = utils.read_data_table_from_file(gene_base_filename)
    remove_genes_data = utils.read_data_table_from_file(remove_genes_filename)

    genes_filtered = remove_genes.remove_genes_in_file(gene_base_data,
                                                       remove_genes_data)

    genes_filtered.to_csv('{}/{}'.format(app.output_dir, result_filename),
                          index=False)
Exemplo n.º 6
0
def generate_genes_from_uniprot_ensembl_db(
    uniprot_db_filename: str,
    ensembl_db_filename: str,
    proteins_filename: str,
    result_filename: str = 'gene_uniprot_ensembl_merged.csv',
) -> None:

    uniprots = utils.read_data_table_from_file(uniprot_db_filename)
    ensembls = utils.read_data_table_from_file(ensembl_db_filename)
    proteins = utils.read_data_table_from_file(proteins_filename)

    result = mergers_genes.merge_genes_from_uniprot_ensembl_db(
        ensembls, proteins, uniprots)

    result.to_csv(result_filename, index=False)
Exemplo n.º 7
0
def validate_gene_list(gene_filename: str) -> None:
    genes = utils.read_data_table_from_file(gene_filename)

    if gene_validators.validate_genes(genes):
        print('GENE LIST IS VALID')

    else:
        print('GENE LIST IS NOT VALID')
Exemplo n.º 8
0
    def _load_meta_counts(counts_filename: str,
                          meta_filename: str) -> (pd.DataFrame, pd.DataFrame):
        """
        :raise ParseMetaException
        """
        meta_raw = utils.read_data_table_from_file(
            os.path.realpath(meta_filename), index_column_first=True)
        counts = utils.read_data_table_from_file(
            os.path.realpath(counts_filename), index_column_first=True)

        try:
            meta = pd.DataFrame(index=meta_raw.index)
            meta['cell_type'] = meta_raw.iloc[:, 0]

        except:
            raise ParseMetaException

        return counts, meta
Exemplo n.º 9
0
def add_hla_genes(gene_base_filename: str,
                  hla_genes_filename: str,
                  data_path: str = '',
                  result_filename: str = 'gene_hla.csv',
                  result_path: str = '') -> None:
    if not data_path:
        data_path = app.data_dir

    gene_base_filename = '{}/{}'.format(data_path, gene_base_filename)
    hla_genes_filename = '{}/{}'.format(data_path, hla_genes_filename)

    gene_base = utils.read_data_table_from_file(gene_base_filename)
    hla_genes = utils.read_data_table_from_file(hla_genes_filename)

    genes_merged = gene_base.append(hla_genes, ignore_index=True)

    genes_merged.to_csv('{}/{}'.format(result_path, result_filename),
                        index=False)
Exemplo n.º 10
0
    def assert_open_file(self, base_name, extension, index_column_first,
                         separator):
        fixtures_dir = '{}/fixtures'.format(self.current_dir)
        result = utils.read_data_table_from_file(
            '{}/{}.{}'.format(fixtures_dir, base_name, extension),
            index_column_first, separator)
        expected_result = pd.read_csv(
            '{}/example_data.csv'.format(fixtures_dir))

        self.assertTrue(result.equals(expected_result))
Exemplo n.º 11
0
def generate_genes_from_uniprot_ensembl_db(
        uniprot_db_filename: str,
        ensembl_db_filename: str,
        proteins_filename: str,
        result_filename: str = 'gene_uniprot_ensembl_merged.csv',
        result_path: str = ''):
    uniprot_db_filename = '{}/{}'.format(app.data_dir, uniprot_db_filename)
    ensembl_db_filename = '{}/{}'.format(app.data_dir, ensembl_db_filename)
    proteins_filename = '{}/{}'.format(app.data_dir, proteins_filename)

    if not result_path:
        result_path = app.output_dir

    uniprots = utils.read_data_table_from_file(uniprot_db_filename)
    ensembls = utils.read_data_table_from_file(ensembl_db_filename)
    proteins = utils.read_data_table_from_file(proteins_filename)

    result = mergers_genes.merge_genes_from_uniprot_ensembl_db(
        ensembls, proteins, uniprots)

    result.to_csv('{}/{}'.format(result_path, result_filename), index=False)
Exemplo n.º 12
0
def validate_gene_list(gene_filename: str, data_path: str) -> None:
    if not data_path:
        data_path = app.output_dir
    gene_filename = '{}/{}'.format(data_path, gene_filename)

    genes = utils.read_data_table_from_file(gene_filename)

    if gene_validators.validate_genes(genes):
        print('GENE LIST IS VALID')

    else:
        print('GENE LIST IS NOT VALID')
Exemplo n.º 13
0
def generate_interactions(imex_raw_filename: str, iuphar_raw_filename: str,
                          database_proteins_filename: str,
                          database_gene_filename: str,
                          database_complex_filename: str,
                          interaction_to_remove_filename: str,
                          interaction_curated_filename: str) -> None:
    interactions_base = utils.read_data_table_from_file(
        '%s/%s' % (data_dir, imex_raw_filename), na_values='-')
    proteins = pd.read_csv('%s/%s' % (data_dir, database_proteins_filename))
    genes = pd.read_csv('%s/%s' % (data_dir, database_gene_filename))
    complexes = pd.read_csv('%s/%s' % (data_dir, database_complex_filename))
    interactions_to_remove = pd.read_csv(
        '%s/%s' % (data_dir, interaction_to_remove_filename))
    interaction_curated = pd.read_csv('%s/%s' %
                                      (data_dir, interaction_curated_filename))

    print('generating imex file')
    imex_interactions = parse_interactions_imex(interactions_base, proteins,
                                                genes)

    print('Getting Iuphar interactions')

    iuphar_original = get_iuphar_guidetopharmacology.call(
        iuphar_raw_filename,
        data_dir,
        '{}/downloads'.format(data_dir),
        default_download_response='yes')
    print('generating iuphar file')
    iuphar_interactions = parse_iuphar_guidetopharmacology.call(
        iuphar_original, genes, proteins)

    print('merging iuphar/imex')
    merged_interactions = merge_iuphar_imex_interactions(
        iuphar_interactions, imex_interactions)

    print('removing complex interactions')
    no_complex_interactions = only_noncomplex_interactions(
        merged_interactions, complexes)

    print('removing selected interactions')
    clean_interactions = remove_interactions_in_file(no_complex_interactions,
                                                     interactions_to_remove)

    print('adding curated interaction')
    interactions_with_curated = add_curated(clean_interactions,
                                            interaction_curated)

    interactions_with_curated.to_csv('%s/interaction.csv' % output_dir,
                                     index=False)
        def wrapper(namefile='', data_path=''):
            app_logger.info('Collecting {}'.format(method_name))
            if not namefile:
                namefile = '{}_input.csv'.format(method_name)

            if not data_path:
                data_path = data_dir

            data = utils.read_data_table_from_file('{}/{}'.format(
                data_path, namefile))

            if self.database_file:
                getattr(
                    create_app(True, self.database_file, True).collect,
                    method_name)(data)
            else:
                getattr(cellphonedb_app.cellphonedb.collect, method_name)(data)
    def assert_file_not_empty(self, file, message=''):
        if not message:
            message = 'File {} is empty'.format(file)

        read_data = utils.read_data_table_from_file(file)
        self.assertFalse(read_data.empty, message)
Exemplo n.º 16
0
def generate_genes(
    user_gene: Optional[str],
    fetch_uniprot: bool,
    fetch_ensembl: bool,
    result_path: str,
    project_name: str,
) -> None:
    output_path = _set_paths(result_path, project_name)

    # TODO: Add logger
    if fetch_ensembl:
        print('fetching remote ensembl data ... ', end='')
        source_url = 'http://www.ensembl.org/biomart/martservice?query={}'
        query = '<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE Query><Query virtualSchemaName = "default" ' \
                'formatter = "CSV" header = "1" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >' \
                '<Dataset name = "hsapiens_gene_ensembl" interface = "default" >' \
                '<Attribute name = "ensembl_gene_id" />' \
                '<Attribute name = "ensembl_transcript_id" />' \
                '<Attribute name = "external_gene_name" />' \
                '<Attribute name = "hgnc_symbol" />' \
                '<Attribute name = "uniprotswissprot" />' \
                '</Dataset>' \
                '</Query>'

        url = source_url.format(urllib.parse.quote(query))
        ensembl_db = pd.read_csv(url)
        print('done')
    else:
        ensembl_db = utils.read_data_table_from_file(
            os.path.join(data_dir, 'sources/ensembl.txt'))
        print('read local ensembl file')

    # additional data comes from given file or uniprot remote url
    if fetch_uniprot:
        try:
            print('fetching remote uniprot file ... ', end='')
            source_url = 'https://www.uniprot.org/uniprot/?query=*&format=tab&force=true' \
                         '&columns=id,entry%20name,reviewed,protein%20names,genes,organism,length' \
                         '&fil=organism:%22Homo%20sapiens%20(Human)%20[9606]%22%20AND%20reviewed:yes' \
                         '&compress=yes'

            uniprot_db = pd.read_csv(source_url, sep='\t', compression='gzip')
            print('done')

        except Exception as e:
            print('Error fetching remote UniProt data, fetching local data')
            uniprot_db = pd.read_csv(os.path.join(data_dir,
                                                  'sources/uniprot.tab'),
                                     sep='\t')
            print('read local uniprot file')
    else:
        uniprot_db = utils.read_data_table_from_file(
            os.path.join(data_dir, 'sources/uniprot.tab'))
        print('read local uniprot file')

    ensembl_columns = {
        'Gene name': 'gene_name',
        'Gene stable ID': 'ensembl',
        'HGNC symbol': 'hgnc_symbol',
        'UniProtKB/Swiss-Prot ID': 'uniprot'
    }

    uniprot_columns = {'Entry': 'uniprot', 'Gene names': 'gene_names'}

    result_columns = ['gene_name', 'uniprot', 'hgnc_symbol', 'ensembl']

    ensembl_db = ensembl_db[list(
        ensembl_columns.keys())].rename(columns=ensembl_columns)
    uniprot_db = uniprot_db[list(
        uniprot_columns.keys())].rename(columns=uniprot_columns)
    hla_genes = utils.read_data_table_from_file(
        os.path.join(data_dir, 'sources/hla_curated.csv'))
    if user_gene:
        separator = _get_separator(os.path.splitext(user_gene)[-1])
        user_gene = pd.read_csv(user_gene, sep=separator)

    cpdb_genes = gene_generator(ensembl_db, uniprot_db, hla_genes, user_gene,
                                result_columns)

    cpdb_genes[result_columns].to_csv('{}/{}'.format(output_path,
                                                     'gene_generated.csv'),
                                      index=False)
Exemplo n.º 17
0
def generate_interactions(
    proteins: str,
    genes: str,
    complex: str,
    user_interactions: Optional[str],
    user_interactions_only: bool,
    result_path: str,
    fetch_imex: bool,
    fetch_iuphar: bool,
    project_name: str,
) -> None:
    if user_interactions_only and not user_interactions:
        raise Exception('You need to set --user-interactions parameter')

    output_path = utils.set_paths(result_path, project_name)
    downloads_path = utils.set_paths(
        utils.set_paths(result_path, project_name), 'downloads')

    proteins = utils.read_data_table_from_file(proteins)
    genes = utils.read_data_table_from_file(genes)
    complexes = utils.read_data_table_from_file(complex)

    if not user_interactions_only:
        raw_imex = get_imex.call(genes, downloads_path, fetch_imex)

        interactions_to_remove = utils.read_data_table_from_file(
            os.path.join(data_dir, 'sources/excluded_interaction.csv'))
        interaction_curated = utils.read_data_table_from_file(
            os.path.join(data_dir, 'sources/interaction_curated.csv'))

    if user_interactions:
        separator = _get_separator(os.path.splitext(user_interactions)[-1])
        user_interactions = pd.read_csv(user_interactions, sep=separator)
        user_interactions['partner_a'] = user_interactions['partner_a'].apply(
            lambda x: str(x).strip())
        user_interactions['partner_b'] = user_interactions['partner_b'].apply(
            lambda x: str(x).strip())
        user_interactions['annotation_strategy'] = 'user_curated'

        if not 'protein_name_a' in user_interactions.columns:
            user_interactions['protein_name_a'] = ''

        if not 'protein_name_b' in user_interactions.columns:
            user_interactions['protein_name_b'] = ''

    result_columns = [
        'partner_a', 'partner_b', 'protein_name_a', 'protein_name_b',
        'annotation_strategy', 'source'
    ]
    if not user_interactions_only:
        print('Parsing IMEX file')
        imex_interactions = parse_interactions_imex(raw_imex, proteins, genes)

        print('Getting iuphar data')
        raw_iuphar = get_iuphar.call(downloads_path, fetch_iuphar)

        print('Generating iuphar interactions')
        iuphar_interactions = parse_iuphar_guidetopharmacology.call(
            raw_iuphar, genes, proteins)

        print('Merging iuphar/imex')
        merged_interactions = merge_iuphar_imex_interactions(
            iuphar_interactions, imex_interactions)

        print('Removing complex interactions')
        no_complex_interactions = only_noncomplex_interactions(
            merged_interactions, complexes)

        print('Removing selected interactions')
        clean_interactions = remove_interactions_in_file(
            no_complex_interactions, interactions_to_remove)

        print('Adding curated interaction')
        interactions_with_curated = add_curated(clean_interactions,
                                                interaction_curated)

        result = tools_helper.normalize_interactions(
            interactions_with_curated.append(user_interactions,
                                             ignore_index=True,
                                             sort=False), 'partner_a',
            'partner_b').drop_duplicates(['partner_a', 'partner_b'],
                                         keep='last')

    else:
        result = tools_helper.normalize_interactions(user_interactions, 'partner_a', 'partner_b') \
            .drop_duplicates(['partner_a', 'partner_b'], keep='last')

    result[result_columns].sort_values(['partner_a', 'partner_b']).to_csv(
        '{}/interaction_input.csv'.format(output_path), index=False)
Exemplo n.º 18
0
def read_meta_file(path, filename):
    meta_raw = utils.read_data_table_from_file('{}/{}'.format(path, filename),
                                               index_column_first=True)
    meta = pd.DataFrame(index=meta_raw.index)
    meta['cell_type'] = meta_raw.iloc[:, 0]
    return meta