def get_publications():
    ref_genes = get_ref_genes()
    ref_gene2pubmed = medline.gene2pubmed(
        taxon_id=9606,
        paper_kind='research',
        ref_genes=ref_genes)

    papers = nar_attention.count_papers_and_attention(
        ref_genes,
        ref_gene2pubmed)

    df_m = medline.select_medline_records(
        columns_sql='''
            medline.pubmed_id,
            medline.pubdate_year''',
        taxon_id=9606,
        kind='research')

    current_gene2pubmed = ref_gene2pubmed[ref_gene2pubmed['pubmed_id'].isin(
        df_m[df_m['pubdate_year'] == 2015]['pubmed_id'])]

    current_papers = nar_attention.count_papers_and_attention(
        ref_genes,
        current_gene2pubmed)

    current_papers = current_papers.rename(columns={
        'attention': 'attention_2015'
    })

    papers = pd.concat([papers, current_papers[['attention_2015']]], axis=1)

    return papers
def supporting_nih_institutes():

    df_prj_core, _, df_papers = nar_funding.get_paper_funding_through_nih()

    ref_genes = get_ref_genes()
    ref_gene2pubmed = medline.gene2pubmed(
        taxon_id=9606,
        paper_kind='research',
        ref_genes=ref_genes)

    f = df_papers['pubdate_year'].isin(range(2015, 2016))

    d = pd.merge(
        ref_gene2pubmed,
        pd.merge(
            df_prj_core[['project_num', 'ADMINISTERING_IC']].drop_duplicates(),
            df_papers.loc[f, ['project_num', 'pubmed_id']].drop_duplicates()))

    d = d[d['gene_ncbi'].isin(ref_genes)]

    dc = d[['gene_ncbi', 'ADMINISTERING_IC']].drop_duplicates()

    dc = dc['gene_ncbi'].value_counts()

    dd = dc.to_frame('recently_supporting_institutes').rename_axis('gene_ncbi')

    total_institutes = len(set(d['ADMINISTERING_IC']))
    dd = dd / total_institutes

    cl = dd >= 0.1

    ge = natsorted(d['gene_ncbi'].unique())

    return cl, dd, ge
示例#3
0
def load_shared_gene2pubmed(taxon_id, reference_df, reference_genes):

    gene2pubmed = medline.gene2pubmed(taxon_id, ['gene_ncbi', 'pubmed_id'])
    gene2pubmed = gene2pubmed[gene2pubmed['gene_ncbi'].isin(reference_genes)]
    f = gene2pubmed['pubmed_id'].isin(reference_df['pubmed_id'])
    gene2pubmed = gene2pubmed.loc[f, :]

    return gene2pubmed
示例#4
0
def filter_for_papers_with_reference_genes(taxon_id, df, reference_genes):

    gene2pubmed = medline.gene2pubmed(taxon_id, ['gene_ncbi', 'pubmed_id'])
    gene2pubmed = gene2pubmed[gene2pubmed['gene_ncbi'].isin(reference_genes)]

    f = df['pubmed_id'].isin(gene2pubmed['pubmed_id'])
    df = df.loc[f, :]

    return df
def fame_of_homologs():
    gene2pubmed_research = medline.gene2pubmed(
        taxon_id='all', paper_kind='research')

    value_of_pubmed_id = gene2pubmed_research[
        'pubmed_id'].value_counts().to_frame().reset_index().rename(
        columns={'index': 'pubmed_id', 'pubmed_id': 'value'})

    value_of_pubmed_id['value'] = 1 / value_of_pubmed_id['value']

    gene2pubmed_research = pd.merge(gene2pubmed_research, value_of_pubmed_id)
    extended_attention = gene2pubmed_research[[
        'gene_ncbi', 'value']].groupby('gene_ncbi').agg(sum)

    hg = relations.homologene()
    hg_attention = pd.merge(hg, extended_attention.reset_index(), how='left')
    hg_attention['value'] = hg_attention['value'].fillna(0)
    hg_max_attention = hg_attention[
        ['homologene_group', 'taxon_ncbi', 'value']].groupby(
        ['homologene_group', 'taxon_ncbi']).agg(max).reset_index()

    hg_max_attention = hg_max_attention[hg_max_attention['taxon_ncbi'] != 9606]

    he = pd.merge(
        hg_max_attention,
        hg[hg['taxon_ncbi'] == 9606][['homologene_group', 'gene_ncbi']]
    )

    he = he[he['gene_ncbi'].isin(get_ref_genes())]

    he = he[['taxon_ncbi', 'gene_ncbi', 'value']].groupby(
        ['gene_ncbi', 'taxon_ncbi']).agg(max).reset_index()

    dd = he.pivot(index='gene_ncbi', columns='taxon_ncbi', values='value')

    dd.columns = [meta.taxon_name(x) for x in dd.columns]
    dd.columns = ['studied_{}'.format(x) for x in dd.columns]

    cl = dd > 1

    f = dd.isnull()
    cl[f] = np.nan

    ge = sorted(dd.index)

    return cl, dd, ge
def count_papers_and_attention(ref_genes, ref_gene2pubmed):
    """
    Counts the number of articles in ref_gene2pubmed
    Uses the full gene2pubmed (not only the one of ref_gene2pubmed)
    to compute attention (thus also considerting genes that lie
    outisde of the present filtering)

    Input:
        ref_genes       list of reference genes
        ref_gene2pubmed dataframe with gene_ncbi and pubmed_id

    """

    ref_gene2pubmed = ref_gene2pubmed[ref_gene2pubmed['gene_ncbi'].isin(
        ref_genes)]

    full_gene2pubmed = medline.gene2pubmed(taxon_id='all')
    full_gene2pubmed = full_gene2pubmed[full_gene2pubmed['pubmed_id'].isin(
        ref_gene2pubmed['pubmed_id'])]

    h = full_gene2pubmed[[
        'pubmed_id', 'gene_ncbi'
    ]].groupby('pubmed_id').agg(lambda x: 1 / len(x)).rename(
        columns={
            'gene_ncbi': 'attention'
        }).reset_index()

    master = pd.merge(ref_gene2pubmed, h, how='inner')

    papers = master['gene_ncbi'].value_counts().to_frame('papers')
    papers.index.name = 'gene_ncbi'
    papers.loc[:, 'attention'] = master[['gene_ncbi', 'attention'
                                         ]].groupby('gene_ncbi').agg(sum)

    papers = papers.loc[ref_genes, :]
    papers = papers.fillna(0)

    return papers
示例#7
0
def get_extended_funding_info(taxon_id, earliest_year, latest_year):
    """
    Function to standardize queries on budget, creates estimate
    of budget per gene

    """

    # INITIALIZATION ###

    # MedLine
    ref_genes = standardizer.reference_genes(taxon_id, 'rpo')
    gene2pubmed = medline.gene2pubmed(taxon_id,
                                      paper_kind='research',
                                      ref_genes=ref_genes)

    df_m = medline.select_medline_wos_records(columns_sql='''
                medline.pubmed_id,
                medline.pubdate_year,
                medline.amount_of_authors,
                medline.j_name_s''',
                                              years_range='all',
                                              taxon_id=taxon_id,
                                              kind='research',
                                              unambiguous=True)

    df_m = df_m[df_m['amount_of_authors'] > 0]  # exclude consortia paper (-1)
    df_m = df_m[['pubmed_id', 'pubdate_year', 'amount_of_authors', 'j_name_s']]

    df_m = df_m[df_m['pubdate_year'] >= earliest_year]
    df_m = df_m[df_m['pubdate_year'] <= latest_year]

    # <========== use later for filtering all!!!!
    _pubmed_articles_in_medline_time_span = set(df_m['pubmed_id'])

    # NIH Exporter
    df_prj_core, df_prj_budget, df_nih_papers = get_paper_funding_through_nih()
    df_nih_papers = df_nih_papers.loc[:,
                                      ['project_num',
                                       'pubmed_id']]  # skip publication year

    df_prj_core = df_prj_core[df_prj_core['FY'] >= earliest_year]
    df_prj_core = df_prj_core[df_prj_core['FY'] <= latest_year]

    df_prj_budget = df_prj_budget.loc[:, ['project_num',
                                          'budget']]  # skip fiscal year
    df_prj_budget = df_prj_budget.groupby('project_num').agg(sum)
    df_prj_budget = df_prj_budget.reset_index()

    # Estimations of costs for non-covered papers ###

    papers_in_nih = len(
        set(df_nih_papers['pubmed_id']).intersection(set(df_m['pubmed_id'])))
    papers_in_medline = len(set(df_m['pubmed_id']))
    multiplier_nih2medline = papers_in_medline / papers_in_nih
    print('Multiplier:', multiplier_nih2medline)

    # Synchronization ###

    # PubMed
    lis = [
        set(df_nih_papers['pubmed_id']),
        set(df_m['pubmed_id']),
        set(gene2pubmed['pubmed_id'])
    ]
    pubmed_in_all = set.intersection(*lis)
    print('Amount of MedLine articles:', len(pubmed_in_all))

    gene2pubmed = gene2pubmed[gene2pubmed['pubmed_id'].isin(pubmed_in_all)]
    df_m = df_m[df_m['pubmed_id'].isin(pubmed_in_all)]
    df_nih_papers = df_nih_papers[df_nih_papers['pubmed_id'].isin(
        pubmed_in_all)]

    # Projects
    lis = [
        set(df_prj_core['project_num']),
        set(df_prj_budget['project_num']),
        set(df_nih_papers['project_num'])
    ]
    project_in_all = set.intersection(*lis)

    df_prj_core = df_prj_core[df_prj_core['project_num'].isin(project_in_all)]
    df_prj_budget = df_prj_budget[df_prj_budget['project_num'].isin(
        project_in_all)]
    df_nih_papers = df_nih_papers[df_nih_papers['project_num'].isin(
        project_in_all)]

    # Resources per paper per gene

    # amount of publications per project
    papers_per_project = df_nih_papers['project_num'].value_counts()
    # overall budget per project
    budget_per_project = df_prj_budget.set_index('project_num')['budget']
    # budget per paper for each project
    budget_per_paper_per_project = budget_per_project.div(
        papers_per_project).to_frame(
            'budget_per_paper_per_project').reset_index().rename(
                columns={'index': 'project_num'})

    budget_per_pubmed_id = pd.merge(
        budget_per_paper_per_project,
        df_nih_papers)[['pubmed_id', 'budget_per_paper_per_project'
                        ]].groupby('pubmed_id').agg(sum).reset_index()

    attention_per_paper = (1 / gene2pubmed['pubmed_id'].value_counts()
                           ).to_frame('attention_per_gene').reset_index()
    attention_per_paper = attention_per_paper.rename(
        columns={'index': 'pubmed_id'})

    gene2pubmed_plus = pd.merge(gene2pubmed, budget_per_pubmed_id)
    gene2pubmed_plus = pd.merge(gene2pubmed_plus, attention_per_paper)
    gene2pubmed_plus = gene2pubmed_plus.rename(
        columns={
            'budget_per_paper_per_project': 'budget_for_paper',
            'attention_per_gene': 'attention'
        })
    gene2pubmed_plus.loc[:, 'papers'] = 1

    gene2pubmed_plus['budget_for_attention'] = gene2pubmed_plus['attention'] * \
        gene2pubmed_plus['budget_for_paper']

    master = gene2pubmed_plus[[
        'gene_ncbi', 'budget_for_attention', 'attention', 'papers',
        'budget_for_paper'
    ]].groupby('gene_ncbi').agg(sum)

    master['budget_by_attention'] = master['budget_for_attention'] / \
        master['attention']
    master['budget_by_papers'] = master['budget_for_paper'] / master['papers']

    gene2pubmed_full = medline.gene2pubmed(taxon_id,
                                           paper_kind='research',
                                           ref_genes=ref_genes)
    gene2pubmed_full = gene2pubmed_full[gene2pubmed_full['pubmed_id'].isin(
        _pubmed_articles_in_medline_time_span)]

    fame_full = nar_attention.count_papers_and_attention(
        ref_genes, gene2pubmed_full)

    n = fame_full.columns
    fame_full.columns = ['full_' + x for x in fame_full.columns]
    master = pd.merge(master.reset_index(), fame_full.reset_index())

    nih_publnk = nih.publnk().drop_duplicates()
    gene2pubmed_all_nih = gene2pubmed_full[gene2pubmed_full['pubmed_id'].isin(
        nih_publnk['pubmed_id'])]

    fame_all_nih = nar_attention.count_papers_and_attention(
        ref_genes, gene2pubmed_all_nih)
    n = fame_all_nih.columns
    fame_all_nih.columns = ['all_nih_' + x for x in fame_all_nih.columns]
    master = pd.merge(master, fame_all_nih.reset_index())

    for x in n:
        master.loc[:, 'non_nih_' +
                   x] = master.loc[:, 'full_' + x] - master.loc[:,
                                                                'all_nih_' + x]

    master = master.set_index('gene_ncbi')

    return master
def get_year_of_discovery(taxon_id, ref_genes):
    """
    Returns earliest years within research papers covered
    in MedLine

    Input:
        taxon_id
        ref_genes

    Output:
        genes_earliest_years    df with first_year and first_solo_year

    """

    ref_gene2pubmed = medline.gene2pubmed(
        taxon_id, paper_kind='research', ref_genes=ref_genes)

    df_m = medline.select_medline_records(
        columns_sql='''
                medline.pubmed_id,
                medline.pubdate_year''',
        taxon_id=taxon_id,
        kind='research')

    df_m = df_m[df_m['pubmed_id'].isin(ref_gene2pubmed['pubmed_id'])]
    columns_to_use = ['pubmed_id', 'pubdate_year']
    df_m = df_m.loc[:, columns_to_use].drop_duplicates()

    genes_per_paper = ref_gene2pubmed['pubmed_id'].value_counts(
    ).to_frame('genes')
    df_m = pd.merge(df_m, genes_per_paper, left_on='pubmed_id',
                    right_index=True, how='inner')
    df_m.loc[:, 'taxon_ncbi'] = taxon_id

    # add genes to medline
    master = pd.merge(
        df_m,
        medline.gene2pubmed(
            taxon_id=taxon_id,
            paper_kind='research',
            ref_genes=ref_genes),
        left_on=['taxon_ncbi', 'pubmed_id'],
        right_on=['taxon_ncbi', 'pubmed_id'],
        how='inner').drop_duplicates()

    # get initial years
    is_single_gene_paper = master['genes'] == 1
    genes_earliest_years = pd.merge(
        master.loc[
            :,
            ['gene_ncbi', 'pubdate_year']].groupby(
                'gene_ncbi').agg(min).reset_index().rename(
                    columns={'pubdate_year': 'first_year'}),
        master.loc[
            is_single_gene_paper,
            ['gene_ncbi', 'pubdate_year']].groupby(
                'gene_ncbi').agg(min).reset_index().rename(
                    columns={'pubdate_year': 'first_solo_year'}),
        left_on='gene_ncbi',
        right_on='gene_ncbi',
        how='outer'
    )

    f = master['genes'] == 1
    genes_earliest_years = pd.merge(
        master.loc[:, ['gene_ncbi', 'pubdate_year']].groupby(
            'gene_ncbi').agg(
            min).rename(columns={'pubdate_year': 'first_year'}),
        master.loc[f, ['gene_ncbi', 'pubdate_year']].groupby(
            'gene_ncbi').agg(
            min).rename(columns={'pubdate_year': 'first_solo_year'}),
        left_index=True,
        right_index=True,
        how='outer'
    )

    genes_earliest_years = genes_earliest_years.loc[ref_genes, :]

    return genes_earliest_years
def reference_genes(taxon_id, ref_code):
    """
    Obtains a list of reference genes

    Input:
        taxon_id    int
        ref_code    str;  if it contains
                        l   -> at least one medline paper
                        o   -> official nomenclature require
                        p   -> protein-coding only

    Output:
        ref_genes   sorted list of gene identifiers
    """

    df = meta.gene_info(taxon_id)

    if df.shape[0] == 0:
        raise EnvironmentError("""
            Did not find gene info for taxon {}""".format(int(taxon_id)))

    if 'l' in ref_code:
        genes_in_medline = medline.gene2pubmed(taxon_id, ['gene_ncbi'])
        f = df.loc[:, 'gene_ncbi'].isin(genes_in_medline['gene_ncbi'])
        df = df.loc[f, :]

        if df.shape[0] == 0:
            raise EnvironmentError("""
                After filtering for genes with at least one paper,
                no gene is left.""")

    if 'o' in ref_code:  # official nomeclature
        f = df.loc[:, 'Nomenclature_status'] == 'O'
        df = df.loc[f, :]

        if df.shape[0] == 0:
            raise EnvironmentError("""
                After filtering for genes with official nomeclature,
                no gene is left.""")

    if 'p' in ref_code:  # protein-coding
        f = df.loc[:, 'type_of_gene'] == 'protein-coding'
        df = df.loc[f, :]

        if df.shape[0] == 0:
            raise EnvironmentError("""
                After filtering for protein-coding, no gene is
                left.""")

    if 'r' in ref_code:
        genes_in_medline = medline.gene2pubmed(taxon_id,
                                               ['pubmed_id', 'gene_ncbi'],
                                               paper_kind='research')
        f = df.loc[:, 'gene_ncbi'].isin(genes_in_medline['gene_ncbi'])
        df = df.loc[f, :]

        if df.shape[0] == 0:
            raise EnvironmentError("""
                After filtering for genes with at least one research paper,
                no gene is left.""")

    ref_genes = sorted(df.loc[:, 'gene_ncbi'].values)

    return ref_genes
def pi_transition():

    p = rinout.get_internal_path(
        (
            '180311_cache_pi_transition_for_genes/'
            '180311_cache_pi_transition_for_genes.csv')

    )

    pool = pd.read_csv(p, low_memory=False)

    pubmed_year_pi = pool[
        ['pubmed_id', 'pubdate_year', 'will_be_pi', 'genes']].copy()

    tolerated_genes_per_publication = 10
    pubmed_year_pi = pubmed_year_pi[
        pubmed_year_pi['genes'] <= tolerated_genes_per_publication]

    human_gene2pubmed = medline.gene2pubmed(
        taxon_id=9606,
        paper_kind='research',
        ref_genes=get_ref_genes())[['gene_ncbi', 'pubmed_id']]

    ma = pd.merge(human_gene2pubmed, pubmed_year_pi)

    # m = ma[['gene_ncbi', 'pubdate_year', 'will_be_pi']].groupby(
    #     ['gene_ncbi', 'pubdate_year']).agg(np.mean).reset_index()

    # av = m[[
    #     'pubdate_year', 'will_be_pi']].groupby(
    #         'pubdate_year').agg(np.mean).reset_index().rename(columns={
    #             'will_be_pi': 'per_year_occurence_will_be_pi'
    #         })

    # n = pd.merge(m, av)

    # f = n['pubdate_year'].isin(range(2010, 2011))

    # nn = n.loc[f, :].copy()

    # nn['above'] = nn['will_be_pi'] > (nn['per_year_occurence_will_be_pi']*2)

    # r = nn[['gene_ncbi', 'above']].groupby('gene_ncbi').agg(np.mean)

    # r = r.rename(columns={'above': 'recent_above_average'})

    # dd = r.copy()
    # ge = natsorted(r.index)
    # cl = r > 0.9

    # y = 2010

    m = ma[ma['pubdate_year'].isin(range(2010, 2016))]

    c = m['gene_ncbi'].value_counts()

    m = m[m['gene_ncbi'].isin(c[c >= 10].index)]

    dd = m[['gene_ncbi', 'will_be_pi']].groupby('gene_ncbi').agg(np.mean)

    a = np.log2(dd / dd['will_be_pi'].mean())

    # a[a > 2] = 2
    # a[a < -2] = -2

    dd = a.copy()

    cl = dd.copy() > 1    

    ge = cl.index

    return cl, dd, ge