def get_publications(): ref_genes = get_ref_genes() ref_gene2pubmed = medline.gene2pubmed( taxon_id=9606, paper_kind='research', ref_genes=ref_genes) papers = nar_attention.count_papers_and_attention( ref_genes, ref_gene2pubmed) df_m = medline.select_medline_records( columns_sql=''' medline.pubmed_id, medline.pubdate_year''', taxon_id=9606, kind='research') current_gene2pubmed = ref_gene2pubmed[ref_gene2pubmed['pubmed_id'].isin( df_m[df_m['pubdate_year'] == 2015]['pubmed_id'])] current_papers = nar_attention.count_papers_and_attention( ref_genes, current_gene2pubmed) current_papers = current_papers.rename(columns={ 'attention': 'attention_2015' }) papers = pd.concat([papers, current_papers[['attention_2015']]], axis=1) return papers
def supporting_nih_institutes(): df_prj_core, _, df_papers = nar_funding.get_paper_funding_through_nih() ref_genes = get_ref_genes() ref_gene2pubmed = medline.gene2pubmed( taxon_id=9606, paper_kind='research', ref_genes=ref_genes) f = df_papers['pubdate_year'].isin(range(2015, 2016)) d = pd.merge( ref_gene2pubmed, pd.merge( df_prj_core[['project_num', 'ADMINISTERING_IC']].drop_duplicates(), df_papers.loc[f, ['project_num', 'pubmed_id']].drop_duplicates())) d = d[d['gene_ncbi'].isin(ref_genes)] dc = d[['gene_ncbi', 'ADMINISTERING_IC']].drop_duplicates() dc = dc['gene_ncbi'].value_counts() dd = dc.to_frame('recently_supporting_institutes').rename_axis('gene_ncbi') total_institutes = len(set(d['ADMINISTERING_IC'])) dd = dd / total_institutes cl = dd >= 0.1 ge = natsorted(d['gene_ncbi'].unique()) return cl, dd, ge
def load_shared_gene2pubmed(taxon_id, reference_df, reference_genes): gene2pubmed = medline.gene2pubmed(taxon_id, ['gene_ncbi', 'pubmed_id']) gene2pubmed = gene2pubmed[gene2pubmed['gene_ncbi'].isin(reference_genes)] f = gene2pubmed['pubmed_id'].isin(reference_df['pubmed_id']) gene2pubmed = gene2pubmed.loc[f, :] return gene2pubmed
def filter_for_papers_with_reference_genes(taxon_id, df, reference_genes): gene2pubmed = medline.gene2pubmed(taxon_id, ['gene_ncbi', 'pubmed_id']) gene2pubmed = gene2pubmed[gene2pubmed['gene_ncbi'].isin(reference_genes)] f = df['pubmed_id'].isin(gene2pubmed['pubmed_id']) df = df.loc[f, :] return df
def fame_of_homologs(): gene2pubmed_research = medline.gene2pubmed( taxon_id='all', paper_kind='research') value_of_pubmed_id = gene2pubmed_research[ 'pubmed_id'].value_counts().to_frame().reset_index().rename( columns={'index': 'pubmed_id', 'pubmed_id': 'value'}) value_of_pubmed_id['value'] = 1 / value_of_pubmed_id['value'] gene2pubmed_research = pd.merge(gene2pubmed_research, value_of_pubmed_id) extended_attention = gene2pubmed_research[[ 'gene_ncbi', 'value']].groupby('gene_ncbi').agg(sum) hg = relations.homologene() hg_attention = pd.merge(hg, extended_attention.reset_index(), how='left') hg_attention['value'] = hg_attention['value'].fillna(0) hg_max_attention = hg_attention[ ['homologene_group', 'taxon_ncbi', 'value']].groupby( ['homologene_group', 'taxon_ncbi']).agg(max).reset_index() hg_max_attention = hg_max_attention[hg_max_attention['taxon_ncbi'] != 9606] he = pd.merge( hg_max_attention, hg[hg['taxon_ncbi'] == 9606][['homologene_group', 'gene_ncbi']] ) he = he[he['gene_ncbi'].isin(get_ref_genes())] he = he[['taxon_ncbi', 'gene_ncbi', 'value']].groupby( ['gene_ncbi', 'taxon_ncbi']).agg(max).reset_index() dd = he.pivot(index='gene_ncbi', columns='taxon_ncbi', values='value') dd.columns = [meta.taxon_name(x) for x in dd.columns] dd.columns = ['studied_{}'.format(x) for x in dd.columns] cl = dd > 1 f = dd.isnull() cl[f] = np.nan ge = sorted(dd.index) return cl, dd, ge
def count_papers_and_attention(ref_genes, ref_gene2pubmed): """ Counts the number of articles in ref_gene2pubmed Uses the full gene2pubmed (not only the one of ref_gene2pubmed) to compute attention (thus also considerting genes that lie outisde of the present filtering) Input: ref_genes list of reference genes ref_gene2pubmed dataframe with gene_ncbi and pubmed_id """ ref_gene2pubmed = ref_gene2pubmed[ref_gene2pubmed['gene_ncbi'].isin( ref_genes)] full_gene2pubmed = medline.gene2pubmed(taxon_id='all') full_gene2pubmed = full_gene2pubmed[full_gene2pubmed['pubmed_id'].isin( ref_gene2pubmed['pubmed_id'])] h = full_gene2pubmed[[ 'pubmed_id', 'gene_ncbi' ]].groupby('pubmed_id').agg(lambda x: 1 / len(x)).rename( columns={ 'gene_ncbi': 'attention' }).reset_index() master = pd.merge(ref_gene2pubmed, h, how='inner') papers = master['gene_ncbi'].value_counts().to_frame('papers') papers.index.name = 'gene_ncbi' papers.loc[:, 'attention'] = master[['gene_ncbi', 'attention' ]].groupby('gene_ncbi').agg(sum) papers = papers.loc[ref_genes, :] papers = papers.fillna(0) return papers
def get_extended_funding_info(taxon_id, earliest_year, latest_year): """ Function to standardize queries on budget, creates estimate of budget per gene """ # INITIALIZATION ### # MedLine ref_genes = standardizer.reference_genes(taxon_id, 'rpo') gene2pubmed = medline.gene2pubmed(taxon_id, paper_kind='research', ref_genes=ref_genes) df_m = medline.select_medline_wos_records(columns_sql=''' medline.pubmed_id, medline.pubdate_year, medline.amount_of_authors, medline.j_name_s''', years_range='all', taxon_id=taxon_id, kind='research', unambiguous=True) df_m = df_m[df_m['amount_of_authors'] > 0] # exclude consortia paper (-1) df_m = df_m[['pubmed_id', 'pubdate_year', 'amount_of_authors', 'j_name_s']] df_m = df_m[df_m['pubdate_year'] >= earliest_year] df_m = df_m[df_m['pubdate_year'] <= latest_year] # <========== use later for filtering all!!!! _pubmed_articles_in_medline_time_span = set(df_m['pubmed_id']) # NIH Exporter df_prj_core, df_prj_budget, df_nih_papers = get_paper_funding_through_nih() df_nih_papers = df_nih_papers.loc[:, ['project_num', 'pubmed_id']] # skip publication year df_prj_core = df_prj_core[df_prj_core['FY'] >= earliest_year] df_prj_core = df_prj_core[df_prj_core['FY'] <= latest_year] df_prj_budget = df_prj_budget.loc[:, ['project_num', 'budget']] # skip fiscal year df_prj_budget = df_prj_budget.groupby('project_num').agg(sum) df_prj_budget = df_prj_budget.reset_index() # Estimations of costs for non-covered papers ### papers_in_nih = len( set(df_nih_papers['pubmed_id']).intersection(set(df_m['pubmed_id']))) papers_in_medline = len(set(df_m['pubmed_id'])) multiplier_nih2medline = papers_in_medline / papers_in_nih print('Multiplier:', multiplier_nih2medline) # Synchronization ### # PubMed lis = [ set(df_nih_papers['pubmed_id']), set(df_m['pubmed_id']), set(gene2pubmed['pubmed_id']) ] pubmed_in_all = set.intersection(*lis) print('Amount of MedLine articles:', len(pubmed_in_all)) gene2pubmed = gene2pubmed[gene2pubmed['pubmed_id'].isin(pubmed_in_all)] df_m = df_m[df_m['pubmed_id'].isin(pubmed_in_all)] df_nih_papers = df_nih_papers[df_nih_papers['pubmed_id'].isin( pubmed_in_all)] # Projects lis = [ set(df_prj_core['project_num']), set(df_prj_budget['project_num']), set(df_nih_papers['project_num']) ] project_in_all = set.intersection(*lis) df_prj_core = df_prj_core[df_prj_core['project_num'].isin(project_in_all)] df_prj_budget = df_prj_budget[df_prj_budget['project_num'].isin( project_in_all)] df_nih_papers = df_nih_papers[df_nih_papers['project_num'].isin( project_in_all)] # Resources per paper per gene # amount of publications per project papers_per_project = df_nih_papers['project_num'].value_counts() # overall budget per project budget_per_project = df_prj_budget.set_index('project_num')['budget'] # budget per paper for each project budget_per_paper_per_project = budget_per_project.div( papers_per_project).to_frame( 'budget_per_paper_per_project').reset_index().rename( columns={'index': 'project_num'}) budget_per_pubmed_id = pd.merge( budget_per_paper_per_project, df_nih_papers)[['pubmed_id', 'budget_per_paper_per_project' ]].groupby('pubmed_id').agg(sum).reset_index() attention_per_paper = (1 / gene2pubmed['pubmed_id'].value_counts() ).to_frame('attention_per_gene').reset_index() attention_per_paper = attention_per_paper.rename( columns={'index': 'pubmed_id'}) gene2pubmed_plus = pd.merge(gene2pubmed, budget_per_pubmed_id) gene2pubmed_plus = pd.merge(gene2pubmed_plus, attention_per_paper) gene2pubmed_plus = gene2pubmed_plus.rename( columns={ 'budget_per_paper_per_project': 'budget_for_paper', 'attention_per_gene': 'attention' }) gene2pubmed_plus.loc[:, 'papers'] = 1 gene2pubmed_plus['budget_for_attention'] = gene2pubmed_plus['attention'] * \ gene2pubmed_plus['budget_for_paper'] master = gene2pubmed_plus[[ 'gene_ncbi', 'budget_for_attention', 'attention', 'papers', 'budget_for_paper' ]].groupby('gene_ncbi').agg(sum) master['budget_by_attention'] = master['budget_for_attention'] / \ master['attention'] master['budget_by_papers'] = master['budget_for_paper'] / master['papers'] gene2pubmed_full = medline.gene2pubmed(taxon_id, paper_kind='research', ref_genes=ref_genes) gene2pubmed_full = gene2pubmed_full[gene2pubmed_full['pubmed_id'].isin( _pubmed_articles_in_medline_time_span)] fame_full = nar_attention.count_papers_and_attention( ref_genes, gene2pubmed_full) n = fame_full.columns fame_full.columns = ['full_' + x for x in fame_full.columns] master = pd.merge(master.reset_index(), fame_full.reset_index()) nih_publnk = nih.publnk().drop_duplicates() gene2pubmed_all_nih = gene2pubmed_full[gene2pubmed_full['pubmed_id'].isin( nih_publnk['pubmed_id'])] fame_all_nih = nar_attention.count_papers_and_attention( ref_genes, gene2pubmed_all_nih) n = fame_all_nih.columns fame_all_nih.columns = ['all_nih_' + x for x in fame_all_nih.columns] master = pd.merge(master, fame_all_nih.reset_index()) for x in n: master.loc[:, 'non_nih_' + x] = master.loc[:, 'full_' + x] - master.loc[:, 'all_nih_' + x] master = master.set_index('gene_ncbi') return master
def get_year_of_discovery(taxon_id, ref_genes): """ Returns earliest years within research papers covered in MedLine Input: taxon_id ref_genes Output: genes_earliest_years df with first_year and first_solo_year """ ref_gene2pubmed = medline.gene2pubmed( taxon_id, paper_kind='research', ref_genes=ref_genes) df_m = medline.select_medline_records( columns_sql=''' medline.pubmed_id, medline.pubdate_year''', taxon_id=taxon_id, kind='research') df_m = df_m[df_m['pubmed_id'].isin(ref_gene2pubmed['pubmed_id'])] columns_to_use = ['pubmed_id', 'pubdate_year'] df_m = df_m.loc[:, columns_to_use].drop_duplicates() genes_per_paper = ref_gene2pubmed['pubmed_id'].value_counts( ).to_frame('genes') df_m = pd.merge(df_m, genes_per_paper, left_on='pubmed_id', right_index=True, how='inner') df_m.loc[:, 'taxon_ncbi'] = taxon_id # add genes to medline master = pd.merge( df_m, medline.gene2pubmed( taxon_id=taxon_id, paper_kind='research', ref_genes=ref_genes), left_on=['taxon_ncbi', 'pubmed_id'], right_on=['taxon_ncbi', 'pubmed_id'], how='inner').drop_duplicates() # get initial years is_single_gene_paper = master['genes'] == 1 genes_earliest_years = pd.merge( master.loc[ :, ['gene_ncbi', 'pubdate_year']].groupby( 'gene_ncbi').agg(min).reset_index().rename( columns={'pubdate_year': 'first_year'}), master.loc[ is_single_gene_paper, ['gene_ncbi', 'pubdate_year']].groupby( 'gene_ncbi').agg(min).reset_index().rename( columns={'pubdate_year': 'first_solo_year'}), left_on='gene_ncbi', right_on='gene_ncbi', how='outer' ) f = master['genes'] == 1 genes_earliest_years = pd.merge( master.loc[:, ['gene_ncbi', 'pubdate_year']].groupby( 'gene_ncbi').agg( min).rename(columns={'pubdate_year': 'first_year'}), master.loc[f, ['gene_ncbi', 'pubdate_year']].groupby( 'gene_ncbi').agg( min).rename(columns={'pubdate_year': 'first_solo_year'}), left_index=True, right_index=True, how='outer' ) genes_earliest_years = genes_earliest_years.loc[ref_genes, :] return genes_earliest_years
def reference_genes(taxon_id, ref_code): """ Obtains a list of reference genes Input: taxon_id int ref_code str; if it contains l -> at least one medline paper o -> official nomenclature require p -> protein-coding only Output: ref_genes sorted list of gene identifiers """ df = meta.gene_info(taxon_id) if df.shape[0] == 0: raise EnvironmentError(""" Did not find gene info for taxon {}""".format(int(taxon_id))) if 'l' in ref_code: genes_in_medline = medline.gene2pubmed(taxon_id, ['gene_ncbi']) f = df.loc[:, 'gene_ncbi'].isin(genes_in_medline['gene_ncbi']) df = df.loc[f, :] if df.shape[0] == 0: raise EnvironmentError(""" After filtering for genes with at least one paper, no gene is left.""") if 'o' in ref_code: # official nomeclature f = df.loc[:, 'Nomenclature_status'] == 'O' df = df.loc[f, :] if df.shape[0] == 0: raise EnvironmentError(""" After filtering for genes with official nomeclature, no gene is left.""") if 'p' in ref_code: # protein-coding f = df.loc[:, 'type_of_gene'] == 'protein-coding' df = df.loc[f, :] if df.shape[0] == 0: raise EnvironmentError(""" After filtering for protein-coding, no gene is left.""") if 'r' in ref_code: genes_in_medline = medline.gene2pubmed(taxon_id, ['pubmed_id', 'gene_ncbi'], paper_kind='research') f = df.loc[:, 'gene_ncbi'].isin(genes_in_medline['gene_ncbi']) df = df.loc[f, :] if df.shape[0] == 0: raise EnvironmentError(""" After filtering for genes with at least one research paper, no gene is left.""") ref_genes = sorted(df.loc[:, 'gene_ncbi'].values) return ref_genes
def pi_transition(): p = rinout.get_internal_path( ( '180311_cache_pi_transition_for_genes/' '180311_cache_pi_transition_for_genes.csv') ) pool = pd.read_csv(p, low_memory=False) pubmed_year_pi = pool[ ['pubmed_id', 'pubdate_year', 'will_be_pi', 'genes']].copy() tolerated_genes_per_publication = 10 pubmed_year_pi = pubmed_year_pi[ pubmed_year_pi['genes'] <= tolerated_genes_per_publication] human_gene2pubmed = medline.gene2pubmed( taxon_id=9606, paper_kind='research', ref_genes=get_ref_genes())[['gene_ncbi', 'pubmed_id']] ma = pd.merge(human_gene2pubmed, pubmed_year_pi) # m = ma[['gene_ncbi', 'pubdate_year', 'will_be_pi']].groupby( # ['gene_ncbi', 'pubdate_year']).agg(np.mean).reset_index() # av = m[[ # 'pubdate_year', 'will_be_pi']].groupby( # 'pubdate_year').agg(np.mean).reset_index().rename(columns={ # 'will_be_pi': 'per_year_occurence_will_be_pi' # }) # n = pd.merge(m, av) # f = n['pubdate_year'].isin(range(2010, 2011)) # nn = n.loc[f, :].copy() # nn['above'] = nn['will_be_pi'] > (nn['per_year_occurence_will_be_pi']*2) # r = nn[['gene_ncbi', 'above']].groupby('gene_ncbi').agg(np.mean) # r = r.rename(columns={'above': 'recent_above_average'}) # dd = r.copy() # ge = natsorted(r.index) # cl = r > 0.9 # y = 2010 m = ma[ma['pubdate_year'].isin(range(2010, 2016))] c = m['gene_ncbi'].value_counts() m = m[m['gene_ncbi'].isin(c[c >= 10].index)] dd = m[['gene_ncbi', 'will_be_pi']].groupby('gene_ncbi').agg(np.mean) a = np.log2(dd / dd['will_be_pi'].mean()) # a[a > 2] = 2 # a[a < -2] = -2 dd = a.copy() cl = dd.copy() > 1 ge = cl.index return cl, dd, ge