def make_base(project_base, sub_name, ref_genes, u_features, df_target): """ Make prediction bases; Note that it will remove rows with NaN as a feature, or as prediction target: Input: project_base str, name of series of analysis sub_name str, will be used as folder (should be specific to dataset) ref_genes list-like; reference genes that shall be filtered for u_features either a dataframe with features or a dictionary of many dataframes (note: always: index is gene_ncbi) df_target dataframe with target to predict as sole column """ p = inout.get_internal_path(os.path.join( project_base, sub_name, 'input' )) if df_target.shape[1] != 1: raise EnvironmentError( 'df_target must have exactly one column') else: df_target.columns = ['prediction_target'] if isinstance(u_features, dict): df = pd.concat(u_features, join='outer', axis=1) else: df = u_features if isinstance(df.columns, pd.core.index.MultiIndex): df.columns = df.columns.droplevel(level=0) df = df.loc[ref_genes, :].dropna() master = pd.merge( df, df_target, left_index=True, right_index=True, how='inner' ) master = master.dropna() if any(master.index.duplicated()): raise EnvironmentError( 'At least one index is duplicated') features = master.drop('prediction_target', axis=1) target = master.loc[:, ['prediction_target']] _export(features, p, 'features') _export(target, p, 'target')
def load_layout(rotation_degrees=0): p = rinout.get_internal_path( os.path.join( '171014f_visualize_metrics_on_characteristics_tsne_and_features/', 'genes_coordinates.csv')) tsne_frame = pd.read_csv(p).set_index('gene_ncbi') if rotation_degrees != 0: agg = [] for x in zip(tsne_frame.loc[:, 'x'], tsne_frame.loc[:, 'y']): d = ret.rotate((0, 0), x, rotation_degrees) agg.append(d) tsne_frame.loc[:, 'x'] = [x[0] for x in agg] tsne_frame.loc[:, 'y'] = [x[1] for x in agg] return tsne_frame
def gtx(): p = rinout.get_internal_path( ( '170830f_differential_gene_expression_from_gtx/' 'taxon9606/gtx/df_genes.csv.gz' ) ) df = pd.read_csv(p) gi = _get_gene_ncbi_2_ensembl() df = pd.merge(df, gi).drop('gene_ensembl', axis=1) df = df[df['gene_ncbi'].isin(get_ref_genes())] df.loc[:, 'differential'] = df.loc[:, 'p-value'] < 0.0001 he = df[['gene_ncbi', 'differential']].groupby('gene_ncbi').agg( np.mean).rename(columns={'differential': 'gtx_fraction'}) ge = natsorted(he.index) h = np.mean(df['differential']) f = he['gtx_fraction'] == 0 he.loc[~f, 'gtx_fold'] = np.log2(he.loc[~f, 'gtx_fraction'] / h) he.loc[f, 'gtx_fold'] = -np.inf dd = he.copy() f = dd['gtx_fraction'].rank(pct=True) >= 0.99 dd.loc[:, 'extreme_gtx'] = f cl = dd.copy() f = cl['gtx_fold'] > 2 cl.loc[f, 'gtx_fold'] = 2 f = cl['gtx_fold'] < -2 cl.loc[f, 'gtx_fold'] = -2 return cl, dd, ge
def pi_transition(): p = rinout.get_internal_path( ( '180311_cache_pi_transition_for_genes/' '180311_cache_pi_transition_for_genes.csv') ) pool = pd.read_csv(p, low_memory=False) pubmed_year_pi = pool[ ['pubmed_id', 'pubdate_year', 'will_be_pi', 'genes']].copy() tolerated_genes_per_publication = 10 pubmed_year_pi = pubmed_year_pi[ pubmed_year_pi['genes'] <= tolerated_genes_per_publication] human_gene2pubmed = medline.gene2pubmed( taxon_id=9606, paper_kind='research', ref_genes=get_ref_genes())[['gene_ncbi', 'pubmed_id']] ma = pd.merge(human_gene2pubmed, pubmed_year_pi) # m = ma[['gene_ncbi', 'pubdate_year', 'will_be_pi']].groupby( # ['gene_ncbi', 'pubdate_year']).agg(np.mean).reset_index() # av = m[[ # 'pubdate_year', 'will_be_pi']].groupby( # 'pubdate_year').agg(np.mean).reset_index().rename(columns={ # 'will_be_pi': 'per_year_occurence_will_be_pi' # }) # n = pd.merge(m, av) # f = n['pubdate_year'].isin(range(2010, 2011)) # nn = n.loc[f, :].copy() # nn['above'] = nn['will_be_pi'] > (nn['per_year_occurence_will_be_pi']*2) # r = nn[['gene_ncbi', 'above']].groupby('gene_ncbi').agg(np.mean) # r = r.rename(columns={'above': 'recent_above_average'}) # dd = r.copy() # ge = natsorted(r.index) # cl = r > 0.9 # y = 2010 m = ma[ma['pubdate_year'].isin(range(2010, 2016))] c = m['gene_ncbi'].value_counts() m = m[m['gene_ncbi'].isin(c[c >= 10].index)] dd = m[['gene_ncbi', 'will_be_pi']].groupby('gene_ncbi').agg(np.mean) a = np.log2(dd / dd['will_be_pi'].mean()) # a[a > 2] = 2 # a[a < -2] = -2 dd = a.copy() cl = dd.copy() > 1 ge = cl.index return cl, dd, ge
def load_group_annotation(annotation_code='180318'): # Add manually selected groups of genes to data to plot if annotation_code == 'pre_180318': gg = rinout.get_internal_path( ( '171014f_visualize_metrics_on_characteristics_tsne' '_and_features/gene_list*')) agg = [] for g in glob.glob(gg): df = pd.read_table(g, names=['gene_ncbi']) _, fn = os.path.split(g) df.loc[:, 'list'] = fn agg.append(df) df_manual_label = pd.concat(agg, axis=0) mislabelled_genes = [ 192670, 154810, 140710, 170690, 157570, 374860, 133690, 353140, 387700, 319100, 404550, 283870, 129450, 121260, 131870, 123720, 132720, 203430, 284110, 219790, 130540, 158830, 347730, 124540, 163590, 283600, 118460 ] f = df_manual_label['gene_ncbi'].isin(mislabelled_genes) df_manual_label = df_manual_label.loc[~f, :] df_manual_label = df_manual_label.drop_duplicates( subset='gene_ncbi', keep=False).copy() df_manual_label.loc[:, 'list_code'] = df_manual_label.loc[ :, 'list'].copy( ).str.extract('gene_list([0-9]*).*', expand=False).astype(float) elif annotation_code == '180318': p = rinout.get_internal_path( '180317_complete_cluster_maker/mark_180318.csv') df_manual_label = pd.read_csv(p).rename(columns={ 'group': 'list_code' }) else: raise ValueError('Does not support provdied annotation_code.') df_manual_label = df_manual_label[df_manual_label['gene_ncbi'].isin( get_ref_genes())] return df_manual_label