def main_cosmic(options): """Main function used to process COSMIC data.""" # get configs in_opts = _utils.get_input_config('classifier') out_opts = _utils.get_output_config('features') count_opts = _utils.get_output_config('feature_matrix') # result_opts = _utils.get_input_config('result') db_cfg = _utils.get_db_config('2020plus') # get mutations conn = sqlite3.connect(db_cfg['db']) sql = ("SELECT Gene, Protein_Change as AminoAcid, " " DNA_Change as Nucleotide, " " Variant_Classification, " " Tumor_Sample, Tumor_Type " "FROM mutations") mut_df = psql.frame_query(sql, con=conn) conn.close() # get features for classification all_features = futils.generate_features(mut_df, options) # save features to text file cols = all_features.columns.tolist() new_order = ['gene'] + cols[:cols.index('gene')] + cols[cols.index('gene')+1:] all_features = all_features[new_order] # make the gene name the first column out_path = _utils.save_dir + in_opts['gene_features'] if not options['output'] else options['output'] all_features.to_csv(out_path, sep='\t', index=False)
def main_cosmic(options): """Main function used to process COSMIC data.""" # get configs in_opts = _utils.get_input_config('classifier') out_opts = _utils.get_output_config('features') count_opts = _utils.get_output_config('feature_matrix') # result_opts = _utils.get_input_config('result') db_cfg = _utils.get_db_config('2020plus') # get mutations conn = sqlite3.connect(db_cfg['db']) sql = ("SELECT Gene, Protein_Change as AminoAcid, " " DNA_Change as Nucleotide, " " Variant_Classification, " " Tumor_Sample, Tumor_Type " "FROM mutations") mut_df = psql.frame_query(sql, con=conn) conn.close() # get features for classification all_features = futils.generate_features(mut_df, options) # save features to text file cols = all_features.columns.tolist() new_order = ['gene' ] + cols[:cols.index('gene')] + cols[cols.index('gene') + 1:] all_features = all_features[ new_order] # make the gene name the first column out_path = _utils.save_dir + in_opts['gene_features'] if not options[ 'output'] else options['output'] all_features.to_csv(out_path, sep='\t', index=False)
def main(cli_opts): cfg_opts = _utils.get_output_config('classifier') in_opts = _utils.get_input_config('classifier') minimum_ct = cli_opts['min_count'] # get path to features used for classification if cli_opts['features']: feature_path = cli_opts['features'] else: feature_path = _utils.save_dir + in_opts['gene_feature'] # read in features df = pd.read_csv(feature_path, sep='\t', index_col=0) logger.info('Training R\'s Random forest . . .') rrclf = RRandomForest(df, other_sample_ratio=cli_opts['other_ratio'], driver_sample=cli_opts['driver_rate'], ntrees=cli_opts['ntrees'], seed=cli_opts['random_seed']) # train on entire data if cli_opts['cv']: rrclf.train_cv() else: rrclf.train() logger.info('Finished training.') logger.info('Saving classifier to . . .') if cli_opts['cv']: rrclf.clf.save_cv(cli_opts['output']) else: rrclf.clf.save(cli_opts['output']) logger.info('Finished saving classifier.')
def main(hypermutator_count, mut_path, db_path, no_cosmic_flag, opts): """Concatenates all the mutation data from tab delmited files in the cosmic directory. Next, saves the results to a sqlite db. Parameters ---------- hypermutator_count : int remove samples with too many mutations mut_path : str Either path to directory containing contents of COSMIC's genes.tgz file or decompressed CosmicMutantExport.tsv. If empty string, just use path from config file. db_path : str path to save sqlite database. If string is empty, use path from config. no_cosmic_flag : bool indicates not to use cosmic mutations """ # get input/output configurations in_opts = _utils.get_input_config('input') cosmic_path = in_opts['cosmic_path'] out_opts = _utils.get_output_config('gene_tsv') out_path = out_opts['gene_tsv'] cnv_path = out_opts['cnv_tsv'] db_opts = _utils.get_db_config('2020plus') out_db = db_opts['db'] # check if user specifies non standard db path out_db = db_path if db_path else out_db # save info into a txt file and sqlite3 database if not no_cosmic_flag: cosmic_path = mut_path if mut_path else cosmic_path if os.path.isdir(cosmic_path): # concatenate all gene files concatenate_genes(out_path, cosmic_path) # save database save_db(hypermutator_count, out_path, out_db, is_genes_tgz=True, only_genome_wide=opts['only_genome_wide'], use_unknown_status=opts['use_unknown_status']) elif os.path.isfile(cosmic_path): # save database save_db(hypermutator_count, cosmic_path, out_db, is_genes_tgz=False, only_genome_wide=opts['only_genome_wide'], use_unknown_status=opts['use_unknown_status']) else: raise ValueError('Please specify a vlid path to COSMIC data') else: # create an empty table if cosmic not wanted create_empty_cosmic_mutation_table(out_db)
def sample_boxplot(pred_onco, pred_tsg, pred_driver, save_path_type, save_path_driver, xlabel='', ylabel='', title=''): """Create a box plot for distribution of percentage of tumor samples containing a non-silent mutation in different categories of genes (ie oncogenes, tsgs, and drivers). Parameters ---------- pred_onco : list list of genes predicted as oncogenes pred_tsg : list list of genes predicted as tsgs pred_driver : list list of genes predicted as drivers save_path_type : str path to save figure for comparing oncogenes, tsgs, and other save_path_driver : str path to save figure for comparing drivers vs other xlabel : str x-axis label ylabel : str y-axis label title : str title of figures """ cfg = _utils.get_output_config('sample') df = pd.read_csv(_utils.result_dir + cfg['max_gene_pct_sample_out'], sep='\t', index_col=0) df['Predicted Type'] = [("oncogene" if g in pred_onco else "other") for g in df.index] df.ix[pred_tsg, 'Predicted Type'] = 'TSG' df['Predicted Driver'] = [("driver" if g in pred_driver else "other") for g in df.index] # set figure labels if not xlabel: xlabel = 'Predicted Type' if not ylabel: ylabel = 'Maximum Pct of Samples for a Tumor Type' if not title: title = 'Percentage of Samples with Non-Silent Mutation' # plot with oncogenes, tsgs, and other myplt.boxplot(df, by='Predicted Type', column=['all mutation sample pct', 'non-silent sample pct'], save_path=save_path_type, xlabel=xlabel, ylabel=ylabel, title=title) # plot with drivers vs other myplt.boxplot(df, by='Predicted Driver', column=['all mutation sample pct', 'non-silent sample pct'], save_path=save_path_driver, xlabel=xlabel, ylabel=ylabel, title=title)
def retrieve_gene_features(conn, opts, get_entropy=True): """Retrieve gene information from the gene_features table. See the gene_features module to understand the gene_features database table. Parameters ---------- conn : mysql/sqlite connection connection to db with gene_features table options : dict options for getting info get_entropy : bool option to togle the use of entropy features. Since entropy features are read from a file in this function, it may induce a not necessary dependency on previously running commands. To avoid this, set get_entropy=False and then compute entropy features separately. Returns ------- df : pd.dataframe dataframe of gene lengths """ logger.info('Retrieving features of genes . . .') selected_cols = ['gene'] # retrieve more features if specified by command line if opts['gene_length']: selected_cols.append('gene_length') if opts['mutation_rate']: selected_cols.append('noncoding_mutation_rate') if opts['replication_time']: selected_cols.append('replication_time') if opts['expression']: selected_cols.append('expression_CCLE as expression') if opts['hic']: selected_cols.append('HiC_compartment') if opts['betweeness']: selected_cols.append('gene_betweeness') if opts['degree']: selected_cols.append('gene_degree') # get info from gene_features table logger.info( 'Retrieving gene feature information from gene_features table . . . ') sql = "SELECT %s FROM gene_features" % ', '.join(selected_cols) df = psql.frame_query(sql, conn) df = df.set_index('gene') df['gene'] = df.index logger.info('Finished retrieving gene features from gene_features table.') # fill graph stats with zeros if gene not in Biogrid if 'gene_betweeness' in df.columns: df['gene_betweeness'] = df['gene_betweeness'].fillna(0) if 'gene_degree' in df.columns: df['gene_degree'] = df['gene_degree'].fillna(0) # get position entropy features if get_entropy: entropy_cfg = _utils.get_output_config('position_entropy') mutation_pos_entropy = pd.read_csv(_utils.result_dir + entropy_cfg['mutation_pos_entropy'], sep='\t', index_col=0) missense_pos_entropy = pd.read_csv(_utils.result_dir + entropy_cfg['missense_pos_entropy'], sep='\t', index_col=0) #df['mutation position entropy'] = mutation_pos_entropy['mutation position entropy'] #df['pct of uniform mutation entropy'] = mutation_pos_entropy['pct of uniform mutation entropy'] df['missense position entropy'] = missense_pos_entropy[ 'missense position entropy'] df['pct of uniform missense entropy'] = missense_pos_entropy[ 'pct of uniform missense entropy'] return df
def main(cli_opts): cfg_opts = _utils.get_output_config('classifier') in_opts = _utils.get_input_config('classifier') minimum_ct = cli_opts['min_count'] # get path to features used for classification if cli_opts['features']: feature_path = cli_opts['features'] else: feature_path = _utils.save_dir + in_opts['gene_feature'] # read in null distribution p-values if not cli_opts['simulated'] and cli_opts['null_distribution']: null_pvals = pd.read_csv(cli_opts['null_distribution'], sep='\t', index_col=0) else: null_pvals = None # use trained classifier if provided if cli_opts['trained_classifier']: # read in features df = pd.read_csv(feature_path, sep='\t', index_col=0) logger.info('Running Random forest . . .') # initialize R's random forest rrclf = RRandomForest(df, other_sample_ratio=cli_opts['other_ratio'], driver_sample=cli_opts['driver_rate'], ntrees=cli_opts['ntrees'], seed=cli_opts['random_seed']) # load classifier depending on whether it uses CV is_cv = cli_opts['cv'] if is_cv: rrclf.clf.load_cv(cli_opts['trained_classifier']) else: rrclf.clf.load(cli_opts['trained_classifier']) if cli_opts['simulated']: # do classification result_df = trained_rand_forest_pred(rrclf, df, None, null_pvals, is_cv) # driver scores driver_score_cts = result_df['driver score'].value_counts() driver_score_cts = driver_score_cts.sort_index(ascending=False) driver_score_cum_cts = driver_score_cts.cumsum() driver_score_pvals = driver_score_cum_cts / float(driver_score_cts.sum()) # oncogene scores onco_score_cts = result_df['oncogene score'].value_counts() onco_score_cts = onco_score_cts.sort_index(ascending=False) onco_score_cum_cts = onco_score_cts.cumsum() onco_score_pvals = onco_score_cum_cts / float(onco_score_cts.sum()) # tsg score tsg_score_cts = result_df['tsg score'].value_counts() tsg_score_cts = tsg_score_cts.sort_index(ascending=False) tsg_score_cum_cts = tsg_score_cts.cumsum() tsg_score_pvals = tsg_score_cum_cts / float(tsg_score_cts.sum()) # construct null p-value score distribution score_ix = set(driver_score_pvals.index) | set(onco_score_pvals.index) | set(tsg_score_pvals.index) score_pvals = pd.DataFrame(index=list(score_ix)) score_pvals['oncogene p-value'] = onco_score_pvals score_pvals['tsg p-value'] = tsg_score_pvals score_pvals['driver p-value'] = driver_score_pvals score_pvals = score_pvals.sort_index(ascending=False) score_pvals.to_csv(cli_opts['null_distribution'], sep='\t', index_label='score') else: # do classification pred_results_path = _utils.clf_result_dir + cfg_opts['rrand_forest_pred'] logger.info('Saving results to {0}'.format(pred_results_path)) result_df = trained_rand_forest_pred(rrclf, df, pred_results_path, null_pvals, is_cv) result_df.to_csv(pred_results_path, sep='\t') # create qq plot try: qq_plot_path = _utils.clf_plot_dir + cfg_opts['qq_plot'] plot_data.create_qqplots(result_df, qq_plot_path) except: pass logger.info('Finished classification.') return df = pd.read_csv(feature_path, sep='\t', index_col=0) # R's random forest logger.info('Running Random forest . . .') # initialize R's random forest rrclf = RRandomForest(df, other_sample_ratio=cli_opts['other_ratio'], driver_sample=cli_opts['driver_rate'], ntrees=cli_opts['ntrees'], seed=cli_opts['random_seed']) # analyze classification metrics rrclf.kfold_validation() rrclf_onco_tpr, rrclf_onco_fpr, rrclf_onco_mean_roc_auc = rrclf.get_onco_roc_metrics() rrclf_onco_precision, rrclf_onco_recall, rrclf_onco_mean_pr_auc = rrclf.get_onco_pr_metrics() rrclf_tsg_tpr, rrclf_tsg_fpr, rrclf_tsg_mean_roc_auc = rrclf.get_tsg_roc_metrics() rrclf_tsg_precision, rrclf_tsg_recall, rrclf_tsg_mean_pr_auc = rrclf.get_tsg_pr_metrics() rrclf_driver_precision, rrclf_driver_recall, rrclf_driver_mean_pr_auc = rrclf.get_driver_pr_metrics() rrclf_driver_tpr, rrclf_driver_fpr, rrclf_driver_mean_roc_auc = rrclf.get_driver_roc_metrics() # skip if no matplotlib try: # plot feature importance mean_df = rrclf.mean_importance std_df = rrclf.std_importance feat_path = _utils.clf_plot_dir + cfg_opts['r_feature_importance_plot'] plot_data.feature_importance_barplot(mean_df, std_df, feat_path) except: pass # run predictions using R's random forest pred_results_path = _utils.clf_result_dir + cfg_opts['rrand_forest_pred'] result_df = rand_forest_pred(rrclf, df, result_path=pred_results_path, null_dist=null_pvals) # save a list of oncogenes/tsgs in separate files if null_pvals is None: pred_onco = result_df[result_df['majority vote class']==_utils.onco_label].index.to_series() novel_onco = result_df[(result_df['majority vote class']==_utils.onco_label) & (result_df['training list class']!=_utils.onco_label)].index.to_series() pred_tsg = result_df[result_df['majority vote class']==_utils.tsg_label].index.to_series() novel_tsg = result_df[(result_df['majority vote class']==_utils.tsg_label) & (result_df['training list class']!=_utils.tsg_label)].index.to_series() pred_driver = result_df[result_df['majority vote cancer gene']==1].index.to_series() pred_onco.to_csv(_utils.clf_result_dir + cfg_opts['rrf_onco'], sep='\t', index=False, header=None) novel_onco.to_csv(_utils.clf_result_dir + cfg_opts['rrf_novel_onco'], sep='\t', index=False, header=None) pred_tsg.to_csv(_utils.clf_result_dir + cfg_opts['rrf_tsg'], sep='\t', index=False, header=None) novel_tsg.to_csv(_utils.clf_result_dir + cfg_opts['rrf_novel_tsg'], sep='\t', index=False, header=None) log_str = ('Majority vote Random forest: {0} ({1} novel) oncogenes, ' '{2} ({3} novel) tsg'.format(len(pred_onco), len(novel_onco), len(pred_tsg), len(novel_tsg))) logger.info(log_str) else: pred_onco = result_df[result_df['oncogene q-value']<=.1].index.to_series() novel_onco = result_df[(result_df['oncogene q-value']<=.1) & (result_df['training list class']!=_utils.onco_label)].index.to_series() pred_tsg = result_df[result_df['tsg q-value']<=.1].index.to_series() novel_tsg = result_df[(result_df['tsg q-value']<=.1) & (result_df['training list class']!=_utils.tsg_label)].index.to_series() pred_driver = result_df[result_df['driver q-value']<=.1].index.to_series() pred_onco.to_csv(_utils.clf_result_dir + cfg_opts['rrf_onco'], sep='\t', index=False, header=None) novel_onco.to_csv(_utils.clf_result_dir + cfg_opts['rrf_novel_onco'], sep='\t', index=False, header=None) pred_tsg.to_csv(_utils.clf_result_dir + cfg_opts['rrf_tsg'], sep='\t', index=False, header=None) novel_tsg.to_csv(_utils.clf_result_dir + cfg_opts['rrf_novel_tsg'], sep='\t', index=False, header=None) log_str = ('Random forest significance test: {0} ({1} novel) oncogenes, ' '{2} ({3} novel) tsg'.format(len(pred_onco), len(novel_onco), len(pred_tsg), len(novel_tsg))) logger.info(log_str) # only plot if matplotlib try: # plot r random forest results plot_data.prob_scatter(result_df, plot_path=_utils.clf_plot_dir + cfg_opts['rrand_forest_plot'], title='Sub-sampled Random Forest Predictions') plot_data.prob_kde(result_df, col_name='oncogene score', save_path=_utils.clf_plot_dir + cfg_opts['onco_kde_rrand_forest'], title='Distribution of Oncogene Scores (sub-sampled random forest)') plot_data.prob_kde(result_df, col_name='tsg score', save_path=_utils.clf_plot_dir + cfg_opts['tsg_kde_rrand_forest'], title='Distribution of TSG Scores (sub-sampled random forest)') logger.info('Finished running sub-sampled Random Forest') # dummy classifier, predict most frequent logger.debug('Running Dummy Classifier. . .') dclf = DummyClf(df, strategy='most_frequent', min_ct=minimum_ct, weight=False) dclf.kfold_validation() dclf_onco_tpr, dclf_onco_fpr, dclf_onco_mean_roc_auc = dclf.get_onco_roc_metrics() dclf_onco_precision, dclf_onco_recall, dclf_onco_mean_pr_auc = dclf.get_onco_pr_metrics() dclf_tsg_tpr, dclf_tsg_fpr, dclf_tsg_mean_roc_auc = dclf.get_tsg_roc_metrics() dclf_tsg_precision, dclf_tsg_recall, dclf_tsg_mean_pr_auc = dclf.get_tsg_pr_metrics() dclf_driver_tpr, dclf_driver_fpr, dclf_driver_mean_roc_auc = dclf.get_driver_roc_metrics() logger.debug('Finished dummy classifier.') # plot oncogene roc figure rrandom_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_onco_mean_roc_auc dummy_str = 'dummy (AUC = %0.3f)' % dclf_onco_mean_roc_auc rrclf_onco_mean_tpr = np.mean(rrclf_onco_tpr, axis=0) dclf_onco_mean_tpr = np.mean(dclf_onco_tpr, axis=0) df = pd.DataFrame({ rrandom_forest_str: rrclf_onco_mean_tpr, dummy_str: dclf_onco_mean_tpr}, index=rrclf_onco_fpr) line_style = {dummy_str: '--', rrandom_forest_str: '-', } save_path = _utils.clf_plot_dir + cfg_opts['roc_plot_oncogene'] plot_data.receiver_operator_curve(df, save_path, line_style) # plot tsg roc figure r_random_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_tsg_mean_roc_auc dummy_str = 'dummy (AUC = %0.3f)' % dclf_tsg_mean_roc_auc rrclf_tsg_mean_tpr = np.mean(rrclf_tsg_tpr, axis=0) dclf_tsg_mean_tpr = np.mean(dclf_tsg_tpr, axis=0) df = pd.DataFrame({r_random_forest_str: rrclf_tsg_mean_tpr, dummy_str: dclf_tsg_mean_tpr}, index=rrclf_tsg_fpr) line_style = {dummy_str: '--', r_random_forest_str: '-', } save_path = _utils.clf_plot_dir + cfg_opts['roc_plot_tsg'] plot_data.receiver_operator_curve(df, save_path, line_style) # plot driver roc figure r_random_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_driver_mean_roc_auc dummy_str = 'dummy (AUC = %0.3f)' % dclf_driver_mean_roc_auc rrclf_driver_mean_tpr = np.mean(rrclf_driver_tpr, axis=0) dclf_driver_mean_tpr = np.mean(dclf_driver_tpr, axis=0) df = pd.DataFrame({r_random_forest_str: rrclf_driver_mean_tpr, dummy_str: dclf_driver_mean_tpr}, index=rrclf_driver_fpr) line_style = {dummy_str: '--', r_random_forest_str: '-', } save_path = _utils.clf_plot_dir + cfg_opts['roc_plot_driver'] plot_data.receiver_operator_curve(df, save_path, line_style) # plot oncogene pr figure rrandom_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_onco_mean_pr_auc dummy_str = 'dummy (AUC = %0.3f)' % dclf_onco_mean_pr_auc rrclf_onco_mean_precision = np.mean(rrclf_onco_precision, axis=0) dclf_onco_mean_precision = np.mean(dclf_onco_precision, axis=0) df = pd.DataFrame({rrandom_forest_str: rrclf_onco_mean_precision,}, index=rrclf_onco_recall) line_style = {dummy_str: '--', rrandom_forest_str: '-', } save_path = _utils.clf_plot_dir + cfg_opts['pr_plot_oncogene'] plot_data.precision_recall_curve(df, save_path, line_style, #sem_df, title='Oncogene Precision-Recall Curve') # plot tsg pr figure r_random_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_tsg_mean_pr_auc dummy_str = 'dummy (AUC = %0.3f)' % dclf_tsg_mean_pr_auc rrclf_tsg_mean_precision = np.mean(rrclf_tsg_precision, axis=0) dclf_tsg_mean_precision = np.mean(dclf_tsg_precision, axis=0) df = pd.DataFrame({ r_random_forest_str: rrclf_tsg_mean_precision, }, index=rrclf_tsg_recall) line_style = {dummy_str: '--', r_random_forest_str: '-', } save_path = _utils.clf_plot_dir + cfg_opts['pr_plot_tsg'] plot_data.precision_recall_curve(df, save_path, line_style, title='TSG Precision-Recall Curve') # plot driver gene pr figure r_random_forest_str = '20/20+ Classifier (AUC = %0.3f)' % rrclf_driver_mean_pr_auc rrclf_driver_mean_precision = np.mean(rrclf_driver_precision, axis=0) df = pd.DataFrame({ r_random_forest_str: rrclf_driver_mean_precision, }, index=rrclf_driver_recall) line_style = {dummy_str: '--', r_random_forest_str: '-', } save_path = _utils.clf_plot_dir + cfg_opts['pr_plot_driver'] plot_data.precision_recall_curve(df, save_path, line_style, title='Driver Precision-Recall Curve') # save performance metrics of ROC and PR AUC save_path = _utils.clf_result_dir + cfg_opts['performance'] logger.info('Saving performance metrics ({0}) . . .'.format(save_path)) metrics = [['TSG', rrclf_tsg_mean_roc_auc, rrclf_tsg_mean_pr_auc], ['OG', rrclf_onco_mean_roc_auc, rrclf_onco_mean_pr_auc], ['Driver', rrclf_driver_mean_roc_auc, rrclf_driver_mean_pr_auc]] perf_df = pd.DataFrame(metrics, columns=['Type', 'ROC AUC', 'PR AUC']) perf_df.to_csv(save_path, sep='\t', index=False) # make qq plot qq_plot_path = _utils.clf_plot_dir + cfg_opts['qq_plot'] plot_data.create_qqplots(result_df, qq_plot_path) except: pass
def retrieve_gene_features(conn, opts, get_entropy=True): """Retrieve gene information from the gene_features table. See the gene_features module to understand the gene_features database table. Parameters ---------- conn : mysql/sqlite connection connection to db with gene_features table options : dict options for getting info get_entropy : bool option to togle the use of entropy features. Since entropy features are read from a file in this function, it may induce a not necessary dependency on previously running commands. To avoid this, set get_entropy=False and then compute entropy features separately. Returns ------- df : pd.dataframe dataframe of gene lengths """ logger.info('Retrieving features of genes . . .') selected_cols = ['gene'] # retrieve more features if specified by command line if opts['gene_length']: selected_cols.append('gene_length') if opts['mutation_rate']: selected_cols.append('noncoding_mutation_rate') if opts['replication_time']: selected_cols.append('replication_time') if opts['expression']: selected_cols.append('expression_CCLE as expression') if opts['hic']: selected_cols.append('HiC_compartment') if opts['betweeness']: selected_cols.append('gene_betweeness') if opts['degree']: selected_cols.append('gene_degree') # get info from gene_features table logger.info('Retrieving gene feature information from gene_features table . . . ') sql = "SELECT %s FROM gene_features" % ', '.join(selected_cols) df = psql.frame_query(sql, conn) df = df.set_index('gene') df['gene'] = df.index logger.info('Finished retrieving gene features from gene_features table.') # fill graph stats with zeros if gene not in Biogrid if 'gene_betweeness' in df.columns: df['gene_betweeness'] = df['gene_betweeness'].fillna(0) if 'gene_degree' in df.columns: df['gene_degree'] = df['gene_degree'].fillna(0) # get position entropy features if get_entropy: entropy_cfg = _utils.get_output_config('position_entropy') mutation_pos_entropy = pd.read_csv(_utils.result_dir + entropy_cfg['mutation_pos_entropy'], sep='\t', index_col=0) missense_pos_entropy = pd.read_csv(_utils.result_dir + entropy_cfg['missense_pos_entropy'], sep='\t', index_col=0) #df['mutation position entropy'] = mutation_pos_entropy['mutation position entropy'] #df['pct of uniform mutation entropy'] = mutation_pos_entropy['pct of uniform mutation entropy'] df['missense position entropy'] = missense_pos_entropy['missense position entropy'] df['pct of uniform missense entropy'] = missense_pos_entropy['pct of uniform missense entropy'] return df