def run_mantis_ml_profiler(self): print('>>> Running mantis-ml config profiling ...') print('verbose:', self.verbose) print('Config file:', self.config_file) print('Output dir:', self.output_dir) cfg = Config(self.config_file, self.output_dir) proc_obj = ProcessFeaturesFilteredByDisease(cfg) # HPO self.assess_hpo_filtered_output(proc_obj, cfg) # GTEx self.assess_gtex_filtered_output(proc_obj, cfg) # Protein Atlas self.assess_proteinatlas_filtered_output(proc_obj, cfg) # MSigDB self.assess_msigdb_filtered_output(proc_obj, cfg) # MGI self.assess_mgi_filtered_output(proc_obj, cfg) print('\n\n<<< mantis-ml config profiling complete.')
def __init__(self, config_file, output_dir, nthreads=4, iterations=10, custom_known_genes_file=None, fast_run_option=False, superv_models=None): from mantis_ml.config_class import Config self.config_file = config_file self.output_dir = output_dir self.cfg = Config(config_file, self.output_dir) # modify default config paramters when provided with respective parameters self.cfg.nthreads = int(nthreads) self.cfg.iterations = int(iterations) if fast_run_option: self.cfg.classifiers = [ 'ExtraTreesClassifier', 'RandomForestClassifier', 'SVC', 'GradientBoostingClassifier' ] if superv_models: models_dict = { 'et': 'ExtraTreesClassifier', 'rf': 'RandomForestClassifier', 'svc': 'SVC', 'gb': 'GradientBoostingClassifier', 'xgb': 'XGBoost', 'dnn': 'DNN', 'stack': 'Stacking' } try: self.cfg.classifiers = list( set([models_dict[k] for k in superv_models.split(',')])) except: print( '[Warning] -m option args are not correct.\n\t Currently going ahead with mantis-ml run using the 6 default classifiers (unless -f has also been specified which will integrate 4 classifiers only).\n' ) self.cfg.custom_known_genes_file = custom_known_genes_file print('nthreads:', self.cfg.nthreads) print('Stochastic iterations:', self.cfg.iterations) print('Classifiers:', self.cfg.classifiers) print('Custom known genes:', self.cfg.custom_known_genes_file) # Run profiler and store results to ouput dir os.system("mantisml-profiler -vc " + config_file + " -o " + self.output_dir + " > " + str(self.cfg.out_root) + "/profiler_metadata.out")
def __init__(self, config_file, output_dir, top_ratio, gene_class): self.config_file = config_file self.output_dir = output_dir self.top_ratio = top_ratio #0.01, 0.05 self.gene_class = gene_class # Novel or Known self.cfg = Config(config_file, self.output_dir) self.sorted_classifiers = self.read_sorted_classifers() self.color_palette = {'Stacking': '#684392', 'ExtraTreesClassifier': '#35A037', 'SVC': '#651124', 'DNN': '#000000', 'RandomForestClassifier': '#1F7AB9', 'XGBoost': '#F07E21', 'GradientBoostingClassifier': '#E32321'} self.clf_alias = {'ExtraTreesClassifier': 'ET', 'SVC': 'SVC', 'DNN': 'DNN', 'RandomForestClassifier': 'RF', 'XGBoost': 'XGB', 'GradientBoostingClassifier': 'GB', 'Stacking': 'Stacking'}
self.percentile_df = pd.merge(self.percentile_df, self.known_genes_df, left_on='Gene_Name', right_on='Gene_Name', how='left') print('Merged percentile_df:') print(self.percentile_df.head()) print(self.percentile_df.shape) # ====== Store all results (proba, percentile score, known/novel gene flag) ====== self.percentile_df.to_csv(self.cfg.superv_ranked_pred / (self.clf_id + '.mantis-ml_predictions.csv'), index=False) if __name__ == '__main__': config_file = sys.argv[1] #'../../config.yaml' cfg = Config(config_file) clf_id = 'XGBoost' clf_eval = ClassifierEvaluator(cfg, clf_id) # if clf_id in feature_imp_classifiers: # clf_eval.plot_avg_feature_imp() # clf_eval.plot_feat_imp_distribustion() # clf_eval.plot_evaluation_metrics() # clf_eval.get_definitive_gene_predictions(pos_ratio_thres=0.99) clf_eval.process_gene_proba_predictions(top_hits=50) # Calculate correlation between mantis-ml scores when using mean vs median of all probability scores per gene #print(clf_eval.gene_proba_means.head())
def main(): parser = ArgumentParser(formatter_class=RawTextHelpFormatter) parser.add_argument( "-c", dest="config_file", required=True, help="Config file (.yaml) with run parameters [Required]\n\n") parser.add_argument( "-o", dest="output_dir", help= "Output directory name\n(absolute/relative path e.g. ./CKD, /tmp/Epilepsy-testing, etc.)\nIf it doesn't exist it will automatically be created [Required]\n\n", required=True) parser.add_argument( "-e", dest="external_ranked_file", required=True, help= "Input file with external ranked gene list;\neither 1-column or 2-columns (with p-values in the 2nd column) [Required]\n\n" ) parser.add_argument( "-t", dest="top_ratio", required=False, default=5, help= "Top percent ratio of mantis-ml predictions\nto overlap with the external ranked list (default: 5)\n\n" ) parser.add_argument( "-m", dest="max_overlapping_genes", required=False, default=50, help= "Max. number of genes to retain that overlap\nmantis-ml and EXTERNAL_RANKED_FILE predictions (default: 50)\n\n" ) parser.add_argument( "-y", dest="ylim", required=False, help="Explicitly define y-axis max. limit (PHRED score value)\n\n") parser.add_argument("-x", dest="xlim", required=False, help="Explicitly define x-axis max. limit\n\n") parser.add_argument( "-f", "--full_xaxis", action="count", required=False, help= "Plot enrichment signal across the entire x-axis\nand not just for the significant part (or the MAX_OVERLAPPING_GENES)\nof the external ranked list\n\n" ) parser.add_argument("-s", dest="suffix", required=False, default=None, help="Suffix to be used with output files\n\n") parser.add_argument( "-n", "--novel", action="count", required=False, help="Run hypergeometric test against novel mantis-ml predictions only" ) if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) args = parser.parse_args() config_file = args.config_file output_dir = args.output_dir external_ranked_file = args.external_ranked_file top_ratio = float(args.top_ratio) / 100 max_overlapping_genes = int(args.max_overlapping_genes) ylim = None if args.ylim: ylim = float(args.ylim) if args.xlim: xlim = float(args.xlim) show_full_xaxis = bool(args.full_xaxis) suffix = args.suffix remove_seed_genes = bool(args.novel) print("\nInput arguments:\n") print('- config_file:', config_file) print('- external_ranked_file:', external_ranked_file) print('- top_ratio:', str(100 * top_ratio) + '%') print('-max_overlapping_genes (if applicable):', max_overlapping_genes) if args.ylim: print('-ylim:', ylim) if args.xlim: print('-xlim:', xlim) else: xlim = None print('- show_full_xaxis:', show_full_xaxis) print('- suffix:', suffix) print('- remove_seed_genes:', remove_seed_genes) print("\n") # *************************** cfg = Config(config_file, output_dir) # Read aggregated results from classifiers try: print("Reading all_clf.pkl ...") with open(str(cfg.superv_out / 'all_clf.pkl'), 'rb') as input: all_clf = pickle.load(input) except Exception as e: print(e) sys.exit( "all_clf.pkl not found. Please run 'process_classifier_results.py' first." ) # Read classifiers in descending order of avg. AUC sorted_classifiers = [] avg_aucs_file = str(cfg.superv_out / 'Avg_AUC_per_classifier.txt') with open(avg_aucs_file) as fh: for line in fh: tmp_clf, tmp_auc = line.split('\t') sorted_classifiers.append(tmp_clf) print(sorted_classifiers) #classifiers = ['XGBoost'] classifiers = sorted_classifiers[:] # Read seed genes seed_genes = all_clf[classifiers[0]].known_genes.tolist() genes_to_remove = [] if remove_seed_genes: genes_to_remove = seed_genes pval_cutoff = 1 # Seto to 1, to include all # ------------------ (Nearly) Static options ------------------ use_phred = True # default: True collapsing_top_ratio = -1 # Set to -1 to use pval_cutoff instead for clf_str in classifiers: print('\n> Classifier: ' + clf_str) print('Overlapping with top ' + str(float(top_ratio) * 100) + '% of ' + clf_str + ' predictions ...') rank_overlap = ExternalRankingOverlap( cfg, clf_str, seed_genes, top_ratio=top_ratio, max_overlapping_genes=max_overlapping_genes, show_full_xaxis=show_full_xaxis, ylim=ylim, xlim=xlim, suffix=suffix) rank_overlap.read_external_ranked_gene_list(external_ranked_file) print(rank_overlap.external_ranked_df.head()) print(rank_overlap.external_ranked_df.shape) rank_overlap.calc_stepwise_hypergeometric( all_clf, pval_cutoff=pval_cutoff, collapsing_top_ratio=collapsing_top_ratio, genes_to_remove=genes_to_remove) # Get consensus of novel (and known) gene predictions for gene_class in ['Novel', 'Known']: cons_obj = Consensus_Gene_Predictions(config_file, output_dir, top_ratio, gene_class) cons_obj.run()
def __init__(self, config_file): self.cfg = Config(config_file) print('Stochastic iterations:', self.cfg.iterations) print('nthreads:', self.cfg.nthreads)
parser = argparse.ArgumentParser() parser.add_argument('config_file') parser.add_argument('-v', '--verbosity', action="count", help="print verbose output verbosity (run with -v option)") args = parser.parse_args() if args.verbosity: verbose = True else: verbose = False print('>>> Running mantis-ml config profiling ...') print('verbose:', verbose) print('Config file:', args.config_file) cfg = Config(args.config_file) proc_obj = ProcessFeaturesFilteredByDisease(cfg) # common strings to exclude from profiling eng_stopwords = get_english_stopwords() custom_bullet = u'\u2022' * 5 line_spacer = '\n' * 6 # HPO assess_hpo_filtered_output(proc_obj, cfg, verbose=verbose) # GTEx assess_gtex_filtered_output(proc_obj, cfg, verbose=verbose)
left_on='Gene_Name', right_on='Gene_Name') print(generic_features_df.shape) generic_features_df = pd.merge(generic_features_df, mgi_essential_df, how='left', left_on='Gene_Name', right_on='Gene_Name') print(generic_features_df.shape) # Impute 'MGI_essential_gene' with zero, for all genes that don't have a '1' value: # these values are not missing data but rather represent a 'False'/zero feature value. generic_features_df['MGI_essential_gene'].fillna(0, inplace=True) generic_features_df.to_csv(self.cfg.generic_feature_table, sep='\t', index=None) print("Saved to {0}".format(self.cfg.generic_feature_table)) print(generic_features_df.shape) if __name__ == '__main__': config_file = sys.argv[1] #'../../../config.yaml' out_dir = sys.argv[2] cfg = Config(config_file, out_dir) proc = ProcessGenericFeatures(cfg) proc.run_all()