def run_mantis_ml_profiler(self):

        print('>>> Running mantis-ml config profiling ...')
        print('verbose:', self.verbose)
        print('Config file:', self.config_file)
        print('Output dir:', self.output_dir)

        cfg = Config(self.config_file, self.output_dir)
        proc_obj = ProcessFeaturesFilteredByDisease(cfg)

        # HPO
        self.assess_hpo_filtered_output(proc_obj, cfg)

        # GTEx
        self.assess_gtex_filtered_output(proc_obj, cfg)

        # Protein Atlas
        self.assess_proteinatlas_filtered_output(proc_obj, cfg)

        # MSigDB
        self.assess_msigdb_filtered_output(proc_obj, cfg)

        # MGI
        self.assess_mgi_filtered_output(proc_obj, cfg)
        print('\n\n<<< mantis-ml config profiling complete.')
    def __init__(self,
                 config_file,
                 output_dir,
                 nthreads=4,
                 iterations=10,
                 custom_known_genes_file=None,
                 fast_run_option=False,
                 superv_models=None):

        from mantis_ml.config_class import Config
        self.config_file = config_file
        self.output_dir = output_dir

        self.cfg = Config(config_file, self.output_dir)

        # modify default config paramters when provided with respective parameters
        self.cfg.nthreads = int(nthreads)
        self.cfg.iterations = int(iterations)

        if fast_run_option:
            self.cfg.classifiers = [
                'ExtraTreesClassifier', 'RandomForestClassifier', 'SVC',
                'GradientBoostingClassifier'
            ]

        if superv_models:
            models_dict = {
                'et': 'ExtraTreesClassifier',
                'rf': 'RandomForestClassifier',
                'svc': 'SVC',
                'gb': 'GradientBoostingClassifier',
                'xgb': 'XGBoost',
                'dnn': 'DNN',
                'stack': 'Stacking'
            }

            try:
                self.cfg.classifiers = list(
                    set([models_dict[k] for k in superv_models.split(',')]))
            except:
                print(
                    '[Warning] -m option args are not correct.\n\t  Currently going ahead with mantis-ml run using the 6 default classifiers (unless -f has also been specified which will integrate 4 classifiers only).\n'
                )

        self.cfg.custom_known_genes_file = custom_known_genes_file

        print('nthreads:', self.cfg.nthreads)
        print('Stochastic iterations:', self.cfg.iterations)
        print('Classifiers:', self.cfg.classifiers)
        print('Custom known genes:', self.cfg.custom_known_genes_file)

        # Run profiler and store results to ouput dir
        os.system("mantisml-profiler -vc " + config_file + " -o " +
                  self.output_dir + " > " + str(self.cfg.out_root) +
                  "/profiler_metadata.out")
Пример #3
0
	def __init__(self, config_file, output_dir, top_ratio, gene_class):

		self.config_file = config_file
		self.output_dir = output_dir
		self.top_ratio = top_ratio	 #0.01, 0.05
		self.gene_class = gene_class	# Novel or Known
	
		self.cfg = Config(config_file, self.output_dir)
		self.sorted_classifiers = self.read_sorted_classifers()


		self.color_palette = {'Stacking': '#684392', 'ExtraTreesClassifier': '#35A037', 'SVC': '#651124', 
				      'DNN': '#000000', 'RandomForestClassifier': '#1F7AB9', 
				      'XGBoost': '#F07E21', 'GradientBoostingClassifier': '#E32321'}
		self.clf_alias = {'ExtraTreesClassifier': 'ET', 'SVC': 'SVC', 'DNN': 'DNN', 
				'RandomForestClassifier': 'RF', 'XGBoost': 'XGB', 
				'GradientBoostingClassifier': 'GB', 'Stacking': 'Stacking'}
Пример #4
0
		self.percentile_df = pd.merge(self.percentile_df, self.known_genes_df, left_on='Gene_Name', right_on='Gene_Name', how='left')
		print('Merged percentile_df:')
		print(self.percentile_df.head())
		print(self.percentile_df.shape)
		# ====== Store all results (proba, percentile score, known/novel gene flag) ======
		self.percentile_df.to_csv(self.cfg.superv_ranked_pred / (self.clf_id + '.mantis-ml_predictions.csv'), index=False)






if __name__ == '__main__':

	config_file = sys.argv[1] #'../../config.yaml'
	cfg = Config(config_file)

	clf_id = 'XGBoost'
	clf_eval = ClassifierEvaluator(cfg, clf_id)

	# if clf_id in feature_imp_classifiers:
	#	 clf_eval.plot_avg_feature_imp()
	#	 clf_eval.plot_feat_imp_distribustion()

	# clf_eval.plot_evaluation_metrics()
	# clf_eval.get_definitive_gene_predictions(pos_ratio_thres=0.99)
	clf_eval.process_gene_proba_predictions(top_hits=50)


	# Calculate correlation between mantis-ml scores when using mean vs median of all probability scores per gene
	#print(clf_eval.gene_proba_means.head())
def main():

    parser = ArgumentParser(formatter_class=RawTextHelpFormatter)
    parser.add_argument(
        "-c",
        dest="config_file",
        required=True,
        help="Config file (.yaml) with run parameters [Required]\n\n")
    parser.add_argument(
        "-o",
        dest="output_dir",
        help=
        "Output directory name\n(absolute/relative path e.g. ./CKD, /tmp/Epilepsy-testing, etc.)\nIf it doesn't exist it will automatically be created [Required]\n\n",
        required=True)
    parser.add_argument(
        "-e",
        dest="external_ranked_file",
        required=True,
        help=
        "Input file with external ranked gene list;\neither 1-column or 2-columns (with p-values in the 2nd column) [Required]\n\n"
    )
    parser.add_argument(
        "-t",
        dest="top_ratio",
        required=False,
        default=5,
        help=
        "Top percent ratio of mantis-ml predictions\nto overlap with the external ranked list (default: 5)\n\n"
    )
    parser.add_argument(
        "-m",
        dest="max_overlapping_genes",
        required=False,
        default=50,
        help=
        "Max. number of genes to retain that overlap\nmantis-ml and EXTERNAL_RANKED_FILE predictions (default: 50)\n\n"
    )
    parser.add_argument(
        "-y",
        dest="ylim",
        required=False,
        help="Explicitly define y-axis max. limit (PHRED score value)\n\n")
    parser.add_argument("-x",
                        dest="xlim",
                        required=False,
                        help="Explicitly define x-axis max. limit\n\n")
    parser.add_argument(
        "-f",
        "--full_xaxis",
        action="count",
        required=False,
        help=
        "Plot enrichment signal across the entire x-axis\nand not just for the significant part (or the MAX_OVERLAPPING_GENES)\nof the external ranked list\n\n"
    )
    parser.add_argument("-s",
                        dest="suffix",
                        required=False,
                        default=None,
                        help="Suffix to be used with output files\n\n")
    parser.add_argument(
        "-n",
        "--novel",
        action="count",
        required=False,
        help="Run hypergeometric test against novel mantis-ml predictions only"
    )

    if len(sys.argv) == 1:
        parser.print_help(sys.stderr)
        sys.exit(1)

    args = parser.parse_args()

    config_file = args.config_file
    output_dir = args.output_dir
    external_ranked_file = args.external_ranked_file
    top_ratio = float(args.top_ratio) / 100
    max_overlapping_genes = int(args.max_overlapping_genes)
    ylim = None
    if args.ylim:
        ylim = float(args.ylim)
    if args.xlim:
        xlim = float(args.xlim)
    show_full_xaxis = bool(args.full_xaxis)

    suffix = args.suffix
    remove_seed_genes = bool(args.novel)

    print("\nInput arguments:\n")
    print('- config_file:', config_file)
    print('- external_ranked_file:', external_ranked_file)
    print('- top_ratio:', str(100 * top_ratio) + '%')
    print('-max_overlapping_genes (if applicable):', max_overlapping_genes)
    if args.ylim:
        print('-ylim:', ylim)
    if args.xlim:
        print('-xlim:', xlim)
    else:
        xlim = None
    print('- show_full_xaxis:', show_full_xaxis)

    print('- suffix:', suffix)
    print('- remove_seed_genes:', remove_seed_genes)
    print("\n")
    # ***************************

    cfg = Config(config_file, output_dir)

    # Read aggregated results from classifiers
    try:
        print("Reading all_clf.pkl ...")
        with open(str(cfg.superv_out / 'all_clf.pkl'), 'rb') as input:
            all_clf = pickle.load(input)
    except Exception as e:
        print(e)
        sys.exit(
            "all_clf.pkl not found. Please run 'process_classifier_results.py' first."
        )

    # Read classifiers in descending order of avg. AUC
    sorted_classifiers = []
    avg_aucs_file = str(cfg.superv_out / 'Avg_AUC_per_classifier.txt')
    with open(avg_aucs_file) as fh:
        for line in fh:
            tmp_clf, tmp_auc = line.split('\t')
            sorted_classifiers.append(tmp_clf)

    print(sorted_classifiers)

    #classifiers = ['XGBoost']
    classifiers = sorted_classifiers[:]

    # Read seed genes
    seed_genes = all_clf[classifiers[0]].known_genes.tolist()

    genes_to_remove = []
    if remove_seed_genes:
        genes_to_remove = seed_genes

    pval_cutoff = 1  # Seto to 1, to include all

    # ------------------ (Nearly) Static options ------------------
    use_phred = True  # default: True
    collapsing_top_ratio = -1  # Set to -1 to use pval_cutoff instead

    for clf_str in classifiers:

        print('\n> Classifier: ' + clf_str)
        print('Overlapping with top ' + str(float(top_ratio) * 100) + '% of ' +
              clf_str + ' predictions ...')

        rank_overlap = ExternalRankingOverlap(
            cfg,
            clf_str,
            seed_genes,
            top_ratio=top_ratio,
            max_overlapping_genes=max_overlapping_genes,
            show_full_xaxis=show_full_xaxis,
            ylim=ylim,
            xlim=xlim,
            suffix=suffix)

        rank_overlap.read_external_ranked_gene_list(external_ranked_file)
        print(rank_overlap.external_ranked_df.head())
        print(rank_overlap.external_ranked_df.shape)

        rank_overlap.calc_stepwise_hypergeometric(
            all_clf,
            pval_cutoff=pval_cutoff,
            collapsing_top_ratio=collapsing_top_ratio,
            genes_to_remove=genes_to_remove)

    # Get consensus of novel (and known) gene predictions
    for gene_class in ['Novel', 'Known']:
        cons_obj = Consensus_Gene_Predictions(config_file, output_dir,
                                              top_ratio, gene_class)
        cons_obj.run()
    def __init__(self, config_file):
        self.cfg = Config(config_file)

        print('Stochastic iterations:', self.cfg.iterations)
        print('nthreads:', self.cfg.nthreads)

    parser = argparse.ArgumentParser()
    parser.add_argument('config_file') 
    parser.add_argument('-v', '--verbosity', action="count", help="print verbose output verbosity (run with -v option)")      
    args = parser.parse_args()

    if args.verbosity:
        verbose = True
    else:
        verbose = False
    print('>>> Running mantis-ml config profiling ...')
    print('verbose:', verbose)
    print('Config file:', args.config_file)

    cfg = Config(args.config_file)

    proc_obj = ProcessFeaturesFilteredByDisease(cfg)

    # common strings to exclude from profiling
    eng_stopwords = get_english_stopwords()
    custom_bullet = u'\u2022' * 5
    line_spacer = '\n' * 6

    # HPO
    assess_hpo_filtered_output(proc_obj, cfg, verbose=verbose)


    # GTEx
    assess_gtex_filtered_output(proc_obj, cfg, verbose=verbose)
                                       left_on='Gene_Name',
                                       right_on='Gene_Name')
        print(generic_features_df.shape)
        generic_features_df = pd.merge(generic_features_df,
                                       mgi_essential_df,
                                       how='left',
                                       left_on='Gene_Name',
                                       right_on='Gene_Name')
        print(generic_features_df.shape)

        # Impute 'MGI_essential_gene' with zero, for all genes that don't have a '1' value:
        # these values are not missing data but rather represent a 'False'/zero feature value.
        generic_features_df['MGI_essential_gene'].fillna(0, inplace=True)

        generic_features_df.to_csv(self.cfg.generic_feature_table,
                                   sep='\t',
                                   index=None)
        print("Saved to {0}".format(self.cfg.generic_feature_table))

        print(generic_features_df.shape)


if __name__ == '__main__':

    config_file = sys.argv[1]  #'../../../config.yaml'
    out_dir = sys.argv[2]
    cfg = Config(config_file, out_dir)

    proc = ProcessGenericFeatures(cfg)
    proc.run_all()