示例#1
0
    def generate_features_from_smiles(self):
        self.jlogger.info(
            "Inside generate_features_from_smiles while is_train flag {}".
            format(self.is_train))

        data = self.ml_pipeline.data
        org_data = data.copy()

        # TODO - Can run below two methods in different threads
        padel_df = self.generate_features_using_padel()
        if self.is_train and padel_df is not None:
            self.write_padel_features_to_csv(padel_df)

        # resetting data after being used by padel feature generation
        self.ml_pipeline.data = org_data.copy()
        mordred_df = self.generate_features_using_mordered()
        if self.is_train and mordred_df is not None:
            self.write_mordred_features_to_csv(mordred_df)

        if self.is_train:
            updated_status = app_config.STEP1_STATUS

            job_oth_config_fp = self.ml_pipeline.job_data[
                'job_oth_config_path']
            helper.update_job_status(job_oth_config_fp, updated_status)

            self.ml_pipeline.status = updated_status

        self.jlogger.info("Feature generation completed successfully")
示例#2
0
def stop_running_job():
    job_id = request.args.get('job_id')

    error = None

    if job_id in running_jobs_details.keys():
        job_status, job_future = running_jobs_details[job_id]

        print("Inside stop_running_job ", job_status, job_future)
        if not job_future is None:
            job_run_status = job_future.running()
        print("Inside stop_running_job with job id {} and job_run_status {}".
              format(job_id, job_run_status))

        if job_run_status:
            job_future.cancel()
            flash(
                "Request to stop job {} sent successfully. You can resume this job at some later point in time."
                .format(job_id), "success")

            # regardless of errored or sucess or failure or whatever reason, remove it from running_jobs_details map
            if job_id in running_jobs_details:
                del running_jobs_details[job_id]

                # print("Updating status to file")
                helper.update_running_job_status(job_id, "Stopped")
        else:
            error = "Job is not running, cannot stop it."
    else:
        error = "Job is not running, cannot stop it."

    if not error is None:
        flash(error, "danger")

    return redirect(url_for("view_all_jobs"))
示例#3
0
    def start(self):
        # perform some logging
        self.jlogger.info("Starting job with job id {}".format(self.job_id))
        self.jlogger.debug("Job Config: {}".format(self.config))
        self.jlogger.debug("Job Other Data: {}".format(self.job_data))

        try:
            rud.ReadUserData(self)
            fg.FeatureGeneration(self, is_train=True)
            pp.Preprocessing(self, is_train=True)
            fs.FeatureSelection(self, is_train=True)
            fe.FeatureExtraction(self, is_train=True)
            clf.Classification(self)
            cv.CrossValidation(self)
            tsg.TestSetGeneration(self)
            tspp.TestSetPreprocessing(self)
            tsprd.TestSetPrediction(self)
            job_success_status = True
        except:
            job_success_status = False
            helper.update_running_job_status(self.job_id, "Errored")
            self.jlogger.exception("Exception occurred in ML Job {} ".format(
                self.job_id))

        return job_success_status
示例#4
0
    def read_data(self, fp):
        data = pd.read_csv(fp)

        if self.validate_data(data):
            self.jlogger.info("Read data is in valid format")
            self.ml_pipeline.data = data

            updated_status = app_config.STEP0_STATUS

            job_oth_config_fp = self.ml_pipeline.job_data[
                'job_oth_config_path']
            helper.update_job_status(job_oth_config_fp, updated_status)

            self.ml_pipeline.status = updated_status
            self.jlogger.info("Read data completed successfully")
示例#5
0
    def search_similar_in_dbs(self):
        self.search_imppat()
        self.search_foodb()
        self.search_chebi()
        self.search_hmdb()
        self.search_pubchem()
        self.search_custom_db()

        updated_status = app_config.STEP6_STATUS

        job_oth_config_fp = self.ml_pipeline.job_data['job_oth_config_path']
        helper.update_job_status(job_oth_config_fp, updated_status)

        self.ml_pipeline.status = updated_status

        self.jlogger.info("Test set generation completed successfully")
示例#6
0
    def combine_calculated_db_parts_sims(self, sim_metric, res_fld_path):
        all_novel_fld_path = os.path.join(res_fld_path, sim_metric, "all_novel")
        shortlisted_fld_path = os.path.join(res_fld_path, sim_metric, "shortlisted")

        fname = None

        df_all_novel_arr = []
        for file in os.listdir(all_novel_fld_path):
            fp = os.path.join(all_novel_fld_path, file)
            fname = file
            fl_df = pd.read_csv(fp)
            df_all_novel_arr.append(fl_df)

        sim_th = helper.infer_th_from_file_name(fname, sim_metric, ".csv")
        all_novel_df = pd.concat(df_all_novel_arr)

        df_all_shortlisted_arr = []
        for file in os.listdir(shortlisted_fld_path):
            fp = os.path.join(shortlisted_fld_path, file)
            fl_df = pd.read_csv(fp)
            df_all_shortlisted_arr.append(fl_df)

        shortlisted_novel_df = pd.concat(df_all_shortlisted_arr)

        return sim_th, all_novel_df, shortlisted_novel_df
示例#7
0
    def search_custom_db(self):

        if self.ml_pipeline.config.db_custom_flg:
            self.jlogger.info("Inside search_custom_db")
            db_path_found = False
            all_app_configs = get_app_config()
            if not all_app_configs is None:
                custom_db_path = all_app_configs['user_db_fld_path']
                if not custom_db_path is None and custom_db_path.strip() != "":
                    self.jlogger.info("Found custom DB path {}".format(custom_db_path))
                    db_path_found = True
                    compound_db_fld = custom_db_path

                    for file in os.listdir(compound_db_fld):
                        custom_db_fp = os.path.join(compound_db_fld, file)
                        self.jlogger.info("Custom DB File Name {}".format(file))

                        db_df = pd.read_csv(custom_db_fp, encoding="ISO-8859-1")

                        c_sim_obj = CompoundSimilarity(self.pos_df, db_df, self.jlogger)
                        user_ip_fps, db_fps = c_sim_obj.calculate_fps_of_all_compounds()

                        custom_db_name = helper.change_ext(file, ".csv", "")

                        self.calculate_all_similarities(c_sim_obj, db_fps, custom_db_name)

            if not db_path_found:
                self.jlogger.error(
                    "Custom DB folder path not found, unable to proceed with search on custom database")
示例#8
0
def job_done_callback(future):
    print("Inside job_done_callback")
    print("job_done_callback Running status ", future.running())

    try:
        job_id, job_success_status = future.result()
        print("Future result inside job_done_callback ", job_id,
              job_success_status)

        if job_id in running_jobs_details:
            del running_jobs_details[job_id]
    except CancelledError as ce:
        print("CancelledError Exception inside job_done_callback ", ce)
    except Exception as e:
        helper.update_running_job_status(job_id, "Errored")
        print("Exception inside job_done_callback ", e)
    def apply_on_all_fg(self):
        # Padel
        if self.ml_pipeline.config.fg_padelpy_flg:
            self.fg_fld_name = app_config.FG_PADEL_FLD_NAME
            self.preprocess_test_set()

        if self.ml_pipeline.config.fg_mordered_flg:
            # Mordred
            self.fg_fld_name = app_config.FG_MORDRED_FLD_NAME
            self.preprocess_test_set()

        updated_status = app_config.STEP6_1_STATUS

        job_oth_config_fp = self.ml_pipeline.job_data['job_oth_config_path']
        helper.update_job_status(job_oth_config_fp, updated_status)

        self.ml_pipeline.status = updated_status

        self.jlogger.info(
            "Generated test set preprocessing completed successfully")
示例#10
0
    def apply_on_all_fg(self):
        # Padel
        if self.ml_pipeline.config.fg_padelpy_flg:
            self.fg_fld_name = app_config.FG_PADEL_FLD_NAME
            self.initialize_lime_explanation()
            self.apply_classification_models()

        if self.ml_pipeline.config.fg_mordered_flg:
            # Mordred
            self.fg_fld_name = app_config.FG_MORDRED_FLD_NAME
            self.initialize_lime_explanation()
            self.apply_classification_models()

        updated_status = app_config.STEPS_COMPLETED_STATUS

        job_oth_config_fp = self.ml_pipeline.job_data['job_oth_config_path']
        helper.update_job_status(job_oth_config_fp, updated_status)

        self.ml_pipeline.status = updated_status

        self.jlogger.info(
            "Generated test set prediction completed successfully")
示例#11
0
def fetch_job_details():
    job_id = request.args.get('job_id')
    # print("Inside fetch_job_details")
    if not job_id is None:
        job_status = helper.get_job_status_detail(job_id, "status")
        job_desc = helper.get_job_status_detail(job_id, "jd_text")

        all_jobs = ListAllJobs()
        job_run_status = all_jobs.get_job_run_status(job_id, job_status)

        job_details = {}
        job_details['job_id'] = job_id
        job_details['job_run_status'] = job_run_status
        job_details['job_last_status'] = app_config.JOB_STATUS_LABELS[
            job_status]
        job_details['job_desc'] = job_desc

        # return jsonify(error=False, job_details=job_details)
        return render_template("job_detail.html", job_details=job_details)
    else:
        flash("A valid job id is needed to view job details", "danger")
        return render_template("log_viewer.html", show_logs=False)
示例#12
0
def get_job_details(job_id):
    """
    retrieves all job related configuration details
    :param job_id: job id whose details needs to be retrieved
    :return:
    tuple - job_config, job_details
    job_config - dictionary from user uploaded json, job_details - map with other job folder related params
    """
    job_details = {}
    jobs_fld = app_config.ALL_JOBS_FOLDER
    job_id_fld = os.path.join(jobs_fld, job_id)

    job_details['job_fld_path'] = job_id_fld
    job_details['job_config_path'] = os.path.join(
        job_id_fld, app_config.JOB_CONFIG_FLD_NAME)
    job_details['job_data_path'] = os.path.join(job_id_fld,
                                                app_config.JOB_DATA_FLD_NAME)
    job_details['job_results_path'] = os.path.join(
        job_id_fld, app_config.JOB_RESULTS_FLD_NAME)
    job_details['job_log_path'] = os.path.join(*[
        job_id_fld, app_config.JOB_CONFIG_FLD_NAME,
        app_config.JOB_LOGS_FLD_NAME
    ])
    job_details['job_oth_config_path'] = os.path.join(*[
        job_id_fld, app_config.JOB_CONFIG_FLD_NAME,
        app_config.JOB_OTHER_CONFIG_FNAME
    ])

    config_fp = os.path.join(job_id_fld, app_config.JOB_CONFIG_FLD_NAME,
                             app_config.JOB_CONFIG_FNAME)
    status_fp = job_details['job_oth_config_path']

    # TODO consider adding user_config validation here too
    with open(config_fp) as f:
        json_str = f.read()
        job_config = helper.create_job_config_object(json_str)

    with open(status_fp) as f:
        other_configs = json.load(f)
        status = other_configs['status']
        status = status.strip()
        if status == None and status == '':
            status = None
        job_details['status'] = status

    return job_config, job_details
示例#13
0
    def fetch_model_save_predictions(self, model_name):
        model_pkl_path = os.path.join(*[
            self.ml_pipeline.job_data['job_data_path'],
            app_config.CLF_FLD_NAME, self.fg_fld_name, model_name, "clf_" +
            model_name + ".pkl"
        ])

        with open(model_pkl_path, 'rb') as f:
            model = pickle.load(f)

        all_test_df, all_test_compounds = self.load_all_test_files()

        if not self.lime_exp is None:
            self.lime_exp.lime_explainer = None

        for padel_fname, test_df in all_test_df.items():
            test_compounds = all_test_compounds[padel_fname]
            novel_compounds_predictions = self.apply_model_for_predictions(
                model, test_df, test_compounds)

            novel_pred_fld_p = os.path.join(*[
                self.ml_pipeline.job_data['job_results_path'],
                self.fg_fld_name, app_config.NOVEL_RESULTS_FLD_NAME, model_name
            ])
            os.makedirs(novel_pred_fld_p, exist_ok=True)

            # TODO Add model name in prediction file
            pred_f_name = "pred_" + model_name + "_" + padel_fname
            novel_pred_fp = os.path.join(novel_pred_fld_p, pred_f_name)

            novel_compounds_predictions.to_csv(novel_pred_fp, index=False)

            if self.ml_pipeline.config.exp_lime_flg:
                lime_exp_f_name = "lime_exp_" + model_name + "_" + helper.change_ext(
                    padel_fname, ".csv", ".pdf")
                lime_exp_pdf_fp = os.path.join(novel_pred_fld_p,
                                               lime_exp_f_name)

                self.lime_exp.exp_preds_using_lime(model, test_compounds,
                                                   padel_fname,
                                                   lime_exp_pdf_fp)
示例#14
0
    def search_pubchem(self):

        if self.ml_pipeline.config.db_pubchem_flg:
            self.jlogger.info("Inside search_pubchem")
            db_path_found = False
            all_app_configs = get_app_config()
            if not all_app_configs is None:
                pubchem_db_path = all_app_configs['pubchem_db_fld_path']
                if not pubchem_db_path is None and pubchem_db_path.strip() != "":
                    self.jlogger.info("Found PubChem DB path {}".format(pubchem_db_path))
                    db_path_found = True
                    compound_db_fld = pubchem_db_path

                    fld_path = self.ml_pipeline.job_data['job_data_path']
                    fld_path = os.path.join(*[fld_path, DATA_FLD_NAME, TEST_FLD_NAME])

                    res_fld_path = os.path.join(fld_path, "pubchem")

                    if not os.path.exists(res_fld_path):
                        os.makedirs(res_fld_path, exist_ok=True)

                    for file in os.listdir(compound_db_fld):
                        pucbchem_db_part_fp = os.path.join(compound_db_fld, file)
                        self.jlogger.info("Pubchem Part DB File Name {}".format(file))

                        db_df = pd.read_csv(pucbchem_db_part_fp, encoding="ISO-8859-1")

                        c_sim_obj = CompoundSimilarity(self.pos_df, db_df, self.jlogger)
                        user_ip_fps, db_fps = c_sim_obj.calculate_fps_of_all_compounds()

                        db_part_name = helper.change_ext(file, ".csv", "")

                        self.calculate_db_part_similarity(c_sim_obj, db_fps, db_part_name, res_fld_path)

                    self.combine_db_parts("pubchem", res_fld_path)

            if not db_path_found:
                self.jlogger.error(
                    "PubChem DB folder path not found, unable to proceed with search on pubchem database")
示例#15
0
    def generate_features_using_padel(self):

        if self.ml_pipeline.config.fg_padelpy_flg:
            self.jlogger.info("Inside generate_features_using_padel method")

            os_type = helper.get_os_type()

            app_temp_path = Path(APP_ROOT).parent

            if os_type.startswith("windows"):
                java_path = os.path.join(
                    *[app_temp_path, "jre8", "win", "bin", "java.exe"])
            elif os_type.startswith("darwin"):
                java_path = os.path.join(*[
                    app_temp_path, "jre8", "mac", "Contents", "Home", "bin",
                    "java"
                ])
            elif os_type.startswith("linux"):
                java_path = os.path.join(
                    *[app_temp_path, "jre8", "linux", "bin", "java"])
            else:
                java_path = None

            self.jlogger.info(
                "Inside generate_features_using_padel method, os type is {}".
                format(os_type))

            # # TODO temporary fix, try parallel first if fails, fall back to serial
            # self.jlogger.info("Trying generating padel features parallelly first")
            # df = self.generate_padel_features_parallely(600)  # 10 mins timeout
            # if df is None:  # if error while generating parallely
            #     self.jlogger.info("Trying generating padel features serially now")

            df = self.generate_padel_features_serially(java_path)

            return df
        else:
            return None
    def apply_on_all_fg(self):

        if self.ml_pipeline.config.fg_padelpy_flg:
            self.jlogger.info(
                "Started feature selection of preprocessed PaDEL features")
            job_fld_path = self.ml_pipeline.job_data['job_fld_path']
            pp_padel_fld_path = os.path.join(*[
                job_fld_path, app_config.TEMP_TTS_FLD_NAME,
                app_config.FG_PADEL_FLD_NAME
            ])

            padel_xtrain_fp = os.path.join(pp_padel_fld_path,
                                           app_config.TEMP_XTRAIN_FNAME)
            padel_ytrain_fp = os.path.join(pp_padel_fld_path,
                                           app_config.TEMP_YTRAIN_FNAME)
            padel_xtest_fp = os.path.join(pp_padel_fld_path,
                                          app_config.TEMP_XTEST_FNAME)
            padel_ytest_fp = os.path.join(pp_padel_fld_path,
                                          app_config.TEMP_YTEST_FNAME)

            self.ml_pipeline.x_train = pd.read_csv(padel_xtrain_fp)
            self.ml_pipeline.y_train = pd.read_csv(padel_ytrain_fp)
            self.ml_pipeline.y_train = self.ml_pipeline.y_train.values.ravel()

            self.ml_pipeline.x_test = pd.read_csv(padel_xtest_fp)
            self.ml_pipeline.y_test = pd.read_csv(padel_ytest_fp)
            self.ml_pipeline.y_test = self.ml_pipeline.y_test.values.ravel()

            # folder path to save output of preprocessed padel features feature selection data
            fs_padel_fld_path = os.path.join(*[
                self.ml_pipeline.job_data['job_data_path'], DATA_FLD_NAME,
                app_config.FG_PADEL_FLD_NAME
            ])
            self.fg_fs_fld_path = fs_padel_fld_path
            os.makedirs(self.fg_fs_fld_path, exist_ok=True)

            self.perform_feature_selection()

        if self.ml_pipeline.config.fg_mordered_flg:
            self.jlogger.info(
                "Started feature selection of preprocessed mordred features")
            job_fld_path = self.ml_pipeline.job_data['job_fld_path']
            pp_mordred_fld_path = os.path.join(*[
                job_fld_path, app_config.TEMP_TTS_FLD_NAME,
                app_config.FG_MORDRED_FLD_NAME
            ])
            mordred_xtrain_fp = os.path.join(pp_mordred_fld_path,
                                             app_config.TEMP_XTRAIN_FNAME)
            mordred_ytrain_fp = os.path.join(pp_mordred_fld_path,
                                             app_config.TEMP_YTRAIN_FNAME)
            mordred_xtest_fp = os.path.join(pp_mordred_fld_path,
                                            app_config.TEMP_XTEST_FNAME)
            mordred_ytest_fp = os.path.join(pp_mordred_fld_path,
                                            app_config.TEMP_YTEST_FNAME)

            self.ml_pipeline.x_train = pd.read_csv(mordred_xtrain_fp)
            self.ml_pipeline.y_train = pd.read_csv(mordred_ytrain_fp)
            self.ml_pipeline.y_train = self.ml_pipeline.y_train.values.ravel()

            self.ml_pipeline.x_test = pd.read_csv(mordred_xtest_fp)
            self.ml_pipeline.y_test = pd.read_csv(mordred_ytest_fp)
            self.ml_pipeline.y_test = self.ml_pipeline.y_test.values.ravel()

            # folder path to save output of preprocessed mordred features feature selection data
            fs_mordred_fld_path = os.path.join(*[
                self.ml_pipeline.job_data['job_data_path'], DATA_FLD_NAME,
                app_config.FG_MORDRED_FLD_NAME
            ])

            self.fg_fs_fld_path = fs_mordred_fld_path
            os.makedirs(self.fg_fs_fld_path, exist_ok=True)

            self.perform_feature_selection()

        if self.is_train:
            updated_status = app_config.STEP3_STATUS

            job_oth_config_fp = self.ml_pipeline.job_data[
                'job_oth_config_path']
            helper.update_job_status(job_oth_config_fp, updated_status)

            self.ml_pipeline.status = updated_status

            self.jlogger.info("Feature selection completed successfully")
    def apply_on_all_fg(self):

        if self.ml_pipeline.config.fg_padelpy_flg:
            self.jlogger.info("Started pre-processing PaDEL features")
            padel_data_fp = os.path.join(*[
                self.ml_pipeline.job_data['job_data_path'],
                app_config.FG_FLD_NAME, app_config.FG_PADEL_FLD_NAME,
                app_config.FG_PADEL_FNAME
            ])
            pp_padel_fld_path = os.path.join(*[
                self.ml_pipeline.job_data['job_data_path'], DATA_FLD_NAME,
                app_config.FG_PADEL_FLD_NAME
            ])
            data, data_labels = self.read_data(padel_data_fp)
            self.ml_pipeline.data = data
            self.ml_pipeline.data_labels = data_labels

            # folder path to save output of padel features preprocessed data
            self.fg_pp_fld_path = pp_padel_fld_path
            os.makedirs(self.fg_pp_fld_path, exist_ok=True)

            pp_init_data_fpath = os.path.join(
                self.fg_pp_fld_path, DATA_FILE_NAME_PRFX + "init_data.csv")
            pp_init_labels_fpath = os.path.join(
                self.fg_pp_fld_path, DATA_FILE_NAME_PRFX + "init_labels.csv")

            data.to_csv(pp_init_data_fpath, index=False)
            data_labels.to_csv(pp_init_labels_fpath, index=False)

            self.preprocess_data()

        if self.ml_pipeline.config.fg_mordered_flg:
            self.jlogger.info("Started pre-processing mordred features")
            mordred_data_fp = os.path.join(*[
                self.ml_pipeline.job_data['job_data_path'],
                app_config.FG_FLD_NAME, app_config.FG_MORDRED_FLD_NAME,
                app_config.FG_MORDRED_FNAME
            ])

            pp_mordred_fld_path = os.path.join(*[
                self.ml_pipeline.job_data['job_data_path'], DATA_FLD_NAME,
                app_config.FG_MORDRED_FLD_NAME
            ])
            data, data_labels = self.read_data(mordred_data_fp)
            self.ml_pipeline.data = data
            self.ml_pipeline.data_labels = data_labels

            # folder path to save output of mordred features preprocessed data
            self.fg_pp_fld_path = pp_mordred_fld_path
            os.makedirs(self.fg_pp_fld_path, exist_ok=True)

            pp_init_data_fpath = os.path.join(
                self.fg_pp_fld_path, DATA_FILE_NAME_PRFX + "init_data.csv")
            pp_init_labels_fpath = os.path.join(
                self.fg_pp_fld_path, DATA_FILE_NAME_PRFX + "init_labels.csv")

            data.to_csv(pp_init_data_fpath, index=False)
            data_labels.to_csv(pp_init_labels_fpath, index=False)

            self.preprocess_data()

        if self.is_train:
            updated_status = app_config.STEP2_STATUS

            job_oth_config_fp = self.ml_pipeline.job_data[
                'job_oth_config_path']
            helper.update_job_status(job_oth_config_fp, updated_status)

            self.ml_pipeline.status = updated_status

        self.jlogger.info("Pre-processing completed successfully")