def generate_features_from_smiles(self): self.jlogger.info( "Inside generate_features_from_smiles while is_train flag {}". format(self.is_train)) data = self.ml_pipeline.data org_data = data.copy() # TODO - Can run below two methods in different threads padel_df = self.generate_features_using_padel() if self.is_train and padel_df is not None: self.write_padel_features_to_csv(padel_df) # resetting data after being used by padel feature generation self.ml_pipeline.data = org_data.copy() mordred_df = self.generate_features_using_mordered() if self.is_train and mordred_df is not None: self.write_mordred_features_to_csv(mordred_df) if self.is_train: updated_status = app_config.STEP1_STATUS job_oth_config_fp = self.ml_pipeline.job_data[ 'job_oth_config_path'] helper.update_job_status(job_oth_config_fp, updated_status) self.ml_pipeline.status = updated_status self.jlogger.info("Feature generation completed successfully")
def stop_running_job(): job_id = request.args.get('job_id') error = None if job_id in running_jobs_details.keys(): job_status, job_future = running_jobs_details[job_id] print("Inside stop_running_job ", job_status, job_future) if not job_future is None: job_run_status = job_future.running() print("Inside stop_running_job with job id {} and job_run_status {}". format(job_id, job_run_status)) if job_run_status: job_future.cancel() flash( "Request to stop job {} sent successfully. You can resume this job at some later point in time." .format(job_id), "success") # regardless of errored or sucess or failure or whatever reason, remove it from running_jobs_details map if job_id in running_jobs_details: del running_jobs_details[job_id] # print("Updating status to file") helper.update_running_job_status(job_id, "Stopped") else: error = "Job is not running, cannot stop it." else: error = "Job is not running, cannot stop it." if not error is None: flash(error, "danger") return redirect(url_for("view_all_jobs"))
def start(self): # perform some logging self.jlogger.info("Starting job with job id {}".format(self.job_id)) self.jlogger.debug("Job Config: {}".format(self.config)) self.jlogger.debug("Job Other Data: {}".format(self.job_data)) try: rud.ReadUserData(self) fg.FeatureGeneration(self, is_train=True) pp.Preprocessing(self, is_train=True) fs.FeatureSelection(self, is_train=True) fe.FeatureExtraction(self, is_train=True) clf.Classification(self) cv.CrossValidation(self) tsg.TestSetGeneration(self) tspp.TestSetPreprocessing(self) tsprd.TestSetPrediction(self) job_success_status = True except: job_success_status = False helper.update_running_job_status(self.job_id, "Errored") self.jlogger.exception("Exception occurred in ML Job {} ".format( self.job_id)) return job_success_status
def read_data(self, fp): data = pd.read_csv(fp) if self.validate_data(data): self.jlogger.info("Read data is in valid format") self.ml_pipeline.data = data updated_status = app_config.STEP0_STATUS job_oth_config_fp = self.ml_pipeline.job_data[ 'job_oth_config_path'] helper.update_job_status(job_oth_config_fp, updated_status) self.ml_pipeline.status = updated_status self.jlogger.info("Read data completed successfully")
def search_similar_in_dbs(self): self.search_imppat() self.search_foodb() self.search_chebi() self.search_hmdb() self.search_pubchem() self.search_custom_db() updated_status = app_config.STEP6_STATUS job_oth_config_fp = self.ml_pipeline.job_data['job_oth_config_path'] helper.update_job_status(job_oth_config_fp, updated_status) self.ml_pipeline.status = updated_status self.jlogger.info("Test set generation completed successfully")
def combine_calculated_db_parts_sims(self, sim_metric, res_fld_path): all_novel_fld_path = os.path.join(res_fld_path, sim_metric, "all_novel") shortlisted_fld_path = os.path.join(res_fld_path, sim_metric, "shortlisted") fname = None df_all_novel_arr = [] for file in os.listdir(all_novel_fld_path): fp = os.path.join(all_novel_fld_path, file) fname = file fl_df = pd.read_csv(fp) df_all_novel_arr.append(fl_df) sim_th = helper.infer_th_from_file_name(fname, sim_metric, ".csv") all_novel_df = pd.concat(df_all_novel_arr) df_all_shortlisted_arr = [] for file in os.listdir(shortlisted_fld_path): fp = os.path.join(shortlisted_fld_path, file) fl_df = pd.read_csv(fp) df_all_shortlisted_arr.append(fl_df) shortlisted_novel_df = pd.concat(df_all_shortlisted_arr) return sim_th, all_novel_df, shortlisted_novel_df
def search_custom_db(self): if self.ml_pipeline.config.db_custom_flg: self.jlogger.info("Inside search_custom_db") db_path_found = False all_app_configs = get_app_config() if not all_app_configs is None: custom_db_path = all_app_configs['user_db_fld_path'] if not custom_db_path is None and custom_db_path.strip() != "": self.jlogger.info("Found custom DB path {}".format(custom_db_path)) db_path_found = True compound_db_fld = custom_db_path for file in os.listdir(compound_db_fld): custom_db_fp = os.path.join(compound_db_fld, file) self.jlogger.info("Custom DB File Name {}".format(file)) db_df = pd.read_csv(custom_db_fp, encoding="ISO-8859-1") c_sim_obj = CompoundSimilarity(self.pos_df, db_df, self.jlogger) user_ip_fps, db_fps = c_sim_obj.calculate_fps_of_all_compounds() custom_db_name = helper.change_ext(file, ".csv", "") self.calculate_all_similarities(c_sim_obj, db_fps, custom_db_name) if not db_path_found: self.jlogger.error( "Custom DB folder path not found, unable to proceed with search on custom database")
def job_done_callback(future): print("Inside job_done_callback") print("job_done_callback Running status ", future.running()) try: job_id, job_success_status = future.result() print("Future result inside job_done_callback ", job_id, job_success_status) if job_id in running_jobs_details: del running_jobs_details[job_id] except CancelledError as ce: print("CancelledError Exception inside job_done_callback ", ce) except Exception as e: helper.update_running_job_status(job_id, "Errored") print("Exception inside job_done_callback ", e)
def apply_on_all_fg(self): # Padel if self.ml_pipeline.config.fg_padelpy_flg: self.fg_fld_name = app_config.FG_PADEL_FLD_NAME self.preprocess_test_set() if self.ml_pipeline.config.fg_mordered_flg: # Mordred self.fg_fld_name = app_config.FG_MORDRED_FLD_NAME self.preprocess_test_set() updated_status = app_config.STEP6_1_STATUS job_oth_config_fp = self.ml_pipeline.job_data['job_oth_config_path'] helper.update_job_status(job_oth_config_fp, updated_status) self.ml_pipeline.status = updated_status self.jlogger.info( "Generated test set preprocessing completed successfully")
def apply_on_all_fg(self): # Padel if self.ml_pipeline.config.fg_padelpy_flg: self.fg_fld_name = app_config.FG_PADEL_FLD_NAME self.initialize_lime_explanation() self.apply_classification_models() if self.ml_pipeline.config.fg_mordered_flg: # Mordred self.fg_fld_name = app_config.FG_MORDRED_FLD_NAME self.initialize_lime_explanation() self.apply_classification_models() updated_status = app_config.STEPS_COMPLETED_STATUS job_oth_config_fp = self.ml_pipeline.job_data['job_oth_config_path'] helper.update_job_status(job_oth_config_fp, updated_status) self.ml_pipeline.status = updated_status self.jlogger.info( "Generated test set prediction completed successfully")
def fetch_job_details(): job_id = request.args.get('job_id') # print("Inside fetch_job_details") if not job_id is None: job_status = helper.get_job_status_detail(job_id, "status") job_desc = helper.get_job_status_detail(job_id, "jd_text") all_jobs = ListAllJobs() job_run_status = all_jobs.get_job_run_status(job_id, job_status) job_details = {} job_details['job_id'] = job_id job_details['job_run_status'] = job_run_status job_details['job_last_status'] = app_config.JOB_STATUS_LABELS[ job_status] job_details['job_desc'] = job_desc # return jsonify(error=False, job_details=job_details) return render_template("job_detail.html", job_details=job_details) else: flash("A valid job id is needed to view job details", "danger") return render_template("log_viewer.html", show_logs=False)
def get_job_details(job_id): """ retrieves all job related configuration details :param job_id: job id whose details needs to be retrieved :return: tuple - job_config, job_details job_config - dictionary from user uploaded json, job_details - map with other job folder related params """ job_details = {} jobs_fld = app_config.ALL_JOBS_FOLDER job_id_fld = os.path.join(jobs_fld, job_id) job_details['job_fld_path'] = job_id_fld job_details['job_config_path'] = os.path.join( job_id_fld, app_config.JOB_CONFIG_FLD_NAME) job_details['job_data_path'] = os.path.join(job_id_fld, app_config.JOB_DATA_FLD_NAME) job_details['job_results_path'] = os.path.join( job_id_fld, app_config.JOB_RESULTS_FLD_NAME) job_details['job_log_path'] = os.path.join(*[ job_id_fld, app_config.JOB_CONFIG_FLD_NAME, app_config.JOB_LOGS_FLD_NAME ]) job_details['job_oth_config_path'] = os.path.join(*[ job_id_fld, app_config.JOB_CONFIG_FLD_NAME, app_config.JOB_OTHER_CONFIG_FNAME ]) config_fp = os.path.join(job_id_fld, app_config.JOB_CONFIG_FLD_NAME, app_config.JOB_CONFIG_FNAME) status_fp = job_details['job_oth_config_path'] # TODO consider adding user_config validation here too with open(config_fp) as f: json_str = f.read() job_config = helper.create_job_config_object(json_str) with open(status_fp) as f: other_configs = json.load(f) status = other_configs['status'] status = status.strip() if status == None and status == '': status = None job_details['status'] = status return job_config, job_details
def fetch_model_save_predictions(self, model_name): model_pkl_path = os.path.join(*[ self.ml_pipeline.job_data['job_data_path'], app_config.CLF_FLD_NAME, self.fg_fld_name, model_name, "clf_" + model_name + ".pkl" ]) with open(model_pkl_path, 'rb') as f: model = pickle.load(f) all_test_df, all_test_compounds = self.load_all_test_files() if not self.lime_exp is None: self.lime_exp.lime_explainer = None for padel_fname, test_df in all_test_df.items(): test_compounds = all_test_compounds[padel_fname] novel_compounds_predictions = self.apply_model_for_predictions( model, test_df, test_compounds) novel_pred_fld_p = os.path.join(*[ self.ml_pipeline.job_data['job_results_path'], self.fg_fld_name, app_config.NOVEL_RESULTS_FLD_NAME, model_name ]) os.makedirs(novel_pred_fld_p, exist_ok=True) # TODO Add model name in prediction file pred_f_name = "pred_" + model_name + "_" + padel_fname novel_pred_fp = os.path.join(novel_pred_fld_p, pred_f_name) novel_compounds_predictions.to_csv(novel_pred_fp, index=False) if self.ml_pipeline.config.exp_lime_flg: lime_exp_f_name = "lime_exp_" + model_name + "_" + helper.change_ext( padel_fname, ".csv", ".pdf") lime_exp_pdf_fp = os.path.join(novel_pred_fld_p, lime_exp_f_name) self.lime_exp.exp_preds_using_lime(model, test_compounds, padel_fname, lime_exp_pdf_fp)
def search_pubchem(self): if self.ml_pipeline.config.db_pubchem_flg: self.jlogger.info("Inside search_pubchem") db_path_found = False all_app_configs = get_app_config() if not all_app_configs is None: pubchem_db_path = all_app_configs['pubchem_db_fld_path'] if not pubchem_db_path is None and pubchem_db_path.strip() != "": self.jlogger.info("Found PubChem DB path {}".format(pubchem_db_path)) db_path_found = True compound_db_fld = pubchem_db_path fld_path = self.ml_pipeline.job_data['job_data_path'] fld_path = os.path.join(*[fld_path, DATA_FLD_NAME, TEST_FLD_NAME]) res_fld_path = os.path.join(fld_path, "pubchem") if not os.path.exists(res_fld_path): os.makedirs(res_fld_path, exist_ok=True) for file in os.listdir(compound_db_fld): pucbchem_db_part_fp = os.path.join(compound_db_fld, file) self.jlogger.info("Pubchem Part DB File Name {}".format(file)) db_df = pd.read_csv(pucbchem_db_part_fp, encoding="ISO-8859-1") c_sim_obj = CompoundSimilarity(self.pos_df, db_df, self.jlogger) user_ip_fps, db_fps = c_sim_obj.calculate_fps_of_all_compounds() db_part_name = helper.change_ext(file, ".csv", "") self.calculate_db_part_similarity(c_sim_obj, db_fps, db_part_name, res_fld_path) self.combine_db_parts("pubchem", res_fld_path) if not db_path_found: self.jlogger.error( "PubChem DB folder path not found, unable to proceed with search on pubchem database")
def generate_features_using_padel(self): if self.ml_pipeline.config.fg_padelpy_flg: self.jlogger.info("Inside generate_features_using_padel method") os_type = helper.get_os_type() app_temp_path = Path(APP_ROOT).parent if os_type.startswith("windows"): java_path = os.path.join( *[app_temp_path, "jre8", "win", "bin", "java.exe"]) elif os_type.startswith("darwin"): java_path = os.path.join(*[ app_temp_path, "jre8", "mac", "Contents", "Home", "bin", "java" ]) elif os_type.startswith("linux"): java_path = os.path.join( *[app_temp_path, "jre8", "linux", "bin", "java"]) else: java_path = None self.jlogger.info( "Inside generate_features_using_padel method, os type is {}". format(os_type)) # # TODO temporary fix, try parallel first if fails, fall back to serial # self.jlogger.info("Trying generating padel features parallelly first") # df = self.generate_padel_features_parallely(600) # 10 mins timeout # if df is None: # if error while generating parallely # self.jlogger.info("Trying generating padel features serially now") df = self.generate_padel_features_serially(java_path) return df else: return None
def apply_on_all_fg(self): if self.ml_pipeline.config.fg_padelpy_flg: self.jlogger.info( "Started feature selection of preprocessed PaDEL features") job_fld_path = self.ml_pipeline.job_data['job_fld_path'] pp_padel_fld_path = os.path.join(*[ job_fld_path, app_config.TEMP_TTS_FLD_NAME, app_config.FG_PADEL_FLD_NAME ]) padel_xtrain_fp = os.path.join(pp_padel_fld_path, app_config.TEMP_XTRAIN_FNAME) padel_ytrain_fp = os.path.join(pp_padel_fld_path, app_config.TEMP_YTRAIN_FNAME) padel_xtest_fp = os.path.join(pp_padel_fld_path, app_config.TEMP_XTEST_FNAME) padel_ytest_fp = os.path.join(pp_padel_fld_path, app_config.TEMP_YTEST_FNAME) self.ml_pipeline.x_train = pd.read_csv(padel_xtrain_fp) self.ml_pipeline.y_train = pd.read_csv(padel_ytrain_fp) self.ml_pipeline.y_train = self.ml_pipeline.y_train.values.ravel() self.ml_pipeline.x_test = pd.read_csv(padel_xtest_fp) self.ml_pipeline.y_test = pd.read_csv(padel_ytest_fp) self.ml_pipeline.y_test = self.ml_pipeline.y_test.values.ravel() # folder path to save output of preprocessed padel features feature selection data fs_padel_fld_path = os.path.join(*[ self.ml_pipeline.job_data['job_data_path'], DATA_FLD_NAME, app_config.FG_PADEL_FLD_NAME ]) self.fg_fs_fld_path = fs_padel_fld_path os.makedirs(self.fg_fs_fld_path, exist_ok=True) self.perform_feature_selection() if self.ml_pipeline.config.fg_mordered_flg: self.jlogger.info( "Started feature selection of preprocessed mordred features") job_fld_path = self.ml_pipeline.job_data['job_fld_path'] pp_mordred_fld_path = os.path.join(*[ job_fld_path, app_config.TEMP_TTS_FLD_NAME, app_config.FG_MORDRED_FLD_NAME ]) mordred_xtrain_fp = os.path.join(pp_mordred_fld_path, app_config.TEMP_XTRAIN_FNAME) mordred_ytrain_fp = os.path.join(pp_mordred_fld_path, app_config.TEMP_YTRAIN_FNAME) mordred_xtest_fp = os.path.join(pp_mordred_fld_path, app_config.TEMP_XTEST_FNAME) mordred_ytest_fp = os.path.join(pp_mordred_fld_path, app_config.TEMP_YTEST_FNAME) self.ml_pipeline.x_train = pd.read_csv(mordred_xtrain_fp) self.ml_pipeline.y_train = pd.read_csv(mordred_ytrain_fp) self.ml_pipeline.y_train = self.ml_pipeline.y_train.values.ravel() self.ml_pipeline.x_test = pd.read_csv(mordred_xtest_fp) self.ml_pipeline.y_test = pd.read_csv(mordred_ytest_fp) self.ml_pipeline.y_test = self.ml_pipeline.y_test.values.ravel() # folder path to save output of preprocessed mordred features feature selection data fs_mordred_fld_path = os.path.join(*[ self.ml_pipeline.job_data['job_data_path'], DATA_FLD_NAME, app_config.FG_MORDRED_FLD_NAME ]) self.fg_fs_fld_path = fs_mordred_fld_path os.makedirs(self.fg_fs_fld_path, exist_ok=True) self.perform_feature_selection() if self.is_train: updated_status = app_config.STEP3_STATUS job_oth_config_fp = self.ml_pipeline.job_data[ 'job_oth_config_path'] helper.update_job_status(job_oth_config_fp, updated_status) self.ml_pipeline.status = updated_status self.jlogger.info("Feature selection completed successfully")
def apply_on_all_fg(self): if self.ml_pipeline.config.fg_padelpy_flg: self.jlogger.info("Started pre-processing PaDEL features") padel_data_fp = os.path.join(*[ self.ml_pipeline.job_data['job_data_path'], app_config.FG_FLD_NAME, app_config.FG_PADEL_FLD_NAME, app_config.FG_PADEL_FNAME ]) pp_padel_fld_path = os.path.join(*[ self.ml_pipeline.job_data['job_data_path'], DATA_FLD_NAME, app_config.FG_PADEL_FLD_NAME ]) data, data_labels = self.read_data(padel_data_fp) self.ml_pipeline.data = data self.ml_pipeline.data_labels = data_labels # folder path to save output of padel features preprocessed data self.fg_pp_fld_path = pp_padel_fld_path os.makedirs(self.fg_pp_fld_path, exist_ok=True) pp_init_data_fpath = os.path.join( self.fg_pp_fld_path, DATA_FILE_NAME_PRFX + "init_data.csv") pp_init_labels_fpath = os.path.join( self.fg_pp_fld_path, DATA_FILE_NAME_PRFX + "init_labels.csv") data.to_csv(pp_init_data_fpath, index=False) data_labels.to_csv(pp_init_labels_fpath, index=False) self.preprocess_data() if self.ml_pipeline.config.fg_mordered_flg: self.jlogger.info("Started pre-processing mordred features") mordred_data_fp = os.path.join(*[ self.ml_pipeline.job_data['job_data_path'], app_config.FG_FLD_NAME, app_config.FG_MORDRED_FLD_NAME, app_config.FG_MORDRED_FNAME ]) pp_mordred_fld_path = os.path.join(*[ self.ml_pipeline.job_data['job_data_path'], DATA_FLD_NAME, app_config.FG_MORDRED_FLD_NAME ]) data, data_labels = self.read_data(mordred_data_fp) self.ml_pipeline.data = data self.ml_pipeline.data_labels = data_labels # folder path to save output of mordred features preprocessed data self.fg_pp_fld_path = pp_mordred_fld_path os.makedirs(self.fg_pp_fld_path, exist_ok=True) pp_init_data_fpath = os.path.join( self.fg_pp_fld_path, DATA_FILE_NAME_PRFX + "init_data.csv") pp_init_labels_fpath = os.path.join( self.fg_pp_fld_path, DATA_FILE_NAME_PRFX + "init_labels.csv") data.to_csv(pp_init_data_fpath, index=False) data_labels.to_csv(pp_init_labels_fpath, index=False) self.preprocess_data() if self.is_train: updated_status = app_config.STEP2_STATUS job_oth_config_fp = self.ml_pipeline.job_data[ 'job_oth_config_path'] helper.update_job_status(job_oth_config_fp, updated_status) self.ml_pipeline.status = updated_status self.jlogger.info("Pre-processing completed successfully")