def regularProject(self, Xb, results): ''' projects a collection of query objects in a regular model, for obtaining predictions ''' Yp = self.estimator.predict(Xb) utils.add_result(results, Yp, 'values', 'Prediction', 'result', 'objs', 'Results of the prediction', 'main')
def conformalProject(self, Xb, results): ''' projects a collection of query objects in a conformal model, for obtaining predictions ''' prediction = self.conformal_pred.predict( Xb, significance=self.conformalSignificance) if self.quantitative: mean1 = np.mean(prediction, axis=1) lower_limit = prediction[:, 0] upper_limit = prediction[:, 1] utils.add_result(results, mean1, 'values', 'Prediction', 'result', 'objs', 'Results of the prediction', 'main') utils.add_result(results, lower_limit, 'lower_limit', 'Lower limit', 'confidence', 'objs', 'Lower limit of the conformal prediction') utils.add_result(results, upper_limit, 'upper_limit', 'Upper limit', 'confidence', 'objs', 'Upper limit of the conformal prediction') else: # For the moment is returning a dictionary with class # predictions # / c0 / c1 / c2 / # /True/True/False/ for i in range(len(prediction[0])): class_key = 'c' + str(i) class_label = 'Class ' + str(i) class_list = prediction[:, i].tolist() utils.add_result(results, class_list, class_key, class_label, 'result', 'objs', 'Conformal class assignment', 'main')
def external_validation(self): ''' when experimental values are available for the predicted compounds, run external validation ''' ext_val_results = [] # Ye are the y values present in the input file Ye = np.asarray(self.results["ymatrix"]) # there are four variants of external validation, depending if the method # if conformal or non-conformal and the model is qualitative and quantitative if not self.parameters["conformal"]: # non-conformal if not self.parameters["quantitative"]: # non-conformal & qualitative Yp = np.asarray(self.results["values"]) if Ye.size == 0: raise ValueError("Experimental activity vector is empty") if Yp.size == 0: raise ValueError("Predicted activity vector is empty") # the use of labels is compulsory to inform the confusion matrix that # it must return a 2x2 confussion matrix. Otherwise it will fail when # a single class is represented (all TP, for example) TN, FP, FN, TP = confusion_matrix( Ye, Yp, labels=[0, 1]).ravel() # protect to avoid warnings in special cases (div by zero) MCC = mcc(Ye, Yp) if (TP+FN) > 0: sensitivity = (TP / (TP + FN)) else: sensitivity = 0.0 if (TN+FP) > 0: specificity = (TN / (TN + FP)) else: specificity = 0.0 ext_val_results.append(('TP_ex', 'True positives in external-validation', float(TP))) ext_val_results.append(('TN_ex', 'True negatives in external-validation', float(TN))) ext_val_results.append(('FP_ex', 'False positives in external-validation', float(FP))) ext_val_results.append(('FN_ex', 'False negatives in external-validation', float(FN))) ext_val_results.append(('Sensitivity_ex', 'Sensitivity in external-validation', float(sensitivity))) ext_val_results.append(('Specificity_ex', 'Specificity in external-validation', float(specificity))) ext_val_results.append(('MCC_ex', 'Mattews Correlation Coefficient in external-validation', float(MCC))) else: # non-conformal & quantitative Yp = np.asarray(self.results["values"]) if Ye.size == 0: raise ValueError("Experimental activity vector is empty") if Yp.size == 0: raise ValueError("Predicted activity vector is empty") Ym = np.mean(Ye) nobj = len(Yp) SSY0_out = np.sum(np.square(Ym - Ye)) SSY_out = np.sum(np.square(Ye - Yp)) scoringP = mean_squared_error(Ye, Yp) SDEP = np.sqrt(SSY_out / (nobj)) Q2 = 1.00 - (SSY_out / SSY0_out) ext_val_results.append( ('scoringP_ex', 'Scoring P', scoringP)) ext_val_results.append( ('Q2_ex', 'Determination coefficient in cross-validation', Q2)) ext_val_results.append( ('SDEP_ex', 'Standard Deviation Error of the Predictions', SDEP)) utils.add_result(self.results, ext_val_results, 'external-validation', 'external validation', 'method', 'single', 'External validation results') else: # conformal external validation if not self.parameters["quantitative"]: # conformal & qualitative Yp = np.concatenate((np.asarray(self.results['c0']).reshape( -1, 1), np.asarray(self.results['c1']).reshape(-1, 1)), axis=1) if Ye.size == 0: raise ValueError("Experimental activity vector is empty") if Yp.size == 0: raise ValueError("Predicted activity vector is empty") c0_correct = 0 c1_correct = 0 not_predicted = 0 c0_incorrect = 0 c1_incorrect = 0 Ye1 = [] Yp1 = [] for i in range(len(Ye)): real = float(Ye[i]) predicted = Yp[i] if predicted[0] != predicted[1]: Ye1.append(real) if predicted[0]: Yp1.append(0) else: Yp1.append(1) if real == 0 and predicted[0] == True: c0_correct += 1 if real == 0 and predicted[1] == True: c0_incorrect += 1 if real == 1 and predicted[1] == True: c1_correct += 1 if real == 1 and predicted[0] == True: c1_incorrect += 1 else: not_predicted += 1 MCC = mcc(Ye1, Yp1) TN = c0_correct FP = c0_incorrect TP = c1_correct FN = c1_incorrect coverage = float((len(Yp) - not_predicted) / len(Yp)) if (TP+FN) > 0: sensitivity = (TP / (TP + FN)) else: sensitivity = 0.0 if (TN+FP) > 0: specificity = (TN / (TN + FP)) else: specificity = 0.0 ext_val_results.append(('TP', 'True positives in external-validation', float(TP))) ext_val_results.append(('TN', 'True negatives in external-validation', float(TN))) ext_val_results.append(('FP', 'False positives in external-validation', float(FP))) ext_val_results.append(('FN', 'False negatives in external-validation', float(FN))) ext_val_results.append(('Coverage', 'Conformal coverage in external-validation', float(coverage))) ext_val_results.append(('Sensitivity', 'Sensitivity in external-validation', float(sensitivity))) ext_val_results.append(('Specificity', 'Specificity in external-validation', float(specificity))) ext_val_results.append(('MCC', 'Mattews Correlation Coefficient in external-validation', float(MCC))) utils.add_result(self.results, ext_val_results, 'external-validation', 'external validation', 'method', 'single', 'External validation results') else: # conformal & quantitative Yp_lower = self.results['lower_limit'] Yp_upper = self.results['upper_limit'] mean_interval = np.mean(np.abs(Yp_lower) - np.abs(Yp_upper)) inside_interval = (Yp_lower.reshape(-1, 1) < Ye) & (Yp_upper.reshape(-1, 1) > Ye) accuracy = len(inside_interval)/len(Ye) conformal_accuracy = float("{0:.2f}".format(accuracy)) conformal_mean_interval = float( "{0:.2f}".format(mean_interval)) ext_val_results.append(('Conformal_mean_interval', 'Conformal mean interval', conformal_mean_interval)) ext_val_results.append(('Conformal_accuracy', 'Conformal accuracy', conformal_accuracy)) utils.add_result(self.results, ext_val_results, 'external-validation', 'external validation', 'method', 'single', 'External validation results')
def _run_molecule(self): ''' version of Run for molecular input ''' # extract useful information from file success_inform = self.extractInformation(self.ifile) if 'error' in self.results: return nobj = self.results['obj_num'] ncpu = min(nobj, self.parameters['numCPUs']) # copy the input file to a temp file which will be cleaned at the end temp_path = tempfile.mkdtemp() shutil.copy(self.ifile, temp_path) lfile = os.path.join(temp_path, os.path.basename(self.ifile)) # Execute the workflow in 1 or n CPUs if ncpu > 1: LOG.debug('Entering molecule workflow for {} cpus'.format(ncpu)) success, results = sdfu.split_SDFile(lfile, ncpu) if not success: self.results['error'] = 'unable to split input molecule' return split_files_names = results[0] split_files_sizes = results[1] pool = mp.Pool(ncpu) if self.parameters['mol_batch'] == 'series': results = pool.map(self.workflow_series, split_files_names) else: results = pool.map(self.workflow_objects, split_files_names) success, results = self.consolidate(results, split_files_sizes) else: if self.parameters['mol_batch'] == 'series': success, results = self.workflow_series(lfile) else: success, results = self.workflow_objects(lfile) # series processing (1 or n CPUs) can produce a success == False if # any of the series/pieces contains an error. Abort the processing... if not success: self.results['error'] = results # check if any molecule failed to complete the workflow and then # ammend object annotations in self.results success_workflow = results[2] if len(success_inform) != len(success_workflow): LOG.error('shape mismatch of informed and workflow results:' f' ({len(success_inform), len(success_workflow)})' ' This is because some molecules failed during' ' the standarization or descriptors computations.') self.results['error'] = ('number of molecules informed' ' and processed does not match') return # Check if molecules not informed succeded # to be complete MD generation. # This should never happen, because they # do not pass the normalization step for i, (inform, workflow) in enumerate(zip(success_inform, success_workflow)): if workflow and not inform: LOG.critical(f'Molecule #{i} is `None` in Rdkit' ' but appears to be processed. This means that' ' there is a serious workflow issue and the ' ' molecule should be cured or eliminated.') self.results[ 'error'] = 'Unknown error processing input file. Probably the format is wrong or not supported' return # check if a molecule informed did not # succeed to complete MD generation for i, j in zip(success_inform, success_workflow): if i and not j: self.ammend_objects(success_inform, success_workflow) break # remove the temp directory with all the temp files inside shutil.rmtree(temp_path) utils.add_result(self.results, results[0], 'xmatrix', 'X matrix', 'method', 'vars', 'Molecular descriptors') utils.add_result(self.results, results[1], 'var_nam', 'Var names', 'method', 'vars', 'Names of the X variables') return
def extractInformation(self, ifile): ''' Extracts molecule names, biological anotations and experimental values from an SDFile. All this information is added to the results using method utils.add_result, so they are also inserted into the results manifest. ''' # Initiate a RDKit SDFile iterator to process the molecules one by one try: suppl = Chem.SDMolSupplier(ifile) LOG.debug(f'mol supplier created from {ifile}') except Exception as e: LOG.debug('Unable to create mol supplier with the exception: ' f'{e}') self.results['error'] = f'unable to open {ifile}. {e}' return # Raise error if SDF is empty if len(suppl) == 0: LOG.critical('ifile {} is empty'.format(ifile)) raise ValueError('Input SDF is empty') # Initate lists which will contain the extracted values obj_nam = [] obj_bio = [] obj_exp = [] obj_sml = [] success_list = [] obj_num = 0 # Iterate for every molecule inside the SDFile for mol in suppl: # Do not try to process molecules not recognised by RDKit. # They will be removed at the pre-normalization step, which is # compulsory for every molecule if mol is None: LOG.error( f'(@extractInformaton) Unable to process molecule #{obj_num+1}' f' in file {ifile}') # success_list.append(False) continue # extract the molecule name, using a sdfileutils algorithm name = sdfu.getName(mol, count=obj_num, field=self.parameters['SDFile_name'], suppl=suppl) # extracts biological information (activity) which is used as dependent variable # for the model training and is provided as a prediction for new compounds bio = None if self.parameters['SDFile_activity'] is not None: bio = utils.get_sdf_value(mol, self.parameters['SDFile_activity']) # extracts experimental information, if any. # note that experimental information is used only in prediction, as a value # which overrides any model predicted value exp = None if self.parameters['SDFile_experimental'] is not None: exp = utils.get_sdf_value( mol, self.parameters['SDFile_experimental']) # generates a SMILES sml = None try: sml = Chem.MolToSmiles(mol) except Exception as e: LOG.error('while converting mol to smiles' f' an exception has ocurred: {e}') # assigns the information extracted from the SDFile to the corresponding lists obj_nam.append(name) obj_bio.append(bio) obj_exp.append(exp) obj_sml.append(sml) success_list.append(True) obj_num += 1 # Insert the values as lists in 'results' using an utility function utils.add_result(self.results, obj_num, 'obj_num', 'Num mol', 'method', 'single', 'Number of molecules present in the input file') utils.add_result(self.results, obj_nam, 'obj_nam', 'Mol name', 'label', 'objs', 'Name of the molecule, as present in the input file') utils.add_result(self.results, obj_sml, 'SMILES', 'SMILES', 'smiles', 'objs', 'Structure of the molecule in SMILES format') if not utils.is_empty(obj_bio): utils.add_result( self.results, np.array(obj_bio, dtype=np.float64), 'ymatrix', 'Activity', 'decoration', 'objs', 'Biological anotation to be predicted by the model') if not utils.is_empty(obj_exp): utils.add_result( self.results, np.array(obj_exp, dtype=np.float64), 'experim', 'Experim.', 'decoration', 'objs', 'Experimental anotation present in the input file') LOG.debug(f'processed {obj_num} molecules' f' from a supplier of {len(suppl)} without issues') return success_list
def _run_ext_data(self): ''' version of Run for inter-process input (calling another model to obtain input) ''' # idata is a list of JSON from 1-n sources # the data usable for input must be listed in the ['meta']['main'] key # use first JSON to load common info like obj_nam, etc obj_common = ['label', 'decoration'] # load object identifiers and decorators first_results = json.loads(self.idata[0]) first_manifest = first_results['manifest'] for item in first_manifest: if item['type'] in obj_common: item_key = item['key'] self.results[item_key] = first_results[item_key] self.results['manifest'].append(item) # extract usable data from every source and add to 'combo' np.array combined_md = None combined_cf = None combined_md_names = [] combined_cf_names = [] for ijson in self.idata: i_result = json.loads(ijson) i_manifest = i_result['manifest'] i_meta = i_result['meta'] for item in i_manifest: if item['type'] == 'result': item_key = item['key'] if combined_md is None: # for first element just copy combined_md = np.array(i_result[item_key], dtype=np.float64) num_obj = len(i_result[item_key]) else: # append laterally if len(i_result[item_key]) != num_obj: self.results[ 'error'] = 'incompatible size of results obtained from external sources' return combined_md = np.c_[ combined_md, np.array(i_result[item_key], dtype=np.float64)] combined_md_names.append(item_key + ':' + i_meta['endpoint'] + ':' + str(i_meta['version'])) if item['type'] == 'confidence': item_key = item['key'] if combined_cf is None: # for first element just copy combined_cf = np.array(i_result[item_key], dtype=np.float64) else: # append laterally combined_cf = np.c_[ combined_cf, np.array(i_result[item_key], dtype=np.float64)] combined_cf_names.append(item_key + ':' + i_meta['endpoint'] + ':' + str(i_meta['version'])) utils.add_result(self.results, combined_md, 'xmatrix', 'X matrix', 'results', 'objs', 'Combined output from external sources') utils.add_result(self.results, combined_cf, 'confidence', 'Confidence', 'confidence', 'objs', 'Combined confidence from external sources') utils.add_result(self.results, combined_md_names, 'var_nam', 'Var. names', 'method', 'vars', 'Variable names from external sources') utils.add_result(self.results, combined_cf_names, 'conf_nam', 'Conf. names', 'method', 'vars', 'Confidence indexes from external sources') return
def _run_data(self): ''' version of Run for data input (TSV tabular format) ''' if not os.path.isfile(self.ifile): self.results['error'] = '{} not found'.format(self.ifile) # raise FileNotFoundError('{} not found'.format(self.ifile)) return # Reading TSV by hand with open(self.ifile, 'r') as fi: var_nam = [] obj_nam = [] smiles = [] for index, line in enumerate(fi): # we asume that the first row contains var names if index == 0 and self.parameters['TSV_varnames']: var_nam = line.strip().split('\t') var_nam = var_nam[1:] else: value_list = line.strip().split('\t') if self.parameters['TSV_objnames']: # we asume that the first column contains object names obj_nam.append(value_list[0]) value_list = value_list[1:] if 'SMILES' in var_nam: col = var_nam.index('SMILES') smiles.append(value_list[col]) del value_list[col] value_array = np.array(value_list, dtype=np.float64) if index == 1: # for the fist row, just copy the value list to the xmatrix xmatrix = value_array else: xmatrix = np.vstack((xmatrix, value_array)) obj_num = index LOG.debug('loaded TSV with shape {} '.format(xmatrix.shape)) if self.parameters['TSV_varnames']: obj_num -= 1 # what? # extract any named as "TSV_activity" as the ymatrix activity_param = self.parameters['TSV_activity'] LOG.debug('creating ymatrix from column {}'.format(activity_param)) if activity_param in var_nam: col = var_nam.index(activity_param) ymatrix = xmatrix[:, col] xmatrix = np.delete(xmatrix, col, 1) utils.add_result( self.results, ymatrix, 'ymatrix', 'Activity', 'decoration', 'objs', 'Biological anotation to be predicted by the model') utils.add_result(self.results, obj_num, 'obj_num', 'Num mol', 'method', 'single', 'Number of molecules present in the input file') utils.add_result(self.results, xmatrix, 'xmatrix', 'X matrix', 'method', 'vars', 'Molecular descriptors') if self.parameters['TSV_varnames']: utils.add_result(self.results, var_nam, 'var_nam', 'Var names', 'method', 'vars', 'Names of the X variables') if not self.parameters['TSV_objnames']: for i in range(obj_num): obj_nam.append('obj%.10f' % i) utils.add_result(self.results, obj_nam, 'obj_nam', 'Mol name', 'label', 'objs', 'Name of the molecule, as present in the input file') if len(smiles) > 0: utils.add_result(self.results, smiles, 'SMILES', 'SMILES', 'smiles', 'objs', 'Structure of the molecule in SMILES format') return
def run_internal(self): ''' Builds a model using the internally defined machine learning tools. All input parameters are extracted from self.parameters. The main output is an instance of basemodel saved in the model folder as a pickle (model.pkl) and used for prediction. The results of building and validation are added to results, but also saved to the model folder as a pickle (info.pkl) for being displayed in manage tools. ''' # expand with new methods here: registered_methods = [('RF', RF), ('SVM', SVM), ('GNB', GNB), ('PLSR', PLSR), ('PLSDA', PLSDA), ] # instanciate an appropriate child of base_model model = None for imethod in registered_methods: if imethod[0] == self.parameters['model']: model = imethod[1](self.X, self.Y, self.parameters) LOG.debug('Recognized learner: ' f"{self.parameters['model']}") break if not model: self.results['error'] = 'modeling method not recognised' LOG.error(f'Modeling method {self.parameters["model"]}' 'not recognized') return # build model LOG.info('Starting model building') success, model_building_results = model.build() if not success: self.results['error'] = model_buidling_results return utils.add_result(self.results, model_building_results, 'model_build_info', 'model buidling information', 'method', 'single', 'Information about the model') # self.results['model_build'] = results # validate model LOG.info('Starting model validation') success, model_validation_results = model.validate() if not success: self.results['error'] = model_validation_results return # model_validation_results is a tuple which contains model_validation_info and # (optionally) Y_adj and Y_pred, depending on the model type utils.add_result(self.results, model_validation_results[0], 'model_valid_info', 'model validation information', 'method', 'single', 'Information about the model validation') if len(model_validation_results)>1: utils.add_result(self.results, model_validation_results[1], 'Y_adj', 'Y fitted', 'result', 'objs', 'Y values of the training series fitted by the model') if len(model_validation_results)>2: utils.add_result(self.results, model_validation_results[2], 'Y_pred', 'Y predicted', 'result', 'objs', 'Y values of the training series predicted by the model') # TODO: compute AD (when applicable) LOG.info('Model finished succesfully') # save model model_pkl_path = os.path.join(self.parameters['model_path'], 'model.pkl') with open(model_pkl_path, 'wb') as handle: pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL) LOG.debug('Model saved as:{}'.format(model_pkl_path)) return