def process_each_frequency_keras(model_dirname, stft, frequency): ''' Setter method on stft. ''' # 1. Instantiate Neural Network Model model_save_fpath = os_path_join(model_dirname, 'k_' + str(frequency), MODEL_SAVE_FNAME) # print('model_save_fpath =', model_save_fpath) loaded_model_pipeline = joblib_load(model_save_fpath) # 2. Get X_test LOGGER.debug('r3.process_each_frequency_keras: stft.shape = {}'.format( stft.shape)) aperture_data = stft[:, :, frequency] # or stft_frequency # 2.1. normalize by L1 norm aperture_data_norm = np_linalg_norm(aperture_data, ord=np_inf, axis=1) aperture_data /= aperture_data_norm[:, np_newaxis] X_test = aperture_data # 3. Predict y_hat = loaded_model_pipeline.predict(X_test) # 4. Postprocess on y_hat aperture_data_new = y_hat # rescale the data and store new data in stft stft[:, :, frequency] = aperture_data_new * aperture_data_norm[:, np_newaxis]
def __init__(self, stepName, isHomoComplex, savedModelsPath=None, averageLRscores=False): ''' :param stepName: str. Must startswith seq_train or struct or mixed (seq_train, mixed_2, structX, seq_train1... are also valid) :param isHomoComplex: boolean. Is the target complex h**o or hetero :param savedModelsPath: str. A path to the directory where models have been saved. If None, it will used the path indicated in Config :param averageLRscores: True if Ligand and receptor are the same protein and thus, binding site prediction should be averaged ''' Configuration.__init__(self) self.isHomoComplex = isHomoComplex self.stepName = stepName self.averageLRscores = averageLRscores if not savedModelsPath is None: self.savedModelsPath = savedModelsPath self.model = None print(stepName) self.savedModelsPath = os.path.join( self.savedModelsPath, "h**o" if self.isHomoComplex else "hetero") for fname in os.listdir(self.savedModelsPath): if fname.endswith(stepName): print("Loading model %s %s" % ("h**o" if isHomoComplex else "hetero", fname)) self.model = joblib_load( os.path.join(self.savedModelsPath, fname)) assert not self.model is None, "Error, there is no valid model in %s for step %s" % ( self.savedModelsPath, self.stepName)
def load(self, dirname): importer = DictImporter() (self.tree, self.node_to_class, self.node_to_classes, self.class_maps) = load_tree(join(dirname, 'tree')) self.models = {} models_dirname = join(dirname, 'models') with open(join(models_dirname, 'models_fnames.yaml'), 'r', encoding='utf-8') as file: models_dct = yaml_load(file) for node_id, fname in models_dct.items(): model_path = join(models_dirname, fname) self.models[node_id] = load_model(model_path) self.encoders = {} encoders_dirname = join(dirname, 'encoders') with open(join(encoders_dirname, 'encoders_fnames.yaml'), 'r', encoding='utf-8') as file: encoders_dct = yaml_load(file) for node_id, fname in encoders_dct.items(): encoder_path = join(encoders_dirname, fname) self.encoders[node_id] = joblib_load(encoder_path) self._fitted = True
def loadModel(cls, model, dependencies_filename, joblib=False): dependencies = json.load(dependencies_filename) if joblib: # deserialize by using library joblib loadedmodel = joblib_load(model) else: # deserialize standard Python objects loadedmodel = load(open(model, 'rb')) return loadedmodel, dependencies
def getDataForTestFromPrefix(testPrefix, testPath): ''' Load a data file whose name startswith testPrefix and it is contained in testPath. Returns a tuple with all data needed to perform predictions and testing :param prefix: str. The prefix of the filename to be loaded. E.g. "1A2K" :param filesPath: str. The path where data files are contained :return (data_d, data_t, ppiComplex.getLabels(), ppiComplex.getIds()) data_d: np.array (n,m). A np.array that can be feed to the classifier. Each row represents a pair of amino acids in direct form (first ligand aa second receptor aa) data_l: np.array (n,m). A np.array that can be feed to the classifier. Each row represents a pair of amino acids in transpose form (first receptor aa second ligand aa) ppiComplex.getLabels(): np.array which contains the labels (-1, 1 ) of each row (pair of amino acids) ppiComplex.getIds(): pandas.DataFrame whose columns are: chainIdL resIdL resNameL chainIdR resIdR resNameR categ ''' ppiComplex = joblib_load( findFullTestPPIName(testPrefix, testPath)) isSeqStruct = isinstance(ppiComplex, ComplexSeqStructCodified) data_d, data_t = ppiComplex.getData() labels= ppiComplex.getLabels() ids= ppiComplex.getIds() if SAMPLE_TRAIN_EXAMPLES and testPrefix[:4].islower(): gc.collect() condition= labels>0 posLabelsIdx= np.where(condition)[0] negIdexIdx= np.where(~ condition)[0] nToSample= min(MAX_SAMPLING_PAIRS, len(negIdexIdx) ) if nToSample==MAX_SAMPLING_PAIRS: print("Random sampling for %s"%(testPrefix)) random_state = abs(hash(testPrefix.split("@")[0].split("#s")[0])) # ensure that all complexes with the same prefix are equally sampled. Required for results average random_state = random_state // 2 ** 32 - 1 random.seed(random_state) np.random.seed(random_state) negIdexIdx= np.random.choice(negIdexIdx, size= nToSample, replace=False) selectIdxs= np.concatenate([posLabelsIdx, negIdexIdx]) selectIdxs= np.array( [ int(l) for l in selectIdxs if not np.isnan(l) and not np.isnan(labels[l]) ], dtype=np.int32) assert len(selectIdxs)>0, "Error, empty selectIdx"+str(testPrefix)+"\n"+str(ids.head()) data_d= data_d[selectIdxs,:] if data_d is not None else None data_t= data_t[selectIdxs,:] if data_t is not None else None labels= labels[selectIdxs] ids= ids.iloc[selectIdxs, :].reset_index() gc.collect() random.seed(None) np.random.seed(None) return isSeqStruct, (data_d, data_t, labels, ids)
def loadPrefixFilesIterator(prefix, filesPath): ''' Load all data files whose name startswith prefix and it is contained in filesPath. Works as an iterator :param prefix: str. The prefix of the filename to be loaded. E.x. "1A2K" :param filesPath: str. The path where data files are contained @yields complex_data: codifyComplexes.ComplexCodified.ComplexCodified class ''' for fname in sorted(os.listdir(filesPath)): if fname.split(".")[0] == prefix: yield joblib_load(os.path.join(filesPath, fname))
def loadPrefixFile(prefix, filesPath): ''' Load a data file whose name startswith prefix and it is contained in filesPath :param prefix: str. The prefix of the filename to be loaded. E.x. "1A2K" :param filesPath: str. The path where data files are contained :return complex_data: codifyComplexes.ComplexCodified.ComplexCodified class ''' complexChunks = [] for fname in sorted(os.listdir(filesPath)): if fname.split(".")[0] == prefix: complexChunks.append( joblib_load(os.path.join(filesPath, fname))) assert len(complexChunks) == 1 return complexChunks[0]
def getDataForTestFromPrefix(testPrefix, testPath): ''' Load a data file whose name startswith testPrefix and it is contained in testPath. Returns a tuple with all data needed to perform predictions and testing :param prefix: str. The prefix of the filename to be loaded. E.g. "1A2K" :param filesPath: str. The path where data files are contained :return (data_d, data_t, ppiComplex.getLabels(), ppiComplex.getIds()) data_d: np.array (n,m). A np.array that can be feed to the classifier. Each row represents a pair of amino acids in direct form (first ligand aa second receptor aa) data_l: np.array (n,m). A np.array that can be feed to the classifier. Each row represents a pair of amino acids in transpose form (first receptor aa second ligand aa) ppiComplex.getLabels(): np.array which contains the labels (-1, 1 ) of each row (pair of amino acids) ppiComplex.getIds(): pandas.DataFrame whose columns are: chainIdL resIdL resNameL chainIdR resIdR resNameR categ ''' for fname in sorted(os.listdir(testPath)): if fname.startswith(testPrefix): ppiComplex = joblib_load(os.path.join(testPath, fname)) data_d, data_t = ppiComplex.getData() return (data_d, data_t, ppiComplex.getLabels(), ppiComplex.getIds())
def deserialize(filename, format=DEFAULT_FORMAT): if not os.path.exists(filename): raise RuntimeError('File %s does not exist' % filename) if format & JOBLIB_FORMAT: if not has_joblib: raise RuntimeError( 'Missing library. Format (JOBLIB_FORMAT) not available.') return joblib_load(filename) if format & BZIP2_FORMAT: open_fn = bz2.BZ2File else: open_fn = open with open_fn(filename, 'rb') as f: if format & PICKLE_FORMAT: return pickle.load(f) elif format & YAML_FORMAT: if not has_yaml: raise RuntimeError( 'Missing library. Format (YAML_FORMAT) not available.') return yaml.load(f) else: raise ValueError('Unknown format value.')
def __init__(self, stepName, savedModelsPath=None): ''' @param stepName: str. Must startswith seq_train or struct or mixed (seq_train, mixed_2, structX, seq_train1... are also valid) @param savedModelsPath: str. A path to the directory where models have been saved. If None, it will used the path indicated in Config ''' Configuration.__init__(self) self.stepName = stepName if not savedModelsPath is None: self.savedModelsPath = savedModelsPath self.model = None print(stepName) for fname in os.listdir(self.savedModelsPath): if fname.endswith(stepName): print("Loading model %s" % (fname)) self.model = joblib_load( os.path.join(self.savedModelsPath, fname)) assert not self.model is None, "Error, there is no valid model in %s for step %s" % ( self.savedModelsPath, self.stepName)
def trainAndTestOneFold(trainData, testPrefixes, trainSubsetN, testPath, outputPath, verbose=False, ncpu=1): ''' Trains and tests one fold :param trainData: a numpy array for training with first column labels and the others are features :param testPrefixes: str[]. A list that contains prefixes for all complexes to be tested :param trainSubsetN: int Tuple. The numerical ids of the training split. :param testPath: str. Path to a dir where testing data files are stored :param outputPath: str. Path to a dir where predictions will be stored. None if results will not be saved :param verbose: boolean. Whether or not print to stdout info :param ncpu: int. Number of cpu's to use in parallel ''' testPrefixesNotEvaluated = [] originalTestPrefixToNewPrefix, __ = getOriginalToActualPrefixs(testPrefixes) alreadyComputedPrefixes_and_outnames= [] for testPrefix in originalTestPrefixToNewPrefix: if outputPath is not None: outName = getResultsOutname(outputPath, testPrefix, trainSubsetN) if verbose and os.path.isfile(outName): print("Complex already computed: %s" % (outName)) alreadyComputedPrefixes_and_outnames.append( (testPrefix, outName) ) else: testPrefixesNotEvaluated.append((testPrefix, outName)) else: testPrefixesNotEvaluated.append((testPrefix, None)) modelo = None from Config import Configuration conf = Configuration() modelFname= os.path.join(conf.tmp, hashlib.md5("".join(sorted(testPrefixes))).hexdigest()+str(trainSubsetN)+"bipspi2.pckl") resultsForEvaluation_list=[] if len(testPrefixesNotEvaluated) > 0 or len(testPrefixes) == 0: if verbose: print("Testing:", [ x[0] for x in testPrefixesNotEvaluated]) verboseLevel = 1 else: verboseLevel = 0 if os.path.exists(modelFname): print("Loading classifier") modelo= joblib_load(modelFname) else: print("Training classifier") modelo = trainMethod(trainData[:, 1:], trainData[:, 0], verboseLevel=verboseLevel, ncpu=ncpu) joblib_save(modelo, modelFname) del trainData gc.collect() if verbose: print("Classifier fitted.") expectedSize= estimateRequiredMemoryPerComplex(testPrefixesNotEvaluated, testPath) freeMem= checkFreeMemory() nJobs= int(max(1, min(ncpu, freeMem/expectedSize, len(testPrefixesNotEvaluated)))) print("Free memory for predictOnePrefix: %s GB. Njobs: %s (%s expected size)"%(freeMem, nJobs, expectedSize)) resultsForEvaluation_list= Parallel(n_jobs=nJobs)(delayed(predictOnePrefix)(originalTestPrefixToNewPrefix[testPrefix], modelo, outName, testPath) for testPrefix, outName in testPrefixesNotEvaluated ) gc.collect() expectedSize= estimateRequiredMemoryPerComplex(alreadyComputedPrefixes_and_outnames, testPath) freeMem= checkFreeMemory() nJobs= int(max(1, min(ncpu, freeMem/expectedSize, len(alreadyComputedPrefixes_and_outnames)))) resultsForEvaluation_list+= Parallel(n_jobs=nJobs)(delayed(loadExistingResults)( testPrefix, outName,) for testPrefix, outName in alreadyComputedPrefixes_and_outnames ) if len(resultsForEvaluation_list)>0: freeMem = checkFreeMemory() totMem= getTotalMemory() usedMem= totMem-freeMem nJobs = int(max(1, min(ncpu, freeMem / (usedMem/(1+len(resultsForEvaluation_list)))))) print("Free memory for evaluateOneResultObj: %s GB. Njobs: %s" % (freeMem, nJobs)) Parallel(n_jobs=nJobs)(delayed(evaluateOneResultObj)(testPrefix, resultObj, False) for testPrefix, resultObj in resultsForEvaluation_list) finalResults= zip(*resultsForEvaluation_list)[1] else: finalResults=[] del resultsForEvaluation_list tryToRemove(modelFname) return finalResults, modelo
def load_model_pipeline(): return joblib_load("model/model_v1.joblib")