def get_blob_ids_of_model_generic_files(model_blob_id): oauth = get_oauth() fetch_token(oauth) file_info = get_file_info_from_blob(oauth, model_blob_id).json() if 'ModelInfo' in file_info['metadata'].keys(): info_key = 'ModelInfo' elif 'modelInfo' in file_info['metadata'].keys(): info_key = 'modelInfo' else: raise KeyError('No model info') model_id = json.loads(file_info['metadata'][info_key])['ModelId'] file_to_read = open('{}/{}'.format(TEMP_FOLDER, model_id), 'r') lines = file_to_read.readlines() file_to_read.close() generic_files_ids = [] for line in lines: generic_file_ids = json.loads(line.replace('\'', '"')) if generic_file_ids['parent_id'] != model_id: continue generic_files_ids.append(generic_file_ids['blob_id']) return generic_files_ids
def preload_ssp_models(): """ Method to preload all SSP models to memory, model should have SSP target. Using to spped up predictions later Add preloaded SSP models to MODELS_IN_MEMORY_CACHE general_helper.py module """ oauth = get_oauth() # set default GET request header and parameters values headers = {'Accept': 'application/json'} params = ( ('$filter', 'Targets eq \'SSP\''), ('PageNumber', '1'), ('PageSize', '100'), ) # GET all SSP models information from OSDR web api response = requests.get(API_MODELS_ENTITIES_URL, headers=headers, params=params, verify=False) # loop to preload all SSP models for model_data in response.json(): fetch_token(oauth) # get model blob id and bucket from response model_blob_id = model_data['blob']['id'] model_bucket = model_data['blob']['bucket'] # get model info blob_model_info = get_model_info(oauth, model_blob_id, model_bucket) blob_model_info['ModelBlobId'] = model_blob_id blob_model_info['ModelBucket'] = model_bucket # preset parameters using to prediction prediction_parameters = dict() prepare_prediction_files(oauth, prediction_parameters, blob_model_info) # add model to cache, if it not in cache if model_blob_id not in MODELS_IN_MEMORY_CACHE.keys(): try: MODELS_IN_MEMORY_CACHE[model_blob_id] = cache_model( prediction_parameters['ModelsFolder']) except FileNotFoundError: LOGGER.error('Cant preload SSP model with blob id: {}'.format( model_blob_id))
def classic_classification_train_logistic_regression(body): """ Callback function which work when modelling success :param body: MassTransit message """ global CLASSIC_CLASSIFICATION_LOGISTIC_TRAINED_FLAG global CLASSIC_CLASSIFICATION_FILES_BLOB_IDS global LOGISTIC_REGRESSION_MODEL_BLOB_ID oauth = get_oauth() fetch_token(oauth) list_of_ids = get_blob_ids_of_model_generic_files(body['BlobId']) CLASSIC_CLASSIFICATION_FILES_BLOB_IDS.extend(list_of_ids) LOGISTIC_REGRESSION_MODEL_BLOB_ID = body['BlobId'] CLASSIC_CLASSIFICATION_LOGISTIC_TRAINED_FLAG = True return None
def setUp(self): """ Method for do some needed stuff before test starts """ os.environ['OAUTHLIB_INSECURE_TRANSPORT'] = '1' # initialise test client for flask FAR application # and set it to internal TestCase variable self.blob_version_url = '{}/version'.format( os.environ['OSDR_BLOB_SERVICE_URL']) self.temp_folder = os.environ['OSDR_TEMP_FILES_FOLDER'] self.test_file_name = 'ML_test.txt' self.oauth = get_oauth() self.test_file = 'DNN_data_solubility.sdf' self.test_file_cif = 'test.cif' self.parent_id = 'c1cc0000-5d8b-0015-d72b-08d52f3ea2a9' self.user_id = '8d76f88c-fc99-45ca-8951-74d3a5fda263' self.source_blob_id = get_blob_id(self) self.model_bucket = os.environ['OSDR_ML_MODELER_CLIENT_ID']
def classic_classification_train_naive_bayes(body): """ Callback function which work when modelling success :param body: MassTransit message """ global CLASSIC_CLASSIFICATION_NAIVE_TRAINED_FLAG global NAIVE_BAYES_MODEL_BLOB_ID global CLASSIC_CLASSIFICATION_MODEL_BUCKET global CLASSIC_CLASSIFICATION_FILES_BLOB_IDS oauth = get_oauth() fetch_token(oauth) list_of_ids = get_blob_ids_of_model_generic_files(body['BlobId']) NAIVE_BAYES_MODEL_BLOB_ID = body['BlobId'] CLASSIC_CLASSIFICATION_MODEL_BUCKET = body['Bucket'] CLASSIC_CLASSIFICATION_FILES_BLOB_IDS.extend(list_of_ids) CLASSIC_CLASSIFICATION_NAIVE_TRAINED_FLAG = True return None
def callback(body): """ Pika callback function used by single structure predictor. Make list of json with prediction data for each model prediction. :param body: RabbitMQ MT message's body """ # start total prediction time counter start_prediction_total = time() models_data = [] oauth = get_oauth() # try to reformat molecule, if using "\\" instead of "\" in mol string if '\\n' in body['Structure']: body['Structure'] = body['Structure'].replace('\\n', '\n') # set molecules converter function depends on input structure type molecules_converter_method = None if body['Format'] == 'MOL': molecules_converter_method = molecules_from_mol_strings elif body['Format'] == 'SMILES': molecules_converter_method = molecules_from_smiles # read molecules from input mol string or SMILES molecules = None exception_message = None try: molecules = molecules_converter_method([body['Structure']]) except: # log error traceback logging_exception_message(LOGGER) exception_message = 'Get molecule from molstring exception' # get models ids, models blob ids, models buckets from input message fetch_token(oauth) model_id, models_blob_ids, models_buckets = get_models_from_body_message( body) # make prediction for all models for model_id, model_blob_id, model_bucket in zip(model_id, models_blob_ids, models_buckets): # start current prediction counter start_current_prediction = time() start_timer = time() # define exception message for current prediction if not exception_message: exception_message = None # initialize prediction parameters prediction_parameters = dict() prediction_parameters['DatasetFileName'] = 'Single Structure Predict' prediction_parameters['Molecules'] = molecules # get current model info fetch_token(oauth) blob_model_info = get_model_info(oauth, model_blob_id, model_bucket) blob_model_info['ModelBlobId'] = model_blob_id blob_model_info['ModelBucket'] = model_bucket blob_model_info['ModelCode'] = algorithm_code_by_name( blob_model_info['Method']) # add prediction data to json # add training parameters # add dataset parameters predicted_data = { 'id': model_id, 'trainingParameters': training_parameters(blob_model_info), 'property': property_parameters(blob_model_info), 'dataset': dataset(blob_model_info), 'reportId': str(uuid.uuid1()) } fetch_token(oauth) LOGGER.info('INITIAL PREPARING: {} sec'.format(time() - start_timer)) try: # update prediction parameters prepare_prediction_parameters(oauth, prediction_parameters, blob_model_info) except: # log error traceback logging_exception_message(LOGGER) if not exception_message: exception_message = 'Prepare prediction parameters exception' start_timer = time() try: # make prediction ml_predictor = MLPredictor(prediction_parameters) ml_predictor.make_prediction() prediction = ml_predictor.prediction except: # log error traceback logging_exception_message(LOGGER) if not exception_message: exception_message = 'Predictor exception' LOGGER.info('PREDICTION TIME: {} sec'.format(time() - start_timer)) # add error to json if something wrong if exception_message: predicted_data['error'] = error(exception_message) else: predicted_data['result'] = result(prediction) predicted_data['applicabilityDomain'] = applicability_domain( prediction) # stop current prediction counter current_prediction_time_seconds = time() - start_current_prediction predicted_data['predictionElapsedTime'] = int( current_prediction_time_seconds * 1000) models_data.append(predicted_data) # stop total predictions counter total_prediction_time_seconds = time() - start_prediction_total body['Data'] = { 'predictionElapsedTime': int(total_prediction_time_seconds * 1000), 'models': models_data } # add prediction consensus data to sending message consensus_data = consensus(models_data) if consensus_data: body['Data'].update({'consensus': consensus_data}) # send (publish) properties predicted message to OSDR prediction_created_event = single_structure_property_predicted(body) properties_predicted_publisher = PurePublisher(SINGLE_STRUCTURE_PREDICTED) properties_predicted_publisher.publish(prediction_created_event) return None
def generate_training_report(body): """ Pika callback function used by training report generator. Make plots files, general metrics csv file and report file if success. :param body: RabbitMQ MT message's body """ oauth = get_oauth() fetch_token(oauth) # define using variables for ml reporter model = body['Models'][0] model_blob_id = model['blobId'] file_info = get_file_info_from_blob(oauth, model_blob_id).json() if 'ModelInfo' in file_info['metadata'].keys(): info_key = 'ModelInfo' elif 'modelInfo' in file_info['metadata'].keys(): info_key = 'modelInfo' else: raise KeyError('No model info') model_info = json.loads(file_info['metadata'][info_key]) body['Bins'] = model_info['Bins'] model_name = model_info['ModelName'] model_type = model_info['ModelType'] base_folder = '{}/general_training_report_{}'.format( TEMP_FOLDER, uuid.uuid1()) make_directory(base_folder) LOGGER.info('MODEL INFO: {}'.format(model_info)) LOGGER.info('MODEL NAME: {}'.format(model_name)) LOGGER.info('MODEL TYPE: {}'.format(model_type)) # generate general metrics dict all_models_metrics = [] for model in body['Models']: # if something wrong with model file from blob storage if 'genericFiles' not in model.keys(): raise TypeError('Empty model\'s generic files blob ids') for file_blob_id in model['genericFiles']: file_info = get_file_info_from_blob(oauth, file_blob_id).json() if 'fileInfo' in file_info['metadata'].keys(): fileinfo_key = 'fileInfo' elif 'FileInfo' in file_info['metadata'].keys(): fileinfo_key = 'FileInfo' else: raise KeyError('No fileInfo key in: {}'.format( file_info['metadata'].keys())) file_info = json.loads(file_info['metadata'][fileinfo_key]) if 'fileType' in file_info.keys(): filetype_key = 'fileType' elif 'FileType' in file_info.keys(): filetype_key = 'FileType' else: filetype_key = None if filetype_key and file_info[filetype_key] == TRAINING_CSV_METRICS: csv_blob_id = file_blob_id csv_model_metrics = get_file_from_blob(csv_blob_id, oauth).content all_models_metrics.append(csv_model_metrics.decode()) LOGGER.info('CURRENT MODEL INFO: {}'.format(file_info)) LOGGER.info('ALL MODELS METRICS: {}'.format(all_models_metrics)) # write general metrics data to csv file csv_files_names = write_csvs_files(all_models_metrics) general_csv_dict = merge_csv_files(csv_files_names) rows = make_general_csv_rows(general_csv_dict) general_csv_file_path = write_rows_to_csv_file(rows, base_folder) metrics = html_metrics_from_dict(general_csv_dict) fetch_token(oauth) # make csv info for blob storage general_csv_info = { 'FileInfo': json.dumps({ 'modelName': model_name, 'fileType': ALL_MODELS_TRAINING_CSV_METRICS }), 'SkipOsdrProcessing': 'true' } # make multipart object prepared to POST to blob storage # include csv file and file info multipart_general_csv = get_multipart_object( body, general_csv_file_path, 'text/csv', additional_fields=general_csv_info) # POST metrcis csv file to blob storage post_data_to_blob(oauth, multipart_general_csv) # create general images body['NumberOfGenericFiles'] = 0 path_to_radar_plot = None try: if model_type == CLASSIFIER: LOGGER.info('Creating radar_plot') nbits = body['Bins'] path_to_radar_plot = radar_plot(general_csv_file_path, base_folder, nbits, titlename=model_name) # make radar plot multipart encoded object multipart_radar_plot = get_multipart_object( body, path_to_radar_plot, 'image/png', additional_fields={'correlationId': body['CorrelationId']}) # send, http POST request to blob storage api with radar plot post_data_to_blob(oauth, multipart_radar_plot) body['NumberOfGenericFiles'] += 1 except: # log error traceback logging_exception_message(LOGGER) raise Exception('Post generic data exception') optimizer_metrics = REDIS_CLIENT.get('optimizer_metrics_{}'.format( body['CorrelationId'])) if optimizer_metrics: optimizer_metrics = html_optimal_metrics_from_dict( json.loads(optimizer_metrics), model_type) # add metrics and images to pdf report file context = { 'metrics': metrics, 'radar_plots': [path_to_radar_plot], 'optimizer': optimizer_metrics } pdf_path = make_pdf_report(base_folder, context, model_name='general') fetch_token(oauth) multipart_general_csv = get_multipart_object( body, pdf_path, 'application/pdf', additional_fields={'correlationId': body['CorrelationId']}) post_data_to_blob(oauth, multipart_general_csv) body['NumberOfGenericFiles'] += 1 # remove temporary directory shutil.rmtree(base_folder, ignore_errors=True) report_generated = training_report_generated_message(body) model_report_generated_message_publisher = PurePublisher(REPORT_GENERATED) model_report_generated_message_publisher.publish(report_generated) LOGGER.info('Report generated!') return None
def callback(body): """ Pika callback function used by ml predictor. Make file with predicted properties by picked model. Send file to blob storage for OSDR :param body: RabbitMQ MT message's body """ oauth = get_oauth() prediction_parameters = dict() fetch_token(oauth) # update prediction parameters # add dataset as bytes # add dataset file name, just for report message prediction_parameters['Dataset'], prediction_parameters[ 'DatasetFileName'] = get_dataset(oauth, body) model_info = get_model_info(oauth, body['ModelBlobId'], body['ModelBucket']) fetch_token(oauth) model_info['ModelBlobId'] = body['ModelBlobId'] model_info['ModelBucket'] = body['ModelBucket'] model_info['ModelCode'] = algorithm_code_by_name(model_info['Method']) # update prediction parameters with model paramers, such as density matrix, # distance model, MODI, fingerprints etc prepare_prediction_parameters(oauth, prediction_parameters, model_info) # update prediction parameters # add list of molecules prediction_parameters['Molecules'] = get_molecules_from_sdf_bytes( prediction_parameters['Dataset']) # define predictor object using prediction parameters # make prediction for all molecules ml_predictor = MLPredictor(prediction_parameters) ml_predictor.make_prediction() # send prediction result to OSDR # write prediction to csv prediction_csv_path = ml_predictor.write_prediction_to_csv() fetch_token(oauth) # prepare multipart object multipart_csv = get_multipart_object(body, prediction_csv_path, 'text/csv') # POST data to blob storage response_csv = post_data_to_blob(oauth, multipart_csv) # get prediction blob id and publish message in properties predicted queue prediction_created_event = property_predicted( body, os.path.basename(prediction_csv_path), response_csv.json()[0]) properties_predicted_publisher = PurePublisher(PROPERTIES_PREDICTED) properties_predicted_publisher.publish(prediction_created_event) # remove prediction file from temporary folder os.remove(prediction_csv_path) # remove temporary models folder shutil.rmtree(prediction_parameters['ModelsFolder'], ignore_errors=True) # clear memory del MODELS_IN_MEMORY_CACHE[model_info['ModelBlobId']] return None
def train_model(body): """ Pika callback function used by ml modeller. Make plots files, metrics files and model file if success. :param body: RabbitMQ MT message's body """ # prepared needed for calculations variables body['CurrentModelId'] = body['CorrelationId'] oauth = get_oauth() if not body['Method']: raise ValueError('Empty method in model trainer') body['Method'] = body['Method'].lower() method_code = body['Method'] body['Name'] = '_'.join(algorithm_name_by_code(method_code).split()) # publish training started event to OSDR fetch_token(oauth) publish_start_training_event(body) # validate input data # raise error with invalid parameter validate_user_input(body) # make dataframe using input parameters model_type = model_type_by_code(method_code) try: dataframe = make_dataframe(model_type, body, oauth) except: # log error traceback logging_exception_message(LOGGER) raise Exception('Make dataframe exception') # calculate scaler model model_trainer = make_model_trainer(method_code, body, dataframe) LOGGER.info('Sending scaler file to {}/{}'.format(BLOB_URL, CLIENT_ID)) fetch_token(oauth) # make applicability domain files and calculate values LOGGER.info('Calculate applicability domain') model_trainer.make_applicability_domain() # generic files for current training model, except model (*.sav) files body['NumberOfGenericFiles'] = 0 # train chosen model LOGGER.info('Start model training') try: model_training_data = conduct_training(model_trainer, method_code, body, oauth) except: # log error traceback logging_exception_message(LOGGER) raise Exception('Model training exception') # POST model's files (images, csvs etc) try: # POST classifier model files if model_type == CLASSIFIER: post_classifier(model_trainer, body, oauth, model_training_data['path_to_csv_file']) # POST regressor model files elif model_type == REGRESSOR: post_regressor(model_trainer, body, oauth) except: # log error traceback logging_exception_message(LOGGER) raise Exception('Post generic data exception') # update template tags with all needed data for model's report model_trainer.make_report_text() # make pdf report for trained model pdf_path = make_pdf_report(model_trainer.sub_folder, model_trainer.template_tags, model_name=body['Name']) LOGGER.info('Sending pdf report to {}/{}'.format(BLOB_URL, CLIENT_ID)) # POST pdf report to blob storage fetch_token(oauth) model_report_info = { 'FileInfo': json.dumps({ 'modelName': model_trainer.cv_model.model_name, 'fileType': MODEL_PDF_REPORT }), 'ParentId': body['CurrentModelId'] } multipart_pdf = get_multipart_object(body, pdf_path, 'application/pdf', additional_fields=model_report_info) post_data_to_blob(oauth, multipart_pdf) body['NumberOfGenericFiles'] += 1 # remove temporary directory shutil.rmtree(model_trainer.sub_folder, ignore_errors=True) # send model trained message to OSDR via rabbitmq model_trained = model_trained_message(body, model_training_data, model_trainer) model_trained_message_publisher = PurePublisher(MODEL_TRAINED) model_trained_message_publisher.publish(model_trained) LOGGER.info('Finished Calculations!') return None
def find_optimal_parameters(body): """ Pika callback function used by ml optimizer Find optimal training fingerprints set for input dataset Using only 1000 (by default) or less structures from input dataset Send overall optimizing result to Redis, to use it in ml training report :param body: RabbitMQ MT message's body :type body: dict """ oauth = get_oauth() # check input methods if not body['Methods']: raise ValueError('Empty Methods') # calculate metrics for each fingerprints set metrics, target_metric = fingerprints_grid_search( oauth, body, BASE_FINGERPRINTS) # send all metrics to redis # later use it to add to training report REDIS_CLIENT.setex( 'optimizer_metrics_{}'.format(body['CorrelationId']), EXPIRATION_TIME, json.dumps(metrics) ) # find best fingerprints set optimal_fingerprints = sorted( metrics.values(), key=lambda value: value['metrics'][target_metric], reverse=True )[0]['fptype'] # set other default 'optimal' parameters for training model body['SubSampleSize'] = 1.0 body['TestDataSize'] = 0.3 body['Scaler'] = 'MinMax' body['KFold'] = 5 body['Fingerprints'] = optimal_fingerprints body['OptimizationMethod'] = 'default' body['NumberOfIterations'] = 100 # make optimizer metrics csv and post it to blob storage formatted_metrics = TMP_TMP( metrics, model_type_by_code(body['Methods'][0].lower())) csv_path = '{}/ml_optimizer/{}/optimizing.csv'.format( TEMP_FOLDER, body['CorrelationId']) write_optimized_metrics_to_csv(formatted_metrics, csv_path) multipart_model = get_multipart_object( body, csv_path, 'application/x-spss-sav', additional_fields={'ParentId': body['TargetFolderId']} ) # send optimizer metrics csv file to blob storage fetch_token(oauth) response = post_data_to_blob(oauth, multipart_model) LOGGER.info('Optimizer csv status code: {}'.format(response.status_code)) # send best fingerprints set and 'optimal' parameters to training model training_optimized = model_training_optimized(body) training_optimized_message_publisher = PurePublisher(TRAINING_OPTIMIZED) training_optimized_message_publisher.publish(training_optimized) # clear current optimization folder shutil.rmtree( '{}/ml_optimizer/{}'.format(TEMP_FOLDER, body['CorrelationId']), ignore_errors=True )