예제 #1
0
    def test_svm_classifier_manual_test_set(self):
        """
        Test for classic classification (SVM classifier) manual test set
        """

        classname = 'Soluble'
        dataframe = sdf_to_csv(self.sdf_file_path,
                               self.fingerprints,
                               class_name_list=classname)
        manual_test_dataframe = sdf_to_csv(self.manual_test_file_path,
                                           self.fingerprints,
                                           class_name_list=classname)
        classic_classifier = ALGORITHM[TRAINER_CLASS][
            SUPPORT_VECTOR_MACHINE_CLASSIFIER](
                self.sdf_file_path,
                classname,
                dataframe,
                subsample_size=1.0,
                test_set_size=self.test_set_size,
                seed=0,
                fptype=self.fingerprints,
                scale='standard',
                output_path=self.temporary_folder,
                n_split=self.n_split,
                manual_test_set=manual_test_dataframe)
        classic_classifier.train_model(
            CODES[SUPPORT_VECTOR_MACHINE_CLASSIFIER])

        metrics = classic_classifier.metrics[
            SUPPORT_VECTOR_MACHINE_CLASSIFIER]['mean']
        true_metrics = {
            ('train', 'AUC'): 0.99,
            ('train', 'ACC'): 0.99,
            ('train', 'f1-score'): 0.99,
            ('train', 'Cohen_Kappa'): 0.95,
            ('train', 'Matthews_corr'): 0.96,
            ('train', 'Precision'): 0.99,
            ('train', 'Recall'): 0.99,
            ('test', 'AUC'): 0.95,
            ('test', 'ACC'): 0.93,
            ('test', 'f1-score'): 0.96,
            ('test', 'Cohen_Kappa'): 0.64,
            ('test', 'Matthews_corr'): 0.66,
            ('test', 'Precision'): 0.93,
            ('test', 'Recall'): 0.98,
            ('validation', 'AUC'): 0.94,
            ('validation', 'ACC'): 0.93,
            ('validation', 'f1-score'): 0.96,
            ('validation', 'Cohen_Kappa'): 0.59,
            ('validation', 'Matthews_corr'): 0.63,
            ('validation', 'Precision'): 0.93,
            ('validation', 'Recall'): 0.99
        }

        self.assertDictAlmostEqual(metrics, true_metrics, delta=0.1)
    def test_classifier_applicability_domain(self):
        valuename = 'Soluble'
        dataframe = sdf_to_csv(self.sdf_file_path,
                               self.fingerprints,
                               class_name_list=valuename)
        classic_classifier = ALGORITHM[TRAINER_CLASS][NAIVE_BAYES](
            self.sdf_file_path,
            valuename,
            dataframe,
            subsample_size=1.0,
            test_set_size=self.test_set_size,
            seed=0,
            fptype=self.fingerprints,
            scale='minmax',
            output_path=self.temporary_folder,
            n_split=self.n_split)
        classic_classifier.make_applicability_domain()

        self.assertAlmostEqual(classic_classifier.distance_mean,
                               5.5480266,
                               delta=0.01)
        self.assertAlmostEqual(classic_classifier.distance_std,
                               2.28519,
                               delta=0.01)
        self.assertAlmostEqual(classic_classifier.density_mean,
                               1916.112310059458,
                               delta=0.01)
        self.assertAlmostEqual(classic_classifier.density_std,
                               0.08123426215633249,
                               delta=0.01)
        self.assertAlmostEqual(classic_classifier.modi, 0.79, delta=0.01)
        self.assertEqual(classic_classifier.train_shape, 1295)
예제 #3
0
    def test_elastic_net_metrics(self):
        """
        Test for classic regression metrics (elastic net)
        """

        valuename = 'logS'
        dataframe = sdf_to_csv(
            self.sdf_file_path, self.fingerprints, value_name_list=valuename)
        classic_regressor = ALGORITHM[TRAINER_CLASS][ELASTIC_NETWORK](
            self.sdf_file_path, valuename, dataframe, scale='minmax', seed=0,
            test_set_size=self.test_set_size, fptype=self.fingerprints,
            output_path=self.temporary_folder, n_split=self.n_split,
            subsample_size=1.0
        )
        classic_regressor.train_model(CODES[ELASTIC_NETWORK])

        metrics = classic_regressor.metrics[ELASTIC_NETWORK]['mean']
        true_metrics = {
            ('train', 'RMSE'): 0.51,
            ('train', 'MAE'): 0.39,
            ('train', 'R2'): 0.93,
            ('test', 'MAE'): 0.55,
            ('test', 'R2'): 0.87,
            ('test', 'RMSE'): 0.74,
            ('validation', 'R2'): 0.86,
            ('validation', 'RMSE'): 0.75,
            ('validation', 'MAE'): 0.58
        }

        self.assertDictAlmostEqual(metrics, true_metrics, delta=0.1)
예제 #4
0
    def make_dataframe(self, dataframe):
        """
        Method which make ndarray dataframe from molecules dataset

        :return: ndarray dataframe with dataset molecules
        """

        # make initial dataframe
        if dataframe is None:
            dataframe = sdf_to_csv(self.dataset_file_name,
                                   self.fptype,
                                   molecules=self.molecules)

        rows_to_delete = numpy.where(
            numpy.isnan(dataframe['value']).any(axis=1))
        dataframe = numpy.delete(dataframe, rows_to_delete, axis=0)

        for index in sorted(rows_to_delete[0], reverse=True):
            LOGGER.info(index)
            del self.molecules[index]

        # apple scaler to dataframe if it used on training
        if self.scaler:
            dataframe = self.scaler.transform(dataframe['value'])
        else:
            dataframe = dataframe["value"]
        return dataframe
예제 #5
0
 def test_sdf_processor(self):
     molecules = molecules_from_mol_strings([self.molstring])
     dataframe = sdf_to_csv('',
                            self.fingerprints,
                            find_classes=True,
                            find_values=True,
                            molecules=molecules)
예제 #6
0
    def test_random_forest_regressor_test_size_zero(self):
        """
        Test for classic regression (random forest regressor) test size zero
        """

        valuename = 'logS'
        dataframe = sdf_to_csv(self.sdf_file_path,
                               self.fingerprints,
                               value_name_list=valuename)
        classic_regressor = ALGORITHM[TRAINER_CLASS][RANDOM_FOREST_REGRESSOR](
            self.sdf_file_path,
            valuename,
            dataframe,
            scale='minmax',
            seed=0,
            test_set_size=self.test_set_size,
            fptype=self.fingerprints,
            output_path=self.temporary_folder,
            n_split=self.n_split,
            subsample_size=1.0)
        classic_regressor.train_model(CODES[RANDOM_FOREST_REGRESSOR])

        metrics = classic_regressor.metrics[RANDOM_FOREST_REGRESSOR]['mean']
        true_metrics = {
            ('train', 'RMSE'): 0.42,
            ('train', 'MAE'): 0.32,
            ('train', 'R2'): 0.95,
            ('validation', 'R2'): 0.90,
            ('validation', 'RMSE'): 0.63,
            ('validation', 'MAE'): 0.48
        }

        self.assertDictAlmostEqual(metrics, true_metrics, delta=0.1)
    def test_regressor_applicability_domain(self):
        valuename = 'logS'
        dataframe = sdf_to_csv(self.sdf_file_path,
                               self.fingerprints,
                               value_name_list=valuename)
        classic_regressor = ALGORITHM[TRAINER_CLASS][ELASTIC_NETWORK](
            self.sdf_file_path,
            valuename,
            dataframe,
            scale='minmax',
            seed=0,
            test_set_size=self.test_set_size,
            fptype=self.fingerprints,
            output_path=self.temporary_folder,
            n_split=self.n_split,
            subsample_size=1.0)
        classic_regressor.make_applicability_domain()

        self.assertAlmostEqual(classic_regressor.distance_mean,
                               5.5480266,
                               delta=0.01)
        self.assertAlmostEqual(classic_regressor.distance_std,
                               2.28519,
                               delta=0.01)
        self.assertAlmostEqual(classic_regressor.density_mean,
                               1916.112310059458,
                               delta=0.01)
        self.assertAlmostEqual(classic_regressor.density_std,
                               0.08123426215633249,
                               delta=0.01)
        self.assertAlmostEqual(classic_regressor.modi, 0.75, delta=0.01)
        self.assertEqual(classic_regressor.train_shape, 1295)
    def test_ada_boost_skopt_hyperparameters(self):
        """
        Test for classic classification skopt hyperparameters (Ada boost)
        """

        classname = 'Soluble'
        dataframe = sdf_to_csv(self.sdf_file_path,
                               self.fingerprints,
                               class_name_list=classname)
        ada_boost = ALGORITHM[TRAINER_CLASS][ADA_BOOST_DECISION_TREE](
            self.sdf_file_path,
            classname,
            dataframe,
            subsample_size=1.0,
            test_set_size=self.test_set_size,
            seed=0,
            fptype=self.fingerprints,
            scale='minmax',
            output_path=self.temporary_folder,
            n_split=self.n_split,
            opt_method='gauss')
        ada_boost.model_name = ADA_BOOST_DECISION_TREE
        parameters = ada_boost.make_training_parameters_grid()
        self.assertEquals(ada_boost_classifier_hyperparameters_skopt,
                          parameters)
예제 #9
0
def make_dataframe(model_type, training_parameters, oauth):
    """
    Method for make dataframe using training parameters values.
    Download sdf file from blob storage and convert it to dataframe

    :param model_type: type of training model, classifier or regressor
    :param training_parameters: model training parameters
    :param oauth: using in ml service OAuth2Session object
    :type model_type: str
    :type training_parameters: dict
    :return: prepared dataframe with needed training target column
    """

    # download sdf file from blob storage
    # make stream from downloaded object
    stream = make_stream_from_sdf(training_parameters, oauth)
    # define using local variables
    filename = training_parameters['SourceFileName']
    classname = training_parameters['ClassName']
    fptype = training_parameters['Fingerprints']

    # create dataframe using training parameters and dataframe
    LOGGER.info('Creating Fingerprints for molecules...')
    if model_type == CLASSIFIER:
        dataframe = sdf_to_csv(filename,
                               fptype,
                               class_name_list=classname,
                               stream=stream)
    elif model_type == REGRESSOR:
        dataframe = sdf_to_csv(filename,
                               fptype,
                               value_name_list=classname,
                               stream=stream)
    else:
        # raise error if using unknown model type
        # you should use defined (known) model types global variables only
        LOGGER.error('Unknown model type: {}'.format(model_type))
        raise TypeError('Unknown model type: {}'.format(model_type))

    LOGGER.info('Fingerprints created.')

    return dataframe
예제 #10
0
    def test_svm_regressor_manual_test_set(self):
        """
        Test for classic regression (SVM regressor) manual test set
        """

        valuename = 'logS'
        dataframe = sdf_to_csv(self.sdf_file_path,
                               self.fingerprints,
                               value_name_list=valuename)
        manual_test_dataframe = sdf_to_csv(self.manual_test_file_path,
                                           self.fingerprints,
                                           value_name_list=valuename)
        classic_regressor = ALGORITHM[TRAINER_CLASS][
            SUPPORT_VECTOR_MACHINE_REGRESSOR](
                self.sdf_file_path,
                valuename,
                dataframe,
                seed=0,
                test_set_size=self.test_set_size,
                fptype=self.fingerprints,
                output_path=self.temporary_folder,
                n_split=self.n_split,
                manual_test_set=manual_test_dataframe,
                subsample_size=1.0)
        classic_regressor.train_model(CODES[SUPPORT_VECTOR_MACHINE_REGRESSOR])

        metrics = classic_regressor.metrics[SUPPORT_VECTOR_MACHINE_REGRESSOR][
            'mean']
        true_metrics = {
            ('train', 'RMSE'): 0.54,
            ('train', 'MAE'): 0.38,
            ('train', 'R2'): 0.93,
            ('test', 'MAE'): 0.64,
            ('test', 'R2'): 0.78,
            ('test', 'RMSE'): 0.88,
            ('validation', 'R2'): 0.75,
            ('validation', 'RMSE'): 1.0,
            ('validation', 'MAE'): 0.68
        }

        self.assertDictAlmostEqual(metrics, true_metrics, delta=0.1)
예제 #11
0
    def test_dnn_classification_metrics(self):
        """
        Test for DNN classification metrics
        """

        valuename = 'Soluble'
        dataframe = sdf_to_csv(self.sdf_file_path,
                               self.fingerprints,
                               class_name_list=valuename)
        dnn_classifier = ALGORITHM[TRAINER_CLASS][DNN_CLASSIFIER](
            self.sdf_file_path,
            valuename,
            dataframe,
            subsample_size=1.0,
            test_set_size=self.test_set_size,
            seed=0,
            fptype=self.fingerprints,
            scale='minmax',
            output_path=self.temporary_folder,
            n_split=self.n_split)
        dnn_classifier.train_model(CODES[DNN_CLASSIFIER])

        metrics = dnn_classifier.metrics[DNN_CLASSIFIER]['mean']
        true_metrics = {
            ('train', 'AUC'): 0.99,
            ('train', 'ACC'): 0.97,
            ('train', 'f1-score'): 0.98,
            ('train', 'Cohen_Kappa'): 0.84,
            ('train', 'Matthews_corr'): 0.85,
            ('train', 'Precision'): 0.96,
            ('train', 'Recall'): 0.99,
            ('test', 'AUC'): 0.96,
            ('test', 'ACC'): 0.94,
            ('test', 'f1-score'): 0.96,
            ('test', 'Cohen_Kappa'): 0.67,
            ('test', 'Matthews_corr'): 0.69,
            ('test', 'Precision'): 0.94,
            ('test', 'Recall'): 0.99,
            ('validation', 'AUC'): 0.89,
            ('validation', 'ACC'): 0.92,
            ('validation', 'f1-score'): 0.95,
            ('validation', 'Cohen_Kappa'): 0.55,
            ('validation', 'Matthews_corr'): 0.57,
            ('validation', 'Precision'): 0.93,
            ('validation', 'Recall'): 0.98
        }

        self.assertDictAlmostEqual(metrics, true_metrics, delta=0.3)
예제 #12
0
    def test_naive_bayes_metrics(self):
        """
        Test for classic classification metrics (naive Bayes)
        """

        valuename = 'Soluble'
        dataframe = sdf_to_csv(
            self.sdf_file_path, self.fingerprints, class_name_list=valuename)
        classic_classifier = ALGORITHM[TRAINER_CLASS][NAIVE_BAYES](
            self.sdf_file_path, valuename, dataframe, subsample_size=1.0,
            test_set_size=self.test_set_size, seed=0, fptype=self.fingerprints,
            scale='robust', output_path=self.temporary_folder,
            n_split=self.n_split
        )
        classic_classifier.train_model(CODES[NAIVE_BAYES])

        metrics = classic_classifier.metrics[NAIVE_BAYES]['mean']
        true_metrics = {
            ('train', 'AUC'): 0.85,
            ('train', 'ACC'): 0.81,
            ('train', 'f1-score'): 0.88,
            ('train', 'Cohen_Kappa'): 0.41,
            ('train', 'Matthews_corr'): 0.47,
            ('train', 'Precision'): 0.97,
            ('train', 'Recall'): 0.8,
            ('test', 'AUC'): 0.89,
            ('test', 'ACC'): 0.81,
            ('test', 'f1-score'): 0.88,
            ('test', 'Cohen_Kappa'): 0.46,
            ('test', 'Matthews_corr'): 0.53,
            ('test', 'Precision'): 0.98,
            ('test', 'Recall'): 0.79,
            ('validation', 'AUC'): 0.83,
            ('validation', 'ACC'): 0.81,
            ('validation', 'f1-score'): 0.88,
            ('validation', 'Cohen_Kappa'): 0.40,
            ('validation', 'Matthews_corr'): 0.45,
            ('validation', 'Precision'): 0.97,
            ('validation', 'Recall'): 0.81
        }

        self.assertDictAlmostEqual(metrics, true_metrics, delta=0.1)
예제 #13
0
    def test_logistic_regression_test_size_zero(self):
        """
        Test for classic classification (logistic regression) test size zero
        """

        classname = 'Soluble'
        dataframe = sdf_to_csv(self.sdf_file_path,
                               self.fingerprints,
                               class_name_list=classname)
        classic_classifier = ALGORITHM[TRAINER_CLASS][LOGISTIC_REGRESSION](
            self.sdf_file_path,
            classname,
            dataframe,
            subsample_size=1.0,
            test_set_size=self.test_set_size,
            seed=0,
            fptype=self.fingerprints,
            scale='minmax',
            output_path=self.temporary_folder,
            n_split=self.n_split)
        classic_classifier.train_model(CODES[LOGISTIC_REGRESSION])

        metrics = classic_classifier.metrics[LOGISTIC_REGRESSION]['mean']
        true_metrics = {
            ('train', 'AUC'): 0.99,
            ('train', 'ACC'): 0.97,
            ('train', 'f1-score'): 0.98,
            ('train', 'Cohen_Kappa'): 0.84,
            ('train', 'Matthews_corr'): 0.84,
            ('train', 'Precision'): 0.97,
            ('train', 'Recall'): 0.99,
            ('validation', 'AUC'): 0.97,
            ('validation', 'ACC'): 0.95,
            ('validation', 'f1-score'): 0.97,
            ('validation', 'Cohen_Kappa'): 0.70,
            ('validation', 'Matthews_corr'): 0.73,
            ('validation', 'Precision'): 0.94,
            ('validation', 'Recall'): 0.99
        }

        self.assertDictAlmostEqual(metrics, true_metrics, delta=0.1)
예제 #14
0
    def test_dnn_regression_metrics(self):
        """
        Test for DNN regression metrics
        """

        valuename = 'logS'
        dataframe = sdf_to_csv(self.sdf_file_path,
                               self.fingerprints,
                               value_name_list=valuename)
        dnn_regressor = ALGORITHM[TRAINER_CLASS][DNN_REGRESSOR](
            self.sdf_file_path,
            valuename,
            dataframe,
            scale='minmax',
            seed=0,
            test_set_size=self.test_set_size,
            fptype=self.fingerprints,
            output_path=self.temporary_folder,
            n_split=self.n_split,
            subsample_size=1.0)
        dnn_regressor.train_model(CODES[DNN_REGRESSOR])

        metrics = dnn_regressor.metrics[DNN_REGRESSOR]['mean']
        true_metrics = {
            ('train', 'RMSE'): 0.60,
            ('train', 'MAE'): 0.46,
            ('train', 'R2'): 0.91,
            ('test', 'MAE'): 0.62,
            ('test', 'R2'): 0.84,
            ('test', 'RMSE'): 0.82,
            ('validation', 'R2'): 0.81,
            ('validation', 'RMSE'): 0.87,
            ('validation', 'MAE'): 0.67
        }

        self.assertDictAlmostEqual(metrics, true_metrics, delta=0.3)
    def test_elastic_network_skopt_hyperparameters(self):
        """
        Test for classic classification skopt hyperparameters (elastic network)
        """

        valuename = 'logS'
        dataframe = sdf_to_csv(self.sdf_file_path,
                               self.fingerprints,
                               value_name_list=valuename)
        elastic_network_regressor = ALGORITHM[TRAINER_CLASS][ELASTIC_NETWORK](
            self.sdf_file_path,
            valuename,
            dataframe,
            subsample_size=1.0,
            test_set_size=self.test_set_size,
            seed=0,
            fptype=self.fingerprints,
            scale='minmax',
            output_path=self.temporary_folder,
            n_split=self.n_split,
            opt_method='forest')
        elastic_network_regressor.model_name = ELASTIC_NETWORK
        parameters = elastic_network_regressor.make_training_parameters_grid()
        self.assertEquals(elastic_network_hyperparameters_skopt, parameters)
예제 #16
0
layers = [64, 64]

input_drop_out = 0.0
drop_out = 0.0
n_split = 10
optimizer = 'Nadam'
activation = 'selu'
l_rate = 0.01
beta = 0.0001
k_constraint = 4
mc_train_cut_off = 0.65

output_path = 'C:\\PycharmProjects\\ml.services\\Source\\callers and models'

dataframe = sdf_to_csv(filepath,
                       fptype,
                       value_name_list=valuename,
                       cut_off=0.1)

# dataframe = pd.read_csv(filename)
# x = dataframe.values #returns a numpy array
# min_max_scaler = preprocessing.MinMaxScaler()
# x_scaled = min_max_scaler.fit_transform(x)
# headers = [x for x in range(797)]
# headers.append('Tox')
# dataframe = pd.DataFrame(x_scaled,columns=headers)
# print(dataframe)

classifier = DNNClassifier(ntpath.basename(filepath),
                           classname,
                           dataframe,
                           test_set_size=test_set_size,
예제 #17
0
def process_sdf(body, sdf_as_bytes_array):
    fingerprints = list()
    for fingerprint in body['Fingerprints']:
        new_fingerprint = dict()
        for key, value in fingerprint.items():
            new_fingerprint[key.capitalize()] = value

        fingerprints.append(new_fingerprint)

    body['Fingerprints'] = fingerprints
    molecules = get_molecules_from_sdf_bytes(sdf_as_bytes_array)
    errors_list = [[]]
    try:
        data_frame = sdf_to_csv('',
                                body['Fingerprints'],
                                find_classes=True,
                                find_values=True,
                                molecules=molecules,
                                processing_errors=errors_list)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        error_message = 'Can\'t make dataframe using this sdf file'
        raise Exception(error_message)

    smiles_list = list()
    inchies = list()
    for molecule_number, molecule in enumerate(molecules):
        smiles = Chem.MolToSmiles(molecule, isomericSmiles=True)
        inchi = get_inchi_key(molecule)
        smiles_list.append(('SMILES', molecule_number, smiles))
        inchies.append(('InChiKey', molecule_number, inchi))

    smiles_numpy_array = numpy.array(smiles_list,
                                     dtype=[('name', 'U17'),
                                            ('molecule_number', 'i4'),
                                            ('value', 'U40')])
    inchi_numpy_array = numpy.array(inchies,
                                    dtype=[('name', 'U17'),
                                           ('molecule_number', 'i4'),
                                           ('value', 'U40')])
    errors_numpy_array = numpy.array(errors_list[0],
                                     dtype=[('name', 'U17'),
                                            ('molecule_number', 'i4'),
                                            ('value', 'U40')])
    data_frame = data_frame.astype([('name', 'U10'), ('molecule_number', 'i4'),
                                    ('value', 'U40')])

    data_frame = numpy.insert(data_frame, 0, inchi_numpy_array, axis=1)
    data_frame = numpy.insert(data_frame, 0, smiles_numpy_array, axis=1)
    data_frame = numpy.insert(data_frame,
                              data_frame.shape[1],
                              errors_numpy_array,
                              axis=1)

    csv_file_path = '{}/{}.csv'.format(TEMP_FOLDER, body['CorrelationId'])

    try:
        numpy_to_csv(data_frame, csv_file_path)
    except:
        # log error traceback
        logging_exception_message(LOGGER)
        error_message = 'Can\'t convert dataframe to csv file'
        raise Exception(error_message)

    body['Structures'] = data_frame.shape[0]
    body['Columns'] = data_frame.shape[1]
    body['Failed'] = 0

    return csv_file_path
예제 #18
0
        shuffle=False,
        verbose=0)

    plot_train_history(model_history_tmp, 'compressor_0_1', '')

    # load the best model base on validation results for this fold
    autoencoder = load_model('checkpoint.h5')

    latent_to_map = Model(input, neck_out)
    latent_to_map.save('smi2lat.h5')

    return latent_to_map, var_thresh, scaler


fptype = [{'Type': 'DESC'}, {'Type': 'SEQ'}]
dataframe = sdf_to_csv(path_to_sdf, fptype=fptype, class_name_list=classname)
dataframe = dataframe.apply(pd.to_numeric, errors='coerce').replace(
    [np.inf, -np.inf], np.nan).dropna(axis=0, how='any').reset_index(drop=True)

x_features = dataframe.ix[:, :-1]
print(x_features)
lat2coord, var_thresh, scaler = get_mapper(x_features)

data_list = []
for id, row in dataframe.iterrows():
    lis = []
    lis.extend(
        list(
            lat2coord.predict(
                scaler.transform(
                    var_thresh.transform(np.reshape(row[:-1].values,
예제 #19
0
    def test_neighbors_regressor_report(self):
        valuename = 'logS'
        dataframe = sdf_to_csv(self.sdf_file_path,
                               self.fingerprints,
                               value_name_list=valuename)
        classic_regressor = ALGORITHM[TRAINER_CLASS][NEIGHBORS_REGRESSOR](
            self.sdf_file_path,
            valuename,
            dataframe,
            scale='minmax',
            seed=0,
            test_set_size=self.test_set_size,
            fptype=self.fingerprints,
            output_path=self.temporary_folder,
            n_split=self.n_split,
            subsample_size=0.2)
        classic_regressor.train_model(CODES[NEIGHBORS_REGRESSOR])
        classic_regressor.make_applicability_domain()
        plots = classic_regressor.cv_model.make_plots()
        path_to_csv = classic_regressor.make_perfomance_csv().split('/')[-1]
        path_to_qmrf_report = classic_regressor.make_qmrf_report(
            None, None).split('/')[-1]
        path_to_archive = classic_regressor.cv_model.compress_models()
        path_to_archive = classic_regressor.compress_additional_files(
            path_to_archive).split('/')[-1]
        path_to_distrubution_plot = distribution_plot(
            classic_regressor, model_name='Nearest Neighbors').split('/')[-1]

        prepare_plots(plots)
        true_plots = {
            '1': {
                'regression_results_test':
                'K Neighbors Regressor_logS_test_fold_1_regression_plot.png',
                'regression_results_train':
                'K Neighbors Regressor_logS_train_fold_1_regression_plot.png',
                'regression_results_valid':
                'K Neighbors Regressor_logS_validation_fold_1_regression_plot.png',
                'thumbnail_plot_path': None
            },
            '2': {
                'regression_results_test':
                'K Neighbors Regressor_logS_test_fold_2_regression_plot.png',
                'regression_results_train':
                'K Neighbors Regressor_logS_train_fold_2_regression_plot.png',
                'regression_results_valid':
                'K Neighbors Regressor_logS_validation_fold_2_regression_plot.png',
                'thumbnail_plot_path': None
            },
            'mean': {
                'regression_results_test':
                'K Neighbors Regressor_logS_test_fold_mean_regression_plot.png',
                'regression_results_train':
                'K Neighbors Regressor_logS_train_fold_mean_regression_plot.png',
                'regression_results_valid':
                'K Neighbors Regressor_logS_validation_fold_mean_regression_plot.png',
                'thumbnail_plot_path':
                'K Neighbors Regressor_logS_thumbnail_image.jpg'
            }
        }

        self.assertDictEqual(plots, true_plots)
        self.assertEqual(path_to_distrubution_plot,
                         'Nearest Neighbors_train_test_distribution.png')
        self.assertEqual(path_to_csv,
                         'K Neighbors Regressor_DNN_data_solubility.csv')
        self.assertEqual(path_to_qmrf_report,
                         'K_Neighbors_Regressor_QMRF_report.pdf')
        self.assertEqual(path_to_archive, 'K_Neighbors_Regressor.zip')
예제 #20
0
    def test_random_forest_classifier_report(self):
        valuename = 'Soluble'
        dataframe = sdf_to_csv(self.sdf_file_path,
                               self.fingerprints,
                               class_name_list=valuename)
        classic_classifier = ALGORITHM[TRAINER_CLASS][
            RANDOM_FOREST_CLASSIFIER](self.sdf_file_path,
                                      valuename,
                                      dataframe,
                                      subsample_size=0.2,
                                      test_set_size=self.test_set_size,
                                      seed=0,
                                      fptype=self.fingerprints,
                                      scale='robust',
                                      output_path=self.temporary_folder,
                                      n_split=self.n_split)
        classic_classifier.train_model(CODES[RANDOM_FOREST_CLASSIFIER])
        classic_classifier.make_applicability_domain()
        plots = classic_classifier.cv_model.make_plots()
        path_to_csv = classic_classifier.make_perfomance_csv()
        path_to_qmrf_report = classic_classifier.make_qmrf_report(
            None, None).split('/')[-1]
        path_to_archive = classic_classifier.cv_model.compress_models()
        path_to_archive = classic_classifier.compress_additional_files(
            path_to_archive).split('/')[-1]
        path_to_radar_plot = radar_plot(
            path_to_csv,
            classic_classifier.sub_folder,
            classic_classifier.bins,
            titlename='Random Forest').split('/')[-1]
        path_to_csv = path_to_csv.split('/')[-1]

        prepare_plots(plots)
        true_plots = {
            '1': {
                'roc_plot_path':
                'Random_Forest_Classifier_fold_1_ROC_plot.png',
                'cm_plot_path':
                'Random_Forest_Classifier_fold_1_confusion.png',
                'thumbnail_plot_path': None
            },
            '2': {
                'roc_plot_path':
                'Random_Forest_Classifier_fold_2_ROC_plot.png',
                'cm_plot_path':
                'Random_Forest_Classifier_fold_2_confusion.png',
                'thumbnail_plot_path': None
            },
            'mean': {
                'roc_plot_path':
                'Random_Forest_Classifier_fold_mean_ROC_plot.png',
                'cm_plot_path':
                'Random_Forest_Classifier_fold_mean_confusion.png',
                'thumbnail_plot_path':
                'Random_Forest_Classifier_thumbnail_image.jpg'
            }
        }

        self.assertDictEqual(plots, true_plots)
        self.assertEqual(path_to_radar_plot, 'radar_plot.png')
        self.assertEqual(path_to_csv,
                         'Random Forest Classifier_DNN_data_solubility.csv')
        self.assertEqual(path_to_qmrf_report,
                         'Random_Forest_Classifier_QMRF_report.pdf')
        self.assertEqual(path_to_archive, 'Random_Forest_Classifier.zip')
예제 #21
0
layers = [64, 64]

input_drop_out = 0.1
drop_out = 0.0
n_split = 10
optimizer = 'Nadam'
activation = 'relu'
l_rate = 0.005
beta = 0.00001
k_constraint = 4
mc_train_cut_off = 0.65

output_path = 'C:\\PycharmProjects\\ml.services\\Source\\callers and models'

dataframe = sdf_to_csv(filepath, fptype, value_name_list=valuename)
dataframe_test = sdf_to_csv(filepath_test, fptype, value_name_list=valuename)

regressor = ALGORITHM[TRAINER_CLASS][DNN_REGRESSOR](
    ntpath.basename(filepath),
    valuename,
    dataframe,
    test_set_size=test_set_size,
    fptype=fptype,
    n_split=n_split,
    output_path=output_path,
    scale="standard",
    manual_test_set=dataframe_test)

dnn = regressor.train_model(CODES[DNN_REGRESSOR])
dnn.make_plots()
예제 #22
0
def fingerprints_grid_search(
        oauth, body, fingerprints, subsample_size=1000
):
    """
    Function for searching of optimal combination of fingerprints.
    subsample_size molecules are extracted from initial dataset and used
    for training of multiple models with varying combinations of fingerprints.

    :param oauth:
    :param body:
    :param fingerprints: list of fingerprints' combinations
    :param subsample_size: number of objects that will be used to train model
    :return: dict with fingerprints' metrics and statistics
    """

    # make folder for current optimization
    optimizer_folder = '{}/ml_optimizer/{}'.format(
        TEMP_FOLDER, body['CorrelationId'])
    make_directory(optimizer_folder)

    # download and save sdf file
    stream = make_stream_from_sdf(body, oauth)
    filename = body['SourceFileName']
    temporary_sdf_filename = '{}/tmp_{}.sdf'.format(optimizer_folder, filename)
    temporary_sdf_file = open(temporary_sdf_filename, 'wb')
    temporary_sdf_file.write(stream.getvalue())
    temporary_sdf_file.close()

    # extract sample (which have subsample_size) from source dataset
    prediction_target = body['ClassName']
    mode = model_type_by_code(body['Methods'][0].lower())
    sample_file_name = extract_sample_dataset(
        input_file_name=temporary_sdf_filename, subsample_size=subsample_size,
        prediction_target=prediction_target, mode=mode
    )

    # define classifier and regressor models for optimizing
    if mode == CLASSIFIER:
        model_code = NAIVE_BAYES
        target_metric = 'test__AUC'
    elif mode == REGRESSOR:
        model_code = ELASTIC_NETWORK
        target_metric = 'test__R2'
    else:
        raise ValueError('Unknown node: {}'.format(mode))

    # loop all base fingerprints sets to find best set
    metrics = dict()
    for fingerprint_number, fptype in enumerate(fingerprints):

        # make dataframe depends on fingerprint set
        # and model type (classifier or regressor)
        start_fps_processing = time()
        if mode == CLASSIFIER:
            dataframe = sdf_to_csv(
                sample_file_name, fptype=fptype,
                class_name_list=prediction_target
            )
        elif mode == REGRESSOR:
            dataframe = sdf_to_csv(
                sample_file_name, fptype=fptype,
                value_name_list=prediction_target
            )
        else:
            raise ValueError('Unknown mode: {}'.format(mode))
        fps_processing_time_seconds = time() - start_fps_processing

        # train model
        start_current_training = time()
        classic_classifier = ALGORITHM[TRAINER_CLASS][model_code](
            sample_file_name, prediction_target, dataframe, subsample_size=1.0,
            test_set_size=0.2, seed=0, fptype=fptype, scale='minmax',
            n_split=1, output_path=optimizer_folder
        )
        classic_classifier.train_model(CODES[model_code])
        current_training_time_seconds = time() - start_current_training

        # add formatted model's metrics and times to heap
        formatted_metrics = format_metrics(
            classic_classifier.metrics[model_code]['mean'])
        metrics.update({
            fingerprint_number: {
                'fptype': fptype,
                'metrics': formatted_metrics,
                'fingerprint_processing_time': fps_processing_time_seconds,
                'prediction_time': current_training_time_seconds
            }
        })

    return metrics, target_metric
예제 #23
0
import sklearn
print(sklearn.__version__)



suppl = Chem.SDMolSupplier(
    'C:\PycharmProjects\ml-data-qsar\TEST\LC50\LC50_training.sdf')
molecules = [x for x in suppl if x is not None]
molecules = molecules

fptype = [{'Type': 'DESC'},
          {'Type': 'MACCS'},
          {'Type': 'FCFC','Size': 512,'Radius':3},
          {'Type': 'AVALON','Size': 512}]
dataframe = sdf_to_csv('LC50_prediction', fptype=fptype, molecules=molecules)


folder_path = 'C:\PycharmProjects\ml-models\\UBC\Half_LIfe_U_2018_03_18__14_24_16_DESC_MACCS_FCFC_512_3_AVALON_512_scaled___'
models_paths = [os.path.join(folder_path, x) for x in listdir(folder_path) if x.split('.')[-1] == 'h5']
transformers = [os.path.join(folder_path, x) for x in listdir(folder_path) if x.split('.')[-1] == 'sav']


predicted_test_y_vectors = []
df_predict_clf = pd.DataFrame()
for transformer in transformers:
    trans = joblib.load(transformer)

for path_to_model in models_paths:
    model_base = load_model(
        path_to_model,