Пример #1
0
def cv(db,
       csv_target,
       csv_descriptors,
       n_splits_,
       n_repeats_,
       num_epochs,
       n_rot_train,
       train_steps_per_epoch_,
       n_rot_test,
       test_steps_per_epoch_,
       ndense_layers,
       nunits,
       nfilters,
       random_state,
       cvout=None,
       fcvgroup=None,
       featimp_out=None,
       y_recalc=False,
       mout=None):
    # Load the dataset
    ai = AIModel(csv_target, db, csv_descriptors)
    available_keys = ai.GetAvailableKeys()
    print("N. instances: %d" % (len(ai.target)))
    predictions = dict()
    valpredictions = dict()
    for key in ai.target.keys():
        predictions[key] = []
        valpredictions[key] = []

    feat_imp = None
    feat_imp_iterations = 20

    if featimp_out is not None:
        # feature importance list for csv descriptors
        if ai.other_descriptors is not None:
            feat_imp = [[] for p in range(ai.nfeatures)]
            # charge voxel descriptor
            feat_imp.append([])
        else:
            print("Feature Importance calculation: DISABLED")

    # Create directory to store all the models
    mout_path = None
    if mout is not None:
        # Utilised to store the out path
        mout_path = Path("%s_%s" % (time.strftime("%Y%m%d%H%M%S"), mout))
        mout_path.mkdir(exist_ok=True, parents=True)
        if ai.other_descriptors is not None:
            # Save the descriptor order
            f = open("%s/odesc_header.csv" % (str(mout_path.absolute())), "w")
            for item in ai.header:
                f.write("%s\n" % (item))
            f.close()
    # Choose between static manual cross validation group or
    # Repeated KFold Cross Validation
    cvmethod = None
    cvgroups = None
    if fcvgroup is not None:
        cvgroups = CVGroupRead(fcvgroup)
        cvmethod = StaticGroupCV(cvgroups)
        # cvmethod = RepeatedStratifiedCV(cvgroups, n_repeats_, 2)
    else:
        cvmethod = RepeatedKFold(available_keys,
                                 n_splits_,
                                 n_repeats_,
                                 random_state,
                                 test_size=0.2)
    cv_ = 0
    for train_keys, val_keys, test_keys in cvmethod:
        print("Train set size: %d Val set size %d Test set size: %d" %
              (len(train_keys), len(val_keys), len(test_keys)))
        # Some memory clean-up
        K.clear_session()
        # print(global_test_intexes)
        model = None
        model_ = GetKerasModel()
        if ai.other_descriptors is None:
            if model_ is None:
                model = build_model(ai.conv3d_chtype, ai.input_shape,
                                    ndense_layers, nunits, nfilters)
            else:
                model = model_(ai.conv3d_chtype, ai.input_shape, ndense_layers,
                               nunits, nfilters)
            # model = model_scirep(ai.conv3d_chtype, ai.input_shape, ndense_layers, nunits, nfilters)
            # model = ResNetModel(ai.input_shape)
            print(model.summary())
        else:
            if model_ is None:
                model = build_2DData_model(ai.conv3d_chtype, ai.input_shape,
                                           ai.nfeatures, ndense_layers, nunits,
                                           nfilters)
            else:
                model = model_(ai.conv3d_chtype, ai.input_shape, ndense_layers,
                               nunits, nfilters)
            """
            for l in model.layers[0].layers:
                print(l.summary())
            """
            print("Total Summary")
            print(model.summary())

        dname = os.path.basename(csv_target).replace(".csv", "")
        log_dir_ = ("./logs/cv%d_%s_%d_#rot%d_#f%d_#dl%d_#u%d_" %
                    (cv_, dname, num_epochs, train_steps_per_epoch_, nfilters,
                     ndense_layers, nunits))
        log_dir_ += time.strftime("%Y%m%d%H%M%S")

        model_outfile = "%s/%d.h5" % (str(mout_path.absolute()), cv_)
        callbacks_ = [
            TensorBoard(log_dir=log_dir_,
                        histogram_freq=0,
                        write_graph=False,
                        write_images=False),
            ModelCheckpoint(model_outfile,
                            monitor='val_loss',
                            verbose=0,
                            save_best_only=True)
        ]

        train_generator = ai.VoxelTrainGenerator(train_keys, n_rot_train)
        x_train_, y_train_ = ai.VoxelTestSetGenerator(train_keys, n_rot_train)
        x_test_, y_test_ = ai.VoxelTestSetGenerator(test_keys, n_rot_test)
        x_val_, y_val_ = ai.VoxelTestSetGenerator(val_keys, n_rot_test)
        val_generator = ai.VoxelTrainGenerator(val_keys, n_rot_test)
        model.fit_generator(
            train_generator,
            epochs=num_epochs,
            steps_per_epoch=train_steps_per_epoch_,
            verbose=1,
            # validation_data=(x_test_, y_test_),
            validation_data=val_generator,
            validation_steps=test_steps_per_epoch_,
            callbacks=callbacks_,
            use_multiprocessing=True)
        """
        if y_recalc is True:
            # Recalculate y it takes a lot of time
            x_dataset_, y_dataset_ = ai.VoxelTestSetGenerator(train_keys, n_rotation_test)
            yrecalc = model.predict(x_dataset_)
            # Store the recalculated y
            k = 0
            c = 0
            for i in range(len(yrecalc)):
                recalc[train_keys[k]].extend(list(yrecalc[i]))
                if c == n_rotation_test-1:
                    k += 1
                    c = 0
                else:
                    c += 1
        """
        """
        test_scores = model.evaluate(x_test_, y_test_)
        print("Test Scores: {}".format(test_scores))
        """
        model = GetLoadModelFnc()(model_outfile)
        y_recalc = model.predict(x_train_)
        ypred_test = model.predict(x_test_)
        ypred_val = model.predict(x_val_)
        # exp_pred_plot(y_test_, ypred_test[:,0])
        r2 = RSQ(y_train_, y_recalc)
        q2 = RSQ(y_test_, ypred_test)
        vr2 = RSQ(y_val_, ypred_val)
        print("Train R2: %.4f Test Q2: %.4f Val: R2: %.4f\n" % (r2, q2, vr2))

        # Store the test prediction result
        k = 0
        c = 0
        for i in range(len(ypred_val)):
            valpredictions[test_keys[k]].append(list(ypred_val[i]))
            if c == n_rot_test - 1:
                k += 1
                c = 0
            else:
                c += 1

        # Store the cross validation result
        k = 0
        c = 0
        for i in range(len(ypred_test)):
            predictions[test_keys[k]].append(list(ypred_test[i]))
            if c == n_rot_test - 1:
                k += 1
                c = 0
            else:
                c += 1
        """
        Compute the feature importance according to the Breiman-Fisher-Rudin-Dominici-Algorithm
        Train a model f with a feature map X and a target vector y. Measure th error L(y, y_pred) = e_original

        Input: trained model f, feature matrix X, target vector y, error measure L(y, y_pred)
        1) Estimate the original model error
        2) For each feature:
          - Generate a feature matrix with the p feature permutated N times to breaks the
            association between Xj and y
          - estimate the error using the permutated X feature matrix
          - calculate the feature importance FI = e_perm/e_original or FI = e_perm - e_original
        3) Sort variables by descending Fi

        The error estimation utilised is the mean squared error calculated with this formula
        mse = (np.square(A - B)).mean(axis=0)
        """
        if feat_imp is not None:
            # e_orig = MSE(list(y_test_), list(ypred))
            e_orig = MAE(list(y_test_), list(ypred_test))
            # calculate the feature importance for the descriptors
            for fid_ in range(ai.nfeatures):
                for it in range(feat_imp_iterations):
                    x_val_perm = ai.FeaturePermutation(x_val_, fid=fid_)
                    ypred_perm = model.predict(x_val_perm)
                    # e_perm = MSE(list(y_test_), list(ypred_perm))
                    e_perm = MAE(list(y_test_), list(ypred_perm))
                    feat_imp[fid_].append(e_perm / e_orig)

            # Calculate the feature importance for the voxel information
            for it in range(feat_imp_iterations):
                x_val_perm = ai.FeaturePermutation(x_val_, fid=9999)
                ypred_perm = model.predict(x_val_perm)
                e_perm = MAE(list(y_test_), list(ypred_perm))
                feat_imp[-1].append(e_perm / e_orig)

        if mout_path is not None:
            model.save("%s/%d.h5" % (str(mout_path.absolute()), cv_))
        # Update the cross validation id
        cv_ += 1

    if cvout is not None:
        WriteCrossValidationOutput(cvout, self.target, predictions, None)

    if feat_imp is not None:
        fo = open("%s" % (featimp_out), "w")
        for i in range(ai.nfeatures):
            """
            fo.write("%s," % (ai.header[i]))
            for j in range(len(feat_imp[i])-1):
                fo.write("%.4f," % (feat_imp[i][j]))
            fo.write("%.4f\n" % (feat_imp[i][-1]))
            """
            a = np.array(feat_imp[i])
            min_a = a.min()
            q1 = np.percentile(a, 25)
            med_a = np.percentile(a, 50)
            q3 = np.percentile(a, 75)
            max_a = a.max()
            fo.write("%s,%.4f,%.4f,%.4f,%.4f,%.4f\n" %
                     (ai.header[i], min_a, q1, med_a, q3, max_a))
        a = np.array(feat_imp[-1])
        min_a = a.min()
        q1 = np.percentile(a, 25)
        med_a = np.percentile(a, 50)
        q3 = np.percentile(a, 75)
        max_a = a.max()
        fo.write("%s,%.4f,%.4f,%.4f,%.4f,%.4f\n" %
                 ("qm_voxel_charge", min_a, q1, med_a, q3, max_a))
        """
        fo.write("%s,\n" % ("qm_voxel_charge"))
        for j in range(len(feat_imp[-1])-1):
            fo.write("%.4f," % (feat_imp[-1][j]))
        fo.write("%.4f\n" % (feat_imp[-1][-1]))
        """
        fo.close()

    ycvp = {}
    for key in predictions.keys():
        if len(predictions[key]) > 0:
            ycvp[key] = np.mean(predictions[key])
        else:
            continue
    return ycvp
Пример #2
0
    def runcv(self,
              batch_size_,
              batch_mode_,
              num_epochs,
              ndense_layers,
              nunits,
              cvout,
              n_splits=5,
              n_repeats=10,
              random_state=None,
              mout=None,
              fimpfile=None):
        print("N. instances: %d" % (len(self.target)))

        mout_path = None
        if mout is not None:
            # Utilised to store the out path
            # mout_path = Path("%s_%s" % (time.strftime("%Y%m%d%H%M%S"), mout))
            mout_path = Path(mout)
        else:
            # Utilised to store the out path
            mout_path = Path("%s_model" % (time.strftime("%Y%m%d%H%M%S")))

        last_model = None
        if mout_path.exists() is True:
            # Find the last model and restart the calculation from it.
            p = Path(mout_path).glob('**/*.h5')
            # getonlzfile numbers
            mids = [int(x.stem) for x in p if x.is_file()]
            if len(mids) > 0:
                # Restart from here...
                last_model = max(mids)
            else:
                last_model = None
        else:
            mout_path.mkdir()
            # Save the descriptor order
            f = open("%s/odesc_header.csv" % (str(mout_path.absolute())), "w")
            for dname in self.xheader:
                f.write("%s\n" % (dname))
            f.close()

        feat_imp = {}
        if fimpfile is not None:
            for feat_name in self.xheader:
                feat_imp[feat_name] = {'mae': [], 'mse': []}

        cv_ = 0
        predictions = {}
        recalc = {}
        for key in self.target.keys():
            predictions[key] = []
            recalc[key] = []

        valfn = GetValidationFnc()
        if valfn is None:
            valfn = RepeatedKFold(list(self.target.keys()),
                                  n_splits,
                                  n_repeats,
                                  random_state=random_state,
                                  test_size=0.2)
        else:
            print("Using custom validation split function")
            valfn = valfn(list(self.target.keys()))

        for train_keys, val_keys, test_keys in valfn:
            # Some memory clean-up
            K.clear_session()
            train_steps_per_epoch = ceil(len(train_keys) / float(batch_size_))
            train_generator = self.DataGenerator(train_keys, batch_size_,
                                                 batch_mode_)
            # x_train, y_train = self.GenData(train_keys)
            # test_steps_per_epoch = ceil(len(train_keys)/float(batch_size_))
            # test_generator = self.DataGenerator(test_keys, batch_size_)
            x_test, y_test = self.GenData(test_keys)
            x_val, y_val = self.GenData(val_keys)
            print("Train set size: %d Val set size %d Test set size: %d" %
                  (len(train_keys), len(val_keys), len(test_keys)))
            model_output = "%s/%d.h5" % (str(mout_path.absolute()), cv_)

            if last_model is None:
                model = None
                model_ = GetKerasModel()
                if model_ is None:
                    model = example_build_model(self.nfeatures, nunits,
                                                ndense_layers, self.ntargets)
                else:
                    model = model_(self.nfeatures, nunits, ndense_layers)

                print(model.summary())
                dname = cvout.replace(".csv", "")
                b = batch_size_
                log_dir_ = ("./logs/cv%d_%s_#b%d_#e%d_#u%d_#dl%d_" %
                            (cv_, dname, b, num_epochs, nunits, ndense_layers))
                log_dir_ += time.strftime("%Y%m%d%H%M%S")
                callbacks_ = [
                    TensorBoard(log_dir=log_dir_,
                                histogram_freq=0,
                                write_graph=False,
                                write_images=False),
                    ModelCheckpoint(model_output,
                                    monitor='val_loss',
                                    verbose=0,
                                    save_best_only=True)
                ]

                model.fit_generator(
                    train_generator,
                    steps_per_epoch=train_steps_per_epoch,
                    epochs=num_epochs,
                    verbose=self.verbose,
                    validation_data=(x_val, y_val),
                    # validation_data=test_generator,
                    # validation_steps=test_steps_per_epoch,
                    callbacks=callbacks_)
            else:
                if last_model - 1 == cv_:
                    last_model = None

            model_ = GetLoadModelFnc()(model_output)

            y_recalc_train = self.makePrediction(model_, train_keys)
            y_pred_val = self.makePrediction(model_, val_keys)
            y_pred_test = self.makePrediction(model_, test_keys)

            y_recalc = []
            y_true_recalc = []
            for key in train_keys:
                y_recalc.append(y_recalc_train[key])
                y_true_recalc.append(self.target[key])
                recalc[key].append(y_recalc_train[key])

            ypred_val = []
            ytrue_val = []
            for key in val_keys:
                ypred_val.append(y_pred_val[key])
                ytrue_val.append(self.target[key])

            ypred_test = []
            ytrue_test = []
            for key in test_keys:
                ypred_test.append(y_pred_test[key])
                ytrue_test.append(self.target[key])
                # Store validation prediction
                predictions[key].append(y_pred_test[key])

            r2 = RSQ(y_true_recalc, y_recalc)
            q2 = RSQ(ytrue_test, ypred_test)
            tr2 = RSQ(ytrue_val, ypred_val)
            print("Train R2: %.4f Test Q2: %.4f Val: R2: %.4f\n" %
                  (r2, q2, tr2))

            # Store the cross validation model
            # if mout_path is not None:
            #    model.save("%s/%d.h5" % (str(mout_path.absolute()), cv_))

            if fimpfile is not None:
                fimp = FeatureImportance(model, x_test, y_test, self.xheader)
                fires = fimp.Calculate(verbose=1)
                for key in fires.keys():
                    feat_imp[key]['mae'].extend(fires[key]['mae'])
                    feat_imp[key]['mse'].extend(fires[key]['mse'])
            cv_ += 1

        WriteCrossValidationOutput(cvout, self.target, predictions, recalc)

        if fimpfile is not None:
            WriteFeatureImportance(feat_imp, fimpfile)
Пример #3
0
def simplerun(db,
              csv_target,
              csv_descriptors,
              num_epochs,
              n_rot_train,
              train_steps_per_epoch_,
              n_rotation_test,
              test_steps_per_epoch_,
              ndense_layers,
              nunits,
              nfilters,
              random_state,
              outmodel=None,
              fcvgroup=None,
              tid=None):
    # Load the dataset
    ai = AIModel(csv_target, db, csv_descriptors)
    available_keys = ai.GetAvailableKeys()
    train_keys = None
    test_keys = None

    if fcvgroup is not None:
        cvgroups = CVGroupRead(fcvgroup)
        tkey = None
        if tid is not None:
            tkey = int(tid)
            print(tkey)
        else:
            tkey = random.choice(list(cvgroups.keys()))
        print(cvgroups[tkey])
        test_keys = cvgroups[tkey]
        train_keys = []
        for key in cvgroups.keys():
            if key == tkey:
                continue
            else:
                train_keys.textened(cvgroups[key])
    else:
        ttfn = GetTrainTestFnc()
        if ttfn is None:
            ttfn = TrainTestSplit
        else:
            print("Using custom train/test split function")
        train_keys, test_keys = ttfn(list(self.target.keys()),
                                     test_size=0.20,
                                     random_state=random_state)

    print("Trainin set size: %d Validation set size %d" %
          (len(train_keys), len(test_keys)))

    train_generator = ai.VoxelTrainGenerator(train_keys, n_rot_train)

    print(train_keys)
    print(test_keys)

    model = None
    model_ = GetKerasModel()
    if ai.other_descriptors is None:
        if model_ is None:
            model = build_model(ai.conv3d_chtype, ai.input_shape,
                                ndense_layers, nunits, nfilters)
        else:
            model = model_(ai.conv3d_chtype, ai.input_shape, ndense_layers,
                           nunits, nfilters)
        # model = build_fcn_model(ai.conv3d_chtype, ai.input_shape, ndense_layers, nunits, nfilters)
        # model = model_scirep(ai.conv3d_chtype, ai.input_shape, ndense_layers, nunits, nfilters)
        # model = ResNetModel(ai.input_shape)
        print(model.summary())
    else:
        if model_ is None:
            model = build_2DData_model(ai.conv3d_chtype, ai.input_shape,
                                       ai.nfeatures, ndense_layers, nunits,
                                       nfilters)
        else:
            model = model_(ai.conv3d_chtype, ai.input_shape, ai.nfeatures,
                           ndense_layers, nunits, nfilters)
        """
        for l in model.layers[0].layers:
            print(l.summary())
        """
        print("Total Summary")
        print(model.summary())
    plot_model(model, to_file="model.png", show_shapes=True)

    dname = os.path.basename(csv_target).replace(".csv", "")
    dname += os.path.basename(db)
    log_dir_ = ("./logs/%s_%d_#rot%d_#f%d_#dl%d_#u%d_" %
                (dname, num_epochs, train_steps_per_epoch_, nfilters,
                 ndense_layers, nunits))
    log_dir_ += time.strftime("%Y%m%d%H%M%S")
    callbacks_ = [
        TensorBoard(log_dir=log_dir_,
                    histogram_freq=0,
                    write_graph=False,
                    write_images=False)
    ]
    """
    ,
                  EarlyStopping(monitor='val_loss',
                                min_delta=0,
                                patience=3,
                                verbose=0,
                                mode='auto')
    """

    test_generator = ai.VoxelTrainGenerator(test_keys, n_rot_train)

    model.fit_generator(
        train_generator,
        epochs=num_epochs,
        steps_per_epoch=train_steps_per_epoch_,
        verbose=1,
        # max_queue_size=2,
        # workers=0,
        # validation_data=(x_test_, y_test_),
        validation_data=test_generator,
        validation_steps=test_steps_per_epoch_,
        # nb_val_samples=x_test.shape[0],
        callbacks=callbacks_,
        use_multiprocessing=True)

    x_test_, y_test_ = ai.VoxelTestSetGenerator(test_keys, n_rotation_test)
    y_pred_ = model.predict(x_test_)
    print("Test R2: %.4f" % (r2_score(y_test_, y_pred_)))

    fo = open("statconf.csv", "w")
    for key in ai.statvoxconf.keys():
        fo.write("%s," % (key))
        for i in range(len(ai.statvoxconf[key])):
            for j in range(len(ai.statvoxconf[key][i])):
                fo.write("%d," % (ai.statvoxconf[key][i][j]))
        fo.write("\n")
    fo.close()
    # score = model.evaluate(x_test_, y_test_, verbose=0)
    # print(score)

    if outmodel is not None:
        model.save(outmodel)
Пример #4
0
    def simplerun(self,
                  batch_size_,
                  batch_mode_,
                  num_epochs,
                  ndense_layers,
                  nunits,
                  random_state,
                  model_output=None):
        """
        Run a simple model...
        """
        # train_keys, test_keys = MDCTrainTestSplit(self.target, 0)
        # train_keys, test_keys = DISCTrainTestSplit(self.target)

        ttfn = GetTrainTestFnc()
        if ttfn is None:
            ttfn = TrainTestSplit
        else:
            print("Using custom train/test split function")

        train_keys, test_keys = ttfn(list(self.target.keys()),
                                     test_size=0.20,
                                     random_state=random_state)

        print("Train set size: %d Test set size %d" %
              (len(train_keys), len(test_keys)))

        model = None
        if model_output is not None and Path(model_output).is_file():
            model = GetLoadModelFnc()(model_output)
        else:
            model_ = GetKerasModel()
            if model_ is None:
                model = example_build_model(self.nfeatures, nunits,
                                            ndense_layers, self.ntargets)
            else:
                model = model_(self.nfeatures, nunits, ndense_layers)
        print(model.summary())

        train_steps_per_epoch = ceil(len(train_keys) / float(batch_size_))
        train_generator = self.DataGenerator(train_keys, batch_size_,
                                             batch_mode_)

        #x_train, y_train = self.GenData(train_keys)

        # This is unstable
        # test_steps_per_epoch = ceil(len(train_keys)/float(batch_size_))
        # test_generator = self.DataGenerator(test_keys, batch_size_)
        # This is more stable
        x_test, y_test = self.GenData(test_keys)

        b = batch_size_
        log_dir_ = ("./logs/#b%d_#e%d_#u%d_#dl%d_" %
                    (b, num_epochs, nunits, ndense_layers))
        log_dir_ += time.strftime("%Y%m%d%H%M%S")

        callbacks_ = [
            TensorBoard(log_dir=log_dir_,
                        histogram_freq=0,
                        write_graph=False,
                        write_images=False)
        ]
        """
        model.fit(x_train, y_train,
                  epochs=num_epochs,
                  batch_size=b,
                  verbose=self.verbose,
                  validation_data=(x_test, y_test),
                  callbacks=callbacks_)

        yrecalc_train = model.predict(x_train)

        """

        model.fit_generator(
            train_generator,
            steps_per_epoch=train_steps_per_epoch,
            epochs=num_epochs,
            verbose=1,
            validation_data=(x_test, y_test),
            # validation_data=test_generator,
            # validation_steps=test_steps_per_epoch,
            callbacks=callbacks_)

        y_recalc_train = self.makePrediction(model, train_keys)
        y_pred_test = self.makePrediction(model, test_keys)

        ytrain_recalc = []
        ytrain_true = []
        for key in train_keys:
            ytrain_recalc.append(y_recalc_train[key])
            ytrain_true.append(self.target[key])

        ytest_pred = []
        ytest_true = []
        for key in test_keys:
            ytest_pred.append(y_pred_test[key])
            ytest_true.append(self.target[key])

        print("R2: %.4f Q2: %.4f MSE: %.4f" %
              (RSQ(ytrain_true, ytrain_recalc), RSQ(
                  ytest_pred, ytest_true), MSE(ytest_pred, ytest_true)))

        fo = open("%s_pred.csv" % time.strftime("%Y%m%d%H%M%S"), "w")
        for i in range(len(ytest_true)):
            fo.write("%s" % (test_keys[i]))
            for j in range(len(ytest_true[i])):
                fo.write(",%f,%f" % (ytest_true[i][j], ytest_pred[i][j]))
            fo.write("\n")
        fo.close()

        if model_output is not None:
            model.save(model_output)
Пример #5
0
    def GridSearch(self,
                   batch_size_,
                   steps_per_epoch_,
                   num_epochs,
                   random_state,
                   gmout="GridSearchResult"):

        train_keys, test_keys = TrainTestSplit(list(self.target.keys()),
                                               test_size=0.20,
                                               random_state=random_state)
        print("Train set size: %d Test set size %d" % (len(train_keys),
                                                       len(test_keys)))

        # train_steps_per_epoch = ceil(len(train_keys)/float(batch_size_))
        # train_generator = self.DataGenerator(train_keys, batch_size_)

        x_train, y_train, rtrain_keys = self.GenData(train_keys)

        # This is unstable
        # test_steps_per_epoch = ceil(len(train_keys)/float(batch_size_))
        # test_generator = self.DataGenerator(test_keys, batch_size_)
        # This is more stable
        x_test, y_test, rtest_keys = self.GenData(test_keys)

        # PARAMETER DEFINITIONS
        # simple architecture
        """
        param = {}
        param["nunits"] = [100, 200, 400]
        param["ndense_layers"] = [2, 4, 6]
        param["dropout"] = ["on", "off"]
        #param["activation"] = ["relu", "leakyrelu"]
        param["activation"] = ["relu"]
        """

        # resnet architecture
        param = {}
        param["nunits"] = [200, 400,  600, 800]
        param["ndense_layers"] = [2, 4, 6, 8]

        all_combo = list(ParameterGrid(param))
        print("Evaluating %d combinations of parameters" % (len(all_combo)))

        already_computed_combo = []
        if Path(gmout).is_file():
            fi = open(gmout, "r")
            for line in fi:
                v = str.split(line.strip(), " ")
                """
                # simple architecture
                units = v[0]
                layers = v[1]tensorflow
                act = v[2]
                drop = v[3]
                s = ("%s-%s-%s-%s" % (units, layers, act, drop))
                """
                # resnet architecture
                units = v[0]
                layers = v[1]
                s = ("%s-%s" % (units, layers))
                already_computed_combo.append(s)
            fi.close()
        model_ = GetKerasModel()
        for c in all_combo:
            """
            # simple architecture
            s = ("%s-%s-%s-%s" % (c["nunits"],
                                  c["ndense_layers"],
                                  c["activation"],
                                  c["dropout"]))
            """
            # resnet architecture
            s = ("%s-%s" % (c["nunits"], c["ndense_layers"]))
            if s in already_computed_combo:
                print("%s already computed... skip..." % (s))
            else:
                """
                model = build_gridsearch_model(self.nfeatures,
                                              c["ndense_layers"],
                                              c["nunits"],
                                              c["activation"],
                                              c["dropout"])
                """
                if model_ is None:
                    model = example_build_model(self.nfeatures,
                                                c["nunits"],
                                                c["ndense_layers"])
                else:
                    model = model_(self.nfeatures,
                                   c["nunits"],
                                   c["ndense_layers"])

                """
                model = build_dnn_resnet_model(self.nfeatures,
                                               c["nunits"],
                                               c["ndense_layers"])
                """

                print(model.summary())
                b = batch_size_
                """
                model_name = ("#b%d_#e%d_#u%d_#dl%d_act-%s_dp-%s" % (b,
                                                                    num_epochs,
                                                                    c["nunits"],
                                                                    c["ndense_layers"],
                                                                    c["activation"],
                                                                    c["dropout"]))
                """

                model_name = ("#b%d_#e%d_#u%d_#dl%d" % (b,
                                                        num_epochs,
                                                        c["nunits"],
                                                        c["ndense_layers"]))
                log_dir_ = ("./logs/%s" % (model_name))

                log_dir_ += time.strftime("%Y%m%d%H%M%S")

                model_output = "%s.h5" % (model_name)
                callbacks_ = [TensorBoard(log_dir=log_dir_,
                                          histogram_freq=0,
                                          write_graph=False,
                                          write_images=False),
                              ModelCheckpoint(model_output,
                                              monitor='val_loss',
                                              verbose=0,
                                              save_best_only=True)]
                """
                callbacks_ = [TensorBoard(log_dir=log_dir_,
                                          histogram_freq=0,
                                          write_graph=False,
                                          write_images=False),
                              EarlyStopping(monitor='val_loss',
                                            min_delta=0,
                                            patience=50,
                                            verbose=0,
                                            mode='auto')]
                """

                model.fit(x_train, y_train,
                          epochs=num_epochs,
                          batch_size=b,
                          steps_per_epoch=steps_per_epoch_,
                          verbose=self.verbose,
                          validation_data=(x_test, y_test),
                          callbacks=callbacks_)

                bestmodel = load_model(model_output,
                                       custom_objects={"score": score})

                yrecalc_train = bestmodel.predict(x_train)

                """

                model.fit_generator(train_generator,
                                    steps_per_epoch=train_steps_per_epoch,
                                    epochs=num_epochs,
                                    verbose=1,
                                    validation_data=(x_test, y_test),
                                    # validation_data=test_generator,
                                    # validation_steps=test_steps_per_epoch,
                                    callbacks=callbacks_)


                yrecalc_train = []
                y_train = []
                for key in train_keys:
                    a = np.array([self.X_raw[key]])
                    yrecalc_train.extend(model.predict(a))
                    y_train.append(self.target[key])
                """
                ypred_test = bestmodel.predict(x_test)
                r2 = r2_score(y_train, yrecalc_train)
                mse_train = mse(y_train, yrecalc_train)
                mae_train = mae(y_train, yrecalc_train)
                q2 = r2_score(y_test, ypred_test)
                mse_test = mse(y_test, ypred_test)
                mae_test = mae(y_test, ypred_test)
                train_score = LOGMAE(y_train, yrecalc_train)
                test_score = LOGMAE(y_test, ypred_test)
                print("R2: %.4f Train Score: %f Q2: %.4f Test Score: %f" % (r2, train_score, q2, test_score))

                fo = open("%s" % (gmout), "a")
                """
                # simple architecture
                fo.write("%d %d %s %s %f %f %f %f %f %f %f %f\n" % (c["nunits"],
                                                                    c["ndense_layers"],
                                                                    c["activation"],
                                                                    c["dropout"],
                                                                    mse_train,
                                                                    mae_train,
                                                                    r2,
                                                                    train_score,
                                                                    mse_test,
                                                                    mae_test,
                                                                    q2,
                                                                    test_score))
                """
                # resnet architecture
                fo.write("%d %d %f %f %f %f %f %f %f %f\n" % (c["nunits"],
                                                              c["ndense_layers"],
                                                              mse_train,
                                                              mae_train,
                                                              r2,
                                                              train_score,
                                                              mse_test,
                                                              mae_test,
                                                              q2,
                                                              test_score))
                fo.close()
Пример #6
0
    def runloo(self, batch_size_, num_epochs, ndense_layers, nunits, cvout):
        print("N. instances: %d" % (len(self.target)))
        predictions = dict()
        for val_key in self.target.keys():
            sub_target = {}
            for key in self.target.keys():
                if val_key == key:
                    continue
                else:
                    sub_target[key] = self.target[key]
                    # train_keys.append(key)
            print("Validating %s" % (val_key))

            # train_keys, test_keys = MDCTrainTestSplit(sub_target, 0)
            train_keys, test_keys = TrainTestSplit(sub_target, test_size=0.20)
            x_train, y_train, rtrain_keys = self.GenData(train_keys)
            x_test, y_test, rtest_keys = self.GenData(test_keys)

            model = None
            model_ = GetKerasModel()
            if model_ is None:
                model = example_build_model(self.nfeatures,
                                            nunits,
                                            ndense_layers)
            else:
                model = model_(self.nfeatures,
                               nunits,
                               ndense_layers)

            print(model.summary())
            b = 0
            if batch_size_ is None:
                b = len(x_test)
            else:
                b = batch_size_
            log_dir_ = ("./logs/%s_#b%d_#e%d_#u%d_#dl%d_" % (val_key,
                                                             b,
                                                             num_epochs,
                                                             nunits,
                                                             ndense_layers))
            log_dir_ += time.strftime("%Y%m%d%H%M%S")

            callbacks_ = [TensorBoard(log_dir=log_dir_,
                                      histogram_freq=0,
                                      write_graph=False,
                                      write_images=False)]
            """
            callbacks_ = [TensorBoard(log_dir=log_dir_,
                                      histogram_freq=0,
                                      write_graph=False,
                                      write_images=False),
                          EarlyStopping(monitor='val_loss',
                                        min_delta=0,
                                        patience=3,
                                        verbose=0,
                                        mode='auto')]
            """

            model.fit(x_train, y_train,
                      epochs=num_epochs,
                      batch_size=b,
                      verbose=1,
                      validation_data=(x_test, y_test),
                      callbacks=callbacks_)

            predictions[val_key] = model.predict(x_test)[0]

        fo = open(cvout, "w")
        for key in predictions.keys():
            fo.write("%s,%.4f,%.4f\n" % (key,
                                         self.target[key],
                                         predictions[key]))
        fo.close()
Пример #7
0
    def runcv(self,
              batch_size_,
              num_epochs,
              steps_per_epoch_,
              nfilters,
              nunits,
              random_state,
              cvout,
              n_splits=5,
              n_repeats=10,
              mout=None):
        print("N. instances: %d" % (len(self.target)))

        mout_path = None
        if mout is not None:
            # Utilised to store the out path
            mout_path = Path("%s_%s" % (time.strftime("%Y%m%d%H%M%S"), mout))
            mout_path.mkdir()
            # Save the descriptor order
            """
            f = open("%s/odesc_header.csv" % (str(mout_path.absolute())), "w")
            for name in self.xheader:
                f.write("%s\n" % (name))
            f.close()
            """

        cv_ = 0
        predictions = {}
        recalc = {}
        for key in self.target.keys():
            # N.B.: each molecule can have multiple outputs.
            predictions[key] = []
            recalc[key] = []

        for train_keys, val_keys, test_keys in RepeatedKFold(list(self.target.keys()),
                                                             n_splits,
                                                             n_repeats,
                                                             random_state=random_state,
                                                             test_size=0.2):
            print("Train set size: %d Val set size %d Test set size: %d" % (len(train_keys),
                                                                            len(val_keys),
                                                                            len(test_keys)))
            x_train, y_train, rtrain_keys = self.GenData(train_keys)
            x_val, y_val, rval_keys = self.GenData(val_keys)
            x_test, y_test, rtest_keys = self.GenData(test_keys)

            # Some memory clean-up
            K.clear_session()
            model = None
            model_ = GetKerasModel()
            if self.dx is not None:
                print("Number of descriptors: %d" % (self.n_descs))
                if model_ is None:
                    model = example_build_2DData_model(self.db.input_shape,
                                                       self.n_descs,
                                                       nfilters,
                                                       nunits)
                else:
                    model = model_(self.db.input_shape,
                                   self.n_descs,
                                   nfilters,
                                   nunits)
            else:
                if model_ is None:
                    model = example_build_model(self.db.input_shape,
                                                nfilters,
                                                nunits)
                else:
                    model = model_(self.db.input_shape,
                                   nfilters,
                                   nunits)

            print(model.summary())
            dname = cvout.replace(".csv", "")
            b = 0
            if batch_size_ is None:
                b = len(x_val)
            else:
                b = batch_size_

            name = "cv%d_%s_#b%d_#e%d_#u%d_#f%d_" % (cv_,
                                                     dname,
                                                     b,
                                                     num_epochs,
                                                     nunits,
                                                     nfilters)
            name += time.strftime("%Y%m%d%H%M%S")
            log_dir_ = ("./logs/%s" % (name))

            model_output = None
            if mout_path is not None:
                model_output = "%s/%d.h5" % (str(mout_path.absolute()), cv_)
            if model_output is None:
                callbacks_ = [TensorBoard(log_dir=log_dir_,
                                          histogram_freq=0,
                                          write_graph=False,
                                          write_images=False)]
            else:
                callbacks_ = [TensorBoard(log_dir=log_dir_,
                                          histogram_freq=0,
                                          write_graph=False,
                                          write_images=False),
                              ModelCheckpoint(model_output,
                                              monitor='val_loss',
                                              verbose=0,
                                              save_best_only=True)]

            train_generator = self.DataGenerator(train_keys, batch_size_)
            model.fit_generator(train_generator,
                                steps_per_epoch=steps_per_epoch_,
                                epochs=num_epochs,
                                verbose=1,
                                validation_data=(x_val, y_val),
                                # validation_data=test_generator,
                                # validation_steps=test_steps_per_epoch,
                                callbacks=callbacks_,
                                use_multiprocessing=True)
            """
            model.fit(x_train, y_train,
                      epochs=num_epochs,
                      batch_size=b,
                      steps_per_epoch=steps_per_epoch_,
                      verbose=1,
                      validation_data=(x_val, y_val),
                      callbacks=callbacks_)
            """
            # WARNING Implement cross validation results for multiple outputs
            bestmodel = load_model(model_output,
                                   custom_objects={"score": score})
            yrecalc = bestmodel.predict(x_train)
            for i in range(len(yrecalc)):
                recalc[train_keys[i]].append(list(yrecalc[i]))

            ypred_val = bestmodel.predict(x_val)
            print("Test R2: %.4f" % (r2_score(y_val, ypred_val)))

            ypred_test = bestmodel.predict(x_test)
            # exp_pred_plot(y_val_, ypred[:,0])
            print("Validation R2: %.4f" % (r2_score(y_test, ypred_test)))
            for i in range(len(ypred_test)):
                predictions[test_keys[i]].append(list(ypred_test[i]))

            """
            if fimpfile is not None:
                fimp = FeatureImportance(model, x_val, y_val, self.xheader)
                fires = fimp.Calculate(verbose=1)
                for key in fires.keys():
                    feat_imp[key]['mae'].extend(fires[key]['mae'])
                    feat_imp[key]['mse'].extend(fires[key]['mse'])
            """
            cv_ += 1

        WriteCrossValidationOutput(cvout, self.target, predictions, recalc)
Пример #8
0
    def simplerun(self,
                  batch_size_,
                  num_epochs,
                  steps_per_epoch_,
                  nfilters,
                  nunits,
                  random_state,
                  mout=None):
        print("N. instances: %d" % (len(self.target)))
        
        ttfn = GetTrainTestFnc()
        if ttfn is None:
            ttfn = TrainTestSplit
        else:
            print("Using custom train/test split function")
            
        train_keys, test_keys = ttfn(list(self.target.keys()),
                                     test_size=0.20,
                                     random_state=random_state)
        
        print("Train set size: %d Test set size %d" % (len(train_keys),
                                                       len(test_keys)))
        model = None
        model_ = GetKerasModel()

        if self.dx is not None:
            print("Number of descriptors: %d" % (self.n_descs))
            if model_ is None:
                model = example_build_2DData_model(self.db.input_shape,
                                                   self.n_descs,
                                                   nfilters,
                                                   nunits)
            else:
                model = model_(self.db.input_shape,
                               self.n_descs,
                               nfilters,
                               nunits)
        else:
            if model_ is None:
                model = example_build_model(self.db.input_shape,
                                            nfilters,
                                            nunits)
            else:
                model = model_(self.db.input_shape, nfilters, nunits)

        print(model.summary())

        x_train, y_train, rtrain_keys = self.GenData(train_keys)
        x_test, y_test, rtest_keys = self.GenData(test_keys)
        if self.dx is not None:
            print("Branch 1 size:", np.array(x_train[0]).shape)
            print("Branch 2 size:", np.array(x_train[1]).shape)
        else:
            print(x_train.shape)

        print(y_train.shape)
        b = 0
        if batch_size_ is None:
            b = len(x_test)
        else:
            b = batch_size_

        name = "#b%d_#e%d_#u%d_#f%d_" % (b,
                                         num_epochs,
                                         nunits,
                                         nfilters)
        name += time.strftime("%Y%m%d%H%M%S")
        log_dir_ = ("./logs/%s" % (name))

        callbacks_ = [TensorBoard(log_dir=log_dir_,
                                  histogram_freq=0,
                                  write_graph=False,
                                  write_images=False)]
        """
        callbacks_ = [TensorBoard(log_dir=log_dir_,
                                  histogram_freq=0,
                                  wx_trainrite_graph=False,
                                  write_images=False),
                      EarlyStopping(monitor='val_loss',
                                    min_delta=0,
                                    patience=50,
                                    verbose=0,
                                    mode='auto')]
        """
        #train_steps_per_epoch = int(np.ceil(len(train_keys)/float(batch_size_)))
        train_generator = self.DataGenerator(train_keys, batch_size_)
        model.fit_generator(train_generator,
                            steps_per_epoch=steps_per_epoch_,
                            epochs=num_epochs,
                            verbose=1,
                            validation_data=(x_test, y_test),
                            # validation_data=test_generator,
                            # validation_steps=test_steps_per_epoch,
                            callbacks=callbacks_,
                            use_multiprocessing=True)
        """
        model.fit(x_train, y_train,
                  epochs=num_epochs,
                  batch_size=b,
                  verbose=self.verbose,
                  validation_data=(x_test, y_test),
                  callbacks=callbacks_)
        """
        yrecalc = model.predict(x_train)
        ypred_test = model.predict(x_test)
        fo = open("%s_pred.csv" % (name), "w")
        if ypred_test.shape[1] > 1:
            for i in range(len(rtest_keys)):
                fo.write("%s," % (rtest_keys[i]))
                for j in range(len(y_test[i])-1):
                    fo.write("%f,%f," % (y_test[i][j], ypred_test[i][j]))
                fo.write("%f,%f\n" % (y_test[i][-1], ypred_test[i][-1]))
            fo.close()
            # Then calculate R2 and Q2 for each output...
            for j in range(ypred_test.shape[1]):
                y_train_ = []
                yrecalc_ = []
                y_test_ = []
                ypred_test_ = []
                for i in range(ypred_test.shape[0]):
                    y_train_.append(y_train[i][j])
                    yrecalc_.append(yrecalc[i][j])
                    y_test_.append(y_test[i][j])
                    ypred_test_.append(ypred_test[i][j])
                print("Output %d R2: %.4f Q2: %.4f" % (j,
                                                       r2_score(y_train_, yrecalc_),
                                                       r2_score(y_test_, ypred_test_)))
        else:
            for i in range(len(rtest_keys)):
                fo.write("%s," % (rtest_keys[i]))
                for j in range(len(y_test[i])):
                    fo.write("%f,%f" % (y_test[i],
                                        ypred_test[i]))
                fo.write("\n")
            fo.close()
            print("R2: %.4f Q2: %.4f" % (r2_score(y_train, yrecalc),
                                         r2_score(y_test, ypred_test)))