Exemplo n.º 1
0
    def test_icp_regression_tree(self):
        # -----------------------------------------------------------------------------
        # Setup training, calibration and test indices
        # -----------------------------------------------------------------------------
        data = load_boston()

        idx = np.random.permutation(data.target.size)
        train = idx[:int(idx.size / 3)]
        calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)]
        test = idx[int(2 * idx.size / 3):]

        # -----------------------------------------------------------------------------
        # Without normalization
        # -----------------------------------------------------------------------------
        # Train and calibrate
        # -----------------------------------------------------------------------------
        underlying_model = RegressorAdapter(
            DecisionTreeRegressor(min_samples_leaf=5))
        nc = RegressorNc(underlying_model, AbsErrorErrFunc())
        icp = IcpRegressor(nc)
        icp.fit(data.data[train, :], data.target[train])
        icp.calibrate(data.data[calibrate, :], data.target[calibrate])

        # -----------------------------------------------------------------------------
        # Predict
        # -----------------------------------------------------------------------------
        prediction = icp.predict(data.data[test, :], significance=0.1)
        header = ["min", "max", "truth", "size"]
        size = prediction[:, 1] - prediction[:, 0]
        table = np.vstack([prediction.T, data.target[test], size.T]).T
        df = pd.DataFrame(table, columns=header)
        print(df)

        # -----------------------------------------------------------------------------
        # With normalization
        # -----------------------------------------------------------------------------
        # Train and calibrate
        # -----------------------------------------------------------------------------
        underlying_model = RegressorAdapter(
            DecisionTreeRegressor(min_samples_leaf=5))
        normalizing_model = RegressorAdapter(
            KNeighborsRegressor(n_neighbors=1))
        normalizer = RegressorNormalizer(underlying_model, normalizing_model,
                                         AbsErrorErrFunc())
        nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)
        icp = IcpRegressor(nc)
        icp.fit(data.data[train, :], data.target[train])
        icp.calibrate(data.data[calibrate, :], data.target[calibrate])

        # -----------------------------------------------------------------------------
        # Predict
        # -----------------------------------------------------------------------------
        prediction = icp.predict(data.data[test, :], significance=0.1)
        header = ["min", "max", "truth", "size"]
        size = prediction[:, 1] - prediction[:, 0]
        table = np.vstack([prediction.T, data.target[test], size.T]).T
        df = pd.DataFrame(table, columns=header)
        print(df)
Exemplo n.º 2
0
from nonconformist.icp import IcpRegressor
from nonconformist.nc import RegressorNc, abs_error, abs_error_inv

data = load_boston()

# -----------------------------------------------------------------------------
# Setup training, calibration and test indices
# -----------------------------------------------------------------------------
idx = np.random.permutation(data.target.size)
train = idx[:int(idx.size / 3)]
calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)]
test = idx[int(2 * idx.size / 3):]

# -----------------------------------------------------------------------------
# Train and calibrate
# -----------------------------------------------------------------------------
icp = IcpRegressor(RegressorNc(DecisionTreeRegressor, abs_error, abs_error_inv))
icp.fit(data.data[train, :], data.target[train])
icp.calibrate(data.data[calibrate, :], data.target[calibrate])

# -----------------------------------------------------------------------------
# Predict
# -----------------------------------------------------------------------------
import pandas

prediction = icp.predict(data.data[test, :], significance=0.1)
header = np.array(['min','max','Truth'])
table = np.vstack([prediction.T, data.target[test]]).T
df = pandas.DataFrame(np.vstack([header, table]))
print(df)
Exemplo n.º 3
0
from nonconformist.base import RegressorAdapter
from nonconformist.icp import IcpRegressor
from nonconformist.nc import RegressorNc, AbsErrorErrFunc, SignErrorErrFunc

# -----------------------------------------------------------------------------
# Setup training, calibration and test indices
# -----------------------------------------------------------------------------
data = load_boston()

idx = np.random.permutation(data.target.size)
train = idx[:int(idx.size / 3)]
calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)]
test = idx[int(2 * idx.size / 3):]

# -----------------------------------------------------------------------------
# Train and calibrate
# -----------------------------------------------------------------------------
icp = IcpRegressor(
    RegressorNc(RegressorAdapter(DecisionTreeRegressor()), SignErrorErrFunc()))
icp.fit(data.data[train, :], data.target[train])
icp.calibrate(data.data[calibrate, :], data.target[calibrate])

# -----------------------------------------------------------------------------
# Predict
# -----------------------------------------------------------------------------
prediction = icp.predict(data.data[test, :], significance=0.05)
header = np.array(['min', 'max', 'Truth'])
table = np.vstack([prediction.T, data.target[test]]).T
df = pd.DataFrame(np.vstack([header, table]))
print(df)
Exemplo n.º 4
0
def train_and_test_cp_algo(i):
    window = 96
    p = {'window': window}
    algorithm = BiLSTM(p)

    path = 'data\EURUSD_NETPOSUSD_hourly_for_regresion' + str(i) + '.csv'
    df = pd.read_csv(path).drop(['QdfTime', 'Unnamed: 0'], axis=1).fillna(0)
    y_raw_test = df.NetPosUsd[-120:]
    median_ = df.NetPosUsd.median()
    mad_ = mad(df.NetPosUsd.values)
    df.NetPosUsd = mlog_trans(df.NetPosUsd.values)

    # mean = df.NetPosUsd.mean()
    # std = df.NetPosUsd.std()
    # df.NetPosUsd = (df.NetPosUsd - mean) / std

    data = df.NetPosUsd.values

    def generate_index(window, data_matrix):
        '''

        :return:
        '''

        num_elements = data_matrix.shape[0]

        for start, stop in zip(range(0, num_elements - window, 1), range(window, num_elements, 1)):
            yield data_matrix[stop - window:stop].reshape((-1, 1))

    cnt = []

    for sequence in generate_index(window, data):
        cnt.append(sequence)
    cnt = np.array(cnt)

    X = cnt
    y = data[window:]

    X = X.reshape(X.shape[0], X.shape[1])

    train_test_split = X.shape[0] - 120 - 3480
    train = X[:train_test_split, :]

    calibrate = X[train_test_split:train_test_split + 3480, :]

    test = X[-120:]

    ytrain = y[:train_test_split]

    ycalibrate = y[train_test_split:train_test_split + 3480]

    ytest = y[-120:]

    underlying_model = RegressorAdapter(algorithm)
    normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50))
    normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc())
    nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)
    icp = IcpRegressor(nc)
    icp.fit(train, ytrain)
    icp.calibrate(calibrate, ycalibrate)

    underlying_model2 = RegressorAdapter(algorithm)
    nc2 = RegressorNc(underlying_model2, AbsErrorErrFunc())
    icp2 = IcpRegressor(nc2)
    icp2.fit(train, ytrain)
    icp2.calibrate(calibrate, ycalibrate)

    for a in tqdm(np.linspace(5, 95, 19)):

        # -----------------------------------------------------------------------------
        # Predict
        # -----------------------------------------------------------------------------
        prediction = icp.predict(test, significance=a / 100)
        header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction']
        lower, upper = prediction[:, 0], prediction[:, 1]

        lower = mlog_inverse(lower, median_, mad_)
        upper = mlog_inverse(upper, median_, mad_)
        ytest = mlog_inverse(ytest, median_, mad_)
        # lower=lower*std+mean
        # upper=upper*std+mean
        # ytest=ytest*std+mean
        size = upper / 2 + lower / 2
        table = np.vstack([lower, upper, y_raw_test, size.T]).T

        dfncp = pd.DataFrame(table, columns=header)

        # -----------------------------------------------------------------------------
        # Predict
        # -----------------------------------------------------------------------------
        prediction = icp2.predict(test, significance=a / 100)
        header = ['CP_lower', 'CP_upper', 'NetPosUsd', 'prediction']
        lower, upper = prediction[:, 0], prediction[:, 1]

        lower = mlog_inverse(lower, median_, mad_)
        upper = mlog_inverse(upper, median_, mad_)
        ytest = mlog_inverse(ytest, median_, mad_)

        # lower=lower*std+mean
        # upper=upper*std+mean
        # ytest=ytest*std+mean
        size = upper / 2 + lower / 2
        table = np.vstack([lower, upper, y_raw_test, size.T]).T

        dfcp = pd.DataFrame(table, columns=header)

        if i == 0:
            dfcp.to_csv(
                'CP' + '_' + 'cudaLSTM' + '_' + str(
                    np.round(a).astype(int)) + '_' + 'calibrationwindow' + str(
                    3480) + '.csv',
                encoding='utf-8', index=False)
        else:
            dfcp.to_csv(
                'CP' + '_' + 'cudaLSTM' + '_' + str(
                    np.round(a).astype(int)) + '_' + 'calibrationwindow' + str(
                    3480) + '.csv', mode='a',
                header=False, index=False)

        if i == 0:
            dfncp.to_csv(
                'NCP' + '_' + 'cudaLSTM' + '_' + str(
                    np.round(a).astype(int)) + '_' + 'calibrationwindow' + str(
                    3480) + '.csv',
                encoding='utf-8', index=False)
        else:
            dfncp.to_csv(
                'NCP' + '_' + 'cudaLSTM' + '_' + str(
                    np.round(a).astype(int)) + '_' + 'calibrationwindow' + str(
                    3480) + '.csv', mode='a',
                header=False, index=False)
Exemplo n.º 5
0
def train_and_test_cp_algo(parameters):
    p = parameters.copy()
    p.pop('algorithm')
    p.pop('randomized_calibration')
    p.pop('alpha_')
    p.pop('calibration_size')
    p.pop('WhichCP')

    for i in tqdm(range(29)):
        if parameters.get('algorithm') == 'RandomForest':
            algorithm = RandomForestRegressor(**p)
        if parameters.get('algorithm') == 'K-NearestNeighbours':
            algorithm = KNeighborsRegressor(**p)
        if parameters.get('algorithm') == 'LightGBM':
            algorithm = LGBMRegressor(**p)
        if parameters.get('algorithm') == 'LassoRegression':
            algorithm = Lasso(**p)
        if parameters.get('algorithm') == 'NeuralNetwork':
            algorithm = NeuralNetworkAlgorithm(p)
        if parameters.get('algorithm') == 'LSTM':
            algorithm = BiLSTM(**p)
        if parameters.get('algorithm') == 'GradientBoosting':
            algorithm =GradientBoostingRegressor(**p)


        path = 'data\EURUSD_NETPOSUSD_hourly_for_regresion' + str(i) + '.csv'
        df = pd.read_csv(path).drop(['Unnamed: 0','QdfTime'], axis=1).fillna(0)
        m, s = df['NetPosUsd'].mean(), df['NetPosUsd'].std()

        mean = df.mean(axis=0)
        std = df.std(axis=0)
        df = (df - mean) / std

        if parameters.get('randomized_calibration') == True:

            train_test_split = len(df) - 120
            train_ = df.drop([ 'NetPosUsd'], axis=1).iloc[:train_test_split, :].values
            choose = np.random.choice(len(train_), parameters.get("calibration_size"), replace=False)
            calibrate = train_[choose, :]
            mask = np.ones(len(train_), dtype=bool)
            mask[choose] = False
            train = train_[mask, :]

            test = (df.drop([  'NetPosUsd'], axis=1)).iloc[train_test_split:,
                   :].values

            ytrain_ = df['NetPosUsd'][:train_test_split].values

            ycalibrate = ytrain_[choose]
            ytrain = ytrain_[mask]

            ytest = df['NetPosUsd'].iloc[train_test_split:]


        else:
            train_test_split = len(df) - 120 - parameters.get("calibration_size")
            train = df.drop([  'NetPosUsd'], axis=1).iloc[:train_test_split, :].values

            calibrate = df.drop([ 'NetPosUsd'], axis=1).iloc[train_test_split:train_test_split + parameters.get("calibration_size"), :].values

            test = (df.drop([  'NetPosUsd'], axis=1)).iloc[-120:,:].values

            ytrain = df['NetPosUsd'][:train_test_split].values

            ycalibrate = df['NetPosUsd'][train_test_split:train_test_split + parameters.get("calibration_size")]

            ytest = df['NetPosUsd'].iloc[-120:]

        if parameters.get("WhichCP") == 'NCP':
            underlying_model = RegressorAdapter(algorithm)
            normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50))
            normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc())
            nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)
            icp = IcpRegressor(nc)
            icp.fit(train, ytrain)
            icp.calibrate(calibrate, ycalibrate)

            # -----------------------------------------------------------------------------
            # Predict
            # -----------------------------------------------------------------------------
            prediction = icp.predict(test, significance=parameters.get('alpha_'))
            header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction']
            size = prediction[:, 1] / 2 + prediction[:, 0] / 2

            prediction=prediction*s+m
            ytest=ytest*s+m
            size=size*s+m

            table = np.vstack([prediction.T, ytest, size.T]).T

            dfncp = pd.DataFrame(table, columns=header)

        else:
            underlying_model = RegressorAdapter(algorithm)
            nc = RegressorNc(underlying_model, AbsErrorErrFunc())
            icp = IcpRegressor(nc)
            icp.fit(train, ytrain)
            icp.calibrate(calibrate, ycalibrate)

            # -----------------------------------------------------------------------------
            # Predict
            # -----------------------------------------------------------------------------
            prediction = icp.predict(test, significance=parameters.get('alpha_'))
            header = ['CP_lower', 'CP_upper', 'NetPosUsd', 'prediction']
            size = prediction[:, 1] / 2 + prediction[:, 0] / 2

            prediction = prediction * s + m
            ytest = ytest * s + m
            size = size * s + m

            table = np.vstack([prediction.T, ytest, size.T]).T

            dfncp = pd.DataFrame(table, columns=header)

        if i == 0:
            dfncp.to_csv(
                parameters.get("WhichCP") + '_' + parameters.get('algorithm') + '_' + str(
                    np.round(parameters.get('alpha_') * 100).astype(int)) + '_' + 'calibrationwindow' + str(
                    parameters.get('calibration_size')) + '.csv',
                encoding='utf-8', index=False)
        else:
            dfncp.to_csv(
                parameters.get("WhichCP") + '_' + parameters.get('algorithm') + '_' + str(
                    np.round(parameters.get('alpha_') * 100).astype(int)) + '_' + 'calibrationwindow' + str(
                    parameters.get('calibration_size')) + '.csv', mode='a',
                header=False, index=False)

        del algorithm
Exemplo n.º 6
0
def cv(df, parameters):
    end = len(df) - 120
    out = np.zeros(3)
    out2 = np.zeros(3)
    p = parameters.copy()
    p.pop('algorithm')
    p.pop('randomized_calibration')
    p.pop('alpha_')
    if parameters.get('algorithm') == 'RandomForest':
        algorithm = RandomForestRegressor(**p)
        d = {'n_estimators': parameters.get('n_estimators'),
             "criterion": parameters.get("criterion"),
             "max_features": parameters.get("max_features"),
             "min_samples_split": parameters.get("min_samples_split"),
             "min_samples_leaf": parameters.get("min_samples_leaf")
             }
    if parameters.get('algorithm') == 'K-NearestNeighbours':
        algorithm = KNeighborsRegressor(**p)
        d = {
            'n_neighbours': parameters.get('n_neighbours'),
            'weights': parameters.get('weights'),
            'metric': parameters.get('metric')
        }
    if parameters.get('algorithm') == 'LightGBM':
        algorithm = LGBMRegressor(**p)
        d = {"metric": parameters.get("metric"),
             "num_leaves": parameters.get('num_leaves'),
             "learning_rate": parameters.get('learning_rate'),
             "feature_fraction": parameters.get('feature_fraction'),
             "bagging_fraction": parameters.get('bagging_fraction'),
             "bagging_freq": parameters.get('bagging_freq'),
             }

    if parameters.get('algorithm') == 'LassoRegression':
        algorithm = Lasso(**p)
        d = {'alpha_': parameters.get('alpha_')}

    if parameters.get('algorithm') == 'NeuralNetwork':
        algorithm = NeuralNetworkAlgorithm(p)

    if parameters.get('algorithm') == 'LSTM':
        algorithm = BiLSTM(**p)
        d = {}
    d = p
    d['alpha_'] = parameters.get('alpha_')

    m, s = df['NetPosUsd'].mean(), df['NetPosUsd'].std()
    df=df.drop(['QdfTime' ], axis=1)
    mean = df.mean(axis=0)
    std = df.std(axis=0)
    df = (df - mean) / std

    for i, ratio in enumerate(([.5, 0.66, .84])):
        if parameters.get('randomized_calibration') == True:

            train_ = df.drop([  'NetPosUsd'], axis=1).iloc[:int(end * ratio), :].values
            choose = np.random.choice(len(train_), int(end / 6), replace=False)
            calibrate = train_[choose, :]
            mask = np.ones(len(train_), dtype=bool)
            mask[choose] = False

            train = train_[mask, :]
            test = (df.drop([  'NetPosUsd'], axis=1)).iloc[int(end * ratio):int(end * ratio) + int(end / 6),
                   :].values

            ytrain_ = df['NetPosUsd'][:int(end * ratio)].values

            ycalibrate = ytrain_[choose]
            ytrain = ytrain_[mask]

            ytest = df['NetPosUsd'].iloc[int(end * ratio):int(end * ratio) + int(end / 6)]

        else:
            train = df.drop([  'NetPosUsd'], axis=1).iloc[:int(end * ratio) - int(end / 6), :].values

            calibrate = df.drop([  'NetPosUsd'], axis=1).iloc[int(end * ratio) - int(end / 6):int(end * ratio),
                        :].values

            test = df.drop([  'NetPosUsd'], axis=1).iloc[int(end * ratio):int(end * ratio) + int(end / 6),
                   :].values

            ytrain = df['NetPosUsd'][:int(end * ratio) - int(end / 6)].values

            ycalibrate = df['NetPosUsd'][int(end * ratio) - int(end / 6):int(end * ratio)].values

            ytest = df['NetPosUsd'][int(end * ratio):int(end * ratio) + int(end / 6)].values
            # print(len(train),len(ytrain),len(calibrate),len(ycalibrate),len(test),len(ytest))

            # Train and calibrate
        # -----------------------------------------------------------------------------

        underlying_model = RegressorAdapter(algorithm)
        normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50))
        normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc())
        nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)

        icp = IcpRegressor(nc)
        icp.fit(train, ytrain)
        icp.calibrate(calibrate, ycalibrate)

        # -----------------------------------------------------------------------------
        # Predict
        # -----------------------------------------------------------------------------
        prediction = icp.predict(test, significance=parameters.get('alpha_'))
        header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction']
        size = prediction[:, 1] / 2 + prediction[:, 0] / 2

        prediction = prediction * s + m
        ytest = ytest * s + m
        size = size * s + m

        table = np.vstack([prediction.T, ytest, size.T]).T

        dfncp = pd.DataFrame(table, columns=header)

        underlying_model = RegressorAdapter(algorithm)

        nc = RegressorNc(underlying_model, AbsErrorErrFunc())
        icp = IcpRegressor(nc)
        icp.fit(train, ytrain)
        icp.calibrate(calibrate, ycalibrate)

        prediction = icp.predict(test, significance=parameters.get('alpha_'))
        header = ['cp_lower', 'cp_upper']

        prediction = prediction * s + m

        table = np.vstack([prediction.T]).T

        dfcp = pd.DataFrame(table, columns=header)
        dfncp['CP_lower'] = dfcp['cp_lower']
        dfncp['CP_upper'] = dfcp['cp_upper']

        out[i] = qd_objective(dfncp.NetPosUsd, dfncp['CP_lower'], dfncp['CP_upper'], parameters.get('alpha_'))

        out2[i] = qd_objective(dfncp.NetPosUsd, dfncp['NCP_lower'], dfncp['NCP_upper'], parameters.get('alpha_'))

    d['CP_loss'] = np.mean(out)
    d['NCP_loss'] = np.mean(out2)

    if os.path.exists(parameters.get('algorithm') + '_cv.csv') == True:

        pd.DataFrame(data=d, index=[0]).to_csv(parameters.get('algorithm') + '_cv.csv', mode='a', header=False,
                                               index=False)

    else:
        pd.DataFrame(data=d, index=[0]).to_csv(parameters.get('algorithm') + '_cv.csv', encoding='utf-8', index=False)
Exemplo n.º 7
0
from nonconformist.nc import RegressorNc, abs_error, abs_error_inv


def split_data(data, n_train, n_test):
    n_train = n_train*len(data)//(n_train+n_test)
    n_test = len(data)-n_train
    ind = np.random.permutation(len(data))
    return data[ind[:n_train]], data[ind[n_train:n_train+n_test]]

data = Orange.data.Table("auto-mpg")
imp = Impute()
data = imp(data)

for sig in np.linspace(0.01, 0.1, 10):
    errs, szs = [], []
    for rep in range(10):
        train, test = split_data(data, 2, 1)
        train, calib = split_data(train, 2, 1)

        icp = IcpRegressor(RegressorNc(DecisionTreeRegressor(), abs_error, abs_error_inv))
        icp.fit(train.X, train.Y)
        icp.calibrate(calib.X, calib.Y)
        pred = icp.predict(test.X, significance=sig)

        acc = sum(p[0] <= y <= p[1] for p, y in zip(pred, test.Y))/len(pred)
        err = 1-acc
        sz = sum(p[1]-p[0] for p in pred)/len(pred)
        errs.append(err)
        szs.append(sz)
    print(sig, np.mean(errs), np.mean(szs))
Exemplo n.º 8
0
def create_conformal_model():
    """
    Description - Create conformal model - Main loop
    """

    #Read data from file
    data = read_data(args.i)

    #Calculate descriptors using RD-kit
    descriptors_df = calculate_descriptors(data['smiles'])

    #Assign indices
    train_i, calibrate_i, test_i = create_indices_test_training_calibration(
        data)  # Create indices for test,training, calibration sets
    test_index_total = [x for x in test_i]
    calibrate_index_total = [x for x in calibrate_i]

    #Create inductive conformal prediction regressor

    if args.m == 'RF':
        icp = IcpRegressor(
            NormalizedRegressorNc(RandomForestRegressor,
                                  KNeighborsRegressor,
                                  abs_error,
                                  abs_error_inv,
                                  model_params={'n_estimators': 100}))

    if args.m == 'SVM':
        #No support vector regressor
        print('error - no SVM-regressor avliable')
        icp = IcpRegressor(
            NormalizedRegressorNc(SVR,
                                  KNeighborsRegressor,
                                  abs_error,
                                  abs_error_inv,
                                  model_params={'n_estimators': 100}))

    #Create DataFrames to store data
    A = pandas.DataFrame(index=range(len(data)))
    B = pandas.DataFrame(index=range(len(data)))
    C = pandas.DataFrame(index=range(len(data)))

    iA = pandas.DataFrame(index=range(len(data)))
    iB = pandas.DataFrame(index=range(len(data)))
    iC = pandas.DataFrame(index=range(len(data)))

    if args.verbose:
        print('Number of models to create: ' + args.num_models)
        print('############## Starting calculations ##############')

    icp_s = []

    for i in range(int(args.num_models)):  #DEBUG 100
        Xtrain, Xtest, Xcalibrate, ytrain, ytest, ycalibrate = create_train_test_calibrate_sets(
            data, descriptors_df, train_i, calibrate_i, test_i)

        #Create nornal model
        icp.fit(Xtrain, ytrain)

        #Calibrate normal model
        icp.calibrate(asanyarray(Xcalibrate), asanyarray(ycalibrate))

        #Predrict test and training sets
        prediction_test = icp.predict(asanyarray(Xtest),
                                      significance=args.significance)  # 0.2
        prediction_calibrate = icp.predict(asanyarray(Xcalibrate),
                                           significance=args.significance)

        #Create DF with data
        blob = pandas.DataFrame(prediction_test, index=test_i)
        iblob = pandas.DataFrame(prediction_calibrate, index=calibrate_i)

        A[i] = blob[0]
        B[i] = blob[1]

        iA[i] = iblob[0]
        iB[i] = iblob[1]

        #Create new indices for next model
        test_index_total = np.unique(
            np.concatenate((test_index_total, test_i), axis=0))
        calibrate_index_total = np.unique(
            np.concatenate((calibrate_index_total, calibrate_i), axis=0))

        train_i, calibrate_i, test_i = randomize_new_indices(
            train_i, calibrate_i, test_i, data, i)

        #temp = sklearn.base.clone(icp)
        icp_s.append(copy.copy(icp))

    ### Save models ###
    save_models(icp_s)

    if args.verbose:
        print(
            '################## Loop finished, model created, test set predicted #################'
        )

    experimental_values = data['Observed'][test_index_total]
    iexperimental_values = data['Observed'][calibrate_index_total]

    C['median_prediction_0'] = A.median(axis=1)
    C['median_prediction_1'] = B.median(axis=1)
    C['median_prediction'] = (C['median_prediction_0'] +
                              C['median_prediction_1']) / 2
    C['median_prediction_size'] = C['median_prediction'] - C[
        'median_prediction_0']

    Y_pred_median_test = C['median_prediction'].dropna()
    median_prediction_size = C['median_prediction_size'].dropna().tolist()

    num_outside_median = 0
    for i in range(len(data)):
        try:
            if C['median_prediction_0'].dropna()[i] < experimental_values[
                    i] < C['median_prediction_1'].dropna()[i]:
                pass
            else:
                num_outside_median += 1
                #print('Outside range')
        except:
            pass  #print('error')

    #Internal prediction
    iC['median_prediction_0'] = iA.median(axis=1)
    iC['median_prediction_1'] = iB.median(axis=1)
    iC['median_prediction'] = (iC['median_prediction_0'] +
                               iC['median_prediction_1']) / 2
    iC['median_prediction_size'] = iC['median_prediction'] - iC[
        'median_prediction_0']

    iY_pred_median_test = iC['median_prediction'].dropna()
    imedian_prediction_size = iC['median_prediction_size'].dropna().tolist()

    inum_outside_median = 0
    for i in range(len(data)):
        try:
            if iC['median_prediction_0'].dropna()[i] < iexperimental_values[
                    i] < iC['median_prediction_1'].dropna()[i]:
                pass
            else:
                inum_outside_median += 1
                #print('Outside range')
        except:
            pass  #print('error')

    if args.verbose:
        print(
            '########################## Prediction statistics external test ##########################'
        )
        print('')

    print('Number of compounds predicted in test set: ' +
          str(C['median_prediction'].notnull().sum()))

    if args.t != 'full_model':
        ex_r2_score = r2_score(experimental_values, Y_pred_median_test)
        print('R^2 (coefficient of determination):  %.3f' % ex_r2_score)

        ex_mean_squared_error = mean_squared_error(experimental_values,
                                                   Y_pred_median_test)
        ex_rmse = sqrt(ex_mean_squared_error)
        print('RMSE:  %.3f' % ex_rmse)

        ex_MAE = mean_absolute_error(experimental_values, Y_pred_median_test)
        print('Mean absolute error:  %.3f' % ex_MAE)

        print('Mean squared error: %.3f' % ex_mean_squared_error)

        #Average prediction range
        print('Mean of median prediction range: %.3f' %
              mean(median_prediction_size))

        percent_num_outside_median = 100 * float(num_outside_median) / float(
            len(experimental_values))
        print('Number of compounds outside of prediction range: ' +
              str(num_outside_median))
        print('% of compounds predicted outside of prediction range: ' +
              str(percent_num_outside_median) + ' %')
        print(' ')

        #####Internal Prediction ########

        print('Number of compounds predicted in training set: ' +
              str(iC['median_prediction'].notnull().sum()))

        iex_r2_score = r2_score(iexperimental_values, iY_pred_median_test)
        print('R^2 (coefficient of determination):  %.3f' % iex_r2_score)

        iex_mean_squared_error = mean_squared_error(iexperimental_values,
                                                    iY_pred_median_test)
        iex_rmse = sqrt(iex_mean_squared_error)
        print('RMSE:  %.3f' % iex_rmse)

        print('Mean squared error: %.3f' % iex_mean_squared_error)

        iex_MAE = mean_absolute_error(iexperimental_values,
                                      iY_pred_median_test)
        print('Mean absolute error:  %.3f' % iex_MAE)

        #Average prediction range
        print('Mean of median prediction range: %.3f' %
              mean(imedian_prediction_size))

        ipercent_num_outside_median = 100 * float(inum_outside_median) / float(
            len(iexperimental_values))
        print('Number of compounds outside of prediction range: ' +
              str(inum_outside_median))
        print('% of compounds predicted outside of prediction range: ' +
              str(ipercent_num_outside_median) + ' %')
        print(' ')

        #### Plot results - plot test set
        if args.plot:
            if args.verbose:
                print(' ################ Plotting testset #################')
            fig, ax = plt.subplots()

            ax.errorbar(experimental_values,
                        Y_pred_median_test,
                        yerr=median_prediction_size,
                        fmt='o',
                        markeredgecolor='black',
                        markersize=6,
                        mew=1,
                        ecolor='black',
                        elinewidth=0.3,
                        capsize=3,
                        capthick=1,
                        errorevery=1)

            #Set the size
            ax.set_ylim([-10, -3])
            ax.set_xlim([-10, -3])

            # Plot title and lables
            #plt.title('Median predictions with prediction ranges for the testset')
            plt.ylabel('Predicted log Kp')
            plt.xlabel('Experimental log Kp')

            # Draw line
            fit = np.polyfit(experimental_values, Y_pred_median_test, 1)

            x = [-10, -3]

            #Regression line
            #ax.plot(experimental_values, fit[0]*asanyarray(experimental_values)+ fit[1], color='black')
            #ax.plot(x, fit[0]*asanyarray(x)+ fit[1], color='black')

            #Creating colored dots for ref 10

            #ref10_experimental = data.loc[data['Ref.'] == 10]['Observed']
            #ref10_predicted = C['median_prediction'][ref10_experimental.index]
            #ax.scatter(ref10_experimental, ref10_predicted,marker = 'o', color ='red', s = 100)

            ax.plot(x, x, color='black')

            plt.show()

    #Print data in CSV-file

    descriptors_df['Median prediction low range'] = C['median_prediction_0']
    descriptors_df['Median prediction high range'] = C['median_prediction_1']
    descriptors_df['Median prediction'] = C['median_prediction']
    descriptors_df['size prediction range'] = C['median_prediction_1'] - C[
        'median_prediction_0']
    write_csv_with_data(data, descriptors_df, args.d)

    #Calculate min, max and mean values for descriptors
    if args.phys:
        print(args.phys)
        print('Min: ')
        print(descriptors_df.min())
        print('Max: ')
        print(descriptors_df.max())
        print('Mean:')
        print(descriptors_df.mean())

    if args.pca:
        print('Starting PCA')
        print(descriptors_df[[
            'logP', 'PSA', 'MolWt', 'RingCount', 'HeavyAtomCount',
            'NumRotatableBonds'
        ]].head(3))
        print(len(descriptors_df[['size prediction range']]))

        #Define typ of PCA
        pca = PCA(n_components=2)

        #Select desctiptors to use in PCA
        df_small = descriptors_df[[
            'logP', 'PSA', 'MolWt', 'RingCount', 'HeavyAtomCount',
            'NumRotatableBonds'
        ]]

        #Convert descritor values to numeric/float
        df_X = df_small.apply(pandas.to_numeric, errors='raise')

        #Scale data
        scaler = preprocessing.RobustScaler()  #Normalizer() # MaxAbsScaler()
        df_X_scaled = scaler.fit_transform(df_X)

        #Calculate PCA
        pca.fit(df_X_scaled)

        X2 = pca.transform(
            df_X_scaled
        )  #descriptors_df[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']])

        #-----------------------------------------------------------
        desc_testset_large = descriptors_df.dropna(
            subset=['size prediction range'])

        desc_testset_small = desc_testset_large[[
            'logP', 'PSA', 'MolWt', 'RingCount', 'HeavyAtomCount',
            'NumRotatableBonds'
        ]]

        desc_testset_num = desc_testset_small.apply(pandas.to_numeric,
                                                    errors='raise')

        desc_testset_scaled = scaler.fit_transform(desc_testset_num)

        X3 = pca.transform(
            desc_testset_scaled
        )  #desc_testset[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']])
        #-----------------------------------------------------------

        #desc_testset = descriptors_df.dropna(subset = ['size prediction range'])
        Yerr_num = desc_testset_large[['size prediction range'
                                       ]].apply(pandas.to_numeric,
                                                errors='coerce')
        #print(pandas.Series(Yerr['size prediction range']))

        yerr = list(pandas.Series(Yerr_num['size prediction range']) / 4)

        plt.errorbar(X3[:, 0],
                     X3[:, 1],
                     yerr=yerr,
                     fmt='o',
                     markeredgecolor='black',
                     markersize=6,
                     mew=1,
                     ecolor='black',
                     elinewidth=0.3,
                     capsize=3,
                     capthick=1,
                     errorevery=1)

        plt.scatter(X2[:, 0], X2[:, 1])
        plt.xlabel('PC1')
        plt.ylabel('PC2')

        plt.title('PCA of descriptors')
        plt.show()
Exemplo n.º 9
0
idx = np.random.permutation(y.size)
train = idx[:idx.size // 3]
calibrate = idx[idx.size // 3:2 * idx.size // 3]
test = idx[2 * idx.size // 3:]

# -----------------------------------------------------------------------------
# Train and calibrate
# -----------------------------------------------------------------------------
icp = IcpRegressor(
    RegressorNc(DecisionTreeRegressor(), abs_error, abs_error_inv))
icp.fit(X[train, :], y[train])
icp.calibrate(X[calibrate, :], y[calibrate])

acp = AggregatedCp(IcpRegressor(
    RegressorNc(DecisionTreeRegressor(), abs_error, abs_error_inv)),
                   sampler=CrossSampler())
acp.fit(X[train, :], y[train])

# -----------------------------------------------------------------------------
# Predict
# -----------------------------------------------------------------------------
print('# Inductive')
prediction = icp.predict(X[test, :], significance=0.1)
for pred, actual in zip(prediction[:5], y[test]):
    print(pred, actual)

print('\n# Cross')
prediction = acp.predict(X[test, :], significance=0.1)
for pred, actual in zip(prediction[:5], y[test]):
    print(pred, actual)
Exemplo n.º 10
0
def create_conformal_model():
    """
    Description - Create conformal model - Main loop
    """

    #Read data from file
    data = read_data(args.i)
	
    #Calculate descriptors using RD-kit
    descriptors_df = calculate_descriptors(data['smiles']) 
    
    #Assign indices
    train_i, calibrate_i, test_i  = create_indices_test_training_calibration(data) # Create indices for test,training, calibration sets          
    test_index_total = [x for x in test_i]
    calibrate_index_total = [x for x in calibrate_i]

    #Create inductive conformal prediction regressor
   
    if args.m == 'RF':
        icp = IcpRegressor(NormalizedRegressorNc(RandomForestRegressor, KNeighborsRegressor, abs_error, abs_error_inv, model_params={'n_estimators': 100}))

    if args.m == 'SVM':
        #No support vector regressor
        print('error - no SVM-regressor avliable')
        icp = IcpRegressor(NormalizedRegressorNc(SVR, KNeighborsRegressor, abs_error, abs_error_inv, model_params={'n_estimators': 100}))
           
    #Create DataFrames to store data
    A = pandas.DataFrame(index = range(len(data)))
    B = pandas.DataFrame(index = range(len(data)))
    C = pandas.DataFrame(index = range(len(data)))

    iA = pandas.DataFrame(index = range(len(data)))
    iB = pandas.DataFrame(index = range(len(data)))
    iC = pandas.DataFrame(index = range(len(data)))

    if args.verbose:
	print('Number of models to create: '+args.num_models)
	print('############## Starting calculations ##############')
    
    icp_s = []


    for i in range(int(args.num_models)): #DEBUG 100
        Xtrain, Xtest, Xcalibrate, ytrain, ytest, ycalibrate = create_train_test_calibrate_sets(data, descriptors_df,  train_i, calibrate_i, test_i)

        #Create nornal model
        icp.fit(Xtrain, ytrain)
    
        #Calibrate normal model               
        icp.calibrate(asanyarray(Xcalibrate), asanyarray(ycalibrate))
            
        #Predrict test and training sets
        prediction_test = icp.predict(asanyarray(Xtest), significance = args.significance) # 0.2
        prediction_calibrate = icp.predict(asanyarray(Xcalibrate), significance = args.significance)

        #Create DF with data
        blob = pandas.DataFrame(prediction_test, index=test_i)
        iblob = pandas.DataFrame(prediction_calibrate, index=calibrate_i)
        
        A[i] = blob[0]
        B[i] = blob[1]

        iA[i] = iblob[0]
        iB[i] = iblob[1]
        

        #Create new indices for next model
        test_index_total = np.unique(np.concatenate((test_index_total, test_i), axis=0))
        calibrate_index_total = np.unique(np.concatenate((calibrate_index_total, calibrate_i), axis=0)) 
        
        train_i, calibrate_i, test_i  = randomize_new_indices(train_i, calibrate_i, test_i, data, i)
        
        #temp = sklearn.base.clone(icp)
        icp_s.append(copy.copy(icp))
    

    ### Save models ###
    save_models(icp_s)
    

 

    if args.verbose:
        print('################## Loop finished, model created, test set predicted #################')
        
    experimental_values = data['Observed'][test_index_total]
    iexperimental_values = data['Observed'][calibrate_index_total] 


    C['median_prediction_0'] = A.median(axis=1)
    C['median_prediction_1'] = B.median(axis=1)
    C['median_prediction'] = (C['median_prediction_0'] + C['median_prediction_1'])/2
    C['median_prediction_size'] = C['median_prediction'] - C['median_prediction_0']

    Y_pred_median_test = C['median_prediction'].dropna()
    median_prediction_size = C['median_prediction_size'].dropna().tolist()
        
    num_outside_median = 0
    for i in range(len(data)):
        try:
            if  C['median_prediction_0'].dropna()[i] < experimental_values[i] < C['median_prediction_1'].dropna()[i]:
                pass
            else:
                num_outside_median +=1
                #print('Outside range')
        except:
            pass #print('error')
    
    #Internal prediction
    iC['median_prediction_0'] = iA.median(axis=1)
    iC['median_prediction_1'] = iB.median(axis=1)
    iC['median_prediction'] = (iC['median_prediction_0'] + iC['median_prediction_1'])/2
    iC['median_prediction_size'] = iC['median_prediction'] - iC['median_prediction_0']
    
    iY_pred_median_test = iC['median_prediction'].dropna()
    imedian_prediction_size = iC['median_prediction_size'].dropna().tolist()

    inum_outside_median = 0
    for i in range(len(data)):
        try:
            if  iC['median_prediction_0'].dropna()[i] < iexperimental_values[i] < iC['median_prediction_1'].dropna()[i]:
                pass
            else:
                inum_outside_median +=1
                #print('Outside range')
        except:
            pass #print('error')


    if args.verbose:
        print('########################## Prediction statistics external test ##########################')
        print('')
    

       
    print('Number of compounds predicted in test set: '+ str(C['median_prediction'].notnull().sum()))   
    
    if args.t != 'full_model':         
        ex_r2_score= r2_score(experimental_values, Y_pred_median_test)
        print('R^2 (coefficient of determination):  %.3f' % ex_r2_score)

        ex_mean_squared_error = mean_squared_error(experimental_values, Y_pred_median_test)
        ex_rmse = sqrt(ex_mean_squared_error)               
        print('RMSE:  %.3f' % ex_rmse)
        
        ex_MAE = mean_absolute_error(experimental_values, Y_pred_median_test)
        print('Mean absolute error:  %.3f' % ex_MAE)
 
        print('Mean squared error: %.3f' % ex_mean_squared_error)

        #Average prediction range   
        print('Mean of median prediction range: %.3f' % mean(median_prediction_size))

        percent_num_outside_median = 100*float(num_outside_median)/float(len(experimental_values))
        print('Number of compounds outside of prediction range: '+str(num_outside_median))
        print('% of compounds predicted outside of prediction range: '+str(percent_num_outside_median) +' %')
        print(' ')

        #####Internal Prediction ########
    
        print('Number of compounds predicted in training set: '+ str(iC['median_prediction'].notnull().sum()))   
          
        iex_r2_score= r2_score(iexperimental_values, iY_pred_median_test)
        print('R^2 (coefficient of determination):  %.3f' % iex_r2_score)

        iex_mean_squared_error = mean_squared_error(iexperimental_values, iY_pred_median_test)
        iex_rmse = sqrt(iex_mean_squared_error)               
        print('RMSE:  %.3f' % iex_rmse)
        
        print('Mean squared error: %.3f' % iex_mean_squared_error)
        
       
        iex_MAE = mean_absolute_error(iexperimental_values, iY_pred_median_test)
        print('Mean absolute error:  %.3f' % iex_MAE)

        #Average prediction range   
        print('Mean of median prediction range: %.3f' % mean(imedian_prediction_size))



        ipercent_num_outside_median = 100*float(inum_outside_median)/float(len(iexperimental_values))
        print('Number of compounds outside of prediction range: '+str(inum_outside_median))
        print('% of compounds predicted outside of prediction range: '+str(ipercent_num_outside_median) +' %')
        print(' ')   

        #### Plot results - plot test set
        if args.plot:     
            if args.verbose:
                print(' ################ Plotting testset #################')
            fig, ax = plt.subplots()

            ax.errorbar(experimental_values, Y_pred_median_test, yerr=median_prediction_size,
            fmt='o', markeredgecolor = 'black', markersize = 6,
            mew=1, ecolor='black', elinewidth=0.3, capsize = 3, capthick=1, errorevery = 1)
    
            #Set the size
            ax.set_ylim([-10,-3])
            ax.set_xlim([-10,-3])
    

            # Plot title and lables
            #plt.title('Median predictions with prediction ranges for the testset')
            plt.ylabel('Predicted log Kp')
            plt.xlabel('Experimental log Kp')
    
            # Draw line 
            fit = np.polyfit(experimental_values, Y_pred_median_test, 1)
    
            x = [-10,-3]
    
            #Regression line
            #ax.plot(experimental_values, fit[0]*asanyarray(experimental_values)+ fit[1], color='black')
            #ax.plot(x, fit[0]*asanyarray(x)+ fit[1], color='black')
    

    
            #Creating colored dots for ref 10
    
            #ref10_experimental = data.loc[data['Ref.'] == 10]['Observed']
            #ref10_predicted = C['median_prediction'][ref10_experimental.index]
            #ax.scatter(ref10_experimental, ref10_predicted,marker = 'o', color ='red', s = 100)
    


            ax.plot(x, x, color='black')
    
            plt.show()

    #Print data in CSV-file
    
    descriptors_df['Median prediction low range'] = C['median_prediction_0']
    descriptors_df['Median prediction high range'] = C['median_prediction_1'] 
    descriptors_df['Median prediction'] = C['median_prediction']
    descriptors_df['size prediction range'] = C['median_prediction_1'] - C['median_prediction_0']
    write_csv_with_data(data,descriptors_df, args.d)


    #Calculate min, max and mean values for descriptors
    if args.phys:
        print(args.phys)
        print('Min: ')
        print(descriptors_df.min())
        print('Max: ')
        print(descriptors_df.max())
        print('Mean:')    
        print(descriptors_df.mean()) 

    if args.pca:
        print('Starting PCA')
        print(descriptors_df[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']].head(3))
        print(len(descriptors_df[['size prediction range']]))
        
        #Define typ of PCA
        pca = PCA(n_components=2)
        
        #Select desctiptors to use in PCA
        df_small = descriptors_df[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']]

        #Convert descritor values to numeric/float
        df_X = df_small.apply(pandas.to_numeric, errors='raise') 
 
        #Scale data 
        scaler = preprocessing.RobustScaler() #Normalizer() # MaxAbsScaler() 
        df_X_scaled = scaler.fit_transform(df_X)
        
        #Calculate PCA
        pca.fit(df_X_scaled)


        X2 = pca.transform(df_X_scaled) #descriptors_df[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']])

        #-----------------------------------------------------------
        desc_testset_large = descriptors_df.dropna(subset = ['size prediction range'])


        desc_testset_small = desc_testset_large[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']]

        desc_testset_num = desc_testset_small.apply(pandas.to_numeric, errors='raise')

        desc_testset_scaled = scaler.fit_transform(desc_testset_num)

        X3 = pca.transform(desc_testset_scaled) #desc_testset[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']])
        #-----------------------------------------------------------

        #desc_testset = descriptors_df.dropna(subset = ['size prediction range'])
        Yerr_num = desc_testset_large[['size prediction range']].apply(pandas.to_numeric, errors='coerce')
        #print(pandas.Series(Yerr['size prediction range']))


        yerr = list(pandas.Series(Yerr_num['size prediction range'])/4)

        plt.errorbar(X3[:,0], X3[:,1], yerr=yerr ,fmt='o', markeredgecolor = 'black', markersize = 6, mew=1, ecolor='black', elinewidth=0.3, capsize = 3, capthick=1, errorevery = 1)

        plt.scatter(X2[:,0], X2[:,1])
        plt.xlabel('PC1')
        plt.ylabel('PC2')

        plt.title('PCA of descriptors')
        plt.show()