Пример #1
0
def main(path):
    # read raw data
    churn_raw_data = pd.read_csv(path)

    # preprocessing data
    X_train, X_test, y_train, y_test = data_preprocess.preprocess(churn_raw_data)
    print(X_train.shape)
Пример #2
0
def run(max_epoch=1, nfolds=10, batch_size=128):
    if os.path.isfile('../Data/data.csv') and os.path.isfile('../Data/max.json'):
        df = pd.read_csv('../Data/data.csv', engine='python', header=None)
        df.dropna(axis=0, how='any')
        X, y = df.iloc[:, 0:-1].values, df.iloc[:, -1].values
        with open('../Data/max.json', 'r') as file:
            js = file.read()
            max_dict = json.loads(js)
            max_features = max_dict['feature']
            max_len = max_dict['len']
    else:
        X, y, max_features, max_len = data_preprocess.preprocess()

    labels = ['bad' if x == 0 else 'good' for x in y]

    final_data = []

    for fold in range(nfolds):
        print("fold %u/%u" % (fold + 1, nfolds))
        X_train, X_test, y_train, y_test, _, label_test = train_test_split(X, y, labels,
                                                                           test_size=0.2)

        print('Build model...')
        model = build_model(max_features, max_len)

        print("Train...")
        X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, test_size=0.05)
        best_iter = -1
        best_auc = 0.0
        out_data = {}

        for ep in range(max_epoch):
            model.fit(X_train, y_train, batch_size=batch_size, epochs=1)

            t_probs = model.predict_proba(X_holdout)
            t_auc = sklearn.metrics.roc_auc_score(y_holdout, t_probs)

            print('Epoch %d: auc = %f (best=%f)' % (ep, t_auc, best_auc))

            if t_auc > best_auc:
                best_auc = t_auc
                best_iter = ep

                probs = model.predict_proba(X_test)

                out_data = {'y': y_test, 'labels': label_test, 'probs': probs, 'epochs': ep,
                            'confusion_matrix': sklearn.metrics.confusion_matrix(y_test, probs > .5)}

                print(sklearn.metrics.confusion_matrix(y_test, probs > .5))
            else:
                if (ep - best_iter) > 2:
                    break
        
        model.save('../Model/model.h5')
        model.save_weights('../Model/model_weights.h5')
        
        final_data.append(out_data)
    pickle.dump(final_data, open('../Model/results.pkl', 'wb'))

    return final_data
Пример #3
0
 def __init__(self):
     print("Initializing Data Loader...")
     # Load dataframe and process it
     self.dataframe = dv.preprocess(
         os.path.join(os.getcwd(), "shuffled-full-set-hashed.csv"))
     print("Load Complete")
     # Create train and test sets
     # self.X_train,self.X_test,self.y_train,self.y_test= dv.create_train_test(self.dataframe)
     print("Created Train(90%) and Test(10%) Stratified splits")
Пример #4
0
    def start(self):
        '''
		The function reads and merges the datasets
		populates the instance methods for training and testing data
		'''
        cdf = prepare_crime_data()
        self.hdf = prepare_housing_data()
        self.cdf = preprocess(cdf)
        mdf = merge_datasets(self.cdf, self.hdf)
        self.train = mdf.drop(['Mean_Price'], axis=1)
        self.target = mdf.Mean_Price
        return
Пример #5
0
def train():
    # read the data set
    scallop = pd.read_csv(file_dict["data_file"],
                          usecols=["longitude", "latitude", "tot.catch"])
    data = scallop.copy()
    data["tot.catch"] = np.log(scallop["tot.catch"] + 1)
    # split data set
    random.seed(0)
    scdata = preprocess(long_block_num=10, lat_block_num=10)

    scdata.split_block_data(data,
                            plot_block=True,
                            save_file=file_dict["plot1"])

    scdata.generate_posterior_plot_data()

    gx, gy, X_new = scdata.generate_suface_data()

    scmodel = GP_model(dim=2, x=scdata.X_2D_train, y=scdata.Y_2D_train)

    scmodel.train(niter=1000)
    scmodel.plot_trace(save_file=file_dict["plot3"])
    scmodel.print_summary(save_file=file_dict["plot2"])

    mu, sd, _ = scmodel.predict_GP(X_new,
                                   pred_noise=True,
                                   samples=50,
                                   pred_name="surface1")
    plot_gp_2D(gx,
               gy,
               mu,
               sd,
               scdata.X_2D_train,
               scdata.Y_2D_train,
               scdata.X_2D_test,
               scdata.X_2D_test,
               save_file=file_dict["plot4"])

    scmodel.plot_range(scdata.long_new,
                       scdata.lat_new,
                       save_file=file_dict["plot5"])

    mu, sd, _ = scmodel.predict_GP(scdata.X_2D_test,
                                   pred_noise=True,
                                   samples=500,
                                   pred_name="test")

    # RMSE
    print(RMSE(mu, scdata.Y_2D_test))
Пример #6
0
def load_data(sz):
    """
    Load the dataset, either from source files, or from pre-prepared compressed numpy array
    If the pre-prepared file does not exist - create it
    """
    img_filez = config.datafile("train_data_{}_{}_{}.npz".format(config.category,sz,config.num_images))
    if not os.path.isfile(img_filez):
        print("Creating {} file for faster processing...".format(img_filez))
        print("Genre: {}, # of images = {}".format(config.category,config.num_images))
        final_images_stacked = preprocess(genre_or_style=config.category, min_vals=[sz,sz],n=config.num_images)
        np.savez_compressed(file=img_filez, a=final_images_stacked)
    else:
        print("Load preprocessed image data from {}".format(img_filez))
        final_images_stacked = np.load(file=img_filez)["a"]
    return final_images_stacked
def main():
    test_mini_batch_gd = True
    test_numerical_grad = False

    # Data parameters
    train_size = 60000
    test_size = 10000
    val_frac = 0.1
    train_data, val_data, test_data, idx_to_class, class_to_idx = data.preprocess(
        train_size, test_size, val_frac)

    # Model parameters
    layers = [784, 32, 32, 10]
    act_funs = ['ELU', 'tanh', 'softmax']
    loss_fun = 'cross_entropy'
    alpha = 0.5
    mu = 0.4
    T = 80
    xavier = True
    p = 0
    gamma = 0
    network = MLP(layers, act_funs, loss_fun, alpha, mu, T, idx_to_class,
                  xavier, p, gamma)

    if test_mini_batch_gd:
        # Training parameters
        epochs = 50
        batch_size = 128
        shuffle = True
        augment = False
        shift = 2
        stop = 3
        report = True
        log = True

        network.mini_batch_GD(epochs, batch_size, train_data, val_data,
                              test_data, shuffle, augment, shift, stop, report,
                              log)
        network.plot_accuracy()
        network.plot_loss()

    if test_numerical_grad:
        # Numerical grad parameters
        input, output = train_data[0][0], train_data[1][0]
        epsilon = 10**-2

        max_difference_1 = numerical_grad(network, 0, input, label, epsilon)
        max_difference_2 = numerical_grad(network, 1, input, label, epsilon)
Пример #8
0
def main(path):
    # read raw data
    churn_raw_data = pd.read_csv(path)

    # preprocessing data
    X_train, X_test, y_train, y_test = data_preprocess.preprocess(churn_raw_data)

    # xgboost model training
    xgb_fpr, xgb_tpr, xgb_roc_auc = xgboost_churn.xgboost_churn(X_train, y_train, X_test, y_test)

    # lightGBM model training
    gbm_fpr, gbm_tpr, gbm_roc_auc = lightGBM_churn.lightGBM_churn(X_train, y_train, X_test, y_test)

    # randomForest model training
    rf_fpr, rf_tpr, rf_roc_auc = randomForest_churn.random_forest_churn(X_train, y_train, X_test, y_test)

    # DNN model training
    dnn_fpr, dnn_tpr, dnn_roc_auc = dnn_churn.dnn_churn(X_train, y_train, X_test, y_test)

    # plot ROC
    drawRoc.drawRoc([xgb_fpr, xgb_tpr, xgb_roc_auc], [gbm_fpr, gbm_tpr, gbm_roc_auc],
                    [rf_fpr, rf_tpr, rf_roc_auc], [dnn_fpr, dnn_tpr, dnn_roc_auc])
Пример #9
0
# using ML-KNN
from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix
from sklearn.model_selection import GridSearchCV
from data_preprocess import preprocess

features_train, features_test, labels_train, labels_test = preprocess()

features_train = lil_matrix(features_train).toarray()
labels_train = lil_matrix(labels_train).toarray()
features_test = lil_matrix(features_test).toarray()

parameters = {'k': [3, 5, 7]}

grid_clf = GridSearchCV(MLkNN(), parameters, scoring='accuracy', cv=3)
grid_clf.fit(features_train, labels_train)

print("Best parameters:\n", grid_clf.best_params_)

# accuracy
print("Accuracy = ", grid_clf.best_score_)
Пример #10
0
def main():
    ### Loading TRAIN Data
    df_train = load_df("train_v2.csv",10000)

    ### Loading TEST Data
    df_test = load_df('test_v2.csv', 10000)

    ### Fix outliers that do not fall within +/-3 std dev from mean of log transaction value
    
    # create a new dummy column logTransaction which is the log of all totalTransactionRevenue
    df_train['logTransaction']= df_train['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x))
    std_dev = df_train.logTransaction.std()
    mean_val = df_train.logTransaction.mean()
    df_train['logTransaction'] = np.where(np.abs(df_train.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_train.logTransaction)

    # create a new dummy column logTransaction which is the log of all totalTransactionRevenue
    df_test['logTransaction']= df_test['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x))
    std_dev = df_test.logTransaction.std()
    mean_val = df_test.logTransaction.mean()
    df_test['logTransaction'] = np.where(np.abs(df_test.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_test.logTransaction)

    ### Extract Labels 
    y = df_train['logTransaction']
    # Get true values from test set
    y_true = df_test['logTransaction']

    ### Removing unnecessary columns 

    # colums that contain no data
    ones = unique_valued_cols(df_train)
    cols_to_remove = [x for x in ones if set(df_train[x].unique()) == set(['not available in demo dataset'])]
    cols_to_remove.extend(['hits','customDimensions'])


    # Drop them from both the sets
    df_train = drop_cols(df_train, list(cols_to_remove))
    df_test = drop_cols(df_test, list(cols_to_remove))

    # Remove transaction related columns
    transaction_cols = ['totals.totalTransactionRevenue', 'totals.transactionRevenue', 'totals.transactions', 'fullVisitorId', 'logTransaction']
    df_train = drop_cols(df_train, transaction_cols)
    df_test = drop_cols(df_test, transaction_cols)

    # Remove extra column in training
    #df_train = df_train.drop('trafficSource.campaignCode', axis=1)

    ### Preprocess the data before we start training
    df_train = preprocess(df_train)
    df_test = preprocess(df_test)

    ### Create categorical and numeric features dataframe
    df_categorical = df_train.select_dtypes(include=['object'])
    df_categorical_test = df_test.select_dtypes(include=['object'])

    # Numeric
    df_numeric = df_train.select_dtypes(include=['float64', 'int64'])
    df_numeric_test = df_test.select_dtypes(include=['float64', 'int64'])

    # Label encoding on categorical
    df_categorical = label_encoding(df_categorical)
    df_categorical_test = label_encoding(df_categorical_test)

    # Predict on categorical
    lm = LinearRegression()
    categorize_model, RMSE_test, RMSE_train = train_and_predict(lm, df_categorical, df_categorical_test, y, y_true)

    print("In-sample RMSE categorical:", RMSE_train)
    print("Out-sample RMSE categorical:", RMSE_test)
    print("Model parameters: ", categorize_model.get_params())

    joblib.dump(categorize_model, "modl_LR_cat.joblib")

    print('-'*10)


    # Predict on numerical
    df_numeric = df_numeric.fillna(0)
    df_numeric_test = df_numeric_test.fillna(0)

    num_model, RMSE_test, RMSE_train = train_and_predict(lm, df_numeric, df_numeric_test, y, y_true)

    print("In-sample RMSE numeric:", RMSE_train)
    print("Out-sample RMSE numeric:", RMSE_test)
    print("Model parameters: ", num_model.get_params())

    joblib.dump(num_model, "modl_LR_cat.joblib")

    print('-'*10)

    # Predict on all features
    df_train = pd.concat([df_numeric,df_categorical],axis=1)
    df_test = pd.concat([df_numeric_test,df_categorical_test],axis=1)
    full_model, RMSE_test, RMSE_train = train_and_predict(lm, df_train, df_test, y, y_true)

    print("In-sample RMSE:", RMSE_train)
    print("Out-sample RMSE:", RMSE_test)
    print("Model parameters: ", full_model.get_params())

    joblib.dump(full_model, "modl_LR_cat.joblib")

    print('-'*10)
Пример #11
0
    def run(self,
            df,
            time_col,
            predictor_sets,
            label,
            start,
            end,
            test_window_months,
            outcome_lag_days,
            output_dir,
            output_filename,
            grid_size='test',
            thresholds=[],
            ks=list(np.arange(0, 1, 0.1)),
            save_output=True,
            debug=False):
        '''
        Run the pipeline using temporal cross validation. 

        Inputs:

        predictor_sets: list of lists of predictors

        Output:

        '''
        if debug:
            print('START')
            print('GRID SIZE = {}'.format(grid_size))

        # load data
        self.load_clean_data(df)
        # set outcome
        self.label = label
        # set predictors
        self.add_predictor_sets(predictor_sets, reset=True)
        # set parametergrid for classifiers
        self.grid_size = grid_size
        self.set_paramgrid(grid_size)
        # get cutoff times for train test split
        self.get_train_test_times(start, end, test_window_months,
                                  outcome_lag_days)
        # initialize run number and results
        N = 0
        model_results = {}
        # initialize output file
        if not os.path.exists(output_dir):
            os.mkdir(output_dir)
        output_path = os.path.join(output_dir, output_filename)
        headers = [
            'model_id', 'N_split', 'i', 'label', 'model_type', 'roc_auc', 'k',
            'precision', 'recall', 'accuracy', 'params', 'predictors'
        ]
        pd.DataFrame(columns=headers).to_csv(output_path, index=False)

        if debug:
            print("set up done. output: {}".format(output_path))

        # 1) loop over temporal sets
        for train_start, train_end, test_start, test_end in self.train_test_times:
            model_results[N] = {}
            i = 0

            if debug:
                print('## TRAIN: {} - {}, TEST:{} - {} ##'.format(
                    str(train_start), str(train_end), str(test_start),
                    str(test_end)))

            # 2) loop over predictor combos
            for predictor_cols in self.predictor_combos:

                if debug:
                    print('### Predictors: {}'.format(predictor_cols))

                # train test split
                X_train, y_train, X_test, y_test = self.temporal_split(
                    time_col, train_start, train_end, test_start, test_end,
                    predictor_cols)
                if debug:
                    print('...train test split done')

                # pre-process training and test sets
                preprocess.preprocess(X_train, y_train)
                preprocess.preprocess(X_test, y_test)

                if debug:
                    print('...pre-processing done')

                # generate features
                train_to_concat = [X_train]
                test_to_concat = [X_test]
                for p in predictor_cols:
                    dummies_train, dummies_test = self.generate_feature(
                        p, X_train, X_test)
                    train_to_concat.append(dummies_train)
                    test_to_concat.append(dummies_test)
                X_train = pd.concat(train_to_concat, axis=1)
                X_test = pd.concat(test_to_concat, axis=1)
                X_train.drop(columns=predictor_cols, inplace=True)
                X_test.drop(columns=predictor_cols, inplace=True)

                if debug:
                    print('...feature generation done')

                # 3) loop over classifier types
                for model_type, clf in self.clfs.items():
                    if debug:
                        print('#### {}-{}: {}'.format(N, i, model_type))
                    # 4) loop over parameter combinations
                    for params in ParameterGrid(self.paramgrid[model_type]):
                        if debug:
                            print('{}'.format(params))
                        m = self.build_model(clf, X_train, y_train, X_test,
                                             y_test, params, N, i, model_type,
                                             predictor_cols, label, output_dir,
                                             output_filename, thresholds, ks,
                                             save_output)
                        model_results[N][i] = m

                        if debug:
                            print('---model results saved---')

                        i += 1
            N += 1

        # store model results
        self.models = model_results

        if debug:
            print('FINISH')
Пример #12
0
ones = unique_valued_cols(df_test)
cols_to_remove = [x for x in ones if set(df_test[x].unique()) == set(['not available in demo dataset'])]
cols_to_remove.extend(['hits', 'customDimensions', 'device.isMobile'])

# Drop them
df_test = drop_cols(df_test, list(cols_to_remove))

# Remove transaction related columns
transaction_cols = ['totals.totalTransactionRevenue', 'totals.transactionRevenue', 'totals.transactions', 'fullVisitorId', 'logTransaction']
df_test = drop_cols(df_test, transaction_cols)

### Preprocess the data before we start training
# print(df_test.iloc[0])
df_test = df_test.fillna(0)
df_test = label_encoding(df_test)
df_test = preprocess(df_test)

### Get Predictions
pred = mdl.predict(df_test)

vals = []
acc = []

for idx, each in enumerate(list(pred)):
    vals.append((idx, (each-y_true[idx])))

for idx, each in enumerate(list(pred)):
    if float(y_true[idx]) != 0.0:
        acc.append((idx, abs(each-y_true[idx])))

Пример #13
0
        # Output layer

        logits = layers.fully_connected(avg_cell_out,
                                        OUT_DIM,
                                        activation_fn=None,
                                        scope='output')

        return logits


# Preprocessing
raw_files = os.listdir(DIR_RAW)
label_file = os.listdir(DIR_LABELS)[0]
dp.preprocess(raw_data_files=raw_files,
              raw_label_file=label_file,
              processed_file_name=PROCESSED_FILE,
              dir_raw=DIR_RAW,
              dir_label=DIR_LABELS,
              stride=BATCH_SIZE)

# Training phase setup
global_step = tf.Variable(0, trainable=False)

csvFileList = [DIR_PROCESSED + 'processed_data.csv']
batch_feature, batch_label = input_pipeline(csvFileList, BATCH_SIZE)
logits = deepSense(batch_feature, True, name='deepSense')
predict = logits[1]  # Regression task, 1-dim tensor
batchLoss = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
                                                    labels=batch_label)
loss = tf.reduce_mean(batchLoss)

# Evaluation phase setup
Пример #14
0
    def cross_val(self):
        # so that we get different metrics used in this custom version
        # preprocess the data
        prep = preprocess(self.fasta_file, self.readout_file)

        # if want mono-nucleotide sequences
        dict = prep.one_hot_encode()
        # if want dinucleotide sequences
        #dict = prep.dinucleotide_encode()

        np.set_printoptions(threshold=sys.maxsize)

        # seed to reproduce results
        seed = random.randint(1, 1000)

        fw_fasta = dict["forward"]
        rc_fasta = dict["reverse"]
        readout = dict["readout"]

        #if self.activation_type == 'linear':
        #    readout = np.log2(readout)
        #    readout = np.ndarray.tolist(readout)

        forward_shuffle, readout_shuffle = shuffle(fw_fasta,
                                                   readout,
                                                   random_state=seed)
        reverse_shuffle, readout_shuffle = shuffle(rc_fasta,
                                                   readout,
                                                   random_state=seed)
        readout_shuffle = np.array(readout_shuffle)

        # initialize metrics to save values
        metrics = []

        # save the information of 10 folds auc scores
        train_auc_scores = []
        test_auc_scores = []

        # Provides train/test indices to split data in train/test sets.
        kFold = StratifiedKFold(n_splits=10)
        ln = np.zeros(len(readout_shuffle))
        for train, test in kFold.split(ln, ln):
            model = None
            model, model2 = self.create_model()

            fwd_train = forward_shuffle[train]
            fwd_test = forward_shuffle[test]
            rc_train = reverse_shuffle[train]
            rc_test = reverse_shuffle[test]
            y_train = readout_shuffle[train]
            y_test = readout_shuffle[test]

            model = self.create_model()

            # Early stopping
            callback = EarlyStopping(monitor='loss',
                                     min_delta=0.001,
                                     patience=3,
                                     verbose=0,
                                     mode='auto',
                                     baseline=None,
                                     restore_best_weights=False)
            history = model.fit({
                'forward': fwd_train,
                'reverse': rc_train
            },
                                y_train,
                                epochs=self.epochs,
                                batch_size=self.batch_size,
                                validation_split=0.0,
                                callbacks=[callback])

            # Without early stopping
            model.fit({
                'forward': x1_train,
                'reverse': x2_train
            },
                      y1_train,
                      epochs=self.epochs,
                      batch_size=self.batch_size,
                      validation_split=0.0)

            pred_train = model.predict({
                'forward': x1_test,
                'reverse': x2_test
            })
            vals = []

            for i in range(len(pred_train)):
                if pred_train[i] < 0.5:
                    val = 0
                    vals.append(val)
                if pred_train[i] >= 0.5:
                    val = 1
                    vals.append(val)

            print(y1_train[0:10])
            print(vals[0:10])

            true_pred = 0
            false_pred = 0
            for ind in range(len(pred_train)):
                if y1_train[ind] == vals[ind]:
                    true_pred += 1
                else:
                    false_pred += 1
            print('Total number of train-set predictions is: ' +
                  str(len(y1_train)))
            print('Number of correct train-set predictions is: ' +
                  str(true_pred))
            print('Number of incorrect train-set predictions is: ' +
                  str(false_pred))

            auc_score = sklearn.metrics.roc_auc_score(y1_train, pred_train)
            print('train-set auc score is: ' + str(auc_score))
            print('train-set seed number is: ' + str(seed))
            train_auc_scores.append(auc_score)

            ##########################################################

            pred = model.predict({'forward': x1_test, 'reverse': x2_test})

            vals = []
            for i in range(len(pred)):
                if pred[i] < 0.5:
                    val = 0
                    vals.append(val)
                if pred[i] >= 0.5:
                    val = 1
                    vals.append(val)

            true_pred = 0
            false_pred = 0
            for ind in range(len(y1_test)):
                if y1_test[ind] == vals[ind]:
                    true_pred += 1
                else:
                    false_pred += 1
            print('Total number of test-set predictions is: ' +
                  str(len(y1_test)))
            print('Number of correct test-set predictions is: ' +
                  str(true_pred))
            print('Number of incorrect test-set predictions is: ' +
                  str(false_pred))

            auc_score = sklearn.metrics.roc_auc_score(y1_test, pred)
            print('test-set auc score is: ' + str(auc_score))
            print('test-set seed number is: ' + str(seed))
            test_auc_scores.append(auc_score)

        print('seed number = %d' % seed)
        print(train_auc_scores)
        print('Mean train auc_scores of 10-fold cv is ' +
              str(np.mean(train_auc_scores)))
        print(test_auc_scores)
        print('Mean test auc_scores of 10-fold cv is ' +
              str(np.mean(test_auc_scores)))
Пример #15
0
    def eval(self):
        prep = preprocess(self.fasta_file, self.readout_file)

        # if want mono-nucleotide sequences
        dict = prep.one_hot_encode()

        # if want dinucleotide sequences
        # dict = prep.dinucleotide_encode()

        # print maximum length without truncation
        np.set_printoptions(threshold=sys.maxsize)

        fw_fasta = dict["forward"]
        rc_fasta = dict["reverse"]
        readout = dict["readout"]

        seed = self.seed  #random.randint(1,1000)

        x1_train, x1_test, y1_train, y1_test = train_test_split(
            fw_fasta, readout, test_size=0.1, random_state=seed)
        # split for reverse complemenet sequences
        x2_train, x2_test, y2_train, y2_test = train_test_split(
            rc_fasta, readout, test_size=0.1, random_state=seed)
        #assert x1_test == x2_test
        #assert y1_test == y2_test

        model = self.create_model()

        # change from list to numpy array
        y1_train = np.asarray(y1_train)
        y1_test = np.asarray(y1_test)
        y2_train = np.asarray(y2_train)
        y2_test = np.asarray(y2_test)

        # Copy the original target values for later uses
        y1_train_orig = y1_train.copy()
        y1_test_orig = y1_test.copy()

        # if we want to merge two training dataset
        # comb = np.concatenate((y1_train, y2_train))

        ## Change it to categorical values
        y1_train = keras.utils.to_categorical(y1_train, 2)
        y1_test = keras.utils.to_categorical(y1_test, 2)

        checkpoint = ModelCheckpoint(
            'model-{epoch:03d}-{accuracy:03f}-{val_accuracy:03f}.h5',
            verbose=1,
            monitor='val_loss',
            save_best_only=True,
            mode='auto')

        # train the data
        model.fit({
            'forward': x1_train,
            'reverse': x2_train
        },
                  y1_train,
                  epochs=self.epochs,
                  batch_size=self.batch_size,
                  validation_split=0.1,
                  callbacks=[checkpoint])
        ## Save the entire model as a SavedModel.
        ##model.save('my_model')
        # Save weights only: later used in self.filter_importance()
        #model.save_weights('./my_checkpoint')

        # save each convolution learned filters as txt file
        """
        motif_weight = model.get_weights()
        motif_weight = np.asarray(motif_weight[0])
        for i in range(int(self.filters)):
            x = motif_weight[:,:,i]
            berd = np.divide(np.exp(100*x), np.transpose(np.expand_dims(np.sum(np.exp(100*x), axis = 1), axis = 0), [1,0]))
            np.savetxt(os.path.join('./motif_files', 'filter_num_%d'%i+'.txt'), berd)
        """
        pred_train = model.predict({'forward': x1_train, 'reverse': x2_train})

        # See which label has the highest confidence value
        predictions_train = np.argmax(pred_train, axis=1)

        print(y1_train_orig[0:10])
        print(predictions_train[0:10])

        true_pred = 0
        false_pred = 0
        for count, value in enumerate(predictions_train):
            if y1_train_orig[count] == predictions_train[count]:
                true_pred += 1
            else:
                false_pred += 1
        print('Total number of train-set predictions is: ' +
              str(len(y1_train_orig)))
        print('Number of correct train-set predictions is: ' + str(true_pred))
        print('Number of incorrect train-set predictions is: ' +
              str(false_pred))

        # Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores.
        # Returns AUC
        auc_score = sklearn.metrics.roc_auc_score(y1_train_orig,
                                                  predictions_train)
        print('train-set auc score is: ' + str(auc_score))
        print('train-set seed number is: ' + str(seed))

        ##########################################################
        # Apply on test data
        pred_test = model.predict({'forward': x1_test, 'reverse': x2_test})
        # See which label has the highest confidence value
        predictions_test = np.argmax(pred_test, axis=1)

        true_pred = 0
        false_pred = 0
        for count, value in enumerate(predictions_test):
            if y1_test_orig[count] == predictions_test[count]:
                true_pred += 1
            else:
                false_pred += 1
        print('Total number of test-set predictions is: ' +
              str(len(y1_test_orig)))
        print('Number of correct test-set predictions is: ' + str(true_pred))
        print('Number of incorrect test-set predictions is: ' +
              str(false_pred))

        auc_score = sklearn.metrics.roc_auc_score(y1_test_orig,
                                                  predictions_test)
        print('test-set auc score is: ' + str(auc_score))
        print('test-set seed number is: ' + str(seed))
Пример #16
0
tr_dat = sys.argv[1]
conditions = os.path.isfile(
    "./data/preprocessed_train_pasges(" + tr_dat + ").npy") and os.path.isfile(
        "./data/preprocessed_train_queries(" + tr_dat +
        ").npy") and os.path.isfile(
            "./data/preprocessed_test_queries(" + tr_dat +
            ").npy") and os.path.isfile(
                "./data/preprocessed_test_pasges(" + tr_dat +
                ").npy") and os.path.isfile(
                    "./data/embed_word_matrix(" + tr_dat +
                    ").npy") and os.path.isfile(
                        "./data/labels(" + tr_dat +
                        ").npy") and os.path.isfile(
                            "./data/embed_char_matrix(" + tr_dat + ").npy")
if (conditions == False):
    preprocess(tr_dat)

queries = np.loadtxt("./data/preprocessed_train_queries(" + tr_dat + ").npy")
pasges = np.loadtxt("./data/preprocessed_train_pasges(" + tr_dat + ").npy")
test_queries = np.loadtxt("./data/preprocessed_test_queries(" + tr_dat +
                          ").npy")
test_pasges = np.loadtxt("./data/preprocessed_test_pasges(" + tr_dat + ").npy")
labels = np.loadtxt("./data/labels(" + tr_dat + ").npy")
embed_word_matrix = np.loadtxt("./data/embed_word_matrix(" + tr_dat + ").npy")
embed_char_matrix = np.loadtxt("./data/embed_char_matrix(" + tr_dat + ").npy")
df = pd.read_csv("./data/eval1_unlabelled.tsv", sep="\t",
                 header=None)  # read dummy .tsv file into memory
test_data = df.values

# ======  End of Loading Data  =======
Пример #17
0
def main():
    ### Loading TRAIN Data
    df_train = load_df("train_v2.csv")

    ### Loading TEST Data
    df_test = load_df('test_v2.csv')

    ### Fix outliers that do not fall within +/-3 std dev from mean of log transaction value
    
    # create a new dummy column logTransaction which is the log of all totalTransactionRevenue
    df_train['logTransaction']= df_train['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x))
    std_dev = df_train.logTransaction.std()
    mean_val = df_train.logTransaction.mean()
    df_train['logTransaction'] = np.where(np.abs(df_train.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_train.logTransaction)

    # create a new dummy column logTransaction which is the log of all totalTransactionRevenue
    df_test['logTransaction']= df_test['totals.totalTransactionRevenue'].fillna(0).astype(float).apply(lambda x: np.log1p(x))
    std_dev = df_test.logTransaction.std()
    mean_val = df_test.logTransaction.mean()
    df_test['logTransaction'] = np.where(np.abs(df_test.logTransaction-mean_val) > 3*std_dev,3*std_dev,df_test.logTransaction)

    ### Extract Labels 
    y = df_train['logTransaction']
    # Get true values from test set
    y_true = df_test['logTransaction']

    ### Removing unnecessary columns 

    # colums that contain no data
    ones = unique_valued_cols(df_train)
    cols_to_remove = [x for x in ones if set(df_train[x].unique()) == set(['not available in demo dataset'])]
    cols_to_remove.append(['hits', 'customDimensions'])

    # Drop them
    df_train = drop_cols(df_train, list(cols_to_remove))
    df_test = drop_cols(df_test, list(cols_to_remove))

    # Remove transaction related columns
    transaction_cols = ['totals.totalTransactionRevenue', 'totals.transactionRevenue', 'totals.transactions', 'fullVisitorId', 'logTransaction']
    df_train = drop_cols(df_train, transaction_cols)
    df_test = drop_cols(df_test, transaction_cols)

    # Remove extra column in training
    df_train = df_train.drop('trafficSource.campaignCode', axis=1)

    ### Preprocess the data before we start training
    df_train = preprocess(df_train)
    df_test = preprocess(df_test)

    ### Create categorical and numeric features dataframe
    df_categorical = df_train.select_dtypes(include=['object'])
    df_categorical_test = df_test.select_dtypes(include=['object'])

    # Numeric
    df_numeric = df_train.select_dtypes(include=['float64', 'int64'])
    df_numeric_test = df_test.select_dtypes(include=['float64', 'int64'])

    # Label encoding
    df_categorical = label_encoding(df_categorical)
    df_categorical_test = label_encoding(df_categorical_test)

    ### Training and Predictions

    ################### Categorical ###############
    reg_tree = tree.DecisionTreeRegressor()
    model_cat, RMSE_test, RMSE_train = train_and_predict(reg_tree, df_categorical, df_categorical_test, y, y_true)

    for idx, each in enumerate(df_categorical.columns):
        print(idx, each)

    for idx, each in enumerate(model_cat.feature_importances_):
        print(idx, each*1e5)

    print('-'*10)
    print('\n')
    print("-- Scores for Categorical --")
    print("RMSE on test: ", RMSE_test)
    print("RMSE on train: ", RMSE_train)
    print('\n\n')

    print('-- Getting list of columns for categorical model --\n')
    for each in df_categorical.columns:
        print(each)
    for imp in model_cat.feature_importances_:
        print(imp)

    # Save Categorical model
    joblib.dump(model_cat, "modl_DT_cat.joblib")

    ###################### Numerical #####################
    df_numeric = df_numeric.fillna(0)
    df_numeric_test = df_numeric_test.fillna(0)

    model_num, RMSE_test, RMSE_train = train_and_predict(reg_tree, df_numeric, df_numeric_test, y, y_true)

    for idx, each in enumerate(df_numeric.columns):
        print(idx, each)

    for idx, each in enumerate(model_num.feature_importances_):
        print(idx, each*1e5)

    print('-'*10)
    print('\n')
    print("-- Scores for Numerical --")
    print("RMSE on test: ", RMSE_test)
    print("RMSE on train: ", RMSE_train)
    print('\n\n')

    print('-- Getting list of columns for Numerical Model --\n')
    for each in df_numeric.columns:
        print(each)
    for imp in model_num.feature_importances_:
        print(imp)

    # Save Numerical model
    joblib.dump(model_num, "modl_DT_num.joblib")

    ###################### Full #####################
    df_train = pd.concat([df_numeric,df_categorical],axis=1)
    df_test = pd.concat([df_numeric_test,df_categorical_test],axis=1)

    model_full, RMSE_test, RMSE_train = train_and_predict(reg_tree, df_train, df_test, y, y_true)

    for idx, each in enumerate(df_train.columns):
        print(idx, each)

    for idx, each in enumerate(model_full.feature_importances_):
        print(idx, each*1e5)

    print('-'*10)
    print('\n')
    print("-- Scores for Full --")
    print("RMSE on test: ", RMSE_test)
    print("RMSE on train: ", RMSE_train)
    print('\n\n')

    print('-- Getting list of columns for Full model --\n')
    for each in df_train.columns:
        print(each)
    for imp in model_full.feature_importances_:
        print(imp)

    # Save full model
    joblib.dump(model_full, "modl_DT_full.joblib")
Пример #18
0
    train_summary_writer = tf.summary.FileWriter(log_dir + '/train',
                                                 sess.graph)
    valid_summary_writer = tf.summary.FileWriter(log_dir + '/valid',
                                                 sess.graph)
    # -------------------------------------------------------------

    for epoch in range(1, 1 + 30):
        pi = np.random.permutation(len(train_images))
        train_data, train_labels = train_images[pi], train_cords[pi]
        t0 = tm.time()
        for i in range(batch_count):
            start = i * cg.batch_size
            end = (i + 1) * cg.batch_size
            t1 = tm.time()
            input, label = preprocess(train_data[start:end],
                                      train_labels[start:end], cg.crop_size,
                                      8)  #8*n
            train_res = sess.run([train_step, loss, merged_summary_op],
                                 feed_dict={
                                     x: input,
                                     y: label,
                                     is_training: True
                                 })
            if i % 200 == 0 or i < 3:
                train_summary_writer.add_summary(train_res[2],
                                                 epoch * all_count + i)
                if epoch <= 5:
                    print('Epoch: %d--Iter: %d--Train_loss: %.3f' %
                          (epoch, i, train_res[1]))
            # -------------------------------------------------------------
            if epoch > 5:
Пример #19
0
    def create_model(self):
        # different metric functions
        def coeff_determination(y_true, y_pred):
            SS_res = K.sum(K.square(y_true - y_pred))
            SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
            return (1 - SS_res / (SS_tot + K.epsilon()))

        def auroc(y_true, y_pred):
            return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)

        # building model
        prep = preprocess(self.fasta_file, self.readout_file)
        # if want mono-nucleotide sequences
        dict = prep.one_hot_encode()
        # if want dinucleotide sequences
        #dict = prep.dinucleotide_encode()

        readout = dict["readout"]
        fw_fasta = dict["forward"]
        rc_fasta = dict["reverse"]

        dim_num = fw_fasta.shape

        # To build this model with the functional API,
        # you would start by creating an input node:
        forward = keras.Input(shape=(dim_num[1], dim_num[2]), name='forward')
        reverse = keras.Input(shape=(dim_num[1], dim_num[2]), name='reverse')

        #first_layer = Conv1D(filters=self.filters, kernel_size=self.kernel_size, data_format='channels_last', input_shape=(dim_num[1],dim_num[2]), use_bias = False)
        ## with trainable = False
        #first_layer = Conv1D(filters=self.filters, kernel_size=self.kernel_size, kernel_initializer = my_init, data_format='channels_last', input_shape=(dim_num[1],dim_num[2]), use_bias = False, trainable=False)
        first_layer = ConvolutionLayer(filters=self.filters,
                                       kernel_size=self.kernel_size,
                                       data_format='channels_last',
                                       use_bias=True,
                                       alpha=self.alpha)

        fw = first_layer(forward)
        bw = first_layer(reverse)

        concat = concatenate([fw, bw], axis=1)

        pool_size_input = concat.shape[1]

        #concat = ReLU()(concat)
        #concat = Dense(1, activation= 'sigmoid')(concat)

        if self.pool_type == 'Max':
            pool_layer = MaxPooling1D(pool_size=pool_size_input)(concat)
        elif self.pool_type == 'Ave':
            pool_layer = AveragePooling1D(pool_size=pool_size_input)(concat)
        elif self.pool_type == 'custom':

            def out_shape(input_shape):
                shape = list(input_shape)
                print(input_shape)
                shape[0] = 10
                return tuple(shape)

            #model.add(Lambda(top_k, arguments={'k': 10}))
            def top_k(inputs, k):
                # tf.nn.top_k Finds values and indices of the k largest entries for the last dimension
                print(inputs.shape)
                inputs2 = tf.transpose(inputs, [0, 2, 1])
                new_vals = tf.nn.top_k(inputs2, k=k, sorted=True).values
                # transform back to (None, 10, 512)
                return tf.transpose(new_vals, [0, 2, 1])

            pool_layer = Lambda(top_k, arguments={'k': 2})(concat_relu)
            pool_layer = AveragePooling1D(pool_size=2)(pool_layer)
        elif self.pool_type == 'custom_sum':
            ## apply relu function before custom_sum functions
            def summed_up(inputs):
                #nonzero_vals = tf.keras.backend.relu(inputs)
                new_vals = tf.math.reduce_sum(inputs, axis=1, keepdims=True)
                return new_vals

            pool_layer = Lambda(summed_up)(concat_relu)
        else:
            raise NameError('Set the pooling layer name correctly')

        flat = Flatten()(pool_layer)

        after_flat = Dense(32)(flat)

        # Binary classification with 2 output neurons
        if self.regularizer == 'L_1':
            #outputs = Dense(1, kernel_initializer='normal', kernel_regularizer=regularizers.l1(0.001), activation= self.activation_type)(flat)
            ## trainable = False with learned bias

            #outputs = Dense(1, kernel_initializer='normal', kernel_regularizer=regularizers.l1(0.001), activation= self.activation_type)(after_flat)
            outputs = Dense(2,
                            kernel_initializer='normal',
                            kernel_regularizer=regularizers.l1(0.001),
                            activation='sigmoid')(after_flat)
        elif self.regularizer == 'L_2':
            #outputs = Dense(1, kernel_initializer='normal', kernel_regularizer=regularizers.l1(0.001), activation= self.activation_type)(flat)
            ## trainable = False with learned bias
            outputs = Dense(2,
                            kernel_initializer='normal',
                            kernel_regularizer=regularizers.l2(0.001),
                            activation=self.activation_type)(after_flat)
        else:
            raise NameError('Set the regularizer name correctly')

        #weight_forwardin_0=model.layers[0].get_weights()[0]
        #print(weight_forwardin_0)
        model = keras.Model(inputs=[forward, reverse], outputs=outputs)

        #print model summary
        model.summary()

        #model.compile(loss='mean_squared_error', optimizer='adam', metrics = ['accuracy'])
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        #model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', auroc])

        return model
Пример #20
0
parser.add_argument('--train_data',
                    type=str,
                    default='data/corpus.txt',
                    help='train data source')
parser.add_argument('--alg', type=str, default='sklearn', help='sklearn/self')
parser.add_argument('--topic', type=int, default=10, help='topic num')
parser.add_argument('--iter', type=int, default=100, help='training iter')
parser.add_argument('--n_top_words',
                    type=int,
                    default=10,
                    help='topic word num')

args = parser.parse_args()
if not os.path.exists(args.train_data):
    preprocess()

corpus = load_data(args.train_data)

tf_vectorizer = CountVectorizer(max_df=0.50, min_df=5, max_features=2000)
tf = tf_vectorizer.fit_transform(corpus)
tf_feature_names = tf_vectorizer.get_feature_names()

if args.alg == 'sklearn':
    model = LatentDirichletAllocation(n_components=args.topic,
                                      max_iter=args.iter,
                                      learning_method='batch',
                                      n_jobs=-1)
    print("Begin training.")

    model.fit(tf)
Пример #21
0
y = df_train['logTransaction']

# Remove colums that contain no data
ones = unique_valued_cols(df_train)
cols_to_remove = [x for x in ones if set(df_train[x].unique()) == set(['not available in demo dataset'])]
df_train = df_train.drop(cols_to_remove, axis=1)

# Remove transaction related columns
transaction_cols = ['totals.totalTransactionRevenue', 'totals.transactionRevenue', 'totals.transactions', 'fullVisitorId', 'logTransaction']
df_train = drop_cols(df_train, transaction_cols)

# Remove extra column in training
df_train = df_train.drop('trafficSource.campaignCode', axis=1)

### Preprocess the data before we start training
df_train = preprocess(df_train)

# Get the categorical variables
df_categorical = df_train.select_dtypes(include=['object'])

# add logTransaction (dependent variable) column
df_categorical['logTransaction'] = y

# delete train set as we don't need it anymore
del df_train

#Initialize ChiSquare Class
cT = ChiSquare(df_categorical)

# Test independence of categorical variables on logTransaction - that we are predicting.
# print which variables are important and which ones are not important
Пример #22
0
    def filter_importance(self):
        prep = preprocess(self.fasta_file, self.readout_file)

        # if want mono-nucleotide sequences
        dict = prep.one_hot_encode()

        # if want dinucleotide sequences
        # dict = prep.dinucleotide_encode()

        # print maximum length without truncation
        np.set_printoptions(threshold=sys.maxsize)

        fw_fasta = dict["forward"]
        rc_fasta = dict["reverse"]
        readout = dict["readout"]

        seed = self.seed  #random.randint(1,1000)

        x1_train, x1_test, y1_train, y1_test = train_test_split(
            fw_fasta, readout, test_size=0.1, random_state=seed)
        # split for reverse complemenet sequences
        x2_train, x2_test, y2_train, y2_test = train_test_split(
            rc_fasta, readout, test_size=0.1, random_state=seed)
        #assert x1_test == x2_test
        #assert y1_test == y2_test

        model = self.create_model()

        # change from list to numpy array
        y1_train = np.asarray(y1_train)
        y1_test = np.asarray(y1_test)
        y2_train = np.asarray(y2_train)
        y2_test = np.asarray(y2_test)

        # Copy the original target values for later uses
        y1_train_orig = y1_train.copy()
        y1_test_orig = y1_test.copy()

        # if we want to merge two training dataset
        # comb = np.concatenate((y1_train, y2_train))

        ## Change it to categorical values
        y1_train = keras.utils.to_categorical(y1_train, 2)
        y1_test = keras.utils.to_categorical(y1_test, 2)

        # Restore the weights
        #weight_dir = './data/E13RACtrlF1_E13RAMutF1_DMR_toppos2000/checkpoint/my_checkpoint'
        weight_dir = '/Users/minjunpark/Documents/MuSeAM/classification/saved_weights/my_checkpoint'

        model.load_weights(weight_dir)

        #######*******************************
        pred_train = model.predict({'forward': x1_train, 'reverse': x2_train})

        # See which label has the highest confidence value
        predictions_train = np.argmax(pred_train, axis=1)

        print(y1_train_orig[0:10])
        print(predictions_train[0:10])

        true_pred = 0
        false_pred = 0
        for count, value in enumerate(predictions_train):
            if y1_train_orig[count] == predictions_train[count]:
                true_pred += 1
            else:
                false_pred += 1
        print('Total number of train-set predictions is: ' +
              str(len(y1_train_orig)))
        print('Number of correct train-set predictions is: ' + str(true_pred))
        print('Number of incorrect train-set predictions is: ' +
              str(false_pred))

        # Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores.
        # Returns AUC
        auc_score = sklearn.metrics.roc_auc_score(y1_train_orig,
                                                  predictions_train)
        print('train-set auc score is: ' + str(auc_score))
        print('train-set seed number is: ' + str(seed))

        ##########################################################
        # Apply on test data
        pred_test = model.predict({'forward': x1_test, 'reverse': x2_test})
        # See which label has the highest confidence value
        predictions_test = np.argmax(pred_test, axis=1)

        true_pred = 0
        false_pred = 0
        for count, value in enumerate(predictions_test):
            if y1_test_orig[count] == predictions_test[count]:
                true_pred += 1
            else:
                false_pred += 1
        print('Total number of test-set predictions is: ' +
              str(len(y1_test_orig)))
        print('Number of correct test-set predictions is: ' + str(true_pred))
        print('Number of incorrect test-set predictions is: ' +
              str(false_pred))

        auc_score = sklearn.metrics.roc_auc_score(y1_test_orig,
                                                  predictions_test)
        print('test-set auc score is: ' + str(auc_score))
        print('test-set seed number is: ' + str(seed))
        sys.exit()
        #######*******************************
        """
        model.load_weights(weight_dir)
        weights = model.get_weights()

        # Apply on test data
        pred_test = model.predict({'forward': x1_test, 'reverse': x2_test})
        # Sum the absolute difference between y1_test and pred_test
        vals = np.sum(np.absolute(np.subtract(y1_test, pred_test)), axis=1)
        baseline = np.average(vals)
        """
        distances = []
        for i in range(self.filters):
            model.load_weights(weight_dir)
            weights = model.get_weights()

            zeros = np.zeros((12, 4))
            weights[0][:, :, i] = zeros
            model.set_weights(weights)

            ##########################################################
            # Apply on test data
            pred_test = model.predict({'forward': x1_test, 'reverse': x2_test})
            # See which label has the highest confidence value
            vals = np.sum(np.absolute(np.subtract(y1_test, pred_test)), axis=1)
            ave_distance = np.average(vals)
            distances.append(ave_distance)
            print(i)
        print(distances)
        np.savetxt('distances.txt', distances)
Пример #23
0
        def objective(params):
            prep = preprocess(self.fasta_file, self.readout_file)

            # if want mono-nucleotide sequences
            dict = prep.one_hot_encode()

            # if want dinucleotide sequences
            # dict = prep.dinucleotide_encode()

            # print maximum length without truncation
            np.set_printoptions(threshold=sys.maxsize)

            fw_fasta = dict["forward"]
            rc_fasta = dict["reverse"]
            readout = dict["readout"]

            # seed = random.randint(1,1000)
            seed = self.seed

            x1_train, x1_test, y1_train, y1_test = train_test_split(
                fw_fasta, readout, test_size=0.2, random_state=seed)
            # split for reverse complemenet sequences
            x2_train, x2_test, y2_train, y2_test = train_test_split(
                rc_fasta, readout, test_size=0.2, random_state=seed)
            #assert x1_test == x2_test
            #assert y1_test == y2_test

            self.filters = params["filters"]
            self.kernel_size = params["kernel_size"]
            self.epochs = params["epochs"]
            self.batch_size = params["batch_size"]
            self.alpha = params["alpha"]

            model = self.create_model()

            # change from list to numpy array
            y1_train = np.asarray(y1_train)
            y1_test = np.asarray(y1_test)
            y2_train = np.asarray(y2_train)
            y2_test = np.asarray(y2_test)

            y1_train_orig = y1_train.copy()
            y1_test_orig = y1_test.copy()

            # if we want to merge two training dataset
            # comb = np.concatenate((y1_train, y2_train))

            ## Change it to categorical values
            y1_train = keras.utils.to_categorical(y1_train, 2)
            y1_test = keras.utils.to_categorical(y1_test, 2)
            # if we want to merge two training dataset
            # comb = np.concatenate((y1_train, y2_train))

            # train the data
            model.fit({
                'forward': x1_train,
                'reverse': x2_train
            },
                      y1_train,
                      epochs=self.epochs,
                      batch_size=self.batch_size,
                      validation_split=0.1)

            test_pred = model.predict({'forward': x1_test, 'reverse': x2_test})
            test_pred = np.argmax(test_pred, axis=1)

            auc_score = sklearn.metrics.roc_auc_score(y1_test_orig, test_pred)
            return -auc_score
Пример #24
0
def main(model, init_train, start_epoch, cycle, epochs, batch_size,
         save_intervals):

    model = model.upper()

    if model == 'DCGAN_1':
        my_model = DCGAN(name='DCGAN_1')
    elif model == 'DCGAN_2':
        my_model = DCGAN(name='DCGAN_2')
    elif model == 'DCGAN_3':
        my_model = DCGAN(name='DCGAN_3')
    elif model == 'VAE_1':
        my_model = VAE(name='VAE_1')
    elif model.upper() == 'VAE_2':
        my_model = VAE(name='VAE_2')
    elif model == 'VAE_3':
        my_model = VAE(name='VAE_3')
    elif model == 'VAE_4':
        my_model = VAE(name='VAE_4')
    else:
        print(
            'The selected model {} is not in the list [DCGAN_1, DCGAN_2, DCGAN_3, VAE_1, VAE_2, VAE_3, VAE_4]'
            .format(model))

    print("Python main programm for generating images using {}".format(model))

    ## preprocess data images if init_train and save the images as pickle file. if not init_train load the saved file
    if init_train:
        print("Start initial process of building the {} model.".format(model))
        print("Do Preprocessing by loading scraped images...")
        ### manually merged into merged_japanese, so take that subdirectory as datapath source:
        if False:
            ## select genre = "yakusha-e"
            image_resized_1 = preprocess(genre_or_style="yakusha-e",
                                         min_vals=[128, 128])
            ## select style = "Japanese Art"
            image_resized_2 = preprocess(genre_or_style="Japanese Art",
                                         min_vals=[128, 128])
            final_images_stacked = np.vstack(
                (image_resized_1, image_resized_2))
            del image_resized_1, image_resized_2
            gc.collect()
        else:
            final_images_stacked = preprocess(genre_or_style="merged_japanese",
                                              min_vals=[128, 128])

        ## save the train data such that in the next intermediate steps the preprocess() fnc is not needed, rather load file
        try:
            print(
                "Save preprocessed image data on ../data/train_data.npz in order to retrieve in upcoming training cycles."
            )
            np.savez_compressed(file="../data/train_data.npz",
                                a=final_images_stacked)
        except:
            print(
                "Could not save train data on machine for upcoming training cycles."
            )

    else:
        try:
            print("Load preprocessed image data from earlier training cycles.")
            final_images_stacked = np.load(file="../data/train_data.npz")["a"]
        except:
            ### manually merged into merged_japanese, so take that subdirectory as datapath source:
            if False:
                ## select genre = "yakusha-e"
                image_resized_1 = preprocess(genre_or_style="yakusha-e",
                                             min_vals=[128, 128])
                ## select style = "Japanese Art"
                image_resized_2 = preprocess(genre_or_style="Japanese Art",
                                             min_vals=[128, 128])
                final_images_stacked = np.vstack(
                    (image_resized_1, image_resized_2))
                del image_resized_1, image_resized_2
                gc.collect()
            else:
                final_images_stacked = preprocess(
                    genre_or_style="merged_japanese", min_vals=[128, 128])

    if init_train:
        print("Start initial training of the {} model:".format(model))
        print("There are {} images provided for training".format(
            len(final_images_stacked)))
        my_model.train(data=final_images_stacked,
                       epochs=epochs,
                       batch_size=batch_size,
                       save_intervals=save_intervals,
                       init_train=init_train,
                       start_epoch=start_epoch,
                       cycle=cycle)
    else:
        if model in ['DCGAN_1', 'DCGAN_2', 'DCGAN_3']:
            print(
                "Using last epoch {} of generator and discriminator for the stacked {}  model:"
                .format(start_epoch, model))
            generator_weights = "../model/{}/epoch_{}_generator.h5".format(
                model, start_epoch)
            discrimininator_weights = "../model/{}/epoch_{}_discriminator.h5".format(
                model, start_epoch)
            #load generator weights
            my_model.generator.load_weights(filepath=generator_weights)
            #load discriminator weights
            my_model.discriminator.load_weights(
                filepath=discrimininator_weights)
            #train the dcgan with last epoch weights
            print(
                "Training the {} model based on last epoch weights {}.".format(
                    model, start_epoch))
        elif model in ['VAE_1', 'VAE_2', 'VAE_3', 'VAE_4']:
            print(
                "Using last epoch {} of encoder and decoder for the stacked {} model:"
                .format(start_epoch, model))
            encoder_weights = "../model/{}/epoch_{}_encoder.h5".format(
                model, start_epoch)
            decoder_weights = "../model/{}/epoch_{}_decoder.h5".format(
                model, start_epoch)
            vae_weights = "../model/{}/epoch_{}_vae.h5".format(
                model, start_epoch)
            #load encoder weights
            my_model.encoder.load_weights(filepath=encoder_weights)
            #load decoder weights
            my_model.decoder.load_weights(filepath=decoder_weights)
            #load VAE weights
            my_model.vae.load_weights(filepath=vae_weights)
            #train the VAE with last epoch weights
            print(
                "Training the {} model based on last epoch weights {}.".format(
                    model, start_epoch))
        else:
            print('Selected model {} is not available')

        my_model.train(data=final_images_stacked,
                       epochs=epochs,
                       batch_size=batch_size,
                       save_intervals=save_intervals,
                       init_train=init_train,
                       start_epoch=start_epoch,
                       cycle=cycle)
Пример #25
0
            testing_labels.append(testing_data[files[i]]['label'])
            testing_content.append(testing_data[files[i]]['content'])
            
    print("Finished loading and splitting data for training and testing.\n")

    print("Preprocessing data for training.\n")
    #removing stop words and lemmatizing
    preprocessed_content_training = []
    #Bigrams and trigrams
    bigrams_content_training = []
    trigrams_content_training = []
    for i,t in enumerate(training_content):
        words = []
        for word in training_content[i].split(' '):
            words.append(word)
        preprocessed_content_training.append(dp.preprocess(training_content[i]))
    print("Finished preprocessing data for training.\n")

    #preprocess for bigrams and trigrams
    print("Getting bigram for training.\n")
    bigrams_content_training, bigram_mod = dp.get_bigrams(preprocessed_content_training)
    print("Finished getting bigram for training.\n")
    print("Getting trigram for training.\n")
    trigrams_content_training, trigram_mod = dp.get_trigrams(bigrams_content_training)
    print("Finished getting trigram for training.\n")
        
    print("Getting bow corpus and dictionary for training.\n")
    bow_corpus_training, dictionary = dp.get_dictionary_corpus(preprocessed_content_training)
    print("Finished getting bow corpus and dictionary for training.\n")
    
#    vectorizer = CountVectorizer()