示例#1
0
if __name__ == "__main__":
	model_dir = os.path.join(scene_test_list[0], 'model')
	result_dir = os.path.join(scene_test_list[0], 'result', 'test_out')
	errorlog_dir = os.path.join(scene_test_list[0], 'errorlog')
	summarylog_dir = os.path.join(scene_test_list[0], 'summarylog')

	os.makedirs(model_dir, exist_ok=True)
	os.makedirs(result_dir, exist_ok=True)
	os.makedirs(errorlog_dir, exist_ok=True)
	os.makedirs(summarylog_dir, exist_ok=True)

	train_data = dataLoader(data_dir=data_dir, subset='train',
							patch_width=patch_width,
							patch_height=patch_height,
							image_start_idx=train_start_idx,
							img_per_scene=train_per_scene,
							patch_per_img=patch_per_img,
							scene_list=scene_train_list)
	valid_data = dataLoader(data_dir=data_dir, subset='valid',
							patch_width=patch_width,
							patch_height=patch_height,
							image_start_idx=valid_start_idx,
							img_per_scene=valid_per_scene,
							patch_per_img=patch_per_img,
							scene_list=scene_valid_list)

	# Train
	train_dataset = tf.data.TFRecordDataset([train_data.dataset_name])
	# Parse the record into tensors.
	train_dataset = train_dataset.map(_parse_function)
"""
Predictions are made here.
@AugustSemrau
"""

from data_loader import dataLoader
from models import Models
from data_loader import csv_saver
import xgboost as xgb

if __name__ == '__main__':

    # Import test data
    testData = dataLoader(test=True, ageNAN='median')

    # Initiate models
    Models = Models(agenan='median')

    # Logistic Regression model predictions
    logisticModel = Models.build_model_LR()
    logistic_predictions = logisticModel.predict(testData)
    # Saving predictions
    csv_saver(predictions=logistic_predictions, type="logistic")

    # Naive Bayes model predictions
    naiveBayes_model = Models.build_model_NB()
    naiveBayes_predictions = naiveBayes_model.predict(testData)
    # Saving predictions
    csv_saver(predictions=naiveBayes_predictions, type="naiveBayes")

    # Stochastic Gradient Descent model predictions
示例#3
0
from model.anchornet import AnchorNet

# read num_anchor from command line
parser = argparse.ArgumentParser()
parser.add_argument('--anchor', help='Number of anchors to predict')
parser.add_argument('--rectify_img',
                    help='Whether to rectify images',
                    default=0)
args = parser.parse_args()
num_anchor = int(args.anchor)
print('Number of anchors to predict: {}'.format(num_anchor))
rectify_img = True if int(args.rectify_img) > 0 else False
print('Rectify image: {}'.format(rectify_img))

# load data
data_loader = dataLoader()
imgs = data_loader.loadTestingImg()
flows = data_loader.loadTestingFlow()
total_count = len(imgs)

# load model
if num_anchor > 0:
    depthnet = DepthNet(data_loader.getImgShape())
    depthnet.model.load_weights(
        os.path.join(os.getcwd(), "checkpoints/model_depth.hdf5"))
    anchornet = AnchorNet(data_loader.getImgShape(), num_anchor)
    anchornet.model.load_weights(
        os.path.join(os.getcwd(),
                     'checkpoints/model_anchor{}.hdf5'.format(num_anchor)))

# path to save results

if __name__ == "__main__":
    model_dir = os.path.join(scene_test_list[0], 'model')
    result_dir = os.path.join(scene_test_list[0], 'result', 'test_out')
    errorlog_dir = os.path.join(scene_test_list[0], 'errorlog')
    summarylog_dir = os.path.join(scene_test_list[0], 'summarylog')

    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(result_dir, exist_ok=True)
    os.makedirs(errorlog_dir, exist_ok=True)
    os.makedirs(summarylog_dir, exist_ok=True)

    test_data = dataLoader(data_dir=data_dir,
                           subset='test',
                           image_start_idx=0,
                           img_per_scene=test_per_scene,
                           scene_list=scene_test_list)

    # Test
    test_dataset = tf.data.TFRecordDataset([test_data.dataset_name])
    test_dataset = test_dataset.map(_parse_function_testdata)
    test_dataset = test_dataset.batch(test_batch_size)

    handle_large = tf.placeholder(tf.string, shape=[])
    iterator_structure_large = tf.data.Iterator.from_string_handle(
        handle_large, test_dataset.output_types, test_dataset.output_shapes)
    next_element_large = iterator_structure_large.get_next()
    test_iterator = test_dataset.make_initializable_iterator()

    # Model
This script is a low-effort tuning of K-Nearest-Neighbor model.
@AugustSemrau
"""

from data_loader import dataLoader
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import pandas as pd



if __name__ == '__main__':

    # Get data
    X, y = dataLoader(test=False, optimize_set=False)
    y = y.values.ravel()
    # First make list of k's
    neighbors = list(range(1, 25, 1))
    # Second make empty list for scores
    cval_scores = []

    # Now do 10-fold cross-validation
    for K in neighbors:
        model_KNN = KNeighborsClassifier(n_neighbors=K)
        scores = cross_val_score(model_KNN, X, y, cv=10, scoring='accuracy')
        cval_scores.append(scores.mean())

    # Plotting these score to se how different number of neighbors affect performance
    def plot_acc(knn_scores):
        pd.DataFrame({"K": [i for i in neighbors], "Accuracy": knn_scores}).set_index("K").plot.bar(rot=0)
示例#6
0
    #formatter = logging.Formatter('[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s')
    logging.basicConfig(filename='logfile.log', level=logging.INFO)
    formatter = logging.Formatter('[%(asctime)s] [%(name)s] [%(levelname)s] %(message)s')
    logger = get_logger('ML_struct')
    parser = argparse.ArgumentParser(description='Character-level convolutional neural network for text classification')
    parser.add_argument('--config','--c', type=str, metavar='yaml FILE', help='where to load YAML configuration')
    args = parser.parse_args()
    print(args)
    print(args.config)
    yaml_file = args.config
    with open(yaml_file) as f:
        cfg = yaml.load(f)
    print(cfg)
    
    train_dataset, val_dataset = Datasets() # 이건 다른 path일때는 어떻게?
    train_loader, val_loader=dataLoader(train_dataset, val_dataset)
    cnn = CNNClassifier()
    device = torch.device("cuda")
    print(device)
    if torch.cuda.device_count() >1 :
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        #cnn.to(device)
        cnn = nn.DataParallel(cnn)
        cnn.to(device)
    else:
        cnn.to(device)
        #mytensor = my_tensor.to(device)

    # loss
    criterion = nn.CrossEntropyLoss()
    # backpropagation method
示例#7
0
def build_optimized_RF(X, y, d0, d1, d2, d3):
    model = RandomForestClassifier(n_estimators=d0,
                                   max_depth=d1,
                                   max_features=d2,
                                   criterion=d3,
                                   oob_score=True,
                                   random_state=0)
    model.fit(X, y)
    return model


if __name__ == '__main__':

    # Defining data
    X_train, X_val, y_train, y_val = dataLoader(test=False,
                                                optimize_set=True,
                                                ageNAN="median")

    # Building baseline RF model
    baselineRF = build_baseline_RF(X=X_train, y=y_train)
    print('Default RF model out-of-the-bag error', baselineRF.oob_score_)
    print('Default RF model test accuracy', baselineRF.score(X_val, y_val))

    # Performing Bayesian optimization to optimize RF model parameters
    # Acquisition functions can be MPI, EI or LCB
    opt = GPyOpt.methods.BayesianOptimization(f=objective_function,
                                              domain=domain_RF,
                                              acquisition_type='MPI')
    opt.acquisition.exploration_weight = 0.5
    opt.run_optimization(max_iter=30)
    domain_best = opt.X[np.argmin(opt.Y)]
示例#8
0
def main(_):
  # Import data
  #mnist = input_data.read_data_sets("./input_data/", one_hot=True)
  mnist=data_loader.dataLoader("./data/train.pkl","./data/test.pkl")

  # Create the model
  x = tf.placeholder(tf.float32, [None, 1568])

  # Define loss and optimizer
  y_ = tf.placeholder(tf.float32, [None, 19])

  # Build the graph for the deep net
  y_conv, keep_prob = deepnn(x)

  with tf.name_scope('loss'):
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=y_,
                                                            logits=y_conv)
  cross_entropy = tf.reduce_mean(cross_entropy)

  with tf.name_scope('adam_optimizer'):
    train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)

  with tf.name_scope('accuracy'):
    correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
    correct_prediction = tf.cast(correct_prediction, tf.float32)
  accuracy = tf.reduce_mean(correct_prediction)

  graph_location = tempfile.mkdtemp()
  print('Saving graph to: %s' % graph_location)
  train_writer = tf.summary.FileWriter(graph_location)
  train_writer.add_graph(tf.get_default_graph())


  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(1001):
      train_image,train_label = mnist.nextBatch(50)


      if i % 500 == 0:
        train_accuracy = accuracy.eval(feed_dict={
            x: train_image, y_: train_label, keep_prob: 1.0})
        print('step % d, training accuracy %g' % (i, train_accuracy))
        train_acc.append(train_accuracy)
      train_step.run(feed_dict={x: train_image, y_: train_label, keep_prob: 0.5})
      
      if i % 500 == 0:
        x_inter.append(i)
        test_accuracy=accuracy.eval(feed_dict={
        x: mnist.test_image, y_: mnist.test_label, keep_prob: 1.0})
        print('test accuracy %g' % test_accuracy)
        test_acc.append(test_accuracy)
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    ax.plot(x_inter,train_acc, 'r', x_inter, test_acc, 'b')

    label = ["train", "test"]
    plt.title("accuracy of train and loss")
    plt.xlabel("interations")
    plt.ylabel("accuracy")
    plt.legend(label, loc=1, ncol=2)
  # display the plot
    plt.savefig("1.jpg")
示例#9
0
"""
This script is experimenting with the XGBoost model.
@AugustSemrau
"""

from data_loader import dataLoader
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

if __name__ == '__main__':

    # Get data
    X_train, X_val, y_train, y_val = dataLoader(test=False, optimize_set=True)
    y_train, y_val = y_train.values.ravel(), y_val.values.ravel()

    # Define baseline XGB classifier model with default parameters
    defualt_xgb = XGBClassifier()
    defualt_xgb.fit(X_train, y_train)
    default_predictions = defualt_xgb.predict(X_val)
    print('Default XGB model test accuracy',
          accuracy_score(y_val, default_predictions))

    # Using early_stopping_rounds to determine best n_estimators number
    tuned_xgb = XGBClassifier(n_estimators=10, learning_rate=0.5)
    tuned_xgb.fit(X_train,
                  y_train,
                  early_stopping_rounds=5,
                  eval_set=[(X_val, y_val)],
                  verbose=False)
    tuned_params = tuned_xgb.get_params()
    print('')
 def __init__(self, agenan="median"):
     self.X, self.y = dataLoader(test=False,
                                 optimize_set=False,
                                 ageNAN=agenan)
     self.y = self.y.values.ravel()
    def process_CNN_word_vector(self):
        data_loader = dataLoader()
        # Without removing stopwords seems to give better performance for CNN
        vec_max_features = 6000
        vectorizer = TfidfVectorizer(stop_words=None,
                                     ngram_range=(1, 4),
                                     max_df=0.7,
                                     max_features=vec_max_features)
        train_docs = data_loader.get_train_docs()
        dev_docs = data_loader.get_dev_docs()
        test_docs = data_loader.get_test_docs()
        X_train = vectorizer.fit_transform(train_docs).toarray()
        X_dev = vectorizer.transform(dev_docs).toarray()
        X_test = vectorizer.transform(test_docs).toarray()
        # print('X_train shape: ', X_train.shape)
        # X_train_vec_mean = data_loader.get_X_train_glove_vec_mean()
        # X_dev_vec_mean = data_loader.get_X_dev_glove_vec_mean()
        # X_test_vec_mean = data_loader.get_X_test_glove_vec_mean()
        # print('X_train_vec_mean shape: ', X_train_vec_mean.shape)
        # X_train = np.concatenate((X_train, X_train_vec_mean), axis=1)
        # X_dev = np.concatenate((X_dev, X_dev_vec_mean), axis=1)
        # X_test = np.concatenate((X_test, X_test_vec_mean), axis=1)

        y_train = data_loader.get_y_train()
        zip_train_X = list(zip(X_train, y_train))
        random.shuffle(zip_train_X)
        X_train, y_train = zip(*zip_train_X)
        X_train, y_train = np.array(list(X_train)), list(y_train)
        y_train = keras.utils.to_categorical(y_train, num_classes=2)
        y_dev = data_loader.get_y_dev()
        y_dev_categorical = keras.utils.to_categorical(y_dev, num_classes=2)
        print('X_train shape:', X_train.shape)

        # _activations = ['tanh', 'relu', 'selu']
        # _optimizers = ['sgd', 'adam']
        # _batch_size = [16, 32, 64]
        # params = dict(var_activation=_activations,
        #               var_optimizer=_optimizers,
        #               batch_size=_batch_size)

        # tokenizer = Tokenizer()
        # tokenizer.fit_on_texts(X_train)
        # maxlen = 1000
        # sequences_train = tokenizer.texts_to_sequences(X_train)
        # sequences_train = pad_sequences(sequences_train, maxlen=maxlen)
        #
        # vocab_size = len(tokenizer.word_index) + 1
        # embedding_size = vec_max_features
        #
        # input_tfidf = Input(shape=(vec_max_features,))
        # input_text = Input(shape=(maxlen,))
        #
        # embedding = Embedding(vocab_size, embedding_size, input_length=maxlen)(input_text)
        # mean_embedding = keras.layers.Lambda(lambda x: keras.backend.mean(x, axis=1))(embedding)
        # concatenated = concatenate([input_tfidf, mean_embedding])
        #
        # dense1 = Dense(256, activation='relu')(concatenated)
        # dense2 = Dense(32, activation='relu')(dense1)
        # dense3 = Dense(8, activation='sigmoid')(dense2)
        #
        # model = Model(inputs=[input_tfidf, input_text], outputs=dense3)
        # model.summary()
        # model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        num_classes = 2
        verbose, epochs, batch_size = 0, 30, 16
        print('Building model...')
        model = Sequential()
        model.add(Dropout(0.25))
        model.add(Dense(256, activation='relu',
                        activity_regularizer=l1(0.001)))
        model.add(Dense(64, activation='relu', activity_regularizer=l1(0.01)))
        model.add(BatchNormalization())
        model.add(Dense(num_classes, activation='softmax'))
        model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        print("Fitting model... ")
        model.fit(X_train,
                  y_train,
                  epochs=epochs,
                  batch_size=batch_size,
                  validation_data=(X_dev, y_dev_categorical),
                  shuffle=True)
        y_dev_pred = model.predict_classes(X_dev)
        y_test_pred = model.predict_classes(X_test)
        print('Precision on development set: ',
              precision_score(y_dev, y_dev_pred))
        print('F1-Score on development set: ', f1_score(y_dev, y_dev_pred))
        print('Recall on development set: ', recall_score(y_dev, y_dev_pred))
        print('Negative prediction proportion on development set: ',
              sum([1 for y in y_dev_pred if y == 0]) / len(y_dev_pred))
        print('Negative prediction proportion on test set: ',
              sum([1 for y in y_test_pred if y == 0]) / len(y_test_pred))
        return y_test_pred
    def process_cnn_embedding(self):
        data_loader = dataLoader()
        train_docs = data_loader.get_train_docs()
        y_train = data_loader.get_y_train()

        zip_train_X = list(zip(train_docs, y_train))
        random.shuffle(zip_train_X)
        train_docs, y_train = zip(*zip_train_X)
        train_docs, y_train = list(train_docs), list(y_train)

        dev_docs = data_loader.get_dev_docs()
        test_docs = data_loader.get_test_docs()
        self.tokenizer.fit_on_texts(train_docs + dev_docs + test_docs)
        encoded_docs = self.tokenizer.texts_to_sequences(train_docs)
        # pad documents to a max length of 10000 words
        max_length = 10000
        X_train = pad_sequences(encoded_docs,
                                maxlen=max_length,
                                padding='post')
        y_train = keras.utils.to_categorical(y_train, num_classes=2)

        # vocab_size = len(self.tokenizer.word_index) + 1
        glove_vec_dim = 300
        word_index = self.tokenizer.word_index
        embedding_matrix = self.create_cnn_embedding_matrix(
            word_index, self.vec_dict, max_length)

        emb_mean, emb_std = -0.005838499, 0.48782197
        all_embs = np.stack(self.vec_dict.values())
        embed_size = all_embs.shape[1]
        nb_words = min(max_length, len(word_index))
        embedding_matrix2 = np.random.normal(emb_mean, emb_std,
                                             (nb_words, embed_size))

        print('Embedding matrix shape: ', embedding_matrix.shape)

        print('Building model...')
        # input1 = Input(shape=(embedding_matrix.shape[0],))
        # embedding1 = Embedding(embedding_matrix.shape[0], glove_vec_dim, weights=[embedding_matrix],
        #               input_length=embedding_matrix.shape[0])(input1)
        # conv1 = Conv1D(filters=16, kernel_size=3, activation='relu', activity_regularizer=l1(0.001))(embedding1)
        # drop1 = Dropout(0.2)(conv1)
        # conv1 = MaxPooling1D(pool_size=2)(drop1)
        #
        # input2 = Input(shape=(embedding_matrix.shape[0],))
        # embedding2 = Embedding(embedding_matrix.shape[0], glove_vec_dim,
        #                        input_length=embedding_matrix.shape[0], trainable=True)(input2)
        # conv2 = Conv1D(filters=16, kernel_size=3, activation='relu',)(embedding2)
        # drop2 = Dropout(0.2)(conv2)
        # conv2 = MaxPooling1D(pool_size=2)(drop2)

        # input2 = Input(shape=(embedding_matrix.shape[0],))
        # embedding2 = Embedding(embedding_matrix.shape[0], glove_vec_dim, trainable=True)(input2)
        # conv2 = Conv1D(filters=32, kernel_size=4, activation='relu', activity_regularizer=l1(0.001))(embedding2)
        # drop1 = Dropout(0.2)(conv2)
        # conv2 = MaxPooling1D(pool_size=2)(drop1)

        # cnn = concatenate([conv1, conv2], axis=-1)
        # flat = Flatten()(cnn)
        # # normal = BatchNormalization()(flat)
        # # x = Dense(128, activation="relu")(flat)
        # x = Dense(2, activation="softmax")(flat)
        # model = Model(inputs=[input1, input2], outputs=x)
        # model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        # # plot_model(model, show_shapes=True, to_file='multichannel.png')

        model = Sequential()
        model.add(
            Embedding(embedding_matrix.shape[0],
                      glove_vec_dim,
                      weights=[embedding_matrix],
                      input_length=embedding_matrix.shape[0]))
        model.add(Dropout(0.25))
        model.add(
            Conv1D(filters=4,
                   kernel_size=3,
                   activation='relu',
                   activity_regularizer=l1(0.01)))
        # model.add(SpatialDropout1D(0.2))
        model.add(MaxPooling1D(pool_size=4))
        # model.add(Conv1D(filters=128, kernel_size=4, activation='relu', activity_regularizer=l1(0.001)))
        # model.add(MaxPooling1D(pool_size=4))
        # model.add(Conv1D(filters=128, kernel_size=4, activation='relu', activity_regularizer=l1(0.001)))
        # model.add(MaxPooling1D(pool_size=4))
        # model.add(LSTM(70))
        # model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(BatchNormalization())
        model.add(Dense(2, activation='softmax'))
        model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        print(model.summary())
        encoded_dev_docs = self.tokenizer.texts_to_sequences(dev_docs)
        X_dev = pad_sequences(encoded_dev_docs,
                              maxlen=max_length,
                              padding='post')
        y_dev = data_loader.get_y_dev()
        y_dev_categorical = keras.utils.to_categorical(y_dev, num_classes=2)
        print("Fitting model... ")
        model.fit(X_train,
                  y_train,
                  epochs=3,
                  batch_size=32,
                  validation_data=(X_dev, y_dev_categorical),
                  shuffle=True)
        # evaluate the model
        y_dev_pred = model.predict(X_dev)
        y_dev_pred = np.argmax(y_dev_pred, axis=1)
        # loss, accuracy = model.evaluate(X_train, y_train, verbose=0)
        # print('Accuracy on training set: ', accuracy)
        print('Accuracy on development set: ',
              accuracy_score(y_dev, y_dev_pred))
        print('Precision on development set: ',
              precision_score(y_dev, y_dev_pred))
        print('F1-Score on development set: ', f1_score(y_dev, y_dev_pred))
        print('Recall on development set: ', recall_score(y_dev, y_dev_pred))
        print('Zero percentage on development set: ',
              sum([1 for y in y_dev_pred if y == 0]) / len(y_dev_pred))
        encoded_test_docs = self.tokenizer.texts_to_sequences(test_docs)
        X_test = pad_sequences(encoded_test_docs,
                               maxlen=max_length,
                               padding='post')
        y_test_pred = model.predict(X_test)
        y_test_pred = np.argmax(y_test_pred, axis=1)
        # y_test_pred = model.predict_classes(X_test)
        print('Zero percentage on test set: ',
              sum([1 for y in y_test_pred if y == 0]) / len(y_test_pred))
        return y_test_pred