Exemplo n.º 1
0
def train_one_epoch(model, param_dict, input_names, output_name, X_shuffled,
                    Y_shuffled):
    batch_size = int(param_dict['batch_size'])
    l = len(X_shuffled)
    train_loss = 0
    train_acc = 0
    for i in range(0, l, batch_size):
        batch_end = min(i + batch_size, l)
        Xs = X_shuffled[i:batch_end]
        Ys = Y_shuffled[i:batch_end]
        batchsize = len(Xs)

        batch = gd.get_batch(Xs, param_dict)
        batch = da.augment_data(batch, param_dict, "train")

        #If single stream model, we have 1 input_name, otherwise 2
        fit_input = make_model_input_dict(input_names, batch)

        # Train on single batch
        history = model.fit(fit_input, {output_name: Ys}, batch_size=batchsize)

        train_loss += float(history.history['loss'][0]) * (batchsize / l)
        train_acc += float(history.history['acc'][0]) * (batchsize / l)

    return model, train_loss, train_acc
Exemplo n.º 2
0
def validate_one_epoch(model, param_dict, input_names, output_name, X_val,
                       Y_val):
    X_val_data = gd.get_batch(X_val, param_dict)
    X_val_augmented = da.augment_data(X_val_data, param_dict, "val")

    # Evaluate on validation data
    print("Evaluating On Validation Data...")
    fit_input = make_model_input_dict(input_names, X_val_augmented)
    val_loss, val_acc = model.evaluate(fit_input, {output_name: Y_val})
    return val_loss, val_acc
Exemplo n.º 3
0
 def test_augment_data(self):
   original_data = [
       np.random.rand(128, 3).tolist(),
       np.random.rand(66, 2).tolist(),
       np.random.rand(9, 1).tolist()
   ]
   original_label = ["data", "augmentation", "test"]
   augmented_data, augmented_label = augment_data(original_data,
                                                  original_label)
   self.assertEqual(25 * len(original_data), len(augmented_data))
   self.assertIsInstance(augmented_data, list)
   self.assertEqual(25 * len(original_label), len(augmented_label))
   self.assertIsInstance(augmented_label, list)
   for i in range(len(original_label)):
     self.assertEqual(augmented_label[25 * i], original_label[i])
Exemplo n.º 4
0
 def get_data_file(self, data_path, data_type):
     """Get train, valid and test data from files."""
     data = []
     label = []
     with open(data_path, "r") as f:
         lines = f.readlines()
         for idx, line in enumerate(lines):  # pylint: disable=unused-variable
             dic = json.loads(line)
             data.append(dic[DATA_NAME])
             label.append(dic[LABEL_NAME])
     if data_type == "train":
         data, label = augment_data(data, label)
     length = len(label)
     print(data_type + "_data_length:" + str(length))
     return data, label, length
Exemplo n.º 5
0
from pydash import flatten
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

from data.utils import load_data
from preprocessing import preprocess_data
from visualization import plot_learning_curves, get_errors_input
from metrics import custom_map_at_k
from feature_selection import get_features_extractor
from data_augmentation import augment_data

print('Augmenting training data set')
augment_data('train.csv', 'train_augmented.csv')

print('Loading training and testing set')
train_data = load_data('train_augmented.csv')
test_data = load_data('test.csv')

print('Preprocessing')
X_train, Y_train = preprocess_data(train_data)
X_test, Y_test = preprocess_data(test_data)

model_name = 'lr'

# print('Loading model')
# model = joblib.load('./models/' + model_name + '_classifier.pkl')
print('Fitting model')
model = Pipeline([
Exemplo n.º 6
0
def main(dataset_dir):
    augment_both = True  # to augment the RGB and target (edge_map) image at the same time
    augment_data(base_dir=dataset_dir,
                 augment_both=augment_both,
                 use_all_type=True)
Exemplo n.º 7
0
    def train(self, \
              xdata, ydata, zdata, x_lengths, y_lengths, \
              xdevdata, ydevdata, zdevdata, xdev_lengths, ydev_lengths, \
              xxdata, yydata, zzdata, xx_lengths, yy_lengths, \
              MAXITER):

        merged_sum = tf.summary.merge_all()

        # writer = tf.train.SummaryWriter("./logs/%s" % "modeldir", self.sess.graph_def)

        tf.initialize_all_variables().run()

        start_time = time.time()

        best_val_loss = 1e100
        best_val_acc = 0.0

        for ITER in range(MAXITER):
            total_acc = 0.0
            print('**************EPOCH****************\n', str(ITER))
            epoch_start_time = time.time()
            total_loss = 0
            # xdata, ydata, zdata, x_lengths, y_lengths = joint_shuffle(xdata, ydata, zdata, x_lengths, y_lengths)
            for i in xrange(0, len(xdata), self.batch_size):
                x, y, z, xlen, ylen = xdata[i:i + self.batch_size], \
                                ydata[i:i + self.batch_size], \
                                zdata[i:i + self.batch_size], \
                                x_lengths[i:i + self.batch_size], \
                                y_lengths[i:i + self.batch_size]

                x, y, z, xlen, ylen = augment_data(x, y, z, xlen, ylen)

                feed_dict = {self.x: x, \
                             self.y: y, \
                             self.target: z, \
                             self.x_length:xlen, \
                             self.y_length:ylen, \
                self.is_training:1, \
        self.dropout_keep_prob:1 }

                att, _, loss, acc, summ = self.sess.run(
                    [self.att, self.optim, self.loss, self.acc, merged_sum],
                    feed_dict=feed_dict)

                total_loss += loss
                total_acc += acc

            print("Epoch Time: ", time.time() - epoch_start_time)

            total_loss = total_loss / float(len(xdata))
            total_acc = total_acc / float(len(xdata) / self.batch_size)
            print("Loss", total_loss, "Accuracy On Training", total_acc)

            total_val_loss, total_val_acc = self.validate(
                xdevdata, ydevdata, zdevdata, xdev_lengths, ydev_lengths, ITER)

            if (best_val_loss >= total_val_loss
                    or best_val_acc <= total_val_acc):
                if (best_val_loss >= total_val_loss):
                    best_val_loss = total_val_loss
                if (best_val_acc <= total_val_acc):
                    best_val_acc = total_val_acc
                self.test(xxdata, yydata, zzdata, xx_lengths, yy_lengths, ITER)

        elapsed_time = time.time() - start_time

        print("Total Time", elapsed_time)
def main():
    """
		Main del archivo, este archivo se encarga de preprocesar las noticias, "limpiar la data" en terminos generales. Hace las operaciones que estén definidas en el método
		transform_tring.

		Primero ejecuta la funcion transform string, luego calcula las salidas de los ejemplos, luego elimina las palabras menos frecuentes y por último guarda estos últimos datos.

	"""
    # dfr = pd.read_csv("newsDatabaseComplete14.csv", header=0, index_col=0)
    # dfr = pd.read_csv("newsDatabaseComplete14_filtered.csv", header=0, index_col=0)
    dfr = pd.read_csv("newsDatabaseComplete14_filtered_mixed.csv",
                      header=0,
                      index_col=0)
    # dfr = pd.read_csv("newsDatabaseComplete14_filtered_augmented.csv", header=0, index_col=0)

    words_in_glove = read_embedd_vectors(
        0)  ############# change for different embedding

    supported_langs = ['en']
    classes = [-1.0, 0.0, 1.0]

    # eliminate non-classes examples
    dfr.dropna(subset=['classes'], inplace=True)
    dfr.index = np.arange(dfr.shape[0])

    dftr, dfte = split_uniformly(dfr, 0.8, classes)

    # augment_data
    import data_augmentation
    dftr, n_perms = data_augmentation.augment_data(dftr)

    # implmentation asking people for classes
    for i in range(dftr.shape[0]):
        lang = detect(dftr['content'][i])
        if (lang in supported_langs):
            tmp = get_raw_data(dftr['title'][i], dftr['content'][i])
            dftr.loc[i, 'content'] = transform_string(tmp, words_in_glove,
                                                      lang, dftr['source'][i])
        else:
            print('language: %s not supported. Notice id: %d' % (lang, i))
            dftr.loc[i, 'content'] = ''

    for i in range(dfte.shape[0]):
        lang = detect(dfte['content'][i])
        if (lang in supported_langs):
            tmp = get_raw_data(dfte['title'][i], dfte['content'][i])
            dfte.loc[i, 'content'] = transform_string(tmp, words_in_glove,
                                                      lang, dfte['source'][i])
        else:
            print('language: %s not supported. Notice id: %d' % (lang, i))
            dfte.loc[i, 'content'] = ''

    # train
    word_to_frecuency = get_word_to_frecuency(dftr['content'])

    # dfr = eliminate_less_frequent_words(dfr, 5, word_to_frecuency)
    dftr = eliminate_less_frequent_words(dftr, 5 * n_perms, word_to_frecuency)

    # eliminate empty strings from dataframe
    dftr['content'].replace('', np.nan, inplace=True)
    dftr.dropna(subset=['content'], inplace=True)
    dftr.index = np.arange(dftr.shape[0])

    # formating problem with pytorch
    dftr['classes'].replace(1, 2, inplace=True)
    dftr['classes'].replace(0, 1, inplace=True)
    dftr['classes'].replace(-1, 0, inplace=True)

    # dfr.to_csv('data14Deps.csv')
    dftr.to_csv(
        'data14Glove_train.csv')  ########### change for different embedding

    # test
    word_to_frecuency = get_word_to_frecuency(dfte['content'])

    dfte = eliminate_less_frequent_words(dfte, 5, word_to_frecuency)

    # eliminate empty strings from dataframe
    dfte['content'].replace('', np.nan, inplace=True)
    dfte.dropna(subset=['content'], inplace=True)
    dfte.index = np.arange(dfte.shape[0])

    # formating problem with pytorch
    dfte['classes'].replace(1, 2, inplace=True)
    dfte['classes'].replace(0, 1, inplace=True)
    dfte['classes'].replace(-1, 0, inplace=True)

    dfte.to_csv(
        'data14Glove_test.csv')  ########### change for different embedding

    vals = dftr.classes.value_counts()
    sns.barplot(x=[0, 1, 2], y=[vals[0], vals[1], vals[2]])
    plt.show()
Exemplo n.º 9
0
weights_pre = model.get_weights()
model = tu.load_weights(model, param_dict, stream, False, "test")
weights_after = model.get_weights()
weight_names = [weight.name for layer in model.layers for weight in layer.weights]
tu.check_weights(weights_pre, weights_after, weight_names)


# If the frame at second 1 is used (time of shot in B3SD dataset), then
#  we only get that particular frame for each video.

if sec1_frame:
    print("1sec_frame set, so choosing that frame for each video")
    result_file.write("1sec_frame set, so choosing that frame for each video\n")
    X = gd.get_batch(X, param_dict)
    augmented_X = da.augment_data(X, param_dict, "test")


    fit_input = tu.make_model_input_dict(input_names, augmented_X)
    predictions_categorical = model.predict(fit_input, verbose=1)
    
    print(predictions_categorical.shape)
    predictions = np.asarray([np.argmax(pred) for pred in predictions_categorical])
    print(predictions.shape)

    acc, correct_predictions = tpu.calculate_accuracy(predictions, Y, len(predictions))

# If sec1_frame is false, then we select x frames from each video for a 
#   more averaged guess for each video.
else:
    print("Augmenting each input "+str(repeats)+" times")
Exemplo n.º 10
0
            print('Selecting and assigning validation set...')
            print('-' * 30)
            train_indices,val_indices = train_test_split(np.arange(trainingFeatures.shape[0]),\
                test_size=pm.validation_fraction)
            valFeatures = trainingFeatures[val_indices]
            trainingFeatures = trainingFeatures[train_indices]
            valLabels = trainingLabels[val_indices]
            trainingLabels = trainingLabels[train_indices]

            # Augmenting the training data and adding this to the training data set
            if pm.data_augm:
                print('-' * 30)
                print('Augmenting data...')
                print('-' * 30)
                augm_trainingFeatures, augm_trainingLabels = \
                    augment_data(trainingFeatures, trainingLabels, pm.nb_augm_samples, pm.augm_transformations)
                trainingFeatures = np.concatenate(
                    (trainingFeatures, augm_trainingFeatures), axis=0)
                trainingLabels = np.concatenate(
                    (trainingLabels, augm_trainingLabels), axis=0)

            # Run the main function
            train_and_predict()

            # Calculating and saving run time
            end_time = datetime.now()
            total_time = time_diff_format(start_time, end_time)
            pm.time_list.append(total_time)

    # Save the data to an xlsx-file and an image.
    write_save_data()
Exemplo n.º 11
0
initial_learning_rate = 0.001
cumulative_loss = 0.0
BATCH_SIZE = 20
TRAIN_SIZE = 60000
NUM_EPOCHS = 100
EPOCH_SIZE = TRAIN_SIZE / BATCH_SIZE
num_iterations = int(NUM_EPOCHS * EPOCH_SIZE)

# Run training loop
with sess.as_default():
    for i in range(1, num_iterations + 1):
        current_learning_rate = initial_learning_rate * (1.0 - i /
                                                         (num_iterations + 5))
        batch = mnist_data.train.next_batch(BATCH_SIZE)
        train, train_labels = da.augment_data(batch[0],
                                              batch[1],
                                              use_random_zoom=False,
                                              use_random_shift=False)

        _, loss_val = sess.run(
            [train_step, loss],
            feed_dict={
                img: train,
                labels: train_labels,
                is_train: True,
                lr: current_learning_rate
            })
        cumulative_loss = cumulative_loss + loss_val
        if i % EPOCH_SIZE == 0:
            print(str(cumulative_loss / EPOCH_SIZE))
            cumulative_loss = 0.0
        if i % (10 * EPOCH_SIZE) == 0:
Exemplo n.º 12
0
    sess.run(tf.global_variables_initializer(
    ))  # initialize all global variables, which includes weights and biases

    # training start
    for epoch in range(0, NUM_EPOCHS):
        total_cost = 0

        for i in range(0, int(NUM_EXAMPLES / BATCH_SIZE)):
            batch_x = get_batch(
                dataset_train_features, i,
                BATCH_SIZE)  # get batch of features of size BATCH_SIZE
            batch_y = get_batch(
                dataset_train_labels, i,
                BATCH_SIZE)  # get batch of labels of size BATCH_SIZE

            batch_x, batch_y = augment_data(
                batch_x, batch_y, augmentation_factor=1)  # augment the data

            _, batch_cost = sess.run(
                [training, loss], feed_dict={
                    x: batch_x,
                    y: batch_y
                })  # train on the given batch size of features and labels
            total_cost += batch_cost
            if i % 25 == 0:
                print(i)

        print("Epoch:", epoch, "\tCost:", total_cost)

        # predict validation accuracy after every epoch
        sum_accuracy_validation = 0.0
        sum_i = 0

weights_pre = model_part1.get_weights()
model_part1 = tu.load_weights(model_part1, param_dict_part1, stream_part1, True, "test")
weights_after = model_part1.get_weights()
weight_names = [weight.name for layer in model_part1.layers for weight in layer.weights]
tu.check_weights(weights_pre, weights_after, weight_names)


# If the frame at second 1 is used (time of shot in B3SD dataset), then
#  we only get that particular frame for each video.

print("1sec_frame set, so choosing that frame for each video")
result_file.write("1sec_frame set, so choosing that frame for each video\n")
X2_data = gd.get_batch(X2, param_dict_part1)
augmented_X2 = da.augment_data(X2_data, param_dict_part1, "test")


fit_input2 = tu.make_model_input_dict(input_part1, augmented_X2)
predictions_categorical2 = model_part1.predict(fit_input2, verbose=1)
    
print(predictions_categorical2.shape)
predictions2 = np.asarray([np.argmax(pred) for pred in predictions_categorical2])
print(predictions2.shape)
acc2, correct_predictions2 = tpu.calculate_accuracy(predictions2, Y2, len(predictions2))


correct=0
ones = 0
actual_shots=0
actual_no_shots=0