Exemplo n.º 1
def train_one_epoch(model, param_dict, input_names, output_name, X_shuffled,
    batch_size = int(param_dict['batch_size'])
    l = len(X_shuffled)
    train_loss = 0
    train_acc = 0
    for i in range(0, l, batch_size):
        batch_end = min(i + batch_size, l)
        Xs = X_shuffled[i:batch_end]
        Ys = Y_shuffled[i:batch_end]
        batchsize = len(Xs)

        batch = gd.get_batch(Xs, param_dict)
        batch = da.augment_data(batch, param_dict, "train")

        #If single stream model, we have 1 input_name, otherwise 2
        fit_input = make_model_input_dict(input_names, batch)

        # Train on single batch
        history = model.fit(fit_input, {output_name: Ys}, batch_size=batchsize)

        train_loss += float(history.history['loss'][0]) * (batchsize / l)
        train_acc += float(history.history['acc'][0]) * (batchsize / l)

    return model, train_loss, train_acc
Exemplo n.º 2
def validate_one_epoch(model, param_dict, input_names, output_name, X_val,
    X_val_data = gd.get_batch(X_val, param_dict)
    X_val_augmented = da.augment_data(X_val_data, param_dict, "val")

    # Evaluate on validation data
    print("Evaluating On Validation Data...")
    fit_input = make_model_input_dict(input_names, X_val_augmented)
    val_loss, val_acc = model.evaluate(fit_input, {output_name: Y_val})
    return val_loss, val_acc
Exemplo n.º 3
 def test_augment_data(self):
   original_data = [
       np.random.rand(128, 3).tolist(),
       np.random.rand(66, 2).tolist(),
       np.random.rand(9, 1).tolist()
   original_label = ["data", "augmentation", "test"]
   augmented_data, augmented_label = augment_data(original_data,
   self.assertEqual(25 * len(original_data), len(augmented_data))
   self.assertIsInstance(augmented_data, list)
   self.assertEqual(25 * len(original_label), len(augmented_label))
   self.assertIsInstance(augmented_label, list)
   for i in range(len(original_label)):
     self.assertEqual(augmented_label[25 * i], original_label[i])
Exemplo n.º 4
 def get_data_file(self, data_path, data_type):
     """Get train, valid and test data from files."""
     data = []
     label = []
     with open(data_path, "r") as f:
         lines = f.readlines()
         for idx, line in enumerate(lines):  # pylint: disable=unused-variable
             dic = json.loads(line)
     if data_type == "train":
         data, label = augment_data(data, label)
     length = len(label)
     print(data_type + "_data_length:" + str(length))
     return data, label, length
Exemplo n.º 5
from pydash import flatten
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

from data.utils import load_data
from preprocessing import preprocess_data
from visualization import plot_learning_curves, get_errors_input
from metrics import custom_map_at_k
from feature_selection import get_features_extractor
from data_augmentation import augment_data

print('Augmenting training data set')
augment_data('train.csv', 'train_augmented.csv')

print('Loading training and testing set')
train_data = load_data('train_augmented.csv')
test_data = load_data('test.csv')

X_train, Y_train = preprocess_data(train_data)
X_test, Y_test = preprocess_data(test_data)

model_name = 'lr'

# print('Loading model')
# model = joblib.load('./models/' + model_name + '_classifier.pkl')
print('Fitting model')
model = Pipeline([
Exemplo n.º 6
def main(dataset_dir):
    augment_both = True  # to augment the RGB and target (edge_map) image at the same time
Exemplo n.º 7
    def train(self, \
              xdata, ydata, zdata, x_lengths, y_lengths, \
              xdevdata, ydevdata, zdevdata, xdev_lengths, ydev_lengths, \
              xxdata, yydata, zzdata, xx_lengths, yy_lengths, \

        merged_sum = tf.summary.merge_all()

        # writer = tf.train.SummaryWriter("./logs/%s" % "modeldir", self.sess.graph_def)


        start_time = time.time()

        best_val_loss = 1e100
        best_val_acc = 0.0

        for ITER in range(MAXITER):
            total_acc = 0.0
            print('**************EPOCH****************\n', str(ITER))
            epoch_start_time = time.time()
            total_loss = 0
            # xdata, ydata, zdata, x_lengths, y_lengths = joint_shuffle(xdata, ydata, zdata, x_lengths, y_lengths)
            for i in xrange(0, len(xdata), self.batch_size):
                x, y, z, xlen, ylen = xdata[i:i + self.batch_size], \
                                ydata[i:i + self.batch_size], \
                                zdata[i:i + self.batch_size], \
                                x_lengths[i:i + self.batch_size], \
                                y_lengths[i:i + self.batch_size]

                x, y, z, xlen, ylen = augment_data(x, y, z, xlen, ylen)

                feed_dict = {self.x: x, \
                             self.y: y, \
                             self.target: z, \
                             self.x_length:xlen, \
                             self.y_length:ylen, \
                self.is_training:1, \
        self.dropout_keep_prob:1 }

                att, _, loss, acc, summ = self.sess.run(
                    [self.att, self.optim, self.loss, self.acc, merged_sum],

                total_loss += loss
                total_acc += acc

            print("Epoch Time: ", time.time() - epoch_start_time)

            total_loss = total_loss / float(len(xdata))
            total_acc = total_acc / float(len(xdata) / self.batch_size)
            print("Loss", total_loss, "Accuracy On Training", total_acc)

            total_val_loss, total_val_acc = self.validate(
                xdevdata, ydevdata, zdevdata, xdev_lengths, ydev_lengths, ITER)

            if (best_val_loss >= total_val_loss
                    or best_val_acc <= total_val_acc):
                if (best_val_loss >= total_val_loss):
                    best_val_loss = total_val_loss
                if (best_val_acc <= total_val_acc):
                    best_val_acc = total_val_acc
                self.test(xxdata, yydata, zzdata, xx_lengths, yy_lengths, ITER)

        elapsed_time = time.time() - start_time

        print("Total Time", elapsed_time)
def main():
		Main del archivo, este archivo se encarga de preprocesar las noticias, "limpiar la data" en terminos generales. Hace las operaciones que estén definidas en el método

		Primero ejecuta la funcion transform string, luego calcula las salidas de los ejemplos, luego elimina las palabras menos frecuentes y por último guarda estos últimos datos.

    # dfr = pd.read_csv("newsDatabaseComplete14.csv", header=0, index_col=0)
    # dfr = pd.read_csv("newsDatabaseComplete14_filtered.csv", header=0, index_col=0)
    dfr = pd.read_csv("newsDatabaseComplete14_filtered_mixed.csv",
    # dfr = pd.read_csv("newsDatabaseComplete14_filtered_augmented.csv", header=0, index_col=0)

    words_in_glove = read_embedd_vectors(
        0)  ############# change for different embedding

    supported_langs = ['en']
    classes = [-1.0, 0.0, 1.0]

    # eliminate non-classes examples
    dfr.dropna(subset=['classes'], inplace=True)
    dfr.index = np.arange(dfr.shape[0])

    dftr, dfte = split_uniformly(dfr, 0.8, classes)

    # augment_data
    import data_augmentation
    dftr, n_perms = data_augmentation.augment_data(dftr)

    # implmentation asking people for classes
    for i in range(dftr.shape[0]):
        lang = detect(dftr['content'][i])
        if (lang in supported_langs):
            tmp = get_raw_data(dftr['title'][i], dftr['content'][i])
            dftr.loc[i, 'content'] = transform_string(tmp, words_in_glove,
                                                      lang, dftr['source'][i])
            print('language: %s not supported. Notice id: %d' % (lang, i))
            dftr.loc[i, 'content'] = ''

    for i in range(dfte.shape[0]):
        lang = detect(dfte['content'][i])
        if (lang in supported_langs):
            tmp = get_raw_data(dfte['title'][i], dfte['content'][i])
            dfte.loc[i, 'content'] = transform_string(tmp, words_in_glove,
                                                      lang, dfte['source'][i])
            print('language: %s not supported. Notice id: %d' % (lang, i))
            dfte.loc[i, 'content'] = ''

    # train
    word_to_frecuency = get_word_to_frecuency(dftr['content'])

    # dfr = eliminate_less_frequent_words(dfr, 5, word_to_frecuency)
    dftr = eliminate_less_frequent_words(dftr, 5 * n_perms, word_to_frecuency)

    # eliminate empty strings from dataframe
    dftr['content'].replace('', np.nan, inplace=True)
    dftr.dropna(subset=['content'], inplace=True)
    dftr.index = np.arange(dftr.shape[0])

    # formating problem with pytorch
    dftr['classes'].replace(1, 2, inplace=True)
    dftr['classes'].replace(0, 1, inplace=True)
    dftr['classes'].replace(-1, 0, inplace=True)

    # dfr.to_csv('data14Deps.csv')
        'data14Glove_train.csv')  ########### change for different embedding

    # test
    word_to_frecuency = get_word_to_frecuency(dfte['content'])

    dfte = eliminate_less_frequent_words(dfte, 5, word_to_frecuency)

    # eliminate empty strings from dataframe
    dfte['content'].replace('', np.nan, inplace=True)
    dfte.dropna(subset=['content'], inplace=True)
    dfte.index = np.arange(dfte.shape[0])

    # formating problem with pytorch
    dfte['classes'].replace(1, 2, inplace=True)
    dfte['classes'].replace(0, 1, inplace=True)
    dfte['classes'].replace(-1, 0, inplace=True)

        'data14Glove_test.csv')  ########### change for different embedding

    vals = dftr.classes.value_counts()
    sns.barplot(x=[0, 1, 2], y=[vals[0], vals[1], vals[2]])
Exemplo n.º 9
weights_pre = model.get_weights()
model = tu.load_weights(model, param_dict, stream, False, "test")
weights_after = model.get_weights()
weight_names = [weight.name for layer in model.layers for weight in layer.weights]
tu.check_weights(weights_pre, weights_after, weight_names)

# If the frame at second 1 is used (time of shot in B3SD dataset), then
#  we only get that particular frame for each video.

if sec1_frame:
    print("1sec_frame set, so choosing that frame for each video")
    result_file.write("1sec_frame set, so choosing that frame for each video\n")
    X = gd.get_batch(X, param_dict)
    augmented_X = da.augment_data(X, param_dict, "test")

    fit_input = tu.make_model_input_dict(input_names, augmented_X)
    predictions_categorical = model.predict(fit_input, verbose=1)
    predictions = np.asarray([np.argmax(pred) for pred in predictions_categorical])

    acc, correct_predictions = tpu.calculate_accuracy(predictions, Y, len(predictions))

# If sec1_frame is false, then we select x frames from each video for a 
#   more averaged guess for each video.
    print("Augmenting each input "+str(repeats)+" times")
Exemplo n.º 10
            print('Selecting and assigning validation set...')
            print('-' * 30)
            train_indices,val_indices = train_test_split(np.arange(trainingFeatures.shape[0]),\
            valFeatures = trainingFeatures[val_indices]
            trainingFeatures = trainingFeatures[train_indices]
            valLabels = trainingLabels[val_indices]
            trainingLabels = trainingLabels[train_indices]

            # Augmenting the training data and adding this to the training data set
            if pm.data_augm:
                print('-' * 30)
                print('Augmenting data...')
                print('-' * 30)
                augm_trainingFeatures, augm_trainingLabels = \
                    augment_data(trainingFeatures, trainingLabels, pm.nb_augm_samples, pm.augm_transformations)
                trainingFeatures = np.concatenate(
                    (trainingFeatures, augm_trainingFeatures), axis=0)
                trainingLabels = np.concatenate(
                    (trainingLabels, augm_trainingLabels), axis=0)

            # Run the main function

            # Calculating and saving run time
            end_time = datetime.now()
            total_time = time_diff_format(start_time, end_time)

    # Save the data to an xlsx-file and an image.
Exemplo n.º 11
initial_learning_rate = 0.001
cumulative_loss = 0.0
TRAIN_SIZE = 60000
num_iterations = int(NUM_EPOCHS * EPOCH_SIZE)

# Run training loop
with sess.as_default():
    for i in range(1, num_iterations + 1):
        current_learning_rate = initial_learning_rate * (1.0 - i /
                                                         (num_iterations + 5))
        batch = mnist_data.train.next_batch(BATCH_SIZE)
        train, train_labels = da.augment_data(batch[0],

        _, loss_val = sess.run(
            [train_step, loss],
                img: train,
                labels: train_labels,
                is_train: True,
                lr: current_learning_rate
        cumulative_loss = cumulative_loss + loss_val
        if i % EPOCH_SIZE == 0:
            print(str(cumulative_loss / EPOCH_SIZE))
            cumulative_loss = 0.0
        if i % (10 * EPOCH_SIZE) == 0:
Exemplo n.º 12
    ))  # initialize all global variables, which includes weights and biases

    # training start
    for epoch in range(0, NUM_EPOCHS):
        total_cost = 0

        for i in range(0, int(NUM_EXAMPLES / BATCH_SIZE)):
            batch_x = get_batch(
                dataset_train_features, i,
                BATCH_SIZE)  # get batch of features of size BATCH_SIZE
            batch_y = get_batch(
                dataset_train_labels, i,
                BATCH_SIZE)  # get batch of labels of size BATCH_SIZE

            batch_x, batch_y = augment_data(
                batch_x, batch_y, augmentation_factor=1)  # augment the data

            _, batch_cost = sess.run(
                [training, loss], feed_dict={
                    x: batch_x,
                    y: batch_y
                })  # train on the given batch size of features and labels
            total_cost += batch_cost
            if i % 25 == 0:

        print("Epoch:", epoch, "\tCost:", total_cost)

        # predict validation accuracy after every epoch
        sum_accuracy_validation = 0.0
        sum_i = 0

weights_pre = model_part1.get_weights()
model_part1 = tu.load_weights(model_part1, param_dict_part1, stream_part1, True, "test")
weights_after = model_part1.get_weights()
weight_names = [weight.name for layer in model_part1.layers for weight in layer.weights]
tu.check_weights(weights_pre, weights_after, weight_names)

# If the frame at second 1 is used (time of shot in B3SD dataset), then
#  we only get that particular frame for each video.

print("1sec_frame set, so choosing that frame for each video")
result_file.write("1sec_frame set, so choosing that frame for each video\n")
X2_data = gd.get_batch(X2, param_dict_part1)
augmented_X2 = da.augment_data(X2_data, param_dict_part1, "test")

fit_input2 = tu.make_model_input_dict(input_part1, augmented_X2)
predictions_categorical2 = model_part1.predict(fit_input2, verbose=1)
predictions2 = np.asarray([np.argmax(pred) for pred in predictions_categorical2])
acc2, correct_predictions2 = tpu.calculate_accuracy(predictions2, Y2, len(predictions2))

ones = 0