示例#1
0
 def get_error(self, data_set: DataSet):
     squared_sum = 0
     # Sum up the squared sup across all squared differences between the actual class value and the expected value.
     for example_array, expected_class in data_set.get_data():
         output = self.run(example_array)
         squared_sum += (output - expected_class)**2
     return math.sqrt(squared_sum) / len(data_set.get_data())
示例#2
0
def mds_and_plot(model):

    data = DataSet()
    x, y, data_list = data.get_test_frames('train')

    custom_model = Model(inputs=model.input,
                         outputs=model.get_layer('dense_1').output)
    y_pred = custom_model.predict(x)
    mds = MDS()
    mds.fit(y_pred)
    a = mds.embedding_

    mark = ['or', 'ob', 'og', 'oy', 'ok', '+r', 'sr', 'dr', '<r', 'pr']
    color = 0
    j = 0
    for item in y:
        index = 0
        for i in item:
            if i == 1:
                break
            index = index + 1

        plt.plot([a[j:j + 1, 0]], [a[j:j + 1, 1]], mark[index], markersize=5)
        print(index)
        j += 1
    plt.show()
示例#3
0
 def get_accuracy(self, data_set: DataSet):
     correct = 0
     # Sum the number of correctly classified examples.
     for example_array, expected_class in data_set.get_data():
         output = self.run(example_array)
         if output == expected_class:
             correct += 1
     # Divide the number of correct examples by the total number of examples.
     return correct / len(data_set.get_data())
示例#4
0
def main(input_train, input_test, output_train, output_test):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')

    raw_data = DataSet(input_train, input_test)

    df_train = raw_data.get_train_set()
    df_test = raw_data.get_test_set()

    TitanicPreProcessing(df_train, output_train)
    TitanicPreProcessing(df_test, output_test)
def test_rnn(src, model):
    data = DataSet(src)
    x, y, data_list = data.get_test_frames('train')
    s = time.clock()
    y_pred = model.predict(x)
    e = time.clock()
    print(e - s)
    y_pred[y_pred < 0.7] = 0
    y_pred[y_pred >= 0.7] = 1

    print(metrics.precision_score(y, y_pred, average='micro', zero_division=0))
    print(metrics.precision_score(y, y_pred, average='macro', zero_division=0))
    print(metrics.recall_score(y, y_pred, average='micro', zero_division=0))
    print(metrics.recall_score(y, y_pred, average='macro', zero_division=0))
    print(metrics.f1_score(y, y_pred, average='weighted', zero_division=0))
示例#6
0
def main(input_data, output_model):
    """ Runs modeling scripts using processed data (../raw) to
        create model. Model is saved as pickle (saved in ../models).
    """
    logger = logging.getLogger(__name__)
    logger.info('training model')

    data = DataSet(train_dir=input_data)
    train = data.get_train_set()
    X_train = data.get_features(train)
    y = data.get_label(train)

    clf = models[4]
    param_grid = params[4]

    model = Model.tune(clf, X_train, y, param_grid)
    model.save(output_model + model.name)
示例#7
0
def main(input_filepath, output_filepath):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')

    raw_data = DataSet(train_dir=input_filepath + '/train.csv',
                       test_dir=input_filepath + '/test.csv')
    cleaning = DataWrangling(train_dir=output_filepath + '/train_clean.csv',
                             test_dir=output_filepath + '/test_clean.csv')

    df_train = raw_data.get_train_set()
    df_test = raw_data.get_test_set()
    df_train_clean = cleaning.apply_preprocessing(df_train, target='Survived')
    df_test_clean = cleaning.apply_preprocessing(df_test, target='Survived')
    cleaning.processed_train_data(df_train_clean)
    cleaning.processed_test_data(df_test_clean)
示例#8
0
def extract_features():
    # Get the dataset.
    data = DataSet()

    # get the model.
    model = Extractor(SAVED_CNN_EXTRACTOR_MODEL)

    if not os.path.exists(PROCESSED_SEQUENCES_DATA_DIR):
        os.makedirs(PROCESSED_SEQUENCES_DATA_DIR)

    # Loop through data.
    folders = ['train', 'test']
    #     folders = ['train']
    for folder in folders:
        print(f'Extracting features from {folder} videos...')
        video_filenames = list(data.data[folder].keys())
        #         video_filenames=['171']
        pbar = tqdm(total=len(video_filenames))
        for video_filename in video_filenames:

            # Get the path to the sequence for this video.
            path = os.path.join(PROCESSED_SEQUENCES_DATA_DIR, video_filename +
                                '-features')  # numpy will auto-append .npy

            # Check if we already have it.
            if os.path.isfile(path + '.npy'):
                pbar.update(1)
                continue

            # Get the frames for this video.
            frames = data.get_frames_paths(folder, video_filename)

            # Now loop through and extract features to build the sequence.
            sequence = []
            for image in frames:
                features = model.extract(image)
                sequence.append(features)

            # Save the sequence.
            np.save(path, sequence)

            pbar.update(1)

        pbar.close()
示例#9
0
def get_generators():
    dataset = DataSet()

    params = {'batch_size': 256, 'shuffle': True}

    # Generators
    train_generator = SequenceDataGenerator(dataset, "train", **params)
    valid_generator = SequenceDataGenerator(dataset, "test", **params)

    return train_generator, valid_generator
示例#10
0
def main(input_train, input_test, input_model, output_prediction):
    """ Runs modeling scripts using model pickle (../models) to predict
        outcomes. Outcomes file is saved as .csv (saved in ../models).
    """
    logger = logging.getLogger(__name__)
    logger.info('predicting outcomes')

    data = DataSet(train_dir=input_train, test_dir=input_test)
    test = data.get_test_set()
    X_test = data.get_features(test)

    model = Model.load(input_model + 'XGBClassifier')
    y_pred = model.predict(X_test)

    output = pd.DataFrame({
        'PassengerId': test['PassengerId'],
        'Survived': y_pred
    })
    output.to_csv(output_prediction + 'submission_{}.csv'.format(model.name),
                  index=False)
示例#11
0
def train(batch_size, nb_epoch, data_type, seq_len, categories, feature_len, saved_model=None):
    checkpointer = ModelCheckpoint(
        # filepath=os.path.join(settings.OUTPUT_CHECKPOINT_FOLDER, model + '.{epoch:03d}-{val_loss:.3f}.hdf5'),
        filepath=os.path.join(util.OUTPUT_CHECKPOINT_FOLDER, 'v1.hdf5'),
        verbose=1,
        save_best_only=True)

    # Helper: TensorBoard
    tb = TensorBoard(log_dir=util.OUTPUT_LOG)

    # Helper: Stop when we stop learning.
    # early_stopper = EarlyStopping(patience=5)

    # Helper: Save results.
    timestamp = time.time()
    csv_logger = CSVLogger(os.path.join(util.OUTPUT_LOG, 'training-' + str(timestamp) + '.log'))

    data = DataSet(util.SCRIPT_EXTRACT_SEQ_SPLIT_PATH)

    # Get samples per epoch.
    # Multiply by 0.7 to attempt to guess how much of data.data is the train set.
    steps_per_epoch = data.len_data() / batch_size

    generator = data.frame_generator(batch_size, 'train')
    val_generator = data.frame_generator(batch_size, 'valid')

    # Get the model.
    model = MLModel(len(categories), data_type, seq_len, saved_model,
                    feature_len)
    rm = model.create_pre_train_model()
    # rm = em.create_model()

    rm.model.fit_generator(
        generator=generator,
        steps_per_epoch=steps_per_epoch,
        epochs=nb_epoch,
        verbose=1,
        callbacks=[tb, csv_logger, checkpointer],
        validation_data=val_generator,
        validation_steps=200 / batch_size)
示例#12
0
    def _extract_private_test_video_filenames():
        all_video_filenames = set(os.listdir(PROCESSED_FRAMES_DATA_DIR))

        dataset = DataSet()
        train_test_video_filenames = set(dataset.data['train'].keys()) | set(
            dataset.data['test'].keys())

        private_test_video_filenames = all_video_filenames - train_test_video_filenames

        private_test_video_filenames = [
            video_filename for video_filename in private_test_video_filenames
            if not video_filename[0] == '.'
        ]
        return private_test_video_filenames
示例#13
0
 def test_get_num_frames(self):
     video_filename = "79-30-960x720"
     expected = len(DataSet.get_targets('test', video_filename))
     actual = VideoHelper._extract_num_frames(video_filename)
     self.assertEqual(expected, actual)
"""
Create histograms of different subsets of Aff-Wild2 dataset.
By choosing 'balancing_mode' we can visualize whole dataset or downsampled subset.
By choosing 'train_test' we can visualize train or test samples of the dataset.
"""
from pylab import *

from src.data import DataSet

dataset = DataSet()
# valences,arousals = dataset.get_val_ar(balancing_mode='balanced',max_mode='mean')
valences, arousals = dataset.get_val_ar(balancing_mode='all',
                                        train_test='test')
# valences,arousals = dataset.get_val_ar(mode='neg_ar_pos_val')
print(len(valences))
print(len(arousals))
res_hist = hist2d(valences, arousals, bins=40, cmap=cm.jet)
density = res_hist[0] / len(valences)
s = np.sum(density)
colorbar().ax.tick_params(axis='y', direction='out')
# savefig("/Users/denisrangulov/Google Drive/EmotionRecognition/figures/b_mean_train_frames.png", bbox_inches='tight')
# savefig("/Users/denisrangulov/Google Drive/EmotionRecognition/figures/train_neg_400_train_frames.png", bbox_inches='tight')
# savefig("/Users/denisrangulov/Google Drive/EmotionRecognition/figures/train_neg_ar_pos_val_400_train_frames.png", bbox_inches='tight')
savefig(
    "/Users/denisrangulov/Google Drive/EmotionRecognition/figures/all_test_frames.png",
    bbox_inches='tight')
示例#15
0
    def get_main_params(train_test):
        dataset = DataSet()
        list_IDs, targets = dataset.get_partition(train_test, balanced=True)

        return list_IDs, targets, train_test
示例#16
0
vbar = tqdm(total=len(private_test_video_filenames))
for video_filename in private_test_video_filenames:
    prediction_path = os.path.join(PREDICTIONS, video_filename + '.txt')

    # Check if we already have it.
    if os.path.isfile(prediction_path + '.txt'):
        vbar.update(1)
        continue

    num_frames = video_helper.get_num_frames(video_filename)
    predictions = np.full((num_frames, 2), -5, dtype=np.float32)
    sequence = []
    fbar = tqdm(total=num_frames)
    for frame_idx in range(num_frames):
        frame_path = DataSet.get_frame_path(video_filename, frame_idx)
        if os.path.isfile(frame_path):
            feature_vector = cnn_extractor_model.extract(frame_path)
            sequence.append(feature_vector)
        elif len(sequence) > 0:
            # Uncomment to predict first less than 'RNN_WINDOW_SIZE' frames using CNN
            # num_cnn_predictions = min(len(sequence), RNN_WINDOW_SIZE)
            # x = np.asarray(sequence[:num_cnn_predictions])
            # prediction = cnn_extractor_model.predict(x)
            # predictions[frame_idx - len(sequence):frame_idx - len(sequence) + len(prediction)] = prediction
            if len(sequence) > RNN_WINDOW_SIZE:
                x = prepare_sequence_for_rnn(sequence)
                prediction = rnn_model.predict(x)
                predictions[frame_idx - len(prediction):frame_idx] = prediction
            sequence = []
        if frame_idx == num_frames - 1 and len(sequence) > 0:
示例#17
0
import numpy as np
import pandas as pd
import tensorflow as tf

from src import metrics
from src.config import PREDICTIONS
from src.data import DataSet

video_filename = "79-30-960x720"

path = os.path.join(PREDICTIONS,
                    video_filename + '.txt')  # numpy will auto-append .npy
pred_df = pd.read_csv(path, sep=",")
pred_df[pred_df['valence'] == -5] = np.nan
# pred_df = pred_df.interpolate(method='linear', axis=0).fillna(-5)
# pred_df = pred_df.interpolate(method='linear', axis=0).fillna(0)
pred_df = pred_df.interpolate(method='linear', axis=0).ffill().bfill()
pred_df = pred_df.ex
pred = pred_df[['valence', 'arousal']].values

true = DataSet.get_targets('test', video_filename)

r = len(true) if len(pred) > len(true) else len(pred)
pred = tf.convert_to_tensor(pred[:r], np.float32)
true = tf.convert_to_tensor(true[:r], np.float32)
ccc_v = metrics.ccc_v(true, pred)
ccc_a = metrics.ccc_a(true, pred)

print(f'ccc_v: {ccc_v}, ccc_a: {ccc_a}')