예제 #1
0
    def __init__(self, dataset=None):

        if dataset == None:
            self.dataset = MOSI()
        else:
            self.dataset = dataset
        self.train_ids = self.dataset.train()
        self.valid_ids = self.dataset.valid()
        self.test_ids = self.dataset.test()
        self.sentiments = self.dataset.sentiments()
np.random.seed(seed)
import os

os.environ['PYTHONHASHSEED'] = '0'
import tensorflow as tf

tf.set_random_seed(seed)
from joblib import Parallel, delayed
import multiprocessing

num_cores = multiprocessing.cpu_count()
mode = sys.argv[1]
task = sys.argv[2]

# Download the data if not present
mosei = MOSI()
embeddings = mosei.embeddings()
if mode == "all" or mode == "AV" or mode == "VT" or mode == "V":
    facet = mosei.facet()
if mode == "all" or mode == "AT" or mode == "AV" or mode == "A":
    covarep = mosei.covarep()
sentiments = mosei.sentiments()
emotions = mosei.emotions()
train_ids = mosei.train()
valid_ids = mosei.valid()
test_ids = mosei.test()

# Merge different features and do word level feature alignment (align according to timestamps of embeddings)
if mode == "all" or mode == "AV":
    bimodal = Dataset.merge(embeddings, facet)
    trimodal = Dataset.merge(bimodal, covarep)
    return K.sum(K.mean(fsp*fst,axis=0)/(devP*devT))

# meta parameters
maxlen = 15 # Each utterance will be truncated/padded to 15 words
batch_size = 128
nb_epoch = 1000 # number of total epochs to train the model
# if the validation loss isn't decreasing for a number of epochs, stop training to prevent over-fitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

opt_func = Adamax(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-08) # optimization function
loss_func = 'mae' # loss function
metr = 'mae' # evaluation metric

# Download the data if not present
mosi = MOSI()
embeddings = mosi.embeddings() # features
sentiments = mosi.sentiments() # Valence labels
train_ids = mosi.train()
valid_ids = mosi.valid()
test_ids = mosi.test()

# Some data preprocessing
x_train = []
y_train = []
x_valid = []
y_valid = []
x_test = []
y_test = []

print("Preparing train and test data...")
예제 #4
0
def get_data(max_len_audio=20, max_len_text=15, max_len_visual=20):
    mosi = MOSI()
    embeddings = mosi.embeddings()
    facet = mosi.facet()
    covarep = mosi.covarep()
    sentiments = mosi.sentiments(
    )  # sentiment labels, real-valued. for this tutorial we'll binarize them
    train_ids = mosi.train()
    valid_ids = mosi.valid()
    test_ids = mosi.test()
    # sort through all the video ID, segment ID pairs
    train_set_ids = []
    for vid in train_ids:
        for sid in embeddings['embeddings'][vid].keys():
            if embeddings['embeddings'][vid][sid] and facet['facet'][vid][
                    sid] and covarep['covarep'][vid][sid]:
                train_set_ids.append((vid, sid))

    valid_set_ids = []
    for vid in valid_ids:
        for sid in embeddings['embeddings'][vid].keys():
            if embeddings['embeddings'][vid][sid] and facet['facet'][vid][
                    sid] and covarep['covarep'][vid][sid]:
                valid_set_ids.append((vid, sid))

    test_set_ids = []
    for vid in test_ids:
        for sid in embeddings['embeddings'][vid].keys():
            if embeddings['embeddings'][vid][sid] and facet['facet'][vid][
                    sid] and covarep['covarep'][vid][sid]:
                test_set_ids.append((vid, sid))

    # partition the training, valid and tesembeddingsall sequences will be padded/truncated to 15 steps
    # data will have shape (dataset_size, max_len, feature_dim)
    max_len = max_len_audio
    train_set_audio = np.stack([
        pad(covarep['covarep'][vid][sid], max_len)
        for (vid, sid) in train_set_ids if covarep['covarep'][vid][sid]
    ],
                               axis=0)
    valid_set_audio = np.stack([
        pad(covarep['covarep'][vid][sid], max_len)
        for (vid, sid) in valid_set_ids if covarep['covarep'][vid][sid]
    ],
                               axis=0)
    test_set_audio = np.stack([
        pad(covarep['covarep'][vid][sid], max_len)
        for (vid, sid) in test_set_ids if covarep['covarep'][vid][sid]
    ],
                              axis=0)

    max_len = max_len_visual
    train_set_visual = np.stack([
        pad(facet['facet'][vid][sid], max_len) for (vid, sid) in train_set_ids
    ],
                                axis=0)
    valid_set_visual = np.stack([
        pad(facet['facet'][vid][sid], max_len) for (vid, sid) in valid_set_ids
    ],
                                axis=0)
    test_set_visual = np.stack([
        pad(facet['facet'][vid][sid], max_len) for (vid, sid) in test_set_ids
    ],
                               axis=0)

    max_len = max_len_text
    train_set_text = np.stack([
        pad(embeddings['embeddings'][vid][sid], max_len)
        for (vid, sid) in train_set_ids
    ],
                              axis=0)
    valid_set_text = np.stack([
        pad(embeddings['embeddings'][vid][sid], max_len)
        for (vid, sid) in valid_set_ids
    ],
                              axis=0)
    test_set_text = np.stack([
        pad(embeddings['embeddings'][vid][sid], max_len)
        for (vid, sid) in test_set_ids
    ],
                             axis=0)
    # binarize the sentiment scores for binary classification task
    y_train = np.array([sentiments[vid][sid]
                        for (vid, sid) in train_set_ids]) > 0
    y_valid = np.array([sentiments[vid][sid]
                        for (vid, sid) in valid_set_ids]) > 0
    y_test = np.array([sentiments[vid][sid]
                       for (vid, sid) in test_set_ids]) > 0

    # train_set_audio = train_set_audio[:,:,1:35]
    # valid_set_audio = valid_set_audio[:,:,1:35]
    # test_set_audio = test_set_audio[:,:,1:35]

    visual_max = np.max(np.max(np.abs(train_set_visual), axis=0), axis=0)
    visual_max[visual_max ==
               0] = 1  # if the maximum is 0 we don't normalize this dimension
    train_set_visual = train_set_visual / visual_max
    valid_set_visual = valid_set_visual / visual_max
    test_set_visual = test_set_visual / visual_max

    train_set_visual[train_set_visual != train_set_visual] = 0
    valid_set_visual[valid_set_visual != valid_set_visual] = 0
    test_set_visual[test_set_visual != test_set_visual] = 0

    audio_max = np.max(np.max(np.abs(train_set_audio), axis=0), axis=0)
    audio_max[audio_max == 0] = 1
    train_set_audio = train_set_audio / audio_max
    valid_set_audio = valid_set_audio / audio_max
    test_set_audio = test_set_audio / audio_max

    train_set_audio[train_set_audio != train_set_audio] = 0
    valid_set_audio[valid_set_audio != valid_set_audio] = 0
    test_set_audio[test_set_audio != test_set_audio] = 0

    return train_set_audio, valid_set_audio, test_set_audio, train_set_text, valid_set_text, test_set_text, train_set_visual, valid_set_visual, test_set_visual, y_train, y_valid, y_test