num_cores = multiprocessing.cpu_count()
mode = sys.argv[1]
task = sys.argv[2]

# Download the data if not present
mosei = MOSI()
embeddings = mosei.embeddings()
if mode == "all" or mode == "AV" or mode == "VT" or mode == "V":
    facet = mosei.facet()
if mode == "all" or mode == "AT" or mode == "AV" or mode == "A":
    covarep = mosei.covarep()
sentiments = mosei.sentiments()
emotions = mosei.emotions()
train_ids = mosei.train()
valid_ids = mosei.valid()
test_ids = mosei.test()

# Merge different features and do word level feature alignment (align according to timestamps of embeddings)
if mode == "all" or mode == "AV":
    bimodal = Dataset.merge(embeddings, facet)
    trimodal = Dataset.merge(bimodal, covarep)
    dataset = trimodal.align('embeddings')
if mode == "AT":
    bimodal = Dataset.merge(embeddings, covarep)
    dataset = bimodal.align('embeddings')
if mode == "VT":
    bimodal = Dataset.merge(embeddings, facet)
    dataset = bimodal.align('embeddings')

# SWEEP values
示例#2
0
class UnimodalData():
    def __init__(self, dataset=None):

        if dataset == None:
            self.dataset = MOSI()
        else:
            self.dataset = dataset
        self.train_ids = self.dataset.train()
        self.valid_ids = self.dataset.valid()
        self.test_ids = self.dataset.test()
        self.sentiments = self.dataset.sentiments()

    def get_data(self, data, max_len):

        x_train = []
        y_train = []
        x_test = []
        y_test = []
        x_val = []
        y_val = []
        for vid, vdata in data.items(
        ):  # note that even Dataset with one feature will require explicit indexing of features
            for sid, sdata in vdata.items():
                if sdata == []:
                    continue
                example = []
                for i, time_step in enumerate(sdata):
                    # data is truncated for 15 words
                    if i == max_len:
                        break
                    example.append(
                        time_step[2]
                    )  # here first 2 dims (timestamps) will not be used

                for i in range(max_len - len(sdata)):
                    example.append(np.zeros(
                        sdata[0][2].shape))  # padding each example to max_len
                example = np.asarray(example)
                label = 1 if self.sentiments[vid][
                    sid] >= 0 else 0  # binarize the labels
                # here we just use everything except training set as the test set
                if vid in self.train_ids:
                    x_train.append(example)
                    y_train.append(label)
                elif vid in self.valid_ids:
                    x_val.append(example)
                    y_val.append(label)
                elif vid in self.test_ids:
                    x_test.append(example)
                    y_test.append(label)
        # Prepare the final inputs as numpy arrays
        x_train = np.asarray(x_train)
        x_val = np.asarray(x_val)
        x_test = np.asarray(x_test)
        y_train = np.asarray(y_train)
        y_val = np.asarray(y_val)
        y_test = np.asarray(y_test)

        return x_train, x_val, x_test, y_train, y_val, y_test

    def get_text(self, max_len=20):

        embeddings = self.dataset.embeddings()
        return self.get_data(embeddings["embeddings"], max_len)

    def get_words(self):

        words = self.dataset.words()
        x_train = []
        y_train = []
        x_test = []
        y_test = []
        x_val = []
        y_val = []
        for vid, vdata in words["words"].items(
        ):  # note that even Dataset with one feature will require explicit indexing of features
            for sid, sdata in vdata.items():
                if sdata == []:
                    continue
                example = []
                for i, time_step in enumerate(sdata):
                    example.append(time_step[2])
                example = np.asarray(example)
                label = 1 if self.sentiments[vid][
                    sid] >= 0 else 0  # binarize the labels
                # here we just use everything except training set as the test set
                if vid in self.train_ids:
                    x_train.append(example)
                    y_train.append(label)
                elif vid in self.valid_ids:
                    x_val.append(example)
                    y_val.append(label)
                elif vid in self.test_ids:
                    x_test.append(example)
                    y_test.append(label)
        # Prepare the final inputs as numpy arrays
        x_train = np.asarray(x_train)
        x_val = np.asarray(x_val)
        x_test = np.asarray(x_test)
        y_train = np.asarray(y_train)
        y_val = np.asarray(y_val)
        y_test = np.asarray(y_test)

        return x_train, x_val, x_test, y_train, y_val, y_test

    def get_audio(self, max_len=20):

        covarep = self.dataset.covarep()
        train_set_audio, valid_set_audio, test_set_audio, y_train, y_val, y_test = self.get_data(
            covarep["covarep"], max_len)

        audio_max = np.max(np.max(np.abs(train_set_audio), axis=0), axis=0)
        audio_max[audio_max == 0] = 1
        train_set_audio = train_set_audio / audio_max
        valid_set_audio = valid_set_audio / audio_max
        test_set_audio = test_set_audio / audio_max

        train_set_audio[train_set_audio != train_set_audio] = 0
        valid_set_audio[valid_set_audio != valid_set_audio] = 0
        test_set_audio[test_set_audio != test_set_audio] = 0

        return train_set_audio, valid_set_audio, test_set_audio, y_train, y_val, y_test

    def get_video(self, max_len=20):

        facet = self.dataset.facet()
        train_set_visual, valid_set_visual, test_set_visual, y_train, y_val, y_test = self.get_data(
            facet["facet"], max_len)
        visual_max = np.max(np.max(np.abs(train_set_visual), axis=0), axis=0)
        visual_max[
            visual_max ==
            0] = 1  # if the maximum is 0 we don't normalize this dimension
        train_set_visual = train_set_visual / visual_max
        valid_set_visual = valid_set_visual / visual_max
        test_set_visual = test_set_visual / visual_max
        train_set_visual[train_set_visual != train_set_visual] = 0
        valid_set_visual[valid_set_visual != valid_set_visual] = 0
        test_set_visual[test_set_visual != test_set_visual] = 0

        return train_set_visual, valid_set_visual, test_set_visual
maxlen = 15 # Each utterance will be truncated/padded to 15 words
batch_size = 128
nb_epoch = 1000 # number of total epochs to train the model
# if the validation loss isn't decreasing for a number of epochs, stop training to prevent over-fitting
early_stopping = EarlyStopping(monitor='val_loss', patience=5)

opt_func = Adamax(lr=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-08) # optimization function
loss_func = 'mae' # loss function
metr = 'mae' # evaluation metric

# Download the data if not present
mosi = MOSI()
embeddings = mosi.embeddings() # features
sentiments = mosi.sentiments() # Valence labels
train_ids = mosi.train()
valid_ids = mosi.valid()
test_ids = mosi.test()

# Some data preprocessing
x_train = []
y_train = []
x_valid = []
y_valid = []
x_test = []
y_test = []

print("Preparing train and test data...")
for vid, vdata in embeddings['embeddings'].items(): # note that even Dataset with one feature will require explicit indexing of features
    for sid, sdata in vdata.items():
        if sdata == []:
            continue
示例#4
0
def get_data(max_len_audio=20, max_len_text=15, max_len_visual=20):
    mosi = MOSI()
    embeddings = mosi.embeddings()
    facet = mosi.facet()
    covarep = mosi.covarep()
    sentiments = mosi.sentiments(
    )  # sentiment labels, real-valued. for this tutorial we'll binarize them
    train_ids = mosi.train()
    valid_ids = mosi.valid()
    test_ids = mosi.test()
    # sort through all the video ID, segment ID pairs
    train_set_ids = []
    for vid in train_ids:
        for sid in embeddings['embeddings'][vid].keys():
            if embeddings['embeddings'][vid][sid] and facet['facet'][vid][
                    sid] and covarep['covarep'][vid][sid]:
                train_set_ids.append((vid, sid))

    valid_set_ids = []
    for vid in valid_ids:
        for sid in embeddings['embeddings'][vid].keys():
            if embeddings['embeddings'][vid][sid] and facet['facet'][vid][
                    sid] and covarep['covarep'][vid][sid]:
                valid_set_ids.append((vid, sid))

    test_set_ids = []
    for vid in test_ids:
        for sid in embeddings['embeddings'][vid].keys():
            if embeddings['embeddings'][vid][sid] and facet['facet'][vid][
                    sid] and covarep['covarep'][vid][sid]:
                test_set_ids.append((vid, sid))

    # partition the training, valid and tesembeddingsall sequences will be padded/truncated to 15 steps
    # data will have shape (dataset_size, max_len, feature_dim)
    max_len = max_len_audio
    train_set_audio = np.stack([
        pad(covarep['covarep'][vid][sid], max_len)
        for (vid, sid) in train_set_ids if covarep['covarep'][vid][sid]
    ],
                               axis=0)
    valid_set_audio = np.stack([
        pad(covarep['covarep'][vid][sid], max_len)
        for (vid, sid) in valid_set_ids if covarep['covarep'][vid][sid]
    ],
                               axis=0)
    test_set_audio = np.stack([
        pad(covarep['covarep'][vid][sid], max_len)
        for (vid, sid) in test_set_ids if covarep['covarep'][vid][sid]
    ],
                              axis=0)

    max_len = max_len_visual
    train_set_visual = np.stack([
        pad(facet['facet'][vid][sid], max_len) for (vid, sid) in train_set_ids
    ],
                                axis=0)
    valid_set_visual = np.stack([
        pad(facet['facet'][vid][sid], max_len) for (vid, sid) in valid_set_ids
    ],
                                axis=0)
    test_set_visual = np.stack([
        pad(facet['facet'][vid][sid], max_len) for (vid, sid) in test_set_ids
    ],
                               axis=0)

    max_len = max_len_text
    train_set_text = np.stack([
        pad(embeddings['embeddings'][vid][sid], max_len)
        for (vid, sid) in train_set_ids
    ],
                              axis=0)
    valid_set_text = np.stack([
        pad(embeddings['embeddings'][vid][sid], max_len)
        for (vid, sid) in valid_set_ids
    ],
                              axis=0)
    test_set_text = np.stack([
        pad(embeddings['embeddings'][vid][sid], max_len)
        for (vid, sid) in test_set_ids
    ],
                             axis=0)
    # binarize the sentiment scores for binary classification task
    y_train = np.array([sentiments[vid][sid]
                        for (vid, sid) in train_set_ids]) > 0
    y_valid = np.array([sentiments[vid][sid]
                        for (vid, sid) in valid_set_ids]) > 0
    y_test = np.array([sentiments[vid][sid]
                       for (vid, sid) in test_set_ids]) > 0

    # train_set_audio = train_set_audio[:,:,1:35]
    # valid_set_audio = valid_set_audio[:,:,1:35]
    # test_set_audio = test_set_audio[:,:,1:35]

    visual_max = np.max(np.max(np.abs(train_set_visual), axis=0), axis=0)
    visual_max[visual_max ==
               0] = 1  # if the maximum is 0 we don't normalize this dimension
    train_set_visual = train_set_visual / visual_max
    valid_set_visual = valid_set_visual / visual_max
    test_set_visual = test_set_visual / visual_max

    train_set_visual[train_set_visual != train_set_visual] = 0
    valid_set_visual[valid_set_visual != valid_set_visual] = 0
    test_set_visual[test_set_visual != test_set_visual] = 0

    audio_max = np.max(np.max(np.abs(train_set_audio), axis=0), axis=0)
    audio_max[audio_max == 0] = 1
    train_set_audio = train_set_audio / audio_max
    valid_set_audio = valid_set_audio / audio_max
    test_set_audio = test_set_audio / audio_max

    train_set_audio[train_set_audio != train_set_audio] = 0
    valid_set_audio[valid_set_audio != valid_set_audio] = 0
    test_set_audio[test_set_audio != test_set_audio] = 0

    return train_set_audio, valid_set_audio, test_set_audio, train_set_text, valid_set_text, test_set_text, train_set_visual, valid_set_visual, test_set_visual, y_train, y_valid, y_test