Пример #1
0
def main():
    trial_meme_dataset = MemeDataset(
        csv_file=os.path.join(os.getcwd(), 'data/data1.csv'),
        image_dir=os.path.join(
            os.path.expanduser('~'),
            'Downloads/semeval-2020_trialdata/Meme_images/'))

    train_meme_dataset = MemeDataset(
        csv_file=os.path.join(os.getcwd(), 'data/data_7000_new.csv'),
        image_dir=os.path.join(
            os.path.expanduser('~'),
            'Downloads/memotion_analysis_training_data/data_7000/'))

    fig = plt.figure()

    for i in range(len(trial_meme_dataset)):
        sample = trial_meme_dataset[i]
        print(i, np.array(sample['image']).shape, sample['image_name'])
        print(sample['humour_onehot'], sample['humour_int'])
        print(sample['offensive_onehot'], sample['offensive_int'])
        ax = plt.subplot(1, 4, i + 1)
        plt.tight_layout()
        ax.set_title('Sample #{}'.format(i))
        ax.axis('off')
        plt.imshow(sample['image'])

        if i == 3:
            plt.show()
            break
Пример #2
0
def readData(datalabel, batch_size):
    data_transform = transforms.Compose([
        ResizeSample(size=(256, 256)),
        ToTensorSample(),
        NormalizeSample(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    if datalabel == 'trial':
        dataset = MemeDataset(
            csv_file=os.path.join(os.getcwd(), '../data/data1.csv'),
            image_dir=os.path.join(
                os.getcwd(), '../data/semeval-2020_trialdata/Meme_images/'),
            transform=data_transform)
    else:
        dataset = MemeDataset(
            csv_file=os.path.join(os.getcwd(), '../data/data_7000_new.csv'),
            image_dir=os.path.join(
                os.getcwd(),
                '../data/memotion_analysis_training_data/data_7000/'),
            transform=data_transform)

    dataloader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=True,
                            num_workers=0)

    return list(dataloader), len(list(dataloader))
Пример #3
0
 def val_data_loader(val_file):
     val_dataset = MemeDataset(filepath=val_file,
                               text_only=True,
                               text_padding=tokenizer_func)
     return data.DataLoader(val_dataset,
                            batch_size=config['batch_size'],
                            num_workers=config['num_workers'],
                            collate_fn=val_dataset.get_collate_fn())
Пример #4
0
 def test_data_loader(test_file):
     test_dataset = MemeDataset(filepath=test_file,
                                text_only=True,
                                text_padding=tokenizer_func,
                                return_ids=True)
     return data.DataLoader(test_dataset,
                            batch_size=config['batch_size'],
                            num_workers=config['num_workers'],
                            collate_fn=test_dataset.get_collate_fn())
Пример #5
0
 def val_data_loader(val_file):
     val_dataset = MemeDataset(
         filepath=val_file,
         feature_dir=config['feature_path'],
         preload_images=False,
         debug=True,
         text_padding=tokenizer_func,
         confidence_threshold=config['object_conf_thresh'])
     return data.DataLoader(val_dataset,
                            batch_size=config['batch_size'],
                            num_workers=config['num_workers'],
                            collate_fn=val_dataset.get_collate_fn())
Пример #6
0
 def train_data_loader(train_file):
     if config['debug']:
         train_file = os.path.join(config["data_path"], "dev_seen.jsonl")
     train_dataset = MemeDataset(filepath=train_file,
                                 text_only=True,
                                 text_padding=tokenizer_func)
     return data.DataLoader(
         train_dataset,
         batch_size=config['batch_size'],
         num_workers=config['num_workers'],
         collate_fn=train_dataset.get_collate_fn(),
         pin_memory=
         True,  # shuffle is mutually exclusive with sampler. It is shuffled anyways
         sampler=ConfounderSampler(
             train_dataset, repeat_factor=config["confounder_repeat"]))
Пример #7
0
def main():
    data_transform = transforms.Compose([
        ResizeSample(size=(256, 256)),
        ToTensorSample(),
        NormalizeSample(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    # Apply each of the above transforms on sample.
    fig = plt.figure()
    trial_meme_dataset = MemeDataset(
        csv_file=os.path.join(os.getcwd(), 'data/data1.csv'),
        image_dir=os.path.join(
            os.path.expanduser('~'),
            'Downloads/semeval-2020_trialdata/Meme_images/'))
    for i in range(len(trial_meme_dataset)):
        sample = trial_meme_dataset[i]
        print(i, np.array(sample['image']).shape)
        print('np.array(sample[\'image\'])[128,128,0]: {}'.format(
            np.array(sample['image'])[128, 128, 0]))
        transformed_sample = data_transform(sample)
        print(i, np.array(transformed_sample['image']).shape)
        # print(transformed_sample['image'].numpy().max(axis=1))
        print('transformed_sample[\'image\'].numpy()[0,128,128]: {}'.format(
            transformed_sample['image'].numpy()[0, 128, 128]))
        ax = plt.subplot(1, 4, i + 1)
        plt.tight_layout()
        ax.set_title('Sample #{}'.format(i))
        ax.axis('off')
        plt.imshow(transformed_sample['image'].numpy().transpose((1, 2, 0)))

        if i == 3:
            plt.show()
            break
Пример #8
0
 def train_data_loader(train_file):
     train_dataset = MemeDataset(
         filepath=train_file,
         feature_dir=config['feature_path'],
         preload_images=False,
         debug=True,
         text_padding=tokenizer_func,
         confidence_threshold=config['object_conf_thresh'])
     return data.DataLoader(
         train_dataset,
         batch_size=config['batch_size'],
         num_workers=config['num_workers'],
         collate_fn=train_dataset.get_collate_fn(),
         pin_memory=
         True,  # shuffle is mutually exclusive with sampler. It is shuffled anyways
         sampler=ConfounderSampler(
             train_dataset, repeat_factor=config["confounder_repeat"]))
Пример #9
0
 def readData(self, datalabel):
     data_transform = transforms.Compose([
         ResizeSample(size=(256, 256)),
         ToTensorSample(),
         NormalizeSample(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])
     
     if datalabel == 'trial':
         dataset = MemeDataset(
             csv_file=os.path.join(os.getcwd(), '../data/data1.csv'),
             image_dir = os.path.join(os.getcwd(), '../data/semeval-2020_trialdata/Meme_images/'),
             transform= data_transform)
     else:
         dataset = MemeDataset(
             csv_file=os.path.join(os.getcwd(), '../data/data_7000_new.csv'),
             image_dir = os.path.join(os.getcwd(), '../data/memotion_analysis_training_data/data_7000/'),
             transform=data_transform)
     return dataset
Пример #10
0
def main():
    data_transform = transforms.Compose([
        ResizeSample(size=(256, 256)),
        ToTensorSample(),
        NormalizeSample(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    trial_meme_dataset_transformed = MemeDataset(
        csv_file=os.path.join(os.getcwd(), '../data/data1.csv'),
        image_dir=os.path.join(os.getcwd(),
                               '../data/semeval-2020_trialdata/Meme_images/'),
        transform=data_transform)

    train_meme_dataset_transformed = MemeDataset(
        csv_file=os.path.join(os.getcwd(), '../data/data_7000_new.csv'),
        image_dir=os.path.join(
            os.getcwd(), '../data/memotion_analysis_training_data/data_7000/'),
        transform=data_transform)

    evaluate_classification(
        meme_dataset_transformed=trial_meme_dataset_transformed)
    evaluate_classification(
        meme_dataset_transformed=train_meme_dataset_transformed)
Пример #11
0
def main():
    trial_meme_dataset_transformed = MemeDataset(
        csv_file=os.path.join(os.getcwd(), '../data/data1.csv'),
        image_dir=os.path.join(os.getcwd(),
                               '../data/semeval-2020_trialdata/Meme_images/'),
        transform=transforms.Compose([
            ResizeSample(size=(256, 256)),
            ToTensorSample(),
            NormalizeSample(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225])
        ]))

    for i in range(len(trial_meme_dataset_transformed)):
        sample = trial_meme_dataset_transformed[i]
        print(i, sample['image'].size())
        if i == 3:
            break

    dataloader = DataLoader(dataset=trial_meme_dataset_transformed,
                            batch_size=4,
                            shuffle=True,
                            num_workers=4)

    for i_batch, sample_batched in enumerate(dataloader):
        print(i_batch, sample_batched['image'].size(),
              sample_batched['image'].numpy().shape)
        print('sample_batched[\'image_name\']:\n{}'.format(
            sample_batched['image_name']))
        print('sample_batched[\'humour_onehot\']:\n{}'.format(
            sample_batched['humour_onehot']))
        print('sample_batched[\'humour_int\']:\n{}'.format(
            sample_batched['humour_int']))
        print('sample_batched[\'offensive_onehot\']:\n{}'.format(
            sample_batched['offensive_onehot']))
        print('sample_batched[\'offensive_int\']:\n{}'.format(
            sample_batched['offensive_int']))
        print('sample_batched[\'ocr_extracted_text\']:\n{}'.format(
            sample_batched['ocr_extracted_text']))
        print('sample_batched[\'corrected_text\']:\n{}\n'.format(
            sample_batched['corrected_text']))

        # observe 4th batch and stop.
        if i_batch == 3:
            plt.figure()
            show_batch(sample_batched)
            plt.axis('off')
            plt.ioff()
            plt.show()
            break
def get_transformed_dataset(textEmb_path, data_path, img_path):
    '''
    Get the embedding for the text, which is used as the text feature, and the dataset.
    '''

    imgname_textEmbs = MyDataLoader.read_text_embeddings_Idx(textEmb_path)

    data_transform = transforms.Compose([
        ResizeSample(size=(256, 256)),
        ToTensorSample(),
        NormalizeSample(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])

    meme_dataset_transformed = MemeDataset(
        csv_file=os.path.join(os.getcwd(), data_path),
        image_dir=os.path.join(os.getcwd(), img_path),
        transform=data_transform)

    return imgname_textEmbs, meme_dataset_transformed
Пример #13
0
def get_dataloaders(data_path, img_path, batch_size, split_seq):
    # split_seq: [0.8, 0.2], 80% data for training, 10% for validation, the rest of data for testing
    data_transform = transforms.Compose([
        ResizeSample(size=(299, 299)),
        # ResizeSample(size=(256,256)),
        ToTensorSample(),
        NormalizeSample((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    meme_dataset_transformed = MemeDataset(
        csv_file=os.path.join(os.getcwd(), data_path),
        image_dir=os.path.join(os.getcwd(), img_path),
        transform=data_transform)

    # Split the dataset
    train_len = int(len(meme_dataset_transformed) * split_seq[0])
    # val_len = int(len(meme_dataset_transformed) * split_seq[1])
    test_len = len(meme_dataset_transformed) - train_len

    # meme_train, meme_val, meme_test = random_split(meme_dataset_transformed, [train_len, val_len, test_len])
    meme_train, meme_val = random_split(meme_dataset_transformed,
                                        [train_len, test_len])

    # The dataloader for training, validation and testing dataset
    train_dataloader = DataLoader(dataset=meme_train,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  num_workers=4)
    val_dataloader = DataLoader(dataset=meme_val,
                                batch_size=batch_size,
                                shuffle=True,
                                num_workers=4)
    # test_dataloader = DataLoader(dataset=meme_test, batch_size=4,
    #     shuffle=True, num_workers=4)

    # dataloaders_dict = {'train': train_dataloader, 'val': val_dataloader, 'test': test_dataloader}
    dataloaders_dict = {'train': train_dataloader, 'val': val_dataloader}

    return dataloaders_dict
Пример #14
0
print("Initializing Datasets and Dataloaders...")
# trial_meme_dataset_transformed = MemeDataset(
# csv_file=os.path.join(os.getcwd(), '../data/data1.csv'),
# image_dir=os.path.join(os.getcwd(),
#     '../data/semeval-2020_trialdata/Meme_images/'),
# transform=transforms.Compose(
#     [ResizeSample(size=(224, 224)),
#     ToTensorSample(),
#     NormalizeSample(mean=[0.485, 0.456, 0.406],
#     std=[0.229, 0.224, 0.225])]))

trial_meme_dataset_transformed = MemeDataset(
    csv_file=os.path.join(os.getcwd(), '../data/data_7000_new.csv'),
    image_dir=os.path.join(
        os.getcwd(), '../data/memotion_analysis_training_data/data_7000/'),
    transform=transforms.Compose([
        ResizeSample(size=(224, 224)),
        ToTensorSample(),
        NormalizeSample(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ]))

trial_meme_train, trial_meme_val, _ = random_split(
    dataset=trial_meme_dataset_transformed, lengths=[5988, 1000, 3])

# Create training and validation dataloaders
sample_weights_train = make_weights_for_balanced_classes(trial_meme_train,
                                                         num_classes=3)
weighted_sampler_train = WeightedRandomSampler(sample_weights_train,
                                               len(sample_weights_train))
train_dataloader = DataLoader(dataset=trial_meme_train,
                              batch_size=4,
Пример #15
0
    set_seed(config['seed'])

    # Tokenize
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    tokenizer_func = partial(tokenizer,
                             max_length=config['max_txt_len'],
                             padding='max_length',
                             truncation=True,
                             return_tensors='pt',
                             return_length=True)

    # Prepare the datasets and dataloaders for training and evaluation
    train_dataset = MemeDataset(
        filepath=os.path.join(config['data_path'], config['train_filename']),
        feature_dir=config['feature_path'],
        text_padding=tokenizer_func,
        filter_text=config["filter_text"],
        upsample_multiplier=config["upsample_multiplier"])
    val_dataset = MemeDataset(filepath=os.path.join(config['data_path'],
                                                    'dev_seen.jsonl'),
                              feature_dir=config['feature_path'],
                              text_padding=tokenizer_func,
                              filter_text=config["filter_text"])
    test_dataset = MemeDataset(filepath=os.path.join(config['data_path'],
                                                     'test_seen.jsonl'),
                               feature_dir=config['feature_path'],
                               text_padding=tokenizer_func,
                               filter_text=config["filter_text"])

    config['train_loader'] = data.DataLoader(
        train_dataset,
    set_seed(config['seed'])

    # Tokenize
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
    tokenizer_func = partial(tokenizer,
                             max_length=config['max_txt_len'],
                             padding='max_length',
                             truncation=True,
                             return_tensors='pt',
                             return_length=True)

    # Prepare the datasets and dataloaders for training and evaluation
    train_dataset = MemeDataset(filepath=os.path.join(config['data_path'],
                                                      'train.jsonl'),
                                feature_dir=config['feature_path'],
                                debug=True,
                                text_padding=tokenizer_func)
    val_dataset = MemeDataset(filepath=os.path.join(config['data_path'],
                                                    'dev_seen.jsonl'),
                              feature_dir=config['feature_path'],
                              debug=True,
                              text_padding=tokenizer_func)
    test_dataset = MemeDataset(filepath=os.path.join(config['data_path'],
                                                     'test_seen.jsonl'),
                               feature_dir=config['feature_path'],
                               return_ids=True,
                               debug=True,
                               text_padding=tokenizer_func)

    config['train_loader'] = data.DataLoader(
Пример #17
0
def main():
    # Create training and validation datasets
    print("Initializing Datasets and Dataloaders...")
    trial_meme_dataset_transformed = MemeDataset(
        csv_file=os.path.join(os.getcwd(), '../data/data1.csv'),
        image_dir=os.path.join(os.getcwd(),
                               '../data/semeval-2020_trialdata/Meme_images/'),
        transform=transforms.Compose([
            ResizeSample(size=(299, 299)),  # For Inception
            # ResizeSample(size=(224, 224)),  # For other pretrained models
            ToTensorSample(),
            NormalizeSample(mean=[0.485, 0.456, 0.406],
                            std=[0.229, 0.224, 0.225])
        ]))

    trial_meme_train, trial_meme_val = random_split(
        dataset=trial_meme_dataset_transformed, lengths=[800, 200])

    # Create training and validation dataloaders
    # Balanced class============================================================
    # sample_weights_train = make_weights_for_balanced_classes(
    #     trial_meme_train, num_classes=3)
    # weighted_sampler_train = WeightedRandomSampler(
    #     sample_weights_train, len(sample_weights_train))
    # train_dataloader = DataLoader(dataset=trial_meme_train, batch_size=4,
    #     sampler=weighted_sampler_train, num_workers=4)

    # sample_weights_val = make_weights_for_balanced_classes(
    #     trial_meme_val, num_classes=3)
    # weighted_sampler_val = WeightedRandomSampler(
    #     sample_weights_val, len(sample_weights_val))
    # val_dataloader = DataLoader(dataset=trial_meme_val, batch_size=4,
    #     sampler=weighted_sampler_val, num_workers=4)
    # ==========================================================================

    # Imbalanced class==========================================================
    train_dataloader = DataLoader(dataset=trial_meme_train,
                                  batch_size=4,
                                  shuffle=True,
                                  num_workers=4)
    val_dataloader = DataLoader(dataset=trial_meme_val,
                                batch_size=4,
                                shuffle=True,
                                num_workers=4)
    # ==========================================================================

    dataloaders_dict = {'train': train_dataloader, 'val': val_dataloader}

    # Detect if we have a GPU available
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    deepsent_config = {
        'num_classes': 3,  # negative, positive, neutral
        'batch_size': 4,
        'vocab_size': 400000,
        'embedding_dim': 300
    }
    deepsent = DeepSentimentModel(**deepsent_config)
    # deepsent = DeepSentimentVanillaModel(**deepsent_config)
    # deepsent = ShallownetGloveModel(**deepsent_config)
    # Send the model to GPU
    deepsent = deepsent.to(device)

    # Gather the parameters to be optimized/updated in this run. If we are
    #  finetuning we will be updating all parameters. However, if we are
    #  doing feature extract method, we will only update the parameters
    #  that we have just initialized, i.e. the parameters with requires_grad
    #  is True.
    feature_extract = True
    params_to_update = deepsent.parameters()
    print("Params to learn:")
    if feature_extract:
        params_to_update = []
        for name, param in deepsent.named_parameters():
            if param.requires_grad == True:
                params_to_update.append(param)
                print("\t", name)
    else:
        for name, param in deepsent.named_parameters():
            if param.requires_grad == True:
                print("\t", name)

    # Observe that all parameters are being optimized
    optimizer_ft = optim.SGD(params_to_update, lr=0.001, momentum=0.9)
    # Setup the loss fxn
    criterion = nn.CrossEntropyLoss()

    # Train and evaluate
    deepsent, hist = train_model(model=deepsent,
                                 dataloaders=dataloaders_dict,
                                 criterion=criterion,
                                 optimizer=optimizer_ft,
                                 num_epochs=10,
                                 is_inception=True,
                                 target_label='overall_sentiment_ternary_int')