示例#1
0
        dataset.threshold_data(13, tokenizer=tokenizer)
        dataset.trim_words(3, tokenizer=tokenizer)
    vocab_dict = dataset.create_vocab_dict(tokenizer)

    # load embeddings from file or set None (to be randomly init)
    if options.embeddings is not None:
        new_emb_file = './cache/new_embs.txt'
        old_emb_file = options.embeddings
        freq_words_file = './cache/freq_words.txt'
        emb_dim = options.emb_dim
        create_emb_file(new_emb_file,
                        old_emb_file,
                        freq_words_file,
                        vocab_dict,
                        most_freq=10000)
        word2idx, idx2word, embeddings = EmbeddingsLoader(
            new_emb_file, emb_dim, extra_tokens=HRED_SPECIAL_TOKENS).load()
    else:
        word2idx, idx2word = word2idx_from_dataset(
            vocab_dict, most_freq=10000, extra_tokens=HRED_SPECIAL_TOKENS)
        embeddings = None
        emb_dim = options.emb_dim

    vocab_size = len(word2idx)
    print("Vocabulary size: {}".format(vocab_size))

    # --- set dataset transforms ---
    tokenizer = DialogSpacyTokenizer(lower=True,
                                     prepend_sos=True,
                                     append_eos=True,
                                     specials=HRED_SPECIAL_TOKENS)
    to_token_ids = ToTokenIds(word2idx, specials=HRED_SPECIAL_TOKENS)
示例#2
0
    trainer = Seq2SeqTrainer(
        model,
        optimizer,
        checkpoint_dir=None,  # '../checkpoints',
        metrics=metrics,
        non_blocking=True,
        retain_graph=True,
        patience=5,
        device=device,
        loss_fn=criterion)
    return trainer


import os
if __name__ == '__main__':
    loader = EmbeddingsLoader('../cache/glove.6B.50d.txt', 50)
    word2idx, _, embeddings = loader.load()

    tokenizer = SpacyTokenizer()
    to_token_ids = ToTokenIds(word2idx)
    to_tensor = ToTensor(device='cpu')

    transforms = Compose([tokenizer, to_token_ids, to_tensor])
    dataset = MovieCorpusDataset('../data/', transforms=transforms, train=True)
    #dataset = dataset.map(tokenizer).map(to_token_ids).map(to_tensor)

    if KFOLD:
        cv_scores = []
        import gc
        for train_loader, val_loader in kfold_split(dataset, 32, 128):
            trainer = trainer_factory(embeddings, device=DEVICE)
def load_embeddings(emb_file, emb_dim):
    loader = EmbeddingsLoader(emb_file,
                              emb_dim,
                              extra_tokens=HRED_SPECIAL_TOKENS)
    word2idx, idx2word, embeddings = loader.load()
    return word2idx, idx2word, embeddings
if __name__ == '__main__':

    ####### Parameters ########
    batch_train = 8
    batch_val = 8
    max_sent_length = 500  #max number of sentences (turns) in transcript - after padding
    max_word_length = 122  #max length of each sentence (turn) - after padding
    num_classes = 2
    batch_size = 8
    hidden_size = 300
    epochs = 40
    lexicons = False
    lex_size = 99

    loader = EmbeddingsLoader('/data/embeddings/glove.840B.300d.txt', 300)
    word2idx, idx2word, embeddings = loader.load()
    embeddings = torch.tensor(embeddings)

    with open("avec.pkl", "rb") as handle:
        _file = pickle.load(handle)

    tokenizer = SpacyTokenizer()
    replace_unknowns = ReplaceUnknownToken()
    to_token_ids = ToTokenIds(word2idx)
    to_tensor = ToTensor(device=DEVICE)

    train = AVECDataset(_file,
                        max_word_length,
                        transforms=Compose([
                            tokenizer, replace_unknowns, to_token_ids,
示例#5
0
    def __getitem__(self, idx):
        datum = self.dataset[idx]
        text, target = datum['text'], datum['sentiment']
        target = self.label_encoder.transform([target])[0]
        for t in self.transforms:
            text = t(text)
        return text, target


DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

collate_fn = SequenceClassificationCollator(device='cpu')

if __name__ == '__main__':
    loader = EmbeddingsLoader('../cache/glove.840B.300d.txt', 300)
    word2idx, _, embeddings = loader.load()

    tokenizer = SpacyTokenizer()
    to_token_ids = ToTokenIds(word2idx)
    to_tensor = ToTensor(device='cpu')

    def create_dataloader(d):
        d = (DatasetWrapper(d).map(tokenizer).map(to_token_ids).map(to_tensor))
        return DataLoader(d,
                          batch_size=32,
                          num_workers=1,
                          pin_memory=True,
                          shuffle=True,
                          collate_fn=collate_fn)

dataset = MovieCorpusDatasetv2('./data/', transforms=None)
# Preprocess dataset
MIN_COUNT= 3
MAX_LENGTH = 10
dataset.normalize_data()
dataset.threshold_data(MAX_LENGTH, tokenizer=SpacyTokenizer())
dataset.trim_words(MIN_COUNT, tokenizer=SpacyTokenizer())

# Load embeddings
emb_file = './cache/glove.6B.300d.txt'
new_emb_file = './cache/new_embs.txt'

create_emb_file(new_emb_file, emb_file, dataset.word2count)
loader = EmbeddingsLoader(new_emb_file, 300, extra_tokens=SPECIAL_TOKENS)
word2idx, idx2word, embeddings = loader.load()


# receive sos,eos and pad tokens
pad_index = word2idx[SPECIAL_TOKENS.PAD.value]
bos_index = word2idx[SPECIAL_TOKENS.BOS.value]
eos_index = word2idx[SPECIAL_TOKENS.EOS.value]

# apply transforms to dataset
tokenizer = SpacyTokenizer(append_eos=True, specials=SPECIAL_TOKENS)
to_token_ids = ToTokenIds(word2idx)
to_tensor = ToTensor(device='cpu')
transforms = Compose([tokenizer, to_token_ids, to_tensor])
dataset.apply_transforms(transforms)