def create_dataset(path_to_dataset,batch_size,split_ratio=0.7,min_vocab_freq=10,max_vocab_size=4000):
	text_field = Field(tokenize="spacy",tokenizer_language="en",batch_first=True,init_token="<sos>",eos_token="<eos>",lower=True)

	def transform(caption):
		caption = caption.strip().lower().split()
		return caption

	dataset = CocoCaptions(annFile=os.path.join(path_to_dataset,"captions_train2014.json"),text_field=text_field,transform=transform)
	train,val = dataset.split(split_ratio=split_ratio)
	test = CocoCaptions(annFile=os.path.join(path_to_dataset,"captions_val2014.json"),text_field=text_field,transform=transform)

	print("Dataset loaded")
	print("Train set size:",len(train))

	text_field.build_vocab(dataset.text,min_freq=min_vocab_freq,max_size=max_vocab_size)
	SOS_TOKEN = text_field.vocab.stoi['<sos>']
	EOS_TOKEN = text_field.vocab.stoi['<eos>']
	UNK_TOKEN = text_field.vocab.stoi['<unk>']
	PAD_TOKEN = text_field.vocab.stoi['<pad>']

	print("Vocabuly build")

	print("Vocabuly statistics")

	print("\nMost common words in the vocabulary:\n",text_field.vocab.freqs.most_common(10))
	print("Size of the vocabulary:",len(text_field.vocab))
	print("Max sequence lenght",dataset.max_seq_len)

	train_iter,val_iter = BucketIterator.splits((train,val),repeat=False,batch_size=batch_size)
	test_iter = BucketIterator(test,batch_size=batch_size,repeat=False,train=False)
	vocab_dict = text_field.vocab.stoi
	return {"data_iters":(train_iter,val_iter,test_iter),"fields":text_field,
	"word_to_num_vocab":vocab_dict,"num_to_word_vocab":{y:x for x,y in vocab_dict.items()},
	"num_classes":len(text_field.vocab),"tokens":(SOS_TOKEN,EOS_TOKEN,UNK_TOKEN,PAD_TOKEN),"max_seq_len":dataset.max_seq_len}
Exemplo n.º 2
0
def load_dataset(file_name):
    """Loads contents from a file in the *data* directory into a
    torchtext.data.TabularDataset instance.
    """
    file_path = join(DATA_DIR, file_name)
    text_field = Field(pad_token=None, tokenize=_tokenize_str)

    dataset = TabularDataset(
        path=file_path,
        format='csv',
        fields=[('text', text_field)])

    text_field.build_vocab(dataset)
    return dataset
Exemplo n.º 3
0
def load_dataset(batch_size):
    spacy_de = spacy.load('de')
    spacy_en = spacy.load('en')
    url = re.compile('(<url>.*</url>)')

    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

    DE = Field(tokenize=tokenize_de, include_lengths=True,
               init_token='<sos>', eos_token='<eos>')
    EN = Field(tokenize=tokenize_en, include_lengths=True,
               init_token='<sos>', eos_token='<eos>')
    train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN))
    DE.build_vocab(train.src, min_freq=2)
    EN.build_vocab(train.trg, max_size=10000)
    train_iter, val_iter, test_iter = BucketIterator.splits(
            (train, val, test), batch_size=batch_size, repeat=False)
    return train_iter, val_iter, test_iter, DE, EN
#EN = Field(tokenize=tokenize_en,batch_first=True,init_token="<SOS>",eos_token="<EOS>")
#DE = Field(tokenize=tokenize_de,batch_first=True,init_token="<SOS>",eos_token="<EOS>")
EN = Field(tokenize="spacy",tokenizer_language="en",batch_first=True,init_token="<SOS>",eos_token="<EOS>")
DE = Field(tokenize="spacy",tokenizer_language="de",batch_first=True,init_token="<SOS>",eos_token="<EOS>")

# multi30k dataloader
train,val,test = datasets.Multi30k.splits(exts=(".en",".de"),fields=(EN,DE),root=data_path)

# wmt14 dataloader (better than using datasets.WMT14.splits since it's slow)
#train,val,test = datasets.TranslationDataset.splits(exts=(".en",".de"),fields=[("src",EN),("trg",DE)],path=os.path.join(data_path,"wmt14"),
#	train="train.tok.clean.bpe.32000",validation="newstest2013.tok.bpe.32000",test="newstest2014.tok.bpe.32000")

print("Dataset loaded")

EN.build_vocab(train.src,min_freq=3)
DE.build_vocab(train.trg,max_size=50000)

print("Vocabularies build")

train_iter,val_iter = BucketIterator.splits((train, val),batch_size=3)
test_iter = BucketIterator(test,batch_size=3)

print("Start iterating through data")

for i,batch in enumerate(train_iter):
	print(batch.src) # the source language
	print(batch.trg) # the target language
	break

for i,batch in enumerate(val_iter):
Exemplo n.º 5
0
class WordToPhonemeModel:
    '''Contains pytorch model for converting words to phonemes.'''

    UNK_TOKEN = '<unk>'
    PAD_TOKEN = '<pad>'
    SOS_TOKEN = '<sos>'
    EOS_TOKEN = '<eos>'

    def __init__(self, model_dir=None, device=None, **load_kwargs):
        self._logger = logging.getLogger(__class__.__name__)
        self.device = device
        self.best_test_loss = float('inf')

        if model_dir is not None:
            self.load_model(model_dir, **load_kwargs)

    def word2phonemes(self, word: str, lower: bool = True):
        if lower:
            word = word.lower()

        tokenized = WordToPhonemeModel.tokenize_word(word)
        tokenized = [WordToPhonemeModel.SOS_TOKEN
                     ] + tokenized + [WordToPhonemeModel.EOS_TOKEN]
        numericalized = [self.src_field.vocab.stoi[t] for t in tokenized]

        src = torch.LongTensor(numericalized).unsqueeze(1).to(self.device)
        self.model.eval()
        output = self.model(src, None, teacher_forcing_ratio=0)[1:]

        predicted = torch.argmax(output.squeeze(1), 1)
        tokenized = [
            self.trg_field.vocab.itos[int(i)] for i in predicted
            if i != self.eos_idx
        ]

        return tokenized

    # -------------------------------------------------------------------------

    def load_dataset(self, csv_path: str, lower=True) -> None:
        '''Loads a CSV dataset of the form WORD,PH ON EM ES'''
        self.src_field = Field(tokenize=WordToPhonemeModel.tokenize_word,
                               init_token=WordToPhonemeModel.SOS_TOKEN,
                               eos_token=WordToPhonemeModel.EOS_TOKEN,
                               lower=lower)

        self.trg_field = Field(tokenize=WordToPhonemeModel.tokenize_phonemes,
                               init_token=WordToPhonemeModel.SOS_TOKEN,
                               eos_token=WordToPhonemeModel.EOS_TOKEN,
                               lower=lower)

        self._logger.debug(f'Loading dataset from {csv_path}')
        self.dataset = TabularDataset(path=csv_path,
                                      format='csv',
                                      fields=[('src', self.src_field),
                                              ('trg', self.trg_field)])

        self.train_data, self.test_data = self.dataset.split()
        self._logger.debug(f'Training examples: {len(self.train_data)}')
        self._logger.debug(f'Testing examples: {len(self.test_data)}')

        self._logger.debug(
            f'Building vocabulary from {len(self.train_data)} example(s)')
        self.src_field.build_vocab(self.train_data, min_freq=1)
        self.trg_field.build_vocab(self.train_data, min_freq=1)

        self.model = self._make_model()
        self._logger.debug(self.model)

    # -------------------------------------------------------------------------

    def save_vocabulary(self, model_dir):
        # Source
        with open(os.path.join(model_dir, 'src-vocab.txt'),
                  'w') as src_vocab_file:
            for symbol in self.src_field.vocab.itos:
                print(symbol, file=src_vocab_file)

        with open(os.path.join(model_dir, 'src-freqs.txt'),
                  'w') as src_freq_file:
            for symbol, count in self.src_field.vocab.freqs.items():
                print(symbol, count, file=src_freq_file)

        # Target
        with open(os.path.join(model_dir, 'trg-vocab.txt'),
                  'w') as trg_vocab_file:
            for symbol in self.trg_field.vocab.itos:
                print(symbol, file=trg_vocab_file)

        with open(os.path.join(model_dir, 'trg-freqs.txt'),
                  'w') as trg_freq_file:
            for symbol, count in self.trg_field.vocab.freqs.items():
                print(symbol, count, file=trg_freq_file)

        self._logger.debug(f'Saved vocabulary to {model_dir}')

    # -------------------------------------------------------------------------

    def train(self,
              epochs,
              save_path,
              load_previous=True,
              clip=10,
              batch_size=128):
        save_dir = os.path.split(save_path)[0]
        os.makedirs(save_dir, exist_ok=True)

        if load_previous and os.path.exists(save_path):
            self._logger.debug(f'Loading model state from {save_path}')
            self.model.load_state_dict(torch.load(save_path))

        train_iterator, test_iterator = BucketIterator.splits(
            (self.train_data, self.test_data),
            batch_size=batch_size,
            device=self.device,
            sort_key=lambda x: len(x.src))

        optimizer = optim.Adam(self.model.parameters())
        trg_pad_idx = self.trg_field.vocab.stoi[WordToPhonemeModel.PAD_TOKEN]
        criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_idx)

        # Training loop
        self._logger.debug(f'Beginning training for {epochs} epoch(s)')
        for epoch in range(epochs):
            train_loss = self._train_iter(train_iterator, optimizer, criterion,
                                          clip)
            test_loss = self._evaluate_iter(test_iterator, criterion)

            if test_loss < self.best_test_loss:
                # Save model if better
                self.best_test_loss = test_loss
                torch.save(self.model.state_dict(), save_path)

            self._logger.debug(
                f'| Epoch: {epoch+1:03} | Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |'
            )

        # Save model
        torch.save(self.model.state_dict(), save_path)
        self._logger.info(save_path)

        return self.best_test_loss

    # -------------------------------------------------------------------------

    def _train_iter(self, iterator, optimizer, criterion, clip) -> float:
        self.model.train()
        epoch_loss = 0
        for i, batch in enumerate(iterator):
            src = batch.src
            trg = batch.trg

            optimizer.zero_grad()

            output = self.model(src, trg)

            #trg = [sent len, batch size]
            #output = [sent len, batch size, output dim]

            #reshape to:
            #trg = [(sent len - 1) * batch size]
            #output = [(sent len - 1) * batch size, output dim]

            loss = criterion(output[1:].view(-1, output.shape[2]),
                             trg[1:].view(-1))
            loss.backward()

            torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip)
            optimizer.step()

            epoch_loss += loss.item()

        return epoch_loss / len(iterator)

    def _evaluate_iter(self, iterator, criterion):
        self.model.eval()
        epoch_loss = 0

        with torch.no_grad():
            for i, batch in enumerate(iterator):
                src = batch.src
                trg = batch.trg

                output = self.model(src, trg, 0)  #turn off teacher forcing

                loss = criterion(output[1:].view(-1, output.shape[2]),
                                 trg[1:].view(-1))
                epoch_loss += loss.item()

        return epoch_loss / len(iterator)

    # -------------------------------------------------------------------------

    def load_model(self, model_dir, lower=True, no_state=False):
        self._logger.debug(f'Loading vocabulary files from {model_dir}')
        self.src_field = Field(tokenize=WordToPhonemeModel.tokenize_word,
                               init_token=WordToPhonemeModel.SOS_TOKEN,
                               eos_token=WordToPhonemeModel.EOS_TOKEN,
                               lower=lower)

        self.src_field.vocab = WordToPhonemeModel.load_vocab(
            os.path.join(model_dir, 'src-freqs.txt'))

        self.trg_field = Field(tokenize=WordToPhonemeModel.tokenize_phonemes,
                               init_token=WordToPhonemeModel.SOS_TOKEN,
                               eos_token=WordToPhonemeModel.EOS_TOKEN,
                               lower=lower)

        self.trg_field.vocab = WordToPhonemeModel.load_vocab(
            os.path.join(model_dir, 'trg-freqs.txt'))

        self.model = self._make_model()
        self._logger.debug(self.model)

        if not no_state:
            state_path = os.path.join(model_dir, 'g2p-model.pt')
            if os.path.exists(state_path):
                self._logger.debug(f'Loading model state from {state_path}')
                self.model.load_state_dict(torch.load(state_path))
            else:
                self._logger.warning(
                    f'Missing model state file at {state_path}!')

    # -------------------------------------------------------------------------

    def _make_model(self):
        input_dim = len(self.src_field.vocab)
        output_dim = len(self.trg_field.vocab)
        enc_emb_dim = 256
        dec_emb_dim = 256
        hid_dim = 512
        enc_dropout = 0.5
        dec_dropout = 0.5

        self.sos_idx = self.trg_field.vocab.stoi[WordToPhonemeModel.SOS_TOKEN]
        self.eos_idx = self.trg_field.vocab.stoi[WordToPhonemeModel.EOS_TOKEN]
        self.pad_idx = self.src_field.vocab.stoi[WordToPhonemeModel.PAD_TOKEN]

        enc = Encoder(input_dim, enc_emb_dim, hid_dim, enc_dropout)
        dec = Decoder(output_dim, dec_emb_dim, hid_dim, dec_dropout)

        return Seq2Seq(enc, dec, self.pad_idx, self.sos_idx, self.eos_idx,
                       self.device)

    # -------------------------------------------------------------------------

    @classmethod
    def tokenize_word(cls, word):
        return list(word)

    @classmethod
    def tokenize_phonemes(cls, text):
        return re.split(r'\s+', text)

    @classmethod
    def load_vocab(cls, freqs_path):
        counter = Counter()
        with open(freqs_path, 'r') as freqs_file:
            for line in freqs_file:
                name, freq = re.split(r'\s+', line.strip(), maxsplit=1)
                counter[name] = int(freq)

        return Vocab(counter,
                     specials=[
                         WordToPhonemeModel.UNK_TOKEN,
                         WordToPhonemeModel.PAD_TOKEN,
                         WordToPhonemeModel.SOS_TOKEN,
                         WordToPhonemeModel.EOS_TOKEN
                     ])
Exemplo n.º 6
0
        super().__init__(examples, fields)


# In[9]:


train=MyDataset(trainset,text_field=TEXT,label_field=LABEL,test=False)
valid=MyDataset(validset,text_field=TEXT,label_field=LABEL,test=False)


# In[10]:


from torchtext.vocab import Vectors
vectors=Vectors(name='./sgns.sogou.word') #使用预训练的词向量,维度为300Dimension
TEXT.build_vocab(train, vectors=vectors) #构建词典
LABEL.build_vocab(train)


# In[11]:


import torch
from torchtext.data import BucketIterator
batchsize=64
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_iter = data.BucketIterator(dataset=train, batch_size=batchsize, 
        shuffle=True, sort_key=lambda x: len(x.text),
        device=DEVICE,sort_within_batch=False, repeat=False)
valid_iter = data.BucketIterator(dataset=valid, batch_size=batchsize, 
        shuffle=True, sort_key=lambda x: len(x.text),
Exemplo n.º 7
0

german = Field(tokenize=tokenizer_ger,
               lower=True,
               init_token="<sos>",
               eos_token="<eos>")
english = Field(tokenize=tokenizer_eng,
                lower=True,
                init_token="<sos>",
                eos_token="<eos>")

train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"),
                                                    fields=(german, english),
                                                    root='data')

german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)


class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
Exemplo n.º 8
0
        fields_dataset = [("query_title", TEXT), ("query_description", TEXT),
                          ("doc_text", TEXT), ("label", LABEL)]
        train_data = Dataset(torch_examples, fields_dataset)
        save_examples(train_data, "../traindata.json")
        exit(0)
    else:
        TEXT = Field(tokenize=tokenize_en,
                     batch_first=True,
                     include_lengths=True)
        LABEL = LabelField(dtype=torch.float, batch_first=True)
        fields_dataset = [("query_title", TEXT), ("query_description", TEXT),
                          ("doc_text", TEXT), ("label", LABEL)]
        train_data = Dataset(
            load_examples("../traindata.json", fields_dataset), fields_dataset)
    print("build_vocabulary...")
    TEXT.build_vocab(train_data, min_freq=1, vectors="glove.6B.300d")
    LABEL.build_vocab(train_data)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("build_iterator...")
    train_iterator, vaild_iterator = BucketIterator.splits(
        (train_data, train_data),
        batch_size=64,
        sort_key=lambda x: len(x.doc_text),
        sort_within_batch=False,
        device=device)

    size_of_vocab = len(TEXT.vocab)
    embedding_dim = 300
    num_hidden_nodes = 128
    num_layers = 2
    num_output_nodes = 1
        src_field = Field(tokenize=english_tokenizer,
                          init_token='<sos>',
                          eos_token='<eos>',
                          lower=True)
        #initialize the field for trg language
        trg_field = Field(tokenize=hindi_tokenizer,
                          init_token='<sos>',
                          eos_token='<eos>',
                          lower=True)
        train_data, valid_data, test_data = load_datasets(
            model_config['global']['dataset_path'],
            model_config['global']['dataset_file_names'],
            model_config['global']['translate_pair'],
            model_config['global']['lang_extensions'], [src_field, trg_field])
        #initialize the vocabulary
        src_field.build_vocab(train_data, min_freq=1)
        trg_field.build_vocab(train_data, min_freq=1)
        #display dataset stats
        print_dataset_statistics(train_data, valid_data, test_data,
                                 model_config['global']['lang_extensions'],
                                 [src_field, trg_field])
        train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
            (train_data, valid_data, test_data),
            batch_size=model_config['global']['batch_size'],
            device=device)

        cache_file_name = "%s-%s-%s-epoch-%s.pt" % (
            model_config['global']['name'],
            model_config['global']['lang_extensions'][0],
            model_config['global']['lang_extensions'][1],
            model_config['global']['epochs'])
Exemplo n.º 10
0
    lower=True,
    fix_length=MAX_LEN,
    postprocessing=filter_low_freq_words)
LABEL = Field(sequential=False, use_vocab=False)

# 2.构建表格型dataset
ds_train, ds_test = TabularDataset.splits(path='./data/',
                                          train='train.tsv',
                                          test='test.tsv',
                                          format='tsv',
                                          fields=[('label', LABEL),
                                                  ('text', TEXT)],
                                          skip_header=False)

# 3.构建词典
TEXT.build_vocab(ds_train)

# 4.构建数据管道迭代器
train_iter, test_iter = Iterator.splits((ds_train, ds_test),
                                        sort_within_batch=True,
                                        sort_key=lambda x: len(x.text),
                                        batch_sizes=(BATCH_SIZE, BATCH_SIZE),
                                        device='cuda:4')


# 将数据管道组织成torch.utils.data.DataLoader相似的features,label输出形式
class DataLoader:
    def __init__(self, data_iter):
        self.data_iter = data_iter
        self.length = len(data_iter)
Exemplo n.º 11
0
    def iters(cls,
              lower=True,
              example_mode='sentence',
              use_wiki=False,
              n_wiki_sentences=5,
              replace_title_mentions='',
              batch_size=128,
              device=-1,
              root='.data',
              vectors='glove.6B.300d',
              unigrams=True,
              bigrams=False,
              trigrams=False,
              combined_ngrams=True,
              combined_max_vocab_size=None,
              unigram_max_vocab_size=None,
              bigram_max_vocab_size=None,
              trigram_max_vocab_size=None,
              **kwargs):
        QNUM = LongField()
        SENT = LongField()
        PAGE = Field(sequential=False, tokenize=str_split)
        if combined_ngrams:
            tokenizer = create_qb_tokenizer(unigrams=unigrams,
                                            bigrams=bigrams,
                                            trigrams=trigrams)
            TEXT = QBTextField(batch_first=True,
                               tokenize=tokenizer,
                               include_lengths=True,
                               lower=lower)
            train, val, dev = cls.splits(
                qnum_field=QNUM,
                sent_field=SENT,
                text_field=TEXT,
                page_field=PAGE,
                root=root,
                example_mode=example_mode,
                use_wiki=use_wiki,
                n_wiki_sentences=n_wiki_sentences,
                replace_title_mentions=replace_title_mentions,
                **kwargs)
            TEXT.build_vocab(train,
                             vectors=vectors,
                             max_size=combined_max_vocab_size)
            PAGE.build_vocab(train)
        else:
            if unigrams:
                unigram_tokenizer = create_qb_tokenizer(unigrams=True,
                                                        bigrams=False,
                                                        trigrams=False)
                UNIGRAM_TEXT = QBTextField(batch_first=True,
                                           tokenize=unigram_tokenizer,
                                           include_lengths=True,
                                           lower=lower)
            else:
                UNIGRAM_TEXT = None

            if bigrams:
                bigram_tokenizer = create_qb_tokenizer(unigrams=False,
                                                       bigrams=True,
                                                       trigrams=False)
                BIGRAM_TEXT = QBTextField(batch_first=True,
                                          tokenize=bigram_tokenizer,
                                          include_lengths=True,
                                          lower=lower)
            else:
                BIGRAM_TEXT = None

            if trigrams:
                trigram_tokenizer = create_qb_tokenizer(unigrams=False,
                                                        bigrams=False,
                                                        trigrams=True)
                TRIGRAM_TEXT = QBTextField(batch_first=True,
                                           tokenize=trigram_tokenizer,
                                           include_lengths=True,
                                           lower=lower)
            else:
                TRIGRAM_TEXT = None

            train, val, dev = cls.splits(
                qnum_field=QNUM,
                sent_field=SENT,
                page_field=PAGE,
                unigram_field=UNIGRAM_TEXT,
                bigram_field=BIGRAM_TEXT,
                trigram_field=TRIGRAM_TEXT,
                root=root,
                example_mode=example_mode,
                use_wiki=use_wiki,
                n_wiki_sentences=n_wiki_sentences,
                replace_title_mentions=replace_title_mentions,
                **kwargs)
            if UNIGRAM_TEXT is not None:
                UNIGRAM_TEXT.build_vocab(train,
                                         vectors=vectors,
                                         max_size=unigram_max_vocab_size)
            if BIGRAM_TEXT is not None:
                BIGRAM_TEXT.build_vocab(train, max_size=bigram_max_vocab_size)
            if TRIGRAM_TEXT is not None:
                TRIGRAM_TEXT.build_vocab(train,
                                         max_size=trigram_max_vocab_size)
            PAGE.build_vocab(train)

        return BucketIterator.splits((train, val, dev),
                                     batch_size=batch_size,
                                     device=-1,
                                     repeat=False)
Exemplo n.º 12
0
for src_line, trg_line in zip(src_file, trg_file):
    src_line, trg_line = src_line.strip(), trg_line.strip()
    if src_line != '' and trg_line != '':
        # TODO 需要注意下面这个 fields 的使用
        temp = data.Example.fromlist([src_line, trg_line], fields)
        examples.append(temp)

print(vars(examples[0]))
# 根据得到的example,构建一个Dataset,得到 trainData
allData = tDataset(examples, fields)
trainData, validData, testData = allData.split(split_ratio=[0.8, 0.1,
                                                            0.1])  #再分割成三份

# 只根据训练数据(trainData)构建字典
SRC.build_vocab(trainData)
TRG.build_vocab(trainData)

BATCH_SIZE = 12
device = t.device("cuda:0" if t.cuda.is_available() else "cpu")
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (trainData, validData, testData),
    batch_size=BATCH_SIZE,
    sort=False,
    device=device)

INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
Exemplo n.º 13
0
def test_single_gpu_batch_parse():
    trainer = Trainer(gpus=1)
    trainer.accelerator_backend = GPUAccelerator(trainer)

    # non-transferrable types
    primitive_objects = [
        None, {}, [], 1.0, "x", [None, 2], {
            "x": (1, 2),
            "y": None
        }
    ]
    for batch in primitive_objects:
        data = trainer.accelerator_backend.batch_to_device(
            batch, torch.device('cuda:0'))
        assert data == batch

    # batch is just a tensor
    batch = torch.rand(2, 3)
    batch = trainer.accelerator_backend.batch_to_device(
        batch, torch.device('cuda:0'))
    assert batch.device.index == 0 and batch.type() == 'torch.cuda.FloatTensor'

    # tensor list
    batch = [torch.rand(2, 3), torch.rand(2, 3)]
    batch = trainer.accelerator_backend.batch_to_device(
        batch, torch.device('cuda:0'))
    assert batch[0].device.index == 0 and batch[0].type(
    ) == 'torch.cuda.FloatTensor'
    assert batch[1].device.index == 0 and batch[1].type(
    ) == 'torch.cuda.FloatTensor'

    # tensor list of lists
    batch = [[torch.rand(2, 3), torch.rand(2, 3)]]
    batch = trainer.accelerator_backend.batch_to_device(
        batch, torch.device('cuda:0'))
    assert batch[0][0].device.index == 0 and batch[0][0].type(
    ) == 'torch.cuda.FloatTensor'
    assert batch[0][1].device.index == 0 and batch[0][1].type(
    ) == 'torch.cuda.FloatTensor'

    # tensor dict
    batch = [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)}]
    batch = trainer.accelerator_backend.batch_to_device(
        batch, torch.device('cuda:0'))
    assert batch[0]['a'].device.index == 0 and batch[0]['a'].type(
    ) == 'torch.cuda.FloatTensor'
    assert batch[0]['b'].device.index == 0 and batch[0]['b'].type(
    ) == 'torch.cuda.FloatTensor'

    # tuple of tensor list and list of tensor dict
    batch = ([torch.rand(2, 3) for _ in range(2)], [{
        'a': torch.rand(2, 3),
        'b': torch.rand(2, 3)
    } for _ in range(2)])
    batch = trainer.accelerator_backend.batch_to_device(
        batch, torch.device('cuda:0'))
    assert batch[0][0].device.index == 0 and batch[0][0].type(
    ) == 'torch.cuda.FloatTensor'

    assert batch[1][0]['a'].device.index == 0
    assert batch[1][0]['a'].type() == 'torch.cuda.FloatTensor'

    assert batch[1][0]['b'].device.index == 0
    assert batch[1][0]['b'].type() == 'torch.cuda.FloatTensor'

    # namedtuple of tensor
    BatchType = namedtuple('BatchType', ['a', 'b'])
    batch = [
        BatchType(a=torch.rand(2, 3), b=torch.rand(2, 3)) for _ in range(2)
    ]
    batch = trainer.accelerator_backend.batch_to_device(
        batch, torch.device('cuda:0'))
    assert batch[0].a.device.index == 0
    assert batch[0].a.type() == 'torch.cuda.FloatTensor'

    # non-Tensor that has `.to()` defined
    class CustomBatchType:
        def __init__(self):
            self.a = torch.rand(2, 2)

        def to(self, *args, **kwargs):
            self.a = self.a.to(*args, **kwargs)
            return self

    batch = trainer.accelerator_backend.batch_to_device(
        CustomBatchType(), torch.device('cuda:0'))
    assert batch.a.type() == 'torch.cuda.FloatTensor'

    # torchtext.data.Batch
    samples = [{
        'text': 'PyTorch Lightning is awesome!',
        'label': 0
    }, {
        'text': 'Please make it work with torchtext',
        'label': 1
    }]

    text_field = Field()
    label_field = LabelField()
    fields = {'text': ('text', text_field), 'label': ('label', label_field)}

    examples = [Example.fromdict(sample, fields) for sample in samples]
    dataset = Dataset(examples=examples, fields=fields.values())

    # Batch runs field.process() that numericalizes tokens, but it requires to build dictionary first
    text_field.build_vocab(dataset)
    label_field.build_vocab(dataset)

    batch = Batch(data=examples, dataset=dataset)
    batch = trainer.accelerator_backend.batch_to_device(
        batch, torch.device('cuda:0'))

    assert batch.text.type() == 'torch.cuda.LongTensor'
    assert batch.label.type() == 'torch.cuda.LongTensor'
Exemplo n.º 14
0
            eos_token='<eos>',
            lower=True)

train_data, valid_data, test_data = TranslationDataset.splits(
    path='IITB_small',
    validation='dev',
    exts=('.en', '.hi'),
    fields=(SRC, TRG))

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

vars(train_data.examples[0])

SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2, specials=['<pad>', '<sop>', '<eop>'])

print(f"Unique tokens in source (en) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (hi) vocabulary: {len(TRG.vocab)}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

device

BATCH_SIZE = 2

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)
"""# EnCoder Parameters"""
                                  'val')
    from_txt_to_dataframe_and_csv('toy-revert', 'src-test.txt', 'tgt-test.txt',
                                  'test')

    data_fields = [('src', TEXT), ('trg', TRG_TEXT)]
    # load the dataset in csv format
    train_data, valid_data, test_data = TabularDataset.splits(
        path='toy-revert',
        train='train.csv',
        validation='val.csv',
        test='test.csv',
        format='csv',
        fields=data_fields,
        skip_header=True)

    TEXT.build_vocab(train_data)
    TRG_TEXT.build_vocab(train_data)
    SRC, TRG = TEXT, TRG_TEXT

    BATCH_SIZE = 128

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=BATCH_SIZE,
        sort_within_batch=True,
        sort_key=lambda x: len(x.src),
        device=device)

    #######################################
Exemplo n.º 16
0
class SequenceDataLoader(CommonDataLoader):

    def __init__(self, data_config):
        super(SequenceDataLoader, self).__init__(data_config)
        self.__build_field()
        self._load_data()

        pass

    def __build_field(self):
        self.TEXT = Field(sequential=True, use_vocab=True, tokenize=tokenizer, include_lengths=True)
        self.TAG = Field(sequential=True, use_vocab=True, tokenize=tokenizer, is_target=True)
        self._fields = [
            ('text', self.TEXT), ('tag', self.TAG)
        ]
        self._fields_test = [('text', self.TEXT)]
        pass

    @timeit
    def _load_data(self):
        self.train_data = REDataset(path=self._config.data.chip_relation.train_path, fields=self._fields)
        self.valid_data = REDataset(path=self._config.data.chip_relation.valid_path, fields=self._fields)
        self.test_data = REDataset(path=self._config.data.chip_relation.test_path, fields=self._fields_test)
        self.__build_vocab(self.train_data, self.valid_data, self.test_data)
        self.__build_iterator(self.train_data, self.valid_data, self.test_data)
        pass

    def __build_vocab(self, *dataset):
        """
        :param dataset: train_data, valid_data, test_data
        :return: text_vocab, tag_vocab
        """
        self.TEXT.build_vocab(*dataset)
        self.TAG.build_vocab(*dataset[:-1])
        self.word_vocab = self.TEXT.vocab
        self.tag_vocab = self.TAG.vocab
        pass

    def __build_iterator(self, *dataset):
        self._train_iter = BucketIterator(
            dataset[0], batch_size=self._config.data.train_batch_size, shuffle=True,
            sort_key=lambda x: len(x.text), sort_within_batch=True, device=self._config.device)

        self._valid_iter = BucketIterator(
            dataset[1], batch_size=self._config.data.train_batch_size, shuffle=False,
            sort_key=lambda x: len(x.text), sort_within_batch=True, device=self._config.device)

        self._test_iter = BucketIterator(
            dataset[2], batch_size=self._config.data.train_batch_size, shuffle=False,
            sort_key=lambda x: len(x.text), sort_within_batch=True, device=self._config.device)

    def load_train(self):
        return self._train_iter
        pass

    def load_test(self):
        return self._test_iter
        pass

    def load_valid(self):
        return self._valid_iter
        pass
print('load data')
configfile = open('../config.yaml')
config=AttrDict(yaml.load(configfile, Loader=yaml.FullLoader))
trainSet = TIMIT(config.data.data_root, mode='train')
devSet = TIMIT(config.data.data_root, mode='test')

TEXT = Field(lower=True, include_lengths=True, batch_first=True, unk_token=None)

print('build vocab')
sents = ['iy', 'ix', 'eh', 'ae', 'ax', 'uw', 'uh',
         'ao', 'ey', 'ay', 'oy', 'aw', 'ow', 'er',
         'l', 'r', 'w', 'y', 'm', 'n', 'ng', 'v',
         'f', 'dh', 'th', 'z', 's', 'zh', 'jh', 'ch',
         'b', 'p', 'd', 'dx', 't', 'g', 'k', 'hh', 'h#']
sents = [[i] for i in sents]
TEXT.build_vocab(sents, specials=['<blank>'])
assert config.data.vocabSize == len(TEXT.vocab)
assert config.data.pad_idx == TEXT.vocab.stoi['<pad>']
assert config.data.blank_idx == TEXT.vocab.stoi['<blank>']


def my_collate(batch):
    '''
    inputs: [N,L]
    targets: [N,L]
    '''
    txt_seqs, seqs_len = TEXT.process([item[1] for item in batch]) 
    inputs = txt_seqs[:,:-1]
    targets = txt_seqs[:,1:]
    return {'inputs':inputs, 'targets':targets}
Exemplo n.º 18
0
class DataLoader():
    """
	This is the dataloader class that takes in a path and return a generator that could be iterated through

	init:
		path: path of the data to read in (assumes CSV format)
		config: a Config object that contains the parameters to be used
		shuffle: whether to shuffle the data or not (true by default)

	"""
    def __init__(self, config, split, type_="train", lang="en"):

        assert config.extension in ["json"]  # Only supports csv now

        self.config = config
        self.extension = self.config.extension

        self.max_length = self.config.max_length
        self.max_tweets = self.config.max_tweets

        self.lang = lang
        if self.lang == "zh":
            print("Doing RD for chinese")
            nlp = nlp_chinese

        # <------------ Running some defined functions ----------->

        if type_ == "train":
            # self.data_folder_path = self.config.data_folder + "_{}/".format(split)
            self.data_folder_path = self.config.data_folder
            self.train_file_path = self.config.train_file_path
            self.test_1_file_path = self.config.test_1_file_path
            self.test_2_file_path = self.config.test_2_file_path
            self.run_pipeline()

    def get_data(self, type_, return_id=False):

        assert type_ in ["train", "train_test", "test_1", "test_2", "test"]

        max_batch_size = self.config.batch_size if type_ == "train" else self.config.batch_size_test if type_ == "train_test" else self.config.batch_size_test if type_ == "test" else self.config.batch_size_test if type_ == "test_1" else self.config.batch_size_test if type_ == "test_2" else "something is wrong"
        data = self.train_batch if type_ == "train" else self.train_test_batch if type_ == "train_test" else self.test_batch if type_ == "test" else self.test_1_batch if type_ == "test_1" else self.test_2_batch if type_ == "test_2" else "something is wrong"

        for batch in data:

            id_ = getattr(batch, self.config.keys_order["post_id"])

            X = getattr(batch, self.config.keys_order["content"])
            y = getattr(batch, self.config.keys_order["label"])
            structure = getattr(batch, self.config.keys_order["structure"])
            time_delay = getattr(batch, self.config.keys_order["time_delay"])

            # <-------- Getting the sizes --------->
            batch_size, num_articles, num_words, = X.shape

            # <-------- Getting the word_pos tensor --------->
            word_pos = np.repeat(np.expand_dims(np.repeat(np.expand_dims(
                np.arange(num_words), axis=0),
                                                          num_articles,
                                                          axis=0),
                                                axis=0),
                                 batch_size,
                                 axis=0)
            word_pos = torch.from_numpy(word_pos)

            # <-------- Getting the attention_mask vector (for words) --------->
            # The mask has 1 for real tokens and 0 for padding / unknown tokens. Only real tokens + last pad are attended to
            # <pad> has an index of 1

            attention_mask_word = torch.where(
                (X == 1), torch.zeros(1),
                torch.ones(1)).type(torch.FloatTensor)
            check = torch.sum(torch.where((X == 1), torch.ones(1),
                                          torch.zeros(1)),
                              dim=-1)

            # <-------- Getting the attention_mask vector (for posts) --------->
            attention_mask_post = torch.where(
                (check == self.config.max_length), torch.zeros(1),
                torch.ones(1)).type(torch.FloatTensor)

            if batch_size >= len(self.config.gpu_idx):

                if return_id:

                    yield id_, X, y, word_pos, time_delay, structure, attention_mask_word, attention_mask_post

                else:

                    yield X, y, word_pos, time_delay, structure, attention_mask_word, attention_mask_post

    @staticmethod
    def clean_text(text):
        """
		This function cleans the text in the following ways:
		1. Replace websites with URL
		1. Replace 's with <space>'s (eg, her's --> her 's)

		"""

        text = re.sub(
            r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
            "URL", text)  # Replace urls with special token
        text = text.replace("\'s", "")
        text = text.replace("\'", "")
        text = text.replace("n\'t", " n\'t")
        text = text.replace("@", "")
        text = text.replace("#", "")
        text = text.replace("_", " ")
        text = text.replace("-", " ")
        text = text.replace("&amp;", "")
        text = text.replace("&gt;", "")
        text = text.replace("\"", "")
        text = text.replace(".", "")
        text = text.replace(",", "")
        text = text.replace("(", "")
        text = text.replace(")", "")

        text = ' '.join(text.split())

        return text.strip()

    @staticmethod
    def clean_tokenized_text(text_lst):

        if len(text_lst) <= 1:
            return text_lst

        idx = 0
        cleaned_token_lst = []

        while idx < len(text_lst) - 1:

            current_token = text_lst[idx]
            next_token = text_lst[idx + 1]

            if current_token != next_token:
                cleaned_token_lst.append(current_token)
                idx += 1

            else:

                last_idx = max([
                    i + idx for i, val in enumerate(text_lst[idx:])
                    if val == current_token
                ]) + 1
                cleaned_token_lst.append(current_token)
                idx = last_idx

        if cleaned_token_lst[-1] != text_lst[-1]:
            cleaned_token_lst.append(text_lst[-1])

        return cleaned_token_lst

    @staticmethod
    def tokenize_structure(structure_lst):

        return structure_lst

    @staticmethod
    def tokenize_text(text):

        text = DataLoader.clean_text(text)
        token_lst = [token.text.lower() for token in nlp(text)]
        token_lst = DataLoader.clean_tokenized_text(token_lst)

        return token_lst

    # Step 1: Define the data fields
    def define_fields(self):

        self.id_field = Field(sequential=False,
                              tokenize=lambda x: x,
                              use_vocab=True)

        self.tweet_field = Field(sequential=True,
                                 tokenize=DataLoader.tokenize_text,
                                 include_lengths=False,
                                 lower=True,
                                 fix_length=self.max_length,
                                 use_vocab=True)

        self.timestamp_field = Field(sequential=False,
                                     include_lengths=False,
                                     use_vocab=False)

        self.structure_field = Field(
            sequential=True,
            tokenize=lambda x: DataLoader.tokenize_structure(x),
            include_lengths=False,
            fix_length=self.config.max_tweets,
            pad_token=self.config.num_structure_index,
            use_vocab=False)

        self.label_field = Field(sequential=False, use_vocab=False)

        self.tweet_lst_field = NestedField(self.tweet_field,
                                           fix_length=self.config.max_tweets)

        self.timestamp_lst_field = NestedField(
            self.timestamp_field,
            pad_token=str(self.config.size),
            fix_length=self.config.max_tweets)

        self.structure_lst_field = NestedField(
            self.structure_field, fix_length=self.config.max_tweets)

        data_fields = {}

        for key, val in self.config.keys_order.items():

            if key == "post_id":
                data_fields[val] = (val, self.id_field)
            if key == "content":
                data_fields[val] = (val, self.tweet_lst_field)
            elif key == "label":
                data_fields[val] = (val, self.label_field)
            elif key == "time_delay":
                data_fields[val] = (val, self.timestamp_lst_field)
            elif key == "structure":
                data_fields[val] = (val, self.structure_lst_field)

        self.data_fields = data_fields

    # Step 2: Reading the data
    def read_data(self, path):

        data = TabularDataset(path=path,
                              format=self.extension,
                              fields=self.data_fields)

        return data

    # Step 3: Building the vectors
    def build_vectors(self):

        # specify the path to the localy saved vectors (Glove in this case)
        vec = vocab.Vectors(name=self.config.glove_file,
                            cache=self.config.glove_directory)

        self.id_field.build_vocab(
            getattr(self.train, self.config.keys_order["post_id"]),
            getattr(self.test_1, self.config.keys_order["post_id"]),
            getattr(self.test_2, self.config.keys_order["post_id"]))

        # Build the vocabulary (for tweets) using the train and test dataset
        self.tweet_field.build_vocab(
            getattr(self.train, self.config.keys_order["content"]),
            getattr(self.test_1, self.config.keys_order["content"]),
            getattr(self.test_2, self.config.keys_order["content"]),
            max_size=self.config.max_vocab,
            vectors=vec)

    # Step 4: Loading the data in batches
    def load_batches(self, dataset, batch_size):

        data = BucketIterator.splits(
            datasets=(dataset, ),  # specify data
            batch_sizes=(batch_size, ),  # batch size
            sort_key=lambda x: len(
                getattr(x, self.config.keys_order["content"])
            ),  # on what attribute the text should be sorted
            sort_within_batch=True,
            repeat=False)

        return data[0]

    def load_vocab_vectors(self, vocab):

        self.tweet_field.vocab = vocab

    def run_pipeline(self):
        """
		Pipeline to run all the necessary steps in sequence

		Note: DO NOT CHANGE THE SEQUENCE OF EXECUTION
		"""

        # Step 1 : Define the fields
        self.define_fields()

        # Step 2: Read data
        self.train = self.read_data(
            os.path.join(self.data_folder_path, self.train_file_path))
        self.test_1 = self.read_data(
            os.path.join(self.data_folder_path, self.test_1_file_path))
        self.test_2 = self.read_data(
            os.path.join(self.data_folder_path, self.test_2_file_path))

        # Step 3: Building the vectors
        self.build_vectors()

        # Step 4: Batching the data
        self.train_batch = self.load_batches(self.train,
                                             self.config.batch_size)
        self.train_test_batch = self.load_batches(self.train,
                                                  self.config.batch_size_test)
        self.test_1_batch = self.load_batches(self.test_1,
                                              self.config.batch_size_test)
        self.test_2_batch = self.load_batches(self.test_2,
                                              self.config.batch_size_test)
Exemplo n.º 19
0
    def run(self):
        print("Running on", self.a.device)
        self.set_device(self.a.device)

        np.random.seed(self.a.seed)
        torch.manual_seed(self.a.seed)
        torch.backends.cudnn.benchmark = True

        ####################    loading event extraction dataset   ####################
        if self.a.train_ee:
            log('loading event extraction corpus from %s' % self.a.train_ee)

        # both for grounding and ee
        WordsField = Field(lower=True, include_lengths=True, batch_first=True)
        PosTagsField = Field(lower=True, batch_first=True)
        EntityLabelsField = MultiTokenField(lower=False, batch_first=True)
        AdjMatrixField = SparseField(sequential=False, use_vocab=False, batch_first=True)
        EntitiesField = EntityField(lower=False, batch_first=True, use_vocab=False)
        # only for ee
        LabelField = Field(lower=False, batch_first=True, pad_token='0', unk_token=None)
        EventsField = EventField(lower=False, batch_first=True)

        if self.a.amr:
            colcc = 'amr-colcc'
        else:
            colcc = 'stanford-colcc'
        print(colcc)

        train_ee_set = ACE2005Dataset(path=self.a.train_ee,
                                   fields={"words": ("WORDS", WordsField),
                                           "pos-tags": ("POSTAGS", PosTagsField),
                                           "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                           colcc: ("ADJM", AdjMatrixField),
                                           "golden-event-mentions": ("LABEL", LabelField),
                                           "all-events": ("EVENT", EventsField),
                                           "all-entities": ("ENTITIES", EntitiesField)},
                                   amr=self.a.amr, keep_events=1)

        dev_ee_set = ACE2005Dataset(path=self.a.dev_ee,
                                 fields={"words": ("WORDS", WordsField),
                                         "pos-tags": ("POSTAGS", PosTagsField),
                                         "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                         colcc: ("ADJM", AdjMatrixField),
                                         "golden-event-mentions": ("LABEL", LabelField),
                                         "all-events": ("EVENT", EventsField),
                                         "all-entities": ("ENTITIES", EntitiesField)},
                                 amr=self.a.amr, keep_events=0)

        test_ee_set = ACE2005Dataset(path=self.a.test_ee,
                                  fields={"words": ("WORDS", WordsField),
                                          "pos-tags": ("POSTAGS", PosTagsField),
                                          "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                          colcc: ("ADJM", AdjMatrixField),
                                          "golden-event-mentions": ("LABEL", LabelField),
                                          "all-events": ("EVENT", EventsField),
                                          "all-entities": ("ENTITIES", EntitiesField)},
                                  amr=self.a.amr, keep_events=0)

        if self.a.webd:
            pretrained_embedding = Vectors(self.a.webd, ".", unk_init=partial(torch.nn.init.uniform_, a=-0.15, b=0.15))
            LabelField.build_vocab(train_ee_set.LABEL, dev_ee_set.LABEL, vectors=pretrained_embedding)
            EventsField.build_vocab(train_ee_set.EVENT, dev_ee_set.EVENT, vectors=pretrained_embedding)
        else:
            LabelField.build_vocab(train_ee_set.LABEL, dev_ee_set.LABEL)
            EventsField.build_vocab(train_ee_set.EVENT, dev_ee_set.EVENT)

        # add role mask
        self.a.role_mask = event_role_mask(self.a.train_ee, self.a.dev_ee, LabelField.vocab.stoi,
                                           EventsField.vocab.stoi, self.device)

        ####################    loading SR dataset   ####################
        # both for grounding and sr
        if self.a.train_sr:
            log('loading corpus from %s' % self.a.train_sr)

        transform = transforms.Compose([
            transforms.Resize(256),
            transforms.RandomHorizontalFlip(),
            transforms.RandomCrop(224),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406),
                                 (0.229, 0.224, 0.225))])

        vocab_noun = Vocab(os.path.join(self.a.vocab, 'vocab_situation_noun.pkl'), load=True)
        vocab_role = Vocab(os.path.join(self.a.vocab, 'vocab_situation_role.pkl'), load=True)
        vocab_verb = Vocab(os.path.join(self.a.vocab, 'vocab_situation_verb.pkl'), load=True)

        # train_sr_loader = imsitu_loader(self.a.image_dir, self.vocab_noun, self.vocab_role, self.vocab_verb, self.a.imsitu_ontology_file,
        #                             self.a.train_sr, self.a.verb_mapping_file, self.a.role_mapping_file,
        #                             self.a.object_class_map_file, self.a.object_detection_pkl_file,
        #                             self.a.object_detection_threshold,
        #                             transform, self.a.batch, shuffle=self.a.shuffle, num_workers=1)  #self.a.shuffle
        # dev_sr_loader = imsitu_loader(self.a.image_dir, self.vocab_noun, self.vocab_role, self.vocab_verb, self.a.imsitu_ontology_file,
        #                             self.a.dev_sr, self.a.verb_mapping_file, self.a.role_mapping_file,
        #                             self.a.object_class_map_file, self.a.object_detection_pkl_file,
        #                             self.a.object_detection_threshold,
        #                             transform, self.a.batch, shuffle=self.a.shuffle, num_workers=1)
        # test_sr_loader = imsitu_loader(self.a.image_dir, self.vocab_noun, self.vocab_role, self.vocab_verb, self.a.imsitu_ontology_file,
        #                             self.a.test_sr, self.a.verb_mapping_file, self.a.role_mapping_file,
        #                             self.a.object_class_map_file, self.a.object_detection_pkl_file,
        #                             self.a.object_detection_threshold,
        #                             transform, self.a.batch, shuffle=self.a.shuffle, num_workers=1)
        train_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb,
                                     LabelField.vocab.stoi, EventsField.vocab.stoi,
                                     self.a.imsitu_ontology_file,
                                     self.a.train_sr, self.a.verb_mapping_file,
                                     self.a.object_class_map_file, self.a.object_detection_pkl_file,
                                     self.a.object_detection_threshold,
                                     transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs,
                                     load_object=self.a.add_object, filter_place=self.a.filter_place)
        dev_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb,
                                   LabelField.vocab.stoi, EventsField.vocab.stoi,
                                   self.a.imsitu_ontology_file,
                                   self.a.dev_sr, self.a.verb_mapping_file,
                                   self.a.object_class_map_file, self.a.object_detection_pkl_file,
                                   self.a.object_detection_threshold,
                                   transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs,
                                   load_object=self.a.add_object, filter_place=self.a.filter_place)
        test_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb,
                                    LabelField.vocab.stoi, EventsField.vocab.stoi,
                                    self.a.imsitu_ontology_file,
                                    self.a.test_sr, self.a.verb_mapping_file,
                                    self.a.object_class_map_file, self.a.object_detection_pkl_file,
                                    self.a.object_detection_threshold,
                                    transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs,
                                    load_object=self.a.add_object, filter_place=self.a.filter_place)


        ####################    loading grounding dataset   ####################
        if self.a.train_grounding:
            log('loading grounding corpus from %s' % self.a.train_grounding)

        # only for grounding
        IMAGEIDField = SparseField(sequential=False, use_vocab=False, batch_first=True)
        SENTIDField = SparseField(sequential=False, use_vocab=False, batch_first=True)
        # IMAGEField = SparseField(sequential=False, use_vocab=False, batch_first=True)

        train_grounding_set = GroundingDataset(path=self.a.train_grounding,
                                               img_dir=self.a.img_dir_grounding,
                                               fields={"id": ("IMAGEID", IMAGEIDField),
                                                       "sentence_id": ("SENTID", SENTIDField),
                                                       "words": ("WORDS", WordsField),
                                                       "pos-tags": ("POSTAGS", PosTagsField),
                                                       "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                                       colcc: ("ADJM", AdjMatrixField),
                                                       "all-entities": ("ENTITIES", EntitiesField),
                                                       # "image": ("IMAGE", IMAGEField),
                                                       },
                                               transform=transform,
                                               amr=self.a.amr,
                                               load_object=self.a.add_object,
                                               object_ontology_file=self.a.object_class_map_file,
                                               object_detection_pkl_file=self.a.object_detection_pkl_file_g,
                                               object_detection_threshold=self.a.object_detection_threshold,
                                               )

        dev_grounding_set = GroundingDataset(path=self.a.dev_grounding,
                                             img_dir=self.a.img_dir_grounding,
                                             fields={"id": ("IMAGEID", IMAGEIDField),
                                                     "sentence_id": ("SENTID", SENTIDField),
                                                     "words": ("WORDS", WordsField),
                                                     "pos-tags": ("POSTAGS", PosTagsField),
                                                     "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                                     colcc: ("ADJM", AdjMatrixField),
                                                     "all-entities": ("ENTITIES", EntitiesField),
                                                     # "image": ("IMAGE", IMAGEField),
                                                     },
                                             transform=transform,
                                             amr=self.a.amr,
                                             load_object=self.a.add_object,
                                             object_ontology_file=self.a.object_class_map_file,
                                             object_detection_pkl_file=self.a.object_detection_pkl_file_g,
                                             object_detection_threshold=self.a.object_detection_threshold,
                                             )

        test_grounding_set = GroundingDataset(path=self.a.test_grounding,
                                              img_dir=self.a.img_dir_grounding,
                                              fields={"id": ("IMAGEID", IMAGEIDField),
                                                      "sentence_id": ("SENTID", SENTIDField),
                                                      "words": ("WORDS", WordsField),
                                                      "pos-tags": ("POSTAGS", PosTagsField),
                                                      "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                                      colcc: ("ADJM", AdjMatrixField),
                                                      "all-entities": ("ENTITIES", EntitiesField),
                                                      # "image": ("IMAGE", IMAGEField),
                                                      },
                                              transform=transform,
                                              amr=self.a.amr,
                                              load_object=self.a.add_object,
                                              object_ontology_file=self.a.object_class_map_file,
                                              object_detection_pkl_file=self.a.object_detection_pkl_file_g,
                                              object_detection_threshold=self.a.object_detection_threshold,
                                              )

        ####################    build vocabulary   ####################

        if self.a.webd:
            pretrained_embedding = Vectors(self.a.webd, ".", unk_init=partial(torch.nn.init.uniform_, a=-0.15, b=0.15))
            WordsField.build_vocab(train_ee_set.WORDS, dev_ee_set.WORDS, train_grounding_set.WORDS, dev_grounding_set.WORDS, vectors=pretrained_embedding)
        else:
            WordsField.build_vocab(train_ee_set.WORDS, dev_ee_set.WORDS, train_grounding_set.WORDS, dev_grounding_set.WORDS)
        PosTagsField.build_vocab(train_ee_set.POSTAGS, dev_ee_set.POSTAGS, train_grounding_set.POSTAGS, dev_grounding_set.POSTAGS)
        EntityLabelsField.build_vocab(train_ee_set.ENTITYLABELS, dev_ee_set.ENTITYLABELS,  train_grounding_set.ENTITYLABELS, dev_grounding_set.ENTITYLABELS)

        consts.O_LABEL = LabelField.vocab.stoi[consts.O_LABEL_NAME]
        # print("O label is", consts.O_LABEL)
        consts.ROLE_O_LABEL = EventsField.vocab.stoi[consts.ROLE_O_LABEL_NAME]
        # print("O label for AE is", consts.ROLE_O_LABEL)

        dev_ee_set1 = ACE2005Dataset(path=self.a.dev_ee,
                                  fields={"words": ("WORDS", WordsField),
                                          "pos-tags": ("POSTAGS", PosTagsField),
                                          "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                          colcc: ("ADJM", AdjMatrixField),
                                          "golden-event-mentions": ("LABEL", LabelField),
                                          "all-events": ("EVENT", EventsField),
                                          "all-entities": ("ENTITIES", EntitiesField)},
                                  amr=self.a.amr, keep_events=1, only_keep=True)

        test_ee_set1 = ACE2005Dataset(path=self.a.test_ee,
                                   fields={"words": ("WORDS", WordsField),
                                           "pos-tags": ("POSTAGS", PosTagsField),
                                           "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField),
                                           colcc: ("ADJM", AdjMatrixField),
                                           "golden-event-mentions": ("LABEL", LabelField),
                                           "all-events": ("EVENT", EventsField),
                                           "all-entities": ("ENTITIES", EntitiesField)},
                                   amr=self.a.amr, keep_events=1, only_keep=True)
        print("train set length", len(train_ee_set))

        print("dev set length", len(dev_ee_set))
        print("dev set 1/1 length", len(dev_ee_set1))

        print("test set length", len(test_ee_set))
        print("test set 1/1 length", len(test_ee_set1))

        # sr model initialization
        if not self.a.sr_hps_path:
            self.a.sr_hps = eval(self.a.sr_hps)
        embeddingMatrix_noun = torch.FloatTensor(np.load(self.a.wnebd)).to(self.device)
        embeddingMatrix_verb = torch.FloatTensor(np.load(self.a.wvebd)).to(self.device)
        embeddingMatrix_role = torch.FloatTensor(np.load(self.a.wrebd)).to(self.device)
        if "wvemb_size" not in self.a.sr_hps:
            self.a.sr_hps["wvemb_size"] = len(vocab_verb.id2word)
        if "wremb_size" not in self.a.sr_hps:
            self.a.sr_hps["wremb_size"] = len(vocab_role.id2word)
        if "wnemb_size" not in self.a.sr_hps:
            self.a.sr_hps["wnemb_size"] = len(vocab_noun.id2word)

        self.a.ee_label_weight = torch.ones([len(LabelField.vocab.itos)]) * 5
        self.a.ee_label_weight[consts.O_LABEL] = 1.0
        self.a.ee_arg_weight = torch.ones([len(EventsField.vocab.itos)]) * 5
        self.a.ee_hps = eval(self.a.ee_hps)
        if "wemb_size" not in self.a.ee_hps:
            self.a.ee_hps["wemb_size"] = len(WordsField.vocab.itos)
        if "pemb_size" not in self.a.ee_hps:
            self.a.ee_hps["pemb_size"] = len(PosTagsField.vocab.itos)
        if "psemb_size" not in self.a.ee_hps:
            # self.a.ee_hps["psemb_size"] = max([train_grounding_set.longest(), dev_grounding_set.longest(), test_grounding_set.longest()]) + 2
            self.a.ee_hps["psemb_size"] = max([train_ee_set.longest(), dev_ee_set.longest(), test_ee_set.longest(), train_grounding_set.longest(), dev_grounding_set.longest(), test_grounding_set.longest()]) + 2
        if "eemb_size" not in self.a.ee_hps:
            self.a.ee_hps["eemb_size"] = len(EntityLabelsField.vocab.itos)
        if "oc" not in self.a.ee_hps:
            self.a.ee_hps["oc"] = len(LabelField.vocab.itos)
        if "ae_oc" not in self.a.ee_hps:
            self.a.ee_hps["ae_oc"] = len(EventsField.vocab.itos)
        if "oc" not in self.a.sr_hps:
            self.a.sr_hps["oc"] = len(LabelField.vocab.itos)
        if "ae_oc" not in self.a.sr_hps:
            self.a.sr_hps["ae_oc"] = len(EventsField.vocab.itos)

        ee_tester = EDTester(LabelField.vocab.itos, EventsField.vocab.itos, self.a.ignore_time_test)
        sr_tester = SRTester()
        g_tester = GroundingTester()
        j_tester = JointTester(self.a.ignore_place_sr_test, self.a.ignore_time_test)

        ace_classifier = ACEClassifier(2 * self.a.ee_hps["lstm_dim"], self.a.ee_hps["oc"], self.a.ee_hps["ae_oc"], self.device)

        if self.a.finetune_ee:
            log('init ee model from ' + self.a.finetune_ee)
            ee_model = load_ee_model(self.a.ee_hps, self.a.finetune_ee, WordsField.vocab.vectors, self.device, ace_classifier)
            log('ee model loaded, there are %i sets of params' % len(ee_model.parameters_requires_grads()))
        else:
            ee_model = load_ee_model(self.a.ee_hps, None, WordsField.vocab.vectors, self.device, ace_classifier)
            log('ee model created from scratch, there are %i sets of params' % len(ee_model.parameters_requires_grads()))

        if self.a.finetune_sr:
            log('init sr model from ' + self.a.finetune_sr)
            sr_model = load_sr_model(self.a.sr_hps, embeddingMatrix_noun, embeddingMatrix_verb, embeddingMatrix_role, self.a.finetune_sr, self.device, ace_classifier, add_object=self.a.add_object, load_partial=True)
            log('sr model loaded, there are %i sets of params' % len(sr_model.parameters_requires_grads()))
        else:
            sr_model = load_sr_model(self.a.sr_hps, embeddingMatrix_noun, embeddingMatrix_verb, embeddingMatrix_role, None, self.device, ace_classifier, add_object=self.a.add_object, load_partial=True)
            log('sr model created from scratch, there are %i sets of params' % len(sr_model.parameters_requires_grads()))

        model = GroundingModel(ee_model, sr_model, self.get_device())
        # ee_model = torch.nn.DataParallel(ee_model)
        # sr_model = torch.nn.DataParallel(sr_model)
        # model = torch.nn.DataParallel(model)

        if self.a.optimizer == "adadelta":
            optimizer_constructor = partial(torch.optim.Adadelta, params=model.parameters_requires_grads(),
                                            weight_decay=self.a.l2decay)
        elif self.a.optimizer == "adam":
            optimizer_constructor = partial(torch.optim.Adam, params=model.parameters_requires_grads(),
                                            weight_decay=self.a.l2decay)
        else:
            optimizer_constructor = partial(torch.optim.SGD, params=model.parameters_requires_grads(),
                                            weight_decay=self.a.l2decay,
                                            momentum=0.9)

        log('optimizer in use: %s' % str(self.a.optimizer))

        if not os.path.exists(self.a.out):
            os.mkdir(self.a.out)
        with open(os.path.join(self.a.out, "word.vec"), "wb") as f:
            pickle.dump(WordsField.vocab, f)
        with open(os.path.join(self.a.out, "pos.vec"), "wb") as f:
            pickle.dump(PosTagsField.vocab.stoi, f)
        with open(os.path.join(self.a.out, "entity.vec"), "wb") as f:
            pickle.dump(EntityLabelsField.vocab.stoi, f)
        with open(os.path.join(self.a.out, "label.vec"), "wb") as f:
            pickle.dump(LabelField.vocab.stoi, f)
        with open(os.path.join(self.a.out, "role.vec"), "wb") as f:
            pickle.dump(EventsField.vocab.stoi, f)
        with open(os.path.join(self.a.out, "ee_hyps.json"), "w") as f:
            json.dump(self.a.ee_hps, f)
        with open(os.path.join(self.a.out, "sr_hyps.json"), "w") as f:
            json.dump(self.a.sr_hps, f)

        log('init complete\n')

        # ee mappings
        self.a.ee_word_i2s = WordsField.vocab.itos
        self.a.ee_label_i2s = LabelField.vocab.itos
        self.a.ee_role_i2s = EventsField.vocab.itos
        # sr mappings
        self.a.sr_word_i2s = vocab_noun.id2word
        self.a.sr_label_i2s = vocab_verb.id2word  # LabelField.vocab.itos
        self.a.sr_role_i2s = vocab_role.id2word
        writer = SummaryWriter(os.path.join(self.a.out, "exp"))
        self.a.writer = writer

        joint_train(
            model_ee=ee_model,
            model_sr=sr_model,
            model_g=model,
            train_set_g=train_grounding_set,
            dev_set_g=dev_grounding_set,
            test_set_g=test_grounding_set,
            train_set_ee=train_ee_set,
            dev_set_ee=dev_ee_set,
            test_set_ee=test_ee_set,
            train_set_sr=train_sr_set,
            dev_set_sr=dev_sr_set,
            test_set_sr=test_sr_set,
            optimizer_constructor=optimizer_constructor,
            epochs=self.a.epochs,
            ee_tester=ee_tester,
            sr_tester=sr_tester,
            g_tester=g_tester,
            j_tester=j_tester,
            parser=self.a,
            other_testsets={
                "dev ee 1/1": dev_ee_set1,
                "test ee 1/1": test_ee_set1,
            },
            transform=transform,
            vocab_objlabel=vocab_noun.word2id
        )
        log('Done!')
Exemplo n.º 20
0
                        batch_first=True,
                        dtype=torch.float)
    text_field = Field(tokenize='spacy',
                       batch_first=True,
                       include_lengths=True,
                       lower=True)
    fields = [('Class', label_field), ('Text', text_field)]

    train, valid = TabularDataset.splits(path="./",
                                         train='processed_train.csv',
                                         validation='processed_val.csv',
                                         format='CSV',
                                         fields=fields,
                                         skip_header=True)

    text_field.build_vocab(train, min_freq=2, vectors='glove.840B.300d')

    # In[10]:
    with open("text_field", "wb") as f:
        dill.dump(text_field, f)

    batch_size = 48
    train_iter = BucketIterator(train,
                                batch_size=batch_size,
                                sort_key=lambda x: len(x.Text),
                                device=device,
                                sort=True,
                                sort_within_batch=True,
                                shuffle=True)
    valid_iter = Iterator(valid,
                          sort=False,
Exemplo n.º 21
0
                   eos_token="<eos>")

    # associate the text in the 'Question' column with the Q_TEXT field,
    # and 'Answer' with A_TEXT field
    data_fields = [('Question', Q_TEXT), ('Answer', A_TEXT)]

    # train, val = TabularDataset.splits(path=PATH, train='train.csv', validation='val.csv', format='csv',
    #                                    fields=data_fields, skip_header=True)
    tab_dataset = TabularDataset(path=f'{args.path}/all.csv',
                                 format='csv',
                                 fields=data_fields,
                                 skip_header=True)
    train, val, test = tab_dataset.split(split_ratio=[0.5, 0.2, 0.3],
                                         random_state=random.getstate())

    Q_TEXT.build_vocab(train)
    A_TEXT.build_vocab(train)
    print('Question Tokenize')
    print(list(Q_TEXT.vocab.stoi.items()))
    print('Answer Tokenize')
    print(list(A_TEXT.vocab.stoi.items()))
    # print(list(A_TEXT.vocab.itos))

    INPUT_DIM = len(Q_TEXT.vocab)
    OUTPUT_DIM = len(A_TEXT.vocab)

    # BATCH_SIZE = 512
    # ENC_EMB_DIM = 256  # 256
    # DEC_EMB_DIM = 256  # 256
    # HID_DIM = 512  # 512
    # N_LAYERS = 2
def init(model_config, device='cpu'):
    logging.critical("[CRITICAL] %s device is selected" % device)
    logging.info(
        '[INFO] Using directory %s for the translation pair with filename %s' %
        (os.path.abspath(model_config['global']['dataset_path']),
         model_config['global']['translate_pair']))
    #initialize the field for src language
    src_field = Field(tokenize=english_tokenizer,
                      init_token='<sos>',
                      eos_token='<eos>',
                      lower=True)
    #initialize the field for trg language
    trg_field = Field(tokenize=hindi_tokenizer,
                      init_token='<sos>',
                      eos_token='<eos>',
                      lower=True)
    train_data, valid_data, test_data = load_datasets(
        model_config['global']['dataset_path'],
        model_config['global']['dataset_file_names'],
        model_config['global']['translate_pair'],
        model_config['global']['lang_extensions'], [src_field, trg_field])
    #initialize the vocabulary
    src_field.build_vocab(train_data, min_freq=1)
    trg_field.build_vocab(train_data, min_freq=1)
    #display dataset stats
    print_dataset_statistics(train_data, valid_data, test_data,
                             model_config['global']['lang_extensions'],
                             [src_field, trg_field])
    model = create_seq2seq_model(model_config, len(src_field.vocab),
                                 len(trg_field.vocab), device)
    optimizer = optim.Adam(model.parameters())
    #defining the loss function
    loss_function = nn.CrossEntropyLoss(
        ignore_index=trg_field.vocab.stoi[trg_field.pad_token])

    logging.info(model.apply(init_weights))
    logging.info('[INFO] Model has %s trainable parameters' %
                 (count_parameters(model)))
    logging.info('[INFO] About to start the primary training loop')
    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size=model_config['global']['batch_size'],
        device=device)
    cache_file_name = "%s-%s-%s-epoch-%s.pt" % (
        model_config['global']['name'],
        model_config['global']['lang_extensions'][0],
        model_config['global']['lang_extensions'][1],
        model_config['global']['epochs'])
    cache_file_path = os.path.join(model_config['global']['cache_path'],
                                   cache_file_name)
    stats = execute_training_loop(
        model,
        train_iterator,
        valid_iterator,
        loss_function,
        optimizer,
        model_config['global']['clip_value'],
        src_field,
        trg_field,
        epochs=model_config['global']['epochs'],
        model_cache_path=os.path.abspath(cache_file_path))

    stats_file_name = "%s-%s-%s-epoch-%s-stats.pickle" % (
        model_config['global']['name'],
        model_config['global']['lang_extensions'][0],
        model_config['global']['lang_extensions'][1],
        model_config['global']['epochs'])
    store_object(
        stats,
        os.path.join(model_config['global']['cache_path'], stats_file_name))

    logging.info("[INFO] loading the model %s" % (cache_file_name))
    model.load_state_dict(torch.load(os.path.abspath(cache_file_path)))
    test_loss, test_bleu = evaluate_model(model, test_iterator, loss_function,
                                          src_field, trg_field)
    logging.info(
        f'[INFO] | Test Loss: {test_loss:.3f} Test Bleu: {test_bleu:.3f} | Test PPL: {math.exp(test_loss):7.3f} |'
    )
Exemplo n.º 23
0
def load_dataset(batch_size):
    spacy_de = spacy.load(
        'de')  #run it on your env or virtrual env:#python -m spacy download de
    spacy_en = spacy.load(
        'en')  #run it on your env or virtrual env:#python -m spacy download en
    url = re.compile('(<url>.*</url>)')

    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

    # create dataset according to Field object.
    # Field define the basic token and tokenize.
    # Field can create vocab.
    # If you don't define init_token and eos_token, you will not get these token when you get training batch data from train_iter
    # Because you define the init_token and eos_token in here, you can get init_token + sentence + eos_token when you create train, val, test from TranslationDataset.splits
    DE = Field(tokenize=tokenize_de,
               include_lengths=True,
               init_token='<sos>',
               eos_token='<eos>')
    EN = Field(tokenize=tokenize_en,
               include_lengths=True,
               init_token='<sos>',
               eos_token='<eos>')

    #you can find: len(val.examples)=1014; len(test.examples)=1000; len(train.examples)=29000 in Multi30k.splits...
    #train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN))

    #I download the data and read it directly:
    #if your file name is not the same as defualt, you must change the function input parameter: train='train', validation='val', test='test'
    #exts parameter is the data file ext name.
    #So the data file depends on the parameter:path+(train\validation\test)+exts
    train, val, test = TranslationDataset.splits(path='./data/',
                                                 exts=('.de', '.en'),
                                                 fields=(DE, EN))

    #build vocabury
    #You can find one word from: DE.vocab.itos[0], it will depend on the order of frenquency
    #You can also find the index of word from: DE.vocab.stoi['word name']
    #It will automatically create the '<pad>' into vocab even you never use it. The '<pad>' sometimes only be used after creating iterators.
    #It is the same to unkonw_token '<pad>'. If you want: init_token='<sos>', eos_token='<eos>',
    #you need to give a arguement in creating the Field object.
    DE.build_vocab(
        train.src, min_freq=2
    )  # you can just use DE.build_vocab(train, min_freq=2), but not: DE.build_vocab(train.trg, min_freq=2)
    EN.build_vocab(train.trg, max_size=10000
                   )  # you can just use EN.build_vocab(train, max_size=10000)

    # Create batch and make the length of every sentence in one batch become the same
    # If repeat=True, program will forever run in: 'for b, batch in enumerate(train_iter):'
    train_iter, val_iter, test_iter = BucketIterator.splits(
        (train, val, test), batch_size=batch_size, repeat=False)
    return train_iter, val_iter, test_iter, DE, EN


# import re
# import spacy
# import torch
# from torchtext.data import Field, BucketIterator
# from torchtext.datasets import Multi30k, TranslationDataset

# spacy_de = spacy.load('de')#run it on your env or virtrual env:#python -m spacy download de
# spacy_en = spacy.load('en')#run it on your env or virtrual env:#python -m spacy download en
# url = re.compile('(<url>.*</url>)')

# def tokenize_de(text):
#     return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]

# def tokenize_en(text):
#     return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

# DE = Field(tokenize=tokenize_de, include_lengths=True,
#             init_token='<sos>', eos_token='<eos>')
# EN = Field(tokenize=tokenize_en, include_lengths=True,
#             init_token='<sos>', eos_token='<eos>')

# #you can find: len(val.examples)=1014; len(test.examples)=1000; len(train.examples)=29000 in Multi30k.splits...
# #train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN))

# #I download the data and read it directly:
# #if your file name is not the same as defualt, you must change the function input parameter: train='train', validation='val', test='test'
# #exts parameter is the data file ext name.
# #So the data file depends on the parameter:path+(train\validation\test)+exts
# train, val, test = TranslationDataset.splits(path='./data2/',exts=('.de', '.en'), fields=(DE, EN))

# #build vocabury
# #You can find one word from: DE.vocab.itos[0], it will depend on the order of frenquency
# #You can also find the index of word from: DE.vocab.stoi['word name']

# DE.build_vocab(train, min_freq=2)
# EN.build_vocab(train, max_size=10000)

# for i in range(5):
#     print(DE.vocab.itos[i])

# train_iter, val_iter, test_iter = BucketIterator.splits(
#             (train, val, test), batch_size=2, repeat=False, sort=True, sort_within_batch=False)
# DE.vocab.stoi
# for i in range(5):
#     print(DE.vocab.itos[i])

# for i in range(len(EN.vocab)):
#     print(EN.vocab.itos[i])

# for e in range(3):
#     for b, batch in enumerate(train_iter):
#             src, len_src = batch.src
#             trg, len_trg = batch.trg
#             tensorToCsv2D(src,name='src',path='/home/yj/Documents/Python/Github/seq2seq/data2/gan.txt')
#             tensorToCsv2D(len_src,name='len_src',path='/home/yj/Documents/Python/Github/seq2seq/data2/gan.txt')
#             tensorToCsv2D(trg,name='trg',path='/home/yj/Documents/Python/Github/seq2seq/data2/gan.txt')
#             tensorToCsv2D(len_trg,name='len_trg',path='/home/yj/Documents/Python/Github/seq2seq/data2/gan.txt')

# import numpy
# def tensorToCsv2D(tensor,name='defualt',path=None,token=','):

#     def get_variable_name(variable):
#         callers_local_vars = inspect.currentframe().f_back.f_locals.items()
#         return [var_name for var_name, var_val in callers_local_vars if var_val is variable]

#     name = ''.join(get_variable_name(tensor))

#     assert(path is not None)

#     z = tensor.numpy().tolist()
#     if len(numpy.shape(z)) == 2:
#         with open(path,'a') as f:
#             f.write(name)
#             f.write('\r')
#             for i in range(numpy.shape(z)[0]):
#                 for j in range(numpy.shape(z)[1]):
#                     f.write(str(z[i][j]))
#                     f.write(token)
#                 f.write('\r')
#     elif len(numpy.shape(z)) == 1:
#         with open(path,'a') as f:
#             f.write(name)
#             f.write('\r')
#             for i in range(numpy.shape(z)[0]):
#                 f.write(str(z[i]))
#                 f.write(token)
#             f.write('\r')

# tensorToCsv2D(src,name='src',path='/home/yj/Documents/Python/Github/seq2seq/data/gan.txt')
# tensorToCsv2D(len_src,name='len_src',path='/home/yj/Documents/Python/Github/seq2seq/data/gan.txt')
# tensorToCsv2D(trg,name='trg',path='/home/yj/Documents/Python/Github/seq2seq/data/gan.txt')
# tensorToCsv2D(len_trg,name='len_trg',path='/home/yj/Documents/Python/Github/seq2seq/data/gan.txt')

# with open('/home/yj/Documents/Python/Github/seq2seq/data/gan.txt','w') as f:
#     f.write(str(src))
#     f.write(str(len_src))
#     f.write(str(trg))
#     f.write(str(len_trg))
# f
# z = src.numpy().tolist()
# z[0][0]
# len(numpy.shape(z))
# numpy.shape(z)[0]
Exemplo n.º 24
0
    return nltk.tokenize.word_tokenize(text)


TEXT = Field(tokenize=tokenize,
             init_token='<sos>',
             eos_token='<eos>',
             include_lengths=True,
             lower=True)

LABEL = Field(tokenize=tokenize, lower=True)

train_data, valid_data, test_data = SNLI.splits(TEXT, LABEL)
# train_data, valid_data, test_data = MultiNLI.splits(TEXT, LABEL)

TEXT.build_vocab(train_data,
                 min_freq=2,
                 specials=[u'<esos>', u'<nsos>', u'<csos>'],
                 vectors='glove.42B.300d')
LABEL.build_vocab(train_data, min_freq=2)

BATCH_SIZE = 32

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    sort_key=lambda x: len(x.premise),
    device=device)
print "Preparing data completed !"
Exemplo n.º 25
0
def main(args):
    print('start ..!')
    BATCH_SIZE = args.batch_size
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    TEXT = Field(
        sequential=True,  # text: sequential data
        tokenize=str.split,
        batch_first=True,
        fix_length=56,  # padding size: max length of data text
        lower=True)
    LABEL = LabelField(sequential=False, dtype=torch.float)

    w2v = KeyedVectors.load_word2vec_format(
        './model/GoogleNews-vectors-negative300.bin.gz', binary=True)

    data_dir = args.data_dir

    train_paths, val_paths = build_data(data_dir)

    N_EPOCHS = args.epochs
    EMBEDDING_DIM = args.embedding
    N_FILTERS = args.n_filters
    FILTER_SIZES = args.filter_sizes
    OUTPUT_DIM = 1
    DROPOUT = args.dropout
    test_acc_lists = []

    for kfold in range(10):
        # make datasets
        train_path = train_paths[kfold]
        val_path = val_paths[kfold]
        train_data = TabularDataset(path=train_path,
                                    skip_header=True,
                                    format='csv',
                                    fields=[('label', LABEL), ('text', TEXT)])
        test_data = TabularDataset(path=val_path,
                                   skip_header=True,
                                   format='csv',
                                   fields=[('label', LABEL), ('text', TEXT)])

        TEXT.build_vocab(train_data)
        LABEL.build_vocab(train_data)

        # for pretrained embedding vectors
        w2v_vectors = []
        for token, idx in TEXT.vocab.stoi.items():
            # pad token -> zero
            if idx == 1:
                w2v_vectors.append(torch.zeros(EMBEDDING_DIM))
            # if word in word2vec vocab -> replace with pretrained word2vec
            elif token in w2v.wv.vocab.keys():
                w2v_vectors.append(torch.FloatTensor(w2v[token]))
            # oov -> randomly initialized uniform distribution
            else:
                w2v_vectors.append(
                    torch.distributions.Uniform(-0.25, +0.25).sample(
                        (EMBEDDING_DIM, )))

        TEXT.vocab.set_vectors(TEXT.vocab.stoi, w2v_vectors, EMBEDDING_DIM)
        pretrained_embeddings = torch.FloatTensor(TEXT.vocab.vectors)

        # make iterators
        train_iterator, test_iterator = BucketIterator.splits(
            (train_data, test_data),
            batch_size=BATCH_SIZE,
            device=device,
            sort=False,
            shuffle=True)

        # define a model
        INPUT_DIM = len(TEXT.vocab)

        model = CNN1d(pretrained_embeddings, INPUT_DIM, EMBEDDING_DIM,
                      N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)
        optimizer = optim.Adadelta(model.parameters(), rho=0.95)
        criterion = nn.BCEWithLogitsLoss()

        model = model.to(device)
        criterion = criterion.to(device)

        # train
        best_test_acc = -float('inf')
        model_name = './model/model' + str(kfold) + '.pt'
        print('kfold', kfold)
        for epoch in range(N_EPOCHS):

            start_time = time.time()

            train_loss, train_acc = train(model, train_iterator, optimizer,
                                          criterion)
            test_loss, test_acc = evaluate(model, test_iterator, criterion)

            end_time = time.time()

            epoch_mins, epoch_secs = epoch_time(start_time, end_time)

            if test_acc > best_test_acc:
                best_test_acc = test_acc
                torch.save(model.state_dict(), model_name)

            # print(f'\tEpoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
            # print(f'\t\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
            # print(f'\t\tTest. Loss: {test_loss:.3f} |  Val. Acc: {test_acc * 100:.2f}%')
            logging.info(
                f'\tEpoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s'
            )
            logging.info(
                f'\t\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%'
            )
            logging.info(
                f'\t\tTest. Loss: {test_loss:.3f} |  Val. Acc: {test_acc * 100:.2f}%'
            )

        model.load_state_dict(torch.load(model_name))

        test_loss, test_acc = evaluate(model, test_iterator, criterion)
        test_acc_lists.append(test_acc)
        logging.info(f'============== last test accuracy: {test_acc}')
        # print(f'============== last test accuracy: {test_acc}')
        print()
    return test_acc_lists
Exemplo n.º 26
0
            batch_first=True)

tgt = Field(sequential=True,
            use_vocab=True,
            pad_token=PAD,
            tokenize=tokenizer_de,
            lower=True,
            init_token=BOS,
            eos_token=EOS,
            batch_first=True)

prefix_f = './escape.en-de.tok.5k'

parallel_dataset = TranslationDataset(path=prefix_f, exts=('.en', '.de'), fields=[('src', src), ('tgt', tgt)])

src.build_vocab(parallel_dataset, min_freq=5, max_size=15000)
tgt.build_vocab(parallel_dataset, min_freq=5, max_size=15000)

train, valid = parallel_dataset.split(split_ratio=0.97)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 20

train_iterator, valid_iterator = BucketIterator.splits((train, valid), batch_size=BATCH_SIZE,
                                                    sort_key=lambda x: interleave_keys(len(x.src), len(x.tgt)),
                                                    device=device)



class Encoder(nn.Module):
Exemplo n.º 27
0
#Getting the path to the Data folder.
pwd = os.getcwd()
pwd = pwd.replace('Utils','Data')

TEXT = Field(sequential=True, tokenize=lambda x: x.split(), lower=True) #spacy's performance is really good but it takes some time to execute.
LABEL = Field(sequential=False, use_vocab=False) #set use_vocab = False when the data is already numerical.

datafields = [("id", None),("conversation",TEXT), ("category", LABEL)]

#If skip_header is set to False, then the headers also get processed!
trn = TabularDataset(path=pwd+"/train_custom.csv", format='csv', skip_header=True, fields=datafields)
tst = TabularDataset(path=pwd+'/test_custom.csv', format='csv', skip_header=True, fields=datafields)

#Creating the vocabulary using GloVe embeddings.
TEXT.build_vocab(trn, vectors="glove.42B.300d")

train_iter = BucketIterator(
 dataset =trn, # we pass in the datasets we want the iterator to draw data from
 batch_size =64,
 device=device,
 sort_key=lambda x: len(x.conversation), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False, # we pass repeat=False because we want to wrap this Iterator layer.
 shuffle =False, #Experiment with this to see if you're getting improved performance.
 train =True #Whether the dataset is a training set or not.
 )

test_iter = Iterator(tst, batch_size=64, device=device, sort=False, sort_within_batch=False, repeat=False, shuffle=False)

class BatchWrapper:
Exemplo n.º 28
0
                                                    fields = (SRC, TRG))

import torch
from torchtext import data

#data.Dataset.splits()

# In[ ]:


print(vars(train_data.examples[0]))


# In[ ]:

SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)


# In[ ]:

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# In[ ]:

BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
Exemplo n.º 29
0
        toknized.extend(["<pad>"]*(5-len(toknized)))
    return toknized

TEXT = Field(tokenize=tagger.morphs,lower=True,include_lengths=False,batch_first=True,preprocessing=pad_under_five)
LABEL = Field(sequential=False,use_vocab=True,unk_token=None)

train_data, test_data = TabularDataset.splits(path=DATA_PATH+'/nsmc/',
 train='ratings_train.txt',
 test='ratings_test.txt',
 format='tsv', 
 skip_header=True, 
 fields=[('id',None),('text',TEXT),('label',LABEL)], 
 filter_pred = lambda x: True if len(x.text) > 1 else False) 
# 토큰 레벨 문장의 길이가 1 이상인 경우만 허용

TEXT.build_vocab(train_data,min_freq=2)
LABEL.build_vocab(train_data)

# print (TEXT.vocab)
# print (len(TEXT.vocab),len(LABEL.vocab))

# print (TEXT.vocab.itos[:5])
# print (LABEL.vocab.itos)

train_loader, test_loader = BucketIterator.splits((train_data,test_data),sort_key=lambda x:len(x.text), sort_within_batch=True,
 repeat=False,shuffle=True,
 batch_size=32,device=DEVICE)

for batch in train_loader:
    
    break
Exemplo n.º 30
0
def getData_old_method(USE_BPE):

    if USE_BPE == False:
        german = Field(tokenize=tokenize_ger,
                       lower=True,
                       init_token="<sos>",
                       eos_token="<eos>",
                       pad_token="<pad>",
                       unk_token="<unk>")

        english = Field(tokenize=tokenize_eng,
                        lower=True,
                        init_token="<sos>",
                        eos_token="<eos>",
                        pad_token="<pad>",
                        unk_token="<unk>")

        # print("===============================before ")
        train_data, valid_data, test_data = Multi30k.splits(
            exts=(".de", ".en"),
            fields=(german, english),
            # root='.data',
            train='train',
            validation='val',
            test='test2016',
            path='../../../data/multi30k')
        #The study’s questions are carefully worded and chosen.
        # The study questions were carefully worded and chosen.

        # train_data, valid_data, test_data = Multi30k.splits(
        #     exts=(".src", ".tgt"), fields=(german, english),
        #     # root='.data',
        #     train='train',
        #     validation='valid',
        #     test='test',
        #     path = '/data/chaudhryz/uwstudent1/GDATA'
        # )

        german.build_vocab(train_data, max_size=10000, min_freq=2)
        english.build_vocab(train_data, max_size=10000, min_freq=2)

        german.vocab.init_token = "<sos>"
        german.vocab.eos_token = "<eos>"

        english.vocab.init_token = "<sos>"
        english.vocab.eos_token = "<eos>"
        # print("Train")
        # for i in range(10):
        #     #print(train_data[i].src, train_data[i].trg)
        #     printSent(train_data[i].src)
        #     printSent(train_data[i].trg)

        # print("Test")
        # for i in range(10):
        #     #print(train_data[i].src, train_data[i].trg)
        #     printSent(test_data[i].src)
        #     printSent(test_data[i].trg)
        # exit()

        print("train_data ", len(train_data.examples))
        print("valid_data ", len(valid_data.examples))
        print("test_data ", len(test_data.examples))

        return german.vocab, english.vocab, train_data, valid_data, test_data

    else:
        print("Not Implemented")
        exit()
Exemplo n.º 31
0
device

eng=spacy.load('en')
ger=spacy.load('de_core_news_sm')

def Tokenize_eng(text):
  return [a.text for a in eng.tokenizer(text)]
def Tokenize_german(text):
  return [b.text for b in ger.tokenizer(text)]

german=Field(tokenize=Tokenize_german,lower=True,init_token='<sos>',eos_token='<eos>')
english=Field(tokenize=Tokenize_eng,lower=True,init_token='<sos>',eos_token='<eos>')

Train,Val,Test=Multi30k.splits(exts=('.de','.en'),fields=(german,english))

german.build_vocab(Train,max_size=10000,min_freq=2)
english.build_vocab(Train,max_size=10000,min_freq=2)

##building encoder
class Encode(Module):
  def __init__(self,inp_size,emd_size,hidden_size):
    super(Encode,self).__init__()
    self.inp_size=inp_size
    self.emd_size=emd_size
    self.hidden_size=hidden_size
    self.embed=Embedding(self.inp_size,self.emd_size)
    self.lstm=LSTM(self.emd_size,self.hidden_size,num_layers=2,dropout=0.3)
  def forward(self,x):
    x=self.embed(x)
    x,(h,c)=self.lstm(x)
    return h,c
Exemplo n.º 32
0
             unk_token=tokenizer.unk_token,
             pad_first=False,
             batch_first=True)

LABEL = Field(use_vocab=False, sequential=False)

datafields = [('text', TEXT), ('label', LABEL)]

trn, cv = TabularDataset.splits(path='.',
                                train='train.csv',
                                validation='cv.csv',
                                format='csv',
                                skip_header=True,
                                fields=datafields)

TEXT.build_vocab(trn, cv)
stoi = dict(tokenizer.vocab)
itos = list(stoi.keys())
TEXT.vocab.stoi = stoi
TEXT.vocab.itos = itos

train_iter, val_iter = BucketIterator.splits((trn, cv),
                                             batch_sizes=(64, 64),
                                             device=device,
                                             sort_key=lambda x: len(x.text),
                                             sort_within_batch=False,
                                             repeat=False)

vocab_sz = len(tokenizer.vocab)
print(vocab_sz)
hidden_sz = 50
Exemplo n.º 33
0
)

fields = {"Lithuanian": ("src", lithuanian), "English": ("trg", english)}

# Convert into Tabular Dataset
train_data, valid_data, test_data = TabularDataset.splits(
    path="",
    train="train.json",
    validation="valid.json",
    test="test.json",
    format="json",
    fields=fields,
)

# Create separate Vocab
english.build_vocab(train_data, max_size=10000, min_freq=2)
lithuanian.build_vocab(train_data, max_size=10000, min_freq=2)


# Prebuild transformer class from pytorch
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
Exemplo n.º 34
0
class Data(object):

    WORDS_NAME = "words"
    LAB_NAME = "lab"
    CHAR_NAME = "char"

    def __init__(
        self,
        train_path: str,
        unlabeled_path: str,
        semi_supervised: bool,
        dev_path: str = None,
        test_path: str = None,
        batch_size: int = 32,
        device: object = None,
        logger: typing.Optional[logging.Logger] = None,
    ) -> None:
        if logger is None:
            logger = logging.getLogger(__name__)
            logger.setLevel(logging.INFO)
            handler = logging.StreamHandler()
            handler.setFormatter(
                logging.Formatter('%(levelname)s - %(name)s - %(message)s'))
            logger.addHandler(handler)

        self.train_path = train_path
        self.dev_path = dev_path
        self.test_path = test_path
        self.unlabeled_path = unlabeled_path
        self.batch_size = batch_size
        self.semi_supervised = semi_supervised
        self.device = device
        self.logger = logger

    def initialize(self):
        ## initialize fields and create dataset ##
        self._init_fields()
        self._read_sentences()
        self.train = self._make_bucket_iterator(self._make_dataset(False),
                                                batch_size=self.batch_size,
                                                device=self.device)
        self.dev = self._make_bucket_iterator(self._make_dataset(False,
                                                                 which="dev"),
                                              batch_size=self.batch_size,
                                              device=self.device)
        self.test = self._make_bucket_iterator(self._make_dataset(
            False, which="test"),
                                               batch_size=self.batch_size,
                                               device=self.device)
        # self.unlabeled_train = self._make_bucket_iterator(self._make_dataset(True),
        #                                                   batch_size=self.batch_size, device=self.device)
        self.unlabeled_data = self._make_dataset(True)
        self._build_vocabularies()

    def _read_sentences(self):
        self.train_sentences = []
        with open(self.train_path) as f:
            for line in f:
                self.train_sentences.append(line.replace("\n", ""))
        self.logger.info('{} train sentences successfully read'.format(
            len(self.train_sentences)))

        self.dev_sentences = []
        with open(self.dev_path) as f:
            for line in f:
                self.dev_sentences.append(line.replace("\n", ""))
        self.logger.info('{} dev sentences successfully read'.format(
            len(self.dev_sentences)))

        self.unlabeled_sentences = []
        temp = []
        with open(self.unlabeled_path) as f:
            for line in f:
                sen_len = len(line.split())
                if sen_len > 0 and sen_len <= 20:
                    temp.append(line.replace("\n", ""))
        #self.unlabeled_sentences = random.sample(temp, 101420)
        self.unlabeled_sentences = temp
        self.logger.info('{} unlabeled sentences successfully read'.format(
            len(self.unlabeled_sentences)))

        self.test_sentences = []
        with open(self.test_path) as f:
            for line in f:
                self.test_sentences.append(line.replace("\n", ""))
        self.logger.info('{} test sentences successfully read'.format(
            len(self.train_sentences)))

    def _init_fields(self):
        self.words = Field(batch_first=True,
                           init_token='<s>',
                           eos_token='</s>')
        self.lab = Field(batch_first=True, unk_token=None, pad_token=None)
        # self.char = NestedField(Field(batch_first=True, tokenize=list, unk_token='<cunk>')
        #                         , init_token='<s>', eos_token='</s>')
        self.char = NestedField(Field(batch_first=True,
                                      tokenize=list,
                                      unk_token='<cunk>',
                                      init_token='<w>',
                                      eos_token='</w>'),
                                init_token='<s>',
                                eos_token='</s>')

        self.labeled_fields = [(self.WORDS_NAME, self.words),
                               (self.CHAR_NAME, self.char),
                               (self.LAB_NAME, self.lab)]
        self.unlabeled_fields = [(self.WORDS_NAME, self.words),
                                 (self.CHAR_NAME, self.char)]
        self.logger.info('fields initialized successfully')

    def _make_dataset(self, unlabeled, which=None) -> Dataset:
        if not unlabeled:
            sentences = self.train_sentences
            if which == "dev":
                sentences = self.dev_sentences
            elif which == "test":
                sentences = self.test_sentences
            examples = [self._make_example(s) for s in sentences]
            return Dataset(examples, self.labeled_fields)
        else:
            sentences = self.unlabeled_sentences
            examples = [self._make_example_unlabeled(s) for s in sentences]
            return Dataset(examples, self.unlabeled_fields)

    def _make_example(self, sent) -> Example:
        cols = sent.split("\t")
        words = [word for word in cols[0].split()]
        tags = [tag for tag in cols[1].split()]
        return Example.fromlist([words, words, tags], self.labeled_fields)

    def _make_example_unlabeled(self, sent) -> Example:
        words = [word for word in sent.split()]
        return Example.fromlist([words, words], self.unlabeled_fields)

    def _make_bucket_iterator(self, data, batch_size=32, device=None):
        # return BucketIterator(
        #     dataset=data, batch_size=batch_size,
        #     sort=False, sort_within_batch=True,
        #     sort_key=lambda x: len(x.words),
        #     device=device, repeat=False)
        return GroupedBucketIterator(data,
                                     batch_size,
                                     lambda ex: len(ex.words),
                                     device=device)

    def _build_vocabularies(self):
        self.words.build_vocab(self.train.dataset)
        self.lab.build_vocab(self.train.dataset)
        self.char.build_vocab(self.train.dataset)

        self.num_words = len(self.words.vocab)
        self.num_tags = len(self.lab.vocab)
        self.num_char = len(self.char.vocab)

        self.logger.info(
            'Found %d words, %d chars, and %d tags for both the labeled and unlabeled dataset',
            self.num_words, self.num_char, self.num_tags)

    def _get_unlabeled_sentences(self):
        while True:
            for us in self.unlabeled_sentences:
                yield us

    def _get_unlabeled_examples(self):
        #while True:
        lines = []
        for words in self._get_unlabeled_sentences():
            lines.append(words)
            if len(lines) >= 10142:
                yield [self._make_example_unlabeled(line) for line in lines]
                lines = []

    def _endless_unlabeled(self):
        #while True:
        for ex in self._get_unlabeled_examples():
            unlabeled_iterator = self._make_bucket_iterator(
                Dataset(ex, self.unlabeled_fields),
                batch_size=self.batch_size,
                device=self.device)
            yield unlabeled_iterator
            del unlabeled_iterator
            torch.cuda.empty_cache()

    def _endless_minibatch(self, data):
        while True:
            for i, batch in enumerate(data):
                yield batch

    def get_alternating_minibatch(self):
        # self._create_dataset()
        while True:
            for iter in self._endless_unlabeled():
                for mb in iter:
                    yield next(self._endless_minibatch(self.train)), "labeled"
                    if self.semi_supervised:
                        yield mb, "unlabeled"

    def get_input_sizes(self):
        return self.num_words, self.num_char, self.num_tags

    def get_pad_token_id(self):
        return self.char.vocab.stoi[self.char.pad_token]

    def get_unk_token_id(self):
        return self.char.vocab.stoi[self.char.unk_token]

    def get_train_sentences_length(self):
        return len(self.train_sentences)
Exemplo n.º 35
0
def build_dataset_and_vocab(sentences: List[str]):
    """
    Define source and target fields, iterate over the list of sentences to
    create list of Examples, and return:
        - training and validation dataset (split 90-10%)
        - source and target fields with Vocab object
    """
    # Minimum and maximum length for sentences to be included in the dataset
    min_length, max_length = 4, 10

    # Define source and target fields
    bos_word = '<s>'
    eos_word = '</s>'
    pad_word = '<pad>'
    src_field = Field(tokenize=tokenize_en, pad_token=pad_word, lower=True)
    tgt_field = Field(tokenize=tokenize_en,
                      init_token=bos_word,
                      eos_token=eos_word,
                      pad_token=pad_word,
                      lower=True)

    # Create list of Examples from the list of sentences
    examples = []
    sent_count = 0
    for sentence in sentences:
        sentence_split = sentence.split(' ')
        sentence_length = len(sentence_split)

        if sentence_length <= min_length or sentence_length >= max_length:
            continue
        sent_count += 1

        # If sent length is less than 8
        if sentence_length <= min_length + 4:
            # Src length is 3
            src_length = min_length - 1
        else:
            # Src length is 5
            src_length = min_length + 1

        for i in range(0, sentence_length - src_length, src_length):
            src = ' '.join(sentence_split[i:i + src_length])
            tgt = ' '.join(sentence_split[i + src_length:])

            example = Example.fromlist(data=[src, tgt],
                                       fields=[('src', src_field),
                                               ('tgt', tgt_field)])
            examples.append(example)

    print(
        f'Total {sent_count} sentences processed into {len(examples)} examples.'
    )
    train_dataset, valid_dataset = Dataset(examples=examples,
                                           fields=[
                                               ('src', src_field),
                                               ('tgt', tgt_field)
                                           ]).split(split_ratio=[0.9, 0.1])

    # Set the minimum frequency needed to include a token in the vocabulary
    min_freq = 2
    src_field.build_vocab(train_dataset, min_freq=min_freq)
    tgt_field.build_vocab(train_dataset, min_freq=min_freq)

    return train_dataset, valid_dataset, src_field, tgt_field