Exemplo n.º 1
0
    def __init__(self, max_len, batch_size, max_epochs, device, pretrained):
        text_field = data.Field(lower=True,
                                batch_first=True,
                                fix_length=max_len,
                                init_token='<go>',
                                eos_token='<eos>',
                                unk_token='<unk>',
                                pad_token='<pad>')
        label_field = data.Field(fix_length=max_len - 1, batch_first=True)

        # make splits for data
        unsup_train, unsup_val, unsup_test = NLIGen.splits(text_field)
        train, val, test = datasets.UDPOS.splits(
            (('text', text_field), ('label', label_field)))

        # build the vocabulary
        text_field.build_vocab(
            unsup_train)  # , vectors="fasttext.simple.300d")
        label_field.build_vocab(train)

        # make iterator for splits
        self.train_iter, _, _ = data.BucketIterator.splits(
            (unsup_train, unsup_val, unsup_test),
            batch_size=batch_size,
            device=device,
            shuffle=True,
            sort=False)
        _, self.unsup_val_iter, _ = data.BucketIterator.splits(
            (unsup_train, unsup_val, unsup_test),
            batch_size=int(batch_size / 10),
            device=device,
            shuffle=True,
            sort=False)
        self.sup_iter, _, _ = data.BucketIterator.splits((train, val, test),
                                                         batch_size=batch_size,
                                                         device=device,
                                                         shuffle=False,
                                                         sort=False)
        _, self.val_iter, self.test_iter = data.BucketIterator.splits(
            (train, unsup_val, unsup_test),
            batch_size=int(batch_size),
            device=device,
            shuffle=False,
            sort=False)

        self.vocab = text_field.vocab
        self.tags = label_field.vocab
        self.text_field = text_field
        self.label_field = label_field
        self.device = device
        self.batch_size = batch_size
        self.n_epochs = 0
        self.max_epochs = max_epochs
        if pretrained:
            ftxt = FastText()
            self.wvs = ftxt.get_vecs_by_tokens(self.vocab.itos)
        else:
            self.wvs = None
Exemplo n.º 2
0
    def __init__(self,
                 max_len,
                 batch_size,
                 max_epochs,
                 device,
                 unsup_proportion,
                 sup_proportion,
                 dev_index=1,
                 pretrained=False):
        text_field = data.Field(
            lower=True,
            batch_first=True,
            fix_length=max_len,
            pad_token='<pad>',
            init_token='<go>',
            is_target=True
        )  # init_token='<go>', eos_token='<eos>', unk_token='<unk>', pad_token='<unk>')
        label_field = data.Field(fix_length=max_len - 1,
                                 batch_first=True,
                                 unk_token=None)

        print('Current working directory:', os.getcwd())
        yelp_data = load_dataset('csv',
                                 data_files={
                                     'train':
                                     os.path.join('.data', 'yelp',
                                                  'train.csv'),
                                     'test':
                                     os.path.join('.data', 'yelp', 'test.csv')
                                 },
                                 column_names=['label', 'text'],
                                 version='0.0.2')
        #download_mode=FORCE_REDOWNLOAD)

        start = time()
        train_data, test_data = yelp_data['train'], yelp_data['test']

        def expand_labels(datum):
            datum['label'] = [str(datum['label'])] * (max_len - 1)
            return datum

        lens = [len(sample['text'].split(' ')) for sample in train_data]

        train_data, test_data = train_data.map(expand_labels), test_data.map(
            expand_labels)
        fields1 = {'text': text_field, 'label': label_field}
        fields2 = {
            'text': ('text', text_field),
            'label': ('label', label_field)
        }
        fields3 = {'text': text_field}
        fields4 = {'text': ('text', text_field)}

        len_train = int(len(train_data) / 3)
        dev_start, dev_end = int(len_train/5*(dev_index-1)), \
                             int(len_train/5*(dev_index))
        train_start1, train_start2, train_end1, train_end2 = 0, dev_end, int(dev_start*sup_proportion),\
                                                             int(dev_end+(len_train-dev_end)*sup_proportion)
        unsup_start, unsup_end = len_train, int(len_train + len_train * 2 *
                                                unsup_proportion)
        # Since the datasets are originally sorted with the label as key, we shuffle them before reducing the supervised
        # or the unsupervised data to the first few examples. We use a fixed see to keep the same data for all
        # experiments
        np.random.seed(42)
        train_examples = [Example.fromdict(ex, fields2) for ex in train_data]
        unsup_examples = [Example.fromdict(ex, fields4) for ex in train_data]
        np.random.shuffle(train_examples)
        np.random.shuffle(unsup_examples)
        train = Dataset(
            train_examples[train_start1:train_end1] +
            train_examples[train_start2:train_end2], fields1)
        val = Dataset(train_examples[dev_start:dev_end], fields1)
        test = Dataset([Example.fromdict(ex, fields2) for ex in test_data],
                       fields1)
        unsup_train = Dataset(unsup_examples[unsup_start:unsup_end], fields3)

        vocab_dataset = Dataset(train_examples, fields1)
        unsup_test, unsup_val = test, test

        print('data loading took', time() - start)

        # build the vocabulary
        text_field.build_vocab(
            vocab_dataset,
            max_size=VOCAB_LIMIT)  # , vectors="fasttext.simple.300d")
        label_field.build_vocab(train)
        # make iterator for splits
        self.train_iter, _, _ = data.BucketIterator.splits(
            (unsup_train, unsup_val, unsup_test),
            batch_size=batch_size,
            device=device,
            shuffle=True,
            sort=False)
        _, self.unsup_val_iter, _ = data.BucketIterator.splits(
            (unsup_train, unsup_val, unsup_test),
            batch_size=int(batch_size),
            device=device,
            shuffle=False,
            sort=False)
        self.sup_iter, _, _ = data.BucketIterator.splits((train, val, test),
                                                         batch_size=batch_size,
                                                         device=device,
                                                         shuffle=True,
                                                         sort=False)
        _, self.val_iter, self.test_iter = data.BucketIterator.splits(
            (train, val, test),
            batch_size=int(batch_size),
            device=device,
            shuffle=False,
            sort=False)

        self.vocab = text_field.vocab
        self.tags = label_field.vocab
        self.text_field = text_field
        self.label_field = label_field
        self.device = device
        self.batch_size = batch_size
        self.n_epochs = 0
        self.max_epochs = max_epochs
        if pretrained:
            ftxt = FastText()
            self.wvs = ftxt.get_vecs_by_tokens(self.vocab.itos)
        else:
            self.wvs = None
Exemplo n.º 3
0
    def __init__(self,
                 max_len,
                 batch_size,
                 max_epochs,
                 device,
                 unsup_proportion,
                 sup_proportion,
                 dev_index=1,
                 pretrained=False):
        text_field = data.Field(
            lower=True,
            batch_first=True,
            fix_length=max_len,
            pad_token='<pad>',
            init_token='<go>',
            is_target=True
        )  #init_token='<go>', eos_token='<eos>', unk_token='<unk>', pad_token='<unk>')
        label_field = data.Field(fix_length=max_len - 1, batch_first=True)

        # make splits for data
        #unsup_train, unsup_val, unsup_test = MyPennTreebank.splits(text_field)
        #unsup_train, unsup_val, unsup_test = datasets.PennTreebank.splits(text_field)
        #unsup_train, unsup_val, unsup_test = datasets.WikiText2.splits(text_field)
        unsup_train, unsup_val, unsup_test = datasets.UDPOS.splits(
            (('text', text_field), ('label', label_field)))
        #unsup_train, unsup_val, unsup_test = YahooLM.splits(text_field)
        train, val, test = datasets.UDPOS.splits(
            (('text', text_field), ('label', label_field)))

        # build the vocabulary
        text_field.build_vocab(
            unsup_train,
            max_size=VOCAB_LIMIT)  # , vectors="fasttext.simple.300d")
        label_field.build_vocab(train)
        # self.train_iter, _,  _ = data.BPTTIterator.splits((unsup_train, unsup_val, unsup_test),
        #                                                                     batch_size=batch_size, bptt_len=max_len,
        #                                                                     device=device, repeat=False, shuffle=False,
        #                                                                     sort=False)
        # _, self.unsup_val_iter,  _ = data.BPTTIterator.splits((unsup_train, unsup_val, unsup_test),
        #                                                                     batch_size=int(batch_size/10), bptt_len=max_len,
        #                                                                     device=device, repeat=False, shuffle=False,
        #                                                                     sort=False)
        # Remaking splits according to supervision proportions
        exlist = [ex for ex in train + val]
        train = Dataset(exlist, {'text': text_field, 'label': label_field})
        dev_start, dev_end = int(len(train) / 5 * (dev_index - 1)), \
                             int(len(train) / 5 * (dev_index))
        train_start1, train_start2, train_end1, train_end2 = 0, dev_end, int(dev_start * sup_proportion), \
                                                             int(dev_end + (len(train) - dev_end) * sup_proportion)
        unsup_start, unsup_end = 0, int(len(unsup_train) * unsup_proportion)
        val = Dataset(train[dev_start:dev_end], {
            'text': text_field,
            'label': label_field
        })
        train = Dataset(
            train[train_start1:train_end1] + train[train_start2:train_end2], {
                'text': text_field,
                'label': label_field
            })
        unsup_train = Dataset(unsup_train[unsup_start:unsup_end],
                              {'text': text_field})

        # make iterator for splits

        self.train_iter, _, _ = data.BucketIterator.splits(
            (unsup_train, unsup_val, unsup_test),
            batch_size=batch_size,
            device=device,
            shuffle=True,
            sort=False)
        _, self.unsup_val_iter, _ = data.BucketIterator.splits(
            (unsup_train, unsup_val, unsup_test),
            batch_size=int(batch_size / 10),
            device=device,
            shuffle=False,
            sort=False)
        self.sup_iter, _, _ = data.BucketIterator.splits((train, val, test),
                                                         batch_size=batch_size,
                                                         device=device,
                                                         shuffle=False,
                                                         sort=False)
        _, self.val_iter, self.test_iter = data.BucketIterator.splits(
            (train, val, test),
            batch_size=int(batch_size),
            device=device,
            shuffle=False,
            sort=False)

        self.vocab = text_field.vocab
        self.tags = label_field.vocab
        self.text_field = text_field
        self.label_field = label_field
        self.device = device
        self.batch_size = batch_size
        self.n_epochs = 0
        self.max_epochs = max_epochs
        if pretrained:
            ftxt = FastText()
            self.wvs = ftxt.get_vecs_by_tokens(self.vocab.itos)
        else:
            self.wvs = None
Exemplo n.º 4
0
                                shuffle=True,
                                sort_key=lambda x: len(x.text),
                                sort_within_batch=True)
    val_iter = BucketIterator(val_data,
                              BATCH_SIZE,
                              sort_key=lambda x: len(x.text),
                              sort_within_batch=True)
    test_iter = BucketIterator(test_data,
                               BATCH_SIZE,
                               sort_key=lambda x: len(x.text),
                               sort_within_batch=True)

    if not args.evaluate_only:

        ff = FastText("en")
        embeddings = ff.get_vecs_by_tokens(SRC.vocab.itos)

        model = TransformerSummarizer(ATTENTION_HEADS, N_LAYERS, N_LAYERS, DIM_FEEDFORWARD, \
                                        SEQ_LEN, VOCAB_SIZE, PAD_IDX, src_list, embeddings=embeddings).to(device)

        num_batches = math.ceil(len(train_data) / BATCH_SIZE)
        val_batches = math.ceil(len(val_data) / BATCH_SIZE)

        parameters = filter(lambda p: p.requires_grad, model.parameters())
        optimizer = optim.Adam(parameters)
        criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

        print("Training Started")

        for epoch in range(N_EPOCHS):
            start_time = time.time()