Exemplo n.º 1
0
def get_data(config):
    train, dev, test = smt_dataset(
        directory="../data/",
        train=True,
        dev=True,
        test=True,
        fine_grained=True,
    )

    def filter_neutrals(data, labels):
        logger.info("Filtering neutral labels for binary task")

        new_data, new_labels = [], []

        for d, l in zip(data, labels):
            # l positive or very positive

            if "positive" in l:
                new_data.append(d)
                new_labels.append("positive")
            # l negative or very negative
            elif "negative" in l:
                new_data.append(d)
                new_labels.append("negative")
            else:
                continue

        return new_data, new_labels

    raw_train = [d["text"] for d in train]
    labels_train = [d["label"] for d in train]

    raw_dev = [d["text"] for d in dev]
    labels_dev = [d["label"] for d in dev]

    raw_test = [d["text"] for d in dev]
    labels_test = [d["label"] for d in dev]

    num_labels = 5

    if config.binary:
        raw_train, labels_train = filter_neutrals(raw_train, labels_train)
        raw_dev, labels_dev = filter_neutrals(raw_dev, labels_dev)
        raw_test, labels_test = filter_neutrals(raw_test, labels_test)
        num_labels = 2

    return (
        raw_train,
        labels_train,
        raw_dev,
        labels_dev,
        raw_test,
        labels_test,
        num_labels,
    )
Exemplo n.º 2
0
def test_smt_dataset_row(mock_urlretrieve):
    mock_urlretrieve.side_effect = urlretrieve_side_effect

    # Check a row are parsed correctly
    train, dev, test = smt_dataset(directory=directory,
                                   test=True,
                                   dev=True,
                                   train=True)
    assert len(train) > 0
    assert len(dev) > 0
    assert len(test) > 0
    assert train[5] == {
        'text':
        "Whether or not you 're enlightened by any of Derrida 's lectures on `` the other '' "
        +
        "and `` the self , '' Derrida is an undeniably fascinating and playful fellow .",
        'label':
        'positive'
    }
    train = smt_dataset(directory=directory, train=True, subtrees=True)
    assert train[3] == {'text': 'Rock', 'label': 'neutral'}

    train = smt_dataset(directory=directory,
                        train=True,
                        subtrees=True,
                        fine_grained=True)
    assert train[4] == {
        'text':
        "is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a"
        +
        " splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven"
        + " Segal .",
        'label':
        'very positive'
    }

    # Clean up
    shutil.rmtree(os.path.join(directory, 'trees'))
Exemplo n.º 3
0
    def __init__(self,
                 mode='train',
                 subtrees=False,
                 embedder=None,
                 tokenizer=None,
                 granularity=2,
                 threshold=3):

        if granularity == 5:
            fine_grained = True
        else:
            fine_grained = False

        if tokenizer:
            self.tokenizer = Tokenizer(tokenizer)

        self.subtrees = subtrees

        if mode == 'train':
            self.data = list(
                smt_dataset('sst/',
                            train=True,
                            fine_grained=fine_grained,
                            subtrees=self.subtrees))

        if mode == 'val':
            self.data = list(
                smt_dataset('sst/',
                            train=False,
                            dev=True,
                            fine_grained=fine_grained,
                            subtrees=self.subtrees))

        if mode == 'test':
            self.data = list(
                smt_dataset('sst/',
                            train=False,
                            test=True,
                            fine_grained=fine_grained,
                            subtrees=self.subtrees))

        if fine_grained is True:
            label_to_id = {}
            label_to_id['very negative'] = 0
            label_to_id['negative'] = 1
            label_to_id['neutral'] = 2
            label_to_id['positive'] = 3
            label_to_id['very positive'] = 4

        else:
            label_to_id = {}
            label_to_id['very negative'] = 0
            label_to_id['negative'] = 0
            label_to_id['neutral'] = 2
            label_to_id['positive'] = 1
            label_to_id['very positive'] = 1

        if self.subtrees == False:
            for i in self.data:
                i['label'] = label_to_id[i['label']]

                if tokenizer:
                    i['text'] = self.tokenizer.tokenize(i['text'])

        else:
            data_list = []
            count = 0
            for i in self.data:
                if len(i['text'].split()) >= threshold:
                    label = label_to_id[i['label']]

                    if tokenizer:
                        text = self.tokenizer.tokenize(i['text'])

                    else:
                        text = i['text']

                    data_list.append({'text': text, 'label': label})
                    count += 1

            self.data = data_list
            del data_list

        del_idxs = []
        if fine_grained is False:
            for i in range(len(self.data)):
                if self.data[i]['label'] == 2:
                    del_idxs.append(i)

        self.data = [
            self.data[x] for x in range(len(self.data)) if x not in del_idxs
        ]
Exemplo n.º 4
0
from slp.plbind.module import RnnPLModule
from slp.util.log import configure_logging
from slp.data.collators import SequenceClassificationCollator
from slp.modules.classifier import Classifier
from slp.modules.rnn import WordRNN
from slp.plbind.trainer import make_trainer, watch_model
from slp.plbind.helpers import FromLogits

collate_fn = SequenceClassificationCollator(device="cpu")

if __name__ == "__main__":
    EXPERIMENT_NAME = "smt-words-sentiment-classification"

    configure_logging(f"logs/{EXPERIMENT_NAME}")

    train, dev = smt_dataset(directory="../data/", train=True, dev=True)

    raw_train = [d["text"] for d in train]
    labels_train = [d["label"] for d in train]

    raw_dev = [d["text"] for d in dev]
    labels_dev = [d["label"] for d in dev]

    ldm = PLDataModuleFromCorpus(
        raw_train,
        labels_train,
        val=raw_dev,
        val_labels=labels_dev,
        batch_size=8,
        batch_size_eval=32,
        collate_fn=collate_fn,
Exemplo n.º 5
0
    to_token_ids = ToTokenIds(word2idx)
    to_tensor = ToTensor(device='cpu')

    def create_dataloader(d):
        d = (DatasetWrapper(d).map(tokenizer).map(to_token_ids).map(to_tensor))
        return DataLoader(
            d,
            batch_size=8,
            num_workers=1,
            pin_memory=True,
            shuffle=True,
            collate_fn=collate_fn)

    train_loader, dev_loader = map(
        create_dataloader,
        smt_dataset(directory='../data/', train=True, dev=True))

    model = Classifier(
        WordRNN(256, embeddings, bidirectional=True, merge_bi='cat',
                packed_sequence=True, attention=False, device=DEVICE),
        512, 3)

    optimizer = Adam([p for p in model.parameters() if p.requires_grad],
                     lr=1e-3)
    criterion = nn.CrossEntropyLoss()
    metrics = {
        'accuracy': Accuracy(),
        'loss': Loss(criterion)
    }
    trainer = SequentialTrainer(model, optimizer,
                                checkpoint_dir='../checkpoints',
    neut_indices = [i for i, x in enumerate(labels) if x == "neutral"]
    neg_indices = [i for i, x in enumerate(labels) if x == "negative"]
    very_neg_indices = [
        i for i, x in enumerate(labels) if x == "very negative"
    ]

    labels_tensor[very_pos_indices] = 0
    labels_tensor[pos_indices] = 1
    labels_tensor[neut_indices] = 2
    labels_tensor[neg_indices] = 3
    labels_tensor[very_neg_indices] = 4

    return labels_tensor


train = smt_dataset(train=True, fine_grained=True)
valid = smt_dataset(dev=True, fine_grained=True)
test = smt_dataset(test=True, fine_grained=True)

train_labels = create_SMT_labels(train, len(train))
train_text = np.array(train.__getitem__('text'))
valid_labels = create_SMT_labels(valid, len(valid))
valid_text = np.array(valid.__getitem__('text'))
test_labels = create_SMT_labels(test, len(test))
test_text = np.array(test.__getitem__('text'))

np.save('sst_train_text', train_text)
np.save('sst_train_labels', train_labels)
np.save('sst_valid_text', valid_text)
np.save('sst_valid_labels', valid_labels)
np.save('sst_test_text', test_text)