예제 #1
0
def predictions(task, model, config, data, label_transformer=None,
                batch_size=128, preprocessor=None, name=None):
    """

    Args:
        task (): available tasks
                - "clf": multiclass classification
                - "bclf": binary classification
                - "mclf": multilabel classification
                - "reg": regression
        model ():
        config ():
        data ():
        label_transformer ():
        batch_size ():
        num_workers ():

    Returns:

    """
    word2idx = None
    if config["op_mode"] == "word":
        word2idx, idx2word, embeddings = load_embeddings(config)

    # dummy scores if order to utilize Dataset classes as they are
    dummy_y = [0] * len(data)

    if config["op_mode"] == "word":

        if preprocessor is None:
            preprocessor = twitter_preprocess()

        dataset = WordDataset(data, dummy_y, word2idx,
                              name=name,
                              preprocess=preprocessor,
                              label_transformer=label_transformer)
        loader = DataLoader(dataset, batch_size)

    elif config["op_mode"] == "char":
        print("Building char-level datasets...")
        dataset = CharDataset(data, dummy_y, name=name,
                              label_transformer=label_transformer)
        loader = DataLoader(dataset, batch_size)
    else:
        raise ValueError("Invalid op_mode")

    model.to(DEVICE)

    pipeline = get_pipeline(task=task, eval=True)
    avg_loss, (dummy_y, pred), posteriors, attentions = predict(model,
                                                                pipeline,
                                                                loader,
                                                                task,
                                                                "eval")

    return pred, posteriors, attentions, loader.dataset.data
예제 #2
0
def train_ei_reg(emotion, model, evaluation, preprocessor=None):
    """
    1. Task EI-reg: Detecting Emotion Intensity (regression)

    Given:

        - a tweet
        - an emotion E (anger, fear, joy, or sadness)

    Task: determine the  intensity of E that best represents the mental state of
    the tweeter—a real-valued score between 0 and 1:

        - a score of 1: highest amount of E can be inferred
        - a score of 0: lowest amount of E can be inferred

    For each language: 4 training sets and 4 test sets: one for each emotion E.

    (Note that the absolute scores have no inherent meaning --
    they are used only as a means to convey that the instances
    with higher scores correspond to a greater degree of E
    than instances with lower scores.)

    :param emotion: emotions = ["anger", "fear", "joy", "sadness"]
    :param pretrained:
    :param finetune:
    :param unfreeze:
    :return:
    """

    if preprocessor is None:
        preprocessor = twitter_preprocess()

    model_config = TASK1_EIREG

    X_train, y_train = parse(task='EI-reg', emotion=emotion, dataset="train")
    X_dev, y_dev = parse(task='EI-reg', emotion=emotion, dataset="dev")
    X_test, y_test = parse(task='EI-reg', emotion=emotion, dataset="gold")

    # keep only scores
    y_train = [y[1] for y in y_train]
    y_dev = [y[1] for y in y_dev]
    y_test = [y[1] for y in y_test]

    name = model_config["name"] + "_" + emotion

    X_train = preprocessor("{}_{}".format(name, "train"), X_train)
    X_dev = preprocessor("{}_{}".format(name, "dev"), X_dev)
    X_test = preprocessor("{}_{}".format(name, "test"), X_test)

    model.fit(X_train, y_train)

    res_dev = evaluation(model.predict(X_dev), y_dev)
    res_test = evaluation(model.predict(X_test), y_test)

    return res_dev, res_test
예제 #3
0
def load_datasets(datasets,
                  train_batch_size,
                  eval_batch_size,
                  token_type,
                  preprocessor=None,
                  params=None,
                  word2idx=None,
                  label_transformer=None):
    if params is not None:
        name = "_".join(params) if isinstance(params, list) else params
    else:
        name = None

    loaders = {}
    if token_type == "word":
        if word2idx is None:
            raise ValueError

        if preprocessor is None:
            preprocessor = twitter_preprocess()

        print("Building word-level datasets...")
        for k, v in datasets.items():
            _name = "{}_{}".format(name, k)
            dataset = WordDataset(v[0],
                                  v[1],
                                  word2idx,
                                  name=_name,
                                  preprocess=preprocessor,
                                  label_transformer=label_transformer)
            batch_size = train_batch_size if k == "train" else eval_batch_size
            loaders[k] = DataLoader(dataset,
                                    batch_size,
                                    shuffle=True,
                                    drop_last=True)

    elif token_type == "char":
        print("Building char-level datasets...")
        for k, v in datasets.items():
            _name = "{}_{}".format(name, k)
            dataset = CharDataset(v[0],
                                  v[1],
                                  name=_name,
                                  label_transformer=label_transformer)
            batch_size = train_batch_size if k == "train" else eval_batch_size
            loaders[k] = DataLoader(dataset,
                                    batch_size,
                                    shuffle=True,
                                    drop_last=True)

    else:
        raise ValueError("Invalid token_type.")

    return loaders
예제 #4
0
def train_ei_oc(emotion, model, evaluation, preprocessor=None):
    """
    2. Task EI-oc: Detecting Emotion Intensity (ordinal classification)

    Given:

    a tweet
    an emotion E (anger, fear, joy, or sadness)

    Task: classify the tweet into one of four ordinal classes of intensity of E
    that best represents the mental state of the tweeter:

        0: no E can be inferred
        1: low amount of E can be inferred
        2: moderate amount of E can be inferred
        3: high amount of E can be inferred

    For each language: 4 training sets and 4 test sets: one for each emotion E.

    :param emotion: emotions = ["anger", "fear", "joy", "sadness"]
    :param pretrained:
    :param finetune:
    :param unfreeze:
    :return:
    """

    if preprocessor is None:
        preprocessor = twitter_preprocess()

    model_config = TASK1_EIOC

    X_train, y_train = parse(task='EI-oc', emotion=emotion, dataset="train")
    X_dev, y_dev = parse(task='EI-oc', emotion=emotion, dataset="dev")
    X_test, y_test = parse(task='EI-oc', emotion=emotion, dataset="gold")

    # keep only scores
    y_train = [y[1] for y in y_train]
    y_dev = [y[1] for y in y_dev]
    y_test = [y[1] for y in y_test]

    name = model_config["name"] + "_" + emotion

    X_train = preprocessor("{}_{}".format(name, "train"), X_train)
    X_dev = preprocessor("{}_{}".format(name, "dev"), X_dev)
    X_test = preprocessor("{}_{}".format(name, "test"), X_test)

    model.fit(X_train, y_train)

    res_dev = evaluation(model.predict(X_dev), y_dev)
    res_test = evaluation(model.predict(X_test), y_test)

    return res_dev, res_test
예제 #5
0
import os

from config import BASE_PATH
from modules.sklearn.models import eval_clf, nbow_model, bow_model
from utils.dataloaders import load_wassa
from utils.load_embeddings import load_word_vectors
from utils.nlp import twitter_preprocess

#############################################################
# Load Data
#############################################################
preprocessor = twitter_preprocess()

X_train, X_test, y_train, y_test = load_wassa()

# X_train = X_train[:1000]
# y_train = y_train[:1000]
# X_test = X_test[:1000]
# y_test = y_test[:1000]

X_train = preprocessor("wassa_train", X_train)
X_test = preprocessor("wassa_test", X_test)

#############################################################
# Bag-of-Words
# #############################################################
bow_clf = bow_model("clf", max_features=30000)
bow_clf.fit(X_train, y_train)
y_pred = bow_clf.predict(X_test)
bow_results = eval_clf(y_pred, y_test)
예제 #6
0
def load_datasets(datasets, train_batch_size, eval_batch_size, token_type,
                  preprocessor=None,
                  params=None, word2idx=None, label_transformer=None):

    def sample_validation(dataset, n=100):
        """Sample n instances for validation from training."""
        n_ = len(dataset)
        dataset.data = numpy.array(dataset.data)
        dataset.labels = numpy.array(dataset.labels)
        dataset_valid = deepcopy(dataset)
        indices = numpy.random.permutation(len(dataset))
        dataset_valid.data = dataset_valid.data[indices[:n]]
        dataset_valid.labels = dataset_valid.labels[indices[:n]]
        dataset.data = dataset.data[indices[n:]]
        dataset.labels = dataset.labels[indices[n:]]
        assert len(dataset_valid.data) == len(dataset_valid.labels) == n
        assert len(dataset.data) == len(dataset.labels) == n_ - n
        return dataset, dataset_valid
        
    if params is not None:
        name = "_".join(params) if isinstance(params, list) else params
    else:
        name = None

    loaders = {}
    if token_type == "word":
        if word2idx is None:
            raise ValueError

        if preprocessor is None:
            preprocessor = twitter_preprocess()

        print("Building word-level datasets...")
        for k, v in datasets.items():
            _name = "{}_{}".format(name, k)
            dataset = WordDataset(v[0], v[1], word2idx, name=_name,
                                  preprocess=preprocessor,
                                  label_transformer=label_transformer)
            if k == 'train':  # Sample validation instances
                dataset, dataset_valid = sample_validation(dataset)
                loaders['valid'] = DataLoader(dataset_valid, eval_batch_size,
                                              shuffle=False, drop_last=False)
            batch_size = train_batch_size if k == "train" else eval_batch_size
            loaders[k] = DataLoader(dataset, batch_size,
                                    shuffle=(k == "train"),
                                    drop_last=(k == "train"))

    elif token_type == "char":
        print("Building char-level datasets...")
        for k, v in datasets.items():
            _name = "{}_{}".format(name, k)
            dataset = CharDataset(v[0], v[1], name=_name,
                                  label_transformer=label_transformer)
            if k == 'train':  # Sample validation instances
                dataset, dataset_valid = sample_validation(dataset)
                loaders['valid'] = DataLoader(dataset_valid, eval_batch_size,
                                              shuffle=False, drop_last=False)
            batch_size = train_batch_size if k == "train" else eval_batch_size
            loaders[k] = DataLoader(dataset, batch_size, shuffle=True,
                                    drop_last=True)

    else:
        raise ValueError("Invalid token_type.")


    return loaders
예제 #7
0
    ("word2vec_500_6_concatened.txt", 510),
]
embeddings = {}
for e, d in emb_files:
    file = os.path.join(BASE_PATH, "embeddings", e)
    word2idx, idx2word, weights = load_word_vectors(file, d)
    embeddings[e.split(".")[0]] = (weights, word2idx)

bow_clf = bow_model("clf")
bow_reg = bow_model("reg")
nbow_clf = {"nbow_{}".format(name): nbow_model("clf", e, w2i)
            for name, (e, w2i) in embeddings.items()}
nbow_reg = {"nbow_{}".format(name): nbow_model("reg", e, w2i)
            for name, (e, w2i) in embeddings.items()}

preprocessor = twitter_preprocess()

# ###########################################################################
# # 1. Task EI-reg: Detecting Emotion Intensity (regression)
# ###########################################################################

results = defaultdict(dict)
print()

for emotion in ["joy", "sadness", "fear", "anger"]:
    task = "EI-reg:{}".format(emotion)

    dev, gold = train_ei_reg(emotion=emotion, model=bow_reg,
                             evaluation=eval_reg, preprocessor=preprocessor)
    results[task]["bow"] = {"dev": dev, "gold": gold}
예제 #8
0
def predictions(task,
                model,
                config,
                data,
                label_transformer=None,
                batch_size=128,
                preprocessor=None,
                name=None):
    """

    Args:
        task (): available tasks
                - "clf": multiclass classification
                - "bclf": binary classification
                - "mclf": multilabel classification
                - "reg": regression
        model ():
        config ():
        data ():
        label_transformer ():
        batch_size ():
        num_workers ():

    Returns:

    """
    word2idx = None
    if config["op_mode"] == "word":
        word2idx, idx2word, embeddings = load_embeddings(config)

    # dummy scores if order to utilize Dataset classes as they are
    dummy_y = [0] * len(data)

    if config["op_mode"] == "word":

        if preprocessor is None:
            preprocessor = twitter_preprocess()

        dataset = WordDataset(data,
                              dummy_y,
                              word2idx,
                              name=name,
                              preprocess=preprocessor,
                              label_transformer=label_transformer)
        loader = DataLoader(dataset, batch_size)

    elif config["op_mode"] == "char":
        print("Building char-level datasets...")
        dataset = CharDataset(data,
                              dummy_y,
                              name=name,
                              label_transformer=label_transformer)
        loader = DataLoader(dataset, batch_size)
    else:
        raise ValueError("Invalid op_mode")

    model.to(DEVICE)

    pipeline = get_pipeline(task=task, eval=True)
    avg_loss, (dummy_y,
               pred), posteriors, attentions = predict(model, pipeline, loader,
                                                       task, "eval")

    return pred, posteriors, attentions, loader.dataset.data