def predictions(task, model, config, data, label_transformer=None, batch_size=128, preprocessor=None, name=None): """ Args: task (): available tasks - "clf": multiclass classification - "bclf": binary classification - "mclf": multilabel classification - "reg": regression model (): config (): data (): label_transformer (): batch_size (): num_workers (): Returns: """ word2idx = None if config["op_mode"] == "word": word2idx, idx2word, embeddings = load_embeddings(config) # dummy scores if order to utilize Dataset classes as they are dummy_y = [0] * len(data) if config["op_mode"] == "word": if preprocessor is None: preprocessor = twitter_preprocess() dataset = WordDataset(data, dummy_y, word2idx, name=name, preprocess=preprocessor, label_transformer=label_transformer) loader = DataLoader(dataset, batch_size) elif config["op_mode"] == "char": print("Building char-level datasets...") dataset = CharDataset(data, dummy_y, name=name, label_transformer=label_transformer) loader = DataLoader(dataset, batch_size) else: raise ValueError("Invalid op_mode") model.to(DEVICE) pipeline = get_pipeline(task=task, eval=True) avg_loss, (dummy_y, pred), posteriors, attentions = predict(model, pipeline, loader, task, "eval") return pred, posteriors, attentions, loader.dataset.data
def train_ei_reg(emotion, model, evaluation, preprocessor=None): """ 1. Task EI-reg: Detecting Emotion Intensity (regression) Given: - a tweet - an emotion E (anger, fear, joy, or sadness) Task: determine the intensity of E that best represents the mental state of the tweeter—a real-valued score between 0 and 1: - a score of 1: highest amount of E can be inferred - a score of 0: lowest amount of E can be inferred For each language: 4 training sets and 4 test sets: one for each emotion E. (Note that the absolute scores have no inherent meaning -- they are used only as a means to convey that the instances with higher scores correspond to a greater degree of E than instances with lower scores.) :param emotion: emotions = ["anger", "fear", "joy", "sadness"] :param pretrained: :param finetune: :param unfreeze: :return: """ if preprocessor is None: preprocessor = twitter_preprocess() model_config = TASK1_EIREG X_train, y_train = parse(task='EI-reg', emotion=emotion, dataset="train") X_dev, y_dev = parse(task='EI-reg', emotion=emotion, dataset="dev") X_test, y_test = parse(task='EI-reg', emotion=emotion, dataset="gold") # keep only scores y_train = [y[1] for y in y_train] y_dev = [y[1] for y in y_dev] y_test = [y[1] for y in y_test] name = model_config["name"] + "_" + emotion X_train = preprocessor("{}_{}".format(name, "train"), X_train) X_dev = preprocessor("{}_{}".format(name, "dev"), X_dev) X_test = preprocessor("{}_{}".format(name, "test"), X_test) model.fit(X_train, y_train) res_dev = evaluation(model.predict(X_dev), y_dev) res_test = evaluation(model.predict(X_test), y_test) return res_dev, res_test
def load_datasets(datasets, train_batch_size, eval_batch_size, token_type, preprocessor=None, params=None, word2idx=None, label_transformer=None): if params is not None: name = "_".join(params) if isinstance(params, list) else params else: name = None loaders = {} if token_type == "word": if word2idx is None: raise ValueError if preprocessor is None: preprocessor = twitter_preprocess() print("Building word-level datasets...") for k, v in datasets.items(): _name = "{}_{}".format(name, k) dataset = WordDataset(v[0], v[1], word2idx, name=_name, preprocess=preprocessor, label_transformer=label_transformer) batch_size = train_batch_size if k == "train" else eval_batch_size loaders[k] = DataLoader(dataset, batch_size, shuffle=True, drop_last=True) elif token_type == "char": print("Building char-level datasets...") for k, v in datasets.items(): _name = "{}_{}".format(name, k) dataset = CharDataset(v[0], v[1], name=_name, label_transformer=label_transformer) batch_size = train_batch_size if k == "train" else eval_batch_size loaders[k] = DataLoader(dataset, batch_size, shuffle=True, drop_last=True) else: raise ValueError("Invalid token_type.") return loaders
def train_ei_oc(emotion, model, evaluation, preprocessor=None): """ 2. Task EI-oc: Detecting Emotion Intensity (ordinal classification) Given: a tweet an emotion E (anger, fear, joy, or sadness) Task: classify the tweet into one of four ordinal classes of intensity of E that best represents the mental state of the tweeter: 0: no E can be inferred 1: low amount of E can be inferred 2: moderate amount of E can be inferred 3: high amount of E can be inferred For each language: 4 training sets and 4 test sets: one for each emotion E. :param emotion: emotions = ["anger", "fear", "joy", "sadness"] :param pretrained: :param finetune: :param unfreeze: :return: """ if preprocessor is None: preprocessor = twitter_preprocess() model_config = TASK1_EIOC X_train, y_train = parse(task='EI-oc', emotion=emotion, dataset="train") X_dev, y_dev = parse(task='EI-oc', emotion=emotion, dataset="dev") X_test, y_test = parse(task='EI-oc', emotion=emotion, dataset="gold") # keep only scores y_train = [y[1] for y in y_train] y_dev = [y[1] for y in y_dev] y_test = [y[1] for y in y_test] name = model_config["name"] + "_" + emotion X_train = preprocessor("{}_{}".format(name, "train"), X_train) X_dev = preprocessor("{}_{}".format(name, "dev"), X_dev) X_test = preprocessor("{}_{}".format(name, "test"), X_test) model.fit(X_train, y_train) res_dev = evaluation(model.predict(X_dev), y_dev) res_test = evaluation(model.predict(X_test), y_test) return res_dev, res_test
import os from config import BASE_PATH from modules.sklearn.models import eval_clf, nbow_model, bow_model from utils.dataloaders import load_wassa from utils.load_embeddings import load_word_vectors from utils.nlp import twitter_preprocess ############################################################# # Load Data ############################################################# preprocessor = twitter_preprocess() X_train, X_test, y_train, y_test = load_wassa() # X_train = X_train[:1000] # y_train = y_train[:1000] # X_test = X_test[:1000] # y_test = y_test[:1000] X_train = preprocessor("wassa_train", X_train) X_test = preprocessor("wassa_test", X_test) ############################################################# # Bag-of-Words # ############################################################# bow_clf = bow_model("clf", max_features=30000) bow_clf.fit(X_train, y_train) y_pred = bow_clf.predict(X_test) bow_results = eval_clf(y_pred, y_test)
def load_datasets(datasets, train_batch_size, eval_batch_size, token_type, preprocessor=None, params=None, word2idx=None, label_transformer=None): def sample_validation(dataset, n=100): """Sample n instances for validation from training.""" n_ = len(dataset) dataset.data = numpy.array(dataset.data) dataset.labels = numpy.array(dataset.labels) dataset_valid = deepcopy(dataset) indices = numpy.random.permutation(len(dataset)) dataset_valid.data = dataset_valid.data[indices[:n]] dataset_valid.labels = dataset_valid.labels[indices[:n]] dataset.data = dataset.data[indices[n:]] dataset.labels = dataset.labels[indices[n:]] assert len(dataset_valid.data) == len(dataset_valid.labels) == n assert len(dataset.data) == len(dataset.labels) == n_ - n return dataset, dataset_valid if params is not None: name = "_".join(params) if isinstance(params, list) else params else: name = None loaders = {} if token_type == "word": if word2idx is None: raise ValueError if preprocessor is None: preprocessor = twitter_preprocess() print("Building word-level datasets...") for k, v in datasets.items(): _name = "{}_{}".format(name, k) dataset = WordDataset(v[0], v[1], word2idx, name=_name, preprocess=preprocessor, label_transformer=label_transformer) if k == 'train': # Sample validation instances dataset, dataset_valid = sample_validation(dataset) loaders['valid'] = DataLoader(dataset_valid, eval_batch_size, shuffle=False, drop_last=False) batch_size = train_batch_size if k == "train" else eval_batch_size loaders[k] = DataLoader(dataset, batch_size, shuffle=(k == "train"), drop_last=(k == "train")) elif token_type == "char": print("Building char-level datasets...") for k, v in datasets.items(): _name = "{}_{}".format(name, k) dataset = CharDataset(v[0], v[1], name=_name, label_transformer=label_transformer) if k == 'train': # Sample validation instances dataset, dataset_valid = sample_validation(dataset) loaders['valid'] = DataLoader(dataset_valid, eval_batch_size, shuffle=False, drop_last=False) batch_size = train_batch_size if k == "train" else eval_batch_size loaders[k] = DataLoader(dataset, batch_size, shuffle=True, drop_last=True) else: raise ValueError("Invalid token_type.") return loaders
("word2vec_500_6_concatened.txt", 510), ] embeddings = {} for e, d in emb_files: file = os.path.join(BASE_PATH, "embeddings", e) word2idx, idx2word, weights = load_word_vectors(file, d) embeddings[e.split(".")[0]] = (weights, word2idx) bow_clf = bow_model("clf") bow_reg = bow_model("reg") nbow_clf = {"nbow_{}".format(name): nbow_model("clf", e, w2i) for name, (e, w2i) in embeddings.items()} nbow_reg = {"nbow_{}".format(name): nbow_model("reg", e, w2i) for name, (e, w2i) in embeddings.items()} preprocessor = twitter_preprocess() # ########################################################################### # # 1. Task EI-reg: Detecting Emotion Intensity (regression) # ########################################################################### results = defaultdict(dict) print() for emotion in ["joy", "sadness", "fear", "anger"]: task = "EI-reg:{}".format(emotion) dev, gold = train_ei_reg(emotion=emotion, model=bow_reg, evaluation=eval_reg, preprocessor=preprocessor) results[task]["bow"] = {"dev": dev, "gold": gold}