def load_embeddings(model_conf): word_vectors = os.path.join(BASE_PATH, "embeddings", "{}.txt".format(model_conf["embeddings_file"])) word_vectors_size = model_conf["embed_dim"] # load word embeddings print("loading word embeddings...") return load_word_vectors(word_vectors, word_vectors_size)
def load_embeddings(model_conf, absolute_path=False, embedding_size_auto_detect=None): if not absolute_path: word_vectors = os.path.join( BASE_PATH, "embeddings", "{}.txt".format(model_conf["embeddings_file"])) else: '''Absolute Path.''' word_vectors = model_conf["embeddings_file"] if embedding_size_auto_detect is not None: word_vectors_size = detect_embedding_dim(word_vectors) else: word_vectors_size = model_conf["embed_dim"] # load word embeddings print("loading word embeddings...") return load_word_vectors(word_vectors, word_vectors_size)
def read_embeddings(self, file, dim): """ Create an Embeddings Matrix, in which each row corresponds to the word vector from the pretrained word embeddings. If a word is missing from the provided pretrained word vectors, then sample a new embedding, from the gaussian of the pretrained embeddings. Args: file: dim: Returns: """ word2idx, idx2word, embeddings = load_word_vectors(file, dim) mu = embeddings.mean(axis=0) sigma = embeddings.std(axis=0) filtered_embeddings = numpy.zeros((len(self), embeddings.shape[1])) mask = numpy.zeros(len(self)) missing = [] for token_id, token in tqdm(self.id2tok.items(), desc="Reading embeddings...", total=len(self.id2tok.items())): if token not in word2idx or token == "<unk>": # todo: smart sampling per dim distribution # sample = numpy.random.uniform(low=-0.5, high=0.5, # size=embeddings.shape[1]) sample = numpy.random.normal(mu, sigma / 4) filtered_embeddings[token_id] = sample mask[token_id] = 1 missing.append(token_id) else: filtered_embeddings[token_id] = embeddings[word2idx[token]] print(f"Missing tokens from the pretrained embeddings: {len(missing)}") return filtered_embeddings, mask, missing
EMB_TRAINABLE = False BATCH_SIZE = 128 EPOCHS = 40 DATASET = "MR" # options: "MR", "Semeval2017A" # if your computer has a CUDA compatible gpu use it, otherwise use the cpu DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") ######################################################## # Define PyTorch datasets and dataloaders ######################################################## # load word embeddings print("loading word embeddings...") word2idx, idx2word, embeddings = load_word_vectors(EMBEDDINGS, EMB_DIM) # load the raw data if DATASET == "Semeval2017A": X_train, y_train, X_test, y_test = load_Semeval2017A() elif DATASET == "MR": X_train, y_train, X_test, y_test = load_MR() else: raise ValueError("Invalid dataset") # ------------ # # EX1 # # ------------ # # Convert data labels from strings to integers # create a new label encoder le = LabelEncoder()
sentiment_yamls = os.path.join(YAML_PATH, 'sentiment') sst_fine_grained_yamls = os.path.join(YAML_PATH, 'sst_fine_grained') scv1_yamls = os.path.join(YAML_PATH, 'scv1') scv2_yamls = os.path.join(YAML_PATH, 'scv2_gen') psych_yamls = os.path.join(YAML_PATH, 'psychexp') one_exp = False yaml = "gating.yaml" yamls_path = irony_yamls ######################### # Load embeddings ######################### if yamls_path is irony_yamls or sentiment_yamls: word2idx, idx2word, weights = load_word_vectors( os.path.join(EMB_DIR, "word2vec_300_6_20_neg.txt"),"300") else: word2idx, idx2word, weights = load_word_vectors_from_fasttext( os.path.join(EMB_DIR, "wiki.en.vec"), "300") ######################### # Run experiments ######################### if one_exp: loss, acc, f1, precision, recall, f1_test, acc_test = clf_features_runner(os.path.join(sentiment_yamls, "{}".format(yaml)), word2idx, idx2word, weights, cluster=True) experiments = {'loss':loss, 'acc':acc, 'f1':f1, 'precision':precision, 'recall':recall, 'f1_test':f1_test, 'acc_test':acc_test} now = datetime.datetime.now().strftime("%y-%m-%d_%H:%M:%S")
psych_yamls = os.path.join(YAML_PATH, 'psychexp') yaml = os.path.join(psych_yamls, "baseline.yaml") opts, config = train_options(yaml) device = opts.device X_train, y_train, X_test, y_test = load_dataset(config["data"]["dataset"], test=True) # load word embeddings if config["data"]["embeddings"] == "wiki.en.vec": word2idx, idx2word, weights = load_word_vectors_from_fasttext( os.path.join(EMB_DIR, config["data"]["embeddings"]), config["data"]["embeddings_dim"]) else: word2idx, idx2word, weights = load_word_vectors( os.path.join(EMB_DIR, config["data"]["embeddings"]), config["data"]["embeddings_dim"]) checkpoint_name = "Psych_exp_baseline" state = load_checkpoint(checkpoint_name) # features, feat_length = load_features(config["data"]["features"]) test_set = ClfDataset(X_test, y_test, word2idx, name="psych_test") test_lengths = [len(x) for x in test_set.data] test_sampler = SortedSampler(test_lengths) test_loader = DataLoader(test_set, sampler=test_sampler, batch_size=config["batch_size"], num_workers=opts.cores,
# Bag-of-Words # ############################################################# bow_clf = bow_model("clf", max_features=30000) bow_clf.fit(X_train, y_train) y_pred = bow_clf.predict(X_test) bow_results = eval_clf(y_pred, y_test) print("\n" + "#" * 40) print("Bag-of-Words") print("#" * 40) for k, v in bow_results.items(): print("{}:{:.4f}".format(k, v)) ############################################################# # Neural Bag-of-Words ############################################################# file = os.path.join(BASE_PATH, "embeddings", "word2vec_300_6_20_neg.txt") word2idx, idx2word, weights = load_word_vectors(file, 300) nbow_clf = nbow_model("clf", weights, word2idx) nbow_clf.fit(X_train, y_train) y_pred = nbow_clf.predict(X_test) nbow_results = eval_clf(y_pred, y_test) print("\n" + "#" * 40) print("Neural Bag-of-Words") print("#" * 40) for k, v in nbow_results.items(): print("{}:{:.4f}".format(k, v))
from utils.load_embeddings import load_word_vectors load_word_vectors("GoogleNews-vectors-negative300.txt", 300)
def submission(dataset, models=[], lm=[], gold=[]): X = load_test_wassa(dataset) with open("label_encoder.pkl", "rb") as f: label_encoder = pickle.load(f) # load embeddings file = os.path.join(BASE_PATH, "embeddings", "ntua_twitter_300.txt") word2idx, idx2word, weights = load_word_vectors(file, 300) dummy_y = [[0] * 6] * len(X) dummy_y = torch.tensor(dummy_y) posteriors_list = [] predicted_list = [] for i in range(0, len(models)): checkpoint_name = models[i] if lm[i]: model, optimizer, word2idx, idx2word, loss, acc, f1 = \ load_checkpoint_pre_lm(checkpoint_name) else: model, optimizer, vocab, loss, acc, f1 = \ load_checkpoint_with_f1(checkpoint_name) ##################################################################### # Define Dataloaders ##################################################################### preprocessor = twitter_preprocessor() # for new experiments remember to empty _cache! test_set = WordDataset(X, dummy_y, word2idx, name="wassa_test_submit", preprocess=preprocessor) sampler = SequentialSampler(test_set) test_loader = DataLoader(test_set, batch_size=32, sampler=sampler) ##################################################################### # Load Trained Model ##################################################################### model.eval() model.to(config.DEVICE) print(model) ##################################################################### # Evaluate Trained Model on test set & Calculate predictions ##################################################################### labels, predicted, posteriors = test_clf(model=model, data_source=test_loader, device=config.DEVICE) # pprint(labels) pprint(predicted) predicted_list.append(predicted) posteriors_list.append(posteriors) # pred, accuracy, f1 = ensemble_voting(predicted_list, gold, dataset) pred, accuracy, f1 = ensemble_posteriors(posteriors_list, gold, dataset) ##################################################################### # Create submission file with the predictions3M_GU13__35_noconc_2att ##################################################################### write_predictions(pred, label_encoder) return
bioc.dump(collection, open(output, 'w'), pretty_print=True) if __name__ == "__main__": # LOAD RAW DATA $ WORD VECTORS EVAL_DATASET = '../../dataset/PMtask_TestSet.xml' MODE = "eval" WV_PATH = '../../embeddings/PubMed-w2v.txt' WV_DIMS = 200 MAX_SENT_LENGTH = 45 MAX_SENTS = 23 print("loading word embeddings...") word2idx, idx2word, embeddings = load_word_vectors(WV_PATH, WV_DIMS, True) docs, labels, ids = load_data(EVAL_DATASET, MODE) # convert strings to lists of tokens print("Tokenizing...") docs = [[text_to_word_sequence(sent) for sent in sent_tokenize(doc)] for doc in docs] # convert words to word indexes print("Vectorizing...") docs = [vectorize_doc(doc, word2idx, MAX_SENTS, MAX_SENT_LENGTH) for doc in docs] docs = numpy.array(docs) # LOAD SAVED MODEL
from utils.load_embeddings import load_word_vectors load_word_vectors( r"D:\ruin\data\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.txt", 300)
from model.task1.baseline_models import train_ei_reg, train_ei_oc, train_v_reg, \ train_v_oc, train_e_c from modules.sklearn.models import nbow_model, bow_model, eval_reg, eval_mclf from utils.load_embeddings import load_word_vectors from utils.nlp import twitter_preprocess emb_files = [ ("word2vec_300_6_20_neg.txt", 300), ("word2vec_300_6_concatened.txt", 310), ("word2vec_500_6_20_neg.txt", 500), ("word2vec_500_6_concatened.txt", 510), ] embeddings = {} for e, d in emb_files: file = os.path.join(BASE_PATH, "embeddings", e) word2idx, idx2word, weights = load_word_vectors(file, d) embeddings[e.split(".")[0]] = (weights, word2idx) bow_clf = bow_model("clf") bow_reg = bow_model("reg") nbow_clf = {"nbow_{}".format(name): nbow_model("clf", e, w2i) for name, (e, w2i) in embeddings.items()} nbow_reg = {"nbow_{}".format(name): nbow_model("reg", e, w2i) for name, (e, w2i) in embeddings.items()} preprocessor = twitter_preprocess() # ########################################################################### # # 1. Task EI-reg: Detecting Emotion Intensity (regression) # ###########################################################################
extral = False discr = False d = 0.6 unfreeze = True freeze = {"embed": True, "hidden": True} unfreeze_epoque = {"embed": 6, "hidden": 4} # at which epoch the fine-tuning starts name = "wassa_2M_ep2_GU_lr_weight_decay" file = os.path.join(BASE_PATH, "embeddings", "ntua_twitter_300.txt") _, _, weights = load_word_vectors(file, 300) # load dataset config = WASSA_WITH_PRETR_LM config_lm = ConfLangModel # Attention size needs to be equal to RNN size for Transfer Learning if config['encoder_size'] != config_lm['rnn_size']: config['encoder_size'] = config_lm['rnn_size'] print("Classifier RNN size needs to be equal to LM RNN size!") X_train, X_test, y_train, y_test = load_wassa() # 3 - convert labels from strings to integers label_encoder = LabelEncoder() label_encoder = label_encoder.fit(y_train)