print("\n > running in FINAL mode!\n") #training, testing = loader.load_final() data_folder = [ "../dataset/brexit/BrexitOpposite.txt", "../dataset/brexit/BrexitNeutral.txt", "../dataset/brexit/BrexitSupport.txt" ] training, testing = loader.load_stance_brexit_5cross( wholeFile, cross_num) else: training, validation, testing = loader.load_train_val_test() if SEMEVAL_GOLD: print("\n > running in Post-Mortem mode!\n") gold_data = SemEvalDataLoader().get_gold(task=TASK) gX = [obs[1] for obs in gold_data] gy = [obs[0] for obs in gold_data] gold = prepare_dataset(gX, gy, loader.pipeline, loader.y_one_hot) validation = testing testing = gold FINAL = False ############################################################################ # NN MODEL # ------------ # Uncomment one of the following model definitions, in order to define a model ############################################################################ print("Building NN Model...") # nn_model = build_attention_RNN(embeddings, classes=3, max_length=max_length,
def __init__(self, word_indices, text_lengths, subtask="A", silver=False, **kwargs): self.word_indices = word_indices filter_classes = kwargs.get("filter_classes", None) self.y_one_hot = kwargs.get("y_one_hot", True) self.pipeline = Pipeline([ ('preprocess', CustomPreProcessor( TextPreProcessor( backoff=[ 'url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'url', 'date', 'number' ], include_tags={ "hashtag", "allcaps", "elongated", "repeated", 'emphasis', 'censored' }, fix_html=True, segmenter="twitter", corrector="twitter", unpack_hashtags=True, unpack_contractions=True, spell_correct_elong=False, tokenizer=SocialTokenizer(lowercase=True).tokenize, dicts=[emoticons]))), ('ext', EmbeddingsExtractor(word_indices=word_indices, max_lengths=text_lengths, add_tokens=(False, True) if subtask != "A" else True, unk_policy="random")) ]) # loading data print("Loading data...") dataset = SemEvalDataLoader(verbose=False).get_data(task=subtask, years=None, datasets=None, only_semeval=True) random.Random(42).shuffle(dataset) if filter_classes: dataset = [d for d in dataset if d[0] in filter_classes] self.X = [obs[1] for obs in dataset] self.y = [obs[0] for obs in dataset] print("total observations:", len(self.y)) print("-------------------\ntraining set stats\n-------------------") print_dataset_statistics(self.y) print("-------------------") if silver: print("Loading silver data...") dataset = SemEvalDataLoader().get_silver() self.silver_X = [obs[1] for obs in dataset] self.silver_y = [obs[0] for obs in dataset] print("total observations:", len(self.silver_y))
from utilities.data_loader import get_embeddings from utilities.sklearn import eval_clf, nbow_model numpy.random.seed(1337) # for reproducibility def tok(text): return text WV_CORPUS = "datastories.twitter" WV_DIM = 300 embeddings, word_indices = get_embeddings(corpus=WV_CORPUS, dim=WV_DIM) train_set = SemEvalDataLoader(verbose=False).get_data(task="A", years=None, datasets=None, only_semeval=True) X = [obs[1] for obs in train_set] y = [obs[0] for obs in train_set] test_data = SemEvalDataLoader(verbose=False).get_gold(task="A") X_test = [obs[1] for obs in test_data] y_test = [obs[0] for obs in test_data] print("-----------------------------") print("LinearSVC") nbow = nbow_model("clf", embeddings, word_indices) nbow.fit(X, y) results = eval_clf(nbow.predict(X_test), y_test) for res, val in results.items(): print("{}: {:.3f}".format(res, val))