Пример #1
0
        print("\n > running in FINAL mode!\n")
        #training, testing = loader.load_final()
        data_folder = [
            "../dataset/brexit/BrexitOpposite.txt",
            "../dataset/brexit/BrexitNeutral.txt",
            "../dataset/brexit/BrexitSupport.txt"
        ]
        training, testing = loader.load_stance_brexit_5cross(
            wholeFile, cross_num)

    else:
        training, validation, testing = loader.load_train_val_test()

    if SEMEVAL_GOLD:
        print("\n > running in Post-Mortem mode!\n")
        gold_data = SemEvalDataLoader().get_gold(task=TASK)
        gX = [obs[1] for obs in gold_data]
        gy = [obs[0] for obs in gold_data]
        gold = prepare_dataset(gX, gy, loader.pipeline, loader.y_one_hot)

        validation = testing
        testing = gold
        FINAL = False

    ############################################################################
    # NN MODEL
    # ------------
    # Uncomment one of the following model definitions, in order to define a model
    ############################################################################
    print("Building NN Model...")
    # nn_model = build_attention_RNN(embeddings, classes=3, max_length=max_length,
Пример #2
0
    def __init__(self,
                 word_indices,
                 text_lengths,
                 subtask="A",
                 silver=False,
                 **kwargs):

        self.word_indices = word_indices

        filter_classes = kwargs.get("filter_classes", None)
        self.y_one_hot = kwargs.get("y_one_hot", True)

        self.pipeline = Pipeline([
            ('preprocess',
             CustomPreProcessor(
                 TextPreProcessor(
                     backoff=[
                         'url', 'email', 'percent', 'money', 'phone', 'user',
                         'time', 'url', 'date', 'number'
                     ],
                     include_tags={
                         "hashtag", "allcaps", "elongated", "repeated",
                         'emphasis', 'censored'
                     },
                     fix_html=True,
                     segmenter="twitter",
                     corrector="twitter",
                     unpack_hashtags=True,
                     unpack_contractions=True,
                     spell_correct_elong=False,
                     tokenizer=SocialTokenizer(lowercase=True).tokenize,
                     dicts=[emoticons]))),
            ('ext',
             EmbeddingsExtractor(word_indices=word_indices,
                                 max_lengths=text_lengths,
                                 add_tokens=(False,
                                             True) if subtask != "A" else True,
                                 unk_policy="random"))
        ])

        # loading data
        print("Loading data...")
        dataset = SemEvalDataLoader(verbose=False).get_data(task=subtask,
                                                            years=None,
                                                            datasets=None,
                                                            only_semeval=True)
        random.Random(42).shuffle(dataset)

        if filter_classes:
            dataset = [d for d in dataset if d[0] in filter_classes]

        self.X = [obs[1] for obs in dataset]
        self.y = [obs[0] for obs in dataset]
        print("total observations:", len(self.y))

        print("-------------------\ntraining set stats\n-------------------")
        print_dataset_statistics(self.y)
        print("-------------------")

        if silver:
            print("Loading silver data...")
            dataset = SemEvalDataLoader().get_silver()
            self.silver_X = [obs[1] for obs in dataset]
            self.silver_y = [obs[0] for obs in dataset]
            print("total observations:", len(self.silver_y))
Пример #3
0
from utilities.data_loader import get_embeddings
from utilities.sklearn import eval_clf, nbow_model

numpy.random.seed(1337)  # for reproducibility


def tok(text):
    return text


WV_CORPUS = "datastories.twitter"
WV_DIM = 300
embeddings, word_indices = get_embeddings(corpus=WV_CORPUS, dim=WV_DIM)

train_set = SemEvalDataLoader(verbose=False).get_data(task="A",
                                                      years=None,
                                                      datasets=None,
                                                      only_semeval=True)
X = [obs[1] for obs in train_set]
y = [obs[0] for obs in train_set]

test_data = SemEvalDataLoader(verbose=False).get_gold(task="A")
X_test = [obs[1] for obs in test_data]
y_test = [obs[0] for obs in test_data]

print("-----------------------------")
print("LinearSVC")
nbow = nbow_model("clf", embeddings, word_indices)
nbow.fit(X, y)
results = eval_clf(nbow.predict(X_test), y_test)
for res, val in results.items():
    print("{}: {:.3f}".format(res, val))