Exemplo n.º 1
0
def _print_op_list(ops, dmap):
    for d, o in ops:
        print "#############################################"
        print " ".join(DataTools.mark_unknown(d.tokens, word_embeddings.vocabulary.word2index))
        print "ORIG:\t", _ops2str(d.opinions)
        print o
        print ""
        print "    :\t", _ops2str(dmap[d.id].opinions)
Exemplo n.º 2
0
    def build(self):
        polarities = set()
        categories = set()
        entities = set()
        attributes = set()
        for s in self.sentences:
            for o in s.opinions:
                polarities.add(o.polarity)
                categories.add(o.category)
                entities.add(o.entity)
                attributes.add(o.attribute)

        self.polarity_table = DataTools.Vocabulary()
        self.polarity_table.init_from_vocab(polarities)
        self.category_table = DataTools.Vocabulary()
        self.category_table.init_from_vocab(categories)
        self.entity_table = DataTools.Vocabulary()
        self.entity_table.init_from_vocab(entities)
        self.attribute_table = DataTools.Vocabulary()
        self.attribute_table.init_from_vocab(attributes)
Exemplo n.º 3
0
def _print_list(l):
    for d, td, cd in l:
        d.opinions = sorted(d.opinions, key=lambda o: o.start)
        td.opinions = sorted(td.opinions, key=lambda o: o.start)
        cd.opinions = sorted(cd.opinions, key=lambda o: o.start)

        print "#############################################"
        print " ".join(DataTools.mark_unknown(d.tokens, word_embeddings.vocabulary.word2index))
        print ""
        print "ORIG:\t", _ops2str(d.opinions)
        print ""
        print "TOKN:\t", _ops2str(td.opinions)
        print ""
        print "CHAR:\t", _ops2str(cd.opinions)
Exemplo n.º 4
0
from nlputils import DataTools

import data

#
for top_k in [10000, 20000, 50000]:
    word_embeddings = DataTools.Embedding()
    word_embeddings.load(
        "/vol/scstaff/sjebbara/data/embeddings/amazon_review_corpus_en_100D_advanced_W.npy",
        "/vol/scstaff/sjebbara/data/embeddings/amazon_review_corpus_en_100D_advanced_vocab.txt"
    )
    word_embeddings.trim_embeddings(vocab_trim=["<UNK>"], top_k=top_k)
    word_embeddings.vocabulary.set_unknown(
        word_embeddings.vocabulary.get_index("<UNK>"))
    word_embeddings.add("<pad>", 0, vector_init="zeros")
    word_embeddings.vocabulary.set_padding(0)
    word_embeddings.add(data.SENTENCE_START_TOKEN, 1, vector_init="zeros")
    word_embeddings.add(data.SENTENCE_END_TOKEN, 2, vector_init="zeros")
    word_embeddings.save(
        "../res/embeddings/",
        "amazon_review_corpus_en_100D_advanced_top-{}".format(top_k))
Exemplo n.º 5
0
def evaluate_aspects(model,
                     documents,
                     word_vocabulary,
                     pos_vocabulary,
                     char_vocabulary,
                     conf,
                     verbose=True):
    print "Evaluate model ..."
    tagging_scheme = AnnotationTools.get_tagging_scheme(conf.tagging_scheme)
    documents = filter(lambda d: not d.out_of_scope, documents)

    batch_generator = DatasetTools.BatchGenerator(
        documents,
        100,
        get_vectorizer(word_vocabulary, pos_vocabulary, char_vocabulary,
                       tagging_scheme),
        raw_data_name="document")

    results = LearningTools.ExperimentSnapshotResults()
    errors = 0
    for i, batches in enumerate(batch_generator):
        actual_batch_size = len(batches.text_input)
        i_text = i * conf.batch_size + actual_batch_size
        print("Pretrain Batch %d; Text %d:" % (i + 1, i_text))

        predicted_aspect_batch = model.predict_on_batch(batches)
        batches["predicted_aspect_output"] = predicted_aspect_batch

        for instance in DatasetTools.BatchIterator([batches]):
            d = instance["document"]
            true_aspects = instance["aspect_output"]
            predicted_aspect_probas = instance["predicted_aspect_output"]

            if verbose:
                print u"#### Sentence: [{}]: '{}'".format(d.id, d.text)
            tokens = DataTools.mark_unknown(d.tokens,
                                            word_vocabulary.word2index)

            true_aspects = true_aspects[-len(tokens):, :]  # remove padding
            predicted_aspect_probas = predicted_aspect_probas[
                -len(tokens):, :]  # remove padding

            true_aspect_spans_orig = set([(o.token_start, o.token_end)
                                          for o in d.opinions])
            true_aspect_spans = set(
                tagging_scheme.encoding2spans(true_aspects))
            if true_aspect_spans != true_aspect_spans_orig:
                print "ERROR: {} vs. {}".format(true_aspect_spans_orig,
                                                true_aspect_spans)
                errors += 1
            predicted_aspect_spans = set(
                tagging_scheme.encoding2spans(predicted_aspect_probas))

            tokens_proba = [
                u"{} ({:.2f},{:.2f},{:.2f})".format(t, pb, pi, po)
                for t, (pb, pi, po) in zip(tokens, predicted_aspect_probas)
            ]

            if verbose:
                print "TRUE:   ", tagging_scheme.visualize_tags(
                    tokens_proba,
                    tagging_scheme.spans2tags(len(tokens), true_aspect_spans),
                    spacer=" ")
                print "PRED:   ", tagging_scheme.visualize_tags(
                    tokens_proba,
                    tagging_scheme.spans2tags(len(tokens),
                                              predicted_aspect_spans),
                    spacer=" ")

            data_sample = DataTools.DataSample()
            data_sample.document = d
            data_sample.true_aspect_spans = true_aspect_spans
            data_sample.predicted_aspect_spans = predicted_aspect_spans
            data_sample.predicted_aspect_probas = predicted_aspect_probas
            results.add(data_sample)

    def extract_aspects(min_confidence=0.75):
        all_true_aspects = set()
        all_predicted_aspects = set()
        for ds in results.data_samples:
            for a in ds.true_aspect_spans:
                all_true_aspects.add((ds.document.id, ) + a)

            for a in ds.predicted_aspect_spans:
                probas = numpy.max(ds.predicted_aspect_probas[a[0]:a[1]],
                                   axis=1)
                if numpy.mean(probas) > min_confidence:
                    all_predicted_aspects.add((ds.document.id, ) + a)

        return all_true_aspects, all_predicted_aspects

    def score(beta=1, min_confidence=0.):
        all_true_aspects, all_predicted_aspects = results.extract_aspects(
            min_confidence)

        return EvaluationTools.f1(beta=beta,
                                  targets=all_true_aspects,
                                  predictions=all_predicted_aspects)

    results.extract_aspects = extract_aspects
    results.score = score

    f1, p, r = results.score(min_confidence=0)
    print "F1: {:.3f}".format(f1)
    print "P:  {:.3f}".format(p)
    print "R:  {:.3f}".format(r)

    print "#Errors:", errors
    return results
Exemplo n.º 6
0
def main(conf, plot_scores=True):
    conf.experiment_id = "AspectBasedSentiment_Configuration_" + LearningTools.get_timestamp(
    )
    print(conf)
    base_dirpath = os.path.join(EXPERIMENTS_OUTPUT_DIR,
                                "AspectBasedSentiment_" + conf.timestamp,
                                conf.experiment_id)
    os.makedirs(base_dirpath)
    print("read dataset...")

    # Read documents and split in train/val portions
    if conf.data_split == "original":
        train_dataset = data.read_semeval2016_restaurant_train(
            conf.scope, conf.text_preprocessing, conf.tokenization_style,
            conf.sentence_filter, conf.opinion_filter)
        blind_test_documents = data.read_semeval2016_restaurant_blind_test(
            conf.scope, conf.text_preprocessing,
            conf.tokenization_style).sentences
        train_documents, val_documents = DataTools.custom_split(
            train_dataset.sentences, 0.8, seed=7)
        train_test_splits = [(train_documents, val_documents)]
    elif conf.data_split == "custom":
        dataset = data.read_semeval2016_restaurant_train(
            conf.scope, conf.text_preprocessing, conf.tokenization_style,
            conf.sentence_filter, conf.opinion_filter)
        train_documents, test_documents = DataTools.custom_split(
            dataset.sentences, 0.8, seed=7)
        train_test_splits = [(train_documents, test_documents)]
    elif conf.data_split == "cv":
        train_dataset = data.read_semeval2016_restaurant_train(
            conf.scope, conf.text_preprocessing, conf.tokenization_style,
            conf.sentence_filter, conf.opinion_filter)
        train_test_splits = DataTools.cross_validation_split(
            train_dataset.sentences, conf.n_cross_validation, seed=7)

    # read word embeddings
    word_embeddings = DataTools.Embedding()
    word_embeddings.load(
        "../res/embeddings/amazon_review_corpus_en_100D_advanced_top-{}_W.npy".
        format(conf.top_k_vocab),
        "../res/embeddings/amazon_review_corpus_en_100D_advanced_top-{}_vocab.txt"
        .format(conf.top_k_vocab))
    word_embeddings.vocabulary.set_padding(
        word_embeddings.vocabulary.get_index("<pad>"))
    word_embeddings.vocabulary.set_unknown(
        word_embeddings.vocabulary.get_index("<UNK>"))

    conf.word_input_size = len(word_embeddings.vocabulary)
    conf.word_embedding_size = word_embeddings.W.shape[1]

    # read character vocabulary (map from word to index and back)
    char_vocabulary = DataTools.Vocabulary()
    char_vocab = Counter(c for w in word_embeddings.vocabulary.vocab
                         for c in unidecode(w) if c != " ")
    print(char_vocab.most_common())
    char_vocabulary.init_from_vocab(char_vocab)
    char_vocabulary.add_padding("<0>", 0)
    char_vocabulary.add_unknown("<?>", 1)
    char_vocabulary.save(os.path.join(base_dirpath, "char_vocabulary.txt"))

    conf.char_input_size = len(char_vocabulary)

    pos_vocabulary = LexicalTools.pos_vocabulary
    conf.pos_input_size = len(pos_vocabulary)
    if not conf.use_pos:
        pos_vocabulary = None

    # setup plotting
    if plot_scores:
        score_plot = LearningTools.ScorePlot(
            "Aspect Extraction",
            n_cross_validation=len(train_test_splits),
            n_epochs=conf.n_epochs)

    # iterate over cross validation splits and train model
    for n, (train_documents, val_documents) in enumerate(train_test_splits):
        cv_dirpath = os.path.join(base_dirpath, "cv-{}".format(n + 1))
        os.makedirs(cv_dirpath)
        conf.save(os.path.join(cv_dirpath, "configuration.conf"))

        best_epoch = 0
        best_score = 0

        model_name = "{}_{}_n-docs={}_batch-size={}_epochs={}_s-size={}_c-size={}_topK={}".format(
            conf.model, conf.dataset, conf.max_documents, conf.batch_size,
            conf.n_epochs, conf.sequence_embedding_size,
            conf.char_embedding_size, conf.top_k_vocab)
        print("Model:", model_name)
        print(conf)

        # instantiate model using the defined configuration
        model_fn = models.__dict__[conf.model]
        modelz = model_fn(word_embedding_weights=[word_embeddings.W], **conf)

        # modelz[0] is the model for tagging sentences, modelz[1] for obtaining a char-level vector for a word
        model = modelz[0]

        model.summary()

        models_dirpath = os.path.join(cv_dirpath, "models")
        os.makedirs(models_dirpath)

        best_model = (0, 0, None)
        for e in range(conf.n_epochs):
            process.train_aspects(model,
                                  train_documents,
                                  word_embeddings.vocabulary,
                                  pos_vocabulary,
                                  char_vocabulary,
                                  conf,
                                  e,
                                  n_epochs=conf.n_epochs)
            print("\n\nEvaluate on TRAIN")
            train_results = process.evaluate_aspects(
                model,
                train_documents,
                word_embeddings.vocabulary,
                pos_vocabulary,
                char_vocabulary,
                conf,
                verbose=False)
            print("\n\nEvaluate on VAL")
            val_results = process.evaluate_aspects(model, val_documents,
                                                   word_embeddings.vocabulary,
                                                   pos_vocabulary,
                                                   char_vocabulary, conf)

            if conf.data_split == "original":
                predict_documents = blind_test_documents
            else:
                predict_documents = val_documents

            process.predict_and_write(
                os.path.join(cv_dirpath,
                             "epoch={}_predicted_aspects.xml".format(e + 1)),
                model, predict_documents, word_embeddings.vocabulary,
                pos_vocabulary, char_vocabulary, conf)

            f1_train, p_train, r_train = train_results.score(min_confidence=0)
            f1, p, r = val_results.score(min_confidence=0)

            if plot_scores:
                score_plot.add(n, e, f1_train, "F1-Train")
                score_plot.add(n, e, f1, "F1")
                score_plot.add(n, e, p, "P")
                score_plot.add(n, e, r, "R")

                score_plot.print_scores("F1")

            if e > 1:
                if best_model is None or f1 > best_model[0]:
                    model.save_weights(
                        os.path.join(models_dirpath,
                                     "weights@{}.h5".format(e + 1)))
                    best_model = (f1, e)

            with io.open(os.path.join(cv_dirpath, "scores.txt".format(e + 1)),
                         "a") as f:
                f.write("{:.6f}\n".format(f1))

        print("best model:", best_model)

        ############ Save Model Weights ############
        model.save_weights(os.path.join(models_dirpath, "final_weights.h5"))

    if plot_scores:
        numpy.save("../results/scores_{}.npy".format(conf.model),
                   score_plot.scores["F1"])
    print("Best Epoch {} with score {}".format(best_epoch, best_score))
Exemplo n.º 7
0
def _ops2str(opinions):
    return "|".join(["({}-{}): '{}'".format(o.start, o.end, " ".join(
        DataTools.mark_unknown(o.tokens, word_embeddings.vocabulary.word2index))) for o in opinions])
Exemplo n.º 8
0
    os.path.join(experiment_base_dirpath, cv_dirname, "configuration.conf"))

print(conf)

model_fn = models.__dict__[conf.model]
modelz = model_fn(word_embedding_weights=None, **conf)
char_model = modelz[1]

# load the trained weights
weights = char_model.load_weights(os.path.join(experiment_base_dirpath,
                                               cv_dirname,
                                               "models/best_model.h5"),
                                  by_name=True)

# load resources: character vocabulary and pretrained word embeddings
char_vocabulary = DataTools.Vocabulary()
char_vocabulary.load(
    os.path.join(experiment_base_dirpath, "char_vocabulary.txt"))
char_vocabulary.set_padding(char_vocabulary.get_index("<0>"))
char_vocabulary.set_unknown(char_vocabulary.get_index("<?>"))
print(char_vocabulary)

word_embeddings = DataTools.Embedding()
word_embeddings.load(
    "../res/embeddings/amazon_review_corpus_en_100D_advanced_top-100000_W.npy",
    "../res/embeddings/amazon_review_corpus_en_100D_advanced_top-100000_vocab.txt"
)
word_embeddings.vocabulary.set_padding(
    word_embeddings.vocabulary.get_index("<pad>"))
word_embeddings.vocabulary.set_unknown(
    word_embeddings.vocabulary.get_index("<UNK>"))