Exemplo n.º 1
0
def test_relation_tags():
    # set 3 labels for 2 spans (HU is tagged twice with different tags)
    sentence = Sentence(
        "Humboldt Universität zu Berlin is located in Berlin .")

    # create two relation label
    Relation(sentence[0:4], sentence[7:8]).add_label("rel", "located in")
    Relation(sentence[0:2], sentence[3:4]).add_label("rel", "university of")
    Relation(sentence[0:2], sentence[3:4]).add_label("syntactic", "apposition")

    # there should be two relation labels
    labels: List[Label] = sentence.get_labels("rel")
    assert 2 == len(labels)
    assert "located in" == labels[0].value
    assert "university of" == labels[1].value

    # there should be one syntactic labels
    labels: List[Label] = sentence.get_labels("syntactic")
    assert 1 == len(labels)

    # there should be two relations, one with two and one with one label
    relations: List[Relation] = sentence.get_relations("rel")
    assert 2 == len(relations)
    assert 1 == len(relations[0].labels)
    assert 2 == len(relations[1].labels)
Exemplo n.º 2
0
def test_token_tags():
    # example sentence
    sentence = Sentence("I love Berlin")

    # set 4 labels for 2 tokens ('love' is tagged twice)
    sentence[1].add_label("pos", "verb")
    sentence[1].add_label("sentiment", "positive")
    sentence[2].add_label("pos", "proper noun")
    sentence[0].add_label("pos", "pronoun")

    # check if there are three POS labels with correct text and values
    labels: List[Label] = sentence.get_labels("pos")
    assert 3 == len(labels)
    assert "I" == labels[0].data_point.text
    assert "pronoun" == labels[0].value
    assert "love" == labels[1].data_point.text
    assert "verb" == labels[1].value
    assert "Berlin" == labels[2].data_point.text
    assert "proper noun" == labels[2].value

    # check if there are is one SENTIMENT label with correct text and values
    labels: List[Label] = sentence.get_labels("sentiment")
    assert 1 == len(labels)
    assert "love" == labels[0].data_point.text
    assert "positive" == labels[0].value

    # check if all tokens are correctly labeled
    assert 3 == len(sentence)
    assert "I" == sentence[0].text
    assert "love" == sentence[1].text
    assert "Berlin" == sentence[2].text
    assert 1 == len(sentence[0].get_labels("pos"))
    assert 1 == len(sentence[1].get_labels("pos"))
    assert 2 == len(sentence[1].labels)
    assert 1 == len(sentence[2].get_labels("pos"))

    assert "verb" == sentence[1].get_label("pos").value
    assert "positive" == sentence[1].get_label("sentiment").value

    # remove the pos label from the last word
    sentence[2].remove_labels("pos")
    # there should be 2 POS labels left
    labels: List[Label] = sentence.get_labels("pos")
    assert 2 == len(labels)
    assert 1 == len(sentence[0].get_labels("pos"))
    assert 1 == len(sentence[1].get_labels("pos"))
    assert 2 == len(sentence[1].labels)
    assert 0 == len(sentence[2].get_labels("pos"))

    # now remove all pos tags
    sentence.remove_labels("pos")
    print(sentence[0].get_labels("pos"))
    assert 0 == len(sentence.get_labels("pos"))
    assert 1 == len(sentence.get_labels("sentiment"))
    assert 1 == len(sentence.labels)

    assert 0 == len(sentence[0].get_labels("pos"))
    assert 0 == len(sentence[1].get_labels("pos"))
    assert 0 == len(sentence[2].get_labels("pos"))
Exemplo n.º 3
0
def test_train_load_use_classifier(results_base_path, tasks_base_path):
    corpus = ColumnCorpus(
        data_folder=tasks_base_path / "conllu",
        train_file="train.conllup",
        dev_file="train.conllup",
        test_file="train.conllup",
        column_format={
            1: "text",
            2: "pos",
            3: "ner"
        },
    )

    relation_label_dict = corpus.make_label_dictionary(label_type="relation")

    embeddings = TransformerWordEmbeddings()

    model: RelationExtractor = RelationExtractor(
        embeddings=embeddings,
        label_dictionary=relation_label_dict,
        label_type="relation",
        entity_label_type="ner",
        train_on_gold_pairs_only=True,
    )

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(model, corpus)

    trainer.train(
        results_base_path,
        learning_rate=0.1,
        mini_batch_size=2,
        max_epochs=3,
        shuffle=False,
    )

    del trainer, model, relation_label_dict, corpus

    loaded_model: RelationExtractor = RelationExtractor.load(
        results_base_path / "final-model.pt")
    loaded_model.train_on_gold_pairs_only = False

    sentence = Sentence(
        ["Apple", "was", "founded", "by", "Steve", "Jobs", "."])
    for token, tag in zip(sentence.tokens,
                          ["B-ORG", "O", "O", "O", "B-PER", "I-PER", "O"]):
        token.set_label("ner", tag)

    loaded_model.predict(sentence)

    assert "founded_by" == sentence.get_labels("relation")[0].value

    # loaded_model.predict([sentence, sentence_empty])
    # loaded_model.predict([sentence_empty])

    del loaded_model
Exemplo n.º 4
0
def test_sequence_tagger_transformer_finetune(results_base_path,
                                              tasks_base_path):
    flair.set_seed(123)

    # load dataset
    corpus: Corpus = ColumnCorpus(
        data_folder=tasks_base_path / "trivial" / "trivial_bioes",
        column_format={
            0: "text",
            1: "ner"
        },
    )
    tag_dictionary = corpus.make_label_dictionary("ner")

    # tagger without CRF
    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=TransformerWordEmbeddings("distilbert-base-uncased",
                                             fine_tune=True),
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
        use_rnn=False,
        reproject_embeddings=False,
    )

    # train
    trainer = ModelTrainer(tagger, corpus)
    trainer.fine_tune(
        results_base_path,
        mini_batch_size=2,
        max_epochs=10,
        shuffle=True,
        learning_rate=0.5e-4,
    )

    loaded_model: SequenceTagger = SequenceTagger.load(results_base_path /
                                                       "final-model.pt")

    sentence = Sentence("this is New York")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # check if loaded model can predict
    entities = [span.span.text for span in sentence.get_labels("ner")]
    assert "New York" in entities

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type="ner")
    assert result.classification_report["micro avg"]["f1-score"] == 1.0

    del loaded_model
Exemplo n.º 5
0
def test_mixed_labels():
    # example sentence
    sentence = Sentence("I love New York")

    # has sentiment value
    sentence.add_label("sentiment", "positive")

    # has 4 part of speech tags
    sentence[1].add_label("pos", "verb")
    sentence[2].add_label("pos", "proper noun")
    sentence[3].add_label("pos", "proper noun")
    sentence[0].add_label("pos", "pronoun")

    # has 1 NER tag
    sentence[2:4].add_label("ner", "City")

    # should be in total 6 labels
    assert 6 == len(sentence.labels)
    assert 4 == len(sentence.get_labels("pos"))
    assert 1 == len(sentence.get_labels("sentiment"))
    assert 1 == len(sentence.get_labels("ner"))
Exemplo n.º 6
0
def test_span_tags():

    # set 3 labels for 2 spans (HU is tagged twice)
    sentence = Sentence(
        "Humboldt Universität zu Berlin is located in Berlin .")
    sentence[0:4].add_label("ner", "Organization")
    sentence[0:4].add_label("ner", "University")
    sentence[7:8].add_label("ner", "City")

    # check if there are three labels with correct text and values
    labels: List[Label] = sentence.get_labels("ner")
    assert 3 == len(labels)
    assert "Humboldt Universität zu Berlin" == labels[0].data_point.text
    assert "Organization" == labels[0].value
    assert "Humboldt Universität zu Berlin" == labels[1].data_point.text
    assert "University" == labels[1].value
    assert "Berlin" == labels[2].data_point.text
    assert "City" == labels[2].value

    # check if there are two spans with correct text and values
    spans: List[Span] = sentence.get_spans("ner")
    assert 2 == len(spans)
    assert "Humboldt Universität zu Berlin" == spans[0].text
    assert 2 == len(spans[0].get_labels("ner"))
    assert "Berlin" == spans[1].text
    assert "City" == spans[1].get_label("ner").value

    # now delete the NER tags of "Humboldt-Universität zu Berlin"
    sentence[0:4].remove_labels("ner")
    # should be only one NER label left
    labels: List[Label] = sentence.get_labels("ner")
    assert 1 == len(labels)
    assert "Berlin" == labels[0].data_point.text
    assert "City" == labels[0].value
    # and only one NER span
    spans: List[Span] = sentence.get_spans("ner")
    assert 1 == len(spans)
    assert "Berlin" == spans[0].text
    assert "City" == spans[0].get_label("ner").value
Exemplo n.º 7
0
def split_to_spans(s: Sentence):
    orig = s.to_original_text()
    last_idx = 0
    spans = []
    tagged_ents = s.get_labels("ner")
    for ent in tagged_ents:
        if last_idx != ent.span.start_pos:
            spans.append((orig[last_idx:ent.span.start_pos], None))
        spans.append((ent.span.text, ent.value))
        assert ent.span.end_pos is not None
        last_idx = ent.span.end_pos
    if last_idx < len(orig) - 1:
        spans.append((orig[last_idx:len(orig)], None))
    return spans
Exemplo n.º 8
0
    def predict(query: str):

        if query is None or len(query) == 0:
            return {'success': False, 'message': 'query is required'}

        try:
            ContactTypeModel.load_model()
            sentence = Sentence(query)
            ContactTypeModel._model.predict(sentence)
            return {'success': True, 'data': [label.to_dict() for label in sentence.get_labels()]}
        except RuntimeError as e:
            logging.error(e, exc_info=True)
            return {'success': False, 'message': "Runtime Error: {0}".format(e)}
        except Exception as e:
            logging.error(e, exc_info=True)
            return {'success': False, 'message': 'exception occurred'}
def calcualte_sentiment_for_given_restaurant_year(db_name, classifier,
                                                  restaurant, year):
    reviews = retrieve_reviews_from_db(db_name, restaurant, year)
    sentiment_scores = []
    for review in reviews:
        if review == '':
            pass
        else:
            sentence = Sentence(review)
            classifier.predict(sentence)
            sentiment = sentence.get_labels()[0].value
            if sentiment == 'NEGATIVE':
                sentiment_score = -1
            else:
                sentiment_score = 1
            sentiment_scores.append(sentiment_score)
    return sentiment_scores
Exemplo n.º 10
0
def test_sentence_labels():
    # example sentence
    sentence = Sentence("I love Berlin")
    sentence.add_label("sentiment", "positive")
    sentence.add_label("topic", "travelling")

    assert 2 == len(sentence.labels)
    assert 1 == len(sentence.get_labels("sentiment"))
    assert 1 == len(sentence.get_labels("topic"))

    # add another topic label
    sentence.add_label("topic", "travelling")
    assert 3 == len(sentence.labels)
    assert 1 == len(sentence.get_labels("sentiment"))
    assert 2 == len(sentence.get_labels("topic"))

    sentence.remove_labels("topic")
    assert 1 == len(sentence.labels)
    assert 1 == len(sentence.get_labels("sentiment"))
    assert 0 == len(sentence.get_labels("topic"))
Exemplo n.º 11
0
def test_spans():
    # bioes tags
    sentence = Sentence("Zalando Research is located in Berlin .")
    sentence[0].add_tag("ner", "B-ORG")
    sentence[1].add_tag("ner", "E-ORG")
    sentence[5].add_tag("ner", "S-LOC")
    sentence._convert_span_labels("ner")

    spans: List[SpanLabel] = sentence.get_labels("ner")

    assert 2 == len(spans)
    assert "Zalando Research" == spans[0].span.text
    assert "ORG" == spans[0].value
    assert "Berlin" == spans[1].span.text
    assert "LOC" == spans[1].value

    # bio tags
    sentence = Sentence("Zalando Research is located in Berlin .")
    sentence[0].add_tag("ner", "B-ORG")
    sentence[1].add_tag("ner", "I-ORG")
    sentence[5].add_tag("ner", "B-LOC")
    sentence._convert_span_labels("ner")

    spans: List[SpanLabel] = sentence.get_labels("ner")

    assert "Zalando Research" == spans[0].span.text
    assert "ORG" == spans[0].value
    assert "Berlin" == spans[1].span.text
    assert "LOC" == spans[1].value

    # broken tags
    sentence = Sentence("Zalando Research is located in Berlin .")
    sentence[0].add_tag("ner", "I-ORG")
    sentence[1].add_tag("ner", "E-ORG")
    sentence[5].add_tag("ner", "I-LOC")
    sentence._convert_span_labels("ner")

    spans: List[SpanLabel] = sentence.get_labels("ner")

    assert "Zalando Research" == spans[0].span.text
    assert "ORG" == spans[0].value
    assert "Berlin" == spans[1].span.text
    assert "LOC" == spans[1].value

    # all tags
    sentence = Sentence("Zalando Research is located in Berlin .")
    sentence[0].add_tag("ner", "I-ORG")
    sentence[1].add_tag("ner", "E-ORG")
    sentence[2].add_tag("ner", "aux")
    sentence[3].add_tag("ner", "verb")
    sentence[4].add_tag("ner", "preposition")
    sentence[5].add_tag("ner", "I-LOC")
    sentence._convert_span_labels("ner")

    spans: List[SpanLabel] = sentence.get_labels("ner")
    assert 5 == len(spans)
    assert "Zalando Research" == spans[0].span.text
    assert "ORG" == spans[0].value
    assert "Berlin" == spans[4].span.text
    assert "LOC" == spans[4].value

    # all weird tags
    sentence = Sentence("Zalando Research is located in Berlin .")
    sentence[0].add_tag("ner", "I-ORG")
    sentence[1].add_tag("ner", "S-LOC")
    sentence[2].add_tag("ner", "aux")
    sentence[3].add_tag("ner", "B-relation")
    sentence[4].add_tag("ner", "E-preposition")
    sentence[5].add_tag("ner", "S-LOC")
    sentence._convert_span_labels("ner")

    spans: List[SpanLabel] = sentence.get_labels("ner")
    assert 5 == len(spans)
    assert "Zalando" == spans[0].span.text
    assert "ORG" == spans[0].value
    assert "Research" == spans[1].span.text
    assert "LOC" == spans[1].value
    assert "located in" == spans[3].span.text
    assert "relation" == spans[3].value

    sentence = Sentence(
        "A woman was charged on Friday with terrorist offences after three "
        "Irish Republican Army mortar bombs were found in a Belfast house , "
        "police said . "
    )
    sentence[11].add_tag("ner", "S-MISC")
    sentence[12].add_tag("ner", "B-MISC")
    sentence[13].add_tag("ner", "E-MISC")
    sentence._convert_span_labels("ner")

    spans: List[SpanLabel] = sentence.get_labels("ner")
    assert 2 == len(spans)
    assert "Irish" == spans[0].span.text
    assert "Republican Army" == spans[1].span.text

    sentence = Sentence("Zalando Research is located in Berlin .")

    # tags with confidence
    sentence[0].add_tag("ner", "B-ORG", 1.0)
    sentence[1].add_tag("ner", "E-ORG", 0.9)
    sentence[5].add_tag("ner", "S-LOC", 0.5)
    sentence._convert_span_labels("ner")

    spans: List[SpanLabel] = sentence.get_labels("ner")

    assert 2 == len(spans)
    assert "Zalando Research" == spans[0].span.text
    assert "ORG" == spans[0].value
    assert 0.95 == spans[0].score

    assert "Berlin" == spans[1].span.text
    assert "LOC" == spans[1].value
    assert 0.5 == spans[1].score