def test_relation_tags(): # set 3 labels for 2 spans (HU is tagged twice with different tags) sentence = Sentence( "Humboldt Universität zu Berlin is located in Berlin .") # create two relation label Relation(sentence[0:4], sentence[7:8]).add_label("rel", "located in") Relation(sentence[0:2], sentence[3:4]).add_label("rel", "university of") Relation(sentence[0:2], sentence[3:4]).add_label("syntactic", "apposition") # there should be two relation labels labels: List[Label] = sentence.get_labels("rel") assert 2 == len(labels) assert "located in" == labels[0].value assert "university of" == labels[1].value # there should be one syntactic labels labels: List[Label] = sentence.get_labels("syntactic") assert 1 == len(labels) # there should be two relations, one with two and one with one label relations: List[Relation] = sentence.get_relations("rel") assert 2 == len(relations) assert 1 == len(relations[0].labels) assert 2 == len(relations[1].labels)
def test_token_tags(): # example sentence sentence = Sentence("I love Berlin") # set 4 labels for 2 tokens ('love' is tagged twice) sentence[1].add_label("pos", "verb") sentence[1].add_label("sentiment", "positive") sentence[2].add_label("pos", "proper noun") sentence[0].add_label("pos", "pronoun") # check if there are three POS labels with correct text and values labels: List[Label] = sentence.get_labels("pos") assert 3 == len(labels) assert "I" == labels[0].data_point.text assert "pronoun" == labels[0].value assert "love" == labels[1].data_point.text assert "verb" == labels[1].value assert "Berlin" == labels[2].data_point.text assert "proper noun" == labels[2].value # check if there are is one SENTIMENT label with correct text and values labels: List[Label] = sentence.get_labels("sentiment") assert 1 == len(labels) assert "love" == labels[0].data_point.text assert "positive" == labels[0].value # check if all tokens are correctly labeled assert 3 == len(sentence) assert "I" == sentence[0].text assert "love" == sentence[1].text assert "Berlin" == sentence[2].text assert 1 == len(sentence[0].get_labels("pos")) assert 1 == len(sentence[1].get_labels("pos")) assert 2 == len(sentence[1].labels) assert 1 == len(sentence[2].get_labels("pos")) assert "verb" == sentence[1].get_label("pos").value assert "positive" == sentence[1].get_label("sentiment").value # remove the pos label from the last word sentence[2].remove_labels("pos") # there should be 2 POS labels left labels: List[Label] = sentence.get_labels("pos") assert 2 == len(labels) assert 1 == len(sentence[0].get_labels("pos")) assert 1 == len(sentence[1].get_labels("pos")) assert 2 == len(sentence[1].labels) assert 0 == len(sentence[2].get_labels("pos")) # now remove all pos tags sentence.remove_labels("pos") print(sentence[0].get_labels("pos")) assert 0 == len(sentence.get_labels("pos")) assert 1 == len(sentence.get_labels("sentiment")) assert 1 == len(sentence.labels) assert 0 == len(sentence[0].get_labels("pos")) assert 0 == len(sentence[1].get_labels("pos")) assert 0 == len(sentence[2].get_labels("pos"))
def test_train_load_use_classifier(results_base_path, tasks_base_path): corpus = ColumnCorpus( data_folder=tasks_base_path / "conllu", train_file="train.conllup", dev_file="train.conllup", test_file="train.conllup", column_format={ 1: "text", 2: "pos", 3: "ner" }, ) relation_label_dict = corpus.make_label_dictionary(label_type="relation") embeddings = TransformerWordEmbeddings() model: RelationExtractor = RelationExtractor( embeddings=embeddings, label_dictionary=relation_label_dict, label_type="relation", entity_label_type="ner", train_on_gold_pairs_only=True, ) # initialize trainer trainer: ModelTrainer = ModelTrainer(model, corpus) trainer.train( results_base_path, learning_rate=0.1, mini_batch_size=2, max_epochs=3, shuffle=False, ) del trainer, model, relation_label_dict, corpus loaded_model: RelationExtractor = RelationExtractor.load( results_base_path / "final-model.pt") loaded_model.train_on_gold_pairs_only = False sentence = Sentence( ["Apple", "was", "founded", "by", "Steve", "Jobs", "."]) for token, tag in zip(sentence.tokens, ["B-ORG", "O", "O", "O", "B-PER", "I-PER", "O"]): token.set_label("ner", tag) loaded_model.predict(sentence) assert "founded_by" == sentence.get_labels("relation")[0].value # loaded_model.predict([sentence, sentence_empty]) # loaded_model.predict([sentence_empty]) del loaded_model
def test_sequence_tagger_transformer_finetune(results_base_path, tasks_base_path): flair.set_seed(123) # load dataset corpus: Corpus = ColumnCorpus( data_folder=tasks_base_path / "trivial" / "trivial_bioes", column_format={ 0: "text", 1: "ner" }, ) tag_dictionary = corpus.make_label_dictionary("ner") # tagger without CRF tagger: SequenceTagger = SequenceTagger( hidden_size=64, embeddings=TransformerWordEmbeddings("distilbert-base-uncased", fine_tune=True), tag_dictionary=tag_dictionary, tag_type="ner", use_crf=False, use_rnn=False, reproject_embeddings=False, ) # train trainer = ModelTrainer(tagger, corpus) trainer.fine_tune( results_base_path, mini_batch_size=2, max_epochs=10, shuffle=True, learning_rate=0.5e-4, ) loaded_model: SequenceTagger = SequenceTagger.load(results_base_path / "final-model.pt") sentence = Sentence("this is New York") sentence_empty = Sentence(" ") loaded_model.predict(sentence) loaded_model.predict([sentence, sentence_empty]) loaded_model.predict([sentence_empty]) # check if loaded model can predict entities = [span.span.text for span in sentence.get_labels("ner")] assert "New York" in entities # check if loaded model successfully fit the training data result: Result = loaded_model.evaluate(corpus.test, gold_label_type="ner") assert result.classification_report["micro avg"]["f1-score"] == 1.0 del loaded_model
def test_mixed_labels(): # example sentence sentence = Sentence("I love New York") # has sentiment value sentence.add_label("sentiment", "positive") # has 4 part of speech tags sentence[1].add_label("pos", "verb") sentence[2].add_label("pos", "proper noun") sentence[3].add_label("pos", "proper noun") sentence[0].add_label("pos", "pronoun") # has 1 NER tag sentence[2:4].add_label("ner", "City") # should be in total 6 labels assert 6 == len(sentence.labels) assert 4 == len(sentence.get_labels("pos")) assert 1 == len(sentence.get_labels("sentiment")) assert 1 == len(sentence.get_labels("ner"))
def test_span_tags(): # set 3 labels for 2 spans (HU is tagged twice) sentence = Sentence( "Humboldt Universität zu Berlin is located in Berlin .") sentence[0:4].add_label("ner", "Organization") sentence[0:4].add_label("ner", "University") sentence[7:8].add_label("ner", "City") # check if there are three labels with correct text and values labels: List[Label] = sentence.get_labels("ner") assert 3 == len(labels) assert "Humboldt Universität zu Berlin" == labels[0].data_point.text assert "Organization" == labels[0].value assert "Humboldt Universität zu Berlin" == labels[1].data_point.text assert "University" == labels[1].value assert "Berlin" == labels[2].data_point.text assert "City" == labels[2].value # check if there are two spans with correct text and values spans: List[Span] = sentence.get_spans("ner") assert 2 == len(spans) assert "Humboldt Universität zu Berlin" == spans[0].text assert 2 == len(spans[0].get_labels("ner")) assert "Berlin" == spans[1].text assert "City" == spans[1].get_label("ner").value # now delete the NER tags of "Humboldt-Universität zu Berlin" sentence[0:4].remove_labels("ner") # should be only one NER label left labels: List[Label] = sentence.get_labels("ner") assert 1 == len(labels) assert "Berlin" == labels[0].data_point.text assert "City" == labels[0].value # and only one NER span spans: List[Span] = sentence.get_spans("ner") assert 1 == len(spans) assert "Berlin" == spans[0].text assert "City" == spans[0].get_label("ner").value
def split_to_spans(s: Sentence): orig = s.to_original_text() last_idx = 0 spans = [] tagged_ents = s.get_labels("ner") for ent in tagged_ents: if last_idx != ent.span.start_pos: spans.append((orig[last_idx:ent.span.start_pos], None)) spans.append((ent.span.text, ent.value)) assert ent.span.end_pos is not None last_idx = ent.span.end_pos if last_idx < len(orig) - 1: spans.append((orig[last_idx:len(orig)], None)) return spans
def predict(query: str): if query is None or len(query) == 0: return {'success': False, 'message': 'query is required'} try: ContactTypeModel.load_model() sentence = Sentence(query) ContactTypeModel._model.predict(sentence) return {'success': True, 'data': [label.to_dict() for label in sentence.get_labels()]} except RuntimeError as e: logging.error(e, exc_info=True) return {'success': False, 'message': "Runtime Error: {0}".format(e)} except Exception as e: logging.error(e, exc_info=True) return {'success': False, 'message': 'exception occurred'}
def calcualte_sentiment_for_given_restaurant_year(db_name, classifier, restaurant, year): reviews = retrieve_reviews_from_db(db_name, restaurant, year) sentiment_scores = [] for review in reviews: if review == '': pass else: sentence = Sentence(review) classifier.predict(sentence) sentiment = sentence.get_labels()[0].value if sentiment == 'NEGATIVE': sentiment_score = -1 else: sentiment_score = 1 sentiment_scores.append(sentiment_score) return sentiment_scores
def test_sentence_labels(): # example sentence sentence = Sentence("I love Berlin") sentence.add_label("sentiment", "positive") sentence.add_label("topic", "travelling") assert 2 == len(sentence.labels) assert 1 == len(sentence.get_labels("sentiment")) assert 1 == len(sentence.get_labels("topic")) # add another topic label sentence.add_label("topic", "travelling") assert 3 == len(sentence.labels) assert 1 == len(sentence.get_labels("sentiment")) assert 2 == len(sentence.get_labels("topic")) sentence.remove_labels("topic") assert 1 == len(sentence.labels) assert 1 == len(sentence.get_labels("sentiment")) assert 0 == len(sentence.get_labels("topic"))
def test_spans(): # bioes tags sentence = Sentence("Zalando Research is located in Berlin .") sentence[0].add_tag("ner", "B-ORG") sentence[1].add_tag("ner", "E-ORG") sentence[5].add_tag("ner", "S-LOC") sentence._convert_span_labels("ner") spans: List[SpanLabel] = sentence.get_labels("ner") assert 2 == len(spans) assert "Zalando Research" == spans[0].span.text assert "ORG" == spans[0].value assert "Berlin" == spans[1].span.text assert "LOC" == spans[1].value # bio tags sentence = Sentence("Zalando Research is located in Berlin .") sentence[0].add_tag("ner", "B-ORG") sentence[1].add_tag("ner", "I-ORG") sentence[5].add_tag("ner", "B-LOC") sentence._convert_span_labels("ner") spans: List[SpanLabel] = sentence.get_labels("ner") assert "Zalando Research" == spans[0].span.text assert "ORG" == spans[0].value assert "Berlin" == spans[1].span.text assert "LOC" == spans[1].value # broken tags sentence = Sentence("Zalando Research is located in Berlin .") sentence[0].add_tag("ner", "I-ORG") sentence[1].add_tag("ner", "E-ORG") sentence[5].add_tag("ner", "I-LOC") sentence._convert_span_labels("ner") spans: List[SpanLabel] = sentence.get_labels("ner") assert "Zalando Research" == spans[0].span.text assert "ORG" == spans[0].value assert "Berlin" == spans[1].span.text assert "LOC" == spans[1].value # all tags sentence = Sentence("Zalando Research is located in Berlin .") sentence[0].add_tag("ner", "I-ORG") sentence[1].add_tag("ner", "E-ORG") sentence[2].add_tag("ner", "aux") sentence[3].add_tag("ner", "verb") sentence[4].add_tag("ner", "preposition") sentence[5].add_tag("ner", "I-LOC") sentence._convert_span_labels("ner") spans: List[SpanLabel] = sentence.get_labels("ner") assert 5 == len(spans) assert "Zalando Research" == spans[0].span.text assert "ORG" == spans[0].value assert "Berlin" == spans[4].span.text assert "LOC" == spans[4].value # all weird tags sentence = Sentence("Zalando Research is located in Berlin .") sentence[0].add_tag("ner", "I-ORG") sentence[1].add_tag("ner", "S-LOC") sentence[2].add_tag("ner", "aux") sentence[3].add_tag("ner", "B-relation") sentence[4].add_tag("ner", "E-preposition") sentence[5].add_tag("ner", "S-LOC") sentence._convert_span_labels("ner") spans: List[SpanLabel] = sentence.get_labels("ner") assert 5 == len(spans) assert "Zalando" == spans[0].span.text assert "ORG" == spans[0].value assert "Research" == spans[1].span.text assert "LOC" == spans[1].value assert "located in" == spans[3].span.text assert "relation" == spans[3].value sentence = Sentence( "A woman was charged on Friday with terrorist offences after three " "Irish Republican Army mortar bombs were found in a Belfast house , " "police said . " ) sentence[11].add_tag("ner", "S-MISC") sentence[12].add_tag("ner", "B-MISC") sentence[13].add_tag("ner", "E-MISC") sentence._convert_span_labels("ner") spans: List[SpanLabel] = sentence.get_labels("ner") assert 2 == len(spans) assert "Irish" == spans[0].span.text assert "Republican Army" == spans[1].span.text sentence = Sentence("Zalando Research is located in Berlin .") # tags with confidence sentence[0].add_tag("ner", "B-ORG", 1.0) sentence[1].add_tag("ner", "E-ORG", 0.9) sentence[5].add_tag("ner", "S-LOC", 0.5) sentence._convert_span_labels("ner") spans: List[SpanLabel] = sentence.get_labels("ner") assert 2 == len(spans) assert "Zalando Research" == spans[0].span.text assert "ORG" == spans[0].value assert 0.95 == spans[0].score assert "Berlin" == spans[1].span.text assert "LOC" == spans[1].value assert 0.5 == spans[1].score