Exemplo n.º 1
0
    def extracNer_text(self, text):
        text = re.sub(r'[,\.\()]', ' va ', text)
        text = text.translate(str.maketrans('', '', string.punctuation))

        sentence = Sentence(str(text))
        self.model.predict(sentence)

        ner = []
        for i in sentence.get_spans('ner'):
            ner.append(str(i).split('"')[1])
            ner = unique(ner)

        fillter = ner
        for i in self.stopwords:
            if i in text:
                fillter.append(i.strip())

        fillter.sort(key=len)
        for i in range(len(fillter) - 1):
            for j in range(i + 1, len(fillter)):
                if fillter[i] in fillter[j]:
                    fillter[i] = ''
        fillter = [i for i in fillter if i != '']
        clean_fillter = []
        for i in fillter:
            tmp = []
            tmp.append(i)
            try:
                tmp.append(self.entity[self.name.index(i + ' ')])
            except:
                tmp.append('Ner')
            clean_fillter.append(tmp)
        return clean_fillter
Exemplo n.º 2
0
def extract_entities():
    if not request.json or 'message' not in request.json:
        abort(400)
    query = request.json['message']
    sentence = Sentence(query, use_tokenizer=True)
    tagger.predict(sentence)

    entities = []
    tags = []
    scores = []
    start_positions = []
    end_positions = []

    for i, en in enumerate(sentence.get_spans('ner')):
        entities.append([str(token.text) for token in en.tokens])
        tags.append(en.tag)
        scores.append(str(round(en.score, 2)))
        start_positions.append(int(en.start_pos))
        end_positions.append(int(en.end_pos))

    response = {
        'entities': entities,
        'tags': tags,
        'scores': scores,
        'start_positions': start_positions,
        'end_positions': end_positions
    }
    return jsonify(response), 200
Exemplo n.º 3
0
    def train(corpus):
        """
        Train a Flair model
        :param corpus: Corpus object
        :return:
        """
        print(corpus)

        # 2. what tag do we want to predict?
        tag_type = "ner"

        # 3. make the tag dictionary from the corpus
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
        print(tag_dictionary.idx2item)

        # 4. initialize embeddings
        embedding_types: List[TokenEmbeddings] = [
            WordEmbeddings("glove"),
            FlairEmbeddings("news-forward"),
            FlairEmbeddings("news-backward"),
        ]

        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)

        # 5. initialize sequence tagger

        tagger: SequenceTagger = SequenceTagger(
            hidden_size=256,
            embeddings=embeddings,
            tag_dictionary=tag_dictionary,
            tag_type=tag_type,
            use_crf=True,
        )

        # 6. initialize trainer

        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        checkpoint = "resources/taggers/presidio-ner/checkpoint.pt"
        # trainer = ModelTrainer.load_checkpoint(checkpoint, corpus)
        trainer.train(
            "resources/taggers/presidio-ner",
            learning_rate=0.1,
            mini_batch_size=32,
            max_epochs=150,
            checkpoint=True,
        )

        sentence = Sentence("I am from Jerusalem")
        # run NER over sentence
        tagger.predict(sentence)

        print(sentence)
        print("The following NER tags are found:")

        # iterate over entities and print
        for entity in sentence.get_spans("ner"):
            print(entity)
Exemplo n.º 4
0
 def get_top_class(tagger, row):
     try:
         sentence = Sentence(row)
         tagger.predict(sentence)
         for entity in sentence.get_spans("ner"):
             ner_entities[entity.tag].append(entity.text)
     except:
         return "problem"
Exemplo n.º 5
0
def get_phrases(sentence):
    sentence = Sentence(sentence)
    ner_tagger.predict(sentence)
    entities = []
    for item in sentence.get_spans('ner'):
        entities.append(item.text.split())
    relations = []
    return [relations, entities]
Exemplo n.º 6
0
def flair_pos(language, input):
    sentence = Sentence(input)
    if (language == "french"):
        tagger = SequenceTagger.load("pos-multi")
        tagger.predict(sentence)
        annotated = sentence.to_tagged_string()
        print(annotated)
        temp = dict()
        boolean = True
        count = 1
        while (boolean):
            index = annotated.find("<")
            end_index = annotated.find(">")
            if (index != -1) and (end_index != -1):

                #annotated = annotated[index+1:]
                print("start", index)
                print("end", end_index)
                pos = annotated[index:end_index + 1]
                print("POS", pos)
                #print(annotated.find("<"))
                annotated = annotated[end_index + 1:]

                if (pos in temp):
                    temp[pos].append(count)
                else:
                    temp[pos] = list()
                    temp[pos].append(count)
            #  print(annotated)
                count = count + 1
            else:
                boolean = False
        print(temp)
        #sreturn temp
    else:
        tagger = SequenceTagger.load('pos')
        tagger.predict(sentence)
        print(sentence)
        print('The following POS tags are found:')
        # iterate over entities and print
        temp = dict()
        for word in sentence.get_spans('pos'):
            if word.tag in temp:
                token_text = str(word.tokens)
                index = token_text[8:]
                endl = index.find(" ")
                index = index[:endl]
                temp[word.tag].append(index)
            else:
                token_text = str(word.tokens)
                index = token_text[8:]
                endl = index.find(" ")
                index = index[:endl]
                temp[word.tag] = list()
                temp[word.tag].append(index)
    print(temp)

    return dict()
Exemplo n.º 7
0
 def model_ner_PRG_flair(paragraph, type_question):
     sentence = Sentence(paragraph)
     tagger.predict(sentence)
     list_predictions_data = []
     for entity in sentence.get_spans('ner'):
         if entity.tag in interesting_entities(type_question) and len(
                 normalize_answer(entity.text)):
             list_predictions_data.append(entity.text)
     return list_predictions_data
Exemplo n.º 8
0
def evaluate(tagger: SequenceTagger, content: str) -> Dict[Any, Any]:
    sentence = Sentence(content)
    tagger.predict(sentence)

    entities = [
        asdict(e) for e in map(NerEntity.from_span, sentence.get_spans('ner'))
    ]

    return {"entities": entities}
Exemplo n.º 9
0
def add_ner_predictions():
    for sent in prep_sentences():
        sent["hf"] = hf_tagger(sent["sentence"])

        # careful, flair tagger replace the sentence in place.
        sentence = Sentence(sent["sentence"])
        flair_tagger.predict(sentence)
        sent["flair"] = sentence.get_spans()

        yield sent
Exemplo n.º 10
0
def test_sequence_tagger_transformer_finetune(results_base_path,
                                              tasks_base_path):
    flair.set_seed(123)

    # load dataset
    corpus: Corpus = ColumnCorpus(
        data_folder=tasks_base_path / "trivial" / "trivial_bioes",
        column_format={
            0: "text",
            1: "ner"
        },
    )
    tag_dictionary = corpus.make_label_dictionary("ner")

    # tagger without CRF
    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=TransformerWordEmbeddings("distilbert-base-uncased",
                                             fine_tune=True),
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
        use_rnn=False,
        reproject_embeddings=False,
    )

    # train
    trainer = ModelTrainer(tagger, corpus)
    trainer.fine_tune(
        results_base_path,
        mini_batch_size=2,
        max_epochs=10,
        shuffle=True,
        learning_rate=0.5e-4,
    )

    loaded_model: SequenceTagger = SequenceTagger.load(results_base_path /
                                                       "final-model.pt")

    sentence = Sentence("this is New York")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # check if loaded model can predict
    entities = [span.text for span in sentence.get_spans("ner")]
    assert "New York" in entities

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type="ner")
    assert result.classification_report["micro avg"]["f1-score"] == 1.0

    del loaded_model
Exemplo n.º 11
0
 def get_entities(self, the_question, model):
     the_sentenced_question = Sentence(the_question)
     model.predict(the_sentenced_question)
     spans = [
         span for span in the_sentenced_question.get_spans('ner')
         if span.tag == "PER" or span.tag == "MISC" or span.tag == 'LOC'
     ]
     entities = [
         " ".join([tok.text for tok in span.tokens]) for span in spans
     ]
     return entities
 def predict(self, text):
     sentence = Sentence(text)
     tagger = SequenceTagger.load('ner')
     tagger.predict(sentence)
     predictions = []
     for entity in sentence.get_spans('ner'):
         ids = []
         for token in entity.tokens:
             ids.append(token.text)
         predictions.append((' '.join(ids), entity.tag))
     return predictions
Exemplo n.º 13
0
    def analyse(self, text: str, entities: List[str]) -> List[Entity]:
        self.validate_entities(entities)

        sentence = Sentence(text)
        self.model.predict(sentence)

        span_labels = []
        for entity in sentence.get_spans("ner"):
            if entity.tag in entities:
                span_labels.append(Entity(entity.tag, entity.start_pos, entity.end_pos))

        return span_labels
Exemplo n.º 14
0
 def get_named_entity_types(self, token):
     sentence = Sentence(token)
     self.tagger.predict(sentence)
     entities = sentence.get_spans('ner')
     label_mapping = {
         'LOC': 'LOCATION',
         'ORG': 'ORGANIZATION',
         'PER': 'PERSON'
     }
     return [
         label_mapping.get(entity.tag) or entity.tag for entity in entities
     ]
Exemplo n.º 15
0
def test_sequence_tagger_with_crf(results_base_path, tasks_base_path):
    flair.set_seed(123)

    # load dataset
    corpus: Corpus = ColumnCorpus(data_folder=tasks_base_path / "trivial" /
                                  "trivial_bioes",
                                  column_format={
                                      0: "text",
                                      1: "ner"
                                  })
    tag_dictionary = corpus.make_label_dictionary("ner")

    # tagger without CRF
    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=turian_embeddings,
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=True,
    )

    # train
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)
    trainer.train(
        results_base_path,
        learning_rate=0.1,
        mini_batch_size=2,
        max_epochs=10,
        shuffle=False,
    )

    loaded_model: SequenceTagger = SequenceTagger.load(results_base_path /
                                                       "final-model.pt")

    sentence = Sentence("this is New York")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # check if loaded model can predict
    entities = [span.text for span in sentence.get_spans('ner')]
    assert "New York" in entities

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type='ner')
    assert result.classification_report["micro avg"]["f1-score"] == 1.

    # clean up results directory
    shutil.rmtree(results_base_path)
    del loaded_model
Exemplo n.º 16
0
def test_span_tags():

    # set 3 labels for 2 spans (HU is tagged twice)
    sentence = Sentence(
        "Humboldt Universität zu Berlin is located in Berlin .")
    sentence[0:4].add_label("ner", "Organization")
    sentence[0:4].add_label("ner", "University")
    sentence[7:8].add_label("ner", "City")

    # check if there are three labels with correct text and values
    labels: List[Label] = sentence.get_labels("ner")
    assert 3 == len(labels)
    assert "Humboldt Universität zu Berlin" == labels[0].data_point.text
    assert "Organization" == labels[0].value
    assert "Humboldt Universität zu Berlin" == labels[1].data_point.text
    assert "University" == labels[1].value
    assert "Berlin" == labels[2].data_point.text
    assert "City" == labels[2].value

    # check if there are two spans with correct text and values
    spans: List[Span] = sentence.get_spans("ner")
    assert 2 == len(spans)
    assert "Humboldt Universität zu Berlin" == spans[0].text
    assert 2 == len(spans[0].get_labels("ner"))
    assert "Berlin" == spans[1].text
    assert "City" == spans[1].get_label("ner").value

    # now delete the NER tags of "Humboldt-Universität zu Berlin"
    sentence[0:4].remove_labels("ner")
    # should be only one NER label left
    labels: List[Label] = sentence.get_labels("ner")
    assert 1 == len(labels)
    assert "Berlin" == labels[0].data_point.text
    assert "City" == labels[0].value
    # and only one NER span
    spans: List[Span] = sentence.get_spans("ner")
    assert 1 == len(spans)
    assert "Berlin" == spans[0].text
    assert "City" == spans[0].get_label("ner").value
Exemplo n.º 17
0
def get_reason_for_appearance(organisation: Span, sentence: Sentence):
    """ Extract the reason for the appearance of an 'ORG' NER tag in a sentence. """
    # Find ORG placement in sentence.
    org_end = organisation.end_pos
    frame_tags = sentence.get_spans("frame")
    # Extract frame and POS tags after organisation occurence.
    pos_tags = list(
        filter(lambda span: "VBD" in span.tag, sentence.get_spans("pos")))
    frame_tags_after_org = list(
        filter(lambda span: span.start_pos > org_end, frame_tags))
    pos_tags_after_org = list(
        filter(lambda span: span.start_pos > org_end, pos_tags))
    # If no frame tags are usable, fall back to POS tags.
    if not frame_tags_after_org and not pos_tags_after_org:
        return None

    first_after_org = (frame_tags_after_org[0]
                       if frame_tags_after_org else pos_tags_after_org[0])
    original = sentence.to_original_text()
    # Extract reason following ORG occurence.
    reason = original[first_after_org.start_pos:]
    return reason
Exemplo n.º 18
0
def tagIt(title, noOfTags):
    # make a sentence
    sentence = Sentence(title)
    # load the NER tagger
    tagger = SequenceTagger.load(
        'ner')  #ner-fast for cpu, in case youy are poor

    # run NER over sentence
    tagger.predict(sentence)

    # iterate over entities and print
    for entity in sentence.get_spans('ner'):
        noOfTags.append(entity.text)
Exemplo n.º 19
0
def split_to_spans(s: Sentence):
    orig = s.to_original_text()
    last_idx = 0
    spans = []
    tagged_ents = s.get_spans('ner')
    for ent in tagged_ents:
        if last_idx != ent.start_pos:
            spans.append((orig[last_idx:ent.start_pos], None))
        spans.append((orig[ent.start_pos:ent.end_pos], ent.tag))
        last_idx = ent.end_pos
    if last_idx < len(orig) - 1:
        spans.append((orig[last_idx:len(orig)], None))
    return spans
Exemplo n.º 20
0
def extract_named_entity(Idea_Text):
    # Pre-process the text
    text_input = Idea_Text
    # text_input=clean_content(Idea_Text)
    ### Preporocess Input Text 1. remove Non-english characters
    text_input = text_input.encode("ascii", errors="ignore").decode()
    sentence = Sentence(text_input)

    # load the NER tagger
    tagger.predict(sentence)

    # iterate over entities and print
    dict_ner = {}
    list_entity = [
        'PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT',
        'WORK_OF_ART', 'LAW', 'LANGUAGE', 'DATE', 'TIME', 'PERCENT', 'MONEY',
        'QUANTITY', 'ORDINAL', 'CARDINAL'
    ]
    '''
    for entity in sentence.get_spans('ner'):
        named_entity=str(entity).strip().split('-')[0]
        dict_ner[entity]=[]
    '''

    for entity in list_entity:
        dict_ner[entity] = []

    list_ner_entity = []
    dict_ner_entitiy = {}
    for entity in sentence.get_spans('ner'):
        # print (entity)
        temp_dict_ner = {}
        named_entity = str(entity).strip().split('-')[0]
        entity_value = str(entity).strip().split(':')[1]

        if entity_value:
            entity_value = re.sub(r'\..*', '', entity_value)
            temp_dict_ner['Name'] = entity_value.strip().replace('"', '')
            temp_dict_ner['Type'] = named_entity
            list_ner_entity.append(temp_dict_ner)
        '''
        try:
            dict_ner[named_entity].append(entity_value)
        except:
            dict_ner[named_entity]=[]
            dict_ner[named_entity].append(entity_value)
        '''

    # print (list_ner_entity)
    dict_ner_entitiy['Entities'] = list_ner_entity
    return dict_ner_entitiy
Exemplo n.º 21
0
    def train(corpus):
        print(corpus)

        # 2. what tag do we want to predict?
        tag_type = 'ner'

        # 3. make the tag dictionary from the corpus
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
        print(tag_dictionary.idx2item)

        # 4. initialize embeddings
        embedding_types: List[TokenEmbeddings] = [
            WordEmbeddings('glove'),
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward'),
            BertEmbeddings()
        ]

        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)

        # 5. initialize sequence tagger

        tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                                embeddings=embeddings,
                                                tag_dictionary=tag_dictionary,
                                                tag_type=tag_type,
                                                use_crf=True)

        # 6. initialize trainer

        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        checkpoint = 'resources/taggers/presidio-ner/checkpoint.pt'
        # trainer = ModelTrainer.load_checkpoint(checkpoint, corpus)
        trainer.train('resources/taggers/presidio-ner',
                      learning_rate=0.1,
                      mini_batch_size=32,
                      max_epochs=150,
                      checkpoint=True)

        sentence = Sentence('I am from Jerusalem')
        # run NER over sentence
        tagger.predict(sentence)

        print(sentence)
        print('The following NER tags are found:')

        # iterate over entities and print
        for entity in sentence.get_spans('ner'):
            print(entity)
Exemplo n.º 22
0
 def extract(self,
             sentence: str) -> Dict[str, Dict[str, Union[str, Tuple]]]:
     doc = Sentence(sentence)
     self.nlp.predict(doc)
     d = sorted([(e.tag, {
         "text": e.text,
         "span": (e.tokens[0].start_pos, len(e.tokens))
     }) for e in doc.get_spans('ner') if e.tag in self.valid_entity_types],
                key=lambda t: t[0])
     d = {
         k: list(map(lambda t: t[1], g))
         for k, g in groupby(d, key=lambda t: t[0])
     }
     return d
Exemplo n.º 23
0
 def __call__(self, doc):
     sent = Sentence(doc.text)
     self.tagger.predict(sent)
     for match in sent.get_spans('ner'):
         _match = match.to_dict()
         span = doc.char_span(_match.get('start_pos'),
                              _match.get('end_pos'),
                              label=_match.get('labels')[0].value)
         # Pass, in case a match already exists
         try:
             doc.ents = list(doc.ents) + [span]
         except:
             pass
     return doc
Exemplo n.º 24
0
    def get_entities(self, text: str) -> List[str]:
        """
        Get the list of named entities for given text.
        COMMENT: We should reinitialize this method for using another NER model
        :param text: str, text used for NER extraction
        :return list of str (entities found in text)
        """

        sentence = Sentence(text)
        self.tagger.predict(sentence)
        entities = []
        for entity in sentence.get_spans('ner'):
            entities.append(entity.text)
        return entities
Exemplo n.º 25
0
def predict_ner(sent):	
	sentence = Sentence(sent)
	ner_model.predict(sentence)
	
	print(sentence.to_tagged_string())
	
	tags = {}
	for entity in sentence.get_spans('ner'):		
		tags[entity.text] = entity.tag
			
	print(tags)
	output = print_ner_tags(tags, sent)
	
	return output
Exemplo n.º 26
0
 def get_line_parts_flair(self, line: 'Line'):
     """ Split by comma since Flair is insensitive to commas
     """
     line_parts = defaultdict(lambda: None)
     for part in self.split_line(line):
         part = Sentence(part)
         self.flair_tagger.predict(part)
         for entity in part.get_spans('ner'):
             # Currently not saving for multiple ner extractions
             if not line_parts[entity.tag]:
                 line_parts[entity.tag] = entity.text
             print(f"{entity.text}, {entity.tag}| ", end="")
     print()
     return line_parts
Exemplo n.º 27
0
def get_important_words(corpus: Corpus,
                        preprocess_pipeline: Optional[List] = None) -> Tuple[Dict, Corpus]:
    important_words = {}
    tagger = SequenceTagger.load('de-ner')
    preprocessor = Preprocessor(pipeline=preprocess_pipeline)
    corpus_processed = preprocessor.process(corpus=corpus)

    for doc in tqdm(corpus_processed.documents):
        sentence = Sentence(doc.text)
        tagger.predict(sentence)

        important_words[doc.id_] = [entity.text for entity in sentence.get_spans('ner')]

    return important_words, corpus_processed
Exemplo n.º 28
0
def correct_who_to_whom(text):

    doc = nlp(text)

    tokenized_text = []

    phrases = []

    token_number = 0

    for token in doc:

        tokenized_text.append(token.text_with_ws)

        token_number += 1

        # it is very difficult for named entity recognizer to recognize 'Who'
        # in isolation - the motivating text was repeated exclamation of
        # 'Who! Who!' in a The Grinch fan fiction.
        if token.text.lower() in ['grinch', 'whoville', 'scooby', 'horton']:
            return

        if token.text.lower() == 'who':
            if token.dep_ in ['dobj', 'iobj', 'pobj']:

                # check for the hard-coded exceptions
                if not check_for_exceptions(doc, token):
                    should_be_whom = True

                    sentence = Sentence(text, use_tokenizer=SegtokTokenizer())
                    tagger.predict(sentence)

                    # make sure it is not part of a named entity
                    for entity in sentence.get_spans('ner'):
                        if token.idx >= entity.start_pos and token.idx <= entity.end_pos:
                            should_be_whom = False

                    if should_be_whom:

                        # detokenizes the corrected excerpt (e.g. removes added space
                        # between last word in sentence and punctutation, rejoins
                        # don and 't to form don't, etc., only if such joins were
                        # present in the original text)
                        tokenized_text[token.i] = whom_string(
                            token.text_with_ws, True)

    # prints the text with corrections made (corrections surround by asterisks)
    corrected_text = ''.join([tkn for tkn in tokenized_text])
    print(corrected_text)
Exemplo n.º 29
0
def get_source_frames(sentence_tokens, frame_tagger):
    all_frames = []
    if frame_tagger:
        sentence_obj = Sentence(" ".join(sentence_tokens))
        frame_tagger.predict(sentence_obj)
        for frame in sentence_obj.get_spans('frame'):
            if frame.tag != "_":
                indices, tokens = zip(*[(tok.idx - 1, tok.text)
                                        for tok in frame.tokens])
                all_frames.append({
                    "predicate_sense": frame.tag,
                    "predicate_word": tokens,
                    "predicate_ix": indices
                })
    return all_frames
Exemplo n.º 30
0
def get_flair_ner(text):
    # wnut_17 = flair.datasets.WNUT_17()
    # wikiner_en = flair.datasets.WIKINER_ENGLISH()

    # # make a multi corpus consisting of two UDs
    # multi_corpus = MultiCorpus([wnut_17, wikiner_en])

    # make a sentence
    sentence = Sentence(text)

    flair_tagger = SequenceTagger.load('multi-ner-fast')
    # run NER over sentence
    flair_tagger.predict(sentence)

    # iterate over entities and print
    return sentence.get_spans('ner')