Exemplo n.º 1
0
    def _split_long_sentences(self, sentences):
        """Split long sentences.

        Args:
            sentences (list): list of flair's Sentences

        Returns:
            list:
        """
        extended = sentences.copy()
        tokenizer = self.model.embeddings.tokenizer
        offset = 0
        for i, sentence in enumerate(sentences):
            len_bpe = len(tokenizer.tokenize(sentence.to_tokenized_string()))
            if len_bpe > self.max_length:
                extended.pop(i + offset)
                num_pieces = len_bpe // self.max_length + 1
                for piece in array_split(sentence, num_pieces):
                    char_offset = piece[0].start_pos
                    sentence_piece = Sentence()
                    for token in piece:
                        token.start_pos -= char_offset
                        token.end_pos -= char_offset
                        sentence_piece.add_token(token)
                    piece[-1].whitespace_after = False
                    extended.insert(i + offset, sentence_piece)
                    offset += 1
                # we pop original sentence, so we should decrease offset by one
                offset -= 1
        logger.debug(f'Lengths before split: {[len(x) for x in sentences]}')
        logger.debug(f'Lengths after split: {[len(x) for x in extended]}')
        return extended
Exemplo n.º 2
0
def predict_sentence_entities(tagger, sent, all_entities, phase=1):

    #print("Processing {}".format(sent['words'][0]))
    global elapsed_times
    newsent = Sentence()
    for i in range(0, len(sent['normwords'])):
        tok = sent['normwords'][i]
        token = Token(tok, i, None, start_position=int(sent['starts'][i]))
        newsent.add_token(token)

    seqtagger = tagger['model']
    model_id = tagger['model_id']

    start = time.time()
    seqtagger.predict(newsent)
    end = time.time()
    words = len(sent['normwords'])
    elapsed = np.round((end - start) * 1000, 0)
    if elapsed < 0:
        elapsed = 0
    obj = elapsed_times[model_id]
    if obj['freqs'].get(words) != None:
        obj['freqs'][words] = obj['freqs'][words] + 1
        obj['times'][words] = obj['times'][words] + elapsed
    else:
        obj['freqs'][words] = 1
        obj['times'][words] = elapsed
    ner_spans = newsent.get_spans("ner")
    make_entities(sent, all_entities, ner_spans, False, model_id)
    return
Exemplo n.º 3
0
def benchmark_flair_mdl():
    tagger = load_flair_ner_model()

    start = time.time()

    flair_sentences = []
    for i, sentence in enumerate(sentences_tokens):
        flair_sentence = Sentence()

        for token_txt in sentence:
            flair_sentence.add_token(Token(token_txt))
        flair_sentences.append(flair_sentence)

    tagger.predict(flair_sentences, verbose=True)
    predictions = [[tok.tags['ner'].value for tok in fs]
                   for fs in flair_sentences]
    print('Flair:')
    print("Made predictions on {} sentences and {} tokens in {}s".format(
        num_sentences, num_tokens,
        time.time() - start))

    assert len(predictions) == num_sentences

    print(
        classification_report(sentences_entities,
                              remove_miscs(predictions),
                              digits=4))
Exemplo n.º 4
0
    def pad_sequence(self,
                     sentences,
                     labelVoc,
                     word_maxlen=30,
                     sent_maxlen=35):
        """
            This function is used to pad the word into the same length, the word length is set to 30.
            Moreover, it also pad each sentence into the same length, the length is set to 35.

        """
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        x = []
        x_flair = []
        y = []
        for sentence in sentences:
            w_id = []
            y_id = []
            st = Sentence()
            for idx, word_label in enumerate(sentence):
                try:
                    w_id.append(tokenizer.vocab[word_label[0].lower()])
                except Exception as e:
                    w_id.append(tokenizer.vocab['[MASK]'])
                st.add_token(word_label[0])
                y_id.append(labelVoc[word_label[1]])
            x.append(w_id)
            x_flair.append(st)
            y.append(y_id)

        y = self.pad_sequences(y, sent_maxlen)
        x = self.pad_sequences(x, sent_maxlen)

        y = np.asarray(y)

        return [x, x_flair, y]
Exemplo n.º 5
0
    def _convert_to_flair(self, data, labels=None):
        """ Convert data and labels into a list of flair.data.Sentence objects.

            Parameters
            ----------
            data : list(list(str))
                list of list of tokens, each inner list represents a list of
                    tokens or words in sentence, and each outer list represents
                    a sentence.
            labels : list(list(str)), can be None
                list of list of NER tags corresponding to tokens in data.

            Returns
            -------
            sentences : list(flair.data.Sentence)
        """
        sentences = []
        if labels is None:
            labels = data
            use_dummy_labels = True
        else:
            use_dummy_labels = False
        for tokens, tags in zip(data, labels):
            sentence = Sentence()
            for token, tag in zip(tokens, tags):
                t = Token(token)
                if not use_dummy_labels:
                    t.add_tag("ner", tag)
                sentence.add_token(t)
            sentences.append(sentence)
        return sentences
    def _embed(self, x):
        vocab_idx = self.vocab_idx
        embeddings = []
        for sequence in x:
            padding_length = sequence.size(0)
            flair_sentence = Sentence()
            for index in sequence:
                index = index.item()
                if index == self.pad_index:
                    break  # skip padding
                padding_length = padding_length - 1
                token = vocab_idx.get(index, '[UNK]')
                flair_sentence.add_token(token)
            self.embeddings.embed(flair_sentence)
            sentence_embedding = torch.stack(
                [token.embedding for token in flair_sentence.tokens])
            if padding_length:
                sentence_embedding = torch.cat(
                    (sentence_embedding,
                     torch.zeros(padding_length,
                                 sentence_embedding.size(-1),
                                 device=sentence_embedding.device)))
            embeddings.append(sentence_embedding)

        return torch.stack(embeddings)
def process_conll_doc(input_file_name, output_file_name):

    columns = {
        0: 'text',
        1: 'nero',
        2: 'nme',
        3: 'wiki',
    }
    with open(input_file_name,
              "r") as input_file, open(output_file_name, "w+") as output_file:
        doc = None
        docs = []
        spos = 0

        for line in input_file:
            if "DOCSTART" in line:
                if doc == None:
                    doc = Sentence()
                else:
                    docs.append(doc)
                    doc = Sentence()
                    spos = 0
            else:
                lsplit = line.split("\t")
                #print(lsplit)
                token = Token(lsplit[0].strip())
                for c in columns:
                    if c != 0:
                        if c < len(lsplit):
                            token.add_tag(columns[c], lsplit[c].strip())
                token.start_pos = spos
                token.end_pos = spos + len(token.text)
                spos = token.end_pos + 1
                doc.add_token(token)

        for d in docs:

            myjson = {
                "text": unidecode.unidecode(d.to_tokenized_string()),
                "spans": []
            }
            res = requests.post(NEURAL_EL_SERVER, json=myjson)
            info = res.json()
            #print(info)
            for i in info:
                entity_ran = range(i[0], i[0] + i[1])
                #print(i[2] + " " + str(entity_ran))
                for t in d.tokens:
                    #print(t.text + " " + str(t.start_pos))
                    if t.start_position in entity_ran:
                        #print("found tag")
                        t.add_tag("pnme", i[2])

            for t in d:
                output_file.write(
                    t.text + "\t" + t.get_tag("nero").value + "\t" +
                    t.get_tag("nme").value + "\t" +
                    unidecode.unidecode(t.get_tag("wiki").value) + "\t" +
                    t.get_tag("pnme").value + "\n")
Exemplo n.º 8
0
def test_sentence_to_tagged_string():
    token1 = Token('I', 0)
    token2 = Token('love', 1, 0)
    token3 = Token('Berlin', 2, 1)
    token3.add_tag('ner', 'LOC')
    sentence = Sentence()
    sentence.add_token(token1)
    sentence.add_token(token2)
    sentence.add_token(token3)
    assert ('I love Berlin <LOC>' == sentence.to_tagged_string())
Exemplo n.º 9
0
 def predict(self, sentence):
     flair_sentence = Sentence()
     for word in sentence:
         flair_sentence.add_token(word)
     self.model.predict(flair_sentence, label_name="predicted")
     predictions = []
     for token in flair_sentence:
         predictions.append((token.get_tag("upos").value,
                             token.get_tag("predicted").value))
     return predictions
Exemplo n.º 10
0
 def predict(self, text: Generator[list[str]]) -> list[list[str]]:
     preds = list()
     flair_sents = list()
     for words in text:
         s = Sentence()
         for word in words:
             s.add_token(Token(word))
         flair_sents.append(s)
     self.model.predict(flair_sents)
     return [[tok.tags["ner"].value for tok in s] for s in flair_sents]
Exemplo n.º 11
0
def test_get_head():
    token1 = Token('I', 0)
    token2 = Token('love', 1, 0)
    token3 = Token('Berlin', 2, 1)
    sentence = Sentence()
    sentence.add_token(token1)
    sentence.add_token(token2)
    sentence.add_token(token3)
    assert (token2 == token3.get_head())
    assert (token1 == token2.get_head())
    assert (None == token1.get_head())
Exemplo n.º 12
0
def sent_to_flair(sent):
    """
    Convert a tokenized sentence (list of words) to a Flair sentence object
    """
    sentence = Sentence()

    for w in sent:
        token = Token(w)
        sentence.add_token(token)
        sentence.infer_space_after()

    return sentence
Exemplo n.º 13
0
	def embed_sentence(self, sentence):
		"""This function embed each sentence with BERT embedder

		Args:
			sentence (str): raw sentence

		Returns:
			np.array: embedded matrix
		"""		
		flair_sentence = Sentence(sentence)
		while len(flair_sentence) < self.MAX_LEN:	flair_sentence.add_token(Token("__PAD__"))
		self.embedder.embed(flair_sentence)
		return np.stack([t.embedding.cpu().numpy() for t in flair_sentence])
Exemplo n.º 14
0
def get_tags(line, tagger):
    # join list for tagging
    sentence = Sentence()
    for token in line:
        sentence.add_token(Token(token))
    tagger.predict(sentence)
    # split to get tags
    tagged_line = sentence.to_tagged_string().split()
    tags = []
    # tags are every other token in sentence
    for i in range(1, len(tagged_line), 2):
        tags.append(tagged_line[i][1:-1])
    return tags
Exemplo n.º 15
0
def test_sentence_add_token():
    token1 = Token('Munich')
    token2 = Token('and')
    token3 = Token('Berlin')
    token4 = Token('are')
    token5 = Token('nice')
    sentence = Sentence()
    sentence.add_token(token1)
    sentence.add_token(token2)
    sentence.add_token(token3)
    sentence.add_token(token4)
    sentence.add_token(token5)
    sentence.add_token('cities')
    sentence.add_token(Token('.'))
    assert ('Munich and Berlin are nice cities .' ==
            sentence.to_tokenized_string())
Exemplo n.º 16
0
def standoff_to_flair_sents(
        docs: List[Document],
        tokenizer: Tokenizer,
        verbose=False) -> Tuple[List[Sentence], List[ParsedDoc]]:
    sents, parsed_docs = standoff_to_sents(docs=docs,
                                           tokenizer=tokenizer,
                                           verbose=verbose)

    flair_sents = []
    for sent in sents:
        flair_sent = Sentence()
        for token in sent:
            tok = Token(token.text)
            tok.add_tag(tag_type='ner', tag_value=token.label)
            flair_sent.add_token(tok)
        flair_sents.append(flair_sent)

    return flair_sents, parsed_docs
Exemplo n.º 17
0
    def get_flair_predictions(self,
                              model_type,
                              input_conllu,
                              with_score=False):
        if model_type == "onto":
            model = self.flair_onto
        elif model_type == "ner":
            model = self.flair_ner
        else:
            model = self.flair_gum

        sentences = []
        conll_sents = input_conllu.strip().split("\n\n")

        for sent in conll_sents:
            token_list = [l.split("\t") for l in sent.split("\n") if "\t" in l]
            token_list = [
                t[1] for t in token_list if "." not in t[0] and "-" not in t[0]
            ]
            sentence = Sentence()
            for token in token_list:
                sentence.add_token(token)
            sentences.append(sentence)

        output = []
        scores = []

        preds = model.predict(sentences, all_tag_prob=with_score)
        if preds is None:  # Newer versions of flair have void predict method, use modified Sentence list
            preds = sentences

        for sentence in preds:
            for token in sentence:
                if str(flair.__version__).startswith("0.4"):
                    output.append(token.tags['pos'].value)
                else:
                    output.append(token.labels[0].value)
                if with_score:
                    scores.append(token.labels[0].score)
        if with_score:
            return (output, scores)
        else:
            return [output]
    def _get_rnn_output(self,
                        tokens: List[List[str]],
                        mask: Tensor = None) -> Tensor:

        sentences = []
        for token in tokens:
            sentence = Sentence()
            [sentence.add_token(Token(t.replace('\xa0', ' '))) for t in token]
            sentences.append(sentence)

        self.embeddings.embed(sentences)

        lengths = [len(sentence.tokens) for sentence in sentences]
        longest_token_sequence_in_batch = max(lengths)

        pre_allocated_zero_tensor = torch.zeros(
            self.embeddings.embedding_length * longest_token_sequence_in_batch,
            dtype=torch.float,
            device=flair.device)

        all_embs = list()
        for sentence in sentences:
            all_embs += [
                emb for token in sentence.tokens
                for emb in token.get_each_embedding()
            ]
            nb_padding_tokens = longest_token_sequence_in_batch - len(sentence)

            if nb_padding_tokens > 0:
                t = pre_allocated_zero_tensor[:self.embeddings.
                                              embedding_length *
                                              nb_padding_tokens]
                all_embs.append(t)

            for token in sentence.tokens:
                token.clear_embeddings()

        # [batch, length, word_dim]
        input = torch.cat(all_embs) \
            .view((len(sentences), longest_token_sequence_in_batch, self.embeddings.embedding_length))

        if self.device != flair.device:
            if self.device != torch.device('cpu'):
                input = input.cuda(self.device)
            else:
                input = input.cpu()

        # output from rnn [batch, length, hidden_size]
        output, hn = self.rnn(input, mask)

        # apply dropout for the output of rnn
        # [batch, length, hidden_size] --> [batch, hidden_size, length] --> [batch, length, hidden_size]
        output = self.dropout_out(output.transpose(1, 2)).transpose(1, 2)

        return output
Exemplo n.º 19
0
def get_flair_predictions(sentences):
    predictions = []
    
    flair_sentences = []
    for sentence in sentences:
        flair_sentence = Sentence()
        for token in sentence:
            flair_sentence.add_token(Token(token))
        flair_sentences.append(flair_sentence)
    flair.predict(flair_sentences)
    
    for s in flair_sentences:
        predicted_categories = []
        for t in s:
            predicted_categories.append(t.tags['ner'].value)
        predictions.append(predicted_categories)
    
    return predictions

#flair_preds = get_flair_predictions(sentences)
Exemplo n.º 20
0
def test_sentence_infer_tokenization():
    sentence = Sentence()
    sentence.add_token(Token('xyz'))
    sentence.add_token(Token('"'))
    sentence.add_token(Token('abc'))
    sentence.add_token(Token('"'))
    sentence.infer_space_after()
    assert ('xyz " abc "' == sentence.to_tokenized_string())
    assert ('xyz "abc"' == sentence.to_plain_string())
    sentence = Sentence('xyz " abc "')
    sentence.infer_space_after()
    assert ('xyz " abc "' == sentence.to_tokenized_string())
    assert ('xyz "abc"' == sentence.to_plain_string())
Exemplo n.º 21
0
def benchmark_flair_mdl():
    tagger = load_flair_ner_model()

    start = time.time()

    flair_sentences = []
    for i, sentence in enumerate(sentences_tokens):
        flair_sentence = Sentence()

        for token_txt in sentence:
            flair_sentence.add_token(Token(token_txt))
        flair_sentences.append(flair_sentence)

    tagger.predict(flair_sentences, verbose=True)
    predictions = [[tok.get_tag('ner').value for tok in fs]
                   for fs in flair_sentences]
    print('Flair:')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
Exemplo n.º 22
0
    def get_sentences(text, lang, use_ontonotes, fast, use_embeddings,
                      char_embeddings, bpe_size, expressions, pos,
                      sentiment) -> List[Sentence]:
        """Process text using Flair and return the output from Flair"""

        if lang not in ('en', 'multi', 'de', 'nl', 'fr'):
            raise TypeError(
                f'{lang} is not supported! Try multi. See https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_2_TAGGING.md'
            )

        # tokenize sentences
        sentences = []
        for s in segment(text):
            sentence = Sentence()
            sentences.append(sentence)
            for t in s:
                sentence.add_token(
                    Token(t.value,
                          start_position=t.offset,
                          whitespace_after=t.space_after))

        # run models
        for model in get_models(lang=lang,
                                use_ontonotes=use_ontonotes,
                                fast=fast,
                                expressions=expressions,
                                pos=pos,
                                sentiment=sentiment):
            model.predict(sentences)

        # load embedding models
        if use_embeddings or char_embeddings or bpe_size > 0:
            get_embeddings([e.strip() for e in use_embeddings.split(',')],
                           char_embeddings, lang, bpe_size).embed(sentences)

        return sentences
Exemplo n.º 23
0
def predictSentence():
    res = request.get_json()
    sentence = Sentence()
    for i in json.loads(res):
        sentence.add_token(Token(i))

    #print(sentence)

    tagger.predict(sentence)

    #for entity in sentence.get_spans('ner'):
    #    print(entity)
    #    print(entity.text)
    #    print(entity.tag)
    #    print("--------------")
    #print(sentence.to_dict(tag_type='ner'))

    tags = []
    for token in sentence.tokens:
        #print(token.text, token.get_tag('ner').value)
        tags.append(token.get_tag('ner').value)

    res = json.dumps(tags)
    return res
Exemplo n.º 24
0
def process_conll_doc(input_file_name, output_file_name, ner_model,
                      with_disambiguation, sim_level_disambig):

    nertagger = SequenceTagger.load(ner_model)
    columns = {
        0: 'text',
        1: 'nero',
        2: 'nme',
        3: 'wiki',
    }
    with open(input_file_name,
              "r") as input_file, open(output_file_name, "w+") as output_file:
        doc = None
        docs = []
        spos = 0

        for line in input_file:
            if "DOCSTART" in line:
                if doc == None:
                    doc = Sentence()
                else:
                    docs.append(doc)
                    doc = Sentence()
                    spos = 0
            else:
                lsplit = line.split("\t")
                #print(lsplit)
                token = Token(lsplit[0].strip())
                for c in columns:
                    if c != 0:
                        if c < len(lsplit):
                            token.add_tag(columns[c], lsplit[c].strip())
                token.start_pos = spos
                token.end_pos = spos + len(token.text)
                spos = token.end_pos + 1
                doc.add_token(token)

        for d in docs:
            nertagger.predict(d)

            centity = []
            newsent = []
            for token in d:
                #print(token)
                nertag = token.get_tag("ner").value
                #print(token.text + " " + nertag)
                if nertag[0:2] in ['B-', 'S-']:
                    if len(centity) != 0:
                        newsent.append("<entity>" + " ".join(centity) +
                                       "</entity>")
                        centity = []
                    centity.append(token.text)
                if nertag[0:2] in ['E-', 'I-']:
                    centity.append(token.text)
                if nertag == "O":
                    if len(centity) != 0:
                        newsent.append("<entity>" + " ".join(centity) +
                                       "</entity>")
                        centity = []
                    newsent.append(token.text)
            sent_for_ag = " ".join(newsent)
            agres = ag.disambiguate(sent_for_ag)

            for entity in d.get_spans('ner'):
                for r in agres:
                    if r["namedEntity"] == entity.text:
                        for t in entity.tokens:
                            t.add_tag("pnme", r["disambiguatedURL"])
                        break

            if with_disambiguation:
                searcher = load_disambiguation()
                for nerspan in d.get_spans('ner'):
                    if "pnme" not in nerspan.tokens[0].tags:
                        #print("calling with " + nerspan.text)
                        r = searcher.search(nerspan.text.lower(),
                                            sim_level_disambig)
                        #print(r)
                        if len(r) > 0:
                            d_tag = unidecode.unidecode(
                                (string.capwords(r[0]) +
                                 "_(disambiguation)").replace(" ", "_"))
                            for t2 in nerspan.tokens:
                                t2.add_tag("pnme", d_tag)

            for t in d:
                output_file.write(
                    t.text + "\t" + t.get_tag("nero").value + "\t" +
                    t.get_tag("nme").value + "\t" +
                    unidecode.unidecode(t.get_tag("wiki").value) + "\t" +
                    t.get_tag("pnme").value + "\n")
Exemplo n.º 25
0
    def train(self, intent_fst) -> None:
        from flair.data import Sentence, Token
        from flair.models import SequenceTagger, TextClassifier
        from flair.embeddings import (
            FlairEmbeddings,
            StackedEmbeddings,
            DocumentRNNEmbeddings,
        )
        from flair.data import TaggedCorpus
        from flair.trainers import ModelTrainer

        # Directory to look for downloaded embeddings
        cache_dir = self.profile.read_path(
            self.profile.get("intent.flair.cache_dir", "flair/cache")
        )

        os.makedirs(cache_dir, exist_ok=True)

        # Directory to store generated models
        data_dir = self.profile.write_path(
            self.profile.get("intent.flair.data_dir", "flair/data")
        )

        if os.path.exists(data_dir):
            shutil.rmtree(data_dir)

        self.embeddings = self.profile.get("intent.flair.embeddings", [])
        assert len(self.embeddings) > 0, "No word embeddings"

        # Create directories to write training data to
        class_data_dir = os.path.join(data_dir, "classification")
        ner_data_dir = os.path.join(data_dir, "ner")
        os.makedirs(class_data_dir, exist_ok=True)
        os.makedirs(ner_data_dir, exist_ok=True)

        # Convert FST to training data
        class_data_path = os.path.join(class_data_dir, "train.txt")
        ner_data_path = os.path.join(ner_data_dir, "train.txt")

        # { intent: [ { 'text': ..., 'entities': { ... } }, ... ] }
        sentences_by_intent: Dict[str, Any] = {}

        # Get sentences for training
        do_sampling = self.profile.get("intent.flair.do_sampling", True)
        start_time = time.time()

        if do_sampling:
            # Sample from each intent FST
            num_samples = int(self.profile.get("intent.flair.num_samples", 10000))
            intent_map_path = self.profile.read_path(
                self.profile.get("training.intent.intent_map", "intent_map.json")
            )

            with open(intent_map_path, "r") as intent_map_file:
                intent_map = json.load(intent_map_file)

            # Gather FSTs for all known intents
            fsts_dir = self.profile.write_dir(
                self.profile.get("speech_to_text.fsts_dir")
            )

            intent_fst_paths = {
                intent_id: os.path.join(fsts_dir, f"{intent_id}.fst")
                for intent_id in intent_map.keys()
            }

            # Generate samples
            self._logger.debug(
                f"Generating {num_samples} sample(s) from {len(intent_fst_paths)} intent(s)"
            )

            sentences_by_intent = sample_sentences_by_intent(
                intent_fst_paths, num_samples
            )
        else:
            # Exhaustively generate all sentences
            self._logger.debug(
                "Generating all possible sentences (may take a long time)"
            )
            sentences_by_intent = make_sentences_by_intent(intent_fst)

        sentence_time = time.time() - start_time
        self._logger.debug(f"Generated sentences in {sentence_time} second(s)")

        # Get least common multiple in order to balance sentences by intent
        lcm_sentences = lcm(*(len(sents) for sents in sentences_by_intent.values()))

        # Generate examples
        class_sentences = []
        ner_sentences: Dict[str, List[Sentence]] = defaultdict(list)
        for intent_name, intent_sents in sentences_by_intent.items():
            num_repeats = max(1, lcm_sentences // len(intent_sents))
            for intent_sent in intent_sents:
                # Only train an intent classifier if there's more than one intent
                if len(sentences_by_intent) > 1:
                    # Add balanced copies
                    for i in range(num_repeats):
                        class_sent = Sentence(labels=[intent_name])
                        for word in intent_sent["tokens"]:
                            class_sent.add_token(Token(word))

                        class_sentences.append(class_sent)

                if len(intent_sent["entities"]) == 0:
                    continue  # no entities, no sequence tagger

                # Named entity recognition (NER) example
                token_idx = 0
                entity_start = {ev["start"]: ev for ev in intent_sent["entities"]}
                entity_end = {ev["end"]: ev for ev in intent_sent["entities"]}
                entity = None

                word_tags = []
                for word in intent_sent["tokens"]:
                    # Determine tag label
                    tag = "O" if not entity else f"I-{entity}"
                    if token_idx in entity_start:
                        entity = entity_start[token_idx]["entity"]
                        tag = f"B-{entity}"

                    word_tags.append((word, tag))

                    # word ner
                    token_idx += len(word) + 1

                    if (token_idx - 1) in entity_end:
                        entity = None

                # Add balanced copies
                for i in range(num_repeats):
                    ner_sent = Sentence()
                    for word, tag in word_tags:
                        token = Token(word)
                        token.add_tag("ner", tag)
                        ner_sent.add_token(token)

                    ner_sentences[intent_name].append(ner_sent)

        # Start training
        max_epochs = int(self.profile.get("intent.flair.max_epochs", 100))

        # Load word embeddings
        self._logger.debug(f"Loading word embeddings from {cache_dir}")
        word_embeddings = [
            FlairEmbeddings(os.path.join(cache_dir, "embeddings", e))
            for e in self.embeddings
        ]

        if len(class_sentences) > 0:
            self._logger.debug("Training intent classifier")

            # Random 80/10/10 split
            class_train, class_dev, class_test = self._split_data(class_sentences)
            class_corpus = TaggedCorpus(class_train, class_dev, class_test)

            # Intent classification
            doc_embeddings = DocumentRNNEmbeddings(
                word_embeddings,
                hidden_size=512,
                reproject_words=True,
                reproject_words_dimension=256,
            )

            classifier = TextClassifier(
                doc_embeddings,
                label_dictionary=class_corpus.make_label_dictionary(),
                multi_label=False,
            )

            self._logger.debug(
                f"Intent classifier has {len(class_sentences)} example(s)"
            )
            trainer = ModelTrainer(classifier, class_corpus)
            trainer.train(class_data_dir, max_epochs=max_epochs)
        else:
            self._logger.info("Skipping intent classifier training")

        if len(ner_sentences) > 0:
            self._logger.debug(f"Training {len(ner_sentences)} NER sequence tagger(s)")

            # Named entity recognition
            stacked_embeddings = StackedEmbeddings(word_embeddings)

            for intent_name, intent_ner_sents in ner_sentences.items():
                ner_train, ner_dev, ner_test = self._split_data(intent_ner_sents)
                ner_corpus = TaggedCorpus(ner_train, ner_dev, ner_test)

                tagger = SequenceTagger(
                    hidden_size=256,
                    embeddings=stacked_embeddings,
                    tag_dictionary=ner_corpus.make_tag_dictionary(tag_type="ner"),
                    tag_type="ner",
                    use_crf=True,
                )

                ner_intent_dir = os.path.join(ner_data_dir, intent_name)
                os.makedirs(ner_intent_dir, exist_ok=True)

                self._logger.debug(
                    f"NER tagger for {intent_name} has {len(intent_ner_sents)} example(s)"
                )
                trainer = ModelTrainer(tagger, ner_corpus)
                trainer.train(ner_intent_dir, max_epochs=max_epochs)
        else:
            self._logger.info("Skipping NER sequence tagger training")
Exemplo n.º 26
0
def form_sentence(tokens):
    s = Sentence()
    for w in tokens:
        s.add_token(Token(w))
    return s
Exemplo n.º 27
0
def process_conll_doc(input_file_name, output_file_name, ner_model,
                      with_disambiguation, sim_level_disambig):

    nertagger = SequenceTagger.load(ner_model)
    columns = {
        0: 'text',
        1: 'nero',
        2: 'nme',
        3: 'wiki',
    }
    with open(input_file_name,
              "r") as input_file, open(output_file_name, "w+") as output_file:
        doc = None
        docs = []
        spos = 0

        for line in input_file:
            if "DOCSTART" in line:
                if doc == None:
                    doc = Sentence()
                else:
                    docs.append(doc)
                    doc = Sentence()
                    spos = 0
            else:
                lsplit = line.split("\t")
                #print(lsplit)
                token = Token(lsplit[0].strip())
                for c in columns:
                    if c != 0:
                        if c < len(lsplit):
                            token.add_tag(columns[c], lsplit[c].strip())
                token.start_pos = spos
                token.end_pos = spos + len(token.text)
                spos = token.end_pos + 1
                doc.add_token(token)

        for d in docs:
            nertagger.predict(d)

            spans = []
            for nerspan in d.get_spans('ner'):
                start = nerspan.start_pos
                length = nerspan.end_pos - nerspan.start_pos
                spans.append({"start": start, "length": length})

            myjson = {
                "text": unidecode.unidecode(d.to_tokenized_string()),
                "spans": spans
            }

            res = requests.post(NEURAL_EL_SERVER, json=myjson)
            info = res.json()

            for nerspan in d.get_spans('ner'):
                for i in info:
                    if i[0] == nerspan.start_pos:
                        for t in nerspan.tokens:
                            t.add_tag("pnme", i[2])
                        break

            if with_disambiguation:
                searcher = load_disambiguation()
                for nerspan in d.get_spans('ner'):
                    if "pnme" not in nerspan.tokens[0].tags:
                        #print("calling with " + nerspan.text)
                        r = searcher.search(nerspan.text.lower(),
                                            sim_level_disambig)
                        #print(r)
                        if len(r) > 0:
                            d_tag = unidecode.unidecode(
                                (string.capwords(r[0]) +
                                 "_(disambiguation)").replace(" ", "_"))
                            for t2 in nerspan.tokens:
                                t2.add_tag("pnme", d_tag)

            for t in d:
                output_file.write(
                    t.text + "\t" + t.get_tag("nero").value + "\t" +
                    t.get_tag("nme").value + "\t" +
                    unidecode.unidecode(t.get_tag("wiki").value) + "\t" +
                    t.get_tag("pnme").value + "\n")
Exemplo n.º 28
0
    def test_check_input(self):
        """
        Test for check_input function
        """
        phone_sigs = [
            'cell', 'Cell', 'phone', 'Phone', 'Phone/fax', 'phone/fax',
            'Phone/Fax'
        ]
        fax_sigs = ['Fax', 'fax']

        # Check for email address
        sentence = Sentence()
        token = Token('hello')
        tag = 'S-email_id'
        token.add_tag('ner', tag)
        sentence.add_token(token)
        app.check_input(sentence)
        return_val = sentence[0].get_tag('ner').value
        self.assertNotEqual(return_val, tag)

        token = Token('*****@*****.**')
        sentence.add_token(token)
        app.check_input(sentence)
        return_val = sentence[1].get_tag('ner').value
        self.assertEqual(return_val, tag)

        token = Token('*****@*****.**')
        sentence.add_token(token)
        app.check_input(sentence)
        return_val = sentence[2].get_tag('ner').value
        self.assertNotEqual(return_val, tag)

        # Check for phone number
        for sig in phone_sigs:
            sentence = Sentence()
            token = Token(sig)
            tag = 'S-phone'
            token.add_tag('ner', tag)
            sentence.add_token(token)
            token = Token('123-456-7890')
            sentence.add_token(token)
            app.check_input(sentence)
            return_val = sentence[0].get_tag('ner').value
            self.assertNotEqual(return_val, tag)
            return_val = sentence[1].get_tag('ner').value
            self.assertEqual(return_val, tag)

        # Check for fax number
        for sig in fax_sigs:
            sentence = Sentence()
            token = Token(sig)
            tag = 'S-fax'
            token.add_tag('ner', tag)
            sentence.add_token(token)
            token = Token('123-456-7890')
            sentence.add_token(token)
            app.check_input(sentence)
            return_val = sentence[0].get_tag('ner').value
            self.assertNotEqual(return_val, tag)
            return_val = sentence[1].get_tag('ner').value
            self.assertEqual(return_val, tag)

        # Check for zipcode
        num = ''
        for i in range(10):
            num += str(i)
            sentence = Sentence()
            token = Token(num)
            tag = 'S-zipcode'
            sentence.add_token(token)
            app.check_input(sentence)
            return_val = sentence[0].get_tag('ner').value
            if len(num) == 5:
                self.assertEqual(return_val, tag)
            else:
                self.assertNotEqual(return_val, tag)
Exemplo n.º 29
0
    def __call__(self, doc):

        # TODO: use a sentencizer or not?
        # TODO: process all sentences in one batch on GPU
        for doc_sentence in doc.sents:
            #filtered_doc_sentence = [token for token in doc_sentence if not token.is_punct and not token.is_space]
            filtered_doc_sentence = doc_sentence

            json_data = []

            # if still token remaining in sentence
            if filtered_doc_sentence:
                sentence = Sentence()
                for token in filtered_doc_sentence:
                    sentence.add_token(Token(token.text))
                    json_data.append(token.text)

                json_obj = json.dumps(json_data)

                r = requests.post(self.req_address, json=json_obj)

                tags_res = r.json()

                spans = []
                tags = []
                for doc_token, tag in zip(filtered_doc_sentence, tags_res):
                    start = doc_token.i
                    end = start + 1

                    #tag = tagged_token.get_tag('ner')

                    if tag != 'O':
                        _, label = tag.split('-')
                        span = Span(doc,
                                    start,
                                    end,
                                    label=self.nlp.vocab.strings[label])
                        spans.append(span)
                        tags.append(tag)
                        # doc.ents = list(doc.ents) + [span]
                doc.ents = list(doc.ents) + self.merge_iob_spans(
                    doc, spans, tags)
                """
                tagged_sentences = self.tagger.predict(sentence)

                spans = []
                tags = []
                for doc_token, tagged_token in zip(filtered_doc_sentence, tagged_sentences[0]):
                    start = doc_token.i
                    end = start + 1

                    tag = tagged_token.get_tag('ner')

                    if tag != 'O':
                        _, label = tag.split('-')
                        span = Span(doc, start, end, label=self.nlp.vocab.strings[label])
                        spans.append(span)
                        tags.append(tag)
                        #doc.ents = list(doc.ents) + [span]
                        
                doc.ents = list(doc.ents) + self.merge_iob_spans(doc, spans, tags)
                """

        return doc
Exemplo n.º 30
0
    def create_sentlist_from_file_batchmax(self,
                                           data,
                                           maxlen=64,
                                           compare_column="cat"):
        """
        takes a pandas dataframe with columns 'tok' and 'sentstart' and creates a list of flair Sentence objects with tags.
        Each flair Sentence object may contain several real sentences, but at most maxlen tokens.
        The Sentence object stops at a sentence boundary, so it is often shorter than maxlen.
        Sentences longer than maxlen are split!
        If a line with token value "EOF" is encountered, a shorter flair Sentence object is returned,
        so no file boundaries are crossed
        :param data_path:
        :return:
        """
        sent_list = []
        toklist = []
        catlist = []
        # the len_last_token is needed to add proper start/end pos for each sentence token
        len_last_token = 0
        # track the sentence that is currently being processed
        curr_sentence_tok = []
        curr_sentence_cat = []
        for index, row in data.iterrows():
            tok = str(row["tok"])
            if compare_column != "NaN":
                cat = str(row[compare_column])
            else:
                cat = "-"

            # if the current token is "EOF" this marks the end of sample file
            # chunks may not cross file boundaries, therefore end the sentence here in any case
            if tok == "EOF":
                # do not add this token to any list
                # merge toklist and curr_sentence_tok list to get all current tokens
                # and create a flair sentence
                toklist.extend(curr_sentence_tok)
                catlist.extend(curr_sentence_cat)
                self.logger.debug(
                    "create chunk at EOF with (len: {}): {}".format(
                        len(toklist), toklist))
                self.logger.debug("catlist with (len: {}): {}".format(
                    len(catlist), catlist))
                sent = Sentence()
                for i, tok in enumerate(toklist):
                    flair_tok = Token(str(tok), start_position=len_last_token)
                    len_last_token += len(tok) + 1
                    flair_tok.add_tag("cat", catlist[i])
                    sent.add_token(flair_tok)
                if len(sent.tokens) > 0:
                    sent_list.append(sent)
                len_last_token = 0
                toklist = []
                catlist = []
                # reset the curr sent lists as well
                curr_sentence_tok = []
                curr_sentence_cat = []

            else:
                # if we are at the start of a new sentence, add the contents of curr_sentence_tok
                # and curr_sentence_cat to the main lists and start a new curr_sentence
                if row["sentstart"] == "yes":
                    toklist.extend(curr_sentence_tok)
                    catlist.extend(curr_sentence_cat)
                    curr_sentence_tok = [tok]
                    curr_sentence_cat = [cat]
                else:
                    curr_sentence_tok.append(tok)
                    curr_sentence_cat.append(cat)

                # if the combined length of toklist and curr_sentence_tok is > maxlen now,
                # create a flair sentence with the tokens in toklist and reset it
                # the remaining tokens in curr_sentence_tok are saved for the next chunk
                if len(toklist) + len(curr_sentence_tok) > maxlen:
                    # if toklist is empty at this point, we have a sentence > maxlen
                    # and must split it. The last token currently in curr_sentence will
                    # be preserved for later so that the chunk is not too long
                    if len(toklist) == 0:
                        toklist.extend(curr_sentence_tok[0:-1])
                        catlist.extend(curr_sentence_cat[0:-1])
                        curr_sentence_tok = [curr_sentence_tok[-1]]
                        curr_sentence_cat = [curr_sentence_cat[-1]]
                        self.logger.debug(
                            "Sentence is split (len: {}): {}".format(
                                len(toklist), toklist))

                    self.logger.debug("create chunk with (len: {}): {}".format(
                        len(toklist), toklist))
                    self.logger.debug("catlist with (len: {}): {}".format(
                        len(catlist), catlist))
                    sent = Sentence()
                    for i, tok in enumerate(toklist):
                        flair_tok = Token(str(tok),
                                          start_position=len_last_token)
                        len_last_token += len(tok) + 1
                        flair_tok.add_tag("cat", str(catlist[i]))
                        sent.add_token(flair_tok)
                    if len(sent.tokens) > 0:
                        sent_list.append(sent)
                    len_last_token = 0
                    toklist = []
                    catlist = []

        self.logger.debug("toklist: {}, curr_sent_tok: {}".format(
            len(toklist), len(curr_sentence_tok)))
        # if the loop is complete, empty the buffers and add them to the list
        if len(curr_sentence_tok) > 0:
            toklist.extend(curr_sentence_tok)
            catlist.extend(curr_sentence_cat)
            sent = Sentence()
            for i, tok in enumerate(toklist):
                flair_tok = Token(str(tok), start_position=len_last_token)
                len_last_token += len(tok) + 1
                flair_tok.add_tag("cat", str(catlist[i]))
                sent.add_token(flair_tok)
            if len(sent.tokens) > 0:
                sent_list.append(sent)
            len_last_token = 0

        return sent_list