Exemplo n.º 1
0
    def _get_tars_formatted_sentence(self, label, sentence):

        label = self._clean(label)

        original_text = sentence.to_tokenized_string()

        label_text_pair = (f"{label} {self.separator} {original_text}"
                           if self.prefix else
                           f"{original_text} {self.separator} {label}")

        sentence_labels = [
            self._clean(label.value)
            for label in sentence.get_labels(self.get_current_label_type())
        ]

        tars_label = self.LABEL_MATCH if label in sentence_labels else self.LABEL_NO_MATCH

        tars_sentence = Sentence(label_text_pair,
                                 use_tokenizer=False).add_label(
                                     self.static_label_type, tars_label)

        return tars_sentence
Exemplo n.º 2
0
    def transform(self, X, y=None, **kwargs):
        """
        an abstract method that is used to transform according to what happend in the fit method
        :param X: features - Dataframe
        :param y: target vector - Series
        :param kwargs: free parameters - dictionary
        :return: X: the transformed data - Dataframe
        """

        X = X['text']

        dataset_hash = hash(str(X) + str(self.embedder.__dict__))
        if dataset_hash in self.dataset_cache:
            return self.dataset_cache[dataset_hash]
        else:
            embeddings = []

            for first in trange(0, len(X), self.batch_size):
                subset = X[first:first + self.batch_size]
                sentences = []
                for element in subset:
                    sentence = Sentence(element)
                    # sentence.tokens = sentence.tokens[:200]
                    sentences.append(sentence)

                self.embedder.embed(sentences)
                for sentence in sentences:
                    key = sentence.to_original_text()
                    if key in self.vector_cache.keys():
                        vector = self.vector_cache[key]
                    else:
                        vector = sentence.get_embedding().cpu().detach().numpy(
                        )
                        self.vector_cache[key] = vector
                    embeddings.append(vector)

            embedding_dataset = numpy.vstack(embeddings)
            self.dataset_cache[dataset_hash] = embedding_dataset
            return embedding_dataset
Exemplo n.º 3
0
    def _get_pos(self, before_ctx, word, after_ctx):
        context_words = before_ctx + [word] + after_ctx
        context_key = " ".join(context_words)
        if context_key in self._pos_tag_cache:
            word_list, pos_list = self._pos_tag_cache[context_key]
        else:
            if self.tagger_type == "nltk":
                word_list, pos_list = zip(
                    *nltk.pos_tag(context_words, tagset=self.tagset))

            if self.tagger_type == "flair":
                context_key_sentence = Sentence(context_key)
                self._flair_pos_tagger.predict(context_key_sentence)
                word_list, pos_list = zip_flair_result(context_key_sentence)

            self._pos_tag_cache[context_key] = (word_list, pos_list)

        # idx of `word` in `context_words`
        idx = len(before_ctx)
        assert word_list[
            idx] == word, "POS list not matched with original word list."
        return pos_list[idx]
def add_Chunktag_and_tokenize(text, e1, e2, tagger):
    """
    Inputs: arrays representing the training, validation and test data
    Outputs: vocabulary (Tokenized text as in-place modification of input arrays or returned as new arrays)
    """
    punctuation = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'
    tokens = text.split(' ')
    key_token1 = tokens[e1]
    key_token2 = tokens[e1]
    if tokens[e1][-1] != "'" or tokens[e1][-2] != 's':
        key_token1 = tokens[e1].split("'")[0].translate(str.maketrans('', '', punctuation))
    if tokens[e2][-1] != "'" or tokens[e2][-2] != 's':
        key_token2 = tokens[e2].split("'")[0].translate(str.maketrans('', '', punctuation))
    Sentence_object = Sentence(text, use_tokenizer=True)
    tagger.predict(Sentence_object)
    new_text = Sentence_object.to_tagged_string()
    new_tokens = new_text.split(' ')
    offset1 = new_tokens[e1:].index(key_token1)
    offset2 = new_tokens[e2:].index(key_token2)
    new_e1 = e1 + offset1
    new_e2 = e2 + offset2
    return new_tokens, new_e1, new_e2
Exemplo n.º 5
0
    def __getitem__(self, index: int = 0) -> Sentence:
        if self.in_memory:
            return self.sentences[index]
        else:
            row = self.raw_data[index]

            text = " ".join(
                [row[text_column] for text_column in self.text_columns])

            if self.max_chars_per_doc > 0:
                text = text[:self.max_chars_per_doc]

            sentence = Sentence(text, use_tokenizer=self.tokenizer)
            for column in self.column_name_map:
                if self.column_name_map[column].startswith(
                        "label") and row[column]:
                    sentence.add_label(self.label_type, row[column])

            if 0 < self.max_tokens_per_doc < len(sentence):
                sentence.tokens = sentence.tokens[:self.max_tokens_per_doc]

            return sentence
Exemplo n.º 6
0
def test_span_tags():

    # set 3 labels for 2 spans (HU is tagged twice)
    sentence = Sentence(
        "Humboldt Universität zu Berlin is located in Berlin .")
    sentence[0:4].add_label("ner", "Organization")
    sentence[0:4].add_label("ner", "University")
    sentence[7:8].add_label("ner", "City")

    # check if there are three labels with correct text and values
    labels: List[Label] = sentence.get_labels("ner")
    assert 3 == len(labels)
    assert "Humboldt Universität zu Berlin" == labels[0].data_point.text
    assert "Organization" == labels[0].value
    assert "Humboldt Universität zu Berlin" == labels[1].data_point.text
    assert "University" == labels[1].value
    assert "Berlin" == labels[2].data_point.text
    assert "City" == labels[2].value

    # check if there are two spans with correct text and values
    spans: List[Span] = sentence.get_spans("ner")
    assert 2 == len(spans)
    assert "Humboldt Universität zu Berlin" == spans[0].text
    assert 2 == len(spans[0].get_labels("ner"))
    assert "Berlin" == spans[1].text
    assert "City" == spans[1].get_label("ner").value

    # now delete the NER tags of "Humboldt-Universität zu Berlin"
    sentence[0:4].remove_labels("ner")
    # should be only one NER label left
    labels: List[Label] = sentence.get_labels("ner")
    assert 1 == len(labels)
    assert "Berlin" == labels[0].data_point.text
    assert "City" == labels[0].value
    # and only one NER span
    spans: List[Span] = sentence.get_spans("ner")
    assert 1 == len(spans)
    assert "Berlin" == spans[0].text
    assert "City" == spans[0].get_label("ner").value
Exemplo n.º 7
0
def test_create_sentence_using_scispacy_tokenizer():
    sentence: Sentence = Sentence(
        "Spinal and bulbar muscular atrophy (SBMA) is an inherited motor neuron",
        use_tokenizer=SciSpacyTokenizer(),
    )

    assert 13 == len(sentence.tokens)
    assert "Spinal" == sentence.tokens[0].text
    assert "and" == sentence.tokens[1].text
    assert "bulbar" == sentence.tokens[2].text
    assert "muscular" == sentence.tokens[3].text
    assert "atrophy" == sentence.tokens[4].text
    assert "(" == sentence.tokens[5].text
    assert "SBMA" == sentence.tokens[6].text
    assert ")" == sentence.tokens[7].text
    assert "is" == sentence.tokens[8].text
    assert "an" == sentence.tokens[9].text
    assert "inherited" == sentence.tokens[10].text
    assert "motor" == sentence.tokens[11].text
    assert "neuron" == sentence.tokens[12].text

    assert 0 == sentence.tokens[0].start_pos
    assert 7 == sentence.tokens[1].start_pos
    assert 11 == sentence.tokens[2].start_pos
    assert 18 == sentence.tokens[3].start_pos
    assert 27 == sentence.tokens[4].start_pos
    assert 35 == sentence.tokens[5].start_pos
    assert 36 == sentence.tokens[6].start_pos
    assert 40 == sentence.tokens[7].start_pos
    assert 42 == sentence.tokens[8].start_pos
    assert 45 == sentence.tokens[9].start_pos
    assert 48 == sentence.tokens[10].start_pos
    assert 58 == sentence.tokens[11].start_pos
    assert 64 == sentence.tokens[12].start_pos

    assert sentence.tokens[4].whitespace_after
    assert not sentence.tokens[5].whitespace_after
    assert not sentence.tokens[6].whitespace_after
    assert sentence.tokens[7].whitespace_after
Exemplo n.º 8
0
def test_train_language_model(results_base_path, resources_path):
    # get default dictionary
    dictionary: Dictionary = Dictionary.load("chars")

    # init forward LM with 128 hidden states and 1 layer
    language_model: LanguageModel = LanguageModel(dictionary,
                                                  is_forward_lm=True,
                                                  hidden_size=128,
                                                  nlayers=1)

    # get the example corpus and process at character level in forward direction
    corpus: TextCorpus = TextCorpus(
        resources_path / "corpora/lorem_ipsum",
        dictionary,
        language_model.is_forward_lm,
        character_level=True,
    )

    # train the language model
    trainer: LanguageModelTrainer = LanguageModelTrainer(language_model,
                                                         corpus,
                                                         test_mode=True)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2)

    # use the character LM as embeddings to embed the example sentence 'I love Berlin'
    char_lm_embeddings: TokenEmbeddings = FlairEmbeddings(
        str(results_base_path / "best-lm.pt"))
    sentence = Sentence("I love Berlin")
    char_lm_embeddings.embed(sentence)

    text, likelihood = language_model.generate_text(number_of_characters=100)
    assert text is not None
    assert len(text) >= 100

    # clean up results directory
    shutil.rmtree(results_base_path, ignore_errors=True)
Exemplo n.º 9
0
def benchmark_flair_mdl():
    tagger = load_flair_ner_model()

    start = time.time()

    flair_sentences = []
    for i, sentence in enumerate(sentences_tokens):
        flair_sentence = Sentence()

        for token_txt in sentence:
            flair_sentence.add_token(Token(token_txt))
        flair_sentences.append(flair_sentence)

    tagger.predict(flair_sentences, verbose=True)
    predictions = [[tok.tags['ner'].value for tok in fs]
                   for fs in flair_sentences]
    print('Flair:')
    print_speed_performance(start, num_sentences, num_tokens)

    assert len(predictions) == num_sentences

    print(f1_report(sentences_entities, remove_miscs(predictions), bio=True))
Exemplo n.º 10
0
def build_flair_sentences(d: Dict) -> List[Sentence]:
    def prefix_to_BIOES(label, start, end, current_index):
        if end - start > 0:
            if current_index == start:
                prefix = 'B'
            elif current_index == end:
                prefix = 'E'
            else:
                prefix = 'I'
        else:
            prefix = 'S'

        return prefix + '-' + label

    def tag_it(token: Token, index, ner_spans):
        labels = [(start, end, label) for start, end, label in ner_spans
                  if index >= start and index <= end]

        if len(labels) > 0:
            for start, end, label in labels:
                token.add_tag(TAG_TYPE,
                              prefix_to_BIOES(label, start, end, index))
        else:
            token.add_tag(TAG_TYPE, 'O')

    offset = 0
    sentences = []
    for tokens, ner_spans in zip(d['sentences'], d['ner']):
        sentence: Sentence = Sentence()
        [sentence.add_token(Token(tok)) for tok in tokens]
        [
            tag_it(token, k + offset, ner_spans)
            for k, token in enumerate(sentence)
        ]
        offset += len(tokens)
        sentences.append(sentence)

    return sentences
Exemplo n.º 11
0
def input_fn(request_body, content_type=JSON_CONTENT_TYPE):
    """
    This method is called by SageMaker for every inference request. It handles the INPUT DATA sent to the model/endpoint.

    This method needs to deserialze the invoke request body into an object we can perform prediction on.
    
    It currently natively supports serialized form "application/json" format only. 
   
    However, this can be easily extended as show below (i.e. text/csv)
    
    Returns:
      - formatted data used for prediction. The return value of this function is passed to predict_fn
    """

    # If the request is submitted/serialized as application/json
    if content_type.lower() == JSON_CONTENT_TYPE:
        inference_text = json.loads(request_body)

    # If the request is submitted/serialized as text/csv
    elif content_type.lower() == CSV_CONTENT_TYPE:
        inference_text = request_body

    else:
        raise ValueError(
            f"Format {content_type} is not supported. Please use one of {[JSON_CONTENT_TYPE, CSV_CONTENT_TYPE]}"
        )

    # Turning the request_body into a FLAIR sentence
    try:
        input_object = Sentence(inference_text)

    except Exception as e:
        logging.exception(
            "Converting inference text to FLAIR sentence failed.")

    logging.info("Input serialization succesfully completed.")

    return input_object
Exemplo n.º 12
0
def prepare_output(text: str,
                   tagger: SequenceTagger,
                   word_tokenizer=None,
                   output_type: str = "pseudonymized"):
    stats_dict = {}
    with sw.timer("root"):
        if not word_tokenizer:
            tokenizer = MOSES_TOKENIZER
        else:
            tokenizer = word_tokenizer

        # text = [t.strip() for t in text.split("\n") if t.strip()]
        text_sentences = [
            Sentence(t.strip(), use_tokenizer=tokenizer)
            for t in text.split("\n") if t.strip()
        ]
        with sw.timer('model_annotation'):
            tagger.predict(
                sentences=text_sentences,
                mini_batch_size=32,
                embedding_storage_mode="none",
                # use_tokenizer=tokenizer,
                verbose=True)

        if output_type == "conll":
            api_output, tags_stats = create_conll_output(
                sentences_tagged=text_sentences)
        elif output_type == "tagged":
            api_output, tags_stats = create_tagged_text(
                sentences_tagged=text_sentences)
        elif output_type == "pseudonymized":
            api_output, tags_stats = create_pseudonymized_text(
                sentences_tagged=text_sentences)

        # deal with stats
        stats_dict["nb_analyzed_sentences"] = len(text)
        stats_dict.update(tags_stats)
        return api_output, stats_dict
Exemplo n.º 13
0
    def add_entity_markers(self, sentence, span_1, span_2):

        text = ""

        entity_one_is_first = None
        offset = 0
        for token in sentence:
            if token == span_2[0]:
                if entity_one_is_first is None: entity_one_is_first = False
                offset += 1
                text += " <e2>"
                span_2_startid = offset
            if token == span_1[0]:
                offset += 1
                text += " <e1>"
                if entity_one_is_first is None: entity_one_is_first = True
                span_1_startid = offset

            text += " " + token.text

            if token == span_1[-1]:
                offset += 1
                text += " </e1>"
                span_1_stopid = offset
            if token == span_2[-1]:
                offset += 1
                text += " </e2>"
                span_2_stopid = offset

            offset += 1

        expanded_sentence = Sentence(text, use_tokenizer=False)

        expanded_span_1 = Span([expanded_sentence[span_1_startid - 1]])
        expanded_span_2 = Span([expanded_sentence[span_2_startid - 1]])

        return expanded_sentence, (expanded_span_1, expanded_span_2) \
            if entity_one_is_first else (expanded_span_2, expanded_span_1)
Exemplo n.º 14
0
def test_html_rendering():
    text = (
        "Boris Johnson has been elected new Conservative leader in a ballot of party members and will become the "
        "next UK prime minister. &"
    )
    sent = Sentence()
    sent.get_spans = MagicMock()
    sent.get_spans.return_value = [
        mock_ner_span(text, "PER", 0, 13),
        mock_ner_span(text, "MISC", 35, 47),
        mock_ner_span(text, "LOC", 109, 111),
    ]
    sent.to_original_text = MagicMock()
    sent.to_original_text.return_value = text
    colors = {
        "PER": "#F7FF53",
        "ORG": "#E8902E",
        "LOC": "yellow",
        "MISC": "#4647EB",
        "O": "#ddd",
    }
    actual = render_ner_html([sent], colors=colors)

    expected_res = HTML_PAGE.format(
        text=PARAGRAPH.format(
            sentence=TAGGED_ENTITY.format(
                color="#F7FF53", entity="Boris Johnson", label="PER"
            )
            + " has been elected new "
            + TAGGED_ENTITY.format(color="#4647EB", entity="Conservative", label="MISC")
            + " leader in a ballot of party members and will become the next "
            + TAGGED_ENTITY.format(color="yellow", entity="UK", label="LOC")
            + " prime minister. &amp;"
        ),
        title="Flair",
    )

    assert expected_res == actual
Exemplo n.º 15
0
    def read_text_classification_file(path_to_file: Union[str, Path], max_tokens_per_doc=-1, use_tokenizer=True) -> \
            Iterable[Sentence]:
        """
        Reads a data file for text classification. The file should contain one document/text per line.
        The line should have the following format:
        __label__<class_name> <text>
        If you have a multi class task, you can have as many labels as you want at the beginning of the line, e.g.,
        __label__<class_name_1> __label__<class_name_2> <text>
        :param path_to_file: the path to the data file
        :param max_tokens_per_doc: Take only documents that contain number of tokens less or equal to this value. If
        set to -1 all documents are taken.
        :return: list of sentences
        """
        label_prefix = '__label__'

        with open(str(path_to_file), encoding='utf-8') as f:
            for line in f:
                words = line.split()

                labels = []
                l_len = 0

                for i in range(len(words)):
                    if words[i].startswith(label_prefix):
                        l_len += len(words[i]) + 1
                        label = words[i].replace(label_prefix, "")
                        labels.append(label)
                    else:
                        break

                text = line[l_len:].strip()

                if text and labels:
                    sentence = Sentence(text, labels=labels, use_tokenizer=use_tokenizer)
                    if len(sentence) > max_tokens_per_doc and max_tokens_per_doc > 0:
                        sentence.tokens = sentence.tokens[:max_tokens_per_doc]
                    if len(sentence.tokens) > 0:
                        yield sentence
Exemplo n.º 16
0
def test_tagged_corpus_make_vocab_dictionary():
    train_sentence = Sentence("used in training. training is cool.",
                              use_tokenizer=segtok_tokenizer)

    corpus: Corpus = Corpus([train_sentence], [], [])

    vocab = corpus.make_vocab_dictionary(max_tokens=2, min_freq=-1)

    assert 3 == len(vocab)
    assert "<unk>" in vocab.get_items()
    assert "training" in vocab.get_items()
    assert "." in vocab.get_items()

    vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=-1)

    assert 7 == len(vocab)

    vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=2)

    assert 3 == len(vocab)
    assert "<unk>" in vocab.get_items()
    assert "training" in vocab.get_items()
    assert "." in vocab.get_items()
Exemplo n.º 17
0
def chunk_text(file):
    f = open(file, 'r', encoding='utf-8', errors='ignore')
    data = json.load(f)
    f.close()

    tokens_list = []
    tags_list = []

    tagger = SequenceTagger.load('chunk')
    for inst in data:
        reviews = inst['reviews']

        for review in reviews:
            tokens = ["<s>"] + review.split() + ["</s>"]
            tokens_list.append(tokens)

            sentence = Sentence(review)
            tags = [token.annotation_layers['np'][0].value for token in tokens]

            tokens_list.append(tokens)
            tags_list.append(tags)

    return tokens_list, tags_list
Exemplo n.º 18
0
    def predict(
        self,
        text: Union[List[Sentence], Sentence, List[str], str],
        mini_batch_size: int = 32,
        **kwargs,
    ) -> List[Sentence]:
        """Predict method for running inference using the pre-trained token tagger model

        * **text** - String, list of strings, sentences, or list of sentences to run inference on
        * **mini_batch_size** - Mini batch size
        * **&ast;&ast;kwargs**(Optional) - Optional arguments for the Flair tagger
        """

        if isinstance(text, (Sentence, str)):
            text = [text]
        if isinstance(text[0], str):
            text = [Sentence(s) for s in text]
        self.tagger.predict(
            sentences=text,
            mini_batch_size=mini_batch_size,
            **kwargs,
        )
        return text
Exemplo n.º 19
0
    def ner_analysis(ner_library, ner_model, text):
        ner_analysis_data = None
        try:
            # Preprocess text
            text_filtered: str = remove_non_alphabetical_symbols(text=text)

            if ner_library == "spacy":
                ner_data = ner_model(text_filtered)
                if len(ner_data.ents) > 0:
                    for ent in ner_data.ents:
                        if ent.label_ == 'PER' or ent.label_ == 'PERSON':
                            ner_analysis_data = str(ent)
            elif ner_library == "flair":
                sentence = Sentence(text_filtered)
                ner_model.predict(sentence)
                for ent in sentence.get_spans("ner"):
                    if ent.tag == 'PER' and ent.score >= 0.80:
                        ner_analysis_data = ent.text
            else:
                gv.logger.warning("Unvalid NER library.")
        except Exception as e:
            gv.logger.error(e)
        return ner_analysis_data
Exemplo n.º 20
0
    def split(self, text: str) -> List[Sentence]:
        plain_sentences: List[str] = split_multi(text)
        sentence_offset = 0

        sentences: List[Sentence] = []
        for sentence in plain_sentences:
            try:
                sentence_offset = text.index(sentence, sentence_offset)
            except ValueError as error:
                raise AssertionError(
                    f"Can't find the sentence offset for sentence {repr(sentence)} "
                    f"starting from position {repr(sentence_offset)}"
                ) from error
            sentences.append(
                Sentence(
                    text=sentence,
                    use_tokenizer=self._tokenizer,
                    start_position=sentence_offset,
                ))

            sentence_offset += len(sentence)

        return sentences
Exemplo n.º 21
0
def pos_tag(sentence, backend='nltk'):
    global flair_pos

    if backend == 'nltk':
        return nltk.pos_tag(sentence.split(' '))

    elif backend == 'flair':
        if flair_pos is None:
            flair_pos = SequenceTagger.load('pos')
        sentence_info = Sentence(sentence)
        flair_pos.predict(sentence_info)
        tagged = sentence_info.to_tagged_string().split(' ')
        assert(len(tagged) % 2 == 0)
        parsed = []
        for i in range(len(tagged) // 2):
            idx = i * 2
            tag = tagged[idx + 1]
            assert(tag.startswith('<') and tag.endswith('>'))
            parsed.append((tagged[idx], tag))
        return parsed

    else:
        raise ValueError('Invalid backend: {}'.format(backend))
Exemplo n.º 22
0
def parseReceipt():
    response_list = list()

    if not request.json or not 'receipt' in request.json:
        abort(400)
    receipt = request.json['receipt']

    # create a sentences
    lines = receipt.split('\n')
    for index, line in enumerate(lines):
        if line.strip():
            sentence = Sentence(line)

            # predict tags and print
            model.predict(sentence)

            # add prediction to response
            response_list.append(
                {f"LINE_{index}": sentence.to_dict(tag_type='ner')})

    print(response_list)
    response = response_list
    return jsonify(response), 200
Exemplo n.º 23
0
def run_ocr(image):
    """
    Runs OCR on given image and returns output with no
    newline, comma, or bar character. If OCR returns no
    output, then return that image is unreadable.
    """
    # Define config parameters
    configs = ('-l eng --oem 1 --psm 3')
    # Run tesseract OCR on image
    output_text = pytesseract.image_to_string(image, config=configs)
    text = ''
    for j, char in enumerate(output_text):
        if char == '\n' or char == ',' or char == '|':
            text += ' '
        else:
            text += char
    readable = True
    text_boxes = pytesseract.image_to_data(image, output_type=Output.DICT, config=configs)
    try:
        text = Sentence(text)
    except Exception as ocr_error:
        readable = False
    return text, readable, text_boxes
Exemplo n.º 24
0
def test_text_classifier_single_label(tasks_base_path):
    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB, tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        [glove_embedding], 128, 1, False, 64, False, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = TextClassifierTrainer(model, corpus, label_dict, False)
    trainer.train('./results', max_epochs=2)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

    # clean up results directory
    shutil.rmtree('./results')
Exemplo n.º 25
0
def test_tagged_corpus_make_vocab_dictionary():
    train_sentence = Sentence("used in training. training is cool.")

    corpus: Corpus = Corpus(FlairDatapointDataset([train_sentence]),
                            sample_missing_splits=False)

    vocab = corpus.make_vocab_dictionary(max_tokens=2, min_freq=-1)

    assert 3 == len(vocab)
    assert "<unk>" in vocab.get_items()
    assert "training" in vocab.get_items()
    assert "." in vocab.get_items()

    vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=-1)

    assert 7 == len(vocab)

    vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=2)

    assert 3 == len(vocab)
    assert "<unk>" in vocab.get_items()
    assert "training" in vocab.get_items()
    assert "." in vocab.get_items()
Exemplo n.º 26
0
def test_train_language_model(results_base_path, resources_path):
    dictionary = Dictionary.load(u'chars')
    language_model = LanguageModel(dictionary,
                                   is_forward_lm=True,
                                   hidden_size=128,
                                   nlayers=1)
    corpus = TextCorpus((resources_path / u'corpora/lorem_ipsum'),
                        dictionary,
                        language_model.is_forward_lm,
                        character_level=True)
    trainer = LanguageModelTrainer(language_model, corpus, test_mode=True)
    trainer.train(results_base_path,
                  sequence_length=10,
                  mini_batch_size=10,
                  max_epochs=2)
    char_lm_embeddings = FlairEmbeddings(
        unicode((results_base_path / u'best-lm.pt')))
    sentence = Sentence(u'I love Berlin')
    char_lm_embeddings.embed(sentence)
    (text, likelihood) = language_model.generate_text(number_of_characters=100)
    assert (text is not None)
    assert (len(text) >= 100)
    shutil.rmtree(results_base_path, ignore_errors=True)
Exemplo n.º 27
0
def test_tagged_corpus_make_vocab_dictionary():
    train_sentence = Sentence('used in training. training is cool.',
                              use_tokenizer='segtok')

    corpus: TaggedCorpus = TaggedCorpus([train_sentence], [], [])

    vocab = corpus.make_vocab_dictionary(max_tokens=2, min_freq=-1)

    assert (3 == len(vocab))
    assert ('<unk>' in vocab.get_items())
    assert ('training' in vocab.get_items())
    assert ('.' in vocab.get_items())

    vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=-1)

    assert (7 == len(vocab))

    vocab = corpus.make_vocab_dictionary(max_tokens=-1, min_freq=2)

    assert (3 == len(vocab))
    assert ('<unk>' in vocab.get_items())
    assert ('training' in vocab.get_items())
    assert ('.' in vocab.get_items())
async def sentimentAnalysis(request):

    response_data = {
        "sentence": None,
        "request_id": None,
        "NEGATIVE": 0,
        "POSITIVE": 0
    }
    try:
        input_body_json = request.json
        sentence = Sentence(input_body_json['sentence'])
        classifier.predict(sentence)
        logger.info(f'Sentiment: {sentence.labels}')
        label = sentence.labels[0]
        labscore = (label.score) * 100
        response_data[f'{label.value}'] = labscore
        response_data[f"{flip_value[f'{label.value}']}"] = 100 - labscore
        response_data['sentence'] = request.json['sentence']
        response_data['request_id'] = request.json['request_id']
        return json(body=response_data, status=200)
    except Exception as e:
        logger.error(f"{e}")
        return json(body={"detail": f"{e}"}, status=404)
Exemplo n.º 29
0
 def _process_sentences(self, sentences):
     sentences_emb = []
     for sentence in sentences:
         sentence = " ".join(sentence.split())
         sent = sentence
         if len(sent.strip()) == 0:
             sent = 'empty'
         try:
             sent = Sentence(sent)
             self.embedding.embed(sent)
             sentence_emb = [
                 np.array(t.embedding).astype(np.float16) for t in sent
             ]
             sentences_emb.append(np.array(sentence_emb).astype(np.float16))
         except IndexError:
             print('IndexError')
             print(sentence)
             sentence_emb = [
                 np.array(t.embedding).astype(np.float16) for t in sent
             ]
             sentences_emb.append(np.array(sentence_emb).astype(np.float16))
     sentences_emb_short = sentences_emb
     return sentences_emb_short
def test_text_classifier_mulit_label():
    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
    document_embeddings: DocumentMeanEmbeddings = DocumentMeanEmbeddings(
        [glove_embedding])

    model = TextClassifier(document_embeddings, label_dict, True)

    trainer = TextClassifierTrainer(model, corpus, label_dict, False)
    trainer.train('./results', max_epochs=2)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert (l.name is not None)
            assert (0.0 <= l.confidence <= 1.0)
            assert (type(l.confidence) is float)

    # clean up results directory
    shutil.rmtree('./results')