Exemplo n.º 1
0
def test_sentiment_in_sentence() -> None:
    stanza.download("en", processors="tokenize, sentiment")
    nlp = stanza.Pipeline(lang="en", processors="tokenize", use_gpu=False)
    # without sentiment
    for document in stanza_batch.batch([EXAMPLE_ONE], nlp):
        for sentence in document.sentences:
            with pytest.raises(AttributeError):
                sentence.sentiment
    # with sentiment
    nlp = stanza.Pipeline(lang="en",
                          processors="tokenize, sentiment",
                          use_gpu=False)
    for document in stanza_batch.batch([EXAMPLE_ONE], nlp):
        for sentence in document.sentences:
            assert isinstance(sentence.sentiment, int)
Exemplo n.º 2
0
    def load_document(self, input, language):
        if (isinstance(input, str)):
            paragraphs = input.split('\n')
            tokenized_paragraphs = [
                self.nlp(p) for p in paragraphs if p.strip()
            ]
        elif isinstance(input, list):
            paragraphs = [' '.join(para) for para in input]
            sentence_list, sentence_paragraph = flat_article(input)
            sentence_nlp = [
                doc.sentences[0]
                for doc in batch(sentence_list, self.nlp, batch_size=32)
            ]
            article_rebuilt = unflat_article(sentence_nlp, sentence_paragraph)
            tokenized_paragraphs = [
                SimpleNamespace(sentences=para) for para in article_rebuilt
            ]
            # tokenized_paragraphs = [SimpleNamespace(sentences=[self.nlp(sentence).sentences[0] for sentence in para if sentence.strip()]) for para in input]
            # tokenized_paragraphs = [SimpleNamespace(sentences=[doc.sentences[0] for doc in batch(para, self.nlp,batch_size=32)]) for para in input]
        else:
            raise NotImplementedError()
        # print(paragraphs)

        # tokenizing, lemmarization, ...

        return {
            'paragraphs': paragraphs,
            'tokenized_paragraphs': tokenized_paragraphs,
            'language': language
        }
def freq_dict(data_texts: list, tagging_method, nlp):
    vectors = []
    if tagging_method == 'SPACY':
        for text in data_texts:
            vector_new_tags = prep_text_new_tags(text)
            doc = nlp(" ".join(vector_new_tags[0]))

            freq_dict = Counter(([token.pos_ for token in doc]))
            vector = vector_freq(freq_dict, vector_new_tags)
            vectors.append(vector)
            # ---test---
            # for token in doc:
            #     print(token.text)
            #     print(token.pos_)
    elif tagging_method == 'STANZA':
        vectors = []
        list_stza = doc_withing_double_space(data_texts)
        #https://pypi.org/project/stanza-batch/
        for doc in batch(list_stza, nlp, batch_size=100):
            vector_new_tags = prep_text_new_tags(doc.text)
            if type(vector_new_tags[0]) == list:
                text = " ".join(vector_new_tags[0])
            sentence = join_sentences(text, nlp)
            freq_dict = Counter([word.upos for word in sentence])
            vector = vector_freq(freq_dict, vector_new_tags)
            vectors.append(vector)
    else: raise Exception("You must enter a valid tagging method, See TaggingMethod.")
    return vectors
Exemplo n.º 4
0
def test_ents_attribute_in_doc_and_sentence(include_ner: bool) -> None:
    entity_document = ["Two entities Alice and Bob"]
    processes = "tokenize"
    if include_ner:
        processes = "tokenize, ner"
    stanza.download("en", processors=processes)
    nlp = stanza.Pipeline(lang="en", processors=processes, use_gpu=False)
    for document in stanza_batch.batch(entity_document, nlp):
        if include_ner:
            assert document.ents
        else:
            assert not document.ents
        for sentence in document.sentences:
            if include_ner:
                assert sentence.ents
            else:
                assert not document.ents
Exemplo n.º 5
0
    nlp = stanza.Pipeline(lang="en",
                          processors="tokenize,pos,sentiment",
                          use_gpu=True)

    book_data: List[str] = []
    test_data_dir = Path(__file__, "..", "tests", "data").resolve()
    with Path(test_data_dir,
              "jane_austin_emma_data.txt").open("r") as emma_file:
        book_data = [line for line in emma_file]
    assert len(book_data) == 490

    t = time()
    gpu_memory_used: List[float] = []
    processed_book_data: List[Document] = []
    for document in stanza_batch.batch(book_data,
                                       nlp,
                                       clear_cache=args.clear_cache):
        processed_book_data.append(document)
        if args.save_fp:
            # assuming the first GPU is the one being used.
            gpu_memory_used.append(GPUtil.getGPUs()[0].memoryUsed)
    print(f'Time taken: {time() - t}')

    if args.save_fp:
        number_documents_processed = range(len(processed_book_data))
        plt.plot(number_documents_processed, gpu_memory_used)
        plt.xlabel('Number of documents processed')
        plt.ylabel('GPU Memory used (MB)')
        plt.grid(True)
        plt.savefig(str(args.save_fp))
Exemplo n.º 6
0
def test_batch(clear_cache: bool, torch_no_grad: bool) -> None:
    stanza.download("en", processors="tokenize")
    nlp = stanza.Pipeline(lang="en",
                          processors="tokenize, sentiment",
                          use_gpu=False)
    # One sample
    count = 0
    for document in stanza_batch.batch([EXAMPLE_ONE],
                                       nlp,
                                       clear_cache=clear_cache,
                                       torch_no_grad=torch_no_grad):
        count += 1
        # This process removes the \n either side of the string
        assert document.text == "Hello how are you"
        assert document.sentences[0].tokens[0].start_char == 0
        assert document.sentences[-1].tokens[-1].end_char == 17
        document_text = document.text
        for sentence in document.sentences:
            for token in sentence.tokens:
                assert (document_text[token.start_char:token.end_char] ==
                        token.text)
            assert isinstance(sentence.sentiment, int)
    assert count == 1
    # One sample where the sample is split into three due to `\n\n` in the
    # middle of the string.
    count = 0
    for document in stanza_batch.batch(
        [EXAMPLE_FOUR],
            nlp,
            clear_cache=clear_cache,
            torch_no_grad=torch_no_grad,
    ):
        count += 1
        # This process removes the `\n \n\n` and adds `\n\n` in its place.
        assert (
            document.text ==
            "Hello how are you. Great Thanks\n\nSomething else\n\nAnother test"
        )
        assert document.sentences[0].tokens[0].start_char == 0
        assert document.sentences[-1].tokens[-1].end_char == 61
        document_text = document.text
        for sentence in document.sentences:
            for token in sentence.tokens:
                assert (document_text[token.start_char:token.end_char] ==
                        token.text)
            assert isinstance(sentence.sentiment, int)
    assert count == 1
    # Multiple samples
    text_dict = {
        0: EXAMPLE_ONE.strip(),
        1: EXAMPLE_THREE.strip(),
        2: "Hello how are you. Great Thanks\n\nSomething else\n\nAnother test",
        3: EXAMPLE_ONE.strip(),
    }
    documents = [EXAMPLE_ONE, EXAMPLE_THREE, EXAMPLE_FOUR, EXAMPLE_ONE]
    count = 0
    for index, document in enumerate(
            stanza_batch.batch(
                documents,
                nlp,
                batch_size=2,
                clear_cache=clear_cache,
                torch_no_grad=torch_no_grad,
            )):
        count += 1
        document_text = document.text
        assert document_text == text_dict[index]
        for sentence in document.sentences:
            for token in sentence.tokens:
                assert (document_text[token.start_char:token.end_char] ==
                        token.text)
            assert isinstance(sentence.sentiment, int)
    assert count == len(documents)
    # One text across 3 batches
    long_text = "\nHi\n\nNice to meet you\n   \n \nIt is a nice day\n\nBut it could be warmer\n    \nBye!\n\n \n\n"
    count = 0
    for index, document in enumerate(
            stanza_batch.batch(
                [long_text],
                nlp,
                batch_size=2,
                clear_cache=clear_cache,
                torch_no_grad=torch_no_grad,
            )):
        count += 1
        document_text = document.text
        assert (
            document_text ==
            "Hi\n\nNice to meet you\n\nIt is a nice day\n\nBut it could be warmer\n\nBye!"
        )
        for sentence in document.sentences:
            for token in sentence.tokens:
                assert (document_text[token.start_char:token.end_char] ==
                        token.text)
            assert isinstance(sentence.sentiment, int)
    assert count == 1

    # Real world type of test across a number of samples from the Jane Austin
    # book Emma.
    book_data: List[str] = []
    test_data_dir = Path(__file__, "..", "data").resolve()
    with Path(test_data_dir,
              "jane_austin_emma_data.txt").open("r") as emma_file:
        book_data = [line for line in emma_file]
    assert len(book_data) == 490

    processed_book_data: List[Document] = []
    processed_book_data = [
        document
        for document in stanza_batch.batch(book_data,
                                           nlp,
                                           clear_cache=clear_cache,
                                           torch_no_grad=torch_no_grad)
    ]
    assert len(book_data) == len(processed_book_data)
    for true_data, processed_data in zip(book_data, processed_book_data):
        processed_text = processed_data.text
        assert true_data.strip() == processed_text

        for sentence in processed_data.sentences:
            for token in sentence.tokens:
                assert (processed_text[token.start_char:token.end_char] ==
                        token.text)
            assert isinstance(sentence.sentiment, int)
Exemplo n.º 7
0
def test_misc_in_change_offsets() -> None:
    """
    Tests that we check if "misc" exists in as a key in token within the
    change_offsets functions. The change_offsets functions are within
    the combine_stanza_documents and _batch_to_documents functions.
    """
    # For the _batch_to_documents function
    stanza.download("cs", processors="tokenize,mwt")
    nlp = stanza.Pipeline(lang="cs", processors="tokenize,mwt", use_gpu=False)
    sent = "Požádal, aby mu vyhověli."
    correct_tokens = [
        [{
            "id": 1,
            "text": "Požádal",
            "misc": "start_char=0|end_char=7"
        }],
        [{
            "id": 2,
            "text": ",",
            "misc": "start_char=7|end_char=8"
        }],
        [
            {
                "id": (3, 4),
                "text": "aby",
                "misc": "start_char=9|end_char=12"
            },
            {
                "id": 3,
                "text": "aby"
            },
            {
                "id": 4,
                "text": "by"
            },
        ],
        [{
            "id": 5,
            "text": "mu",
            "misc": "start_char=13|end_char=15"
        }],
        [{
            "id": 6,
            "text": "vyhověli",
            "misc": "start_char=16|end_char=24"
        }],
        [{
            "id": 7,
            "text": ".",
            "misc": "start_char=24|end_char=25"
        }],
    ]
    docs = list(stanza_batch.batch([sent], nlp))
    assert len(docs) == 1
    for index, token in enumerate(docs[0].iter_tokens()):
        correct_token = correct_tokens[index]
        temp_token = token.to_dict()
        assert len(temp_token) == len(correct_token)
        assert temp_token == correct_token
    # For the combine_stanza_documents function
    doc = stanza_batch.combine_stanza_documents(docs)
    for index, token in enumerate(doc.iter_tokens()):
        correct_token = correct_tokens[index]
        temp_token = token.to_dict()
        assert len(temp_token) == len(correct_token)
        assert temp_token == correct_token