Пример #1
0
def test_pretokenized_multidoc():
    nlp = stanza.Pipeline(**{'processors': 'tokenize', 'dir': TEST_MODELS_DIR, 'lang': 'en',
                                  'tokenize_pretokenized': True})
    doc = nlp(EN_DOC_PRETOKENIZED)
    assert EN_DOC_PRETOKENIZED_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
    assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
    doc = nlp([stanza.Document([], text=EN_DOC_PRETOKENIZED_LIST)])[0]
    assert EN_DOC_PRETOKENIZED_LIST_GOLD_TOKENS == '\n\n'.join([sent.tokens_string() for sent in doc.sentences])
    assert all([doc.text[token._start_char: token._end_char] == token.text for sent in doc.sentences for token in sent.tokens])
    def _tag(self, text: Union[str, List[str]]) -> List[TaggedDocument]:
        """Tag text. Return dict if lists."""

        documents: List[stanza.Document] = [stanza.Document([], text=d) for d in text]

        tagged_documents: List[stanza.Document] = self.nlp(documents)

        if isinstance(tagged_documents, stanza.Document):
            tagged_documents = [tagged_documents]

        return [self._to_dict(d) for d in tagged_documents]
Пример #3
0
def test_depparse_with_pretagged_doc():
    nlp = stanza.Pipeline(
        **{
            'processors': 'depparse',
            'dir': TEST_MODELS_DIR,
            'lang': 'en',
            'depparse_pretagged': True
        })

    doc = stanza.Document(CoNLL.conll2dict(input_str=EN_DOC_CONLLU_PRETAGGED))
    processed_doc = nlp(doc)

    assert EN_DOC_DEPENDENCY_PARSES_GOLD == '\n\n'.join(
        [sent.dependencies_string() for sent in processed_doc.sentences])
def extract_features(writer, language, corpus, sentence_list):

    id = 0

    for sentence in sentence_list:

        data = {}
        root = get_root(sentence)

        # First sanity check: is there a verbal root?
        if root == None:
            continue

        sentence_all, sentence_open = remove_punct_particles(
            sentence), remove_closed_class(sentence)

        # Convert back to stanza for later tree creation (lazy)
        try:
            document_all = stanza.Document(CoNLL.convert_conll([sentence_all]))
            document_open = stanza.Document(
                CoNLL.convert_conll([sentence_open]))
        except:
            print("WARNING: Could not parse {0}".format(id))
            continue

        try:
            dependency_tree_all = tree(document_all.sentences[0].dependencies)
            dependency_tree_open = tree(
                document_open.sentences[0].dependencies)
        except:
            print("WARNING: Could not create tree for {0}".format(id))
            continue

        # Second sanity check: can we make a tree?
        if len(dependency_tree_all) == 0 or len(dependency_tree_open) == 0:
            print(root)
            text = []
            for tok in sentence:
                text.append(tok[1])
                text.append(tok[7])
            print(text)
            print("WARNING: Dependencies empty! (sentence {0})".format(id))
            id += 1
            continue

        # Third sanity check: does it meet order_info requirements?
        root = get_root(sentence_all)  # Retrieve new verb index
        order_info = determine_order_from_constituents(root, sentence_all)
        if (order_info == None):
            continue

        data.update({
            "language": language,
            "corpus": corpus,
            "id": "{0}_{1}".format(corpus, id),
            "original_length": len(sentence)
        })
        data.update(order_info)
        data.update(head_final(sentence_all, sentence_open))

        observed_data = data
        observed_data.update({"baseline": "observed"})
        observed_data.update(get_dep_length(sentence_all, sentence_open))

        optimal_data = data
        optimal_data.update({"baseline": "optimal"})
        optimal_data.update(
            get_optimal_dep_length(dependency_tree_all, dependency_tree_open))

        writer.writerow(observed_data)
        writer.writerow(optimal_data)
        #print(observed_data)

        for i in range(0, 10):
            random_data = data
            random_data.update({"baseline": "random"})
            random_data.update(
                get_random_dep_lengths(dependency_tree_all,
                                       dependency_tree_open))

            writer.writerow(random_data)
            #print(random_data)

        id += 1