Exemplo n.º 1
0
 def test_pluralize(self):
     # Assert the accuracy of the pluralization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "celex-wordforms-de.csv")):
         if tag == "n":
             if de.pluralize(sg) == pl:
                 i +=1
             n += 1
     self.assertTrue(float(i) / n > 0.69)
     print "pattern.de.pluralize()"
Exemplo n.º 2
0
 def test_pluralize(self):
     # Assert the accuracy of the pluralization algorithm.
     from pattern.db import Datasheet
     i, n = 0, 0
     for tag, sg, pl in Datasheet.load(os.path.join(PATH, "corpora", "wordforms-de-celex.csv")):
         if tag == "n":
             if de.pluralize(sg) == pl:
                 i +=1
             n += 1
     self.assertTrue(float(i) / n > 0.69)
     print("pattern.de.pluralize()")
Exemplo n.º 3
0
def findPlural(word):
    """
    :param word: The word to convert into plural.
    :raise BadValueError: If `recurse and not save`.
    :return: The pluralized word.
    """

    word = word.encode('ascii', 'ignore')

    # first singularize the word
    # word = singularize(word)
    # print "singular: %s" % word
    # then pluralize it
    word = pluralize(word)
    # print "plural: %s" % word
    return word
Exemplo n.º 4
0
def create_graph(nlp: StanfordCoreNLP, sentence, debug=False):
    """
    Create a graph from a given sentence with a given StanfordCoreNLP instance
    :param nlp: StanfordCoreNLP instance
    :param sentence: Sentence to create a graph from
    :param debug: Debug mode
    :return:
    """

    if debug:
        print(
            "-------------------------------------------------------------------------"
        )
        pprint(sentence)

    # -------------------------------------------------------------------------
    # Create NLTK parse tree of entire sentence
    parsed_output = nlp.parse(sentence)
    parse_tree = nltk.tree.Tree.fromstring(parsed_output)

    if debug:
        print("------------")
        print("Entire tree:")
        parse_tree.pretty_print()

    # -------------------------------------------------------------------------
    # Get all noun phrases - those are possible Node candidates
    noun_phrases = list(parse_tree.subtrees(phrase_filter))

    # There's no noun phrases, this sentence is a little _to_ simple.
    if len(noun_phrases) == 0:
        noun_phrases = list(parse_tree.subtrees(noun_filter))

    if debug:
        print("------------")
        print("All noun phrase trees:")
        for noun_phrase in noun_phrases:
            noun_phrase.pretty_print()

    # -------------------------------------------------------------------------
    # Cleanups: Remove those that are nested in another nounphrase, we're interested in the bigger one
    for current_noun_phrase in noun_phrases:
        sub_noun_phrases = list(current_noun_phrase.subtrees(phrase_filter))
        if len(sub_noun_phrases) == 0:
            pass  # There's no sub-noun-phrases, we can skip this

        # Iterate over sub noun phrases and remove all of them
        for sub_noun_phrase in sub_noun_phrases:
            if sub_noun_phrase == current_noun_phrase:
                continue  # Don't remove yourself

            i = 0
            while i <= len(noun_phrases) - 1:
                if sub_noun_phrase == noun_phrases[i]:
                    del noun_phrases[i]
                i += 1

    if debug:
        print("------------")
        print("All noun phrase trees after cleanup:")
        for noun_phrase in noun_phrases:
            noun_phrase.pretty_print()

    # -------------------------------------------------------------------------
    # Remove those sub trees from the parse tree, left over can be considered links between those Nodes
    removable_tree = parse_tree[0]
    if removable_tree.label() != 'S':
        removable_tree = removable_tree[0]

    for noun_phrase in noun_phrases:
        try:
            removable_tree.remove(noun_phrase)
        except ValueError:
            pass  # Trying to remove a sub-tree that isn't there anymore - skip it

    if len(
            noun_phrases
    ) == 1:  # There was only one nounphrase, so the other one is likely just a noun
        nouns = list(parse_tree.subtrees(noun_filter))
        if len(nouns
               ) == 1:  # Gotcha, that single noun is the other node candidate
            noun_phrases.append(nouns[0])
            try:
                removable_tree.remove(nouns[0])
            except ValueError:
                pass  # In list, but not really, swallow

    # Remove end of sentence (i.e. $.), so it doesn't interfere with the rest
    eos_subtrees = list(parse_tree.subtrees(eos_filter))
    if len(eos_subtrees) > 0:
        try:
            removable_tree.remove(eos_subtrees[0])
        except ValueError:
            pass  # Seems to be in list, but not really, swallow this error.

    if debug:
        print("------------")
        print("Rest of the tree after removing all noun-phrases:")
        parse_tree.pretty_print()

    # -------------------------------------------------------------------------
    # Create nodes and edge links

    nodes = []
    edge_links = []
    edges = []

    # Try to figure out what kind of NE the possible nodes are
    ner_tagging = nlp.ner(sentence)

    if debug:
        print("------------")
        print("NER tagging for entire sentence:")
        pprint(ner_tagging)

    # Create nodes
    for noun_phrase in noun_phrases:
        # Create the word itself
        leaves = noun_phrase.leaves()
        node_word = " ".join(leaves)
        node_ne_tag = "O"
        node_numerus = SG

        # Try to find the corresponding NER
        for current_tag in ner_tagging:
            if current_tag[1] != 'O' and current_tag[0] in leaves:
                node_ne_tag = current_tag[1]

        # Grab the first noun of the sentence
        nouns = list(noun_phrase.subtrees(noun_filter))
        if len(nouns) == 0:
            # No nouns in this phrase, no nodes created.
            continue
        noun = " ".join(nouns[0].leaves())

        # Try to determine the gender
        node_gender = gender(noun)

        # Try to determine the numerus
        if pluralize(noun) == noun and node_ne_tag == 'O':
            node_numerus = PL

        nodes.append(Node(node_word, node_ne_tag, node_gender, node_numerus))

    if debug:
        print("------------")
        print("Nodes created:")
        for node in nodes:
            pprint(str(node))

    # Create edge links
    edge_leaves = parse_tree.leaves()
    for edge_leave in edge_leaves:
        link_word = edge_leave
        link_tense = None

        try:
            link_tenses = tenses(link_word)
            if len(link_tenses) > 0:
                link_tense = link_tenses[
                    0]  # Take first, this one is most likely
        except ValueError:
            pass  # Something in the intestines of pattern.de went wrong, swallow

        edge_links.append(EdgeLinkVerb(link_word, link_tense))

    if debug:
        print("------------")
        print("Edge links created:")
        for edge_link in edge_links:
            pprint(str(edge_link))

    # -------------------------------------------------------------------------
    # Stick nodes and edge links together: they form the graph

    # The edges are, within the sentence, between two nodes in the pattern of "Node - Edge - Node"
    # Therefore, the first node, together with the second node and the first link forms one graph

    i = 0
    while i <= len(nodes) - 2 and i <= len(edge_links) - 1:
        first_node = nodes[i]
        second_node = nodes[i + 1]

        edge = Edge(first_node, edge_links[i], second_node)

        first_node.attach_edge(edge)
        second_node.attach_edge(edge)

        edges.append(edge)

        i += 1

    if debug:
        for edge in edges:
            print("------------")
            print("Edges created:")
            pprint(str(edge))

    # -------------------------------------------------------------------------
    # Create graph collection
    return Graph(nodes, edges)