Пример #1
0
def getSecondLvNPsOfParseTree(parse_tree, nps, display_tree=False):
    if display_tree:
        Tree.pretty_print(parse_tree)

    for subtree in parse_tree:
        if isinstance(subtree, Tree) and subtree.label() == 'NP' and subtree.height() == 3:
            np = subtree
            start_flag = "B-NP"
            print('\nNP: ' + ' '.join(Tree.leaves(np)))
            # obtained = False
            # may or may not be a terminal
            for np_derivation in Tree.subtrees(np):
                getSecondLvNPsOfParseTree(np_derivation, nps, False)
                if np_derivation.label() in penni_tags:
                    # if not obtained:
                    #     print('\nNP: ' + ' '.join(Tree.leaves(np)))
                    #     nps.append(Tree.leaves(np))
                    #     obtained = True
                    print(np_derivation.leaves()[0]+'\t'+np_derivation.label()+'\t'+start_flag)
                    start_flag = "I-NP"
            nps.append(Tree.leaves(np))
        elif isinstance(subtree, Tree) and subtree.label() != 'NP':
            getSecondLvNPsOfParseTree(subtree, nps, False)
        elif isinstance(subtree, Tree) and subtree.label() == 'NP' and subtree.height() != 3:
            getSecondLvNPsOfParseTree(subtree, nps, False)
        else:
            # reach terminal
            pass
Пример #2
0
def parse_using_stanfordparser(tokenized_sent,
                               display_tree=False,
                               printNP=False,
                               printLeave=False):
    result = stanford_parser.tagged_parse(tokenized_sent)
    for item in result:
        # print item
        if display_tree:
            Tree.draw(item)
        if printNP:
            NPs = list(
                Tree.subtrees(
                    item,
                    filter=lambda x: x.label() == 'NP' and x.height() <= 6))
            for n in NPs:
                if printLeave:
                    candidate = Tree.leaves(n)
                    s = ' '.join(candidate)
                    if len(candidate) == 1:
                        if re.search(re.compile(r'[A-Z_-]+', re.X), s):
                            print s
                    else:
                        print s
                else:
                    tags = []
                    for t in Tree.subtrees(n):
                        if t.label() not in ['NP', 'S', 'VP']:
                            tags.append(t.label())
                    tagged = []
                    for w in range(len(Tree.leaves(n))):
                        tagged.append(
                            (Tree.leaves(n)[w], tags[w].encode('gbk')))
                    regexp_ner_m2(regexp_grammar, tagged)
Пример #3
0
    def parse_coord(t: Tree, is_coord: bool):
        if len(t.leaves()) == 1:
            if t.pos()[0][1] == '-NONE-':
                return []
            else:
                return [(t.leaves()[0], is_coord)]

        res = []
        for subtree in t:
            res += parse_coord(subtree, is_coord or "COORD" in subtree.label())

        return res
Пример #4
0
 def recover(label, children):
     if label.endswith('|<>'):
         if label[:-3] in pos_label:
             label = label[:-3]
             tree = Tree(label, children)
             return [
                 Tree(label,
                      [Tree("CHAR", [char]) for char in tree.leaves()])
             ]
         else:
             return children
     else:
         sublabels = [l for l in label.split('+')]
         sublabel = sublabels[-1]
         tree = Tree(sublabel, children)
         if sublabel in pos_label:
             tree = Tree(sublabel,
                         [Tree("CHAR", [char]) for char in tree.leaves()])
         for sublabel in reversed(sublabels[:-1]):
             tree = Tree(sublabel, [tree])
         return [tree]
Пример #5
0
def _add_entity(t, tpl, entity_type):
    """
    Does the work of adding the entity-type node
    """

    parent_positions = []
    parents = []

    first_parent_position = t.leaf_treeposition(tpl[0])[:-1]
    first_grandparent_position = first_parent_position[:-1]

    for i in range(tpl[0], tpl[-1]):
        parent_position = t.leaf_treeposition(i)[:-1]
        parent = t[parent_position]
        parent_positions.append(parent_position)
        parents.append(parent)

    if 'parent_position' in locals():
        grandparent_position = parent_position[:-1]
        grandparent = t[grandparent_position]

        if grandparent_position == first_grandparent_position:
            # augment the nodes ONLY if every token in the mention has the same grandparent
            # i.e., if 'Barack Hussein Obama' is one NP, replace it with (NP (E-PER (NNP Barack)(NNP Hussein)(NNP Obama)))
            # but if we have "National Rifle" in one NP and "Association" in another NP, we don't bother adding E-ORG at all
            # (hopefully that doesn't exclude too many trees)
            aug_node = 'E-' + entity_type

            new_tree = Tree(aug_node, parents)

            if len(parent_positions) > 1:
                if parent_positions[-1][-1] != len(
                        grandparent.leaves()
                ) - 1:  #if the last member of the tuple is NOT the rightmost child
                    #giving up on slices; collecting all of gp's children, then adding b
                    new_leaves = new_tree.leaves()
                    new_kids = []
                    for kid in grandparent:
                        if kid[0] not in new_leaves:
                            new_kids.append(kid)
                        elif kid[0] == new_leaves[0]:
                            new_kids.append(new_tree)
                        else:
                            pass
                    new_grandparent = Tree(grandparent.node, new_kids)
                    ggparent = t[grandparent_position[:-1]]
                    ggparent[grandparent_position[-1]] = new_grandparent
                else:  #it is the rightmost child
                    grandparent[parent_positions[0][-1]:len(grandparent.leaves(
                    ))] = [new_tree]
            else:  #one-word node
                grandparent[parent_positions[0][-1]] = new_tree
Пример #6
0
def find_star_step(tree: Tree, template: Tree) -> list:
    if type(template) == str and template == "@":
        return tree.leaves()
    if type(tree) == str:
        return []
    res = find_step(tree, template)
    if len(res) != 0:
        return res
    for node in tree:
        res_ = find_star_step(node, template)
        if res_ != []:
            return res_
    return []
Пример #7
0
def getFirstLvNPsOfParseTree(parse_tree, nps, display_tree=False):
    if display_tree:
        Tree.pretty_print(parse_tree)
        # print(Tree.leaf_treeposition(parser_tree, 1)) get a child index by leaves list index
        # print(parser_tree[(0, 0, 1,)]) get a tree by index
    for subtree in parse_tree:
        if isinstance(subtree, Tree) and subtree.label() == 'NP':
            np = subtree
            start_flag = "B-NP"
            print('\nNP: '+' '.join(Tree.leaves(np)))
            # may or may not be a terminal
            for np_derivation in Tree.subtrees(np):
                # below gets smaller np scope
                # getNPsOfParseTree(np_derivation, nps, False)
                if np_derivation.label() in penni_tags:
                    print(np_derivation.leaves()[0]+'\t'+np_derivation.label()+'\t'+start_flag)
                    start_flag = "I-NP"
            nps.append(Tree.leaves(np))
        elif isinstance(subtree, Tree) and subtree.label() != 'NP':
            getFirstLvNPsOfParseTree(subtree, nps, False)
        else:
            # reach terminal
            pass
Пример #8
0
def _add_entity(t,tpl,entity_type):
    """
    Does the work of adding the entity-type node
    """

    parent_positions=[]
    parents=[]

    first_parent_position=t.leaf_treeposition(tpl[0])[:-1]
    first_grandparent_position=first_parent_position[:-1]

    for i in range(tpl[0],tpl[-1]):
        parent_position=t.leaf_treeposition(i)[:-1]
        parent=t[parent_position]
        parent_positions.append(parent_position)
        parents.append(parent)

    if 'parent_position' in locals():
        grandparent_position=parent_position[:-1]
        grandparent=t[grandparent_position]

        if grandparent_position==first_grandparent_position:
            # augment the nodes ONLY if every token in the mention has the same grandparent
            # i.e., if 'Barack Hussein Obama' is one NP, replace it with (NP (E-PER (NNP Barack)(NNP Hussein)(NNP Obama)))
            # but if we have "National Rifle" in one NP and "Association" in another NP, we don't bother adding E-ORG at all
            # (hopefully that doesn't exclude too many trees)
            aug_node='E-'+entity_type

            new_tree=Tree(aug_node,parents)

            if len(parent_positions)>1:
                if parent_positions[-1][-1]!=len(grandparent.leaves())-1: #if the last member of the tuple is NOT the rightmost child
                    #giving up on slices; collecting all of gp's children, then adding b
                    new_leaves=new_tree.leaves()
                    new_kids=[]
                    for kid in grandparent:
                        if kid[0] not in new_leaves:
                            new_kids.append(kid)
                        elif kid[0]==new_leaves[0]:
                            new_kids.append(new_tree)
                        else:
                            pass
                    new_grandparent=Tree(grandparent.node,new_kids)
                    ggparent=t[grandparent_position[:-1]]
                    ggparent[grandparent_position[-1]]=new_grandparent
                else: #it is the rightmost child
                    grandparent[parent_positions[0][-1]:len(grandparent.leaves())]=[new_tree]
            else: #one-word node
                grandparent[parent_positions[0][-1]]=new_tree
Пример #9
0
def get_constituencies_from_tree(tree: Tree, tags: List[str]):
    """
    This is a recursive function that searches through the tree (a nltk.tree.Tree) representing a constituency parse,
    and finds all nodes with a tag in tags.

    Returns:
        spans: list of strings. Each string corresponds to a node with one of the desired tags. The string is the
        node's leaves, joined together.
    """
    spans = []
    if tree.label() in tags:
        spans.append(' '.join(tree.leaves()))
    nonleaf_children = [child for child in tree if isinstance(child, Tree)]
    spans += [span for child in nonleaf_children for span in get_constituencies_from_tree(child, tags)]
    return spans
Пример #10
0
def regexp_ner_m2(grammar_re, tagged_sentence):
    result = []
    cp = nltk.RegexpParser(grammar_re)
    result_tree = cp.parse(tagged_sentence)
    nps = list(
        Tree.subtrees(result_tree,
                      filter=lambda x: x.label() == 'NE' and x.height() <= 5))
    if nps is not []:
        # print "(M2)NE found: "
        for n in nps:
            ne_list = [i[0] for i in Tree.leaves(n)]
            s = ' '.join(ne_list)
            result.append(s)
            # print s
    return result
Пример #11
0
def find_step(tree: Tree, template: Tree) -> list:
    if template == "@":
        if type(tree) == str:
            return [tree]
        return tree.leaves()
    if template == '*':
        return []
    if type(template) != str and template.label() == '*':
        res_star = []
        for temp_node in template:
            res_star.extend(find_star_step(tree, temp_node))
        return res_star
    if type(tree) == str or type(template) == str:
        if tree == template:
            return []
        return []
    if tree.label() != template.label():
        return []
    else:
        res = []
        for t_node in template:
            for node in get_node_by_label(tree, t_node):
                res.extend(find_step(node, t_node))
        return res
Пример #12
0
def extractSyntacticPhrases(srcsent, tgtsent, alignments, bractree=None, \
    maxPhraseLen=7):
    srctokenslen, tgttokenslen = len(srcsent), len(tgtsent)

    alignedTo, alignedFrom = defaultdict(list), defaultdict(list)
    for align in alignments:
        srcidx, tgtidx = align
        alignedTo[tgtidx].append(srcidx)
        alignedFrom[srcidx].append(tgtidx)
    unalignedSrcIndices = set(range(srctokenslen)) - set(alignedFrom.keys())
    unalignedTgtIndices = set(range(tgttokenslen)) - set(alignedTo.keys())

    if bractree:
        tree = Tree(bractree.strip())
        # In NLTK3, this is Tree.fromstring
        if len(tree.leaves()) != srctokenslen:
            #print("Error: tokenization mismatch between sentence and tree", \
            #    file=stderr);
            #return;
            pass
        syntacticPhraseIndices = generatePhraseSpans(tree)

    # processing phrases in source sentence
    # get possible indices of n-grams in source sentence
    srcPhraseIndices = ((srcidx_start, srcidx_end) \
        for srcidx_start in range(srctokenslen) \
        for srcidx_end in range(srcidx_start, srctokenslen))

    # filter them based on length
    srcPhraseIndices = filter(lambda rang: rang[1]+1-rang[0] <= maxPhraseLen,\
        srcPhraseIndices)

    if bractree and False:
        # filter only the syntactic phrases out
        srcPhraseIndices = filter(lambda X: X in syntacticPhraseIndices,\
            srcPhraseIndices)

    for (srcidx_start, srcidx_end) in srcPhraseIndices:
        tgtPhraseIndices = set(tgtidx \
            for idx in range(srcidx_start, srcidx_end+1) \
            for tgtidx in alignedFrom[idx])
        if not len(tgtPhraseIndices):
            tgtidx_start, tgtidx_end = 0, tgttokenslen - 1
        else:
            tgtidx_start, tgtidx_end = min(tgtPhraseIndices), max(
                tgtPhraseIndices)

        # Check for out-of-range alignments i.e words should not have alignments
        # outside the windows
        alignedSrcIndices = set(srcidx \
            for idx in range(tgtidx_start, tgtidx_end+1) \
            for srcidx in alignedTo[idx])

        if alignedSrcIndices.issubset(set(range(srcidx_start,
                                                srcidx_end + 1))):
            # no out-of-bounds alignments in source phrase
            # move tgt_min left until you find an aligned word
            # move tgt_max right until you find an aligned word
            for tgtidx_min in range(tgtidx_start, -1, -1):
                for tgtidx_max in range(tgtidx_end, tgttokenslen):
                    if tgtidx_max + 1 - tgtidx_min <= maxPhraseLen:
                        alignments = sorted((srcidx, tgtidx) \
                            for tgtidx in range(tgtidx_min, tgtidx_max+1) \
                            for srcidx in alignedTo[tgtidx])
                        phrase_alignments = tuple('%d-%d' \
                            %(srcidx-srcidx_start, tgtidx-tgtidx_start) \
                            for srcidx, tgtidx in sorted(alignments))
                        yield {'srcphrase': ' '.join(srcsent[srcidx_start:srcidx_end+1]),\
                            'tgtphrase': ' '.join(tgtsent[tgtidx_min:tgtidx_max+1]), \
                            'alignments': phrase_alignments}
                        if tgtidx_max + 1 not in unalignedTgtIndices:
                            break
                if tgtidx_min - 1 not in unalignedTgtIndices:
                    break
    return
Пример #13
0
from nltk.tree import Tree

vp = Tree('VP', [Tree('V', ['saw']), Tree('NP', ['him'])])

s = Tree('S', [Tree('NP', ['I']), vp])

print(s)

dp1 = Tree('dp', [Tree('d', ['the']), Tree('np', ['dog'])])
dp2 = Tree('dp', [Tree('d', ['the']), Tree('np', ['cat'])])
vp = Tree('vp', [Tree('v', ['chased']), dp2])

dp1.draw()

dp2.draw()

vp.draw()

tree = Tree('s', [dp1, vp])

print(tree)
tree.draw()

len(tree)

print(tree.leaves())

tree.label()

dp1.label()
Пример #14
0
def coref_replace(event_dict, key):
    """
    Function to replace pronouns with the referenced noun phrase. Iterates
    over each sentence in a news story and pulls coreference information
    from the applicable sentence, even if it is from another sentence. Also
    keeps track of any changes in indexes made by replacing pronouns, i.e.,
    the reference is longer than the reference so the tree index changes for
    future references. Filters coreferences on various dimensions to ensure
    only "good" coreferences are replaced. The default behavior is to do
    no replacement rather than a bad replacement. The function does not
    return a value, instead the event_dict is updated with the new parse tree
    containing the coref information.

    Parameters
    ----------

    event_dict: Dictionary.
                    Dictionary of sentence information, such as produced by
                    utilities.parse_sents().

    key: String.
            ID of the event or news story being processed.

    """
    #TODO: This could use some major refactoring.
    if 'coref_info' in event_dict[key]['sent_info'].keys():
        sent_info = event_dict[key]['sent_info']['sents']
        coref_info = event_dict[key]['sent_info']['coref_info']
        for sent in coref_info:
            for coref in coref_info[sent]['corefs']:
                pronoun = coref[0]
                ref = coref[1]
                if any([word in ref[0] for word in pronoun[0].split()]):
                    pass
                elif any([word in pronoun[0] for word in ref[0].split()]):
                    pass
                elif pronoun[4] - pronoun[3] > 1:
                    pass
                else:
                    try:
                        #Getting the stuff for pronouns
                        if 'coref_tree' in sent_info[pronoun[1]].keys():
                            pronoun_sent = copy.deepcopy(sent_info[pronoun[1]]
                                                         ['coref_tree'])
                        else:
                            pronoun_sent = copy.deepcopy(sent_info[pronoun[1]]
                                                         ['parse_tree'])
                            pronoun_sent = Tree(pronoun_sent)
                        pro_shift = coref_info[pronoun[1]]['shift']
                        #Getting stuff for the reference
                        if 'coref_tree' in sent_info[ref[1]].keys():
                            coref_sent = sent_info[ref[1]]['coref_tree']
                        else:
                            coref_sent = Tree(sent_info[ref[1]]['parse_tree'])
                        ref_shift = coref_info[ref[1]]['shift']

                        #Actaully replacing the pronoun
                        try:
                            pronoun_pos = pronoun_sent.leaf_treeposition(pronoun[3]
                                                                         + pro_shift)
                            #Hunting for the right pronoun
                            if pronoun_sent[pronoun_pos] != pronoun[0]:
                                if pronoun_sent[pronoun_sent.leaf_treeposition(pronoun[3] + (pro_shift - 1))] == pronoun[0]:
                                    pronoun_pos = pronoun_sent.leaf_treeposition(pronoun[3] + (pro_shift - 1))
                                    coref_info[pronoun[1]]['shift'] -= 1
                                elif pronoun_sent[pronoun_sent.leaf_treeposition(pronoun[3] + (pro_shift + 1))] == pronoun[0]:
                                    pronoun_pos = pronoun_sent.leaf_treeposition(pronoun[3] + (pro_shift + 1))
                                    coref_info[pronoun[1]]['shift'] += 1
                                else:
                                    break

                            #Hunting for the right coref
                            original_coref_index = coref_sent.leaf_treeposition(ref[3])[:-2]
                            if ' '.join(coref_sent[original_coref_index].leaves()) == ref[0]:
                                coref_pos = coref_sent.leaf_treeposition(ref[3])[:-2]
                            elif ref[0] in ' '.join(coref_sent[original_coref_index].leaves()):
                                coref_pos = coref_sent.leaf_treeposition(ref[3])[:-2]
                            else:
                                coref_pos = coref_sent.leaf_treeposition(ref[3] + ref_shift)[:-2]

                            if ref[0] not in ' '.join(coref_sent[coref_pos].leaves()):
                                pass

                            #Found everything, now replace
                            coref_tree = Tree('COREF', [coref_sent[coref_pos]])
                            pronoun_sent[pronoun_pos[:-1]] = coref_tree
                        except IndexError:
                            #TODO: Should this use the original sentence rather
                            #than possibly bad coreferences?
                            print """Key {}, sentence {} has a problem with the corefencing. Breaking and moving on.\n""".format(key, sent)
                            break

                        #Recording the shift length for the pronoun replacement
                        if len(coref_tree.leaves()) <= 2:
                            coref_info[pronoun[1]]['shift'] += 0
                        else:
                            coref_info[pronoun[1]]['shift'] += coref_tree.height()

                        coref_info[pronoun[1]]['errors'].append(False)

                        if not any(coref_info[pronoun[1]]['errors']):
                            if pronoun_sent != sent_info[sent]['parse_tree']:
                                sent_info[sent]['coref_tree'] = pronoun_sent
                    except RuntimeError, e:
                        print 'There was an error. {}'.format(e)
                        coref_info[pronoun[1]]['errors'].append(True)
                        pass
Пример #15
0
        def traverse(node):
            def extract_tags(W):
                pos = [W.getAttribute('lc') if W.getAttribute('lc') else None]
                if W.getAttribute('clitic') in {
                        'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'
                }:
                    pos.append(W.getAttribute('clitic'))
                if W.getAttribute('ne_sort'):
                    pos.append(W.getAttribute('ne_sort'))
                if W.getAttribute('n_type'):
                    pos.append(W.getAttribute('n_type'))
                if W.getAttribute('ya_type'):
                    pos.append(W.getAttribute('ya_type'))
                if W.getAttribute('ke_type'):
                    pos.append(W.getAttribute('ke_type'))
                if W.getAttribute('type'):
                    pos.append(W.getAttribute('type'))
                if W.getAttribute('kind'):
                    pos.append(W.getAttribute('kind'))
                return pos

            def clitic_join(tree, clitic):
                if type(tree[-1]) == Tree:
                    return clitic_join(tree[-1], clitic)
                else:
                    if (clitic[0][0][0] == 'ا'):
                        clitic[0] = ('‌' + clitic[0][0], clitic[0][1])
                    tree[-1] = (tree[-1][0] + clitic[0][0], clitic[0][1])
                    tree.set_label('CLITICS')
                    return

            if not len(node.childNodes):
                return
            first = node.childNodes[0]
            if first.tagName == 'w':
                pos = extract_tags(first)
                return Tree(node.tagName, [(first.childNodes[0].data.replace(
                    'می ', 'می‌'), self._pos_map(pos))])
            childs = node.childNodes[
                2:] if node.tagName == 'S' else node.childNodes
            for child in childs:
                if not len(child.childNodes):
                    childs.remove(child)
            tree = Tree(node.tagName, map(traverse, childs))
            if self._join_clitics and len(tree) > 1 and type(
                    tree[1]) == Tree and tree[1].label(
                    ) == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}:
                clitic = tree[-1]
                tree = Tree(tree.label(), [subtree for subtree in tree[0]])
                clitic_join(tree, clitic)
            if self._join_verb_parts and len(tree) > 1 and type(
                    tree[1]) == Tree and type(
                        tree[0]) == Tree and tree[0].label() == 'AUX' and tree[
                            0][0][0] in self._tokenizer.before_verbs:
                tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0],
                              tree[1][0][1])
                tree.remove(tree[0])
            if self._join_verb_parts and len(
                    tree.leaves()) > 1 and tree.leaves(
                    )[-1][0] in self._tokenizer.after_verbs and tree.leaves(
                    )[-2][0] in self._tokenizer.verbe:
                tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0],
                              tree[1][0][1])
                path = tree.leaf_treeposition(len(tree.leaves()) - 2)
                removingtree = tree
                while len(path) > 2:
                    removingtree = removingtree[path[0]]
                    path = path[1:]
                removingtree.remove(
                    Tree(tree.pos()[-2][1], [tree.pos()[-2][0]]))
            if self._join_verb_parts and len(
                    tree.leaves()) > 1 and tree.leaves(
                    )[-1][0] in self._tokenizer.after_verbs and tree.leaves(
                    )[-2][0] in self._tokenizer.verbe:
                tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0],
                              tree[1][0][1])
                path = tree.leaf_treeposition(len(tree.leaves()) - 2)
                removingtree = tree
                while len(path) > 2:
                    removingtree = removingtree[path[0]]
                    path = path[1:]
                removingtree.remove(
                    Tree(tree.pos()[-2][1], [tree.pos()[-2][0]]))
            return tree
Пример #16
0
 def extract_string_from_tree(tree: Tree) -> str:
     return ' '.join(node[0] for node in tree.leaves())
Пример #17
0
def convert_sentence(input_string: str):
    java_path = '/usr/bin/java'
    os.environ['CLASSPATH'] = java_path

    if input_string.split() == 1:
        return None

    if len(input_string.split()) == 1:
        path = create_video(input_string)
        return path

    parser = CoreNLPParser(url='http://localhost:9000')

    englishtree = [tree for tree in parser.parse(input_string.split())]
    parsetree = englishtree[0]

    dict = {}

    # parenttree = ParentedTree(node=parsetree, children=[])
    parenttree = ParentedTree.fromstring(str(parsetree))

    # print("Input Sentence: ", input_string)
    # print("Input Sentence Tree\n")
    # print(parenttree)
    print("\n\n")

    for sub in parenttree.subtrees():
        dict[sub.treeposition()] = 0

    #----------------------------#

    islTree = Tree('ROOT', [])
    i = 0

    for sub in parenttree.subtrees():
        if (sub.label() == "NP" and dict[sub.treeposition()] == 0
                and dict[sub.parent().treeposition()] == 0):
            dict[sub.treeposition()] = 1
            islTree.insert(i, sub)
            i = i + 1

        if (sub.label() == "VP" or sub.label() == "PRP"):
            for sub2 in sub.subtrees():
                if ((sub2.label() == "NP" or sub2.label() == 'PRP')
                        and dict[sub2.treeposition()] == 0
                        and dict[sub2.parent().treeposition()] == 0):
                    dict[sub2.treeposition()] = 1
                    islTree.insert(i, sub2)
                    i = i + 1

    for sub in parenttree.subtrees():
        for sub2 in sub.subtrees():
            if (len(sub2.leaves()) == 1 and dict[sub2.treeposition()] == 0
                    and dict[sub2.parent().treeposition()] == 0):
                dict[sub2.treeposition()] = 1
                islTree.insert(i, sub2)
                i = i + 1

    parsed_sent = islTree.leaves()

    # words = parsed_sent

    # print("ISL Tree\n")
    # print(islTree)
    # print("\n\n")

    # nltk.download('stopwords')
    # nltk.download('wordnet')
    # print()

    stop_words = set(stopwords.words("english"))

    lemmantizer = WordNetLemmatizer()
    # ps = PorterStemmer()
    lemmantized_words = []

    for w in parsed_sent:
        # w = ps.stem(w)
        lemmantized_words.append(lemmantizer.lemmatize(w))

    islSentence = ""

    for w in lemmantized_words:
        if w not in stop_words:
            islSentence += w
            islSentence += " "

        # islSentence += w
        # islSentence += " "

    # print("ISL Sentence\n")
    # print(islSentence)
    # print("\n\n")
    path = create_video(islSentence)

    return path
Пример #18
0
		def traverse(node):
			def extract_tags(W):
				pos = [W.getAttribute('lc') if W.getAttribute('lc') else None]
				if W.getAttribute('clitic') in {'ezafe', 'pronominal', 'verb', 'prep', 'adv', 'det'}:
					pos.append(W.getAttribute('clitic'))
				if W.getAttribute('ne_sort'):
					pos.append(W.getAttribute('ne_sort'))
				if W.getAttribute('n_type'):
					pos.append(W.getAttribute('n_type'))
				if W.getAttribute('ya_type'):
					pos.append(W.getAttribute('ya_type'))
				if W.getAttribute('ke_type'):
					pos.append(W.getAttribute('ke_type'))
				if W.getAttribute('type'):
					pos.append(W.getAttribute('type'))
				if W.getAttribute('kind'):
					pos.append(W.getAttribute('kind'))
				return pos

			def clitic_join(tree, clitic):
				if type(tree[-1]) == Tree:
					return clitic_join(tree[-1], clitic)
				else:
					if(clitic[0][0][0] == 'ا'):
						clitic[0] = ('‌' + clitic[0][0], clitic[0][1])
					tree[-1]=(tree[-1][0] + clitic[0][0], clitic[0][1])
					tree.set_label('CLITICS')
					return

			if not len(node.childNodes):
				return
			first = node.childNodes[0]
			if first.tagName == 'w':
				pos=extract_tags(first)
				return Tree(node.tagName, [(first.childNodes[0].data.replace('می ', 'می‌'), self._pos_map(pos))])
			childs = node.childNodes[2:] if node.tagName == 'S' else node.childNodes
			for child in childs:
				if not len(child.childNodes):
					childs.remove(child)
			tree = Tree(node.tagName, map(traverse, childs))
			if self._join_clitics and len(tree) > 1 and type(tree[1]) == Tree and tree[1].label() == 'CLITIC' and tree[1][0][1] not in {'P', 'V'}:
				clitic=tree[-1]
				tree = Tree(tree.label(), [subtree for subtree in tree[0]])
				clitic_join(tree, clitic)
			if self._join_verb_parts and len(tree) > 1 and type(tree[1]) == Tree and type(tree[0]) == Tree and tree[0].label() == 'AUX' and tree[0][0][0] in self._tokenizer.before_verbs:
				tree[1][0] = (tree[0][0][0] + ' ' + tree[1][0][0], tree[1][0][1])
				tree.remove(tree[0])
			if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe :
				tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1])
				path = tree.leaf_treeposition(len(tree.leaves())-2)
				removingtree = tree
				while len(path) > 2 :
					removingtree = removingtree[path[0]]
					path = path[1:]
				removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]]))
			if self._join_verb_parts and len(tree.leaves()) > 1 and tree.leaves()[-1][0] in self._tokenizer.after_verbs and tree.leaves()[-2][0] in self._tokenizer.verbe :
				tree[1][0] = (tree[0].leaves()[-1][0] + ' ' + tree[1][0][0], tree[1][0][1])
				path = tree.leaf_treeposition(len(tree.leaves())-2)
				removingtree = tree
				while len(path) > 2 :
					removingtree = removingtree[path[0]]
					path = path[1:]
				removingtree.remove(Tree(tree.pos()[-2][1],[tree.pos()[-2][0]]))
			return tree