Exemplo n.º 1
0
def get_tree_data_new(raw_data, character_to_index, word_to_index,
                      pos_to_index, lexicon_list):
    log("get_tree_data()...")
    """ Get tree structured data from CoNLL 2012

    Stores into Node data structure
    """
    tree_pyramid_list = []
    ner_list = []
    word_count = 0
    pos_count = defaultdict(lambda: 0)
    ne_count = defaultdict(lambda: 0)
    pos_ne_count = defaultdict(lambda: 0)
    lexicon_hits = [0]

    for document in raw_data["gold"]:

        for part in raw_data["gold"][document]:
            ner_raw_data = defaultdict(lambda: {})

            for k, v in raw_data["gold"][document][part]["ner"].items():
                ner_raw_data[k[0]][(k[1], k[2])] = v

            for index, parse in enumerate(
                    raw_data["gold"][document][part]["parses"]):
                text_raw_data = raw_data["gold"][document][part]["text"][index]
                word_count += len(text_raw_data)

                if parse.subtrees[0].label == "NOPARSE": continue
                head_raw_data = raw_data["gold"][document][part]["heads"][
                    index]

                root_node = Node()
                span_to_node = {}
                nodes = construct_node(root_node, parse, ner_raw_data[index],
                                       head_raw_data, text_raw_data,
                                       character_to_index, word_to_index,
                                       pos_to_index, lexicon_list, pos_count,
                                       ne_count, pos_ne_count, lexicon_hits,
                                       span_to_node, False)
                root_node.nodes = nodes
                root_node.text_raw_data = text_raw_data  # YOLO

                additional_node_list = []
                """
                additional_node_list = create_dense_nodes(
                    ner_raw_data[index], text_raw_data,
                    pos_to_index, lexicon_list,
                    pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node)
                """
                tree_pyramid_list.append((root_node, additional_node_list))
                ner_list.append(ner_raw_data[index])

    log(" %d sentences\n" % len(tree_pyramid_list))
    return (tree_pyramid_list, ner_list, word_count, pos_count, ne_count,
            pos_ne_count, lexicon_hits[0])
Exemplo n.º 2
0
def get_tree_data(raw_data, character_to_index, word_to_index, pos_to_index, lexicon_list):
    log("get_tree_data()...")
    """ Get tree structured data from CoNLL 2012
    
    Stores into Node data structure
    """
    tree_pyramid_list = []
    ner_list = []
    word_count = 0
    pos_count = defaultdict(lambda: 0)
    ne_count = defaultdict(lambda: 0)
    pos_ne_count = defaultdict(lambda: 0)
    lexicon_hits = [0]
    
    for document in raw_data["auto"]:
        for part in raw_data["auto"][document]:
            
            ner_raw_data = defaultdict(lambda: {})
            for k, v in raw_data["gold"][document][part]["ner"].iteritems():
                ner_raw_data[k[0]][(k[1], k[2])] = v
            
            for index, parse in enumerate(raw_data["auto"][document][part]["parses"]):
                text_raw_data = raw_data["auto"][document][part]["text"][index]
                word_count += len(text_raw_data)
                
                if parse.subtrees[0].label == "NOPARSE": continue
                head_raw_data = raw_data["auto"][document][part]["heads"][index]
                
                root_node = Node()
                span_to_node = {}
                nodes = construct_node(
                   root_node, parse, ner_raw_data[index], head_raw_data, text_raw_data,
                   character_to_index, word_to_index, pos_to_index, lexicon_list,
                   pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node, False)
                root_node.nodes = nodes
                root_node.text_raw_data = text_raw_data #YOLO
                
                additional_node_list = []
                """
                additional_node_list = create_dense_nodes(
                    ner_raw_data[index], text_raw_data,
                    pos_to_index, lexicon_list,
                    pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node)
                """
                tree_pyramid_list.append((root_node, additional_node_list))
                ner_list.append(ner_raw_data[index])
                
    log(" %d sentences\n" % len(tree_pyramid_list))
    return (tree_pyramid_list, ner_list, word_count, pos_count, ne_count, pos_ne_count,
        lexicon_hits[0])
Exemplo n.º 3
0
def construct_node(node, tree, ner_raw_data, head_raw_data, text_raw_data,
                    character_to_index, word_to_index, pos_to_index,
                    pos_count, ne_count, pos_ne_count):
    pos = tree.label
    word = tree.word
    span = tree.span
    head = tree.head if hasattr(tree, "head") else head_raw_data[(span, pos)][1]
    ne = ner_raw_data[span] if span in ner_raw_data else "NONE"
    
    # Process pos info
    node.pos = pos
    node.pos_index = pos_to_index[pos]
    pos_count[pos] += 1
    
    # Process word info
    node.word_split = [character_to_index[character] for character in word] if word else []
    node.word_index = word_to_index[word] if word else -1
    
    # Process head info
    node.head_split = [character_to_index[character] for character in head]
    node.head_index = word_to_index[head]
    
    # Process ne info
    node.ne = ne
    if not node.parent or node.parent.span!=span:
        ne_count[ne] += 1
    if ne != "NONE":
        pos_ne_count[pos] += 1
    
    # Process span info
    node.span = span
    
    # Binarize children
    if len(tree.subtrees) > 2:
        side_child_pos = tree.subtrees[-1].label
        side_child_span = tree.subtrees[-1].span
        side_child_head = head_raw_data[(side_child_span, side_child_pos)][1]
        if side_child_head != head:
            sub_subtrees = tree.subtrees[:-1]
        else:
            sub_subtrees = tree.subtrees[1:]
        new_span = (sub_subtrees[0].span[0], sub_subtrees[-1].span[1])
        new_tree = PSTree(label=pos, span=new_span, subtrees=sub_subtrees)
        new_tree.head = head
        if side_child_head != head:
            tree.subtrees = [new_tree, tree.subtrees[-1]]
        else:
            tree.subtrees = [tree.subtrees[0], new_tree]
         
    # Process children
    nodes = 1
    for subtree in tree.subtrees:
        child = Node()
        node.add_child(child)
        child_nodes = construct_node(child, subtree, ner_raw_data, head_raw_data, text_raw_data,
            character_to_index, word_to_index, pos_to_index,
            pos_count, ne_count, pos_ne_count)
        nodes += child_nodes
    return nodes
Exemplo n.º 4
0
def get_tree_data(raw_data, character_to_index, word_to_index, pos_to_index):
    log("get_tree_data()...")
    """ Get tree structured data from CoNLL 2012
    
    Stores into Node data structure
    """
    tree_list = []
    ner_list = []
    word_count = 0
    pos_count = defaultdict(lambda: 0)
    ne_count = defaultdict(lambda: 0)
    pos_ne_count = defaultdict(lambda: 0)

    for document in raw_data["auto"]:
        for part in raw_data["auto"][document]:

            ner_raw_data = defaultdict(lambda: {})
            for k, v in raw_data["gold"][document][part]["ner"].iteritems():
                ner_raw_data[k[0]][(k[1], k[2])] = v

            for index, parse in enumerate(
                    raw_data["auto"][document][part]["parses"]):
                text_raw_data = raw_data["auto"][document][part]["text"][index]
                word_count += len(text_raw_data)

                if parse.subtrees[0].label == "NOPARSE": continue
                head_raw_data = raw_data["auto"][document][part]["heads"][
                    index]

                root_node = Node()
                nodes = construct_node(root_node, parse, ner_raw_data[index],
                                       head_raw_data, text_raw_data,
                                       character_to_index, word_to_index,
                                       pos_to_index, pos_count, ne_count,
                                       pos_ne_count)
                root_node.nodes = nodes

                tree_list.append(root_node)
                ner_list.append(ner_raw_data[index])

    log(" %d sentences\n" % len(tree_list))
    return tree_list, ner_list, word_count, pos_count, ne_count, pos_ne_count
Exemplo n.º 5
0
def get_tree_data(sentence_list, parse_list, ner_list, character_to_index,
                  word_to_index, pos_to_index, index_to_lexicon):
    log("get_tree_data()...")
    """ Get tree structured data from CoNLL-2003
    
    Stores into Node data structure
    """
    tree_pyramid_list = []
    word_count = 0
    pos_count = defaultdict(lambda: 0)
    ne_count = defaultdict(lambda: 0)
    pos_ne_count = defaultdict(lambda: 0)
    lexicon_hits = [0]

    for index, parse in enumerate(parse_list):
        text_raw_data = sentence_list[index]
        word_count += len(text_raw_data)
        span_to_node = {}
        head_raw_data = head_finder.collins_find_heads(parse)

        root_node = Node()
        nodes = construct_node(root_node, parse, ner_list[index],
                               head_raw_data, text_raw_data,
                               character_to_index, word_to_index, pos_to_index,
                               index_to_lexicon, pos_count, ne_count,
                               pos_ne_count, lexicon_hits, span_to_node)
        root_node.nodes = nodes
        root_node.tokens = len(text_raw_data)

        additional_node_list = create_dense_nodes(ner_list[index],
                                                  text_raw_data, pos_to_index,
                                                  index_to_lexicon, pos_count,
                                                  ne_count, pos_ne_count,
                                                  lexicon_hits, span_to_node)

        tree_pyramid_list.append((root_node, additional_node_list))

    log(" %d sentences\n" % len(tree_pyramid_list))
    return tree_pyramid_list, word_count, pos_count, ne_count, pos_ne_count, lexicon_hits[
        0]
Exemplo n.º 6
0
def get_tree_data(raw_data, character_to_index, word_to_index, pos_to_index):
    log("get_tree_data()...")
    """ Get tree structured data from CoNLL 2012
    
    Stores into Node data structure
    """
    tree_list = []
    ner_list = []
    word_count = 0
    pos_count = defaultdict(lambda: 0)
    ne_count = defaultdict(lambda: 0)
    pos_ne_count = defaultdict(lambda: 0)
    
    for document in raw_data["auto"]:
        for part in raw_data["auto"][document]:
            
            ner_raw_data = defaultdict(lambda: {})
            for k, v in raw_data["gold"][document][part]["ner"].iteritems():
                ner_raw_data[k[0]][(k[1], k[2])] = v
            
            for index, parse in enumerate(raw_data["auto"][document][part]["parses"]):
                text_raw_data = raw_data["auto"][document][part]["text"][index]
                word_count += len(text_raw_data)
                
                if parse.subtrees[0].label == "NOPARSE": continue
                head_raw_data = raw_data["auto"][document][part]["heads"][index]
                
                root_node = Node()
                nodes = construct_node(
                   root_node, parse, ner_raw_data[index], head_raw_data, text_raw_data,
                   character_to_index, word_to_index, pos_to_index,
                   pos_count, ne_count, pos_ne_count)
                root_node.nodes = nodes
                
                tree_list.append(root_node)
                ner_list.append(ner_raw_data[index])
                
    log(" %d sentences\n" % len(tree_list))
    return tree_list, ner_list, word_count, pos_count, ne_count, pos_ne_count
Exemplo n.º 7
0
def get_tree_data(sentence_list, parse_list, ner_list,
        character_to_index, word_to_index, pos_to_index, index_to_lexicon):
    log("get_tree_data()...")
    """ Get tree structured data from CoNLL-2003
    
    Stores into Node data structure
    """
    tree_pyramid_list = []
    word_count = 0
    pos_count = defaultdict(lambda: 0)
    ne_count = defaultdict(lambda: 0)
    pos_ne_count = defaultdict(lambda: 0)
    lexicon_hits = [0]
    
    for index, parse in enumerate(parse_list):
        text_raw_data = sentence_list[index]
        word_count += len(text_raw_data)
        span_to_node = {}
        head_raw_data = head_finder.collins_find_heads(parse)
        
        root_node = Node()
        nodes = construct_node(
           root_node, parse, ner_list[index], head_raw_data, text_raw_data,
           character_to_index, word_to_index, pos_to_index, index_to_lexicon,
           pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node)
        root_node.nodes = nodes
        root_node.tokens = len(text_raw_data)
        
        additional_node_list = create_dense_nodes(
            ner_list[index], text_raw_data,
            pos_to_index, index_to_lexicon,
            pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node)
        
        tree_pyramid_list.append((root_node, additional_node_list))
        
    log(" %d sentences\n" % len(tree_pyramid_list))
    return tree_pyramid_list, word_count, pos_count, ne_count, pos_ne_count, lexicon_hits[0]
Exemplo n.º 8
0
def dependency_to_constituency(word_list, pos_list, head_list, relation_list,
                               index):
    leaf = Node()
    leaf.word = word_list[index]
    leaf.pos = pos_list[index]
    leaf.span = (index, index + 1)
    leaf.head = leaf.word

    root = leaf
    for child_index in head_list[index]:
        child_root = dependency_to_constituency(word_list, pos_list, head_list,
                                                relation_list, child_index)
        new_root = Node()
        new_root.word = None
        new_root.pos = relation_list[child_index]
        if child_index < index:
            new_root.span = (child_root.span[0], root.span[1])
            new_root.add_child(child_root)
            new_root.add_child(root)
        else:
            new_root.span = (root.span[0], child_root.span[1])
            new_root.add_child(root)
            new_root.add_child(child_root)
        new_root.head = root.head
        root = new_root

    return root
Exemplo n.º 9
0
def create_dense_nodes(ner_raw_data, text_raw_data, pos_to_index,
                       index_to_lexicon, pos_count, ne_count, pos_ne_count,
                       lexicon_hits, span_to_node):
    node_list = []
    max_dense_span = 3
    # Start from bigram, since all unigrams are already covered by parses
    for span_length in range(2, 1 + max_dense_span):
        for span_start in range(0, 1 + len(text_raw_data) - span_length):
            span = (span_start, span_start + span_length)
            if span in span_to_node: continue
            pos = "NONE"
            ne = ner_raw_data[span] if span in ner_raw_data else "NONE"
            constituent = " ".join(text_raw_data[span[0]:span[1]]).lower()

            # span, child
            # TODO: sibling
            node = Node()
            node_list.append(node)
            node.span = span
            span_to_node[span] = node
            node.child_list = [
                span_to_node[(span[0], span[1] - 1)],
                span_to_node[(span[0] + 1, span[1])]
            ]

            # word, head, pos
            node.pos_index = pos_to_index[pos]
            pos_count[pos] += 1
            node.word_split = []
            node.word_index = -1
            node.head_split = []
            node.head_index = -1

            # ne
            node.ne = ne
            if ne != "NONE":
                ne_count[ne] += 1
                pos_ne_count[pos] += 1

            # lexicon
            node.lexicon_hit = [0] * len(index_to_lexicon)
            hits = 0
            for index, lexicon in index_to_lexicon.items():
                if constituent in lexicon:
                    node.lexicon_hit[index] = 1
                    hits = 1
            lexicon_hits[0] += hits

    return node_list
Exemplo n.º 10
0
def construct_node(node, tree, ner_raw_data, head_raw_data, text_raw_data,
                   character_to_index, word_to_index, pos_to_index,
                   index_to_lexicon, pos_count, ne_count, pos_ne_count,
                   lexicon_hits, span_to_node):
    pos = tree.label
    word = tree.word
    span = tree.span
    head = tree.head if hasattr(tree, "head") else head_raw_data[(span,
                                                                  pos)][1]
    ne = ner_raw_data[span] if span in ner_raw_data else "NONE"
    constituent = " ".join(text_raw_data[span[0]:span[1]]).lower()

    # Process pos info
    node.pos_index = pos_to_index[pos]
    pos_count[pos] += 1

    # Process word info
    node.word_split = [character_to_index[character]
                       for character in word] if word else []
    node.word_index = word_to_index[word] if word else -1

    # Process head info
    node.head_split = [character_to_index[character] for character in head]
    #if head == "-LSB-": print(text_raw_data
    node.head_index = word_to_index[head]

    # Process ne info
    node.ne = ne
    if ne != "NONE":
        if not node.parent or node.parent.span != span:
            ne_count[ne] += 1
        pos_ne_count[pos] += 1

    # Process span info
    node.span = span
    span_to_node[span] = node

    # Process lexicon info
    node.lexicon_hit = [0] * len(index_to_lexicon)
    hits = 0
    for index, lexicon in index_to_lexicon.items():
        if constituent in lexicon:
            node.lexicon_hit[index] = 1
            hits = 1
    lexicon_hits[0] += hits

    # Binarize children
    if len(tree.subtrees) > 2:
        side_child_pos = tree.subtrees[-1].label
        side_child_span = tree.subtrees[-1].span
        side_child_head = head_raw_data[(side_child_span, side_child_pos)][1]
        if side_child_head != head:
            sub_subtrees = tree.subtrees[:-1]
        else:
            sub_subtrees = tree.subtrees[1:]
        new_span = (sub_subtrees[0].span[0], sub_subtrees[-1].span[1])
        new_tree = pstree.PSTree(label=pos,
                                 span=new_span,
                                 subtrees=sub_subtrees)
        new_tree.head = head
        if side_child_head != head:
            tree.subtrees = [new_tree, tree.subtrees[-1]]
        else:
            tree.subtrees = [tree.subtrees[0], new_tree]

    # Process children
    nodes = 1
    for subtree in tree.subtrees:
        child = Node()
        node.add_child(child)
        child_nodes = construct_node(child, subtree, ner_raw_data,
                                     head_raw_data, text_raw_data,
                                     character_to_index, word_to_index,
                                     pos_to_index, index_to_lexicon, pos_count,
                                     ne_count, pos_ne_count, lexicon_hits,
                                     span_to_node)
        nodes += child_nodes
    return nodes
Exemplo n.º 11
0
def construct_node(node, tree, ner_raw_data, head_raw_data, text_raw_data,
                   character_to_index, word_to_index, pos_to_index,
                   lexicon_list, pos_count, ne_count, pos_ne_count,
                   lexicon_hits, span_to_node, under_ne):
    pos = tree.label
    word = tree.word
    span = tree.span
    head = tree.head if hasattr(tree, "head") else head_raw_data[(span,
                                                                  pos)][1]
    ne = ner_raw_data[span] if span in ner_raw_data else "NONE"
    constituent = " ".join(text_raw_data[span[0]:span[1]]).lower()

    # Process pos info
    node.pos_index = pos_to_index[pos]
    pos_count[pos] += 1
    node.pos = pos  #YOLO

    # Process word info
    node.word_split = [character_to_index[character]
                       for character in word] if word else []
    node.word_index = word_to_index[word] if word else -1
    node.word = word if word else ""  # YOLO

    # Process head info
    node.head_split = [character_to_index[character] for character in head]
    node.head_index = word_to_index[head]
    node.head = head  # YOLO

    # Process ne info
    node.under_ne = under_ne
    node.ne = ne
    if ne != "NONE":
        under_ne = True
        if not node.parent or node.parent.span != span:
            ne_count[ne] += 1
        pos_ne_count[pos] += 1
        """
        if hasattr(tree, "head"):
            print " ".join(text_raw_data)
            print " ".join(text_raw_data[span[0]:span[1]])
            print ne
            print node.parent.head
            raw_input()
        """
    # Process span info
    node.span = span
    node.span_length = span[1] - span[0]
    span_to_node[span] = node

    # Process lexicon info
    node.lexicon_hit = [0] * len(lexicon_list)
    hits = 0
    for index, lexicon in enumerate(lexicon_list):
        if constituent in lexicon:
            lexicon[constituent] += 1
            node.lexicon_hit[index] = 1
            hits = 1
    lexicon_hits[0] += hits

    # Binarize children
    if len(tree.subtrees) > 2:
        side_child_pos = tree.subtrees[-1].label
        side_child_span = tree.subtrees[-1].span
        side_child_head = head_raw_data[(side_child_span, side_child_pos)][1]
        if side_child_head != head:
            sub_subtrees = tree.subtrees[:-1]
        else:
            sub_subtrees = tree.subtrees[1:]
        new_span = (sub_subtrees[0].span[0], sub_subtrees[-1].span[1])
        new_tree = PSTree(label=pos, span=new_span, subtrees=sub_subtrees)
        new_tree.head = head
        if side_child_head != head:
            tree.subtrees = [new_tree, tree.subtrees[-1]]
        else:
            tree.subtrees = [tree.subtrees[0], new_tree]

    # Process children
    nodes = 1
    for subtree in tree.subtrees:
        child = Node()
        node.add_child(child)
        child_nodes = construct_node(child, subtree, ner_raw_data,
                                     head_raw_data, text_raw_data,
                                     character_to_index, word_to_index,
                                     pos_to_index, lexicon_list, pos_count,
                                     ne_count, pos_ne_count, lexicon_hits,
                                     span_to_node, under_ne)
        nodes += child_nodes
    return nodes
Exemplo n.º 12
0
def dependency_to_constituency(word_list, pos_list, head_list, relation_list, index):
    leaf = Node()
    leaf.word = word_list[index]
    leaf.pos = pos_list[index]
    leaf.span = (index, index+1)
    leaf.head = leaf.word
    
    root = leaf
    for child_index in head_list[index]:
        child_root = dependency_to_constituency(word_list, pos_list, head_list, relation_list,
                            child_index)
        new_root = Node()
        new_root.word = None
        new_root.pos = relation_list[child_index]
        if child_index < index:
            new_root.span = (child_root.span[0], root.span[1])
            new_root.add_child(child_root)
            new_root.add_child(root)
        else:
            new_root.span = (root.span[0], child_root.span[1])
            new_root.add_child(root)
            new_root.add_child(child_root)
        new_root.head = root.head
        root = new_root
    
    return root
Exemplo n.º 13
0
def create_dense_nodes(ner_raw_data, text_raw_data, pos_to_index, lexicon_list,
        pos_count, ne_count, pos_ne_count, lexicon_hits, span_to_node):
    node_list = []
    max_dense_span = 3
    # Start from bigram, since all unigrams are already covered by parses
    for span_length in range(2, 1+max_dense_span):
        for span_start in range(0, 1+len(text_raw_data)-span_length):
            span = (span_start, span_start+span_length)
            if span in span_to_node: continue
            pos = "NONE"
            ne = ner_raw_data[span] if span in ner_raw_data else "NONE"
            constituent = " ".join(text_raw_data[span[0]:span[1]]).lower()
            
            # span, child
            # TODO: sibling
            node = Node(family=1)
            node_list.append(node)
            node.span = span
            node.span_length = span_length
            span_to_node[span] = node
            node.child_list = [span_to_node[(span[0],span[1]-1)], span_to_node[(span[0]+1,span[1])]]
            
            # word, head, pos
            node.pos_index = pos_to_index[pos]
            pos_count[pos] += 1
            node.word_split = []
            node.word_index = -1
            node.head_split = []
            node.head_index = -1
            
            # ne
            node.ne = ne
            if ne != "NONE":
                ne_count[ne] += 1
                pos_ne_count[pos] += 1
            
            # lexicon
            node.lexicon_hit = [0] * len(lexicon_list)
            hits = 0
            for index, lexicon in enumerate(lexicon_list):
                if constituent in lexicon:
                    lexicon[constituent] += 1
                    node.lexicon_hit[index] = 1
                    hits = 1
            lexicon_hits[0] += hits
    
    return node_list