Пример #1
0
def get_sentence_and_indexes(parsed_sentence):

  sentence_tree = Tree(parsed_sentence)
  if sentence_tree.node == bitpar_top: #remove designated TOP-symbol    
    sentence_tree = sentence_tree[0]
    
  rlist = [0]*len(sentence_tree.leaves())
  slist = [""]*len(sentence_tree.leaves())
  get_sentence_and_indexes_rec_helper(sentence_tree, rlist, slist)
  reordered_sentence = " ".join(slist)
  
  return reordered_sentence, rlist
def is_same_phrase (antecedent:nltk.Tree, anaphor:nltk.Tree):
    partial_antecedent = []
    for word, entity in antecedent.leaves():
        if not (entity == 'DT' or entity == 'JJ'):
            partial_antecedent.append(word)

    partial_anaphor = []
    for word, entity in anaphor.leaves():
        if not (entity == 'DT' or entity == 'JJ'):
            partial_anaphor.append(word)

    if len(partial_antecedent) == len(partial_anaphor):
        for word1, word2 in zip(partial_anaphor, partial_antecedent):
            if word1 != word2:
                return false           
        return true
    else:
        return false 
Пример #3
0
def extract_itg(alignments_file_name, parses_file_name, inv_extension):
    """Extract a inversion transduction grammar (ITG)
    from the given files.
    
    Keyword arguments:
    alignments_file_name -- name of file containing alignments
        between sentences in l1_file_name and l2_file_name
    parses_file_name -- name of file containing parse trees
        of the sentences in l1_file_name
    inv_extension -- extension denoting whether a node is inverted
        
    Returns a Counter of binary ITG rules and unary rules. Each ITG rule is 
    represented as the tuple (lhs, rhs), where rhs is a tuple of nodes."""
    binary_itg = Counter()
    unary_itg = Counter()
    num_lines = number_of_lines(parses_file_name)
    alignments_file = open(alignments_file_name)
    parses_file = open(parses_file_name)
    
    for i, l1_parse in enumerate(parses_file):
        if i % (num_lines/100) is 0:
            sys.stdout.write('\r%d%%' % (i*100/num_lines,))
            sys.stdout.flush()

        try: # TODO remove try/catch
            reordered_indexes = str_to_reordered_indexes(alignments_file.next())
            # remove outer brackets from Berkeley parse
            l1_parse = l1_parse.strip()
            l1_parse = l1_parse[1:len(l1_parse)-1]
            l1_parse = l1_parse.strip()
            parse_tree = Tree(l1_parse)            
            parse_forest = generate_forest(parse_tree, 
                reordered_indexes, inv_extension)
        except:
            error_log = open('error.log', 'a')
            error_log.write('%s -- in extract_itg/3\n' % time.asctime())
            error_log.write('line: %s\n' % i)
            error_log.write('%s\n' % l1_parse.strip())
            error_log.write('%s\n' % reordered_indexes)
            error_log.write('\n')
            error_log.close()
            print 'Error in extract_itg/3. See error.log'
            raise

        binary_rules, unary_rules = extract_rules(parse_forest, 
                                                  parse_tree.leaves())
        for rule in binary_rules:
            binary_itg[rule] += 1

        for rule in unary_rules:
            unary_itg[rule] += 1

    alignments_file.close()
    parses_file.close()
    return binary_itg, unary_itg
def is_alias (antecedent_np:nltk.Tree, antecedent_dict, anaphor_np:nltk.Tree, anaphor_dict):
    if antecedent_dict['class'] != anaphor_dict['class']:
        return False

    if antecedent_dict['class'] == 'PERSON':
        return (antecedent_np.leaves())[-1][0] == (anaphor_np.leaves())[-1][0] and antecedent_dict['gender'] == anaphor_dict['gender']

    if antecedent_dict['class'] == 'ORGANIZATION':
        acronym, words = (antecedent_np.leaves()[0], anaphor_np.leaves()) if 1 == len(antecedent_np.leaves()) < len(anaphor_np.leaves()) else (anaphor_np.leaves()[0], antecedent_np.leaves())
        
        if len(acronym) != len(words):
            return False

        for i in len(acronym):
            if str(acronym[i]) != words[i][0]:
                return False

        return True 

    return False
Пример #5
0
def tree_get_ner(t: nltk.Tree) -> list:
    ner = []
    label = t.label()

    if label != 'S':
        ner = [(' '.join(l for (l, _) in t.leaves()), label)]

    for branch in t:
        if type(branch) == nltk.Tree:
            ner += tree_get_ner(branch)

    return ner
Пример #6
0
    def _build_hierplane_tree(self, tree: Tree, index: int,
                              is_root: bool) -> JsonDict:
        """
        Recursively builds a JSON dictionary from an NLTK ``Tree`` suitable for
        rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`.

        Parameters
        ----------
        tree : ``Tree``, required.
            The tree to convert into Hierplane JSON.
        index : int, required.
            The character index into the tree, used for creating spans.
        is_root : bool
            An indicator which allows us to add the outer Hierplane JSON which
            is required for rendering.

        Returns
        -------
        A JSON dictionary render-able by Hierplane for the given tree.
        """
        children = []
        for child in tree:
            if isinstance(child, Tree):
                # If the child is a tree, it has children,
                # as NLTK leaves are just strings.
                children.append(
                    self._build_hierplane_tree(child, index, is_root=False))
            else:
                # We're at a leaf, so add the length of
                # the word to the character index.
                index += len(child)

        label = tree.label()
        span = " ".join(tree.leaves())
        hierplane_node = {
            "word": span,
            "nodeType": label,
            "attributes": [label],
            "link": label
        }
        if children:
            hierplane_node["children"] = children
        # TODO(Mark): Figure out how to span highlighting to the leaves.
        if is_root:
            hierplane_node = {
                "linkNameToLabel": LINK_TO_LABEL,
                "nodeTypeToStyle": NODE_TYPE_TO_STYLE,
                "text": span,
                "root": hierplane_node
            }
        return hierplane_node
Пример #7
0
    def _build_hierplane_tree(self, tree: Tree, index: int, is_root: bool) -> JsonDict:
        """
        Recursively builds a JSON dictionary from an NLTK ``Tree`` suitable for
        rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`.

        Parameters
        ----------
        tree : ``Tree``, required.
            The tree to convert into Hierplane JSON.
        index : int, required.
            The character index into the tree, used for creating spans.
        is_root : bool
            An indicator which allows us to add the outer Hierplane JSON which
            is required for rendering.

        Returns
        -------
        A JSON dictionary render-able by Hierplane for the given tree.
        """
        children = []
        for child in tree:
            if isinstance(child, Tree):
                # If the child is a tree, it has children,
                # as NLTK leaves are just strings.
                children.append(self._build_hierplane_tree(child, index, is_root=False))
            else:
                # We're at a leaf, so add the length of
                # the word to the character index.
                index += len(child)

        label = tree.label()
        span = " ".join(tree.leaves())
        hierplane_node = {
                "word": span,
                "nodeType": label,
                "attributes": [label],
                "link": label
        }
        if children:
            hierplane_node["children"] = children
        # TODO(Mark): Figure out how to span highlighting to the leaves.
        if is_root:
            hierplane_node = {
                    "linkNameToLabel": LINK_TO_LABEL,
                    "nodeTypeToStyle": NODE_TYPE_TO_STYLE,
                    "text": span,
                    "root": hierplane_node
            }
        return hierplane_node
    def from_tree(
            cls,
            tree: nltk.Tree,
            label_vocab: dict,
            tag_vocab: Optional[dict] = None) -> "CompressedParserOutput":
        num_words = len(tree.leaves())
        starts = np.empty(2 * num_words, dtype=int)
        ends = np.empty(2 * num_words, dtype=int)
        labels = np.empty(2 * num_words, dtype=int)

        def helper(tree, start, write_idx):
            nonlocal starts, ends, labels
            label = []
            while len(tree) == 1 and not isinstance(tree[0], str):
                if tree.label() != "TOP":
                    label.append(tree.label())
                tree = tree[0]

            if len(tree) == 1 and isinstance(tree[0], str):
                starts[write_idx] = start
                ends[write_idx] = start + 1
                labels[write_idx] = label_vocab["::".join(label)]
                return start + 1, write_idx + 1

            label.append(tree.label())
            starts[write_idx] = start
            labels[write_idx] = label_vocab["::".join(label)]

            end = start
            new_write_idx = write_idx + 1
            for child in tree:
                end, new_write_idx = helper(child, end, new_write_idx)

            ends[write_idx] = end
            return end, new_write_idx

        _, num_constituents = helper(tree, 0, 0)
        starts = starts[:num_constituents]
        ends = ends[:num_constituents]
        labels = labels[:num_constituents]

        if tag_vocab is None:
            tags = None
        else:
            tags = np.array([tag_vocab[tag] for _, tag in tree.pos()],
                            dtype=int)

        return cls(starts=starts, ends=ends, labels=labels, tags=tags)
Пример #9
0
def translate_nltk_tree(tree: nltk.Tree, tree_def: TreeDefinition, label_map: T.Dict[str, int], normalizer: T.Callable[[str], str], ignore_leaves=False):
    if tree.height() > 2:
        return Tree(node_type_id="NODE",
                    children=list(map(lambda x: translate_nltk_tree(x, tree_def, label_map, normalizer, ignore_leaves), tree)),
                    value=tree_def.id_map["NODE"].value_type(abstract_value=tree.label()))
    else:
        normalized = normalizer(tree.leaves()[0])
        return Tree(node_type_id="PRE_LEAF",
                    children=[
                        Tree(
                            node_type_id="LEAF",
                            children=[],
                            value=tree_def.id_map["LEAF"].value_type(abstract_value=label_map.get(normalized,0))  # 0 is oov
                        )
                    ] if not ignore_leaves else [],
                    value=tree_def.id_map["PRE_LEAF"].value_type(abstract_value=tree.label()))
Пример #10
0
def convert_psd_sent_2_segmentation_2(parsed_corpus):

  SegCorpus=[]
  
  for top_k_psd_of_the_sent in parsed_corpus:
    segmentation=[]
    
    best_score, best_parse_tree_str= top_k_psd_of_the_sent[0]
    tree=Tree(best_parse_tree_str)

    # tree=ROOT,  tree[0]=S,  tree[0, ] is the subtrees of S, i.e. POS tags, we can use alternative methods
    # note that it is highly dependent on the format of the parser outputs!!
    for subtree in tree.subtrees(lambda t: t.height()==tree.height()-2):
      segmentation.append(''.join(subtree.leaves()))

    SegCorpus.append(segmentation)

    if not ''.join(segmentation)==''.join(tree.leaves()):
      print('Error! Leaves/characters in thee segmentation != total characters in the tree (as leaves), Double check the format and/or code!')
      break
    
  return SegCorpus
Пример #11
0
def tree_get_pos(t: nltk.Tree) -> list:
    return [leave for leave in t.leaves()]
f=codecs.open(path_annotation, 'rU', 'utf-8')
lines=f.readlines()
f.close()

Production=[]

count=0
total_nth=int(len(lines)/10)
for line in lines:
  if count%total_nth==0:
    print(count/total_nth*10, '% finished')
  count +=1

  tree=Tree(line.strip())
  tag, subscript=decompose_tag(tree.node)
  word=''.join(tree.leaves())

  word_pos2tree_str[(word, tag)]=line.strip()
  

print('done!')

#
# gen single-char annotation from the corpus
#

print('\n\ngenerating rules for single-char words from corpus')

#---> one needs to run 2a_gen_tag_set_for_word_type.py to gen word2newtag.pickle before using it
path_word2newtag='../working_data/word2newtag.pickle'
def is_proper_name(noun_phrase:nltk.Tree):
    for word, typ in noun_phrase.leaves():
        if (not word.istitle()) and (not typ in []):
            return False
    return True
    def _build_hierplane_tree(self, tree: Tree, index: int,
                              start_token_index: int,
                              is_root: bool) -> JsonDict:
        """
        Recursively builds a JSON dictionary from an NLTK ``Tree`` suitable for
        rendering trees using the `Hierplane library<https://allenai.github.io/hierplane/>`.

        Parameters
        ----------
        tree : ``Tree``, required.
            The tree to convert into Hierplane JSON.
        index : int, required.
            The character index into the tree, used for creating spans.
        start_token_index : int, required.
            The token idx of the left-most-leaf of this tree used for storing span indices in the tree nodes
        is_root : bool
            An indicator which allows us to add the outer Hierplane JSON which
            is required for rendering.

        Returns
        -------
        A JSON dictionary render-able by Hierplane for the given tree.
        """
        children = []
        prev_children_num_tokens = 0
        for child in tree:
            if isinstance(child, Tree):
                # If the child is a tree, it has children,
                # as NLTK leaves are just strings.
                children.append(
                    self._build_hierplane_tree(child,
                                               index,
                                               start_token_index +
                                               prev_children_num_tokens,
                                               is_root=False))
                # The next child's starting index is offset by sum of length of all children to the left of it
                num_tokens_child = len(child.leaves())
                prev_children_num_tokens += num_tokens_child
            else:
                # We're at a leaf, so add the length of
                # the word to the character index.
                index += len(child)

        label = tree.label()
        span = " ".join(tree.leaves())
        # Span indexing works because children are traversed in a left-to-right manner in this NLTK tree as the
        # ConstituencyParser model makes these trees in that manner
        num_tokens = len(span.split(" "))
        hierplane_node = {
            "word": span,
            "start": start_token_index,
            "end": start_token_index + num_tokens,
            "nodeType": label,
            "attributes": [label],
            "link": label,
        }
        if children:
            hierplane_node["children"] = children
        # TODO(Mark): Figure out how to span highlighting to the leaves.
        if is_root:
            hierplane_node = {
                "linkNameToLabel": LINK_TO_LABEL,
                "nodeTypeToStyle": NODE_TYPE_TO_STYLE,
                "text": span,
                "root": hierplane_node,
            }
        return hierplane_node