Пример #1
0
 def get_oracle(self,
                configuration: Configuration,
                tree: DependencyTree) -> str:
     """
     Provide a static-oracle recommendation for the next parsing step to take
     """
     word1 = configuration.get_stack(1)
     word2 = configuration.get_stack(0)
     if word1 > 0 and tree.get_head(word1) == word2:
         return "L(" + tree.get_label(word1) + ")"
     elif word1 >= 0 and tree.get_head(word2) == word1 and not configuration.has_other_child(word2, tree):
         return "R(" + tree.get_label(word2) + ")"
     return "S"
Пример #2
0
def read_conll_data(
        data_file_path: str) -> Tuple[List[Sentence], List[DependencyTree]]:
    """
    Reads Sentences and Trees from a CONLL formatted data file.

    Parameters
    ----------
    data_file_path : ``str``
        Path to data to be read.
    """
    sentences: List[Sentence] = []
    trees: List[DependencyTree] = []

    with open(data_file_path, 'r') as file:
        sentence_tokens = []
        tree = DependencyTree()
        for line in tqdm(file):
            line = line.strip()
            array = line.split('\t')
            #['8', 'The', '_', 'DT', 'DT', '_', '9', 'det', '_', '_\n']
            if len(array) < 10:
                if sentence_tokens:
                    trees.append(tree)
                    sentences.append(sentence_tokens)
                    tree = DependencyTree()
                    sentence_tokens = []
            else:
                word = array[1]
                pos = array[4]
                head = int(array[6])
                dep_type = array[7]
                token = Token(word=word, pos=pos, head=head, dep_type=dep_type)
                sentence_tokens.append(token)
                tree.add(head, dep_type)

    if not sentences:
        raise Exception(f"No sentences read from {data_file_path}. "
                        f"Make sure you have not replaced tabs with spaces "
                        f"in conll formatted file by mistake.")

    return sentences, trees
Пример #3
0
 def __init__(self, sentence):
     self.stack = []
     self.buffer = []
     self.tree = DependencyTree()
     self.sentence = sentence
Пример #4
0
class Configuration:
    def __init__(self, sentence):
        self.stack = []
        self.buffer = []
        self.tree = DependencyTree()
        self.sentence = sentence

    def shift(self):
        k = self.get_buffer(0)
        if k == constants.NONEXIST:
            return False
        self.buffer.pop(0)
        self.stack.append(k)
        return True

    def remove_second_top_stack(self):
        n_stack = self.get_stack_size()
        if n_stack < 2:
            return False
        self.stack.pop(-2)
        return True

    def remove_top_stack(self):
        n_stack = self.get_stack_size()
        if n_stack <= 1:
            return False
        self.stack.pop()
        return True

    def get_stack_size(self):
        return len(self.stack)

    def get_buffer_size(self):
        return len(self.buffer)

    def getSentenceSize(self):
        return len(self.sentence)

    def get_head(self, k):
        return self.tree.get_head(k)

    def get_label(self, k):
        return self.tree.get_label(k)

    def get_stack(self, k):
        """
            Get the token index of the kth word on the stack.
            If stack doesn't have an element at this index, return constants.NONEXIST
        """
        n_stack = self.get_stack_size()
        if k >= 0 and k < n_stack:
            return self.stack[n_stack - 1 - k]
        return constants.NONEXIST

    def get_buffer(self, k):
        """
        Get the token index of the kth word on the buffer.
        If buffer doesn't have an element at this index, return constants.NONEXIST
        """
        if k >= 0 and k < self.get_buffer_size():
            return self.buffer[k]
        return constants.NONEXIST

    def get_word(self, k):
        """
        Get the word at index k
        """
        if k == 0:
            return constants.ROOT
        else:
            k -= 1

        if k < 0 or k >= len(self.sentence):
            return constants.NULL
        return self.sentence[k].word

    def get_pos(self, k):
        """
        Get the pos at index k
        """
        if k == 0:
            return constants.ROOT
        else:
            k -= 1

        if k < 0 or k >= len(self.sentence):
            return constants.NULL
        return self.sentence[k].pos

    def add_arc(self, h, t, l):
        """
        Add an arc with the label l from the head node h to the dependent node t.
        """
        self.tree.set(t, h, l)

    def get_left_child(self, k, cnt):
        """
            Get cnt-th leftmost child of k.
            (i.e., if cnt = 1, the leftmost child of k will be returned,
                   if cnt = 2, the 2nd leftmost child of k will be returned.)
        """
        if k < 0 or k > self.tree.n:
            return constants.NONEXIST

        c = 0
        for i in range(1, k):
            if self.tree.get_head(i) == k:
                c += 1
                if c == cnt:
                    return i
        return constants.NONEXIST

    def get_right_child(self, k, cnt):
        """
        Get cnt-th rightmost child of k.
        (i.e., if cnt = 1, the rightmost child of k will be returned,
               if cnt = 2, the 2nd rightmost child of k will be returned.)
        """
        if k < 0 or k > self.tree.n:
            return constants.NONEXIST

        c = 0
        for i in range(self.tree.n, k, -1):
            if self.tree.get_head(i) == k:
                c += 1
                if c == cnt:
                    return i
        return constants.NONEXIST

    def has_other_child(self, k, goldTree):
        for i in range(1, self.tree.n + 1):
            if goldTree.get_head(i) == k and self.tree.get_head(i) != k:
                return True
        return False

    def get_str(self):
        """
            Returns a string that concatenates all elements on the stack and buffer, and head / label
        """
        s = "[S]"
        for i in range(self.get_stack_size()):
            if i > 0:
                s += ","
            s += self.stack[i]

        s += "[B]"
        for i in range(self.get_buffer_size()):
            if i > 0:
                s += ","
            s += self.buffer[i]

        s += "[H]"
        for i in range(1, self.tree.n + 1):
            if i > 1:
                s += ","
            s += self.get_head(i) + "(" + self.get_label(i) + ")"

        return s