示例#1
0
    def __init__(self, corpus_train):

        self.PCFG = PCFG(corpus_train)
        self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_tags, self.PCFG.freq_tokens)

        self.tag_to_id = {tag: i for (i, tag) in enumerate(self.PCFG.list_all_tags)}

        self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon}
        for tag in self.PCFG.lexicon:
            for word in self.PCFG.lexicon[tag]:
                self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word]

        # self.grammar_dicts[X][Y][Z] stores P(rule X->YZ)
        self.grammar_dicts = {}
        for (root_tag, rules) in self.PCFG.grammar.items():
            # root_tag is the left hand tag of the grammar rule
            idx_root_tag = self.tag_to_id[root_tag]
            self.grammar_dicts[idx_root_tag] = {}
            dico = {}
            for (split, proba) in rules.items():  # split is the right hand term, and proba the probability of the rule
                idx_left_tag = self.tag_to_id[split[0]]
                idx_right_tag = self.tag_to_id[split[1]]
                if idx_left_tag in dico.keys():
                    dico[idx_left_tag][idx_right_tag] = proba
                else:
                    dico[idx_left_tag] = {idx_right_tag: proba}
            self.grammar_dicts[idx_root_tag] = dico
示例#2
0
    def __init__(self, corpus):

        # PCFG and OOV class
        self.pcfg = PCFG(corpus)
        self.oov = OOV(self.pcfg.lexicon, self.pcfg.list_all_tags,
                       self.pcfg.tokens)

        # Initialize CYP probability matrix
        self.proba_matrix = None
        self.cyk_matrix = None
示例#3
0
 def __init__(self, fname):
     self.count1 = {}
     self.count2 = {}
     self.count3 = {}
     self.rules = {}
     self.lexicon = {}
     self.nt = []
     self.oov = OOV('polyglot-fr.pkl')
     with open(fname, 'r') as f:
         self.training_corpus = f.readlines()
示例#4
0
    def __init__(self, corpus_train):

        self.PCFG = PCFG(corpus_train)
        self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_symbols,
                       self.PCFG.freq_tokens)

        #note : if the id of a symbol is above self.PCFG.nb_tags,
        #it's an artificial symbol introduced with Chomsky normalization
        self.symbol_to_id = {
            symbol: i
            for (i, symbol) in enumerate(self.PCFG.list_all_symbols)
        }

        #instead of storing tags, storing grammar rules with their corresponding indices in grammar_ids:
        #we store rules with an additional hierarchical level for speed up
        #in other words, self.grammar_ids[X][Y][Z] stores P(rule X->YZ)
        #where self.grammar_ids, self.grammar_ids[X], and self.grammar_ids[X][Y] are all dictionnaries
        self.grammar_ids = {}
        for (root_tag, rules) in self.PCFG.grammar.items():
            # root_tag is the left hand symbol of the grammar rule
            idx_root_tag = self.symbol_to_id[root_tag]
            self.grammar_ids[idx_root_tag] = {}
            dico = {}
            for (split, proba) in rules.items(
            ):  #split is the right hand term, and proba the probability of the rule
                idx_left_tag = self.symbol_to_id[split[0]]
                idx_right_tag = self.symbol_to_id[split[1]]
                if idx_left_tag in dico.keys():
                    dico[idx_left_tag][idx_right_tag] = proba
                else:
                    dico[idx_left_tag] = {idx_right_tag: proba}
            self.grammar_ids[idx_root_tag] = dico

        #for a given word, which are its tags with the corresponding probabilities P(tag -> mot) ?
        #this is what stores self.lexicon_inverted
        self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon}
        for tag in self.PCFG.lexicon:
            for word in self.PCFG.lexicon[tag]:
                self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word]
示例#5
0
 def __init__(self):
     self.oov = OOV()
     self.train = []
     self.test = []
     self.poses = set()
     # ex// Un: 56
     self.tokens = defaultdict(int)
     # ex// (A, B): 41
     self.count_grammar = defaultdict(int)
     # ex// (A, a): 11
     self.count_lexicon = defaultdict(int)
     # ex// Un: [N, NP]
     self.token_to_pos = defaultdict(set)
     # ex// (B, C) : [A1, A2, A3]
     self.right_to_pos = defaultdict(set)
     # ex// A: 22
     self.preterminals_pos = defaultdict(int)
     # ex// (N, Un): 0.23
     self.prob_pos_to_token = defaultdict(int)
     # ex// (A, (B, C)): 56
     self.count_left_to_right = defaultdict(int)
     # ex// (A, (B, C)): 0.11
     self.prob_left_to_right = defaultdict(int)
示例#6
0
class CYK:
    def __init__(self, corpus):

        # PCFG and OOV class
        self.pcfg = PCFG(corpus)
        self.oov = OOV(self.pcfg.lexicon, self.pcfg.list_all_tags,
                       self.pcfg.tokens)

        # Initialize CYP probability matrix
        self.proba_matrix = None
        self.cyk_matrix = None

    # Apply the CYK algorithm
    def CYK_algorithm(self, sentence):

        # Initialize
        n = len(sentence)
        r = self.pcfg.nb_all_tags
        P = np.zeros((n, n, r))
        cyk_matrix = np.zeros((n, n, r, 3))

        # First level P[0,:,:]
        for idx_word, word in enumerate(sentence):

            # Get closest word in the lexicon
            word = self.oov.closest_word(word)

            if word is None:
                for idx_tag, tag in enumerate(self.pcfg.list_all_tags):
                    if tag in self.pcfg.terminal_tags:
                        P[0, idx_word, idx_tag] = self.pcfg.terminal_tags[tag]

            else:
                for idx_tag, tag in enumerate(self.pcfg.list_all_tags):
                    if tag in self.pcfg.inv_lexicon[word]:
                        P[0, idx_word,
                          idx_tag] = self.pcfg.inv_lexicon[word][tag]

        # Other levels
        for l in range(1, n):

            for s in range(n - l):

                for tag in self.pcfg.grammar:
                    idx_tag = self.pcfg.dic_all_tags[tag]

                    for p in range(l):

                        for rule in self.pcfg.grammar[tag]:
                            left_tag = rule.split(' ')[0]
                            right_tag = rule.split(' ')[1]
                            b = self.pcfg.dic_all_tags[left_tag]
                            c = self.pcfg.dic_all_tags[right_tag]

                            prob_splitting = self.pcfg.grammar[tag][rule] * P[
                                p, s, b] * P[l - p - 1, s + p + 1, c]

                            if prob_splitting > P[l, s, idx_tag]:
                                P[l, s, idx_tag] = prob_splitting
                                cyk_matrix[l, s, idx_tag] = [p, b, c]

        self.proba_matrix = P
        self.cyk_matrix = cyk_matrix.astype(int)

    # Remove new tags and de-telescope tags
    def clean_tags(self, tree):
        # remove new tags of type
        nodes = deepcopy(tree.nodes)
        for node in nodes:
            children = list(tree.successors(node))

            if len(children) == 0:
                pass

            elif len(children) == 1 and len(list(tree.successors(
                    children[0]))) == 0:
                pass

            else:
                parent = list(tree.predecessors(node))
                if len(parent) == 0:
                    pass
                else:
                    tag = tree.nodes[node]["name"]

                    if (self.pcfg.dic_all_tags[tag] >=
                            self.pcfg.nb_tags) and ("|" in tag):

                        for child in tree.successors(node):
                            tree.add_edge(parent[0], child)
                        tree.remove_node(node)

        # Decomposing A&B -> w into A -> B -> w
        max_node = np.max(tree.nodes())
        nodes = deepcopy(tree.nodes)
        for node in nodes:

            children = list(tree.successors(node))

            if len(children) == 0 or len(list(tree.predecessors(node))) == 0:
                pass

            elif len(children) == 1 and len(list(tree.successors(
                    children[0]))) == 0:
                tag = tree.nodes[node]["name"]

                if (self.pcfg.dic_all_tags[tag] >= self.pcfg.nb_tags) and (
                        "&" in tag):  # new tag from unit rule
                    word = children[0]

                    idx_cut = None

                    for (idx, c) in enumerate(tag):
                        if c == "&":
                            idx_cut = idx

                    tree.nodes[node]["name"] = tag[:idx_cut]

                    idx_pre_terminal_node = max_node + 1
                    tree.add_node(idx_pre_terminal_node,
                                  name=tag[idx_cut + 1:])
                    max_id_node += 1
                    tree.remove_edge(node, word)
                    tree.add_edge(node, idx_pre_terminal_node)
                    tree.add_edge(idx_pre_terminal_node, word)

    # Parse part of a sentence
    def parse_substring(self, s, l, idx_tag, sentence):

        if l == 0:
            return sentence[s]

        else:
            cut = self.cyk_matrix[l, s, idx_tag, 0]
            idx_left_tag = self.cyk_matrix[l, s, idx_tag, 1]
            idx_right_tag = self.cyk_matrix[l, s, idx_tag, 2]

            left_tag = self.pcfg.list_all_tags[idx_left_tag]
            right_tag = self.pcfg.list_all_tags[idx_right_tag]

            return [[
                left_tag,
                self.parse_substring(s, cut, idx_left_tag, sentence)
            ],
                    [
                        right_tag,
                        self.parse_substring(s + cut + 1, l - cut - 1,
                                             idx_right_tag, sentence)
                    ]]

    # Returns the parsed sentence
    def parse(self, sentence):

        sentence = sentence.split(' ')
        length_sentence = len(sentence)

        if length_sentence > 1:
            self.CYK_algorithm(sentence)
            idx_root_tag = self.pcfg.dic_all_tags['SENT']
            if self.proba_matrix[length_sentence -
                                 1][0][idx_root_tag] == 0:  # no valid parsing
                return None
            parsing_list = self.parse_substring(0, length_sentence - 1,
                                                idx_root_tag, sentence)

        else:
            word = sentence[0]
            word_lexicon = self.oov.closest_word(word)

            if word_lexicon is None:
                tag = max(self.pcfg.terminal_tags,
                          key=self.pcfg.terminal_tags.get)

            else:
                tag = max(self.pcfg.inv_lexicon[word_lexicon],
                          key=self.pcfg.inv_lexicon[word_lexicon].get)

            parsing_list = '(' + tag + word + ')'

        # converting the parsing stored as a string into a tree
        tree = tagged_sent_to_tree(
            "( (SENT " + list_to_parsed_sentence(parsing_list) + "))",
            remove_after_hyphen=False)

        self.clean_tags(tree)

        return tree_to_sentence(tree)
示例#7
0
class PCFG(object):
    def __init__(self, fname):
        self.count1 = {}
        self.count2 = {}
        self.count3 = {}
        self.rules = {}
        self.lexicon = {}
        self.nt = []
        self.oov = OOV('polyglot-fr.pkl')
        with open(fname, 'r') as f:
            self.training_corpus = f.readlines()

    def parse_corpus(self):
        '''
        Parse the sentences of a corpus to compute a probabilistic grammar in
        CNF
        '''
        for i, sentence in enumerate(self.training_corpus):
            root = self._parse_tree(sentence)
            self._update_rules(root)
        self._compute_probabilities()
        self._convert_to_cnf()

    def _parse_tree(self, sentence):
        '''
        Parse the tree in string format and convert it into a data structure
        format
        '''
        list_words = (re.split('(\(|\))', sentence))
        level = -1
        root = Node('ROOT')
        curr_node = root
        for i in range(1, len(list_words)-2, 2): # Ignore first and last parenth
            word = list_words[i] + list_words[i+1]
            if i == 1:
                continue
            if word[0] == '(':
                split = word[1:].split(' ', 1)
                non_terminal = split[0].split('-', 1)[0] # Ignore hyphen
                if split[-1] == '':
                    new_node = Node(non_terminal)
                else:
                    terminal = split[1]
                    new_node = Node(non_terminal, anchor=terminal)
                curr_node.add_child(new_node)
                new_node.add_parent(curr_node)
                curr_node = new_node
            else:
                curr_node = curr_node.parents[-1]
        return root

    def _update_rules(self, root):
        '''
        Perform a BFS from the derived tree to count the rules
        '''
        queue = deque()
        queue.append(root)
        marked = [root]
        while queue:
            node = queue.pop()
            rule = node.data + ' ->'
            for child in node.children:
                if child not in marked:
                    rule += ' ' + child.data
                    queue.append(child)
                    marked.append(child)
            if node.is_terminal:
                rule += ' \'' + node.anchor + '\''
                # Keep track of the count of ... -> anchor
                if node.anchor in self.count3:
                    self.count3[node.anchor] += 1
                else:
                    self.count3[node.anchor] = 1
            # Keep track of the count of alpha -> beta
            if rule in self.count1:
                self.count1[rule] += 1
            else:
                self.count1[rule] = 1
            # Keep track of the count of alpha -> ...
            if node.data in self.count2:
                self.count2[node.data] += 1
            else:
                self.count2[node.data] = 1

    def _compute_probabilities(self):
        '''
        Compute the probability for each rule parsed from the count computed
        from the derived tree of the corpus
        '''
        for rule in self.count1:
            split = rule.split('->')
            rhs = split[-1].strip()
            lhs = split[0].strip()
            if rhs[0] == "\'" and rhs[-1] == "\'": # if anchor
                rhs = rhs[1:-1] # remove the ''
                tuple = (lhs, self.count1[rule]/self.count2[lhs])
                if rhs in self.lexicon:
                    self.lexicon[rhs].append(tuple)
                else:
                    self.lexicon[rhs] = [tuple]
                if lhs not in self.nt:
                    self.nt.append(lhs)
            else:
                self.rules[rule] = self.count1[rule]/self.count2[lhs]

    def _add_to_dict(self, key, value, dictionary):
        '''
        Add a pair (key, value) to dictionary
        '''
        if key in dictionary:
            if value not in dictionary[key]:
                dictionary[key].append(value)
        else:
            dictionary[key] = [value]

    def _add_to_nt_list(self, value):
        '''
        Add a value to a list
        '''
        if value not in self.nt:
            self.nt.append(value)

    def _convert_to_cnf(self):
        '''
        Transform N-nary rules when N > 2 to binary rules
        '''
        binary_rules = {}
        unary_rules = {}
        for rule in self.rules:
            proba = self.rules[rule]
            split = rule.split('->')
            lhs = split[0].strip()
            rhs = split[-1].strip()
            symbols = rhs.split(' ')
            if len(symbols) > 2: # if more than two symbols in rule
                new_symbol = lhs + '|' + "+".join(symbols[1:])
                new_rhs = symbols[0] + ' ' + new_symbol
                self._add_to_dict(lhs, (new_rhs, proba), binary_rules)
                self._add_to_nt_list(lhs)
                for i in range(1, len(symbols)-2):
                    new_lhs = new_symbol
                    new_symbol = lhs + '|' + "+".join(symbols[i+1:])
                    new_rhs = symbols[i] + ' ' + new_symbol

                    self._add_to_dict(new_lhs, (new_rhs, 1), binary_rules)
                    self._add_to_nt_list(new_lhs)

                new_rhs = symbols[-2] + ' ' + symbols[-1]
                self._add_to_dict(new_symbol, (new_rhs, 1), binary_rules)
                self._add_to_nt_list(new_symbol)

            elif len(symbols) == 2:
                self._add_to_dict(lhs, (rhs, self.rules[rule]), binary_rules)
            else:
                self._add_to_dict(lhs, (rhs, self.rules[rule]), unary_rules)
            self._add_to_nt_list(lhs)

        self.unary_rules = unary_rules
        self.binary_rules = binary_rules
        self.oov.vocabulary = list(self.lexicon.keys())

    def cky(self, original_sequence, substitute_sequence):
        '''
        Implement the Probabilistic CKY algorithm
        '''
        n = len(original_sequence)
        best = [[{} for i in range(n+1)] for j in range(n+1)]
        back = [[{} for i in range(n+1)] for j in range(n+1)]

        # Init
        for i in range(n+1):
            for j in range(n+1):
                for X in self.nt:
                    best[i][j][X] = 0

        # Handle terminal lexicon
        for i in range(1, n+1):
            substitute_word = substitute_sequence[i-1]
            original_word = original_sequence[i-1]
            for X, p in self.lexicon[substitute_word]:
                if p > best[i-1][i][X]:
                    best[i-1][i][X] = p
                    back[i-1][i][X] = original_word

            # Handle unary rules
            self._handle_unary(back, best, i-1, i)

        for l in range(2, n+1):
            for i in range(n-l+1):
                j = i + l
                for k in range(i+1, j):
                    # Handle binary rules
                    for X in self.binary_rules:
                        for rhs, p in self.binary_rules[X]:
                            Y, Z = rhs.split(' ')
                            p_prime = p * best[i][k][Y] * best[k][j][Z]
                            if p_prime > best[i][j][X]:
                                best[i][j][X] = p_prime
                                back[i][j][X] = (k, Y, Z)

                # Handle unary rules
                self._handle_unary(back, best, i, j)

        return back, best

    def _handle_unary(self, back, best, i, j):
        '''
        Auxiliary function that treats unary rules in the probabilistic CKY
        algorithm
        '''
        again = True
        while again:
            again = False
            for X in self.unary_rules:
                for rhs, p in self.unary_rules[X]:
                    Y = rhs.split(' ')[0]
                    p_prime = p * best[i][j][Y]
                    if p_prime > best[i][j][X]:
                        best[i][j][X] = p_prime
                        back[i][j][X] = Y
                        again = True


    def build_tree(self, i, j, non_terminal, back):
        '''
        Generate the tree from the backpointers computed from P-CKY algorithm
        '''
        node = back[i][j][non_terminal]
        tree = Node(non_terminal)
        if type(node) is tuple: # If binary
            k, left_non_terminal, right_non_terminal = node
            left_node = self.build_tree(i, k, left_non_terminal, back)
            right_node = self.build_tree(k, j, right_non_terminal, back)
            tree.add_child(left_node)
            tree.add_child(right_node)
            left_node.add_parent(tree)
            right_node.add_parent(tree)
            return tree
        else: # If unary
            if node in self.nt: # If not anchor
                left_node = self.build_tree(i, j, node, back)
                tree.add_child(left_node)
                left_node.add_parent(tree)
                return tree
            else: # If anchor
                return Node(non_terminal, node)

    def generate_tree(self, sequence):
        '''
        Generate a parsed tree corresponding to the most likely derivation
        for the sequence given as input
        '''
        original_sequence = sequence.split()
        substitute_sequence = self.oov.process(original_sequence)
        back, best = self.cky(original_sequence, substitute_sequence)
        if "ROOT" not in back[0][len(original_sequence)]: # Not parsable
            return None
        tree = self.build_tree(0, len(original_sequence), 'ROOT', back)
        tree.un_cnf()
        return tree

    def compute_accuracy(self, gt_tree, predicted_tree):
        '''
        Compute the percentage of tokens for which the parser choses
        the correct part-of-speech
        '''
        map_token_pos_gt = {}
        gt_tree.extract_leaves(map_token_pos_gt)
        map_token_pos_pred = {}
        predicted_tree.extract_leaves(map_token_pos_pred)
        acc = 0
        for token in map_token_pos_gt.keys():
            gt_pos = map_token_pos_gt[token]
            pred_pos = map_token_pos_pred[token]
            if gt_pos == pred_pos:
                acc += 1
        return acc/len(map_token_pos_gt.keys())

    def evaluate(self, input_file):
        '''
        Compute the average accuracy after comparing the parsed trees after cky
        and the actual ones of the last 10% of the corpus
        '''
        mean_accuracy = 0
        with open(input_file, 'r') as f:
            testing_sentences = f.readlines()
        for idx, test_sentence in enumerate(tqdm(testing_sentences)):
            gt_tree = self._parse_tree(test_sentence)
            raw_tokens = gt_tree.compute_raw_tokens()
            predicted_tree = self.generate_tree(raw_tokens)
            if predicted_tree is not None:
                mean_accuracy += self.compute_accuracy(gt_tree, predicted_tree)
            else: # sentence not parsable
                print("Sentence number %d: \"%s\" not parsable" \
                       %(idx, raw_tokens))
                mean_accuracy += 0
        mean_accuracy = mean_accuracy / len(testing_sentences)
        print("Final average accuracy:", mean_accuracy)

    def predict(self, input_file, output_file):
        '''
        Predict parse trees from input file with raw tokens and write them in
        an output file
        '''
        with open(input_file, 'r') as f:
            testing_sentences = f.readlines()
        pred_file = open(output_file, "w")
        for idx, test_sentence in enumerate(tqdm(testing_sentences)):
            predicted_tree = self.generate_tree(test_sentence)
            if predicted_tree is not None:
                pred_file.write(predicted_tree.to_string() + "\n")
            else: # sentence not parsable
                print("Sentence number %d: \"%s\" not parsable" \
                       %(idx, raw_tokens))
                pred_file.write("( )\n")
        pred_file.close()
示例#8
0
class PCFG:
    def __init__(self):
        self.oov = OOV()
        self.train = []
        self.test = []
        self.poses = set()
        # ex// Un: 56
        self.tokens = defaultdict(int)
        # ex// (A, B): 41
        self.count_grammar = defaultdict(int)
        # ex// (A, a): 11
        self.count_lexicon = defaultdict(int)
        # ex// Un: [N, NP]
        self.token_to_pos = defaultdict(set)
        # ex// (B, C) : [A1, A2, A3]
        self.right_to_pos = defaultdict(set)
        # ex// A: 22
        self.preterminals_pos = defaultdict(int)
        # ex// (N, Un): 0.23
        self.prob_pos_to_token = defaultdict(int)
        # ex// (A, (B, C)): 56
        self.count_left_to_right = defaultdict(int)
        # ex// (A, (B, C)): 0.11
        self.prob_left_to_right = defaultdict(int)

    def from_path(self, path):
        """load and create dataset from a treebank in path"""
        dataset = open(join(dirname(realpath(__file__)), path),
                       'r').read().splitlines()
        np.random.shuffle(dataset)
        sep1, sep2 = int(len(dataset) * 0.8), int(len(dataset) * 0.9)
        self.train, self.test = dataset[:sep2], dataset[sep2:]

    def count_occurences(self):
        """count occurences for the different grammar rules, and compute probabilities"""
        for line in self.train:
            new_tree = Tree()
            new_tree.fit(line)
            for pos, _dict in new_tree.count_rules.items():
                self.poses.add(pos)
                for left, count in _dict.items():
                    self.count_grammar[(pos, left)] += count
                    self.right_to_pos[left].add(pos)
            for pos, _dict in new_tree.count_lexicon.items():
                for token, count in _dict.items():
                    self.count_lexicon[(pos, token[0])] += count
                    self.tokens[token[0]] += 1
                    self.token_to_pos[token[0]].add(pos)
        # compute proba for A --> token
        for (pos, token), count in self.count_lexicon.items():
            self.preterminals_pos[pos] += count
        for (pos, token), count in self.count_lexicon.items():
            self.prob_pos_to_token[(
                pos, token)] = count / self.preterminals_pos[pos]
        # compute proba for A --> BC
        for (pos, _), count in self.count_grammar.items():
            self.count_left_to_right[pos] += count
        for (pos, right_side), count in self.count_grammar.items():
            self.prob_left_to_right[(
                pos, right_side)] = count / self.count_left_to_right[pos]

    def fit(self):
        """Compute grammar probabilities"""
        self.count_occurences()
        self.proba_grammar = {
            **self.prob_pos_to_token,
            **self.prob_left_to_right
        }
        self.non_terminal = set([x[0] for x in self.proba_grammar.keys()])
        self.pos_2_ind = {pos: i for i, pos in enumerate(self.non_terminal)}
        self.ind_2_pos = {v: k for k, v in self.pos_2_ind.items()}

    def pcky(self, tokens):
        """Probabilistic CYK algorithm"""
        since = time()

        # normalize input: OOV module
        words = self.normalize_tokens(tokens)

        N = len(words)
        V = len(self.non_terminal)

        table = np.zeros((N + 1, N + 1, V))
        back = np.zeros((N + 1, N + 1, V), dtype=tuple)
        for j in range(1, N + 1):
            for A in self.token_to_pos[words[j - 1]]:
                table[j - 1, j,
                      self.pos_2_ind[A]] = self.proba_grammar[(A,
                                                               words[j - 1])]

            for i in range(j - 2, -1, -1):
                for k in range(i + 1, j):
                    ind_B = np.nonzero(table[i, k, :] > 0)[0]
                    B_list = [self.ind_2_pos[x] for x in ind_B]
                    ind_C = np.nonzero(table[k, j, :] > 0)[0]
                    C_list = [self.ind_2_pos[x] for x in ind_C]
                    prod = product(B_list, C_list)
                    for BC in prod:
                        for A in self.right_to_pos[BC]:
                            indA, indB, indC = self.pos_2_ind[
                                A], self.pos_2_ind[BC[0]], self.pos_2_ind[
                                    BC[1]]
                            value = (self.proba_grammar[(A, BC)]) * (
                                table[i, k, indB]) * (table[k, j, indC])
                            if (table[i, j, indA]) < value:
                                table[i, j, indA] = value
                                back[i, j, indA] = (k, *BC)

        print("Took {}s".format(int(time() - since)))
        if not back[0, N, self.pos_2_ind["SENT"]]:
            return None
        tree = self.build_tree(tokens, back, 0, N, "SENT")
        return " ".join(self.debinarize(tree.split()))

    def build_tree(self, words, back, i, j, pos):
        """Transform the output of CYK to the form of a parsed sentence with parentheses"""
        n = j - i
        if n == 1:
            return " ( " + pos + " " + words[i] + " ) "
        else:
            k, B, C = back[i, j, self.pos_2_ind[pos]]
            return "( " + pos + " " + self.build_tree(
                words, back, i, k, B) + " " + self.build_tree(
                    words, back, k, j, C) + ") "

    def debinarize(self, s):
        """Reverse Chomsky binarisation"""
        for i, x in enumerate(s):
            if "$" in x and s[i - 1] == "(":
                c = 1
                for j, y in enumerate(s[i + 1:]):
                    if y == '(':
                        c += 1
                    elif y == ")":
                        c -= 1
                    if c == 0:
                        return self.debinarize(s[:i - 1] + s[i + 1:i + 1 + j] +
                                               s[i + 1 + j + 1:])
        return s

    def predict(self, line):
        """predict the parser of line from training dataset"""
        new = self.line_to_tokens(line)
        return self.pcky(new)

    def line_to_tokens(self, line):
        """transform a line from dataset to a list of tokens"""
        tokenized = line.replace("(", " ( ").replace(")", " ) ").split()[1:-1]
        remove = False
        new = []
        for i, x in enumerate(tokenized):
            if tokenized[i] == "(" and tokenized[i + 1] != "(":
                remove = True
            elif tokenized[i] == "(":
                new.append(x)
            else:
                if not remove:
                    new.append(x)
                else:
                    remove = False
        new = list(filter(lambda x: x not in [')', '('], new))
        return new

    def prepare_line_for_prediction(self, line):
        """Tokenize a line from dataset"""
        tokenized = line.replace("(", " ( ").replace(")", " ) ").split()[1:-1]
        new = []
        for i, x in enumerate(tokenized):
            if "-" in x and tokenized[i - 1] == "(":
                new.append(x.split("-")[0])
            else:
                new.append(x)
        return " ".join(new)

    def normalize_word(self, word):
        """OOV module, 1st: compute levenshtein_distance, if not, return closest word using cosinus similarity"""
        if word in self.tokens.keys():
            return word
        lv_distances = defaultdict(list)
        for token in self.tokens.keys():
            distance = levenshtein_distance(word, token)
            for i in range(1, 3):
                if distance == i:
                    lv_distances[i].append(token)
                    break
        for i in range(1, 3):
            if lv_distances[i]:
                return lv_distances[i][0]

        return self.oov.closest_to_tokens(word, self.tokens.keys())

    def normalize_tokens(self, tokens):
        """apply self.normalize_word to a list of tokens"""
        return [self.normalize_word(token) for token in tokens]
示例#9
0
class CYK_Parser():

    #My parser based on probabilist CYK algorithm

    def __init__(self, corpus_train):

        self.PCFG = PCFG(corpus_train)
        self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_symbols,
                       self.PCFG.freq_tokens)

        #note : if the id of a symbol is above self.PCFG.nb_tags,
        #it's an artificial symbol introduced with Chomsky normalization
        self.symbol_to_id = {
            symbol: i
            for (i, symbol) in enumerate(self.PCFG.list_all_symbols)
        }

        #instead of storing tags, storing grammar rules with their corresponding indices in grammar_ids:
        #we store rules with an additional hierarchical level for speed up
        #in other words, self.grammar_ids[X][Y][Z] stores P(rule X->YZ)
        #where self.grammar_ids, self.grammar_ids[X], and self.grammar_ids[X][Y] are all dictionnaries
        self.grammar_ids = {}
        for (root_tag, rules) in self.PCFG.grammar.items():
            # root_tag is the left hand symbol of the grammar rule
            idx_root_tag = self.symbol_to_id[root_tag]
            self.grammar_ids[idx_root_tag] = {}
            dico = {}
            for (split, proba) in rules.items(
            ):  #split is the right hand term, and proba the probability of the rule
                idx_left_tag = self.symbol_to_id[split[0]]
                idx_right_tag = self.symbol_to_id[split[1]]
                if idx_left_tag in dico.keys():
                    dico[idx_left_tag][idx_right_tag] = proba
                else:
                    dico[idx_left_tag] = {idx_right_tag: proba}
            self.grammar_ids[idx_root_tag] = dico

        #for a given word, which are its tags with the corresponding probabilities P(tag -> mot) ?
        #this is what stores self.lexicon_inverted
        self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon}
        for tag in self.PCFG.lexicon:
            for word in self.PCFG.lexicon[tag]:
                self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word]

    def compute_CYK_tables(self, sentence, viz_oov=False):
        # compute CYK tables :
        # - looking for the probabilities of the most likely trees parsing the substrings of sentence, for increasing length of substring (from 1 to length of the sentence)
        # - storing each time the position of the cut and the rule (right hand term) enabling to reach the most likely parsing tree of a given root tag

        nb_words = len(sentence)

        max_proba_derivation = np.zeros(
            (nb_words, nb_words, self.PCFG.nb_all_symbols))
        # max_proba_derivation[s,l,a] is the maximum probability of
        # a parsing where symbol a derives substring x_s...x_(s+l)

        split_reaching_max = np.zeros(
            (nb_words, nb_words, self.PCFG.nb_all_symbols, 3))
        # split_reaching_max[s,l,a,0] stores index cut
        # split_reaching_max[s,l,a,1] stores symbol b
        # split_reaching_max[s,l,a,2] stores symbol c
        # such that

        # (i) b derives x_s...x_(s+cut), c derives x_(s+cut)...x_(s+l)
        # and a rewrites bc (a->bc in the grammar)

        # (ii) the splitting <cut,b,c> defined by (i) is the one enabling
        # to reach the maximum probability for a to derives  x_s...x_(s+l)
        # (ie enabling to reach max_proba_derivation[s,l,a])

        # probabilities of tags for unary strings (words)
        for (position_word, word) in enumerate(sentence):

            token_to_tag = word

            if not (word in self.OOV.words_lexicon):
                if viz_oov: print(word + " is an OOV")
                token_to_tag = self.OOV.closest_in_corpus(word,
                                                          viz_closest=viz_oov)
                if viz_oov:
                    if token_to_tag is None:
                        print("No closest token found")
                        print("")
                    else:
                        print("Closest token found : " + token_to_tag)
                        print("")

            if token_to_tag is None:
                for (tag, counts) in self.PCFG.freq_terminal_tags.items():
                    if tag in self.symbol_to_id:  # avoid the case where tag appearing in lexicon but not in grammar rules
                        id_tag = self.symbol_to_id[tag]
                        max_proba_derivation[position_word, 0, id_tag] = counts
            else:
                for (tag,
                     proba) in self.lexicon_inverted[token_to_tag].items():
                    if tag in self.symbol_to_id:  #avoid the case where tag appearing in lexicon but not in grammar rules
                        id_tag = self.symbol_to_id[tag]
                        max_proba_derivation[position_word, 0, id_tag] = proba

        for l in range(1, nb_words):
            # we will consider symbols deriving strings of length l+1...

            for s in range(nb_words - l):
                # ... and starting at index s of the sentence

                for idx_root_tag in self.grammar_ids:
                    # ... root_tag is the symbol deriving the considered string (rule left-hand term)

                    for cut in range(0, l):
                        # ... such symbol can rewrite as two symbols AB
                        # with A deriving substring until index cut included, and B deriving substring from index cut+1

                        for idx_left_tag in self.grammar_ids[
                                idx_root_tag]:  #left symbol A

                            proba_left_derivation = max_proba_derivation[
                                s, cut, idx_left_tag]

                            if proba_left_derivation > max_proba_derivation[
                                    s, l, idx_root_tag]:

                                for (idx_right_tag,
                                     proba_split) in self.grammar_ids[
                                         idx_root_tag][idx_left_tag].items(
                                         ):  #right symbol B

                                    proba_right_derivation = max_proba_derivation[
                                        s + cut + 1, l - cut - 1,
                                        idx_right_tag]

                                    proba_decomposition = proba_split * proba_left_derivation * proba_right_derivation

                                    if proba_decomposition > max_proba_derivation[
                                            s, l, idx_root_tag]:
                                        # therefore, we found a new decomposition <cut,split[0],split[1]>
                                        # reaching a highest probability for root_tag to derive substring x_s...x_(s+l)

                                        max_proba_derivation[
                                            s, l,
                                            idx_root_tag] = proba_decomposition
                                        split_reaching_max[s, l, idx_root_tag,
                                                           0] = cut
                                        split_reaching_max[s, l, idx_root_tag,
                                                           1] = idx_left_tag
                                        split_reaching_max[s, l, idx_root_tag,
                                                           2] = idx_right_tag

        self.max_proba_derivation = max_proba_derivation
        self.split_reaching_max = split_reaching_max.astype(int)

    def parse_substring(self, s, l, idx_root_tag, sentence):
        # parse substring beginning at index s of sentence, of length l+1, and tagged as idx_root_tag

        if l == 0:
            return sentence[s]

        else:  # split enabling to reach max_proba_derivation[s,l,idx_root_tag]
            cut = self.split_reaching_max[s, l, idx_root_tag, 0]
            idx_left_tag = self.split_reaching_max[s, l, idx_root_tag, 1]
            idx_right_tag = self.split_reaching_max[s, l, idx_root_tag, 2]

            left_tag = self.PCFG.list_all_symbols[idx_left_tag]
            right_tag = self.PCFG.list_all_symbols[idx_right_tag]

            return [[
                left_tag,
                self.parse_substring(s, cut, idx_left_tag, sentence)
            ],
                    [
                        right_tag,
                        self.parse_substring(s + cut + 1, l - cut - 1,
                                             idx_right_tag, sentence)
                    ]]

    def remove_artificial_symbols(self, T):
        #removing artificial symbols from T the tree structure encoding the parsing of the sentence

        #debinarize : remove artificial symbols of type X|X1X2X3.. (coming from BIN rule)
        #attaching children of an artificial symbol to its own father
        nodes = deepcopy(T.nodes)
        for node in nodes:
            children = list(T.successors(node))
            if len(children) == 0: pass
            elif len(children) == 1 and len(list(T.successors(
                    children[0]))) == 0:
                pass
            else:
                father = list(T.predecessors(node))
                if len(father) == 0: pass
                else:
                    symbol = T.nodes[node]["name"]
                    if (self.symbol_to_id[symbol] >= self.PCFG.nb_tags) and (
                            "|" in symbol):  # artificial symbol from BIN rule
                        for child in T.successors(node):
                            T.add_edge(father[0], child)
                        T.remove_node(node)

        #add pre_terminal symbols : remove artificial symbols of type A&B (coming from UNIT rule)
        #decompositing A&B into two symbols A and B (A father of B father of word)
        max_id_node = np.max(T.nodes())
        nodes = deepcopy(T.nodes)
        for node in nodes:
            children = list(T.successors(node))
            if len(children) == 0 or len(list(T.predecessors(node))) == 0: pass
            elif len(children) == 1 and len(list(T.successors(
                    children[0]))) == 0:
                symbol = T.nodes[node]["name"]

                if (self.symbol_to_id[symbol] >= self.PCFG.nb_tags) and (
                        "&" in symbol):  # artificial symbol from UNIT rule
                    word = children[0]

                    idx_cut = None
                    for (idx, c) in enumerate(symbol):
                        if c == "&":
                            idx_cut = idx

                    T.nodes[node]["name"] = symbol[:idx_cut]

                    idx_pre_terminal_node = max_id_node + 1
                    T.add_node(idx_pre_terminal_node,
                               name=symbol[idx_cut + 1:])
                    max_id_node += 1

                    T.remove_edge(node, word)
                    T.add_edge(node, idx_pre_terminal_node)
                    T.add_edge(idx_pre_terminal_node, word)

    def reformat_parsing(self, parsing):
        # converting parsing stored as nested lists into the required format (with nested brackets)

        if type(parsing) == str:
            return parsing

        else:
            string = ""
            for el in parsing:
                root_tag = el[0]
                parsing_substring = el[1]
                string = string + "(" + root_tag + " " + self.reformat_parsing(
                    parsing_substring) + ")" + " "
            string = string[:-1]
            return string

    def parse(self, sentence, remove_artificial_symbols=True, viz_oov=False):
        # parse sentence
        # remove_artificial_symbols : if False, keep Chomsky artificial symbols
        # viz_oov : if True, plot management of oov words
        sentence = sentence.split()

        nb_words = len(sentence)

        if nb_words > 1:
            self.compute_CYK_tables(sentence, viz_oov=viz_oov)
            idx_root_tag = self.symbol_to_id["SENT"]
            if self.max_proba_derivation[0][
                    nb_words - 1][idx_root_tag] == 0:  #no valid parsing
                return None
            parsing_list = self.parse_substring(0, nb_words - 1, idx_root_tag,
                                                sentence)

        else:
            word = sentence[0]
            token_to_tag = self.OOV.closest_in_corpus(word,
                                                      viz_closest=viz_oov)
            if token_to_tag is None:
                tag = max(self.PCFG.freq_terminal_tags,
                          key=self.PCFG.freq_terminal_tags.get)
            else:
                tag = max(self.lexicon_inverted[token_to_tag],
                          key=self.lexicon_inverted[token_to_tag].get)
            parsing_list = "(" + tag + " " + word + ")"

        if remove_artificial_symbols:
            #converting the parsing stored as a string into a tree
            T = postagged_sent_to_tree(
                "( (SENT " + self.reformat_parsing(parsing_list) + "))",
                remove_after_hyphen=False)
            #nx.draw(T, labels=nx.get_node_attributes(T, "name"), arrows=False, pos=graphviz_layout(T, prog='dot'))
            self.remove_artificial_symbols(T)
            return tree_to_postagged_sent(T)  #return parsing as a string

        else:
            return "( (SENT " + self.reformat_parsing(
                parsing_list) + "))"  #return parsing as a string
示例#10
0
class CYK:
    """Class for applying the CYK algorithm"""

    def __init__(self, corpus_train):

        self.PCFG = PCFG(corpus_train)
        self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_tags, self.PCFG.freq_tokens)

        self.tag_to_id = {tag: i for (i, tag) in enumerate(self.PCFG.list_all_tags)}

        self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon}
        for tag in self.PCFG.lexicon:
            for word in self.PCFG.lexicon[tag]:
                self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word]

        # self.grammar_dicts[X][Y][Z] stores P(rule X->YZ)
        self.grammar_dicts = {}
        for (root_tag, rules) in self.PCFG.grammar.items():
            # root_tag is the left hand tag of the grammar rule
            idx_root_tag = self.tag_to_id[root_tag]
            self.grammar_dicts[idx_root_tag] = {}
            dico = {}
            for (split, proba) in rules.items():  # split is the right hand term, and proba the probability of the rule
                idx_left_tag = self.tag_to_id[split[0]]
                idx_right_tag = self.tag_to_id[split[1]]
                if idx_left_tag in dico.keys():
                    dico[idx_left_tag][idx_right_tag] = proba
                else:
                    dico[idx_left_tag] = {idx_right_tag: proba}
            self.grammar_dicts[idx_root_tag] = dico

    def list_to_sentence(self, parsing):
        """Go from list to string representation"""

        if type(parsing) == str:
            return parsing

        else:
            string = ""
            for p in parsing:
                root_tag = p[0]
                parsing_substring = p[1]
                string = string + "(" + root_tag + " " + self.list_to_sentence(parsing_substring) + ")" + " "
            string = string[:-1]  # Remove the extra space
            return string

    def parse_substring(self, s, l, idx_root_tag, sentence):
        """Parse part of a sentence into a list"""

        if l == 0:
            return sentence[s]

        else:  # split enabling to reach max_proba_derivation[s,l,idx_root_tag]
            cut = self.cyk_matrix[s, l, idx_root_tag, 0]
            idx_left_tag = self.cyk_matrix[s, l, idx_root_tag, 1]
            idx_right_tag = self.cyk_matrix[s, l, idx_root_tag, 2]

            left_tag = self.PCFG.list_all_tags[idx_left_tag]
            right_tag = self.PCFG.list_all_tags[idx_right_tag]

            return [[left_tag, self.parse_substring(s, cut, idx_left_tag, sentence)],
                    [right_tag, self.parse_substring(s + cut + 1, l - cut - 1, idx_right_tag, sentence)]]

    def clean_tags(self, tree):
        """Remove artificial tags and de-telescope tags"""

        # remove artificial tag of type X|X1X2X3.. (coming from BIN rule)
        nodes = deepcopy(tree.nodes)
        for node in nodes:
            children = list(tree.successors(node))
            if len(children) == 0:
                pass
            elif len(children) == 1 and len(list(tree.successors(children[0]))) == 0:
                pass
            else:
                father = list(tree.predecessors(node))
                if len(father) == 0:
                    pass
                else:
                    tag = tree.nodes[node]["name"]
                    if (self.tag_to_id[tag] >= self.PCFG.nb_tags) and (
                            "|" in tag):  # artificial tag from BIN rule
                        for child in tree.successors(node):
                            tree.add_edge(father[0], child)
                        tree.remove_node(node)

        # decomposing (A&B w) into (A (B w))
        max_id_node = np.max(tree.nodes())
        nodes = deepcopy(tree.nodes)
        for node in nodes:
            children = list(tree.successors(node))
            if len(children) == 0 or len(list(tree.predecessors(node))) == 0:
                pass
            elif len(children) == 1 and len(list(tree.successors(children[0]))) == 0:
                tag = tree.nodes[node]["name"]

                if (self.tag_to_id[tag] >= self.PCFG.nb_tags) and (
                        "&" in tag):  # artificial tag from UNIT rule
                    word = children[0]

                    idx_cut = None
                    for (idx, c) in enumerate(tag):
                        if c == "&":
                            idx_cut = idx

                    tree.nodes[node]["name"] = tag[:idx_cut]

                    idx_pre_terminal_node = max_id_node + 1
                    tree.add_node(idx_pre_terminal_node, name=tag[idx_cut + 1:])
                    max_id_node += 1

                    tree.remove_edge(node, word)
                    tree.add_edge(node, idx_pre_terminal_node)
                    tree.add_edge(idx_pre_terminal_node, word)

    def compute_CYK(self, sentence, viz_oov=False):
        """Apply the CYK algorithm (heavily influenced by https://en.wikipedia.org/wiki/CYK_algorithm)"""

        n = len(sentence)
        prob_matrix = np.zeros((n, n, self.PCFG.nb_all_tags))
        cyk_matrix = np.zeros((n, n, self.PCFG.nb_all_tags, 3))

        # probabilities of tags for unary rule
        for (position_word, word) in enumerate(sentence):

            token_to_tag = word

            if not (word in self.OOV.words_lexicon):
                token_to_tag = self.OOV.closest_in_corpus(word, viz_closest=viz_oov)

            if token_to_tag is None:
                for (tag, counts) in self.PCFG.freq_terminal_tags.items():
                    if tag in self.tag_to_id:
                        id_tag = self.tag_to_id[tag]
                        prob_matrix[position_word, 0, id_tag] = counts
            else:
                for (tag, proba) in self.lexicon_inverted[token_to_tag].items():
                    if tag in self.tag_to_id:
                        id_tag = self.tag_to_id[tag]
                        prob_matrix[position_word, 0, id_tag] = proba

        for l in range(1, n):
            for s in range(n - l):
                for idx_root_tag in self.grammar_dicts:
                    for cut in range(0, l):
                        for idx_left_tag in self.grammar_dicts[idx_root_tag]:
                            proba_left_derivation = prob_matrix[s, cut, idx_left_tag]
                            if proba_left_derivation > prob_matrix[s, l, idx_root_tag]:  # save useless iterations

                                for (idx_right_tag, proba_split) in self.grammar_dicts[idx_root_tag][
                                    idx_left_tag].items():

                                    proba_right_derivation = prob_matrix[s + cut + 1, l - cut - 1, idx_right_tag]
                                    proba_decomposition = proba_split * proba_left_derivation * proba_right_derivation

                                    if proba_decomposition > prob_matrix[s, l, idx_root_tag]:
                                        prob_matrix[s, l, idx_root_tag] = proba_decomposition
                                        cyk_matrix[s, l, idx_root_tag] = [cut, idx_left_tag, idx_right_tag]

        self.prob_matrix = prob_matrix
        self.cyk_matrix = cyk_matrix.astype(int)

    def parse(self, sentence, viz_oov=False):
        """Returns a parsed and tagged sentence from a natural sentence"""
        sentence = sentence.split()

        nb_words = len(sentence)

        if nb_words > 1:
            self.compute_CYK(sentence, viz_oov=viz_oov)
            idx_root_tag = self.tag_to_id["SENT"]
            if self.prob_matrix[0][nb_words - 1][idx_root_tag] == 0:  # no valid parsing
                return None
            parsing_list = self.parse_substring(0, nb_words - 1, idx_root_tag, sentence)

        else:
            word = sentence[0]
            token_to_tag = self.OOV.closest_in_corpus(word, viz_closest=viz_oov)
            if token_to_tag is None:
                tag = max(self.PCFG.freq_terminal_tags, key=self.PCFG.freq_terminal_tags.get)
            else:
                tag = max(self.lexicon_inverted[token_to_tag], key=self.lexicon_inverted[token_to_tag].get)
            parsing_list = "(" + tag + " " + word + ")"

        # converting the parsing stored as a string into a tree
        tree = tagged_sent_to_tree("( (SENT " + self.list_to_sentence(parsing_list) + "))",
                                   remove_after_hyphen=False)
        self.clean_tags(tree)
        return tree_to_sentence(tree)
示例#11
0
    f = open(trainfilename, 'r')
    for line in f:
        trees.append(nltk.Tree.fromstring(line))

    # preprocss the tree forms: ignore functional labels and binarize to CNF
    for tree in trees:
        # ignore_func_labels(tree)
        tree.chomsky_normal_form(horzMarkov=2)
        # tree.chomsky_normal_form()

    # learn PCFG
    lexicon, grammar, vocabulary, symbols = PCFG(trees)
    # print(grammar)

    # for OOV
    oovwords = OOV(embedfilename, vocabulary)

    # parse new sentences using CYK based on learned PCFG
    # parser = CYKSolver(lexicon, grammar, vocabulary, symbols, oovwords)

    # i = 0
    for line in sys.stdin:
        # print('start parse')
        # print(line)
        # start = time.time()
        # if line == '\n': continue
        # cyksolver = CYK(line.split(), lexicon, grammar, vocabulary, symbols, embedfilename)
        # i += 1
        # if i < 20: continue
        # if i > 3: break
        # parsedtree = parser.compute(line.split())