예제 #1
0
    def __init__(self, parse_file):
        counter = Counts()
        for l in open(parse_file):
            tree = json.loads(l)
            counter.count(tree)

        #N is 'Non-terminal'=> N[symbol] = count
        self.N = counter.nonterm
        #binary_R is 'binary Rule'  binary_R[symbol, y1, y2] = count
        self.binary_R = counter.binary
        #unary will be on this form=> unary[symbol, word] = count
        self.unary_R = counter.unary
        #V is 'vocabulary' and there are 245 words in it.
        self.V = counter.vocabulary  #245
        # pi[i, i, x] = probability
        #'pi' is a dictionary, 'i' is the index of the word in the sentence, 'x' is the non-terminal symbol
        #'pi[i, i, x]' = 0.03 where 0.03 is the probability of the word at 'i' being assigned to 'x'
        self.pi = {}
        #'bp' stands for 'back pointer' and it's a dictionary that maps the best rule and best_split to
        #the binary table
        #'bp[i, i, x]' = [(x,word),i]
        self.bp = {}
        #binary_table is a dictionary that collects the binary rules derived from one symbol
        #binary_table['S'] =  [ ('NP', 'VP'), ('NP', 'VP+VERB'), ...]
        self.binary_table = defaultdict(set)
        self.initialize_binary_table()
예제 #2
0
파일: HW5.py 프로젝트: sulfamide/cs4705
 def __init__(self):
     self.binary_q = {}
     self.unary_q = {}
     self.counter = Counts()
     self.pi = []
     self.bp = []
     self.parser = {}
     self.use_vert = False
예제 #3
0
파일: HW6.py 프로젝트: sulfamide/cs4705
 def __init__(self):
     self.binary_q = {}
     self.unary_q = {}
     self.counter = Counts()
     self.counter_n = Counts()
     self.pi = []
     self.bp = []
     self.parser = {}
     self.use_vert = False
     #0.95 curr best
     self.beta = 0
     self.vm = "\^\<[A-Z]+[+A-Z]*\>"
     self.vocab = {}
 def compute_cfg_frequency_in_train_file(self, train_file):
     self.cfg_counter = Counts()
     with open(train_file, 'r') as fd:
         for line in fd:
             try:
                 cfg_tree = json.loads(line)
                 self.cfg_counter.count(cfg_tree)
             except:
                 print('Error: compute_cfg_frequency_in_train_file(): line: {0}'.format(line))
 def __init__(self, rare_count_threshold=5):
     self.rare_count_threshold = rare_count_threshold
     self.cfg_counter = Counts()
     self.word_freq = defaultdict(int)
class PCFG:
    def __init__(self, rare_count_threshold=5):
        self.rare_count_threshold = rare_count_threshold
        self.cfg_counter = Counts()
        self.word_freq = defaultdict(int)

    # This tree is in Penn Tree Bank format not in Chomsky Normal Form
    def create_parse_tree(self, elem_root_subtree):
        if 'null' in elem_root_subtree.attrib:
            return None
        subtree_array = [elem_root_subtree.attrib['cat']]

        if elem_root_subtree.tag == 'cons':
            for child in elem_root_subtree:
                child_subtree_array = self.create_parse_tree(child)
                if child_subtree_array is not None:
                    subtree_array.append(child_subtree_array)
        elif elem_root_subtree.tag == 'tok':
            subtree_array.append(elem_root_subtree.text)
        else:
            # TBD: print error message
            pass

        return subtree_array

    # right-factored
    # TBD: Provide option for left-factored too
    # TBD: Handle null as mentioned in section 6.3 Null Elements in GENIA corpus manual
    def convert_PTB_to_CNF(self, tree_array):
        if len(tree_array) > 3:
            # convert multi-child tree into tree with two children
            subtree_array = [tree_array[0]]
            for i in range(2, len(tree_array)):
                subtree_array.append(tree_array[i])
            tree_array[2] = subtree_array
            # tree_array[2] = [tree_array[0], tree_array[2], tree_array[3:]]
            del tree_array[3:]
            self.convert_PTB_to_CNF(tree_array)
        elif len(tree_array) == 3:
            # root of both the children should be non-terminals
            assert (type(tree_array[1]) is list), "expected list for left child: {0}".format(tree_array[1])
            assert (type(tree_array[2]) is list), "expected list for right child: {0}".format(tree_array[2])
            # convert left child into CNF form if its not already in that form
            self.convert_PTB_to_CNF(tree_array[1])
            # convert right child into CNF form if its not already in that form
            self.convert_PTB_to_CNF(tree_array[2])
        elif (len(tree_array) == 2) and (type(tree_array[1]) is list):
            # Form: X->Y where X,Y are non-terminals
            tree_array[0] = tree_array[0] + '+' + tree_array[1][0]
            if type(tree_array[1][1]) is list:
                subtree_array = tree_array[1][1:]
                del tree_array[1:]
                for subtree in subtree_array:
                    tree_array.append(subtree)
                self.convert_PTB_to_CNF(tree_array)
            else:
                # e.g. [NP [DET There]]
                tree_array[1] = tree_array[1][1]

    # Extract parse tree from the xml files. These parse trees are in Penn TreeBank format.
    # Randomly assign xml files into training and validation set.
    # Write the parse trees after converting them into Chomsky Normal Form
    def create_train_and_test_parse_tree(self, treebank_folder, train_key_file, test_file, test_key_file, n_train_file):
        file_list = glob.glob(treebank_folder+'/'+'*.xml')
        # randomly select n_train_file for training and rest for testing
        train_index_list = random.sample(range(len(file_list)), n_train_file)

        # write the train file
        count_parse_tree = 0
        with open(train_key_file, 'w') as fd:
            for file_i in train_index_list:
                train_file = file_list[file_i]
                try:
                    count_parse_tree_in_xml = self.write_parse_tree(fd, train_file)
                    count_parse_tree += count_parse_tree_in_xml
                    print('train file: {0} :: parse tree count till now: {1}'.format(train_file, count_parse_tree))
                except:
                    err = sys.exc_info()[0]
                    print('Error in {0}: {1}'.format(train_file, err))

        # test files are actually more of validation set
        # write the test parse tree file
        failure_parse_file_list = []
        with open(test_key_file, 'w') as fd:
            # optimize the search to O(n) by sorting the train_index_list first
            for file_i in range(len(file_list)):
                if file_i not in train_index_list:
                    test_xml_file = file_list[file_i]
                    try:
                        self.write_parse_tree(fd, test_xml_file)
                    except:
                        err = sys.exc_info()
                        print('Error in {0}: {1}'.format(test_xml_file, err))
                        parts_filename = os.path.split(test_xml_file)
                        failure_parse_file_list.append(parts_filename[1])

        # Now write the test sentences
        with open(test_file, 'w') as fd:
            # optimize the search to O(n) by sorting the train_index_list first
            for file_i in range(len(file_list)):
                if file_i not in train_index_list:
                    test_xml_file = file_list[file_i]
                    parts_filename = os.path.split(test_xml_file)
                    if parts_filename[1] in failure_parse_file_list:
                        print('ignoring sentence extraction from {0}'.format(test_xml_file))
                        continue
                    try:
                        self.write_sentences(fd, test_xml_file)
                    except:
                        err = sys.exc_info()[0]
                        print('Error in extracting sentence from {0}: {1}'.format(test_xml_file, err))

    # Create parse trees from xml file and write in the train/test file (fd)
    def write_parse_tree(self, fd, xml_filename):
        tree = ET.parse(xml_filename)
        root = tree.getroot()

        count_parse_tree_in_xml = 0
        # reading the sentences only from the section: AbstractText
        for abstractText in root.iter('AbstractText'):
            # iterate over each of the sentences
            for sentence in abstractText.iter('sentence'):
                # TBD: Following should have a single iteration
                #       Need to check if the root is an actual root tag e.g. 'S'
                for sentence_root in sentence:
                    tree_ptb_array = self.create_parse_tree(sentence_root)
                    tree_cnf_array = deepcopy(tree_ptb_array)
                    self.convert_PTB_to_CNF(tree_cnf_array)
                    # convert string into json (this converts single quotes to double quotes)
                    # required due to failure in json load of tree in count_cfg_freq.py
                    tree_cnf_json = json.dumps(tree_cnf_array)
                    fd.write('{0}\n'.format(tree_cnf_json))
                    count_parse_tree_in_xml += 1

        return count_parse_tree_in_xml

    # Extract sentences from xml file and write in fd
    @staticmethod
    def write_sentences(fd, xml_filename):
        tree = ET.parse(xml_filename)
        root = tree.getroot()

        for abstractText in root.iter('AbstractText'):
            for sentence in abstractText.iter('sentence'):
                token_array = []
                for token in sentence.iter('tok'):
                    token_array.append(token.text)
                fd.write('{0}\n'.format(' '.join(token_array)))

    def create_train_with_rare(self, orig_train_key_file, mod_train_key_file):
        # First check if self.word_freq is already populated or not
        if len(self.cfg_counter.unary) == 0:
            self.compute_cfg_frequency_in_train_file(orig_train_key_file)
        if len(self.word_freq) == 0:
            self.compute_word_frequency_in_cfg()

        # Now iterate through each parse tree and replace the rare word with rare symbol
        # Write the changed parse trees into new train file
        count_parse_tree = 0
        with open(mod_train_key_file, 'w') as wfd:
            with open(orig_train_key_file, 'r') as rfd:
                for line in rfd:
                    count_parse_tree += 1
                    tree = json.loads(line)
                    try:
                        self.replace_infrequent_words_in_parse_tree(tree)
                        tree_json = json.dumps(tree)
                        wfd.write('{0}\n'.format(tree_json))
                    except:
                        print('Error: create_train_with_rare(): parse tree # {0} :: line: {1}\n'.format(count_parse_tree, line))

    def compute_cfg_frequency_in_train_file(self, train_file):
        self.cfg_counter = Counts()
        with open(train_file, 'r') as fd:
            for line in fd:
                try:
                    cfg_tree = json.loads(line)
                    self.cfg_counter.count(cfg_tree)
                except:
                    print('Error: compute_cfg_frequency_in_train_file(): line: {0}'.format(line))

    def compute_word_frequency_in_cfg(self):
        self.word_freq = defaultdict(int)

        # Terminal word can be assigned to multiple non-Terminals
        for (sym, word), count in self.cfg_counter.unary.iteritems():
            self.word_freq[word] += count

    def write_word_frequency_in_cfg(self, word_freq_file):
        word_freq_sorted = sorted(self.word_freq.items(), key=lambda x: x[1], reverse=True)
        with open(word_freq_file, 'w') as fd:
            for word_freq in word_freq_sorted:
                fd.write('{0} {1}\n'.format(word_freq[1], word_freq[0]))

    def is_rare_word(self, word):
        return self.word_freq[word.lower()] < self.rare_count_threshold

    def replace_infrequent_words_in_parse_tree(self, tree):
        if isinstance(tree, basestring):
            return
        if len(tree) == 3:
            # binary rule
            self.replace_infrequent_words_in_parse_tree(tree[1])
            self.replace_infrequent_words_in_parse_tree(tree[2])
        elif len(tree) == 2:
            # unary rule
            word = tree[1]
            assert (type(tree[1]) is not list), "expected string: {0}".format(tree[1])
            if self.is_rare_word(word):
                tree[1] = '_RARE_'

    # x -> y z
    # x,y,z are non-terminals
    def compute_binary_parameter(self, x, y, z):
        key = (x, y, z)
        if key not in self.cfg_counter.binary:
            return 0.0
        else:
            return self.cfg_counter.binary[key]*1.0/self.cfg_counter.nonterm[x]

    # x -> w
    # x: non terminal
    # w: terminal
    def compute_unary_parameter(self, x, w):
        key = (x, w)
        if key not in self.cfg_counter.unary:
            return 0.0
        else:
            return self.cfg_counter.unary[key]*1.0/self.cfg_counter.nonterm[x]

    def compute_parse_tree_using_cky_algorithm(self, sentence):
        tokens = sentence.strip().split(' ')
        n = len(tokens)

        # replace the rare token with rare symbol
        for i in range(n):
            if self.is_rare_word(tokens[i]):
                tokens[i] = '_RARE_'.lower()
            else:
                tokens[i] = tokens[i].lower()

        pi = {}  # store the matrix in dict form
        bp = {}

        # initialize the dict
        for i in range(n):
            pi[i] = {}
            bp[i] = {}

            for j in range(i, n):
                pi[i][j] = {}
                bp[i][j] = {}

                if i == j:
                    # initialize pi for x->w for each of the word
                    for x in self.cfg_counter.nonterm:
                        pi[i][i][x] = self.compute_unary_parameter(x, tokens[i])
                        bp[i][i][x] = None
                else:
                    for x in self.cfg_counter.nonterm:
                        pi[i][j][x] = 0.0
                        bp[i][j][x] = None

        # Now for x -> y z where x,y,z are non-terminals
        for l in range(1, n):
            for i in range(n-l):
                j = i + l
                # Here we only consider the (x,y,z) tuple which we have seen in training data
                for (x, y, z) in self.cfg_counter.binary.keys():
                    q_x_to_yz = self.compute_binary_parameter(x, y, z)

                    if q_x_to_yz <= pi[i][j][x]:
                        # current x -> y z  can't give better probability than already computed max prob with
                        # non-terminal x spanning words i..j inclusive
                        continue

                    max_arg_s, max_val_s_i_j = self.compute_best_split(pi, i, j, y, z)

                    # now check if the current value of pi[i][j][x] is better than the value computed earlier
                    val = q_x_to_yz * max_val_s_i_j
                    if pi[i][j][x] <= val:
                        pi[i][j][x] = val
                        bp[i][j][x] = [max_arg_s, y, z]

                """
                # In the following we try all the (x,y,z) combinations
                # This is a slower process
                for x in self.cfg_counter.nonterm:
                    max_pi_i_j_x = 0.0
                    # [s,y,z]
                    arg_max_pi = [None, None, None]

                    for y in self.cfg_counter.nonterm:
                        for z in self.cfg_counter.nonterm:
                            q_x_to_yz = self.compute_binary_parameter(x, y, z)

                            max_val_s_i_j = 0.0
                            max_arg_s = None
                            for s in range(i, j):
                                val = pi[i][s][y] * pi[s+1][j][z]
                                if max_val_s_i_j < val:
                                    max_val_s_i_j = val
                                    max_arg_s = s

                            val = q_x_to_yz * max_val_s_i_j
                            if max_pi_i_j_x < val:
                                max_pi_i_j_x = val
                                arg_max_pi = [max_arg_s, y, z]

                    pi[i][j][x] = max_pi_i_j_x
                    bp[i][j][x] = arg_max_pi
                """

        assert (pi[0][n-1]['S'] > 0), "pi[0][{0}]['S'] is zero".format(n-1)
        # split the sentence into tokens again. In the beginning of the function we had replaced the rare tokens with rare symbol
        tokens = sentence.strip().split(' ')
        best_parse_tree = self.create_parse_tree_from_backpointers(tokens, bp, 0, n-1, 'S')
        best_prob = pi[0][n-1]['S']

        return best_parse_tree, best_prob

    # For the given x -> y z for the words spanning i,..,j inclusive, compute the best split
    # Select s which maximizes pi(i,s,y)*pi(s+1,j,z)
    @staticmethod
    def compute_best_split(pi, i, j, y, z):
        max_val_s_i_j = 0.0
        max_arg_s = None
        for s in range(i, j):
            val = pi[i][s][y] * pi[s + 1][j][z]
            if max_val_s_i_j <= val:
                max_val_s_i_j = val
                max_arg_s = s

        return max_arg_s, max_val_s_i_j

    def create_parse_tree_from_backpointers(self, tokens, bp, i, j, x):
        if i == j:
            return [x, tokens[i]]

        assert (bp[i][j][x] is not None), "bp[{0}][{1}][{2}] is None".format(i, j, x)
        split_index, y, z = bp[i][j][x]

        parse_tree_left_child = self.create_parse_tree_from_backpointers(tokens, bp, i, split_index, y)
        parse_tree_right_child = self.create_parse_tree_from_backpointers(tokens, bp, split_index+1, j, z)

        parse_tree = [x, parse_tree_left_child, parse_tree_right_child]

        return parse_tree

    def compute_parse_tree_for_test_sentences(self, test_sentence_file, test_key_file):
        with open(test_key_file, 'w') as wfd:
            with open(test_sentence_file, 'r') as rfd:
                sent_i = 0
                for sentence in rfd:
                    sent_i += 1
                    if sent_i % 20 == 19:
                        print('sent_i: {0}'.format(sent_i))

                    try:
                        tree, prob = self.compute_parse_tree_using_cky_algorithm(sentence)
                        # convert into json
                        tree_json = json.dumps(tree)
                        wfd.write('{0}\n'.format(tree_json))
                        print('prob of sent_i #{0}: {1}'.format(sent_i, prob))
                    except:
                        err = sys.exc_info()[0]
                        print('Error: compute_parse_tree_for_test_sentences(): sent_i: {0} :: sentence: {1} :: \
                        error: {2}\n'.format(sent_i, sentence, err))
 def compute_cfg_frequency_in_train_file(self, train_file):
     self.cfg_counter = Counts()
     with open(train_file, 'r') as fd:
         for line in fd:
             cfg_tree = json.loads(line)
             self.cfg_counter.count(cfg_tree)
 def __init__(self, original_train_file, modified_train_file, count_threshold=5):
     self.count_threshold = count_threshold
     self.word_freq = defaultdict(int)
     self.cfg_counter = Counts()
     self.bp_table = {}
class PCFG:
    def __init__(self, original_train_file, modified_train_file, count_threshold=5):
        self.count_threshold = count_threshold
        self.word_freq = defaultdict(int)
        self.cfg_counter = Counts()
        self.bp_table = {}

    def compute_cfg_frequency_in_train_file(self, train_file):
        self.cfg_counter = Counts()
        with open(train_file, 'r') as fd:
            for line in fd:
                cfg_tree = json.loads(line)
                self.cfg_counter.count(cfg_tree)

    def compute_word_frequency_in_cfg(self):
        self.word_freq = defaultdict(int)
        # case sensitive
        # https://class.coursera.org/nlangp-001/forum/thread?thread_id=631

        # Terminal word can be assigned to multiple non-Terminals
        # https://class.coursera.org/nlangp-001/forum/thread?thread_id=620#post-2613
        for (sym, word), count in self.cfg_counter.unary.iteritems():
            self.word_freq[word] += count

    def is_rare_word(self, word):
        return self.word_freq[word] < self.count_threshold

    def replace_infrequent_words_in_parse_tree(self, tree):
        if isinstance(tree, basestring):
            return
        if len(tree) == 3:
            # binary rule
            self.replace_infrequent_words_in_parse_tree(tree[1])
            self.replace_infrequent_words_in_parse_tree(tree[2])
        elif len(tree) == 2:
            # unary rule
            word = tree[1]
            if self.is_rare_word(word):
                tree[1] = '_RARE_'

    def compute_binary_parameter(self, symbol, y1, y2):
        return self.cfg_counter.binary[(symbol, y1, y2)]*1.0/self.cfg_counter.nonterm[symbol]

    def compute_unary_parameter(self, symbol, word):
        return self.cfg_counter.unary[(symbol, word)]*1.0/self.cfg_counter.nonterm[symbol]

    def CKY_algorithm(self, sentence):
        sentence_tokens = re.split(r'[ ]+', sentence.rstrip())
        n_tokens = len(sentence_tokens)

        max_prob_table = defaultdict(float)  # pi table
        self.bp_table = {}

        # now build the dynamic programming table bottom-up
        for symbol in self.cfg_counter.nonterm.iterkeys():
            for i in range(0, n_tokens):
                word = sentence_tokens[i]
                if self.is_rare_word(word):
                    word = '_RARE_'
                key = (symbol, word)
                if key in self.cfg_counter.unary.keys():
                    max_prob_table[(i, i, symbol)] = self.compute_unary_parameter(symbol, word)

        for step in range(1, n_tokens):
            for i in range(0, n_tokens-step):
                j = i + step

                for (sym, y1, y2) in self.cfg_counter.binary.iterkeys():
                    binary_param = self.compute_binary_parameter(sym, y1, y2)
                    max_prob_mult = 0
                    max_prob_s = None
                    for s in range(i, j):
                        prob_mult = max_prob_table[(i, s, y1)]*max_prob_table[(s+1, j, y2)]
                        if max_prob_mult < prob_mult:
                            max_prob_mult = prob_mult
                            max_prob_s = s

                    prob_with_current_binary_rule_over_i_j = binary_param*max_prob_mult

                    if max_prob_table[(i, j, sym)] < prob_with_current_binary_rule_over_i_j:
                        max_prob_table[(i, j, sym)] = prob_with_current_binary_rule_over_i_j
                        self.bp_table[(i, j, sym)] = tuple([max_prob_s, y1, y2])

        parse_tree = self.create_parse_tree(0, n_tokens-1, 'SBARQ', sentence_tokens)
        return parse_tree

    def create_parse_tree(self, i, j, sym, sentence_tokens):
        # [sym, func(i,s,y1), func(s+1,j,y2) ]
        parse_sub_tree = []
        if i == j:
            parse_sub_tree = [sym, sentence_tokens[i]]
            # parse_sub_tree = '[' + sym + ', ' + sentence_tokens[i] + ']'
        else:
            split_tuple = self.bp_table[(i, j, sym)]
            s = split_tuple[0]
            y1 = split_tuple[1]
            y2 = split_tuple[2]

            parse_sub_tree = [sym, self.create_parse_tree(i, s, y1, sentence_tokens),
                              self.create_parse_tree(s+1, j, y2, sentence_tokens)]
            '''
            parse_sub_tree = '['
            parse_sub_tree += sym
            parse_sub_tree += ', '
            parse_sub_tree += self.create_parse_tree(i, s, y1, sentence_tokens)
            parse_sub_tree += ', '
            parse_sub_tree += self.create_parse_tree(s+1, j, y2, sentence_tokens)
            parse_sub_tree += ']'
            '''

        return parse_sub_tree
예제 #10
0
파일: HW4.py 프로젝트: sulfamide/cs4705
import json
from count_cfg_freq import Counts

counter = Counts()
vocab = {}


def loadData(corpus_file):
    l = corpus_file.readline()
    while (l):
        t = json.loads(l)
        counter.count(t)
        wordList = [y for x, y in extract_tag(t)]
        for word in wordList:
            if word not in vocab:
                vocab[word] = 0
            vocab[word] += 1
        #for tag in tagList
        l = corpus_file.readline()


def extract_tag(t):
    if len(t) == 3:
        return extract_tag(t[1]) + extract_tag(t[2])
    if len(t) == 2:
        return [(t[0], t[1])]


def tagRareWord(corpus_file, new_corpus_file):
    l = corpus_file.readline()
    while (l):
예제 #11
0
파일: HW5.py 프로젝트: sulfamide/cs4705
class CKY:
    def __init__(self):
        self.binary_q = {}
        self.unary_q = {}
        self.counter = Counts()
        self.pi = []
        self.bp = []
        self.parser = {}
        self.use_vert = False

    def loadData(self, corpus_file):

        l = corpus_file.readline()
        while (l):
            t = json.loads(l)
            self.counter.count(t)
            l = corpus_file.readline()

    def compute(self):
        for (sym, word), count in self.counter.unary.iteritems():
            self.unary_q[(sym, word)] = count / self.counter.nonterm[sym]
        for (sym, y1, y2), count in self.counter.binary.iteritems():
            self.binary_q[(sym, y1, y2)] = count / self.counter.nonterm[sym]

    def buildNTDict(self):
        self.binary_rule = {}
        for x, y1, y2 in self.binary_q.keys():
            if x not in self.binary_rule.keys():
                self.binary_rule[x] = []
            self.binary_rule[x].append((y1, y2))

    def cky_init(self, wordList):
        self.pi = []
        self.bp = []
        sent_len = len(wordList)
        for i in range(0, sent_len):
            self.pi.append([])
            self.bp.append([])
            for j in range(0, sent_len):
                self.pi[i].append({})
                self.bp[i].append({})
            for (sym, word), q in self.unary_q.iteritems():
                if wordList[i] == word:
                    self.pi[i][i][sym] = q
                    self.bp[i][i][sym] = word
        pass

    def clean_sent(self, sentence):
        wordList = sentence.split()
        vocabDict = [v for s, v in self.counter.unary.keys()]
        for i, word in enumerate(wordList):
            if word not in vocabDict:
                wordList[i] = '_RARE_'
        return wordList

    def cky_algorithm(self, sentence):
        self.inputWordList = sentence.split()
        wordList = self.clean_sent(sentence)
        sent_len = len(wordList)
        self.cky_init(wordList)

        #print self.pi[10][10]

        for l in range(1, sent_len):
            for i in range(0, sent_len - l):
                j = i + l
                #print i,j
                for sym in self.binary_rule.keys():
                    for s in range(i, j):
                        derivations = self.binary_rule[sym]
                        for y1, y2 in derivations:
                            if y1 not in self.pi[i][s].keys(
                            ) or y2 not in self.pi[s + 1][j].keys():
                                #if i == 0 and j == 4 :
                                #print s,'t'
                                #if sym=='S' and y1=='NP' and y2=='S':
                                #    print sym,y1,y2
                                #    print s
                                #    print self.pi[0][1].keys()
                                #    print self.pi[2][4].keys()
                                #    print y1 in self.pi[i][s].keys()
                                #    print y2 in self.pi[s+1][j].keys()
                                #    print (sym,y1,y2) not in self.binary_q.keys()
                                #pass
                                continue
                            #print i,j

                            temp = self.binary_q.get(
                                (sym, y1, y2)) * self.pi[i][s].get(
                                    y1) * self.pi[s + 1][j].get(y2)
                            self.bp[i][j][sym] = (
                                s, y1, y2) if temp > self.pi[i][j].get(
                                    sym, 0) else self.bp[i][j].get(sym, 0)
                            self.pi[i][j][sym] = max(temp,
                                                     self.pi[i][j].get(sym, 0))

        #print self.pi[1][sent_len-1]
        #print self.pi[0][0]
        #print self.binary_q['S','NP^<S>+NOUN','S']
        #print self.bp
        #try:
        if 'S' in self.pi[0][sent_len - 1].keys():
            root_score = 'S', self.pi[0][sent_len - 1]['S']
        else:
            root_score = max(self.pi[0][sent_len - 1].iteritems(),
                             key=lambda x: x[1])
        self.parse_res = self.buildTreeHelper(0, sent_len - 1,
                                              root_score[0])  #,self.parser
        #except:
        #    self.parse_res = "cannot parse unk"
        pass
        #print 'start',root_score

    def buildTreeHelper(self, start, end, root):
        if start == end:
            #self.parser[root]= self.bp[start][end][root]
            #t=self.bp[start][end][root]
            return '["' + root + '", "' + self.inputWordList[start] + '"]'
        else:
            s, y1, y2 = self.bp[start][end][root]
            r1 = self.buildTreeHelper(start, s, y1)
            r2 = self.buildTreeHelper(s + 1, end, y2)
            return ('["' + root + '", ' + r1 + ', ' + r2 + ']')

    def dev(self, dev_file, output_file):
        output = file(output_file, 'w')
        for l in open(dev_file):
            print l.strip()
            self.parse_res = ""
            self.cky_algorithm(l.strip())
            if self.use_vert:
                new_parse_res = re.sub("\^\[[A-Z]+\]", "", self.parse_res)
                self.parse_res = new_parse_res
            output.write(self.parse_res)
            output.write('\n')
예제 #12
0
 def load_params(self, input):
     self.counter = Counts() 
     self.counter.load(open(input))
예제 #13
0
class CKY(object):
    
    def __init__(self):
        self.counter = None
    
    def load_params(self, input):
        self.counter = Counts() 
        self.counter.load(open(input))
        
    def cky(self, test_file):
        for line in test_file:
            self.parse(line)
            
    def constract_result(self, tp, words, i, j, N):
        #print t[(1, n, "S")]
        tree = []
        tree.append(N)
        if i == j:
            tree.append(words[i-1])
            return tree
        else:
            (s, Y, Z) = tp[(i, j, N)]
            #print s, Y, Z
            tree.append(self.constract_result(tp, words, i, s, Y))
            tree.append(self.constract_result(tp, words, s+1, j, Z))
            return tree

    def parse(self, line):
        t = {}
        tp = {}
        #init
        words = line.strip().split(" ")
        n = len(words)
        #print words
        i = 1
        for word in words:
            if self.counter.unary_pob_reorg.has_key(word):
                for N, freq in self.counter.unary_pob_reorg[word].iteritems():
                    t[(i, i, N)] = freq
            else:
                #print word, " mapped to ", rare_label
                for N, freq in self.counter.unary_pob_reorg[rare_label].iteritems():
                    t[(i, i, N)] = freq
            i += 1
        #print t
        
        #calculation
        for l in range(1, n):
            for i in range(1, n - l + 1):
                j = i + l
                for N in self.counter.nonterm.iterkeys(): 
                    if self.counter.binary_pob_reorg.has_key(N):
                        max = 0
                        max_s = i
                        max_Y = None
                        max_Z = None
                        for (Y, Z), freq in self.counter.binary_pob_reorg[N].iteritems():
                            for s in range(i, j):
                                if t.has_key((i, s, Y)) and t.has_key(((s+1, j, Z))):
                                    p = freq * t[(i, s, Y)] * t[(s+1, j, Z)]
                                    #print "none-value", i, j, N, p, s, Y, Z
                                    if p > max:
                                        max = p
                                        max_s = s
                                        max_Y = Y
                                        max_Z = Z
                        if max > 0:
                            t[i, j, N] = max
                            tp[i, j, N] = (max_s, max_Y, max_Z)
                            #print i, j, N, max, (max_s, max_Y, max_Z)
        
        #print t[(1, n, "SBARQ")]
        
        #backpointer for result
        tree = self.constract_result(tp, words, 1, n, "SBARQ")
        print json.dumps(tree)
예제 #14
0
    def __init__(self):
        Counts.__init__(self)

        self.wc = defaultdict(int)
        self.rare_tag = '_RARE_'
        self.rare_threshold = 5