def get_productions(treebank):
    """
    Returns CNF rules of grammar as derived from the input treebank.
    Rules are stored in dictionary with LHS:[RHS, RHS, RHS...] format
    and the [RHS] list DOES include duplicates for future probability 
    calculations. Also returns start symbol of grammar. 

    :param: treebank: list where each entry is a str value representing a tree
    from the input treebank file 
    """
    
    # grab the top of the tree/start symbol for the grammar
    # use first tree treebank[0], split on whitespace, use root node [0], omit "(" [1:]
    start = treebank[0].split()[0][1:]

    rules = {}
    for sentence in treebank:
        # converting the tree to CNF using NLTK magic 
        t = tree.Tree.fromstring(sentence, remove_empty_top_bracketing=True)
        collapsed_tree = deepcopy(t)
        treetransforms.collapse_unary(collapsed_tree)
        cnf_tree = deepcopy(collapsed_tree)
        treetransforms.chomsky_normal_form(cnf_tree)

        # using more NLTK magic to grab the production rules from each tree 
        for raw_rule in cnf_tree.productions():
            rule = str(raw_rule)
            production = rule.split(" -> ")
            if production[0] not in rules:
                rules[production[0]] = [production[1]]
            else:
                rules[production[0]].append(production[1])
        # now rules contains LHS:[RHS, RHS, RHS...] pairs
    return rules, start
Пример #2
0
    def get_grammar(cls, train_trees, starting_symb='SENT'):
        """
        This method returns a the grammar coputed from the training set.

        Inputs:
        -------

        train_trees (list): List of trees to perform training
        startting_symbol (str): The root symbol
        """
        productions = []

        # Chmosky Normal Form
        for tree in train_trees:
            
            # Remove unary rules
            treetransforms.collapse_unary(tree)

            # Transform to CNF
            treetransforms.chomsky_normal_form(tree, horzMarkov=2)

            # Copute production and store is
            productions += tree.productions()

        # Define the root symbol
        SENT = Nonterminal(starting_symb)

        # Compute the grammar using PCFG
        grammar = induce_pcfg(SENT, productions)

        grammar.chomsky_normal_form()

        return grammar
Пример #3
0
def preprocess(y):
    treetransforms.collapse_unary(y, collapsePOS=True)
    treetransforms.chomsky_normal_form(y, horzMarkov=2, vertMarkov=1)
    traverse_tree(y)
    # we got the modified tree so we need to calculate the scores
    change_labels(y)
    y._label = 4
Пример #4
0
def trained_pcfg():
  try:
    with open("pcfgcache.pkl",'rb') as input:
      print("Loading the PCFG...")
      gram = pickle.load(input)
    print("Loaded!")
    return gram
  except FileNotFoundError:
    print("Training the PCFG...")
    ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ 
    'ptb', CategorizedBracketParseCorpusReader, r'wsj/\d\d/wsj_\d\d\d\d.mrg',
    cat_file='allcats.txt', tagset='wsj')
    productions = []
    tb = treebank
    # Search for nltk_data/corpora/ptb and place all the wsj/XX/*.mrg files in 
    useFullTreeBank = True
    n = 0                   # check progress of training
    if useFullTreeBank:
      tb = ptb
    for t in tb.parsed_sents(): 
      if n % 200 == 0:
        print(n)
      collapse_unary(t,True)
      chomsky_normal_form(t)
      n = n + 1
      for p in t.productions():
        productions.append(p)
    gram = grammar.induce_pcfg(grammar.Nonterminal('S'), productions)
    print("Trained!")
    print("Writing the PCFG...")
    with open("pcfgcache.pkl",'wb') as output:
      pickle.dump(gram, output, -1)
    print("Write successful!")
    return gram
Пример #5
0
 def chomsky_normal_form(self):
   chomsky_parsed_senteces = []
   for parsed_sentence in self.parsed_sentences:
     try:
       tree = deepcopy(parsed_sentence)
       treetransforms.collapse_unary(tree)
       cnfTree = deepcopy(tree)
       treetransforms.chomsky_normal_form(cnfTree)
       chomsky_parsed_senteces.append(cnfTree)
     except Exception:
       pass
   self.parsed_sentences = chomsky_parsed_senteces
Пример #6
0
def demo():
    """
    A demonstration showing how each tree transform can be used.
    """

    from copy import deepcopy

    from nltk import tree, treetransforms
    from nltk.draw.tree import draw_trees

    # original tree from WSJ bracketed text
    sentence = """(TOP
  (S
    (S
      (VP
        (VBN Turned)
        (ADVP (RB loose))
        (PP
          (IN in)
          (NP
            (NP (NNP Shane) (NNP Longman) (POS 's))
            (NN trading)
            (NN room)))))
    (, ,)
    (NP (DT the) (NN yuppie) (NNS dealers))
    (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))
    (. .)))"""
    t = tree.Tree.fromstring(sentence, remove_empty_top_bracketing=True)

    # collapse subtrees with only one child
    collapsedTree = deepcopy(t)
    treetransforms.collapse_unary(collapsedTree)

    # convert the tree to CNF
    cnfTree = deepcopy(collapsedTree)
    treetransforms.chomsky_normal_form(cnfTree)

    # convert the tree to CNF with parent annotation (one level) and horizontal smoothing of order two
    parentTree = deepcopy(collapsedTree)
    treetransforms.chomsky_normal_form(parentTree, horzMarkov=2, vertMarkov=1)

    # convert the tree back to its original form (used to make CYK results comparable)
    original = deepcopy(parentTree)
    treetransforms.un_chomsky_normal_form(original)

    # convert tree back to bracketed text
    sentence2 = original.pprint()
    print(sentence)
    print(sentence2)
    print("Sentences the same? ", sentence == sentence2)

    draw_trees(t, collapsedTree, cnfTree, parentTree, original)
Пример #7
0
def demo():
    """
    A demonstration showing how each tree transform can be used.
    """

    from nltk.draw.tree import draw_trees
    from nltk import tree, treetransforms
    from copy import deepcopy

    # original tree from WSJ bracketed text
    sentence = """(TOP
  (S
    (S
      (VP
        (VBN Turned)
        (ADVP (RB loose))
        (PP
          (IN in)
          (NP
            (NP (NNP Shane) (NNP Longman) (POS 's))
            (NN trading)
            (NN room)))))
    (, ,)
    (NP (DT the) (NN yuppie) (NNS dealers))
    (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))
    (. .)))"""
    t = tree.Tree.parse(sentence, remove_empty_top_bracketing=True)

    # collapse subtrees with only one child
    collapsedTree = deepcopy(t)
    treetransforms.collapse_unary(collapsedTree)

    # convert the tree to CNF
    cnfTree = deepcopy(collapsedTree)
    treetransforms.chomsky_normal_form(cnfTree)

    # convert the tree to CNF with parent annotation (one level) and horizontal smoothing of order two
    parentTree = deepcopy(collapsedTree)
    treetransforms.chomsky_normal_form(parentTree, horzMarkov=2, vertMarkov=1)

    # convert the tree back to its original form (used to make CYK results comparable)
    original = deepcopy(parentTree)
    treetransforms.un_chomsky_normal_form(original)

    # convert tree back to bracketed text
    sentence2 = original.pprint()
    print sentence
    print sentence2
    print "Sentences the same? ", sentence == sentence2

    draw_trees(t, collapsedTree, cnfTree, parentTree, original)
Пример #8
0
    def get_trees(cls, path_to_dataset, train_split=0.8):
        """
        This methods returns the train, test and eval set as list of trees

        Inputs:
        -------

        path_to_dataset (str): The path to the copus to be split
        train_split (float): Proportion of training
        """
        sentences = []
        print('Collecting training, test and evaluation trees')
        with open(path_to_dataset) as f:

            for sentence in f:
                # Removes functional labels
                sent = re.sub(r'-\w+\ ', " ", sentence)
                sentences.append(sent.rstrip())

        # Split Train / (Test + Eval)
        train_sent, test_sent = train_test_split(sentences, train_size=train_split, test_size=1 - train_split, shuffle=False)

        # Split Test / Eval
        eval_sent, test_sent = train_test_split(test_sent, train_size=0.5, shuffle=False)

        print(f'The total number of sentences {len(sentences)}')
        print(f'Number of train sentences {len(train_sent)} -- {round(100 * len(train_sent)  / len(sentences), 1)} %')
        print(f'Number of test sentences {len(test_sent)} -- {round(100 * len(test_sent)  / len(sentences), 1) } %')
        print(f'Number of evaluation sentences {len(eval_sent)} -- {round(100 * len(eval_sent)  / len(sentences), 1)} %')


        train_trees = [Tree.fromstring(sent, remove_empty_top_bracketing=True) for sent in train_sent]
        test_trees  = [Tree.fromstring(sent, remove_empty_top_bracketing=True) for sent in test_sent]
        eval_trees = [Tree.fromstring(sent, remove_empty_top_bracketing=True) for sent in eval_sent]

        for (test, ev) in zip(test_trees, eval_trees):

            # Remove unary rules
            treetransforms.collapse_unary(test)
            treetransforms.collapse_unary(ev)

            # Transform to CNF
            treetransforms.chomsky_normal_form(test, horzMarkov=2)
            treetransforms.chomsky_normal_form(ev, horzMarkov=2)


        return train_trees, eval_trees, test_trees
Пример #9
0
def transform_sst_to_acd_trees(sents, tag='train', filter_len=20):
    # base parameters
    sweep_dim = 1  # how large chunks of text should be considered (1 for words)
    method = 'cd'  # build_up, break_down, cd
    percentile_include = 99.5  # keep this very high so we don't add too many words at once
    num_iters = 25  # maximum number of iterations (rarely reached)

    new_data = []
    cnt = time.time()

    for s_i, sent in enumerate(sents):
        # prepare inputs
        # print(len(sent))
        if s_i % 10 == 0:
            print("phase {} time {} processed {}".format(
                tag,
                time.time() - cnt, s_i))

        if len(sent) > filter_len:
            continue
        # print("{} time {} - 0".format(s_i, time.time() - cnt))
        # cnt = time.time()

        sent = [w.lower() for w in sent]

        batch = batch_from_str_list(sent)
        scores_all = model(batch).data.numpy()[0]  # predict
        label_pred = np.argmax(scores_all)  # get predicted class

        # agglomerate
        # print("{} time {} - 1".format(s_i, time.time() - cnt))
        # cnt = time.time()
        lists = agg.agglomerate(
            model,
            batch,
            percentile_include,
            method,
            sweep_dim,  # only works for sweep_dim = 1
            sent,
            label_pred,
            num_iters=num_iters
        )  # see agg_1d.agglomerate to understand what this dictionary contains
        # print("{} time {} - 1.5".format(s_i, time.time() - cnt))
        lists = agg.collapse_tree(lists)  # don't show redundant joins

        # print("{} time {} - 2".format(s_i, time.time() - cnt))
        # cnt = time.time()
        # gather tree
        children = comp_to_tree(lists, sent)

        # print("{} time {} - 3".format(s_i, time.time() - cnt))
        # cnt = time.time()
        # uniary combine the tree, then binarize it
        tree = deepcopy(children[0])
        treetransforms.collapse_unary(tree)

        chomsky_normal_form(tree, factor='left')

        new_data.append(tree_to_str(tree))

        if s_i % 100 == 0:
            json.dump(new_data, open('tmp.json', 'w'))

    return new_data