예제 #1
0
def multi_f(sentence):
    grammar_file = sys.argv[1]
    pcfg = PCFG()
    pcfg.load_model(grammar_file)
    parser = Parser(pcfg)
    tree = parser.parse(sentence)
    print(dumps(tree))
예제 #2
0
def sqrt_PCFG(G: PCFG):
    """
    Input: a PCFG G
    Output: a PCFG that is the sqrt of G
    """
    WCFG_rules = {}
    for S in G.rules:
        WCFG_rules[S] = {
            F: (G.rules[S][F][0], G.rules[S][F][1]**(0.5))
            for F in G.rules[S]
        }

    # Yeah, I know... not exactly a PCFG (probabilities do not sum to 1), but it fits the bill
    WCFG = PCFG(start=G.start, rules=WCFG_rules)
    partition_function = compute_partition_function(WCFG)

    PCFG_rules = {}
    for S in WCFG.rules:
        new_rules_S = {}
        for F in WCFG.rules[S]:
            args_F = WCFG.rules[S][F][0]
            w = WCFG.rules[S][F][1]
            multiplier = prod(partition_function[arg] for arg in args_F)
            new_rules_S[F] = (args_F,
                              w * multiplier * 1 / partition_function[S])
        PCFG_rules[S] = new_rules_S
    return PCFG(G.start, PCFG_rules)
예제 #3
0
파일: main.py 프로젝트: Rigeru/NLP
def pcfg(sentence):
    pcfg = PCFG()
    pcfg.readCFGRules(FilePath.ROOT + "rules.txt")
    #pcfg.showRules()
    pcfg.parse(sentence)
    pcfg.showTrees()
    pass
예제 #4
0
def a_star(G: PCFG):
    """
    A generator that enumerates all programs using A*.
    Assumes that the PCFG only generates programs of bounded depth.
    """

    frontier = []
    initial_non_terminals = deque()
    initial_non_terminals.append(G.start)
    heappush(
        frontier,
        (
            -G.max_probability[G.start].probability[(G.__hash__(), G.start)],
            (None, initial_non_terminals, 1),
        ),
    )
    # A frontier is a heap of pairs (-max_probability, (partial_program, non_terminals, probability))
    # describing a partial program:
    # max_probability is the most likely program completing the partial program
    # partial_program is the list of primitives and variables describing the leftmost derivation,
    # non_terminals is the queue of non-terminals appearing from left to right, and
    # probability is the probability of the partial program

    while len(frontier) != 0:
        max_probability, (partial_program, non_terminals, probability) = heappop(
            frontier
        )
        if len(non_terminals) == 0:
            yield partial_program
        else:
            S = non_terminals.pop()
            for P in G.rules[S]:
                args_P, w = G.rules[S][P]
                new_partial_program = (P, partial_program)
                new_non_terminals = non_terminals.copy()
                new_probability = probability * w
                new_max_probability = new_probability
                for arg in args_P:
                    new_non_terminals.append(arg)
                    new_max_probability *= G.max_probability[arg].probability[
                        (G.__hash__(), arg)
                    ]
                heappush(
                    frontier,
                    (
                        -new_max_probability,
                        (new_partial_program, new_non_terminals, new_probability),
                    ),
                )
예제 #5
0
    def __init__(self, corpus_train):

        self.PCFG = PCFG(corpus_train)
        self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_tags, self.PCFG.freq_tokens)

        self.tag_to_id = {tag: i for (i, tag) in enumerate(self.PCFG.list_all_tags)}

        self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon}
        for tag in self.PCFG.lexicon:
            for word in self.PCFG.lexicon[tag]:
                self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word]

        # self.grammar_dicts[X][Y][Z] stores P(rule X->YZ)
        self.grammar_dicts = {}
        for (root_tag, rules) in self.PCFG.grammar.items():
            # root_tag is the left hand tag of the grammar rule
            idx_root_tag = self.tag_to_id[root_tag]
            self.grammar_dicts[idx_root_tag] = {}
            dico = {}
            for (split, proba) in rules.items():  # split is the right hand term, and proba the probability of the rule
                idx_left_tag = self.tag_to_id[split[0]]
                idx_right_tag = self.tag_to_id[split[1]]
                if idx_left_tag in dico.keys():
                    dico[idx_left_tag][idx_right_tag] = proba
                else:
                    dico[idx_left_tag] = {idx_right_tag: proba}
            self.grammar_dicts[idx_root_tag] = dico
예제 #6
0
    def __init__(self, corpus):

        # PCFG and OOV class
        self.pcfg = PCFG(corpus)
        self.oov = OOV(self.pcfg.lexicon, self.pcfg.list_all_tags,
                       self.pcfg.tokens)

        # Initialize CYP probability matrix
        self.proba_matrix = None
        self.cyk_matrix = None
예제 #7
0
def run(args):

    data = loader.load_treebanks(TREEBANK_PATH)
    train_data, dev_data, test_data = loader.train_test_split(
        data, 0.8, 0.1, 0.1)
    words, embeddings = loader.load_word_embeddings(EMBEDDING_PATH)

    pcfg = PCFG(train_data)
    pcfg.train(train_data)
    pcfg.set_oov(OOV, words, embeddings)

    if args.generate_output:
        output = pcfg.generate_output(test_data)

    if args.evaluation:
        accs, nb_no_parse = pcfg.predict(test_data[:2])

    if args.parse:
        corpus = []
        with open(args.txt_path, 'r') as f:
            corpus = f.read().split('\n')
        pcfg.parse_from_txt(corpus)
예제 #8
0
def build_model():
    pcfg = PCFG()
    if exists(MODEL):
        pcfg.load_model(MODEL)
    
    else:
        print "Building the Grammar Model"
        start = time()
        
        if not exists(TEMP_DIR):
            makedirs(TEMP_DIR)
        
        # Normalise the treebanks
        if not exists(QUESTIONBANK_NORM):
            normalize_questionbank(QUESTIONBANK_DATA, QUESTIONBANK_PENN_DATA)
            gen_norm(QUESTIONBANK_NORM, [QUESTIONBANK_PENN_DATA])
        
        if not exists(PENNTREEBANK_NORM):
            gen_norm(PENNTREEBANK_NORM, glob(PENNTREEBANK_GLOB))
        
        # Keep a part of the treebanks for testing
        i = 0
        with open(MODEL_TREEBANK, 'w') as model, open(TEST_DAT, 'w') as dat, open(TEST_KEY, 'w') as key:
            for treebank in [QUESTIONBANK_NORM, PENNTREEBANK_NORM]:
                for tree in open(treebank):
                    i += 1
                    if (i % 100) == 0:
                        sentence, n = get_sentence(loads(tree))
                        if n > 7 and n < 20:
                            dat.write(sentence+'\n')
                            key.write(tree)
                        else:
                            i -= 1
                    
                    model.write(tree)
        
        # Learn PCFG
        pcfg.learn_from_treebanks([MODEL_TREEBANK])
        pcfg.save_model(MODEL)
        print "Time: (%.2f)s\n" % (time() - start)
    
    return pcfg
예제 #9
0
    def __init__(self, corpus_train):

        self.PCFG = PCFG(corpus_train)
        self.OOV = OOV(self.PCFG.lexicon, self.PCFG.list_all_symbols,
                       self.PCFG.freq_tokens)

        #note : if the id of a symbol is above self.PCFG.nb_tags,
        #it's an artificial symbol introduced with Chomsky normalization
        self.symbol_to_id = {
            symbol: i
            for (i, symbol) in enumerate(self.PCFG.list_all_symbols)
        }

        #instead of storing tags, storing grammar rules with their corresponding indices in grammar_ids:
        #we store rules with an additional hierarchical level for speed up
        #in other words, self.grammar_ids[X][Y][Z] stores P(rule X->YZ)
        #where self.grammar_ids, self.grammar_ids[X], and self.grammar_ids[X][Y] are all dictionnaries
        self.grammar_ids = {}
        for (root_tag, rules) in self.PCFG.grammar.items():
            # root_tag is the left hand symbol of the grammar rule
            idx_root_tag = self.symbol_to_id[root_tag]
            self.grammar_ids[idx_root_tag] = {}
            dico = {}
            for (split, proba) in rules.items(
            ):  #split is the right hand term, and proba the probability of the rule
                idx_left_tag = self.symbol_to_id[split[0]]
                idx_right_tag = self.symbol_to_id[split[1]]
                if idx_left_tag in dico.keys():
                    dico[idx_left_tag][idx_right_tag] = proba
                else:
                    dico[idx_left_tag] = {idx_right_tag: proba}
            self.grammar_ids[idx_root_tag] = dico

        #for a given word, which are its tags with the corresponding probabilities P(tag -> mot) ?
        #this is what stores self.lexicon_inverted
        self.lexicon_inverted = {word: {} for word in self.OOV.words_lexicon}
        for tag in self.PCFG.lexicon:
            for word in self.PCFG.lexicon[tag]:
                self.lexicon_inverted[word][tag] = self.PCFG.lexicon[tag][word]
def run(args):

    has_effect = False

    if args:
        try:

            train_corpus, val_corpus, test_corpus = data.get_train_val_test()
            words, embeddings = data.get_polyglot_words_embeddings()

            parser = PCFG()
            parser.learn_probabilities_and_rules(train_corpus)
            parser.set_oov_module(OovModule, words, embeddings)

            if args.inference:

                get_gold(parser, test_corpus, filename='evaluation_data.gold')
                get_predictions(parser,
                                test_corpus,
                                filename='evaluation_data.parser_output')

            if args.evaluation:
                evaluation('evaluation_data.gold',
                           'evaluation_data.parser_output')

            if args.parse:
                parser.parse_from_txt(args.txt_path)

        except Exception as e:
            logger.exception(e)
            logger.error("Uhoh, the script halted with an error.")
    else:
        if not has_effect:
            logger.error(
                "Script halted without any effect. To run code, use command:\npython3 main.py <args>"
            )
예제 #11
0
# !/usr/bin/env python3
# -*- coding: utf-8 -*-

# -------------------------------------------#
# main.py    	                             #
# author: sean lee                           #
# qq: 929325776							     #
# email: [email protected]                    #
#--------------------------------------------#

from pcfg import PCFG 

parser = PCFG()
parser.fit('./corpus/toy/train.txt')
parser.parse("the man saw the dog")
'''
print(parser.N_dict)
print(parser.NR_dict)
print(parser.TR_dict)
'''
예제 #12
0
        for index, token in enumerate(tokens):
            resulting_chart[index, index] = token

        for row in range(1, L):
            for col in range(0, row):
                resulting_chart[row][col] = ' '

        for row in range(0, L):
            for col in range(row+2, L+1):
                if not resulting_chart[row, col]:
                    resulting_chart[row, col] = ' '
        f.write(str(resulting_chart))
        f.write('\n')
    f.close()
    return (chart, best_probability[0])

if __name__ == "__main__":
    try:
        os.remove('workfile.txt')
    except OSError:
        pass
    sentences_to_parse = 10
    pcfg = PCFG.load(PCFG_SOURCE)
    with open(TOKEN_SOURCE, "r") as source:
        for index, line in enumerate(source):
            if index == sentences_to_parse:
                break
            tokens = line.split()
            (_, best_probability) = CKY_chart(tokens, pcfg)
            print("{:.4f}: {}".format(best_probability.bw, " ".join(tokens)))
예제 #13
0

class Timeout(object):
    def __init__(self, seconds=1, error_message="Timeout"):
        self.seconds = seconds
        self.error_message = error_message

    def handle_timeout(self, signum, frame):
        raise TimeoutError(self.error_message)

    def __enter__(self):
        signal.signal(signal.SIGALRM, self.handle_timeout)
        signal.alarm(self.seconds)

    def __exit__(self, type, value, traceback):
        signal.alarm(0)


if __name__ == "__main__":
    pcfg = PCFG.load(PCFG_SOURCE)
    print "PCFG loaded."
    with open(TREE_SOURCE, "r") as source:
        for tree in Tree.from_stream(source):
            tokens = [leaf.decode("ASCII") for leaf in tree.leaves()]
            try:
                with Timeout(TIMEOUT):
                    (_, bw_prob) = CYK_chart(pcfg, tokens)
                    print bw_prob
            except TimeoutError:
                pass
예제 #14
0
def train(train_data_filename, train_rare_filename, pcfg_model_filename,
          rare_words_rule):
    print 'train PCFG model'
    pcfg = PCFG()
    for l in open(train_data_filename):
        t = json.loads(l)
        pcfg.count(t)
    pcfg.count_word()

    print 'process rare word'
    process_rare_words(open(train_data_filename), open(train_rare_filename,
                                                       'w'), pcfg.rare_words,
                       rare_words_rule)

    print 'train PCFG model again'
    new_pcfg = PCFG()
    for l in open(train_rare_filename):
        t = json.loads(l)
        new_pcfg.count(t)
    new_pcfg.cal_rule_params()

    new_pcfg.write(open(pcfg_model_filename, 'w'))
    return new_pcfg
예제 #15
0
파일: p1.py 프로젝트: zsxh/Coursera_NLP_MC
def main():
    train_data_filename = 'parse_train.dat'
    train_rare_filename = 'p1.train.rare.dat'
    pcfg_model_filename = 'parser_train.counts.out'

    pcfg = PCFG()
    for l in open(train_data_filename):
        t = json.loads(l)
        pcfg.count(t)
    pcfg.count_word()

    process_rare_words(open(train_data_filename), open(train_rare_filename,
                                                       'w'), pcfg.rare_words,
                       rare_words_rule_p1)

    new_pcfg = PCFG()
    for l in open(train_rare_filename):
        t = json.loads(l)
        new_pcfg.count(t)
    new_pcfg.cal_rule_params()

    new_pcfg.write(open(pcfg_model_filename, 'w'))
예제 #16
0
import math
import pprint
import torch
import torch.nn as nn
import torch.nn.functional as F
from pcfg import PCFG


def p(s):
    print(s)
    return s


gen_a = lambda terminals_per_category: PCFG.fromstring(
    " A    -> " + " | ".join([
        "\"a" + str(i) + "\" [" + str(1 / terminals_per_category) + "]"
        for i in range(terminals_per_category)
    ]))

generate_test_grammar = lambda p1, terminals_per_category: PCFG.fromstring(
    " S  -> NP-s VP-s [" + str((1 - p1) / 2) + "] | NP-s S VP-s [" + str(
        p1 / 2) + "] | NP-p VP-p [" + str(
            (1 - p1) / 2) + "] | NP-p S VP-p [" + str(p1 / 2) + "]\n" +
    " NP-s -> N-s     [" + str(1) + "]\n" + " NP-p -> N-p     [" + str(1) +
    "]\n" + " VP-s -> V-s     [" + str(1) + "]\n" + " VP-p -> V-p     [" + str(
        1) + "]\n" + " N-s  -> " + " | ".join([
            "\"n" + str(i) + "-s\" [" + str(1 / terminals_per_category) + "]"
            for i in range(terminals_per_category)
        ]) + "\n" + " N-p  -> " + " | ".join([
            "\"n" + str(i) + "-p\" [" + str(1 / terminals_per_category) + "]"
            for i in range(terminals_per_category)
예제 #17
0
        sys.exit(1)

    # load the train file to trees
    trees = []
    f = open(trainfilename, 'r')
    for line in f:
        trees.append(nltk.Tree.fromstring(line))

    # preprocss the tree forms: ignore functional labels and binarize to CNF
    for tree in trees:
        # ignore_func_labels(tree)
        tree.chomsky_normal_form(horzMarkov=2)
        # tree.chomsky_normal_form()

    # learn PCFG
    lexicon, grammar, vocabulary, symbols = PCFG(trees)
    # print(grammar)

    # for OOV
    oovwords = OOV(embedfilename, vocabulary)

    # parse new sentences using CYK based on learned PCFG
    # parser = CYKSolver(lexicon, grammar, vocabulary, symbols, oovwords)

    # i = 0
    for line in sys.stdin:
        # print('start parse')
        # print(line)
        # start = time.time()
        # if line == '\n': continue
        # cyksolver = CYK(line.split(), lexicon, grammar, vocabulary, symbols, embedfilename)
예제 #18
0
from typing import TextIO

from pcfg import PCFG
import os

BASEPATH: str = os.path.dirname(__file__)

f: TextIO
with open(os.path.join(BASEPATH, "subject_adjectives.txt")) as f:
    subject_adjectives: PCFG = PCFG.fromstring(f.read())

n: int = int(input("How many sentences do you want generated? "))
sentence: str
for sentence in subject_adjectives.generate(n):
    print()
    print(sentence.capitalize())
예제 #19
0
def main():
    train_data_filename = 'parse_train.dat'
    train_rare_filename = 'p1.train.rare.dat'
    pcfg_model_filename = 'parser_train.counts.out'

    pcfg = PCFG()
    for l in open(train_data_filename):
        t = json.loads(l)
        pcfg.count(t)
    pcfg.count_word()

    process_rare_words(open(train_data_filename),
        open(train_rare_filename, 'w'),
        pcfg.rare_words,
        rare_words_rule_p1)

    new_pcfg = PCFG()
    for l in open(train_rare_filename):
        t = json.loads(l)
        new_pcfg.count(t)
    new_pcfg.cal_rule_params()

    new_pcfg.write(open(pcfg_model_filename, 'w'))
예제 #20
0
 def __init__(self, grammar_path, expand_binaries=False):
     self.grammar = PCFG.from_file(grammar_path, expand_binaries)
예제 #21
0
def multi_f(sentence):
    grammar_file = sys.argv[1]
    pcfg = PCFG()
    pcfg.load_model(grammar_file)
    parser = Parser(pcfg)
    return parser.parse(sentence)
예제 #22
0
from pcfg import PCFG
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--corpus", help="training treebank corpus", type=str)
parser.add_argument("--sentences", help="raw token sentences", type=str)
parser.add_argument("--outfile", help="name of the output file", type=str)
args = parser.parse_args()

grammar = PCFG(args.corpus)
grammar.parse_corpus()
grammar.predict(args.sentences, args.outfile)
예제 #23
0
def train(train_data_filename, train_rare_filename, pcfg_model_filename, rare_words_rule):
    print 'train PCFG model'
    pcfg = PCFG()
    for l in open(train_data_filename):
        t = json.loads(l)
        pcfg.count(t)
    pcfg.count_word()

    print 'process rare word'
    process_rare_words(open(train_data_filename),
        open(train_rare_filename, 'w'),
        pcfg.rare_words,
        rare_words_rule)

    print 'train PCFG model again'
    new_pcfg = PCFG()
    for l in open(train_rare_filename):
        t = json.loads(l)
        new_pcfg.count(t)
    new_pcfg.cal_rule_params()

    new_pcfg.write(open(pcfg_model_filename, 'w'))
    return new_pcfg
예제 #24
0
def multi_f(sentence):
    grammar_file = sys.argv[1]
    pcfg = PCFG()
    pcfg.load_model(grammar_file)
    parser = Parser(pcfg)
    return parser.parse(sentence)

if __name__ == "__main__":

    if len(sys.argv) != 2:
        print("usage: python3 parser.py GRAMMAR")
        exit()

    start = time()
    grammar_file = sys.argv[1]
    print("Loading grammar from " + grammar_file + " ...", file=stderr)    
    pcfg = PCFG()
    pcfg.load_model(grammar_file)
    parser = Parser(pcfg)

    print("Parsing sentences ...", file=stderr)

    with Pool(processes = os.cpu_count()) as pool:
        trees = pool.map(multi_f, stdin.readlines())

    for t in trees:
        print(dumps(t))

    print("Time: (%.2f)s\n" % (time() - start), file=stderr)
        )
        (VP
            (VB gave)
            (NP
                (DT the)
                (NN lecture)
            )
        )
    )"""
    
    # uncomment to use the above simple trees for debugging:
    #    trees = [TRANSFORM(Tree.from_string(t)) for t in (t0, t1)]
    #    grammar = PCFG.from_trees(trees)

    # let's get some input to build a grammar:
    grammar = PCFG.from_trees(list(TRANSFORM(t) for t in Tree.from_stream(GzipFile('bigger_treebank_2.txt.gz'))))
    print "Read {} rules in grammar.".format(len(grammar))
    trees = list(TRANSFORM(t) for t in Tree.from_stream(open('end_of_wsj.txt')))
    print "Read {} trees.".format(len(trees))
    
    # now try and parse our trees:
    results = []
    
    for idx, tree in enumerate(trees):
        tokens = [(t,) for t in tree.terminals()]        
        # print 'Sentence {}\tTokens: "{}"'.format(idx, ' '.join(tree.terminals()))
        chart = Chart(grammar, tokens)
        chart.pretty_print()
        has_parse = chart.extract_parse()
        if not has_parse:
            print 'Sentence {}\tTokens: "{}" has no parse!'.format(idx, ' '.join(tree.terminals()))            
예제 #26
0
if args.fnc == "generate":
    print "### GENERATING WORDS ###"
    print "model:", args.model, args.n 
    print "language:", args.lang
    if args.model == "nphone":
        lm = NgramModel(args.n, corpus, 1)
    elif args.model == "nsyll":
        if args.lex.startswith("celexes/syll"):
            lm = NsyllModel(args.n, corpus, 1)
        else:
            print "Use syll__ file for this model"
            sys.exit()
    elif args.model == "pcfg":
        if args.lex.startswith("celexes/pcfg"):
            print call(["./pcfg/io","-d","1","-g", args.grammar, args.lex],stdout = open('grammars/gram_pcfg.wlt', 'w'))
            lm = PCFG('grammars/gram_pcfg.wlt')
            corpus = [re.sub(" ","",x) for x in corpus]
        else:
            print "Use pcfg__ file for this model"
            sys.exit()
    lm.create_model(corpus, args.smoothing) 
    o = "Lexicons/lex_" + args.lex.split("/")[-1][:-4] + "_cv" +  str(args.cv) + "_iter" + str(args.iter) + "_m" + args.model + "_n" + str(args.n) + "_smoothing" + str(args.smoothing) + ".txt"
    lexfile = write_lex_file(o, corpus, args.cv, args.iter, lm, args.h**o)
    print "null lexicons wrote on", lexfile
    print "### WRITING RESULTS ###"
    write_all(lexfile, args.graph, args.lang)


else: 
    o = "evaluation/eval_" + args.lex.split("/")[-1][:-4] + "_cv" +  str(args.cv) + "_iter" + str(args.iter) + "_m" + args.model  + "_n" + str(args.n) + "_smoothing" +  str(args.smoothing)+ ".txt"
    out = open(o, 'w')
예제 #27
0
# stdlib
import argparse
# project
from pcfg import PCFG

if __name__ == "__main__":
    print("Welcome to my parser!")
    print("Please wait while loading the model ...")
    pcfg = PCFG()
    pcfg.from_path('sequoia-corpus+fct.mrg_strict.txt')
    pcfg.fit()
    print("Model loaded!")
    while True:
        print("Please enter phrase to parse!")
        phrase = str(input('>>> '))
        tokenized = phrase.split()
        parsed = pcfg.pcky(tokenized)
        if not parsed:
            print("Sorry, we couldn't parse your line :(")
        else:
            print(parsed)
        print(">>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<\n")