Exemplo n.º 1
0
def convert_hybrid(grammar):
    '''
    Convert rules in the form of [A -> 'b' C] where the rhs has both non-terminals and terminals
    into rules in the form of [A -> B C] & [B -> 'b'] with a dummy non-terminal B
    '''
    rules = grammar.productions()
    new_rules = []
    for rule in rules:
        lhs = rule.lhs()
        rhs = rule.rhs()
        # check for hybrid rules
        if rule.is_lexical() and len(rhs) > 1:
            new_rhs = []
            for item in rule.rhs():
                if is_terminal(item):
                    new_sym = Nonterminal(item)
                    new_rhs.append(new_sym)
                    # add new lexical rule with dummy lhs nonterminal
                    new_rules.append(Production(new_sym, (item, )))
                else:
                    new_rhs.append(item)
            # add converted mixed rule with only non-terminals on rhs
            new_rules.append(Production(lhs, tuple(new_rhs)))
        else:
            new_rules.append(rule)

    new_grammar = CFG(grammar.start(), new_rules)

    return new_grammar
Exemplo n.º 2
0
def exploreCFG(cfg, length_limit):
    """
    Generate strings with the CFG,
    without ever allowing an intermediate expression to exceed the length_limit.
    
    Note that not all strings with length <= length_limit that can be generated will be generated.
    For example, with S -> 1S | e and length_limit = 2, we won't be able to generate "11"
    because in the sequence S -> 1S -> 11S -> 11, 11S is too long and we would stop exploring there.
    """
    cfg = CFG(cfg.start(), [splitProdRhs(prod) for prod in cfg.productions()])

    finished = set()  # Expressions with no nonterminals left
    visited = set(
    )  # Expressions with nonterminals that have already been explored
    to_explore = [(cfg.start(), )]

    while to_explore:
        expr = to_explore.pop()
        if expr in visited or len(expr) > length_limit:
            continue

        for i in range(len(expr)):
            if isinstance(expr[i], Nonterminal):
                break
        else:
            finished.add(expr)
            continue
        visited.add(expr)

        for prod in cfg.productions(lhs=expr[i]):
            to_explore.append(expr[:i] + prod.rhs() + expr[i + 1:])

    return finished
Exemplo n.º 3
0
def remove_unary_rules(grammar):
    """Remove unary nonterminal productions A -> B"""
    result = []
    unary = []
    fake_rules = []
    removed_rules = []
    for rule in grammar.productions():
        if len(rule) == 1 and rule.is_nonlexical():
            unary.append(rule)
        else:
            result.append(rule)

    while unary:
        rule = unary.pop(0)
        removed_rules.append(rule)
        for item in grammar.productions(lhs=rule.rhs()[0]):
            new_rule = Production(rule.lhs(), item.rhs())
            if len(new_rule) != 1 or new_rule.is_lexical():
                result.append(new_rule)
                fake_rules.append(new_rule)
            else:
                unary.append(new_rule)

    n_grammar = CFG(grammar.start(), result)
    return n_grammar, grammar
Exemplo n.º 4
0
Arquivo: tiger.py Projeto: ooz/Confopy
 def cfg(self, include_edgelabels=True):
     sents = self.parsed_sents(include_edgelabels)
     tiger_prods = set(prod for sent in sents
                       for prod in sent.productions())
     cfg = CFG(Nonterminal(TigerCorpusReader.GRAMMAR_START),
               list(tiger_prods))
     return cfg
def generate_grammar_and_parsers(parsed_sents):
    # From sentences, extract the parsing tree and transform each tree to a list of CFG productions;
    # generate a set containing all the productions (without repetitions)
    tbank_productions_with_repet = [
        production for parsed_sent in parsed_sents
        for production in parsed_sent.productions()
    ]
    tbank_productions = set(
        tbank_productions_with_repet)  # exclude repetitions
    print("Num. of unique productions read:", len(tbank_productions))

    # Build a CFG from the productions
    print("\nBuinding a CFG...")
    cfg_grammar = CFG(Nonterminal('S'), tbank_productions)  # a CFG
    print(cfg_grammar, end="\n\n")

    # CFG - An Earley parser
    cfg_earley_parser = EarleyChartParser(cfg_grammar, trace=3)
    # Build a PCFG from the productions

    print("Building a PCFG...")
    pcfg_grammar = induce_pcfg(
        Nonterminal('S'),
        tbank_productions_with_repet)  # a PCFG, here repetitions are needed!
    print(pcfg_grammar, end="\n\n")

    # Allocate a bottom-up chart parser for PCFG; see: http://www.nltk.org/_modules/nltk/parse/pchart.html
    pcfg_pchart_parser = InsideChartParser(pcfg_grammar)

    return cfg_earley_parser, pcfg_pchart_parser  # return both parsers
Exemplo n.º 6
0
def demo2():
    from nltk import Nonterminal, Production, CFG
    nonterminals = 'S VP NP PP P N Name V Det'
    (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s)
                                           for s in nonterminals.split()]
    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),
        Production(PP, []),

        Production(PP, ['up', 'over', NP]),

        # Lexical Productions
        Production(NP, ['I']),   Production(Det, ['the']),
        Production(Det, ['a']),  Production(N, ['man']),
        Production(V, ['saw']),  Production(P, ['in']),
        Production(P, ['with']), Production(N, ['park']),
        Production(N, ['dog']),  Production(N, ['statue']),
        Production(Det, ['my']),
        )
    grammar = CFG(S, productions)

    text = 'I saw a man in the park'.split()
    d=CFGDemo(grammar, text)
    d.mainloop()
Exemplo n.º 7
0
Arquivo: cfg.py Projeto: Geolem/nltk
def demo2():
    from nltk import Nonterminal, Production, CFG

    nonterminals = "S VP NP PP P N Name V Det"
    (S, VP, NP, PP, P, N, Name, V,
     Det) = [Nonterminal(s) for s in nonterminals.split()]
    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),
        Production(PP, []),
        Production(PP, ["up", "over", NP]),
        # Lexical Productions
        Production(NP, ["I"]),
        Production(Det, ["the"]),
        Production(Det, ["a"]),
        Production(N, ["man"]),
        Production(V, ["saw"]),
        Production(P, ["in"]),
        Production(P, ["with"]),
        Production(N, ["park"]),
        Production(N, ["dog"]),
        Production(N, ["statue"]),
        Production(Det, ["my"]),
    )
    grammar = CFG(S, productions)

    text = "I saw a man in the park".split()
    d = CFGDemo(grammar, text)
    d.mainloop()
Exemplo n.º 8
0
def create_taskgrammar(grammar, task, encoders):
    logger.info('Creating specific grammar for task %s' % task)
    productions = grammar.productions(Nonterminal(task))
    start_token = Nonterminal('S')
    new_productions = []

    for start_production in productions:
        first_token = start_production.rhs()[0]
        if is_nonterminal(first_token) and first_token.symbol().endswith('_TASK'):
            for new_start_production in grammar.productions(first_token):
                new_productions.append(Production(start_token, new_start_production.rhs()))
        else:
            new_productions.append(Production(start_token, start_production.rhs()))

    for production in grammar.productions():
        for new_production in new_productions:
            if production.lhs() in new_production.rhs() and production not in new_productions:
                if production.lhs().symbol() == 'ENCODERS':  # Use encoders only for types of features in the dataset
                    if len(encoders) > 0:
                        new_productions.append(Production(production.lhs(), [Nonterminal(e) for e in encoders]))
                    else:
                        new_productions.append(Production(production.lhs(), ['E']))
                else:
                    new_productions.append(production)

    task_grammar = CFG(start_token, new_productions)

    with open(TASK_GRAMMAR_PATH, 'w') as fout:
        fout.write('\n'.join([str(x) for x in task_grammar.productions()]))

    return task_grammar
Exemplo n.º 9
0
def binarize(grammar):
    """Binarize grammar by introducing new nonterminals"""
    result = []

    for rule in grammar.productions():
        if len(rule.rhs()) > 2:
            # this rule needs to be broken down
            left_side = rule.lhs()
            symbol_names = [
                tsym.symbol() if not isinstance(tsym, str) else '@' + tsym
                for tsym in rule.rhs()
            ]
            for k in range(1, len(rule.rhs()) - 1):
                new_rhs_name = rule.lhs().symbol() + '|<' + '-'.join(
                    symbol_names[k:]) + '>'
                new_sym = Nonterminal(new_rhs_name)
                new_production = Production(left_side,
                                            (rule.rhs()[k - 1], new_sym))
                left_side = new_sym
                result.append(new_production)
            last_prd = Production(left_side, rule.rhs()[-2:])
            result.append(last_prd)
        else:
            result.append(rule)

    n_grammar = CFG(grammar.start(), result)
    return n_grammar
Exemplo n.º 10
0
def convert_unit(grammar):
    '''
    Convert unitary rules in the form of [A -> B] where the rhs has one non-terminal
    by eliminating intermediate unitary rules and promoting the final lexical rule, e.g. [B -> 'b'] => [A -> 'b']
    or stop at an intermediate rule with only non-terminals on the rhs like [B -> C D] => [A -> C D]
    '''

    rules = grammar.productions()
    new_rules = []
    unit_rules = []
    for rule in rules:
        # check for unit rules
        if rule.is_nonlexical() and len(rule) == 1:
            unit_rules.append(rule)
        else:
            new_rules.append(rule)

    # following each unit rule and find the final terminal
    while unit_rules:
        rule = unit_rules.pop(0)
        lhs = rule.lhs()
        rhs = rule.rhs()
        # find rules that can derive the rhs to something else
        for cascade_rule in grammar.productions(lhs=rhs[0]):
            temp_rule = Production(lhs, cascade_rule.rhs())
            if cascade_rule.is_lexical() or len(cascade_rule) > 1:
                new_rules.append(temp_rule)
            else:
                unit_rules.append(temp_rule)

    new_grammar = CFG(grammar.start(), new_rules)

    return new_grammar
Exemplo n.º 11
0
    def buildFromTreebank(self):
        """ Build a Context-Free-Grammar based on UPenn treebank """
        tbank_productions = set()
        for sent in treebank.parsed_sents():
            for production in sent.productions():
                if production.is_lexical():
                    new_rhs = [str(production._lhs)]
                    production = Production(production._lhs, new_rhs)
                tbank_productions.add(production)

        tbank_grammar = CFG(Nonterminal('S'), list(tbank_productions))

        return tbank_grammar
Exemplo n.º 12
0
    def __init__(self, parent, cfg=None, set_cfg_callback=None):
        self._parent = parent
        if cfg is not None: self._cfg = cfg
        else: self._cfg = CFG(Nonterminal('S'), [])
        self._set_cfg_callback = set_cfg_callback

        self._highlight_matching_nonterminals = 1

        # Create the top-level window.
        self._top = Toplevel(parent)
        self._init_bindings()

        self._init_startframe()
        self._startframe.pack(side='top', fill='x', expand=0)
        self._init_prodframe()
        self._prodframe.pack(side='top', fill='both', expand=1)
        self._init_buttons()
        self._buttonframe.pack(side='bottom', fill='x', expand=0)

        self._textwidget.focus()
Exemplo n.º 13
0
def remove_mixing(grammar):
  result = []
  for rule in grammar.productions():
    if len(rule.rhs()) == 2 and (isinstance(rule.rhs()[0], str) or isinstance(rule.rhs()[1], str)):
      new_rhs = []
      for k in range(2):
        if isinstance(rule.rhs()[k], str):
          new_sym = Nonterminal('$'+rule.rhs()[k])
          new_production = Production(new_sym, (rule.rhs()[k],))
          result.append(new_production)
          new_rhs.append(new_sym)
        else:
          new_rhs.append(rule.rhs()[k])
      new_production = Production(rule.lhs(), new_rhs)
      result.append(new_production)
    else:
      result.append(rule)

  n_grammar = CFG(grammar.start(), result)
  return n_grammar
Exemplo n.º 14
0
def app():
    """
    Create a shift reduce parser app, using a simple grammar and
    text.
    """

    from nltk.grammar import Nonterminal, Production, CFG

    nonterminals = "S VP NP PP P N Name V Det"
    (S, VP, NP, PP, P, N, Name, V,
     Det) = [Nonterminal(s) for s in nonterminals.split()]

    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),
        # Lexical Productions
        Production(NP, ["I"]),
        Production(Det, ["the"]),
        Production(Det, ["a"]),
        Production(N, ["man"]),
        Production(V, ["saw"]),
        Production(P, ["in"]),
        Production(P, ["with"]),
        Production(N, ["park"]),
        Production(N, ["dog"]),
        Production(N, ["statue"]),
        Production(Det, ["my"]),
    )

    grammar = CFG(S, productions)

    # tokenize the sentence
    sent = "my dog saw a man in the park with a statue".split()

    ShiftReduceApp(grammar, sent).mainloop()
Exemplo n.º 15
0
def app():
    """
    Create a shift reduce parser app, using a simple grammar and
    text.
    """

    from nltk.grammar import Nonterminal, Production, CFG
    nonterminals = 'S VP NP PP P N Name V Det'
    (S, VP, NP, PP, P, N, Name, V,
     Det) = [Nonterminal(s) for s in nonterminals.split()]

    productions = (
        # Syntactic Productions
        Production(S, [NP, VP]),
        Production(NP, [Det, N]),
        Production(NP, [NP, PP]),
        Production(VP, [VP, PP]),
        Production(VP, [V, NP, PP]),
        Production(VP, [V, NP]),
        Production(PP, [P, NP]),

        # Lexical Productions
        Production(NP, ['I']),
        Production(Det, ['the']),
        Production(Det, ['a']),
        Production(N, ['man']),
        Production(V, ['saw']),
        Production(P, ['in']),
        Production(P, ['with']),
        Production(N, ['park']),
        Production(N, ['dog']),
        Production(N, ['statue']),
        Production(Det, ['my']),
    )

    grammar = CFG(S, productions)

    # tokenize the sentence
    sent = 'my dog saw a man in the park with a statue'.split()

    ShiftReduceApp(grammar, sent).mainloop()
Exemplo n.º 16
0
def convert_long(grammar):
    '''
    Convert non-binary rules in the form of [A -> B C D], where the rhs has more than 2 non-terminals
    into binarised rules in the form of [A -> B_C D] & [B_C -> B C] witha dummy non-terminal B_C
    '''

    rules = grammar.productions()
    new_rules = []
    long_rules = []
    for rule in rules:
        if len(rule.rhs()) > 2:
            long_rules.append(rule)
        else:
            new_rules.append(rule)

    while long_rules:
        rule = long_rules.pop(0)
        lhs = rule.lhs()
        rhs = rule.rhs()
        new_rhs = []
        for i in range(0, len(rhs) - 1, 2):
            new_sym = Nonterminal(f"{rhs[i].symbol()}_{rhs[i + 1].symbol()}")
            new_rules.append(Production(new_sym, (rhs[i], rhs[i + 1])))
            new_rhs.append(new_sym)
        # case: odd number of non-terminals on rhs
        if len(rhs) % 2 == 1:
            new_rhs.append(rhs[-1])

        new_rule = Production(lhs, tuple(new_rhs))
        # continue binarisation if rhs still has more than 2 non-terminals
        if len(new_rhs) > 2:
            long_rules.append(new_rule)
        else:
            new_rules.append(new_rule)

    new_grammar = CFG(grammar.start(), new_rules)

    return new_grammar
Exemplo n.º 17
0
def create_completegrammar(primitives):
    base_grammar = load_grammar(BASE_GRAMMAR_PATH)
    new_productions = []

    for production in base_grammar.productions():
        primitive_type = production.lhs().symbol()
        if primitive_type in primitives:
            new_rhs_list = []
            for token in production.rhs():
                if isinstance(token, str) and token.startswith('primitive_'):
                    new_rhs_list.append(primitives[primitive_type])
                else:
                    new_rhs_list.append([token])
            for new_rhs in itertools.product(*new_rhs_list):
                new_productions.append(Production(production.lhs(), new_rhs))
        else:
            new_productions.append(production)

    complete_grammar = CFG(Nonterminal('S'), new_productions)

    with open(COMPLETE_GRAMMAR_PATH, 'w') as fout:
        fout.write('\n'.join([str(x) for x in complete_grammar.productions()]))

    return complete_grammar
Exemplo n.º 18
0
def extract_simple_cfg(n):
    rules = extract_simple_productions(n)
    rules = list(set(rules))
    return CFG(Nonterminal("S"), sort_rules(rules))
Exemplo n.º 19
0
    def guess(self, verbose=None):
        """
        Makes a guess based on the next observation.
        Updates self._curr_guess.

        :rtype: CFG
        :returns: The next guess
        """
        if verbose is not None:
            self._verbose = verbose

        sentence = Sentence(next(self._text))
        self._num_steps += 1
        self._log("String {}: {}".format(self._num_steps, sentence))

        if sentence in self._data:
            self._log("String already seen")
            return self._curr_guess

        # Info from previous guess
        num_contexts = len(self._contexts)
        num_subs = len(self._substrings)
        if self._curr_guess is not None:
            num_nts = len(set(p.lhs()
                              for p in self._curr_guess.productions())) - 1
        else:
            num_nts = 0

        total_timer = Timer()
        total_timer.start()

        # Update data and terminals
        words = sentence.get_words()
        self._data.add(sentence)
        self._terminals.update(set(words))

        # Update contexts
        self._log("Updating contexts...")
        inds = range(0, len(words) + 1)
        contexts = [
            Context(words[:i], words[j:]) for i in inds for j in inds[i:]
        ]
        self._contexts.update(ContextSet(contexts))
        self._log(
            "{} new contexts added".format(len(self._contexts) - num_contexts))

        # Update substrings
        self._log("Updating substrings...")

        is_new_sentence = True
        if self._curr_guess_parser is not None:
            try:
                parses = self._curr_guess_parser.parse(words)
                is_new_sentence = len(list(parses)) == 0
            except:
                is_new_sentence = True

        if is_new_sentence:
            subs = [Sentence(words[i:j]) for i in inds for j in inds[i:]]
            self._substrings.update(SentenceSet(subs))
            self._log("{} new substrings added".format(
                len(self._substrings) - num_subs))
        else:
            self._log("Sentence already generated by current guess")

        # Construct the nonterminals
        self._log("Constructing nonterminals...")

        kernels = set()
        for i in range(1, self._k + 1):
            subsets = [
                SentenceSet(j) for j in combinations(self._substrings, i)
            ]
            kernels.update(subsets)

        for kernel in kernels:
            if kernel not in self._nonterminals:
                nt_name = self._new_name()
                contexts = self._oracle.restr_right_triangle(
                    kernel, self._contexts)
                nt = Nonterminal(nt_name)
                self._nonterminals[kernel] = nt
                self._nt_contexts[nt] = contexts

        # Get a set of nonterminals with unique contexts
        self._log("Removing equivalent nonterminals...")
        context_nts = {con: nt for nt, con in self._nt_contexts.iteritems()}
        self._log(
            "{} nonterminals removed".format(len(kernels) - len(context_nts)))
        self._log("{} new nonterminals constructed".format(
            len(context_nts) - num_nts))

        # Construct the rules
        self._log("Constructing rules...")
        self._productions = set()
        timer = Timer()

        # Lexical rules
        timer.start()
        for t in self._terminals:
            t_kernel = SentenceSet([Sentence([t])])
            t_nt = self._nonterminals[t_kernel]
            t_contexts = self._nt_contexts[t_nt]

            for contexts, nt in context_nts.iteritems():
                rule = Production(nt, [t])
                if rule in self._productions:
                    continue
                if rule in self._eliminated_rules:
                    continue

                if contexts.issubset(t_contexts):
                    self._productions.add(rule)
                else:
                    self._eliminated_rules.add(rule)

        timer.stop()
        num_lex = len(self._productions)
        self._log("{} lexical rules ({:.2f} secs)".format(
            num_lex, timer.elapsed()))

        # Binary rules
        timer.reset()
        timer.start()
        for kernel_l in self._nonterminals:
            for kernel_r in self._nonterminals:
                kernel_rhs = kernel_l + kernel_r
                sents_rhs = list(kernel_rhs.intersection(self._substrings))

                inds = range(len(sents_rhs) / self._k + 1)
                kers_rhs = [
                    sents_rhs[self._k * i:self._k * (i + 1)] for i in inds
                ]
                kers_rhs = [SentenceSet(k) for k in kers_rhs if len(k) > 0]

                nts_rhs = [self._nonterminals[k] for k in kers_rhs]
                contexts_nts_rhs = [self._nt_contexts[nt] for nt in nts_rhs]
                if len(contexts_nts_rhs) > 0:
                    contexts_rhs = contexts_nts_rhs[0].intersection(
                        *contexts_nts_rhs)
                else:
                    contexts_rhs = self._contexts

                # Membership queries
                new_strs_rhs = kernel_rhs.difference(SentenceSet(sents_rhs))
                new_contexts_rhs = self._oracle.restr_right_triangle(
                    new_strs_rhs, contexts_rhs)
                contexts_rhs.intersection_update(new_contexts_rhs)

                # Building the rules
                for contexts, nt in context_nts.iteritems():
                    nt_l = context_nts[self._nt_contexts[
                        self._nonterminals[kernel_l]]]
                    nt_r = context_nts[self._nt_contexts[
                        self._nonterminals[kernel_r]]]
                    rule = Production(nt, [nt_l, nt_r])
                    if rule in self._productions:
                        continue
                    if rule in self._eliminated_rules:
                        continue

                    if contexts.issubset(contexts_rhs):
                        self._productions.add(rule)
                    else:
                        self._eliminated_rules.add(rule)

        timer.stop()
        num_bin = len(self._productions) - num_lex
        self._log("{} binary rules ({:.2f} secs)".format(
            num_bin, timer.elapsed()))

        # Start rules
        timer.reset()
        timer.start()
        for contexts, nt in context_nts.iteritems():
            rule = Production(self._start_symbol, [nt])
            if rule in self._productions:
                continue
            if rule in self._eliminated_rules:
                continue
            if Context([], []) in contexts:
                self._productions.add(rule)
            else:
                self._eliminated_rules.add(rule)

        timer.stop()
        num_start = len(self._productions) - num_lex - num_bin
        self._log("{} start rules ({:.2f} secs)".format(
            num_start, timer.elapsed()))

        # Construct the grammar
        self._curr_guess = CFG(self._start_symbol, self._productions)
        self._curr_guess_parser = ChartParser(self._curr_guess)

        total_timer.stop()
        elapsed = total_timer.elapsed()
        num_rules = len(self._curr_guess.productions())
        self._log("Constructed grammar with {} rules ({:.2f} secs)".format(
            num_rules, elapsed))

        return self._curr_guess
Exemplo n.º 20
0
Arquivo: cfg.py Projeto: Geolem/nltk
 def _apply(self, *e):
     productions = self._parse_productions()
     start = Nonterminal(self._start.get())
     cfg = CFG(start, productions)
     if self._set_cfg_callback is not None:
         self._set_cfg_callback(cfg)
Exemplo n.º 21
0
s = ''

print('Bulding tree from parsed sentences')
with open('parsed_sentences.txt') as f:
    sentences = list(f) + ['']
    for line in sentences:
        line = line.strip()
        if len(line) > 0:
            if line[0] != '#':
                s += line
        elif len(s) > 0:
            t = tree.Tree.fromstring(s)
            prod += t.productions()
            t.chomsky_normal_form()
            t.collapse_unary(collapsePOS=True)
            prod_cnf += t.productions()
            s = ''

prod = set(prod)
prod_cnf = set(prod_cnf)

print('Writing CFG to file with %d productions' % len(prod))
grammar = CFG(Nonterminal('ROOT'), prod)
with open('grammar.cfg', 'w') as f:
    f.write('\n'.join([str(p) for p in grammar.productions()]))

print('Writing CFG (CNF) to file with %d productions' % len(prod_cnf))
grammar_cnf = CFG(Nonterminal('ROOT'), prod_cnf)
with open('grammar_cnf.cfg', 'w') as f:
    f.write('\n'.join([str(p) for p in grammar_cnf.productions()]))
Exemplo n.º 22
0
    return len(rhs) == 1 and isinstance(rhs[0], str)


parser = CoreNLPParser(url="http://localhost:9000")

sentences = brown.sents()

# FILTER SHORT AND LONG SENTENCES
filter_sentences = []
for sentence in tqdm(sentences):
    nb_words = number_of_words(sentence)
    if nb_words >= 5 and nb_words <= 10:
        filter_sentences.append(sentence)

# PARSE SENTENCES
productions = []
for sentence in tqdm(filter_sentences):
    parse_tree = next(iter(parser.parse(sentence)))
    productions += parse_tree.productions()

unique_productions = list(set(productions))

# REMOVE TERMINAL SYMBOLS
productions_wo_term = []
for prod in unique_productions:
    if not is_rhs_terminal(prod):
        productions_wo_term.append(prod)

grammar = CFG(start=Nonterminal("ROOT"), productions=productions_wo_term)
pickle.dump(grammar, open("brown_grammar.pickle", "wb"))
Exemplo n.º 23
0
for sent in sentences:
    for p in parser.parse(sent):
        p.draw()

from nltk.corpus import treebank
print(treebank.parsed_sents()[0])
print(treebank.parsed_sents()[1])

from nltk.grammar import CFG, Nonterminal

prods = list({
    production
    for sent in treebank.parsed_sents() for production in sent.productions()
})
t_grammar = CFG(Nonterminal('S'), prods)

sents = [
    'Mr. Vinken is chairman .'.split(), 'Stocks rose .'.split(),
    'Alan introduced a plan .'.split()
]

t_parser = BottomUpChartParser(t_grammar)

parses = 0
for s in sents[:1]:
    for p in t_parser.parse(s):
        if parses < 5:
            print(p)
        parses += 1
Exemplo n.º 24
0
def parse(text):
    """ Parse some text.
"""
    '''
    # extract new words and numbers
    words = set([match.group(0) for match in re.finditer(r"[a-zA-Z]+", text)])
    numbers = set([match.group(0) for match in re.finditer(r"\d+", text)])        
    '''

    numbers = set([match.group(0) for match in re.finditer(r"\d+", text)])
    coordinates = set(
        [match.group(0) for match in re.finditer(r"\(\d+,\d+\)", text)])
    relations = [
        "segitiga", "kotak", "titik", "garis", "poligon", "negara", "kota",
        "provinsi"
    ]
    fields = ["nama", "ibukota", "geom", "id", "id_ibukota"]

    class Relation:
        def __init__(self, name, attrs, geom):
            self.name = name
            self.attrs = attrs
            self.geom = geom

    # segitiga: id, nama, geom
    # kotak: id, nama, geom
    # titik: id, nama, geom
    # garis: id, nama, geom
    # poligon: id, nama, geom
    # negara: id, nama, id_ibukota, geom
    # provinsi: id, nama, id_ibukota, geom
    # kota: id, nama, geom

    # Make a local copy of productions
    lproductions = list(productions)

    # Add a production for every words and number
    lproductions.extend(
        [literal_production("NUMBER", number) for number in numbers])
    lproductions.extend(
        [literal_production("RELATION", relation) for relation in relations])
    lproductions.extend(
        [literal_production("VALUE", value) for value in values])
    lproductions.extend(
        [literal_production("FIELD", field) for field in fields])
    lproductions.extend(
        [literal_production("COOR", coor) for coor in coordinates])

    key = "VALUE"
    lhs = Nonterminal(key)
    lproductions.extend([Production(lhs, ["bengawan", "solo"])])

    # Make a local copy of the grammar with extra productions
    lgrammar = CFG(grammar.start(), lproductions)

    # Load grammar into a parser
    parser = nltk.RecursiveDescentParser(lgrammar)

    tokens = text.split()

    return parser.parse(tokens)