Пример #1
0
def _slow_day19a(s):
    import nltk
    rules, received = s.split('\n\n')
    rules = sorted(rules.splitlines(), key=lambda x: not x.startswith('0: '))
    grammar = nltk.CFG.fromstring(
        line.replace(':', ' ->', 1) for line in rules)
    parser = nltk.ChartParser(grammar)
    result = 0
    for n, line in enumerate(received.splitlines()):
        res = parser.parse(list(line))
        try:
            _ = next(iter(res))
            result += 1
        except StopIteration:
            pass
    return result
Пример #2
0
def defgrammar():
    Grammar = nltk.CFG.fromstring("""S -> NP VP
    PP -> P NP
    NP -> Det N | Det N PP | 'I'
    VP -> V NP | VP PP
    Det -> 'an' | my
    N -> 'elephant' | 'pajamas'
    V -> 'shot'
    P -> in
    """)

    sent = "I shot an elephant".split()
    parser - nltk.ChartParser(Grammar)
    trees = parser.parse(sent)
    for tree in trees:
        print(tree)
Пример #3
0
def sentence_parse_example():
    groucho_grammar = nltk.parse_cfg("""
    S -> NP VP
    PP -> P NP
    NP -> Det N | Det N PP | 'I'
    VP -> V NP | VP PP
    Det -> 'an' | 'my'
    N -> 'elephant' | 'pajamas'
    V -> 'shot'
    P -> 'in'
  """)
    sent = ["I", "shot", "an", "elephant", "in", "my", "pajamas"]
    parser = nltk.ChartParser(groucho_grammar)
    trees = parser.nbest_parse(sent)
    for tree in trees:
        print tree
Пример #4
0
def definegrammar_pasrereult():
    Grammar = nltk.CFG.fromstring(""" 
    S -> NP VP 
    PP -> P NP 
    NP -> Det N | Det N PP | 'I' 
    VP -> V NP | VP PP 
    Det -> 'an' | 'my' 
    N -> 'elephant' | 'pajamas' 
    V -> 'shot' 
    P -> 'in' 
    """)
    sent = "I shot an elephant".split()
    parser = nltk.ChartParser(Grammar)
    trees = parser.parse(sent)
    for tree in trees:
        print tree
Пример #5
0
def another_test():
    grammar = nltk.parse_cfg("""
S -> NP VP
NP -> 'DT' 'NN'
VP -> 'VB' | 'VBP'
VP -> 'VB' 'NN'
""")

    # Make your POS sentence into a list of tokens.
    sentence = "DT NN VB NN".split(" ")

    # Load the grammar into the ChartParser.
    cp = nltk.ChartParser(grammar)

    # Generate and print the nbest_parse from the grammar given the sentence tokens.
    for tree in cp.nbest_parse(sentence):
        print(tree)
Пример #6
0
def encode(smiles):
    assert type(smiles) == list
    GCFG = zinc_grammar.GCFG
    tokenize = get_zinc_tokenizer(GCFG)
    tokens = map(tokenize, smiles)
    parser = nltk.ChartParser(GCFG)
    parse_trees = [parser.parse(t).__next__() for t in tokens]
    productions_seq = [tree.productions() for tree in parse_trees]
    productions = GCFG.productions()
    prod_map = {}
    for ix, prod in enumerate(productions):
        prod_map[prod] = ix
    indices = [
        np.array([prod_map[prod] for prod in entry], dtype=int)
        for entry in productions_seq
    ]
    return indices
Пример #7
0
 def __init__(self, weights_file, latent_rep_size=56):
     """ Load the (trained) zinc encoder/decoder, grammar model. """
     self._grammar = zinc_grammar
     self._model = models.model_zinc
     self.MAX_LEN = self._model.MAX_LEN
     self._productions = self._grammar.GCFG.productions()
     self._prod_map = {}
     for ix, prod in enumerate(self._productions):
         self._prod_map[prod] = ix
     self._parser = nltk.ChartParser(self._grammar.GCFG)
     self._tokenize = get_zinc_tokenizer(self._grammar.GCFG)
     self._n_chars = len(self._productions)
     self._lhs_map = {}
     for ix, lhs in enumerate(self._grammar.lhs_list):
         self._lhs_map[lhs] = ix
     self.vae = self._model.MoleculeVAE()
     self.vae.load(self._productions, weights_file, max_length=self.MAX_LEN, latent_rep_size=latent_rep_size)
def parse_tree(data):
    grammar = nltk.CFG.fromstring("""
		S -> NP N
		S -> DT NP
		NP -> JJ NN
		NP -> NN NN
		NP -> DT NN
		NP -> JJ NP
	""")

    cp = nltk.ChartParser(grammar)

    for d in data:
        text = d["text"]
        tokens = nltk.pos_tag(nltk.word_tokenize(text))
        for tree in cp.parse(tokens):
            print tree
Пример #9
0
def process2(s):
    tokens = nltk.word_tokenize(s)
    tagged = nltk.pos_tag(tokens)

    grammar = nltk.parse_cfg("""
		S -> NP VP
		PP -> P NP
		NP -> Det N | Det N PP | 'I'
		VP -> V NP | VP PP
 		Det -> 'an' | 'my'
 		N -> 'elephant' | 'pajamas'
		V -> 'shot'
		P -> 'in'
	""")
    parser = nltk.ChartParser(grammar)
    trees = parser.nbest_parse(tagged)
    return trees
Пример #10
0
def to_one_hot(smiles):
    """ Encode a list of smiles strings to one-hot vectors """
    assert type(smiles) == list
    tokens = list(map(tokenize, smiles))
    parser = nltk.ChartParser(G.GCFG)
    parse_trees = [next(parser.parse(t)) for t in tokens]
    productions_seq = [tree.productions() for tree in parse_trees]
    indices = [
        np.array([prod_map[prod] for prod in entry], dtype=int)
        for entry in productions_seq
    ]
    one_hot = np.zeros((len(indices), MAX_LEN, NCHARS), dtype=np.float32)
    for i in range(len(indices)):
        num_productions = len(indices[i])
        one_hot[i][np.arange(num_productions), indices[i]] = 1.
        one_hot[i][np.arange(num_productions, MAX_LEN), -1] = 1.
    return one_hot
Пример #11
0
 def __init__(self, weights_file, latent_rep_size=2):
     """ Load the (trained) equation encoder/decoder, grammar model. """
     self._grammar = the_grammar
     self._model = molecules.model_gr
     self.MAX_LEN = 15 # TODO: read from elsewhere
     self._productions = self._grammar.GCFG.productions()
     self._prod_map = {}
     for ix, prod in enumerate(self._productions):
         self._prod_map[prod] = ix
     self._parser = nltk.ChartParser(self._grammar.GCFG)
     self._tokenize = tokenize
     self._n_chars = len(self._productions)
     self._lhs_map = {}
     for ix, lhs in enumerate(self._grammar.lhs_list):
         self._lhs_map[lhs] = ix
     self.vae = self._model.MoleculeVAE()
     self.vae.load(self._productions, weights_file, max_length=self.MAX_LEN, latent_rep_size=latent_rep_size)
Пример #12
0
    def __init__(self, vae: EquationVaeTorch):
        """ Load the (trained) equation encoder/decoder, grammar model. """
        self._grammar = eq_grammar
        self._model = expr_model_pt
        self.MAX_LEN = 15
        self._productions = self._grammar.GCFG.productions()
        self._prod_map = {}
        for ix, prod in enumerate(self._productions):
            self._prod_map[prod] = ix
        self._parser = nltk.ChartParser(self._grammar.GCFG)
        self._tokenize = tokenize
        self._n_chars = len(self._productions)
        self._lhs_map = {}
        for ix, lhs in enumerate(self._grammar.lhs_list):
            self._lhs_map[lhs] = ix

        self.vae: EquationVaeTorch = vae
Пример #13
0
    def __init__(self, boundaryEPs, operationalPEs, availableDomains):

        kernelGrammar = """
			S 			 -> "IN" OPBLOCK

			OPBLOCK 	 -> TBRANCH | NTBRANCH | TPBLOCK OPBLOCK | TPBLOCK EN
			ROPBLOCK	 -> INTBRANCH | TPBLOCK ROPBLOCK | TPBLOCK
			TPBLOCK  	 -> PORDER | MASKPELEM
			
			PORDER 		 -> "[" MASKPELEM NPELEM "]" POEXCEPTION | "[" MASKPELEM NPELEM "]"
			POEXCEPTION  -> "(" PELEM PELEM ")" POEXCEPTION | "(" PELEM PELEM ")" | "(" PELEM PELEM "*" ")" POEXCEPTION | "(" PELEM PELEM "*" ")"
			
			TBRANCH 	 -> TPBLOCK "{" OPBLOCK NEXTTBRANCH "}" 
			NEXTTBRANCH  -> "/" OPBLOCK NEXTTBRANCH | "/" OPBLOCK

			NTBRANCH	 -> TPBLOCK "{" ROPBLOCK NEXTNTBRANCH "}" OPBLOCK
			INTBRANCH	 -> TPBLOCK "{" ROPBLOCK NEXTNTBRANCH "}" ROPBLOCK
			NEXTNTBRANCH -> "/" ROPBLOCK NEXTNTBRANCH | "/" ROPBLOCK

			NPELEM 		 -> MASKPELEM NPELEM | MASKPELEM
			MASKPELEM	 -> PELEM | PELEM "<" DOMAIN ">"
		"""

        grammarPELEM = 'PELEM ->'
        for PE in operationalPEs:
            grammarPELEM += ' "' + PE + '" |'
        grammarPELEM = grammarPELEM[:len(grammarPELEM) - 1] + '\n'

        grammarEP = 'EN ->'
        for EN in boundaryEPs:
            grammarEP += ' "' + EN + '" |'
        grammarEP = grammarEP[:len(grammarEP) - 1] + '\n'

        grammarDomain = 'DOMAIN ->'
        if len(availableDomains) != 0:
            for domain in availableDomains:
                grammarDomain += ' "' + domain + '" |'
            grammarDomain = grammarDomain[:len(grammarDomain) - 1]

        self.__boundaryEPs = boundaryEPs
        self.__operationalPEs = operationalPEs
        self.__mainParser = nltk.ChartParser(
            nltk.CFG.fromstring(kernelGrammar + grammarPELEM + grammarEP +
                                grammarDomain))
        self.__status = 0
Пример #14
0
    def date_parse(self, dates: Set[str]):
        # support formats:
        #   on 2020-12-12, on 2020/12/12
        #   2020-12-12, 2020/12/12
        #   December 12, 2020
        #   December 12
        #   December 2020
        #   on December 12th
        #   December 12th 2020
        #   December 12th, 2020
        #   December 12th
        #   the twelfth of December
        #   the 12th of December
        #   in 2020
        #   in December 2020
        DateParseCFG = nltk.CFG.fromstring("""
            DATE -> IN YEAR SEP MONTH_NUM SEP DAY | YEAR SEP MONTH_NUM SEP DAY | MONTH_STR DAY SEP YEAR | MONTH_STR DAY | MONTH_STR YEAR | IN MONTH_STR NN_NUM | MONTH_STR NN_NUM YEAR | MONTH_STR NN_NUM SEP YEAR | MONTH_STR NN_NUM | DT NN_STR IN MONTH_STR | DT NN_NUM IN MONTH_STR | IN YEAR | IN MONTH_STR YEAR
            SEP -> "/" | "-" | ","
            YEAR -> DIGIT DIGIT DIGIT DIGIT
            MONTH_NUM -> DIGIT | DIGIT DIGIT
            DAY -> DIGIT | DIGIT DIGIT
            DT -> "the"
            IN -> "of" | "in" | "on"
            NN_STR -> "first" | "second" | "third" | "fourth" | "fifth" | "sixth" | "seventh" | "eighth" | "ninth" | "tenth" | "eleventh" | "twelfth" | "thirteenth" | "fourteenth" | "fifteenth" | "sixteenth" | "seventeenth" | "eighteenth" | "nineteenth" | "twentieth" | "twenty-first" | "twenth-second" | "twenty-third" | "twenty-fourth" | "twenty-fifth" | "twenty-sixth" | "twenty-seventh" | "twenty-eighth" | "twenth-ninth" | "thirtieth" | "thirty-first"
            MONTH_STR -> "January" | "February" | "March" | "April" | "May" | "June" | "July" | "August" | "September" | "October" | "November" | "December"
            NN_NUM -> "1st" | "2nd" | "3rd" | "4th" | "5th" | "6th" | "7th" | "8th" | "9th" | "10th" | "11th" | "12th" | "13th" | "14th" | "15th" | "16th" | "17th" | "18th" | "19th" | "20th" | "21st" | "22nd" | "23rd" | "24th" | "25th" | "26th" | "27th" | "28th" | "29th" | "30th" | "31st"
            DIGIT -> "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
            """)
        date_parser = nltk.ChartParser(DateParseCFG)

        for date in dates:
            if date.find('/') != -1 or date.find('-') != -1:
                # if the format is yyyy/mm/dd then each character is a token
                tokens = [ch for ch in date]
            else:
                tokens = []
                for t in date.split():
                    if t.isnumeric():
                        tokens.extend([num for num in t])
                    else:
                        tokens.append(t)

            for tree in date_parser.parse(tokens):
                print(tree)
                tree.draw()
def ambiguity():

    groucho_grammar = nltk.parse_cfg("""
    S -> NP VP
    PP -> P NP
    NP -> Det N | Det N PP | 'I'
    VP -> V NP | VP PP
    Det -> 'an' | 'my'
    N -> 'elephant' | 'pajamas'
    V -> 'shot'
    P -> 'in'
    """)

    sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
    parser = nltk.ChartParser(groucho_grammar)
    trees = parser.nbest_parse(sent)
    for tree in trees:
        print tree
Пример #16
0
 def __init__(self, grammar, device, hparams=stgs.VAE_HPARAMS):
     """
     Load trained encoder/decoder and grammar model
     :param grammar: A nas_grammar.Grammar object
     :param hparams: dict, hyperparameters for the VAE and the grammar model
     """
     self._grammar = grammar
     self.device = device
     self.hp = hparams
     self.max_len = self.hp['max_len']
     self._productions = self._grammar.GCFG.productions()
     self._prod_map = make_prod_map(grammar.GCFG)
     self._parser = nltk.ChartParser(grammar.GCFG)
     self._tokenize = make_tokenizer(grammar.GCFG)
     self._n_chars = len(self._productions)
     self._lhs_map = grammar.lhs_map
     self.vae = NA_VAE(self.hp)
     self.vae.eval()
def to_one_hot(smiles):
    """ Encode a list of smiles strings to one-hot vectors """
    assert type(smiles) == list
    prod_map = {}
    for ix, prod in enumerate(zinc_grammar.GCFG.productions()):
        prod_map[prod] = ix
    tokenize = molecule_vae.get_zinc_tokenizer(zinc_grammar.GCFG)
    tokens = list(map(tokenize, smiles))
    parser = nltk.ChartParser(zinc_grammar.GCFG)
    parse_trees = [next(parser.parse(t)) for t in tokens]
    productions_seq = [tree.productions() for tree in parse_trees]
    indices = [np.array([prod_map[prod] for prod in entry], dtype=int) for entry in productions_seq]
    one_hot = np.zeros((len(indices), MAX_LEN, NCHARS), dtype=np.int8)
    for i in range(len(indices)):
        num_productions = len(indices[i])
        one_hot[i][np.arange(num_productions),indices[i]] = 1.
        one_hot[i][np.arange(num_productions, MAX_LEN),-1] = 1.
    return one_hot
Пример #18
0
    def get_grammar(self, tagged_sents):

        res = []

        grammarfile = nltk.data.load('file:grammar.cfg')
        rd = nltk.ChartParser(grammarfile)
        for tagged_sent in tagged_sents:
            b = False
            for tree in rd.parse(tagged_sent.split()):
                if tree != "":
                    res.append(f"[color=00ff00]{tree}[/color]")
                    b = True
            if b == False:
                res.append(
                    "[color=ff0000]Sentence is grammatically wrong according to your grammar[/color]"
                )

        return res
Пример #19
0
    def load(self, filepath):
        cfg_string = ''.join(list(open(filepath).readlines()))

        # parse from nltk
        cfg_grammar = nltk.CFG.fromstring(cfg_string)
        # self.cfg_parser = cfg_parser = nltk.RecursiveDescentParser(cfg_grammar)
        self.cfg_parser = cfg_parser = nltk.ChartParser(cfg_grammar)

        # our info for rule macthing
        self.head_to_rules = head_to_rules = {}
        self.valid_tokens = valid_tokens = set()
        rule_ranges = {}
        total_num_rules = 0
        first_head = None
        for line in cfg_string.split('\n'):
            if len(line.strip()) > 0:
                head, rules = line.split('->')
                head = Nonterminal(head.strip())  # remove space
                rules = [_.strip()
                         for _ in rules.split('|')]  # split and remove space
                rules = [
                    tuple([
                        Nonterminal(_) if not _.startswith("'") else _[1:-1]
                        for _ in rule.split()
                    ]) for rule in rules
                ]
                head_to_rules[head] = rules

                for rule in rules:
                    for t in rule:
                        if isinstance(t, str):
                            valid_tokens.add(t)

                if first_head is None:
                    first_head = head

                rule_ranges[head] = (total_num_rules,
                                     total_num_rules + len(rules))
                total_num_rules += len(rules)

        self.first_head = first_head

        self.rule_ranges = rule_ranges
        self.total_num_rules = total_num_rules
def to_one_hot(smiles):
    """ Encode a list of smiles strings to one-hot vectors """
    token = tokenize(smiles)
    parser = nltk.ChartParser(zinc_grammar.GCFG)
    parse_tree = parser.parse(token).next()
    draw_trees(parse_tree)
    print(type(parse_tree))
    exit(0)
    productions_seq = parse_tree.productions()
    print(smiles)
    for i in productions_seq:
        print(i)
    exit(0)
    indices = [prod_map[prod] for prod in productions_seq]
    one_hot = np.zeros(shape=(MAX_LEN, NRULES), dtype=np.float32)
    num_productions = len(indices)
    one_hot[np.arange(num_productions), indices] = 1.
    one_hot[np.arange(num_productions, MAX_LEN), -1] = 1.
    return one_hot
Пример #21
0
def to_one_hot(strs):
    """ Encode a list of strs strings to one-hot vectors """
    prod_map = {}
    for ix, prod in enumerate(toy_grammar.GCFG.productions()):
        prod_map[prod] = ix
    tokens = map(lambda x: x.split(), strs)
    parser = nltk.ChartParser(toy_grammar.GCFG)
    parse_trees = [parser.parse(t).next() for t in tokens]
    productions_seq = [tree.productions() for tree in parse_trees]
    indices = [
        np.array([prod_map[prod] for prod in entry], dtype=int)
        for entry in productions_seq
    ]
    one_hot = np.zeros((len(indices), MAX_LEN, NCHARS), dtype=np.float32)
    for i in xrange(len(indices)):
        num_productions = len(indices[i])
        one_hot[i][np.arange(num_productions), indices[i]] = 1.
        one_hot[i][np.arange(num_productions, MAX_LEN), -1] = 1.
    return one_hot
Пример #22
0
def make_one_hot(cfg: nltk.CFG, tokenizer, prod_map, sents, max_len = 25, n_chars = 34):
    """
    Encodes a list of sentences (strings) into a one-hot vector representing the production rules used to generate it.
    """
    if not isinstance(sents, list):
        sents = [sents]
    tokens = list(map(tokenizer, sents))  # tokenize sentences
    parse_trees = [next(nltk.ChartParser(cfg).parse(t)) for t in tokens]  # build parse tree for each sentence
    prod_seq = [tree.productions() for tree in parse_trees]  # list productions used in each parse tree
    indices = []  # list of vectors identifying the production rules used in each sentence
    for entry in prod_seq:
        indices.append(np.array([prod_map[prod] for prod in entry], dtype=int))
    one_hot = np.zeros((len(indices), max_len, n_chars), dtype=np.float32)
    for i in range(len(indices)):
        num_productions = len(indices[i])
        one_hot[i][np.arange(num_productions), indices[i]] = 1.
        one_hot[i][np.arange(num_productions, max_len), -1] = 1.  # fill last column of
        # unused production slots with 1, which corresponds to the rule "Nothing -> None".
    return torch.tensor(one_hot)
Пример #23
0
def one():
    # Грамматика
    grammar = nltk.CFG.fromstring(""" 
    S -> NP VP 
    VP -> V NP | VP PP 
    PP -> P NP 
    NP -> Det N | Det N PP | "Путешественник" 
    V -> "шел" 
    Det -> "несколько" | "небольшими" 
    N -> "недель" | "остановками" 
    P -> "с" 
    """)
    text = "Путешественник шел несколько недель с небольшими остановками"
    words = nltk.word_tokenize(text)
    # Деревья синтаксического разбора
    trees = nltk.ChartParser(grammar)
    # Вывод
    print("\t\t" + text)
    for t in trees.parse(words):
        print(t)
Пример #24
0
def three():
    # Грамматика
    grammar = nltk.CFG.fromstring(""" 
    S -> NP VP 
    VP -> V NP | VP PP 
    PP -> P NP 
    NP -> Det N | Det N PP | "Он" 
    V -> "бежал" 
    Det -> "воспрянув" | "мокрому" 
    N -> "асфальту" | "духом" 
    P -> "по" 
    """)
    text = "Он бежал воспрянув духом по мокрому асфальту"
    words = nltk.word_tokenize(text)
    # Деревья синтаксического разбора
    trees = nltk.ChartParser(grammar)
    # Вывод
    print("\t\t" + text)
    for t in trees.parse(words):
        print(t)
Пример #25
0
def draw_1(s):
    m = s
    l = fool.cut(s)[0]
    print(l)
    p = product_grammar(m)
    grammar = CFG.fromstring("""
	S -> NP L NP|NP vshi NP y|NP L P NP|NP L P NP F|NP vshi R|T vshi R
	NP -> nr nr| nr ude n| nr n|NP ude NP|NP NP|z ude n|a ude n|v ude n|nr|n|b ude|ns ude|ns|ns ude NP|m n|m q n|A\
    |d m|m|NP c NP|NP p NP
	VP -> v NP|v VP
	L ->vshi d vshi
	P ->p|vi p
	F ->f
	T ->t
	R ->r|r NP|r ude NP
	A ->a|d a|m q|d a ude
	""" + p)
    cp = nltk.ChartParser(grammar)
    trees = cp.parse(l)
    for s in trees:
        print(s)
Пример #26
0
def main():
    f = open('cfg_sentences.txt').readlines()
    sentences = ' '.join(f).replace('\n', '')

    text = nltk.word_tokenize(sentences)
    tagged_text = nltk.pos_tag(text)

    # Generate grammar from ruleset
    cfg_rules = generate_cfg_rules()
    grammar = CFG.fromstring(cfg_rules)

    # Display sentence trees if our grammar can parse the sentence
    chart_parser = nltk.ChartParser(grammar)
    print
    print 'Sentences from our input set that can be generated by our grammar: '
    print '--------------------------------------------'
    for line in f:
        # Cleanup input sentences
        line = line.replace('\n', '').lower()
        line = line.replace('.', '')
        sent = line.split()
        for tree in chart_parser.parse(sent):
            print(tree)

    translation_dict = generate_english_to_spanish()
    translated_sentences = []
    for line in f:
        translated_sentences.append(translate_sentence(line, translation_dict))

    print
    print 'Translated sentences: '
    print '----------------------'
    for item in translated_sentences:
        print item

    bleu_score = calculate_bleu_score(translated_sentences)
    print
    print 'BLEU Score'
    print '----------'
    print "System BLEU Score:", bleu_score
Пример #27
0
def perform_scg(sentence):
    gramma_string = (" SIGMA -> DELTA\n"
                     " DELTA -> S P C|S P C A|S P A|S P\n"
                     " S -> h |h m\n"
                     " C -> h m|h\n"
                     " P -> aux l| l \n"
                     " A -> Pre C \n"
                     " h ->" + noun_string + " \n"
                     " l ->" + verb_output + " \n"
                     " m -> 'náà' \n"
                     " aux -> 'n'\n"
                     " Pre -> 'ní'\n")

    gramma = CFG.fromstring(gramma_string)
    parser = nltk.ChartParser(gramma)
    try:
        lower_sentence = sentence.lower()
        ans = parser.parse(lower_sentence.split())
        output = " ".join(str(x) for x in list(ans))
    except ValueError as e:
        output = "Error : " + str(e)
    return output
def to_one_hot(transactions):
    """ Encode a list of smiles strings to one-hot vectors """
    assert type(transactions) == list
    prod_map = {}
    for ix, prod in enumerate(trans_grammar.GCFG.productions()):
        # print(prod)
        prod_map[prod] = ix
    # tokenize = trans_vae.get_trans_tokenizer(trans_grammar.GCFG)
    tokens = []
    for transaction in transactions:
        tokens.append(transaction.split())
    # tokens = map(string.split, smiles)
    parser = nltk.ChartParser(trans_grammar.GCFG)
    parse_trees = [parser.parse(t).next() for t in tokens]
    productions_seq = [tree.productions() for tree in parse_trees]
    indices = [np.array([prod_map[prod] for prod in entry], dtype=int) for entry in productions_seq]
    one_hot = np.zeros((len(indices), MAX_LEN, NCHARS), dtype=np.float32)
    for i in xrange(len(indices)):
        num_productions = len(indices[i])
        one_hot[i][np.arange(num_productions),indices[i]] = 1.
        one_hot[i][np.arange(num_productions, MAX_LEN),-1] = 1.
    return one_hot
Пример #29
0
def perform_xbar(sentence):
    gramma_string = (" IP -> Spec IBAR \n"
                     " Spec -> NP \n"
                     " IBAR -> I VP\n"
                     " NP -> NBAR  \n"
                     " NBAR -> N DP| N  \n"
                     " VP -> VBAR \n"
                     " VBAR -> V| V NP \n"
                     " DP -> DBAR \n"
                     " DBAR -> D \n"
                     " N -> " + noun_string + " \n"
                     " V -> " + verb_output + " \n"
                     " D -> 'náà' \n")
    gramma = CFG.fromstring(gramma_string)
    parser = nltk.ChartParser(gramma)
    try:
        lower_sentence = sentence.lower()
        ans = parser.parse(lower_sentence.split())
        output = " ".join(str(x) for x in list(ans))
    except ValueError as e:
        output = "Error : " + str(e)
    return output
def validate(text):
    grammar = nltk.CFG.fromstring(grammar_str)
    parser = nltk.ChartParser(grammar)
    trees = parser.parse(list(text))
    valid = False
    answer = math_form = None
    for tree in trees:
        addition = tree[4].leaves()
        operation_string = ''
        for i in addition:
            operation_string = operation_string + i
        p = operation_string.replace(PLUS, '+')\
            .replace(MUL, '*') \
            .replace(DIV, '/') \
            .replace(MIN, '-') \
            .replace(OPENB, '(') \
            .replace(CLOSEB, ')')
        math_form = p
        answer = eval(p)
        valid = True
        break
    return (valid, math_form, answer)