def test_parser_code_production(self):
		s = "S -> A B C\n"
		s +="A -> a A | &\n"
		s +="B -> b B | A C d\n"
		s +="C -> c C | &"
		g = Grammar.text_to_grammar(s)
		r = RecursiveDescentParser(g)

		c = '''\
	A()
	B()
	C()'''
		self.assertEqual(c.strip(),r._parser_code_production(Production('S','A B C'),'S').strip())

		c = '''\
	if current_symbol == 'a':
		next_lexic_symbol()
	else:
		raise Exception('A','a',current_symbol)
	A()'''
		self.assertEqual(c.strip(),r._parser_code_production(Production('A','a A'),'A').strip())

		c = '''\
	A()
	C()
	if current_symbol == 'd':
		next_lexic_symbol()
	else:
		raise Exception('B','d',current_symbol)'''
		self.assertEqual(c.strip(),r._parser_code_production(Production('B','A C d'),'B').strip())
	def test_parser_code_nonterminal(self):
		s = "S -> A B C\n"
		s +="A -> a A | &\n"
		s +="B -> b B | A C d\n"
		s +="C -> c C | D\n"
		s +="D -> &"
		g = Grammar.text_to_grammar(s)
		r = RecursiveDescentParser(g)

		c = '''\
def S():
	global current_symbol
	if current_symbol in ['a', 'b', 'c', 'd']:
		A()
		B()
		C()
\t
	else:
		raise Exception('S',['a', 'b', 'c', 'd'],current_symbol)'''
		self.assertEqual(c.strip(),r._parser_code_nonterminal('S').strip())

		c = '''\
def A():
	global current_symbol
	if current_symbol in ['a']:
		if current_symbol == 'a':
			next_lexic_symbol()
		else:
			raise Exception('A','a',current_symbol)
		A()'''
		self.assertEqual(c.strip(),r._parser_code_nonterminal('A').strip())
Пример #3
0
	def btn_parser_clicked(self):
		if(self.verify_grammar_ll1()):
			g = Grammar.text_to_grammar(self.ui.text_grammar.toPlainText())
			r = RecursiveDescentParser(g)
			self._current_parser = r
			self.ui.text_parser.setText(r.parser_code(self.log).strip().replace('\t','    '))
			QMessageBox.information(self,'Geração do parser descendente recursivo','O parser foi gerado!')
Пример #4
0
    def __init__(self, grammar):
        self.string = ""
        self.words = []
        self.engine = GrammarEngine(grammar)
        self.RDP = RecursiveDescentParser(self.engine, False)

        self.nonterminals = self.engine.get_nonterminals(grammar)
        self.terminals = self.engine.get_terminals(grammar)
Пример #5
0
def component2():
    grammar_engine = GrammarEngine("component2.txt")
    grammar = grammar_engine.grammar
    parser = RecursiveDescentParser(grammar, False)
    result = parser.parse("Joe said Buster ghosted Schiller", "S")

    ##keys = grammar.keys()
    # for symbol in grammar:
    #   rules[symbol] = grammar[symbol].body
    # print(rules)
    # print(grammar.variables)
    print(result)
Пример #6
0
 def __init__(self, grammar, verbose=False):
     self.parser = RecursiveDescentParser(grammar=grammar, verbose=verbose)
     self.grammar = grammar
     self.partial_parses = list()
Пример #7
0
class IslandParser:
    def __init__(self, grammar, verbose=False):
        self.parser = RecursiveDescentParser(grammar=grammar, verbose=verbose)
        self.grammar = grammar
        self.partial_parses = list()

    def parse(self, string):
        '''
    Basic idea
    1. take out the unparsable parts (not stored as terminals in the grammar)
    2. create a list of substrings which consist of terminals only
    3. call parse_substring() to parse each substring
    '''
        # split the input string into tokens
        fragments = string.split()

        # create a list of all symbols appear in the right-hand side of the grammar
        symbol_set = []
        for symbol in self.grammar.grammar.values():
            for rule in symbol.rules:
                for token in rule.body:
                    if type(token) == str:
                        symbol_set.append(token)

        length = len(fragments)
        index = 0
        subset = []
        valid_set = []
        # check each token and see if it appears in the grammar
        while index < length:
            # if the current token is valid, append it to a subset so that we can keep the adjecent valid tokens together
            if fragments[index] in symbol_set:
                subset.append(fragments[index])
            # if the current token is not valid, check if the subset has some tokens already and append the non-empty subset to the valid_set
            elif len(subset) > 0:
                valid_set.append(subset)
                subset = []
            # move on to the next token
            index += 1
        for sets in valid_set:
            print(sets)

        # convert the sub_set inside the valid_set into string and store it in the substring_set
        substring_set = []
        for item in valid_set:
            sub_string = " ".join(item)
            substring_set.append(sub_string)

        # parse each string in substring_set
        for string in substring_set:
            self.parse_substring(string)

        # return the list of partial parses sorted in descending order of length
        self.partial_parses.sort(key=lambda x: len(x), reverse=True)
        return self.partial_parses

    def parse_substring(self, sub_string):
        '''
    Basic idea 
    1. fragment the string
    2. attempt to parse this string 
    3. if there is a successful parse, check this partial parse is a subset of some parse, if yes, discard, if no keep
    '''

        # split the input string into tokens
        fragments = sub_string.split()
        # length of the substring
        length = len(sub_string)

        # while the length of the substring is greater than or equal to 1
        while length >= 1:
            start = 0
            while start <= len(fragments) - length:
                # get the substring
                substring = " ".join(fragments[start:start + length])
                # parse the substring
                for symbol in self.grammar.grammar.keys(
                ):  # try out all symbols
                    result = self.parser.parse(string=substring,
                                               start_symbol_name=symbol)
                    parse_already_exists = False
                    # if the result is not None, that means there was a successful parse
                    if result != None:
                        # check if the result is a subset of some other partial parse
                        for parse in self.partial_parses:
                            if result in parse:
                                parse_already_exists = True
                        # if it's not a subset of any of the existing parses, add it to the list
                        if not parse_already_exists:
                            self.partial_parses.append(result)
                # move the window
                start += 1
            # decrease the length of the expected substring
            length -= 1
Пример #8
0
class IslandParser:
    def __init__(self, grammar):
        self.string = ""
        self.words = []
        self.engine = GrammarEngine(grammar)
        self.RDP = RecursiveDescentParser(self.engine, False)

        self.nonterminals = self.engine.get_nonterminals(grammar)
        self.terminals = self.engine.get_terminals(grammar)

    '''
    Can be given a list of nonterminals (if not, uses default)
    Returns a tuple of island parses, each of which is a list of partial parses of the object's words. 

    Starting at the number of tokens, n, in the string it wants to parse and successively decreasing by 1, looks at all the n-length substrings of the string. Attempts to parse each one. If able, it adds it to a partial parse, then continues on. 
  '''

    def parse(self, string):
        # self.string = string + " "
        # string = string + " ENDS"
        puncts = [",", ".", "!"]
        # words = string.split(" ")
        self.words = re.findall(r"[\w']+|[.,!?;]", string)

        #False until it finds the largest thing it can parse
        biggest_parse = False
        #the partial parses - ends up being a list of lists. Each inner list is one partial parse
        partial_parses = []
        #the tokens that make up each partial parse - a list of lists
        partial_parses_tokens = []
        #the indices of the tokens that make up each partial parse - a list of lists
        partial_parses_indices = []

        symbols = self.nonterminals
        for i in range(len(self.words), 1, -1):
            # print(self.words)
            token_lists = self.substringsFilterNotInGrammar(self.words, i)
            # print(token_lists)
            biggest_parse_this_level = False
            for token_and_indices in token_lists:
                token = token_and_indices[0]
                # print("i", i)
                # print(token)
                indices = token_and_indices[1]
                for symbol in symbols:
                    # print(symbol)
                    parse = self.RDP.parse(token, symbol)
                    # print(parse)
                    if parse != None:
                        if not biggest_parse:
                            temp = [parse]
                            partial_parses.append(temp)
                            # partial_parses_tokens.append(token.split(" "))
                            partial_parses_tokens.append(
                                re.findall(r"[\w']+|[.,!?;]", token))
                            partial_parses_indices.append(indices)
                            biggest_parse_this_level = True
                        else:
                            # little_tokens = token.split(" ")
                            little_tokens = re.findall(r"[\w']+|[.,!?;]",
                                                       token)
                            parse_num = 0
                            for par in partial_parses_tokens:
                                new_tokens = True
                                for partial in par:
                                    for index in indices:
                                        if index in partial_parses_indices[
                                                parse_num]:
                                            new_tokens = False
                                if new_tokens:
                                    # new_tokens_list = token.split(" ")
                                    new_tokens_list = re.findall(
                                        r"[\w']+|[.,!?;]", token)
                                    temp_parses = partial_parses[
                                        parse_num].copy()
                                    temp_parses.append(parse)
                                    temp_parses_tokens = partial_parses_tokens[
                                        parse_num].copy()
                                    temp_parses_indices = partial_parses_indices[
                                        parse_num].copy()
                                    for x in new_tokens_list:
                                        temp_parses_tokens.append(x)
                                    for index in indices:
                                        temp_parses_indices.append(index)
                                    partial_parses.append(temp_parses)
                                    partial_parses_tokens.append(
                                        temp_parses_tokens)
                                    partial_parses_indices.append(
                                        temp_parses_indices)
                                parse_num = parse_num + 1

                #if all tokens have been parsed
                all_tokens_parses = []
                x = 0
                all_parsed = False
                for tokensList in partial_parses_tokens:
                    if len(tokensList) == len(self.words):
                        all_tokens_parses.append(partial_parses[x])
                        all_parsed = True
                    x += 1
                if all_parsed:
                    final_parses = []
                    x = 0
                    minParses = min(len(parse) for parse in all_tokens_parses)
                    for parse in all_tokens_parses:
                        if len(parse) == minParses:
                            final_parses.append(parse)
                        x += 1
                    # print("Done early")
                    if len(final_parses) > 1:
                        largest_length = -float('inf')
                        largest_parse = []
                        for final_parse in final_parses:
                            if final_parse[0].count("(") > largest_length:
                                largest_length = final_parse[0].count("(")
                        for final_parse in final_parses:
                            if final_parse[0].count("(") == largest_length:
                                largest_parse.append(final_parse)
                        return tuple(largest_parse), True
                    return final_parses, True

                if biggest_parse_this_level:
                    biggest_parse = True

        if partial_parses == []:
            # print("There are no possible partial parses for the given string and grammar. Please ensure that there is white space around each token")
            return (), False
        #do something to choose which partial parses to return
        #only consider parses with the most tokens parsed
        maxTokens = max(len(parse) for parse in partial_parses_tokens)
        # print("MAX TOKENS", maxTokens)
        parse_num = 0
        pre_final_parses = []
        for parse in partial_parses_tokens:
            if len(parse) == maxTokens:
                pre_final_parses.append(partial_parses[parse_num])
            parse_num += 1

        #only consider parses from the previous set that have the minimum islands
        final_parses = []
        parse_num = 0
        minParses = min(len(parse) for parse in pre_final_parses)
        for parse in pre_final_parses:
            if len(parse) == minParses:
                final_parses.append(pre_final_parses[parse_num])
            parse_num += 1

        if len(final_parses) > 1:
            largest_length = -float('inf')
            largest_parse = []
            for final_parse in final_parses:
                if final_parse[0].count("(") > largest_length:
                    largest_length = final_parse[0].count("(")
            for final_parse in final_parses:
                if final_parse[0].count("(") == largest_length:
                    largest_parse.append(final_parse)
            return tuple(largest_parse), True
        return tuple(final_parses), True

    #only returns strings of length substring_length that have words that are in the grammar's terminals
    #returns a list of tuples, where each tuple contains the token phrase and a list of indices of each token in the phrase ("phrase", [indices])
    def substringsFilterNotInGrammar(self, words, substring_length):
        terminals = []
        for terminal in self.terminals:
            for splitted_terminal in terminal.split(" "):
                if "<" not in splitted_terminal:
                    terminals.append(splitted_terminal)
        # print(terminals)
        tokens_and_indices = []
        i = 0
        while i + substring_length <= len(words):
            temp = words[i:i + substring_length]
            add = True
            for word in temp:
                if word not in terminals:
                    add = False
            if add:
                string = " ".join(temp)
                string = re.sub(r' ([^A-Za-z0-9])', r'\1', string)
                indices = []
                for x in range(i, i + substring_length):
                    indices.append(x)
                tokens_and_indices.append((string, indices))
            i += 1
        # print(tokens_and_indices)
        return tokens_and_indices