def chartParser(): """ 线图句法分析 """ from nltk.grammar import CFG from nltk.parse.chart import ChartParser, BU_LC_STRATEGY # BNF格式文法 开始符号:S 终结符号:单词 grammar = CFG.fromstring(""" S -> T1 T4 T1 -> NNP VBZ T2 -> DT NN T3 ->IN NNP T4 -> T3 | T2 T3 NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka' VBZ -> 'is' IN -> 'in' | 'of' DT -> 'the' NN -> 'capital' """) cp = ChartParser(grammar, BU_LC_STRATEGY, trace=True) # trace=True可以看见分析过程 # strategy=BU_LC_STRATEGY是默认的,不写好像也行 sentence = 'Bangalore is the capital of Karnataka' tokens = sentence.split() chart = cp.chart_parse(tokens) # 对单词列表分析,并存到chart对象 parses = list(chart.parses(grammar.start())) # 将chart取到的所有分析树赋给parses print('Total Edges:', len(chart.edges())) # 输出chart对象所有边的数量 for tree in parses: print(tree) tree.draw()
def __init__(self, **parser_args): """ Constructor. Initializes a MbmaParser. Args: - parser_args: needs a keyword grammar which is of type :class:`ContextFreeGrammar` """ ChartParser.__init__(self, [], BU_RHR_STRATEGY, **parser_args)
def generate_context_free_grammar_novel_text( self, number_of_words_in_sentence=0, number_of_sentences_per_record=0, number_of_records=0 ): """ This method utilizes NLTK's Context Free Grammar parser objects to parse an available .*cfg file and generate novel text from it. @param number_of_words_in_sentence: An indicator as to the number of words to generate in each novel sentence. @type number_of_words_in_sentence: int @param number_of_sentences_per_record: An indicator as to the number of sentences per record to generate. @type number_of_sentences_per_record: int @param number_of_records: An indicator as to the total number of records to generate. @type number_of_records: int @return: str """ words = [] punct_selector = [". ", "! ", "? "] punctuation_stop_symbols = dict((ord(char), None) for char in string.punctuation) parser = None grammar = None try: if isinstance(self._corpus, CFG): _grammar = self._corpus if _grammar is not None: parser = ChartParser(_grammar) grammar = parser.grammar elif isinstance(self._corpus, FeatureGrammar): _grammar = self._corpus if _grammar is not None: parser = FeatureChartParser(_grammar) grammar = parser.grammar() elif isinstance(self._corpus, PCFG): _grammar = self._corpus if _grammar is not None: parser = InsideChartParser(_grammar) grammar = parser.grammar() else: grammar = CFG.fromstring(self._corpus) if grammar is not None: for _ in range(number_of_records): novel_sentence = [] for _ in range(number_of_sentences_per_record): sentence = " ".join( [ sent for _, sent in enumerate(generate_text(grammar, depth=2, n=number_of_words_in_sentence)) ] ) sentence = sentence.translate(punctuation_stop_symbols) + random.choice(punct_selector) sentence = sentence[0:].capitalize() novel_sentence.append(sentence) words.append("".join(novel_sentence)) except Exception, error: self.logger.error( "TextGenerator.generate_context_free_grammar_novel_text: Error occurred - {0}".format(str(error)) )
def __init__(self, grammar, strategy=BU_LC_FEATURE_STRATEGY, trace_chart_width=20, chart_class=FeatureChart, **parser_args): ChartParser.__init__(self, grammar, strategy=strategy, trace_chart_width=trace_chart_width, chart_class=chart_class, **parser_args)
def generate_context_free_grammar_novel_text( self, corpus, number_of_words_in_sentence, number_of_sentences_per_record, number_of_records): ''' This method utilizes NLTK's Context Free Grammar parser objects to parse an available .*cfg file and generate novel text from it. Params: ------- - number_of_words_in_sentence (int): An indicator as to the number of words to generate in each novel sentence. - number_of_sentences_per_record (int): An indicator as to the number of sentences per record to generate. - number_of_records (int): An indicator as to the total number of records to generate. Returns: str ''' words = [] punct_selector = ['. ', '! ', '? '] punctuation_stop_symbols = dict( (ord(char), None) for char in string.punctuation) parser = None grammar = None try: if isinstance(corpus, CFG): _grammar = corpus if _grammar is not None: parser = ChartParser(_grammar) grammar = parser.grammar elif isinstance(corpus, FeatureGrammar): _grammar = corpus if _grammar is not None: parser = FeatureChartParser(_grammar) grammar = parser.grammar() elif isinstance(corpus, PCFG): _grammar = corpus if _grammar is not None: parser = InsideChartParser(_grammar) grammar = parser.grammar() else: grammar = CFG.fromstring(corpus) if grammar is not None: for _ in range(number_of_records): novel_sentence = [] for _ in range(number_of_sentences_per_record): sentence = ' '.join([ sent for _, sent in enumerate( generate_text(grammar, depth=2, n=number_of_words_in_sentence)) ]) sentence = sentence.translate( punctuation_stop_symbols) + random.choice( punct_selector) sentence = sentence[0:].capitalize() novel_sentence.append(sentence) words.append(''.join(novel_sentence)) except Exception, error: logging.error('TextGenerator: Error occurred - {0}'.format( str(error)))
from nltk.grammar import CFG from nltk.parse.chart import ChartParser, BU_LC_STRATEGY grammar = CFG.fromstring(""" S -> T1 T4 T1 -> NNP VBZ T2 -> DT NN T3 -> IN NNP T4 -> T3 | T2 T3 NNP -> 'Tajmahal' | 'Agra' | 'Bangalore' | 'Karnataka' VBZ -> 'is' IN -> 'in' | 'of' DT -> 'the' NN -> 'capital' """) cp = ChartParser(grammar, BU_LC_STRATEGY, trace=True) sentence = "Bangalore is the capital of Karnataka" tokens = sentence.split() chart = cp.chart_parse(tokens) parses = list(chart.parses(grammar.start())) print("Total Edges :", len(chart.edges())) for tree in parses: print(tree) tree.draw()