def __init__(self, pcfg=None): if pcfg is None: pcfg = build_model() self.pcfg = pcfg self.tokenizer = PennTreebankTokenizer() if nltk_is_available: self.parse = self.nltk_parse else: self.parse = self.raw_parse
class Parser(object): def __init__(self, pcfg=None): if pcfg is None: pcfg = build_model() self.pcfg = pcfg self.tokenizer = PennTreebankTokenizer() if nltk_is_available: self.parse = self.nltk_parse else: self.parse = self.raw_parse def norm_parse(self, sentence): words = self.tokenizer.tokenize(sentence) if is_cap_word(words[0]): words[0] = words[0].lower() norm_words = [] for word in words: if isinstance(word, tuple): # This is already a word normalized to the Treebank conventions norm_words.append(word) else: # rare words normalization norm_words.append((self.pcfg.norm_word(word), word)) return CKY(self.pcfg, norm_words) def raw_parse(self, sentence): tree = self.norm_parse(sentence) un_chomsky_normal_form(tree) return tree def nltk_parse(self, sentence): return nltk_tree(self.raw_parse(sentence))
def __init__(self, pcfg=None): if pcfg is None: pcfg = build_model() self.pcfg = pcfg self.tokenizer = PennTreebankTokenizer() if nltk_is_available: self.parse = self.nltk_parse else: self.parse = self.raw_parse
def tokenize_for_parsing(string): tokenizer = PennTreebankTokenizer() return tokenizer.tokenize(string)