예제 #1
0
    def __init__(self, pcfg=None):
        if pcfg is None:
            pcfg = build_model()

        self.pcfg = pcfg
        self.tokenizer = PennTreebankTokenizer()

        if nltk_is_available:
            self.parse = self.nltk_parse
        else:
            self.parse = self.raw_parse
예제 #2
0
class Parser(object):
    def __init__(self, pcfg=None):
        if pcfg is None:
            pcfg = build_model()
        
        self.pcfg = pcfg
        self.tokenizer = PennTreebankTokenizer()
        
        if nltk_is_available:
            self.parse = self.nltk_parse
        else:
            self.parse = self.raw_parse
    
    def norm_parse(self, sentence):
        words = self.tokenizer.tokenize(sentence)
        if is_cap_word(words[0]):
            words[0] = words[0].lower()
        
        norm_words = []
        for word in words:
            if isinstance(word, tuple):
                # This is already a word normalized to the Treebank conventions
                norm_words.append(word)
            else:
                # rare words normalization
                norm_words.append((self.pcfg.norm_word(word), word))
        return CKY(self.pcfg, norm_words)
    
    def raw_parse(self, sentence):
        tree = self.norm_parse(sentence)
        un_chomsky_normal_form(tree)
        return tree
    
    def nltk_parse(self, sentence):
        return nltk_tree(self.raw_parse(sentence))
예제 #3
0
 def __init__(self, pcfg=None):
     if pcfg is None:
         pcfg = build_model()
     
     self.pcfg = pcfg
     self.tokenizer = PennTreebankTokenizer()
     
     if nltk_is_available:
         self.parse = self.nltk_parse
     else:
         self.parse = self.raw_parse
예제 #4
0
 def tokenize_for_parsing(string):
     tokenizer = PennTreebankTokenizer()
     return tokenizer.tokenize(string)