f = os.path.join(os.path.dirname(__file__), "en-model.slp") m.save(f, final=True) # Each parser in Pattern (pattern.en, pattern.es, pattern.it, ...) # assumes that a lexicon of known words and their most frequent tag is available, # along with some rules for morphology (suffixes, e.g., -ly = adverb) # and context (surrounding words) for unknown words. # If a language model is also available, it overrides these (simpler) rules. # For English, this can raise accuracy from about 94% up to about 97%, # and makes the parses about 3x faster. print("loading model...") f = os.path.join(os.path.dirname(__file__), "en-model.slp") lexicon.model = Model.load(lexicon, f) # To test the accuracy of the language model, # we can compare a tagged corpus to the predicted tags. # This corpus must be different from the one used for training. # Typically, sections 22, 23 and 24 of the WSJ are used. # Note that the WSJ contains standardized English. # The accuracy will be lower when tested on, for example, informal tweets. # A different classifier could be trained for informal language use. print("testing...") i, n = 0, 0 for s1 in data[-5000:]: s2 = " ".join(w for w, tag in s1)
f = os.path.join(os.path.dirname(__file__), "en-model.slp") m.save(f, final=True) # Each parser in Pattern (pattern.en, pattern.es, pattern.it, ...) # assumes that a lexicon of known words and their most frequent tag is available, # along with some rules for morphology (suffixes, e.g., -ly = adverb) # and context (surrounding words) for unknown words. # If a language model is also available, it overrides these (simpler) rules. # For English, this can raise accuracy from about 94% up to about 97%, # and makes the parses about 3x faster. print("loading model...") f = os.path.join(os.path.dirname(__file__), "en-model.slp") lexicon.model = Model.load(f, lexicon) # To test the accuracy of the language model, # we can compare a tagged corpus to the predicted tags. # This corpus must be different from the one used for training. # Typically, sections 22, 23 and 24 of the WSJ are used. # Note that the WSJ contains standardized English. # The accuracy will be lower when tested on, for example, informal tweets. # A different classifier could be trained for informal language use. print("testing...") i, n = 0, 0 for s1 in data[-5000:]: s2 = " ".join(w for w, tag in s1)
next = None m.save("en-model.slp", final=True) # Each parser in Pattern (pattern.en, pattern.es, pattern.it, ...) # assumes that a lexicon of known words and their most frequent tag is available, # along with some rules for morphology (suffixes, e.g., -ly = adverb) # and context (surrounding words) for unknown words. # If a language model is also available, it overrides these (simpler) rules. # For English, this can raise accuracy from about 94% up to about 97%, # and makes the parses about 3x faster. print "loading model..." lexicon.model = Model.load(lexicon, "en-model.slp") # To test the accuracy of the language model, # we can compare a tagged corpus to the predicted tags. # This corpus must be different from the one used for training. # Typically, sections 22, 23 and 24 of the WSJ are used. # Note that the WSJ contains standardized English. # The accuracy will be lower when tested on, for example, informal tweets. # A different classifier could be trained for informal language use. print "testing..." i, n = 0, 0 for s1 in data[-5000:]: s2 = " ".join(w for w, tag in s1)