def parse(self, sentence, rerank='auto', sentence_id=None): """Parse some text or tokens and return an NBestList with the results. sentence can be a string or a sequence. If it is a string, it will be tokenized. If rerank is True, we will rerank the n-best list, if False the reranker will not be used. rerank can also be set to 'auto' which will only rerank if a reranker model is loaded. If there are no parses or an error occurs, this will return an empty NBestList.""" rerank = self.check_models_loaded_or_error(rerank) sentence = Sentence(sentence) # max_sentence_length is actually 1 longer than the maximum # allowed sentence length if len(sentence) >= parser.max_sentence_length - 1: raise ValueError("Sentence is too long (%s tokens, must be " "under %s)" % (len(sentence), parser.max_sentence_length - 1)) try: parses = parser.parse(sentence.sentrep) except RuntimeError: parses = [] nbest_list = NBestList(sentence, parses, sentence_id) if rerank: nbest_list.rerank(self) return nbest_list
def parse_tagged(self, tokens, possible_tags, rerank='auto'): """Parse some pre-tagged, pre-tokenized text. tokens must be a sequence of strings. possible_tags is map from token indices to possible POS tags (strings). Tokens without an entry in possible_tags will be unconstrained by POS. POS tags must be in the terms.txt file in the parsing model or else you will get a ValueError. If rerank is True, we will rerank the n-best list, if False the reranker will not be used. rerank can also be set to 'auto' which will only rerank if a reranker model is loaded.""" rerank = self._check_loaded_models(rerank) if isinstance(tokens, basestring): raise ValueError("tokens must be a sequence, not a string.") ext_pos = parser.ExtPos() for index in range(len(tokens)): tags = possible_tags.get(index, []) if isinstance(tags, basestring): tags = [tags] tags = map(str, tags) valid_tags = ext_pos.addTagConstraints(parser.VectorString(tags)) if not valid_tags: # at least one of the tags is bad -- find out which ones # and throw a ValueError self._find_bad_tag_and_raise_error(tags) sentence = Sentence(tokens) parses = parser.parse(sentence.sentrep, ext_pos, self._parser_thread_slot) nbest_list = NBestList(sentence, parses) if rerank: nbest_list.rerank(self) return nbest_list
def parse(self, sentence, rerank=True, max_sentence_length=399): """Parse some text or tokens and return an NBestList with the results. sentence can be a string or a sequence. If it is a string, it will be tokenized. If rerank is True, we will rerank the n-best list.""" self.check_loaded_models(rerank) sentence = Sentence(sentence, max_sentence_length) try: parses = parser.parse(sentence.sentrep, self._parser_thread_slot) except RuntimeError: parses = [] nbest_list = NBestList(sentence, parses) if rerank: nbest_list.rerank(self) return nbest_list
def parse(self, sentence, rerank='auto', max_sentence_length=399): """Parse some text or tokens and return an NBestList with the results. sentence can be a string or a sequence. If it is a string, it will be tokenized. If rerank is True, we will rerank the n-best list, if False the reranker will not be used. rerank can also be set to 'auto' which will only rerank if a reranker model is loaded.""" rerank = self._check_loaded_models(rerank) sentence = Sentence(sentence, max_sentence_length) try: parses = parser.parse(sentence.sentrep, self._parser_thread_slot) except RuntimeError: parses = [] nbest_list = NBestList(sentence, parses) if rerank: nbest_list.rerank(self) return nbest_list
def parse_tagged(self, tokens, possible_tags, rerank=True): """Parse some pre-tagged, pre-tokenized text. tokens is a sequence of strings. possible_tags is map from token indices to possible POS tags. Tokens without an entry in possible_tags will be unconstrained by POS. If rerank is True, we will rerank the n-best list.""" self.check_loaded_models(rerank) ext_pos = parser.ExtPos() for index in range(len(tokens)): tags = possible_tags.get(index, []) if isinstance(tags, basestring): tags = [tags] ext_pos.addTagConstraints(parser.VectorString(tags)) sentence = Sentence(tokens) parses = parser.parse(sentence.sentrep, ext_pos, self._parser_thread_slot) nbest_list = NBestList(sentence, parses) if rerank: nbest_list.rerank(self) return nbest_list
def parse_constrained(self, tokens, constraints, possible_tags=None, rerank='auto', sentence_id=None): """Parse pre-tokenized text with part of speech and/or phrasal constraints. Constraints is a dictionary of {(start, end): [terms]} which represents the constraint that all spans between [start,end) must be one of the terms in that list. start and end are integers and terms can be a single string or a list of strings. This also allows you to incorporate external POS tags as in parse_tagged(). While you can specify a constraint or an external POS tag for a word, the semantics are slightly different. Setting a tag with possible_tags will allow you to force a word to be a POS tag that the parser's tagger would not ordinarily use for a tag. Setting a constraint with constraints would only limit the set of allowable tags. Additionally, setting constraints doesn't change the probability of the final tree whereas setting possible_tags changes the probabilities of words given tags and may change the overall probability. The rerank flag is the same as in parse().""" rerank = self.check_models_loaded_or_error(rerank) if isinstance(tokens, basestring): raise ValueError("tokens must be a sequence, not a string.") if constraints: span_constraints = parser.LabeledSpans() for (start, end), terms in constraints.items(): if end <= start: raise ValueError("End must be at least start + 1:" "(%r, %r) -> %r" % (start, end, terms)) # since Tree.label currently returns a DeprecatedGetter, # we take some extra steps to get these back to strings # to avoid type errors if isinstance(terms, (basestring, DeprecatedGetter)): terms = [str(terms)] for term in terms: span_constraints.addConstraint(int(start), int(end), str(term)) else: span_constraints = None possible_tags = possible_tags or {} ext_pos = self._possible_tags_to_ext_pos(tokens, possible_tags) sentence = Sentence(tokens) try: parses = parser.parse(sentence.sentrep, ext_pos, span_constraints) if constraints and not parses: raise RuntimeError("Reparsing with relaxed constraints") except RuntimeError: if span_constraints: # we should relax them and retry span_constraints.minSizeForParsing = 2 try: parses = parser.parse(sentence.sentrep, ext_pos, span_constraints) except RuntimeError: parses = [] else: parses = [] nbest_list = NBestList(sentence, parses, sentence_id) if rerank: nbest_list.rerank(self) return nbest_list