Exemplo n.º 1
1
    def parse_tagged(self, tokens, possible_tags, rerank='auto'):
        """Parse some pre-tagged, pre-tokenized text. tokens must be a
        sequence of strings. possible_tags is map from token indices
        to possible POS tags (strings). Tokens without an entry in
        possible_tags will be unconstrained by POS. POS tags must be
        in the terms.txt file in the parsing model or else you will get
        a ValueError. If rerank is True, we will rerank the n-best list,
        if False the reranker will not be used. rerank can also be set to
        'auto' which will only rerank if a reranker model is loaded."""
        rerank = self._check_loaded_models(rerank)
        if isinstance(tokens, basestring):
            raise ValueError("tokens must be a sequence, not a string.")

        ext_pos = parser.ExtPos()
        for index in range(len(tokens)):
            tags = possible_tags.get(index, [])
            if isinstance(tags, basestring):
                tags = [tags]
            tags = map(str, tags)
            valid_tags = ext_pos.addTagConstraints(parser.VectorString(tags))
            if not valid_tags:
                # at least one of the tags is bad -- find out which ones
                # and throw a ValueError
                self._find_bad_tag_and_raise_error(tags)

        sentence = Sentence(tokens)
        parses = parser.parse(sentence.sentrep, ext_pos,
            self._parser_thread_slot)
        nbest_list = NBestList(sentence, parses)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
Exemplo n.º 2
0
    def parse(self, sentence, rerank='auto', sentence_id=None):
        """Parse some text or tokens and return an NBestList with the
        results. sentence can be a string or a sequence. If it is a
        string, it will be tokenized. If rerank is True, we will rerank
        the n-best list, if False the reranker will not be used. rerank
        can also be set to 'auto' which will only rerank if a reranker
        model is loaded. If there are no parses or an error occurs,
        this will return an empty NBestList."""
        rerank = self.check_models_loaded_or_error(rerank)

        sentence = Sentence(sentence)
        # max_sentence_length is actually 1 longer than the maximum
        # allowed sentence length
        if len(sentence) >= parser.max_sentence_length - 1:
            raise ValueError("Sentence is too long (%s tokens, must be "
                             "under %s)" %
                             (len(sentence), parser.max_sentence_length - 1))

        try:
            parses = parser.parse(sentence.sentrep)
        except RuntimeError:
            parses = []
        nbest_list = NBestList(sentence, parses, sentence_id)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
Exemplo n.º 3
0
    def parse(self, sentence, rerank='auto', sentence_id=None):
        """Parse some text or tokens and return an NBestList with the
        results. sentence can be a string or a sequence. If it is a
        string, it will be tokenized. If rerank is True, we will rerank
        the n-best list, if False the reranker will not be used. rerank
        can also be set to 'auto' which will only rerank if a reranker
        model is loaded. If there are no parses or an error occurs,
        this will return an empty NBestList."""
        rerank = self.check_models_loaded_or_error(rerank)

        sentence = Sentence(sentence)
        # max_sentence_length is actually 1 longer than the maximum
        # allowed sentence length
        if len(sentence) >= parser.max_sentence_length - 1:
            raise ValueError("Sentence is too long (%s tokens, must be "
                             "under %s)" %
                             (len(sentence), parser.max_sentence_length - 1))

        try:
            parses = parser.parse(sentence.sentrep)
        except RuntimeError:
            parses = []
        nbest_list = NBestList(sentence, parses, sentence_id)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
Exemplo n.º 4
0
    def parse(self, sentence, rerank=True, max_sentence_length=399):
        """Parse some text or tokens and return an NBestList with the
        results.  sentence can be a string or a sequence.  If it is a
        string, it will be tokenized.  If rerank is True, we will rerank
        the n-best list."""
        self.check_loaded_models(rerank)

        sentence = Sentence(sentence, max_sentence_length)
        try:
            parses = parser.parse(sentence.sentrep, self._parser_thread_slot)
        except RuntimeError:
            parses = []
        nbest_list = NBestList(sentence, parses)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
Exemplo n.º 5
0
    def parse(self, sentence, rerank=True, max_sentence_length=399):
        """Parse some text or tokens and return an NBestList with the
        results.  sentence can be a string or a sequence.  If it is a
        string, it will be tokenized.  If rerank is True, we will rerank
        the n-best list."""
        self.check_loaded_models(rerank)

        sentence = Sentence(sentence, max_sentence_length)
        try:
            parses = parser.parse(sentence.sentrep, self._parser_thread_slot)
        except RuntimeError:
            parses = []
        nbest_list = NBestList(sentence, parses)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
Exemplo n.º 6
0
    def parse(self, sentence, rerank='auto', max_sentence_length=399):
        """Parse some text or tokens and return an NBestList with the
        results. sentence can be a string or a sequence. If it is a
        string, it will be tokenized. If rerank is True, we will rerank
        the n-best list, if False the reranker will not be used. rerank
        can also be set to 'auto' which will only rerank if a reranker
        model is loaded."""
        rerank = self._check_loaded_models(rerank)

        sentence = Sentence(sentence, max_sentence_length)
        try:
            parses = parser.parse(sentence.sentrep, self._parser_thread_slot)
        except RuntimeError:
            parses = []
        nbest_list = NBestList(sentence, parses)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
Exemplo n.º 7
0
    def parse_tagged(self, tokens, possible_tags, rerank=True):
        """Parse some pre-tagged, pre-tokenized text.  tokens is a
        sequence of strings.  possible_tags is map from token indices
        to possible POS tags.  Tokens without an entry in possible_tags
        will be unconstrained by POS.  If rerank is True, we will
        rerank the n-best list."""
        self.check_loaded_models(rerank)

        ext_pos = parser.ExtPos()
        for index in range(len(tokens)):
            tags = possible_tags.get(index, [])
            if isinstance(tags, basestring):
                tags = [tags]
            ext_pos.addTagConstraints(parser.VectorString(tags))

        sentence = Sentence(tokens)
        parses = parser.parse(sentence.sentrep, ext_pos,
            self._parser_thread_slot)
        nbest_list = NBestList(sentence, parses)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
Exemplo n.º 8
0
    def parse_tagged(self, tokens, possible_tags, rerank=True):
        """Parse some pre-tagged, pre-tokenized text.  tokens is a
        sequence of strings.  possible_tags is map from token indices
        to possible POS tags.  Tokens without an entry in possible_tags
        will be unconstrained by POS.  If rerank is True, we will
        rerank the n-best list."""
        self.check_loaded_models(rerank)

        ext_pos = parser.ExtPos()
        for index in range(len(tokens)):
            tags = possible_tags.get(index, [])
            if isinstance(tags, basestring):
                tags = [tags]
            ext_pos.addTagConstraints(parser.VectorString(tags))

        sentence = Sentence(tokens)
        parses = parser.parse(sentence.sentrep, ext_pos,
                              self._parser_thread_slot)
        nbest_list = NBestList(sentence, parses)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
Exemplo n.º 9
0
    def parse_constrained(self, tokens, constraints, possible_tags=None,
                          rerank='auto', sentence_id=None):
        """Parse pre-tokenized text with part of speech and/or phrasal
        constraints. Constraints is a dictionary of

            {(start, end): [terms]}

        which represents the constraint that all spans between [start,end)
        must be one of the terms in that list. start and end are integers
        and terms can be a single string or a list of strings.

        This also allows you to incorporate external POS tags as in
        parse_tagged(). While you can specify a constraint or an external
        POS tag for a word, the semantics are slightly different. Setting
        a tag with possible_tags will allow you to force a word to be a
        POS tag that the parser's tagger would not ordinarily use for
        a tag. Setting a constraint with constraints would only limit
        the set of allowable tags.  Additionally, setting constraints
        doesn't change the probability of the final tree whereas setting
        possible_tags changes the probabilities of words given tags and
        may change the overall probability.

        The rerank flag is the same as in parse()."""
        rerank = self.check_models_loaded_or_error(rerank)
        if isinstance(tokens, basestring):
            raise ValueError("tokens must be a sequence, not a string.")

        if constraints:
            span_constraints = parser.LabeledSpans()
            for (start, end), terms in constraints.items():
                if end <= start:
                    raise ValueError("End must be at least start + 1:"
                                     "(%r, %r) -> %r" % (start, end, terms))
                # since Tree.label currently returns a DeprecatedGetter,
                # we take some extra steps to get these back to strings
                # to avoid type errors
                if isinstance(terms, (basestring, DeprecatedGetter)):
                    terms = [str(terms)]
                for term in terms:
                    span_constraints.addConstraint(int(start), int(end),
                                                   str(term))
        else:
            span_constraints = None

        possible_tags = possible_tags or {}
        ext_pos = self._possible_tags_to_ext_pos(tokens, possible_tags)
        sentence = Sentence(tokens)
        try:
            parses = parser.parse(sentence.sentrep, ext_pos, span_constraints)
            if constraints and not parses:
                raise RuntimeError("Reparsing with relaxed constraints")
        except RuntimeError:
            if span_constraints:
                # we should relax them and retry
                span_constraints.minSizeForParsing = 2
                try:
                    parses = parser.parse(sentence.sentrep, ext_pos,
                                          span_constraints)
                except RuntimeError:
                    parses = []
            else:
                parses = []
        nbest_list = NBestList(sentence, parses, sentence_id)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list
Exemplo n.º 10
0
    def parse_constrained(self,
                          tokens,
                          constraints,
                          possible_tags=None,
                          rerank='auto',
                          sentence_id=None):
        """Parse pre-tokenized text with part of speech and/or phrasal
        constraints. Constraints is a dictionary of

            {(start, end): [terms]}

        which represents the constraint that all spans between [start,end)
        must be one of the terms in that list. start and end are integers
        and terms can be a single string or a list of strings.

        This also allows you to incorporate external POS tags as in
        parse_tagged(). While you can specify a constraint or an external
        POS tag for a word, the semantics are slightly different. Setting
        a tag with possible_tags will allow you to force a word to be a
        POS tag that the parser's tagger would not ordinarily use for
        a tag. Setting a constraint with constraints would only limit
        the set of allowable tags.  Additionally, setting constraints
        doesn't change the probability of the final tree whereas setting
        possible_tags changes the probabilities of words given tags and
        may change the overall probability.

        The rerank flag is the same as in parse()."""
        rerank = self.check_models_loaded_or_error(rerank)
        if isinstance(tokens, basestring):
            raise ValueError("tokens must be a sequence, not a string.")

        if constraints:
            span_constraints = parser.LabeledSpans()
            for (start, end), terms in constraints.items():
                if end <= start:
                    raise ValueError("End must be at least start + 1:"
                                     "(%r, %r) -> %r" % (start, end, terms))
                # since Tree.label currently returns a DeprecatedGetter,
                # we take some extra steps to get these back to strings
                # to avoid type errors
                if isinstance(terms, (basestring, DeprecatedGetter)):
                    terms = [str(terms)]
                for term in terms:
                    span_constraints.addConstraint(int(start), int(end),
                                                   str(term))
        else:
            span_constraints = None

        possible_tags = possible_tags or {}
        ext_pos = self._possible_tags_to_ext_pos(tokens, possible_tags)
        sentence = Sentence(tokens)
        try:
            parses = parser.parse(sentence.sentrep, ext_pos, span_constraints)
            if constraints and not parses:
                raise RuntimeError("Reparsing with relaxed constraints")
        except RuntimeError:
            if span_constraints:
                # we should relax them and retry
                span_constraints.minSizeForParsing = 2
                try:
                    parses = parser.parse(sentence.sentrep, ext_pos,
                                          span_constraints)
                except RuntimeError:
                    parses = []
            else:
                parses = []
        nbest_list = NBestList(sentence, parses, sentence_id)
        if rerank:
            nbest_list.rerank(self)
        return nbest_list