Python Token.stopped примеры, whoosh.analysis.acore.Token.stopped Python примеры использования

Пример #1

0

Показать файл

Файл: ngrams.py Проект: ChaseByInfinity/creme

    def __call__(
        self,
        value,
        positions=False,
        chars=False,
        keeporiginal=False,
        removestops=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs
    ):
        assert isinstance(value, text_type), "%r is not unicode" % value

        inlen = len(value)
        t = Token(positions, chars, removestops=removestops, mode=mode)
        pos = start_pos

        if mode == "query":
            size = min(self.max, inlen)
            for start in xrange(0, inlen - size + 1):
                end = start + size
                if end > inlen:
                    continue
                t.text = value[start:end]
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                if chars:
                    t.startchar = start_char + start
                    t.endchar = start_char + end
                yield t
                pos += 1
        else:
            for start in xrange(0, inlen - self.min + 1):
                for size in xrange(self.min, self.max + 1):
                    end = start + size
                    if end > inlen:
                        continue
                    t.text = value[start:end]
                    if keeporiginal:
                        t.original = t.text
                    t.stopped = False
                    if positions:
                        t.pos = pos
                    if chars:
                        t.startchar = start_char + start
                        t.endchar = start_char + end

                    yield t
                pos += 1

Пример #2

0

Показать файл

Файл: ngrams.py Проект: DR08/whoosh

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 mode='',
                 **kwargs):
        assert isinstance(value, text_type), "%r is not unicode" % value

        inlen = len(value)
        t = Token(positions, chars, removestops=removestops, mode=mode)
        pos = start_pos

        if mode == "query" and self.reduce_for_query:
            size = min(self.max, inlen)
            for start in xrange(0, inlen - size + 1):
                end = start + size
                if end > inlen:
                    continue
                t.text = value[start:end]
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                if chars:
                    t.startchar = start_char + start
                    t.endchar = start_char + end
                yield t
                pos += 1
        else:
            for start in xrange(0, inlen - self.min + 1):
                for size in xrange(self.min, self.max + 1):
                    end = start + size
                    if end > inlen:
                        continue
                    t.text = value[start:end]
                    if keeporiginal:
                        t.original = t.text
                    t.stopped = False
                    if positions:
                        t.pos = pos
                    if chars:
                        t.startchar = start_char + start
                        t.endchar = start_char + end

                    yield t
                pos += 1

Пример #3

0

Показать файл

Файл: ast_parser.py Проект: matiaslindgren/spaghetti-search

 def __call__(self,
              source_string,
              positions=False,
              chars=False,
              keeporiginal=False,
              start_pos=0,
              removestops=False,
              start_char=0,
              tokenize=True,
              mode='',
              **kwargs):
     t = Token(positions, chars, removestops=removestops, mode=mode)
     all_subtrees = dump(ast.parse(source_string),
                         **self.string_dump_options)
     full_tree = next(all_subtrees)
     for pos, subtree in enumerate(
             itertools.chain((full_tree, ), all_subtrees)):
         t.text = subtree
         t.boost = 1.0
         if keeporiginal:
             t.original = t.text
         t.stopped = False
         if positions:
             t.pos = start_pos + pos
         if chars:
             t.startchar = start_char + full_tree.index(subtree)
             t.endchar = start_char + t.startchar + len(subtree)
         yield t

Пример #4

0

Показать файл

Файл: TIMEindex.py Проект: fbkarsdorp/takeover

    def __call__(self, value, positions=False, chars=False, keeporiginal=False,
                 removestops=True, start_pos=0, start_char=0, tokenize=True,
                 mode='', **kwargs):

        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions, chars, removestops=removestops, mode=mode,
                  **kwargs)
        # The default: expression matches are used as tokens
        for i, match in enumerate(value.split('\n')):
            fields = match.strip().split('\t')
            word, lemma, pos, ne = fields if len(fields) is 4 else ["", "", "", ""]
            t.text = match.strip().split('\t')[0]
            t.lemma = lemma
            t.part_of_speech = pos
            t.named_entity = ne
            t.boost = 1.0
            if keeporiginal:
                t.original = t.text
            t.stopped = False
            if positions:
                t.pos = start_pos + i
            if chars:
                t.startchar = start_char + match.start()
                t.endchar = start_char + match.end()
            yield t

Пример #5

0

Показать файл

Файл: search.py Проект: kirberich/potato-assignment

 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              mode='',
              **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     for pos, match in enumerate(value.split(self.separator)):
         t.text = match
         t.boost = 1.0
         if keeporiginal:
             t.original = t.text
         t.stopped = False
         if positions:
             t.pos = start_pos + pos
         if chars:
             t.startchar = start_char + match.start()
             t.endchar = start_char + match.end()
         yield t

Пример #6

0

Показать файл

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):

        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        # The default: expression matches are used as tokens
        for i, match in enumerate(value.split('\n')):
            fields = match.strip().split('\t')
            word, lemma, pos, ne = fields if len(fields) is 4 else [
                "", "", "", ""
            ]
            t.text = match.strip().split('\t')[0]
            t.lemma = lemma
            t.part_of_speech = pos
            t.named_entity = ne
            t.boost = 1.0
            if keeporiginal:
                t.original = t.text
            t.stopped = False
            if positions:
                t.pos = start_pos + i
            if chars:
                t.startchar = start_char + match.start()
                t.endchar = start_char + match.end()
            yield t

Пример #7

0

Показать файл

Файл: tokenizers.py Проект: sainjusajan/django-oscar

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """

        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        elif not self.gaps:
            # The default: expression matches are used as tokens
            for pos, match in enumerate(self.expression.finditer(value)):
                t.text = match.group(0)
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = start_pos + pos
                if chars:
                    t.startchar = start_char + match.start()
                    t.endchar = start_char + match.end()
                yield t
        else:
            # When gaps=True, iterate through the matches and
            # yield the text between them.
            prevend = 0
            pos = start_pos
            for match in self.expression.finditer(value):
                start = prevend
                end = match.start()
                text = value[start:end]
                if text:
                    t.text = text
                    t.boost = 1.0
                    if keeporiginal:
                        t.original = t.text
                    t.stopped = False
                    if positions:
                        t.pos = pos
                        pos += 1
                    if chars:
                        t.startchar = start_char + start
                        t.endchar = start_char + end

                    yield t

                prevend = match.end()

            # If the last "gap" was before the end of the text,
            # yield the last bit of text as a final token.
            if prevend < len(value):
                t.text = value[prevend:]
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                if chars:
                    t.startchar = prevend
                    t.endchar = len(value)
                yield t

Пример #8

0

Показать файл

Файл: tokenizers.py Проект: MNI-NIL/NIL-MNI.github.io

    def __call__(
        self,
        value,
        positions=False,
        chars=False,
        keeporiginal=False,
        removestops=True,
        start_pos=0,
        start_char=0,
        tokenize=True,
        mode="",
        **kwargs
    ):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """

        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        elif not self.gaps:
            # The default: expression matches are used as tokens
            for pos, match in enumerate(self.expression.finditer(value)):
                t.text = match.group(0)
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = start_pos + pos
                if chars:
                    t.startchar = start_char + match.start()
                    t.endchar = start_char + match.end()
                yield t
        else:
            # When gaps=True, iterate through the matches and
            # yield the text between them.
            prevend = 0
            pos = start_pos
            for match in self.expression.finditer(value):
                start = prevend
                end = match.start()
                text = value[start:end]
                if text:
                    t.text = text
                    t.boost = 1.0
                    if keeporiginal:
                        t.original = t.text
                    t.stopped = False
                    if positions:
                        t.pos = pos
                        pos += 1
                    if chars:
                        t.startchar = start_char + start
                        t.endchar = start_char + end

                    yield t

                prevend = match.end()

            # If the last "gap" was before the end of the text,
            # yield the last bit of text as a final token.
            if prevend < len(value):
                t.text = value[prevend:]
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                if chars:
                    t.startchar = prevend
                    t.endchar = len(value)
                yield t

Python Token.stopped примеры использования