Python Token 예제들, whoosh.analysis.acore.Token Python 예제들

예제 #1

0

파일 보기

파일: chinese.py 프로젝트: pdwaggoner/txtorg

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """
        assert isinstance(value, text_type), "%r is not unicode" % value

        # test
        #fpath = '/Users/astorer/Dev/txtorg/examples/chinese/1.txt'
        #text = open(fpath).read()
        #value = unicode(text,encoding='utf-8')
        # Thanks, isnowfy!
        s = SnowNLP(value)
        tokenlist = s.words

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            for (pos, text) in enumerate(tokenlist):
                # we may have some off by one errors
                # what is the starting character of the token?
                start_char_t = value[start_char:].find(text) + start_char
                t.text = text
                #print pos, start_char_t, text
                if positions:
                    t.pos = start_pos + pos
                if chars:
                    t.startchar = start_char_t
                    t.endchar = start_char_t + len(text)
                yield t

예제 #2

0

파일 보기

파일: tokenizers.py 프로젝트: 32footsteps/SpecialCollectionsProject

 def __call__(self, value, positions=False, start_pos=0, **kwargs):
      assert isinstance(value, text_type), "%r is not unicode" % value
      token = Token(positions, **kwargs)
      pos = start_pos
      for match in self.expr.finditer(value):
          token.text = value[:match.end()]
          if positions:
              token.pos = pos
              pos += 1
          yield token

예제 #3

0

파일 보기

파일: tokenizers.py 프로젝트: sainjusajan/django-oscar

 def __call__(self, value, positions=False, start_pos=0, **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     token = Token(positions, **kwargs)
     pos = start_pos
     for match in self.expr.finditer(value):
         token.text = value[:match.end()]
         if positions:
             token.pos = pos
             pos += 1
         yield token

예제 #4

0

파일 보기

파일: ast_parser.py 프로젝트: matiaslindgren/spaghetti-search

 def __call__(self,
              source_string,
              positions=False,
              chars=False,
              keeporiginal=False,
              start_pos=0,
              removestops=False,
              start_char=0,
              tokenize=True,
              mode='',
              **kwargs):
     t = Token(positions, chars, removestops=removestops, mode=mode)
     all_subtrees = dump(ast.parse(source_string),
                         **self.string_dump_options)
     full_tree = next(all_subtrees)
     for pos, subtree in enumerate(
             itertools.chain((full_tree, ), all_subtrees)):
         t.text = subtree
         t.boost = 1.0
         if keeporiginal:
             t.original = t.text
         t.stopped = False
         if positions:
             t.pos = start_pos + pos
         if chars:
             t.startchar = start_char + full_tree.index(subtree)
             t.endchar = start_char + t.startchar + len(subtree)
         yield t

예제 #5

0

파일 보기

파일: search.py 프로젝트: kirberich/potato-assignment

 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              mode='',
              **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     for pos, match in enumerate(value.split(self.separator)):
         t.text = match
         t.boost = 1.0
         if keeporiginal:
             t.original = t.text
         t.stopped = False
         if positions:
             t.pos = start_pos + pos
         if chars:
             t.startchar = start_char + match.start()
             t.endchar = start_char + match.end()
         yield t

예제 #6

0

파일 보기

파일: CustomTokenizer.py 프로젝트: avteja/recipe_search

 def __call__(self, value, positions = False, start_pos = 0, **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     token = Token(positions, **kwargs)
     pos = start_pos
     syns = return_synonyms(value)
     syns.append(value)
     for syn in syns:
         token.text = syn
         if positions:
             token.pos = pos
         yield token

예제 #7

0

파일 보기

파일: tokenizers.py 프로젝트: sainjusajan/django-oscar

 def __call__(self,
              value,
              positions=False,
              chars=False,
              keeporiginal=False,
              removestops=True,
              start_pos=0,
              start_char=0,
              mode='',
              **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions,
               chars,
               removestops=removestops,
               mode=mode,
               **kwargs)
     t.text = value
     t.boost = 1.0
     if keeporiginal:
         t.original = value
     if positions:
         t.pos = start_pos + 1
     if chars:
         t.startchar = start_char
         t.endchar = start_char + len(value)
     yield t

예제 #8

0

파일 보기

파일: fromlucene.py 프로젝트: ChristopherLucas/txtorg

    def __call__(self, value, positions=False, chars=False, keeporiginal=False,
                 removestops=True, start_pos=0, start_char=0, tokenize=True,
                  mode='', **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """
        assert isinstance(value, text_type), "%r is not unicode" % value

        tokenlist = []
        tokenStream = self.lanalyzer.tokenStream("contents", StringReader(value))
        #term = tokenStream.addAttribute(lucene.TermAttribute.class_)
        tokenStream.reset()
        
        if len(value)>0:
            while tokenStream.incrementToken():
                tokenlist.append(tokenStream.getAttribute(CharTermAttribute.class_).toString())
                #self.tokentext.insert(END, "[%s]" %(term.term()))
        

        t = Token(positions, chars, removestops=removestops, mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            for (pos,text) in enumerate(tokenlist):
                # we may have some off by one errors
                # what is the starting character of the token?
                #start_char_t = value[start_char:].find(text)+start_char
                t.text = text
                #print pos, start_char_t, text
                if positions:
                    print "Unsupported!"
                    #t.pos = start_pos+pos
                    t.pos = pos
                if chars:
                    print "Unsupported!"                    
                    t.startchar = pos
                    t.endchar = pos
                yield t

예제 #9

0

파일 보기

파일: chinese.py 프로젝트: ChristopherLucas/txtorg

    def __call__(self, value, positions=False, chars=False, keeporiginal=False,
                 removestops=True, start_pos=0, start_char=0, tokenize=True,
                  mode='', **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """
        assert isinstance(value, text_type), "%r is not unicode" % value

        # test
        #fpath = '/Users/astorer/Dev/txtorg/examples/chinese/1.txt'
        #text = open(fpath).read()
        #value = unicode(text,encoding='utf-8')
        # Thanks, isnowfy!
        s = SnowNLP(value)
        tokenlist = s.words

        t = Token(positions, chars, removestops=removestops, mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            for (pos,text) in enumerate(tokenlist):
                # we may have some off by one errors
                # what is the starting character of the token?
                start_char_t = value[start_char:].find(text)+start_char
                t.text = text
                #print pos, start_char_t, text
                if positions:
                    t.pos = start_pos+pos
                if chars:
                    t.startchar = start_char_t
                    t.endchar = start_char_t + len(text)
                yield t

예제 #10

0

파일 보기

파일: arabic.py 프로젝트: ChristopherLucas/txtorg

    def __call__(self, value, positions=False, chars=False, keeporiginal=False,
                 removestops=True, start_pos=0, start_char=0, tokenize=True,
                  mode='', **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """
        assert isinstance(value, text_type), "%r is not unicode" % value

        s = nielsenstemmer.stem(value,transliteration=False)
        tokenlist = s.split()

        t = Token(positions, chars, removestops=removestops, mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            for (pos,text) in enumerate(tokenlist):
                # we may have some off by one errors
                # what is the starting character of the token?
                #start_char_t = value[start_char:].find(text)+start_char
                t.text = text
                #print pos, start_char_t, text
                if positions:
                    print "Unsupported!"
                    #t.pos = start_pos+pos
                    t.pos = pos
                if chars:
                    print "Unsupported!"                    
                    t.startchar = pos
                    t.endchar = pos
                yield t

예제 #11

0

파일 보기

파일: tokenizers.py 프로젝트: 32footsteps/SpecialCollectionsProject

 def __call__(self, value, positions=False, chars=False,
              keeporiginal=False, removestops=True,
              start_pos=0, start_char=0, mode='', **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode,
               **kwargs)
     t.text = value
     t.boost = 1.0
     if keeporiginal:
         t.original = value
     if positions:
         t.pos = start_pos + 1
     if chars:
         t.startchar = start_char
         t.endchar = start_char + len(value)
     yield t

예제 #12

0

파일 보기

파일: tokenizers.py 프로젝트: MNI-NIL/NIL-MNI.github.io

    def __call__(
        self,
        value,
        positions=False,
        chars=False,
        keeporiginal=False,
        removestops=True,
        start_pos=0,
        start_char=0,
        tokenize=True,
        mode="",
        **kwargs
    ):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """

        assert isinstance(value, text_type), "%r is not unicode" % value

        t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            text = u("")
            charmap = self.charmap
            pos = start_pos
            startchar = currentchar = start_char
            for char in value:
                tchar = charmap[ord(char)]
                if tchar:
                    text += tchar
                else:
                    if currentchar > startchar:
                        t.text = text
                        t.boost = 1.0
                        if keeporiginal:
                            t.original = t.text
                        if positions:
                            t.pos = pos
                            pos += 1
                        if chars:
                            t.startchar = startchar
                            t.endchar = currentchar
                        yield t
                    startchar = currentchar + 1
                    text = u("")

                currentchar += 1

            if currentchar > startchar:
                t.text = value[startchar:currentchar]
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                if positions:
                    t.pos = pos
                if chars:
                    t.startchar = startchar
                    t.endchar = currentchar
                yield t

예제 #13

0

파일 보기

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):

        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        # The default: expression matches are used as tokens
        for i, match in enumerate(value.split('\n')):
            fields = match.strip().split('\t')
            word, lemma, pos, ne = fields if len(fields) is 4 else [
                "", "", "", ""
            ]
            t.text = match.strip().split('\t')[0]
            t.lemma = lemma
            t.part_of_speech = pos
            t.named_entity = ne
            t.boost = 1.0
            if keeporiginal:
                t.original = t.text
            t.stopped = False
            if positions:
                t.pos = start_pos + i
            if chars:
                t.startchar = start_char + match.start()
                t.endchar = start_char + match.end()
            yield t

예제 #14

0

파일 보기

파일: ngrams.py 프로젝트: ChaseByInfinity/creme

    def __call__(
        self,
        value,
        positions=False,
        chars=False,
        keeporiginal=False,
        removestops=True,
        start_pos=0,
        start_char=0,
        mode="",
        **kwargs
    ):
        assert isinstance(value, text_type), "%r is not unicode" % value

        inlen = len(value)
        t = Token(positions, chars, removestops=removestops, mode=mode)
        pos = start_pos

        if mode == "query":
            size = min(self.max, inlen)
            for start in xrange(0, inlen - size + 1):
                end = start + size
                if end > inlen:
                    continue
                t.text = value[start:end]
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                if chars:
                    t.startchar = start_char + start
                    t.endchar = start_char + end
                yield t
                pos += 1
        else:
            for start in xrange(0, inlen - self.min + 1):
                for size in xrange(self.min, self.max + 1):
                    end = start + size
                    if end > inlen:
                        continue
                    t.text = value[start:end]
                    if keeporiginal:
                        t.original = t.text
                    t.stopped = False
                    if positions:
                        t.pos = pos
                    if chars:
                        t.startchar = start_char + start
                        t.endchar = start_char + end

                    yield t
                pos += 1

예제 #15

0

파일 보기

파일: TIMEindex.py 프로젝트: fbkarsdorp/takeover

    def __call__(self, value, positions=False, chars=False, keeporiginal=False,
                 removestops=True, start_pos=0, start_char=0, tokenize=True,
                 mode='', **kwargs):

        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions, chars, removestops=removestops, mode=mode,
                  **kwargs)
        # The default: expression matches are used as tokens
        for i, match in enumerate(value.split('\n')):
            fields = match.strip().split('\t')
            word, lemma, pos, ne = fields if len(fields) is 4 else ["", "", "", ""]
            t.text = match.strip().split('\t')[0]
            t.lemma = lemma
            t.part_of_speech = pos
            t.named_entity = ne
            t.boost = 1.0
            if keeporiginal:
                t.original = t.text
            t.stopped = False
            if positions:
                t.pos = start_pos + i
            if chars:
                t.startchar = start_char + match.start()
                t.endchar = start_char + match.end()
            yield t

예제 #16

0

파일 보기

파일: tokenizers.py 프로젝트: sainjusajan/django-oscar

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """

        assert isinstance(value, text_type), "%r is not unicode" % value

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            text = u("")
            charmap = self.charmap
            pos = start_pos
            startchar = currentchar = start_char
            for char in value:
                tchar = charmap[ord(char)]
                if tchar:
                    text += tchar
                else:
                    if currentchar > startchar:
                        t.text = text
                        t.boost = 1.0
                        if keeporiginal:
                            t.original = t.text
                        if positions:
                            t.pos = pos
                            pos += 1
                        if chars:
                            t.startchar = startchar
                            t.endchar = currentchar
                        yield t
                    startchar = currentchar + 1
                    text = u("")

                currentchar += 1

            if currentchar > startchar:
                t.text = value[startchar:currentchar]
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                if positions:
                    t.pos = pos
                if chars:
                    t.startchar = startchar
                    t.endchar = currentchar
                yield t

예제 #17

0

파일 보기

파일: tokenizers.py 프로젝트: sainjusajan/django-oscar

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """

        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        elif not self.gaps:
            # The default: expression matches are used as tokens
            for pos, match in enumerate(self.expression.finditer(value)):
                t.text = match.group(0)
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = start_pos + pos
                if chars:
                    t.startchar = start_char + match.start()
                    t.endchar = start_char + match.end()
                yield t
        else:
            # When gaps=True, iterate through the matches and
            # yield the text between them.
            prevend = 0
            pos = start_pos
            for match in self.expression.finditer(value):
                start = prevend
                end = match.start()
                text = value[start:end]
                if text:
                    t.text = text
                    t.boost = 1.0
                    if keeporiginal:
                        t.original = t.text
                    t.stopped = False
                    if positions:
                        t.pos = pos
                        pos += 1
                    if chars:
                        t.startchar = start_char + start
                        t.endchar = start_char + end

                    yield t

                prevend = match.end()

            # If the last "gap" was before the end of the text,
            # yield the last bit of text as a final token.
            if prevend < len(value):
                t.text = value[prevend:]
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                if chars:
                    t.startchar = prevend
                    t.endchar = len(value)
                yield t

예제 #18

0

파일 보기

파일: tokenizers.py 프로젝트: MNI-NIL/NIL-MNI.github.io

 def __call__(self, value, **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     token = Token(**kwargs)
     for match in self.expr.finditer(value):
         token.text = value[: match.end()]
         yield token

예제 #19

0

파일 보기

파일: tokenizers.py 프로젝트: mykytamorachov/outpost

 def __call__(self, value, **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     token = Token(**kwargs)
     for match in self.expr.finditer(value):
         token.text = value[:match.end()]
         yield token

예제 #20

0

파일 보기

파일: tokenizers.py 프로젝트: MNI-NIL/NIL-MNI.github.io

    def __call__(
        self,
        value,
        positions=False,
        chars=False,
        keeporiginal=False,
        removestops=True,
        start_pos=0,
        start_char=0,
        tokenize=True,
        mode="",
        **kwargs
    ):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """

        assert isinstance(value, text_type), "%s is not unicode" % repr(value)

        t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        elif not self.gaps:
            # The default: expression matches are used as tokens
            for pos, match in enumerate(self.expression.finditer(value)):
                t.text = match.group(0)
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = start_pos + pos
                if chars:
                    t.startchar = start_char + match.start()
                    t.endchar = start_char + match.end()
                yield t
        else:
            # When gaps=True, iterate through the matches and
            # yield the text between them.
            prevend = 0
            pos = start_pos
            for match in self.expression.finditer(value):
                start = prevend
                end = match.start()
                text = value[start:end]
                if text:
                    t.text = text
                    t.boost = 1.0
                    if keeporiginal:
                        t.original = t.text
                    t.stopped = False
                    if positions:
                        t.pos = pos
                        pos += 1
                    if chars:
                        t.startchar = start_char + start
                        t.endchar = start_char + end

                    yield t

                prevend = match.end()

            # If the last "gap" was before the end of the text,
            # yield the last bit of text as a final token.
            if prevend < len(value):
                t.text = value[prevend:]
                t.boost = 1.0
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                if chars:
                    t.startchar = prevend
                    t.endchar = len(value)
                yield t

예제 #21

0

파일 보기

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 tokenize=True,
                 mode='',
                 **kwargs):
        """
        :param value: The unicode string to tokenize.
        :param positions: Whether to record token positions in the token.
        :param chars: Whether to record character offsets in the token.
        :param start_pos: The position number of the first token. For example,
            if you set start_pos=2, the tokens will be numbered 2,3,4,...
            instead of 0,1,2,...
        :param start_char: The offset of the first character of the first
            token. For example, if you set start_char=2, the text "aaa bbb"
            will have chars (2,5),(6,9) instead (0,3),(4,7).
        :param tokenize: if True, the text should be tokenized.
        """
        assert isinstance(value, text_type), "%r is not unicode" % value

        tokenlist = []
        tokenStream = self.lanalyzer.tokenStream("contents",
                                                 StringReader(value))
        #term = tokenStream.addAttribute(lucene.TermAttribute.class_)
        tokenStream.reset()

        if len(value) > 0:
            while tokenStream.incrementToken():
                tokenlist.append(
                    tokenStream.getAttribute(
                        CharTermAttribute.class_).toString())
                #self.tokentext.insert(END, "[%s]" %(term.term()))

        t = Token(positions,
                  chars,
                  removestops=removestops,
                  mode=mode,
                  **kwargs)
        if not tokenize:
            t.original = t.text = value
            t.boost = 1.0
            if positions:
                t.pos = start_pos
            if chars:
                t.startchar = start_char
                t.endchar = start_char + len(value)
            yield t
        else:
            for (pos, text) in enumerate(tokenlist):
                # we may have some off by one errors
                # what is the starting character of the token?
                #start_char_t = value[start_char:].find(text)+start_char
                t.text = text
                #print pos, start_char_t, text
                if positions:
                    print "Unsupported!"
                    #t.pos = start_pos+pos
                    t.pos = pos
                if chars:
                    print "Unsupported!"
                    t.startchar = pos
                    t.endchar = pos
                yield t

예제 #22

0

파일 보기

파일: ngrams.py 프로젝트: DR08/whoosh

    def __call__(self,
                 value,
                 positions=False,
                 chars=False,
                 keeporiginal=False,
                 removestops=True,
                 start_pos=0,
                 start_char=0,
                 mode='',
                 **kwargs):
        assert isinstance(value, text_type), "%r is not unicode" % value

        inlen = len(value)
        t = Token(positions, chars, removestops=removestops, mode=mode)
        pos = start_pos

        if mode == "query" and self.reduce_for_query:
            size = min(self.max, inlen)
            for start in xrange(0, inlen - size + 1):
                end = start + size
                if end > inlen:
                    continue
                t.text = value[start:end]
                if keeporiginal:
                    t.original = t.text
                t.stopped = False
                if positions:
                    t.pos = pos
                if chars:
                    t.startchar = start_char + start
                    t.endchar = start_char + end
                yield t
                pos += 1
        else:
            for start in xrange(0, inlen - self.min + 1):
                for size in xrange(self.min, self.max + 1):
                    end = start + size
                    if end > inlen:
                        continue
                    t.text = value[start:end]
                    if keeporiginal:
                        t.original = t.text
                    t.stopped = False
                    if positions:
                        t.pos = pos
                    if chars:
                        t.startchar = start_char + start
                        t.endchar = start_char + end

                    yield t
                pos += 1