def _index_cjk(xdoc, value, prefix, termpos): """ Returns the next word and its position in the data. The analysis is done with the automaton: 0 -> 1 [letter or number or cjk] 0 -> 0 [stop word] 1 -> 0 [stop word] 1 -> 2 [letter or number or cjk] 2 -> 2 [letter or number or cjk] 2 -> 0 [stop word] """ state = 0 lexeme = previous_cjk = u'' for c in value: if is_punctuation(c): # Stop word if previous_cjk and state == 1: # CJK not yielded yet xdoc.add_posting(prefix + previous_cjk, termpos) termpos += 1 # reset state lexeme = u'' previous_cjk = u'' state = 0 else: c = c.lower() if previous_cjk: xdoc.add_posting(prefix + (u'%s%s' % (previous_cjk, c)), termpos) termpos += 1 state = 2 else: state = 1 previous_cjk = c # Last word if previous_cjk and state == 1: xdoc.add_posting(prefix + previous_cjk, termpos) return termpos + 1
def _index_cjk(xdoc, value, prefix, termpos): """ Returns the next word and its position in the data. The analysis is done with the automaton: 0 -> 1 [letter or number or cjk] 0 -> 0 [stop word] 1 -> 0 [stop word] 1 -> 2 [letter or number or cjk] 2 -> 2 [letter or number or cjk] 2 -> 0 [stop word] """ state = 0 previous_cjk = u'' for c in value: if is_punctuation(c): # Stop word if previous_cjk and state == 1: # CJK not yielded yet xdoc.add_posting(prefix + previous_cjk, termpos) termpos += 1 # reset state previous_cjk = u'' state = 0 else: c = c.lower() if previous_cjk: xdoc.add_posting(prefix + (u'%s%s' % (previous_cjk, c)), termpos) termpos += 1 state = 2 else: state = 1 previous_cjk = c # Last word if previous_cjk and state == 1: xdoc.add_posting(prefix + previous_cjk, termpos) return termpos + 1