def tokenizePrint(): while True: analyzer = StandardAnalyzer() ts = analyzer.tokenStream("myfield", [ input("Input the string to tokenize: ") ]) # OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); ts.reset() # Resets this stream to the beginning. (Required) while ts.incrementToken() : # Use AttributeSource.reflectAsString(boolean) # for token stream debugging. print("token: " + ts.getTerm().text) print('pos: ', ts.getPosition()) #print("token start offset: " + offsetAtt.startOffset()) #print(" token end offset: " + offsetAtt.endOffset()) ts.end() # Perform end-of-stream operations, e.g. set the final offset. ts.close() # Release resources associated with this stream.
class Query(object): def __init__(self, string: str): self.q = StandardAnalyzer() self.s = string def tokenVector(self): t = self.q.tokenStream("userfield", [ self.s ]) t.reset() s = [] l = {} while t.incrementToken(): s.append(t.getTerm().text) l = set(s) return l
class PhraseQuery(Query): def __init__(self, string: str): Query.__init__(self, string) self.q = StandardAnalyzer() self.s = string def tokenVector(self): t = self.q.tokenStream("userfield", [ self.s ]) t.reset() l = [] while t.incrementToken(): l.append(t.getTerm().text) return l def booleanQuery(self): l = self.tokenVector() i = 0 num = len(l) for i in range( num-1 ): l.append("AND") return l
def __init__(self, string: str): Query.__init__(self, string) self.q = StandardAnalyzer() self.s = string