def _tokenize(string): words = [] for word in tokenize(string): try: words.append(normalize(word)) except StopWord: continue return words
def get_words(self): """Return words to be indexed (a word is an unicode string). """ for attr in self.__indexable_attributes: value = getattr(self, attr, None) if value is None: continue for word in tokenize(value): yield word
def execute(self, querystr, cursor=None): """Execute a full text query and return a list of 2-uple (rating, uid). """ if isinstance(querystr, str): querystr = unicode(querystr, self.encoding) words = normalize_words(tokenize(querystr)) cursor = cursor or self._cnx.cursor() cursor.execute( 'SELECT 1, uid FROM appears ' 'WHERE MATCH (words) AGAINST (%(words)s IN BOOLEAN MODE)', {'words': ' '.join(words)}) return cursor.fetchall()
def restriction_sql(self, tablename, querystr, jointo=None, not_=False): """Execute a full text query and return a list of 2-uple (rating, uid). """ if isinstance(querystr, str): querystr = unicode(querystr, self.encoding) words = normalize_words(tokenize(querystr)) sql = "MATCH (%s.words) AGAINST ('%s' IN BOOLEAN MODE)" % ( tablename, ' '.join(words)) if not_: sql = 'NOT (%s)' % sql if jointo is None: return sql return "%s AND %s.uid=%s" % (sql, tablename, jointo)
def execute(self, querystr, cursor=None): """Execute a full text query and return a list of 2-uple (rating, uid). """ if isinstance(querystr, str): querystr = unicode(querystr, self.encoding) words = normalize_words(tokenize(querystr)) cursor = cursor or self._cnx.cursor() cursor.execute( 'SELECT 1, uid FROM appears ' "WHERE words @@ to_tsquery(%(config)s, %(words)s)", { 'config': self.config, 'words': '&'.join(words) }) return cursor.fetchall()
def restriction_sql(self, tablename, querystr, jointo=None, not_=False): """Execute a full text query and return a list of 2-uple (rating, uid). """ if isinstance(querystr, str): querystr = unicode(querystr, self.encoding) words = normalize_words(tokenize(querystr)) # XXX replace '%' since it makes tsearch fail, dunno why yet, should # be properly fixed searched = '&'.join(words).replace('%', '') sql = "%s.words @@ to_tsquery('%s', '%s')" % (tablename, self.config, searched) if not_: sql = 'NOT (%s)' % sql if jointo is None: return sql return "%s AND %s.uid=%s" % (sql, tablename, jointo)
def restriction_sql(self, tablename, querystr, jointo=None, not_=False): if isinstance(querystr, str): querystr = unicode(querystr, self.encoding) words = [] for word in tokenize(querystr): try: words.append("'%s'" % normalize(word)) except StopWord: continue sql = '%s.word_id IN (SELECT word_id FROM word WHERE word in (%s))' % ( tablename, ', '.join(words)) if not_: sql = 'NOT (%s)' % sql if jointo is None: return sql return '%s AND %s.uid=%s' % (sql, tablename, jointo)
def get_words(self): return tokenize(u'gïnco-jpl blâ blîp blôp blàp')
def _get_words(self, buffer): """ extract word from a plain text buffer """ for line in buffer.xreadlines(): for word in tokenize(unicode(line, self.encoding)): yield word