def tokenize(self, value): """ Split the incoming value into tokens and process each token, optionally stemming or running metaphone. :returns: A ``dict`` mapping token to score. The score is based on the relative frequency of the word in the document. """ words = self.split_phrase(decode(value).lower()) if self._stopwords: words = [w for w in words if w not in self._stopwords] if self._min_word_length: words = [w for w in words if len(w) >= self._min_word_length] fraction = 1. / (len(words) + 1) # Prevent division by zero. # Apply optional transformations. if self._use_stemmer: words = self.stem(words) if self._use_metaphone: words = self.metaphone(words) scores = {} for word in words: scores.setdefault(word, 0) scores[word] += fraction return scores
def pending(self, start='-', stop='+', count=1000, consumer=None): start = normalize_id(start) stop = normalize_id(stop) resp = self.database.xpending_range(self.key, self.group, start, stop, count, consumer) return [(id_to_datetime(msg['message_id']), decode(msg['consumer']), msg['time_since_delivered'], msg['times_delivered']) for msg in resp]
def tokenize_title(self, phrase, stopwords=True): if isinstance(phrase, bytes): phrase = decode(phrase) phrase = re.sub('[^a-z0-9_\-\s]', '', phrase.lower()) if stopwords: return [w for w in phrase.split() if w not in self._stopwords] else: return phrase.split()
def list_data(self): """ Return all the data stored in the autocomplete index. If the data was stored as serialized JSON, then it will be de-serialized before being returned. :rtype: list """ fn = (lambda v: json.loads(decode(v))) if self._use_json else decode return map(fn, self._data.values())
def python_value(self, value): if self._pickled: return pickle.loads(value) elif self._as_json and PY3: return json.loads(decode(value)) elif self._as_json: return json.loads(value) elif self._coerce: return self._coerce(value) return value
def db_value(self, value): if self._pickled: return pickle.dumps(value) elif PY3 and self._as_json: return json.dumps(decode(value)) elif self._as_json: return json.dumps(value) elif self._coerce: return self._coerce(value) return value
def test_slicing(self): self.lst.extend(['i1', 'i2', 'i3', 'i4']) self.assertEqual(self.lst[:1], [b'i1']) self.assertEqual(self.lst[:2], [b'i1', b'i2']) self.assertEqual(self.lst[:-1], [b'i1', b'i2', b'i3']) self.assertEqual(self.lst[1:2], [b'i2']) self.assertEqual(self.lst[1:], [b'i2', b'i3', b'i4']) l = db.List('l1') l.extend(range(10)) # LTRIM, preserve the 1st to last (removes the 0th element). del l[1:-1] self.assertEqual([int(decode(i)) for i in l], [1, 2, 3, 4, 5, 6, 7, 8, 9]) # Trim the list so that it contains only the values within the # specified range. del l[:3] self.assertEqual([int(decode(i)) for i in l], [1, 2, 3])
def _load_objects(self, obj_id_zset, limit, chunk_size=1000): ct = i = 0 while True: id_chunk = obj_id_zset[i:i + chunk_size] if not id_chunk: return i += chunk_size for raw_data in self._data[id_chunk]: if not raw_data: continue if self._use_json: yield json.loads(decode(raw_data)) else: yield raw_data ct += 1 if limit and ct == limit: return
def query(self, s=None, p=None, o=None): """ Return all triples that satisfy the given expression. You may specify all or none of the fields (s, p, and o). For instance, if I wanted to query for all the people who live in Kansas, I might write: .. code-block:: python for triple in graph.query(p='lives', o='Kansas'): print triple['s'], 'lives in Kansas!' """ start, end = self.keys_for_query(s, p, o) if end is None: if start in self._z: yield {'s': s, 'p': p, 'o': o} else: raise StopIteration else: for key in self._z.range_by_lex('[' + start, '[' + end): keys, p1, p2, p3 = decode(key).split('::') yield dict(zip(keys, (p1, p2, p3)))
def split_phrase(self, phrase): """Split the document or search query into tokens.""" return self._symbols_re.sub(' ', decode(phrase)).split()
def python_value(self, value): return uuid.UUID(decode(value))
def python_value(self, value): return uuid.UUID(decode(value)) if value else None
def __init__(self, stream, message_id, data): self.stream = decode(stream) self.message_id = decode(message_id) self.data = decode_dict(data) self.timestamp, self.sequence = id_to_datetime(message_id)
def python_value(self, value): return json.loads(decode(value))
def python_value(self, value): return decode(value) == '1'
def test_read_api(self): sa = db.Stream('a') sb = db.Stream('b') sc = db.Stream('c') streams = [sa, sb, sc] docids = [] for i in range(20): stream = streams[i % 3] docids.append(stream.add({'k': 'v%s' % i}, id=i + 1)) def assertData(ret, idxs, is_multi=False): if is_multi: ret = dict(ret) accum = {} for idx in idxs: sname = encode('abc'[idx % 3]) accum.setdefault(sname, []) accum[sname].append(( docids[idx], {b'k': encode('v%s' % idx)})) else: accum = [] for idx in idxs: accum.append((docids[idx], {b'k': encode('v%s' % idx)})) self.assertEqual(ret, accum) assertData(sa.read(), [0, 3, 6, 9, 12, 15, 18]) assertData(sc.read(), [2, 5, 8, 11, 14, 17]) # We can specify a maximum number of records via "count". assertData(sa.read(3), [0, 3, 6]) assertData(sb.read(2), [1, 4]) assertData(sc.read(4), [2, 5, 8, 11]) # We get the same values we read earlier. assertData(sa.read(2), [0, 3]) # We can pass a minimum ID and will get newer data -- even if the ID # does not exist in the stream. We can also pass an exact ID and unlike # the range function, it is not inclusive. assertData(sa.read(2, last_id=docids[3]), [6, 9]) assertData(sa.read(2, last_id=docids[4]), [6, 9]) # If the last ID exceeds the highest ID (indicating no data), None is # returned. This is the same whether or not "count" is specified. self.assertEqual(sa.read(last_id=docids[18]), []) self.assertEqual(sa.read(2, last_id=docids[18]), []) # The count is a maximum, so up-to 2 items are return -- but since only # one item in the stream exceeds the given ID, we only get one result. assertData(sa.read(2, last_id=docids[17]), [18]) # If a timeout is set and any stream can return a value, then that # value is returned immediately. assertData(sa.read(2, block=1, last_id=docids[17]), [18]) assertData(sb.read(2, block=1, last_id=docids[18]), [19]) # If no items are available and we timed-out, None is returned. self.assertEqual(sc.read(block=1, last_id=docids[19]), []) self.assertEqual(sc.read(2, block=1, last_id=docids[19]), []) # When multiple keys are given, up-to "count" items per stream # are returned. normalized = _normalize_stream_keys(['a', 'b', 'c']) res = db.xread(normalized, count=2) assertData(res, [0, 1, 2, 3, 4, 5], True) # Specify max-ids for each stream. The max value in "c" is 17, so # nothing will be returned for "c". uids = [decode(docid) for docid in docids] res = db.xread({'a': uids[15], 'b': uids[16], 'c': uids[17]}, count=3) assertData(res, [18, 19], True) # Now we limit ourselves to being able to pull only a single item from # stream "c". res = db.xread({'a': uids[18], 'b': uids[19], 'c': uids[16]}) assertData(res, [17], True) # None is returned when no results are present and timeout is None or # if we reach the timeout. res = db.xread({'a': uids[18], 'b': uids[19], 'c': uids[17]}) self.assertEqual(res, []) res = db.xread({'a': uids[18], 'b': uids[19], 'c': uids[17]}, count=1, block=1) self.assertEqual(res, [])