Пример #1
0
 def test_can_retrieve_data_from_search(self):
     db = HammingDb(self._path, code_size=8)
     t1 = 'Mary had a little lamb'
     t2 = 'Mary had a little dog'
     t3 = 'Permanent Midnight'
     t4 = 'Mary sad a little cog'
     db.append(self.extract_code_from_text(t1), t1)
     db.append(self.extract_code_from_text(t2), t2)
     db.append(self.extract_code_from_text(t3), t3)
     db.append(self.extract_code_from_text(t4), t4)
     results = list(db.search(self.extract_code_from_text(t1), 3))
     data = results[0]
     self.assertEqual(t1, data)
Пример #2
0
 def test_search_raises_in_write_only_mode(self):
     db = HammingDb(self._path, code_size=16, writeonly=True)
     t1 = 'Mary had a little lamb'
     t2 = 'Mary had a little dog'
     t3 = 'Permanent Midnight'
     t4 = 'Mary sad a little cog'
     extract_code = lambda x: self.extract_code_from_text(x, n_chunks=2)
     db.append(extract_code(t1), t1)
     db.append(extract_code(t2), t2)
     db.append(extract_code(t3), t3)
     db.append(extract_code(t4), t4)
     self.assertRaises(RuntimeError,
                       lambda: list(db.search(extract_code(t1), 3)))
Пример #3
0
 def test_can_search_over_text_documents(self):
     db = HammingDb(self._path, code_size=8)
     t1 = 'Mary had a little lamb'
     t2 = 'Mary had a little dog'
     t3 = 'Permanent Midnight'
     t4 = 'Mary sad a little cog'
     db.append(self.extract_code_from_text(t1), t1)
     db.append(self.extract_code_from_text(t2), t2)
     db.append(self.extract_code_from_text(t3), t3)
     db.append(self.extract_code_from_text(t4), t4)
     results = list(db.search(self.extract_code_from_text(t1), 3))
     self.assertEqual(3, len(results))
     self.assertEqual(t1, results[0])
     self.assertEqual(t2, results[1])
     self.assertEqual(t4, results[2])
Пример #4
0
 def test_can_search_with_128_bits(self):
     db = HammingDb(self._path, code_size=16)
     t1 = 'Mary had a little lamb'
     t2 = 'Mary had a little dog'
     t3 = 'Permanent Midnight'
     t4 = 'Mary sad a little cog'
     extract_code = lambda x: self.extract_code_from_text(x, n_chunks=2)
     db.append(extract_code(t1), t1)
     db.append(extract_code(t2), t2)
     db.append(extract_code(t3), t3)
     db.append(extract_code(t4), t4)
     results = list(db.search(extract_code(t1), 3))
     self.assertEqual(3, len(results))
     self.assertEqual(t1, results[0])
     self.assertEqual(t2, results[1])
     self.assertEqual(t4, results[2])
Пример #5
0
 def test_can_search_over_data_added_from_another_instance(self):
     db = HammingDb(self._path, code_size=8)
     db2 = HammingDb(self._path, code_size=8)
     t1 = 'Mary had a little lamb'
     t2 = 'Mary had a little dog'
     t3 = 'Permanent Midnight'
     t4 = 'Mary sad a little cog'
     db.append(self.extract_code_from_text(t1), t1)
     db.append(self.extract_code_from_text(t2), t2)
     db.append(self.extract_code_from_text(t3), t3)
     db.append(self.extract_code_from_text(t4), t4)
     results = list(db2.search(self.extract_code_from_text(t1), 3))
     self.assertEqual(3, len(results))
     s = set(results)
     self.assertTrue(t1 in s)
     self.assertTrue(t2 in s)
     self.assertTrue(t4 in s)
Пример #6
0
class HammingIndex(object):
    def __init__(self,
                 document,
                 feature,
                 version=None,
                 path='',
                 db_size_bytes=1000000000,
                 listen=False,
                 writeonly=False,
                 **extra_data):

        super(HammingIndex, self).__init__()
        self.document = document
        self.feature = feature
        self.db_size_bytes = db_size_bytes
        self.path = path
        self.extra_data = extra_data
        self.writeonly = writeonly

        version = version or self.feature.version

        self.hamming_db_path = os.path.join(
            self.path, 'index.{self.feature.key}.{version}'.format(**locals()))

        try:
            self.event_log = document.event_log
        except AttributeError:
            self.event_log = None

        try:
            self.hamming_db = HammingDb(self.hamming_db_path,
                                        code_size=None,
                                        writeonly=self.writeonly)
        except ValueError:
            self.hamming_db = None

        self.encoder = TimeSliceEncoder()
        self.decoder = TimeSliceDecoder()
        self.thread = None

        if listen:
            self.listen()

    def close(self):
        try:
            self.stop()
        except:
            pass

        try:
            self.hamming_db.close()
        except:
            pass

    def __del__(self):
        self.close()

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    def __len__(self):
        if self.hamming_db is None:
            return 0
        return len(self.hamming_db)

    def stop(self):
        self.event_log.unsubscribe()

    def listen(self):
        self.thread = threading.Thread(target=self._listen)
        self.thread.daemon = True
        self.thread.start()

    def _init_hamming_db(self, code=None):
        if self.hamming_db is not None:
            return
        code_size = len(code) if code else None
        self.hamming_db = HammingDb(self.hamming_db_path,
                                    code_size=code_size,
                                    writeonly=self.writeonly)

    def _synchronously_process_events(self):
        self._listen(raise_when_empty=True)

    def add_all(self):
        for doc in self.document:
            self.add(doc._id)

    def _collect_extra_data(self, _id, ts):
        if not self.extra_data:
            return None

        doc = self.document(_id)
        return dict(((key, func(doc, ts))
                     for key, func in self.extra_data.iteritems()))

    def add(self, _id, timestamp=''):
        # load the feature from the feature database
        feature = self.feature(_id=_id, persistence=self.document)

        try:
            arr = ConstantRateTimeSeries(feature)
        except ValueError:
            arr = feature

        # extract codes and timeslices from the feature
        for ts, data in arr.iter_slices():
            code = self.encode_query(data)
            encoded_ts = dict(_id=_id, **self.encoder.dict(ts))
            extra_data = self._collect_extra_data(_id, ts)
            if extra_data:
                encoded_ts['extra_data'] = extra_data
            self._init_hamming_db(code)
            self.hamming_db.append(code, json.dumps(encoded_ts))
            self.hamming_db.set_metadata('timestamp', bytes(timestamp))

    def _listen(self, raise_when_empty=False):

        if self.hamming_db is not None:
            last_timestamp = self.hamming_db.get_metadata('timestamp') or ''
        else:
            last_timestamp = ''

        if not self.event_log:
            raise ValueError(
                '{self.document} must have an event log configured'.format(
                    **locals()))

        subscription = self.event_log.subscribe(
            last_id=last_timestamp, raise_when_empty=raise_when_empty)

        for timestamp, data in subscription:

            # parse the data from the event stream
            data = json.loads(data)
            _id, name, version = data['_id'], data['name'], data['version']

            # ensure that it's about the feature we're subscribed to
            if name != self.feature.key or version != self.feature.version:
                continue

            self.add(_id, timestamp)

    def _parse_result(self, result):
        d = json.loads(result)
        ts = TimeSlice(**self.decoder.kwargs(d))

        if not self.extra_data:
            return d['_id'], ts

        return d['_id'], ts, d['extra_data']

    def decode_query(self, binary_query):
        packed = np.fromstring(binary_query, dtype=np.uint8)
        return np.unpackbits(packed)

    def encode_query(self, feature):
        if isinstance(feature, str):
            return feature
        elif feature.dtype == np.uint64:
            return feature.tostring()
        elif feature.dtype == np.uint8 or feature.dtype == np.bool:
            return np.packbits(feature).tostring()
        else:
            raise ValueError(
                'feature must be a raw bit string, an already packed uint64'
                'array, or an "unpacked" uint8 or bool array')

    def random_search(self, n_results, multithreaded=False, sort=False):
        self._init_hamming_db()
        code, raw_results = self.hamming_db.random_search(n_results,
                                                          multithreaded,
                                                          sort=sort)
        parsed_results = (self._parse_result(r) for r in raw_results)
        return SearchResults(code, parsed_results)

    def search(self, feature, n_results, multithreaded=False, sort=False):
        self._init_hamming_db()
        code = self.encode_query(feature)
        raw_results = self.hamming_db.search(code,
                                             n_results,
                                             multithreaded,
                                             sort=sort)
        parsed_results = (self._parse_result(r) for r in raw_results)
        return SearchResults(code, parsed_results)
Пример #7
0
 def test_cannot_search_for_wrong_code_size(self):
     db = HammingDb(self._path, code_size=8)
     self.assertRaises(ValueError, lambda: list(db.search('a' * 7, 10)))