예제 #1
0
    def search_multiple_records(self, rec_arr, pre_filter=None):
        request = []
        signatures = []
        for rec in rec_arr:
            path = rec.pop('path')
            signatures.append(rec.pop('signature'))
            if 'metadata' in rec:
                rec.pop('metadata')

            # build the 'should' list
            should = [{'term': {'image.default.' + word: rec[word]}} for word in rec]
            body = {
                'query': {
                    'bool': {'should': should}
                },
                '_source': {'excludes': ['simple_word_*']},
                'size': self.size
            }

            if pre_filter is not None:
                body['query']['bool']['filter'] = pre_filter

            head = {'index': self.index, 'type': self.doc_type}
            request.extend([head, body])

        responses = self.es.msearch(body=request)['responses']

        result = []

        for response, signature in zip(responses, signatures):
            try:
                res = response['hits']['hits']
            except KeyError:
                res = []
            sigs = np.array([x['_source']['image']['default']['signature'] for x in res])

            if sigs.size == 0:
                continue

            dists = normalized_distance(sigs, np.array(signature))

            formatted_res = [{**x['_source'], '_score': x['_score']}
                             for x in res]

            for i, row in enumerate(formatted_res):
                row['dist'] = dists[i]
            formatted_res = filter(lambda y: y['dist'] < self.distance_cutoff, formatted_res)

            result.extend(formatted_res)

        ids = set()
        result = sorted(result, key=itemgetter('dist'))
        unique = []
        for item in result:
            if 'id' in item and item['id'] not in ids:
                unique.append(item)
                ids.add(item['id'])

        return unique
예제 #2
0
    def search_single_record(self, rec, pre_filter=None):
        path = rec.pop('path')
        signature = rec.pop('signature')
        if 'metadata' in rec:
            rec.pop('metadata')

        # build the 'should' list
        should = [{'term': {word: rec[word]}} for word in rec]
        body = {
            'query': {
                'bool': {
                    'should': should
                }
            },
            '_source': {
                'excludes': ['simple_word_*']
            }
        }

        if pre_filter is not None:
            body['query']['bool']['filter'] = pre_filter

        res = self.es.search(index=self.index,
                             doc_type=self.doc_type,
                             body=body,
                             size=self.size,
                             timeout=self.timeout)['hits']['hits']

        sigs = np.array([x['_source']['signature'] for x in res])

        if sigs.size == 0:
            return []

        dists = normalized_distance(sigs, np.array(signature))

        formatted_res = [{
            'id':
            x['_id'],
            'score':
            x['_score'],
            'metadata':
            x['_source'].get('metadata'),
            'path':
            x['_source'].get('url', x['_source'].get('path'))
        } for x in res]

        for i, row in enumerate(formatted_res):
            row['dist'] = dists[i]
        formatted_res = filter(lambda y: y['dist'] < self.distance_cutoff,
                               formatted_res)

        return formatted_res
예제 #3
0
def get_next_match(result_q,
                   word,
                   collection,
                   signature,
                   cutoff=0.5,
                   max_in_cursor=100):
    """Given a cursor, iterate through matches

    Scans a cursor for word matches below a distance threshold.
    Exhausts a cursor, possibly enqueuing many matches
    Note that placing this function outside the SignatureCollection
    class breaks encapsulation.  This is done for compatibility with
    multiprocessing.

    Args:
        result_q (multiprocessing.Queue): a multiprocessing queue in which to queue results
        word (dict): {word_name: word_value} dict to scan against
        collection (collection): a pymongo collection
        signature (numpy.ndarray): signature array to match against
        cutoff (Optional[float]): normalized distance limit (default 0.5)
        max_in_cursor (Optional[int]): if more than max_in_cursor matches are in the cursor,
            ignore this cursor; this column is not discriminatory (default 100)

    """
    curs = collection.find(word,
                           projection=['_id', 'signature', 'path', 'metadata'])

    # if the cursor has many matches, then it's probably not a huge help. Get the next one.
    if curs.count() > max_in_cursor:
        result_q.put('STOP')
        return

    matches = dict()
    while True:
        try:
            rec = curs.next()
            dist = normalized_distance(
                np.reshape(signature, (1, signature.size)),
                np.array(rec['signature']))[0]
            if dist < cutoff:
                matches[rec['_id']] = {
                    'dist': dist,
                    'path': rec.get('path'),
                    'id': rec.get('_id'),
                    'metadata': rec.get('metadata')
                }
                result_q.put(matches)
        except StopIteration:
            # do nothing...the cursor is exhausted
            break
    result_q.put('STOP')
예제 #4
0
    def search_single_record(self, rec, pre_filter=None):
        path = rec.pop('path')
        signature = rec.pop('signature')
        if 'metadata' in rec:
            rec.pop('metadata')

        # build the 'should' list
        should = [{'term': {'image.default.' + word: rec[word]}} for word in rec]
        body = {
            'query': {
                'bool': {'should': should}
            },
            '_source': {'excludes': ['simple_word_*']}
        }

        if pre_filter is not None:
            body['query']['bool']['filter'] = pre_filter

        import time
        # print('send-{}'.format(time.time()))
        try:
            res = self.es.search(index=self.index,
                                 doc_type=self.doc_type,
                                 body=body,
                                 size=self.size)['hits']['hits']
        except KeyError:
            res = []
        # print('receive-{}'.format(time.time()))

        sigs = np.array([x['_source']['image']['default']['signature'] for x in res])

        if sigs.size == 0:
            return []

        dists = normalized_distance(sigs, np.array(signature))

        formatted_res = [{**x['_source'], '_score': x['_score']}
                         for x in res]

        for i, row in enumerate(formatted_res):
            row['dist'] = dists[i]
        formatted_res = filter(lambda y: y['dist'] < self.distance_cutoff, formatted_res)

        return formatted_res