def refresh_retriever(): global W if W.queue_name == settings.GLOBAL_RETRIEVER: for dr in Retrievers._selector_to_dr.values(): logging.info("Starting index refresh on queue {} for retriever {}".format(W.queue_name,dr.pk)) start_ts = time.time() Retrievers.refresh_index(dr) delta = time.time() - start_ts redis_client.hset("retriever_state", "{},{},{}".format(W.pk, W.queue_name, dr.pk),json.dumps({"delta":delta, 'worker_id':W.pk, 'retriever_id':dr.pk, 'queue_name': W.queue_name, 'ts':time.time()})) logging.info("Finished index refresh on queue {} for retriever {}".format(W.queue_name, dr.pk)) elif 'retriever_' in W.queue_name: pk = int(W.queue_name.split('_')[-1]) logging.info("Starting index refresh on queue {} for retriever {}".format(W.queue_name, pk)) start_ts = time.time() _, dr = Retrievers.get_retriever(args={'retriever_selector': {'pk': pk}}) Retrievers.refresh_index(dr) delta = time.time() - start_ts redis_client.hset("retriever_state","{},{},{}".format(W.pk, W.queue_name, dr.pk),json.dumps({"delta":delta, 'worker_id':W.pk, 'retriever_id':dr.pk, 'queue_name':W.queue_name, 'ts':time.time()})) logging.info("Finished index refresh on queue {} for retriever {}".format(W.queue_name, pk)) else: raise ValueError("{} is not valid for retriever".format(W.queue_name))
def handle_perform_indexing(start): json_args = start.arguments target = json_args.get('target', 'frames') if 'index' in json_args: index_name = json_args['index'] visual_index, di = indexing.Indexers.get_index_by_name(index_name) else: visual_index, di = indexing.Indexers.get_index_by_pk( json_args['indexer_pk']) sync = True if target == 'query': local_path = task_shared.download_and_get_query_path(start) vector = visual_index.apply(local_path) # TODO: figure out a better way to store numpy arrays. s = io.BytesIO() np.save(s, vector) redis_client.set(start.pk, s.getvalue()) sync = False elif target == 'query_regions': queryset, target = task_shared.build_queryset(args=start.arguments) region_paths = task_shared.download_and_get_query_region_path( start, queryset) for i, dr in enumerate(queryset): local_path = region_paths[i] vector = visual_index.apply(local_path) s = io.BytesIO() np.save(s, vector) # can be replaced by Redis instead of using DB redis_client.hset(start.pk, dr.pk, s.getvalue()) _ = models.QueryRegionIndexVector.objects.create( vector=s.getvalue(), event=start, query_region=dr) sync = False elif target == 'regions': # For regions simply download/ensure files exists. queryset, target = task_shared.build_queryset(args=start.arguments, video_id=start.video_id) task_shared.ensure_files(queryset, target) indexing.Indexers.index_queryset(di, visual_index, start, target, queryset) elif target == 'frames': queryset, target = task_shared.build_queryset(args=start.arguments, video_id=start.video_id) if visual_index.cloud_fs_support and settings.ENABLE_CLOUDFS and ( not settings.KUBE_MODE): # TODO Re-enable this in Kube Mode when issues with GCS are resolved. # if NFS is disabled and index supports cloud file systems natively (e.g. like Tensorflow) indexing.Indexers.index_queryset(di, visual_index, start, target, queryset, cloud_paths=True) else: # Otherwise download and ensure that the files exist task_shared.ensure_files(queryset, target) indexing.Indexers.index_queryset(di, visual_index, start, target, queryset) return sync
def handle_perform_indexing(start): json_args = start.arguments target = json_args.get('target', 'frames') if 'index' in json_args: index_name = json_args['index'] visual_index, di = indexing.Indexers.get_index_by_name(index_name) else: visual_index, di = indexing.Indexers.get_index_by_pk( json_args['indexer_pk']) sync = True if target == 'query': local_path = task_shared.download_and_get_query_path(start) vector = visual_index.apply(local_path) # TODO: figure out a better way to store numpy arrays. s = io.BytesIO() np.save(s, vector) redis_client.set("query_vector_{}".format(start.pk), s.getvalue()) sync = False elif target == 'query_regions': queryset, target = task_shared.build_queryset(args=start.arguments) region_paths = task_shared.download_and_get_query_region_path( start, queryset) for i, dr in enumerate(queryset): local_path = region_paths[i] vector = visual_index.apply(local_path) s = io.BytesIO() np.save(s, vector) redis_client.hset("query_region_vectors_{}".format(start.pk), dr.pk, s.getvalue()) sync = False elif target == 'regions': # For regions simply download/ensure files exists. queryset, target = task_shared.build_queryset(args=start.arguments, video_id=start.video_id) task_shared.ensure_files(queryset, target) indexing.Indexers.index_queryset(di, visual_index, start, target, queryset) elif target == 'frames': queryset, target = task_shared.build_queryset(args=start.arguments, video_id=start.video_id) if visual_index.cloud_fs_support and settings.ENABLE_CLOUDFS: # if NFS is disabled and index supports cloud file systems natively (e.g. like Tensorflow) indexing.Indexers.index_queryset(di, visual_index, start, target, queryset, cloud_paths=True) else: # Otherwise download and ensure that the files exist task_shared.ensure_files(queryset, target) indexing.Indexers.index_queryset(di, visual_index, start, target, queryset) return sync