class ExtractionChecker(ConfReader): """ExtractionChecker class """ def __init__(self, global_conf, prefix=DEFAULT_EXTR_CHECK_PREFIX, pid=None): """ExtractionChecker constructor :param global_conf_in: configuration file or dictionary :type global_conf_in: str, dict :param prefix: prefix in configuration :type prefix: str :param pid: process id :type pid: int """ self.list_extr_prefix = [] self.pid = pid self.dict_sha1_infos = dict() super(ExtractionChecker, self).__init__(global_conf, prefix) self.last_push = time.time() self.nb_imgs_check = 0 self.nb_imgs_unproc = 0 self.nb_imgs_unproc_lastprint = 0 self.featurizer_type = self.get_required_param("featurizer_type") self.detector_type = self.get_required_param("detector_type") self.input_type = self.get_required_param("input_type") # Max delay self.max_delay = int(self.get_param("max_delay", default=3600)) self.list_extr_prefix = [self.featurizer_type, "feat", self.detector_type, self.input_type] self.extr_prefix = "_".join(self.list_extr_prefix) self.batch_check_column = None self.check_columns = [] # changed to: get column family from indexer in set_check_columns # Need to be build from extraction type and detection input + "_processed" #self.extr_family_column = self.get_param("extr_family_column", default="ext") # self.extr_prefix_base_column_name = self.extr_family_column + ":" + self.extr_prefix # self.extr_check_column = self.extr_prefix_base_column_name + "_processed" # # Need to be build from extraction type and extraction input + "_batchid" # self.batch_check_column = self.extr_prefix_base_column_name + "_updateid" # self.check_columns = [self.extr_check_column, self.batch_check_column] self.set_pp() # Initialize indexer self.indexer = HBaseIndexerMinimal(self.global_conf, prefix=self.get_required_param("indexer_prefix")) self.indexer.pp = "CheckerHBase" print(self.get_required_param("indexer_prefix"), self.indexer.get_dictcf_sha1_table()) self.set_check_columns() print(self.check_columns) # Initialize ingester try: self.ingester = GenericKafkaProcessor(self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) except Exception as inst: # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay) # time.sleep(self.max_delay) # raise(inst) #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst)) print("[{}: ERROR] Could not start ingester.".format(self.pp, inst)) raise inst # This will not be set for HBase processing, but checker would keep dying here... self.updates_out_topic = None try: self.updates_out_topic = self.ingester.get_required_param("producer_updates_out_topic") except Exception as inst: # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay) # time.sleep(self.max_delay) # raise(inst) #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst)) print("{}. Will write only to HBase.".format(inst)) self.ingester.pp = "ec" if self.pid: self.ingester.pp += str(self.pid) def set_check_columns(self): """Set columns to be checked in indexer """ # changed to: get column family from indexer extr_prefix_base_column_name = self.indexer.extrcf + ":" + self.extr_prefix extr_check_column = extr_prefix_base_column_name + "_processed" # Need to be build from extraction type and extraction input + "_batchid" self.batch_check_column = extr_prefix_base_column_name + "_updateid" self.check_columns = [extr_check_column, self.batch_check_column] #print(self.check_columns) def set_pp(self, pp=""): """Set pretty name """ self.pp = "ExtractionChecker" self.pp += "-".join(self.list_extr_prefix) if self.pid: self.pp += "." + str(self.pid) def store_img_infos(self, msg): """Store information about the images of ``msg`` in ``self.dict_sha1_infos`` :param msg: Kafka record :type msg: collections.namedtuple """ # msg is technically a ConsumerRecord that is a collections.namedtuple, see: # https://github.com/dpkp/kafka-python/blob/master/kafka/consumer/fetcher.py#L30 strk = str(msg['sha1']) self.dict_sha1_infos[strk] = dict() for key in msg: # dumps json of 'img_info' # We actually need that only for DIG... if key == "img_info": self.dict_sha1_infos[strk][key] = json.dumps(msg[key]) else: # discard 'img_buffer' (if it exists?...), and 'sha1' # if k != "img_buffer" and k != "sha1": # self.dict_sha1_infos[strk][k] = msg[k] # discard 'sha1' if key != "sha1": self.dict_sha1_infos[strk][key] = msg[key] def cleanup_dict_infos(self, list_del_sha1s): """Remove images ``list_del_sha1s`` from ``self.dict_sha1_infos`` :param list_del_sha1s: list of images sha1 to remove :type list_del_sha1s: list """ for sha1 in list_del_sha1s: try: del self.dict_sha1_infos[str(sha1)] except: # could happen when cleaning up duplicates or image processed by another process pass def get_dict_push(self, list_get_sha1s, daemon=False): """Get dictionary to be pushed to HBase for images in ``list_get_sha1s`` :param list_get_sha1s: list of images :type list_get_sha1s: list :param daemon: whether the checker is running in daemon mode :type daemon: bool :return: (dict_push, update_id) :rtype: tuple """ #TODO: is this needed for every get_dict_push call? self.set_check_columns() # TODO: also pass current update_id, and move the creation of update id out of this method # this method should actually be used to 'claim' an image as soon as we can. dict_push = dict() # append processid to 'update_id' for safe use with multiple consumers, even after restart # /!\ beware, it should not contain underscores tmp_update_id, _ = self.indexer.get_next_update_id(today=None, extr_type=self.extr_prefix) update_id = tmp_update_id + '-' + self.ingester.pp + '-' + str(time.time()) for sha1 in list_get_sha1s: dict_push[str(sha1)] = dict() try: tmp_dict = self.dict_sha1_infos[str(sha1)] except: # This would mean the image has been marked as part of another batch by another process, # and thus deleted in a previous 'get_unprocessed_rows' call # This is also only relevant if we run on Daemon mode... # TODO: for transition we won't really have any info to push except the update_id... if daemon: del dict_push[str(sha1)] continue # build column names properly i.e. appending 'info:' for key in tmp_dict: # changed to: use column_family from indexer # But the use of 'key' here also means we rely on the input to define column name... #dict_push[str(sha1)]['info:' + key] = tmp_dict[key] dict_push[str(sha1)][self.indexer.imginfocf + ':' + key] = tmp_dict[key] dict_push[str(sha1)][self.batch_check_column] = update_id return dict_push, update_id def get_unprocessed_rows(self, list_check_sha1s): """Get the subset of the list of sha1s ``list_check_sha1s`` that have not been processed yet :param list_check_sha1s: list of images sha1 to check :type list_check_sha1s: list :return: set of unprocessed images :rtype: set """ # TODO: also pass current update_id and only delete if != from current update... unprocessed_rows = set(list_check_sha1s) if list_check_sha1s: # Check if the selected sha1 rows in HBase table 'sha1infos' have those check_column # This call will only return rows that DO have those check_column fam = self.indexer.get_dictcf_sha1_table() try: sha1s_rows = self.indexer.get_columns_from_sha1_rows(list_check_sha1s, self.check_columns, families=fam) except Exception as inst: print("[{}.get_unprocessed_rows: log] fam: {}".format(self.pp, fam)) raise inst #families=self.tablesha1_col_families) if sha1s_rows: # TODO: only delete if really previously processed, i.e. if != from current update... found_sha1_rows = set([str(row[0]) for row in sha1s_rows]) # Clean up 'dict_sha1_infos' deleting found_sha1_rows self.cleanup_dict_infos(found_sha1_rows) set_list_check_sha1s = set(list_check_sha1s) # TODO: but we should not re-add them, so we should discard them from unprocessed_rows unprocessed_rows = set_list_check_sha1s - found_sha1_rows return unprocessed_rows def run(self, daemon=False): """Run extraction checker :param daemon: whether we are running in daemon mode :type daemon: bool :raises Exception: if check fails """ i = 0 try: list_sha1s_to_process = [] # TODO: create update_id here while True: list_check_sha1s = [] try: # Accumulate images infos for msg_json in self.ingester.consumer: msg = json.loads(msg_json.value) # i += 1 # print((i, len(list_check_sha1s), msg)) # msg could now contain keys 'sha1' or 'list_sha1s' # should we check that we can't have both or other keys?... if 'sha1' in msg: list_check_sha1s.append(str(msg['sha1'])) # Store other fields to be able to push them too self.store_img_infos(msg) elif 'list_sha1s' in msg: for sha1 in msg['list_sha1s']: list_check_sha1s.append(str(sha1)) # We won't have any additional infos no? # But we should still build a dict for each sample for consistency... tmp_dict = dict() tmp_dict['sha1'] = str(sha1) # will basically push an empty dict to self.dict_sha1_infos, so self.get_dict_push # works properly later on... self.store_img_infos(tmp_dict) else: print('Unknown keys in msg: {}'.format(msg.keys())) if len(list_check_sha1s) >= self.indexer.batch_update_size: break except Exception as inst: # trying to use 'consumer_timeout_ms' to raise timeout and get last samples msg = "[{}: warning] At {}, caught {} {} in consumer loop" now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S') print(msg.format(self.pp, now_str, type(inst), inst)) sys.stdout.flush() if not list_check_sha1s: # TODO: should we fallback to scanning Hbase table here? continue # Check which images have not been processed (or pushed in an update) yet unprocessed_rows = self.get_unprocessed_rows(list_check_sha1s) self.nb_imgs_check += len(list_check_sha1s) push_delay = (time.time() - self.last_push) > self.max_delay / 60 if push_delay and self.nb_imgs_unproc_lastprint != self.nb_imgs_unproc: msg = "[{}: log] Found {}/{} unprocessed images" print(msg.format(self.pp, self.nb_imgs_unproc, self.nb_imgs_check)) self.nb_imgs_unproc_lastprint = self.nb_imgs_unproc # TODO: we should mark those images as being 'owned' by the update we are constructing # (only important if we are running multiple threads i.e. daemon is True) # otherwise another update running at the same time could also claim it (in another ad) # could be handle when adding data to the searcher but duplicates in extraction process... # Push sha1s to be processed for sha1 in unprocessed_rows: list_sha1s_to_process.append(sha1) # Remove potential duplicates list_sha1s_to_process = list(set(list_sha1s_to_process)) if list_sha1s_to_process: # Push them to HBase by batch of 'batch_update_size' push_delay = (time.time() - self.last_push) > self.max_delay full_batch = len(list_sha1s_to_process) >= self.indexer.batch_update_size if full_batch or (push_delay and list_sha1s_to_process): # Trim here to push exactly a batch of 'batch_update_size' list_push = list_sha1s_to_process[:min(self.indexer.batch_update_size, len(list_sha1s_to_process))] # TODO: this should be done before, # to 'claim' the images as soon as we plan to process them for this update # Gather corresponding sha1 infos dict_push, update_id = self.get_dict_push(list_push, daemon=daemon) if dict_push: self.nb_imgs_unproc += len(dict_push.keys()) msg = "[{}: at {}] Pushing update {} of {} images." now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S') print(msg.format(self.pp, now_str, update_id, len(dict_push.keys()))) sys.stdout.flush() # Push images fam = self.indexer.get_dictcf_sha1_table() if self.verbose > 4: msg = "[{}] Pushing images for update {} with fam {}" print(msg.format(self.pp, update_id, fam)) sha1s_table = self.indexer.table_sha1infos_name self.indexer.push_dict_rows(dict_push, sha1s_table, families=fam) # Build HBase updates dict dict_updates_db = dict() now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S') list_sha1s_col = self.indexer.get_col_listsha1s() dict_updates_db[update_id] = {list_sha1s_col: ','.join(dict_push.keys()), self.indexer.get_col_upcreate(): now_str} # Push it fam = self.indexer.get_dictcf_update_table() if self.verbose > 4: msg = "[{}] Pushing update {} info with fam {}" print(msg.format(self.pp, update_id, fam)) self.indexer.push_dict_rows(dict_updates_db, self.indexer.table_updateinfos_name, families=fam) # Build HBase updates dict if self.updates_out_topic is not None: dict_updates_kafka = dict() dict_updates_kafka[update_id] = ','.join(dict_push.keys()) # Push it self.ingester.producer.send(self.updates_out_topic, json.dumps(dict_updates_kafka)) # Gather any remaining sha1s and clean up infos if len(list_sha1s_to_process) > self.indexer.batch_update_size: list_sha1s_to_process = list_sha1s_to_process[self.indexer.batch_update_size:] else: list_sha1s_to_process = [] # if duplicates wrt list_push, remove them. Can this still happen? list_sha1s_to_process = [sh1 for sh1 in list_sha1s_to_process if sh1 not in list_push] self.cleanup_dict_infos(list_push) else: msg = "[{}: at {}] Nothing to push for update {}" print(msg.format(self.pp, datetime.now().strftime('%Y-%m-%d:%H.%M.%S'), update_id)) sys.stdout.flush() self.last_push = time.time() # TODO: we should create a new update_id here, # and let it claim the potential remaining images in 'list_sha1s_to_process' # sanity check that len(list_sha1s_to_process) == len(self.dict_sha1_infos) ? except Exception as inst: exc_type, exc_obj, exc_tb = sys.exc_info() fulltb = traceback.format_tb(exc_tb) raise type(inst)(" {} ({})".format(inst, ''.join(fulltb)))
class ExtractionChecker(ConfReader): """ExtractionChecker class """ def __init__(self, global_conf, prefix=DEFAULT_EXTR_CHECK_PREFIX, pid=None): """ExtractionChecker constructor :param global_conf_in: configuration file or dictionary :type global_conf_in: str, dict :param prefix: prefix in configuration :type prefix: str :param pid: process id :type pid: int """ self.list_extr_prefix = [] self.pid = pid self.dict_sha1_infos = dict() super(ExtractionChecker, self).__init__(global_conf, prefix) self.last_push = time.time() self.nb_imgs_check = 0 self.nb_imgs_unproc = 0 self.nb_imgs_unproc_lastprint = 0 self.featurizer_type = self.get_required_param("featurizer_type") self.detector_type = self.get_required_param("detector_type") self.input_type = self.get_required_param("input_type") # Max delay self.max_delay = int( self.get_param("max_delay", default=DEFAULT_MAX_DELAY)) self.min_len_check = int( self.get_param("min_len_check", default=DEFAULT_MIN_LENGTH_CHECK)) self.list_extr_prefix = [ self.featurizer_type, "feat", self.detector_type, self.input_type ] self.extr_prefix = "_".join(self.list_extr_prefix) self.batch_check_column = None self.check_columns = [] # changed to: get column family from indexer in set_check_columns # Need to be build from extraction type and detection input + "_processed" #self.extr_family_column = self.get_param("extr_family_column", default="ext") # self.extr_prefix_base_column_name = self.extr_family_column + ":" + self.extr_prefix # self.extr_check_column = self.extr_prefix_base_column_name + "_processed" # # Need to be build from extraction type and extraction input + "_batchid" # self.batch_check_column = self.extr_prefix_base_column_name + "_updateid" # self.check_columns = [self.extr_check_column, self.batch_check_column] self.set_pp() # Initialize indexer self.indexer = HBaseIndexerMinimal( self.global_conf, prefix=self.get_required_param("indexer_prefix")) self.indexer.pp = "CheckerHBase" print(self.get_required_param("indexer_prefix"), self.indexer.get_dictcf_sha1_table()) self.set_check_columns() print(self.check_columns) # Initialize ingester, that could now be Kafka or Kinesis. Should we have a default? ingester_type = self.get_required_param("image_ingestion_type") try: if ingester_type == "kafka": self.ingester = KafkaIngester( self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) elif ingester_type == "kinesis": self.ingester = KinesisIngester( self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) else: raise ValueError( "Unknown 'ingester_type': {}".format(ingester_type)) except Exception as inst: # print "Could not initialize checker, sleeping for {}s.".format(self.max_delay) # time.sleep(self.max_delay) # raise(inst) #print("Could not initialize 'updates_out_topic' ({}). Will write only to HBase.".format(inst)) print("[{}: ERROR] Could not start ingester.".format( self.pp, inst)) raise inst # Initialize producer # TODO: also check for 'update_ingestion_type' as producer_type? producer_type = self.get_param("update_ingestion_type", DEFAULT_UPDATE_INGESTION_TYPE) # TODO: create a producer if 'update_ingestion_type' is Kinesis or Kafka # if producer_type != "hbase": # self.updates_out_topic = self.ingester.get_required_param("producer_updates_out_topic") if producer_type == "kafka": self.pusher = KafkaPusher( self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) elif producer_type == "kinesis": self.pusher = KinesisPusher( self.global_conf, prefix=self.get_required_param("check_ingester_prefix")) elif producer_type == "hbase": self.pusher = None print("[{}: log] Will write updates only to HBase.".format( self.pp)) else: raise ValueError( "Unknown 'producer_type': {}".format(producer_type)) #self.ingester.pp = self.get_param("pp", "ImageIngester") # Only if daemon mode, as we may have multiple ingesters # But for Kinesis the `shard_infos_filename` may not be re-used... #if self.pid: # self.ingester.pp += str(self.pid) def set_check_columns(self): """Set columns to be checked in indexer """ # changed to: get column family from indexer # TODO: get the suffixes as global variables maybe from common.defaults extr_prefix_base_column_name = self.indexer.extrcf + ":" + self.extr_prefix extr_check_column = extr_prefix_base_column_name + "_processed" # Need to be build from extraction type and extraction input + "_batchid" self.batch_check_column = extr_prefix_base_column_name + "_updateid" self.check_columns = [extr_check_column, self.batch_check_column] #print(self.check_columns) def set_pp(self, pp=""): """Set pretty name """ self.pp = "ExtractionChecker" self.pp += "-".join(self.list_extr_prefix) if self.pid: self.pp += "." + str(self.pid) def store_img_infos(self, msg): """Store information about the images of ``msg`` in ``self.dict_sha1_infos`` :param msg: message :type msg: dict """ strk = str(msg['sha1']).upper() self.dict_sha1_infos[strk] = dict() for key in msg: # dumps json of 'img_info' # We actually need that only for DIG... if key == "img_info": self.dict_sha1_infos[strk][key] = json.dumps(msg[key]) else: # discard 'img_buffer' (if it exists?...), and 'sha1' # if k != "img_buffer" and k != "sha1": # self.dict_sha1_infos[strk][k] = msg[k] # discard 'sha1' if key != "sha1": self.dict_sha1_infos[strk][key] = msg[key] def cleanup_dict_infos(self, list_del_sha1s): """Remove images ``list_del_sha1s`` from ``self.dict_sha1_infos`` :param list_del_sha1s: list of images sha1 to remove :type list_del_sha1s: list """ for sha1 in list_del_sha1s: try: del self.dict_sha1_infos[str(sha1)] except: # could happen when cleaning up duplicates or image processed by another process pass def get_dict_push(self, list_get_sha1s, daemon=False): """Get dictionary to be pushed to HBase for images in ``list_get_sha1s`` :param list_get_sha1s: list of images :type list_get_sha1s: list :param daemon: whether the checker is running in daemon mode :type daemon: bool :return: (dict_push, update_id) :rtype: tuple """ #TODO: is this needed for every get_dict_push call? self.set_check_columns() # TODO: also pass current update_id, and move the creation of update id out of this method # this method should actually be used to 'claim' an image as soon as we can. dict_push = dict() # append processid to 'update_id' for safe use with multiple consumers, even after restart # /!\ beware, it should not contain underscores tmp_update_id, _ = self.indexer.get_next_update_id( today=None, extr_type=self.extr_prefix) update_id = tmp_update_id + '-' + self.ingester.pp + '-' + str( time.time()) for sha1 in list_get_sha1s: dict_push[str(sha1)] = dict() try: tmp_dict = self.dict_sha1_infos[str(sha1)] except: # This would mean the image has been marked as part of another batch by another process, # and thus deleted in a previous 'get_unprocessed_rows' call # This is also only relevant if we run on Daemon mode... # TODO: for transition we won't really have any info to push except the update_id... if daemon: del dict_push[str(sha1)] continue # build column names properly i.e. appending 'info:' for key in tmp_dict: # changed to: use column_family from indexer # But the use of 'key' here also means we rely on the input to define column name... #dict_push[str(sha1)]['info:' + key] = tmp_dict[key] dict_push[str(sha1)][self.indexer.imginfocf + ':' + key] = tmp_dict[key] dict_push[str(sha1)][self.batch_check_column] = update_id return dict_push, update_id def get_unprocessed_rows(self, list_check_sha1s): """Get the subset of the list of sha1s ``list_check_sha1s`` that have not been processed yet :param list_check_sha1s: list of images sha1 to check :type list_check_sha1s: list :return: set of unprocessed images :rtype: set """ # TODO: also pass current update_id and only delete if != from current update... unprocessed_rows = set(list_check_sha1s) if list_check_sha1s: # Check if the selected sha1 rows in HBase table 'sha1infos' have those check_column # This call will only return rows that DO have those check_column fam = self.indexer.get_dictcf_sha1_table() try: sha1s_rows = self.indexer.get_columns_from_sha1_rows( list_check_sha1s, self.check_columns, families=fam) except Exception as inst: print("[{}.get_unprocessed_rows: log] fam: {}".format( self.pp, fam)) raise inst #families=self.tablesha1_col_families) if sha1s_rows: # TODO: only delete if really previously processed, i.e. if != from current update... found_sha1_rows = set([str(row[0]) for row in sha1s_rows]) # Clean up 'dict_sha1_infos' deleting found_sha1_rows self.cleanup_dict_infos(found_sha1_rows) set_list_check_sha1s = set(list_check_sha1s) # TODO: but we should not re-add them, so we should discard them from unprocessed_rows unprocessed_rows = set_list_check_sha1s - found_sha1_rows return unprocessed_rows def run(self, daemon=False): """Run extraction checker :param daemon: whether we are running in daemon mode :type daemon: bool :raises Exception: if check fails """ # import inspect # if not inspect.isgeneratorfunction(self.ingester.get_msg_json()): # msg = "[{}: Warning] Ingester {} function `get_msg_json` is not a generator" # print(msg.format(self.pp, type(self.ingester))) try: list_sha1s_to_process = [] list_check_sha1s = [] # TODO: create update_id here if self.verbose > 1: msg = "[{}: log] Start run main loop" msg.format(self.pp) while True: try: # Accumulate images infos #while len(list_check_sha1s) < self.indexer.batch_update_size: #while len(list_check_sha1s) < self.min_len_check: for msg in self.ingester.get_msg_json(): try: # Fix if input was JSON dumped twice? if not isinstance(msg, dict): msg = json.loads(msg) # msg could now contain keys 'sha1' or 'list_sha1s' if 'sha1' in msg: list_check_sha1s.append( str(msg['sha1']).upper()) # Store other fields to be able to push them too self.store_img_infos(msg) elif 'list_sha1s' in msg: for sha1 in msg['list_sha1s']: list_check_sha1s.append(str(sha1).upper()) # We won't have any additional infos no? # But we should still build a dict for each sample for consistency... tmp_dict = dict() tmp_dict['sha1'] = str(sha1).upper() # will basically push a dict with just the sha1 to self.dict_sha1_infos, so self.get_dict_push # works properly later on... self.store_img_infos(tmp_dict) else: raise ValueError( 'Unknown keys in msg: {}'.format( msg.keys())) # This is dangerous, as it assumes the self.ingester.get_msg_json() generator # would restart from the next point... Is this the case for Kafka? prev_len = len(list_check_sha1s) list_check_sha1s = list(set(list_check_sha1s)) if len(list_check_sha1s) < prev_len: msg = "[{}: log] Removed {} duplicate from `list_check_sha1s`" print( msg.format( self.pp, prev_len - len(list_check_sha1s))) if len(list_check_sha1s ) >= self.indexer.batch_update_size: break except Exception as inst: pr_msg = "[{}: ERROR] Could not process message: {}. {}" print(pr_msg.format(self.pp, msg, inst)) except Exception as inst: pr_msg = "[{}: at {} ERROR] Caught {} {} in consumer loop" now_str = datetime.now().strftime('%Y-%m-%d:%H.%M.%S') print(pr_msg.format(self.pp, now_str, type(inst), inst)) if msg is not None: print(msg) sys.stdout.flush() if self.verbose > 3: msg = "[{}: log] Gathered {} images to check so far" msg = msg.format(self.pp, len(list_check_sha1s)) msg2 = "" if len(list_check_sha1s) > 0: msg2 = " (first: {}, last: {})" msg2 = msg2.format(list_check_sha1s[0], list_check_sha1s[-1]) print(msg + msg2) # To be able to push one (non empty) update every max_delay #if not list_check_sha1s and (time.time() - self.last_push) < self.max_delay: if len(list_check_sha1s) < self.indexer.batch_update_size and ( time.time() - self.last_push) < self.max_delay: time.sleep(1) continue self.nb_imgs_check += len(list_check_sha1s) push_delay = (time.time() - self.last_push) > max( int(self.max_delay / 60), 10) if push_delay and self.nb_imgs_unproc_lastprint != self.nb_imgs_unproc: msg = "[{}: log] Pushed {} unprocessed images so far" print( msg.format(self.pp, self.nb_imgs_unproc, self.nb_imgs_check)) self.nb_imgs_unproc_lastprint = self.nb_imgs_unproc if list_check_sha1s: # Check which images have not been processed (or pushed in an update) yet # This seems slow start_check = time.time() unprocessed_rows = self.get_unprocessed_rows( list_check_sha1s) msg = "[{}: log] Found {}/{} unprocessed images in {:.2f}s" print( msg.format(self.pp, len(unprocessed_rows), len(list_check_sha1s), time.time() - start_check)) if len(unprocessed_rows) != len( list_check_sha1s) and self.verbose > 5: already_processed = list( set(list_check_sha1s) - set(unprocessed_rows)) msg = "[{}: log] Images ".format(self.pp) for ap in already_processed: msg += "{} ".format(ap) msg += "were already processed." print(msg) #unprocessed_rows = self.get_unprocessed_rows(list_check_sha1s) # TODO: we should mark those images as being 'owned' by the update we are constructing # (only important if we are running multiple threads i.e. daemon is True) # otherwise another update running at the same time could also claim it (in another ad) # could be handle when adding data to the searcher but duplicates in extraction process... # Push sha1s to be processed for sha1 in unprocessed_rows: list_sha1s_to_process.append(sha1) # Remove potential duplicates list_sha1s_to_process = list(set(list_sha1s_to_process)) list_check_sha1s = [] if list_sha1s_to_process: # Push them to HBase by batch of 'batch_update_size' push_delay = (time.time() - self.last_push) > self.max_delay full_batch = len(list_sha1s_to_process ) >= self.indexer.batch_update_size if full_batch or (push_delay and list_sha1s_to_process): # Trim here to push exactly a batch of 'batch_update_size' list_push = list_sha1s_to_process[:min( self.indexer. batch_update_size, len(list_sha1s_to_process))] # TODO: this should be done before, # to 'claim' the images as soon as we plan to process them for this update # Gather corresponding sha1 infos dict_push, update_id = self.get_dict_push( list_push, daemon=daemon) if dict_push: self.nb_imgs_unproc += len(dict_push.keys()) msg = "[{}: at {}] Pushing update {} of {} images." now_str = datetime.now().strftime( '%Y-%m-%d:%H.%M.%S') print( msg.format(self.pp, now_str, update_id, len(dict_push.keys()))) sys.stdout.flush() # Push images fam = self.indexer.get_dictcf_sha1_table() if self.verbose > 5: msg = "[{}] Pushing images for update {} with fam {}" print(msg.format(self.pp, update_id, fam)) sha1s_table = self.indexer.table_sha1infos_name self.indexer.push_dict_rows(dict_push, sha1s_table, families=fam) # Build HBase updates dict dict_updates_db = dict() now_str = datetime.now().strftime( '%Y-%m-%d:%H.%M.%S') list_sha1s_col = self.indexer.get_col_listsha1s() dict_updates_db[update_id] = { list_sha1s_col: ','.join(dict_push.keys()), self.indexer.get_col_upcreate(): now_str } # Push it fam = self.indexer.get_dictcf_update_table() if self.verbose > 5: msg = "[{}] Pushing update {} info with fam {}" print(msg.format(self.pp, update_id, fam)) self.indexer.push_dict_rows( dict_updates_db, self.indexer.table_updateinfos_name, families=fam) # Build pusher updates dict if self.pusher is not None: dict_updates_kafka = dict() dict_updates_kafka[update_id] = ','.join( dict_push.keys()) # Push it #self.ingester.producer.send(self.updates_out_topic, json.dumps(dict_updates_kafka)) #self.pusher.send(self.updates_out_topic, dict_updates_kafka) self.pusher.send(dict_updates_kafka) # Gather any remaining sha1s and clean up infos if len(list_sha1s_to_process ) > self.indexer.batch_update_size: list_sha1s_to_process = list_sha1s_to_process[ self.indexer.batch_update_size:] else: list_sha1s_to_process = [] # if duplicates wrt list_push, remove them. Can this still happen? list_sha1s_to_process = [ sh1 for sh1 in list_sha1s_to_process if sh1 not in list_push ] self.cleanup_dict_infos(list_push) else: msg = "[{}: at {}] Nothing to push for update {}" print( msg.format( self.pp, datetime.now().strftime( '%Y-%m-%d:%H.%M.%S'), update_id)) sys.stdout.flush() self.last_push = time.time() # TODO: we should create a new update_id here, # and let it claim the potential remaining images in 'list_sha1s_to_process' # sanity check that len(list_sha1s_to_process) == len(self.dict_sha1_infos) ? else: if self.verbose > 3: msg = "[{}: at {}] Gathered {} images so far..." now_str = datetime.now().strftime( '%Y-%m-%d:%H.%M.%S') print( msg.format(self.pp, now_str, len(list_sha1s_to_process))) except Exception as inst: exc_type, exc_obj, exc_tb = sys.exc_info() fulltb = traceback.format_tb(exc_tb) raise type(inst)(" {} ({})".format(inst, ''.join(fulltb)))