def load_state(self, allow_cached=True): if not allow_cached or self._cached_state is None: # First, get a list of all files all_datafile_keys = self._get_datafile_object_keys() items = [] # Then for each datafile, append to items for datafile in all_datafile_keys: txt_content = read_s3_text(self.s3_client, self.s3_bucket_name, datafile) signal_type = self.get_signal_type_from_object_key(datafile) indicator_type = self.indicator_type_str_from_signal_type( signal_type) if txt_content is None: logger.warning("No TE state for %d. First run?", self.privacy_group) elif indicator_type is None: logger.warning( "Could not identify indicator type for signal with type: %s. Will not process.", signal_type.get_name(), ) else: csv.field_size_limit(65535) # dodge field size problems for row in csv.reader(txt_content): items.append( HMASerialization( row[0], indicator_type, row[1], SimpleDescriptorRollup.from_row(row[2:]), )) logger.info("%d rows loaded for %d", len(items), self.privacy_group) # Do all in one assignment just in case of threads self._cached_state = {item.key: item for item in items} return self._cached_state
def _apply_updates_impl( self, delta: tu.ThreatUpdatesDelta, post_apply_fn=lambda x: None, ) -> None: state: t.Dict = {} updated: t.Dict = {} if delta.start > 0: state = self.load_state() for update in delta: item = HMASerialization.from_threat_updates_json( self.app_id, update.raw_json) if update.should_delete: state.pop(item.key, None) else: state[item.key] = item updated[item.key] = item self._store_state(state.values()) self._cached_state = state post_apply_fn(updated)
def load_state(self, allow_cached=True): if not allow_cached or self._cached_state is None: txt_content = read_s3_text(self.s3_bucket, self.data_s3_key) items = [] if txt_content is None: logger.warning("No TE state for %d. First run?", self.privacy_group) else: # Violate your warranty with module state! csv.field_size_limit(65535) # dodge field size problems for row in csv.reader(txt_content): items.append( HMASerialization( row[0], "HASH_PDQ", row[1], SimpleDescriptorRollup.from_row(row[2:]), )) logger.info("%d rows loaded for %d", len(items), self.privacy_group) # Do all in one assignment just in case of threads self._cached_state = {item.key: item for item in items} return self._cached_state