def prepare_consolidation(self, fetched_df: pd.DataFrame, max_rows_per_file: int, **args): if fetched_df is None or len(fetched_df) == 0: kl.info('empty dataframe, nothing to save.') kl.info(f'saving consolidated data for {len(fetched_df)} rows...') kprof = kp.KProfiler() kprof.log_mem('memory usage') # slice by table tables = set(fetched_df['table'].unique()) for table in tables: table_slice = fetched_df.loc[fetched_df['table'] == table] kl.debug(f'preparing data for table {table} ({len(table_slice)} items)...') # slice by time for (time_slice_df, time_slice_id, time_slice_ref) in self.time_slicing(table_slice): # slice into files and prepare dataframe for (prepared_file_df, current_file, n_files) in self.rows_slicing(time_slice_df, max_rows_per_file): self.save_consolidation(prepared_file_df, table, time_slice_id=time_slice_id, time_slice_ref=time_slice_ref, current_file=current_file, n_files=n_files, **args) kprof.log_mem('memory usage before gc') del prepared_file_df gc.collect() kprof.log_mem('memory usage after gc')
def purge(self, workers: List[str], results: bool = False): kl.debug(f'purge {self.name}: start.') if workers is None and not results: kl.info(f'purge: no action defined - nothing to do!') return qs = self.queue_sizes() state = self.fetcher_state(qs) if state == 'idle': kl.info(f'purge: fetcher state {state}, nothing to do') return items = [] if workers is not None: items.extend(workers) if results: items.append('results') for w in items: if w == 'results': queue = self.results_queue else: queue = self.worker_queue(w) n = qs[queue] if n > 0: kl.info(f"queue for '{w}' ({queue}) has {n} messages: purging") ksqs.purge_queue(queue) else: kl.info( f"queue for '{w}' ({queue}) has {n} messages: nothing to do" ) kl.debug(f'purge {self.name}: finish.')
def work(self): self.check_worker_state() if self.state != 'working': kl.warn(f'Nothing to do: worker in state {self.state}') return threads = [] for i in range(self.n_threads): t = threading.Thread(target=self.fetcher_thread_loop, args=(i,)) t.start() threads.append(t) # wait for queue to be processed for t in threads: t.join() kl.info(f'worker for {self.extractor} finished: fetcher state {self.state}')
def rebalance(self, from_strategy: str, to_strategy: str, items_cnt: int): rebalance_cnt = min(items_cnt, 100_000) kl.debug( f'rebalancing {items_cnt} items from {from_strategy} to {to_strategy}' ) from_queue = self.worker_queue(from_strategy) to_queue = self.worker_queue(to_strategy) if from_queue is None or to_queue is None: kl.error('rebalance not possible: invalid strategy') else: items = self.fetch_items(from_queue, max_items=rebalance_cnt) rebalanced_items = [ i.reset_strategy(to_strategy, reset_errors=True) for i in items ] self.populate_worker_queue(rebalanced_items, to_strategy) handles = [i.handle for i in items] ksqs.remove_messages(queue_name=from_queue, receipt_handles=handles) kl.info('rebalance: finished')
def kickoff(self, max_fetch: Optional[int] = None, force_fetch: Optional[List[str]] = None, strategies: Optional[List[str]] = None, force=False) -> bool: if strategies is None: strategies = self.strategies # test fetch state state = self.fetcher_state() if state != 'idle' and not force: kl.info(f'cannot kickoff {self.name}: current state is {state}.') return False # keys and initial strategies df = pd.DataFrame(self.keys_to_fetch(max_fetch=max_fetch, force_fetch=force_fetch), columns={'key'}) if len(df) == 0: kl.info(f'cannot kickoff {self.name}: 0 ids to fetch.') return False self.set_initial_strategy(df, strategies=strategies) ref_ts = datetime.datetime.now() ref_ts_str = ref_ts.strftime('%Y%m%d_%H%M%S') kl.debug( f'populating {self.name} with {len(df)} elements, ref {ref_ts_str}.' ) def row_to_item(row): return FetcherItem(key=row['key'], start_ts=ref_ts, strategy=row['strategy']) df['item'] = df.apply(row_to_item, axis=1) for strategy in strategies: df_strategy = df[df['strategy'] == strategy] kl.debug(f'putting {len(df_strategy)} in {strategy} queue...') self.populate_worker_queue(df_strategy['item'].tolist(), strategy) kl.debug(f'kickoff completed for {self.name}, ref {ref_ts_str}.')
def kickoff(self, table: str, max_keys: Optional[int] = None, add_keys: Optional[List[str]] = None, method: Optional[str] = None, scope: Optional[str] = None, priority: Optional[int] = None, if_empty: bool = False, wait_empty: bool = False, empty_priority: Optional[int] = None, extractors: List[str] = None, **args) -> bool: _extractors = extractors if extractors is not None else self.extractors # test if its ready to kickoff if if_empty: kickoff_ready, state = self.kickoff_ready(empty_priority) if not kickoff_ready: kl.info(f'cannot kickoff {self.name} table {table}: current state is {state}.') return False elif wait_empty: wait_time_seconds = 60 while True: kickoff_ready, state = self.kickoff_ready(empty_priority) if kickoff_ready: break kl.info(f'waiting {wait_time_seconds}s for kickoff {self.name} table {table}:' f' current state is {state}.') time.sleep(wait_time_seconds) # keys and initial strategies items = self.keys_to_fetch(table=table, max_keys=max_keys, add_keys=add_keys, method=method, scope=scope, **args) if items is None or len(items) == 0: kl.info(f'cannot kickoff {self.name} table {table}: nothing to fetch.') return False # set priority, cohort, creation time if priority is not None: items = [x.set_priority(priority) for x in items] # set initial extractor if len(self.extractors) > 0: items = self.set_initial_extractor(items) for extractor in _extractors: extractor_items = [x for x in items if x.extractor == extractor] kl.debug(f'populating extractor {extractor} with {len(extractor_items)} items.') self.populate_worker_queue(extractor_items, extractor=extractor, priority=priority) kl.debug(f'kickoff completed for {self.name} table {table}.')
class SqsFetcher: def __init__(self, name: str, results_queue: str, worker_queues: List[str], strategies: List[str] = None, staging: bool = False): assert len(worker_queues) > 0 self.name = name self.strategies = strategies if strategies is not None else ['default'] self.is_multi_strategy = len(strategies) > 1 self.staging = staging queue_prefix = '' if staging: queue_prefix = 'staging-' self.worker_queues = [queue_prefix + q for q in worker_queues] self.results_queue = queue_prefix + results_queue # # general # def queue_sizes(self, wait_seconds: Optional[int] = None, sqs_client=None) -> Dict[str, int]: """Returns approximate message count for all queues. Retries if any is zeroed.""" # FIXME wait_seconds should be 60 if wait_seconds is None: wait_seconds = 2 i = 0 qs_results = 0 qs_workers = 0 retries = 2 kl.trace( f'getting queue sizes (wait up to {wait_seconds * (retries + 1)} s)' ) qs = {} queues = [self.results_queue] + self.worker_queues while i < retries and (qs_results == 0 or qs_workers == 0): for q in queues: attr = ksqs.queue_attributes(q, sqs_client=sqs_client) available = int(attr['ApproximateNumberOfMessages']) in_flight = int(attr['ApproximateNumberOfMessagesNotVisible']) delayed = int(attr['ApproximateNumberOfMessagesDelayed']) qs[q] = available + in_flight + delayed qs_results = qs[self.results_queue] qs_workers = sum( [qs[queue_name] for queue_name in self.worker_queues]) i += 1 # sleep and retry if any queue has 0 elements if i < retries and (qs_results == 0 or qs_workers == 0): time.sleep(wait_seconds) qs_str = ', '.join([f'{q}: {qs[q]}' for q in qs]) # kl.trace('queue sizes: ' + qs_str) return qs def fetcher_state(self, qs: Optional[Dict[str, int]] = None, sqs_client=None, wait_seconds: Optional[int] = None) -> str: """Returns current fetcher state: processing, consolidating, idle.""" if qs is None: qs = self.queue_sizes(sqs_client=sqs_client, wait_seconds=wait_seconds) qs_results = qs[self.results_queue] qs_workers = sum([qs[queue_name] for queue_name in self.worker_queues]) if qs_results + qs_workers == 0: return 'idle' elif qs_workers == 0: return 'consolidation' else: return 'working' def worker_queue(self, strategy: str = None) -> Optional[str]: if strategy is None: return self.worker_queues[0] elif strategy in self.strategies: return self.worker_queues[self.strategies.index(strategy)] else: kl.error(f'invalid worker strategy: {strategy}') return None # # queue access # @classmethod def fetch_items(cls, queue_name: str, max_items=1, sqs_client=None) -> List[FetcherItem]: """Returns: dict of message handle, FetcherItem""" items = ksqs.receive_messages(queue_name=queue_name, max_messages=max_items, sqs_client=sqs_client) ret = [ FetcherItem.from_string(items[handle], handle=handle) for handle in items if items is not None ] return ret # # kickoff # @abstractmethod def keys_to_fetch(self, max_fetch: Optional[int] = None, force_fetch: Optional[List[str]] = None) -> List[str]: return [] def set_initial_strategy(self, df: pd.DataFrame, strategies: Optional[List[str]] = None): valid_strategies = strategies if strategies is None: valid_strategies = self.strategies df['strategy'] = valid_strategies[0] @classmethod def build_items_list(cls, keys_strategies: Dict[(str, Optional[str])], ref_ts: datetime.datetime)\ -> List[FetcherItem]: return [FetcherItem(key=k, start_ts=ref_ts) for k in keys_strategies] def populate_worker_queue(self, items: List[FetcherItem], strategy: str): contents = [i.to_string() for i in items] ksqs.send_messages(self.worker_queue(strategy=strategy), contents) def kickoff(self, max_fetch: Optional[int] = None, force_fetch: Optional[List[str]] = None, strategies: Optional[List[str]] = None, force=False) -> bool: if strategies is None: strategies = self.strategies # test fetch state state = self.fetcher_state() if state != 'idle' and not force: kl.info(f'cannot kickoff {self.name}: current state is {state}.') return False # keys and initial strategies df = pd.DataFrame(self.keys_to_fetch(max_fetch=max_fetch, force_fetch=force_fetch), columns={'key'}) if len(df) == 0: kl.info(f'cannot kickoff {self.name}: 0 ids to fetch.') return False self.set_initial_strategy(df, strategies=strategies) ref_ts = datetime.datetime.now() ref_ts_str = ref_ts.strftime('%Y%m%d_%H%M%S') kl.debug( f'populating {self.name} with {len(df)} elements, ref {ref_ts_str}.' ) def row_to_item(row): return FetcherItem(key=row['key'], start_ts=ref_ts, strategy=row['strategy']) df['item'] = df.apply(row_to_item, axis=1) for strategy in strategies: df_strategy = df[df['strategy'] == strategy] kl.debug(f'putting {len(df_strategy)} in {strategy} queue...') self.populate_worker_queue(df_strategy['item'].tolist(), strategy) kl.debug(f'kickoff completed for {self.name}, ref {ref_ts_str}.') # # consolidator # @abstractmethod def save_results(self, df: pd.DataFrame, strategy: str, ref_ts: datetime.datetime, current_file: int, n_files: int, output_folder: str, local_only: bool): pass @abstractmethod def save_consolidated(self, fetched_df: pd.DataFrame, **args): pass def data_to_df(self, fetched_data: list) -> pd.DataFrame: return pd.DataFrame(fetched_data) def consolidate(self, max_queue_items: int = 120_000, **args): kl.debug(f'consolidate {self.name}: start.') qs = self.queue_sizes() qs_results = qs[self.results_queue] state = self.fetcher_state(qs) if qs_results == 0: kl.info( f'consolidate {self.name}: nothing to consolidate. State: {state}' ) return # get all messages from result queue # TODO: improvement: interactive algorithm that gets less elements each time remaining = qs_results while remaining > 0: messages_to_fetch = min(remaining, 120_000, max_queue_items) kl.debug( f'reading {messages_to_fetch} messages from results queue...') items = [] while len(items) < messages_to_fetch: new_items = self.fetch_items(self.results_queue, max_items=messages_to_fetch - len(items)) kl.debug(f'read {len(new_items)} new messages.') items.extend(new_items) if len(new_items) == 0: break if len(items) == 0: break remaining -= len(items) fetched_data = [i.content for i in items] fetched_df = self.data_to_df(fetched_data) self.save_consolidated(fetched_df, **args) del fetched_df handles = [i.handle for i in items] ksqs.remove_messages(queue_name=self.results_queue, receipt_handles=handles) del fetched_data kl.debug(f'consolidate {self.name}: finish.')
class KarnakSqsFetcher(KarnakFetcher): def __init__(self, name: str, tables: List[str], environment: str, extractors: Optional[List[str]] = None, max_priority: Optional[int] = None, empty_work_queue_recheck_seconds: int = 300): super().__init__(name, tables, environment, extractors, max_priority) self.empty_queue_control = {} self.default_sqs_client = ksqs.get_client() self.empty_work_queue_recheck_seconds = empty_work_queue_recheck_seconds # # queues # @abstractmethod def results_queue_name(self) -> str: """Returns the name of the results queue.""" pass @abstractmethod def worker_queue_name(self, extractor: str, priority: Optional[int]) -> str: """Returns the name of the worker queue.""" pass def worker_queue_names(self, extractor=None) -> List[str]: priorities = self.priorities() _extractors = [extractor] if extractor is not None else self.extractors ql = [self.worker_queue_name(ext, p) for ext in _extractors for p in priorities] return ql def fetcher_state(self, queue_sizes: Optional[Dict[str, int]] = None) -> (str, int): if queue_sizes is None: queue_sizes = self.queue_sizes() qs_results = queue_sizes[self.results_queue_name()] qs_workers = sum([queue_sizes[qn] for qn in self.worker_queue_names()]) working_priority = None if self.max_priority is not None and qs_workers > 0: for p in range(1, self.max_priority + 1): q_names = [self.worker_queue_name(ext, p) for ext in self.extractors] cnt = sum([queue_sizes[qn] for qn in q_names]) if cnt > 0: working_priority = p break if qs_results + qs_workers == 0: return 'idle', working_priority elif qs_workers == 0: return 'consolidating', working_priority else: return 'working', working_priority def queue_sizes(self, sqs_client=None) -> Dict[str, int]: """Returns approximate message count for all queues.""" kl.trace(f'getting queue sizes') _sqs_client = sqs_client if sqs_client is not None else self.default_sqs_client qs = {} queue_names = self.worker_queue_names() + [self.results_queue_name()] for q in queue_names: attr = ksqs.queue_attributes(q, sqs_client=_sqs_client) available = int(attr['ApproximateNumberOfMessages']) in_flight = int(attr['ApproximateNumberOfMessagesNotVisible']) delayed = int(attr['ApproximateNumberOfMessagesDelayed']) qs[q] = available + in_flight + delayed return qs # # kickoff # def populate_worker_queue(self, items: List[FetcherQueueItem], extractor: str, priority: Optional[int]): worker_queue_name = self.worker_queue_name(extractor=extractor, priority=priority) kl.trace(f'putting {len(items)} messages in queue {worker_queue_name}') contents = [i.to_string() for i in items] ksqs.send_messages(worker_queue_name, contents) # # worker # def create_thread_context(self) -> KarnakSqsFetcherThreadContext: ctx = KarnakSqsFetcherThreadContext() return ctx @synchronized def set_empty_queue(self, queue_name: str): self.empty_queue_control[queue_name] = datetime.datetime.now(tz=pytz.utc) @synchronized def is_empty_queue(self, queue_name: str,) -> bool: eqc = self.empty_queue_control.get(queue_name) if eqc is None: return False now = datetime.datetime.now(tz=pytz.utc) if now - eqc >= datetime.timedelta(seconds=self.empty_work_queue_recheck_seconds): del self.empty_queue_control[queue_name] return False return True def pop_work_queue_item(self, extractor: str, priority: Optional[int], context: KarnakSqsFetcherThreadContext, wait: bool) \ -> Optional[FetcherQueueItem]: queue_name = self.worker_queue_name(extractor, priority=priority) sqs_client = context.sqs_client wait_seconds = 20 if wait else 0 items = ksqs.receive_messages(queue_name=queue_name, max_messages=1, wait_seconds=wait_seconds, sqs_client=sqs_client) if items is None or len(items) == 0: self.set_empty_queue(queue_name) return None else: assert len(items) == 1 handle = list(items.keys())[0] content_str = items[handle] ret = FetcherQueueItem.from_string(content_str, handle=handle) return ret def pop_best_work_queue_item(self, extractor: str, context: KarnakSqsFetcherThreadContext) -> Optional[FetcherQueueItem]: priorities = self.priorities() for retry in [0, 1]: # two rounds of attempts for p in priorities: queue_name = self.worker_queue_name(extractor, priority=p) if retry or not self.is_empty_queue(queue_name): # only checks empty in first round. # only wait in retry round. wait = retry > 0 item = self.pop_work_queue_item(extractor, p, context, wait=wait) if item is not None: return item # # consolidator # def pop_result_items(self, max_items) -> List[FetcherResult]: items = ksqs.receive_messages(queue_name=self.results_queue_name(), max_messages=max_items, wait_seconds=20) ret = [FetcherResult.from_string(items[handle], handle=handle) for handle in items if items is not None] return ret def consolidate(self, max_queue_items_per_file: int = 120_000, max_rows_per_file: int = 2_000_000, **args): kl.info(f'consolidate {self.name}: start.') kl.debug(f'max_queue_items_per_file: {max_queue_items_per_file}, max_rows_per_file: {max_rows_per_file} ') qs = self.queue_sizes() qs_results = qs[self.results_queue_name()] state = self.fetcher_state(qs) if qs_results == 0: kl.info(f'consolidate {self.name}: nothing to consolidate. State: {state}') return # get all messages from result queue remaining = qs_results while remaining > 0: messages_to_fetch = min(remaining, 120_000, max_queue_items_per_file) kl.debug(f'reading {messages_to_fetch} messages from results queue...') results: List[FetcherResult] = [] while len(results) < messages_to_fetch: next_to_fetch = messages_to_fetch - len(results) new_results = self.pop_result_items(max_items=next_to_fetch) kl.debug(f'read {len(new_results)} new messages.') results.extend(new_results) if len(new_results) == 0: break if len(results) == 0: break remaining -= len(results) # fetched_df = self.data_to_df(results) fetched_df = self.results_df(results) self.prepare_consolidation(fetched_df, max_rows_per_file=max_rows_per_file, **args) del fetched_df gc.collect() handles = [i.handle for i in results] ksqs.remove_messages(queue_name=self.results_queue_name(), receipt_handles=handles) del results gc.collect() kl.debug(f'consolidate {self.name}: finish.')