Пример #1
0
    def prepare_consolidation(self, fetched_df: pd.DataFrame, max_rows_per_file: int, **args):
        if fetched_df is None or len(fetched_df) == 0:
            kl.info('empty dataframe, nothing to save.')

        kl.info(f'saving consolidated data for {len(fetched_df)} rows...')
        kprof = kp.KProfiler()
        kprof.log_mem('memory usage')

        # slice by table
        tables = set(fetched_df['table'].unique())
        for table in tables:
            table_slice = fetched_df.loc[fetched_df['table'] == table]
            kl.debug(f'preparing data for table {table} ({len(table_slice)} items)...')

            # slice by time
            for (time_slice_df, time_slice_id, time_slice_ref) in self.time_slicing(table_slice):

                # slice into files and prepare dataframe
                for (prepared_file_df, current_file, n_files) in self.rows_slicing(time_slice_df, max_rows_per_file):

                    self.save_consolidation(prepared_file_df, table, time_slice_id=time_slice_id,
                                            time_slice_ref=time_slice_ref,
                                            current_file=current_file, n_files=n_files, **args)
                    kprof.log_mem('memory usage before gc')
                    del prepared_file_df
                    gc.collect()
                    kprof.log_mem('memory usage after gc')
Пример #2
0
    def purge(self, workers: List[str], results: bool = False):
        kl.debug(f'purge {self.name}: start.')

        if workers is None and not results:
            kl.info(f'purge: no action defined - nothing to do!')
            return

        qs = self.queue_sizes()
        state = self.fetcher_state(qs)
        if state == 'idle':
            kl.info(f'purge: fetcher state {state}, nothing to do')
            return

        items = []
        if workers is not None:
            items.extend(workers)
        if results:
            items.append('results')

        for w in items:
            if w == 'results':
                queue = self.results_queue
            else:
                queue = self.worker_queue(w)
            n = qs[queue]
            if n > 0:
                kl.info(f"queue for '{w}' ({queue}) has {n} messages: purging")
                ksqs.purge_queue(queue)
            else:
                kl.info(
                    f"queue for '{w}' ({queue}) has {n} messages: nothing to do"
                )

        kl.debug(f'purge {self.name}: finish.')
Пример #3
0
    def work(self):
        self.check_worker_state()
        if self.state != 'working':
            kl.warn(f'Nothing to do: worker in state {self.state}')
            return

        threads = []
        for i in range(self.n_threads):
            t = threading.Thread(target=self.fetcher_thread_loop, args=(i,))
            t.start()
            threads.append(t)

        # wait for queue to be processed
        for t in threads:
            t.join()

        kl.info(f'worker for {self.extractor} finished: fetcher state {self.state}')
Пример #4
0
 def rebalance(self, from_strategy: str, to_strategy: str, items_cnt: int):
     rebalance_cnt = min(items_cnt, 100_000)
     kl.debug(
         f'rebalancing {items_cnt} items from {from_strategy} to {to_strategy}'
     )
     from_queue = self.worker_queue(from_strategy)
     to_queue = self.worker_queue(to_strategy)
     if from_queue is None or to_queue is None:
         kl.error('rebalance not possible: invalid strategy')
     else:
         items = self.fetch_items(from_queue, max_items=rebalance_cnt)
         rebalanced_items = [
             i.reset_strategy(to_strategy, reset_errors=True) for i in items
         ]
         self.populate_worker_queue(rebalanced_items, to_strategy)
         handles = [i.handle for i in items]
         ksqs.remove_messages(queue_name=from_queue,
                              receipt_handles=handles)
         kl.info('rebalance: finished')
Пример #5
0
    def kickoff(self,
                max_fetch: Optional[int] = None,
                force_fetch: Optional[List[str]] = None,
                strategies: Optional[List[str]] = None,
                force=False) -> bool:
        if strategies is None:
            strategies = self.strategies

        # test fetch state
        state = self.fetcher_state()
        if state != 'idle' and not force:
            kl.info(f'cannot kickoff {self.name}: current state is {state}.')
            return False

        # keys and initial strategies
        df = pd.DataFrame(self.keys_to_fetch(max_fetch=max_fetch,
                                             force_fetch=force_fetch),
                          columns={'key'})
        if len(df) == 0:
            kl.info(f'cannot kickoff {self.name}: 0 ids to fetch.')
            return False
        self.set_initial_strategy(df, strategies=strategies)

        ref_ts = datetime.datetime.now()
        ref_ts_str = ref_ts.strftime('%Y%m%d_%H%M%S')
        kl.debug(
            f'populating {self.name} with {len(df)} elements, ref {ref_ts_str}.'
        )

        def row_to_item(row):
            return FetcherItem(key=row['key'],
                               start_ts=ref_ts,
                               strategy=row['strategy'])

        df['item'] = df.apply(row_to_item, axis=1)

        for strategy in strategies:
            df_strategy = df[df['strategy'] == strategy]
            kl.debug(f'putting {len(df_strategy)} in {strategy} queue...')
            self.populate_worker_queue(df_strategy['item'].tolist(), strategy)

        kl.debug(f'kickoff completed for {self.name}, ref {ref_ts_str}.')
Пример #6
0
    def kickoff(self, table: str,
                max_keys: Optional[int] = None,
                add_keys: Optional[List[str]] = None,
                method: Optional[str] = None,
                scope: Optional[str] = None,
                priority: Optional[int] = None,
                if_empty: bool = False,
                wait_empty: bool = False,
                empty_priority: Optional[int] = None,
                extractors: List[str] = None,
                **args) -> bool:

        _extractors = extractors if extractors is not None else self.extractors

        # test if its ready to kickoff
        if if_empty:
            kickoff_ready, state = self.kickoff_ready(empty_priority)
            if not kickoff_ready:
                kl.info(f'cannot kickoff {self.name} table {table}: current state is {state}.')
                return False
        elif wait_empty:
            wait_time_seconds = 60
            while True:
                kickoff_ready, state = self.kickoff_ready(empty_priority)
                if kickoff_ready:
                    break
                kl.info(f'waiting {wait_time_seconds}s for kickoff {self.name} table {table}:'
                        f' current state is {state}.')
                time.sleep(wait_time_seconds)

        # keys and initial strategies
        items = self.keys_to_fetch(table=table, max_keys=max_keys, add_keys=add_keys,
                                   method=method, scope=scope, **args)

        if items is None or len(items) == 0:
            kl.info(f'cannot kickoff {self.name} table {table}: nothing to fetch.')
            return False

        # set priority, cohort, creation time
        if priority is not None:
            items = [x.set_priority(priority) for x in items]

        # set initial extractor
        if len(self.extractors) > 0:
            items = self.set_initial_extractor(items)

        for extractor in _extractors:
            extractor_items = [x for x in items if x.extractor == extractor]
            kl.debug(f'populating extractor {extractor} with {len(extractor_items)} items.')
            self.populate_worker_queue(extractor_items, extractor=extractor, priority=priority)

        kl.debug(f'kickoff completed for {self.name} table {table}.')
Пример #7
0
class SqsFetcher:
    def __init__(self,
                 name: str,
                 results_queue: str,
                 worker_queues: List[str],
                 strategies: List[str] = None,
                 staging: bool = False):
        assert len(worker_queues) > 0
        self.name = name
        self.strategies = strategies if strategies is not None else ['default']
        self.is_multi_strategy = len(strategies) > 1
        self.staging = staging

        queue_prefix = ''
        if staging:
            queue_prefix = 'staging-'
        self.worker_queues = [queue_prefix + q for q in worker_queues]
        self.results_queue = queue_prefix + results_queue

    #
    # general
    #

    def queue_sizes(self,
                    wait_seconds: Optional[int] = None,
                    sqs_client=None) -> Dict[str, int]:
        """Returns approximate message count for all queues. Retries if any is zeroed."""
        # FIXME wait_seconds should be 60
        if wait_seconds is None:
            wait_seconds = 2

        i = 0
        qs_results = 0
        qs_workers = 0
        retries = 2
        kl.trace(
            f'getting queue sizes (wait up to {wait_seconds * (retries + 1)} s)'
        )
        qs = {}
        queues = [self.results_queue] + self.worker_queues
        while i < retries and (qs_results == 0 or qs_workers == 0):
            for q in queues:
                attr = ksqs.queue_attributes(q, sqs_client=sqs_client)
                available = int(attr['ApproximateNumberOfMessages'])
                in_flight = int(attr['ApproximateNumberOfMessagesNotVisible'])
                delayed = int(attr['ApproximateNumberOfMessagesDelayed'])
                qs[q] = available + in_flight + delayed
            qs_results = qs[self.results_queue]
            qs_workers = sum(
                [qs[queue_name] for queue_name in self.worker_queues])
            i += 1
            # sleep and retry if any queue has 0 elements
            if i < retries and (qs_results == 0 or qs_workers == 0):
                time.sleep(wait_seconds)
        qs_str = ', '.join([f'{q}: {qs[q]}' for q in qs])
        # kl.trace('queue sizes: ' + qs_str)
        return qs

    def fetcher_state(self,
                      qs: Optional[Dict[str, int]] = None,
                      sqs_client=None,
                      wait_seconds: Optional[int] = None) -> str:
        """Returns current fetcher state: processing, consolidating, idle."""

        if qs is None:
            qs = self.queue_sizes(sqs_client=sqs_client,
                                  wait_seconds=wait_seconds)
        qs_results = qs[self.results_queue]
        qs_workers = sum([qs[queue_name] for queue_name in self.worker_queues])

        if qs_results + qs_workers == 0:
            return 'idle'
        elif qs_workers == 0:
            return 'consolidation'
        else:
            return 'working'

    def worker_queue(self, strategy: str = None) -> Optional[str]:
        if strategy is None:
            return self.worker_queues[0]
        elif strategy in self.strategies:
            return self.worker_queues[self.strategies.index(strategy)]
        else:
            kl.error(f'invalid worker strategy: {strategy}')
            return None

    #
    # queue access
    #

    @classmethod
    def fetch_items(cls,
                    queue_name: str,
                    max_items=1,
                    sqs_client=None) -> List[FetcherItem]:
        """Returns: dict of message handle, FetcherItem"""
        items = ksqs.receive_messages(queue_name=queue_name,
                                      max_messages=max_items,
                                      sqs_client=sqs_client)
        ret = [
            FetcherItem.from_string(items[handle], handle=handle)
            for handle in items if items is not None
        ]
        return ret

    #
    # kickoff
    #

    @abstractmethod
    def keys_to_fetch(self,
                      max_fetch: Optional[int] = None,
                      force_fetch: Optional[List[str]] = None) -> List[str]:
        return []

    def set_initial_strategy(self,
                             df: pd.DataFrame,
                             strategies: Optional[List[str]] = None):
        valid_strategies = strategies
        if strategies is None:
            valid_strategies = self.strategies
        df['strategy'] = valid_strategies[0]

    @classmethod
    def build_items_list(cls, keys_strategies: Dict[(str, Optional[str])], ref_ts: datetime.datetime)\
            -> List[FetcherItem]:
        return [FetcherItem(key=k, start_ts=ref_ts) for k in keys_strategies]

    def populate_worker_queue(self, items: List[FetcherItem], strategy: str):
        contents = [i.to_string() for i in items]
        ksqs.send_messages(self.worker_queue(strategy=strategy), contents)

    def kickoff(self,
                max_fetch: Optional[int] = None,
                force_fetch: Optional[List[str]] = None,
                strategies: Optional[List[str]] = None,
                force=False) -> bool:
        if strategies is None:
            strategies = self.strategies

        # test fetch state
        state = self.fetcher_state()
        if state != 'idle' and not force:
            kl.info(f'cannot kickoff {self.name}: current state is {state}.')
            return False

        # keys and initial strategies
        df = pd.DataFrame(self.keys_to_fetch(max_fetch=max_fetch,
                                             force_fetch=force_fetch),
                          columns={'key'})
        if len(df) == 0:
            kl.info(f'cannot kickoff {self.name}: 0 ids to fetch.')
            return False
        self.set_initial_strategy(df, strategies=strategies)

        ref_ts = datetime.datetime.now()
        ref_ts_str = ref_ts.strftime('%Y%m%d_%H%M%S')
        kl.debug(
            f'populating {self.name} with {len(df)} elements, ref {ref_ts_str}.'
        )

        def row_to_item(row):
            return FetcherItem(key=row['key'],
                               start_ts=ref_ts,
                               strategy=row['strategy'])

        df['item'] = df.apply(row_to_item, axis=1)

        for strategy in strategies:
            df_strategy = df[df['strategy'] == strategy]
            kl.debug(f'putting {len(df_strategy)} in {strategy} queue...')
            self.populate_worker_queue(df_strategy['item'].tolist(), strategy)

        kl.debug(f'kickoff completed for {self.name}, ref {ref_ts_str}.')

    #
    # consolidator
    #

    @abstractmethod
    def save_results(self, df: pd.DataFrame, strategy: str,
                     ref_ts: datetime.datetime, current_file: int,
                     n_files: int, output_folder: str, local_only: bool):
        pass

    @abstractmethod
    def save_consolidated(self, fetched_df: pd.DataFrame, **args):
        pass

    def data_to_df(self, fetched_data: list) -> pd.DataFrame:
        return pd.DataFrame(fetched_data)

    def consolidate(self, max_queue_items: int = 120_000, **args):
        kl.debug(f'consolidate {self.name}: start.')

        qs = self.queue_sizes()
        qs_results = qs[self.results_queue]
        state = self.fetcher_state(qs)
        if qs_results == 0:
            kl.info(
                f'consolidate {self.name}: nothing to consolidate. State: {state}'
            )
            return

        # get all messages from result queue
        # TODO: improvement: interactive algorithm that gets less elements each time
        remaining = qs_results
        while remaining > 0:
            messages_to_fetch = min(remaining, 120_000, max_queue_items)
            kl.debug(
                f'reading {messages_to_fetch} messages from results queue...')
            items = []
            while len(items) < messages_to_fetch:
                new_items = self.fetch_items(self.results_queue,
                                             max_items=messages_to_fetch -
                                             len(items))
                kl.debug(f'read {len(new_items)} new messages.')
                items.extend(new_items)
                if len(new_items) == 0:
                    break
            if len(items) == 0:
                break
            remaining -= len(items)
            fetched_data = [i.content for i in items]
            fetched_df = self.data_to_df(fetched_data)

            self.save_consolidated(fetched_df, **args)
            del fetched_df

            handles = [i.handle for i in items]
            ksqs.remove_messages(queue_name=self.results_queue,
                                 receipt_handles=handles)
            del fetched_data

        kl.debug(f'consolidate {self.name}: finish.')
Пример #8
0
class KarnakSqsFetcher(KarnakFetcher):
    def __init__(self, name: str,
                 tables: List[str],
                 environment: str,
                 extractors: Optional[List[str]] = None,
                 max_priority: Optional[int] = None,
                 empty_work_queue_recheck_seconds: int = 300):
        super().__init__(name, tables, environment, extractors, max_priority)
        self.empty_queue_control = {}
        self.default_sqs_client = ksqs.get_client()
        self.empty_work_queue_recheck_seconds = empty_work_queue_recheck_seconds

    #
    # queues
    #

    @abstractmethod
    def results_queue_name(self) -> str:
        """Returns the name of the results queue."""
        pass

    @abstractmethod
    def worker_queue_name(self, extractor: str, priority: Optional[int]) -> str:
        """Returns the name of the worker queue."""
        pass

    def worker_queue_names(self, extractor=None) -> List[str]:
        priorities = self.priorities()
        _extractors = [extractor] if extractor is not None else self.extractors
        ql = [self.worker_queue_name(ext, p) for ext in _extractors for p in priorities]
        return ql

    def fetcher_state(self, queue_sizes: Optional[Dict[str, int]] = None) -> (str, int):
        if queue_sizes is None:
            queue_sizes = self.queue_sizes()
        qs_results = queue_sizes[self.results_queue_name()]
        qs_workers = sum([queue_sizes[qn] for qn in self.worker_queue_names()])
        working_priority = None
        if self.max_priority is not None and qs_workers > 0:
            for p in range(1, self.max_priority + 1):
                q_names = [self.worker_queue_name(ext, p) for ext in self.extractors]
                cnt = sum([queue_sizes[qn] for qn in q_names])
                if cnt > 0:
                    working_priority = p
                    break

        if qs_results + qs_workers == 0:
            return 'idle', working_priority
        elif qs_workers == 0:
            return 'consolidating', working_priority
        else:
            return 'working', working_priority

    def queue_sizes(self, sqs_client=None) -> Dict[str, int]:
        """Returns approximate message count for all queues."""
        kl.trace(f'getting queue sizes')
        _sqs_client = sqs_client if sqs_client is not None else self.default_sqs_client
        qs = {}
        queue_names = self.worker_queue_names() + [self.results_queue_name()]
        for q in queue_names:
            attr = ksqs.queue_attributes(q, sqs_client=_sqs_client)
            available = int(attr['ApproximateNumberOfMessages'])
            in_flight = int(attr['ApproximateNumberOfMessagesNotVisible'])
            delayed = int(attr['ApproximateNumberOfMessagesDelayed'])
            qs[q] = available + in_flight + delayed
        return qs

    #
    # kickoff
    #

    def populate_worker_queue(self, items: List[FetcherQueueItem], extractor: str, priority: Optional[int]):
        worker_queue_name = self.worker_queue_name(extractor=extractor, priority=priority)
        kl.trace(f'putting {len(items)} messages in queue {worker_queue_name}')
        contents = [i.to_string() for i in items]
        ksqs.send_messages(worker_queue_name, contents)

    #
    # worker
    #

    def create_thread_context(self) -> KarnakSqsFetcherThreadContext:
        ctx = KarnakSqsFetcherThreadContext()
        return ctx

    @synchronized
    def set_empty_queue(self, queue_name: str):
        self.empty_queue_control[queue_name] = datetime.datetime.now(tz=pytz.utc)

    @synchronized
    def is_empty_queue(self, queue_name: str,) -> bool:
        eqc = self.empty_queue_control.get(queue_name)
        if eqc is None:
            return False
        now = datetime.datetime.now(tz=pytz.utc)
        if now - eqc >= datetime.timedelta(seconds=self.empty_work_queue_recheck_seconds):
            del self.empty_queue_control[queue_name]
            return False
        return True

    def pop_work_queue_item(self, extractor: str, priority: Optional[int],
                            context: KarnakSqsFetcherThreadContext, wait: bool) \
            -> Optional[FetcherQueueItem]:
        queue_name = self.worker_queue_name(extractor, priority=priority)
        sqs_client = context.sqs_client
        wait_seconds = 20 if wait else 0
        items = ksqs.receive_messages(queue_name=queue_name, max_messages=1, wait_seconds=wait_seconds,
                                      sqs_client=sqs_client)
        if items is None or len(items) == 0:
            self.set_empty_queue(queue_name)
            return None
        else:
            assert len(items) == 1
            handle = list(items.keys())[0]
            content_str = items[handle]
            ret = FetcherQueueItem.from_string(content_str, handle=handle)
            return ret

    def pop_best_work_queue_item(self, extractor: str,
                                 context: KarnakSqsFetcherThreadContext) -> Optional[FetcherQueueItem]:
        priorities = self.priorities()
        for retry in [0, 1]:  # two rounds of attempts
            for p in priorities:
                queue_name = self.worker_queue_name(extractor, priority=p)
                if retry or not self.is_empty_queue(queue_name):  # only checks empty in first round.
                    # only wait in retry round.
                    wait = retry > 0
                    item = self.pop_work_queue_item(extractor, p, context, wait=wait)
                    if item is not None:
                        return item

    #
    # consolidator
    #

    def pop_result_items(self, max_items) -> List[FetcherResult]:
        items = ksqs.receive_messages(queue_name=self.results_queue_name(),
                                      max_messages=max_items, wait_seconds=20)
        ret = [FetcherResult.from_string(items[handle], handle=handle) for handle in items if items is not None]
        return ret

    def consolidate(self, max_queue_items_per_file: int = 120_000, max_rows_per_file: int = 2_000_000, **args):
        kl.info(f'consolidate {self.name}: start.')
        kl.debug(f'max_queue_items_per_file: {max_queue_items_per_file}, max_rows_per_file: {max_rows_per_file} ')

        qs = self.queue_sizes()
        qs_results = qs[self.results_queue_name()]
        state = self.fetcher_state(qs)
        if qs_results == 0:
            kl.info(f'consolidate {self.name}: nothing to consolidate. State: {state}')
            return

        # get all messages from result queue
        remaining = qs_results
        while remaining > 0:
            messages_to_fetch = min(remaining, 120_000, max_queue_items_per_file)
            kl.debug(f'reading {messages_to_fetch} messages from results queue...')
            results: List[FetcherResult] = []
            while len(results) < messages_to_fetch:
                next_to_fetch = messages_to_fetch - len(results)
                new_results = self.pop_result_items(max_items=next_to_fetch)
                kl.debug(f'read {len(new_results)} new messages.')
                results.extend(new_results)
                if len(new_results) == 0:
                    break
            if len(results) == 0:
                break
            remaining -= len(results)
            # fetched_df = self.data_to_df(results)
            fetched_df = self.results_df(results)

            self.prepare_consolidation(fetched_df, max_rows_per_file=max_rows_per_file, **args)
            del fetched_df
            gc.collect()

            handles = [i.handle for i in results]
            ksqs.remove_messages(queue_name=self.results_queue_name(), receipt_handles=handles)

            del results
            gc.collect()

        kl.debug(f'consolidate {self.name}: finish.')