Пример #1
0
    def test_populate_from_scope_record(self):

        scope_id = gen_string_id()
        sweep_id = gen_string_id()

        console_token = 'console token'
        platform_token = 'platform token'

        scope_record = AssetScope()
        scope_record.scope = scope_id
        scope_record.scope_api_token = console_token
        scope_record.set_cache(platform_tokens={platform_token})

        PlatformTokenManager.populate_from_scope_entity(scope_record, sweep_id)

        # now let's make sure we see those tokens:

        # Scope-centered jobs must result in scope-centered key for token storage
        job_scope = JobScope(sweep_id=sweep_id,
                             entity_type=Entity.Scope,
                             entity_id=scope_id)
        assert console_token == PlatformTokenManager.from_job_scope(
            job_scope).get_best_token()

        job_scope = JobScope(
            sweep_id=sweep_id,
            # uses .namespace default value as 2nd value in redis key. no need to set here.
        )
        assert platform_token == PlatformTokenManager.from_job_scope(
            job_scope).get_best_token()
Пример #2
0
def _report_failure(job_scope: JobScope, start_time: float, exc: Exception,
                    **kwargs: Any):
    """Report task stats when task fails."""
    end_time = time.time()
    job_scope.running_time = math.ceil(end_time - start_time)
    job_scope.datapoint_count = kwargs.get('partial_datapoint_count')

    ErrorInspector.inspect(exc, job_scope.ad_account_id,
                           {'job_scope': job_scope})

    if isinstance(exc, FacebookRequestError):
        failure_status, failure_bucket = FacebookApiErrorInspector(
            exc).get_status_and_bucket()
    else:
        failure_status, failure_bucket = ExternalPlatformJobStatus.GenericError, FailureBucket.Other

    # No entity type means we don't know what table to target
    if failure_bucket == FailureBucket.InaccessibleObject and job_scope.entity_type is not None:
        set_inaccessible_entity_task.delay(job_scope)

    report_job_status_task.delay(failure_status, job_scope)
    PlatformTokenManager.from_job_scope(
        job_scope).report_usage_per_failure_bucket(job_scope.token,
                                                   failure_bucket)
    SweepStatusTracker(job_scope.sweep_id).report_status(failure_bucket)
    _send_measurement_task_runtime(job_scope, failure_bucket)
def collect_page(job_scope: JobScope, _job_context: JobContext):
    """
    Collect a single facebook page
    """
    if job_scope.report_variant != Entity.Page:
        raise ValueError(
            f"Report level {job_scope.report_variant} specified is not: {Entity.Page}"
        )

    token = job_scope.token
    if not token:
        raise ValueError(
            f"Job {job_scope.job_id} cannot proceed. No platform tokens provided."
        )

    # We don't use it for getting a token. Something else that calls us does.
    # However, we use it to report usages of the token we got.
    token_manager = PlatformTokenManager.from_job_scope(job_scope)

    with PlatformApiContext(token) as fb_ctx:
        page_inst = page.Page(fbid=job_scope.entity_id, api=fb_ctx.api)
        page_fetched = page_inst.api_get(fields=get_default_fields(Page))
        report_job_status_task.delay(ExternalPlatformJobStatus.DataFetched,
                                     job_scope)
        token_manager.report_usage(token, 2)

        record_id_data = job_scope.to_dict()
        record_id_data.update(entity_type=Entity.Page,
                              entity_id=job_scope.entity_id,
                              report_variant=None)
        entity_data = page_fetched.export_all_data()
        entity_data = add_vendor_data(
            entity_data, id=generate_universal_id(**record_id_data))
        store = NormalStore(job_scope)
        store.store(entity_data)
Пример #4
0
def collect_organic_insights_task(job_scope: JobScope, _: JobContext):
    logger.info(f'{job_scope} started')

    if not job_scope.tokens:
        good_token = PlatformTokenManager.from_job_scope(
            job_scope).get_best_token()
        if good_token is not None:
            job_scope.tokens = [good_token]

    data_iter = InsightsOrganic.iter_collect_insights(job_scope)

    cnt = 0
    try:
        for _ in data_iter:
            cnt += 1
            if cnt % 100 == 0:
                logger.info(f'{job_scope} processed {cnt} data points so far')
    except Exception as e:
        # re-raising causes loss of original stack trace. printing it.
        ErrorInspector.inspect(e, job_scope.ad_account_id,
                               {'job_scope': job_scope})
        raise CollectionError(e, cnt)

    logger.info(f'{job_scope} complete a total of {cnt} data points')
    return cnt
def collect_pages_from_business(job_scope: JobScope,
                                _job_context: JobContext) -> int:
    """
    Collect all facebook pages that are active
    """
    if job_scope.report_variant != Entity.Page:
        raise ValueError(
            f"Report level {job_scope.report_variant} specified is not: {Entity.Page}"
        )

    token = job_scope.token
    if not token:
        raise ValueError(
            f"Job {job_scope.job_id} cannot proceed. No platform tokens provided."
        )

    # We don't use it for getting a token. Something else that calls us does.
    # However, we use it to report usages of the token we got.
    token_manager = PlatformTokenManager.from_job_scope(job_scope)

    with PlatformApiContext(token) as fb_ctx:
        fb_req = FacebookRequest(node_id="me",
                                 method="GET",
                                 endpoint="/businesses",
                                 api=fb_ctx.api,
                                 api_type='EDGE',
                                 target_class=Business)
        businesses = fb_req.execute()

    report_job_status_task.delay(ExternalPlatformJobStatus.DataFetched,
                                 job_scope)
    token_manager.report_usage(token)

    entity_type = Entity.Page

    record_id_base_data = job_scope.to_dict()
    record_id_base_data.update(entity_type=entity_type, report_variant=None)

    cnt = 0
    for biz in businesses:
        client_pages = list(
            biz.get_client_pages(fields=get_default_fields(Page)))
        owned_pages = list(
            biz.get_owned_pages(fields=get_default_fields(Page)))
        pages_list = client_pages + owned_pages

        for page_inst in pages_list:

            entity_data = page_inst.export_all_data()
            record_id_base_data.update(entity_id=entity_data.get('id'))
            entity_data = add_vendor_data(
                entity_data, id=generate_universal_id(**record_id_base_data))

            store = NormalStore(job_scope)
            store.store(entity_data)
            cnt += 1

    report_job_status_task.delay(ExternalPlatformJobStatus.Done, job_scope)
    return cnt
Пример #6
0
def collect_adaccount(job_scope: JobScope) -> Dict[str, Any]:
    """
    Collects ad account data for a AA specific JobScope definition.
    :param JobScope job_scope: The JobScope as we get it from the task itself
    """
    if job_scope.report_variant != Entity.AdAccount:
        raise ValueError(
            f"Report level {job_scope.report_variant} specified is not: {Entity.AdAccount}"
        )

    token = job_scope.token
    if not token:
        raise ValueError(
            f"Job {job_scope.job_id} cannot proceed. No platform tokens provided."
        )

    assert (
        job_scope.ad_account_id == job_scope.entity_id
    ), f'This is an ad account entity job, account_id should be equal to entity_id'

    # Used to report token usage by this job
    token_manager = PlatformTokenManager.from_job_scope(job_scope)

    with PlatformApiContext(token) as fb_ctx:
        ad_account = fb_ctx.to_fb_model(job_scope.ad_account_id,
                                        Entity.AdAccount)

        fields = get_default_fields(ad_account.__class__)

        ad_account_with_selected_fields = ad_account.api_get(
            fields=fields)  # Read just the fields we need
        ad_account_data_dict = ad_account_with_selected_fields.export_all_data(
        )  # Export the object to a dict

        token_manager.report_usage(token)

        job_scope_base = {
            # Duplicate the job_scope data to avoid mutating it
            **job_scope.to_dict(),
            'entity_type': Entity.AdAccount,
            'report_variant': None,
        }

        augmented_ad_account_data = add_vendor_data(
            # Augment the data returned from the remote API with our vendor data
            ad_account_data_dict,
            id=generate_universal_id(**job_scope_base),
        )
        feedback_entity_task.delay(ad_account_data_dict,
                                   job_scope.report_variant)
        store = NormalStore(job_scope)
        store.store(augmented_ad_account_data)

        # TODO: feedback account? this probably wouldn't make sense at the moment
        # because ad accounts are discovered from console and their lifecycle is controlled from there.

        return ad_account_data_dict
Пример #7
0
def collect_adaccount_task(job_scope: JobScope, _: JobContext):
    logger.info(f'{job_scope} started')

    if not job_scope.tokens:
        good_token = PlatformTokenManager.from_job_scope(
            job_scope).get_best_token()
        if good_token is not None:
            job_scope.tokens = [good_token]

    collect_adaccount(job_scope)
def collect_page_task(job_scope: JobScope, job_context: JobContext) -> int:
    logger.info(f'{job_scope} started')

    if not job_scope.tokens:
        good_token = PlatformTokenManager.from_job_scope(
            job_scope).get_best_token()
        if good_token is not None:
            job_scope.tokens = [good_token]

    collect_page(job_scope, job_context)
    return 1  # we collect 1 page at a time
def iter_collect_entities_per_page(
        job_scope: JobScope) -> Generator[Dict[str, Any], None, None]:
    """
    Collects an arbitrary entity for a page
    """
    token, entity_type, root_fb_entity = _extract_token_entity_type_parent_entity(
        job_scope, [Entity.PagePost, Entity.PageVideo], Entity.Page,
        'ad_account_id')

    entities = iter_native_entities_per_page(root_fb_entity, entity_type)

    record_id_base_data = job_scope.to_dict()
    record_id_base_data.update(entity_type=entity_type, report_variant=None)

    token_manager = PlatformTokenManager.from_job_scope(job_scope)
    with ChunkDumpStore(
            job_scope, chunk_size=DEFAULT_CHUNK_SIZE) as store, ChunkDumpStore(
                job_scope,
                chunk_size=DEFAULT_CHUNK_SIZE,
                bucket_type=ColdStoreBucketType.RAW_BUCKET,
                custom_namespace=NAMESPACE_RAW,
            ) as raw_store:
        cnt = 0
        for entity in entities:
            entity_data = entity.export_all_data()

            entity_data = add_vendor_data(entity_data,
                                          id=generate_universal_id(
                                              entity_id=entity_data.get('id'),
                                              **record_id_base_data))
            entity_data['page_id'] = job_scope.ad_account_id

            if entity_type == Entity.PagePost:
                # store raw version of response (just to remain consistent)
                raw_store(entity_data)
                entity_data = _augment_page_post(entity_data)

            # Store the individual datum, use job context for the cold
            # storage thing to divine whatever it needs from the job context
            store(entity_data)

            # Signal to the system the new entity
            feedback_entity_task.delay(entity_data, entity_type)

            yield entity_data
            cnt += 1

            if cnt % 1000 == 0:
                # default paging size for entities per parent
                # is typically around 200. So, each 200 results
                # means about 5 hits to FB
                token_manager.report_usage(token, 5)

    token_manager.report_usage(token)
Пример #10
0
    def test_from_job_scope(self):

        key_gen = '{asset_scope}-{sweep_id}-sorted-token-queue'.format

        sweep_id = gen_string_id()
        entity_id = gen_string_id()
        scope_id = gen_string_id()

        # Scope-centered jobs must result in scope-centered key for token storage
        job_scope = JobScope(sweep_id=sweep_id,
                             entity_type=Entity.Scope,
                             entity_id=scope_id)
        token_manager = PlatformTokenManager.from_job_scope(job_scope)
        assert token_manager.queue_key == key_gen(asset_scope=scope_id,
                                                  sweep_id=sweep_id)

        # non-Scope-centered jobs must result in 'fb'-centered key for token storage
        job_scope = JobScope(sweep_id=sweep_id)
        token_manager = PlatformTokenManager.from_job_scope(job_scope)
        assert token_manager.queue_key == key_gen(
            asset_scope=JobScope.namespace, sweep_id=sweep_id)
Пример #11
0
    def test_adding_token(self):
        token_manager = PlatformTokenManager(self.asset_scope, self.sweep_id)

        token = '123'
        token_manager.add(token)
        assert token_manager.get_token_count() == 1
        assert token_manager.get_best_token() == token
Пример #12
0
    def test_token_priority(self):
        token_manager = PlatformTokenManager(self.asset_scope, self.sweep_id)

        first_token = 'first'
        second_token = 'second'

        token_manager.add(first_token, second_token)

        assert token_manager.get_token_count() == 2

        # Used the first token
        token_manager.report_usage(first_token)

        assert token_manager.get_best_token() == second_token
Пример #13
0
    def iter_collect_insights(cls, job_scope: JobScope):
        """
        Central, *GENERIC* implementation of insights fetcher task

        The goal of this method is to be the entry point for
        metrics fetching Celery tasks. This method is expected to parse
        the JobScope object, figure out that needs to be done
        based on data in the JobScope object and convert that data into
        proper parameters for calling FB

        :param job_scope: The JobScope as we get it from the task itself
        """
        if not job_scope.tokens:
            raise ValueError(
                f"Job {job_scope.job_id} cannot proceed. No platform tokens provided."
            )

        token = job_scope.token
        token_manager = PlatformTokenManager.from_job_scope(job_scope)
        report_entity_kind = InsightsOrganic._detect_report_api_kind(job_scope)
        page_token_manager = PageTokenManager.from_job_scope(job_scope)

        if report_entity_kind == ReportEntityApiKind.Video:
            with PlatformApiContext(job_scope.token) as fb_ctx:
                report_root_fb_entity = fb_ctx.to_fb_model(
                    job_scope.entity_id, job_scope.report_variant)

            data_iter = cls.iter_video_insights(report_root_fb_entity)

        elif report_entity_kind in {
                ReportEntityApiKind.Page, ReportEntityApiKind.Post
        }:
            with PlatformApiContext(
                    page_token_manager.get_best_token(
                        job_scope.ad_account_id)) as fb_ctx:
                report_root_fb_entity = fb_ctx.to_fb_model(
                    job_scope.entity_id, job_scope.report_variant)

            data_iter = cls.iter_page_entities_lifetime_insights(
                report_root_fb_entity, report_entity_kind)
        else:
            raise ValueError(
                f'Unsupported report entity kind "{report_entity_kind}" to collect organic insights'
            )

        for datum in cls._iter_collect_organic_insights(data_iter, job_scope):
            yield datum
        # right now, we support fetching insights for only one entity at a time
        # so no reason to report usage here
        token_manager.report_usage(token)
def collect_pages_from_business_task(job_scope: JobScope,
                                     job_context: JobContext) -> int:
    """
    This task should import pages from FB using Business API. At the moment, it is not used anywhere.
    """
    logger.info(f'{job_scope} started')

    if not job_scope.tokens:
        good_token = PlatformTokenManager.from_job_scope(
            job_scope).get_best_token()
        if good_token is not None:
            job_scope.tokens = [good_token]

    return collect_pages_from_business(job_scope, job_context)
    def iter_collect_insights(cls, job_scope: JobScope, _):
        """
        Central, *GENERIC* implementation of insights fetcher task

        The goal of this method is to be the entry point for
        metrics fetching Celery tasks. This method is expected to parse
        the JobScope object, figure out that needs to be done
        based on data in the JobScope object and convert that data into
        proper parameters for calling FB

        :param job_scope: The JobScope as we get it from the task itself
        :param _: A job context we use for entity checksums
        """
        if not job_scope.tokens:
            raise ValueError(
                f"Job {job_scope.job_id} cannot proceed. No platform tokens provided."
            )

        token = job_scope.token
        # We don't use it for getting a token. Something else that calls us does.
        # However, we use it to report usages of the token we got.
        token_manager = PlatformTokenManager.from_job_scope(job_scope)

        scope_parsed = JobScopeParsed(job_scope, ReportEntityApiKind.Ad)
        data_iter = cls.iter_ads_insights(scope_parsed.report_root_fb_entity,
                                          scope_parsed.report_params)

        with scope_parsed.datum_handler as store:
            for cnt, datum in enumerate(data_iter):
                # this computes values for and adds _oprm data object
                # to each datum that passes through us.
                scope_parsed.augment_with_vendor_data(datum)
                datum_with_transformed_fields = FieldTransformation.transform(
                    datum, Insights._ACTIONS_FIELDS_TO_TRANSFORM)

                store(datum_with_transformed_fields)
                yield datum_with_transformed_fields

                if cnt % 1000 == 0:
                    # default paging size for entities per parent
                    # is typically around 25. So, each 1000 results
                    # means about 40 hits to FB
                    token_manager.report_usage(token, 40)

        token_manager.report_usage(token)
def iter_collect_entities_per_adaccount(
        job_scope: JobScope) -> Generator[Dict[str, Any], None, None]:
    """
    Collects an arbitrary entity for an ad account
    """
    token, entity_type, root_fb_entity = _extract_token_entity_type_parent_entity(
        job_scope, Entity.AA_SCOPED, Entity.AdAccount, 'ad_account_id')

    entities = iter_native_entities_per_adaccount(root_fb_entity, entity_type)

    record_id_base_data = job_scope.to_dict()
    record_id_base_data.update(entity_type=entity_type, report_variant=None)

    token_manager = PlatformTokenManager.from_job_scope(job_scope)
    with ChunkDumpStore(job_scope, chunk_size=DEFAULT_CHUNK_SIZE) as store:
        for cnt, entity in enumerate(entities):
            entity_data = entity.export_all_data()
            entity_data = add_vendor_data(
                entity_data,
                id=generate_universal_id(
                    # FIXME: add a bug to facebook ads (get_ad_videos doesnt return ad videos but AbstractCrudObject)
                    # FIXME so it is unable to access entity.Field.id then (only a problem for ad videos)
                    entity_id=entity_data.get('id'),
                    **record_id_base_data,
                ),
            )

            # Store the individual datum, use job context for the cold
            # storage thing to divine whatever it needs from the job context
            store(entity_data)

            # Signal to the system the new entity
            feedback_entity_task.delay(entity_data, entity_type)

            yield entity_data

            if cnt % 1000 == 0:
                # default paging size for entities per parent
                # is typically around 200. So, each 200 results
                # means about 5 hits to FB
                token_manager.report_usage(token, 5)

    # Report on the effective task status
    token_manager.report_usage(token)
Пример #17
0
def collect_entities_from_iterator(
        job_scope: JobScope, entity_iterator: Generator[object, None,
                                                        None]) -> int:
    logger.info(f'{job_scope} started')

    if not job_scope.tokens:
        good_token = PlatformTokenManager.from_job_scope(
            job_scope).get_best_token()
        if good_token is not None:
            job_scope.tokens = [good_token]

    cnt = 0
    try:
        for cnt, datum in enumerate(entity_iterator):
            if cnt % 100 == 0:
                logger.info(f'{job_scope} processed {cnt} data points so far')
    except Exception as e:
        raise CollectionError(e, cnt)

    logger.info(f'{job_scope} complete a total of {cnt} data points')
    return cnt
Пример #18
0
def collect_insights_task(job_scope: JobScope, job_context: JobContext):
    logger.info(f'{job_scope} started')

    if not job_scope.tokens:
        good_token = PlatformTokenManager.from_job_scope(
            job_scope).get_best_token()
        if good_token is not None:
            job_scope.tokens = [good_token]

    data_iter = Insights.iter_collect_insights(job_scope, job_context)

    cnt = 0
    try:
        for cnt, datum in enumerate(data_iter):
            if cnt % 100 == 0:
                logger.info(f'{job_scope} processed {cnt} data points so far')
    except Exception as e:
        raise CollectionError(e, cnt)

    logger.info(f'{job_scope} complete a total of {cnt} data points')
    return cnt
Пример #19
0
    def test_empty_state(self):
        token_manager = PlatformTokenManager(self.asset_scope, self.sweep_id)

        assert token_manager.get_token_count() == 0
Пример #20
0
def init_tokens(sweep_id):
    """
    This is an accidental child of two parents: indecision and imperfection.

    *We need tokens in workers.*

    (A) "Clean" way to pass tokens to workers would be
    in the args - celery task args.

    (B) To get there, the cleanest way to have
    tokens available for insertion into celery task call args, is to have that data
    available in the Oozer, for each JobID it pulls from the queue, it also, somehow
    gets the other parts of JobScope data, like tokens, but these are NOT on job ID it pulls.

    (C) So, To have this JobScope additional data available on a per JobID basis to Oozer's looper
    it would be "clean" to pre-apportion (store somewhere temporarily one per each Job ID by
    Sweep Builder.

    (D) In that above case - where Builder just pre-assigns tokens etc scope data to each and single
    data and persists it along each Job ID - our generator pipeline cleanly delivers this stuff
    from Reality inferer, through Expecation Builder, through Prioritizer to Persister - all on top
    of *Claim family of objects - all in process - all fine.... Right?

    Well....

    While effective duplication of Job Scope data inside the Reality>Expectation>Prioritizer chain
    mentioned in part (D) is super cheap duplication - we send exactly same 2-3 tokens with each
    Reality>Expecation>Prioritization Claim objects again and again, we never hold ALL of Claims
    for the sweep in memory. It's like we are watching a movies where parts of the shot repeat
    because they are part of the same stage / scene, but we hold only one frame of the movie at the time.
    We don't feel the weight of data (tokens) duplication at stage (D)

    However, at stage (C) - Sweep Builder *Persister* we are serializing entire movie - all frames,
    all shots, into Redis. Naturally, it feels icky to write same exact 3 tokens several million times
    to different keys in Redis.

    Then, you take this to Oozer level, where all of these million records with same values need to be read,
    serialized into Celery call signatures and again written to Celery broker DB (redis again).
    Luckily there we *ooze* the tasks out, so only several hundred thousand times duplication at the same time.

    All that writing + reading counted in millions + pressure on Redis memory is starting to be annoying.

    And.... then we arrive to realization that the "clean" way of approtioning tokens ahead of time is
    actually not as clean, because we actually need to delay apportioning the tokens until the very end
    when actual worker wakes up from the queue and says "I am ready to do shit!! Which token is still alive?"

    What we realized is that apportioning tokens to each job in builder is completely inefficient,
    that apportioning tokens to each Celery task in Oozer looper is largely inefficient (because it
    shoves thousands of tasks into Celery queue and by then it's late to react to throttling). What
    we realized is that Celery task itself is best picking a token at the time it gets a turn to run.

    So, if workers pick from a weighted / scored collection of platform tokens, awesome shit! Clever!
    But where do we form that collection?....

    This is the piece of shit code where we front-run entire Sweep Builder run and manufacture
    (dump to Redis) collections of tokens (grouped by various scopes). It's here
    because we did not find a better place for it yet.

    TODO: Find a better place for it
          as this off-on-the-side band-aid looper code feels redundant since we have a loop already
    """
    for scope_record in iter_scopes():
        PlatformTokenManager.populate_from_scope_entity(scope_record, sweep_id)
        PageTokenManager.populate_from_scope_entity(scope_record, sweep_id)
Пример #21
0
    def test_token_priority_with_failure_buckets(self):
        token_manager = PlatformTokenManager(self.asset_scope, self.sweep_id)

        first_token = 'first'
        second_token = 'second'
        third_token = 'third'

        token_manager.add(first_token, second_token, third_token)

        assert token_manager.get_token_count() == 3

        # Used the first token
        token_manager.report_usage_per_failure_bucket(
            first_token, FailureBucket.Throttling)  # most penalized
        token_manager.report_usage_per_failure_bucket(
            third_token, FailureBucket.TooLarge)  # somewhat heavily penalized
        token_manager.report_usage_per_failure_bucket(
            second_token, 'blah')  # gets default 1 use

        # best token is one with least penalty / use
        assert token_manager.get_best_token() == second_token
        token_manager.remove(second_token)

        # least worst of remaining ones
        assert token_manager.get_best_token() == third_token
        token_manager.remove(third_token)

        # well... it's only one left
        assert token_manager.get_best_token() == first_token
        token_manager.remove(first_token)

        # allow None to be returned
        assert token_manager.get_best_token() is None