def _iter_collect_organic_insights( cls, data_iter: Generator[Dict[str, Any], None, None], job_scope: JobScope) -> Generator[Dict[str, Any], None, None]: raw_store = batch_store.NormalStore( job_scope, bucket_type=ColdStoreBucketType.RAW_BUCKET, custom_namespace=NAMESPACE_RAW) orig_store = batch_store.NormalStore( job_scope, bucket_type=ColdStoreBucketType.ORIGINAL_BUCKET) common_vendor_data = { 'ad_account_id': job_scope.ad_account_id, 'entity_type': job_scope.report_variant, 'report_type': job_scope.report_type, ORGANIC_DATA_ENTITY_ID_MAP[job_scope.report_variant]: job_scope.entity_id, } data = list(data_iter) raw_record = { 'payload': data, 'page_id': job_scope.ad_account_id, ORGANIC_DATA_ENTITY_ID_MAP[job_scope.report_variant]: job_scope.entity_id, } vendor_data_raw = report_type_vendor_data_raw_extractor_map[ job_scope.report_type](raw_record, **common_vendor_data) raw_record = add_vendor_data(raw_record, **vendor_data_raw) raw_store.store(raw_record) if len(data): # then, transpose it to correct form final_record = { 'page_id': job_scope.ad_account_id, ORGANIC_DATA_ENTITY_ID_MAP[job_scope.report_variant]: job_scope.entity_id, } for param_datum in data: final_record[ param_datum['name']] = param_datum['values'][0]['value'] vendor_data = report_type_vendor_data_extractor_map[ job_scope.report_type](raw_record, **common_vendor_data) final_record = add_vendor_data(final_record, **vendor_data) orig_store.store(final_record) yield final_record
def iter_collect_entities_per_page_post( job_scope: JobScope) -> Generator[Dict[str, Any], None, None]: """ Collects an arbitrary entity for a page post """ entity_type = job_scope.report_variant page_token_manager = PageTokenManager.from_job_scope(job_scope) with PlatformApiContext( page_token_manager.get_best_token( job_scope.ad_account_id)) as fb_ctx: root_fb_entity = fb_ctx.to_fb_model(job_scope.entity_id, Entity.PagePost) entities = iter_native_entities_per_page_post(root_fb_entity, entity_type) record_id_base_data = job_scope.to_dict() record_id_base_data.update(entity_type=entity_type, report_variant=None) del record_id_base_data['entity_id'] with ChunkDumpStore(job_scope, chunk_size=DEFAULT_CHUNK_SIZE) as store: for entity in entities: entity_data = entity.export_all_data() entity_data = add_vendor_data(entity_data, id=generate_universal_id( entity_id=entity_data.get('id'), **record_id_base_data)) entity_data['page_id'] = job_scope.ad_account_id entity_data['page_post_id'] = job_scope.entity_id # Store the individual datum, use job context for the cold # storage thing to divine whatever it needs from the job context store(entity_data) yield entity_data
def collect_page(job_scope: JobScope, _job_context: JobContext): """ Collect a single facebook page """ if job_scope.report_variant != Entity.Page: raise ValueError( f"Report level {job_scope.report_variant} specified is not: {Entity.Page}" ) token = job_scope.token if not token: raise ValueError( f"Job {job_scope.job_id} cannot proceed. No platform tokens provided." ) # We don't use it for getting a token. Something else that calls us does. # However, we use it to report usages of the token we got. token_manager = PlatformTokenManager.from_job_scope(job_scope) with PlatformApiContext(token) as fb_ctx: page_inst = page.Page(fbid=job_scope.entity_id, api=fb_ctx.api) page_fetched = page_inst.api_get(fields=get_default_fields(Page)) report_job_status_task.delay(ExternalPlatformJobStatus.DataFetched, job_scope) token_manager.report_usage(token, 2) record_id_data = job_scope.to_dict() record_id_data.update(entity_type=Entity.Page, entity_id=job_scope.entity_id, report_variant=None) entity_data = page_fetched.export_all_data() entity_data = add_vendor_data( entity_data, id=generate_universal_id(**record_id_data)) store = NormalStore(job_scope) store.store(entity_data)
def collect_pages_from_business(job_scope: JobScope, _job_context: JobContext) -> int: """ Collect all facebook pages that are active """ if job_scope.report_variant != Entity.Page: raise ValueError( f"Report level {job_scope.report_variant} specified is not: {Entity.Page}" ) token = job_scope.token if not token: raise ValueError( f"Job {job_scope.job_id} cannot proceed. No platform tokens provided." ) # We don't use it for getting a token. Something else that calls us does. # However, we use it to report usages of the token we got. token_manager = PlatformTokenManager.from_job_scope(job_scope) with PlatformApiContext(token) as fb_ctx: fb_req = FacebookRequest(node_id="me", method="GET", endpoint="/businesses", api=fb_ctx.api, api_type='EDGE', target_class=Business) businesses = fb_req.execute() report_job_status_task.delay(ExternalPlatformJobStatus.DataFetched, job_scope) token_manager.report_usage(token) entity_type = Entity.Page record_id_base_data = job_scope.to_dict() record_id_base_data.update(entity_type=entity_type, report_variant=None) cnt = 0 for biz in businesses: client_pages = list( biz.get_client_pages(fields=get_default_fields(Page))) owned_pages = list( biz.get_owned_pages(fields=get_default_fields(Page))) pages_list = client_pages + owned_pages for page_inst in pages_list: entity_data = page_inst.export_all_data() record_id_base_data.update(entity_id=entity_data.get('id')) entity_data = add_vendor_data( entity_data, id=generate_universal_id(**record_id_base_data)) store = NormalStore(job_scope) store.store(entity_data) cnt += 1 report_job_status_task.delay(ExternalPlatformJobStatus.Done, job_scope) return cnt
def collect_adaccount(job_scope: JobScope) -> Dict[str, Any]: """ Collects ad account data for a AA specific JobScope definition. :param JobScope job_scope: The JobScope as we get it from the task itself """ if job_scope.report_variant != Entity.AdAccount: raise ValueError( f"Report level {job_scope.report_variant} specified is not: {Entity.AdAccount}" ) token = job_scope.token if not token: raise ValueError( f"Job {job_scope.job_id} cannot proceed. No platform tokens provided." ) assert ( job_scope.ad_account_id == job_scope.entity_id ), f'This is an ad account entity job, account_id should be equal to entity_id' # Used to report token usage by this job token_manager = PlatformTokenManager.from_job_scope(job_scope) with PlatformApiContext(token) as fb_ctx: ad_account = fb_ctx.to_fb_model(job_scope.ad_account_id, Entity.AdAccount) fields = get_default_fields(ad_account.__class__) ad_account_with_selected_fields = ad_account.api_get( fields=fields) # Read just the fields we need ad_account_data_dict = ad_account_with_selected_fields.export_all_data( ) # Export the object to a dict token_manager.report_usage(token) job_scope_base = { # Duplicate the job_scope data to avoid mutating it **job_scope.to_dict(), 'entity_type': Entity.AdAccount, 'report_variant': None, } augmented_ad_account_data = add_vendor_data( # Augment the data returned from the remote API with our vendor data ad_account_data_dict, id=generate_universal_id(**job_scope_base), ) feedback_entity_task.delay(ad_account_data_dict, job_scope.report_variant) store = NormalStore(job_scope) store.store(augmented_ad_account_data) # TODO: feedback account? this probably wouldn't make sense at the moment # because ad accounts are discovered from console and their lifecycle is controlled from there. return ad_account_data_dict
def test_add_new_vendor_block(self): data = {'a': 1} data_should_be = {'a': 1, '__oprm': {'id': 5}} data_actual = add_vendor_data(data, id=5) assert data_actual is data, 'we did not repackage the instance. Same instance' assert data_actual == data_should_be
def test_update_existing_vendor_block(self): data = {'a': 1, '__oprm': {'id': 5}} data_should_be = {'a': 1, '__oprm': {'id': 5, 'extra_attr': 7}} data_actual = add_vendor_data(data, extra_attr=7) assert data_actual is data, 'we did not repackage the instance. Same instance' assert data_actual == data_should_be
def iter_collect_entities_per_page( job_scope: JobScope) -> Generator[Dict[str, Any], None, None]: """ Collects an arbitrary entity for a page """ token, entity_type, root_fb_entity = _extract_token_entity_type_parent_entity( job_scope, [Entity.PagePost, Entity.PageVideo], Entity.Page, 'ad_account_id') entities = iter_native_entities_per_page(root_fb_entity, entity_type) record_id_base_data = job_scope.to_dict() record_id_base_data.update(entity_type=entity_type, report_variant=None) token_manager = PlatformTokenManager.from_job_scope(job_scope) with ChunkDumpStore( job_scope, chunk_size=DEFAULT_CHUNK_SIZE) as store, ChunkDumpStore( job_scope, chunk_size=DEFAULT_CHUNK_SIZE, bucket_type=ColdStoreBucketType.RAW_BUCKET, custom_namespace=NAMESPACE_RAW, ) as raw_store: cnt = 0 for entity in entities: entity_data = entity.export_all_data() entity_data = add_vendor_data(entity_data, id=generate_universal_id( entity_id=entity_data.get('id'), **record_id_base_data)) entity_data['page_id'] = job_scope.ad_account_id if entity_type == Entity.PagePost: # store raw version of response (just to remain consistent) raw_store(entity_data) entity_data = _augment_page_post(entity_data) # Store the individual datum, use job context for the cold # storage thing to divine whatever it needs from the job context store(entity_data) # Signal to the system the new entity feedback_entity_task.delay(entity_data, entity_type) yield entity_data cnt += 1 if cnt % 1000 == 0: # default paging size for entities per parent # is typically around 200. So, each 200 results # means about 5 hits to FB token_manager.report_usage(token, 5) token_manager.report_usage(token)
def iter_collect_entities_per_page_graph( job_scope: JobScope) -> Generator[Dict[str, Any], None, None]: """ Collects an arbitrary entity for a page using graph API """ page_token_manager = PageTokenManager.from_job_scope(job_scope) with PlatformApiContext( page_token_manager.get_best_token( job_scope.ad_account_id)) as fb_ctx: page_root_fb_entity = fb_ctx.to_fb_model(job_scope.ad_account_id, Entity.Page) entity_type = job_scope.report_variant # page size reduced to avoid error: # "Please reduce the amount of data you're asking for, then retry your request" entities = iter_native_entities_per_page_graph(page_root_fb_entity, entity_type, page_size=30) record_id_base_data = job_scope.to_dict() record_id_base_data.update(entity_type=entity_type, report_variant=None) with ChunkDumpStore( job_scope, chunk_size=DEFAULT_CHUNK_SIZE) as store, ChunkDumpStore( job_scope, chunk_size=DEFAULT_CHUNK_SIZE, bucket_type=ColdStoreBucketType.RAW_BUCKET, custom_namespace=NAMESPACE_RAW, ) as raw_store: for entity in entities: entity_data = entity.export_all_data() entity_data = add_vendor_data(entity_data, id=generate_universal_id( entity_id=entity_data.get('id'), **record_id_base_data)) entity_data['page_id'] = job_scope.ad_account_id if entity_type == Entity.PagePostPromotable: # store raw version of response (just to remain consistent) raw_store(entity_data) entity_data = _augment_page_post(entity_data) # Store the individual datum, use job context for the cold # storage thing to divine whatever it needs from the job context store(entity_data) # Signal to the system the new entity feedback_entity_task.delay(entity_data, entity_type) yield entity_data
def iter_collect_entities_per_adaccount( job_scope: JobScope) -> Generator[Dict[str, Any], None, None]: """ Collects an arbitrary entity for an ad account """ token, entity_type, root_fb_entity = _extract_token_entity_type_parent_entity( job_scope, Entity.AA_SCOPED, Entity.AdAccount, 'ad_account_id') entities = iter_native_entities_per_adaccount(root_fb_entity, entity_type) record_id_base_data = job_scope.to_dict() record_id_base_data.update(entity_type=entity_type, report_variant=None) token_manager = PlatformTokenManager.from_job_scope(job_scope) with ChunkDumpStore(job_scope, chunk_size=DEFAULT_CHUNK_SIZE) as store: for cnt, entity in enumerate(entities): entity_data = entity.export_all_data() entity_data = add_vendor_data( entity_data, id=generate_universal_id( # FIXME: add a bug to facebook ads (get_ad_videos doesnt return ad videos but AbstractCrudObject) # FIXME so it is unable to access entity.Field.id then (only a problem for ad videos) entity_id=entity_data.get('id'), **record_id_base_data, ), ) # Store the individual datum, use job context for the cold # storage thing to divine whatever it needs from the job context store(entity_data) # Signal to the system the new entity feedback_entity_task.delay(entity_data, entity_type) yield entity_data if cnt % 1000 == 0: # default paging size for entities per parent # is typically around 200. So, each 200 results # means about 5 hits to FB token_manager.report_usage(token, 5) # Report on the effective task status token_manager.report_usage(token)
def __init__(self, job_scope: JobScope, report_entity_api_kind: str): if job_scope.report_type not in ReportType.ALL_METRICS: raise ValueError( f"Report type {job_scope.report_type} specified is not one of supported values: " + ReportType.ALL_METRICS) # cool. we are in the right place... self.report_params = { 'fields': DEFAULT_REPORT_FIELDS, 'action_attribution_windows': [ # https://developers.facebook.com/docs/marketing-api/reference/adgroup/insights/ # https://developers.facebook.com/docs/marketing-api/insights#sample # 'actions' and 'action_values' can contain values per different measurement window # In case of 'actions', default 'value' is always 1d_view PLUS 28d_click and cannot be removed. # In case of 'action_values', default 'value' is some weighted sum of # 1d_view AND 28d_click $ values, that may be smaller than raw 1d_view PLUS 28d_click $ values. # Many customers interpret their conversions / actions in different attribution windows. # The more windows we ask the data for, the less reliably it returns reports. # Be super conservative about asking for more / all. AdsInsights.ActionAttributionWindows.value_1d_view, AdsInsights.ActionAttributionWindows.value_7d_view, AdsInsights.ActionAttributionWindows.value_28d_view, AdsInsights.ActionAttributionWindows.value_1d_click, AdsInsights.ActionAttributionWindows.value_7d_click, AdsInsights.ActionAttributionWindows.value_28d_click, ], } # Next is (a) vs (b) - abstraction level determination is_per_parent_report = not job_scope.entity_id and job_scope.report_variant in Entity.ALL if is_per_parent_report: entity_id = job_scope.ad_account_id entity_type = Entity.AdAccount entity_type_reporting = job_scope.report_variant if report_entity_api_kind == ReportEntityApiKind.Ad: self.report_params.update( level=ENUM_LEVEL_MAP[job_scope.report_variant]) else: # direct, per-entity report entity_id = job_scope.entity_id entity_type = job_scope.entity_type entity_type_reporting = job_scope.report_variant if report_entity_api_kind == ReportEntityApiKind.Ad: self.report_params.update( level=ENUM_LEVEL_MAP[entity_type_reporting]) # Now, (c), (d), (e), (f), (g) choices # we already checked above that this is one of metrics report types # So we know it will be either lifetime or day-with-breakdown type # TODO: add fields listings appropriate for each type if job_scope.report_type == ReportType.lifetime: self.report_params.update( date_preset=AdsInsights.DatePreset.lifetime) elif job_scope.report_type in REPORT_TYPE_FB_BREAKDOWN_ENUM: # some day-with-breakdown type self.report_params.update( time_increment=1, # group by calendar day (in AA tz) time_range={ 'since': _convert_and_validate_date_format(job_scope.range_start), # No value for job_scope.range_end means 1-day report for range_start day 'until': _convert_and_validate_date_format( job_scope.range_end or job_scope.range_start), }, breakdowns=REPORT_TYPE_FB_BREAKDOWN_ENUM[ job_scope.report_type], ) else: raise ValueError( f"Report type {job_scope.report_type} does not have a mapped Platform-side breakdown value." ) # Indicates that datum returned in a per-parent report is by itself # naturally mapped to some single normative job , # meaning each element can be stored separately # but only under normative ID computed on the fly # from the datum. # This must be accompanied by a transform fn that # derives a normative ID from data. # special case. # when report type is per-specific-single-entity-ID # AND one of per-day-with-breakdown # per-Entity-ID-per-day bundle with 24 records before saving it. # This results in a single write to the cold store under # single normative ID. is_whole_report_bundle_write = ( # must be one of those per-day reports job_scope.report_type in ReportType.ALL_DAY_BREAKDOWNS and # except for DMA-based data, as these can be very long, # - 10s of thousands of records per day job_scope.report_type not in [ ReportType.day_dma, ReportType.day_region, ReportType.day_country ] and # and the report is per single entity ID job_scope.entity_id and not job_scope.report_variant and # and report is for a single calendar day # ReportType.ALL_DAY_BREAKDOWNS means there must be a non-Null # value in time_range, but we check anyway self.report_params['time_range']['since'] and self.report_params['time_range']['since'] == self.report_params['time_range']['until']) # a more complex variant of whole_report_bundle_write # where, while we canNOT spool entire report into memory to # write it as one bundle, we cannot really write each # individual result out either, as there will be a shit-load of them # and we have to write is some sort of batching mode, but # cannot cleanly group the bundles into per-normative-ID bundles, # and instead will write under effective ID, but with a suffix # indicating the monotonically-increasing chunk number. # Disabled but kept for reference to compare to shorter version immediately below # These represent good range of choices for cold store handlers. # When / if there is value to it, steal from this commented out code. # if is_naturally_normative_child: # self.datum_handler = batch_store.NaturallyNormativeChildStore(job_scope) # elif is_whole_report_bundle_write: # self.datum_handler = batch_store.MemorySpoolStore(job_scope) # elif is_chunk_write: # self.datum_handler = batch_store.ChunkDumpStore(job_scope) # else: # self.datum_handler = batch_store.NormalStore(job_scope) # let's be more aggressive about doing bundled writes to cold store # and (temporarily) get away from "normative" and single-datum writes # There are two ways we can get closer to bundled writes: # - spool entire report in memory and flush out at the end, when we know we can tolerate that # - spool large chunks of report in memory and flush them periodically if we fear large sizes in report. if is_whole_report_bundle_write: self.datum_handler = batch_store.MemorySpoolStore(job_scope) else: self.datum_handler = batch_store.ChunkDumpStore(job_scope, chunk_size=200) with PlatformApiContext(job_scope.token) as fb_ctx: self.report_root_fb_entity = fb_ctx.to_fb_model( entity_id, entity_type) # here we configure code that will augment each datum with record ID vendor_data_extractor = report_type_vendor_data_extractor_map[ job_scope.report_type] if job_scope.report_type == ReportType.day_hour: # hour report type's ID extractor function needs extra leading arg - timezone vendor_data_extractor = functools.partial( vendor_data_extractor, job_scope.ad_account_timezone_name) aux_data = { 'ad_account_id': job_scope.ad_account_id, 'entity_type': entity_type_reporting, 'report_type': job_scope.report_type, } self.augment_with_vendor_data = lambda data: add_vendor_data( data, **vendor_data_extractor(data, **aux_data))