except Exception as e: completion_status = f'Uncaught exception: {e}' logging.error(completion_status, exc_info=True) finally: end_time = datetime.datetime.now() num_ads_added = search_runner.num_ads_added_to_db() num_impressions_added = search_runner.num_impressions_added_to_db() if not min_expected_ads_or_impressions_met( num_ads_added, min_expected_new_ads, num_impressions_added, min_expected_new_impressions): # log to error channel because num expected ads or impressions not met slack_url_for_completion_msg = slack_url_error_channel logging.info(search_runner.get_formatted_graph_error_counts()) send_completion_slack_notification( slack_url_for_completion_msg, country_code_uppercase, completion_status, start_time, end_time, num_ads_added, num_impressions_added, min_expected_new_ads, min_expected_new_impressions, search_runner.get_formatted_graph_error_counts()) if __name__ == '__main__': config = config_utils.get_config(sys.argv[1]) country_code = config['SEARCH']['COUNTRY_CODE'].lower() config_utils.configure_logger(f"{country_code}_fb_api_collection.log") if len(sys.argv) < 2: exit(f"Usage:python3 {sys.argv[0]} generic_fb_collector.cfg") main(config)
| 'Fetch CrowdTangle results' >> fetch_crowdtangle.FetchCrowdTangle()) processed_results = ( results | 'Transform CrowdTangle for SQL' >> beam.ParDo( process_crowdtangle_posts.ProcessCrowdTanglePosts()) | 'Batch CrowdTangle results transformed for SQL' >> beam.transforms.util.BatchElements(min_batch_size=10, max_batch_size=500)) if known_args.dry_run: def print_row(row): print(row) return row processed_results | beam.Map(print_row) else: (processed_results | 'Write processed results to Database' >> beam.ParDo( write_crowdtangle_results_to_database. WriteCrowdTangleResultsToDatabase(database_connection_params)) ) if __name__ == '__main__': config_utils.configure_logger('run_fetch_crowdtangle.log') run()
ad_creative_video_bucket_client = make_gcs_bucket_client(AD_CREATIVE_VIDEOS_BUCKET, GCS_CREDENTIALS_FILE) archive_screenshots_bucket_client = make_gcs_bucket_client(ARCHIVE_SCREENSHOTS_BUCKET, GCS_CREDENTIALS_FILE) image_retriever = FacebookAdCreativeRetriever( database_connection_params, creative_retriever_factory, browser_context_factory, ad_creative_images_bucket_client, ad_creative_video_bucket_client, archive_screenshots_bucket_client, commit_to_db_every_n_processed, slack_url, slack_user_id_to_include, max_video_download_size=max_video_download_size) try: image_retriever.retreive_and_store_ad_creatives() except KeyboardInterrupt: # don't log about Ctrl-C raise except BaseException as error: slack_msg = ( ':rotating_light: :rotating_light: :rotating_light: ' 'fb_ad_creative_retriever.py raised |%r| on host %s.' ':rotating_light: :rotating_light: :rotating_light:' % ( error, socket.getfqdn())) send_slack_message(slack_url, slack_msg, slack_user_id_to_include=slack_user_id_to_include) raise if __name__ == '__main__': if len(sys.argv) < 2: sys.exit('Usage: %s <config file>' % sys.argv[0]) config_utils.configure_logger("fb_ad_creative_retriever.log") main(sys.argv[1:])
template['mappings'] = mappings headers = {'content-type': 'application/json'} req = requests.put( "https://%(es_cluster_name)s/_template/nyu_ad_creatives" % {es_cluster_name}, headers=headers, data=json.dumps(template)) logging.info("Successfully created ad creatives template. Status code: %s", req.status_code) if not req.ok: logging.warning("Encountered an error when creating template: %s", req.content) def main(argv): config = config_utils.get_config(argv[0]) es_cluster_name = config['ELASTIC_SEARCH']['CLUSTER_NAME'] logging.info("Creating ad screener elasticsearch pages template.") create_pages_template(es_cluster_name) logging.info("Creating ad screener elasticsearch ad creatives template.") create_ads_template(es_cluster_name) if __name__ == '__main__': if len(sys.argv) < 2: sys.exit('Usage: %s <config file>' % sys.argv[0]) config_utils.configure_logger("initalize_es.log") logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) main(sys.argv[1:])
"""Module to make batches of unfetched archive IDs and store them in the DB.""" import sys import db_functions import config_utils DEFAULT_BATCH_SIZE = 1000 DEFAULT_MIN_AD_CREATION_DATE = '2019-01-01' def main(config): country_code = config.get('SEARCH', 'COUNTRY_CODE', fallback=None) min_ad_creation_date = config.get('SEARCH', 'MIN_AD_CREATION_DATE', fallback=DEFAULT_MIN_AD_CREATION_DATE) with config_utils.get_database_connection_from_config( config) as database_connection: database_interface = db_functions.DBInterface(database_connection) database_interface.make_snapshot_fetch_batches( batch_size=DEFAULT_BATCH_SIZE, country_code=country_code, min_ad_creation_date=min_ad_creation_date) if __name__ == '__main__': config_utils.configure_logger('archive_id_batcher.log') config = config_utils.get_config(sys.argv[1]) main(config)
total_records_inserted += len(es_records) logging.debug("Inserted %s ad creatives records.", total_records_inserted) rows = cursor.fetchmany(AD_CREATIVES_TABLE_FETCH_BATCH_SIZE) logging.info("Copied %s ad creatives records in %d seconds.", total_records_inserted, int(time.time() - start_time)) def main(argv): config = config_utils.get_config(argv[0]) db_connection_params = config_utils.get_database_connection_params_from_config( config) es_cluster_name = config['ELASTIC_SEARCH']['CLUSTER_NAME'] pages_index_name = config['ELASTIC_SEARCH']['PAGES_INDEX_NAME'] ad_creatives_index_name = config['ELASTIC_SEARCH'][ 'AD_CREATIVES_INDEX_NAME'] move_pages_to_es(db_connection_params, es_cluster_name, pages_index_name) move_ads_to_es(db_connection_params, es_cluster_name, ad_creatives_index_name) if __name__ == '__main__': if len(sys.argv) < 2: sys.exit('Usage: %s <config file>' % sys.argv[0]) config_utils.configure_logger("populate_es.log") logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("urllib3").setLevel(logging.WARNING) main(sys.argv[1:])