except Exception as e:
        completion_status = f'Uncaught exception: {e}'
        logging.error(completion_status, exc_info=True)
    finally:
        end_time = datetime.datetime.now()
        num_ads_added = search_runner.num_ads_added_to_db()
        num_impressions_added = search_runner.num_impressions_added_to_db()
        if not min_expected_ads_or_impressions_met(
                num_ads_added, min_expected_new_ads, num_impressions_added,
                min_expected_new_impressions):

            # log to error channel because num expected ads or impressions not met
            slack_url_for_completion_msg = slack_url_error_channel
        logging.info(search_runner.get_formatted_graph_error_counts())
        send_completion_slack_notification(
            slack_url_for_completion_msg, country_code_uppercase,
            completion_status, start_time, end_time, num_ads_added,
            num_impressions_added, min_expected_new_ads,
            min_expected_new_impressions,
            search_runner.get_formatted_graph_error_counts())


if __name__ == '__main__':
    config = config_utils.get_config(sys.argv[1])
    country_code = config['SEARCH']['COUNTRY_CODE'].lower()

    config_utils.configure_logger(f"{country_code}_fb_api_collection.log")
    if len(sys.argv) < 2:
        exit(f"Usage:python3 {sys.argv[0]} generic_fb_collector.cfg")
    main(config)
                           | 'Fetch CrowdTangle results' >>
                           fetch_crowdtangle.FetchCrowdTangle())

        processed_results = (
            results
            | 'Transform CrowdTangle for SQL' >> beam.ParDo(
                process_crowdtangle_posts.ProcessCrowdTanglePosts())
            | 'Batch CrowdTangle results transformed for SQL' >>
            beam.transforms.util.BatchElements(min_batch_size=10,
                                               max_batch_size=500))

        if known_args.dry_run:

            def print_row(row):
                print(row)
                return row

            processed_results | beam.Map(print_row)

        else:
            (processed_results
             | 'Write processed results to Database' >> beam.ParDo(
                 write_crowdtangle_results_to_database.
                 WriteCrowdTangleResultsToDatabase(database_connection_params))
             )


if __name__ == '__main__':
    config_utils.configure_logger('run_fetch_crowdtangle.log')
    run()
    ad_creative_video_bucket_client = make_gcs_bucket_client(AD_CREATIVE_VIDEOS_BUCKET,
                                                             GCS_CREDENTIALS_FILE)
    archive_screenshots_bucket_client = make_gcs_bucket_client(ARCHIVE_SCREENSHOTS_BUCKET,
                                                               GCS_CREDENTIALS_FILE)
    image_retriever = FacebookAdCreativeRetriever(
        database_connection_params, creative_retriever_factory, browser_context_factory,
        ad_creative_images_bucket_client, ad_creative_video_bucket_client,
        archive_screenshots_bucket_client, commit_to_db_every_n_processed, slack_url,
        slack_user_id_to_include, max_video_download_size=max_video_download_size)
    try:
        image_retriever.retreive_and_store_ad_creatives()
    except KeyboardInterrupt:
        # don't log about Ctrl-C
        raise
    except BaseException as error:
        slack_msg = (
            ':rotating_light: :rotating_light: :rotating_light: '
            'fb_ad_creative_retriever.py raised |%r| on host %s.'
            ':rotating_light: :rotating_light: :rotating_light:' % (
                error, socket.getfqdn()))
        send_slack_message(slack_url, slack_msg,
                           slack_user_id_to_include=slack_user_id_to_include)
        raise


if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.exit('Usage: %s <config file>' % sys.argv[0])
    config_utils.configure_logger("fb_ad_creative_retriever.log")
    main(sys.argv[1:])
Exemplo n.º 4
0
    template['mappings'] = mappings
    headers = {'content-type': 'application/json'}
    req = requests.put(
        "https://%(es_cluster_name)s/_template/nyu_ad_creatives" %
        {es_cluster_name},
        headers=headers,
        data=json.dumps(template))
    logging.info("Successfully created ad creatives template. Status code: %s",
                 req.status_code)
    if not req.ok:
        logging.warning("Encountered an error when creating template: %s",
                        req.content)


def main(argv):
    config = config_utils.get_config(argv[0])
    es_cluster_name = config['ELASTIC_SEARCH']['CLUSTER_NAME']
    logging.info("Creating ad screener elasticsearch pages template.")
    create_pages_template(es_cluster_name)
    logging.info("Creating ad screener elasticsearch ad creatives template.")
    create_ads_template(es_cluster_name)


if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.exit('Usage: %s <config file>' % sys.argv[0])
    config_utils.configure_logger("initalize_es.log")
    logging.getLogger("requests").setLevel(logging.WARNING)
    logging.getLogger("urllib3").setLevel(logging.WARNING)
    main(sys.argv[1:])
"""Module to make batches of unfetched archive IDs and store them in the DB."""
import sys

import db_functions
import config_utils

DEFAULT_BATCH_SIZE = 1000
DEFAULT_MIN_AD_CREATION_DATE = '2019-01-01'


def main(config):
    country_code = config.get('SEARCH', 'COUNTRY_CODE', fallback=None)
    min_ad_creation_date = config.get('SEARCH',
                                      'MIN_AD_CREATION_DATE',
                                      fallback=DEFAULT_MIN_AD_CREATION_DATE)
    with config_utils.get_database_connection_from_config(
            config) as database_connection:
        database_interface = db_functions.DBInterface(database_connection)
        database_interface.make_snapshot_fetch_batches(
            batch_size=DEFAULT_BATCH_SIZE,
            country_code=country_code,
            min_ad_creation_date=min_ad_creation_date)


if __name__ == '__main__':
    config_utils.configure_logger('archive_id_batcher.log')
    config = config_utils.get_config(sys.argv[1])
    main(config)
Exemplo n.º 6
0
            total_records_inserted += len(es_records)
            logging.debug("Inserted %s ad creatives records.",
                          total_records_inserted)

            rows = cursor.fetchmany(AD_CREATIVES_TABLE_FETCH_BATCH_SIZE)

    logging.info("Copied %s ad creatives records in %d seconds.",
                 total_records_inserted, int(time.time() - start_time))


def main(argv):
    config = config_utils.get_config(argv[0])
    db_connection_params = config_utils.get_database_connection_params_from_config(
        config)
    es_cluster_name = config['ELASTIC_SEARCH']['CLUSTER_NAME']
    pages_index_name = config['ELASTIC_SEARCH']['PAGES_INDEX_NAME']
    ad_creatives_index_name = config['ELASTIC_SEARCH'][
        'AD_CREATIVES_INDEX_NAME']
    move_pages_to_es(db_connection_params, es_cluster_name, pages_index_name)
    move_ads_to_es(db_connection_params, es_cluster_name,
                   ad_creatives_index_name)


if __name__ == '__main__':
    if len(sys.argv) < 2:
        sys.exit('Usage: %s <config file>' % sys.argv[0])
    config_utils.configure_logger("populate_es.log")
    logging.getLogger("requests").setLevel(logging.WARNING)
    logging.getLogger("urllib3").setLevel(logging.WARNING)
    main(sys.argv[1:])