Exemplo n.º 1
0
def main(user, google_cloud_credentials_file_path, pipeline_configuration_file_path, raw_data_dir):
    # Read the settings from the configuration file
    log.info("Loading Pipeline Configuration File...")
    with open(pipeline_configuration_file_path) as f:
        pipeline_configuration = PipelineConfiguration.from_configuration_file(f)

    log.info("Downloading Firestore UUID Table credentials...")
    firestore_uuid_table_credentials = json.loads(google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        pipeline_configuration.phone_number_uuid_table.firebase_credentials_file_url
    ))

    phone_number_uuid_table = FirestoreUuidTable(
        pipeline_configuration.phone_number_uuid_table.table_name,
        firestore_uuid_table_credentials,
        "avf-phone-uuid-"
    )
    log.info("Initialised the Firestore UUID table")

    log.info(f"Fetching data from {len(pipeline_configuration.raw_data_sources)} sources...")
    for i, raw_data_source in enumerate(pipeline_configuration.raw_data_sources):
        log.info(f"Fetching from source {i + 1}/{len(pipeline_configuration.raw_data_sources)}...")
        if isinstance(raw_data_source, RapidProSource):
            fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table,
                                 raw_data_source)
        elif isinstance(raw_data_source, GCloudBucketSource):
            fetch_from_gcloud_bucket(google_cloud_credentials_file_path, raw_data_dir, raw_data_source)
        elif isinstance(raw_data_source, ShaqadoonCSVSource):
            fetch_from_shaqadoon_csv(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table,
                                     raw_data_source)

        else:
            assert False, f"Unknown raw_data_source type {type(raw_data_source)}"
def fetch_from_recovery_csv(user, google_cloud_credentials_file_path,
                            raw_data_dir, phone_number_uuid_table,
                            recovery_csv_source):
    log.info("Fetching data from a Recovery CSV...")
    for blob_url in recovery_csv_source.activation_flow_urls + recovery_csv_source.survey_flow_urls:
        flow_name = blob_url.split('/')[-1].split('.')[
            0]  # Takes the name between the last '/' and the '.csv' ending
        traced_runs_output_path = f"{raw_data_dir}/{flow_name}.jsonl"
        if os.path.exists(traced_runs_output_path):
            log.info(
                f"File '{traced_runs_output_path}' for blob '{blob_url}' already exists; skipping download"
            )
            continue

        log.info(f"Downloading recovered data from '{blob_url}'...")
        raw_csv_string = StringIO(
            google_cloud_utils.download_blob_to_string(
                google_cloud_credentials_file_path, blob_url))
        raw_data = list(csv.DictReader(raw_csv_string))
        log.info(f"Downloaded {len(raw_data)} recovered messages")

        log.info("Converting the recovered messages to TracedData...")
        traced_runs = []
        for i, row in enumerate(raw_data):
            raw_date = row["ReceivedOn"]
            if len(raw_date) == len("dd/mm/YYYY HH:MM"):
                parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M")
            else:
                parsed_raw_date = datetime.strptime(raw_date,
                                                    "%d/%m/%Y %H:%M:%S")
            localized_date = pytz.timezone("Africa/Mogadishu").localize(
                parsed_raw_date)

            assert row["Sender"].startswith("avf-phone-uuid-"), \
                f"The 'Sender' column for '{blob_url} contains an item that has not been de-identified " \
                f"into Africa's Voices Foundation's de-identification format. This may be done with de_identify_csv.py."

            d = {
                "avf_phone_id": row["Sender"],
                "message": row["Message"],
                "received_on": localized_date.isoformat(),
                "run_id": SHAUtils.sha_dict(row)
            }

            traced_runs.append(
                TracedData(
                    d,
                    Metadata(user, Metadata.get_call_location(),
                             TimeUtils.utc_now_as_iso_string())))
        log.info("Converted the recovered messages to TracedData")

        log.info(
            f"Exporting {len(traced_runs)} TracedData items to {traced_runs_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                traced_runs, f)
        log.info(f"Exported TracedData")
def main(user, google_cloud_credentials_file_path,
         pipeline_configuration_file_path, raw_data_dir):
    # Read the settings from the configuration file
    log.info("Loading Pipeline Configuration File...")
    with open(pipeline_configuration_file_path) as f:
        pipeline_configuration = PipelineConfiguration.from_configuration_file(
            f)
    Logger.set_project_name(pipeline_configuration.pipeline_name)
    log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}")

    log.info("Downloading Firestore UUID Table credentials...")
    firestore_uuid_table_credentials = json.loads(
        google_cloud_utils.download_blob_to_string(
            google_cloud_credentials_file_path,
            pipeline_configuration.uuid_table.firebase_credentials_file_url))

    uuid_table = FirestoreUuidTable(
        pipeline_configuration.uuid_table.table_name,
        firestore_uuid_table_credentials,
        pipeline_configuration.uuid_table.uuid_prefix)
    log.info("Initialised the Firestore UUID table")

    log.info(
        f"Fetching data from {len(pipeline_configuration.raw_data_sources)} sources..."
    )
    for i, raw_data_source in enumerate(
            pipeline_configuration.raw_data_sources):
        log.info(
            f"Fetching from source {i + 1}/{len(pipeline_configuration.raw_data_sources)}..."
        )
        if isinstance(raw_data_source, RapidProSource):
            fetch_from_rapid_pro(user, google_cloud_credentials_file_path,
                                 raw_data_dir, uuid_table, raw_data_source)
        elif isinstance(raw_data_source, GCloudBucketSource):
            fetch_from_gcloud_bucket(google_cloud_credentials_file_path,
                                     raw_data_dir, raw_data_source)
        elif isinstance(raw_data_source, RecoveryCSVSource):
            fetch_from_recovery_csv(user, google_cloud_credentials_file_path,
                                    raw_data_dir, uuid_table, raw_data_source)
        elif isinstance(raw_data_source, FacebookSource):
            fetch_from_facebook(user, google_cloud_credentials_file_path,
                                raw_data_dir, uuid_table, raw_data_source)
        else:
            assert False, f"Unknown raw_data_source type {type(raw_data_source)}"
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path,
                         raw_data_dir, phone_number_uuid_table,
                         rapid_pro_source):
    log.info("Fetching data from Rapid Pro...")
    log.info("Downloading Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        rapid_pro_source.token_file_url).strip()

    rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token)

    # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro.
    raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json"
    contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl"
    try:
        log.info(f"Loading raw contacts from file '{raw_contacts_path}'...")
        with open(raw_contacts_path) as raw_contacts_file:
            raw_contacts = [
                Contact.deserialize(contact_json)
                for contact_json in json.load(raw_contacts_file)
            ]
        log.info(f"Loaded {len(raw_contacts)} contacts")
    except FileNotFoundError:
        log.info(
            f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server"
        )
        with open(contacts_log_path, "a") as contacts_log_file:
            raw_contacts = rapid_pro.get_raw_contacts(
                raw_export_log_file=contacts_log_file)

    # Download all the runs for each of the radio shows
    for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names:
        runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl"
        raw_runs_path = f"{raw_data_dir}/{flow}_raw.json"
        traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl"
        log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...")

        flow_id = rapid_pro.get_flow_id(flow)

        # Load the previous export of runs for this flow, and update them with the newest runs.
        # If there is no previous export for this flow, fetch all the runs from Rapid Pro.
        with open(runs_log_path, "a") as raw_runs_log_file:
            try:
                log.info(f"Loading raw runs from file '{raw_runs_path}'...")
                with open(raw_runs_path) as raw_runs_file:
                    raw_runs = [
                        Run.deserialize(run_json)
                        for run_json in json.load(raw_runs_file)
                    ]
                log.info(f"Loaded {len(raw_runs)} runs")
                raw_runs = rapid_pro.update_raw_runs_with_latest_modified(
                    flow_id,
                    raw_runs,
                    raw_export_log_file=raw_runs_log_file,
                    ignore_archives=True)
            except FileNotFoundError:
                log.info(
                    f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'"
                )
                raw_runs = rapid_pro.get_raw_runs_for_flow_id(
                    flow_id, raw_export_log_file=raw_runs_log_file)

        # Fetch the latest contacts from Rapid Pro.
        with open(contacts_log_path, "a") as raw_contacts_log_file:
            raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(
                raw_contacts, raw_export_log_file=raw_contacts_log_file)

        # Convert the runs to TracedData.
        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table,
            rapid_pro_source.test_contact_uuids)

        log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...")
        with open(raw_runs_path, "w") as raw_runs_file:
            json.dump([run.serialize() for run in raw_runs], raw_runs_file)
        log.info(f"Saved {len(raw_runs)} raw runs")

        log.info(
            f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as traced_runs_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                traced_runs, traced_runs_output_file)
        log.info(f"Saved {len(traced_runs)} traced runs")

    log.info(
        f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'..."
    )
    with open(raw_contacts_path, "w") as raw_contacts_file:
        json.dump([contact.serialize() for contact in raw_contacts],
                  raw_contacts_file)
    log.info(f"Saved {len(raw_contacts)} contacts")
Exemplo n.º 5
0
    google_cloud_credentials_file_path = args.google_cloud_credentials_file_path
    pipeline_configuration_file_path = args.pipeline_configuration_file_path
    traced_data_paths = args.traced_data_paths
    csv_output_file_path = args.csv_output_file_path

    log.info("Loading Pipeline Configuration File...")
    with open(pipeline_configuration_file_path) as f:
        pipeline_configuration = PipelineConfiguration.from_configuration_file(
            f)
    Logger.set_project_name(pipeline_configuration.pipeline_name)
    log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}")

    log.info("Downloading Firestore UUID Table credentials...")
    firestore_uuid_table_credentials = json.loads(
        google_cloud_utils.download_blob_to_string(
            google_cloud_credentials_file_path, pipeline_configuration.
            phone_number_uuid_table.firebase_credentials_file_url))

    phone_number_uuid_table = FirestoreUuidTable(
        pipeline_configuration.phone_number_uuid_table.table_name,
        firestore_uuid_table_credentials, "avf-phone-uuid-")
    log.info("Initialised the Firestore UUID table")

    uuids = set()
    skipped_nr = 0
    for path in traced_data_paths:
        # Load the traced data
        log.info(f"Loading previous traced data from file '{path}'...")
        with open(path) as f:
            data = TracedDataJsonIO.import_jsonl_to_traced_data_iterable(f)
        log.info(f"Loaded {len(data)} traced data objects")
Exemplo n.º 6
0
    memory_profile_file_path = args.memory_profile_file_path
    data_archive_file_path = args.data_archive_file_path

    log.info("Loading Pipeline Configuration File...")
    with open(pipeline_configuration_file_path) as f:
        pipeline_configuration = PipelineConfiguration.from_configuration_file(
            f)
    Logger.set_project_name(pipeline_configuration.pipeline_name)
    log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}")

    # Upload to Google Drive, if requested.
    if pipeline_configuration.drive_upload is not None:
        log.info(f"Downloading Google Drive service account credentials...")
        credentials_info = json.loads(
            google_cloud_utils.download_blob_to_string(
                google_cloud_credentials_file_path, pipeline_configuration.
                drive_upload.drive_credentials_file_url))
        drive_client_wrapper.init_client_from_info(credentials_info)

        log.info("Uploading CSVs to Google Drive...")

        production_csv_drive_dir = os.path.dirname(
            pipeline_configuration.drive_upload.production_upload_path)
        production_csv_drive_file_name = os.path.basename(
            pipeline_configuration.drive_upload.production_upload_path)
        drive_client_wrapper.update_or_create(
            production_csv_input_path,
            production_csv_drive_dir,
            target_file_name=production_csv_drive_file_name,
            target_folder_is_shared_with_me=True)
Exemplo n.º 7
0
                             "credentials bucket")
    parser.add_argument("rapid_pro_domain", help="URL of the Rapid Pro server to download data from")
    parser.add_argument("rapid_pro_token_file_url", metavar="rapid-pro-token-file-url",
                        help="GS URLs of a text file containing the authorisation token for the Rapid Pro server")
    parser.add_argument("output_file_path", metavar="output-file-path",
                        help="Output CSV file to write the phone numbers to")

    args = parser.parse_args()

    google_cloud_credentials_file_path = args.google_cloud_credentials_file_path
    rapid_pro_domain = args.rapid_pro_domain
    rapid_pro_token_file_url = args.rapid_pro_token_file_url
    output_file_path = args.output_file_path

    log.info("Downloading the Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path, rapid_pro_token_file_url).strip()

    rapid_pro = RapidProClient(rapid_pro_domain, rapid_pro_token)

    all_messages = rapid_pro.get_raw_messages()
    inbound_messages = [msg for msg in all_messages if msg.direction == "in"]

    inbound_phone_numbers = set()
    for msg in inbound_messages:
        if msg.urn.startswith("tel:"):
            phone_number = msg.urn.split(":")[1]
            inbound_phone_numbers.add(phone_number)
        else:
            log.warning(f"Skipped non-telephone URN type {msg.urn.split(':')[0]}")

    log.warning(f"Exporting {len(inbound_phone_numbers)} inbound phone numbers to {output_file_path}...")
        "credentials bucket")
    parser.add_argument(
        "firestore_credentials_url",
        metavar="firestore-credentials-url",
        help=
        "GS URL to the credentials file to use to access the Firestore instance containing "
        "the operations statistics")

    args = parser.parse_args()

    google_cloud_credentials_file_path = args.google_cloud_credentials_file_path
    firestore_credentials_url = args.firestore_credentials_url

    log.info("Initialising the Firestore client...")
    firestore_credentials = json.loads(
        google_cloud_utils.download_blob_to_string(
            google_cloud_credentials_file_path, firestore_credentials_url))
    firestore_wrapper = FirestoreWrapper(firestore_credentials)

    log.info("Loading the active project details...")
    active_projects = firestore_wrapper.get_active_projects()
    log.info(f"Loaded the details for {len(active_projects)} active projects")

    for project in active_projects:
        if project.flow_definitions_upload_url_prefix is None:
            log.info(
                f"Not archiving flow definitions for project {project.project_name} because its "
                f"'flow_definitions_upload_url_prefix' is unspecified.")
            continue

        log.info(
            f"Archiving the latest flow definitions for project {project.project_name}..."
Exemplo n.º 9
0
        metavar="target-credentials-url",
        help=
        "GS URL to the organisation access token file for authenticating to the target instance"
    )

    args = parser.parse_args()

    google_cloud_credentials_file_path = args.google_cloud_credentials_file_path
    source_domain = args.source_domain
    source_credentials_url = args.source_credentials_url
    target_domain = args.target_domain
    target_credentials_url = args.target_credentials_url

    # Initialise the source/target instances
    log.info("Downloading the source instance access token...")
    source_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path, source_credentials_url).strip()
    source_instance = RapidProClient(source_domain, source_token)

    log.info("Downloading the target instance access token...")
    target_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path, target_credentials_url).strip()
    target_instance = RapidProClient(target_domain, target_token)

    # For each contact field in the source instance, create a matching contact field in the target instance if it
    # does not already exist
    log.info("Copying contact fields...")
    source_fields = source_instance.get_fields()
    target_field_keys = {f.key for f in target_instance.get_fields()}
    for field in source_fields:
        if field.key not in target_field_keys:
            target_instance.create_field(field.label)
    )

    args = parser.parse_args()

    google_cloud_credentials_file_path = args.google_cloud_credentials_file_path
    force_update = args.force

    instance_1_domain = args.instance_1_domain
    instance_1_credentials_url = args.instance_1_credentials_url
    instance_2_domain = args.instance_2_domain
    instance_2_credentials_url = args.instance_2_credentials_url

    # Initialise the two instances
    log.info("Downloading the access token for instance 1...")
    instance_1_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        instance_1_credentials_url).strip()
    instance_1 = RapidProClient(instance_1_domain, instance_1_token)

    log.info("Downloading the target instance access token...")
    instance_2_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        instance_2_credentials_url).strip()
    instance_2 = RapidProClient(instance_2_domain, instance_2_token)

    # Synchronise the contact fields
    log.info("Synchronising contact fields...")
    instance_1_fields = instance_1.get_fields()
    instance_2_fields = instance_2.get_fields()
    for field in instance_1_fields:
        if field.key not in {f.key for f in instance_2_fields}:
Exemplo n.º 11
0
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table,
                         rapid_pro_source):
    log.info("Fetching data from Rapid Pro...")
    log.info("Downloading Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip()

    rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token)

    # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro.
    raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json"
    contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl"
    try:
        log.info(f"Loading raw contacts from file '{raw_contacts_path}'...")
        with open(raw_contacts_path) as raw_contacts_file:
            raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)]
        log.info(f"Loaded {len(raw_contacts)} contacts")
    except FileNotFoundError:
        log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server")
        with open(contacts_log_path, "a") as contacts_log_file:
            raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file)

    # Download all the runs for each of the radio shows
    for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names:
        runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl"
        raw_runs_path = f"{raw_data_dir}/{flow}_raw.json"
        traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl"
        log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...")

        flow_id = rapid_pro.get_flow_id(flow)

        # Load the previous export of runs for this flow, and update them with the newest runs.
        # If there is no previous export for this flow, fetch all the runs from Rapid Pro.
        with open(runs_log_path, "a") as raw_runs_log_file:
            try:
                log.info(f"Loading raw runs from file '{raw_runs_path}'...")
                with open(raw_runs_path) as raw_runs_file:
                    raw_runs = [Run.deserialize(run_json) for run_json in json.load(raw_runs_file)]
                log.info(f"Loaded {len(raw_runs)} runs")
                raw_runs = rapid_pro.update_raw_runs_with_latest_modified(
                    flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True)
            except FileNotFoundError:
                log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'")
                raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file)

        # Fetch the latest contacts from Rapid Pro.
        with open(contacts_log_path, "a") as raw_contacts_log_file:
            raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts,
                                                                              raw_export_log_file=raw_contacts_log_file)

        # Convert the runs to TracedData.
        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids)

        if flow in rapid_pro_source.activation_flow_names:
            # Append the Rapid Pro source name to each run.
            # Only do this for activation flows because this is the only place where this is interesting.
            # Also, demogs may come from either instance, which causes problems downstream.
            for td in traced_runs:
                td.append_data({
                    "source_raw": rapid_pro_source.source_name,
                    "source_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.SOURCE, CodeSchemes.SOURCE.get_code_with_match_value(rapid_pro_source.source_name),
                        Metadata.get_call_location()
                    ).to_dict()
                }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))

        log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...")
        with open(raw_runs_path, "w") as raw_runs_file:
            json.dump([run.serialize() for run in raw_runs], raw_runs_file)
        log.info(f"Saved {len(raw_runs)} raw runs")

        log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...")
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as traced_runs_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(traced_runs, traced_runs_output_file)
        log.info(f"Saved {len(traced_runs)} traced runs")

    log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...")
    with open(raw_contacts_path, "w") as raw_contacts_file:
        json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file)
    log.info(f"Saved {len(raw_contacts)} contacts")
def fetch_from_facebook(user, google_cloud_credentials_file_path, raw_data_dir,
                        facebook_uuid_table, facebook_source):
    log.info("Fetching data from Facebook...")
    log.info("Downloading Facebook access token...")
    facebook_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        facebook_source.token_file_url).strip()

    facebook = FacebookClient(facebook_token)

    for dataset in facebook_source.datasets:
        log.info(f"Exporting comments for dataset {dataset.name}...")
        raw_comments_output_path = f"{raw_data_dir}/{dataset.name}_raw.json"
        traced_comments_output_path = f"{raw_data_dir}/{dataset.name}.jsonl"

        # Download all the comments on all the posts in this dataset, logging the raw data returned by Facebook.
        raw_comments = []
        for post_id in dataset.post_ids:
            comments_log_path = f"{raw_data_dir}/{post_id}_comments_log.jsonl"
            with open(comments_log_path, "a") as raw_comments_log_file:
                post_comments = facebook.get_all_comments_on_post(
                    post_id,
                    raw_export_log_file=raw_comments_log_file,
                    fields=[
                        "from{id}", "parent", "attachments", "created_time",
                        "message"
                    ])

            # Download the post and add it as context to all the comments. Adding a reference to the post under
            # which a comment was made enables downstream features such as post-type labelling and comment context
            # in Coda, as well as allowing us to track how many comments were made on each post.
            post = facebook.get_post(post_id, fields=["attachments"])
            for comment in post_comments:
                comment["post"] = post

            raw_comments.extend(post_comments)

        # Facebook only returns a parent if the comment is a reply to another comment.
        # If there is no parent, set one to the empty-dict.
        for comment in raw_comments:
            if "parent" not in comment:
                comment["parent"] = {}

        # Convert the comments to TracedData.
        traced_comments = facebook.convert_facebook_comments_to_traced_data(
            user, dataset.name, raw_comments, facebook_uuid_table)

        # Export to disk.
        log.info(
            f"Saving {len(raw_comments)} raw comments to {raw_comments_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(raw_comments_output_path)
        with open(raw_comments_output_path, "w") as raw_comments_output_file:
            json.dump(raw_comments, raw_comments_output_file)
        log.info(f"Saved {len(raw_comments)} raw comments")

        log.info(
            f"Saving {len(traced_comments)} traced comments to {traced_comments_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(traced_comments_output_path)
        with open(traced_comments_output_path,
                  "w") as traced_comments_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                traced_comments, traced_comments_output_file)
        log.info(f"Saved {len(traced_comments)} traced comments")
Exemplo n.º 13
0
                        help="Path to a directory to save the raw data to")

    args = parser.parse_args()

    user = args.user
    pipeline_configuration_file_path = args.pipeline_configuration_file_path
    google_cloud_credentials_file_path = args.google_cloud_credentials_file_path
    raw_data_dir = args.raw_data_dir

    # Read the settings from the configuration file
    log.info("Loading Pipeline Configuration File...")
    with open(pipeline_configuration_file_path) as f:
        pipeline_configuration = PipelineConfiguration.from_configuration_file(f)

    log.info("Downloading Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path, pipeline_configuration.rapid_pro_token_file_url).strip()

    log.info("Downloading Firestore UUID Table credentials...")
    firestore_uuid_table_credentials = json.loads(google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        pipeline_configuration.phone_number_uuid_table.firebase_credentials_file_url
    ))

    phone_number_uuid_table = FirestoreUuidTable(
        pipeline_configuration.phone_number_uuid_table.table_name,
        firestore_uuid_table_credentials,
        "avf-phone-uuid-"
    )
    log.info("Initialised the Firestore UUID table")

    rapid_pro = RapidProClient(pipeline_configuration.rapid_pro_domain, rapid_pro_token)
    args = parser.parse_args()

    gzip_export_file_path = args.gzip_export_file_path
    gcs_upload_path = args.gcs_upload_path
    google_cloud_credentials_file_path = args.google_cloud_credentials_file_path
    firebase_credentials_file_url = args.firebase_credentials_file_url
    table_names = args.table_names

    if gzip_export_file_path is None and gcs_upload_path is None:
        log.error(f"No output locations specified. Please provide at least one of --gzip-export-file-path or "
                  f"--gcs-upload-path")
        exit(1)

    log.info("Downloading Firestore UUID Table credentials...")
    firestore_uuid_table_credentials = json.loads(google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        firebase_credentials_file_url
    ))

    id_tables = FirestoreUuidInfrastructure.init_from_credentials(firestore_uuid_table_credentials)
    if len(table_names) == 0:
        table_names = id_tables.list_table_names()
    log.info(f"Found {len(table_names)} uuid tables to export")

    export = dict()  # of table_name -> {mappings: dict of data -> uuid}
    for i, table_name in enumerate(table_names):
        log.info(f"Fetching mappings from table {i + 1}/{len(table_names)}: {table_name}...")
        mappings = id_tables.get_table(table_name, None).get_all_mappings()
        export[table_name] = {
            "mappings": mappings
        }
        log.info(f"Fetched {len(mappings)} mappings")
    args = parser.parse_args()

    user = args.user
    pipeline_configuration_file_path = args.pipeline_configuration_file_path
    google_cloud_credentials_file_path = args.google_cloud_credentials_file_path
    raw_data_dir = args.raw_data_dir

    # Read the settings from the configuration file
    log.info("Loading Pipeline Configuration File...")
    with open(pipeline_configuration_file_path) as f:
        pipeline_configuration = PipelineConfiguration.from_configuration_file(
            f)

    log.info("Downloading Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path,
        pipeline_configuration.rapid_pro_token_file_url).strip()

    log.info("Downloading Firestore UUID Table credentials...")
    firestore_uuid_table_credentials = json.loads(
        google_cloud_utils.download_blob_to_string(
            google_cloud_credentials_file_path, pipeline_configuration.
            phone_number_uuid_table.firebase_credentials_file_url))

    phone_number_uuid_table = FirestoreUuidTable(
        pipeline_configuration.phone_number_uuid_table.table_name,
        firestore_uuid_table_credentials, "avf-phone-uuid-")
    log.info("Initialised the Firestore UUID table")

    rapid_pro = RapidProClient(pipeline_configuration.rapid_pro_domain,
                               rapid_pro_token)