def set_matrix_keys(user, td, show_keys, coded_shows_prefix,
                        radio_q_prefix):
        matrix_d = dict()

        special = None
        if td.get("{}_NC".format(coded_shows_prefix)) == "1":
            special = "0"
        if td.get("{}_stop".format(coded_shows_prefix)) == "1":
            special = "stop"

        for output_key in td:
            if output_key.startswith(coded_shows_prefix):
                code_key = output_key.replace(coded_shows_prefix,
                                              radio_q_prefix)

                if code_key.endswith("_NC") or code_key.endswith("_stop"):
                    continue

                show_keys.add(code_key)
                if special is not None:
                    matrix_d[code_key] = special
                else:
                    matrix_d[code_key] = td[output_key]

        td.append_data(
            matrix_d, Metadata(user, Metadata.get_call_location(),
                               time.time()))
示例#2
0
    def test_hide_keys(self):
        td = self.generate_test_data()

        self.assertEqual(td["id"], "0")
        self.assertEqual(td["phone"], "+441632000001")
        self.assertEqual(td["gender"], "man")

        with self.assertRaisesRegex(KeyError, "age"):
            td.hide_keys({"age"}, Metadata("test_user", "hide_keys", time.time()))

        td.hide_keys({"gender", "phone"}, Metadata("test_user", "hide_keys", time.time()))

        self.assertTrue("id" in td)
        self.assertFalse("phone" in td)
        self.assertFalse("gender" in td)

        self.assertRaises(KeyError, lambda: td["gender"])
        self.assertRaises(KeyError, lambda: td["phone"])

        self.assertEqual(td.get("gender"), None)
        self.assertEqual(td.get("id"), "0")

        self.assertSetEqual(set(td.keys()), {"id"})
        self.assertDictEqual(dict(td.items()), {"id": "0"})
        self.assertSetEqual(set(td.values()), {"0"})
        self.assertEqual(len(td), 1)

        with self.assertRaisesRegex(KeyError, "gender"):
            td.hide_keys({"gender"}, Metadata("test_user", "hide_keys", time.time()))

        td.append_data({"gender": "female"}, Metadata("test_user", "add_gender", time.time()))
        self.assertTrue("gender" in td)
        self.assertEqual(td["gender"], "female")
def label_somalia_operator(user, traced_runs, phone_number_uuid_table):
    # Set the operator codes for each message.
    uuids = {td["avf_phone_id"] for td in traced_runs}
    uuid_to_phone_lut = phone_number_uuid_table.uuid_to_data_batch(uuids)
    for td in traced_runs:
        operator_raw = uuid_to_phone_lut[td["avf_phone_id"]][:5]  # Returns the country code 252 and the next two digits

        operator_code = PhoneCleaner.clean_operator(operator_raw)
        if operator_code == Codes.NOT_CODED:
            operator_label = CleaningUtils.make_label_from_cleaner_code(
                CodeSchemes.SOMALIA_OPERATOR,
                CodeSchemes.SOMALIA_OPERATOR.get_code_with_control_code(Codes.NOT_CODED),
                Metadata.get_call_location()
            )
        else:
            operator_label = CleaningUtils.make_label_from_cleaner_code(
                CodeSchemes.SOMALIA_OPERATOR,
                CodeSchemes.SOMALIA_OPERATOR.get_code_with_match_value(operator_code),
                Metadata.get_call_location()
            )

        td.append_data({
            "operator_raw": operator_raw,
            "operator_coded": operator_label.to_dict()
        }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))
示例#4
0
    def compute_message_ids(cls, user, data, message_key,
                            message_id_key_to_write):
        """
        Appends a message id to each object in the provided iterable of TracedData.

        Message ids are set by computing the SHA of the value at each `message_key`, so are guaranteed to be stable.

        If the `message_key` is not found in a TracedData object in the iterable, no message id is assigned.

        :param user: Identifier of the user running this program, for TracedData Metadata.
        :type user: str
        :param data: TracedData objects to set the message ids of.
        :type data: iterable of TracedData
        :param message_key: Key in TracedData objects of the raw text to generate message ids from.
        :type message_key: str
        :param message_id_key_to_write: Key in TracedData objects to write the message id to.
        :type message_id_key_to_write: str
        """
        for td in data:
            if message_key in td:
                td.append_data(
                    {
                        message_id_key_to_write:
                        SHAUtils.sha_string(td[message_key])
                    }, Metadata(user, Metadata.get_call_location(),
                                time.time()))
示例#5
0
    def determine_consent_withdrawn(cls,
                                    user,
                                    data,
                                    coding_plans,
                                    withdrawn_key="consent_withdrawn"):
        """
        Determines whether consent has been withdrawn, by searching for Codes.STOP in the given list of keys.

        TracedData objects where a stop code is found will have the key-value pair <withdrawn_key>: Codes.TRUE
        appended. Objects where a stop code is not found are not modified.

        Note that this does not actually set any other keys to Codes.STOP. Use Consent.set_stopped for this purpose.

        :param user: Identifier of the user running this program, for TracedData Metadata.
        :type user: str
        :param data: TracedData objects to determine consent for.
        :type data: iterable of TracedData
        :param coding_plans:
        :type coding_plans: iterable of CodingPlan
        :param withdrawn_key: Name of key to use for the consent withdrawn field.
        :type withdrawn_key: str
        """
        for td in data:
            if cls.td_has_stop_code(td, coding_plans):
                td.append_data({withdrawn_key: Codes.TRUE},
                               Metadata(user, Metadata.get_call_location(),
                                        time.time()))
    def remap_key_names(cls, user, data, pipeline_configuration):
        """
        Remaps key names.
        
        :param user: Identifier of the user running this program, for TracedData Metadata.
        :type user: str
        :param data: TracedData objects to remap the key names of.
        :type data: iterable of TracedData
        :param pipeline_configuration: Pipeline configuration.
        :type pipeline_configuration: PipelineConfiguration
        """
        for td in data:
            remapped = dict()
               
            for remapping in pipeline_configuration.rapid_pro_key_remappings:
                if remapping.is_activation_message:
                    continue

                old_key = remapping.rapid_pro_key
                new_key = remapping.pipeline_key
                
                if td.get(old_key) is not None and new_key not in td:
                    remapped[new_key] = td[old_key]

            td.append_data(remapped, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))
示例#7
0
    def tag_beneficiary_participants(user, data, pipeline_configuration, raw_data_dir):
        """
        This tags uids who are our partners beneficiaries.
        :param user: Identifier of the user running this program, for TracedData Metadata.
        :type user: str
        :param data: TracedData objects to tag listening group participation to.
        :type data: iterable of TracedData
        :param raw_data_dir: Directory containing de-identified beneficiary contacts CSVs containing
                                    beneficiary data stored as `avf-phone-uuid` and `location` columns.
        :type user: str
        :param pipeline_configuration: Pipeline configuration.
        :type pipeline_configuration: PipelineConfiguration
        """
        beneficiary_uids = set()  # Contains avf-phone ids of partner's beneficiaries.

        # Read beneficiary file CSVs data
        for beneficiary_file_url in pipeline_configuration.beneficiary_file_urls:
            with open(f'{raw_data_dir}/{beneficiary_file_url.split("/")[-1]}', "r", encoding='utf-8-sig') as f:
                beneficiary_data = list(csv.DictReader(f))
                for row in beneficiary_data:
                    beneficiary_uids.add(row['avf-phone-uuid'])

        # 1.Check if a participant is part of the beneficiary contacts then tag true or false otherwise
        #   Example - "beneficiary": true
        for td in data:
            beneficiary_data = dict()  # of uid repeat and weekly listening group participation data
            beneficiary_data["beneficiary"] = td["uid"] in beneficiary_uids

            td.append_data(beneficiary_data, Metadata(user, Metadata.get_call_location(), time.time()))
示例#8
0
    def auto_code_surveys(cls, user, data, phone_uuid_table, coda_output_dir):
        # Label missing data
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.DEMOGS_CODING_PLANS:
                if td.get(plan.raw_field, "") == "":
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                        Metadata.get_call_location()
                    )
                    missing_dict[plan.coded_field] = na_label.to_dict()
            td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time()))

        # Auto-code remaining data
        for plan in PipelineConfiguration.DEMOGS_CODING_PLANS:
            if plan.cleaner is not None:
                CleaningUtils.apply_cleaner_to_traced_data_iterable(user, data, plan.raw_field, plan.coded_field,
                                                                    plan.cleaner, plan.code_scheme)

        # Output survey answers to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.DEMOGS_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field)
            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, 'w') as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field, {plan.coded_field: plan.code_scheme}, f
                )
        print("Coda demogs files successfully exported")

        return data 
示例#9
0
    def set_stopped(user,
                    data,
                    withdrawn_key="consent_withdrawn",
                    additional_keys=None):
        """
        For each TracedData object in an iterable whose 'withdrawn_key' is Codes.True, sets every other key to
        Codes.STOP. If there is no withdrawn_key or the value is not Codes.True, that TracedData object is not modified.

        :param user: Identifier of the user running this program, for TracedData Metadata.
        :type user: str
        :param data: TracedData objects to set to stopped if consent has been withdrawn.
        :type data: iterable of TracedData
        :param withdrawn_key: Key in each TracedData object which indicates whether consent has been withdrawn.
        :type withdrawn_key: str
        :param additional_keys: Additional keys to set to 'STOP' (e.g. keys not already in some TracedData objects)
        :type additional_keys: list of str | None
        """
        if additional_keys is None:
            additional_keys = []

        for td in data:
            if td.get(withdrawn_key) == Codes.TRUE:
                stop_dict = {
                    key: Codes.STOP
                    for key in list(td.keys()) + additional_keys
                    if key != withdrawn_key
                }
                td.append_data(
                    stop_dict,
                    Metadata(user, Metadata.get_call_location(), time.time()))
示例#10
0
    def set_channel_keys(cls, user, data, time_key):
        for td in data:
            timestamp = isoparse(td[time_key])

            channel_dict = dict()

            # Set channel ranges
            time_range_matches = 0
            matching_ranges = []
            for key, ranges in cls.CHANNEL_RANGES.items():
                if cls.timestamp_is_in_ranges(timestamp, ranges, matching_ranges):
                    time_range_matches += 1
                    channel_dict[key] = Codes.TRUE
                else:
                    channel_dict[key] = Codes.FALSE

            # Set time as NON_LOGICAL if it doesn't fall in range of the **sms ad/radio promo/radio_show**
            if time_range_matches == 0:
                # Assert in range of project
                assert PipelineConfiguration.PROJECT_START_DATE <= timestamp < PipelineConfiguration.PROJECT_END_DATE, \
                    f"Timestamp {td[time_key]} out of range of project"
                channel_dict[cls.NON_LOGICAL_KEY] = Codes.TRUE
            else:
                assert time_range_matches == 1, f"Time '{td[time_key]}' matches multiple time ranges {matching_ranges}"
                channel_dict[cls.NON_LOGICAL_KEY] = Codes.FALSE

            # Set show ranges
            for key, ranges in cls.SHOW_RANGES.items():
                if cls.timestamp_is_in_ranges(timestamp, ranges, matching_ranges):
                    channel_dict[key] = Codes.TRUE
                else:
                    channel_dict[key] = Codes.FALSE

            td.append_data(channel_dict, Metadata(user, Metadata.get_call_location(), time.time()))
    def test_fold_groups(self):
        data = [TracedData({"x": c}, Metadata("test_user", Metadata.get_call_location(), i))
                for i, c in enumerate(["a", "b", "c", "d", "e"])]

        groups = [
            [data[0]],
            [data[1], data[2], data[3]],
            [data[4]]
        ]

        def fold_fn(td_1, td_2):
            td_1 = td_1.copy()
            td_2 = td_2.copy()

            folded_dict = {"x": "{}{}".format(td_1["x"], td_2["x"])}

            td_1.append_data(folded_dict, Metadata("test_user", Metadata.get_call_location(), 10))
            td_2.append_data(folded_dict, Metadata("test_user", Metadata.get_call_location(), 11))

            folded = td_1
            td_1.append_traced_data("folded_with", td_2, Metadata("test_user", Metadata.get_call_location(), 12))

            return folded

        folded_data = FoldTracedData.fold_groups(groups, fold_fn)

        self.assertDictEqual(dict(folded_data[0].items()), {"x": "a"})
        self.assertDictEqual(dict(folded_data[1].items()), {"x": "bcd"})
        self.assertDictEqual(dict(folded_data[2].items()), {"x": "e"})
示例#12
0
    def set_show_ids(cls, user, data, pipeline_configuration):
        """
        Sets a show pipeline key for each message, using the presence of Rapid Pro value keys to determine which
        show each message belongs to.

        :param user: Identifier of the user running this program, for TracedData Metadata.
        :type user: str
        :param data: TracedData objects to set the show ids of.
        :type data: iterable of TracedData
        :param pipeline_configuration: Pipeline configuration.
        :type pipeline_configuration: PipelineConfiguration
        """
        for td in data:
            show_dict = dict()

            for remapping in pipeline_configuration.rapid_pro_key_remappings:
                if not remapping.is_activation_message:
                    continue

                if td.get(remapping.rapid_pro_key) is not None:
                    assert "rqa_message" not in show_dict
                    show_dict["rqa_message"] = td[remapping.rapid_pro_key]
                    show_dict["show_pipeline_key"] = remapping.pipeline_key

            td.append_data(
                show_dict,
                Metadata(user, Metadata.get_call_location(),
                         TimeUtils.utc_now_as_iso_string()))
示例#13
0
    def test_update_iterable(self):
        data_dicts = [
            {"id": "A", "message": "hello"},
            {"id": "B", "message": "hello"},
            {"id": "A", "message": "hi"}
        ]
        data = [
            TracedData(d, Metadata("test_user", "data_generator", time.time()))
            for d in data_dicts
        ]

        updates_dicts = [
            {"id": "A", "gender": "male"},
            {"id": "B", "gender": "female", "age": 20}
        ]
        updates = [
            TracedData(d, Metadata("test_user", "data_generator", time.time()))
            for d in updates_dicts
        ]

        TracedData.update_iterable("test_user", "id", data, updates, "demographics")

        expected_dicts = [
            {"id": "A", "message": "hello", "gender": "male"},
            {"id": "B", "message": "hello", "gender": "female", "age": 20},
            {"id": "A", "message": "hi", "gender": "male"}
        ]

        for td, expected_dict in zip(data, expected_dicts):
            self.assertDictEqual(dict(td.items()), expected_dict)
示例#14
0
    def set_show_ids(cls, user, data, show_id_map):
        """
        Sets a show_id for each message, using the presence of Rapid Pro value keys to determine which show each message
        belongs to.

        :param user: Identifier of the user running this program, for TracedData Metadata.
        :type user: str
        :param data: TracedData objects to set the show ids of.
        :type data: iterable of TracedData
        :param show_id_map: Dictionary of Rapid Pro value key to show id.
        :type show_id_map: dict of str -> int
        """
        for td in data:
            show_dict = dict()

            for message_key, show_id in show_id_map.items():
                if message_key in td:
                    assert "rqa_message" not in show_dict
                    show_dict["rqa_message"] = td[message_key]
                    show_dict["show_id"] = show_id

            td.append_data(
                show_dict,
                Metadata(user, Metadata.get_call_location(),
                         TimeUtils.utc_now_as_iso_string()))
示例#15
0
    def _impute_coding_error_codes(user, data):
        for td in data:
            coding_error_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                if f"{plan.raw_field}_WS_correct_dataset" in td:
                    if td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"] == \
                            CodeSchemes.WS_CORRECT_DATASET.get_code_with_control_code(Codes.CODING_ERROR).code_id:
                        for cc in plan.coding_configurations:
                            if cc.coding_mode == CodingModes.SINGLE:
                                coding_error_dict[cc.coded_field] = \
                                    CleaningUtils.make_label_from_cleaner_code(
                                        cc.code_scheme,
                                        cc.code_scheme.get_code_with_control_code(Codes.CODING_ERROR),
                                        Metadata.get_call_location()
                                    ).to_dict()
                            else:
                                assert cc.coding_mode == CodingModes.MULTIPLE
                                coding_error_dict[cc.coded_field] = [
                                    CleaningUtils.make_label_from_cleaner_code(
                                        cc.code_scheme,
                                        cc.code_scheme.
                                        get_code_with_control_code(
                                            Codes.CODING_ERROR),
                                        Metadata.get_call_location()).to_dict(
                                        )
                                ]

            td.append_data(
                coding_error_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))
示例#16
0
    def set_channel_keys(cls, user, data, time_key):
        for td in data:
            timestamp = isoparse(td[time_key])

            channel_dict = dict()

            # Set channel ranges
            time_range_matches = 0
            for key, ranges in cls.CHANNEL_RANGES.items():
                if cls.timestamp_is_in_ranges(timestamp, ranges):
                    time_range_matches += 1
                    channel_dict[key] = Codes.TRUE
                else:
                    channel_dict[key] = Codes.FALSE

            # Set time as NON_LOGICAL if it doesn't fall in range of the **sms ad/radio promo/radio_show**
            if time_range_matches == 0:
                # Assert in range of project
                assert isoparse("2018-12-02T00:00:00+03:00") <= timestamp < isoparse("2018-12-31T00:00:00+03:00"), \
                    f"Timestamp {td[time_key]} out of range of project"
                channel_dict[cls.NON_LOGICAL_KEY] = Codes.TRUE
            else:
                assert time_range_matches == 1, f"Time '{td[time_key]}' matches multiple time ranges"
                channel_dict[cls.NON_LOGICAL_KEY] = Codes.FALSE

            # Set show ranges
            for key, ranges in cls.SHOW_RANGES.items():
                if cls.timestamp_is_in_ranges(timestamp, ranges):
                    channel_dict[key] = Codes.TRUE
                else:
                    channel_dict[key] = Codes.FALSE

            td.append_data(
                channel_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))
示例#17
0
def convert_facebook_comments_to_traced_data(user, dataset_name, raw_comments, facebook_uuid_table):
    log.info(f"Converting {len(raw_comments)} Facebook comments to TracedData...")

    facebook_uuids = {comment["from"]["id"] for comment in raw_comments}
    facebook_to_uuid_lut = facebook_uuid_table.data_to_uuid_batch(facebook_uuids)

    traced_comments = []
    # Use a placeholder avf facebook id for now, to make the individuals file work until we know if we'll be able
    # to see Facebook user ids or not.
    for comment in raw_comments:
        comment["created_time"] = isoparse(comment["created_time"]).isoformat()
        validators.validate_utc_iso_string(comment["created_time"])

        comment_dict = {
            "avf_facebook_id": facebook_to_uuid_lut[comment["from"]["id"]]
        }
        for k, v in comment.items():
            comment_dict[f"{dataset_name}.{k}"] = v

        traced_comments.append(
            TracedData(comment_dict,
                       Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())))

    log.info(f"Converted {len(traced_comments)} Facebook comments to TracedData")

    return traced_comments
示例#18
0
    def _remap_radio_show_by_time_range(cls,
                                        user,
                                        data,
                                        time_key,
                                        show_pipeline_key_to_remap_to,
                                        range_start=None,
                                        range_end=None,
                                        time_to_adjust_to=None):
        """
        Remaps radio show messages received in the given time range to another radio show.

        Optionally adjusts the datetime of re-mapped messages to a constant.

        :param user: Identifier of the user running this program, for TracedData Metadata.
        :type user: str
        :param data: TracedData objects to set the show ids of.
        :type data: iterable of TracedData
        :param time_key: Key in each TracedData of an ISO 8601-formatted datetime string to read the message sent on
                         time from.
        :type time_key: str
        :param show_pipeline_key_to_remap_to: Pipeline key to assign to messages received within the given time range.
        :type show_pipeline_key_to_remap_to: str
        :param range_start: Start datetime for the time range to remap radio show messages from, inclusive.
                            If None, defaults to the beginning of time.
        :type range_start: datetime | None
        :param range_end: End datetime for the time range to remap radio show messages from, exclusive.
                          If None, defaults to the end of time.
        :type range_end: datetime | None
        :param time_to_adjust_to: Datetime to assign to the `time_key` field of re-mapped shows.
                                  If None, re-mapped shows will not have timestamps re-adjusted.
        :type time_to_adjust_to: datetime | None
        """
        if range_start is None:
            range_start = pytz.utc.localize(datetime.min)
        if range_end is None:
            range_end = pytz.utc.localize(datetime.max)

        log.info(
            f"Remapping messages in time range {range_start.isoformat()} to {range_end.isoformat()} "
            f"to show {show_pipeline_key_to_remap_to}...")

        remapped_count = 0
        for td in data:
            if time_key in td and range_start <= isoparse(
                    td[time_key]) < range_end:
                remapped_count += 1

                remapped = {"show_pipeline_key": show_pipeline_key_to_remap_to}
                if time_to_adjust_to is not None:
                    remapped[time_key] = time_to_adjust_to.isoformat()

                td.append_data(
                    remapped,
                    Metadata(user, Metadata.get_call_location(),
                             TimeUtils.utc_now_as_iso_string()))

        log.info(
            f"Remapped {remapped_count} messages to show {show_pipeline_key_to_remap_to}"
        )
def fetch_from_recovery_csv(user, google_cloud_credentials_file_path,
                            raw_data_dir, phone_number_uuid_table,
                            recovery_csv_source):
    log.info("Fetching data from a Recovery CSV...")
    for blob_url in recovery_csv_source.activation_flow_urls + recovery_csv_source.survey_flow_urls:
        flow_name = blob_url.split('/')[-1].split('.')[
            0]  # Takes the name between the last '/' and the '.csv' ending
        traced_runs_output_path = f"{raw_data_dir}/{flow_name}.jsonl"
        if os.path.exists(traced_runs_output_path):
            log.info(
                f"File '{traced_runs_output_path}' for blob '{blob_url}' already exists; skipping download"
            )
            continue

        log.info(f"Downloading recovered data from '{blob_url}'...")
        raw_csv_string = StringIO(
            google_cloud_utils.download_blob_to_string(
                google_cloud_credentials_file_path, blob_url))
        raw_data = list(csv.DictReader(raw_csv_string))
        log.info(f"Downloaded {len(raw_data)} recovered messages")

        log.info("Converting the recovered messages to TracedData...")
        traced_runs = []
        for i, row in enumerate(raw_data):
            raw_date = row["ReceivedOn"]
            if len(raw_date) == len("dd/mm/YYYY HH:MM"):
                parsed_raw_date = datetime.strptime(raw_date, "%d/%m/%Y %H:%M")
            else:
                parsed_raw_date = datetime.strptime(raw_date,
                                                    "%d/%m/%Y %H:%M:%S")
            localized_date = pytz.timezone("Africa/Mogadishu").localize(
                parsed_raw_date)

            assert row["Sender"].startswith("avf-phone-uuid-"), \
                f"The 'Sender' column for '{blob_url} contains an item that has not been de-identified " \
                f"into Africa's Voices Foundation's de-identification format. This may be done with de_identify_csv.py."

            d = {
                "avf_phone_id": row["Sender"],
                "message": row["Message"],
                "received_on": localized_date.isoformat(),
                "run_id": SHAUtils.sha_dict(row)
            }

            traced_runs.append(
                TracedData(
                    d,
                    Metadata(user, Metadata.get_call_location(),
                             TimeUtils.utc_now_as_iso_string())))
        log.info("Converted the recovered messages to TracedData")

        log.info(
            f"Exporting {len(traced_runs)} TracedData items to {traced_runs_output_path}..."
        )
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as f:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(
                traced_runs, f)
        log.info(f"Exported TracedData")
示例#20
0
 def apply(cls, user, code_books, td):
     code_book_data = dict()
     for coded_key, code_book in code_books.items():
         code_book_data[coded_key] = cls.apply_code_book(
             code_book, td[coded_key])
     td.append_data(
         code_book_data,
         Metadata(user, Metadata.get_call_location(), time.time()))
    def tag_listening_groups_participants(cls, user, data, pipeline_configuration, raw_data_dir):
        """
        This tags uids who participated in repeat listening groups and/or weekly listening
        group sessions.
        :param user: Identifier of the user running this program, for TracedData Metadata.
        :type user: str
        :param data: TracedData objects to tag listening group participation to.
        :type data: iterable of TracedData
        :param raw_data_dir: Directory containing de-identified listening groups contacts CSVs containing
                                    listening groups data stored as `Name` and `avf-phone-uuid` columns.
        :type user: str
        :param pipeline_configuration: Pipeline configuration.
        :type pipeline_configuration: PipelineConfiguration
        """
        repeat_listening_group_participants = [] # Contains uids of listening group participants who will participate
                                                 # in all listening group sessions.
        listening_group_participants = dict()   # Contains lists of weekly listening group participants. The participants
                                                # will change each week.

        # Read repeat listening group participants CSV and add their uids to repeat_listening_group_participants lists
        if os.path.exists(f'{raw_data_dir}/repeat_listening_group.csv'):
            with open(f'{raw_data_dir}/repeat_listening_group.csv', "r", encoding='utf-8-sig') as f:
                repeat_listening_group_data = list(csv.DictReader(f))
                for row in repeat_listening_group_data:
                    repeat_listening_group_participants.append(row['avf-phone-uuid'])
                log.info(f'Loaded {len(repeat_listening_group_participants)} repeat listening group participants')
        else:
            log.warning(f'Skipping loading {raw_data_dir}/repeat_listening_group.csv, file not found!')

        # Read weekly listening group participants CSVs and add their uids to the respective radio-show
        # listening_group_participants lists
        listening_group_csvs = []
        for listening_group_csv_url in pipeline_configuration.listening_group_csv_urls:
            listening_group_csvs.append(listening_group_csv_url.split("/")[-1])
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            listening_group_participants[plan.dataset_name] = set()
            if plan.listening_group_filename in listening_group_csvs:
                with open(f'{raw_data_dir}/{plan.listening_group_filename}', "r",
                          encoding='utf-8-sig') as f:
                    plan_listening_group_data = list(csv.DictReader(f))
                    for row in plan_listening_group_data:
                        listening_group_participants[plan.dataset_name].add(row['avf-phone-uuid'])
                    log.info(f'Loaded {len(listening_group_participants[f"{plan.dataset_name}"])} '
                             f'{plan.dataset_name} listening group participants')
            else:
                log.warning(f'Skipping loading {plan.listening_group_filename},file not found!')

        # 1.Check if a participant is part of the repeat listening groups contacts then tag true or false otherwise
        #   Example - "repeat_listening_group_participant": true
        # 2.Check if a participant participated in a radio show listening group then tag true or false otherwise
        #   Example - "kakuma_s01e01_listening_group_participant": false
        for td in data:
            listening_group_participation = dict() # of uid repeat and weekly listening group participation data
            listening_group_participation["repeat_listening_group_participant"] = td["uid"] in repeat_listening_group_participants
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                listening_group_participation[f'{plan.dataset_name}_listening_group_participant'] =  td['uid'] in listening_group_participants[plan.dataset_name]

            td.append_data(listening_group_participation, Metadata(user, Metadata.get_call_location(), time.time()))
示例#22
0
    def test_append_traced_data(self):
        # Note that this only tests failing appends. Successful appends are tested by the other methods in this suite.
        message_td = self.generate_message_td()

        demog_1_td = self.generate_demog_1_td()
        demog_1_td.append_data({"message": "should-fail"}, Metadata("test_user", "conflicting_message", time.time()))

        self.assertRaises(AssertionError,
                          lambda: message_td.append_traced_data(
                              "demog_1", demog_1_td, Metadata("test_user", "demog_1_append", time.time())))
示例#23
0
    def generate_test_data():
        test_data = list(generate_traced_data_iterable())

        test_data[1].append_data({"Gender": "f", "Gender_Coded": "Female"},
                                 Metadata("test_user", "gender_coder", 10))

        test_data[2].append_traced_data("Age_Data",
                                        TracedData({"age": 4}, Metadata("test_user", "age_generator", 11)),
                                        Metadata("test_user", "age_merger", 12))

        return test_data
示例#24
0
    def generate_test_data(cls):
        """Returns a new TracedData object with example id, phone, and gender fields"""
        message_td = cls.generate_message_td()

        demog_1_td = cls.generate_demog_1_td()
        demog_2_td = cls.generate_demog_2_td()

        message_td.append_traced_data("demog_1", demog_1_td, Metadata("test_user", "demog_1_append", time.time()))
        message_td.append_traced_data("demog_2", demog_2_td, Metadata("test_user", "demog_2_append", time.time()))

        return message_td
    def set_matrix_keys(user, data, all_matrix_keys, plan, code_ids, coded_key, matrix_prefix=""):
        for td in data:
            matrix_d = dict()

            for label in td.get(coded_key, []):
                matrix_d[f"{matrix_prefix}{code_ids[plan.code_scheme['Name']][label['CodeID']]}"] = Codes.MATRIX_1

            for key in all_matrix_keys:
                if key not in matrix_d:
                    matrix_d[key] = Codes.MATRIX_0

            td.append_data(matrix_d, Metadata(user, Metadata.get_call_location(), time.time()))
    def set_matrix_keys(user, data, all_matrix_keys, scheme, coded_key, matrix_prefix=""):
        for td in data:
            matrix_d = dict()

            for label in td.get(coded_key, []):
                matrix_d[f"{matrix_prefix}{scheme.get_code_with_id(label['CodeID']).string_value}"] = Codes.MATRIX_1

            for key in all_matrix_keys:
                if key not in matrix_d:
                    matrix_d[key] = Codes.MATRIX_0

            td.append_data(matrix_d, Metadata(user, Metadata.get_call_location(), time.time()))
    def coalesce_traced_runs_by_key(user, traced_runs, coalesce_key):
        coalesced_runs = dict()

        for run in traced_runs:
            if run[coalesce_key] not in coalesced_runs:
                coalesced_runs[run[coalesce_key]] = run
            else:
                coalesced_runs[run[coalesce_key]].append_data(
                    dict(run.items()),
                    Metadata(user, Metadata.get_call_location(),
                             TimeUtils.utc_now_as_iso_string()))

        return list(coalesced_runs.values())
    def test_fold_traced_data(self):
        td_1_dict = {
                "equal_1": 4, "equal_2": "xyz",
                "concat": "abc",
                "matrix_1": Codes.MATRIX_0, "matrix_2": Codes.MATRIX_0,
                "bool_1": Codes.FALSE, "bool_2": Codes.TRUE,
                "yes_no_1": Codes.YES, "yes_no_2": Codes.YES,
                "other_1": "other 1", "other_2": "other 2"
             }

        td_2_dict = {
                "equal_1": 4, "equal_2": "xyz",
                "concat": "def",
                "matrix_1": Codes.MATRIX_1, "matrix_2": Codes.MATRIX_0,
                "bool_1": Codes.TRUE, "bool_2": Codes.TRUE,
                "yes_no_1": Codes.YES, "yes_no_2": Codes.NO,
                "other_1": "other",
            }

        td_1 = TracedData(td_1_dict, Metadata("test_user", Metadata.get_call_location(), 0))
        td_2 = TracedData(td_2_dict, Metadata("test_user", Metadata.get_call_location(), 1))

        fold_strategies = {
            "equal_1": FoldStrategies.assert_equal,
            "equal_2": FoldStrategies.assert_equal,
            "concat": FoldStrategies.concatenate,
            "bool_1": FoldStrategies.boolean_or,
            "bool_2": FoldStrategies.boolean_or,
            "matrix_1": FoldStrategies.matrix,
            "matrix_2": FoldStrategies.matrix,
            "yes_no_1": FoldStrategies.yes_no_amb,
            "yes_no_2": FoldStrategies.yes_no_amb
        }
        folded_td = FoldTracedData.fold_traced_data("test_user", td_1, td_2, fold_strategies)

        # Test input tds unchanged
        self.assertDictEqual(dict(td_1.items()), td_1_dict)
        self.assertDictEqual(dict(td_2.items()), td_2_dict)
        
        # Test folded td has expected values
        self.assertDictEqual(
            dict(folded_td.items()),
            {
                "equal_1": 4, "equal_2": "xyz",
                "concat": "abc;def",
                "matrix_1": Codes.MATRIX_1, "matrix_2": Codes.MATRIX_0,
                "bool_1": Codes.TRUE, "bool_2": Codes.TRUE,
                "yes_no_1": Codes.YES, "yes_no_2": Codes.AMBIVALENT
            }
        )
示例#29
0
    def import_csv_to_traced_data_iterable(user, f):
        """
        Loads a CSV into new TracedData objects.

        :param user: Identifier of user running this program.
        :type user: str
        :param f: File to import from.
        :type f: file-like
        :return: TracedData objects imported from the provided file.
        :rtype: generator of TracedData
        """
        for row in csv.DictReader(f):
            yield TracedData(
                dict(row),
                Metadata(user, Metadata.get_call_location(), time.time()))
    def apply_cleaner_to_text(cls, cleaner, text, scheme, set_checked=False):
        """
        Applies a cleaning function to a text, and returns a label if the cleaned value wasn't NC.

        :param cleaner: Cleaning function to apply.
        :type cleaner: function of str -> str
        :param text: Text to apply the cleaner to.
        :type text: str
        :param scheme: Scheme containing codes which the string returned from the `cleaner` can be matched against.
        :type scheme: core_data_modules.data_models.CodeScheme
        :param set_checked: Whether to set the `checked` property of the applied Label.
        :type set_checked: bool
        """
        clean_value = cleaner(text)

        # Don't label data which the cleaners couldn't code
        if clean_value == Codes.NOT_CODED:
            return None

        # Construct a label for the clean_value returned by the cleaner
        code_id = scheme.get_code_with_match_value(clean_value)
        origin_id = Metadata.get_function_location(cleaner)
        return cls.make_label_from_cleaner_code(scheme,
                                                code_id,
                                                origin_id,
                                                set_checked=set_checked)