def test_fold_list_of_labels(self):
        na_code = Code("code-NA", "Control", "NA", -10, "NA", True, control_code=Codes.TRUE_MISSING)
        nr_code = Code("code-NR", "Control", "NR", -20, "NR", True, control_code=Codes.NOT_REVIEWED)
        nc_code = Code("code-NC", "Control", "NC", -30, "NC", True, control_code=Codes.NOT_CODED)
        normal_1_code = Code("code-normal-1", "Normal", "Normal 1", 1, "normal_1", True)
        normal_2_code = Code("code-normal-2", "Normal", "Normal 2", 2, "normal_2", True)
        scheme_1 = CodeScheme("scheme-1", "Scheme 1", "1", [na_code, nr_code, nc_code, normal_1_code, normal_2_code])

        scheme_2 = CodeScheme("scheme-2", "Scheme 2", "2", [])

        na_label = Label("scheme-1", "code-NA", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict()
        nr_label = Label("scheme-1", "code-NR", "2019-10-01T12:25:18Z", Origin("x", "test", "automatic")).to_dict()
        nc_label = Label("scheme-1", "code-NC", "2019-10-01T12:30:00Z", Origin("x", "test", "automatic")).to_dict()
        na_label_2 = Label("scheme-1", "code-NA", "2019-10-01T13:00:00Z", Origin("x", "test", "automatic")).to_dict()
        normal_1_label = Label("scheme-1", "code-normal-1", "2019-10-01T12:20:14Z", Origin("x", "test", "automatic")).to_dict()
        normal_1_label_2 = Label("scheme-1", "code-normal-1", "2019-10-03T00:00:00Z", Origin("x", "test", "automatic")).to_dict()
        normal_2_label = Label("scheme-1", "code-normal-2", "2019-10-01T15:00:00Z", Origin("x", "test", "automatic")).to_dict()

        # Test empty lists are rejected
        self.assertRaises(AssertionError, lambda: FoldStrategies.list_of_labels(scheme_1, [], []))
        self.assertRaises(AssertionError, lambda: FoldStrategies.list_of_labels(scheme_1, [na_label], []))

        # Test lists containing only NA labels return a single NA label
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [na_label], [na_label]), [na_label])
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [na_label], [na_label_2]), [na_label])

        # Test lists containing an NA label and another label (including another NA label) are rejected
        self.assertRaises(AssertionError, lambda: FoldStrategies.list_of_labels(scheme_1, [na_label, na_label], [na_label]))
        self.assertRaises(AssertionError, lambda: FoldStrategies.list_of_labels(scheme_1, [na_label, normal_1_label], [na_label]))

        # Test folding a normal label with an NA label
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [na_label], [normal_1_label]), [normal_1_label])
        
        # Test folding various combinations of only normal labels
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label], [normal_1_label]), [normal_1_label])
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label, normal_2_label], [normal_1_label]),
                         [normal_1_label, normal_2_label])
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label, normal_2_label], [normal_1_label_2]),
                         [normal_1_label, normal_2_label])

        # Test folding normal labels with a control code that isn't NA or NC
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label, normal_2_label], [nr_label]),
                         [normal_1_label, normal_2_label, nr_label])

        # Test folding a label from a different code scheme
        self.assertRaises(AssertionError, lambda: FoldStrategies.list_of_labels(scheme_2, [normal_1_label], [na_label]))
        # (make sure that test would have been ok with the correct code scheme)
        FoldStrategies.list_of_labels(scheme_1, [normal_1_label], [na_label])

        # Test folding normal codes with NC codes
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [nc_label], [nc_label]), [nc_label])
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [na_label], [nc_label]), [nc_label])
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label], [nc_label]), [normal_1_label])
        self.assertEqual(FoldStrategies.list_of_labels(scheme_1, [normal_1_label], [normal_2_label, nc_label]),
                         [normal_1_label, normal_2_label])
Exemplo n.º 2
0
def _open_scheme(filename):
    with open(f"code_schemes/{filename}", "r") as f:
        firebase_map = json.load(f)
        return CodeScheme.from_firebase_map(firebase_map)
Exemplo n.º 3
0
    google_cloud_credentials_file_path = args.google_cloud_credentials_file_path
    pipeline_configuration_file_path = args.pipeline_configuration_file_path
    code_scheme_file_path = args.code_scheme_file_path
    traced_data_paths = args.traced_data_paths
    csv_output_file_path = args.csv_output_file_path

    log.info("Loading Pipeline Configuration File...")
    with open(pipeline_configuration_file_path) as f:
        pipeline_configuration = PipelineConfiguration.from_configuration_file(
            f)
    Logger.set_project_name(pipeline_configuration.pipeline_name)
    log.debug(f"Pipeline name is {pipeline_configuration.pipeline_name}")

    log.info(f"Loading code scheme from {code_scheme_file_path}...")
    with open(code_scheme_file_path) as f:
        code_scheme = CodeScheme.from_firebase_map(json.load(f))

    log.info("Downloading Firestore UUID Table credentials...")
    firestore_uuid_table_credentials = json.loads(
        google_cloud_utils.download_blob_to_string(
            google_cloud_credentials_file_path, pipeline_configuration.
            phone_number_uuid_table.firebase_credentials_file_url))

    phone_number_uuid_table = FirestoreUuidTable(
        pipeline_configuration.phone_number_uuid_table.table_name,
        firestore_uuid_table_credentials, "avf-phone-uuid-")
    log.info("Initialised the Firestore UUID table")

    uuids = set()
    county_counts = {county: 0 for county in TARGET_COUNTIES}
    for path in traced_data_paths:
Exemplo n.º 4
0
    def test_export_import_one_single_coded_scheme(self):
        file_path = path.join(self.test_dir, "coda_2_test.json")

        # Build raw input data
        message_dicts = [
            {"gender_raw": "woman", "gender_sent_on": "2018-11-01T07:13:04+03:00"},
            {"gender_raw": "", "gender_sent_on": "2018-11-01T07:17:04+03:00"},
            {"gender_raw": "hiya", "gender_sent_on": "2018-11-01T07:19:04+05:00"},
            {},
            {"gender_raw": "boy", "gender_sent_on": "2018-11-02T19:00:29+03:00"},
            {"gender_raw": "man", "gender_sent_on": "2018-11-02T19:00:29+03:00"},
        ]
        messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i))
                    for i, d in enumerate(message_dicts)]

        # Add message ids
        TracedDataCodaV2IO.compute_message_ids("test_user", messages, "gender_raw", "gender_coda_id")

        # Load gender scheme
        with open("tests/traced_data/resources/coda_2_gender_scheme.json") as f:
            gender_scheme = CodeScheme.from_firebase_map(json.load(f))

        # Apply the English gender cleaner
        with mock.patch("core_data_modules.util.TimeUtils.utc_now_as_iso_string") as time_mock, \
                mock.patch("core_data_modules.traced_data.Metadata.get_function_location") as location_mock:
            time_mock.return_value = "2018-11-02T15:00:07+00:00"
            location_mock.return_value = "english.DemographicCleaner.clean_gender"

            CleaningUtils.apply_cleaner_to_traced_data_iterable(
                "test_user", messages, "gender_raw", "gender_coded",
                english.DemographicCleaner.clean_gender, gender_scheme
            )

        # Export to a Coda 2 messages file
        with open(file_path, "w") as f:
            TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                messages, "gender_raw", "gender_sent_on", "gender_coda_id", {"gender_coded": gender_scheme}, f)

        self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_one_scheme.json"))

        # Test importing with no file available
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
            "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme})
        # Deliberately testing the read can be done twice
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
            "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme})

        na_id = gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id
        nr_id = gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                gender_scheme,
                gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("gender_raw", "") == "":
                td.append_data({"gender_coded": na_label.to_dict()},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        imported_code_ids = [td["gender_coded"]["CodeID"] for td in imported_messages]

        self.assertListEqual(imported_code_ids, [nr_id, na_id, nr_id, na_id, nr_id, nr_id])

        # Test importing from the test file
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        with open("tests/traced_data/resources/coda_2_import_test_one_scheme.json", "r") as f:
            TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme}, f)

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                gender_scheme,
                gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("gender_raw", "") == "":
                td.append_data({"gender_coded": na_label.to_dict()},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        imported_code_ids = [td["gender_coded"]["CodeID"] for td in imported_messages]

        expected_code_ids = [
            gender_scheme.get_code_with_match_value("female").code_id,  # Manually approved auto-code
            gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id,  # Empty raw message
            gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id,  # Manually assigned code which isn't checked
            gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id,  # No raw message
            gender_scheme.get_code_with_control_code(Codes.NOT_CODED).code_id,  # Manually Not Coded
            gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id,  # Manually un-coded
        ]
        self.assertListEqual(imported_code_ids, expected_code_ids)

        # Add an element with the same raw text but a conflicting
        messages.append(TracedData({
            "gender_raw": "woman", "gender_sent_on": "2018-11-01T07:13:04+03:00",
            "gender_coded": CleaningUtils.make_label_from_cleaner_code(
                gender_scheme, gender_scheme.get_code_with_match_value("male"), "make_location_label",
                date_time_utc="2018-11-03T13:40:50Z").to_dict()
        }, Metadata("test_user", Metadata.get_call_location(), time.time())))
        TracedDataCodaV2IO.compute_message_ids("test_user", messages, "gender_raw", "gender_coda_id")

        with open(file_path, "w") as f:
            try:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    messages, "gender_raw", "gender_sent_on", "gender_coda_id", {"gender_coded": gender_scheme}, f)
            except AssertionError as e:
                assert str(e) == "Messages with the same id " \
                                 "(cf2e5bff1ef03dcd20d1a0b18ef7d89fc80a3554434165753672f6f40fde1d25) have different " \
                                 "labels for coded_key 'gender_coded'"
                return
            self.fail("Exporting data with conflicting labels did not fail")
Exemplo n.º 5
0
    def test_export_import_one_multi_coded_scheme(self):
        file_path = path.join(self.test_dir, "coda_2_test.json")

        # Build raw input data
        message_dicts = [
            {"msg_raw": "food", "msg_sent_on": "2018-11-01T07:13:04+03:00"},
            {"msg_raw": "", "msg_sent_on": "2018-11-01T07:17:04+03:00"},
            {"msg_raw": "food + water", "msg_sent_on": "2018-11-01T07:19:04+05:00"},
            {},
            {"msg_raw": "water", "msg_sent_on": "2018-11-02T19:00:29+03:00"},
            {"msg_raw": "abcd", "msg_sent_on": "2018-11-02T20:30:45+03:00"}
        ]
        messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i))
                    for i, d in enumerate(message_dicts)]

        # Add message ids
        TracedDataCodaV2IO.compute_message_ids("test_user", messages, "msg_raw", "msg_coda_id")

        # Load gender scheme
        with open("tests/traced_data/resources/coda_2_msg_scheme.json") as f:
            msg_scheme = CodeScheme.from_firebase_map(json.load(f))

        # Export to a Coda 2 messages file
        with open(file_path, "w") as f:
            TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                messages, "msg_raw", "msg_sent_on", "msg_coda_id", {"msg_coded": msg_scheme}, f)

        self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_multi_coded.json"))

        # Test importing with no file available
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
            "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme})
        # Deliberately testing the read can be done twice
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
            "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme})

        na_id = msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id
        nr_id = msg_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                msg_scheme,
                msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("msg_raw", "") == "":
                td.append_data({"msg_coded": [na_label.to_dict()]},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        for td in imported_messages:
            self.assertEqual(len(td["msg_coded"]), 1)
        imported_code_ids = [td["msg_coded"][0]["CodeID"] for td in imported_messages]
        self.assertListEqual(imported_code_ids, [nr_id, na_id, nr_id, na_id, nr_id, nr_id])

        # Test importing from the test file
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        with open("tests/traced_data/resources/coda_2_import_test_multi_coded.json", "r") as f:
            TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}, f)

            # Test that reading the same file-pointer twice without moving it back to the start of the file fails
            try:
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                    "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}, f)
                self.fail("Re-using the same file pointer didn't raise an assertion error")
            except AssertionError as e:
                self.assertEqual(str(e), "File-pointer not at byte 0. "
                                         "Should you have used e.g. `f.seek(0)` before calling this method?")

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                msg_scheme,
                msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("msg_raw", "") == "":
                td.append_data({"msg_coded": [na_label.to_dict()]},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        imported_code_ids = []
        for td in imported_messages:
            imported_code_ids.append([code["CodeID"] for code in td["msg_coded"]])

        expected_code_ids = [
            [msg_scheme.get_code_with_match_value("food").code_id],
            [msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id],
            [msg_scheme.get_code_with_match_value("food").code_id, msg_scheme.get_code_with_match_value("water").code_id],
            [msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id],
            [msg_scheme.get_code_with_match_value("water").code_id],
            [msg_scheme.get_code_with_control_code(Codes.NOT_CODED).code_id]
        ]

        for x, y in zip(imported_code_ids, expected_code_ids):
            self.assertEqual(len(x), len(y))
            self.assertSetEqual(set(x), set(y))
Exemplo n.º 6
0
    def test_export_two_single_coded_schemes(self):
        file_path = path.join(self.test_dir, "coda_2_test.json")

        # Load schemes
        with open("tests/traced_data/resources/coda_2_district_scheme.json") as f:
            district_scheme = CodeScheme.from_firebase_map(json.load(f))

        with open("tests/traced_data/resources/coda_2_zone_scheme.json") as f:
            zone_scheme = CodeScheme.from_firebase_map(json.load(f))

        def make_location_label(scheme, value):
            if value in {Codes.TRUE_MISSING, Codes.SKIPPED, Codes.NOT_CODED}:
                code = scheme.get_code_with_control_code(value)
            else:
                code = scheme.get_code_with_match_value(value)

            return CleaningUtils.make_label_from_cleaner_code(scheme, code, "make_location_label",
                                                              date_time_utc="2018-11-02T13:40:50Z").to_dict()

        # Build raw input data
        message_dicts = [
            # Normal, coded data
            {"location_raw": "mog", "location_sent_on": "2018-11-01T07:13:04+03:00",
             "district": make_location_label(district_scheme, "mogadishu"),
             "zone": make_location_label(zone_scheme, "scz")},

            # Data coded under one scheme only
            {"location_raw": "kismayo", "location_sent_on": "2018-11-01T07:17:04+03:00",
             "district": make_location_label(district_scheme, "kismayo")},

            # Data coded as missing under both schemes
            {"location_raw": "", "location_sent_on": "2018-11-01T07:19:04+05:00",
             "district": make_location_label(district_scheme, Codes.TRUE_MISSING),
             "zone": make_location_label(zone_scheme, Codes.TRUE_MISSING)},

            # No data
            {},

            # Data coded as NC under both schemes
            {"location_raw": "kismyo", "location_sent_on": "2018-11-01T07:19:30+03:00",
             "district": make_location_label(district_scheme, Codes.NOT_CODED),
             "zone": make_location_label(zone_scheme, Codes.NOT_CODED)},

            # Data coded as NC under one scheme only
            {"location_raw": "kismay", "location_sent_on": "2018-11-01T07:19:30+03:00",
             "district": make_location_label(district_scheme, "kismayo"),
             "zone": make_location_label(zone_scheme, Codes.NOT_CODED)},
        ]
        messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i))
                    for i, d in enumerate(message_dicts)]

        # Add message ids
        TracedDataCodaV2IO.compute_message_ids("test_user", messages, "location_raw", "location_coda_id")

        # Export to a Coda 2 messages file
        with open(file_path, "w") as f:
            scheme_key_map = collections.OrderedDict()  # Using OrderedDict to make tests easier to write in Py2 and Py3.
            scheme_key_map["district"] = district_scheme
            scheme_key_map["zone"] = zone_scheme

            TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                messages, "location_raw", "location_sent_on", "location_coda_id", scheme_key_map, f)

        self.assertTrue(
            filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_multiple_schemes.json"))