def auto_code_surveys(cls, user, data, phone_uuid_table, coda_output_dir): # Label missing data for td in data: missing_dict = dict() for plan in PipelineConfiguration.DEMOGS_CODING_PLANS: if td.get(plan.raw_field, "") == "": na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.coded_field] = na_label.to_dict() td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Auto-code remaining data for plan in PipelineConfiguration.DEMOGS_CODING_PLANS: if plan.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable(user, data, plan.raw_field, plan.coded_field, plan.cleaner, plan.code_scheme) # Output survey answers to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.DEMOGS_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, 'w') as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, {plan.coded_field: plan.code_scheme}, f ) print("Coda demogs files successfully exported") return data
def auto_code_surveys(cls, user, data, coda_output_dir): # Auto-code surveys for plan in PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, plan.raw_field, cc.coded_field, cc.cleaner, cc.code_scheme) # Output single-scheme answers to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, { cc.coded_field: cc.code_scheme for cc in plan.coding_configurations }, f) # Note: no need to handle location in any special way on this project because it is not being auto-coded return data
def _build_message_to_s01e02_dict(cls, user, data, coda_input_dir): # Duplicate the input list because reading the file requires appending data to the TracedData, # and we don't actually want to modify the input at this stage of the pipeline. data = [td.copy() for td in data] # Apply the week 3 codes from Coda. message_id_key = "radio_show_3_message_id" coded_ws_key = "radio_show_3_ws" TracedDataCodaV2IO.compute_message_ids(user, data, cls.WEEK_3_VALUE_KEY, message_id_key) coda_input_path = path.join(coda_input_dir, "s01e03.json") with open(coda_input_path) as f: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, message_id_key, {coded_ws_key: CodeSchemes.WS_CORRECT_DATASET}, f) # Parse the loaded codes into a look-up table of raw message string -> is ws boolean. message_to_ws_dict = dict() for td in data: label = td.get(coded_ws_key) if label is not None: message_to_ws_dict[td.get(cls.WEEK_3_VALUE_KEY)] = \ label["CodeID"] == CodeSchemes.WS_CORRECT_DATASET.get_code_with_match_value("s01e02").code_id return message_to_ws_dict
def auto_code_show_messages(cls, user, data, icr_output_dir, coda_output_dir): # Filter out test messages sent by AVF if not PipelineConfiguration.DEV_MODE: data = MessageFilters.filter_test_messages(data) # Filter for runs which don't contain a response to any week's question data = MessageFilters.filter_empty_messages(data, cls.RQA_KEYS) # Filter out runs sent outwith the project start and end dates data = MessageFilters.filter_time_range( data, cls.SENT_ON_KEY, PipelineConfiguration.PROJECT_START_DATE, PipelineConfiguration.PROJECT_END_DATE) # Label each message with channel keys Channels.set_channel_keys(user, data, cls.SENT_ON_KEY) # Output RQA and follow up surveys messages to Coda IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) output_path = path.join(coda_output_dir, plan.coda_filename) with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f) # Output RQA and follow up messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: rqa_and_follow_up_messages = [] # This test works because the only codes which have been applied at this point are TRUE_MISSING. # If any other coding is done above, this test will need to change for td in data: if plan.raw_field in td: rqa_and_follow_up_messages.append(td) icr_messages = ICRTools.generate_sample_for_icr( rqa_and_follow_up_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field]) return data
def export_coda(cls, user, data, coda_output_dir): IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.coda_filename is None: continue TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, { cc.coded_field: cc.code_scheme for cc in plan.coding_configurations }, f)
def export_coda(cls, user, data, coda_output_dir): IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.coda_filename is None: continue for td in data: if plan.raw_field in td: td.append_data({plan.id_field: plan.message_id_fn(td)}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, { cc.coded_field: cc.code_scheme for cc in plan.coding_configurations }, f)
def auto_code_surveys(cls, user, data, icr_output_dir, coda_output_dir): # Auto-code surveys for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, plan.raw_field, plan.coded_field, plan.cleaner, plan.code_scheme) # Output single-scheme answers to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, {plan.coded_field: plan.code_scheme}, f) # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: rqa_messages = [] for td in data: if plan.raw_field in td: rqa_messages.append(td) icr_messages = ICRTools.generate_sample_for_icr( rqa_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field]) return data
def apply_manual_codes(cls, user, data, coda_input_dir): """ Merge manually coded radio show files into the cleaned dataset """ for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS: test_pipeline_messages = [td for td in data if plan.raw_field in td] coda_input_path = path.join(coda_input_dir, plan.coda_filename) f = None try: if path.exists(coda_input_path): f = open(coda_input_path, 'r') TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, test_pipeline_messages, plan.id_field, {plan.coded_field: plan.code_scheme}, f) if plan.binary_code_scheme is not None: if f is not None: f.seek(0) TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, test_pipeline_messages, plan.id_field, {plan.binary_coded_field:plan.binary_code_scheme}, f) finally: if f is not None: f.close() # Merge manually coded survey files into cleaned dataset for plan in PipelineConfiguration.SURVEY_CODING_PLANS: f = None try: coda_input_path = path.join(coda_input_dir, plan.coda_filename) if path.exists(coda_input_path): f = open(coda_input_path, 'r') TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field, {plan.coded_field: plan.code_scheme}, f) finally: if f is not None: f.close() return data
def auto_code_show_messages(cls, user, data, icr_output_dir, coda_output_dir): # Filter out test messages sent by AVF. if not PipelineConfiguration.DEV_MODE: data = MessageFilters.filter_test_messages(data) # Filter for runs which don't contain a response to any week's question data = MessageFilters.filter_empty_messages(data, cls.RQA_KEYS) # Filter out runs sent outwith the project start and end dates data = MessageFilters.filter_time_range(data, cls.SENT_ON_KEY, cls.PROJECT_START_DATE, cls.PROJECT_END_DATE) # Tag messages which are noise as being noise for td in data: is_noise = True for rqa_key in cls.RQA_KEYS: if rqa_key in td and not somali.DemographicCleaner.is_noise( td[rqa_key], min_length=10): is_noise = False td.append_data({cls.NOISE_KEY: is_noise}, Metadata(user, Metadata.get_call_location(), time.time())) # Label missing data for td in data: missing_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[plan.coded_field] = [na_label.to_dict()] if plan.binary_code_scheme is not None: na_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[ plan.binary_coded_field] = na_label.to_dict() td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Label each message with channel keys Channels.set_channel_keys(user, data, cls.SENT_ON_KEY) # Filter for messages which aren't noise (in order to export to Coda and export for ICR) not_noise = MessageFilters.filter_noise(data, cls.NOISE_KEY, lambda x: x) # Output messages which aren't noise to Coda IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, not_noise, plan.raw_field, plan.id_field) output_path = path.join(coda_output_dir, plan.coda_filename) with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( not_noise, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f) # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [] for td in not_noise: # This test works because the only codes which have been applied at this point are TRUE_MISSING. # If any other coding is done above, this test will need to change. if plan.coded_field not in td: rqa_messages.append(td) else: assert len(td[plan.coded_field]) == 1 assert td[plan.coded_field][0]["CodeID"] == \ plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id icr_messages = ICRTools.generate_sample_for_icr( rqa_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field]) return data
def apply_manual_codes(cls, user, data, coda_input_dir): # Merge manually coded data into the cleaned dataset for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: coda_input_path = path.join(coda_input_dir, plan.coda_filename) for cc in plan.coding_configurations: f = None try: if path.exists(coda_input_path): f = open(coda_input_path, "r") if cc.coding_mode == CodingModes.SINGLE: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field, {cc.coded_field: cc.code_scheme}, f) else: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, data, plan.id_field, {cc.coded_field: cc.code_scheme}, f) finally: if f is not None: f.close() # Label data for which there is no response as TRUE_MISSING. # Label data for which the response is the empty string as NOT_CODED. for td in data: missing_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field not in td: for cc in plan.coding_configurations: na_label = CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()).to_dict() missing_dict[ cc. coded_field] = na_label if cc.coding_mode == CodingModes.SINGLE else [ na_label ] elif td[plan.raw_field] == "": for cc in plan.coding_configurations: nc_label = CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()).to_dict() missing_dict[ cc. coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [ nc_label ] td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Mark data that is noise as Codes.NOT_CODED for td in data: if td.get("noise", False): nc_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS: for cc in plan.coding_configurations: if cc.coded_field not in td: nc_label = CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()).to_dict() nc_dict[ cc. coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [ nc_label ] td.append_data( nc_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Run code imputation functions for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.code_imputation_function is not None: plan.code_imputation_function(user, data, plan.coding_configurations) cls._impute_coding_error_codes(user, data) return data
def apply_manual_codes(cls, user, data, coda_input_dir): # Merge manually coded radio show files into the cleaned dataset for plan in PipelineConfiguration.SURVEY_CODING_PLANS: rqa_messages = [td for td in data if plan.raw_field in td] coda_input_path = path.join(coda_input_dir, plan.coda_filename) print(coda_input_path) f = None try: if path.exists(coda_input_path): f = open(coda_input_path, "r") TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, rqa_messages, plan.id_field, {plan.coded_field: plan.code_scheme}, f) if plan.binary_code_scheme is not None: if f is not None: f.seek(0) TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, rqa_messages, plan.id_field, {plan.binary_coded_field: plan.binary_code_scheme}, f) finally: if f is not None: f.close() # At this point, the TracedData objects still contain messages for at most one week each. # Label the weeks for which there is no response as TRUE_MISSING. for td in data: missing_dict = dict() for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.coded_field] = [na_label.to_dict()] if plan.binary_code_scheme is not None: na_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.binary_coded_field] = na_label.to_dict() td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Synchronise the control codes between the binary and reasons schemes: # Some RQA datasets have a binary scheme, which is always labelled, and a reasons scheme, which is only labelled # if there is an additional reason given. Importing those two schemes separately above caused the labels in # each scheme to go out of sync with each other, e.g. reasons can be NR when the binary *was* reviewed. # This block updates the reasons scheme in cases where only a binary label was set, by assigning the # label 'NC' if the binary label was set to a normal code, otherwise to be the same control code as the binary. for plan in PipelineConfiguration.SURVEY_CODING_PLANS: rqa_messages = [td for td in data if plan.raw_field in td] if plan.binary_code_scheme is not None: for td in rqa_messages: binary_label = td[plan.binary_coded_field] binary_code = plan.binary_code_scheme.get_code_with_id(binary_label["CodeID"]) binary_label_present = binary_label["CodeID"] != \ plan.binary_code_scheme.get_code_with_control_code( Codes.NOT_REVIEWED).code_id reasons_label_present = len(td[plan.coded_field]) > 1 or td[plan.coded_field][0][ "CodeID"] != \ plan.code_scheme.get_code_with_control_code( Codes.NOT_REVIEWED).code_id if binary_label_present and not reasons_label_present: if binary_code.code_type == "Control": control_code = binary_code.control_code reasons_code = plan.code_scheme.get_code_with_control_code(control_code) reasons_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, reasons_code, Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation") td.append_data( {plan.coded_field: [reasons_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()) ) else: assert binary_code.code_type == "Normal" nc_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation" ) td.append_data( {plan.coded_field: [nc_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()) ) # Not everyone will have answered all of the demographic flows. # Label demographic questions which had no responses as TRUE_MISSING. for td in data: missing_dict = dict() for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if td.get(plan.raw_field, "") == "": na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.coded_field] = na_label.to_dict() td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) return data
def test_export_import_one_single_coded_scheme(self): file_path = path.join(self.test_dir, "coda_2_test.json") # Build raw input data message_dicts = [ {"gender_raw": "woman", "gender_sent_on": "2018-11-01T07:13:04+03:00"}, {"gender_raw": "", "gender_sent_on": "2018-11-01T07:17:04+03:00"}, {"gender_raw": "hiya", "gender_sent_on": "2018-11-01T07:19:04+05:00"}, {}, {"gender_raw": "boy", "gender_sent_on": "2018-11-02T19:00:29+03:00"}, {"gender_raw": "man", "gender_sent_on": "2018-11-02T19:00:29+03:00"}, ] messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i)) for i, d in enumerate(message_dicts)] # Add message ids TracedDataCodaV2IO.compute_message_ids("test_user", messages, "gender_raw", "gender_coda_id") # Load gender scheme with open("tests/traced_data/resources/coda_2_gender_scheme.json") as f: gender_scheme = CodeScheme.from_firebase_map(json.load(f)) # Apply the English gender cleaner with mock.patch("core_data_modules.util.TimeUtils.utc_now_as_iso_string") as time_mock, \ mock.patch("core_data_modules.traced_data.Metadata.get_function_location") as location_mock: time_mock.return_value = "2018-11-02T15:00:07+00:00" location_mock.return_value = "english.DemographicCleaner.clean_gender" CleaningUtils.apply_cleaner_to_traced_data_iterable( "test_user", messages, "gender_raw", "gender_coded", english.DemographicCleaner.clean_gender, gender_scheme ) # Export to a Coda 2 messages file with open(file_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( messages, "gender_raw", "gender_sent_on", "gender_coda_id", {"gender_coded": gender_scheme}, f) self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_one_scheme.json")) # Test importing with no file available imported_messages = [] for td in messages: imported_messages.append(td.copy()) TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme}) # Deliberately testing the read can be done twice TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme}) na_id = gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id nr_id = gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id # Set TRUE_MISSING codes for td in imported_messages: na_label = CleaningUtils.make_label_from_cleaner_code( gender_scheme, gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING), "test_export_traced_data_iterable_to_coda_2", date_time_utc="2018-11-02T10:00:00+00:00" ) if td.get("gender_raw", "") == "": td.append_data({"gender_coded": na_label.to_dict()}, Metadata("test_user", Metadata.get_call_location(), time.time())) imported_code_ids = [td["gender_coded"]["CodeID"] for td in imported_messages] self.assertListEqual(imported_code_ids, [nr_id, na_id, nr_id, na_id, nr_id, nr_id]) # Test importing from the test file imported_messages = [] for td in messages: imported_messages.append(td.copy()) with open("tests/traced_data/resources/coda_2_import_test_one_scheme.json", "r") as f: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme}, f) # Set TRUE_MISSING codes for td in imported_messages: na_label = CleaningUtils.make_label_from_cleaner_code( gender_scheme, gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING), "test_export_traced_data_iterable_to_coda_2", date_time_utc="2018-11-02T10:00:00+00:00" ) if td.get("gender_raw", "") == "": td.append_data({"gender_coded": na_label.to_dict()}, Metadata("test_user", Metadata.get_call_location(), time.time())) imported_code_ids = [td["gender_coded"]["CodeID"] for td in imported_messages] expected_code_ids = [ gender_scheme.get_code_with_match_value("female").code_id, # Manually approved auto-code gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id, # Empty raw message gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id, # Manually assigned code which isn't checked gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id, # No raw message gender_scheme.get_code_with_control_code(Codes.NOT_CODED).code_id, # Manually Not Coded gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id, # Manually un-coded ] self.assertListEqual(imported_code_ids, expected_code_ids) # Add an element with the same raw text but a conflicting messages.append(TracedData({ "gender_raw": "woman", "gender_sent_on": "2018-11-01T07:13:04+03:00", "gender_coded": CleaningUtils.make_label_from_cleaner_code( gender_scheme, gender_scheme.get_code_with_match_value("male"), "make_location_label", date_time_utc="2018-11-03T13:40:50Z").to_dict() }, Metadata("test_user", Metadata.get_call_location(), time.time()))) TracedDataCodaV2IO.compute_message_ids("test_user", messages, "gender_raw", "gender_coda_id") with open(file_path, "w") as f: try: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( messages, "gender_raw", "gender_sent_on", "gender_coda_id", {"gender_coded": gender_scheme}, f) except AssertionError as e: assert str(e) == "Messages with the same id " \ "(cf2e5bff1ef03dcd20d1a0b18ef7d89fc80a3554434165753672f6f40fde1d25) have different " \ "labels for coded_key 'gender_coded'" return self.fail("Exporting data with conflicting labels did not fail")
def apply_manual_codes(cls, user, data, coda_input_dir): # Merge manually coded radio show files into the cleaned dataset for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [td for td in data if plan.raw_field in td] coda_input_path = path.join(coda_input_dir, plan.coda_filename) f = None try: if path.exists(coda_input_path): f = open(coda_input_path, "r") TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, rqa_messages, plan.id_field, {plan.coded_field: plan.code_scheme}, f) if plan.binary_code_scheme is not None: if f is not None: f.seek(0) TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, rqa_messages, plan.id_field, {plan.binary_coded_field: plan.binary_code_scheme}, f) finally: if f is not None: f.close() # Mark data that is noise as Codes.NOT_CODED for td in data: if td["noise"]: nc_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.coded_field not in td: nc_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location() ) nc_dict[plan.coded_field] = [nc_label.to_dict()] if plan.binary_code_scheme is not None: nc_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location() ) nc_dict[plan.binary_coded_field] = nc_label.to_dict() td.append_data(nc_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Merge manually coded survey files into the cleaned dataset for plan in PipelineConfiguration.SURVEY_CODING_PLANS: f = None try: coda_input_path = path.join(coda_input_dir, plan.coda_filename) if path.exists(coda_input_path): f = open(coda_input_path, "r") TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field, {plan.coded_field: plan.code_scheme}, f) finally: if f is not None: f.close() # Set district/region/state/zone codes from the coded district field. for td in data: # Up to 1 location code should have been assigned in Coda. Search for that code, # ensuring that only 1 has been assigned or, if multiple have been assigned, that they are non-conflicting # control codes location_code = None for plan in PipelineConfiguration.LOCATION_CODING_PLANS: coda_code = plan.code_scheme.get_code_with_id(td[plan.coded_field]["CodeID"]) if location_code is not None: if not (coda_code.code_id == location_code.code_id or coda_code.control_code == Codes.NOT_REVIEWED): location_code = Code(None, "Control", None, None, None, None, control_code=Codes.NOT_INTERNALLY_CONSISTENT) elif coda_code.control_code != Codes.NOT_REVIEWED: location_code = coda_code # If no code was found, then this location is still not reviewed. # Synthesise a NOT_REVIEWED code accordingly. if location_code is None: location_code = Code(None, "Control", None, None, None, None, control_code=Codes.NOT_REVIEWED) # If a control code was found, set all other location keys to that control code, # otherwise convert the provided location to the other locations in the hierarchy. if location_code.code_type == "Control": for plan in PipelineConfiguration.LOCATION_CODING_PLANS: td.append_data({ plan.coded_field: CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(location_code.control_code), Metadata.get_call_location() ).to_dict() }, Metadata(user, Metadata.get_call_location(), time.time())) else: location = location_code.match_values[0] td.append_data({ "mogadishu_sub_district_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.MOGADISHU_SUB_DISTRICT, cls.make_location_code(CodeSchemes.MOGADISHU_SUB_DISTRICT, SomaliaLocations.mogadishu_sub_district_for_location_code(location)), Metadata.get_call_location()).to_dict(), "district_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.DISTRICT, cls.make_location_code(CodeSchemes.DISTRICT, SomaliaLocations.district_for_location_code(location)), Metadata.get_call_location()).to_dict(), "region_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.REGION, cls.make_location_code(CodeSchemes.REGION, SomaliaLocations.region_for_location_code(location)), Metadata.get_call_location()).to_dict(), "state_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.STATE, cls.make_location_code(CodeSchemes.STATE, SomaliaLocations.state_for_location_code(location)), Metadata.get_call_location()).to_dict(), "zone_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.ZONE, cls.make_location_code(CodeSchemes.ZONE, SomaliaLocations.zone_for_location_code(location)), Metadata.get_call_location()).to_dict() }, Metadata(user, Metadata.get_call_location(), time.time())) return data
def auto_code_show_messages(cls, user, data, pipeline_configuration, icr_output_dir, coda_output_dir): # Filter out test messages sent by AVF. if pipeline_configuration.filter_test_messages: data = MessageFilters.filter_test_messages(data) else: log.debug( "Not filtering out test messages (because the pipeline configuration json key " "'FilterTestMessages' was set to false)") # Filter for runs which don't contain a response to any week's question data = MessageFilters.filter_empty_messages(data, [ plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS ]) # Filter out runs sent outwith the project start and end dates data = MessageFilters.filter_time_range( data, cls.SENT_ON_KEY, pipeline_configuration.project_start_date, pipeline_configuration.project_end_date) # Skipping auto-assigning noise, as an experiment on this project. # If it turns out we need this, uncomment this block. # for td in data: # is_noise = True # for rqa_key in cls.RQA_KEYS: # if rqa_key in td and not somali.DemographicCleaner.is_noise(td[rqa_key], min_length=10): # is_noise = False # td.append_data({cls.NOISE_KEY: is_noise}, Metadata(user, Metadata.get_call_location(), time.time())) # TODO: Label each message with channel keys # Channels.set_channel_keys(user, data, cls.SENT_ON_KEY, # pipeline_configuration.project_start_date, pipeline_configuration.project_end_date) # Filter for messages which aren't noise (in order to export to Coda and export for ICR) not_noise = MessageFilters.filter_noise(data, cls.NOISE_KEY, lambda x: x) # Compute the number of RQA messages that were the empty string log.debug( "Counting the number of empty string messages for each raw radio show field..." ) raw_rqa_fields = [] for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in raw_rqa_fields: raw_rqa_fields.append(plan.raw_field) cls.log_empty_string_stats(data, raw_rqa_fields) # Compute the number of survey messages that were the empty string log.debug( "Counting the number of empty string messages for each survey field..." ) raw_survey_fields = [] for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field not in raw_survey_fields: raw_survey_fields.append(plan.raw_field) survey_data = dict() for td in data: survey_data[td["uid"]] = td cls.log_empty_string_stats(survey_data.values(), raw_survey_fields) # Output messages which aren't noise to Coda IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, not_noise, plan.raw_field, plan.id_field) output_path = path.join(coda_output_dir, plan.coda_filename) with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( not_noise, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f) # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [] for td in not_noise: if plan.raw_field in td: rqa_messages.append(td) icr_messages = ICRTools.generate_sample_for_icr( rqa_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field]) return data
def apply_manual_codes(cls, user, data, coda_input_dir): # Merge manually coded radio show files into the cleaned dataset for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [td for td in data if plan.raw_field in td] coda_input_path = path.join(coda_input_dir, plan.coda_filename) f = None try: if path.exists(coda_input_path): f = open(coda_input_path, "r") TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, rqa_messages, plan.id_field, {plan.coded_field: plan.code_scheme}, f) if plan.binary_code_scheme is not None: if f is not None: f.seek(0) TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, rqa_messages, plan.id_field, {plan.binary_coded_field: plan.binary_code_scheme}, f) finally: if f is not None: f.close() # Label the RQA for which there is no response yet as TRUE MISSING for td in data: missing_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[plan.coded_field] = [na_label.to_dict()] if plan.binary_code_scheme is not None: na_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[ plan.binary_coded_field] = na_label.to_dict() elif td[plan.raw_field] == "": nc_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()) missing_dict[plan.coded_field] = [nc_label.to_dict()] elif plan.binary_code_scheme is not None and td[ plan.raw_field] == "": nc_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()) missing_dict[plan.binary_coded_field] = [ nc_label.to_dict() ] td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Synchronise the control codes between the binary and reasons schemes: # Some RQA datasets have a binary scheme, which is always labelled, and a reasons scheme, which is only labelled # if there is an additional reason given. Importing those two schemes separately above caused the labels in # each scheme to go out of sync with each other, e.g. reasons can be NR when the binary *was* reviewed. # This block updates the reasons scheme in cases where only a binary label was set, by assigning the # label 'NC' if the binary label was set to a normal code, otherwise to be the same control code as the binary. for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [td for td in data if plan.raw_field in td] if plan.binary_code_scheme is not None: for td in rqa_messages: binary_label = td[plan.binary_coded_field] binary_code = plan.binary_code_scheme.get_code_with_id( binary_label["CodeID"]) binary_label_present = binary_label["CodeID"] != \ plan.binary_code_scheme.get_code_with_control_code( Codes.NOT_REVIEWED).code_id reasons_label_present = len(td[plan.coded_field]) > 1 or td[plan.coded_field][0][ "CodeID"] != \ plan.code_scheme.get_code_with_control_code( Codes.NOT_REVIEWED).code_id if binary_label_present and not reasons_label_present: if binary_code.code_type == "Control": control_code = binary_code.control_code reasons_code = plan.code_scheme.get_code_with_control_code( control_code) reasons_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, reasons_code, Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation") td.append_data( {plan.coded_field: [reasons_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) else: assert binary_code.code_type == "Normal" nc_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation") td.append_data( {plan.coded_field: [nc_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Merge manually coded demog and follow-up survey files into the cleaned dataset # Recursion depth currently exceeding # TODO: Investigate/address the cause of this. sys.setrecursionlimit(10000) for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: f = None try: coda_input_path = path.join(coda_input_dir, plan.coda_filename) if path.exists(coda_input_path): f = open(coda_input_path, "r") TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field, {plan.coded_field: plan.code_scheme}, f) finally: if f is not None: f.close() # Not everyone will have answered all of the demographic and follow-up survey flows flows. # Label demographic and follow-up survey questions which had no responses as TRUE_MISSING. # Label data which is just the empty string as NOT_CODED. for td in data: missing_dict = dict() for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: if plan.raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[plan.coded_field] = na_label.to_dict() elif td[plan.raw_field] == "": nc_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()) missing_dict[plan.coded_field] = nc_label.to_dict() td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Set county/constituency/from the coded constituency field. cls._impute_location_codes(user, data) # Set coding error codes using the coding error field cls._impute_coding_error_codes(user, data) return data
def auto_code_show_messages(cls, user, data, icr_output_dir, coda_output_dir): # Filter out test messages sent by AVF. if not PipelineConfiguration.DEV_MODE: data = MessageFilters.filter_test_messages(data) # Filter for runs which don't contain a response to any week's question data = MessageFilters.filter_empty_messages(data, cls.TEST_KEYS) # Filter out runs sent outside the project start and end dates data = MessageFilters.filter_time_range(data, cls.SENT_ON_KEY, cls.PROJECT_START_DATE, cls.PROJECT_END_DATE) # Label missing data for td in data: missing_dict = dict() for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS: if plan.raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.coded_field] = [na_label.to_dict()] if plan.binary_code_scheme is not None: na_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.binary_coded_field] = na_label.to_dict() td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Label each message with channel keys Channels.set_channel_keys(user, data, cls.SENT_ON_KEY) # Output messagges for Coda IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) output_path = path.join(coda_output_dir, plan.coda_filename) with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f ) print("Coda message files successfully exported") # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS: test_pipeline_messages = [] na_messages = [] for td in data: if plan.coded_field not in td: test_pipeline_messages.append(td) else: assert len(td[plan.coded_field]) == 1 assert td[plan.coded_field][0]["CodeID"] == \ plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id icr_messages = ICRTools.generate_sample_for_icr( test_pipeline_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field] ) print("ICR files successfully exported") return data
def apply_manual_codes(cls, user, data, coda_input_dir): # Merge manually coded data into the cleaned dataset for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.coda_filename is None: continue coda_input_path = path.join(coda_input_dir, plan.coda_filename) for cc in plan.coding_configurations: if not cc.requires_manual_verification: continue f = None try: if path.exists(coda_input_path): f = open(coda_input_path, "r") if cc.coding_mode == CodingModes.SINGLE: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field, {cc.coded_field: cc.code_scheme}, f) else: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, data, plan.id_field, {cc.coded_field: cc.code_scheme}, f) finally: if f is not None: f.close() if PipelineConfiguration.WS_CORRECT_DATASET_SCHEME is not None: f = None try: if path.exists(coda_input_path): f = open(coda_input_path, "r") TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field, { f"{plan.raw_field}_correct_dataset": PipelineConfiguration.WS_CORRECT_DATASET_SCHEME }, f) finally: if f is not None: f.close() # Label data for which there is no response as TRUE_MISSING. # Label data for which the response is the empty string as NOT_CODED. for td in data: missing_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: raw_field = cc.raw_field if cc.raw_field is not None else plan.raw_field if raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()).to_dict() missing_dict[ cc. coded_field] = na_label if cc.coding_mode == CodingModes.SINGLE else [ na_label ] for cc in plan.coding_configurations: raw_field = cc.raw_field if cc.raw_field is not None else plan.raw_field if td.get(raw_field) == "": nc_label = CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()).to_dict() missing_dict[ cc. coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [ nc_label ] td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Mark data that is noise as Codes.NOT_CODED for td in data: if td.get("noise", False): nc_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS: for cc in plan.coding_configurations: if cc.coded_field not in td: nc_label = CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()).to_dict() nc_dict[ cc. coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [ nc_label ] td.append_data( nc_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Run the cleaners that don't require manual verification again, this time setting "checked" to True for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.cleaner is not None and not cc.requires_manual_verification: raw_field = cc.raw_field if cc.raw_field is not None else plan.raw_field CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, raw_field, cc.coded_field, cc.cleaner, cc.code_scheme, set_checked=True) # Run code imputation functions for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.code_imputation_function is not None: plan.code_imputation_function(user, data, plan.coding_configurations) cls._impute_coding_error_codes(user, data) return data
def auto_code_surveys(cls, user, data, phone_uuid_table, coda_output_dir): # Label missing data for td in data: missing_dict = dict() for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if td.get(plan.raw_field, "") == "": na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[plan.coded_field] = na_label.to_dict() td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Auto-code remaining data for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, plan.raw_field, plan.coded_field, plan.cleaner, plan.code_scheme) # For any locations where the cleaners assigned a code to a sub district, set the district code to NC # (this is because only one column should have a value set in Coda) for td in data: if "mogadishu_sub_district_coded" in td: mogadishu_code_id = td["mogadishu_sub_district_coded"][ "CodeID"] if CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_id( mogadishu_code_id).code_type == "Normal": nc_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.MOGADISHU_SUB_DISTRICT, CodeSchemes.MOGADISHU_SUB_DISTRICT. get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location(), ) td.append_data({"district_coded": nc_label.to_dict()}, Metadata(user, Metadata.get_call_location(), time.time())) # Set operator from phone number for td in data: operator_clean = PhoneCleaner.clean_operator( phone_uuid_table.get_phone(td["uid"])) if operator_clean == Codes.NOT_CODED: label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.OPERATOR, CodeSchemes.OPERATOR.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()) else: label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.OPERATOR, CodeSchemes.OPERATOR.get_code_with_match_value( operator_clean), Metadata.get_call_location()) td.append_data({"operator_coded": label.to_dict()}, Metadata(user, Metadata.get_call_location(), time.time())) # Output single-scheme answers to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field == "mogadishu_sub_district_raw": continue TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, {plan.coded_field: plan.code_scheme}, f) # Output location scheme to coda for manual verification + coding output_path = path.join(coda_output_dir, "location.json") TracedDataCodaV2IO.compute_message_ids( user, data, "mogadishu_sub_district_raw", "mogadishu_sub_district_raw_id") with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, "mogadishu_sub_district_raw", "mogadishu_sub_district_time", "mogadishu_sub_district_raw_id", { "mogadishu_sub_district_coded": CodeSchemes.MOGADISHU_SUB_DISTRICT, "district_coded": CodeSchemes.DISTRICT, "region_coded": CodeSchemes.REGION, "state_coded": CodeSchemes.STATE, "zone_coded": CodeSchemes.ZONE }, f) return data
def test_export_two_single_coded_schemes(self): file_path = path.join(self.test_dir, "coda_2_test.json") # Load schemes with open("tests/traced_data/resources/coda_2_district_scheme.json") as f: district_scheme = CodeScheme.from_firebase_map(json.load(f)) with open("tests/traced_data/resources/coda_2_zone_scheme.json") as f: zone_scheme = CodeScheme.from_firebase_map(json.load(f)) def make_location_label(scheme, value): if value in {Codes.TRUE_MISSING, Codes.SKIPPED, Codes.NOT_CODED}: code = scheme.get_code_with_control_code(value) else: code = scheme.get_code_with_match_value(value) return CleaningUtils.make_label_from_cleaner_code(scheme, code, "make_location_label", date_time_utc="2018-11-02T13:40:50Z").to_dict() # Build raw input data message_dicts = [ # Normal, coded data {"location_raw": "mog", "location_sent_on": "2018-11-01T07:13:04+03:00", "district": make_location_label(district_scheme, "mogadishu"), "zone": make_location_label(zone_scheme, "scz")}, # Data coded under one scheme only {"location_raw": "kismayo", "location_sent_on": "2018-11-01T07:17:04+03:00", "district": make_location_label(district_scheme, "kismayo")}, # Data coded as missing under both schemes {"location_raw": "", "location_sent_on": "2018-11-01T07:19:04+05:00", "district": make_location_label(district_scheme, Codes.TRUE_MISSING), "zone": make_location_label(zone_scheme, Codes.TRUE_MISSING)}, # No data {}, # Data coded as NC under both schemes {"location_raw": "kismyo", "location_sent_on": "2018-11-01T07:19:30+03:00", "district": make_location_label(district_scheme, Codes.NOT_CODED), "zone": make_location_label(zone_scheme, Codes.NOT_CODED)}, # Data coded as NC under one scheme only {"location_raw": "kismay", "location_sent_on": "2018-11-01T07:19:30+03:00", "district": make_location_label(district_scheme, "kismayo"), "zone": make_location_label(zone_scheme, Codes.NOT_CODED)}, ] messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i)) for i, d in enumerate(message_dicts)] # Add message ids TracedDataCodaV2IO.compute_message_ids("test_user", messages, "location_raw", "location_coda_id") # Export to a Coda 2 messages file with open(file_path, "w") as f: scheme_key_map = collections.OrderedDict() # Using OrderedDict to make tests easier to write in Py2 and Py3. scheme_key_map["district"] = district_scheme scheme_key_map["zone"] = zone_scheme TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( messages, "location_raw", "location_sent_on", "location_coda_id", scheme_key_map, f) self.assertTrue( filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_multiple_schemes.json"))
def auto_code_surveys(cls, user, data, pipeline_configuration, coda_output_dir): # Auto-code surveys for plan in PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, plan.raw_field, cc.coded_field, cc.cleaner, cc.code_scheme) # Remove survey data sent after the project finished log.info( "Hiding survey messages sent after the end of the project. These will not be exported in " "production/analysis files") out_of_range_count = 0 for td in data: for plan in PipelineConfiguration.SURVEY_CODING_PLANS: # TODO: Come up with a better solution here e.g. separate DEMOG/SURVEY lists if plan.raw_field in ["have_voice_raw", "suggestions_raw"]: continue if plan.time_field in td and isoparse( td[plan.time_field] ) > pipeline_configuration.project_end_date: out_of_range_count += 1 td.hide_keys({plan.raw_field, plan.time_field}, Metadata(user, Metadata.get_call_location(), time.time())) log.info( f"Hid {out_of_range_count} survey messages sent after the end of the project" ) # For any locations where the cleaners assigned a code to a sub district, set the district code to NC # (this is because only one column should have a value set in Coda) for td in data: if "mogadishu_sub_district_coded" in td: mogadishu_code_id = td["mogadishu_sub_district_coded"][ "CodeID"] if CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_id( mogadishu_code_id).code_type == "Normal": nc_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.MOGADISHU_SUB_DISTRICT, CodeSchemes.MOGADISHU_SUB_DISTRICT. get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location(), ) td.append_data({"district_coded": nc_label.to_dict()}, Metadata(user, Metadata.get_call_location(), time.time())) # Output survey responses to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, { cc.coded_field: cc.code_scheme for cc in plan.coding_configurations }, f) return data
def test_export_import_one_multi_coded_scheme(self): file_path = path.join(self.test_dir, "coda_2_test.json") # Build raw input data message_dicts = [ {"msg_raw": "food", "msg_sent_on": "2018-11-01T07:13:04+03:00"}, {"msg_raw": "", "msg_sent_on": "2018-11-01T07:17:04+03:00"}, {"msg_raw": "food + water", "msg_sent_on": "2018-11-01T07:19:04+05:00"}, {}, {"msg_raw": "water", "msg_sent_on": "2018-11-02T19:00:29+03:00"}, {"msg_raw": "abcd", "msg_sent_on": "2018-11-02T20:30:45+03:00"} ] messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i)) for i, d in enumerate(message_dicts)] # Add message ids TracedDataCodaV2IO.compute_message_ids("test_user", messages, "msg_raw", "msg_coda_id") # Load gender scheme with open("tests/traced_data/resources/coda_2_msg_scheme.json") as f: msg_scheme = CodeScheme.from_firebase_map(json.load(f)) # Export to a Coda 2 messages file with open(file_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( messages, "msg_raw", "msg_sent_on", "msg_coda_id", {"msg_coded": msg_scheme}, f) self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_multi_coded.json")) # Test importing with no file available imported_messages = [] for td in messages: imported_messages.append(td.copy()) TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}) # Deliberately testing the read can be done twice TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}) na_id = msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id nr_id = msg_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id # Set TRUE_MISSING codes for td in imported_messages: na_label = CleaningUtils.make_label_from_cleaner_code( msg_scheme, msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING), "test_export_traced_data_iterable_to_coda_2", date_time_utc="2018-11-02T10:00:00+00:00" ) if td.get("msg_raw", "") == "": td.append_data({"msg_coded": [na_label.to_dict()]}, Metadata("test_user", Metadata.get_call_location(), time.time())) for td in imported_messages: self.assertEqual(len(td["msg_coded"]), 1) imported_code_ids = [td["msg_coded"][0]["CodeID"] for td in imported_messages] self.assertListEqual(imported_code_ids, [nr_id, na_id, nr_id, na_id, nr_id, nr_id]) # Test importing from the test file imported_messages = [] for td in messages: imported_messages.append(td.copy()) with open("tests/traced_data/resources/coda_2_import_test_multi_coded.json", "r") as f: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}, f) # Test that reading the same file-pointer twice without moving it back to the start of the file fails try: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}, f) self.fail("Re-using the same file pointer didn't raise an assertion error") except AssertionError as e: self.assertEqual(str(e), "File-pointer not at byte 0. " "Should you have used e.g. `f.seek(0)` before calling this method?") # Set TRUE_MISSING codes for td in imported_messages: na_label = CleaningUtils.make_label_from_cleaner_code( msg_scheme, msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING), "test_export_traced_data_iterable_to_coda_2", date_time_utc="2018-11-02T10:00:00+00:00" ) if td.get("msg_raw", "") == "": td.append_data({"msg_coded": [na_label.to_dict()]}, Metadata("test_user", Metadata.get_call_location(), time.time())) imported_code_ids = [] for td in imported_messages: imported_code_ids.append([code["CodeID"] for code in td["msg_coded"]]) expected_code_ids = [ [msg_scheme.get_code_with_match_value("food").code_id], [msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id], [msg_scheme.get_code_with_match_value("food").code_id, msg_scheme.get_code_with_match_value("water").code_id], [msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id], [msg_scheme.get_code_with_match_value("water").code_id], [msg_scheme.get_code_with_control_code(Codes.NOT_CODED).code_id] ] for x, y in zip(imported_code_ids, expected_code_ids): self.assertEqual(len(x), len(y)) self.assertSetEqual(set(x), set(y))
def move_wrong_scheme_messages(user, data, coda_input_dir): log.info("Importing manually coded Coda files to '_WS' fields...") for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.coda_filename is None: continue TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, f"{plan.id_field}_WS") with open(f"{coda_input_dir}/{plan.coda_filename}") as f: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, f"{plan.id_field}_WS", { f"{plan.raw_field}_WS_correct_dataset": PipelineConfiguration.WS_CORRECT_DATASET_SCHEME }, f) for cc in plan.coding_configurations: with open(f"{coda_input_dir}/{plan.coda_filename}") as f: if cc.coding_mode == CodingModes.SINGLE: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field + "_WS", {f"{cc.coded_field}_WS": cc.code_scheme}, f) else: assert cc.coding_mode == CodingModes.MULTIPLE TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, data, f"{plan.id_field}_WS", {f"{cc.coded_field}_WS": cc.code_scheme}, f) log.info("Checking for WS Coding Errors...") # Check for coding errors for td in data: for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: rqa_codes = [] for cc in plan.coding_configurations: if cc.coding_mode == CodingModes.SINGLE: if f"{cc.coded_field}_WS" in td: label = td[f"{cc.coded_field}_WS"] rqa_codes.append( cc.code_scheme.get_code_with_code_id( label["CodeID"])) else: assert cc.coding_mode == CodingModes.MULTIPLE for label in td.get(f"{cc.coded_field}_WS", []): rqa_codes.append( cc.code_scheme.get_code_with_code_id( label["CodeID"])) has_ws_code_in_code_scheme = False for code in rqa_codes: if code.control_code == Codes.WRONG_SCHEME: has_ws_code_in_code_scheme = True has_ws_code_in_ws_scheme = False if f"{plan.raw_field}_WS_correct_dataset" in td: ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id( td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"]) has_ws_code_in_ws_scheme = ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED if has_ws_code_in_code_scheme != has_ws_code_in_ws_scheme: log.warning( f"Coding Error: {plan.raw_field}: {td[plan.raw_field]}" ) coding_error_dict = { f"{plan.raw_field}_WS_correct_dataset": CleaningUtils.make_label_from_cleaner_code( PipelineConfiguration.WS_CORRECT_DATASET_SCHEME, PipelineConfiguration.WS_CORRECT_DATASET_SCHEME. get_code_with_control_code(Codes.CODING_ERROR), Metadata.get_call_location(), ).to_dict() } td.append_data( coding_error_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Construct a map from WS normal code id to the raw field that code indicates a requested move to. ws_code_to_raw_field_map = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.ws_code is not None: ws_code_to_raw_field_map[plan.ws_code.code_id] = plan.raw_field # Group the TracedData by uid. data_grouped_by_uid = dict() for td in data: uid = td["uid"] if uid not in data_grouped_by_uid: data_grouped_by_uid[uid] = [] data_grouped_by_uid[uid].append(td) # Perform the WS correction for each uid. log.info("Performing WS correction...") corrected_data = [] # List of TracedData with the WS data moved. unknown_target_code_counts = dict( ) # 'WS - Correct Dataset' codes with no matching code id in any coding plan # for this project, with a count of the occurrences for group in data_grouped_by_uid.values(): # Find all the surveys data being moved. # (Note: we only need to check one td in this group because all the demographics are the same) td = group[0] survey_moves = dict() # of source_field -> target_field for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field not in td or plan.coda_filename is None: continue ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id( td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"]) if ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED: if ws_code.code_id in ws_code_to_raw_field_map: survey_moves[ plan.raw_field] = ws_code_to_raw_field_map[ ws_code.code_id] else: if (ws_code.code_id, ws_code.display_text ) not in unknown_target_code_counts: unknown_target_code_counts[( ws_code.code_id, ws_code.display_text)] = 0 unknown_target_code_counts[(ws_code.code_id, ws_code.display_text)] += 1 survey_moves[plan.raw_field] = None # Find all the RQA data being moved. rqa_moves = dict( ) # of (index in group, source_field) -> target_field for i, td in enumerate(group): for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in td or plan.coda_filename is None: continue ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id( td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"]) if ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED: if ws_code.code_id in ws_code_to_raw_field_map: rqa_moves[( i, plan.raw_field )] = ws_code_to_raw_field_map[ws_code.code_id] else: if (ws_code.code_id, ws_code.display_text ) not in unknown_target_code_counts: unknown_target_code_counts[( ws_code.code_id, ws_code.display_text)] = 0 unknown_target_code_counts[( ws_code.code_id, ws_code.display_text)] += 1 rqa_moves[(i, plan.raw_field)] = None # Build a dictionary of the survey fields that haven't been moved, and cleared fields for those which have. survey_updates = dict() # of raw_field -> updated value for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.coda_filename is None: continue if plan.raw_field in survey_moves.keys(): # Data is moving survey_updates[plan.raw_field] = [] elif plan.raw_field in td: # Data is not moving survey_updates[plan.raw_field] = [ _WSUpdate(td[plan.raw_field], td[plan.time_field], plan.raw_field, td) ] # Build a list of the rqa fields that haven't been moved. rqa_updates = [] # of (raw_field, _WSUpdate) for i, td in enumerate(group): for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.coda_filename is None: continue if plan.raw_field in td: if (i, plan.raw_field) in rqa_moves.keys(): # Data is moving pass else: # Data is not moving rqa_updates.append( (plan.raw_field, _WSUpdate(td[plan.raw_field], td[plan.time_field], plan.raw_field, td))) # Add data moving from survey fields to the relevant survey_/rqa_updates raw_survey_fields = { plan.raw_field for plan in PipelineConfiguration.SURVEY_CODING_PLANS } raw_rqa_fields = { plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS } for plan in PipelineConfiguration.SURVEY_CODING_PLANS + PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in survey_moves: continue target_field = survey_moves[plan.raw_field] if target_field is None: continue update = _WSUpdate(td[plan.raw_field], td[plan.time_field], plan.raw_field, td) if target_field in raw_survey_fields: survey_updates[target_field] = survey_updates.get( target_field, []) + [update] else: assert target_field in raw_rqa_fields, f"Raw field '{target_field}' not in any coding plan" rqa_updates.append((target_field, update)) # Add data moving from RQA fields to the relevant survey_/rqa_updates for (i, source_field), target_field in rqa_moves.items(): if target_field is None: continue for plan in PipelineConfiguration.SURVEY_CODING_PLANS + PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field == source_field: _td = group[i] update = _WSUpdate(_td[plan.raw_field], _td[plan.time_field], plan.raw_field, td) if target_field in raw_survey_fields: survey_updates[target_field] = survey_updates.get( target_field, []) + [update] else: assert target_field in raw_rqa_fields, f"Raw field '{target_field}' not in any coding plan" rqa_updates.append((target_field, update)) # Re-format the survey updates to a form suitable for use by the rest of the pipeline flattened_survey_updates = {} for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field in survey_updates: plan_updates = survey_updates[plan.raw_field] if len(plan_updates) > 0: flattened_survey_updates[plan.raw_field] = "; ".join( [u.message for u in plan_updates]) flattened_survey_updates[plan.time_field] = sorted( [u.timestamp for u in plan_updates])[0] flattened_survey_updates[ f"{plan.raw_field}_source"] = "; ".join( [u.source_field for u in plan_updates]) else: flattened_survey_updates[plan.raw_field] = None flattened_survey_updates[plan.time_field] = None flattened_survey_updates[ f"{plan.raw_field}_source"] = None # For each RQA message, create a copy of its source td, append the updated TracedData, and add this to # the list of TracedData to be returned raw_field_to_rqa_plan_map = { plan.raw_field: plan for plan in PipelineConfiguration.RQA_CODING_PLANS } for target_field, update in rqa_updates: corrected_td = update.source_td.copy() # Hide the survey keys currently in the TracedData which have had data moved away. corrected_td.hide_keys({ k for k, v in flattened_survey_updates.items() if v is None }.intersection(corrected_td.keys()), Metadata(user, Metadata.get_call_location(), time.time())) # Update with the corrected survey data corrected_td.append_data( { k: v for k, v in flattened_survey_updates.items() if v is not None }, Metadata(user, Metadata.get_call_location(), time.time())) # Hide all the RQA fields (they will be added back, in turn, in the next step). corrected_td.hide_keys({ plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS }.intersection(corrected_td.keys()), Metadata(user, Metadata.get_call_location(), time.time())) corrected_td.hide_keys({ plan.time_field for plan in PipelineConfiguration.RQA_CODING_PLANS }.intersection(corrected_td.keys()), Metadata(user, Metadata.get_call_location(), time.time())) target_coding_plan = raw_field_to_rqa_plan_map[target_field] rqa_dict = { target_field: update.message, target_coding_plan.time_field: update.timestamp, f"{target_field}_source": update.source_field } corrected_td.append_data( rqa_dict, Metadata(user, Metadata.get_call_location(), time.time())) corrected_data.append(corrected_td) if len(unknown_target_code_counts) > 0: log.warning( "Found the following 'WS - Correct Dataset' CodeIDs with no matching coding plan:" ) for (code_id, display_text), count in unknown_target_code_counts.items(): log.warning( f" '{code_id}' (DisplayText '{display_text}') ({count} occurrences)" ) return corrected_data
def move_wrong_scheme_messages(user, data, coda_input_dir): log.info( "Importing manually coded Coda files to '_WS' coded fields...") for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field + "_WS") with open(f"{coda_input_dir}/{plan.coda_filename}") as f: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field + "_WS", { f"{plan.coded_field}_WS_correct_dataset": CodeSchemes.WS_CORRECT_DATASET }, f) for plan in PipelineConfiguration.RQA_CODING_PLANS: with open(f"{coda_input_dir}/{plan.coda_filename}") as f: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, data, plan.id_field + "_WS", {f"{plan.coded_field}_WS": plan.code_scheme}, f) if plan.binary_code_scheme is not None: f.seek(0) TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field + "_WS", { f"{plan.binary_coded_field}_WS": plan.binary_code_scheme }, f) for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: with open(f"{coda_input_dir}/{plan.coda_filename}") as f: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field + "_WS", {f"{plan.coded_field}_WS": plan.code_scheme}, f) log.info("Checking for WS Coding Errors...") # Check for coding errors in the RQA datasets. for td in data: for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_codes = [] for label in td.get(f"{plan.coded_field}_WS", []): rqa_codes.append( plan.code_scheme.get_code_with_id(label["CodeID"])) if plan.binary_code_scheme is not None and f"{plan.binary_coded_field}_WS" in td: label = td[f"{plan.binary_coded_field}_WS"] rqa_codes.append( plan.binary_code_scheme.get_code_with_id( label["CodeID"])) has_ws_code_in_code_scheme = False for code in rqa_codes: if code.control_code == Codes.WRONG_SCHEME: has_ws_code_in_code_scheme = True has_ws_code_in_ws_scheme = False if f"{plan.coded_field}_WS_correct_dataset" in td: has_ws_code_in_ws_scheme = CodeSchemes.WS_CORRECT_DATASET.get_code_with_id( td[f"{plan.coded_field}_WS_correct_dataset"] ["CodeID"]).code_type == "Normal" if has_ws_code_in_code_scheme != has_ws_code_in_ws_scheme: log.debug( f"Coding Error: {plan.raw_field}: {td[plan.raw_field]}" ) coding_error_dict = { f"{plan.coded_field}_WS_correct_dataset": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.WS_CORRECT_DATASET, CodeSchemes.WS_CORRECT_DATASET. get_code_with_control_code(Codes.CODING_ERROR), Metadata.get_call_location(), ).to_dict() } td.append_data( coding_error_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Check for coding errors in the demogs and follow up surveys datasets, except location, as this is handled differently below. for td in data: for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: if plan.raw_field == "location_raw": continue has_ws_code_in_code_scheme = False if f"{plan.coded_field}_WS" in td: has_ws_code_in_code_scheme = plan.code_scheme.get_code_with_id( td[f"{plan.coded_field}_WS"] ["CodeID"]).control_code == Codes.WRONG_SCHEME has_ws_code_in_ws_scheme = False if f"{plan.coded_field}_WS_correct_dataset" in td: has_ws_code_in_ws_scheme = CodeSchemes.WS_CORRECT_DATASET.get_code_with_id( td[f"{plan.coded_field}_WS_correct_dataset"] ["CodeID"]).code_type == "Normal" if has_ws_code_in_code_scheme != has_ws_code_in_ws_scheme: log.debug( f"Coding Error: {plan.raw_field}: {td[plan.raw_field]}" ) coding_error_dict = { f"{plan.coded_field}_WS_correct_dataset": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.WS_CORRECT_DATASET, CodeSchemes.WS_CORRECT_DATASET. get_code_with_control_code(Codes.CODING_ERROR), Metadata.get_call_location(), ).to_dict() } td.append_data( coding_error_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Check for coding errors in the locations dataset. for td in data: location_codes = [] for plan in PipelineConfiguration.LOCATION_CODING_PLANS: if f"{plan.coded_field}_WS" in td: label = td[f"{plan.coded_field}_WS"] location_codes.append( plan.code_scheme.get_code_with_id(label["CodeID"])) has_ws_code_in_code_scheme = False for code in location_codes: if code.control_code == Codes.WRONG_SCHEME: has_ws_code_in_code_scheme = True has_ws_code_in_ws_scheme = False for plan in PipelineConfiguration.LOCATION_CODING_PLANS: if f"{plan.coded_field}_WS_correct_dataset" in td: if CodeSchemes.WS_CORRECT_DATASET.get_code_with_id( td[f"{plan.coded_field}_WS_correct_dataset"] ["CodeID"]).code_type == "Normal": has_ws_code_in_ws_scheme = True if has_ws_code_in_code_scheme != has_ws_code_in_ws_scheme: log.debug(f"Coding Error: location_raw: {td['location_raw']}") coding_error_dict = dict() for plan in PipelineConfiguration.LOCATION_CODING_PLANS: coding_error_dict[f"{plan.coded_field}_WS_correct_dataset"] = \ CleaningUtils.make_label_from_cleaner_code( CodeSchemes.WS_CORRECT_DATASET, CodeSchemes.WS_CORRECT_DATASET.get_code_with_control_code(Codes.CODING_ERROR), Metadata.get_call_location(), ).to_dict() td.append_data( coding_error_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Construct a map from WS normal code id to the raw field that code indicates a requested move to. ws_code_to_raw_field_map = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: if plan.ws_code is not None: ws_code_to_raw_field_map[plan.ws_code.code_id] = plan.raw_field # Group the TracedData by uid. data_grouped_by_uid = dict() for td in data: uid = td["uid"] if uid not in data_grouped_by_uid: data_grouped_by_uid[uid] = [] data_grouped_by_uid[uid].append(td) # Perform the WS correction for each uid. log.info("Performing WS correction...") corrected_data = [] # List of TracedData with the WS data moved. for group in data_grouped_by_uid.values(): log.debug(f"\n\nStarting re-map for {group[0]['uid']}...") for i, td in enumerate(group): log.debug(f"--------------td[{i}]--------------") for _plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.DEMOGS_CODING_PLANS + \ PipelineConfiguration.FOLLOW_UP_CODING_PLANS: log.debug(f"{_plan.raw_field}: {td.get(_plan.raw_field)}") log.debug( f"{_plan.time_field}: {td.get(_plan.time_field)}") # Find all the demogs and follow-up survey data being moved. # (Note: we only need to check one td in this group because all the demographics and follow up surveys are the same) td = group[0] demogs_and_follow_up_survey_moves = dict( ) # of source_field -> target_field for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: if plan.raw_field not in td: continue ws_code = CodeSchemes.WS_CORRECT_DATASET.get_code_with_id( td[f"{plan.coded_field}_WS_correct_dataset"]["CodeID"]) if ws_code.code_type == "Normal": log.debug( f"Detected redirect from {plan.raw_field} -> {ws_code_to_raw_field_map.get(ws_code.code_id, ws_code.code_id)} for message {td[plan.raw_field]}" ) demogs_and_follow_up_survey_moves[ plan.raw_field] = ws_code_to_raw_field_map[ ws_code.code_id] # Find all the RQA data being moved. rqa_moves = dict( ) # of (index in group, source_field) -> target_field for i, td in enumerate(group): for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in td: continue ws_code = CodeSchemes.WS_CORRECT_DATASET.get_code_with_id( td[f"{plan.coded_field}_WS_correct_dataset"]["CodeID"]) if ws_code.code_type == "Normal": log.debug( f"Detected redirect from ({i}, {plan.raw_field}) -> {ws_code_to_raw_field_map.get(ws_code.code_id, ws_code.code_id)} for message {td[plan.raw_field]}" ) rqa_moves[(i, plan.raw_field)] = ws_code_to_raw_field_map[ ws_code.code_id] # Build a dictionary of the demogs and follow-up survey fields that haven't been moved, and clear fields for those which have. demogs_and_follow_up_survey_updates = dict( ) # of raw_field -> updated value for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: if plan.raw_field in demogs_and_follow_up_survey_moves.keys(): # Data is moving demogs_and_follow_up_survey_updates[plan.raw_field] = [] elif plan.raw_field in td: # Data is not moving demogs_and_follow_up_survey_updates[plan.raw_field] = [ _WSUpdate(td[plan.raw_field], td[plan.time_field], plan.raw_field) ] # Build a list of the rqa fields that haven't been moved. rqa_updates = [] # of (field, value) for i, td in enumerate(group): for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field in td: if (i, plan.raw_field) in rqa_moves.keys(): # Data is moving pass else: # Data is not moving rqa_updates.append( (plan.raw_field, _WSUpdate(td[plan.raw_field], td[plan.time_field], plan.raw_field))) raw_demog_and_follow_up_survey_fields = {plan.raw_field for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + \ PipelineConfiguration.FOLLOW_UP_CODING_PLANS} raw_rqa_fields = { plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS } # Add data moving from demogs and follow-up survey fields to the relevant demog/follow_up/rqa_updates for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.RQA_CODING_PLANS + \ PipelineConfiguration.FOLLOW_UP_CODING_PLANS: if plan.raw_field not in demogs_and_follow_up_survey_moves: continue target_field = demogs_and_follow_up_survey_moves[ plan.raw_field] update = _WSUpdate(td[plan.raw_field], td[plan.time_field], plan.raw_field) if target_field in raw_demog_and_follow_up_survey_fields: demogs_and_follow_up_survey_updates[ target_field] = demogs_and_follow_up_survey_updates.get( target_field, []) + [update] else: assert target_field in raw_rqa_fields, f"Raw field '{target_field}' not in any coding plan" rqa_updates.append((target_field, update)) for (i, source_field), target_field in rqa_moves.items(): for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.RQA_CODING_PLANS + \ PipelineConfiguration.FOLLOW_UP_CODING_PLANS: if plan.raw_field == source_field: _td = group[i] update = _WSUpdate(_td[plan.raw_field], _td[plan.time_field], plan.raw_field) if target_field in raw_demog_and_follow_up_survey_fields: demogs_and_follow_up_survey_updates[ target_field] = demogs_and_follow_up_survey_updates.get( target_field, []) + [update] else: assert target_field in raw_rqa_fields, f"Raw field '{target_field}' not in any coding plan" rqa_updates.append((target_field, update)) # Re-format the demogs and follow-up survey updates to a form suitable for use by the rest of the pipeline flattened_demog_and_follow_up_survey_updates = {} for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: if plan.raw_field in demogs_and_follow_up_survey_updates: plan_updates = demogs_and_follow_up_survey_updates[ plan.raw_field] if len(plan_updates) > 0: flattened_demog_and_follow_up_survey_updates[ plan.raw_field] = "; ".join( [u.message for u in plan_updates]) flattened_demog_and_follow_up_survey_updates[ plan.time_field] = sorted( [u.sent_on for u in plan_updates])[0] flattened_demog_and_follow_up_survey_updates[ f"{plan.raw_field}_source"] = "; ".join( [u.source for u in plan_updates]) else: flattened_demog_and_follow_up_survey_updates[ plan.raw_field] = None flattened_demog_and_follow_up_survey_updates[ plan.time_field] = None flattened_demog_and_follow_up_survey_updates[ f"{plan.raw_field}_source"] = None # Hide the demogs and follow up survey keys currently in the TracedData which have had data moved away. td.hide_keys({ k for k, v in flattened_demog_and_follow_up_survey_updates.items() if v is None }.intersection(td.keys()), Metadata(user, Metadata.get_call_location(), time.time())) # Update with the corrected demogs + follow up data td.append_data( { k: v for k, v in flattened_demog_and_follow_up_survey_updates. items() if v is not None }, Metadata(user, Metadata.get_call_location(), time.time())) # Hide all the RQA fields (they will be added back, in turn, in the next step). td.hide_keys({ plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS }.intersection(td.keys()), Metadata(user, Metadata.get_call_location(), time.time())) # For each rqa message, create a copy of this td, append the rqa message, and add this to the # list of TracedData. for target_field, update in rqa_updates: rqa_dict = { target_field: update.message, "sent_on": update.sent_on, f"{target_field}_source": update.source } corrected_td = td.copy() corrected_td.append_data( rqa_dict, Metadata(user, Metadata.get_call_location(), time.time())) corrected_data.append(corrected_td) log.debug(f"----------Created TracedData--------------") for _plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.DEMOGS_CODING_PLANS + \ PipelineConfiguration.FOLLOW_UP_CODING_PLANS: log.debug( f"{_plan.raw_field}: {corrected_td.get(_plan.raw_field)}" ) log.debug( f"{_plan.time_field}: {corrected_td.get(_plan.time_field)}" ) return corrected_data