def auto_code_surveys(cls, user, data, coda_output_dir): # Auto-code surveys for plan in PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, plan.raw_field, cc.coded_field, cc.cleaner, cc.code_scheme) # Output single-scheme answers to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, { cc.coded_field: cc.code_scheme for cc in plan.coding_configurations }, f) # Note: no need to handle location in any special way on this project because it is not being auto-coded return data
def auto_code_surveys(cls, user, data, phone_uuid_table, coda_output_dir): # Label missing data for td in data: missing_dict = dict() for plan in PipelineConfiguration.DEMOGS_CODING_PLANS: if td.get(plan.raw_field, "") == "": na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.coded_field] = na_label.to_dict() td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Auto-code remaining data for plan in PipelineConfiguration.DEMOGS_CODING_PLANS: if plan.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable(user, data, plan.raw_field, plan.coded_field, plan.cleaner, plan.code_scheme) # Output survey answers to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.DEMOGS_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, 'w') as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, {plan.coded_field: plan.code_scheme}, f ) print("Coda demogs files successfully exported") return data
def _impute_coding_error_codes(user, data): for td in data: coding_error_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if f"{plan.raw_field}_WS_correct_dataset" in td: if td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"] == \ CodeSchemes.WS_CORRECT_DATASET.get_code_with_control_code(Codes.CODING_ERROR).code_id: for cc in plan.coding_configurations: if cc.coding_mode == CodingModes.SINGLE: coding_error_dict[cc.coded_field] = \ CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code(Codes.CODING_ERROR), Metadata.get_call_location() ).to_dict() else: assert cc.coding_mode == CodingModes.MULTIPLE coding_error_dict[cc.coded_field] = [ CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme. get_code_with_control_code( Codes.CODING_ERROR), Metadata.get_call_location()).to_dict( ) ] td.append_data( coding_error_dict, Metadata(user, Metadata.get_call_location(), time.time()))
def run_cleaners(cls, user, data): for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, plan.raw_field, cc.coded_field, cc.cleaner, cc.code_scheme)
def label_somalia_operator(user, traced_runs, phone_number_uuid_table): # Set the operator codes for each message. uuids = {td["avf_phone_id"] for td in traced_runs} uuid_to_phone_lut = phone_number_uuid_table.uuid_to_data_batch(uuids) for td in traced_runs: operator_raw = uuid_to_phone_lut[td["avf_phone_id"]][:5] # Returns the country code 252 and the next two digits operator_code = PhoneCleaner.clean_operator(operator_raw) if operator_code == Codes.NOT_CODED: operator_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOMALIA_OPERATOR, CodeSchemes.SOMALIA_OPERATOR.get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location() ) else: operator_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOMALIA_OPERATOR, CodeSchemes.SOMALIA_OPERATOR.get_code_with_match_value(operator_code), Metadata.get_call_location() ) td.append_data({ "operator_raw": operator_raw, "operator_coded": operator_label.to_dict() }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))
def impute_kenya_location_codes(user, data, location_configurations): for td in data: # Up to 1 location code should have been assigned in Coda. Search for that code, # ensuring that only 1 has been assigned or, if multiple have been assigned, that they are non-conflicting # control codes location_code = None for cc in location_configurations: coda_code = cc.code_scheme.get_code_with_id( td[cc.coded_field]["CodeID"]) if location_code is not None: if not (coda_code.code_id == location_code.code_id or coda_code.control_code == Codes.NOT_REVIEWED): location_code = CodeSchemes.CONSTITUENCY.get_code_with_control_code( Codes.CODING_ERROR) elif coda_code.control_code != Codes.NOT_REVIEWED: location_code = coda_code # If no code was found, then this location is still not reviewed. # Synthesise a NOT_REVIEWED code accordingly. if location_code is None: location_code = CodeSchemes.CONSTITUENCY.get_code_with_control_code( Codes.NOT_REVIEWED) # If a control code was found, set all other location keys to that control code, # otherwise convert the provided location to the other locations in the hierarchy. if location_code.code_type == "Control": for cc in location_configurations: td.append_data( { cc.coded_field: CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code( location_code.control_code), Metadata.get_call_location()).to_dict() }, Metadata(user, Metadata.get_call_location(), time.time())) else: location = location_code.match_values[0] td.append_data( { "county_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.COUNTY, make_location_code( CodeSchemes.COUNTY, KenyaLocations.county_for_location_code(location)), Metadata.get_call_location()).to_dict(), "constituency_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.CONSTITUENCY, make_location_code( CodeSchemes.CONSTITUENCY, KenyaLocations.constituency_for_location_code( location)), Metadata.get_call_location()).to_dict() }, Metadata(user, Metadata.get_call_location(), time.time()))
def fold_source(x, y): if x["CodeID"] == y["CodeID"]: return x else: return CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOURCE, CodeSchemes.SOURCE.get_code_with_match_value("both"), Metadata.get_call_location()).to_dict()
def make_location_label(scheme, value): if value in {Codes.TRUE_MISSING, Codes.SKIPPED, Codes.NOT_CODED}: code = scheme.get_code_with_control_code(value) else: code = scheme.get_code_with_match_value(value) return CleaningUtils.make_label_from_cleaner_code(scheme, code, "make_location_label", date_time_utc="2018-11-02T13:40:50Z").to_dict()
def auto_code_surveys(cls, user, data, icr_output_dir, coda_output_dir): # Auto-code surveys for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, plan.raw_field, plan.coded_field, plan.cleaner, plan.code_scheme) # Output single-scheme answers to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, {plan.coded_field: plan.code_scheme}, f) # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: rqa_messages = [] for td in data: if plan.raw_field in td: rqa_messages.append(td) icr_messages = ICRTools.generate_sample_for_icr( rqa_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field]) return data
def _impute_coding_error_codes(user, data): for td in data: coding_error_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS: if f"{plan.coded_field}_WS_correct_dataset" in td: if td[f"{plan.coded_field}_WS_correct_dataset"]["CodeID"] == \ CodeSchemes.WS_CORRECT_DATASET.get_code_with_control_code(Codes.CODING_ERROR).code_id: coding_error_dict[plan.coded_field] = [ CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.CODING_ERROR), Metadata.get_call_location()).to_dict() ] if plan.binary_code_scheme is not None: coding_error_dict[plan.binary_coded_field] = \ CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.CODING_ERROR), Metadata.get_call_location() ).to_dict() for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: if f"{plan.coded_field}_WS_correct_dataset" in td: if td[f"{plan.coded_field}_WS_correct_dataset"]["CodeID"] == \ CodeSchemes.WS_CORRECT_DATASET.get_code_with_control_code(Codes.CODING_ERROR).code_id: coding_error_dict[plan.coded_field] = \ CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.CODING_ERROR), Metadata.get_call_location() ).to_dict() td.append_data( coding_error_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))
def impute_age_category(user, data, age_configurations): # TODO: By accepting a list of age_configurations but then requiring that list to contain code schemes in a # certain order, it looks like we're providing more flexibility than we actually do. We should change this # to explicitly accept age and age_category configurations, which requires refactoring all of the # code imputation functions. age_cc = age_configurations[0] age_category_cc = age_configurations[1] age_categories = { (10, 14): "10 to 14", (15, 17): "15 to 17", (18, 35): "18 to 35", (36, 54): "36 to 54", (55, 99): "55 to 99" } for td in data: age_label = td[age_cc.coded_field] age_code = age_cc.code_scheme.get_code_with_code_id( age_label["CodeID"]) if age_code.code_type == CodeTypes.NORMAL: # TODO: If these age categories are standard across projects, move this to Core as a new cleaner. age_category = None for age_range, category in age_categories.items(): if age_range[0] <= age_code.numeric_value <= age_range[1]: age_category = category assert age_category is not None age_category_code = age_category_cc.code_scheme.get_code_with_match_value( age_category) elif age_code.code_type == CodeTypes.META: age_category_code = age_category_cc.code_scheme.get_code_with_meta_code( age_code.meta_code) else: assert age_code.code_type == CodeTypes.CONTROL age_category_code = age_category_cc.code_scheme.get_code_with_control_code( age_code.control_code) age_category_label = CleaningUtils.make_label_from_cleaner_code( age_category_cc.code_scheme, age_category_code, Metadata.get_call_location()) td.append_data( {age_category_cc.coded_field: age_category_label.to_dict()}, Metadata(user, Metadata.get_call_location(), time.time()))
def auto_code_surveys(cls, user, data, pipeline_configuration, coda_output_dir): # Auto-code surveys for plan in PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, plan.raw_field, cc.coded_field, cc.cleaner, cc.code_scheme) # Remove survey data sent after the project finished log.info( "Hiding survey messages sent after the end of the project. These will not be exported in " "production/analysis files") out_of_range_count = 0 for td in data: for plan in PipelineConfiguration.SURVEY_CODING_PLANS: # TODO: Come up with a better solution here e.g. separate DEMOG/SURVEY lists if plan.raw_field in ["have_voice_raw", "suggestions_raw"]: continue if plan.time_field in td and isoparse( td[plan.time_field] ) > pipeline_configuration.project_end_date: out_of_range_count += 1 td.hide_keys({plan.raw_field, plan.time_field}, Metadata(user, Metadata.get_call_location(), time.time())) log.info( f"Hid {out_of_range_count} survey messages sent after the end of the project" ) # For any locations where the cleaners assigned a code to a sub district, set the district code to NC # (this is because only one column should have a value set in Coda) for td in data: if "mogadishu_sub_district_coded" in td: mogadishu_code_id = td["mogadishu_sub_district_coded"][ "CodeID"] if CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_id( mogadishu_code_id).code_type == "Normal": nc_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.MOGADISHU_SUB_DISTRICT, CodeSchemes.MOGADISHU_SUB_DISTRICT. get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location(), ) td.append_data({"district_coded": nc_label.to_dict()}, Metadata(user, Metadata.get_call_location(), time.time())) # Output survey responses to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, { cc.coded_field: cc.code_scheme for cc in plan.coding_configurations }, f) return data
def apply_manual_codes(cls, user, data, coda_input_dir): # Merge manually coded radio show files into the cleaned dataset for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [td for td in data if plan.raw_field in td] coda_input_path = path.join(coda_input_dir, plan.coda_filename) f = None try: if path.exists(coda_input_path): f = open(coda_input_path, "r") TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, rqa_messages, plan.id_field, {plan.coded_field: plan.code_scheme}, f) if plan.binary_code_scheme is not None: if f is not None: f.seek(0) TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, rqa_messages, plan.id_field, {plan.binary_coded_field: plan.binary_code_scheme}, f) finally: if f is not None: f.close() # Mark data that is noise as Codes.NOT_CODED for td in data: if td["noise"]: nc_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.coded_field not in td: nc_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location() ) nc_dict[plan.coded_field] = [nc_label.to_dict()] if plan.binary_code_scheme is not None: nc_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location() ) nc_dict[plan.binary_coded_field] = nc_label.to_dict() td.append_data(nc_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Merge manually coded survey files into the cleaned dataset for plan in PipelineConfiguration.SURVEY_CODING_PLANS: f = None try: coda_input_path = path.join(coda_input_dir, plan.coda_filename) if path.exists(coda_input_path): f = open(coda_input_path, "r") TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field, {plan.coded_field: plan.code_scheme}, f) finally: if f is not None: f.close() # Set district/region/state/zone codes from the coded district field. for td in data: # Up to 1 location code should have been assigned in Coda. Search for that code, # ensuring that only 1 has been assigned or, if multiple have been assigned, that they are non-conflicting # control codes location_code = None for plan in PipelineConfiguration.LOCATION_CODING_PLANS: coda_code = plan.code_scheme.get_code_with_id(td[plan.coded_field]["CodeID"]) if location_code is not None: if not (coda_code.code_id == location_code.code_id or coda_code.control_code == Codes.NOT_REVIEWED): location_code = Code(None, "Control", None, None, None, None, control_code=Codes.NOT_INTERNALLY_CONSISTENT) elif coda_code.control_code != Codes.NOT_REVIEWED: location_code = coda_code # If no code was found, then this location is still not reviewed. # Synthesise a NOT_REVIEWED code accordingly. if location_code is None: location_code = Code(None, "Control", None, None, None, None, control_code=Codes.NOT_REVIEWED) # If a control code was found, set all other location keys to that control code, # otherwise convert the provided location to the other locations in the hierarchy. if location_code.code_type == "Control": for plan in PipelineConfiguration.LOCATION_CODING_PLANS: td.append_data({ plan.coded_field: CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(location_code.control_code), Metadata.get_call_location() ).to_dict() }, Metadata(user, Metadata.get_call_location(), time.time())) else: location = location_code.match_values[0] td.append_data({ "mogadishu_sub_district_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.MOGADISHU_SUB_DISTRICT, cls.make_location_code(CodeSchemes.MOGADISHU_SUB_DISTRICT, SomaliaLocations.mogadishu_sub_district_for_location_code(location)), Metadata.get_call_location()).to_dict(), "district_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.DISTRICT, cls.make_location_code(CodeSchemes.DISTRICT, SomaliaLocations.district_for_location_code(location)), Metadata.get_call_location()).to_dict(), "region_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.REGION, cls.make_location_code(CodeSchemes.REGION, SomaliaLocations.region_for_location_code(location)), Metadata.get_call_location()).to_dict(), "state_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.STATE, cls.make_location_code(CodeSchemes.STATE, SomaliaLocations.state_for_location_code(location)), Metadata.get_call_location()).to_dict(), "zone_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.ZONE, cls.make_location_code(CodeSchemes.ZONE, SomaliaLocations.zone_for_location_code(location)), Metadata.get_call_location()).to_dict() }, Metadata(user, Metadata.get_call_location(), time.time())) return data
def test_export_import_one_single_coded_scheme(self): file_path = path.join(self.test_dir, "coda_2_test.json") # Build raw input data message_dicts = [ {"gender_raw": "woman", "gender_sent_on": "2018-11-01T07:13:04+03:00"}, {"gender_raw": "", "gender_sent_on": "2018-11-01T07:17:04+03:00"}, {"gender_raw": "hiya", "gender_sent_on": "2018-11-01T07:19:04+05:00"}, {}, {"gender_raw": "boy", "gender_sent_on": "2018-11-02T19:00:29+03:00"}, {"gender_raw": "man", "gender_sent_on": "2018-11-02T19:00:29+03:00"}, ] messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i)) for i, d in enumerate(message_dicts)] # Add message ids TracedDataCodaV2IO.compute_message_ids("test_user", messages, "gender_raw", "gender_coda_id") # Load gender scheme with open("tests/traced_data/resources/coda_2_gender_scheme.json") as f: gender_scheme = CodeScheme.from_firebase_map(json.load(f)) # Apply the English gender cleaner with mock.patch("core_data_modules.util.TimeUtils.utc_now_as_iso_string") as time_mock, \ mock.patch("core_data_modules.traced_data.Metadata.get_function_location") as location_mock: time_mock.return_value = "2018-11-02T15:00:07+00:00" location_mock.return_value = "english.DemographicCleaner.clean_gender" CleaningUtils.apply_cleaner_to_traced_data_iterable( "test_user", messages, "gender_raw", "gender_coded", english.DemographicCleaner.clean_gender, gender_scheme ) # Export to a Coda 2 messages file with open(file_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( messages, "gender_raw", "gender_sent_on", "gender_coda_id", {"gender_coded": gender_scheme}, f) self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_one_scheme.json")) # Test importing with no file available imported_messages = [] for td in messages: imported_messages.append(td.copy()) TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme}) # Deliberately testing the read can be done twice TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme}) na_id = gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id nr_id = gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id # Set TRUE_MISSING codes for td in imported_messages: na_label = CleaningUtils.make_label_from_cleaner_code( gender_scheme, gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING), "test_export_traced_data_iterable_to_coda_2", date_time_utc="2018-11-02T10:00:00+00:00" ) if td.get("gender_raw", "") == "": td.append_data({"gender_coded": na_label.to_dict()}, Metadata("test_user", Metadata.get_call_location(), time.time())) imported_code_ids = [td["gender_coded"]["CodeID"] for td in imported_messages] self.assertListEqual(imported_code_ids, [nr_id, na_id, nr_id, na_id, nr_id, nr_id]) # Test importing from the test file imported_messages = [] for td in messages: imported_messages.append(td.copy()) with open("tests/traced_data/resources/coda_2_import_test_one_scheme.json", "r") as f: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme}, f) # Set TRUE_MISSING codes for td in imported_messages: na_label = CleaningUtils.make_label_from_cleaner_code( gender_scheme, gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING), "test_export_traced_data_iterable_to_coda_2", date_time_utc="2018-11-02T10:00:00+00:00" ) if td.get("gender_raw", "") == "": td.append_data({"gender_coded": na_label.to_dict()}, Metadata("test_user", Metadata.get_call_location(), time.time())) imported_code_ids = [td["gender_coded"]["CodeID"] for td in imported_messages] expected_code_ids = [ gender_scheme.get_code_with_match_value("female").code_id, # Manually approved auto-code gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id, # Empty raw message gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id, # Manually assigned code which isn't checked gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id, # No raw message gender_scheme.get_code_with_control_code(Codes.NOT_CODED).code_id, # Manually Not Coded gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id, # Manually un-coded ] self.assertListEqual(imported_code_ids, expected_code_ids) # Add an element with the same raw text but a conflicting messages.append(TracedData({ "gender_raw": "woman", "gender_sent_on": "2018-11-01T07:13:04+03:00", "gender_coded": CleaningUtils.make_label_from_cleaner_code( gender_scheme, gender_scheme.get_code_with_match_value("male"), "make_location_label", date_time_utc="2018-11-03T13:40:50Z").to_dict() }, Metadata("test_user", Metadata.get_call_location(), time.time()))) TracedDataCodaV2IO.compute_message_ids("test_user", messages, "gender_raw", "gender_coda_id") with open(file_path, "w") as f: try: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( messages, "gender_raw", "gender_sent_on", "gender_coda_id", {"gender_coded": gender_scheme}, f) except AssertionError as e: assert str(e) == "Messages with the same id " \ "(cf2e5bff1ef03dcd20d1a0b18ef7d89fc80a3554434165753672f6f40fde1d25) have different " \ "labels for coded_key 'gender_coded'" return self.fail("Exporting data with conflicting labels did not fail")
def auto_code_surveys(cls, user, data, phone_uuid_table, coda_output_dir): # Label missing data for td in data: missing_dict = dict() for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if td.get(plan.raw_field, "") == "": na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[plan.coded_field] = na_label.to_dict() td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Auto-code remaining data for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.cleaner is not None: CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, plan.raw_field, plan.coded_field, plan.cleaner, plan.code_scheme) # For any locations where the cleaners assigned a code to a sub district, set the district code to NC # (this is because only one column should have a value set in Coda) for td in data: if "mogadishu_sub_district_coded" in td: mogadishu_code_id = td["mogadishu_sub_district_coded"][ "CodeID"] if CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_id( mogadishu_code_id).code_type == "Normal": nc_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.MOGADISHU_SUB_DISTRICT, CodeSchemes.MOGADISHU_SUB_DISTRICT. get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location(), ) td.append_data({"district_coded": nc_label.to_dict()}, Metadata(user, Metadata.get_call_location(), time.time())) # Set operator from phone number for td in data: operator_clean = PhoneCleaner.clean_operator( phone_uuid_table.get_phone(td["uid"])) if operator_clean == Codes.NOT_CODED: label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.OPERATOR, CodeSchemes.OPERATOR.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()) else: label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.OPERATOR, CodeSchemes.OPERATOR.get_code_with_match_value( operator_clean), Metadata.get_call_location()) td.append_data({"operator_coded": label.to_dict()}, Metadata(user, Metadata.get_call_location(), time.time())) # Output single-scheme answers to coda for manual verification + coding IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field == "mogadishu_sub_district_raw": continue TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) coda_output_path = path.join(coda_output_dir, plan.coda_filename) with open(coda_output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, plan.time_field, plan.id_field, {plan.coded_field: plan.code_scheme}, f) # Output location scheme to coda for manual verification + coding output_path = path.join(coda_output_dir, "location.json") TracedDataCodaV2IO.compute_message_ids( user, data, "mogadishu_sub_district_raw", "mogadishu_sub_district_raw_id") with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, "mogadishu_sub_district_raw", "mogadishu_sub_district_time", "mogadishu_sub_district_raw_id", { "mogadishu_sub_district_coded": CodeSchemes.MOGADISHU_SUB_DISTRICT, "district_coded": CodeSchemes.DISTRICT, "region_coded": CodeSchemes.REGION, "state_coded": CodeSchemes.STATE, "zone_coded": CodeSchemes.ZONE }, f) return data
traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, pipeline_configuration.rapid_pro_test_contact_uuids) # Set the operator codes for each message. if flow in pipeline_configuration.activation_flow_names: uuids = {td["avf_phone_id"] for td in traced_runs} uuid_to_phone_lut = phone_number_uuid_table.uuid_to_data_batch( uuids) for td in traced_runs: operator_code = PhoneCleaner.clean_operator( uuid_to_phone_lut[td["avf_phone_id"]]) if operator_code == Codes.NOT_CODED: operator_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOMALIA_OPERATOR, CodeSchemes.SOMALIA_OPERATOR. get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location()) else: operator_label = CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOMALIA_OPERATOR, CodeSchemes.SOMALIA_OPERATOR.get_code_with_match_value( operator_code), Metadata.get_call_location()) td.append_data({"operator_coded": operator_label.to_dict()}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs")
def impute_yes_no_reasons_codes(user, data, coding_configurations): # Synchronise the control codes between the binary and reasons schemes: # Some RQA datasets have a binary scheme, which is always labelled, and a reasons scheme, which is only labelled # if there is an additional reason given. Importing those two schemes separately above caused the labels in # each scheme to go out of sync with each other, e.g. reasons can be NR when the binary *was* reviewed. # This block updates the reasons scheme in cases where only a binary label was set, by assigning the # label 'NC' if the binary label was set to a normal code, otherwise to be the same control code as the binary. binary_configuration = coding_configurations[0] reasons_configuration = coding_configurations[1] # TODO: Switch to using CodingModes.SINGLE/MULTIPLE once configuration is being set in configuration json # rather than in pipeline_configuration.py assert binary_configuration.coding_mode == "SINGLE" assert reasons_configuration.coding_mode == "MULTIPLE" for td in data: binary_label = td[binary_configuration.coded_field] binary_code = binary_configuration.code_scheme.get_code_with_id( binary_label["CodeID"]) binary_label_present = \ binary_label["CodeID"] != binary_configuration.code_scheme.get_code_with_control_code( Codes.NOT_REVIEWED).code_id reasons_label_present = \ len(td[reasons_configuration.coded_field]) > 1 or \ td[reasons_configuration.coded_field][0][ "CodeID"] != reasons_configuration.code_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id if binary_label_present and not reasons_label_present: if binary_code.code_type == "Control": control_code = binary_code.control_code reasons_code = reasons_configuration.code_scheme.get_code_with_control_code( control_code) reasons_label = CleaningUtils.make_label_from_cleaner_code( reasons_configuration.code_scheme, reasons_code, Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation") td.append_data( { reasons_configuration.coded_field: [reasons_label.to_dict()] }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) else: assert binary_code.code_type == "Normal" nc_label = CleaningUtils.make_label_from_cleaner_code( reasons_configuration.code_scheme, reasons_configuration.code_scheme. get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation") td.append_data( {reasons_configuration.coded_field: [nc_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))
def apply_manual_codes(cls, user, data, coda_input_dir): # Merge manually coded data into the cleaned dataset for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.coda_filename is None: continue coda_input_path = path.join(coda_input_dir, plan.coda_filename) for cc in plan.coding_configurations: if not cc.requires_manual_verification: continue f = None try: if path.exists(coda_input_path): f = open(coda_input_path, "r") if cc.coding_mode == CodingModes.SINGLE: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field, {cc.coded_field: cc.code_scheme}, f) else: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, data, plan.id_field, {cc.coded_field: cc.code_scheme}, f) finally: if f is not None: f.close() if PipelineConfiguration.WS_CORRECT_DATASET_SCHEME is not None: f = None try: if path.exists(coda_input_path): f = open(coda_input_path, "r") TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field, { f"{plan.raw_field}_correct_dataset": PipelineConfiguration.WS_CORRECT_DATASET_SCHEME }, f) finally: if f is not None: f.close() # Label data for which there is no response as TRUE_MISSING. # Label data for which the response is the empty string as NOT_CODED. for td in data: missing_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: raw_field = cc.raw_field if cc.raw_field is not None else plan.raw_field if raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()).to_dict() missing_dict[ cc. coded_field] = na_label if cc.coding_mode == CodingModes.SINGLE else [ na_label ] for cc in plan.coding_configurations: raw_field = cc.raw_field if cc.raw_field is not None else plan.raw_field if td.get(raw_field) == "": nc_label = CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()).to_dict() missing_dict[ cc. coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [ nc_label ] td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Mark data that is noise as Codes.NOT_CODED for td in data: if td.get("noise", False): nc_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS: for cc in plan.coding_configurations: if cc.coded_field not in td: nc_label = CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()).to_dict() nc_dict[ cc. coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [ nc_label ] td.append_data( nc_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Run the cleaners that don't require manual verification again, this time setting "checked" to True for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.cleaner is not None and not cc.requires_manual_verification: raw_field = cc.raw_field if cc.raw_field is not None else plan.raw_field CleaningUtils.apply_cleaner_to_traced_data_iterable( user, data, raw_field, cc.coded_field, cc.cleaner, cc.code_scheme, set_checked=True) # Run code imputation functions for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.code_imputation_function is not None: plan.code_imputation_function(user, data, plan.coding_configurations) cls._impute_coding_error_codes(user, data) return data
def test_export_import_one_multi_coded_scheme(self): file_path = path.join(self.test_dir, "coda_2_test.json") # Build raw input data message_dicts = [ {"msg_raw": "food", "msg_sent_on": "2018-11-01T07:13:04+03:00"}, {"msg_raw": "", "msg_sent_on": "2018-11-01T07:17:04+03:00"}, {"msg_raw": "food + water", "msg_sent_on": "2018-11-01T07:19:04+05:00"}, {}, {"msg_raw": "water", "msg_sent_on": "2018-11-02T19:00:29+03:00"}, {"msg_raw": "abcd", "msg_sent_on": "2018-11-02T20:30:45+03:00"} ] messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i)) for i, d in enumerate(message_dicts)] # Add message ids TracedDataCodaV2IO.compute_message_ids("test_user", messages, "msg_raw", "msg_coda_id") # Load gender scheme with open("tests/traced_data/resources/coda_2_msg_scheme.json") as f: msg_scheme = CodeScheme.from_firebase_map(json.load(f)) # Export to a Coda 2 messages file with open(file_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( messages, "msg_raw", "msg_sent_on", "msg_coda_id", {"msg_coded": msg_scheme}, f) self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_multi_coded.json")) # Test importing with no file available imported_messages = [] for td in messages: imported_messages.append(td.copy()) TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}) # Deliberately testing the read can be done twice TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}) na_id = msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id nr_id = msg_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id # Set TRUE_MISSING codes for td in imported_messages: na_label = CleaningUtils.make_label_from_cleaner_code( msg_scheme, msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING), "test_export_traced_data_iterable_to_coda_2", date_time_utc="2018-11-02T10:00:00+00:00" ) if td.get("msg_raw", "") == "": td.append_data({"msg_coded": [na_label.to_dict()]}, Metadata("test_user", Metadata.get_call_location(), time.time())) for td in imported_messages: self.assertEqual(len(td["msg_coded"]), 1) imported_code_ids = [td["msg_coded"][0]["CodeID"] for td in imported_messages] self.assertListEqual(imported_code_ids, [nr_id, na_id, nr_id, na_id, nr_id, nr_id]) # Test importing from the test file imported_messages = [] for td in messages: imported_messages.append(td.copy()) with open("tests/traced_data/resources/coda_2_import_test_multi_coded.json", "r") as f: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}, f) # Test that reading the same file-pointer twice without moving it back to the start of the file fails try: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}, f) self.fail("Re-using the same file pointer didn't raise an assertion error") except AssertionError as e: self.assertEqual(str(e), "File-pointer not at byte 0. " "Should you have used e.g. `f.seek(0)` before calling this method?") # Set TRUE_MISSING codes for td in imported_messages: na_label = CleaningUtils.make_label_from_cleaner_code( msg_scheme, msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING), "test_export_traced_data_iterable_to_coda_2", date_time_utc="2018-11-02T10:00:00+00:00" ) if td.get("msg_raw", "") == "": td.append_data({"msg_coded": [na_label.to_dict()]}, Metadata("test_user", Metadata.get_call_location(), time.time())) imported_code_ids = [] for td in imported_messages: imported_code_ids.append([code["CodeID"] for code in td["msg_coded"]]) expected_code_ids = [ [msg_scheme.get_code_with_match_value("food").code_id], [msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id], [msg_scheme.get_code_with_match_value("food").code_id, msg_scheme.get_code_with_match_value("water").code_id], [msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id], [msg_scheme.get_code_with_match_value("water").code_id], [msg_scheme.get_code_with_control_code(Codes.NOT_CODED).code_id] ] for x, y in zip(imported_code_ids, expected_code_ids): self.assertEqual(len(x), len(y)) self.assertSetEqual(set(x), set(y))
def impute_somalia_location_codes(user, data, location_configurations): for td in data: # Up to 1 location code should have been assigned in Coda. Search for that code, # ensuring that only 1 has been assigned or, if multiple have been assigned, that they are non-conflicting # control codes location_code = None for cc in location_configurations: coda_code = cc.code_scheme.get_code_with_code_id( td[cc.coded_field]["CodeID"]) if location_code is not None: if not (coda_code.code_id == location_code.code_id or coda_code.control_code == Codes.NOT_REVIEWED): location_code = CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_control_code( Codes.CODING_ERROR) elif coda_code.control_code != Codes.NOT_REVIEWED: location_code = coda_code # If no code was found, then this location is still not reviewed. # Synthesise a NOT_REVIEWED code accordingly. if location_code is None: location_code = CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_control_code( Codes.NOT_REVIEWED) # If a control code was found, set all other location keys to that control code, # otherwise convert the provided location to the other locations in the hierarchy. if location_code.code_type == CodeTypes.CONTROL: for cc in location_configurations: td.append_data( { cc.coded_field: CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code( location_code.control_code), Metadata.get_call_location()).to_dict() }, Metadata(user, Metadata.get_call_location(), time.time())) elif location_code.code_type == CodeTypes.META: for cc in location_configurations: td.append_data( { cc.coded_field: CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_meta_code( location_code.meta_code), Metadata.get_call_location()).to_dict() }, Metadata(user, Metadata.get_call_location(), time.time())) else: assert location_code.code_type == CodeTypes.NORMAL location = location_code.match_values[0] td.append_data( { "mogadishu_sub_district_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.MOGADISHU_SUB_DISTRICT, make_location_code( CodeSchemes.MOGADISHU_SUB_DISTRICT, SomaliaLocations. mogadishu_sub_district_for_location_code(location) ), Metadata.get_call_location()).to_dict(), "district_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOMALIA_DISTRICT, make_location_code( CodeSchemes.SOMALIA_DISTRICT, SomaliaLocations.district_for_location_code( location)), Metadata.get_call_location()).to_dict(), "region_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOMALIA_REGION, make_location_code( CodeSchemes.SOMALIA_REGION, SomaliaLocations.region_for_location_code(location) ), Metadata.get_call_location()).to_dict(), "state_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOMALIA_STATE, make_location_code( CodeSchemes.SOMALIA_STATE, SomaliaLocations.state_for_location_code(location) ), Metadata.get_call_location()).to_dict(), "zone_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOMALIA_ZONE, make_location_code( CodeSchemes.SOMALIA_ZONE, SomaliaLocations.zone_for_location_code(location)), Metadata.get_call_location()).to_dict() }, Metadata(user, Metadata.get_call_location(), time.time())) # Impute zone from operator if "location_raw" not in td: operator_str = CodeSchemes.SOMALIA_OPERATOR.get_code_with_code_id( td["operator_coded"]["CodeID"]).string_value zone_str = SomaliaLocations.zone_for_operator_code(operator_str) td.append_data( { "zone_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOMALIA_ZONE, make_location_code( CodeSchemes.SOMALIA_ZONE, SomaliaLocations.state_for_location_code(zone_str) ), Metadata.get_call_location()).to_dict() }, Metadata(user, Metadata.get_call_location(), time.time()))
def apply_manual_codes(cls, user, data, coda_input_dir): # Merge manually coded radio show files into the cleaned dataset for plan in PipelineConfiguration.SURVEY_CODING_PLANS: rqa_messages = [td for td in data if plan.raw_field in td] coda_input_path = path.join(coda_input_dir, plan.coda_filename) print(coda_input_path) f = None try: if path.exists(coda_input_path): f = open(coda_input_path, "r") TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, rqa_messages, plan.id_field, {plan.coded_field: plan.code_scheme}, f) if plan.binary_code_scheme is not None: if f is not None: f.seek(0) TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, rqa_messages, plan.id_field, {plan.binary_coded_field: plan.binary_code_scheme}, f) finally: if f is not None: f.close() # At this point, the TracedData objects still contain messages for at most one week each. # Label the weeks for which there is no response as TRUE_MISSING. for td in data: missing_dict = dict() for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.coded_field] = [na_label.to_dict()] if plan.binary_code_scheme is not None: na_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.binary_coded_field] = na_label.to_dict() td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Synchronise the control codes between the binary and reasons schemes: # Some RQA datasets have a binary scheme, which is always labelled, and a reasons scheme, which is only labelled # if there is an additional reason given. Importing those two schemes separately above caused the labels in # each scheme to go out of sync with each other, e.g. reasons can be NR when the binary *was* reviewed. # This block updates the reasons scheme in cases where only a binary label was set, by assigning the # label 'NC' if the binary label was set to a normal code, otherwise to be the same control code as the binary. for plan in PipelineConfiguration.SURVEY_CODING_PLANS: rqa_messages = [td for td in data if plan.raw_field in td] if plan.binary_code_scheme is not None: for td in rqa_messages: binary_label = td[plan.binary_coded_field] binary_code = plan.binary_code_scheme.get_code_with_id(binary_label["CodeID"]) binary_label_present = binary_label["CodeID"] != \ plan.binary_code_scheme.get_code_with_control_code( Codes.NOT_REVIEWED).code_id reasons_label_present = len(td[plan.coded_field]) > 1 or td[plan.coded_field][0][ "CodeID"] != \ plan.code_scheme.get_code_with_control_code( Codes.NOT_REVIEWED).code_id if binary_label_present and not reasons_label_present: if binary_code.code_type == "Control": control_code = binary_code.control_code reasons_code = plan.code_scheme.get_code_with_control_code(control_code) reasons_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, reasons_code, Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation") td.append_data( {plan.coded_field: [reasons_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()) ) else: assert binary_code.code_type == "Normal" nc_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.NOT_CODED), Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation" ) td.append_data( {plan.coded_field: [nc_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()) ) # Not everyone will have answered all of the demographic flows. # Label demographic questions which had no responses as TRUE_MISSING. for td in data: missing_dict = dict() for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if td.get(plan.raw_field, "") == "": na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.coded_field] = na_label.to_dict() td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) return data
def auto_code_show_messages(cls, user, data, icr_output_dir, coda_output_dir): # Filter out test messages sent by AVF. if not PipelineConfiguration.DEV_MODE: data = MessageFilters.filter_test_messages(data) # Filter for runs which don't contain a response to any week's question data = MessageFilters.filter_empty_messages(data, cls.TEST_KEYS) # Filter out runs sent outside the project start and end dates data = MessageFilters.filter_time_range(data, cls.SENT_ON_KEY, cls.PROJECT_START_DATE, cls.PROJECT_END_DATE) # Label missing data for td in data: missing_dict = dict() for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS: if plan.raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.coded_field] = [na_label.to_dict()] if plan.binary_code_scheme is not None: na_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.TRUE_MISSING), Metadata.get_call_location() ) missing_dict[plan.binary_coded_field] = na_label.to_dict() td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Label each message with channel keys Channels.set_channel_keys(user, data, cls.SENT_ON_KEY) # Output messagges for Coda IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field) output_path = path.join(coda_output_dir, plan.coda_filename) with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( data, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f ) print("Coda message files successfully exported") # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS: test_pipeline_messages = [] na_messages = [] for td in data: if plan.coded_field not in td: test_pipeline_messages.append(td) else: assert len(td[plan.coded_field]) == 1 assert td[plan.coded_field][0]["CodeID"] == \ plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id icr_messages = ICRTools.generate_sample_for_icr( test_pipeline_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field] ) print("ICR files successfully exported") return data
def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table, rapid_pro_source): log.info("Fetching data from Rapid Pro...") log.info("Downloading Rapid Pro access token...") rapid_pro_token = google_cloud_utils.download_blob_to_string( google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip() rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token) # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro. raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json" contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl" try: log.info(f"Loading raw contacts from file '{raw_contacts_path}'...") with open(raw_contacts_path) as raw_contacts_file: raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)] log.info(f"Loaded {len(raw_contacts)} contacts") except FileNotFoundError: log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server") with open(contacts_log_path, "a") as contacts_log_file: raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file) # Download all the runs for each of the radio shows for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names: runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl" raw_runs_path = f"{raw_data_dir}/{flow}_raw.json" traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl" log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...") flow_id = rapid_pro.get_flow_id(flow) # Load the previous export of runs for this flow, and update them with the newest runs. # If there is no previous export for this flow, fetch all the runs from Rapid Pro. with open(runs_log_path, "a") as raw_runs_log_file: try: log.info(f"Loading raw runs from file '{raw_runs_path}'...") with open(raw_runs_path) as raw_runs_file: raw_runs = [Run.deserialize(run_json) for run_json in json.load(raw_runs_file)] log.info(f"Loaded {len(raw_runs)} runs") raw_runs = rapid_pro.update_raw_runs_with_latest_modified( flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True) except FileNotFoundError: log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'") raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file) # Fetch the latest contacts from Rapid Pro. with open(contacts_log_path, "a") as raw_contacts_log_file: raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts, raw_export_log_file=raw_contacts_log_file) # Convert the runs to TracedData. traced_runs = rapid_pro.convert_runs_to_traced_data( user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids) if flow in rapid_pro_source.activation_flow_names: # Append the Rapid Pro source name to each run. # Only do this for activation flows because this is the only place where this is interesting. # Also, demogs may come from either instance, which causes problems downstream. for td in traced_runs: td.append_data({ "source_raw": rapid_pro_source.source_name, "source_coded": CleaningUtils.make_label_from_cleaner_code( CodeSchemes.SOURCE, CodeSchemes.SOURCE.get_code_with_match_value(rapid_pro_source.source_name), Metadata.get_call_location() ).to_dict() }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...") with open(raw_runs_path, "w") as raw_runs_file: json.dump([run.serialize() for run in raw_runs], raw_runs_file) log.info(f"Saved {len(raw_runs)} raw runs") log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...") IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path) with open(traced_runs_output_path, "w") as traced_runs_output_file: TracedDataJsonIO.export_traced_data_iterable_to_jsonl(traced_runs, traced_runs_output_file) log.info(f"Saved {len(traced_runs)} traced runs") log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...") with open(raw_contacts_path, "w") as raw_contacts_file: json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file) log.info(f"Saved {len(raw_contacts)} contacts")
def import_coda_2_to_traced_data_iterable(cls, user, data, message_id_key, scheme_key_map, f=None): """ Codes keys in an iterable of TracedData objects by using the codes from a Coda 2 messages JSON file. Data which is has not been checked in the Coda file is coded using the provided nr_label (irrespective of whether there was an automatic code there before). TODO: Data which has been assigned a code under one scheme but none of the others needs to coded as NC not NR TODO: Or, do this in Coda so as to remove ambiguity from the perspective of the RAs? :param user: Identifier of user running this program. :type user: str :param data: TracedData objects to be coded using the Coda file. :type data: iterable of TracedData :param message_id_key: Key in TracedData objects of the message ids. :type message_id_key: str :param scheme_key_map: Dictionary of (key in TracedData objects to assign labels to) -> (Scheme in the Coda messages file to retrieve the labels from) :type scheme_key_map: dict of str -> Scheme :param f: Coda data file to import codes from, or None. :type f: file-like | None """ if f is None: f = cls._make_empty_file() # Build a lookup table of MessageID -> SchemeID -> Labels coda_dataset = cls._dataset_lut_from_messages_file( f, scheme_key_map.values()) # Filter out TracedData objects that do not contain a message id key data = [td for td in data if message_id_key in td] # Apply the labels from Coda to each TracedData item in data for td in data: for key_of_coded, scheme in scheme_key_map.items(): # Get labels for this (message id, scheme id) from the look-up table labels = coda_dataset.get(td[message_id_key], dict()).get(scheme.scheme_id, []) if labels is not None: # Append each label that was assigned to this message for this scheme to the TracedData. for label in reversed(labels): td.append_data({key_of_coded: label.to_dict()}, Metadata( user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # If this td still has no label after importing from the Coda file, or the label is a non-missing label # that hasn't been checked in the Coda UI, set a code for NOT_REVIEWED if key_of_coded not in td or not td[key_of_coded]["Checked"]: nr_label = CleaningUtils.make_label_from_cleaner_code( scheme, scheme.get_code_with_control_code(Codes.NOT_REVIEWED), Metadata.get_call_location()) td.append_data({key_of_coded: nr_label.to_dict()}, Metadata(user, Metadata.get_call_location(), time.time()))
def import_coda_2_to_traced_data_iterable_multi_coded( cls, user, data, message_id_key, scheme_key_map, f=None): """ Codes keys in an iterable of TracedData objects by using the codes from a Coda 2 messages JSON file. Data which is has not been checked in the Coda file is coded using the provided nr_label (irrespective of whether there was an automatic code there before). Only the 'primary' schemes should be passed in. Schemes that have been duplicated using the duplicate_scheme tool in CodaV2/data_tools will be detected as being associated with the primary scheme automatically. TODO: Data which has been assigned a code under one scheme but none of the others needs to coded as NC not NR TODO: Or, do this in Coda so as to remove ambiguity from the perspective of the RAs? :param user: Identifier of user running this program. :type user: str :param data: TracedData objects to be coded using the Coda file. :type data: iterable of TracedData :param message_id_key: Key in TracedData objects of the message ids. :type message_id_key: str :param scheme_key_map: Dictionary of (key in TracedData objects to assign labels to) -> (Scheme in the Coda messages file to retrieve the labels from) :type scheme_key_map: dict of str -> iterable of Scheme :param f: Coda data file to import codes from, or None. If None, assigns NOT_REVIEWED codes to everything. :type f: file-like | None """ if f is None: f = cls._make_empty_file() # Build a lookup table of MessageID -> SchemeID -> Labels coda_dataset = cls._dataset_lut_from_messages_file( f, scheme_key_map.values()) # Filter out TracedData objects that do not contain a message id key data = [td for td in data if message_id_key in td] # Apply the labels from Coda to each TracedData item in data for td in data: for coded_key, scheme in scheme_key_map.items(): # Get labels for this (message id, scheme id) from the look-up table labels = coda_dataset.get(td[message_id_key], dict()).get(scheme.scheme_id, []) # Get the currently assigned list of labels for this multi-coded scheme, # and construct a look-up table of scheme id -> label td_labels = td.get(coded_key, []) td_labels_lut = { label["SchemeID"]: Label.from_dict(label) for label in td_labels } for label in reversed(labels): # Update the relevant label in this traced data's list of labels with the new label, # and append the whole new list to the traced data. td_labels_lut[label.scheme_id] = label td_labels = list(td_labels_lut.values()) td.append_data( {coded_key: [label.to_dict() for label in td_labels]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Delete any labels that are SPECIAL-MANUALLY_UNCODED for scheme_id, label in list(td_labels_lut.items()): if label.code_id == "SPECIAL-MANUALLY_UNCODED": del td_labels_lut[scheme_id] td_labels = list(td_labels_lut.values()) td.append_data( { coded_key: [label.to_dict() for label in td_labels] }, Metadata(user, Metadata.get_call_location(), time.time())) # If no manual labels have been set and are checked, set a code for NOT_REVIEWED checked_codes_count = 0 labels = td.get(coded_key) if labels is not None: for label in labels: if label["Checked"]: checked_codes_count += 1 if checked_codes_count == 0: nr_label = CleaningUtils.make_label_from_cleaner_code( scheme, scheme.get_code_with_control_code(Codes.NOT_REVIEWED), Metadata.get_call_location()) td.append_data({coded_key: [nr_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), time.time())) # Normalise the scheme ids of all the imported labels labels = [Label.from_dict(d) for d in td[coded_key]] for label in labels: assert label.scheme_id.startswith(scheme.scheme_id) label.scheme_id = scheme.scheme_id # De-duplicate the imported labels by selecting the first label with each code id. # This is required in cases where the same label was applied to this message under different columns # of the same code scheme, and is possible now that we have normalised the scheme ids. unique_labels_by_code_id = [] seen_code_ids = set() for label in labels: if label.code_id not in seen_code_ids: unique_labels_by_code_id.append(label) seen_code_ids.add(label.code_id) td.append_data( { coded_key: [ label.to_dict() for label in unique_labels_by_code_id ] }, Metadata(user, Metadata.get_call_location(), time.time()))
def apply_manual_codes(cls, user, data, coda_input_dir): # Merge manually coded data into the cleaned dataset for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: coda_input_path = path.join(coda_input_dir, plan.coda_filename) for cc in plan.coding_configurations: f = None try: if path.exists(coda_input_path): f = open(coda_input_path, "r") if cc.coding_mode == CodingModes.SINGLE: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field, {cc.coded_field: cc.code_scheme}, f) else: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, data, plan.id_field, {cc.coded_field: cc.code_scheme}, f) finally: if f is not None: f.close() # Label data for which there is no response as TRUE_MISSING. # Label data for which the response is the empty string as NOT_CODED. for td in data: missing_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field not in td: for cc in plan.coding_configurations: na_label = CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()).to_dict() missing_dict[ cc. coded_field] = na_label if cc.coding_mode == CodingModes.SINGLE else [ na_label ] elif td[plan.raw_field] == "": for cc in plan.coding_configurations: nc_label = CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()).to_dict() missing_dict[ cc. coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [ nc_label ] td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Mark data that is noise as Codes.NOT_CODED for td in data: if td.get("noise", False): nc_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS: for cc in plan.coding_configurations: if cc.coded_field not in td: nc_label = CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()).to_dict() nc_dict[ cc. coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [ nc_label ] td.append_data( nc_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Run code imputation functions for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.code_imputation_function is not None: plan.code_imputation_function(user, data, plan.coding_configurations) cls._impute_coding_error_codes(user, data) return data
def auto_code_show_messages(cls, user, data, icr_output_dir, coda_output_dir): # Filter out test messages sent by AVF. if not PipelineConfiguration.DEV_MODE: data = MessageFilters.filter_test_messages(data) # Filter for runs which don't contain a response to any week's question data = MessageFilters.filter_empty_messages(data, cls.RQA_KEYS) # Filter out runs sent outwith the project start and end dates data = MessageFilters.filter_time_range(data, cls.SENT_ON_KEY, cls.PROJECT_START_DATE, cls.PROJECT_END_DATE) # Tag messages which are noise as being noise for td in data: is_noise = True for rqa_key in cls.RQA_KEYS: if rqa_key in td and not somali.DemographicCleaner.is_noise( td[rqa_key], min_length=10): is_noise = False td.append_data({cls.NOISE_KEY: is_noise}, Metadata(user, Metadata.get_call_location(), time.time())) # Label missing data for td in data: missing_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[plan.coded_field] = [na_label.to_dict()] if plan.binary_code_scheme is not None: na_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[ plan.binary_coded_field] = na_label.to_dict() td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Label each message with channel keys Channels.set_channel_keys(user, data, cls.SENT_ON_KEY) # Filter for messages which aren't noise (in order to export to Coda and export for ICR) not_noise = MessageFilters.filter_noise(data, cls.NOISE_KEY, lambda x: x) # Output messages which aren't noise to Coda IOUtils.ensure_dirs_exist(coda_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: TracedDataCodaV2IO.compute_message_ids(user, not_noise, plan.raw_field, plan.id_field) output_path = path.join(coda_output_dir, plan.coda_filename) with open(output_path, "w") as f: TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2( not_noise, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f) # Output messages for ICR IOUtils.ensure_dirs_exist(icr_output_dir) for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [] for td in not_noise: # This test works because the only codes which have been applied at this point are TRUE_MISSING. # If any other coding is done above, this test will need to change. if plan.coded_field not in td: rqa_messages.append(td) else: assert len(td[plan.coded_field]) == 1 assert td[plan.coded_field][0]["CodeID"] == \ plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id icr_messages = ICRTools.generate_sample_for_icr( rqa_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED)) icr_output_path = path.join(icr_output_dir, plan.icr_filename) with open(icr_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( icr_messages, f, headers=[plan.run_id_field, plan.raw_field]) return data
def apply_manual_codes(cls, user, data, coda_input_dir): # Merge manually coded radio show files into the cleaned dataset for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [td for td in data if plan.raw_field in td] coda_input_path = path.join(coda_input_dir, plan.coda_filename) f = None try: if path.exists(coda_input_path): f = open(coda_input_path, "r") TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, rqa_messages, plan.id_field, {plan.coded_field: plan.code_scheme}, f) if plan.binary_code_scheme is not None: if f is not None: f.seek(0) TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, rqa_messages, plan.id_field, {plan.binary_coded_field: plan.binary_code_scheme}, f) finally: if f is not None: f.close() # Label the RQA for which there is no response yet as TRUE MISSING for td in data: missing_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[plan.coded_field] = [na_label.to_dict()] if plan.binary_code_scheme is not None: na_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[ plan.binary_coded_field] = na_label.to_dict() elif td[plan.raw_field] == "": nc_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()) missing_dict[plan.coded_field] = [nc_label.to_dict()] elif plan.binary_code_scheme is not None and td[ plan.raw_field] == "": nc_label = CleaningUtils.make_label_from_cleaner_code( plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()) missing_dict[plan.binary_coded_field] = [ nc_label.to_dict() ] td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Synchronise the control codes between the binary and reasons schemes: # Some RQA datasets have a binary scheme, which is always labelled, and a reasons scheme, which is only labelled # if there is an additional reason given. Importing those two schemes separately above caused the labels in # each scheme to go out of sync with each other, e.g. reasons can be NR when the binary *was* reviewed. # This block updates the reasons scheme in cases where only a binary label was set, by assigning the # label 'NC' if the binary label was set to a normal code, otherwise to be the same control code as the binary. for plan in PipelineConfiguration.RQA_CODING_PLANS: rqa_messages = [td for td in data if plan.raw_field in td] if plan.binary_code_scheme is not None: for td in rqa_messages: binary_label = td[plan.binary_coded_field] binary_code = plan.binary_code_scheme.get_code_with_id( binary_label["CodeID"]) binary_label_present = binary_label["CodeID"] != \ plan.binary_code_scheme.get_code_with_control_code( Codes.NOT_REVIEWED).code_id reasons_label_present = len(td[plan.coded_field]) > 1 or td[plan.coded_field][0][ "CodeID"] != \ plan.code_scheme.get_code_with_control_code( Codes.NOT_REVIEWED).code_id if binary_label_present and not reasons_label_present: if binary_code.code_type == "Control": control_code = binary_code.control_code reasons_code = plan.code_scheme.get_code_with_control_code( control_code) reasons_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, reasons_code, Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation") td.append_data( {plan.coded_field: [reasons_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) else: assert binary_code.code_type == "Normal" nc_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation") td.append_data( {plan.coded_field: [nc_label.to_dict()]}, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Merge manually coded demog and follow-up survey files into the cleaned dataset # Recursion depth currently exceeding # TODO: Investigate/address the cause of this. sys.setrecursionlimit(10000) for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: f = None try: coda_input_path = path.join(coda_input_dir, plan.coda_filename) if path.exists(coda_input_path): f = open(coda_input_path, "r") TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field, {plan.coded_field: plan.code_scheme}, f) finally: if f is not None: f.close() # Not everyone will have answered all of the demographic and follow-up survey flows flows. # Label demographic and follow-up survey questions which had no responses as TRUE_MISSING. # Label data which is just the empty string as NOT_CODED. for td in data: missing_dict = dict() for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS: if plan.raw_field not in td: na_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.TRUE_MISSING), Metadata.get_call_location()) missing_dict[plan.coded_field] = na_label.to_dict() elif td[plan.raw_field] == "": nc_label = CleaningUtils.make_label_from_cleaner_code( plan.code_scheme, plan.code_scheme.get_code_with_control_code( Codes.NOT_CODED), Metadata.get_call_location()) missing_dict[plan.coded_field] = nc_label.to_dict() td.append_data( missing_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Set county/constituency/from the coded constituency field. cls._impute_location_codes(user, data) # Set coding error codes using the coding error field cls._impute_coding_error_codes(user, data) return data
def move_wrong_scheme_messages(user, data, coda_input_dir): log.info("Importing manually coded Coda files to '_WS' fields...") for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.coda_filename is None: continue TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, f"{plan.id_field}_WS") with open(f"{coda_input_dir}/{plan.coda_filename}") as f: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, f"{plan.id_field}_WS", { f"{plan.raw_field}_WS_correct_dataset": PipelineConfiguration.WS_CORRECT_DATASET_SCHEME }, f) for cc in plan.coding_configurations: with open(f"{coda_input_dir}/{plan.coda_filename}") as f: if cc.coding_mode == CodingModes.SINGLE: TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable( user, data, plan.id_field + "_WS", {f"{cc.coded_field}_WS": cc.code_scheme}, f) else: assert cc.coding_mode == CodingModes.MULTIPLE TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded( user, data, f"{plan.id_field}_WS", {f"{cc.coded_field}_WS": cc.code_scheme}, f) log.info("Checking for WS Coding Errors...") # Check for coding errors for td in data: for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: rqa_codes = [] for cc in plan.coding_configurations: if cc.coding_mode == CodingModes.SINGLE: if f"{cc.coded_field}_WS" in td: label = td[f"{cc.coded_field}_WS"] rqa_codes.append( cc.code_scheme.get_code_with_code_id( label["CodeID"])) else: assert cc.coding_mode == CodingModes.MULTIPLE for label in td.get(f"{cc.coded_field}_WS", []): rqa_codes.append( cc.code_scheme.get_code_with_code_id( label["CodeID"])) has_ws_code_in_code_scheme = False for code in rqa_codes: if code.control_code == Codes.WRONG_SCHEME: has_ws_code_in_code_scheme = True has_ws_code_in_ws_scheme = False if f"{plan.raw_field}_WS_correct_dataset" in td: ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id( td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"]) has_ws_code_in_ws_scheme = ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED if has_ws_code_in_code_scheme != has_ws_code_in_ws_scheme: log.warning( f"Coding Error: {plan.raw_field}: {td[plan.raw_field]}" ) coding_error_dict = { f"{plan.raw_field}_WS_correct_dataset": CleaningUtils.make_label_from_cleaner_code( PipelineConfiguration.WS_CORRECT_DATASET_SCHEME, PipelineConfiguration.WS_CORRECT_DATASET_SCHEME. get_code_with_control_code(Codes.CODING_ERROR), Metadata.get_call_location(), ).to_dict() } td.append_data( coding_error_dict, Metadata(user, Metadata.get_call_location(), time.time())) # Construct a map from WS normal code id to the raw field that code indicates a requested move to. ws_code_to_raw_field_map = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: if plan.ws_code is not None: ws_code_to_raw_field_map[plan.ws_code.code_id] = plan.raw_field # Group the TracedData by uid. data_grouped_by_uid = dict() for td in data: uid = td["uid"] if uid not in data_grouped_by_uid: data_grouped_by_uid[uid] = [] data_grouped_by_uid[uid].append(td) # Perform the WS correction for each uid. log.info("Performing WS correction...") corrected_data = [] # List of TracedData with the WS data moved. unknown_target_code_counts = dict( ) # 'WS - Correct Dataset' codes with no matching code id in any coding plan # for this project, with a count of the occurrences for group in data_grouped_by_uid.values(): # Find all the surveys data being moved. # (Note: we only need to check one td in this group because all the demographics are the same) td = group[0] survey_moves = dict() # of source_field -> target_field for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field not in td or plan.coda_filename is None: continue ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id( td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"]) if ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED: if ws_code.code_id in ws_code_to_raw_field_map: survey_moves[ plan.raw_field] = ws_code_to_raw_field_map[ ws_code.code_id] else: if (ws_code.code_id, ws_code.display_text ) not in unknown_target_code_counts: unknown_target_code_counts[( ws_code.code_id, ws_code.display_text)] = 0 unknown_target_code_counts[(ws_code.code_id, ws_code.display_text)] += 1 survey_moves[plan.raw_field] = None # Find all the RQA data being moved. rqa_moves = dict( ) # of (index in group, source_field) -> target_field for i, td in enumerate(group): for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in td or plan.coda_filename is None: continue ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id( td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"]) if ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED: if ws_code.code_id in ws_code_to_raw_field_map: rqa_moves[( i, plan.raw_field )] = ws_code_to_raw_field_map[ws_code.code_id] else: if (ws_code.code_id, ws_code.display_text ) not in unknown_target_code_counts: unknown_target_code_counts[( ws_code.code_id, ws_code.display_text)] = 0 unknown_target_code_counts[( ws_code.code_id, ws_code.display_text)] += 1 rqa_moves[(i, plan.raw_field)] = None # Build a dictionary of the survey fields that haven't been moved, and cleared fields for those which have. survey_updates = dict() # of raw_field -> updated value for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.coda_filename is None: continue if plan.raw_field in survey_moves.keys(): # Data is moving survey_updates[plan.raw_field] = [] elif plan.raw_field in td: # Data is not moving survey_updates[plan.raw_field] = [ _WSUpdate(td[plan.raw_field], td[plan.time_field], plan.raw_field, td) ] # Build a list of the rqa fields that haven't been moved. rqa_updates = [] # of (raw_field, _WSUpdate) for i, td in enumerate(group): for plan in PipelineConfiguration.RQA_CODING_PLANS: if plan.coda_filename is None: continue if plan.raw_field in td: if (i, plan.raw_field) in rqa_moves.keys(): # Data is moving pass else: # Data is not moving rqa_updates.append( (plan.raw_field, _WSUpdate(td[plan.raw_field], td[plan.time_field], plan.raw_field, td))) # Add data moving from survey fields to the relevant survey_/rqa_updates raw_survey_fields = { plan.raw_field for plan in PipelineConfiguration.SURVEY_CODING_PLANS } raw_rqa_fields = { plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS } for plan in PipelineConfiguration.SURVEY_CODING_PLANS + PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field not in survey_moves: continue target_field = survey_moves[plan.raw_field] if target_field is None: continue update = _WSUpdate(td[plan.raw_field], td[plan.time_field], plan.raw_field, td) if target_field in raw_survey_fields: survey_updates[target_field] = survey_updates.get( target_field, []) + [update] else: assert target_field in raw_rqa_fields, f"Raw field '{target_field}' not in any coding plan" rqa_updates.append((target_field, update)) # Add data moving from RQA fields to the relevant survey_/rqa_updates for (i, source_field), target_field in rqa_moves.items(): if target_field is None: continue for plan in PipelineConfiguration.SURVEY_CODING_PLANS + PipelineConfiguration.RQA_CODING_PLANS: if plan.raw_field == source_field: _td = group[i] update = _WSUpdate(_td[plan.raw_field], _td[plan.time_field], plan.raw_field, td) if target_field in raw_survey_fields: survey_updates[target_field] = survey_updates.get( target_field, []) + [update] else: assert target_field in raw_rqa_fields, f"Raw field '{target_field}' not in any coding plan" rqa_updates.append((target_field, update)) # Re-format the survey updates to a form suitable for use by the rest of the pipeline flattened_survey_updates = {} for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.raw_field in survey_updates: plan_updates = survey_updates[plan.raw_field] if len(plan_updates) > 0: flattened_survey_updates[plan.raw_field] = "; ".join( [u.message for u in plan_updates]) flattened_survey_updates[plan.time_field] = sorted( [u.timestamp for u in plan_updates])[0] flattened_survey_updates[ f"{plan.raw_field}_source"] = "; ".join( [u.source_field for u in plan_updates]) else: flattened_survey_updates[plan.raw_field] = None flattened_survey_updates[plan.time_field] = None flattened_survey_updates[ f"{plan.raw_field}_source"] = None # For each RQA message, create a copy of its source td, append the updated TracedData, and add this to # the list of TracedData to be returned raw_field_to_rqa_plan_map = { plan.raw_field: plan for plan in PipelineConfiguration.RQA_CODING_PLANS } for target_field, update in rqa_updates: corrected_td = update.source_td.copy() # Hide the survey keys currently in the TracedData which have had data moved away. corrected_td.hide_keys({ k for k, v in flattened_survey_updates.items() if v is None }.intersection(corrected_td.keys()), Metadata(user, Metadata.get_call_location(), time.time())) # Update with the corrected survey data corrected_td.append_data( { k: v for k, v in flattened_survey_updates.items() if v is not None }, Metadata(user, Metadata.get_call_location(), time.time())) # Hide all the RQA fields (they will be added back, in turn, in the next step). corrected_td.hide_keys({ plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS }.intersection(corrected_td.keys()), Metadata(user, Metadata.get_call_location(), time.time())) corrected_td.hide_keys({ plan.time_field for plan in PipelineConfiguration.RQA_CODING_PLANS }.intersection(corrected_td.keys()), Metadata(user, Metadata.get_call_location(), time.time())) target_coding_plan = raw_field_to_rqa_plan_map[target_field] rqa_dict = { target_field: update.message, target_coding_plan.time_field: update.timestamp, f"{target_field}_source": update.source_field } corrected_td.append_data( rqa_dict, Metadata(user, Metadata.get_call_location(), time.time())) corrected_data.append(corrected_td) if len(unknown_target_code_counts) > 0: log.warning( "Found the following 'WS - Correct Dataset' CodeIDs with no matching coding plan:" ) for (code_id, display_text), count in unknown_target_code_counts.items(): log.warning( f" '{code_id}' (DisplayText '{display_text}') ({count} occurrences)" ) return corrected_data
def _impute_coding_error_codes(user, data): for td in data: coding_error_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: rqa_codes = [] for cc in plan.coding_configurations: if cc.coding_mode == CodingModes.SINGLE: if cc.coded_field in td: label = td[cc.coded_field] rqa_codes.append( cc.code_scheme.get_code_with_code_id( label["CodeID"])) else: assert cc.coding_mode == CodingModes.MULTIPLE for label in td.get(cc.coded_field, []): rqa_codes.append( cc.code_scheme.get_code_with_code_id( label["CodeID"])) has_ws_code_in_code_scheme = False for code in rqa_codes: if code.control_code == Codes.WRONG_SCHEME: has_ws_code_in_code_scheme = True has_ws_code_in_ws_scheme = False if f"{plan.raw_field}_correct_dataset" in td: ws_code = CodeSchemes.WS_CORRECT_DATASET.get_code_with_code_id( td[f"{plan.raw_field}_correct_dataset"]["CodeID"]) has_ws_code_in_ws_scheme = ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED if has_ws_code_in_code_scheme != has_ws_code_in_ws_scheme: log.warning( f"Coding Error: {plan.raw_field}: {td[plan.raw_field]}" ) coding_error_dict[f"{plan.raw_field}_correct_dataset"] = \ CleaningUtils.make_label_from_cleaner_code( CodeSchemes.WS_CORRECT_DATASET, CodeSchemes.WS_CORRECT_DATASET.get_code_with_control_code(Codes.CODING_ERROR), Metadata.get_call_location(), ).to_dict() for cc in plan.coding_configurations: if cc.coding_mode == CodingModes.SINGLE: coding_error_dict[cc.coded_field] = \ CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code(Codes.CODING_ERROR), Metadata.get_call_location() ).to_dict() else: assert cc.coding_mode == CodingModes.MULTIPLE coding_error_dict[cc.coded_field] = [ CleaningUtils.make_label_from_cleaner_code( cc.code_scheme, cc.code_scheme.get_code_with_control_code( Codes.CODING_ERROR), Metadata.get_call_location()).to_dict() ] td.append_data( coding_error_dict, Metadata(user, Metadata.get_call_location(), time.time()))