def test_fold_groups(self): data = [TracedData({"x": c}, Metadata("test_user", Metadata.get_call_location(), i)) for i, c in enumerate(["a", "b", "c", "d", "e"])] groups = [ [data[0]], [data[1], data[2], data[3]], [data[4]] ] def fold_fn(td_1, td_2): td_1 = td_1.copy() td_2 = td_2.copy() folded_dict = {"x": "{}{}".format(td_1["x"], td_2["x"])} td_1.append_data(folded_dict, Metadata("test_user", Metadata.get_call_location(), 10)) td_2.append_data(folded_dict, Metadata("test_user", Metadata.get_call_location(), 11)) folded = td_1 td_1.append_traced_data("folded_with", td_2, Metadata("test_user", Metadata.get_call_location(), 12)) return folded folded_data = FoldTracedData.fold_groups(groups, fold_fn) self.assertDictEqual(dict(folded_data[0].items()), {"x": "a"}) self.assertDictEqual(dict(folded_data[1].items()), {"x": "bcd"}) self.assertDictEqual(dict(folded_data[2].items()), {"x": "e"})
def generate(cls, user, data, pipeline_configuration, raw_data_dir, csv_by_message_output_path, csv_by_individual_output_path): # Serializer is currently overflowing # TODO: Investigate/address the cause of this. # sys.setrecursionlimit(15000) # Set consent withdrawn based on presence of data coded as "stop" consent_withdrawn_key = "consent_withdrawn" ConsentUtils.determine_consent_withdrawn( user, data, PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS, consent_withdrawn_key ) # Set the list of keys to be exported and how they are to be handled when folding fold_strategies = OrderedDict() fold_strategies["uid"] = FoldStrategies.assert_equal fold_strategies[consent_withdrawn_key] = FoldStrategies.boolean_or export_keys = ["uid", consent_withdrawn_key] # Export listening group bool keys in analysis files headers only when running kakuma_pipeline because # dadaab does not have listening groups. if pipeline_configuration.pipeline_name in ["kakuma_s01_pipeline", "kakuma_s02_pipeline", "kakuma_s03_pipeline", "kakuma_all_seasons_pipeline"]: export_keys.append("repeat_listening_group_participant") for plan in PipelineConfiguration.RQA_CODING_PLANS: export_keys.extend([f'{plan.dataset_name}_listening_group_participant']) else: assert pipeline_configuration.pipeline_name in ["dadaab_s01_pipeline", "dadaab_s02_pipeline", "dadaab_s03_pipeline", "dadaab_all_seasons_pipeline"],\ "PipelineName must be either a 'seasonal pipeline' or 'all seasons pipeline'" for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.analysis_file_key is None: continue if cc.coding_mode == CodingModes.SINGLE: export_keys.append(cc.analysis_file_key) else: assert cc.coding_mode == CodingModes.MULTIPLE for code in cc.code_scheme.codes: export_keys.append(f"{cc.analysis_file_key}{code.string_value}") fold_strategies[cc.coded_field] = cc.fold_strategy export_keys.append(plan.raw_field) fold_strategies[plan.raw_field] = plan.raw_field_fold_strategy # Fold data to have one respondent per row to_be_folded = [] for td in data: to_be_folded.append(td.copy()) folded_data = FoldTracedData.fold_iterable_of_traced_data( user, to_be_folded, lambda td: td["uid"], fold_strategies ) cls.export_to_csv(user, data, pipeline_configuration, raw_data_dir, csv_by_message_output_path, export_keys, consent_withdrawn_key) cls.export_to_csv(user, folded_data, pipeline_configuration, raw_data_dir, csv_by_individual_output_path, export_keys, consent_withdrawn_key) return data, folded_data
def generate(cls, user, data, csv_by_message_output_path, csv_by_individual_output_path): # Serializer is currently overflowing # TODO: Investigate/address the cause of this. sys.setrecursionlimit(15000) # Set consent withdrawn based on presence of data coded as "stop" consent_withdrawn_key = "consent_withdrawn" ConsentUtils.determine_consent_withdrawn( user, data, PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS, consent_withdrawn_key) # Set the list of keys to be exported and how they are to be handled when folding fold_strategies = OrderedDict() fold_strategies["uid"] = FoldStrategies.assert_equal fold_strategies[consent_withdrawn_key] = FoldStrategies.boolean_or export_keys = ["uid", consent_withdrawn_key] for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.analysis_file_key is None: continue if cc.coding_mode == CodingModes.SINGLE: export_keys.append(cc.analysis_file_key) else: assert cc.coding_mode == CodingModes.MULTIPLE for code in cc.code_scheme.codes: export_keys.append( f"{cc.analysis_file_key}_{code.string_value}") if cc.include_in_individuals_file: fold_strategies[cc.coded_field] = cc.fold_strategy export_keys.append(plan.raw_field) fold_strategies[plan.raw_field] = plan.raw_field_fold_strategy # Fold data to have one respondent per row to_be_folded = [] for td in data: to_be_folded.append(td.copy()) folded_data = FoldTracedData.fold_iterable_of_traced_data( user, to_be_folded, lambda td: td["uid"], fold_strategies) ConsentUtils.set_stopped(user, data, consent_withdrawn_key) ConsentUtils.set_stopped(user, folded_data, consent_withdrawn_key) cls.export_to_csv(MESSAGES_FILE, data, csv_by_message_output_path, export_keys, consent_withdrawn_key) cls.export_to_csv(INDIVIDUALS_FILE, folded_data, csv_by_individual_output_path, export_keys, consent_withdrawn_key) return data, folded_data
def test_fold_traced_data(self): td_1_dict = { "equal_1": 4, "equal_2": "xyz", "concat": "abc", "matrix_1": Codes.MATRIX_0, "matrix_2": Codes.MATRIX_0, "bool_1": Codes.FALSE, "bool_2": Codes.TRUE, "yes_no_1": Codes.YES, "yes_no_2": Codes.YES, "other_1": "other 1", "other_2": "other 2" } td_2_dict = { "equal_1": 4, "equal_2": "xyz", "concat": "def", "matrix_1": Codes.MATRIX_1, "matrix_2": Codes.MATRIX_0, "bool_1": Codes.TRUE, "bool_2": Codes.TRUE, "yes_no_1": Codes.YES, "yes_no_2": Codes.NO, "other_1": "other", } td_1 = TracedData(td_1_dict, Metadata("test_user", Metadata.get_call_location(), 0)) td_2 = TracedData(td_2_dict, Metadata("test_user", Metadata.get_call_location(), 1)) fold_strategies = { "equal_1": FoldStrategies.assert_equal, "equal_2": FoldStrategies.assert_equal, "concat": FoldStrategies.concatenate, "bool_1": FoldStrategies.boolean_or, "bool_2": FoldStrategies.boolean_or, "matrix_1": FoldStrategies.matrix, "matrix_2": FoldStrategies.matrix, "yes_no_1": FoldStrategies.yes_no_amb, "yes_no_2": FoldStrategies.yes_no_amb } folded_td = FoldTracedData.fold_traced_data("test_user", td_1, td_2, fold_strategies) # Test input tds unchanged self.assertDictEqual(dict(td_1.items()), td_1_dict) self.assertDictEqual(dict(td_2.items()), td_2_dict) # Test folded td has expected values self.assertDictEqual( dict(folded_td.items()), { "equal_1": 4, "equal_2": "xyz", "concat": "abc;def", "matrix_1": Codes.MATRIX_1, "matrix_2": Codes.MATRIX_0, "bool_1": Codes.TRUE, "bool_2": Codes.TRUE, "yes_no_1": Codes.YES, "yes_no_2": Codes.AMBIVALENT } )
def test_group_by(self): flat_dicts = [ {"id": "a", "x": "4"}, {"id": "b", "x": "5"}, {"id": "a", "x": "6"}, {"id": "a", "x": "7"}, {"id": "c", "x": "8"}, {"id": "b", "x": "9"} ] flat_data = self.make_traced_data(flat_dicts) grouped = list(FoldTracedData.group_by(flat_data, lambda td: td["id"])) self.assertEqual(len(grouped), 3) grouped.sort(key=lambda l: l[0]["id"]) self.assertListEqual([td["x"] for td in grouped[0]], ["4", "6", "7"]) self.assertListEqual([td["x"] for td in grouped[1]], ["5", "9"]) self.assertListEqual([td["x"] for td in grouped[2]], ["8"])
def generate(user, data, csv_by_message_output_path, csv_by_individual_output_path): # Serializer is currently overflowing # TODO: Investigate/address the cause of this. sys.setrecursionlimit(15000) consent_withdrawn_key = "consent_withdrawn" for td in data: td.append_data({consent_withdrawn_key: Codes.FALSE}, Metadata(user, Metadata.get_call_location(), time.time())) # Set the list of raw/coded keys which survey_keys = [] for plan in PipelineConfiguration.SURVEY_CODING_PLANS: if plan.analysis_file_key is not None and plan.analysis_file_key not in survey_keys: survey_keys.append(plan.analysis_file_key) if plan.raw_field not in survey_keys: survey_keys.append(plan.raw_field) # Convert survey codes to their string values for td in data: td.append_data( { plan.analysis_file_key: plan.code_scheme.get_code_with_id( td[plan.coded_field]["CodeID"]).string_value for plan in PipelineConfiguration.SURVEY_CODING_PLANS if plan.analysis_file_key is not None }, Metadata(user, Metadata.get_call_location(), time.time())) # Convert RQA binary codes to their string values for td in data: td.append_data( { plan.binary_analysis_file_key: plan.binary_code_scheme.get_code_with_id( td[plan.binary_coded_field]["CodeID"]).string_value for plan in PipelineConfiguration.RQA_CODING_PLANS if plan.binary_code_scheme is not None }, Metadata(user, Metadata.get_call_location(), time.time())) # Translate the RQA reason codes to matrix values matrix_keys = [] for plan in PipelineConfiguration.RQA_CODING_PLANS: show_matrix_keys = list() for code in plan.code_scheme.codes: show_matrix_keys.append( f"{plan.analysis_file_key}{code.string_value}") AnalysisKeys.set_matrix_keys(user, data, show_matrix_keys, plan.code_scheme, plan.coded_field, plan.analysis_file_key) matrix_keys.extend(show_matrix_keys) binary_keys = [ plan.binary_analysis_file_key for plan in PipelineConfiguration.RQA_CODING_PLANS if plan.binary_analysis_file_key is not None ] equal_keys = ["uid"] equal_keys.extend(survey_keys) concat_keys = [ plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS ] bool_keys = [ consent_withdrawn_key, # "sms_ad", # "radio_promo", # "radio_show", # "non_logical_time", # "radio_participation_s02e01", # "radio_participation_s02e02", # "radio_participation_s02e03", # "radio_participation_s02e04", # "radio_participation_s02e05", # "radio_participation_s02e06", ] # Export to CSV export_keys = ["uid"] export_keys.extend(bool_keys) export_keys.extend(matrix_keys) export_keys.extend(binary_keys) export_keys.extend(concat_keys) export_keys.extend(survey_keys) # Set consent withdrawn based on presence of data coded as "stop" ConsentUtils.determine_consent_withdrawn( user, data, PipelineConfiguration.SURVEY_CODING_PLANS, consent_withdrawn_key) # Set consent withdrawn based on stop codes from radio question answers for td in data: for plan in PipelineConfiguration.RQA_CODING_PLANS: if td[f"{plan.analysis_file_key}{Codes.STOP}"] == Codes.MATRIX_1: td.append_data({consent_withdrawn_key: Codes.TRUE}, Metadata(user, Metadata.get_call_location(), time.time())) if plan.binary_code_scheme is not None: if td[plan.binary_coded_field]["CodeID"] == \ plan.binary_code_scheme.get_code_with_control_code(Codes.STOP).code_id: td.append_data({consent_withdrawn_key: Codes.TRUE}, Metadata(user, Metadata.get_call_location(), time.time())) # Fold data to have one respondent per row to_be_folded = [] for td in data: to_be_folded.append(td.copy()) folded_data = FoldTracedData.fold_iterable_of_traced_data( user, data, fold_id_fn=lambda td: td["uid"], equal_keys=equal_keys, concat_keys=concat_keys, matrix_keys=matrix_keys, bool_keys=bool_keys, binary_keys=binary_keys) # Fix-up _NA and _NC keys, which are currently being set incorrectly by # FoldTracedData.fold_iterable_of_traced_data when there are multiple radio shows # TODO: Update FoldTracedData to handle NA and NC correctly under multiple radio shows for td in folded_data: for plan in PipelineConfiguration.RQA_CODING_PLANS: if td.get(plan.raw_field, "") != "": td.append_data( { f"{plan.analysis_file_key}{Codes.TRUE_MISSING}": Codes.MATRIX_0 }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) contains_non_nc_key = False for key in matrix_keys: if key.startswith(plan.analysis_file_key) and not key.endswith(Codes.NOT_CODED) \ and td.get(key) == Codes.MATRIX_1: contains_non_nc_key = True if not contains_non_nc_key: td.append_data( { f"{plan.analysis_file_key}{Codes.NOT_CODED}": Codes.MATRIX_1 }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Process consent ConsentUtils.set_stopped(user, data, consent_withdrawn_key, additional_keys=export_keys) ConsentUtils.set_stopped(user, folded_data, consent_withdrawn_key, additional_keys=export_keys) # Output to CSV with one message per row with open(csv_by_message_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( data, f, headers=export_keys) with open(csv_by_individual_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( folded_data, f, headers=export_keys) return data
def generate(user, data, csv_by_message_output_path, csv_by_individual_output_path): # Serializer is currently overflowing # TODO: Investigate/address the cause of this. sys.setrecursionlimit(15000) consent_withdrawn_key = "consent_withdrawn" for td in data: td.append_data({consent_withdrawn_key: Codes.FALSE}, Metadata(user, Metadata.get_call_location(), time.time())) # Set the list of keys to be exported and how they are to be handled when folding export_keys = ["uid", consent_withdrawn_key] bool_keys = [ consent_withdrawn_key # "sms_ad", # "radio_promo", # "radio_show", # "non_logical_time", # "radio_participation_s02e01", # "radio_participation_s02e02", # "radio_participation_s02e03", # "radio_participation_s02e04", # "radio_participation_s02e05", # "radio_participation_s02e06", ] equal_keys = ["uid"] concat_keys = [] matrix_keys = [] binary_keys = [] for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.analysis_file_key is None: continue if cc.coding_mode == CodingModes.SINGLE: export_keys.append(cc.analysis_file_key) if cc.folding_mode == FoldingModes.ASSERT_EQUAL: equal_keys.append(cc.analysis_file_key) elif cc.folding_mode == FoldingModes.YES_NO_AMB: binary_keys.append(cc.analysis_file_key) else: assert False, f"Incompatible folding_mode {plan.folding_mode}" else: assert cc.folding_mode == FoldingModes.MATRIX for code in cc.code_scheme.codes: export_keys.append( f"{cc.analysis_file_key}{code.string_value}") matrix_keys.append( f"{cc.analysis_file_key}{code.string_value}") export_keys.append(plan.raw_field) if plan.raw_field_folding_mode == FoldingModes.CONCATENATE: concat_keys.append(plan.raw_field) elif plan.raw_field_folding_mode == FoldingModes.ASSERT_EQUAL: equal_keys.append(plan.raw_field) else: assert False, f"Incompatible raw_field_folding_mode {plan.raw_field_folding_mode}" # Convert codes to their string/matrix values for td in data: analysis_dict = dict() for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.analysis_file_key is None: continue if cc.coding_mode == CodingModes.SINGLE: analysis_dict[cc.analysis_file_key] = \ cc.code_scheme.get_code_with_id(td[cc.coded_field]["CodeID"]).string_value else: assert cc.coding_mode == CodingModes.MULTIPLE show_matrix_keys = [] for code in cc.code_scheme.codes: show_matrix_keys.append( f"{cc.analysis_file_key}{code.string_value}") for label in td.get(cc.coded_field, []): code_string_value = cc.code_scheme.get_code_with_id( label['CodeID']).string_value analysis_dict[ f"{cc.analysis_file_key}{code_string_value}"] = Codes.MATRIX_1 for key in show_matrix_keys: if key not in analysis_dict: analysis_dict[key] = Codes.MATRIX_0 td.append_data( analysis_dict, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Set consent withdrawn based on presence of data coded as "stop" ConsentUtils.determine_consent_withdrawn( user, data, PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS, consent_withdrawn_key) # Fold data to have one respondent per row to_be_folded = [] for td in data: to_be_folded.append(td.copy()) folded_data = FoldTracedData.fold_iterable_of_traced_data( user, data, fold_id_fn=lambda td: td["uid"], equal_keys=equal_keys, concat_keys=concat_keys, matrix_keys=matrix_keys, bool_keys=bool_keys, binary_keys=binary_keys) # Fix-up _NA and _NC keys, which are currently being set incorrectly by # FoldTracedData.fold_iterable_of_traced_data when there are multiple radio shows # TODO: Update FoldTracedData to handle NA and NC correctly under multiple radio shows for td in folded_data: for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS: for cc in plan.coding_configurations: if cc.analysis_file_key is None: continue if cc.coding_mode == CodingModes.MULTIPLE: if td.get(plan.raw_field, "") != "": td.append_data( { f"{cc.analysis_file_key}{Codes.TRUE_MISSING}": Codes.MATRIX_0 }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) contains_non_nc_key = False for key in matrix_keys: if key.startswith(cc.analysis_file_key) and not key.endswith(Codes.NOT_CODED) \ and td.get(key) == Codes.MATRIX_1: contains_non_nc_key = True if not contains_non_nc_key: td.append_data( { f"{cc.analysis_file_key}{Codes.NOT_CODED}": Codes.MATRIX_1 }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())) # Process consent ConsentUtils.set_stopped(user, data, consent_withdrawn_key, additional_keys=export_keys) ConsentUtils.set_stopped(user, folded_data, consent_withdrawn_key, additional_keys=export_keys) # Output to CSV with one message per row with open(csv_by_message_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( data, f, headers=export_keys) with open(csv_by_individual_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( folded_data, f, headers=export_keys) return data, folded_data
AnalysisKeys.set_matrix_keys(user, data, show_matrix_keys, plan, code_ids, plan.coded_field, plan.analysis_file_key) matrix_keys.extend(show_matrix_keys) matrix_keys.sort() equal_keys = ["avf_phone_id"] equal_keys.extend(survey_keys) print("Folding") folded = FoldTracedData.fold_iterable_of_traced_data( user, data, lambda td: (td["avf_phone_id"]), equal_keys=equal_keys, column_keys=column_keys, matrix_keys=matrix_keys) print("Post fold fix-up") # Determine which column keys were set by FoldTracedData.fold_iterable_of_traced_data folded_column_keys = set() for key in column_keys: for td in folded: i = 1 while "{} {}".format(key, i) in td: folded_column_keys.add("{} {}".format(key, i)) i += 1