def test_fold_groups(self):
        data = [TracedData({"x": c}, Metadata("test_user", Metadata.get_call_location(), i))
                for i, c in enumerate(["a", "b", "c", "d", "e"])]

        groups = [
            [data[0]],
            [data[1], data[2], data[3]],
            [data[4]]
        ]

        def fold_fn(td_1, td_2):
            td_1 = td_1.copy()
            td_2 = td_2.copy()

            folded_dict = {"x": "{}{}".format(td_1["x"], td_2["x"])}

            td_1.append_data(folded_dict, Metadata("test_user", Metadata.get_call_location(), 10))
            td_2.append_data(folded_dict, Metadata("test_user", Metadata.get_call_location(), 11))

            folded = td_1
            td_1.append_traced_data("folded_with", td_2, Metadata("test_user", Metadata.get_call_location(), 12))

            return folded

        folded_data = FoldTracedData.fold_groups(groups, fold_fn)

        self.assertDictEqual(dict(folded_data[0].items()), {"x": "a"})
        self.assertDictEqual(dict(folded_data[1].items()), {"x": "bcd"})
        self.assertDictEqual(dict(folded_data[2].items()), {"x": "e"})
    def generate(cls, user, data, pipeline_configuration, raw_data_dir,  csv_by_message_output_path, csv_by_individual_output_path):
        # Serializer is currently overflowing
        # TODO: Investigate/address the cause of this.
        # sys.setrecursionlimit(15000)

        # Set consent withdrawn based on presence of data coded as "stop"
        consent_withdrawn_key = "consent_withdrawn"
        ConsentUtils.determine_consent_withdrawn(
            user, data, PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS,
            consent_withdrawn_key
        )

        # Set the list of keys to be exported and how they are to be handled when folding
        fold_strategies = OrderedDict()
        fold_strategies["uid"] = FoldStrategies.assert_equal
        fold_strategies[consent_withdrawn_key] = FoldStrategies.boolean_or

        export_keys = ["uid", consent_withdrawn_key]

        # Export listening group bool keys in analysis files headers only when running kakuma_pipeline because
        # dadaab does not have listening groups.
        if pipeline_configuration.pipeline_name in ["kakuma_s01_pipeline", "kakuma_s02_pipeline", "kakuma_s03_pipeline", "kakuma_all_seasons_pipeline"]:
            export_keys.append("repeat_listening_group_participant")
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                export_keys.extend([f'{plan.dataset_name}_listening_group_participant'])
        else:
            assert pipeline_configuration.pipeline_name in ["dadaab_s01_pipeline", "dadaab_s02_pipeline", "dadaab_s03_pipeline", "dadaab_all_seasons_pipeline"],\
                "PipelineName must be either a 'seasonal pipeline' or 'all seasons pipeline'"

        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            for cc in plan.coding_configurations:
                if cc.analysis_file_key is None:
                    continue

                if cc.coding_mode == CodingModes.SINGLE:
                    export_keys.append(cc.analysis_file_key)
                else:
                    assert cc.coding_mode == CodingModes.MULTIPLE
                    for code in cc.code_scheme.codes:
                        export_keys.append(f"{cc.analysis_file_key}{code.string_value}")

                fold_strategies[cc.coded_field] = cc.fold_strategy

            export_keys.append(plan.raw_field)
            fold_strategies[plan.raw_field] = plan.raw_field_fold_strategy

        # Fold data to have one respondent per row
        to_be_folded = []
        for td in data:
            to_be_folded.append(td.copy())

        folded_data = FoldTracedData.fold_iterable_of_traced_data(
            user, to_be_folded, lambda td: td["uid"], fold_strategies
        )

        cls.export_to_csv(user, data, pipeline_configuration, raw_data_dir, csv_by_message_output_path, export_keys, consent_withdrawn_key)
        cls.export_to_csv(user, folded_data, pipeline_configuration, raw_data_dir, csv_by_individual_output_path, export_keys, consent_withdrawn_key)

        return data, folded_data
示例#3
0
    def generate(cls, user, data, csv_by_message_output_path,
                 csv_by_individual_output_path):
        # Serializer is currently overflowing
        # TODO: Investigate/address the cause of this.
        sys.setrecursionlimit(15000)

        # Set consent withdrawn based on presence of data coded as "stop"
        consent_withdrawn_key = "consent_withdrawn"
        ConsentUtils.determine_consent_withdrawn(
            user, data, PipelineConfiguration.RQA_CODING_PLANS +
            PipelineConfiguration.SURVEY_CODING_PLANS, consent_withdrawn_key)

        # Set the list of keys to be exported and how they are to be handled when folding
        fold_strategies = OrderedDict()
        fold_strategies["uid"] = FoldStrategies.assert_equal
        fold_strategies[consent_withdrawn_key] = FoldStrategies.boolean_or

        export_keys = ["uid", consent_withdrawn_key]

        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            for cc in plan.coding_configurations:
                if cc.analysis_file_key is None:
                    continue

                if cc.coding_mode == CodingModes.SINGLE:
                    export_keys.append(cc.analysis_file_key)
                else:
                    assert cc.coding_mode == CodingModes.MULTIPLE
                    for code in cc.code_scheme.codes:
                        export_keys.append(
                            f"{cc.analysis_file_key}_{code.string_value}")

                if cc.include_in_individuals_file:
                    fold_strategies[cc.coded_field] = cc.fold_strategy

            export_keys.append(plan.raw_field)
            fold_strategies[plan.raw_field] = plan.raw_field_fold_strategy

        # Fold data to have one respondent per row
        to_be_folded = []
        for td in data:
            to_be_folded.append(td.copy())

        folded_data = FoldTracedData.fold_iterable_of_traced_data(
            user, to_be_folded, lambda td: td["uid"], fold_strategies)

        ConsentUtils.set_stopped(user, data, consent_withdrawn_key)
        ConsentUtils.set_stopped(user, folded_data, consent_withdrawn_key)

        cls.export_to_csv(MESSAGES_FILE, data, csv_by_message_output_path,
                          export_keys, consent_withdrawn_key)
        cls.export_to_csv(INDIVIDUALS_FILE, folded_data,
                          csv_by_individual_output_path, export_keys,
                          consent_withdrawn_key)

        return data, folded_data
    def test_fold_traced_data(self):
        td_1_dict = {
                "equal_1": 4, "equal_2": "xyz",
                "concat": "abc",
                "matrix_1": Codes.MATRIX_0, "matrix_2": Codes.MATRIX_0,
                "bool_1": Codes.FALSE, "bool_2": Codes.TRUE,
                "yes_no_1": Codes.YES, "yes_no_2": Codes.YES,
                "other_1": "other 1", "other_2": "other 2"
             }

        td_2_dict = {
                "equal_1": 4, "equal_2": "xyz",
                "concat": "def",
                "matrix_1": Codes.MATRIX_1, "matrix_2": Codes.MATRIX_0,
                "bool_1": Codes.TRUE, "bool_2": Codes.TRUE,
                "yes_no_1": Codes.YES, "yes_no_2": Codes.NO,
                "other_1": "other",
            }

        td_1 = TracedData(td_1_dict, Metadata("test_user", Metadata.get_call_location(), 0))
        td_2 = TracedData(td_2_dict, Metadata("test_user", Metadata.get_call_location(), 1))

        fold_strategies = {
            "equal_1": FoldStrategies.assert_equal,
            "equal_2": FoldStrategies.assert_equal,
            "concat": FoldStrategies.concatenate,
            "bool_1": FoldStrategies.boolean_or,
            "bool_2": FoldStrategies.boolean_or,
            "matrix_1": FoldStrategies.matrix,
            "matrix_2": FoldStrategies.matrix,
            "yes_no_1": FoldStrategies.yes_no_amb,
            "yes_no_2": FoldStrategies.yes_no_amb
        }
        folded_td = FoldTracedData.fold_traced_data("test_user", td_1, td_2, fold_strategies)

        # Test input tds unchanged
        self.assertDictEqual(dict(td_1.items()), td_1_dict)
        self.assertDictEqual(dict(td_2.items()), td_2_dict)
        
        # Test folded td has expected values
        self.assertDictEqual(
            dict(folded_td.items()),
            {
                "equal_1": 4, "equal_2": "xyz",
                "concat": "abc;def",
                "matrix_1": Codes.MATRIX_1, "matrix_2": Codes.MATRIX_0,
                "bool_1": Codes.TRUE, "bool_2": Codes.TRUE,
                "yes_no_1": Codes.YES, "yes_no_2": Codes.AMBIVALENT
            }
        )
    def test_group_by(self):
        flat_dicts = [
            {"id": "a", "x": "4"},
            {"id": "b", "x": "5"},
            {"id": "a", "x": "6"},
            {"id": "a", "x": "7"},
            {"id": "c", "x": "8"},
            {"id": "b", "x": "9"}
        ]

        flat_data = self.make_traced_data(flat_dicts)

        grouped = list(FoldTracedData.group_by(flat_data, lambda td: td["id"]))

        self.assertEqual(len(grouped), 3)

        grouped.sort(key=lambda l: l[0]["id"])
        self.assertListEqual([td["x"] for td in grouped[0]], ["4", "6", "7"])
        self.assertListEqual([td["x"] for td in grouped[1]], ["5", "9"])
        self.assertListEqual([td["x"] for td in grouped[2]], ["8"])
示例#6
0
    def generate(user, data, csv_by_message_output_path,
                 csv_by_individual_output_path):
        # Serializer is currently overflowing
        # TODO: Investigate/address the cause of this.
        sys.setrecursionlimit(15000)

        consent_withdrawn_key = "consent_withdrawn"
        for td in data:
            td.append_data({consent_withdrawn_key: Codes.FALSE},
                           Metadata(user, Metadata.get_call_location(),
                                    time.time()))

        # Set the list of raw/coded keys which
        survey_keys = []
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.analysis_file_key is not None and plan.analysis_file_key not in survey_keys:
                survey_keys.append(plan.analysis_file_key)
            if plan.raw_field not in survey_keys:
                survey_keys.append(plan.raw_field)

        # Convert survey codes to their string values
        for td in data:
            td.append_data(
                {
                    plan.analysis_file_key: plan.code_scheme.get_code_with_id(
                        td[plan.coded_field]["CodeID"]).string_value
                    for plan in PipelineConfiguration.SURVEY_CODING_PLANS
                    if plan.analysis_file_key is not None
                }, Metadata(user, Metadata.get_call_location(), time.time()))

        # Convert RQA binary codes to their string values
        for td in data:
            td.append_data(
                {
                    plan.binary_analysis_file_key:
                    plan.binary_code_scheme.get_code_with_id(
                        td[plan.binary_coded_field]["CodeID"]).string_value
                    for plan in PipelineConfiguration.RQA_CODING_PLANS
                    if plan.binary_code_scheme is not None
                }, Metadata(user, Metadata.get_call_location(), time.time()))

        # Translate the RQA reason codes to matrix values
        matrix_keys = []

        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            show_matrix_keys = list()
            for code in plan.code_scheme.codes:
                show_matrix_keys.append(
                    f"{plan.analysis_file_key}{code.string_value}")

            AnalysisKeys.set_matrix_keys(user, data, show_matrix_keys,
                                         plan.code_scheme, plan.coded_field,
                                         plan.analysis_file_key)

            matrix_keys.extend(show_matrix_keys)

        binary_keys = [
            plan.binary_analysis_file_key
            for plan in PipelineConfiguration.RQA_CODING_PLANS
            if plan.binary_analysis_file_key is not None
        ]

        equal_keys = ["uid"]
        equal_keys.extend(survey_keys)
        concat_keys = [
            plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS
        ]
        bool_keys = [
            consent_withdrawn_key,

            # "sms_ad",
            # "radio_promo",
            # "radio_show",
            # "non_logical_time",
            # "radio_participation_s02e01",
            # "radio_participation_s02e02",
            # "radio_participation_s02e03",
            # "radio_participation_s02e04",
            # "radio_participation_s02e05",
            # "radio_participation_s02e06",
        ]

        # Export to CSV
        export_keys = ["uid"]
        export_keys.extend(bool_keys)
        export_keys.extend(matrix_keys)
        export_keys.extend(binary_keys)
        export_keys.extend(concat_keys)
        export_keys.extend(survey_keys)

        # Set consent withdrawn based on presence of data coded as "stop"
        ConsentUtils.determine_consent_withdrawn(
            user, data, PipelineConfiguration.SURVEY_CODING_PLANS,
            consent_withdrawn_key)

        # Set consent withdrawn based on stop codes from radio question answers
        for td in data:
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                if td[f"{plan.analysis_file_key}{Codes.STOP}"] == Codes.MATRIX_1:
                    td.append_data({consent_withdrawn_key: Codes.TRUE},
                                   Metadata(user, Metadata.get_call_location(),
                                            time.time()))

                if plan.binary_code_scheme is not None:
                    if td[plan.binary_coded_field]["CodeID"] == \
                            plan.binary_code_scheme.get_code_with_control_code(Codes.STOP).code_id:
                        td.append_data({consent_withdrawn_key: Codes.TRUE},
                                       Metadata(user,
                                                Metadata.get_call_location(),
                                                time.time()))

        # Fold data to have one respondent per row
        to_be_folded = []
        for td in data:
            to_be_folded.append(td.copy())

        folded_data = FoldTracedData.fold_iterable_of_traced_data(
            user,
            data,
            fold_id_fn=lambda td: td["uid"],
            equal_keys=equal_keys,
            concat_keys=concat_keys,
            matrix_keys=matrix_keys,
            bool_keys=bool_keys,
            binary_keys=binary_keys)

        # Fix-up _NA and _NC keys, which are currently being set incorrectly by
        # FoldTracedData.fold_iterable_of_traced_data when there are multiple radio shows
        # TODO: Update FoldTracedData to handle NA and NC correctly under multiple radio shows
        for td in folded_data:
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                if td.get(plan.raw_field, "") != "":
                    td.append_data(
                        {
                            f"{plan.analysis_file_key}{Codes.TRUE_MISSING}":
                            Codes.MATRIX_0
                        },
                        Metadata(user, Metadata.get_call_location(),
                                 TimeUtils.utc_now_as_iso_string()))

                contains_non_nc_key = False
                for key in matrix_keys:
                    if key.startswith(plan.analysis_file_key) and not key.endswith(Codes.NOT_CODED) \
                            and td.get(key) == Codes.MATRIX_1:
                        contains_non_nc_key = True
                if not contains_non_nc_key:
                    td.append_data(
                        {
                            f"{plan.analysis_file_key}{Codes.NOT_CODED}":
                            Codes.MATRIX_1
                        },
                        Metadata(user, Metadata.get_call_location(),
                                 TimeUtils.utc_now_as_iso_string()))

        # Process consent
        ConsentUtils.set_stopped(user,
                                 data,
                                 consent_withdrawn_key,
                                 additional_keys=export_keys)
        ConsentUtils.set_stopped(user,
                                 folded_data,
                                 consent_withdrawn_key,
                                 additional_keys=export_keys)

        # Output to CSV with one message per row
        with open(csv_by_message_output_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(
                data, f, headers=export_keys)

        with open(csv_by_individual_output_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(
                folded_data, f, headers=export_keys)

        return data
示例#7
0
    def generate(user, data, csv_by_message_output_path,
                 csv_by_individual_output_path):
        # Serializer is currently overflowing
        # TODO: Investigate/address the cause of this.
        sys.setrecursionlimit(15000)

        consent_withdrawn_key = "consent_withdrawn"
        for td in data:
            td.append_data({consent_withdrawn_key: Codes.FALSE},
                           Metadata(user, Metadata.get_call_location(),
                                    time.time()))

        # Set the list of keys to be exported and how they are to be handled when folding
        export_keys = ["uid", consent_withdrawn_key]
        bool_keys = [
            consent_withdrawn_key

            # "sms_ad",
            # "radio_promo",
            # "radio_show",
            # "non_logical_time",
            # "radio_participation_s02e01",
            # "radio_participation_s02e02",
            # "radio_participation_s02e03",
            # "radio_participation_s02e04",
            # "radio_participation_s02e05",
            # "radio_participation_s02e06",
        ]
        equal_keys = ["uid"]
        concat_keys = []
        matrix_keys = []
        binary_keys = []
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            for cc in plan.coding_configurations:
                if cc.analysis_file_key is None:
                    continue

                if cc.coding_mode == CodingModes.SINGLE:
                    export_keys.append(cc.analysis_file_key)

                    if cc.folding_mode == FoldingModes.ASSERT_EQUAL:
                        equal_keys.append(cc.analysis_file_key)
                    elif cc.folding_mode == FoldingModes.YES_NO_AMB:
                        binary_keys.append(cc.analysis_file_key)
                    else:
                        assert False, f"Incompatible folding_mode {plan.folding_mode}"
                else:
                    assert cc.folding_mode == FoldingModes.MATRIX
                    for code in cc.code_scheme.codes:
                        export_keys.append(
                            f"{cc.analysis_file_key}{code.string_value}")
                        matrix_keys.append(
                            f"{cc.analysis_file_key}{code.string_value}")

            export_keys.append(plan.raw_field)
            if plan.raw_field_folding_mode == FoldingModes.CONCATENATE:
                concat_keys.append(plan.raw_field)
            elif plan.raw_field_folding_mode == FoldingModes.ASSERT_EQUAL:
                equal_keys.append(plan.raw_field)
            else:
                assert False, f"Incompatible raw_field_folding_mode {plan.raw_field_folding_mode}"

        # Convert codes to their string/matrix values
        for td in data:
            analysis_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                for cc in plan.coding_configurations:
                    if cc.analysis_file_key is None:
                        continue

                    if cc.coding_mode == CodingModes.SINGLE:
                        analysis_dict[cc.analysis_file_key] = \
                            cc.code_scheme.get_code_with_id(td[cc.coded_field]["CodeID"]).string_value
                    else:
                        assert cc.coding_mode == CodingModes.MULTIPLE
                        show_matrix_keys = []
                        for code in cc.code_scheme.codes:
                            show_matrix_keys.append(
                                f"{cc.analysis_file_key}{code.string_value}")

                        for label in td.get(cc.coded_field, []):
                            code_string_value = cc.code_scheme.get_code_with_id(
                                label['CodeID']).string_value
                            analysis_dict[
                                f"{cc.analysis_file_key}{code_string_value}"] = Codes.MATRIX_1

                        for key in show_matrix_keys:
                            if key not in analysis_dict:
                                analysis_dict[key] = Codes.MATRIX_0
            td.append_data(
                analysis_dict,
                Metadata(user, Metadata.get_call_location(),
                         TimeUtils.utc_now_as_iso_string()))

        # Set consent withdrawn based on presence of data coded as "stop"
        ConsentUtils.determine_consent_withdrawn(
            user, data, PipelineConfiguration.RQA_CODING_PLANS +
            PipelineConfiguration.SURVEY_CODING_PLANS, consent_withdrawn_key)

        # Fold data to have one respondent per row
        to_be_folded = []
        for td in data:
            to_be_folded.append(td.copy())

        folded_data = FoldTracedData.fold_iterable_of_traced_data(
            user,
            data,
            fold_id_fn=lambda td: td["uid"],
            equal_keys=equal_keys,
            concat_keys=concat_keys,
            matrix_keys=matrix_keys,
            bool_keys=bool_keys,
            binary_keys=binary_keys)

        # Fix-up _NA and _NC keys, which are currently being set incorrectly by
        # FoldTracedData.fold_iterable_of_traced_data when there are multiple radio shows
        # TODO: Update FoldTracedData to handle NA and NC correctly under multiple radio shows
        for td in folded_data:
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                for cc in plan.coding_configurations:
                    if cc.analysis_file_key is None:
                        continue

                    if cc.coding_mode == CodingModes.MULTIPLE:
                        if td.get(plan.raw_field, "") != "":
                            td.append_data(
                                {
                                    f"{cc.analysis_file_key}{Codes.TRUE_MISSING}":
                                    Codes.MATRIX_0
                                },
                                Metadata(user, Metadata.get_call_location(),
                                         TimeUtils.utc_now_as_iso_string()))

                        contains_non_nc_key = False
                        for key in matrix_keys:
                            if key.startswith(cc.analysis_file_key) and not key.endswith(Codes.NOT_CODED) \
                                    and td.get(key) == Codes.MATRIX_1:
                                contains_non_nc_key = True
                        if not contains_non_nc_key:
                            td.append_data(
                                {
                                    f"{cc.analysis_file_key}{Codes.NOT_CODED}":
                                    Codes.MATRIX_1
                                },
                                Metadata(user, Metadata.get_call_location(),
                                         TimeUtils.utc_now_as_iso_string()))

        # Process consent
        ConsentUtils.set_stopped(user,
                                 data,
                                 consent_withdrawn_key,
                                 additional_keys=export_keys)
        ConsentUtils.set_stopped(user,
                                 folded_data,
                                 consent_withdrawn_key,
                                 additional_keys=export_keys)

        # Output to CSV with one message per row
        with open(csv_by_message_output_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(
                data, f, headers=export_keys)

        with open(csv_by_individual_output_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(
                folded_data, f, headers=export_keys)

        return data, folded_data
        AnalysisKeys.set_matrix_keys(user, data, show_matrix_keys, plan,
                                     code_ids, plan.coded_field,
                                     plan.analysis_file_key)

        matrix_keys.extend(show_matrix_keys)

    matrix_keys.sort()

    equal_keys = ["avf_phone_id"]
    equal_keys.extend(survey_keys)

    print("Folding")
    folded = FoldTracedData.fold_iterable_of_traced_data(
        user,
        data,
        lambda td: (td["avf_phone_id"]),
        equal_keys=equal_keys,
        column_keys=column_keys,
        matrix_keys=matrix_keys)

    print("Post fold fix-up")

    # Determine which column keys were set by FoldTracedData.fold_iterable_of_traced_data
    folded_column_keys = set()
    for key in column_keys:
        for td in folded:
            i = 1
            while "{} {}".format(key, i) in td:
                folded_column_keys.add("{} {}".format(key, i))
                i += 1