Пример #1
0
    def auto_code_show_messages(cls, user, data, icr_output_dir,
                                coda_output_dir):
        # Filter out test messages sent by AVF
        if not PipelineConfiguration.DEV_MODE:
            data = MessageFilters.filter_test_messages(data)

        # Filter for runs which don't contain a response to any week's question
        data = MessageFilters.filter_empty_messages(data, cls.RQA_KEYS)

        # Filter out runs sent outwith the project start and end dates
        data = MessageFilters.filter_time_range(
            data, cls.SENT_ON_KEY, PipelineConfiguration.PROJECT_START_DATE,
            PipelineConfiguration.PROJECT_END_DATE)

        # Label each message with channel keys
        Channels.set_channel_keys(user, data, cls.SENT_ON_KEY)

        # Output RQA and follow up surveys messages to Coda
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {},
                    f)

        # Output RQA and follow up messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
            rqa_and_follow_up_messages = []
            # This test works because the only codes which have been applied at this point are TRUE_MISSING.
            # If any other coding is done above, this test will need to change
            for td in data:
                if plan.raw_field in td:
                    rqa_and_follow_up_messages.append(td)

            icr_messages = ICRTools.generate_sample_for_icr(
                rqa_and_follow_up_messages, cls.ICR_MESSAGES_COUNT,
                random.Random(cls.ICR_SEED))

            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages,
                    f,
                    headers=[plan.run_id_field, plan.raw_field])

        return data
    def generate(data, production_csv_output_path):
        production_keys = ["uid"]
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            if plan.raw_field not in production_keys:
                production_keys.append(plan.raw_field)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.raw_field not in production_keys:
                production_keys.append(plan.raw_field)

        not_noise = MessageFilters.filter_noise(data, "noise", lambda x: x)
        with open(production_csv_output_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(not_noise, f, headers=production_keys)

        return data
Пример #3
0
    def generate(data, production_csv_output_path):
        production_keys = ["uid"]
        for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS:
            if plan.raw_field not in production_keys:
                production_keys.append(plan.raw_field)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.raw_field not in production_keys:
                production_keys.append(plan.raw_field)

        # Not perfoming message filtering at this stage for this test-pipeline.
        with open(production_csv_output_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(
                data, f, headers=production_keys)

        return data
 def generate(data, production_csv_output_path):
     production_keys = ["uid"]
     for plan in PipelineConfiguration.RQA_CODING_PLANS:
         if plan.raw_field not in production_keys:
             production_keys.append(plan.raw_field)
     for plan in PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
         if plan.raw_field not in production_keys:
             production_keys.append(plan.raw_field)
     for plan in PipelineConfiguration.DEMOGS_CODING_PLANS:
         if plan.raw_field not in production_keys:
             production_keys.append(plan.raw_field)
     with open(production_csv_output_path, "w") as f:
         TracedDataCSVIO.export_traced_data_iterable_to_csv(data, f, headers=production_keys)
     
     return data
Пример #5
0
    def test_export_traced_data_iterable_to_csv(self):
        file_path = path.join(self.test_dir, "csv_test.csv")

        # Test exporting wrong data type
        data = list(generate_traced_data_iterable())
        with open(file_path, "w") as f:
            try:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(data[0], f)
                self.fail("Exporting the wrong data type did not raise an assertion error")
            except AssertionError as e:
                self.assertEqual(str(e), _td_type_error_string)

        # Test exporting normal data, including requesting an unknown header.
        data = generate_traced_data_iterable()
        with open(file_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(data, f, headers=["URN", "Gender", "Non-Existent"])

        self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/csv_export_expected.csv"))
Пример #6
0
    def export_to_csv(user, data, csv_path, export_keys,
                      consent_withdrawn_key):
        # Convert codes to their string/matrix values
        for td in data:
            analysis_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                for cc in plan.coding_configurations:
                    if cc.analysis_file_key is None:
                        continue

                    if cc.coding_mode == CodingModes.SINGLE:
                        analysis_dict[cc.analysis_file_key] = \
                            cc.code_scheme.get_code_with_code_id(td[cc.coded_field]["CodeID"]).string_value
                    else:
                        assert cc.coding_mode == CodingModes.MULTIPLE
                        show_matrix_keys = []
                        for code in cc.code_scheme.codes:
                            show_matrix_keys.append(
                                f"{cc.analysis_file_key}{code.string_value}")

                        for label in td[cc.coded_field]:
                            code_string_value = cc.code_scheme.get_code_with_code_id(
                                label["CodeID"]).string_value
                            analysis_dict[
                                f"{cc.analysis_file_key}{code_string_value}"] = Codes.MATRIX_1

                        for key in show_matrix_keys:
                            if key not in analysis_dict:
                                analysis_dict[key] = Codes.MATRIX_0
            td.append_data(
                analysis_dict,
                Metadata(user, Metadata.get_call_location(),
                         TimeUtils.utc_now_as_iso_string()))

        # Hide data from participants who opted out
        ConsentUtils.set_stopped(user,
                                 data,
                                 consent_withdrawn_key,
                                 additional_keys=export_keys)

        with open(csv_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(
                data, f, headers=export_keys)
Пример #7
0
    def export_icr(cls, data, icr_output_dir):
        # Output messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            rqa_messages = []
            for td in data:
                if plan.raw_field in td:
                    rqa_messages.append(td)

            icr_messages = ICRTools.generate_sample_for_icr(
                rqa_messages, cls.ICR_MESSAGES_COUNT,
                random.Random(cls.ICR_SEED))

            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages,
                    f,
                    headers=[plan.run_id_field, plan.raw_field])
Пример #8
0
    def test_import_csv_to_traced_data_iterable(self):
        file_path = "tests/traced_data/resources/csv_import_data.csv"

        with open(file_path, "r") as f:
            exported = list(generate_traced_data_iterable())
            imported = list(TracedDataCSVIO.import_csv_to_traced_data_iterable("test_user", f))

            self.assertEqual(len(exported), len(imported))

            for x, y in zip(exported, imported):
                self.assertSetEqual(set(x.items()), set(y.items()))
    def auto_code_surveys(cls, user, data, icr_output_dir, coda_output_dir):
        # Auto-code surveys
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.cleaner is not None:
                CleaningUtils.apply_cleaner_to_traced_data_iterable(
                    user, data, plan.raw_field, plan.coded_field, plan.cleaner,
                    plan.code_scheme)

        # Output single-scheme answers to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:

            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field,
                    {plan.coded_field: plan.code_scheme}, f)

        # Output messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            rqa_messages = []
            for td in data:
                if plan.raw_field in td:
                    rqa_messages.append(td)

            icr_messages = ICRTools.generate_sample_for_icr(
                rqa_messages, cls.ICR_MESSAGES_COUNT,
                random.Random(cls.ICR_SEED))

            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages,
                    f,
                    headers=[plan.run_id_field, plan.raw_field])

        return data
Пример #10
0
    def generate(user, data, csv_by_message_output_path,
                 csv_by_individual_output_path):
        # Serializer is currently overflowing
        # TODO: Investigate/address the cause of this.
        sys.setrecursionlimit(15000)

        consent_withdrawn_key = "consent_withdrawn"
        for td in data:
            td.append_data({consent_withdrawn_key: Codes.FALSE},
                           Metadata(user, Metadata.get_call_location(),
                                    time.time()))

        # Set the list of raw/coded keys which
        survey_keys = []
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.analysis_file_key is not None and plan.analysis_file_key not in survey_keys:
                survey_keys.append(plan.analysis_file_key)
            if plan.raw_field not in survey_keys:
                survey_keys.append(plan.raw_field)

        # Convert survey codes to their string values
        for td in data:
            td.append_data(
                {
                    plan.analysis_file_key: plan.code_scheme.get_code_with_id(
                        td[plan.coded_field]["CodeID"]).string_value
                    for plan in PipelineConfiguration.SURVEY_CODING_PLANS
                    if plan.analysis_file_key is not None
                }, Metadata(user, Metadata.get_call_location(), time.time()))

        # Convert RQA binary codes to their string values
        for td in data:
            td.append_data(
                {
                    plan.binary_analysis_file_key:
                    plan.binary_code_scheme.get_code_with_id(
                        td[plan.binary_coded_field]["CodeID"]).string_value
                    for plan in PipelineConfiguration.RQA_CODING_PLANS
                    if plan.binary_code_scheme is not None
                }, Metadata(user, Metadata.get_call_location(), time.time()))

        # Translate the RQA reason codes to matrix values
        matrix_keys = []

        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            show_matrix_keys = list()
            for code in plan.code_scheme.codes:
                show_matrix_keys.append(
                    f"{plan.analysis_file_key}{code.string_value}")

            AnalysisKeys.set_matrix_keys(user, data, show_matrix_keys,
                                         plan.code_scheme, plan.coded_field,
                                         plan.analysis_file_key)

            matrix_keys.extend(show_matrix_keys)

        binary_keys = [
            plan.binary_analysis_file_key
            for plan in PipelineConfiguration.RQA_CODING_PLANS
            if plan.binary_analysis_file_key is not None
        ]

        equal_keys = ["uid"]
        equal_keys.extend(survey_keys)
        concat_keys = [
            plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS
        ]
        bool_keys = [
            consent_withdrawn_key,

            # "sms_ad",
            # "radio_promo",
            # "radio_show",
            # "non_logical_time",
            # "radio_participation_s02e01",
            # "radio_participation_s02e02",
            # "radio_participation_s02e03",
            # "radio_participation_s02e04",
            # "radio_participation_s02e05",
            # "radio_participation_s02e06",
        ]

        # Export to CSV
        export_keys = ["uid"]
        export_keys.extend(bool_keys)
        export_keys.extend(matrix_keys)
        export_keys.extend(binary_keys)
        export_keys.extend(concat_keys)
        export_keys.extend(survey_keys)

        # Set consent withdrawn based on presence of data coded as "stop"
        ConsentUtils.determine_consent_withdrawn(
            user, data, PipelineConfiguration.SURVEY_CODING_PLANS,
            consent_withdrawn_key)

        # Set consent withdrawn based on stop codes from radio question answers
        for td in data:
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                if td[f"{plan.analysis_file_key}{Codes.STOP}"] == Codes.MATRIX_1:
                    td.append_data({consent_withdrawn_key: Codes.TRUE},
                                   Metadata(user, Metadata.get_call_location(),
                                            time.time()))

                if plan.binary_code_scheme is not None:
                    if td[plan.binary_coded_field]["CodeID"] == \
                            plan.binary_code_scheme.get_code_with_control_code(Codes.STOP).code_id:
                        td.append_data({consent_withdrawn_key: Codes.TRUE},
                                       Metadata(user,
                                                Metadata.get_call_location(),
                                                time.time()))

        # Fold data to have one respondent per row
        to_be_folded = []
        for td in data:
            to_be_folded.append(td.copy())

        folded_data = FoldTracedData.fold_iterable_of_traced_data(
            user,
            data,
            fold_id_fn=lambda td: td["uid"],
            equal_keys=equal_keys,
            concat_keys=concat_keys,
            matrix_keys=matrix_keys,
            bool_keys=bool_keys,
            binary_keys=binary_keys)

        # Fix-up _NA and _NC keys, which are currently being set incorrectly by
        # FoldTracedData.fold_iterable_of_traced_data when there are multiple radio shows
        # TODO: Update FoldTracedData to handle NA and NC correctly under multiple radio shows
        for td in folded_data:
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                if td.get(plan.raw_field, "") != "":
                    td.append_data(
                        {
                            f"{plan.analysis_file_key}{Codes.TRUE_MISSING}":
                            Codes.MATRIX_0
                        },
                        Metadata(user, Metadata.get_call_location(),
                                 TimeUtils.utc_now_as_iso_string()))

                contains_non_nc_key = False
                for key in matrix_keys:
                    if key.startswith(plan.analysis_file_key) and not key.endswith(Codes.NOT_CODED) \
                            and td.get(key) == Codes.MATRIX_1:
                        contains_non_nc_key = True
                if not contains_non_nc_key:
                    td.append_data(
                        {
                            f"{plan.analysis_file_key}{Codes.NOT_CODED}":
                            Codes.MATRIX_1
                        },
                        Metadata(user, Metadata.get_call_location(),
                                 TimeUtils.utc_now_as_iso_string()))

        # Process consent
        ConsentUtils.set_stopped(user,
                                 data,
                                 consent_withdrawn_key,
                                 additional_keys=export_keys)
        ConsentUtils.set_stopped(user,
                                 folded_data,
                                 consent_withdrawn_key,
                                 additional_keys=export_keys)

        # Output to CSV with one message per row
        with open(csv_by_message_output_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(
                data, f, headers=export_keys)

        with open(csv_by_individual_output_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(
                folded_data, f, headers=export_keys)

        return data
        help="CSV file containing demographics of CAPYEI students. ")
    parser.add_argument("json_output_path",
                        metavar="json-output-path",
                        help="Path to serialized TracedData JSON file")

    args = parser.parse_args()
    user = args.user
    phone_uuid_path = args.phone_uuid_table_path
    demog_dataset_path = args.demog_dataset_path
    json_output_path = args.json_output_path

    with open(phone_uuid_path, "r") as f:
        phone_uuids = PhoneNumberUuidTable.load(f)

    with open(demog_dataset_path, "r") as f:
        traced_demog = TracedDataCSVIO.import_csv_to_traced_data_iterable(
            user, f)
        traced_demog = list(traced_demog)
        for td in traced_demog:
            uuid_dict = {
                "avf_phone_id": phone_uuids.add_phone(td["final_phone"])
            }
            td.append_data(
                uuid_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

    # Write the UUIDs out to a file
    with open(phone_uuid_path, "w") as f:
        phone_uuids.dump(f)

    # Output TracedData to JSON.
    IOUtils.ensure_dirs_exist(json_output_path)
    def auto_code_show_messages(cls, user, data, icr_output_dir, coda_output_dir):
        # Filter out test messages sent by AVF.
        if not PipelineConfiguration.DEV_MODE:
            data = MessageFilters.filter_test_messages(data)

        # Filter for runs which don't contain a response to any week's question 
        data = MessageFilters.filter_empty_messages(data, cls.TEST_KEYS)

        # Filter out runs sent outside the project start and end dates
        data = MessageFilters.filter_time_range(data, cls.SENT_ON_KEY, cls.PROJECT_START_DATE, cls.PROJECT_END_DATE)
        
        # Label missing data
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS:
                if plan.raw_field not in td:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                        Metadata.get_call_location()
                    )
                    missing_dict[plan.coded_field] = [na_label.to_dict()]

                if plan.binary_code_scheme is not None:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                        Metadata.get_call_location()
                    )
                    missing_dict[plan.binary_coded_field] = na_label.to_dict()
            
            td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time()))
    
        # Label each message with channel keys
        Channels.set_channel_keys(user, data, cls.SENT_ON_KEY)

        # Output messagges for Coda
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field)

            output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f
                )
        print("Coda message files successfully exported")

        # Output messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS:
            test_pipeline_messages = []
            na_messages = []
            for td in data:
                if plan.coded_field not in td:
                    test_pipeline_messages.append(td)
                    
                else:
                    assert len(td[plan.coded_field]) == 1
                    assert td[plan.coded_field][0]["CodeID"] == \
                            plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id

            
            icr_messages = ICRTools.generate_sample_for_icr(
                test_pipeline_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED))
            
            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages, f, headers=[plan.run_id_field, plan.raw_field]
                )
        print("ICR files successfully exported")

        return data
Пример #13
0
    def generate(user, data, csv_by_message_output_path,
                 csv_by_individual_output_path):
        # Serializer is currently overflowing
        # TODO: Investigate/address the cause of this.
        sys.setrecursionlimit(15000)

        consent_withdrawn_key = "consent_withdrawn"
        for td in data:
            td.append_data({consent_withdrawn_key: Codes.FALSE},
                           Metadata(user, Metadata.get_call_location(),
                                    time.time()))

        # Set the list of keys to be exported and how they are to be handled when folding
        export_keys = ["uid", consent_withdrawn_key]
        bool_keys = [
            consent_withdrawn_key

            # "sms_ad",
            # "radio_promo",
            # "radio_show",
            # "non_logical_time",
            # "radio_participation_s02e01",
            # "radio_participation_s02e02",
            # "radio_participation_s02e03",
            # "radio_participation_s02e04",
            # "radio_participation_s02e05",
            # "radio_participation_s02e06",
        ]
        equal_keys = ["uid"]
        concat_keys = []
        matrix_keys = []
        binary_keys = []
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            for cc in plan.coding_configurations:
                if cc.analysis_file_key is None:
                    continue

                if cc.coding_mode == CodingModes.SINGLE:
                    export_keys.append(cc.analysis_file_key)

                    if cc.folding_mode == FoldingModes.ASSERT_EQUAL:
                        equal_keys.append(cc.analysis_file_key)
                    elif cc.folding_mode == FoldingModes.YES_NO_AMB:
                        binary_keys.append(cc.analysis_file_key)
                    else:
                        assert False, f"Incompatible folding_mode {plan.folding_mode}"
                else:
                    assert cc.folding_mode == FoldingModes.MATRIX
                    for code in cc.code_scheme.codes:
                        export_keys.append(
                            f"{cc.analysis_file_key}{code.string_value}")
                        matrix_keys.append(
                            f"{cc.analysis_file_key}{code.string_value}")

            export_keys.append(plan.raw_field)
            if plan.raw_field_folding_mode == FoldingModes.CONCATENATE:
                concat_keys.append(plan.raw_field)
            elif plan.raw_field_folding_mode == FoldingModes.ASSERT_EQUAL:
                equal_keys.append(plan.raw_field)
            else:
                assert False, f"Incompatible raw_field_folding_mode {plan.raw_field_folding_mode}"

        # Convert codes to their string/matrix values
        for td in data:
            analysis_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                for cc in plan.coding_configurations:
                    if cc.analysis_file_key is None:
                        continue

                    if cc.coding_mode == CodingModes.SINGLE:
                        analysis_dict[cc.analysis_file_key] = \
                            cc.code_scheme.get_code_with_id(td[cc.coded_field]["CodeID"]).string_value
                    else:
                        assert cc.coding_mode == CodingModes.MULTIPLE
                        show_matrix_keys = []
                        for code in cc.code_scheme.codes:
                            show_matrix_keys.append(
                                f"{cc.analysis_file_key}{code.string_value}")

                        for label in td.get(cc.coded_field, []):
                            code_string_value = cc.code_scheme.get_code_with_id(
                                label['CodeID']).string_value
                            analysis_dict[
                                f"{cc.analysis_file_key}{code_string_value}"] = Codes.MATRIX_1

                        for key in show_matrix_keys:
                            if key not in analysis_dict:
                                analysis_dict[key] = Codes.MATRIX_0
            td.append_data(
                analysis_dict,
                Metadata(user, Metadata.get_call_location(),
                         TimeUtils.utc_now_as_iso_string()))

        # Set consent withdrawn based on presence of data coded as "stop"
        ConsentUtils.determine_consent_withdrawn(
            user, data, PipelineConfiguration.RQA_CODING_PLANS +
            PipelineConfiguration.SURVEY_CODING_PLANS, consent_withdrawn_key)

        # Fold data to have one respondent per row
        to_be_folded = []
        for td in data:
            to_be_folded.append(td.copy())

        folded_data = FoldTracedData.fold_iterable_of_traced_data(
            user,
            data,
            fold_id_fn=lambda td: td["uid"],
            equal_keys=equal_keys,
            concat_keys=concat_keys,
            matrix_keys=matrix_keys,
            bool_keys=bool_keys,
            binary_keys=binary_keys)

        # Fix-up _NA and _NC keys, which are currently being set incorrectly by
        # FoldTracedData.fold_iterable_of_traced_data when there are multiple radio shows
        # TODO: Update FoldTracedData to handle NA and NC correctly under multiple radio shows
        for td in folded_data:
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                for cc in plan.coding_configurations:
                    if cc.analysis_file_key is None:
                        continue

                    if cc.coding_mode == CodingModes.MULTIPLE:
                        if td.get(plan.raw_field, "") != "":
                            td.append_data(
                                {
                                    f"{cc.analysis_file_key}{Codes.TRUE_MISSING}":
                                    Codes.MATRIX_0
                                },
                                Metadata(user, Metadata.get_call_location(),
                                         TimeUtils.utc_now_as_iso_string()))

                        contains_non_nc_key = False
                        for key in matrix_keys:
                            if key.startswith(cc.analysis_file_key) and not key.endswith(Codes.NOT_CODED) \
                                    and td.get(key) == Codes.MATRIX_1:
                                contains_non_nc_key = True
                        if not contains_non_nc_key:
                            td.append_data(
                                {
                                    f"{cc.analysis_file_key}{Codes.NOT_CODED}":
                                    Codes.MATRIX_1
                                },
                                Metadata(user, Metadata.get_call_location(),
                                         TimeUtils.utc_now_as_iso_string()))

        # Process consent
        ConsentUtils.set_stopped(user,
                                 data,
                                 consent_withdrawn_key,
                                 additional_keys=export_keys)
        ConsentUtils.set_stopped(user,
                                 folded_data,
                                 consent_withdrawn_key,
                                 additional_keys=export_keys)

        # Output to CSV with one message per row
        with open(csv_by_message_output_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(
                data, f, headers=export_keys)

        with open(csv_by_individual_output_path, "w") as f:
            TracedDataCSVIO.export_traced_data_iterable_to_csv(
                folded_data, f, headers=export_keys)

        return data, folded_data
                TracedDataCodaIO.import_coda_to_traced_data_iterable(
                    user, data, key_of_raw, key_of_clean, f, True))
    else:
        assert coding_mode == "coding-csv", "coding_mode was not one of 'coda' or 'coding-csv'"

        # Merge manually coded CSV files into the cleaned dataset
        with open(path.join(coded_input_path, "{}.csv".format(key_of_raw)),
                  "r") as f:
            data = list(
                TracedDataCodingCSVIO.
                import_coding_csv_to_traced_data_iterable(
                    user, data, key_of_raw, key_of_clean, key_of_raw,
                    key_of_clean, f, True))

    # Write coded data back out to disk
    if os.path.dirname(json_output_path) is not "" and not os.path.exists(
            os.path.dirname(json_output_path)):
        os.makedirs(os.path.dirname(json_output_path))
    with open(json_output_path, "w") as f:
        TracedDataJsonIO.export_traced_data_iterable_to_json(data,
                                                             f,
                                                             pretty_print=True)

    # Export coded data to CSV for analysis
    if os.path.dirname(csv_output_path) is not "" and not os.path.exists(
            os.path.dirname(csv_output_path)):
        os.makedirs(os.path.dirname(csv_output_path))
    with open(csv_output_path, "w") as f:
        TracedDataCSVIO.export_traced_data_iterable_to_csv(
            data, f, headers=["avf_phone_id", key_of_raw, key_of_clean])
Пример #15
0
        os.makedirs(os.path.dirname(json_output_path))
    with open(json_output_path, "w") as f:
        TracedDataJsonIO.export_traced_data_iterable_to_json(data,
                                                             f,
                                                             pretty_print=True)

    # Output to a more human-friendly CSV.
    if os.path.dirname(csv_output_path) is not "" and not os.path.exists(
            os.path.dirname(csv_output_path)):
        os.makedirs(os.path.dirname(csv_output_path))
    with open(csv_output_path, "w") as f:
        TracedDataCSVIO.export_traced_data_iterable_to_csv(
            data,
            f,
            headers=[
                "avf_phone_id",
                "{} (Run ID) - {}".format(variable_name, flow_name),
                "{} (Time) - {}".format(variable_name, flow_name),
                "{} (Text) - {}".format(variable_name, flow_name)
            ])

    # Output messages to Coda
    IOUtils.ensure_dirs_exist_for_file(coda_output_path)
    with open(coda_output_path, "w") as f:
        TracedDataCodaIO.export_traced_data_iterable_to_coda(
            data, "{} (Text) - {}".format(variable_name, flow_name), f)

    # Get 200 non-noise messages and output to CSVs for ICR.
    print("Noise items:")
    show_message_key = "{} (Text) - {}".format(variable_name, flow_name)
    not_noise = []
    for td in folded:
        d = dict()
        for key in folded_column_keys:
            if key not in td:
                d[key] = Codes.TRUE_MISSING

        td.append_data(
            d, Metadata(user, Metadata.get_call_location(), time.time()))

    # Export to CSV
    export_keys = ["avf_phone_id", "Group"]
    export_keys.extend(list(folded_column_keys))
    export_keys.extend(survey_keys)
    export_keys.extend(matrix_keys)
    export_keys.sort()

    print("Writing 1/2")
    with open(csv_by_individual_output_path, "w") as f:
        TracedDataCSVIO.export_traced_data_iterable_to_csv(folded,
                                                           f,
                                                           headers=export_keys)

    print("Writing 2/2")
    # Hack an unused output field to write traced data to (for debug)
    # FIXME
    with open(csv_by_message_output_path, "w") as f:
        TracedDataJsonIO.export_traced_data_iterable_to_json(data,
                                                             f,
                                                             pretty_print=True)
        # TracedDataCSVIO.export_traced_data_iterable_to_csv(data, f, headers=export_keys)
    print("  Stopped Respondents:")
    stopped_ids = set()
    for td in all_messages:
        stop_d = dict()
        for output_key in output_keys:
            if td[output_key] == "stop":
                stopped_ids.add(td["phone_uuid"])
                for k in output_keys:
                    stop_d[k] = "stop"
                stop_d["consent_clean"] = CodeBooks.yes_no[Codes.NO]
        if "consent_clean" not in stop_d:
            stop_d["consent_clean"] = CodeBooks.yes_no[Codes.YES]
        td.append_data(
            stop_d, Metadata(user, Metadata.get_call_location(), time.time()))
    print("  " + str(len(stopped_ids)))

    output_keys.insert(2, "consent_clean")

    # Output analysis TracedData to JSON
    IOUtils.ensure_dirs_exist_for_file(json_output_path)
    with open(json_output_path, "w") as f:
        TracedDataJsonIO.export_traced_data_iterable_to_json(all_messages,
                                                             f,
                                                             pretty_print=True)

    # Output analysis file as CSV
    IOUtils.ensure_dirs_exist_for_file(csv_output_path)
    with open(csv_output_path, "w") as f:
        TracedDataCSVIO.export_traced_data_iterable_to_csv(
            all_messages, f, output_keys)
Пример #18
0
    def auto_code_show_messages(cls, user, data, pipeline_configuration,
                                icr_output_dir, coda_output_dir):
        # Filter out test messages sent by AVF.
        if pipeline_configuration.filter_test_messages:
            data = MessageFilters.filter_test_messages(data)
        else:
            log.debug(
                "Not filtering out test messages (because the pipeline configuration json key "
                "'FilterTestMessages' was set to false)")

        # Filter for runs which don't contain a response to any week's question
        data = MessageFilters.filter_empty_messages(data, [
            plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS
        ])

        # Filter out runs sent outwith the project start and end dates
        data = MessageFilters.filter_time_range(
            data, cls.SENT_ON_KEY, pipeline_configuration.project_start_date,
            pipeline_configuration.project_end_date)

        # Skipping auto-assigning noise, as an experiment on this project.
        # If it turns out we need this, uncomment this block.
        # for td in data:
        #     is_noise = True
        #     for rqa_key in cls.RQA_KEYS:
        #         if rqa_key in td and not somali.DemographicCleaner.is_noise(td[rqa_key], min_length=10):
        #             is_noise = False
        #     td.append_data({cls.NOISE_KEY: is_noise}, Metadata(user, Metadata.get_call_location(), time.time()))

        # TODO: Label each message with channel keys
        # Channels.set_channel_keys(user, data, cls.SENT_ON_KEY,
        #                           pipeline_configuration.project_start_date, pipeline_configuration.project_end_date)

        # Filter for messages which aren't noise (in order to export to Coda and export for ICR)
        not_noise = MessageFilters.filter_noise(data, cls.NOISE_KEY,
                                                lambda x: x)

        # Compute the number of RQA messages that were the empty string
        log.debug(
            "Counting the number of empty string messages for each raw radio show field..."
        )
        raw_rqa_fields = []
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            if plan.raw_field not in raw_rqa_fields:
                raw_rqa_fields.append(plan.raw_field)
        cls.log_empty_string_stats(data, raw_rqa_fields)

        # Compute the number of survey messages that were the empty string
        log.debug(
            "Counting the number of empty string messages for each survey field..."
        )
        raw_survey_fields = []
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.raw_field not in raw_survey_fields:
                raw_survey_fields.append(plan.raw_field)
        survey_data = dict()
        for td in data:
            survey_data[td["uid"]] = td
        cls.log_empty_string_stats(survey_data.values(), raw_survey_fields)

        # Output messages which aren't noise to Coda
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, not_noise,
                                                   plan.raw_field,
                                                   plan.id_field)

            output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    not_noise, plan.raw_field, cls.SENT_ON_KEY, plan.id_field,
                    {}, f)

        # Output messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            rqa_messages = []
            for td in not_noise:
                if plan.raw_field in td:
                    rqa_messages.append(td)

            icr_messages = ICRTools.generate_sample_for_icr(
                rqa_messages, cls.ICR_MESSAGES_COUNT,
                random.Random(cls.ICR_SEED))

            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages,
                    f,
                    headers=[plan.run_id_field, plan.raw_field])

        return data
Пример #19
0
    args = parser.parse_args()
    user = args.user[0]
    input_path = args.input[0]
    json_output_path = args.json_output[0]
    csv_output_path = args.csv_output[0]

    # Load data from JSON file
    with open(input_path, "r") as f:
        data = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

    # Filter out messages which are only 1 character long
    data = list(filter(lambda td: len(td["Message"]) > 1, data))

    # Write json output
    if os.path.dirname(json_output_path) is not "" and not os.path.exists(
            os.path.dirname(json_output_path)):
        os.makedirs(os.path.dirname(json_output_path))
    with open(json_output_path, "w") as f:
        TracedDataJsonIO.export_traced_data_iterable_to_json(data,
                                                             f,
                                                             pretty_print=True)

    # Write CSV output
    if os.path.dirname(csv_output_path) is not "" and not os.path.exists(
            os.path.dirname(csv_output_path)):
        os.makedirs(os.path.dirname(csv_output_path))
    with open(csv_output_path, "w") as f:
        TracedDataCSVIO.export_traced_data_iterable_to_csv(
            data, f, ["avf_phone_id", "avf_message_id", "Date", "Message"])
Пример #20
0
    def auto_code_show_messages(cls, user, data, icr_output_dir,
                                coda_output_dir):
        # Filter out test messages sent by AVF.
        if not PipelineConfiguration.DEV_MODE:
            data = MessageFilters.filter_test_messages(data)

        # Filter for runs which don't contain a response to any week's question
        data = MessageFilters.filter_empty_messages(data, cls.RQA_KEYS)

        # Filter out runs sent outwith the project start and end dates
        data = MessageFilters.filter_time_range(data, cls.SENT_ON_KEY,
                                                cls.PROJECT_START_DATE,
                                                cls.PROJECT_END_DATE)

        # Tag messages which are noise as being noise
        for td in data:
            is_noise = True
            for rqa_key in cls.RQA_KEYS:
                if rqa_key in td and not somali.DemographicCleaner.is_noise(
                        td[rqa_key], min_length=10):
                    is_noise = False
            td.append_data({cls.NOISE_KEY: is_noise},
                           Metadata(user, Metadata.get_call_location(),
                                    time.time()))

        # Label missing data
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                if plan.raw_field not in td:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.TRUE_MISSING), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = [na_label.to_dict()]

                    if plan.binary_code_scheme is not None:
                        na_label = CleaningUtils.make_label_from_cleaner_code(
                            plan.binary_code_scheme,
                            plan.binary_code_scheme.get_code_with_control_code(
                                Codes.TRUE_MISSING),
                            Metadata.get_call_location())
                        missing_dict[
                            plan.binary_coded_field] = na_label.to_dict()

            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Label each message with channel keys
        Channels.set_channel_keys(user, data, cls.SENT_ON_KEY)

        # Filter for messages which aren't noise (in order to export to Coda and export for ICR)
        not_noise = MessageFilters.filter_noise(data, cls.NOISE_KEY,
                                                lambda x: x)

        # Output messages which aren't noise to Coda
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, not_noise,
                                                   plan.raw_field,
                                                   plan.id_field)

            output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    not_noise, plan.raw_field, cls.SENT_ON_KEY, plan.id_field,
                    {}, f)

        # Output messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            rqa_messages = []
            for td in not_noise:
                # This test works because the only codes which have been applied at this point are TRUE_MISSING.
                # If any other coding is done above, this test will need to change.
                if plan.coded_field not in td:
                    rqa_messages.append(td)
                else:
                    assert len(td[plan.coded_field]) == 1
                    assert td[plan.coded_field][0]["CodeID"] == \
                        plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id

            icr_messages = ICRTools.generate_sample_for_icr(
                rqa_messages, cls.ICR_MESSAGES_COUNT,
                random.Random(cls.ICR_SEED))

            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages,
                    f,
                    headers=[plan.run_id_field, plan.raw_field])

        return data
Пример #21
0
        if START_TIME <= utc_time <= END_TIME:
            inside_time_window.append(td)
        else:
            print("Dropping: {}".format(utc_time))

    print("{}:{} Dropped as outside time/Total".format(
        len(show_messages) - len(inside_time_window), len(show_messages)))
    show_messages = inside_time_window

    # Output messages to a CSV file
    IOUtils.ensure_dirs_exist_for_file(csv_output_path)
    run_id_key = "{} (Run ID) - {}".format(variable_name, flow_name)
    raw_text_key = "{} (Text) - {}".format(variable_name, flow_name)
    with open(csv_output_path, "w") as f:
        TracedDataCSVIO.export_traced_data_iterable_to_csv(
            show_messages,
            f,
            headers=["avf_phone_id", run_id_key, raw_text_key])

    # Output messages to Coda
    IOUtils.ensure_dirs_exist_for_file(coda_output_path)
    if os.path.exists(prev_coda_path):
        # TODO: Modifying this line once the coding frame has been developed to include lots of Nones feels a bit
        # TODO: cumbersome. We could instead modify export_traced_data_iterable_to_coda to support a prev_f argument.
        # TODO: Modify by adding code scheme keys once they are ready
        scheme_keys = {
            "Relevance": None,
            "Code 1": None,
            "Code 2": None,
            "Code 3": None,
            "Code 4": None
        }
    TOTAL_FGD_CONTACTS = 100

    # Load phone uuid table
    with open(phone_uuid_table_path, "r") as f:
        phone_uuids = PhoneNumberUuidTable.load(f)

    # Load FGD/CC survey responses
    with open(fgd_cc_input_path, "r") as f:
        fgd_cc_data = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

    # Load the previous export
    prev_exports = []
    if prev_exports_path is not None:
        with open(prev_exports_path, "r") as f:
            prev_exports = list(
                TracedDataCSVIO.import_csv_to_traced_data_iterable(user, f))

    # Load coded demog surveys
    with open(demog_surveys_input_path, "r") as f:
        surveys = TracedDataJsonIO.import_json_to_traced_data_iterable(f)

    # Filter out people who haven't answered the fgd_cc consent question
    fgd_cc_consent_key = "Response_1 (Category) - wt_fgd_cc"
    fgd_cc_data = [td for td in fgd_cc_data if fgd_cc_consent_key in td]

    # Filter out people that we have exported in the past
    prev_contacts = {td["Phone Number"] for td in prev_exports}
    fgd_cc_data = [
        td for td in fgd_cc_data if "+{}".format(
            phone_uuids.get_phone(td["avf_phone_id"])) not in prev_contacts
    ]