Python TracedDataCodaV2IO 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: core_data_modules.traced_data.io

클래스/타입: TracedDataCodaV2IO

hotexamples.com에서의 예제들: 23

Python TracedDataCodaV2IO - 23개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 core_data_modules.traced_data.io.TracedDataCodaV2IO에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

compute_message_ids(16)

export_traced_data_iterable_to_coda_2(14)

import_coda_2_to_traced_data_iterable(10)

import_coda_2_to_traced_data_iterable_multi_coded(9)

예제 #1

파일 보기

    def auto_code_surveys(cls, user, data, phone_uuid_table, coda_output_dir):
        # Label missing data
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.DEMOGS_CODING_PLANS:
                if td.get(plan.raw_field, "") == "":
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                        Metadata.get_call_location()
                    )
                    missing_dict[plan.coded_field] = na_label.to_dict()
            td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time()))

        # Auto-code remaining data
        for plan in PipelineConfiguration.DEMOGS_CODING_PLANS:
            if plan.cleaner is not None:
                CleaningUtils.apply_cleaner_to_traced_data_iterable(user, data, plan.raw_field, plan.coded_field,
                                                                    plan.cleaner, plan.code_scheme)

        # Output survey answers to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.DEMOGS_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field)
            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, 'w') as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field, {plan.coded_field: plan.code_scheme}, f
                )
        print("Coda demogs files successfully exported")

        return data

예제 #2

파일 보기

    def auto_code_surveys(cls, user, data, coda_output_dir):
        # Auto-code surveys
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            for cc in plan.coding_configurations:
                if cc.cleaner is not None:
                    CleaningUtils.apply_cleaner_to_traced_data_iterable(
                        user, data, plan.raw_field, cc.coded_field, cc.cleaner,
                        cc.code_scheme)

        # Output single-scheme answers to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field, {
                        cc.coded_field: cc.code_scheme
                        for cc in plan.coding_configurations
                    }, f)

        # Note: no need to handle location in any special way on this project because it is not being auto-coded

        return data

예제 #3

파일 보기

    def _build_message_to_s01e02_dict(cls, user, data, coda_input_dir):
        # Duplicate the input list because reading the file requires appending data to the TracedData,
        # and we don't actually want to modify the input at this stage of the pipeline.
        data = [td.copy() for td in data]

        # Apply the week 3 codes from Coda.
        message_id_key = "radio_show_3_message_id"
        coded_ws_key = "radio_show_3_ws"
        TracedDataCodaV2IO.compute_message_ids(user, data,
                                               cls.WEEK_3_VALUE_KEY,
                                               message_id_key)
        coda_input_path = path.join(coda_input_dir, "s01e03.json")
        with open(coda_input_path) as f:
            TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                user, data, message_id_key,
                {coded_ws_key: CodeSchemes.WS_CORRECT_DATASET}, f)

        # Parse the loaded codes into a look-up table of raw message string -> is ws boolean.
        message_to_ws_dict = dict()
        for td in data:
            label = td.get(coded_ws_key)
            if label is not None:
                message_to_ws_dict[td.get(cls.WEEK_3_VALUE_KEY)] = \
                    label["CodeID"] == CodeSchemes.WS_CORRECT_DATASET.get_code_with_match_value("s01e02").code_id

        return message_to_ws_dict

예제 #4

파일 보기

    def auto_code_show_messages(cls, user, data, icr_output_dir,
                                coda_output_dir):
        # Filter out test messages sent by AVF
        if not PipelineConfiguration.DEV_MODE:
            data = MessageFilters.filter_test_messages(data)

        # Filter for runs which don't contain a response to any week's question
        data = MessageFilters.filter_empty_messages(data, cls.RQA_KEYS)

        # Filter out runs sent outwith the project start and end dates
        data = MessageFilters.filter_time_range(
            data, cls.SENT_ON_KEY, PipelineConfiguration.PROJECT_START_DATE,
            PipelineConfiguration.PROJECT_END_DATE)

        # Label each message with channel keys
        Channels.set_channel_keys(user, data, cls.SENT_ON_KEY)

        # Output RQA and follow up surveys messages to Coda
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {},
                    f)

        # Output RQA and follow up messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
            rqa_and_follow_up_messages = []
            # This test works because the only codes which have been applied at this point are TRUE_MISSING.
            # If any other coding is done above, this test will need to change
            for td in data:
                if plan.raw_field in td:
                    rqa_and_follow_up_messages.append(td)

            icr_messages = ICRTools.generate_sample_for_icr(
                rqa_and_follow_up_messages, cls.ICR_MESSAGES_COUNT,
                random.Random(cls.ICR_SEED))

            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages,
                    f,
                    headers=[plan.run_id_field, plan.raw_field])

        return data

예제 #5

파일 보기

파일: auto_code.py 프로젝트: AfricasVoices/Project-UNICEF-COVID19-SOM

    def export_coda(cls, user, data, coda_output_dir):
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.coda_filename is None:
                continue

            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field, {
                        cc.coded_field: cc.code_scheme
                        for cc in plan.coding_configurations
                    }, f)

예제 #6

파일 보기

    def export_coda(cls, user, data, coda_output_dir):
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.coda_filename is None:
                continue

            for td in data:
                if plan.raw_field in td:
                    td.append_data({plan.id_field: plan.message_id_fn(td)},
                                   Metadata(user, Metadata.get_call_location(),
                                            TimeUtils.utc_now_as_iso_string()))

            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field, {
                        cc.coded_field: cc.code_scheme
                        for cc in plan.coding_configurations
                    }, f)

예제 #7

파일 보기

파일: auto_code_surveys.py 프로젝트: AfricasVoices/Project-CAPYEI

    def auto_code_surveys(cls, user, data, icr_output_dir, coda_output_dir):
        # Auto-code surveys
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.cleaner is not None:
                CleaningUtils.apply_cleaner_to_traced_data_iterable(
                    user, data, plan.raw_field, plan.coded_field, plan.cleaner,
                    plan.code_scheme)

        # Output single-scheme answers to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:

            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field,
                    {plan.coded_field: plan.code_scheme}, f)

        # Output messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            rqa_messages = []
            for td in data:
                if plan.raw_field in td:
                    rqa_messages.append(td)

            icr_messages = ICRTools.generate_sample_for_icr(
                rqa_messages, cls.ICR_MESSAGES_COUNT,
                random.Random(cls.ICR_SEED))

            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages,
                    f,
                    headers=[plan.run_id_field, plan.raw_field])

        return data

예제 #8

파일 보기

파일: apply_manual_codes.py 프로젝트: AfricasVoices/Test-REDDS

    def apply_manual_codes(cls, user, data, coda_input_dir):
        """ Merge manually coded radio show files into the cleaned dataset """
        for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS:
            test_pipeline_messages = [td for td in data if plan.raw_field in td]
            coda_input_path = path.join(coda_input_dir, plan.coda_filename)

            f = None
            try:
                if path.exists(coda_input_path):
                    f = open(coda_input_path, 'r')
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                    user, test_pipeline_messages, plan.id_field, {plan.coded_field: plan.code_scheme}, f)

                if plan.binary_code_scheme is not None:
                    if f is not None:
                        f.seek(0)
                    TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                        user, test_pipeline_messages, plan.id_field, {plan.binary_coded_field:plan.binary_code_scheme}, f)
            finally:
                if f is not None:
                    f.close()

        # Merge manually coded survey files into cleaned dataset
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            f = None
            try:
                coda_input_path = path.join(coda_input_dir, plan.coda_filename)
                if path.exists(coda_input_path):
                    f = open(coda_input_path, 'r')
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                    user, data, plan.id_field, {plan.coded_field: plan.code_scheme}, f)
            finally:
                if f is not None:
                    f.close()
        
        return data

예제 #9

파일 보기

    def auto_code_show_messages(cls, user, data, icr_output_dir,
                                coda_output_dir):
        # Filter out test messages sent by AVF.
        if not PipelineConfiguration.DEV_MODE:
            data = MessageFilters.filter_test_messages(data)

        # Filter for runs which don't contain a response to any week's question
        data = MessageFilters.filter_empty_messages(data, cls.RQA_KEYS)

        # Filter out runs sent outwith the project start and end dates
        data = MessageFilters.filter_time_range(data, cls.SENT_ON_KEY,
                                                cls.PROJECT_START_DATE,
                                                cls.PROJECT_END_DATE)

        # Tag messages which are noise as being noise
        for td in data:
            is_noise = True
            for rqa_key in cls.RQA_KEYS:
                if rqa_key in td and not somali.DemographicCleaner.is_noise(
                        td[rqa_key], min_length=10):
                    is_noise = False
            td.append_data({cls.NOISE_KEY: is_noise},
                           Metadata(user, Metadata.get_call_location(),
                                    time.time()))

        # Label missing data
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                if plan.raw_field not in td:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.TRUE_MISSING), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = [na_label.to_dict()]

                    if plan.binary_code_scheme is not None:
                        na_label = CleaningUtils.make_label_from_cleaner_code(
                            plan.binary_code_scheme,
                            plan.binary_code_scheme.get_code_with_control_code(
                                Codes.TRUE_MISSING),
                            Metadata.get_call_location())
                        missing_dict[
                            plan.binary_coded_field] = na_label.to_dict()

            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Label each message with channel keys
        Channels.set_channel_keys(user, data, cls.SENT_ON_KEY)

        # Filter for messages which aren't noise (in order to export to Coda and export for ICR)
        not_noise = MessageFilters.filter_noise(data, cls.NOISE_KEY,
                                                lambda x: x)

        # Output messages which aren't noise to Coda
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, not_noise,
                                                   plan.raw_field,
                                                   plan.id_field)

            output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    not_noise, plan.raw_field, cls.SENT_ON_KEY, plan.id_field,
                    {}, f)

        # Output messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            rqa_messages = []
            for td in not_noise:
                # This test works because the only codes which have been applied at this point are TRUE_MISSING.
                # If any other coding is done above, this test will need to change.
                if plan.coded_field not in td:
                    rqa_messages.append(td)
                else:
                    assert len(td[plan.coded_field]) == 1
                    assert td[plan.coded_field][0]["CodeID"] == \
                        plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id

            icr_messages = ICRTools.generate_sample_for_icr(
                rqa_messages, cls.ICR_MESSAGES_COUNT,
                random.Random(cls.ICR_SEED))

            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages,
                    f,
                    headers=[plan.run_id_field, plan.raw_field])

        return data

예제 #10

파일 보기

    def apply_manual_codes(cls, user, data, coda_input_dir):
        # Merge manually coded data into the cleaned dataset
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            coda_input_path = path.join(coda_input_dir, plan.coda_filename)

            for cc in plan.coding_configurations:
                f = None
                try:
                    if path.exists(coda_input_path):
                        f = open(coda_input_path, "r")

                    if cc.coding_mode == CodingModes.SINGLE:
                        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                            user, data, plan.id_field,
                            {cc.coded_field: cc.code_scheme}, f)
                    else:
                        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                            user, data, plan.id_field,
                            {cc.coded_field: cc.code_scheme}, f)
                finally:
                    if f is not None:
                        f.close()

        # Label data for which there is no response as TRUE_MISSING.
        # Label data for which the response is the empty string as NOT_CODED.
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                if plan.raw_field not in td:
                    for cc in plan.coding_configurations:
                        na_label = CleaningUtils.make_label_from_cleaner_code(
                            cc.code_scheme,
                            cc.code_scheme.get_code_with_control_code(
                                Codes.TRUE_MISSING),
                            Metadata.get_call_location()).to_dict()
                        missing_dict[
                            cc.
                            coded_field] = na_label if cc.coding_mode == CodingModes.SINGLE else [
                                na_label
                            ]
                elif td[plan.raw_field] == "":
                    for cc in plan.coding_configurations:
                        nc_label = CleaningUtils.make_label_from_cleaner_code(
                            cc.code_scheme,
                            cc.code_scheme.get_code_with_control_code(
                                Codes.NOT_CODED),
                            Metadata.get_call_location()).to_dict()
                        missing_dict[
                            cc.
                            coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [
                                nc_label
                            ]
            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Mark data that is noise as Codes.NOT_CODED
        for td in data:
            if td.get("noise", False):
                nc_dict = dict()
                for plan in PipelineConfiguration.RQA_CODING_PLANS:
                    for cc in plan.coding_configurations:
                        if cc.coded_field not in td:
                            nc_label = CleaningUtils.make_label_from_cleaner_code(
                                cc.code_scheme,
                                cc.code_scheme.get_code_with_control_code(
                                    Codes.NOT_CODED),
                                Metadata.get_call_location()).to_dict()
                            nc_dict[
                                cc.
                                coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [
                                    nc_label
                                ]
                td.append_data(
                    nc_dict,
                    Metadata(user, Metadata.get_call_location(), time.time()))

        # Run code imputation functions
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.code_imputation_function is not None:
                plan.code_imputation_function(user, data,
                                              plan.coding_configurations)

        cls._impute_coding_error_codes(user, data)

        return data

예제 #11

파일 보기

    def apply_manual_codes(cls, user, data, coda_input_dir):
        # Merge manually coded radio show files into the cleaned dataset
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            rqa_messages = [td for td in data if plan.raw_field in td]
            coda_input_path = path.join(coda_input_dir, plan.coda_filename)
            print(coda_input_path)

            f = None
            try:
                if path.exists(coda_input_path):
                    f = open(coda_input_path, "r")
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                    user, rqa_messages, plan.id_field, {plan.coded_field: plan.code_scheme}, f)

                if plan.binary_code_scheme is not None:
                    if f is not None:
                        f.seek(0)
                    TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                        user, rqa_messages, plan.id_field, {plan.binary_coded_field: plan.binary_code_scheme}, f)
            finally:
                if f is not None:
                    f.close()

        # At this point, the TracedData objects still contain messages for at most one week each.
        # Label the weeks for which there is no response as TRUE_MISSING.
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if plan.raw_field not in td:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                        Metadata.get_call_location()
                    )
                    missing_dict[plan.coded_field] = [na_label.to_dict()]

                    if plan.binary_code_scheme is not None:
                        na_label = CleaningUtils.make_label_from_cleaner_code(
                            plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                            Metadata.get_call_location()
                        )
                        missing_dict[plan.binary_coded_field] = na_label.to_dict()

            td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time()))

        # Synchronise the control codes between the binary and reasons schemes:
        # Some RQA datasets have a binary scheme, which is always labelled, and a reasons scheme, which is only labelled
        # if there is an additional reason given. Importing those two schemes separately above caused the labels in
        # each scheme to go out of sync with each other, e.g. reasons can be NR when the binary *was* reviewed.
        # This block updates the reasons scheme in cases where only a binary label was set, by assigning the
        # label 'NC' if the binary label was set to a normal code, otherwise to be the same control code as the binary.
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            rqa_messages = [td for td in data if plan.raw_field in td]
            if plan.binary_code_scheme is not None:
                for td in rqa_messages:
                    binary_label = td[plan.binary_coded_field]
                    binary_code = plan.binary_code_scheme.get_code_with_id(binary_label["CodeID"])

                    binary_label_present = binary_label["CodeID"] != \
                                           plan.binary_code_scheme.get_code_with_control_code(
                                               Codes.NOT_REVIEWED).code_id

                    reasons_label_present = len(td[plan.coded_field]) > 1 or td[plan.coded_field][0][
                        "CodeID"] != \
                                            plan.code_scheme.get_code_with_control_code(
                                                Codes.NOT_REVIEWED).code_id

                    if binary_label_present and not reasons_label_present:
                        if binary_code.code_type == "Control":
                            control_code = binary_code.control_code
                            reasons_code = plan.code_scheme.get_code_with_control_code(control_code)

                            reasons_label = CleaningUtils.make_label_from_cleaner_code(
                                plan.code_scheme, reasons_code,
                                Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation")

                            td.append_data(
                                {plan.coded_field: [reasons_label.to_dict()]},
                                Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())
                            )
                        else:
                            assert binary_code.code_type == "Normal"

                            nc_label = CleaningUtils.make_label_from_cleaner_code(
                                plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.NOT_CODED),
                                Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation"
                            )
                            td.append_data(
                                {plan.coded_field: [nc_label.to_dict()]},
                                Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())
                            )

        # Not everyone will have answered all of the demographic flows.
        # Label demographic questions which had no responses as TRUE_MISSING.
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if td.get(plan.raw_field, "") == "":
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                        Metadata.get_call_location()
                    )
                    missing_dict[plan.coded_field] = na_label.to_dict()
            td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time()))

        return data

예제 #12

파일 보기

파일: test_io.py 프로젝트: AfricasVoices/CoreDataModules

    def test_export_import_one_single_coded_scheme(self):
        file_path = path.join(self.test_dir, "coda_2_test.json")

        # Build raw input data
        message_dicts = [
            {"gender_raw": "woman", "gender_sent_on": "2018-11-01T07:13:04+03:00"},
            {"gender_raw": "", "gender_sent_on": "2018-11-01T07:17:04+03:00"},
            {"gender_raw": "hiya", "gender_sent_on": "2018-11-01T07:19:04+05:00"},
            {},
            {"gender_raw": "boy", "gender_sent_on": "2018-11-02T19:00:29+03:00"},
            {"gender_raw": "man", "gender_sent_on": "2018-11-02T19:00:29+03:00"},
        ]
        messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i))
                    for i, d in enumerate(message_dicts)]

        # Add message ids
        TracedDataCodaV2IO.compute_message_ids("test_user", messages, "gender_raw", "gender_coda_id")

        # Load gender scheme
        with open("tests/traced_data/resources/coda_2_gender_scheme.json") as f:
            gender_scheme = CodeScheme.from_firebase_map(json.load(f))

        # Apply the English gender cleaner
        with mock.patch("core_data_modules.util.TimeUtils.utc_now_as_iso_string") as time_mock, \
                mock.patch("core_data_modules.traced_data.Metadata.get_function_location") as location_mock:
            time_mock.return_value = "2018-11-02T15:00:07+00:00"
            location_mock.return_value = "english.DemographicCleaner.clean_gender"

            CleaningUtils.apply_cleaner_to_traced_data_iterable(
                "test_user", messages, "gender_raw", "gender_coded",
                english.DemographicCleaner.clean_gender, gender_scheme
            )

        # Export to a Coda 2 messages file
        with open(file_path, "w") as f:
            TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                messages, "gender_raw", "gender_sent_on", "gender_coda_id", {"gender_coded": gender_scheme}, f)

        self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_one_scheme.json"))

        # Test importing with no file available
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
            "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme})
        # Deliberately testing the read can be done twice
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
            "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme})

        na_id = gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id
        nr_id = gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                gender_scheme,
                gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("gender_raw", "") == "":
                td.append_data({"gender_coded": na_label.to_dict()},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        imported_code_ids = [td["gender_coded"]["CodeID"] for td in imported_messages]

        self.assertListEqual(imported_code_ids, [nr_id, na_id, nr_id, na_id, nr_id, nr_id])

        # Test importing from the test file
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        with open("tests/traced_data/resources/coda_2_import_test_one_scheme.json", "r") as f:
            TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme}, f)

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                gender_scheme,
                gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("gender_raw", "") == "":
                td.append_data({"gender_coded": na_label.to_dict()},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        imported_code_ids = [td["gender_coded"]["CodeID"] for td in imported_messages]

        expected_code_ids = [
            gender_scheme.get_code_with_match_value("female").code_id,  # Manually approved auto-code
            gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id,  # Empty raw message
            gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id,  # Manually assigned code which isn't checked
            gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id,  # No raw message
            gender_scheme.get_code_with_control_code(Codes.NOT_CODED).code_id,  # Manually Not Coded
            gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id,  # Manually un-coded
        ]
        self.assertListEqual(imported_code_ids, expected_code_ids)

        # Add an element with the same raw text but a conflicting
        messages.append(TracedData({
            "gender_raw": "woman", "gender_sent_on": "2018-11-01T07:13:04+03:00",
            "gender_coded": CleaningUtils.make_label_from_cleaner_code(
                gender_scheme, gender_scheme.get_code_with_match_value("male"), "make_location_label",
                date_time_utc="2018-11-03T13:40:50Z").to_dict()
        }, Metadata("test_user", Metadata.get_call_location(), time.time())))
        TracedDataCodaV2IO.compute_message_ids("test_user", messages, "gender_raw", "gender_coda_id")

        with open(file_path, "w") as f:
            try:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    messages, "gender_raw", "gender_sent_on", "gender_coda_id", {"gender_coded": gender_scheme}, f)
            except AssertionError as e:
                assert str(e) == "Messages with the same id " \
                                 "(cf2e5bff1ef03dcd20d1a0b18ef7d89fc80a3554434165753672f6f40fde1d25) have different " \
                                 "labels for coded_key 'gender_coded'"
                return
            self.fail("Exporting data with conflicting labels did not fail")

예제 #13

파일 보기

    def apply_manual_codes(cls, user, data, coda_input_dir):
        # Merge manually coded radio show files into the cleaned dataset
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            rqa_messages = [td for td in data if plan.raw_field in td]
            coda_input_path = path.join(coda_input_dir, plan.coda_filename)

            f = None
            try:
                if path.exists(coda_input_path):
                    f = open(coda_input_path, "r")
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                    user, rqa_messages, plan.id_field, {plan.coded_field: plan.code_scheme}, f)

                if plan.binary_code_scheme is not None:
                    if f is not None:
                        f.seek(0)
                    TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                        user, rqa_messages, plan.id_field, {plan.binary_coded_field: plan.binary_code_scheme}, f)
            finally:
                if f is not None:
                    f.close()

        # Mark data that is noise as Codes.NOT_CODED
        for td in data:
            if td["noise"]:
                nc_dict = dict()
                for plan in PipelineConfiguration.RQA_CODING_PLANS:
                    if plan.coded_field not in td:
                        nc_label = CleaningUtils.make_label_from_cleaner_code(
                            plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.NOT_CODED),
                            Metadata.get_call_location()
                        )
                        nc_dict[plan.coded_field] = [nc_label.to_dict()]

                        if plan.binary_code_scheme is not None:
                            nc_label = CleaningUtils.make_label_from_cleaner_code(
                                plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.NOT_CODED),
                                Metadata.get_call_location()
                            )
                            nc_dict[plan.binary_coded_field] = nc_label.to_dict()

                td.append_data(nc_dict, Metadata(user, Metadata.get_call_location(), time.time()))

        # Merge manually coded survey files into the cleaned dataset
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            f = None
            try:
                coda_input_path = path.join(coda_input_dir, plan.coda_filename)
                if path.exists(coda_input_path):
                    f = open(coda_input_path, "r")
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                    user, data, plan.id_field, {plan.coded_field: plan.code_scheme}, f)
            finally:
                if f is not None:
                    f.close()

        # Set district/region/state/zone codes from the coded district field.
        for td in data:
            # Up to 1 location code should have been assigned in Coda. Search for that code,
            # ensuring that only 1 has been assigned or, if multiple have been assigned, that they are non-conflicting
            # control codes
            location_code = None

            for plan in PipelineConfiguration.LOCATION_CODING_PLANS:
                coda_code = plan.code_scheme.get_code_with_id(td[plan.coded_field]["CodeID"])
                if location_code is not None:
                    if not (coda_code.code_id == location_code.code_id or coda_code.control_code == Codes.NOT_REVIEWED):
                        location_code = Code(None, "Control", None, None, None, None, control_code=Codes.NOT_INTERNALLY_CONSISTENT)
                elif coda_code.control_code != Codes.NOT_REVIEWED:
                    location_code = coda_code

            # If no code was found, then this location is still not reviewed.
            # Synthesise a NOT_REVIEWED code accordingly.
            if location_code is None:
                location_code = Code(None, "Control", None, None, None, None, control_code=Codes.NOT_REVIEWED)

            # If a control code was found, set all other location keys to that control code,
            # otherwise convert the provided location to the other locations in the hierarchy.
            if location_code.code_type == "Control":
                for plan in PipelineConfiguration.LOCATION_CODING_PLANS:
                    td.append_data({
                        plan.coded_field: CleaningUtils.make_label_from_cleaner_code(
                            plan.code_scheme,
                            plan.code_scheme.get_code_with_control_code(location_code.control_code),
                            Metadata.get_call_location()
                        ).to_dict()
                    }, Metadata(user, Metadata.get_call_location(), time.time()))
            else:
                location = location_code.match_values[0]

                td.append_data({
                    "mogadishu_sub_district_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.MOGADISHU_SUB_DISTRICT,
                        cls.make_location_code(CodeSchemes.MOGADISHU_SUB_DISTRICT,
                                               SomaliaLocations.mogadishu_sub_district_for_location_code(location)),
                        Metadata.get_call_location()).to_dict(),
                    "district_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.DISTRICT,
                        cls.make_location_code(CodeSchemes.DISTRICT,
                                               SomaliaLocations.district_for_location_code(location)),
                        Metadata.get_call_location()).to_dict(),
                    "region_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.REGION,
                        cls.make_location_code(CodeSchemes.REGION,
                                               SomaliaLocations.region_for_location_code(location)),
                        Metadata.get_call_location()).to_dict(),
                    "state_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.STATE,
                        cls.make_location_code(CodeSchemes.STATE,
                                               SomaliaLocations.state_for_location_code(location)),
                        Metadata.get_call_location()).to_dict(),
                    "zone_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.ZONE,
                        cls.make_location_code(CodeSchemes.ZONE,
                                               SomaliaLocations.zone_for_location_code(location)),
                        Metadata.get_call_location()).to_dict()
                }, Metadata(user, Metadata.get_call_location(), time.time()))

        return data

예제 #14

파일 보기

    def auto_code_show_messages(cls, user, data, pipeline_configuration,
                                icr_output_dir, coda_output_dir):
        # Filter out test messages sent by AVF.
        if pipeline_configuration.filter_test_messages:
            data = MessageFilters.filter_test_messages(data)
        else:
            log.debug(
                "Not filtering out test messages (because the pipeline configuration json key "
                "'FilterTestMessages' was set to false)")

        # Filter for runs which don't contain a response to any week's question
        data = MessageFilters.filter_empty_messages(data, [
            plan.raw_field for plan in PipelineConfiguration.RQA_CODING_PLANS
        ])

        # Filter out runs sent outwith the project start and end dates
        data = MessageFilters.filter_time_range(
            data, cls.SENT_ON_KEY, pipeline_configuration.project_start_date,
            pipeline_configuration.project_end_date)

        # Skipping auto-assigning noise, as an experiment on this project.
        # If it turns out we need this, uncomment this block.
        # for td in data:
        #     is_noise = True
        #     for rqa_key in cls.RQA_KEYS:
        #         if rqa_key in td and not somali.DemographicCleaner.is_noise(td[rqa_key], min_length=10):
        #             is_noise = False
        #     td.append_data({cls.NOISE_KEY: is_noise}, Metadata(user, Metadata.get_call_location(), time.time()))

        # TODO: Label each message with channel keys
        # Channels.set_channel_keys(user, data, cls.SENT_ON_KEY,
        #                           pipeline_configuration.project_start_date, pipeline_configuration.project_end_date)

        # Filter for messages which aren't noise (in order to export to Coda and export for ICR)
        not_noise = MessageFilters.filter_noise(data, cls.NOISE_KEY,
                                                lambda x: x)

        # Compute the number of RQA messages that were the empty string
        log.debug(
            "Counting the number of empty string messages for each raw radio show field..."
        )
        raw_rqa_fields = []
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            if plan.raw_field not in raw_rqa_fields:
                raw_rqa_fields.append(plan.raw_field)
        cls.log_empty_string_stats(data, raw_rqa_fields)

        # Compute the number of survey messages that were the empty string
        log.debug(
            "Counting the number of empty string messages for each survey field..."
        )
        raw_survey_fields = []
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.raw_field not in raw_survey_fields:
                raw_survey_fields.append(plan.raw_field)
        survey_data = dict()
        for td in data:
            survey_data[td["uid"]] = td
        cls.log_empty_string_stats(survey_data.values(), raw_survey_fields)

        # Output messages which aren't noise to Coda
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, not_noise,
                                                   plan.raw_field,
                                                   plan.id_field)

            output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    not_noise, plan.raw_field, cls.SENT_ON_KEY, plan.id_field,
                    {}, f)

        # Output messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            rqa_messages = []
            for td in not_noise:
                if plan.raw_field in td:
                    rqa_messages.append(td)

            icr_messages = ICRTools.generate_sample_for_icr(
                rqa_messages, cls.ICR_MESSAGES_COUNT,
                random.Random(cls.ICR_SEED))

            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages,
                    f,
                    headers=[plan.run_id_field, plan.raw_field])

        return data

예제 #15

파일 보기

파일: apply_manual_codes.py 프로젝트: AfricasVoices/Project-ICRAF

    def apply_manual_codes(cls, user, data, coda_input_dir):
        # Merge manually coded radio show files into the cleaned dataset
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            rqa_messages = [td for td in data if plan.raw_field in td]
            coda_input_path = path.join(coda_input_dir, plan.coda_filename)

            f = None
            try:
                if path.exists(coda_input_path):
                    f = open(coda_input_path, "r")
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                    user, rqa_messages, plan.id_field,
                    {plan.coded_field: plan.code_scheme}, f)

                if plan.binary_code_scheme is not None:
                    if f is not None:
                        f.seek(0)
                    TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                        user, rqa_messages, plan.id_field,
                        {plan.binary_coded_field: plan.binary_code_scheme}, f)
            finally:
                if f is not None:
                    f.close()

        # Label the RQA for which there is no response yet as TRUE MISSING
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                if plan.raw_field not in td:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.TRUE_MISSING), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = [na_label.to_dict()]

                    if plan.binary_code_scheme is not None:
                        na_label = CleaningUtils.make_label_from_cleaner_code(
                            plan.binary_code_scheme,
                            plan.binary_code_scheme.get_code_with_control_code(
                                Codes.TRUE_MISSING),
                            Metadata.get_call_location())
                        missing_dict[
                            plan.binary_coded_field] = na_label.to_dict()
                elif td[plan.raw_field] == "":
                    nc_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.NOT_CODED), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = [nc_label.to_dict()]
                elif plan.binary_code_scheme is not None and td[
                        plan.raw_field] == "":
                    nc_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.binary_code_scheme,
                        plan.binary_code_scheme.get_code_with_control_code(
                            Codes.NOT_CODED), Metadata.get_call_location())
                    missing_dict[plan.binary_coded_field] = [
                        nc_label.to_dict()
                    ]
            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Synchronise the control codes between the binary and reasons schemes:
        # Some RQA datasets have a binary scheme, which is always labelled, and a reasons scheme, which is only labelled
        # if there is an additional reason given. Importing those two schemes separately above caused the labels in
        # each scheme to go out of sync with each other, e.g. reasons can be NR when the binary *was* reviewed.
        # This block updates the reasons scheme in cases where only a binary label was set, by assigning the
        # label 'NC' if the binary label was set to a normal code, otherwise to be the same control code as the binary.
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            rqa_messages = [td for td in data if plan.raw_field in td]
            if plan.binary_code_scheme is not None:
                for td in rqa_messages:
                    binary_label = td[plan.binary_coded_field]
                    binary_code = plan.binary_code_scheme.get_code_with_id(
                        binary_label["CodeID"])

                    binary_label_present = binary_label["CodeID"] != \
                                            plan.binary_code_scheme.get_code_with_control_code(
                                                Codes.NOT_REVIEWED).code_id

                    reasons_label_present = len(td[plan.coded_field]) > 1 or td[plan.coded_field][0][
                        "CodeID"] != \
                                            plan.code_scheme.get_code_with_control_code(
                                                Codes.NOT_REVIEWED).code_id

                    if binary_label_present and not reasons_label_present:
                        if binary_code.code_type == "Control":
                            control_code = binary_code.control_code
                            reasons_code = plan.code_scheme.get_code_with_control_code(
                                control_code)

                            reasons_label = CleaningUtils.make_label_from_cleaner_code(
                                plan.code_scheme,
                                reasons_code,
                                Metadata.get_call_location(),
                                origin_name="Pipeline Code Synchronisation")

                            td.append_data(
                                {plan.coded_field: [reasons_label.to_dict()]},
                                Metadata(user, Metadata.get_call_location(),
                                         TimeUtils.utc_now_as_iso_string()))
                        else:
                            assert binary_code.code_type == "Normal"

                            nc_label = CleaningUtils.make_label_from_cleaner_code(
                                plan.code_scheme,
                                plan.code_scheme.get_code_with_control_code(
                                    Codes.NOT_CODED),
                                Metadata.get_call_location(),
                                origin_name="Pipeline Code Synchronisation")
                            td.append_data(
                                {plan.coded_field: [nc_label.to_dict()]},
                                Metadata(user, Metadata.get_call_location(),
                                         TimeUtils.utc_now_as_iso_string()))

        # Merge manually coded demog and follow-up survey files into the cleaned dataset
        # Recursion depth currently exceeding
        # TODO: Investigate/address the cause of this.
        sys.setrecursionlimit(10000)
        for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
            f = None
            try:
                coda_input_path = path.join(coda_input_dir, plan.coda_filename)
                if path.exists(coda_input_path):
                    f = open(coda_input_path, "r")
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                    user, data, plan.id_field,
                    {plan.coded_field: plan.code_scheme}, f)
            finally:
                if f is not None:
                    f.close()

        # Not everyone will have answered all of the demographic and follow-up survey flows flows.
        # Label demographic and follow-up survey questions which had no responses as TRUE_MISSING.
        # Label data which is just the empty string as NOT_CODED.
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
                if plan.raw_field not in td:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.TRUE_MISSING), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = na_label.to_dict()
                elif td[plan.raw_field] == "":
                    nc_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.NOT_CODED), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = nc_label.to_dict()
            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Set county/constituency/from the coded constituency field.
        cls._impute_location_codes(user, data)

        # Set coding error codes using the coding error field
        cls._impute_coding_error_codes(user, data)

        return data

예제 #16

파일 보기

파일: auto_code_show_messages.py 프로젝트: AfricasVoices/Test-REDDS

    def auto_code_show_messages(cls, user, data, icr_output_dir, coda_output_dir):
        # Filter out test messages sent by AVF.
        if not PipelineConfiguration.DEV_MODE:
            data = MessageFilters.filter_test_messages(data)

        # Filter for runs which don't contain a response to any week's question 
        data = MessageFilters.filter_empty_messages(data, cls.TEST_KEYS)

        # Filter out runs sent outside the project start and end dates
        data = MessageFilters.filter_time_range(data, cls.SENT_ON_KEY, cls.PROJECT_START_DATE, cls.PROJECT_END_DATE)
        
        # Label missing data
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS:
                if plan.raw_field not in td:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                        Metadata.get_call_location()
                    )
                    missing_dict[plan.coded_field] = [na_label.to_dict()]

                if plan.binary_code_scheme is not None:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                        Metadata.get_call_location()
                    )
                    missing_dict[plan.binary_coded_field] = na_label.to_dict()
            
            td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time()))
    
        # Label each message with channel keys
        Channels.set_channel_keys(user, data, cls.SENT_ON_KEY)

        # Output messagges for Coda
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field)

            output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f
                )
        print("Coda message files successfully exported")

        # Output messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS:
            test_pipeline_messages = []
            na_messages = []
            for td in data:
                if plan.coded_field not in td:
                    test_pipeline_messages.append(td)
                    
                else:
                    assert len(td[plan.coded_field]) == 1
                    assert td[plan.coded_field][0]["CodeID"] == \
                            plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id

            
            icr_messages = ICRTools.generate_sample_for_icr(
                test_pipeline_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED))
            
            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages, f, headers=[plan.run_id_field, plan.raw_field]
                )
        print("ICR files successfully exported")

        return data

예제 #17

파일 보기

    def apply_manual_codes(cls, user, data, coda_input_dir):
        # Merge manually coded data into the cleaned dataset
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.coda_filename is None:
                continue

            coda_input_path = path.join(coda_input_dir, plan.coda_filename)

            for cc in plan.coding_configurations:
                if not cc.requires_manual_verification:
                    continue

                f = None
                try:
                    if path.exists(coda_input_path):
                        f = open(coda_input_path, "r")

                    if cc.coding_mode == CodingModes.SINGLE:
                        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                            user, data, plan.id_field,
                            {cc.coded_field: cc.code_scheme}, f)
                    else:
                        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                            user, data, plan.id_field,
                            {cc.coded_field: cc.code_scheme}, f)
                finally:
                    if f is not None:
                        f.close()

            if PipelineConfiguration.WS_CORRECT_DATASET_SCHEME is not None:
                f = None
                try:
                    if path.exists(coda_input_path):
                        f = open(coda_input_path, "r")

                    TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                        user, data, plan.id_field, {
                            f"{plan.raw_field}_correct_dataset":
                            PipelineConfiguration.WS_CORRECT_DATASET_SCHEME
                        }, f)
                finally:
                    if f is not None:
                        f.close()

        # Label data for which there is no response as TRUE_MISSING.
        # Label data for which the response is the empty string as NOT_CODED.
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                for cc in plan.coding_configurations:
                    raw_field = cc.raw_field if cc.raw_field is not None else plan.raw_field
                    if raw_field not in td:
                        na_label = CleaningUtils.make_label_from_cleaner_code(
                            cc.code_scheme,
                            cc.code_scheme.get_code_with_control_code(
                                Codes.TRUE_MISSING),
                            Metadata.get_call_location()).to_dict()
                        missing_dict[
                            cc.
                            coded_field] = na_label if cc.coding_mode == CodingModes.SINGLE else [
                                na_label
                            ]
                for cc in plan.coding_configurations:
                    raw_field = cc.raw_field if cc.raw_field is not None else plan.raw_field
                    if td.get(raw_field) == "":
                        nc_label = CleaningUtils.make_label_from_cleaner_code(
                            cc.code_scheme,
                            cc.code_scheme.get_code_with_control_code(
                                Codes.NOT_CODED),
                            Metadata.get_call_location()).to_dict()
                        missing_dict[
                            cc.
                            coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [
                                nc_label
                            ]
            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Mark data that is noise as Codes.NOT_CODED
        for td in data:
            if td.get("noise", False):
                nc_dict = dict()
                for plan in PipelineConfiguration.RQA_CODING_PLANS:
                    for cc in plan.coding_configurations:
                        if cc.coded_field not in td:
                            nc_label = CleaningUtils.make_label_from_cleaner_code(
                                cc.code_scheme,
                                cc.code_scheme.get_code_with_control_code(
                                    Codes.NOT_CODED),
                                Metadata.get_call_location()).to_dict()
                            nc_dict[
                                cc.
                                coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [
                                    nc_label
                                ]
                td.append_data(
                    nc_dict,
                    Metadata(user, Metadata.get_call_location(), time.time()))

        # Run the cleaners that don't require manual verification again, this time setting "checked" to True
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            for cc in plan.coding_configurations:
                if cc.cleaner is not None and not cc.requires_manual_verification:
                    raw_field = cc.raw_field if cc.raw_field is not None else plan.raw_field
                    CleaningUtils.apply_cleaner_to_traced_data_iterable(
                        user,
                        data,
                        raw_field,
                        cc.coded_field,
                        cc.cleaner,
                        cc.code_scheme,
                        set_checked=True)

        # Run code imputation functions
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.code_imputation_function is not None:
                plan.code_imputation_function(user, data,
                                              plan.coding_configurations)

        cls._impute_coding_error_codes(user, data)

        return data

예제 #18

파일 보기

    def auto_code_surveys(cls, user, data, phone_uuid_table, coda_output_dir):
        # Label missing data
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if td.get(plan.raw_field, "") == "":
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.TRUE_MISSING), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = na_label.to_dict()
            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Auto-code remaining data
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.cleaner is not None:
                CleaningUtils.apply_cleaner_to_traced_data_iterable(
                    user, data, plan.raw_field, plan.coded_field, plan.cleaner,
                    plan.code_scheme)

        # For any locations where the cleaners assigned a code to a sub district, set the district code to NC
        # (this is because only one column should have a value set in Coda)
        for td in data:
            if "mogadishu_sub_district_coded" in td:
                mogadishu_code_id = td["mogadishu_sub_district_coded"][
                    "CodeID"]
                if CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_id(
                        mogadishu_code_id).code_type == "Normal":
                    nc_label = CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.MOGADISHU_SUB_DISTRICT,
                        CodeSchemes.MOGADISHU_SUB_DISTRICT.
                        get_code_with_control_code(Codes.NOT_CODED),
                        Metadata.get_call_location(),
                    )
                    td.append_data({"district_coded": nc_label.to_dict()},
                                   Metadata(user, Metadata.get_call_location(),
                                            time.time()))

        # Set operator from phone number
        for td in data:
            operator_clean = PhoneCleaner.clean_operator(
                phone_uuid_table.get_phone(td["uid"]))
            if operator_clean == Codes.NOT_CODED:
                label = CleaningUtils.make_label_from_cleaner_code(
                    CodeSchemes.OPERATOR,
                    CodeSchemes.OPERATOR.get_code_with_control_code(
                        Codes.NOT_CODED), Metadata.get_call_location())
            else:
                label = CleaningUtils.make_label_from_cleaner_code(
                    CodeSchemes.OPERATOR,
                    CodeSchemes.OPERATOR.get_code_with_match_value(
                        operator_clean), Metadata.get_call_location())
            td.append_data({"operator_coded": label.to_dict()},
                           Metadata(user, Metadata.get_call_location(),
                                    time.time()))

        # Output single-scheme answers to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.raw_field == "mogadishu_sub_district_raw":
                continue

            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field,
                    {plan.coded_field: plan.code_scheme}, f)

        # Output location scheme to coda for manual verification + coding
        output_path = path.join(coda_output_dir, "location.json")
        TracedDataCodaV2IO.compute_message_ids(
            user, data, "mogadishu_sub_district_raw",
            "mogadishu_sub_district_raw_id")
        with open(output_path, "w") as f:
            TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                data, "mogadishu_sub_district_raw",
                "mogadishu_sub_district_time", "mogadishu_sub_district_raw_id",
                {
                    "mogadishu_sub_district_coded":
                    CodeSchemes.MOGADISHU_SUB_DISTRICT,
                    "district_coded": CodeSchemes.DISTRICT,
                    "region_coded": CodeSchemes.REGION,
                    "state_coded": CodeSchemes.STATE,
                    "zone_coded": CodeSchemes.ZONE
                }, f)

        return data

예제 #19

파일 보기

파일: test_io.py 프로젝트: AfricasVoices/CoreDataModules

    def test_export_two_single_coded_schemes(self):
        file_path = path.join(self.test_dir, "coda_2_test.json")

        # Load schemes
        with open("tests/traced_data/resources/coda_2_district_scheme.json") as f:
            district_scheme = CodeScheme.from_firebase_map(json.load(f))

        with open("tests/traced_data/resources/coda_2_zone_scheme.json") as f:
            zone_scheme = CodeScheme.from_firebase_map(json.load(f))

        def make_location_label(scheme, value):
            if value in {Codes.TRUE_MISSING, Codes.SKIPPED, Codes.NOT_CODED}:
                code = scheme.get_code_with_control_code(value)
            else:
                code = scheme.get_code_with_match_value(value)

            return CleaningUtils.make_label_from_cleaner_code(scheme, code, "make_location_label",
                                                              date_time_utc="2018-11-02T13:40:50Z").to_dict()

        # Build raw input data
        message_dicts = [
            # Normal, coded data
            {"location_raw": "mog", "location_sent_on": "2018-11-01T07:13:04+03:00",
             "district": make_location_label(district_scheme, "mogadishu"),
             "zone": make_location_label(zone_scheme, "scz")},

            # Data coded under one scheme only
            {"location_raw": "kismayo", "location_sent_on": "2018-11-01T07:17:04+03:00",
             "district": make_location_label(district_scheme, "kismayo")},

            # Data coded as missing under both schemes
            {"location_raw": "", "location_sent_on": "2018-11-01T07:19:04+05:00",
             "district": make_location_label(district_scheme, Codes.TRUE_MISSING),
             "zone": make_location_label(zone_scheme, Codes.TRUE_MISSING)},

            # No data
            {},

            # Data coded as NC under both schemes
            {"location_raw": "kismyo", "location_sent_on": "2018-11-01T07:19:30+03:00",
             "district": make_location_label(district_scheme, Codes.NOT_CODED),
             "zone": make_location_label(zone_scheme, Codes.NOT_CODED)},

            # Data coded as NC under one scheme only
            {"location_raw": "kismay", "location_sent_on": "2018-11-01T07:19:30+03:00",
             "district": make_location_label(district_scheme, "kismayo"),
             "zone": make_location_label(zone_scheme, Codes.NOT_CODED)},
        ]
        messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i))
                    for i, d in enumerate(message_dicts)]

        # Add message ids
        TracedDataCodaV2IO.compute_message_ids("test_user", messages, "location_raw", "location_coda_id")

        # Export to a Coda 2 messages file
        with open(file_path, "w") as f:
            scheme_key_map = collections.OrderedDict()  # Using OrderedDict to make tests easier to write in Py2 and Py3.
            scheme_key_map["district"] = district_scheme
            scheme_key_map["zone"] = zone_scheme

            TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                messages, "location_raw", "location_sent_on", "location_coda_id", scheme_key_map, f)

        self.assertTrue(
            filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_multiple_schemes.json"))

예제 #20

파일 보기

파일: auto_code_surveys.py 프로젝트: AfricasVoices/Project-UNDP-RCO

    def auto_code_surveys(cls, user, data, pipeline_configuration,
                          coda_output_dir):
        # Auto-code surveys
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            for cc in plan.coding_configurations:
                if cc.cleaner is not None:
                    CleaningUtils.apply_cleaner_to_traced_data_iterable(
                        user, data, plan.raw_field, cc.coded_field, cc.cleaner,
                        cc.code_scheme)

        # Remove survey data sent after the project finished
        log.info(
            "Hiding survey messages sent after the end of the project. These will not be exported in "
            "production/analysis files")
        out_of_range_count = 0
        for td in data:
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                # TODO: Come up with a better solution here e.g. separate DEMOG/SURVEY lists
                if plan.raw_field in ["have_voice_raw", "suggestions_raw"]:
                    continue

                if plan.time_field in td and isoparse(
                        td[plan.time_field]
                ) > pipeline_configuration.project_end_date:
                    out_of_range_count += 1
                    td.hide_keys({plan.raw_field, plan.time_field},
                                 Metadata(user, Metadata.get_call_location(),
                                          time.time()))
        log.info(
            f"Hid {out_of_range_count} survey messages sent after the end of the project"
        )

        # For any locations where the cleaners assigned a code to a sub district, set the district code to NC
        # (this is because only one column should have a value set in Coda)
        for td in data:
            if "mogadishu_sub_district_coded" in td:
                mogadishu_code_id = td["mogadishu_sub_district_coded"][
                    "CodeID"]
                if CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_id(
                        mogadishu_code_id).code_type == "Normal":
                    nc_label = CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.MOGADISHU_SUB_DISTRICT,
                        CodeSchemes.MOGADISHU_SUB_DISTRICT.
                        get_code_with_control_code(Codes.NOT_CODED),
                        Metadata.get_call_location(),
                    )
                    td.append_data({"district_coded": nc_label.to_dict()},
                                   Metadata(user, Metadata.get_call_location(),
                                            time.time()))

        # Output survey responses to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field, {
                        cc.coded_field: cc.code_scheme
                        for cc in plan.coding_configurations
                    }, f)

        return data

예제 #21

파일 보기

파일: test_io.py 프로젝트: AfricasVoices/CoreDataModules

    def test_export_import_one_multi_coded_scheme(self):
        file_path = path.join(self.test_dir, "coda_2_test.json")

        # Build raw input data
        message_dicts = [
            {"msg_raw": "food", "msg_sent_on": "2018-11-01T07:13:04+03:00"},
            {"msg_raw": "", "msg_sent_on": "2018-11-01T07:17:04+03:00"},
            {"msg_raw": "food + water", "msg_sent_on": "2018-11-01T07:19:04+05:00"},
            {},
            {"msg_raw": "water", "msg_sent_on": "2018-11-02T19:00:29+03:00"},
            {"msg_raw": "abcd", "msg_sent_on": "2018-11-02T20:30:45+03:00"}
        ]
        messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i))
                    for i, d in enumerate(message_dicts)]

        # Add message ids
        TracedDataCodaV2IO.compute_message_ids("test_user", messages, "msg_raw", "msg_coda_id")

        # Load gender scheme
        with open("tests/traced_data/resources/coda_2_msg_scheme.json") as f:
            msg_scheme = CodeScheme.from_firebase_map(json.load(f))

        # Export to a Coda 2 messages file
        with open(file_path, "w") as f:
            TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                messages, "msg_raw", "msg_sent_on", "msg_coda_id", {"msg_coded": msg_scheme}, f)

        self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_multi_coded.json"))

        # Test importing with no file available
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
            "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme})
        # Deliberately testing the read can be done twice
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
            "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme})

        na_id = msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id
        nr_id = msg_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                msg_scheme,
                msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("msg_raw", "") == "":
                td.append_data({"msg_coded": [na_label.to_dict()]},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        for td in imported_messages:
            self.assertEqual(len(td["msg_coded"]), 1)
        imported_code_ids = [td["msg_coded"][0]["CodeID"] for td in imported_messages]
        self.assertListEqual(imported_code_ids, [nr_id, na_id, nr_id, na_id, nr_id, nr_id])

        # Test importing from the test file
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        with open("tests/traced_data/resources/coda_2_import_test_multi_coded.json", "r") as f:
            TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}, f)

            # Test that reading the same file-pointer twice without moving it back to the start of the file fails
            try:
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                    "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}, f)
                self.fail("Re-using the same file pointer didn't raise an assertion error")
            except AssertionError as e:
                self.assertEqual(str(e), "File-pointer not at byte 0. "
                                         "Should you have used e.g. `f.seek(0)` before calling this method?")

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                msg_scheme,
                msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("msg_raw", "") == "":
                td.append_data({"msg_coded": [na_label.to_dict()]},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        imported_code_ids = []
        for td in imported_messages:
            imported_code_ids.append([code["CodeID"] for code in td["msg_coded"]])

        expected_code_ids = [
            [msg_scheme.get_code_with_match_value("food").code_id],
            [msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id],
            [msg_scheme.get_code_with_match_value("food").code_id, msg_scheme.get_code_with_match_value("water").code_id],
            [msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id],
            [msg_scheme.get_code_with_match_value("water").code_id],
            [msg_scheme.get_code_with_control_code(Codes.NOT_CODED).code_id]
        ]

        for x, y in zip(imported_code_ids, expected_code_ids):
            self.assertEqual(len(x), len(y))
            self.assertSetEqual(set(x), set(y))

예제 #22

파일 보기

파일: ws_correction.py 프로젝트: AfricasVoices/Project-COVID19-KE-Urban

    def move_wrong_scheme_messages(user, data, coda_input_dir):
        log.info("Importing manually coded Coda files to '_WS' fields...")
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.coda_filename is None:
                continue

            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   f"{plan.id_field}_WS")
            with open(f"{coda_input_dir}/{plan.coda_filename}") as f:
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                    user, data, f"{plan.id_field}_WS", {
                        f"{plan.raw_field}_WS_correct_dataset":
                        PipelineConfiguration.WS_CORRECT_DATASET_SCHEME
                    }, f)

            for cc in plan.coding_configurations:
                with open(f"{coda_input_dir}/{plan.coda_filename}") as f:
                    if cc.coding_mode == CodingModes.SINGLE:
                        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                            user, data, plan.id_field + "_WS",
                            {f"{cc.coded_field}_WS": cc.code_scheme}, f)
                    else:
                        assert cc.coding_mode == CodingModes.MULTIPLE
                        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                            user, data, f"{plan.id_field}_WS",
                            {f"{cc.coded_field}_WS": cc.code_scheme}, f)

        log.info("Checking for WS Coding Errors...")
        # Check for coding errors
        for td in data:
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                rqa_codes = []
                for cc in plan.coding_configurations:
                    if cc.coding_mode == CodingModes.SINGLE:
                        if f"{cc.coded_field}_WS" in td:
                            label = td[f"{cc.coded_field}_WS"]
                            rqa_codes.append(
                                cc.code_scheme.get_code_with_code_id(
                                    label["CodeID"]))
                    else:
                        assert cc.coding_mode == CodingModes.MULTIPLE
                        for label in td.get(f"{cc.coded_field}_WS", []):
                            rqa_codes.append(
                                cc.code_scheme.get_code_with_code_id(
                                    label["CodeID"]))

                has_ws_code_in_code_scheme = False
                for code in rqa_codes:
                    if code.control_code == Codes.WRONG_SCHEME:
                        has_ws_code_in_code_scheme = True

                has_ws_code_in_ws_scheme = False
                if f"{plan.raw_field}_WS_correct_dataset" in td:
                    ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id(
                        td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"])
                    has_ws_code_in_ws_scheme = ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED

                if has_ws_code_in_code_scheme != has_ws_code_in_ws_scheme:
                    log.warning(
                        f"Coding Error: {plan.raw_field}: {td[plan.raw_field]}"
                    )
                    coding_error_dict = {
                        f"{plan.raw_field}_WS_correct_dataset":
                        CleaningUtils.make_label_from_cleaner_code(
                            PipelineConfiguration.WS_CORRECT_DATASET_SCHEME,
                            PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.
                            get_code_with_control_code(Codes.CODING_ERROR),
                            Metadata.get_call_location(),
                        ).to_dict()
                    }
                    td.append_data(
                        coding_error_dict,
                        Metadata(user, Metadata.get_call_location(),
                                 time.time()))

        # Construct a map from WS normal code id to the raw field that code indicates a requested move to.
        ws_code_to_raw_field_map = dict()
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.ws_code is not None:
                ws_code_to_raw_field_map[plan.ws_code.code_id] = plan.raw_field

        # Group the TracedData by uid.
        data_grouped_by_uid = dict()
        for td in data:
            uid = td["uid"]
            if uid not in data_grouped_by_uid:
                data_grouped_by_uid[uid] = []
            data_grouped_by_uid[uid].append(td)

        # Perform the WS correction for each uid.
        log.info("Performing WS correction...")
        corrected_data = []  # List of TracedData with the WS data moved.
        unknown_target_code_counts = dict(
        )  # 'WS - Correct Dataset' codes with no matching code id in any coding plan
        # for this project, with a count of the occurrences
        for group in data_grouped_by_uid.values():
            # Find all the surveys data being moved.
            # (Note: we only need to check one td in this group because all the demographics are the same)
            td = group[0]
            survey_moves = dict()  # of source_field -> target_field
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if plan.raw_field not in td or plan.coda_filename is None:
                    continue
                ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id(
                    td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"])
                if ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED:
                    if ws_code.code_id in ws_code_to_raw_field_map:
                        survey_moves[
                            plan.raw_field] = ws_code_to_raw_field_map[
                                ws_code.code_id]
                    else:
                        if (ws_code.code_id, ws_code.display_text
                            ) not in unknown_target_code_counts:
                            unknown_target_code_counts[(
                                ws_code.code_id, ws_code.display_text)] = 0
                        unknown_target_code_counts[(ws_code.code_id,
                                                    ws_code.display_text)] += 1
                        survey_moves[plan.raw_field] = None

            # Find all the RQA data being moved.
            rqa_moves = dict(
            )  # of (index in group, source_field) -> target_field
            for i, td in enumerate(group):
                for plan in PipelineConfiguration.RQA_CODING_PLANS:
                    if plan.raw_field not in td or plan.coda_filename is None:
                        continue
                    ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id(
                        td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"])
                    if ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED:
                        if ws_code.code_id in ws_code_to_raw_field_map:
                            rqa_moves[(
                                i, plan.raw_field
                            )] = ws_code_to_raw_field_map[ws_code.code_id]
                        else:
                            if (ws_code.code_id, ws_code.display_text
                                ) not in unknown_target_code_counts:
                                unknown_target_code_counts[(
                                    ws_code.code_id, ws_code.display_text)] = 0
                            unknown_target_code_counts[(
                                ws_code.code_id, ws_code.display_text)] += 1
                            rqa_moves[(i, plan.raw_field)] = None

            # Build a dictionary of the survey fields that haven't been moved, and cleared fields for those which have.
            survey_updates = dict()  # of raw_field -> updated value
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if plan.coda_filename is None:
                    continue

                if plan.raw_field in survey_moves.keys():
                    # Data is moving
                    survey_updates[plan.raw_field] = []
                elif plan.raw_field in td:
                    # Data is not moving
                    survey_updates[plan.raw_field] = [
                        _WSUpdate(td[plan.raw_field], td[plan.time_field],
                                  plan.raw_field, td)
                    ]

            # Build a list of the rqa fields that haven't been moved.
            rqa_updates = []  # of (raw_field, _WSUpdate)
            for i, td in enumerate(group):
                for plan in PipelineConfiguration.RQA_CODING_PLANS:
                    if plan.coda_filename is None:
                        continue

                    if plan.raw_field in td:
                        if (i, plan.raw_field) in rqa_moves.keys():
                            # Data is moving
                            pass
                        else:
                            # Data is not moving
                            rqa_updates.append(
                                (plan.raw_field,
                                 _WSUpdate(td[plan.raw_field],
                                           td[plan.time_field], plan.raw_field,
                                           td)))

            # Add data moving from survey fields to the relevant survey_/rqa_updates
            raw_survey_fields = {
                plan.raw_field
                for plan in PipelineConfiguration.SURVEY_CODING_PLANS
            }
            raw_rqa_fields = {
                plan.raw_field
                for plan in PipelineConfiguration.RQA_CODING_PLANS
            }
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS + PipelineConfiguration.RQA_CODING_PLANS:
                if plan.raw_field not in survey_moves:
                    continue

                target_field = survey_moves[plan.raw_field]
                if target_field is None:
                    continue

                update = _WSUpdate(td[plan.raw_field], td[plan.time_field],
                                   plan.raw_field, td)
                if target_field in raw_survey_fields:
                    survey_updates[target_field] = survey_updates.get(
                        target_field, []) + [update]
                else:
                    assert target_field in raw_rqa_fields, f"Raw field '{target_field}' not in any coding plan"
                    rqa_updates.append((target_field, update))

            # Add data moving from RQA fields to the relevant survey_/rqa_updates
            for (i, source_field), target_field in rqa_moves.items():
                if target_field is None:
                    continue

                for plan in PipelineConfiguration.SURVEY_CODING_PLANS + PipelineConfiguration.RQA_CODING_PLANS:
                    if plan.raw_field == source_field:
                        _td = group[i]
                        update = _WSUpdate(_td[plan.raw_field],
                                           _td[plan.time_field],
                                           plan.raw_field, td)
                        if target_field in raw_survey_fields:
                            survey_updates[target_field] = survey_updates.get(
                                target_field, []) + [update]
                        else:
                            assert target_field in raw_rqa_fields, f"Raw field '{target_field}' not in any coding plan"
                            rqa_updates.append((target_field, update))

            # Re-format the survey updates to a form suitable for use by the rest of the pipeline
            flattened_survey_updates = {}
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if plan.raw_field in survey_updates:
                    plan_updates = survey_updates[plan.raw_field]

                    if len(plan_updates) > 0:
                        flattened_survey_updates[plan.raw_field] = "; ".join(
                            [u.message for u in plan_updates])
                        flattened_survey_updates[plan.time_field] = sorted(
                            [u.timestamp for u in plan_updates])[0]
                        flattened_survey_updates[
                            f"{plan.raw_field}_source"] = "; ".join(
                                [u.source_field for u in plan_updates])
                    else:
                        flattened_survey_updates[plan.raw_field] = None
                        flattened_survey_updates[plan.time_field] = None
                        flattened_survey_updates[
                            f"{plan.raw_field}_source"] = None

            # For each RQA message, create a copy of its source td, append the updated TracedData, and add this to
            # the list of TracedData to be returned
            raw_field_to_rqa_plan_map = {
                plan.raw_field: plan
                for plan in PipelineConfiguration.RQA_CODING_PLANS
            }
            for target_field, update in rqa_updates:
                corrected_td = update.source_td.copy()

                # Hide the survey keys currently in the TracedData which have had data moved away.
                corrected_td.hide_keys({
                    k
                    for k, v in flattened_survey_updates.items() if v is None
                }.intersection(corrected_td.keys()),
                                       Metadata(user,
                                                Metadata.get_call_location(),
                                                time.time()))

                # Update with the corrected survey data
                corrected_td.append_data(
                    {
                        k: v
                        for k, v in flattened_survey_updates.items()
                        if v is not None
                    }, Metadata(user, Metadata.get_call_location(),
                                time.time()))

                # Hide all the RQA fields (they will be added back, in turn, in the next step).
                corrected_td.hide_keys({
                    plan.raw_field
                    for plan in PipelineConfiguration.RQA_CODING_PLANS
                }.intersection(corrected_td.keys()),
                                       Metadata(user,
                                                Metadata.get_call_location(),
                                                time.time()))
                corrected_td.hide_keys({
                    plan.time_field
                    for plan in PipelineConfiguration.RQA_CODING_PLANS
                }.intersection(corrected_td.keys()),
                                       Metadata(user,
                                                Metadata.get_call_location(),
                                                time.time()))

                target_coding_plan = raw_field_to_rqa_plan_map[target_field]

                rqa_dict = {
                    target_field: update.message,
                    target_coding_plan.time_field: update.timestamp,
                    f"{target_field}_source": update.source_field
                }

                corrected_td.append_data(
                    rqa_dict,
                    Metadata(user, Metadata.get_call_location(), time.time()))
                corrected_data.append(corrected_td)

        if len(unknown_target_code_counts) > 0:
            log.warning(
                "Found the following 'WS - Correct Dataset' CodeIDs with no matching coding plan:"
            )
            for (code_id,
                 display_text), count in unknown_target_code_counts.items():
                log.warning(
                    f"  '{code_id}' (DisplayText '{display_text}') ({count} occurrences)"
                )

        return corrected_data

예제 #23

파일 보기

파일: ws_correction.py 프로젝트: AfricasVoices/Project-ICRAF

    def move_wrong_scheme_messages(user, data, coda_input_dir):
        log.info(
            "Importing manually coded Coda files to '_WS' coded fields...")
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field + "_WS")
            with open(f"{coda_input_dir}/{plan.coda_filename}") as f:
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                    user, data, plan.id_field + "_WS", {
                        f"{plan.coded_field}_WS_correct_dataset":
                        CodeSchemes.WS_CORRECT_DATASET
                    }, f)

        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            with open(f"{coda_input_dir}/{plan.coda_filename}") as f:
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                    user, data, plan.id_field + "_WS",
                    {f"{plan.coded_field}_WS": plan.code_scheme}, f)

                if plan.binary_code_scheme is not None:
                    f.seek(0)
                    TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                        user, data, plan.id_field + "_WS", {
                            f"{plan.binary_coded_field}_WS":
                            plan.binary_code_scheme
                        }, f)

        for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
            with open(f"{coda_input_dir}/{plan.coda_filename}") as f:
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                    user, data, plan.id_field + "_WS",
                    {f"{plan.coded_field}_WS": plan.code_scheme}, f)

        log.info("Checking for WS Coding Errors...")
        # Check for coding errors in the RQA datasets.
        for td in data:
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                rqa_codes = []
                for label in td.get(f"{plan.coded_field}_WS", []):
                    rqa_codes.append(
                        plan.code_scheme.get_code_with_id(label["CodeID"]))
                if plan.binary_code_scheme is not None and f"{plan.binary_coded_field}_WS" in td:
                    label = td[f"{plan.binary_coded_field}_WS"]
                    rqa_codes.append(
                        plan.binary_code_scheme.get_code_with_id(
                            label["CodeID"]))

                has_ws_code_in_code_scheme = False
                for code in rqa_codes:
                    if code.control_code == Codes.WRONG_SCHEME:
                        has_ws_code_in_code_scheme = True

                has_ws_code_in_ws_scheme = False
                if f"{plan.coded_field}_WS_correct_dataset" in td:
                    has_ws_code_in_ws_scheme = CodeSchemes.WS_CORRECT_DATASET.get_code_with_id(
                        td[f"{plan.coded_field}_WS_correct_dataset"]
                        ["CodeID"]).code_type == "Normal"

                if has_ws_code_in_code_scheme != has_ws_code_in_ws_scheme:
                    log.debug(
                        f"Coding Error: {plan.raw_field}: {td[plan.raw_field]}"
                    )
                    coding_error_dict = {
                        f"{plan.coded_field}_WS_correct_dataset":
                        CleaningUtils.make_label_from_cleaner_code(
                            CodeSchemes.WS_CORRECT_DATASET,
                            CodeSchemes.WS_CORRECT_DATASET.
                            get_code_with_control_code(Codes.CODING_ERROR),
                            Metadata.get_call_location(),
                        ).to_dict()
                    }
                    td.append_data(
                        coding_error_dict,
                        Metadata(user, Metadata.get_call_location(),
                                 time.time()))

        # Check for coding errors in the demogs and follow up surveys datasets, except location, as this is handled differently below.
        for td in data:
            for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
                if plan.raw_field == "location_raw":
                    continue

                has_ws_code_in_code_scheme = False
                if f"{plan.coded_field}_WS" in td:
                    has_ws_code_in_code_scheme = plan.code_scheme.get_code_with_id(
                        td[f"{plan.coded_field}_WS"]
                        ["CodeID"]).control_code == Codes.WRONG_SCHEME

                has_ws_code_in_ws_scheme = False
                if f"{plan.coded_field}_WS_correct_dataset" in td:
                    has_ws_code_in_ws_scheme = CodeSchemes.WS_CORRECT_DATASET.get_code_with_id(
                        td[f"{plan.coded_field}_WS_correct_dataset"]
                        ["CodeID"]).code_type == "Normal"

                if has_ws_code_in_code_scheme != has_ws_code_in_ws_scheme:
                    log.debug(
                        f"Coding Error: {plan.raw_field}: {td[plan.raw_field]}"
                    )
                    coding_error_dict = {
                        f"{plan.coded_field}_WS_correct_dataset":
                        CleaningUtils.make_label_from_cleaner_code(
                            CodeSchemes.WS_CORRECT_DATASET,
                            CodeSchemes.WS_CORRECT_DATASET.
                            get_code_with_control_code(Codes.CODING_ERROR),
                            Metadata.get_call_location(),
                        ).to_dict()
                    }
                    td.append_data(
                        coding_error_dict,
                        Metadata(user, Metadata.get_call_location(),
                                 time.time()))

        # Check for coding errors in the locations dataset.
        for td in data:
            location_codes = []
            for plan in PipelineConfiguration.LOCATION_CODING_PLANS:
                if f"{plan.coded_field}_WS" in td:
                    label = td[f"{plan.coded_field}_WS"]
                    location_codes.append(
                        plan.code_scheme.get_code_with_id(label["CodeID"]))

            has_ws_code_in_code_scheme = False
            for code in location_codes:
                if code.control_code == Codes.WRONG_SCHEME:
                    has_ws_code_in_code_scheme = True

            has_ws_code_in_ws_scheme = False
            for plan in PipelineConfiguration.LOCATION_CODING_PLANS:
                if f"{plan.coded_field}_WS_correct_dataset" in td:
                    if CodeSchemes.WS_CORRECT_DATASET.get_code_with_id(
                            td[f"{plan.coded_field}_WS_correct_dataset"]
                        ["CodeID"]).code_type == "Normal":
                        has_ws_code_in_ws_scheme = True

            if has_ws_code_in_code_scheme != has_ws_code_in_ws_scheme:
                log.debug(f"Coding Error: location_raw: {td['location_raw']}")
                coding_error_dict = dict()
                for plan in PipelineConfiguration.LOCATION_CODING_PLANS:
                    coding_error_dict[f"{plan.coded_field}_WS_correct_dataset"] = \
                        CleaningUtils.make_label_from_cleaner_code(
                            CodeSchemes.WS_CORRECT_DATASET,
                            CodeSchemes.WS_CORRECT_DATASET.get_code_with_control_code(Codes.CODING_ERROR),
                            Metadata.get_call_location(),
                        ).to_dict()
                td.append_data(
                    coding_error_dict,
                    Metadata(user, Metadata.get_call_location(), time.time()))

        # Construct a map from WS normal code id to the raw field that code indicates a requested move to.
        ws_code_to_raw_field_map = dict()
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
            if plan.ws_code is not None:
                ws_code_to_raw_field_map[plan.ws_code.code_id] = plan.raw_field

        # Group the TracedData by uid.
        data_grouped_by_uid = dict()
        for td in data:
            uid = td["uid"]
            if uid not in data_grouped_by_uid:
                data_grouped_by_uid[uid] = []
            data_grouped_by_uid[uid].append(td)

        # Perform the WS correction for each uid.
        log.info("Performing WS correction...")
        corrected_data = []  # List of TracedData with the WS data moved.
        for group in data_grouped_by_uid.values():
            log.debug(f"\n\nStarting re-map for {group[0]['uid']}...")
            for i, td in enumerate(group):
                log.debug(f"--------------td[{i}]--------------")
                for _plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.DEMOGS_CODING_PLANS + \
                             PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
                    log.debug(f"{_plan.raw_field}: {td.get(_plan.raw_field)}")
                    log.debug(
                        f"{_plan.time_field}: {td.get(_plan.time_field)}")

            # Find all the demogs and follow-up survey data being moved.
            # (Note: we only need to check one td in this group because all the demographics and follow up surveys are the same)
            td = group[0]
            demogs_and_follow_up_survey_moves = dict(
            )  # of source_field -> target_field
            for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
                if plan.raw_field not in td:
                    continue
                ws_code = CodeSchemes.WS_CORRECT_DATASET.get_code_with_id(
                    td[f"{plan.coded_field}_WS_correct_dataset"]["CodeID"])
                if ws_code.code_type == "Normal":
                    log.debug(
                        f"Detected redirect from {plan.raw_field} -> {ws_code_to_raw_field_map.get(ws_code.code_id, ws_code.code_id)} for message {td[plan.raw_field]}"
                    )
                    demogs_and_follow_up_survey_moves[
                        plan.raw_field] = ws_code_to_raw_field_map[
                            ws_code.code_id]

            # Find all the RQA data being moved.
            rqa_moves = dict(
            )  # of (index in group, source_field) -> target_field
            for i, td in enumerate(group):
                for plan in PipelineConfiguration.RQA_CODING_PLANS:
                    if plan.raw_field not in td:
                        continue
                    ws_code = CodeSchemes.WS_CORRECT_DATASET.get_code_with_id(
                        td[f"{plan.coded_field}_WS_correct_dataset"]["CodeID"])
                    if ws_code.code_type == "Normal":
                        log.debug(
                            f"Detected redirect from ({i}, {plan.raw_field}) -> {ws_code_to_raw_field_map.get(ws_code.code_id, ws_code.code_id)} for message {td[plan.raw_field]}"
                        )
                        rqa_moves[(i,
                                   plan.raw_field)] = ws_code_to_raw_field_map[
                                       ws_code.code_id]

            # Build a dictionary of the demogs and follow-up survey fields that haven't been moved, and clear fields for those which have.
            demogs_and_follow_up_survey_updates = dict(
            )  # of raw_field -> updated value
            for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
                if plan.raw_field in demogs_and_follow_up_survey_moves.keys():
                    # Data is moving
                    demogs_and_follow_up_survey_updates[plan.raw_field] = []
                elif plan.raw_field in td:
                    # Data is not moving
                    demogs_and_follow_up_survey_updates[plan.raw_field] = [
                        _WSUpdate(td[plan.raw_field], td[plan.time_field],
                                  plan.raw_field)
                    ]

            # Build a list of the rqa fields that haven't been moved.
            rqa_updates = []  # of (field, value)
            for i, td in enumerate(group):
                for plan in PipelineConfiguration.RQA_CODING_PLANS:
                    if plan.raw_field in td:
                        if (i, plan.raw_field) in rqa_moves.keys():
                            # Data is moving
                            pass
                        else:
                            # Data is not moving
                            rqa_updates.append(
                                (plan.raw_field,
                                 _WSUpdate(td[plan.raw_field],
                                           td[plan.time_field],
                                           plan.raw_field)))

            raw_demog_and_follow_up_survey_fields = {plan.raw_field for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + \
                                                     PipelineConfiguration.FOLLOW_UP_CODING_PLANS}
            raw_rqa_fields = {
                plan.raw_field
                for plan in PipelineConfiguration.RQA_CODING_PLANS
            }

            # Add data moving from demogs and follow-up survey fields to the relevant demog/follow_up/rqa_updates
            for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.RQA_CODING_PLANS + \
                        PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
                if plan.raw_field not in demogs_and_follow_up_survey_moves:
                    continue
                target_field = demogs_and_follow_up_survey_moves[
                    plan.raw_field]
                update = _WSUpdate(td[plan.raw_field], td[plan.time_field],
                                   plan.raw_field)
                if target_field in raw_demog_and_follow_up_survey_fields:
                    demogs_and_follow_up_survey_updates[
                        target_field] = demogs_and_follow_up_survey_updates.get(
                            target_field, []) + [update]
                else:
                    assert target_field in raw_rqa_fields, f"Raw field '{target_field}' not in any coding plan"
                    rqa_updates.append((target_field, update))

            for (i, source_field), target_field in rqa_moves.items():
                for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.RQA_CODING_PLANS + \
                            PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
                    if plan.raw_field == source_field:
                        _td = group[i]
                        update = _WSUpdate(_td[plan.raw_field],
                                           _td[plan.time_field],
                                           plan.raw_field)
                        if target_field in raw_demog_and_follow_up_survey_fields:
                            demogs_and_follow_up_survey_updates[
                                target_field] = demogs_and_follow_up_survey_updates.get(
                                    target_field, []) + [update]
                        else:
                            assert target_field in raw_rqa_fields, f"Raw field '{target_field}' not in any coding plan"
                            rqa_updates.append((target_field, update))

            # Re-format the demogs and follow-up survey updates to a form suitable for use by the rest of the pipeline
            flattened_demog_and_follow_up_survey_updates = {}
            for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
                if plan.raw_field in demogs_and_follow_up_survey_updates:
                    plan_updates = demogs_and_follow_up_survey_updates[
                        plan.raw_field]

                    if len(plan_updates) > 0:
                        flattened_demog_and_follow_up_survey_updates[
                            plan.raw_field] = "; ".join(
                                [u.message for u in plan_updates])
                        flattened_demog_and_follow_up_survey_updates[
                            plan.time_field] = sorted(
                                [u.sent_on for u in plan_updates])[0]
                        flattened_demog_and_follow_up_survey_updates[
                            f"{plan.raw_field}_source"] = "; ".join(
                                [u.source for u in plan_updates])
                    else:
                        flattened_demog_and_follow_up_survey_updates[
                            plan.raw_field] = None
                        flattened_demog_and_follow_up_survey_updates[
                            plan.time_field] = None
                        flattened_demog_and_follow_up_survey_updates[
                            f"{plan.raw_field}_source"] = None

            # Hide the demogs and follow up survey keys currently in the TracedData which have had data moved away.
            td.hide_keys({
                k
                for k, v in
                flattened_demog_and_follow_up_survey_updates.items()
                if v is None
            }.intersection(td.keys()),
                         Metadata(user, Metadata.get_call_location(),
                                  time.time()))

            # Update with the corrected demogs + follow up data
            td.append_data(
                {
                    k: v
                    for k, v in flattened_demog_and_follow_up_survey_updates.
                    items() if v is not None
                }, Metadata(user, Metadata.get_call_location(), time.time()))

            # Hide all the RQA fields (they will be added back, in turn, in the next step).
            td.hide_keys({
                plan.raw_field
                for plan in PipelineConfiguration.RQA_CODING_PLANS
            }.intersection(td.keys()),
                         Metadata(user, Metadata.get_call_location(),
                                  time.time()))

            # For each rqa message, create a copy of this td, append the rqa message, and add this to the
            # list of TracedData.
            for target_field, update in rqa_updates:
                rqa_dict = {
                    target_field: update.message,
                    "sent_on": update.sent_on,
                    f"{target_field}_source": update.source
                }

                corrected_td = td.copy()
                corrected_td.append_data(
                    rqa_dict,
                    Metadata(user, Metadata.get_call_location(), time.time()))
                corrected_data.append(corrected_td)

                log.debug(f"----------Created TracedData--------------")
                for _plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.DEMOGS_CODING_PLANS + \
                             PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
                    log.debug(
                        f"{_plan.raw_field}: {corrected_td.get(_plan.raw_field)}"
                    )
                    log.debug(
                        f"{_plan.time_field}: {corrected_td.get(_plan.time_field)}"
                    )

        return corrected_data