Exemplos de CleaningUtils em Python, exemplos de core_data_modules.cleaners.cleaning_utils.CleaningUtils em Python

Exemplo n.º 1

0

Exibir arquivo

    def auto_code_surveys(cls, user, data, coda_output_dir):
        # Auto-code surveys
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            for cc in plan.coding_configurations:
                if cc.cleaner is not None:
                    CleaningUtils.apply_cleaner_to_traced_data_iterable(
                        user, data, plan.raw_field, cc.coded_field, cc.cleaner,
                        cc.code_scheme)

        # Output single-scheme answers to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field, {
                        cc.coded_field: cc.code_scheme
                        for cc in plan.coding_configurations
                    }, f)

        # Note: no need to handle location in any special way on this project because it is not being auto-coded

        return data

Exemplo n.º 2

0

Exibir arquivo

    def auto_code_surveys(cls, user, data, phone_uuid_table, coda_output_dir):
        # Label missing data
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.DEMOGS_CODING_PLANS:
                if td.get(plan.raw_field, "") == "":
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                        Metadata.get_call_location()
                    )
                    missing_dict[plan.coded_field] = na_label.to_dict()
            td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time()))

        # Auto-code remaining data
        for plan in PipelineConfiguration.DEMOGS_CODING_PLANS:
            if plan.cleaner is not None:
                CleaningUtils.apply_cleaner_to_traced_data_iterable(user, data, plan.raw_field, plan.coded_field,
                                                                    plan.cleaner, plan.code_scheme)

        # Output survey answers to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.DEMOGS_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field)
            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, 'w') as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field, {plan.coded_field: plan.code_scheme}, f
                )
        print("Coda demogs files successfully exported")

        return data

Exemplo n.º 3

0

Exibir arquivo

    def _impute_coding_error_codes(user, data):
        for td in data:
            coding_error_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                if f"{plan.raw_field}_WS_correct_dataset" in td:
                    if td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"] == \
                            CodeSchemes.WS_CORRECT_DATASET.get_code_with_control_code(Codes.CODING_ERROR).code_id:
                        for cc in plan.coding_configurations:
                            if cc.coding_mode == CodingModes.SINGLE:
                                coding_error_dict[cc.coded_field] = \
                                    CleaningUtils.make_label_from_cleaner_code(
                                        cc.code_scheme,
                                        cc.code_scheme.get_code_with_control_code(Codes.CODING_ERROR),
                                        Metadata.get_call_location()
                                    ).to_dict()
                            else:
                                assert cc.coding_mode == CodingModes.MULTIPLE
                                coding_error_dict[cc.coded_field] = [
                                    CleaningUtils.make_label_from_cleaner_code(
                                        cc.code_scheme,
                                        cc.code_scheme.
                                        get_code_with_control_code(
                                            Codes.CODING_ERROR),
                                        Metadata.get_call_location()).to_dict(
                                        )
                                ]

            td.append_data(
                coding_error_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: auto_code.py Projeto: AfricasVoices/Project-UNICEF-COVID19-SOM

 def run_cleaners(cls, user, data):
     for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
         for cc in plan.coding_configurations:
             if cc.cleaner is not None:
                 CleaningUtils.apply_cleaner_to_traced_data_iterable(
                     user, data, plan.raw_field, cc.coded_field, cc.cleaner,
                     cc.code_scheme)

Exemplo n.º 5

0

Exibir arquivo

Arquivo: fetch_raw_data.py Projeto: AfricasVoices/Project-OCHA

def label_somalia_operator(user, traced_runs, phone_number_uuid_table):
    # Set the operator codes for each message.
    uuids = {td["avf_phone_id"] for td in traced_runs}
    uuid_to_phone_lut = phone_number_uuid_table.uuid_to_data_batch(uuids)
    for td in traced_runs:
        operator_raw = uuid_to_phone_lut[td["avf_phone_id"]][:5]  # Returns the country code 252 and the next two digits

        operator_code = PhoneCleaner.clean_operator(operator_raw)
        if operator_code == Codes.NOT_CODED:
            operator_label = CleaningUtils.make_label_from_cleaner_code(
                CodeSchemes.SOMALIA_OPERATOR,
                CodeSchemes.SOMALIA_OPERATOR.get_code_with_control_code(Codes.NOT_CODED),
                Metadata.get_call_location()
            )
        else:
            operator_label = CleaningUtils.make_label_from_cleaner_code(
                CodeSchemes.SOMALIA_OPERATOR,
                CodeSchemes.SOMALIA_OPERATOR.get_code_with_match_value(operator_code),
                Metadata.get_call_location()
            )

        td.append_data({
            "operator_raw": operator_raw,
            "operator_coded": operator_label.to_dict()
        }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))

Exemplo n.º 6

0

Exibir arquivo

Arquivo: code_imputation_functions.py Projeto: AfricasVoices/Project-AHADI

def impute_kenya_location_codes(user, data, location_configurations):
    for td in data:
        # Up to 1 location code should have been assigned in Coda. Search for that code,
        # ensuring that only 1 has been assigned or, if multiple have been assigned, that they are non-conflicting
        # control codes
        location_code = None

        for cc in location_configurations:
            coda_code = cc.code_scheme.get_code_with_id(
                td[cc.coded_field]["CodeID"])
            if location_code is not None:
                if not (coda_code.code_id == location_code.code_id
                        or coda_code.control_code == Codes.NOT_REVIEWED):
                    location_code = CodeSchemes.CONSTITUENCY.get_code_with_control_code(
                        Codes.CODING_ERROR)
            elif coda_code.control_code != Codes.NOT_REVIEWED:
                location_code = coda_code

        # If no code was found, then this location is still not reviewed.
        # Synthesise a NOT_REVIEWED code accordingly.
        if location_code is None:
            location_code = CodeSchemes.CONSTITUENCY.get_code_with_control_code(
                Codes.NOT_REVIEWED)

        # If a control code was found, set all other location keys to that control code,
        # otherwise convert the provided location to the other locations in the hierarchy.
        if location_code.code_type == "Control":
            for cc in location_configurations:
                td.append_data(
                    {
                        cc.coded_field:
                        CleaningUtils.make_label_from_cleaner_code(
                            cc.code_scheme,
                            cc.code_scheme.get_code_with_control_code(
                                location_code.control_code),
                            Metadata.get_call_location()).to_dict()
                    }, Metadata(user, Metadata.get_call_location(),
                                time.time()))
        else:
            location = location_code.match_values[0]
            td.append_data(
                {
                    "county_coded":
                    CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.COUNTY,
                        make_location_code(
                            CodeSchemes.COUNTY,
                            KenyaLocations.county_for_location_code(location)),
                        Metadata.get_call_location()).to_dict(),
                    "constituency_coded":
                    CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.CONSTITUENCY,
                        make_location_code(
                            CodeSchemes.CONSTITUENCY,
                            KenyaLocations.constituency_for_location_code(
                                location)),
                        Metadata.get_call_location()).to_dict()
                }, Metadata(user, Metadata.get_call_location(), time.time()))

Exemplo n.º 7

0

Exibir arquivo

Arquivo: coding_plans.py Projeto: AfricasVoices/Project-COVID19-KE-Urban

def fold_source(x, y):
    if x["CodeID"] == y["CodeID"]:
        return x
    else:
        return CleaningUtils.make_label_from_cleaner_code(
            CodeSchemes.SOURCE,
            CodeSchemes.SOURCE.get_code_with_match_value("both"),
            Metadata.get_call_location()).to_dict()

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test_io.py Projeto: AfricasVoices/CoreDataModules

        def make_location_label(scheme, value):
            if value in {Codes.TRUE_MISSING, Codes.SKIPPED, Codes.NOT_CODED}:
                code = scheme.get_code_with_control_code(value)
            else:
                code = scheme.get_code_with_match_value(value)

            return CleaningUtils.make_label_from_cleaner_code(scheme, code, "make_location_label",
                                                              date_time_utc="2018-11-02T13:40:50Z").to_dict()

Exemplo n.º 9

0

Exibir arquivo

Arquivo: auto_code_surveys.py Projeto: AfricasVoices/Project-CAPYEI

    def auto_code_surveys(cls, user, data, icr_output_dir, coda_output_dir):
        # Auto-code surveys
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.cleaner is not None:
                CleaningUtils.apply_cleaner_to_traced_data_iterable(
                    user, data, plan.raw_field, plan.coded_field, plan.cleaner,
                    plan.code_scheme)

        # Output single-scheme answers to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:

            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field,
                    {plan.coded_field: plan.code_scheme}, f)

        # Output messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            rqa_messages = []
            for td in data:
                if plan.raw_field in td:
                    rqa_messages.append(td)

            icr_messages = ICRTools.generate_sample_for_icr(
                rqa_messages, cls.ICR_MESSAGES_COUNT,
                random.Random(cls.ICR_SEED))

            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages,
                    f,
                    headers=[plan.run_id_field, plan.raw_field])

        return data

Exemplo n.º 10

0

Exibir arquivo

Arquivo: apply_manual_codes.py Projeto: AfricasVoices/Project-ICRAF

    def _impute_coding_error_codes(user, data):
        for td in data:
            coding_error_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                if f"{plan.coded_field}_WS_correct_dataset" in td:
                    if td[f"{plan.coded_field}_WS_correct_dataset"]["CodeID"] == \
                            CodeSchemes.WS_CORRECT_DATASET.get_code_with_control_code(Codes.CODING_ERROR).code_id:
                        coding_error_dict[plan.coded_field] = [
                            CleaningUtils.make_label_from_cleaner_code(
                                plan.code_scheme,
                                plan.code_scheme.get_code_with_control_code(
                                    Codes.CODING_ERROR),
                                Metadata.get_call_location()).to_dict()
                        ]
                        if plan.binary_code_scheme is not None:
                            coding_error_dict[plan.binary_coded_field] = \
                                CleaningUtils.make_label_from_cleaner_code(
                                    plan.binary_code_scheme,
                                    plan.binary_code_scheme.get_code_with_control_code(Codes.CODING_ERROR),
                                    Metadata.get_call_location()
                                ).to_dict()

            for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
                if f"{plan.coded_field}_WS_correct_dataset" in td:
                    if td[f"{plan.coded_field}_WS_correct_dataset"]["CodeID"] == \
                            CodeSchemes.WS_CORRECT_DATASET.get_code_with_control_code(Codes.CODING_ERROR).code_id:
                        coding_error_dict[plan.coded_field] = \
                            CleaningUtils.make_label_from_cleaner_code(
                                plan.code_scheme,
                                plan.code_scheme.get_code_with_control_code(Codes.CODING_ERROR),
                                Metadata.get_call_location()
                            ).to_dict()

            td.append_data(
                coding_error_dict,
                Metadata(user, Metadata.get_call_location(),
                         TimeUtils.utc_now_as_iso_string()))

Exemplo n.º 11

0

Exibir arquivo

def impute_age_category(user, data, age_configurations):
    # TODO: By accepting a list of age_configurations but then requiring that list to contain code schemes in a
    #       certain order, it looks like we're providing more flexibility than we actually do. We should change this
    #       to explicitly accept age and age_category configurations, which requires refactoring all of the
    #       code imputation functions.
    age_cc = age_configurations[0]
    age_category_cc = age_configurations[1]

    age_categories = {
        (10, 14): "10 to 14",
        (15, 17): "15 to 17",
        (18, 35): "18 to 35",
        (36, 54): "36 to 54",
        (55, 99): "55 to 99"
    }

    for td in data:
        age_label = td[age_cc.coded_field]
        age_code = age_cc.code_scheme.get_code_with_code_id(
            age_label["CodeID"])

        if age_code.code_type == CodeTypes.NORMAL:
            # TODO: If these age categories are standard across projects, move this to Core as a new cleaner.
            age_category = None
            for age_range, category in age_categories.items():
                if age_range[0] <= age_code.numeric_value <= age_range[1]:
                    age_category = category
            assert age_category is not None

            age_category_code = age_category_cc.code_scheme.get_code_with_match_value(
                age_category)
        elif age_code.code_type == CodeTypes.META:
            age_category_code = age_category_cc.code_scheme.get_code_with_meta_code(
                age_code.meta_code)
        else:
            assert age_code.code_type == CodeTypes.CONTROL
            age_category_code = age_category_cc.code_scheme.get_code_with_control_code(
                age_code.control_code)

        age_category_label = CleaningUtils.make_label_from_cleaner_code(
            age_category_cc.code_scheme, age_category_code,
            Metadata.get_call_location())

        td.append_data(
            {age_category_cc.coded_field: age_category_label.to_dict()},
            Metadata(user, Metadata.get_call_location(), time.time()))

Exemplo n.º 12

0

Exibir arquivo

Arquivo: auto_code_surveys.py Projeto: AfricasVoices/Project-UNDP-RCO

    def auto_code_surveys(cls, user, data, pipeline_configuration,
                          coda_output_dir):
        # Auto-code surveys
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            for cc in plan.coding_configurations:
                if cc.cleaner is not None:
                    CleaningUtils.apply_cleaner_to_traced_data_iterable(
                        user, data, plan.raw_field, cc.coded_field, cc.cleaner,
                        cc.code_scheme)

        # Remove survey data sent after the project finished
        log.info(
            "Hiding survey messages sent after the end of the project. These will not be exported in "
            "production/analysis files")
        out_of_range_count = 0
        for td in data:
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                # TODO: Come up with a better solution here e.g. separate DEMOG/SURVEY lists
                if plan.raw_field in ["have_voice_raw", "suggestions_raw"]:
                    continue

                if plan.time_field in td and isoparse(
                        td[plan.time_field]
                ) > pipeline_configuration.project_end_date:
                    out_of_range_count += 1
                    td.hide_keys({plan.raw_field, plan.time_field},
                                 Metadata(user, Metadata.get_call_location(),
                                          time.time()))
        log.info(
            f"Hid {out_of_range_count} survey messages sent after the end of the project"
        )

        # For any locations where the cleaners assigned a code to a sub district, set the district code to NC
        # (this is because only one column should have a value set in Coda)
        for td in data:
            if "mogadishu_sub_district_coded" in td:
                mogadishu_code_id = td["mogadishu_sub_district_coded"][
                    "CodeID"]
                if CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_id(
                        mogadishu_code_id).code_type == "Normal":
                    nc_label = CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.MOGADISHU_SUB_DISTRICT,
                        CodeSchemes.MOGADISHU_SUB_DISTRICT.
                        get_code_with_control_code(Codes.NOT_CODED),
                        Metadata.get_call_location(),
                    )
                    td.append_data({"district_coded": nc_label.to_dict()},
                                   Metadata(user, Metadata.get_call_location(),
                                            time.time()))

        # Output survey responses to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field, {
                        cc.coded_field: cc.code_scheme
                        for cc in plan.coding_configurations
                    }, f)

        return data

Exemplo n.º 13

0

Exibir arquivo

    def apply_manual_codes(cls, user, data, coda_input_dir):
        # Merge manually coded radio show files into the cleaned dataset
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            rqa_messages = [td for td in data if plan.raw_field in td]
            coda_input_path = path.join(coda_input_dir, plan.coda_filename)

            f = None
            try:
                if path.exists(coda_input_path):
                    f = open(coda_input_path, "r")
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                    user, rqa_messages, plan.id_field, {plan.coded_field: plan.code_scheme}, f)

                if plan.binary_code_scheme is not None:
                    if f is not None:
                        f.seek(0)
                    TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                        user, rqa_messages, plan.id_field, {plan.binary_coded_field: plan.binary_code_scheme}, f)
            finally:
                if f is not None:
                    f.close()

        # Mark data that is noise as Codes.NOT_CODED
        for td in data:
            if td["noise"]:
                nc_dict = dict()
                for plan in PipelineConfiguration.RQA_CODING_PLANS:
                    if plan.coded_field not in td:
                        nc_label = CleaningUtils.make_label_from_cleaner_code(
                            plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.NOT_CODED),
                            Metadata.get_call_location()
                        )
                        nc_dict[plan.coded_field] = [nc_label.to_dict()]

                        if plan.binary_code_scheme is not None:
                            nc_label = CleaningUtils.make_label_from_cleaner_code(
                                plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.NOT_CODED),
                                Metadata.get_call_location()
                            )
                            nc_dict[plan.binary_coded_field] = nc_label.to_dict()

                td.append_data(nc_dict, Metadata(user, Metadata.get_call_location(), time.time()))

        # Merge manually coded survey files into the cleaned dataset
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            f = None
            try:
                coda_input_path = path.join(coda_input_dir, plan.coda_filename)
                if path.exists(coda_input_path):
                    f = open(coda_input_path, "r")
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                    user, data, plan.id_field, {plan.coded_field: plan.code_scheme}, f)
            finally:
                if f is not None:
                    f.close()

        # Set district/region/state/zone codes from the coded district field.
        for td in data:
            # Up to 1 location code should have been assigned in Coda. Search for that code,
            # ensuring that only 1 has been assigned or, if multiple have been assigned, that they are non-conflicting
            # control codes
            location_code = None

            for plan in PipelineConfiguration.LOCATION_CODING_PLANS:
                coda_code = plan.code_scheme.get_code_with_id(td[plan.coded_field]["CodeID"])
                if location_code is not None:
                    if not (coda_code.code_id == location_code.code_id or coda_code.control_code == Codes.NOT_REVIEWED):
                        location_code = Code(None, "Control", None, None, None, None, control_code=Codes.NOT_INTERNALLY_CONSISTENT)
                elif coda_code.control_code != Codes.NOT_REVIEWED:
                    location_code = coda_code

            # If no code was found, then this location is still not reviewed.
            # Synthesise a NOT_REVIEWED code accordingly.
            if location_code is None:
                location_code = Code(None, "Control", None, None, None, None, control_code=Codes.NOT_REVIEWED)

            # If a control code was found, set all other location keys to that control code,
            # otherwise convert the provided location to the other locations in the hierarchy.
            if location_code.code_type == "Control":
                for plan in PipelineConfiguration.LOCATION_CODING_PLANS:
                    td.append_data({
                        plan.coded_field: CleaningUtils.make_label_from_cleaner_code(
                            plan.code_scheme,
                            plan.code_scheme.get_code_with_control_code(location_code.control_code),
                            Metadata.get_call_location()
                        ).to_dict()
                    }, Metadata(user, Metadata.get_call_location(), time.time()))
            else:
                location = location_code.match_values[0]

                td.append_data({
                    "mogadishu_sub_district_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.MOGADISHU_SUB_DISTRICT,
                        cls.make_location_code(CodeSchemes.MOGADISHU_SUB_DISTRICT,
                                               SomaliaLocations.mogadishu_sub_district_for_location_code(location)),
                        Metadata.get_call_location()).to_dict(),
                    "district_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.DISTRICT,
                        cls.make_location_code(CodeSchemes.DISTRICT,
                                               SomaliaLocations.district_for_location_code(location)),
                        Metadata.get_call_location()).to_dict(),
                    "region_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.REGION,
                        cls.make_location_code(CodeSchemes.REGION,
                                               SomaliaLocations.region_for_location_code(location)),
                        Metadata.get_call_location()).to_dict(),
                    "state_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.STATE,
                        cls.make_location_code(CodeSchemes.STATE,
                                               SomaliaLocations.state_for_location_code(location)),
                        Metadata.get_call_location()).to_dict(),
                    "zone_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.ZONE,
                        cls.make_location_code(CodeSchemes.ZONE,
                                               SomaliaLocations.zone_for_location_code(location)),
                        Metadata.get_call_location()).to_dict()
                }, Metadata(user, Metadata.get_call_location(), time.time()))

        return data

Exemplo n.º 14

0

Exibir arquivo

Arquivo: test_io.py Projeto: AfricasVoices/CoreDataModules

    def test_export_import_one_single_coded_scheme(self):
        file_path = path.join(self.test_dir, "coda_2_test.json")

        # Build raw input data
        message_dicts = [
            {"gender_raw": "woman", "gender_sent_on": "2018-11-01T07:13:04+03:00"},
            {"gender_raw": "", "gender_sent_on": "2018-11-01T07:17:04+03:00"},
            {"gender_raw": "hiya", "gender_sent_on": "2018-11-01T07:19:04+05:00"},
            {},
            {"gender_raw": "boy", "gender_sent_on": "2018-11-02T19:00:29+03:00"},
            {"gender_raw": "man", "gender_sent_on": "2018-11-02T19:00:29+03:00"},
        ]
        messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i))
                    for i, d in enumerate(message_dicts)]

        # Add message ids
        TracedDataCodaV2IO.compute_message_ids("test_user", messages, "gender_raw", "gender_coda_id")

        # Load gender scheme
        with open("tests/traced_data/resources/coda_2_gender_scheme.json") as f:
            gender_scheme = CodeScheme.from_firebase_map(json.load(f))

        # Apply the English gender cleaner
        with mock.patch("core_data_modules.util.TimeUtils.utc_now_as_iso_string") as time_mock, \
                mock.patch("core_data_modules.traced_data.Metadata.get_function_location") as location_mock:
            time_mock.return_value = "2018-11-02T15:00:07+00:00"
            location_mock.return_value = "english.DemographicCleaner.clean_gender"

            CleaningUtils.apply_cleaner_to_traced_data_iterable(
                "test_user", messages, "gender_raw", "gender_coded",
                english.DemographicCleaner.clean_gender, gender_scheme
            )

        # Export to a Coda 2 messages file
        with open(file_path, "w") as f:
            TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                messages, "gender_raw", "gender_sent_on", "gender_coda_id", {"gender_coded": gender_scheme}, f)

        self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_one_scheme.json"))

        # Test importing with no file available
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
            "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme})
        # Deliberately testing the read can be done twice
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
            "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme})

        na_id = gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id
        nr_id = gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                gender_scheme,
                gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("gender_raw", "") == "":
                td.append_data({"gender_coded": na_label.to_dict()},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        imported_code_ids = [td["gender_coded"]["CodeID"] for td in imported_messages]

        self.assertListEqual(imported_code_ids, [nr_id, na_id, nr_id, na_id, nr_id, nr_id])

        # Test importing from the test file
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        with open("tests/traced_data/resources/coda_2_import_test_one_scheme.json", "r") as f:
            TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                "test_user", imported_messages, "gender_coda_id", {"gender_coded": gender_scheme}, f)

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                gender_scheme,
                gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("gender_raw", "") == "":
                td.append_data({"gender_coded": na_label.to_dict()},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        imported_code_ids = [td["gender_coded"]["CodeID"] for td in imported_messages]

        expected_code_ids = [
            gender_scheme.get_code_with_match_value("female").code_id,  # Manually approved auto-code
            gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id,  # Empty raw message
            gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id,  # Manually assigned code which isn't checked
            gender_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id,  # No raw message
            gender_scheme.get_code_with_control_code(Codes.NOT_CODED).code_id,  # Manually Not Coded
            gender_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id,  # Manually un-coded
        ]
        self.assertListEqual(imported_code_ids, expected_code_ids)

        # Add an element with the same raw text but a conflicting
        messages.append(TracedData({
            "gender_raw": "woman", "gender_sent_on": "2018-11-01T07:13:04+03:00",
            "gender_coded": CleaningUtils.make_label_from_cleaner_code(
                gender_scheme, gender_scheme.get_code_with_match_value("male"), "make_location_label",
                date_time_utc="2018-11-03T13:40:50Z").to_dict()
        }, Metadata("test_user", Metadata.get_call_location(), time.time())))
        TracedDataCodaV2IO.compute_message_ids("test_user", messages, "gender_raw", "gender_coda_id")

        with open(file_path, "w") as f:
            try:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    messages, "gender_raw", "gender_sent_on", "gender_coda_id", {"gender_coded": gender_scheme}, f)
            except AssertionError as e:
                assert str(e) == "Messages with the same id " \
                                 "(cf2e5bff1ef03dcd20d1a0b18ef7d89fc80a3554434165753672f6f40fde1d25) have different " \
                                 "labels for coded_key 'gender_coded'"
                return
            self.fail("Exporting data with conflicting labels did not fail")

Exemplo n.º 15

0

Exibir arquivo

    def auto_code_surveys(cls, user, data, phone_uuid_table, coda_output_dir):
        # Label missing data
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if td.get(plan.raw_field, "") == "":
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.TRUE_MISSING), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = na_label.to_dict()
            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Auto-code remaining data
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.cleaner is not None:
                CleaningUtils.apply_cleaner_to_traced_data_iterable(
                    user, data, plan.raw_field, plan.coded_field, plan.cleaner,
                    plan.code_scheme)

        # For any locations where the cleaners assigned a code to a sub district, set the district code to NC
        # (this is because only one column should have a value set in Coda)
        for td in data:
            if "mogadishu_sub_district_coded" in td:
                mogadishu_code_id = td["mogadishu_sub_district_coded"][
                    "CodeID"]
                if CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_id(
                        mogadishu_code_id).code_type == "Normal":
                    nc_label = CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.MOGADISHU_SUB_DISTRICT,
                        CodeSchemes.MOGADISHU_SUB_DISTRICT.
                        get_code_with_control_code(Codes.NOT_CODED),
                        Metadata.get_call_location(),
                    )
                    td.append_data({"district_coded": nc_label.to_dict()},
                                   Metadata(user, Metadata.get_call_location(),
                                            time.time()))

        # Set operator from phone number
        for td in data:
            operator_clean = PhoneCleaner.clean_operator(
                phone_uuid_table.get_phone(td["uid"]))
            if operator_clean == Codes.NOT_CODED:
                label = CleaningUtils.make_label_from_cleaner_code(
                    CodeSchemes.OPERATOR,
                    CodeSchemes.OPERATOR.get_code_with_control_code(
                        Codes.NOT_CODED), Metadata.get_call_location())
            else:
                label = CleaningUtils.make_label_from_cleaner_code(
                    CodeSchemes.OPERATOR,
                    CodeSchemes.OPERATOR.get_code_with_match_value(
                        operator_clean), Metadata.get_call_location())
            td.append_data({"operator_coded": label.to_dict()},
                           Metadata(user, Metadata.get_call_location(),
                                    time.time()))

        # Output single-scheme answers to coda for manual verification + coding
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.raw_field == "mogadishu_sub_district_raw":
                continue

            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   plan.id_field)

            coda_output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(coda_output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, plan.time_field, plan.id_field,
                    {plan.coded_field: plan.code_scheme}, f)

        # Output location scheme to coda for manual verification + coding
        output_path = path.join(coda_output_dir, "location.json")
        TracedDataCodaV2IO.compute_message_ids(
            user, data, "mogadishu_sub_district_raw",
            "mogadishu_sub_district_raw_id")
        with open(output_path, "w") as f:
            TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                data, "mogadishu_sub_district_raw",
                "mogadishu_sub_district_time", "mogadishu_sub_district_raw_id",
                {
                    "mogadishu_sub_district_coded":
                    CodeSchemes.MOGADISHU_SUB_DISTRICT,
                    "district_coded": CodeSchemes.DISTRICT,
                    "region_coded": CodeSchemes.REGION,
                    "state_coded": CodeSchemes.STATE,
                    "zone_coded": CodeSchemes.ZONE
                }, f)

        return data

Exemplo n.º 16

0

Exibir arquivo

Arquivo: fetch_raw_data.py Projeto: AfricasVoices/Project-UNDP-RCO

        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table,
            pipeline_configuration.rapid_pro_test_contact_uuids)

        # Set the operator codes for each message.
        if flow in pipeline_configuration.activation_flow_names:
            uuids = {td["avf_phone_id"] for td in traced_runs}
            uuid_to_phone_lut = phone_number_uuid_table.uuid_to_data_batch(
                uuids)
            for td in traced_runs:
                operator_code = PhoneCleaner.clean_operator(
                    uuid_to_phone_lut[td["avf_phone_id"]])
                if operator_code == Codes.NOT_CODED:
                    operator_label = CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.SOMALIA_OPERATOR,
                        CodeSchemes.SOMALIA_OPERATOR.
                        get_code_with_control_code(Codes.NOT_CODED),
                        Metadata.get_call_location())
                else:
                    operator_label = CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.SOMALIA_OPERATOR,
                        CodeSchemes.SOMALIA_OPERATOR.get_code_with_match_value(
                            operator_code), Metadata.get_call_location())
                td.append_data({"operator_coded": operator_label.to_dict()},
                               Metadata(user, Metadata.get_call_location(),
                                        TimeUtils.utc_now_as_iso_string()))

        log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...")
        with open(raw_runs_path, "w") as raw_runs_file:
            json.dump([run.serialize() for run in raw_runs], raw_runs_file)
        log.info(f"Saved {len(raw_runs)} raw runs")

Exemplo n.º 17

0

Exibir arquivo

Arquivo: code_imputation_functions.py Projeto: AfricasVoices/Project-UNDP-RCO

def impute_yes_no_reasons_codes(user, data, coding_configurations):
    # Synchronise the control codes between the binary and reasons schemes:
    # Some RQA datasets have a binary scheme, which is always labelled, and a reasons scheme, which is only labelled
    # if there is an additional reason given. Importing those two schemes separately above caused the labels in
    # each scheme to go out of sync with each other, e.g. reasons can be NR when the binary *was* reviewed.
    # This block updates the reasons scheme in cases where only a binary label was set, by assigning the
    # label 'NC' if the binary label was set to a normal code, otherwise to be the same control code as the binary.
    binary_configuration = coding_configurations[0]
    reasons_configuration = coding_configurations[1]

    # TODO: Switch to using CodingModes.SINGLE/MULTIPLE once configuration is being set in configuration json
    #       rather than in pipeline_configuration.py
    assert binary_configuration.coding_mode == "SINGLE"
    assert reasons_configuration.coding_mode == "MULTIPLE"

    for td in data:
        binary_label = td[binary_configuration.coded_field]
        binary_code = binary_configuration.code_scheme.get_code_with_id(
            binary_label["CodeID"])

        binary_label_present = \
            binary_label["CodeID"] != binary_configuration.code_scheme.get_code_with_control_code(
                Codes.NOT_REVIEWED).code_id

        reasons_label_present = \
            len(td[reasons_configuration.coded_field]) > 1 or \
            td[reasons_configuration.coded_field][0][
                "CodeID"] != reasons_configuration.code_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id

        if binary_label_present and not reasons_label_present:
            if binary_code.code_type == "Control":
                control_code = binary_code.control_code
                reasons_code = reasons_configuration.code_scheme.get_code_with_control_code(
                    control_code)

                reasons_label = CleaningUtils.make_label_from_cleaner_code(
                    reasons_configuration.code_scheme,
                    reasons_code,
                    Metadata.get_call_location(),
                    origin_name="Pipeline Code Synchronisation")

                td.append_data(
                    {
                        reasons_configuration.coded_field:
                        [reasons_label.to_dict()]
                    },
                    Metadata(user, Metadata.get_call_location(),
                             TimeUtils.utc_now_as_iso_string()))
            else:
                assert binary_code.code_type == "Normal"

                nc_label = CleaningUtils.make_label_from_cleaner_code(
                    reasons_configuration.code_scheme,
                    reasons_configuration.code_scheme.
                    get_code_with_control_code(Codes.NOT_CODED),
                    Metadata.get_call_location(),
                    origin_name="Pipeline Code Synchronisation")
                td.append_data(
                    {reasons_configuration.coded_field: [nc_label.to_dict()]},
                    Metadata(user, Metadata.get_call_location(),
                             TimeUtils.utc_now_as_iso_string()))

Exemplo n.º 18

0

Exibir arquivo

    def apply_manual_codes(cls, user, data, coda_input_dir):
        # Merge manually coded data into the cleaned dataset
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.coda_filename is None:
                continue

            coda_input_path = path.join(coda_input_dir, plan.coda_filename)

            for cc in plan.coding_configurations:
                if not cc.requires_manual_verification:
                    continue

                f = None
                try:
                    if path.exists(coda_input_path):
                        f = open(coda_input_path, "r")

                    if cc.coding_mode == CodingModes.SINGLE:
                        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                            user, data, plan.id_field,
                            {cc.coded_field: cc.code_scheme}, f)
                    else:
                        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                            user, data, plan.id_field,
                            {cc.coded_field: cc.code_scheme}, f)
                finally:
                    if f is not None:
                        f.close()

            if PipelineConfiguration.WS_CORRECT_DATASET_SCHEME is not None:
                f = None
                try:
                    if path.exists(coda_input_path):
                        f = open(coda_input_path, "r")

                    TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                        user, data, plan.id_field, {
                            f"{plan.raw_field}_correct_dataset":
                            PipelineConfiguration.WS_CORRECT_DATASET_SCHEME
                        }, f)
                finally:
                    if f is not None:
                        f.close()

        # Label data for which there is no response as TRUE_MISSING.
        # Label data for which the response is the empty string as NOT_CODED.
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                for cc in plan.coding_configurations:
                    raw_field = cc.raw_field if cc.raw_field is not None else plan.raw_field
                    if raw_field not in td:
                        na_label = CleaningUtils.make_label_from_cleaner_code(
                            cc.code_scheme,
                            cc.code_scheme.get_code_with_control_code(
                                Codes.TRUE_MISSING),
                            Metadata.get_call_location()).to_dict()
                        missing_dict[
                            cc.
                            coded_field] = na_label if cc.coding_mode == CodingModes.SINGLE else [
                                na_label
                            ]
                for cc in plan.coding_configurations:
                    raw_field = cc.raw_field if cc.raw_field is not None else plan.raw_field
                    if td.get(raw_field) == "":
                        nc_label = CleaningUtils.make_label_from_cleaner_code(
                            cc.code_scheme,
                            cc.code_scheme.get_code_with_control_code(
                                Codes.NOT_CODED),
                            Metadata.get_call_location()).to_dict()
                        missing_dict[
                            cc.
                            coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [
                                nc_label
                            ]
            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Mark data that is noise as Codes.NOT_CODED
        for td in data:
            if td.get("noise", False):
                nc_dict = dict()
                for plan in PipelineConfiguration.RQA_CODING_PLANS:
                    for cc in plan.coding_configurations:
                        if cc.coded_field not in td:
                            nc_label = CleaningUtils.make_label_from_cleaner_code(
                                cc.code_scheme,
                                cc.code_scheme.get_code_with_control_code(
                                    Codes.NOT_CODED),
                                Metadata.get_call_location()).to_dict()
                            nc_dict[
                                cc.
                                coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [
                                    nc_label
                                ]
                td.append_data(
                    nc_dict,
                    Metadata(user, Metadata.get_call_location(), time.time()))

        # Run the cleaners that don't require manual verification again, this time setting "checked" to True
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            for cc in plan.coding_configurations:
                if cc.cleaner is not None and not cc.requires_manual_verification:
                    raw_field = cc.raw_field if cc.raw_field is not None else plan.raw_field
                    CleaningUtils.apply_cleaner_to_traced_data_iterable(
                        user,
                        data,
                        raw_field,
                        cc.coded_field,
                        cc.cleaner,
                        cc.code_scheme,
                        set_checked=True)

        # Run code imputation functions
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.code_imputation_function is not None:
                plan.code_imputation_function(user, data,
                                              plan.coding_configurations)

        cls._impute_coding_error_codes(user, data)

        return data

Exemplo n.º 19

0

Exibir arquivo

Arquivo: test_io.py Projeto: AfricasVoices/CoreDataModules

    def test_export_import_one_multi_coded_scheme(self):
        file_path = path.join(self.test_dir, "coda_2_test.json")

        # Build raw input data
        message_dicts = [
            {"msg_raw": "food", "msg_sent_on": "2018-11-01T07:13:04+03:00"},
            {"msg_raw": "", "msg_sent_on": "2018-11-01T07:17:04+03:00"},
            {"msg_raw": "food + water", "msg_sent_on": "2018-11-01T07:19:04+05:00"},
            {},
            {"msg_raw": "water", "msg_sent_on": "2018-11-02T19:00:29+03:00"},
            {"msg_raw": "abcd", "msg_sent_on": "2018-11-02T20:30:45+03:00"}
        ]
        messages = [TracedData(d, Metadata("test_user", Metadata.get_call_location(), i))
                    for i, d in enumerate(message_dicts)]

        # Add message ids
        TracedDataCodaV2IO.compute_message_ids("test_user", messages, "msg_raw", "msg_coda_id")

        # Load gender scheme
        with open("tests/traced_data/resources/coda_2_msg_scheme.json") as f:
            msg_scheme = CodeScheme.from_firebase_map(json.load(f))

        # Export to a Coda 2 messages file
        with open(file_path, "w") as f:
            TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                messages, "msg_raw", "msg_sent_on", "msg_coda_id", {"msg_coded": msg_scheme}, f)

        self.assertTrue(filecmp.cmp(file_path, "tests/traced_data/resources/coda_2_export_expected_multi_coded.json"))

        # Test importing with no file available
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
            "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme})
        # Deliberately testing the read can be done twice
        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
            "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme})

        na_id = msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id
        nr_id = msg_scheme.get_code_with_control_code(Codes.NOT_REVIEWED).code_id

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                msg_scheme,
                msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("msg_raw", "") == "":
                td.append_data({"msg_coded": [na_label.to_dict()]},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        for td in imported_messages:
            self.assertEqual(len(td["msg_coded"]), 1)
        imported_code_ids = [td["msg_coded"][0]["CodeID"] for td in imported_messages]
        self.assertListEqual(imported_code_ids, [nr_id, na_id, nr_id, na_id, nr_id, nr_id])

        # Test importing from the test file
        imported_messages = []
        for td in messages:
            imported_messages.append(td.copy())
        with open("tests/traced_data/resources/coda_2_import_test_multi_coded.json", "r") as f:
            TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}, f)

            # Test that reading the same file-pointer twice without moving it back to the start of the file fails
            try:
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                    "test_user", imported_messages, "msg_coda_id", {"msg_coded": msg_scheme}, f)
                self.fail("Re-using the same file pointer didn't raise an assertion error")
            except AssertionError as e:
                self.assertEqual(str(e), "File-pointer not at byte 0. "
                                         "Should you have used e.g. `f.seek(0)` before calling this method?")

        # Set TRUE_MISSING codes
        for td in imported_messages:
            na_label = CleaningUtils.make_label_from_cleaner_code(
                msg_scheme,
                msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                "test_export_traced_data_iterable_to_coda_2",
                date_time_utc="2018-11-02T10:00:00+00:00"
            )
            if td.get("msg_raw", "") == "":
                td.append_data({"msg_coded": [na_label.to_dict()]},
                               Metadata("test_user", Metadata.get_call_location(), time.time()))

        imported_code_ids = []
        for td in imported_messages:
            imported_code_ids.append([code["CodeID"] for code in td["msg_coded"]])

        expected_code_ids = [
            [msg_scheme.get_code_with_match_value("food").code_id],
            [msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id],
            [msg_scheme.get_code_with_match_value("food").code_id, msg_scheme.get_code_with_match_value("water").code_id],
            [msg_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id],
            [msg_scheme.get_code_with_match_value("water").code_id],
            [msg_scheme.get_code_with_control_code(Codes.NOT_CODED).code_id]
        ]

        for x, y in zip(imported_code_ids, expected_code_ids):
            self.assertEqual(len(x), len(y))
            self.assertSetEqual(set(x), set(y))

Exemplo n.º 20

0

Exibir arquivo

def impute_somalia_location_codes(user, data, location_configurations):
    for td in data:
        # Up to 1 location code should have been assigned in Coda. Search for that code,
        # ensuring that only 1 has been assigned or, if multiple have been assigned, that they are non-conflicting
        # control codes
        location_code = None

        for cc in location_configurations:
            coda_code = cc.code_scheme.get_code_with_code_id(
                td[cc.coded_field]["CodeID"])
            if location_code is not None:
                if not (coda_code.code_id == location_code.code_id
                        or coda_code.control_code == Codes.NOT_REVIEWED):
                    location_code = CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_control_code(
                        Codes.CODING_ERROR)
            elif coda_code.control_code != Codes.NOT_REVIEWED:
                location_code = coda_code

        # If no code was found, then this location is still not reviewed.
        # Synthesise a NOT_REVIEWED code accordingly.
        if location_code is None:
            location_code = CodeSchemes.MOGADISHU_SUB_DISTRICT.get_code_with_control_code(
                Codes.NOT_REVIEWED)

        # If a control code was found, set all other location keys to that control code,
        # otherwise convert the provided location to the other locations in the hierarchy.
        if location_code.code_type == CodeTypes.CONTROL:
            for cc in location_configurations:
                td.append_data(
                    {
                        cc.coded_field:
                        CleaningUtils.make_label_from_cleaner_code(
                            cc.code_scheme,
                            cc.code_scheme.get_code_with_control_code(
                                location_code.control_code),
                            Metadata.get_call_location()).to_dict()
                    }, Metadata(user, Metadata.get_call_location(),
                                time.time()))
        elif location_code.code_type == CodeTypes.META:
            for cc in location_configurations:
                td.append_data(
                    {
                        cc.coded_field:
                        CleaningUtils.make_label_from_cleaner_code(
                            cc.code_scheme,
                            cc.code_scheme.get_code_with_meta_code(
                                location_code.meta_code),
                            Metadata.get_call_location()).to_dict()
                    }, Metadata(user, Metadata.get_call_location(),
                                time.time()))
        else:
            assert location_code.code_type == CodeTypes.NORMAL
            location = location_code.match_values[0]
            td.append_data(
                {
                    "mogadishu_sub_district_coded":
                    CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.MOGADISHU_SUB_DISTRICT,
                        make_location_code(
                            CodeSchemes.MOGADISHU_SUB_DISTRICT,
                            SomaliaLocations.
                            mogadishu_sub_district_for_location_code(location)
                        ), Metadata.get_call_location()).to_dict(),
                    "district_coded":
                    CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.SOMALIA_DISTRICT,
                        make_location_code(
                            CodeSchemes.SOMALIA_DISTRICT,
                            SomaliaLocations.district_for_location_code(
                                location)),
                        Metadata.get_call_location()).to_dict(),
                    "region_coded":
                    CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.SOMALIA_REGION,
                        make_location_code(
                            CodeSchemes.SOMALIA_REGION,
                            SomaliaLocations.region_for_location_code(location)
                        ), Metadata.get_call_location()).to_dict(),
                    "state_coded":
                    CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.SOMALIA_STATE,
                        make_location_code(
                            CodeSchemes.SOMALIA_STATE,
                            SomaliaLocations.state_for_location_code(location)
                        ), Metadata.get_call_location()).to_dict(),
                    "zone_coded":
                    CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.SOMALIA_ZONE,
                        make_location_code(
                            CodeSchemes.SOMALIA_ZONE,
                            SomaliaLocations.zone_for_location_code(location)),
                        Metadata.get_call_location()).to_dict()
                }, Metadata(user, Metadata.get_call_location(), time.time()))

        # Impute zone from operator
        if "location_raw" not in td:
            operator_str = CodeSchemes.SOMALIA_OPERATOR.get_code_with_code_id(
                td["operator_coded"]["CodeID"]).string_value
            zone_str = SomaliaLocations.zone_for_operator_code(operator_str)

            td.append_data(
                {
                    "zone_coded":
                    CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.SOMALIA_ZONE,
                        make_location_code(
                            CodeSchemes.SOMALIA_ZONE,
                            SomaliaLocations.state_for_location_code(zone_str)
                        ), Metadata.get_call_location()).to_dict()
                }, Metadata(user, Metadata.get_call_location(), time.time()))

Exemplo n.º 21

0

Exibir arquivo

    def apply_manual_codes(cls, user, data, coda_input_dir):
        # Merge manually coded radio show files into the cleaned dataset
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            rqa_messages = [td for td in data if plan.raw_field in td]
            coda_input_path = path.join(coda_input_dir, plan.coda_filename)
            print(coda_input_path)

            f = None
            try:
                if path.exists(coda_input_path):
                    f = open(coda_input_path, "r")
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                    user, rqa_messages, plan.id_field, {plan.coded_field: plan.code_scheme}, f)

                if plan.binary_code_scheme is not None:
                    if f is not None:
                        f.seek(0)
                    TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                        user, rqa_messages, plan.id_field, {plan.binary_coded_field: plan.binary_code_scheme}, f)
            finally:
                if f is not None:
                    f.close()

        # At this point, the TracedData objects still contain messages for at most one week each.
        # Label the weeks for which there is no response as TRUE_MISSING.
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if plan.raw_field not in td:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                        Metadata.get_call_location()
                    )
                    missing_dict[plan.coded_field] = [na_label.to_dict()]

                    if plan.binary_code_scheme is not None:
                        na_label = CleaningUtils.make_label_from_cleaner_code(
                            plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                            Metadata.get_call_location()
                        )
                        missing_dict[plan.binary_coded_field] = na_label.to_dict()

            td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time()))

        # Synchronise the control codes between the binary and reasons schemes:
        # Some RQA datasets have a binary scheme, which is always labelled, and a reasons scheme, which is only labelled
        # if there is an additional reason given. Importing those two schemes separately above caused the labels in
        # each scheme to go out of sync with each other, e.g. reasons can be NR when the binary *was* reviewed.
        # This block updates the reasons scheme in cases where only a binary label was set, by assigning the
        # label 'NC' if the binary label was set to a normal code, otherwise to be the same control code as the binary.
        for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
            rqa_messages = [td for td in data if plan.raw_field in td]
            if plan.binary_code_scheme is not None:
                for td in rqa_messages:
                    binary_label = td[plan.binary_coded_field]
                    binary_code = plan.binary_code_scheme.get_code_with_id(binary_label["CodeID"])

                    binary_label_present = binary_label["CodeID"] != \
                                           plan.binary_code_scheme.get_code_with_control_code(
                                               Codes.NOT_REVIEWED).code_id

                    reasons_label_present = len(td[plan.coded_field]) > 1 or td[plan.coded_field][0][
                        "CodeID"] != \
                                            plan.code_scheme.get_code_with_control_code(
                                                Codes.NOT_REVIEWED).code_id

                    if binary_label_present and not reasons_label_present:
                        if binary_code.code_type == "Control":
                            control_code = binary_code.control_code
                            reasons_code = plan.code_scheme.get_code_with_control_code(control_code)

                            reasons_label = CleaningUtils.make_label_from_cleaner_code(
                                plan.code_scheme, reasons_code,
                                Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation")

                            td.append_data(
                                {plan.coded_field: [reasons_label.to_dict()]},
                                Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())
                            )
                        else:
                            assert binary_code.code_type == "Normal"

                            nc_label = CleaningUtils.make_label_from_cleaner_code(
                                plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.NOT_CODED),
                                Metadata.get_call_location(), origin_name="Pipeline Code Synchronisation"
                            )
                            td.append_data(
                                {plan.coded_field: [nc_label.to_dict()]},
                                Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string())
                            )

        # Not everyone will have answered all of the demographic flows.
        # Label demographic questions which had no responses as TRUE_MISSING.
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if td.get(plan.raw_field, "") == "":
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                        Metadata.get_call_location()
                    )
                    missing_dict[plan.coded_field] = na_label.to_dict()
            td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time()))

        return data

Exemplo n.º 22

0

Exibir arquivo

Arquivo: auto_code_show_messages.py Projeto: AfricasVoices/Test-REDDS

    def auto_code_show_messages(cls, user, data, icr_output_dir, coda_output_dir):
        # Filter out test messages sent by AVF.
        if not PipelineConfiguration.DEV_MODE:
            data = MessageFilters.filter_test_messages(data)

        # Filter for runs which don't contain a response to any week's question 
        data = MessageFilters.filter_empty_messages(data, cls.TEST_KEYS)

        # Filter out runs sent outside the project start and end dates
        data = MessageFilters.filter_time_range(data, cls.SENT_ON_KEY, cls.PROJECT_START_DATE, cls.PROJECT_END_DATE)
        
        # Label missing data
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS:
                if plan.raw_field not in td:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme, plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                        Metadata.get_call_location()
                    )
                    missing_dict[plan.coded_field] = [na_label.to_dict()]

                if plan.binary_code_scheme is not None:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.binary_code_scheme, plan.binary_code_scheme.get_code_with_control_code(Codes.TRUE_MISSING),
                        Metadata.get_call_location()
                    )
                    missing_dict[plan.binary_coded_field] = na_label.to_dict()
            
            td.append_data(missing_dict, Metadata(user, Metadata.get_call_location(), time.time()))
    
        # Label each message with channel keys
        Channels.set_channel_keys(user, data, cls.SENT_ON_KEY)

        # Output messagges for Coda
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field, plan.id_field)

            output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    data, plan.raw_field, cls.SENT_ON_KEY, plan.id_field, {}, f
                )
        print("Coda message files successfully exported")

        # Output messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.TEST_SHOWS_CODING_PLANS:
            test_pipeline_messages = []
            na_messages = []
            for td in data:
                if plan.coded_field not in td:
                    test_pipeline_messages.append(td)
                    
                else:
                    assert len(td[plan.coded_field]) == 1
                    assert td[plan.coded_field][0]["CodeID"] == \
                            plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id

            
            icr_messages = ICRTools.generate_sample_for_icr(
                test_pipeline_messages, cls.ICR_MESSAGES_COUNT, random.Random(cls.ICR_SEED))
            
            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages, f, headers=[plan.run_id_field, plan.raw_field]
                )
        print("ICR files successfully exported")

        return data

Exemplo n.º 23

0

Exibir arquivo

def fetch_from_rapid_pro(user, google_cloud_credentials_file_path, raw_data_dir, phone_number_uuid_table,
                         rapid_pro_source):
    log.info("Fetching data from Rapid Pro...")
    log.info("Downloading Rapid Pro access token...")
    rapid_pro_token = google_cloud_utils.download_blob_to_string(
        google_cloud_credentials_file_path, rapid_pro_source.token_file_url).strip()

    rapid_pro = RapidProClient(rapid_pro_source.domain, rapid_pro_token)

    # Load the previous export of contacts if it exists, otherwise fetch all contacts from Rapid Pro.
    raw_contacts_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_raw.json"
    contacts_log_path = f"{raw_data_dir}/{rapid_pro_source.contacts_file_name}_log.jsonl"
    try:
        log.info(f"Loading raw contacts from file '{raw_contacts_path}'...")
        with open(raw_contacts_path) as raw_contacts_file:
            raw_contacts = [Contact.deserialize(contact_json) for contact_json in json.load(raw_contacts_file)]
        log.info(f"Loaded {len(raw_contacts)} contacts")
    except FileNotFoundError:
        log.info(f"File '{raw_contacts_path}' not found, will fetch all contacts from the Rapid Pro server")
        with open(contacts_log_path, "a") as contacts_log_file:
            raw_contacts = rapid_pro.get_raw_contacts(raw_export_log_file=contacts_log_file)

    # Download all the runs for each of the radio shows
    for flow in rapid_pro_source.activation_flow_names + rapid_pro_source.survey_flow_names:
        runs_log_path = f"{raw_data_dir}/{flow}_log.jsonl"
        raw_runs_path = f"{raw_data_dir}/{flow}_raw.json"
        traced_runs_output_path = f"{raw_data_dir}/{flow}.jsonl"
        log.info(f"Exporting flow '{flow}' to '{traced_runs_output_path}'...")

        flow_id = rapid_pro.get_flow_id(flow)

        # Load the previous export of runs for this flow, and update them with the newest runs.
        # If there is no previous export for this flow, fetch all the runs from Rapid Pro.
        with open(runs_log_path, "a") as raw_runs_log_file:
            try:
                log.info(f"Loading raw runs from file '{raw_runs_path}'...")
                with open(raw_runs_path) as raw_runs_file:
                    raw_runs = [Run.deserialize(run_json) for run_json in json.load(raw_runs_file)]
                log.info(f"Loaded {len(raw_runs)} runs")
                raw_runs = rapid_pro.update_raw_runs_with_latest_modified(
                    flow_id, raw_runs, raw_export_log_file=raw_runs_log_file, ignore_archives=True)
            except FileNotFoundError:
                log.info(f"File '{raw_runs_path}' not found, will fetch all runs from the Rapid Pro server for flow '{flow}'")
                raw_runs = rapid_pro.get_raw_runs_for_flow_id(flow_id, raw_export_log_file=raw_runs_log_file)

        # Fetch the latest contacts from Rapid Pro.
        with open(contacts_log_path, "a") as raw_contacts_log_file:
            raw_contacts = rapid_pro.update_raw_contacts_with_latest_modified(raw_contacts,
                                                                              raw_export_log_file=raw_contacts_log_file)

        # Convert the runs to TracedData.
        traced_runs = rapid_pro.convert_runs_to_traced_data(
            user, raw_runs, raw_contacts, phone_number_uuid_table, rapid_pro_source.test_contact_uuids)

        if flow in rapid_pro_source.activation_flow_names:
            # Append the Rapid Pro source name to each run.
            # Only do this for activation flows because this is the only place where this is interesting.
            # Also, demogs may come from either instance, which causes problems downstream.
            for td in traced_runs:
                td.append_data({
                    "source_raw": rapid_pro_source.source_name,
                    "source_coded": CleaningUtils.make_label_from_cleaner_code(
                        CodeSchemes.SOURCE, CodeSchemes.SOURCE.get_code_with_match_value(rapid_pro_source.source_name),
                        Metadata.get_call_location()
                    ).to_dict()
                }, Metadata(user, Metadata.get_call_location(), TimeUtils.utc_now_as_iso_string()))

        log.info(f"Saving {len(raw_runs)} raw runs to {raw_runs_path}...")
        with open(raw_runs_path, "w") as raw_runs_file:
            json.dump([run.serialize() for run in raw_runs], raw_runs_file)
        log.info(f"Saved {len(raw_runs)} raw runs")

        log.info(f"Saving {len(traced_runs)} traced runs to {traced_runs_output_path}...")
        IOUtils.ensure_dirs_exist_for_file(traced_runs_output_path)
        with open(traced_runs_output_path, "w") as traced_runs_output_file:
            TracedDataJsonIO.export_traced_data_iterable_to_jsonl(traced_runs, traced_runs_output_file)
        log.info(f"Saved {len(traced_runs)} traced runs")

    log.info(f"Saving {len(raw_contacts)} raw contacts to file '{raw_contacts_path}'...")
    with open(raw_contacts_path, "w") as raw_contacts_file:
        json.dump([contact.serialize() for contact in raw_contacts], raw_contacts_file)
    log.info(f"Saved {len(raw_contacts)} contacts")

Exemplo n.º 24

0

Exibir arquivo

Arquivo: io.py Projeto: AfricasVoices/CoreDataModules

    def import_coda_2_to_traced_data_iterable(cls,
                                              user,
                                              data,
                                              message_id_key,
                                              scheme_key_map,
                                              f=None):
        """
        Codes keys in an iterable of TracedData objects by using the codes from a Coda 2 messages JSON file.

        Data which is has not been checked in the Coda file is coded using the provided nr_label
        (irrespective of whether there was an automatic code there before).

        TODO: Data which has been assigned a code under one scheme but none of the others needs to coded as NC not NR
        TODO: Or, do this in Coda so as to remove ambiguity from the perspective of the RAs?

        :param user: Identifier of user running this program.
        :type user: str
        :param data: TracedData objects to be coded using the Coda file.
        :type data: iterable of TracedData
        :param message_id_key: Key in TracedData objects of the message ids.
        :type message_id_key: str
        :param scheme_key_map: Dictionary of (key in TracedData objects to assign labels to) ->
                               (Scheme in the Coda messages file to retrieve the labels from)
        :type scheme_key_map: dict of str -> Scheme
        :param f: Coda data file to import codes from, or None.
        :type f: file-like | None
        """
        if f is None:
            f = cls._make_empty_file()

        # Build a lookup table of MessageID -> SchemeID -> Labels
        coda_dataset = cls._dataset_lut_from_messages_file(
            f, scheme_key_map.values())

        # Filter out TracedData objects that do not contain a message id key
        data = [td for td in data if message_id_key in td]

        # Apply the labels from Coda to each TracedData item in data
        for td in data:
            for key_of_coded, scheme in scheme_key_map.items():
                # Get labels for this (message id, scheme id) from the look-up table
                labels = coda_dataset.get(td[message_id_key],
                                          dict()).get(scheme.scheme_id, [])
                if labels is not None:
                    # Append each label that was assigned to this message for this scheme to the TracedData.
                    for label in reversed(labels):
                        td.append_data({key_of_coded: label.to_dict()},
                                       Metadata(
                                           user, Metadata.get_call_location(),
                                           TimeUtils.utc_now_as_iso_string()))

                # If this td still has no label after importing from the Coda file, or the label is a non-missing label
                # that hasn't been checked in the Coda UI, set a code for NOT_REVIEWED
                if key_of_coded not in td or not td[key_of_coded]["Checked"]:
                    nr_label = CleaningUtils.make_label_from_cleaner_code(
                        scheme,
                        scheme.get_code_with_control_code(Codes.NOT_REVIEWED),
                        Metadata.get_call_location())
                    td.append_data({key_of_coded: nr_label.to_dict()},
                                   Metadata(user, Metadata.get_call_location(),
                                            time.time()))

Exemplo n.º 25

0

Exibir arquivo

Arquivo: io.py Projeto: AfricasVoices/CoreDataModules

    def import_coda_2_to_traced_data_iterable_multi_coded(
            cls, user, data, message_id_key, scheme_key_map, f=None):
        """
        Codes keys in an iterable of TracedData objects by using the codes from a Coda 2 messages JSON file.

        Data which is has not been checked in the Coda file is coded using the provided nr_label
        (irrespective of whether there was an automatic code there before).
        
        Only the 'primary' schemes should be passed in. Schemes that have been duplicated using the duplicate_scheme
        tool in CodaV2/data_tools will be detected as being associated with the primary scheme automatically.

        TODO: Data which has been assigned a code under one scheme but none of the others needs to coded as NC not NR
        TODO: Or, do this in Coda so as to remove ambiguity from the perspective of the RAs?

        :param user: Identifier of user running this program.
        :type user: str
        :param data: TracedData objects to be coded using the Coda file.
        :type data: iterable of TracedData
        :param message_id_key: Key in TracedData objects of the message ids.
        :type message_id_key: str
        :param scheme_key_map: Dictionary of (key in TracedData objects to assign labels to) ->
                            (Scheme in the Coda messages file to retrieve the labels from)
        :type scheme_key_map: dict of str -> iterable of Scheme
        :param f: Coda data file to import codes from, or None. If None, assigns NOT_REVIEWED codes to everything.
        :type f: file-like | None
        """
        if f is None:
            f = cls._make_empty_file()

        # Build a lookup table of MessageID -> SchemeID -> Labels
        coda_dataset = cls._dataset_lut_from_messages_file(
            f, scheme_key_map.values())

        # Filter out TracedData objects that do not contain a message id key
        data = [td for td in data if message_id_key in td]

        # Apply the labels from Coda to each TracedData item in data
        for td in data:
            for coded_key, scheme in scheme_key_map.items():
                # Get labels for this (message id, scheme id) from the look-up table
                labels = coda_dataset.get(td[message_id_key],
                                          dict()).get(scheme.scheme_id, [])

                # Get the currently assigned list of labels for this multi-coded scheme,
                # and construct a look-up table of scheme id -> label
                td_labels = td.get(coded_key, [])
                td_labels_lut = {
                    label["SchemeID"]: Label.from_dict(label)
                    for label in td_labels
                }

                for label in reversed(labels):
                    # Update the relevant label in this traced data's list of labels with the new label,
                    # and append the whole new list to the traced data.
                    td_labels_lut[label.scheme_id] = label

                    td_labels = list(td_labels_lut.values())
                    td.append_data(
                        {coded_key: [label.to_dict() for label in td_labels]},
                        Metadata(user, Metadata.get_call_location(),
                                 TimeUtils.utc_now_as_iso_string()))

                # Delete any labels that are SPECIAL-MANUALLY_UNCODED
                for scheme_id, label in list(td_labels_lut.items()):
                    if label.code_id == "SPECIAL-MANUALLY_UNCODED":
                        del td_labels_lut[scheme_id]
                        td_labels = list(td_labels_lut.values())
                        td.append_data(
                            {
                                coded_key:
                                [label.to_dict() for label in td_labels]
                            },
                            Metadata(user, Metadata.get_call_location(),
                                     time.time()))

                # If no manual labels have been set and are checked, set a code for NOT_REVIEWED
                checked_codes_count = 0
                labels = td.get(coded_key)
                if labels is not None:
                    for label in labels:
                        if label["Checked"]:
                            checked_codes_count += 1

                if checked_codes_count == 0:
                    nr_label = CleaningUtils.make_label_from_cleaner_code(
                        scheme,
                        scheme.get_code_with_control_code(Codes.NOT_REVIEWED),
                        Metadata.get_call_location())

                    td.append_data({coded_key: [nr_label.to_dict()]},
                                   Metadata(user, Metadata.get_call_location(),
                                            time.time()))

                # Normalise the scheme ids of all the imported labels
                labels = [Label.from_dict(d) for d in td[coded_key]]
                for label in labels:
                    assert label.scheme_id.startswith(scheme.scheme_id)
                    label.scheme_id = scheme.scheme_id

                # De-duplicate the imported labels by selecting the first label with each code id.
                # This is required in cases where the same label was applied to this message under different columns
                # of the same code scheme, and is possible now that we have normalised the scheme ids.
                unique_labels_by_code_id = []
                seen_code_ids = set()
                for label in labels:
                    if label.code_id not in seen_code_ids:
                        unique_labels_by_code_id.append(label)
                        seen_code_ids.add(label.code_id)

                td.append_data(
                    {
                        coded_key: [
                            label.to_dict()
                            for label in unique_labels_by_code_id
                        ]
                    }, Metadata(user, Metadata.get_call_location(),
                                time.time()))

Exemplo n.º 26

0

Exibir arquivo

    def apply_manual_codes(cls, user, data, coda_input_dir):
        # Merge manually coded data into the cleaned dataset
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            coda_input_path = path.join(coda_input_dir, plan.coda_filename)

            for cc in plan.coding_configurations:
                f = None
                try:
                    if path.exists(coda_input_path):
                        f = open(coda_input_path, "r")

                    if cc.coding_mode == CodingModes.SINGLE:
                        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                            user, data, plan.id_field,
                            {cc.coded_field: cc.code_scheme}, f)
                    else:
                        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                            user, data, plan.id_field,
                            {cc.coded_field: cc.code_scheme}, f)
                finally:
                    if f is not None:
                        f.close()

        # Label data for which there is no response as TRUE_MISSING.
        # Label data for which the response is the empty string as NOT_CODED.
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                if plan.raw_field not in td:
                    for cc in plan.coding_configurations:
                        na_label = CleaningUtils.make_label_from_cleaner_code(
                            cc.code_scheme,
                            cc.code_scheme.get_code_with_control_code(
                                Codes.TRUE_MISSING),
                            Metadata.get_call_location()).to_dict()
                        missing_dict[
                            cc.
                            coded_field] = na_label if cc.coding_mode == CodingModes.SINGLE else [
                                na_label
                            ]
                elif td[plan.raw_field] == "":
                    for cc in plan.coding_configurations:
                        nc_label = CleaningUtils.make_label_from_cleaner_code(
                            cc.code_scheme,
                            cc.code_scheme.get_code_with_control_code(
                                Codes.NOT_CODED),
                            Metadata.get_call_location()).to_dict()
                        missing_dict[
                            cc.
                            coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [
                                nc_label
                            ]
            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Mark data that is noise as Codes.NOT_CODED
        for td in data:
            if td.get("noise", False):
                nc_dict = dict()
                for plan in PipelineConfiguration.RQA_CODING_PLANS:
                    for cc in plan.coding_configurations:
                        if cc.coded_field not in td:
                            nc_label = CleaningUtils.make_label_from_cleaner_code(
                                cc.code_scheme,
                                cc.code_scheme.get_code_with_control_code(
                                    Codes.NOT_CODED),
                                Metadata.get_call_location()).to_dict()
                            nc_dict[
                                cc.
                                coded_field] = nc_label if cc.coding_mode == CodingModes.SINGLE else [
                                    nc_label
                                ]
                td.append_data(
                    nc_dict,
                    Metadata(user, Metadata.get_call_location(), time.time()))

        # Run code imputation functions
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.code_imputation_function is not None:
                plan.code_imputation_function(user, data,
                                              plan.coding_configurations)

        cls._impute_coding_error_codes(user, data)

        return data

Exemplo n.º 27

0

Exibir arquivo

    def auto_code_show_messages(cls, user, data, icr_output_dir,
                                coda_output_dir):
        # Filter out test messages sent by AVF.
        if not PipelineConfiguration.DEV_MODE:
            data = MessageFilters.filter_test_messages(data)

        # Filter for runs which don't contain a response to any week's question
        data = MessageFilters.filter_empty_messages(data, cls.RQA_KEYS)

        # Filter out runs sent outwith the project start and end dates
        data = MessageFilters.filter_time_range(data, cls.SENT_ON_KEY,
                                                cls.PROJECT_START_DATE,
                                                cls.PROJECT_END_DATE)

        # Tag messages which are noise as being noise
        for td in data:
            is_noise = True
            for rqa_key in cls.RQA_KEYS:
                if rqa_key in td and not somali.DemographicCleaner.is_noise(
                        td[rqa_key], min_length=10):
                    is_noise = False
            td.append_data({cls.NOISE_KEY: is_noise},
                           Metadata(user, Metadata.get_call_location(),
                                    time.time()))

        # Label missing data
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                if plan.raw_field not in td:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.TRUE_MISSING), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = [na_label.to_dict()]

                    if plan.binary_code_scheme is not None:
                        na_label = CleaningUtils.make_label_from_cleaner_code(
                            plan.binary_code_scheme,
                            plan.binary_code_scheme.get_code_with_control_code(
                                Codes.TRUE_MISSING),
                            Metadata.get_call_location())
                        missing_dict[
                            plan.binary_coded_field] = na_label.to_dict()

            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Label each message with channel keys
        Channels.set_channel_keys(user, data, cls.SENT_ON_KEY)

        # Filter for messages which aren't noise (in order to export to Coda and export for ICR)
        not_noise = MessageFilters.filter_noise(data, cls.NOISE_KEY,
                                                lambda x: x)

        # Output messages which aren't noise to Coda
        IOUtils.ensure_dirs_exist(coda_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            TracedDataCodaV2IO.compute_message_ids(user, not_noise,
                                                   plan.raw_field,
                                                   plan.id_field)

            output_path = path.join(coda_output_dir, plan.coda_filename)
            with open(output_path, "w") as f:
                TracedDataCodaV2IO.export_traced_data_iterable_to_coda_2(
                    not_noise, plan.raw_field, cls.SENT_ON_KEY, plan.id_field,
                    {}, f)

        # Output messages for ICR
        IOUtils.ensure_dirs_exist(icr_output_dir)
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            rqa_messages = []
            for td in not_noise:
                # This test works because the only codes which have been applied at this point are TRUE_MISSING.
                # If any other coding is done above, this test will need to change.
                if plan.coded_field not in td:
                    rqa_messages.append(td)
                else:
                    assert len(td[plan.coded_field]) == 1
                    assert td[plan.coded_field][0]["CodeID"] == \
                        plan.code_scheme.get_code_with_control_code(Codes.TRUE_MISSING).code_id

            icr_messages = ICRTools.generate_sample_for_icr(
                rqa_messages, cls.ICR_MESSAGES_COUNT,
                random.Random(cls.ICR_SEED))

            icr_output_path = path.join(icr_output_dir, plan.icr_filename)
            with open(icr_output_path, "w") as f:
                TracedDataCSVIO.export_traced_data_iterable_to_csv(
                    icr_messages,
                    f,
                    headers=[plan.run_id_field, plan.raw_field])

        return data

Exemplo n.º 28

0

Exibir arquivo

Arquivo: apply_manual_codes.py Projeto: AfricasVoices/Project-ICRAF

    def apply_manual_codes(cls, user, data, coda_input_dir):
        # Merge manually coded radio show files into the cleaned dataset
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            rqa_messages = [td for td in data if plan.raw_field in td]
            coda_input_path = path.join(coda_input_dir, plan.coda_filename)

            f = None
            try:
                if path.exists(coda_input_path):
                    f = open(coda_input_path, "r")
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                    user, rqa_messages, plan.id_field,
                    {plan.coded_field: plan.code_scheme}, f)

                if plan.binary_code_scheme is not None:
                    if f is not None:
                        f.seek(0)
                    TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                        user, rqa_messages, plan.id_field,
                        {plan.binary_coded_field: plan.binary_code_scheme}, f)
            finally:
                if f is not None:
                    f.close()

        # Label the RQA for which there is no response yet as TRUE MISSING
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS:
                if plan.raw_field not in td:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.TRUE_MISSING), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = [na_label.to_dict()]

                    if plan.binary_code_scheme is not None:
                        na_label = CleaningUtils.make_label_from_cleaner_code(
                            plan.binary_code_scheme,
                            plan.binary_code_scheme.get_code_with_control_code(
                                Codes.TRUE_MISSING),
                            Metadata.get_call_location())
                        missing_dict[
                            plan.binary_coded_field] = na_label.to_dict()
                elif td[plan.raw_field] == "":
                    nc_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.NOT_CODED), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = [nc_label.to_dict()]
                elif plan.binary_code_scheme is not None and td[
                        plan.raw_field] == "":
                    nc_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.binary_code_scheme,
                        plan.binary_code_scheme.get_code_with_control_code(
                            Codes.NOT_CODED), Metadata.get_call_location())
                    missing_dict[plan.binary_coded_field] = [
                        nc_label.to_dict()
                    ]
            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Synchronise the control codes between the binary and reasons schemes:
        # Some RQA datasets have a binary scheme, which is always labelled, and a reasons scheme, which is only labelled
        # if there is an additional reason given. Importing those two schemes separately above caused the labels in
        # each scheme to go out of sync with each other, e.g. reasons can be NR when the binary *was* reviewed.
        # This block updates the reasons scheme in cases where only a binary label was set, by assigning the
        # label 'NC' if the binary label was set to a normal code, otherwise to be the same control code as the binary.
        for plan in PipelineConfiguration.RQA_CODING_PLANS:
            rqa_messages = [td for td in data if plan.raw_field in td]
            if plan.binary_code_scheme is not None:
                for td in rqa_messages:
                    binary_label = td[plan.binary_coded_field]
                    binary_code = plan.binary_code_scheme.get_code_with_id(
                        binary_label["CodeID"])

                    binary_label_present = binary_label["CodeID"] != \
                                            plan.binary_code_scheme.get_code_with_control_code(
                                                Codes.NOT_REVIEWED).code_id

                    reasons_label_present = len(td[plan.coded_field]) > 1 or td[plan.coded_field][0][
                        "CodeID"] != \
                                            plan.code_scheme.get_code_with_control_code(
                                                Codes.NOT_REVIEWED).code_id

                    if binary_label_present and not reasons_label_present:
                        if binary_code.code_type == "Control":
                            control_code = binary_code.control_code
                            reasons_code = plan.code_scheme.get_code_with_control_code(
                                control_code)

                            reasons_label = CleaningUtils.make_label_from_cleaner_code(
                                plan.code_scheme,
                                reasons_code,
                                Metadata.get_call_location(),
                                origin_name="Pipeline Code Synchronisation")

                            td.append_data(
                                {plan.coded_field: [reasons_label.to_dict()]},
                                Metadata(user, Metadata.get_call_location(),
                                         TimeUtils.utc_now_as_iso_string()))
                        else:
                            assert binary_code.code_type == "Normal"

                            nc_label = CleaningUtils.make_label_from_cleaner_code(
                                plan.code_scheme,
                                plan.code_scheme.get_code_with_control_code(
                                    Codes.NOT_CODED),
                                Metadata.get_call_location(),
                                origin_name="Pipeline Code Synchronisation")
                            td.append_data(
                                {plan.coded_field: [nc_label.to_dict()]},
                                Metadata(user, Metadata.get_call_location(),
                                         TimeUtils.utc_now_as_iso_string()))

        # Merge manually coded demog and follow-up survey files into the cleaned dataset
        # Recursion depth currently exceeding
        # TODO: Investigate/address the cause of this.
        sys.setrecursionlimit(10000)
        for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
            f = None
            try:
                coda_input_path = path.join(coda_input_dir, plan.coda_filename)
                if path.exists(coda_input_path):
                    f = open(coda_input_path, "r")
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                    user, data, plan.id_field,
                    {plan.coded_field: plan.code_scheme}, f)
            finally:
                if f is not None:
                    f.close()

        # Not everyone will have answered all of the demographic and follow-up survey flows flows.
        # Label demographic and follow-up survey questions which had no responses as TRUE_MISSING.
        # Label data which is just the empty string as NOT_CODED.
        for td in data:
            missing_dict = dict()
            for plan in PipelineConfiguration.DEMOGS_CODING_PLANS + PipelineConfiguration.FOLLOW_UP_CODING_PLANS:
                if plan.raw_field not in td:
                    na_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.TRUE_MISSING), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = na_label.to_dict()
                elif td[plan.raw_field] == "":
                    nc_label = CleaningUtils.make_label_from_cleaner_code(
                        plan.code_scheme,
                        plan.code_scheme.get_code_with_control_code(
                            Codes.NOT_CODED), Metadata.get_call_location())
                    missing_dict[plan.coded_field] = nc_label.to_dict()
            td.append_data(
                missing_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))

        # Set county/constituency/from the coded constituency field.
        cls._impute_location_codes(user, data)

        # Set coding error codes using the coding error field
        cls._impute_coding_error_codes(user, data)

        return data

Exemplo n.º 29

0

Exibir arquivo

Arquivo: ws_correction.py Projeto: AfricasVoices/Project-COVID19-KE-Urban

    def move_wrong_scheme_messages(user, data, coda_input_dir):
        log.info("Importing manually coded Coda files to '_WS' fields...")
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.coda_filename is None:
                continue

            TracedDataCodaV2IO.compute_message_ids(user, data, plan.raw_field,
                                                   f"{plan.id_field}_WS")
            with open(f"{coda_input_dir}/{plan.coda_filename}") as f:
                TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                    user, data, f"{plan.id_field}_WS", {
                        f"{plan.raw_field}_WS_correct_dataset":
                        PipelineConfiguration.WS_CORRECT_DATASET_SCHEME
                    }, f)

            for cc in plan.coding_configurations:
                with open(f"{coda_input_dir}/{plan.coda_filename}") as f:
                    if cc.coding_mode == CodingModes.SINGLE:
                        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable(
                            user, data, plan.id_field + "_WS",
                            {f"{cc.coded_field}_WS": cc.code_scheme}, f)
                    else:
                        assert cc.coding_mode == CodingModes.MULTIPLE
                        TracedDataCodaV2IO.import_coda_2_to_traced_data_iterable_multi_coded(
                            user, data, f"{plan.id_field}_WS",
                            {f"{cc.coded_field}_WS": cc.code_scheme}, f)

        log.info("Checking for WS Coding Errors...")
        # Check for coding errors
        for td in data:
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                rqa_codes = []
                for cc in plan.coding_configurations:
                    if cc.coding_mode == CodingModes.SINGLE:
                        if f"{cc.coded_field}_WS" in td:
                            label = td[f"{cc.coded_field}_WS"]
                            rqa_codes.append(
                                cc.code_scheme.get_code_with_code_id(
                                    label["CodeID"]))
                    else:
                        assert cc.coding_mode == CodingModes.MULTIPLE
                        for label in td.get(f"{cc.coded_field}_WS", []):
                            rqa_codes.append(
                                cc.code_scheme.get_code_with_code_id(
                                    label["CodeID"]))

                has_ws_code_in_code_scheme = False
                for code in rqa_codes:
                    if code.control_code == Codes.WRONG_SCHEME:
                        has_ws_code_in_code_scheme = True

                has_ws_code_in_ws_scheme = False
                if f"{plan.raw_field}_WS_correct_dataset" in td:
                    ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id(
                        td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"])
                    has_ws_code_in_ws_scheme = ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED

                if has_ws_code_in_code_scheme != has_ws_code_in_ws_scheme:
                    log.warning(
                        f"Coding Error: {plan.raw_field}: {td[plan.raw_field]}"
                    )
                    coding_error_dict = {
                        f"{plan.raw_field}_WS_correct_dataset":
                        CleaningUtils.make_label_from_cleaner_code(
                            PipelineConfiguration.WS_CORRECT_DATASET_SCHEME,
                            PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.
                            get_code_with_control_code(Codes.CODING_ERROR),
                            Metadata.get_call_location(),
                        ).to_dict()
                    }
                    td.append_data(
                        coding_error_dict,
                        Metadata(user, Metadata.get_call_location(),
                                 time.time()))

        # Construct a map from WS normal code id to the raw field that code indicates a requested move to.
        ws_code_to_raw_field_map = dict()
        for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
            if plan.ws_code is not None:
                ws_code_to_raw_field_map[plan.ws_code.code_id] = plan.raw_field

        # Group the TracedData by uid.
        data_grouped_by_uid = dict()
        for td in data:
            uid = td["uid"]
            if uid not in data_grouped_by_uid:
                data_grouped_by_uid[uid] = []
            data_grouped_by_uid[uid].append(td)

        # Perform the WS correction for each uid.
        log.info("Performing WS correction...")
        corrected_data = []  # List of TracedData with the WS data moved.
        unknown_target_code_counts = dict(
        )  # 'WS - Correct Dataset' codes with no matching code id in any coding plan
        # for this project, with a count of the occurrences
        for group in data_grouped_by_uid.values():
            # Find all the surveys data being moved.
            # (Note: we only need to check one td in this group because all the demographics are the same)
            td = group[0]
            survey_moves = dict()  # of source_field -> target_field
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if plan.raw_field not in td or plan.coda_filename is None:
                    continue
                ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id(
                    td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"])
                if ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED:
                    if ws_code.code_id in ws_code_to_raw_field_map:
                        survey_moves[
                            plan.raw_field] = ws_code_to_raw_field_map[
                                ws_code.code_id]
                    else:
                        if (ws_code.code_id, ws_code.display_text
                            ) not in unknown_target_code_counts:
                            unknown_target_code_counts[(
                                ws_code.code_id, ws_code.display_text)] = 0
                        unknown_target_code_counts[(ws_code.code_id,
                                                    ws_code.display_text)] += 1
                        survey_moves[plan.raw_field] = None

            # Find all the RQA data being moved.
            rqa_moves = dict(
            )  # of (index in group, source_field) -> target_field
            for i, td in enumerate(group):
                for plan in PipelineConfiguration.RQA_CODING_PLANS:
                    if plan.raw_field not in td or plan.coda_filename is None:
                        continue
                    ws_code = PipelineConfiguration.WS_CORRECT_DATASET_SCHEME.get_code_with_code_id(
                        td[f"{plan.raw_field}_WS_correct_dataset"]["CodeID"])
                    if ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED:
                        if ws_code.code_id in ws_code_to_raw_field_map:
                            rqa_moves[(
                                i, plan.raw_field
                            )] = ws_code_to_raw_field_map[ws_code.code_id]
                        else:
                            if (ws_code.code_id, ws_code.display_text
                                ) not in unknown_target_code_counts:
                                unknown_target_code_counts[(
                                    ws_code.code_id, ws_code.display_text)] = 0
                            unknown_target_code_counts[(
                                ws_code.code_id, ws_code.display_text)] += 1
                            rqa_moves[(i, plan.raw_field)] = None

            # Build a dictionary of the survey fields that haven't been moved, and cleared fields for those which have.
            survey_updates = dict()  # of raw_field -> updated value
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if plan.coda_filename is None:
                    continue

                if plan.raw_field in survey_moves.keys():
                    # Data is moving
                    survey_updates[plan.raw_field] = []
                elif plan.raw_field in td:
                    # Data is not moving
                    survey_updates[plan.raw_field] = [
                        _WSUpdate(td[plan.raw_field], td[plan.time_field],
                                  plan.raw_field, td)
                    ]

            # Build a list of the rqa fields that haven't been moved.
            rqa_updates = []  # of (raw_field, _WSUpdate)
            for i, td in enumerate(group):
                for plan in PipelineConfiguration.RQA_CODING_PLANS:
                    if plan.coda_filename is None:
                        continue

                    if plan.raw_field in td:
                        if (i, plan.raw_field) in rqa_moves.keys():
                            # Data is moving
                            pass
                        else:
                            # Data is not moving
                            rqa_updates.append(
                                (plan.raw_field,
                                 _WSUpdate(td[plan.raw_field],
                                           td[plan.time_field], plan.raw_field,
                                           td)))

            # Add data moving from survey fields to the relevant survey_/rqa_updates
            raw_survey_fields = {
                plan.raw_field
                for plan in PipelineConfiguration.SURVEY_CODING_PLANS
            }
            raw_rqa_fields = {
                plan.raw_field
                for plan in PipelineConfiguration.RQA_CODING_PLANS
            }
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS + PipelineConfiguration.RQA_CODING_PLANS:
                if plan.raw_field not in survey_moves:
                    continue

                target_field = survey_moves[plan.raw_field]
                if target_field is None:
                    continue

                update = _WSUpdate(td[plan.raw_field], td[plan.time_field],
                                   plan.raw_field, td)
                if target_field in raw_survey_fields:
                    survey_updates[target_field] = survey_updates.get(
                        target_field, []) + [update]
                else:
                    assert target_field in raw_rqa_fields, f"Raw field '{target_field}' not in any coding plan"
                    rqa_updates.append((target_field, update))

            # Add data moving from RQA fields to the relevant survey_/rqa_updates
            for (i, source_field), target_field in rqa_moves.items():
                if target_field is None:
                    continue

                for plan in PipelineConfiguration.SURVEY_CODING_PLANS + PipelineConfiguration.RQA_CODING_PLANS:
                    if plan.raw_field == source_field:
                        _td = group[i]
                        update = _WSUpdate(_td[plan.raw_field],
                                           _td[plan.time_field],
                                           plan.raw_field, td)
                        if target_field in raw_survey_fields:
                            survey_updates[target_field] = survey_updates.get(
                                target_field, []) + [update]
                        else:
                            assert target_field in raw_rqa_fields, f"Raw field '{target_field}' not in any coding plan"
                            rqa_updates.append((target_field, update))

            # Re-format the survey updates to a form suitable for use by the rest of the pipeline
            flattened_survey_updates = {}
            for plan in PipelineConfiguration.SURVEY_CODING_PLANS:
                if plan.raw_field in survey_updates:
                    plan_updates = survey_updates[plan.raw_field]

                    if len(plan_updates) > 0:
                        flattened_survey_updates[plan.raw_field] = "; ".join(
                            [u.message for u in plan_updates])
                        flattened_survey_updates[plan.time_field] = sorted(
                            [u.timestamp for u in plan_updates])[0]
                        flattened_survey_updates[
                            f"{plan.raw_field}_source"] = "; ".join(
                                [u.source_field for u in plan_updates])
                    else:
                        flattened_survey_updates[plan.raw_field] = None
                        flattened_survey_updates[plan.time_field] = None
                        flattened_survey_updates[
                            f"{plan.raw_field}_source"] = None

            # For each RQA message, create a copy of its source td, append the updated TracedData, and add this to
            # the list of TracedData to be returned
            raw_field_to_rqa_plan_map = {
                plan.raw_field: plan
                for plan in PipelineConfiguration.RQA_CODING_PLANS
            }
            for target_field, update in rqa_updates:
                corrected_td = update.source_td.copy()

                # Hide the survey keys currently in the TracedData which have had data moved away.
                corrected_td.hide_keys({
                    k
                    for k, v in flattened_survey_updates.items() if v is None
                }.intersection(corrected_td.keys()),
                                       Metadata(user,
                                                Metadata.get_call_location(),
                                                time.time()))

                # Update with the corrected survey data
                corrected_td.append_data(
                    {
                        k: v
                        for k, v in flattened_survey_updates.items()
                        if v is not None
                    }, Metadata(user, Metadata.get_call_location(),
                                time.time()))

                # Hide all the RQA fields (they will be added back, in turn, in the next step).
                corrected_td.hide_keys({
                    plan.raw_field
                    for plan in PipelineConfiguration.RQA_CODING_PLANS
                }.intersection(corrected_td.keys()),
                                       Metadata(user,
                                                Metadata.get_call_location(),
                                                time.time()))
                corrected_td.hide_keys({
                    plan.time_field
                    for plan in PipelineConfiguration.RQA_CODING_PLANS
                }.intersection(corrected_td.keys()),
                                       Metadata(user,
                                                Metadata.get_call_location(),
                                                time.time()))

                target_coding_plan = raw_field_to_rqa_plan_map[target_field]

                rqa_dict = {
                    target_field: update.message,
                    target_coding_plan.time_field: update.timestamp,
                    f"{target_field}_source": update.source_field
                }

                corrected_td.append_data(
                    rqa_dict,
                    Metadata(user, Metadata.get_call_location(), time.time()))
                corrected_data.append(corrected_td)

        if len(unknown_target_code_counts) > 0:
            log.warning(
                "Found the following 'WS - Correct Dataset' CodeIDs with no matching coding plan:"
            )
            for (code_id,
                 display_text), count in unknown_target_code_counts.items():
                log.warning(
                    f"  '{code_id}' (DisplayText '{display_text}') ({count} occurrences)"
                )

        return corrected_data

Exemplo n.º 30

0

Exibir arquivo

Arquivo: apply_manual_codes.py Projeto: AfricasVoices/Project-COVID19-SOM

    def _impute_coding_error_codes(user, data):
        for td in data:
            coding_error_dict = dict()
            for plan in PipelineConfiguration.RQA_CODING_PLANS + PipelineConfiguration.SURVEY_CODING_PLANS:
                rqa_codes = []
                for cc in plan.coding_configurations:
                    if cc.coding_mode == CodingModes.SINGLE:
                        if cc.coded_field in td:
                            label = td[cc.coded_field]
                            rqa_codes.append(
                                cc.code_scheme.get_code_with_code_id(
                                    label["CodeID"]))
                    else:
                        assert cc.coding_mode == CodingModes.MULTIPLE
                        for label in td.get(cc.coded_field, []):
                            rqa_codes.append(
                                cc.code_scheme.get_code_with_code_id(
                                    label["CodeID"]))

                has_ws_code_in_code_scheme = False
                for code in rqa_codes:
                    if code.control_code == Codes.WRONG_SCHEME:
                        has_ws_code_in_code_scheme = True

                has_ws_code_in_ws_scheme = False
                if f"{plan.raw_field}_correct_dataset" in td:
                    ws_code = CodeSchemes.WS_CORRECT_DATASET.get_code_with_code_id(
                        td[f"{plan.raw_field}_correct_dataset"]["CodeID"])
                    has_ws_code_in_ws_scheme = ws_code.code_type == "Normal" or ws_code.control_code == Codes.NOT_CODED

                if has_ws_code_in_code_scheme != has_ws_code_in_ws_scheme:
                    log.warning(
                        f"Coding Error: {plan.raw_field}: {td[plan.raw_field]}"
                    )
                    coding_error_dict[f"{plan.raw_field}_correct_dataset"] = \
                        CleaningUtils.make_label_from_cleaner_code(
                            CodeSchemes.WS_CORRECT_DATASET,
                            CodeSchemes.WS_CORRECT_DATASET.get_code_with_control_code(Codes.CODING_ERROR),
                            Metadata.get_call_location(),
                        ).to_dict()

                    for cc in plan.coding_configurations:
                        if cc.coding_mode == CodingModes.SINGLE:
                            coding_error_dict[cc.coded_field] = \
                                CleaningUtils.make_label_from_cleaner_code(
                                    cc.code_scheme,
                                    cc.code_scheme.get_code_with_control_code(Codes.CODING_ERROR),
                                    Metadata.get_call_location()
                                ).to_dict()
                        else:
                            assert cc.coding_mode == CodingModes.MULTIPLE
                            coding_error_dict[cc.coded_field] = [
                                CleaningUtils.make_label_from_cleaner_code(
                                    cc.code_scheme,
                                    cc.code_scheme.get_code_with_control_code(
                                        Codes.CODING_ERROR),
                                    Metadata.get_call_location()).to_dict()
                            ]

            td.append_data(
                coding_error_dict,
                Metadata(user, Metadata.get_call_location(), time.time()))