args = parser.parse_args() user = args.user[0] input_path = args.input[0] coding_mode = args.coding_mode[0] coded_input_directory = args.coding_input[0] json_output_path = args.json_output[0] # Load data from JSON file with open(input_path, "r") as f: data = TracedDataJsonIO.import_json_to_traced_data_iterable(f) if coding_mode == "coda": # Merge manually coded Coda files into the cleaned dataset. with open(path.join(coded_input_directory, "gender.csv"), "r") as f: data = list( TracedDataCodaIO.import_coda_to_traced_data_iterable( user, data, "GENDER_R", "GENDER_R_clean", f, True)) with open(path.join(coded_input_directory, "age.csv"), "r") as f: data = list( TracedDataCodaIO.import_coda_to_traced_data_iterable( user, data, "AGE_R", "AGE_R_clean", f, True)) if len(data) > 0 and "LOCATION_R" in data[0]: with open(path.join(coded_input_directory, "location.csv"), "r") as f: data = list( TracedDataCodaIO.import_coda_to_traced_data_iterable( user, data, "LOCATION_R", "LOCATION_R_clean", f, True)) with open(path.join(coded_input_directory, "nationality.csv"), "r") as f:
# Output messages to Coda IOUtils.ensure_dirs_exist_for_file(coda_output_path) if os.path.exists(prev_coda_path): # TODO: Modifying this line once the coding frame has been developed to include lots of Nones feels a bit # TODO: cumbersome. We could instead modify export_traced_data_iterable_to_coda to support a prev_f argument. # TODO: Modify by adding code scheme keys once they are ready scheme_keys = { "Relevance": None, "Code 1": None, "Code 2": None, "Code 3": None, "Code 4": None } with open(coda_output_path, "w") as f, open(prev_coda_path, "r") as prev_f: TracedDataCodaIO.export_traced_data_iterable_to_coda_with_scheme( show_messages, show_message_key, scheme_keys, f, prev_f=prev_f) else: with open(coda_output_path, "w") as f: TracedDataCodaIO.export_traced_data_iterable_to_coda( show_messages, show_message_key, f) # Randomly select some messages to export for ICR random.seed(0) random.shuffle(show_messages) icr_messages = show_messages[:ICR_MESSAGES_COUNT] # Output ICR data to a CSV file run_id_key = "{} (Run ID) - {}".format(variable_name, flow_name) raw_text_key = "{} (Text) - {}".format(variable_name, flow_name) IOUtils.ensure_dirs_exist_for_file(icr_output_path) with open(icr_output_path, "w") as f:
json_output_path = args.json_output_path[0] csv_output_path = args.csv_output_path[0] key_of_clean = "{}_clean".format(key_of_raw) # Load data from JSON file with open(json_input_path, "r") as f: data = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Merge coded data into the loaded data file if coding_mode == "coda": # Merge manually coded Coda files into the cleaned dataset with open(path.join(coded_input_path, "{}.csv".format(key_of_raw)), "r") as f: data = list( TracedDataCodaIO.import_coda_to_traced_data_iterable( user, data, key_of_raw, key_of_clean, f, True)) else: assert coding_mode == "coding-csv", "coding_mode was not one of 'coda' or 'coding-csv'" # Merge manually coded CSV files into the cleaned dataset with open(path.join(coded_input_path, "{}.csv".format(key_of_raw)), "r") as f: data = list( TracedDataCodingCSVIO. import_coding_csv_to_traced_data_iterable( user, data, key_of_raw, key_of_clean, key_of_raw, key_of_clean, f, True)) # Write coded data back out to disk if os.path.dirname(json_output_path) is not "" and not os.path.exists( os.path.dirname(json_output_path)):
json_input_path = args.json_input_path coding_mode = args.coding_mode coded_input_path = args.coded_input_path json_output_path = args.json_output_path # Load data from JSON file with open(json_input_path, "r") as f: data = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Merge coded data into the loaded data file if coding_mode == "coda": # Merge manually coded Coda files into the cleaned dataset # FIXME: Set the <example-arguments> to import a particular column e.g. "age", "age_clean", "Age" with open(path.join(coded_input_path, "<input-file>.csv"), "r") as f: data = list( TracedDataCodaIO.import_coda_to_traced_data_iterable( user, data, "<key-of-raw>", "<key-of-coded>", f, True)) # FIXME: Re-use the above code sample to export other columns which need importing. else: assert coding_mode == "coding-csv", "coding_mode was not one of 'coda' or 'coding-csv'" # Merge manually coded CSV files into the cleaned dataset # FIXME: Set the <example-arguments> to import a particular column e.g. "age", "age_clean", "Age" with open(path.join(coded_input_path, "<input-file>.csv"), "r") as f: data = list( TracedDataCodingCSVIO. import_coding_csv_to_traced_data_iterable( user, data, "<key_of_raw_in_data>", "<key_of_coded_in_data>", "<key_of_raw_in_f>", "<key_of_coded_in_f>", f, True))
os.path.dirname(json_output_path)): os.makedirs(os.path.dirname(json_output_path)) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(data, f, pretty_print=True) # Output for manual verification + coding if coding_mode == "coda": # Write Coda output if not os.path.exists(coded_output_path): os.makedirs(coded_output_path) # FIXME: Set the <example-arguments> to export a particular column e.g. "age", "age_clean", "Age" with open(path.join(coded_output_path, "<output-file>.csv"), "w") as f: TracedDataCodaIO.export_traced_data_iterable_to_coda_with_scheme( data, "<key-of-raw>", "<key-of-coded>", "<name-in-Coda>", f) # FIXME: Re-use the above code sample to export other columns which need verifying/coding. else: assert coding_mode == "coding-csv", "coding_mode was not one of 'coda' or 'coding-csv'" # Write Coding CSV output if not os.path.exists(coded_output_path): os.makedirs(coded_output_path) # FIXME: Set the <example-arguments> to export a particular column e.g. "age", "age_clean", "Age" with open(path.join(coded_output_path, "<output-file>.csv"), "w") as f: TracedDataCodingCSVIO.export_traced_data_iterable_to_coding_csv_with_scheme( data, "<key-of-raw>", "<key-of-coded>", f) # FIXME: Re-use the above code sample to export other columns which need verifying/coding.
help="Path to write merged dataset to, as JSON", nargs=1) args = parser.parse_args() user = args.user[0] json_input_path = args.input[0] coda_input_path = args.coda_input[0] output_path = args.output[0] col_raw = args.raw_column[0] col_coded = args.coded_column[0] # Load data from a JSON file with open(json_input_path, "r") as f: data = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Merge Coda input into the data imported from the JSON with open(coda_input_path, "r") as f: data = list( TracedDataCodaIO.import_coda_to_traced_data_iterable( user, data, col_raw, col_coded, f)) # Write merged output if os.path.dirname(output_path) is not "" and not os.path.exists( os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) with open(output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(data, f, pretty_print=True)
for plan_item in merge_plans: coda_file_path = path.join(coded_input_path, plan_item.coda_filename) if not path.exists(coda_file_path): print("Warning: Coda file '{}' not found".format( plan_item.coda_filename)) for td in surveys: for td_col in plan_item.scheme_keys.values(): td.append_data({td_col: None}, Metadata(user, Metadata.get_call_location(), time.time())) continue with open(coda_file_path, "r") as f: TracedDataCodaIO.import_coda_to_traced_data_iterable( user, surveys, plan_item.key_of_raw, plan_item.scheme_keys, f, True) # Items coded under household sickness are not explicitly coded under People. For these cases, set the # people column to 'NC' sickness_people_key = "Household_Sickness (Text) - wt_practice_coded_people" household_sickness_key = "Household_Sickness (Text) - wt_practice_coded" for td in surveys: if td.get(household_sickness_key ) is not None and td.get(sickness_people_key) is None: td.append_data({sickness_people_key: "NC"}, Metadata(user, Metadata.get_call_location(), user)) # Import Trustworthy Advisors using matrix imports coda_file_path = path.join(coded_input_path, "Trustworthy_Advisors_coded.csv")
# Write json output if os.path.dirname(json_output_path) is not "" and not os.path.exists( os.path.dirname(json_output_path)): os.makedirs(os.path.dirname(json_output_path)) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(data, f, pretty_print=True) if coding_mode == "coda": # Write Coda output if not os.path.exists(coded_output_directory): os.makedirs(coded_output_directory) with open(path.join(coded_output_directory, "gender.csv"), "w") as f: TracedDataCodaIO.export_traced_data_iterable_to_coda_with_scheme( data, "GENDER_R", "GENDER_R_clean", "Gender", f) with open(path.join(coded_output_directory, "age.csv"), "w") as f: TracedDataCodaIO.export_traced_data_iterable_to_coda_with_scheme( data, "AGE_R", "AGE_R_clean", "Age", f) # ATA uses LOCATION/NATIONALITY whereas BIBLIA uses LOCATION 1/LOCATION 2 if len(data) > 0 and "LOCATION_R" in data[0]: with open(path.join(coded_output_directory, "location.csv"), "w") as f: TracedDataCodaIO.export_traced_data_iterable_to_coda( data, "LOCATION_R", f) with open(path.join(coded_output_directory, "nationality.csv"), "w") as f: TracedDataCodaIO.export_traced_data_iterable_to_coda( data, "NATIONALITY_R", f)
elif show_number == 5: key_of_raw = "S06E05_Water_Quality (Text) - wt_s06e05_activation" key_of_coded_prefix = "{}_coded_".format(key_of_raw) coda_yes_no_scheme = "Relevance" coda_reason_schemes = {"reason 2", "reason 3"} else: assert False, "Unrecognised show_number '{}'. Must be a number from 1-5 inclusive.".format( show_number) # Merge yes/no responses from the manually coded Coda files into the cleaned dataset if coda_yes_no_scheme is not None: with open(coded_input_path, "r") as f: TracedDataCodaIO.import_coda_to_traced_data_iterable( user, show_messages, key_of_raw, {coda_yes_no_scheme: "{}yes_no".format(key_of_coded_prefix)}, f, overwrite_existing_codes=True) # Merge matrix data from the manually coded Coda files into the cleaned dataset with open(coded_input_path, "r") as f: TracedDataCodaIO.import_coda_to_traced_data_iterable_as_matrix( user, show_messages, key_of_raw, coda_reason_schemes, f, key_of_coded_prefix=key_of_coded_prefix) # Write coded data back out to disk
os.makedirs(os.path.dirname(csv_output_path)) with open(csv_output_path, "w") as f: TracedDataCSVIO.export_traced_data_iterable_to_csv( data, f, headers=[ "avf_phone_id", "{} (Run ID) - {}".format(variable_name, flow_name), "{} (Time) - {}".format(variable_name, flow_name), "{} (Text) - {}".format(variable_name, flow_name) ]) # Output messages to Coda IOUtils.ensure_dirs_exist_for_file(coda_output_path) with open(coda_output_path, "w") as f: TracedDataCodaIO.export_traced_data_iterable_to_coda( data, "{} (Text) - {}".format(variable_name, flow_name), f) # Get 200 non-noise messages and output to CSVs for ICR. print("Noise items:") show_message_key = "{} (Text) - {}".format(variable_name, flow_name) not_noise = [] for td in data: if somali.DemographicCleaner.is_noise(td[show_message_key]): print(td[show_message_key]) else: not_noise.append(td) # Take 200 items pseudo-randomly for ICR random.seed(0) random.shuffle(not_noise) icr_messages = not_noise[:200]
# Write json output if os.path.dirname(json_output_path) is not "" and not os.path.exists( os.path.dirname(json_output_path)): os.makedirs(os.path.dirname(json_output_path)) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(data, f, pretty_print=True) # Output for manual verification + coding if coding_mode == "coda": # Write Coda output if not os.path.exists(coded_output_path): os.makedirs(coded_output_path) with open(path.join(coded_output_path, "{}.csv".format(key_of_raw)), "w") as f: TracedDataCodaIO.export_traced_data_iterable_to_coda( data, key_of_raw, f) else: assert coding_mode == "coding-csv", "coding_mode was not one of 'coda' or 'coding-csv'" # Write Coding CSV output if not os.path.exists(coded_output_path): os.makedirs(coded_output_path) with open(path.join(coded_output_path, "{}.csv".format(key_of_raw)), "w") as f: TracedDataCodingCSVIO.export_traced_data_iterable_to_coding_csv( data, key_of_raw, f)
time.time())) # Write json output IOUtils.ensure_dirs_exist_for_file(json_output_path) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(all_survey_data, f, pretty_print=True) # Output for manual verification + coding IOUtils.ensure_dirs_exist(coded_output_path) # TODO: Tidy up the usage of keys here once the format of the keys has been updated. for key in cleaning_plan.keys(): coded_output_file_path = path.join(coded_output_path, "{}.csv".format(key.split(" ")[0])) prev_coded_output_file_path = path.join( prev_coded_path, "{}_coded.csv".format(key.split(" ")[0])) if os.path.exists(prev_coded_output_file_path): with open(coded_output_file_path, "w") as f, open(prev_coded_output_file_path, "r") as prev_f: TracedDataCodaIO.export_traced_data_iterable_to_coda_with_scheme( all_survey_data, key, {key.split(" ")[0]: "{}_clean".format(key)}, f, prev_f) else: with open(coded_output_file_path, "w") as f: TracedDataCodaIO.export_traced_data_iterable_to_coda_with_scheme( all_survey_data, key, {key.split(" ")[0]: "{}_clean".format(key)}, f)
gender_col_clean = "{}_clean".format( gender_col) # Appending _clean follows AVF practice in Dreams # Load data from JSON file with open(input_path, "r") as f: data = TracedDataJsonIO.import_json_to_traced_data_iterable(f) # Clean data for td in data: cleaned = DemographicCleaner.clean_gender(td[gender_col]) td.append_data({gender_col_clean: cleaned}, Metadata(user, Metadata.get_call_location(), time.time())) # Write json output if os.path.dirname(json_output_path) is not "" and not os.path.exists( os.path.dirname(json_output_path)): os.makedirs(os.path.dirname(json_output_path)) with open(json_output_path, "w") as f: TracedDataJsonIO.export_traced_data_iterable_to_json(data, f, pretty_print=True) # Write Coda output if os.path.dirname(coda_output_path) is not "" and not os.path.exists( os.path.dirname(coda_output_path)): os.makedirs(os.path.dirname(coda_output_path)) with open(coda_output_path, "w") as f: TracedDataCodaIO.export_traced_data_iterable_to_coda( data, gender_col, f, exclude_coded_with_key=gender_col_clean)