def test_redistribution_weights(tmpdir, tariff_data, short_form, hce): """Verify the shape of structure of the redistribution weights is correct for one country.""" prep = TariffPrep(tariff_data, working_dir_path=tmpdir.strpath, short_form=short_form, options={ 'hce': hce, 'free_text': True, 'hiv': True, 'malaria': True, 'chinese': False }, country='USA') undetermined_weights = prep._get_undetermined_matrix() cause_list = set(prep.data_module.CAUSES[cause] for _, cause in prep.data_module.CAUSE_REDUCTION.items()) for key, weights in undetermined_weights.items(): age, sex = key assert sex in [1, 2, 3] if prep.AGE_GROUP == 'adult': assert age in range(10, 81, 5) + [99] elif prep.AGE_GROUP == 'child': assert age in [0, 1, 5, 10, 99] elif prep.AGE_GROUP == 'neonate': assert age in [0, 7, 99] assert not cause_list.symmetric_difference(weights.keys()) assert np.allclose(sum(weights.values()), 1)
def test_redistribution_weights_no_country(tmpdir, tariff_data): prep = TariffPrep(tariff_data, working_dir_path=tmpdir.strpath, short_form=True, options={ 'hce': True, 'free_text': True, 'hiv': True, 'malaria': True, 'chinese': False }, country=None) undetermined_weights = prep._get_undetermined_matrix() assert undetermined_weights == {}
def test_redistribution_weights_for_countries(tmpdir, tariff_data, country): """Any valid country from the countries list should be present and contain the default key used for invalid age-sex observations.""" prep = TariffPrep(tariff_data, working_dir_path=tmpdir.strpath, short_form=True, options={ 'hce': True, 'free_text': True, 'hiv': True, 'malaria': True, 'chinese': False }, country=country) undetermined_weights = prep._get_undetermined_matrix() assert isinstance(undetermined_weights, dict) assert (99, 3) in undetermined_weights
def prep(tmpdir): return TariffPrep( adult_tariff_data, working_dir_path=tmpdir.strpath, short_form=True, options={'hce': True, 'free_text': True, 'hiv': True, 'malaria': True, 'chinese': False}, country='USA' )
def test_training_likelihood_ranges(tariff_data): prep = TariffPrep(tariff_data, '/', True, {'hce': True, 'free_text': True, 'hiv': True, 'malaria': True, 'chinese': False}, 'USA') drop_headers = {'xs_name'} drop_headers.update(prep.data_module.SHORT_FORM_DROP_LIST) tariffs = get_tariff_matrix(prep.tariffs_filename, drop_headers, prep.data_module.SPURIOUS_ASSOCIATIONS) prep.cause_list = sorted(tariffs.keys()) validated = prep.read_input_file(prep.validated_filename)[1] train = prep.process_training_data(validated, tariffs, prep.data_module.FREQUENCIES, prep.data_module.CUTOFF_POS, [.25, .5, .75]) uniform_train = train[0] likelihoods = train[4] assert likelihoods.keys() == prep.cause_list assert len(set(map(len, likelihoods.values()))) == 1 for cause, likelihood in likelihoods.items(): assert likelihood[-1] == len(uniform_train) assert sorted(likelihood) == likelihood
def test_redistribution_weights_sum_to_one(tmpdir, tariff_data): prep = TariffPrep(tariff_data, working_dir_path=tmpdir.strpath, short_form=True, options={ 'hce': True, 'free_text': True, 'hiv': True, 'malaria': True, 'chinese': False }, country='USA') df = pd.read_csv(prep.undetermined_matrix_filename) weights_by_id = df.groupby(['age', 'sex', 'iso3']).sum() assert weights_by_id.apply(np.allclose, args=(1, )).all()
def test_redistribution_causes_match_reporting_causes(tmpdir, tariff_data): prep = TariffPrep(tariff_data, working_dir_path=tmpdir.strpath, short_form=True, options={ 'hce': True, 'free_text': True, 'hiv': True, 'malaria': True, 'chinese': False }, country=None) with open(prep.undetermined_matrix_filename) as f: undetermined_causes = {row['gs_text34'] for row in csv.DictReader(f)} tariff_causes = { prep.data_module.CAUSES[cause] for cause in prep.data_module.CAUSE_REDUCTION.values() } assert undetermined_causes == tariff_causes
def prep(): return TariffPrep(sample_tariff_data, '/', True, {'hce': True, 'free_text': True, 'hiv': True, 'malaria': True, 'chinese': False}, 'USA')
def run(self): status_logger.info('Preparing variable headers.') status_notifier.update({'progress': (0, 15), 'sub_progress': None}) intermediate_dir = intermediate_dir_path(self.output_dir_path) figures_dir = os.path.join(self.output_dir_path, 'figures') self.make_dir(intermediate_dir_path(self.output_dir_path)) try: self.format_headers(self.input_file_path, os.path.join(intermediate_dir, CLEAN_HEADERS_FILENAME)) except StopIteration: # File doesn't contain data message = 'Source file "{}" does not contain data.'.format(self.input_file_path) self._complete(CompletionStatus.FAIL, message) warning_logger.warning(message) return report_logger.info('Analysis parameters:') report_logger.info('- Input file: {}'.format(self.input_file_path)) report_logger.info('- Output folder: {}'.format(self.output_dir_path)) report_logger.info('- Country: {}'.format(self.country)) report_logger.info('- HIV Region: {}'.format(self.options.get('hiv', True))) report_logger.info('- Malaria Region: {}'.format(self.options.get('malaria', True))) report_logger.info('') file_path = os.path.join(intermediate_dir, CLEAN_HEADERS_FILENAME) who_questionnaire = self.who_questionaire_test(file_path) if who_questionnaire: self.short_form = True form_name = 'WHO 2016 Questionnaire' else: self.short_form = self.short_form_test(file_path) warning_logger.debug('Detected {} form'.format( 'short' if self.short_form else 'standard')) if self.short_form: form_name = 'PHMRC Shortened Questionnaire' else: form_name = 'PHMRC Full Questionnaire' report_logger.info('Detected {}'.format(form_name)) who_prep = WHOPrep(self.output_dir_path) common_prep = CommonPrep(self.output_dir_path, self.short_form) adult_pre_symptom = PreSymptomPrep(adult_pre_symptom_data, self.output_dir_path, self.short_form) adult_rules = RulesPrep(self.output_dir_path, self.short_form, common_data.ADULT, ADULT_RULES) adult_symptom = SymptomPrep(adult_symptom_data, self.output_dir_path, self.short_form) adult_results = TariffPrep(adult_tariff_data, self.output_dir_path, self.short_form, self.options, self.country) child_pre_symptom = PreSymptomPrep(child_pre_symptom_data, self.output_dir_path, self.short_form) child_rules = RulesPrep(self.output_dir_path, self.short_form, common_data.CHILD, CHILD_RULES) child_symptom = SymptomPrep(child_symptom_data, self.output_dir_path, self.short_form) child_results = TariffPrep(child_tariff_data, self.output_dir_path, self.short_form, self.options, self.country) neonate_pre_symptom = PreSymptomPrep(neonate_pre_symptom_data, self.output_dir_path, self.short_form) neonate_rules = RulesPrep(self.output_dir_path, self.short_form, common_data.NEONATE, NEONATE_RULES) neonate_symptom = SymptomPrep(neonate_symptom_data, self.output_dir_path, self.short_form) neonate_results = TariffPrep(neonate_tariff_data, self.output_dir_path, self.short_form, self.options, self.country) legacy = self.options.get('legacy_format', False) output = OutputPrep(self.output_dir_path, reorganize=not legacy, keep_orig=legacy, short_form=self.short_form, free_text=self.options.get('free_text', True), hce=self.options.get('hce', True)) cause_grapher = CauseGrapher(self.output_dir_path) csmf_grapher = CSMFGrapher(self.output_dir_path) self._abort_list.extend([ who_prep, common_prep, adult_pre_symptom, adult_rules, adult_symptom, adult_results, child_pre_symptom, child_rules, child_symptom, child_results, neonate_pre_symptom, neonate_rules, neonate_symptom, neonate_results, cause_grapher, csmf_grapher, ]) try: if who_questionnaire: who_prep.run() # makes adult-prepped.csv, child-prepped.csv, neonate-prepped.csv adult_data, child_data, neonate_data = common_prep.run() if adult_data: # makes adult-presymptom.csv adult_pre_symptom.run() # makes adult-logic-rules.csv adult_rules.run() # makes adult-symptom.csv adult_symptom.run() # creates adult output files adult_results.run() if child_data: # makes child-presymptom.csv child_pre_symptom.run() # makes child-logic-rules.csv child_rules.run() # makes child-symptom.csv child_symptom.run() # creates child output files child_results.run() if neonate_data: # makes neonate-presymptom.csv neonate_pre_symptom.run() # makes neonate-logic-rules.csv neonate_rules.run() # makes neonate-symptom.csv neonate_symptom.run() # creates neonate output files neonate_results.run() if self.options.get('figures') and (adult_data or child_data or neonate_data): self.make_dir(figures_dir) # generate all cause graphs cause_grapher.run() # generate all csmf graphs csmf_grapher.run() output.run() except AbortException: self._complete(CompletionStatus.ABORT) except Exception: traceback.print_exc() self._complete(CompletionStatus.FAIL) else: self._complete(CompletionStatus.DONE)