示例#1
0
    def _completion_handler(self, status, message=''):
        """
        Completion callback.
        :type status: int
        :param status:
        :return:
        """
        self.running = False

        if not self._want_quit:
            style = ''
            status_message = ''
            if status == workerthread.CompletionStatus.ABORT:
                status_message = 'Computation successfully aborted. '
            elif status == workerthread.CompletionStatus.DONE:
                open_folder(self.output_folder_path)
                status_message = 'Processing complete. '
            elif status == workerthread.CompletionStatus.FAIL:
                open_folder(self.output_folder_path)
                status_message = 'Processing failed. '
                style = 'error'

            status_notifier.update({
                'progress': (int(not status), 1),
                'sub_progress': (int(not status), 1)
            })
            self.action_button.Enable(True)
            self.action_button.SetLabel('Start')
            self.enable_ui(True)
            status_logger.info(status_message)

            self.StatusBar.SetStatusText('')

        with self._completion_lock:
            self._completion_lock.notifyAll()
示例#2
0
    def run(self):
        super(RulesPrep, self).run()

        status_logger.info('{} :: Processing rules data'.format(
            self.AGE_GROUP.capitalize()))
        status_notifier.update({'progress': 1})

        headers, matrix = DataPrep.read_input_file(self.input_file_path())

        headers.extend(ADDITIONAL_DATA.keys())

        status_notifier.update({'sub_progress': (0, len(matrix))})

        for index, row in enumerate(matrix):
            self.check_abort()

            status_notifier.update({'sub_progress': (index, )})

            self.expand_row(row, ADDITIONAL_DATA)

            for rule in self.rules:
                try:
                    if rule.logic_rule(row) is True:
                        row[RULES_CAUSE_NUM_KEY] = rule.CAUSE_ID
                        break
                except Exception as e:
                    warning_logger.warning(
                        'SID: {} rule `{}` failed complete: {}'.format(
                            row['sid'], rule, e.message))

        status_notifier.update({'sub_progress': None})

        DataPrep.write_output_file(headers, matrix, self.output_file_path())

        return matrix
    def _read_graph_data(self):
        super(CSMFGrapher, self)._read_graph_data()
        # build ordered dict for values to be graphed. indexed by module
        graph_data_unsorted = defaultdict(dict)

        status_notifier.update({'sub_progress': (0, len(MODULE_LABELS))})

        for cnt, module_key in enumerate(MODULE_LABELS):
            status_notifier.update({'sub_progress': (cnt, )})

            try:
                with open(
                        os.path.join(
                            self.input_dir_path,
                            INPUT_FILENAME_TEMPLATE.format(module_key)),
                        'rb') as f:
                    reader = csv.DictReader(f)

                    for row in reader:
                        self.check_abort()

                        cause_key = row['cause'].rstrip()
                        cause_fraction = row['CSMF']

                        graph_data_unsorted[module_key][cause_key] = float(
                            cause_fraction)

            except IOError:
                # The file isn't there, there was no data or an error, so just skip it
                continue

            for sex in ('male', 'female'):
                try:
                    key = '-'.join([module_key, sex])
                    filename = os.path.join(self.input_dir_path,
                                            '{:s}-csmf.csv'.format(key))
                    with open(filename, 'rb') as f:
                        for row in csv.DictReader(f):
                            self.check_abort()

                            cause_key = row['cause'].rstrip()
                            cause_fraction = row['CSMF']

                            graph_data_unsorted[key][cause_key] = float(
                                cause_fraction)
                except IOError:
                    continue

        return graph_data_unsorted
示例#4
0
    def generate_cause_rankings(self, scored, uniform_scores):
        """Determine rank for each cause.

        The scored user data is ranked against the scores from the validation
        data which has been resampled to a uniform cause distribtuion. If an
        observation is scored higher than any observation in the training data
        it is ranked 0.5. If an observation is scored lower than any
        observation in the training data it is ranked len(training) + 0.5.

        The user_data is modified in place and not returned.

        Args:
            scored (list): list of ScoredVAs from user data
            uniform_scores (dict of lists): sorted distribution of scores
                by cause from uniform training data
        """
        status_notifier.update({'sub_progress': (0, len(scored))})

        for index, va in enumerate(scored):
            status_notifier.update({'sub_progress': (index, )})

            for cause in self.cause_list:
                self.check_abort()

                gt = bisect_left(uniform_scores[cause], va.scores[cause])
                lt = bisect_right(uniform_scores[cause], va.scores[cause])
                avg_rank = len(uniform_scores[cause]) - gt + (gt - lt) / 2.

                va.ranks[cause] = avg_rank + .5

        status_notifier.update({'sub_progress': None})
示例#5
0
    def score_symptom_data(self, symptom_data, tariffs):
        """Score symptom data using a tariffs matrix.

        Args:
            symptom_data (list of dict): symptom data from a csv.DictReader
            tariffs (dict of lists): processed tariffs by cause

        Returns:
            list: List of Scored VAs.
        """
        scored = []

        status_notifier.update({'sub_progress': (0, len(symptom_data))})

        for index, row in enumerate(symptom_data):
            self.check_abort()

            status_notifier.update({'sub_progress': (index, )})

            va = self.score_row(row, tariffs)

            va.censored = map(safe_int, row.get('restricted', '').split())
            va.rules = safe_int(row.get(RULES_CAUSE_NUM_KEY))

            scored.append(va)

        status_notifier.update({'sub_progress': None})

        return scored
示例#6
0
    def _read_graph_data(self):
        graph_data = defaultdict(get_default_dict)
        status_notifier.update({'sub_progress': (0, len(MODULE_LABELS))})
        for cnt, module_key in enumerate(MODULE_LABELS):
            status_notifier.update({'sub_progress': (cnt, )})

            try:
                with open(
                        os.path.join(
                            self.input_dir_path,
                            INPUT_FILENAME_TEMPLATE.format(module_key)),
                        'rb') as f:
                    reader = csv.DictReader(f)

                    for row in reader:
                        self.check_abort()

                        try:
                            age_key = get_age_key(float(row['age']))
                            if age_key not in AGE_DATA.values():
                                raise ValueError('Unknown age group.')
                            sex_key = int(row['sex'])
                            if sex_key not in [1, 2]:
                                raise ValueError(
                                    'Cannot yet plot when sex is not M/F')
                        except ValueError as e:
                            # Age or sex is invalid. Log warning and skip this item.
                            warning_logger.warning(
                                'Cause Grapher :: SID {} value for age or sex is invalid.'
                                .format(row['sid'], e.message))
                            continue

                        graph_data[row['cause34']][sex_key][age_key] += 1
                        graph_data['All'][sex_key][age_key] += 1

            except IOError:
                # The file isn't there, there was no data or an error, so just skip it.
                continue

        return graph_data
示例#7
0
    def run(self):
        super(WHOPrep, self).run()

        status_logger.info('Mapping WHO Questionnaire')
        status_notifier.update({'progress': 1})

        headers = set(self.data_module.ADDITIONAL_HEADERS)
        headers.update(self.data_module.YES_NO_QUESTIONS)
        headers.update([h for h, _ in self.data_module.RECODE_QUESTIONS])
        headers.update(self.data_module.RENAME_QUESTIONS)
        headers.update(self.data_module.REVERSE_ONE_HOT_MULTISELECT)
        headers.update([h for h, _ in self.data_module.RECODE_MULTISELECT])
        headers.update(self.data_module.ONE_HOT_FROM_MULTISELECT)
        headers.update(self.data_module.UNIT_IF_AMOUNT)
        for unit_col, value_col, _ in self.data_module.DURATION_CONVERSIONS:
            headers.update([unit_col, value_col])

        _, matrix = DataPrep.read_input_file(self.input_file_path())

        status_notifier.update({'sub_progress': (0, len(matrix))})

        for index, row in enumerate(matrix):
            self.check_abort()

            status_notifier.update({'sub_progress': (index,)})

            self.determine_consent(row)
            self.calculate_age(row)
            self.recode_yes_no_questions(row)
            self.recode_categoricals(row)
            self.rename_questions(row)
            self.reverse_one_hot_multiselect(row)
            self.recode_multiselects(row)
            self.encode_one_hot_from_multiselect(row)
            self.map_units_from_values(row)
            self.convert_durations(row)
            self.map_adult_chest_pain_duration(row)
            self.map_child_illness_duration(row)
            self.map_neonate_first_cry(row)
            self.map_child_unconsciousness_start(row)
            self.map_neonate_delivery_type(row)
            self.map_child_birth_size(row)
            self.map_redundant_child_age_data(row)

        status_notifier.update({'sub_progress': None})

        DataPrep.write_output_file(sorted(headers), matrix,
                                   self.output_file_path(None))
示例#8
0
    def _make_graphs(self, graph_data):
        # Make cause of death graphs.
        status_notifier.update({'sub_progress': (0, len(graph_data))})
        for cnt, (cause_key, data) in enumerate(graph_data.items()):
            self.check_abort()

            status_notifier.update({'sub_progress': (cnt, )})

            make_graph(data, cause_key, self.output_dir_path)
        status_notifier.update({'sub_progress': None})
    def _make_graphs(self, graph_data_unsorted):
        super(CSMFGrapher, self)._make_graphs(graph_data_unsorted)
        # Make csmf graphs.
        status_notifier.update({'sub_progress': (0, len(graph_data_unsorted))})

        for cnt, (module_key, data) in enumerate(graph_data_unsorted.items()):
            self.check_abort()

            status_notifier.update({'sub_progress': (cnt, )})

            # sort data in decreasing order
            graph_data = OrderedDict(
                sorted(data.iteritems(), key=lambda x: x[1], reverse=True))
            make_graph(graph_data, module_key, self.output_dir_path)

        status_notifier.update({'sub_progress': None})
示例#10
0
 def _update_status(self):
     status_logger.info('Making cause graphs')
     status_notifier.update({'progress': 1})
 def _update_status(self):
     super(CSMFGrapher, self)._update_status()
     status_logger.info('Making CSMF graphs')
     status_notifier.update({'progress': 1})
示例#12
0
    def process_training_data(self, train, tariffs, frequencies, cutoff_pos,
                              thresholds):
        """Process the training data.

        The validated data is expanded so that the cause distribution across
        all the observation is uniformly distributed across the causes. The
        sampling frequencies are determined elsewhere and stored in the data
        module.

        Cause-specific cutoffs are calculated as the rank value at the given
        cutoff percentile of the subset of observations whose gold standards
        is the given cause. While the data are sorted also store the
        distribution of scores by cause. This is used to rank the user data.

        Args:
            train (list): List of validated ScoredVAs.
            frequencies (dict): Map of validated sid to frequency.
            cutoff_pos (float): Percentile cutoff from 0 to 1.

        Returns:
            tuple:
                list: validated VAs with uniform cause distribution.
                dict: ordered scores by cause for all VAs in the training data
                dict: mapping of ranks of VAs in the training data for which
                    the gold standard is the given cause
                dict: Cutoff score for each cause.
        """
        uniform_train = []

        status_notifier.update({'sub_progress': (0, 1)})

        for index, row in enumerate(train):
            self.check_abort()

            # Assume half the processing time is scoring/expanding
            # Fill half the status bar based on the number of rows
            status_notifier.update(
                {'sub_progress': ((index / 2) / len(train), )})

            va = self.score_row(row, tariffs)
            va.cause = row.get('va46')
            uniform_train.extend([va] * frequencies[va.sid])

        scores = {}
        ranks = {}
        cutoffs = {}
        likelihoods = {}

        n_causes = len(self.cause_list)
        n_uniform = len(uniform_train)
        overall_cutoff = n_uniform * self.data_module.CUTOFF_POS

        for index, cause in enumerate(self.cause_list):
            self.check_abort()

            # Assume half the processing time is sorting/ranking
            # Start at 50% and updated in even increments for each cause
            status_notifier.update(
                {'sub_progress': (.5 + (index / 2) / n_causes, )})

            # Get the uniform training data sorted by (reversed) score and
            # sid. Sorting by sid ensures the ranks are stable between row
            # which have the score but different gold standard causes.
            def sorter(va):
                return -va.scores[cause], va.sid

            uniform_sorted = sorted(uniform_train, key=sorter)

            # Store the scores from the distribution sorted from low to high
            scores[cause] = [va.scores[cause] for va in uniform_sorted][::-1]

            # Determine the rank within the complete uniform training data
            # of the subset of VAs whose gold standard cause is the cause
            # by which the VAs are ranked.
            ranks = [(i + 1) for i, va in enumerate(uniform_sorted)
                     if int(va.cause) == cause]
            n_ranks = len(ranks)
            ranks[cause] = ranks

            # Find the index of the item at cutoff position.
            cutoffs[cause] = ranks[int(n_ranks * cutoff_pos)]

            # Find the rank value at each threshold value
            like = [0]
            like.extend([ranks[int(n_ranks * thre)] for thre in thresholds])
            like.append(min([cutoffs[cause], overall_cutoff]))
            like.append(n_uniform)
            likelihoods[cause] = like

        status_notifier.update({'sub_progress': None})

        return uniform_train, scores, ranks, cutoffs, likelihoods
示例#13
0
    def run(self):
        super(TariffPrep, self).run()

        status_logger.info('{:s} :: Processing tariffs'.format(
            self.AGE_GROUP.capitalize()))
        status_notifier.update({'progress': 1})

        # Headers are being dropped only from tariff matrix now because of the
        # way we are iterating over the pruned tariff data. It is unnecessary
        # to drop headers from other matrices.
        drop_headers = {TARIFF_CAUSE_NUM_KEY}
        if not self.hce:
            drop_headers.update(self.data_module.HCE_DROP_LIST)
        if not self.free_text:
            drop_headers.update(self.data_module.FREE_TEXT)
        if self.short_form:
            drop_headers.update(self.data_module.SHORT_FORM_DROP_LIST)

        tariffs = get_tariff_matrix(self.tariffs_filename, drop_headers,
                                    self.data_module.SPURIOUS_ASSOCIATIONS)

        self.cause_list = sorted(tariffs.keys())

        validated = self.read_input_file(self.validated_filename)[1]

        status_logger.info('{:s} :: Processing validation data.'.format(
            self.AGE_GROUP.capitalize()))
        train = self.process_training_data(validated, tariffs,
                                           self.data_module.FREQUENCIES,
                                           self.data_module.CUTOFF_POS,
                                           [.25, .5, .75])
        (uniform_train, uniform_scores, uniform_ranks, cutoffs,
         likelihoods) = train

        self.write_cutoffs(cutoffs)

        status_logger.info('{:s} :: Generating VA cause list.'.format(
            self.AGE_GROUP.capitalize()))
        user_data = self.read_input_file(self.input_file_path())[1]
        user_data = self.score_symptom_data(user_data, tariffs)

        status_logger.info('{:s} :: Generating cause rankings.'.format(
            self.AGE_GROUP.capitalize()))
        self.generate_cause_rankings(user_data, uniform_scores)

        self.write_intermediate_file(user_data, 'external-ranks', 'ranks')

        lowest_rank = len(uniform_train) + 0.5

        self.mask_ranks(user_data, len(uniform_train), cutoffs,
                        self.data_module.CAUSE_CONDITIONS, lowest_rank,
                        self.data_module.UNIFORM_LIST_POS,
                        self.data_module.MIN_CAUSE_SCORE)

        self.predict(user_data, lowest_rank, self.data_module.CAUSE_REDUCTION,
                     self.data_module.CAUSES, self.data_module.CAUSES46)

        self.determine_likelihood(user_data, likelihoods,
                                  self.data_module.CAUSE_REDUCTION)

        undetermined_weights = self._get_undetermined_matrix()
        csmf, csmf_by_sex = self.calculate_csmf(user_data,
                                                undetermined_weights)

        self.write_predictions(user_data)

        likelihood_names = [
            'Very Likely', 'Likely', 'Somewhat Likely', 'Possible'
        ]
        if self.language != 'english':
            path = os.path.join(config.basedir, 'data',
                                '{}.json'.format(self.language))
            with open(path, 'rb') as f:
                translation = json.load(f)
            likelihood_names = [
                translation['likelihoods'].get(likelihood)
                for likelihood in likelihood_names
            ]
        else:
            translation = None
        colors = ['#3CB371', '#47d147', '#8ae600', '#e6e600']
        mp = self.write_multiple_predictions_xlsx(user_data, tariffs,
                                                  likelihood_names, colors,
                                                  translation)
        self.write_multiple_predictions_csv(mp)

        self.write_csmf(self.AGE_GROUP, csmf)
        sex_name = {1: 'male', 2: 'female'}
        for sex, csmf_data in csmf_by_sex.items():
            key = '-'.join([self.AGE_GROUP, sex_name[sex]])
            self.write_csmf(key, csmf_data)

        self.write_intermediate_file(user_data, 'tariff-scores', 'scores')

        self.write_intermediate_file(user_data, 'tariff-ranks', 'ranks')

        return user_data
示例#14
0
    def run(self):
        status_logger.info('Preparing variable headers.')
        status_notifier.update({'progress': (0, 15), 'sub_progress': None})

        intermediate_dir = intermediate_dir_path(self.output_dir_path)
        figures_dir = os.path.join(self.output_dir_path, 'figures')

        self.make_dir(intermediate_dir_path(self.output_dir_path))

        try:
            self.format_headers(self.input_file_path, os.path.join(intermediate_dir, CLEAN_HEADERS_FILENAME))
        except StopIteration:
            # File doesn't contain data
            message = 'Source file "{}" does not contain data.'.format(self.input_file_path)
            self._complete(CompletionStatus.FAIL, message)
            warning_logger.warning(message)
            return

        report_logger.info('Analysis parameters:')
        report_logger.info('- Input file: {}'.format(self.input_file_path))
        report_logger.info('- Output folder: {}'.format(self.output_dir_path))
        report_logger.info('- Country: {}'.format(self.country))
        report_logger.info('- HIV Region: {}'.format(self.options.get('hiv', True)))
        report_logger.info('- Malaria Region: {}'.format(self.options.get('malaria', True)))
        report_logger.info('')

        file_path = os.path.join(intermediate_dir, CLEAN_HEADERS_FILENAME)
        who_questionnaire = self.who_questionaire_test(file_path)

        if who_questionnaire:
            self.short_form = True
            form_name = 'WHO 2016 Questionnaire'

        else:
            self.short_form = self.short_form_test(file_path)
            warning_logger.debug('Detected {} form'.format(
                'short' if self.short_form else 'standard'))
            if self.short_form:
                form_name = 'PHMRC Shortened Questionnaire'
            else:
                form_name = 'PHMRC Full Questionnaire'
        report_logger.info('Detected {}'.format(form_name))

        who_prep = WHOPrep(self.output_dir_path)
        common_prep = CommonPrep(self.output_dir_path, self.short_form)
        adult_pre_symptom = PreSymptomPrep(adult_pre_symptom_data, self.output_dir_path, self.short_form)
        adult_rules = RulesPrep(self.output_dir_path, self.short_form, common_data.ADULT, ADULT_RULES)
        adult_symptom = SymptomPrep(adult_symptom_data, self.output_dir_path, self.short_form)
        adult_results = TariffPrep(adult_tariff_data, self.output_dir_path, self.short_form, self.options, self.country)
        child_pre_symptom = PreSymptomPrep(child_pre_symptom_data, self.output_dir_path, self.short_form)
        child_rules = RulesPrep(self.output_dir_path, self.short_form, common_data.CHILD, CHILD_RULES)
        child_symptom = SymptomPrep(child_symptom_data, self.output_dir_path, self.short_form)
        child_results = TariffPrep(child_tariff_data, self.output_dir_path, self.short_form, self.options, self.country)
        neonate_pre_symptom = PreSymptomPrep(neonate_pre_symptom_data, self.output_dir_path, self.short_form)
        neonate_rules = RulesPrep(self.output_dir_path, self.short_form, common_data.NEONATE, NEONATE_RULES)
        neonate_symptom = SymptomPrep(neonate_symptom_data, self.output_dir_path, self.short_form)
        neonate_results = TariffPrep(neonate_tariff_data, self.output_dir_path, self.short_form, self.options, self.country)
        legacy = self.options.get('legacy_format', False)
        output = OutputPrep(self.output_dir_path, reorganize=not legacy,
                            keep_orig=legacy, short_form=self.short_form,
                            free_text=self.options.get('free_text', True),
                            hce=self.options.get('hce', True))
        cause_grapher = CauseGrapher(self.output_dir_path)
        csmf_grapher = CSMFGrapher(self.output_dir_path)

        self._abort_list.extend([
            who_prep,
            common_prep,
            adult_pre_symptom,
            adult_rules,
            adult_symptom,
            adult_results,
            child_pre_symptom,
            child_rules,
            child_symptom,
            child_results,
            neonate_pre_symptom,
            neonate_rules,
            neonate_symptom,
            neonate_results,
            cause_grapher,
            csmf_grapher,
        ])

        try:
            if who_questionnaire:
                who_prep.run()

            # makes adult-prepped.csv, child-prepped.csv, neonate-prepped.csv
            adult_data, child_data, neonate_data = common_prep.run()

            if adult_data:
                # makes adult-presymptom.csv
                adult_pre_symptom.run()
                # makes adult-logic-rules.csv
                adult_rules.run()
                # makes adult-symptom.csv
                adult_symptom.run()
                # creates adult output files
                adult_results.run()

            if child_data:
                # makes child-presymptom.csv
                child_pre_symptom.run()
                # makes child-logic-rules.csv
                child_rules.run()
                # makes child-symptom.csv
                child_symptom.run()
                # creates child output files
                child_results.run()

            if neonate_data:
                # makes neonate-presymptom.csv
                neonate_pre_symptom.run()
                # makes neonate-logic-rules.csv
                neonate_rules.run()
                # makes neonate-symptom.csv
                neonate_symptom.run()
                # creates neonate output files
                neonate_results.run()

            if self.options.get('figures') and (adult_data or child_data or neonate_data):
                self.make_dir(figures_dir)
                # generate all cause graphs
                cause_grapher.run()
                # generate all csmf graphs
                csmf_grapher.run()

            output.run()

        except AbortException:
            self._complete(CompletionStatus.ABORT)
        except Exception:
            traceback.print_exc()
            self._complete(CompletionStatus.FAIL)
        else:
            self._complete(CompletionStatus.DONE)
示例#15
0
    def run(self):
        super(SymptomPrep, self).run()

        status_logger.info('{} :: Processing symptom data'.format(
            self.AGE_GROUP.capitalize()))
        status_notifier.update({'progress': 1})

        headers, matrix = DataPrep.read_input_file(self.input_file_path())

        status_notifier.update({'sub_progress': (0, len(matrix))})

        additional_data = {}
        additional_data.update(self.data_module.GENERATED_VARS_DATA)
        additional_headers, additional_values = additional_headers_and_values(
            headers, additional_data.items())

        headers.extend(additional_headers)
        self.rename_headers(headers, self.data_module.VAR_CONVERSION_MAP)

        keep_list = [
            header for header in headers
            if re.match(self.data_module.KEEP_PATTERN, header)
        ]
        drop_list = self.data_module.DROP_LIST

        headers = sorted([
            header for header in headers
            if header in keep_list and header not in drop_list
        ],
                         key=lambda t: (t != 'sid', t[1].isdigit(), t))

        for index, row in enumerate(matrix):
            self.check_abort()

            status_notifier.update({'sub_progress': (index, )})

            self.expand_row(row,
                            dict(zip(additional_headers, additional_values)))
            self.rename_vars(row, self.data_module.VAR_CONVERSION_MAP)

            self.copy_variables(row, self.data_module.COPY_VARS)

            # Compute age quartiles.
            self.process_progressive_value_data(
                row, self.data_module.AGE_QUARTILE_BINARY_VARS.items())

            self.process_cutoff_data(
                row, self.data_module.DURATION_CUTOFF_DATA.items())

            self.process_injury_data(row, self.data_module.INJURY_VARS.items())

            # Dichotomize!
            self.process_binary_vars(
                row, self.data_module.BINARY_CONVERSION_MAP.items())

            # Ensure all binary variables actually ARE 0 or 1:
            self.post_process_binary_variables(row,
                                               self.data_module.BINARY_VARS)

            self.censor_causes(row, self.data_module.CENSORED_MAP)

            self.require_symptoms(row, self.data_module.REQUIRED_MAP)

        status_notifier.update({'sub_progress': None})

        DataPrep.write_output_file(headers, matrix, self.output_file_path())

        return matrix
    def run(self):
        """Perform initial processing step for preparing input data.

        Returns:
            tuple(bool): Tuple of bool values if VAs are present for Adult, Child, and Neonate.
        """
        super(CommonPrep, self).run()

        status_logger.info('Initial data prep')
        status_notifier.update({'progress': 1})

        headers, matrix = DataPrep.read_input_file(self.input_file_path())

        status_notifier.update({'sub_progress': (0, len(matrix))})

        # Extend the headers with additional headers and read the remaining data into the matrix
        additional_data = {k: '' for k in ADDITIONAL_HEADERS}
        if self.short_form:
            additional_data.update(SHORT_FORM_ADDITIONAL_HEADERS_DATA)
        additional_headers, additional_values = additional_headers_and_values(
            headers, additional_data.items())

        headers.extend(additional_headers)
        if 'child_1_8a' not in headers:
            headers.append('child_1_8a')

        for index, row in enumerate(matrix):
            self.check_abort()

            status_notifier.update({'sub_progress': (index, )})

            self.check_sids(row, index)

            if not self.check_consent(row, CONSENT_HEADER, index):
                warning_logger.info('SID: {} Refused consent.'.format(
                    row['sid']))
                continue

            self.expand_row(row,
                            dict(zip(additional_headers, additional_values)))

            self.correct_missing_age(row)

            try:
                self.convert_cell_to_int(row, AGE_VARS.values())
            except KeyError as e:
                warning_logger.error('Missing age variable: {}'.format(
                    e.message))
                missing_vars = [
                    var for var in AGE_VARS.values() if var not in headers
                ]
                status_logger.info('Cannot process data without: {}'.format(
                    ', '.join(missing_vars)))
                status_notifier.update('abort')
                continue

            for header, mapping in BINARY_CONVERSION_MAP.items():
                self.process_multiselect_vars(row, header, mapping)

            for header in COUNT_DATA_HEADERS:
                self.process_count_data(row, header)

            self.convert_rash_data(row, RASH_DATA)

            self.convert_weight_data(row, WEIGHT_CONVERSION_DATA)

            self.convert_free_text(row, FREE_TEXT_VARS, WORD_SUBS)

            self.save_row(row, index)

        status_notifier.update({'sub_progress': None})

        self.write_data(headers, self._matrix_data)

        return bool(self._matrix_data[ADULT]), bool(
            self._matrix_data[CHILD]), bool(self._matrix_data[NEONATE])
    def run(self):
        super(PreSymptomPrep, self).run()

        status_logger.info('{} :: Processing pre-symptom data'.format(
            self.AGE_GROUP.capitalize()))
        status_notifier.update({'progress': 1})

        # Create a list of duration variables, dropping specified variables if using the short form.
        duration_vars = self.data_module.DURATION_VARS[:]
        if self.short_form:
            for var in self.data_module.DURATION_VARS_SHORT_FORM_DROP_LIST:
                duration_vars.remove(var)

        headers, matrix = DataPrep.read_input_file(self.input_file_path())

        status_notifier.update({'sub_progress': (0, len(matrix))})

        # Identify new headers and data to be included.
        additional_data = {k: '' for k in self.data_module.DURATION_VARS}
        duration_day_vars = getattr(self.data_module, 'DURATION_DAYS_VARS', [])
        additional_data.update({k: '' for k in duration_day_vars})
        additional_data.update(
            {k: 0
             for k in self.data_module.GENERATED_VARS_DATA})
        additional_data.update(
            {k: 0
             for k in sorted(self.data_module.WORDS_TO_VARS.values())})
        additional_headers, additional_values = additional_headers_and_values(
            headers, additional_data.items())

        headers.extend(additional_headers)
        self.rename_headers(headers, self.data_module.VAR_CONVERSION_MAP)

        # Make a list of headers to keep and to drop.
        keep_list = [
            header for header in headers
            if re.match(self.data_module.KEEP_PATTERN, header)
        ]
        drop_list = (['{}a'.format(header) for header in duration_vars] +
                     ['{}b'.format(header) for header in duration_vars])

        # Prune headers and sort by 'sid', then anything that doesn't contain a digit at pos 1, then general vars.
        headers = sorted(
            [
                header for header in headers
                if header in keep_list and header not in drop_list
            ],
            key=lambda t:
            (t != 'sid', t[1].isdigit(), not t.startswith('g'), t))

        for index, row in enumerate(matrix):
            self.check_abort()

            status_notifier.update({'sub_progress': (index, )})
            self.expand_row(row,
                            dict(zip(additional_headers, additional_values)))
            self.rename_vars(row, self.data_module.VAR_CONVERSION_MAP)

            self.verify_answers_for_row(row, RANGE_LIST)

            self.fix_agedays(row)

            self.calculate_age_at_death_value(row)

            self.recode_answers(row, self.data_module.RECODE_MAP)

            self.process_binary_vars(
                row, self.data_module.BINARY_CONVERSION_MAP.items())

            self.calculate_duration_vars(
                row, duration_vars,
                self.data_module.DURATION_VARS_SPECIAL_CASE)

            self.validate_days_vars(row, duration_day_vars)

            self.validate_weight_vars(row, self.data_module.WEIGHT_VARS)

            self.validate_date_vars(row, self.data_module.DATE_VARS)

            self.process_age_vars(row)

            self.convert_free_text_vars(row, self.data_module.FREE_TEXT_VARS,
                                        self.data_module.WORDS_TO_VARS)

            if self.short_form:
                word_list = [
                    v for k, v in
                    self.data_module.SHORT_FORM_FREE_TEXT_CONVERSION.items()
                    if value_or_default(row.get(k)) == 1
                ]
                if word_list:
                    self.convert_free_text_words(
                        row, word_list, self.data_module.WORDS_TO_VARS)

            self.fix_rash_length(row)

            self.fix_rash_location(row)

            self.process_weight_sd_vars(
                row, getattr(self.data_module, 'EXAM_DATE_VARS', {}),
                getattr(self.data_module, 'WEIGHT_SD_DATA', {}))

            self.fill_missing_data(row, self.default_fill)

        status_notifier.update({'sub_progress': None})

        DataPrep.write_output_file(headers, matrix, self.output_file_path())

        return matrix