Exemplo n.º 1
0
    def RunStacked(self, results_file, cv_folds = 10, min_word_count = 5,
                   stem = True, lemmatize = False, remove_stop_words = True, layers = 2):

        #SETTINGS
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

        print "Results filename: " + results_file
        settings = Settings.Settings()

        results_dir = settings.results_directory + self.sub_dir() + "\\"

        fName = results_dir + results_file

        #TOKENIZE
        data = self.get_data(settings)
        tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=min_word_count, stem=stem,
                                                lemmatize=lemmatize, remove_stop_words=remove_stop_words,
                                                spelling_correct=True, number_fn=NumberStrategy.collapse_num)

        empty_ixs = set([i for i, doc in enumerate(tokenized_docs) if len(doc) < StackedExperimentRunner.__MIN_DOC_LENGTH__])
        tokenized_docs = [t for i, t in enumerate(tokenized_docs) if i not in empty_ixs]

        #TRAINING DATA
        #TODO Make this one call from docs -> td
        (distance_matrix, id2word) = self.get_vector_space(tokenized_docs)
        xs = self.get_training_data(distance_matrix, id2word)

        matrix_mapper = self.matrix_value_mapper()
        if matrix_mapper:
            xs = MatrixHelper.map_matrix(matrix_mapper, xs)

        all_results = self.get_params() + "\n"
        print all_results,

        MIN_CODE_COUNT = 3

        codes = set(self.get_codes(data.sm_codes))
        label_mapper = self.label_mapper()

        # Stop logging now
        logging.disable(logging.INFO)

        xs = ensure_np_array(xs)
        edges = cross_validation_edges(len(xs), cv_folds)

        ys_by_code = {}
        positive_count_by_code = {}
        for code in codes.copy():
            ys = self.get_ys(code, data, empty_ixs, label_mapper, xs)
            ys_by_code[code] = ys

            positive_count = len([item for item in ys if item == 1])
            positive_count_by_code[code] = positive_count

            if positive_count < MIN_CODE_COUNT:
                codes.remove(code)

        dct_td_predictions_by_fold = {}
        dct_vd_predictions_by_fold = {}
        dct_actual_by_fold = {}

        for layer in range(layers):

            print("Layer: {0}".format(layer))
            vd_metrics_for_layer, td_metrics_for_layer = [], []

            vd_metrics_by_code = defaultdict(lambda: [])
            td_metrics_by_code = defaultdict(lambda: [])

            for fold in range(cv_folds):

                l, r = edges[fold]

                #Note these are numpy obj's and cannot be treated as lists
                td_x = np.concatenate((xs[:l], xs[r:]))
                vd_x = xs[l:r]

                predictions_from_previous_layer = None
                if layer > 0:
                    # Seed with an empty lists
                    lst_td_preds = self.__extract_predictions__(codes, dct_td_predictions_by_fold[fold], td_x)
                    td_x = np.concatenate((td_x, np.array(lst_td_preds)), 1)

                    lst_vd_preds = self.__extract_predictions__(codes, dct_vd_predictions_by_fold[fold], vd_x)
                    vd_x = np.concatenate((vd_x, np.array(lst_vd_preds)), 1)

                dct_td_predictions_per_code = {}
                dct_vd_predictions_per_code = {}
                dct_actual_per_code = {}

                dct_td_predictions_by_fold[fold] = dct_td_predictions_per_code
                dct_vd_predictions_by_fold[fold] = dct_vd_predictions_per_code
                dct_actual_by_fold[fold] = dct_actual_per_code

                class_value = self.get_class_value()

                for code in codes:

                    total_codes = positive_count_by_code[code]

                    ys = ys_by_code[code]
                    td_y = np.concatenate((ys[:l], ys[r:]))
                    vd_y = ys[l:r]

                    if min(td_y) == max(td_y):
                        val = td_y[0]
                        td_predictions = np.array([val for y in td_y])
                        vd_predictions = np.array([val for y in vd_y])
                    else:
                        create_classifier_func = self.create_classifier(code)
                        classify_func = self.classify()

                        classifier = create_classifier_func(td_x, td_y)
                        td_predictions = classify_func(classifier, td_x)
                        vd_predictions = classify_func(classifier, vd_x)

                    dct_td_predictions_per_code[code]  = td_predictions
                    dct_vd_predictions_per_code[code]  = vd_predictions
                    dct_actual_per_code[code]       = td_y

                    td_r, td_p, td_f1, td_a = Metrics.rpf1a(td_y, td_predictions, class_value=class_value)
                    vd_r, vd_p, vd_f1, vd_a = Metrics.rpf1a(vd_y, vd_predictions, class_value=class_value)

                    vd_metric, td_metric = self.rpfa(vd_r, vd_p, vd_f1, vd_a, total_codes), \
                                           self.rpfa(td_r, td_p, td_f1, td_a, total_codes)

                    vd_metrics_for_layer.append(vd_metric)
                    td_metrics_for_layer.append(td_metric)

                    vd_metrics_by_code[code].append(vd_metric)
                    td_metrics_by_code[code].append(td_metric)

                pass # End for code in codes

            pass #END for fold in folds

            for code in sorted(codes):
                positive_count = positive_count_by_code[code]
                vd_metric, td_metric = self.mean_rpfa(vd_metrics_by_code[code]), self.mean_rpfa(td_metrics_by_code[code])

                results = "Code: {0} Count: {1} VD[ {2} ]\tTD[ {3} ]\n".format(code.ljust(7), str(positive_count).rjust(4),
                                                                               vd_metric.to_str(), td_metric.to_str())
                print results,

            mean_vd_metrics, mean_td_metrics = self.mean_rpfa(vd_metrics_for_layer), self.mean_rpfa(td_metrics_for_layer)
            wt_mean_vd_metrics, wt_mean_td_metrics = self.weighted_mean_rpfa(vd_metrics_for_layer), self.weighted_mean_rpfa(
                td_metrics_for_layer)

            aggregate_results = "\n"
            aggregate_results += "VALIDATION DATA -\n"
            aggregate_results += "\tMEAN\n\t\t {0}\n".format(mean_vd_metrics.to_str(True))
            aggregate_results += "\tWEIGHTED MEAN\n\t\t {0}\n".format(wt_mean_vd_metrics.to_str(True))

            aggregate_results += "\n"
            aggregate_results += "TRAINING DATA -\n"
            aggregate_results += "\tMEAN\n\t\t {0}\n".format(mean_td_metrics.to_str(True))
            aggregate_results += "\tWEIGHTED MEAN\n\t\t {0}\n".format(wt_mean_td_metrics.to_str(True))

            print aggregate_results
            pass #End for layer in layers

        pass #End fold

        """ Dump results to file in case of crash """

        #DUMP TO FILE
        """
        print "Writing results to: " + fName
        handle = open(fName, mode="w+")
        handle.write(all_results)
        handle.close()
        """
        #return (mean_vd_metrics, wt_mean_vd_metrics)
Exemplo n.º 2
0
    def Run(self, results_file_name, cv_folds = 10, min_word_count = 5, stem = True, lemmatize = False, remove_stop_words = True, one_code = None, spelling_correct = True, one_fold = False):

        self.min_word_count = min_word_count

        #SETTINGS
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

        results_dir = self.__get_results_folder__()
        self.__ensure_dir__(results_dir)

        print "Results filename: " + results_file_name
        results_file_path = results_dir + results_file_name
        vd_hits_and_misses_fname = results_file_path.replace(".txt", "_VD_hits_misses.txt")
        td_hits_and_misses_fname = results_file_path.replace(".txt", "_TD_hits_misses.txt")

        #TOKENIZE
        data = self.get_data(ExperimentBase.__settings__)
        tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count = min_word_count, stem = stem, lemmatize=lemmatize, remove_stop_words = remove_stop_words, spelling_correct=spelling_correct, number_fn = NumberStrategy.collapse_num)
    
        empty_ixs = set([i for i, doc in enumerate(tokenized_docs) if len(doc) < ExperimentBase.MIN_DOC_LENGTH])
        tokenized_docs = [t for i, t in enumerate(tokenized_docs) if i not in empty_ixs]
       
        #TRAINING DATA
        #TODO Make this one call from docs -> td
        (distance_matrix, id2word) = self.get_vector_space(tokenized_docs)
        xs = self.get_training_data(distance_matrix, id2word)
        
        matrix_mapper = self.matrix_value_mapper()
        if matrix_mapper:
            xs = MatrixHelper.map_matrix(matrix_mapper, xs)
        
        all_results = self.get_params() + "\n"
        print all_results,
        
        MIN_CODE_COUNT = 1
        
        vd_metrics, td_metrics = [], []
        label_mapper = self.label_mapper()

        # Stop logging now
        logging.disable(logging.INFO)
        
        # So we can test on one code only
        codes_to_process = self.__get_codes_to_process__(data, one_code)

        # Store the indices into the inputs that detail
        # the true and false positives and negatives
        vd_hits_misses_by_code = dict()
        td_hits_misses_by_code = dict()

        for code in codes_to_process:

            ys = self.__get_labels_for_code__(code, data, empty_ixs, label_mapper, xs)
                  
            total_codes = len([item for item in ys if item == 1])
            if total_codes <= MIN_CODE_COUNT:
                continue

            # Yes, that is a lot I know
            vd_r, vd_p, vd_f1, vd_a, \
            td_r, td_p, td_f1, td_a, \
            vd_tp_ix, vd_fp_ix, vd_fn_ix, vd_tn_ix, \
            td_tp_ix, td_fp_ix, td_fn_ix, td_tn_ix \
                = cross_validation_score_generic(
                    xs, ys,
                    self.create_classifier(code),
                    self.classify(),
                    cv_folds,
                    class_value = self.get_class_value(),
                    one_fold = one_fold)

            vd_metric, td_metric = rpfa(vd_r, vd_p, vd_f1, vd_a, total_codes), rpfa(td_r, td_p, td_f1, td_a, total_codes)
            vd_metrics.append(vd_metric)
            td_metrics.append(td_metric)

            vd_hits_misses_by_code[code] = (vd_tp_ix, vd_fp_ix, vd_fn_ix, vd_tn_ix)
            td_hits_misses_by_code[code] = (td_tp_ix, td_fp_ix, td_fn_ix, td_tn_ix)

            results = "Code: {0} Count: {1} VD[ {2} ]\tTD[ {3} ]\n".format(code.ljust(7), str(total_codes).rjust(4), vd_metric.to_str(), td_metric.to_str())
            print results,
            all_results += results

            """ Dump results to file in case of crash """
            self.__dump_results_to_file__(all_results, results_file_path)
            dump_hits_and_misses(vd_hits_misses_by_code, xs, vd_hits_and_misses_fname)
            dump_hits_and_misses(td_hits_misses_by_code, xs, td_hits_and_misses_fname)

        """ Compute mean metrics """
        """ MEAN """
        mean_vd_metrics,    mean_td_metrics     = mean_rpfa(vd_metrics),           mean_rpfa(td_metrics)
        """ WEIGHTED MEAN """
        wt_mean_vd_metrics, wt_mean_td_metrics  = weighted_mean_rpfa(vd_metrics),  weighted_mean_rpfa(td_metrics)

        str_aggregate_results = self.__build_aggregate_results_string__(mean_td_metrics, mean_vd_metrics,
                                                                    wt_mean_td_metrics, wt_mean_vd_metrics)
        print str_aggregate_results
        all_results += str_aggregate_results
            
        #DUMP TO FILE
        print "Writing results to: " + results_file_path
        print "TD Hits and Misses: " + td_hits_and_misses_fname
        print "VD Hits and Misses: " + vd_hits_and_misses_fname

        self.__dump_results_to_file__(all_results, results_file_path)
        return (mean_vd_metrics, wt_mean_vd_metrics)