示例#1
0
def run_algorithms(algorithms, datasets, metrics, output, conf):
    dts = Datasets()
    shall_plot = conf.get("plot_data")
    if shall_plot:
        plot_dir = conf.get("plot_dir", "../plots")

        tmp_plot_dir = "../plots_1"
        if os.path.exists(tmp_plot_dir):
            shutil.rmtree(tmp_plot_dir)

        os.mkdir(tmp_plot_dir)

        orig_data_dir = os.path.join(tmp_plot_dir, "original")
        os.mkdir(orig_data_dir)
        for dataset in datasets:
            plot_data(os.path.join(orig_data_dir, "%s-orig.png"  % dataset), "%s-orig" % dataset, dataset)

    if output == 'dump_text' and not os.path.exists("../dumps"):
        os.mkdir("../dumps")

    for algorithm in algorithms:

        if shall_plot:
            algo_dir = os.path.join(tmp_plot_dir, algorithm)
            os.mkdir(algo_dir)

        algo_conf = conf["algorithms"].get(algorithm, None)

        if not algo_conf:
            logging.error("Algorithm %s not found in conf file" % algorithm)
            sys.exit(0)

        algo_conf['name'] = algorithm
        learn_class = _get_algorithm_class(algorithm)
        learn = learn_class(**algo_conf)
        learn._set_cross_validation(conf.get("cv_method", None), conf.get("cv_metric", None), conf.get("cv_params", None))
        results = []
        for dataset in datasets:
            if dataset not in conf["datasets"]:
                logging.error("Dataset %s not found" % dataset)
                sys.exit(0)

            cv_dir = None
            if shall_plot:
                dataset_dir = os.path.join(algo_dir, dataset)
                os.mkdir(dataset_dir)

                if algo_conf.get("cross_validate", True):
                    cv_dir = os.path.join(dataset_dir, "cv")
                    os.mkdir(cv_dir)

            training_sizes = conf.get("training_size", [0.40])
            scores = []
            for training_size in training_sizes:
                data = dts.load_dataset(dataset, training_size)

                learn.set_dataset(dataset, training_size*100, cv_dir)
                if learn.check_type(data["type"]):
                    eval_metrics = []
                    if metrics:
                        eval_metrics.extend(metrics)
                    else:
                        eval_metrics.extend(algo_conf["allowed_metrics"])

                    learn.train(data["x_train"], data["y_train"])
                    result_tups = learn.evaluate(data["x_test"], data["y_test"], eval_metrics)

                    print_results(training_size, algorithm, dataset, result_tups)
                    results.append((algorithm, dataset, training_size, result_tups))

                    if shall_plot:
                        decision_plot_path = os.path.join(dataset_dir, "decision-%s_%s_size_%d.png" % (dataset, algorithm, training_size * 100))
                        learn.plot_results(decision_plot_path, dataset, training_size, data['x_train'], data['x_test'], data['y_train'], data['y_test'])

                        for metric, y_test, score in result_tups:
                            metric_plot_path = os.path.join(dataset_dir, "metric-%s-%s_%s_size_%d.png" % (metric, dataset, algorithm, training_size * 100))
                            plot_metric(metric_plot_path, data['type'], y_test, data['y_test'], dataset, algorithm, training_size * 100)
                    scores.append(result_tups[0][2])
            if shall_plot:
                train_plot_path = os.path.join(dataset_dir, "train_vs_acc-%s_%s.png" % (algorithm, dataset))
                plot_training_results(train_plot_path, [train_size * 100 for train_size in training_sizes], scores)

        if output == "pdf":
            generate_pdf(results)
        elif output == "dump_text":
            dump_results(algorithm, results)
    if conf.get("plot_data", False):
        shutil.rmtree(plot_dir)
        shutil.move(tmp_plot_dir, plot_dir)
class ClassifierLib:
    def __init__(self, analyze, conf, data_conf, output, algorithms, datasets, metrics):
        self.conf = self._load_conf(conf)
        self.shall_analyze = analyze
        self.data_conf = self._load_conf(data_conf)
        self.data_class = Datasets()
        self.output = output
        self.output_dir = os.path.abspath(self.conf.get("output_dir", "./output"))
        self.shall_plot = self.conf.get("plot_data")
        self.algorithms = algorithms
        self.datasets = datasets
        self.metrics = metrics


    def _get_algorithm_class(self, algorithm_name):
        module = importlib.import_module("%s" % algorithm_name)

        if not module:
            logging.error("Module %s not found" % algorithm_name)

        class_name = algorithm_name.replace("_"," ").title().replace(" ","")
        logging.info("Algorithm %s loaded from module %s" % (class_name, algorithm_name))
        return getattr(module, class_name)

    def _load_conf(self, conf_path):
        conf_file = open(os.path.abspath(conf_path))
        return yaml.load(conf_file)


    def run_algorithm(self, algorithm, data, data_conf, training_size):
        algo_conf = self.conf['algorithms'][algorithm]
        learn_class = self._get_algorithm_class(algorithm)
        learn = learn_class(**algo_conf)

        if not learn.check_type(getattr(constants, data_conf["type"])):
            return

        dataset = data_conf['name']

        learn.set_dataset(dataset, training_size)

        if algo_conf.get("cross_validate", False):
            learn._set_cross_validation(self.conf.get("cv_method", None), self.conf.get("cv_metric", None), self.conf.get("cv_params", None))
            learn.cross_validation(data['x_train'], data['y_train'], self.conf.get('print_cv_score', self.conf.get('print_cv_score', False)))

        learn.train(data["x_train"], data["y_train"])
        result = learn.predict(data['x_test'])

        if self.conf.get('evaluate', False):
            eval_metrics = []
            if self.metrics:
                eval_metrics.extend(self.metrics)
            else:
                eval_metrics.extend(algo_conf["allowed_metrics"])

            result = learn.evaluate(result, data["y_test"], eval_metrics)

        return result

    def run(self):
        if os.path.exists(self.output_dir):

            if os.path.exists("%s%s" % (self.output_dir, "_1")):
                shutil.rmtree("%s%s" % (self.output_dir, "_1"))

            shutil.move(self.output_dir, "%s%s" % (self.output_dir, "_1"))

        os.mkdir(self.output_dir)

        for dataset in self.datasets:
            if dataset not in self.data_conf:
                logging.error("Dataset %s not found" % dataset)
                sys.exit(0)

            dataset_dir = os.path.join(self.output_dir, dataset)
            os.mkdir(dataset_dir)

            if self.shall_analyze:
                self.analyze(dataset, dataset_dir)

            for algorithm in self.algorithms:

                algo_dir = os.path.join(dataset_dir, algorithm)
                os.mkdir(algo_dir)


                results = []
                for training_size in self.conf.get('training_sizes', [.4]):

                    data_conf = self.data_conf[dataset]

                    data = self.data_class.load_dataset(dataset, training_size)

                    result = self.run_algorithm(algorithm, data, data_conf, training_size)

                    if self.conf.get('evaluate', True):
                        if self.output == "print":
                            self.print_results(training_size, algorithm, dataset, result)

                        if self.shall_plot:
                            for metric, y_test, score in result:
                                metric_plot_path = os.path.join(algo_dir, "metric-%s-%s_%s_size_%d.png" % (metric, dataset, algorithm, training_size * 100))
                                plot_metric(data['type'], y_test, data['y_test'], dataset, algorithm, training_size * 100, metric_plot_path)
                    else:
                        result_file = open(os.path.join(algo_dir, "result.csv"), 'a+')
                        result_file.write(",".join(results))
                        result_file.close()


    def analyze(self, dataset, dataset_dir):
        data = self.data_class.load_dataset(dataset, train_size=100)

        (X, Y) = (data['x_train'], data['y_train'])
        print_score.print_breakdown(X, Y)

        if self.shall_plot:
            plot_scatter(X, Y, "%s-orig" % dataset, filename=os.path.join(dataset_dir, "%s-orig.png"  % dataset))
            plot_histogram(X, Y, "%s-hist" % dataset, filename=os.path.join(dataset_dir, "%s-hist.png" % dataset))

            pca = PCA()
            pca.fit(X)
            plot_PCA_variance(pca.explained_variance_ratio_ * 100, "%s-pca-#feature-vs-variance" % dataset, filename=os.path.join(dataset_dir, "%s-pca-variance-ratio" % dataset))


    def print_results(self, training_size, algorithm, dataset, metric_tuples):
        #print "\nFor Algorithm::\t%s" % algorithm
        #print "For Dataset::\t%s\n" % dataset
            for met_tup in metric_tuples:
                func = getattr(print_score, "print_%s" % met_tup[0])
                func(training_size, algorithm, dataset, met_tup[2])