Exemplo n.º 1
0
 def _sanity_check(self, all_tasks):
     total_clf = 0
     failed_clf = 0
     Xt, Yt = mnist_reader.load_mnist(path=DATA_DIR, kind='t10k')
     Xt = preprocessing.StandardScaler().fit_transform(Xt.astype(float))
     Xs, Ys = shuffle(Xt, Yt)
     num_dummy = 10
     Xs = Xs[:num_dummy]
     Ys = [j for j in range(10)]
     valid_jobs = []
     for v in all_tasks:
         clf_name = list(v.keys())[0]
         clf_par = list(v.values())[0]
         total_clf += 1
         try:
             globals()[clf_name](**clf_par).fit(Xs, Ys)
             valid_jobs.append(
                 PredictJob(clf_name, clf_par, self.num_repeat))
         except Exception as e:
             failed_clf += 1
             LOGGER.error(
                 'Can not create classifier "%s" with parameter "%s". Reason: %s'
                 % (clf_name, clf_par, e))
     LOGGER.info('%d classifiers to test, %d fail to create!' %
                 (total_clf, failed_clf))
     return valid_jobs
Exemplo n.º 2
0
    def run(self) -> None:
        while True:
            cur_job = self.pending_q.get()  # type: PredictJob

            LOGGER.info(
                'job received! repeat: %d classifier: "%s" parameter: "%s"' %
                (cur_job.num_repeat, cur_job.clf_name, cur_job.clf_par))
            if cur_job.clf_name in globals():
                try:
                    acc = []
                    cur_job.start_time = now_int()
                    for j in range(cur_job.num_repeat):
                        cur_score = self.get_accuracy(cur_job.clf_name,
                                                      cur_job.clf_par, j)
                        acc.append(cur_score)
                        if len(acc) == 2 and abs(acc[0] - cur_score) < 1e-3:
                            LOGGER.info(
                                '%s is invariant to training data shuffling, will stop repeating!'
                                % cur_job.clf_name)
                            break
                    cur_job.done_time = now_int()
                    test_info = {
                        'name':
                        cur_job.clf_name,
                        'parameter':
                        cur_job.clf_par,
                        'score':
                        acc,
                        'start_time':
                        cur_job.start_time,
                        'done_time':
                        cur_job.done_time,
                        'num_repeat':
                        len(acc),
                        'mean_accuracy':
                        np.array(acc).mean(),
                        'std_accuracy':
                        np.array(acc).std() * 2,
                        'time_per_repeat':
                        int((cur_job.done_time - cur_job.start_time) /
                            len(acc))
                    }

                    JSON_LOGGER.info(json.dumps(test_info, sort_keys=True))

                    LOGGER.info(
                        'done! acc: %0.3f (+/- %0.3f) repeated: %d classifier: "%s" '
                        'parameter: "%s" ' %
                        (np.array(acc).mean(), np.array(acc).std() * 2,
                         len(acc), cur_job.clf_name, cur_job.clf_par))
                except Exception as e:
                    LOGGER.error('%s with %s failed! reason: %s' %
                                 (cur_job.clf_name, cur_job.clf_par, e))
            else:
                LOGGER.error(
                    'Can not found "%s" in scikit-learn, missing import?' %
                    cur_job.clf_name)
Exemplo n.º 3
0
    def run(self) -> None:
        while True:
            cur_job = self.pending_q.get()  # type: PredictJob

            LOGGER.info('job received! repeat: %d classifier: "%s" parameter: "%s"' % (cur_job.num_repeat,
                                                                                       cur_job.clf_name,
                                                                                       cur_job.clf_par))
            if cur_job.clf_name in globals():
                try:
                    acc = []
                    cur_job.start_time = now_int()
                    for j in range(cur_job.num_repeat):
                        cur_score = self.get_accuracy(cur_job.clf_name, cur_job.clf_par, j)
                        acc.append(cur_score)
                        if len(acc) == 2 and abs(acc[0] - cur_score) < 1e-3:
                            LOGGER.info('%s is invariant to training data shuffling, will stop repeating!' %
                                        cur_job.clf_name)
                            break
                    cur_job.done_time = now_int()
                    test_info = {
                        'name': cur_job.clf_name,
                        'parameter': cur_job.clf_par,
                        'score': acc,
                        'start_time': cur_job.start_time,
                        'done_time': cur_job.done_time,
                        'num_repeat': len(acc),
                        'mean_accuracy': np.array(acc).mean(),
                        'std_accuracy': np.array(acc).std() * 2,
                        'time_per_repeat': int((cur_job.done_time - cur_job.start_time) / len(acc))
                    }

                    JSON_LOGGER.info(json.dumps(test_info, sort_keys=True))

                    LOGGER.info('done! acc: %0.3f (+/- %0.3f) repeated: %d classifier: "%s" '
                                'parameter: "%s" ' % (np.array(acc).mean(),
                                                      np.array(acc).std() * 2,
                                                      len(acc),
                                                      cur_job.clf_name,
                                                      cur_job.clf_par))
                except Exception as e:
                    LOGGER.error('%s with %s failed! reason: %s' % (cur_job.clf_name, cur_job.clf_par, e))
            else:
                LOGGER.error('Can not found "%s" in scikit-learn, missing import?' % cur_job.clf_name)
Exemplo n.º 4
0
def upload_result_s3():
    LOGGER.info("Syncing data to S3...")
    with open(LOG_PATH, 'a', 1) as logfile:
        proc = subprocess.Popen("bash %s %s" % (SYNC_SCRIPT_PATH, RESULT_PATH),
                                shell=True,
                                stdin=subprocess.PIPE,
                                stdout=logfile,
                                stderr=logfile,
                                cwd=ROOT_DIR,
                                env=os.environ)

        # we have to wait until the training data is downloaded
        try:
            outs, errs = proc.communicate(timeout=SYNC_TIMEOUT)
            if outs:
                LOGGER.info(outs)
            if errs:
                LOGGER.error(errs)
        except subprocess.TimeoutExpired:
            proc.kill()
Exemplo n.º 5
0
 def _sanity_check(self, all_tasks):
     total_clf = 0
     failed_clf = 0
     Xt, Yt = mnist_reader.load_mnist(path=DATA_DIR, kind='t10k')
     Xt = preprocessing.StandardScaler().fit_transform(Xt)
     Xs, Ys = shuffle(Xt, Yt)
     num_dummy = 10
     Xs = Xs[:num_dummy]
     Ys = [j for j in range(10)]
     valid_jobs = []
     for v in all_tasks:
         clf_name = list(v.keys())[0]
         clf_par = list(v.values())[0]
         total_clf += 1
         try:
             globals()[clf_name](**clf_par).fit(Xs, Ys)
             valid_jobs.append(PredictJob(clf_name, clf_par, self.num_repeat))
         except Exception as e:
             failed_clf += 1
             LOGGER.error('Can not create classifier "%s" with parameter "%s". Reason: %s' % (clf_name, clf_par, e))
     LOGGER.info('%d classifiers to test, %d fail to create!' % (total_clf, failed_clf))
     return valid_jobs
Exemplo n.º 6
0
 def _sanity_check(self, all_tasks):
     total_clf = 0
     failed_clf = 0
     newsgroups = fetch_20newsgroups(subset='train',
                                     shuffle=True,
                                     random_state=2019,
                                     remove=('headers', 'footers',
                                             'quotes'))
     Xt, Yt = newsgroups.data, newsgroups.target
     Xs, Ys = shuffle(Xt, Yt)
     num_dummy = 10
     Xs = Xs[:num_dummy]
     Ys = Ys[:num_dummy]
     valid_jobs = []
     for v in all_tasks:
         processor_name = list(v[0].keys())[0]
         processor_par = list(v[0].values())[0]
         clf_name = list(v[1].keys())[0]
         clf_par = list(v[1].values())[0]
         topic_name = list(v[2].keys())[0]
         topic_par = list(v[2].values())[0]
         total_clf += 1
         try:
             make_pipeline(globals()[processor_name](**processor_par),
                           DenseTransformer(),
                           globals()[topic_name](**topic_par),
                           globals()[clf_name](**clf_par)).fit(Xs, Ys)
             valid_jobs.append(
                 PredictJob(processor_name, processor_par, clf_name,
                            clf_par, topic_name, topic_par,
                            self.num_repeat))
         except Exception as e:
             failed_clf += 1
             LOGGER.error(
                 'Can not create classifier "%s" with parameter "%s". Reason: %s'
                 % (clf_name, clf_par, e))
     LOGGER.info('%d classifiers to test, %d fail to create!' %
                 (total_clf, failed_clf))
     return valid_jobs