def _analyze_predictors_on_holdout(self): fm_io = FeatureMatrixIO() algorithms_to_test = list() algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS) pipeline_file_name = inspect.getfile(inspect.currentframe()) data_dir = SupervisedLearningPipeline._fetch_data_dir_path( self, pipeline_file_name) # for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS: # algorithms_to_test.append('bifurcated-%s' % algorithm) log.debug('algorithms_to_test: %s' % algorithms_to_test) for algorithm in algorithms_to_test: log.info('analyzing %s...' % algorithm) # If report_dir does not exist, make it. report_dir = '/'.join([data_dir, algorithm]) pipeline_prefix = '%s-normality-prediction-%s' % (self._var, algorithm) predictor_path = self._build_model_dump_path(algorithm) if os.path.exists( predictor_path) and 'bifurcated' not in algorithm: log.debug('Loading model from disk...') # TODO(sbala): Fix loblib.load so that it works for bifurcated # supervised classifiers. self._predictor = joblib.load(predictor_path) # self._features = self._X_train.columns status = SupervisedClassifier.TRAINED SupervisedLearningPipeline._analyze_predictor_holdoutset( self, report_dir, pipeline_prefix)
def _build_raw_matrix_path(self): template = '%s-change-matrix-%d-episodes-raw.tab' pipeline_file_name = inspect.getfile(inspect.currentframe()) # Build matrix file name. slugified_var = '-'.join(self._var.split()) matrix_name = template % (slugified_var, self._num_rows) # Build path using parent class logic for _fetch_data_dir_path. # This puts raw matrix in the directory for lab test rather than the # subdirectory for the specific change definition. That way it can be # reused in pipelines for multiple different change defs. data_dir = SupervisedLearningPipeline._fetch_data_dir_path(self, pipeline_file_name) matrix_path = '/'.join([data_dir, matrix_name]) return matrix_path
def _train_and_analyze_predictors(self): log.info('Training and analyzing predictors...') problem = SupervisedLearningPipeline.CLASSIFICATION meta_report = None fm_io = FeatureMatrixIO() # Build paths for output. pipeline_file_name = inspect.getfile(inspect.currentframe()) data_dir = SupervisedLearningPipeline._fetch_data_dir_path( self, pipeline_file_name) # Test BifurcatedSupervisedClassifier and SupervisedClassifier. algorithms_to_test = list() algorithms_to_test.extend(SupervisedClassifier.SUPPORTED_ALGORITHMS) # for algorithm in SupervisedClassifier.SUPPORTED_ALGORITHMS: # algorithms_to_test.append('bifurcated-%s' % algorithm) log.debug('algorithms_to_test: %s' % algorithms_to_test) # Train and analyse algorithms. for algorithm in algorithms_to_test: log.info('Training and analyzing %s...' % algorithm) # If report_dir does not exist, make it. report_dir = '/'.join([data_dir, self.drug, algorithm]) if not os.path.exists(report_dir): os.makedirs(report_dir) # Define hyperparams. hyperparams = {} hyperparams['algorithm'] = algorithm hyperparams[ 'hyperparam_strategy'] = SupervisedClassifier.EXHAUSTIVE_SEARCH hyperparams['max_iter'] = 1024 hyperparams['random_state'] = self._random_state # If bifurcated algorithm, define bifurcator. if 'bifurcated' in algorithm: # bifrucator = LAB.pre == 0 hyperparams['bifurcator'] = '%s.pre' % self._var hyperparams[ 'bifurcation_strategy'] = BifurcatedSupervisedClassifier.EQUAL hyperparams['bifurcation_value'] = 0 hyperparams['bifurcated'] = True # Train classifier. predictor_path = self._build_model_dump_path(algorithm) if os.path.exists( predictor_path) and 'bifurcated' not in algorithm: log.debug('Loading model from disk...') # TODO(sbala): Fix loblib.load so that it works for bifurcated # supervised classifiers. self._predictor = joblib.load(predictor_path) self._features = self._X_train.columns status = SupervisedClassifier.TRAINED else: status = SupervisedLearningPipeline._train_predictor( self, problem, [0, 1], hyperparams) # If failed to train, write an error report. y_train_counts = self._y_train[ self._y_train.columns[0]].value_counts() y_test_counts = self._y_test[ self._y_test.columns[0]].value_counts() if status == SupervisedClassifier.INSUFFICIENT_SAMPLES: # Skip all analysis and reporting. # This will be true for all algorithms, so just return. # Build error report. algorithm_report = DataFrame( { 'bug_panel': [self._var], 'algorithm': [algorithm], 'error': [status], 'y_train.value_counts()': [y_train_counts.to_dict()], 'y_test.value_counts()': [y_test_counts.to_dict()] }, columns=[ 'bug_panel', 'algorithm', 'error', 'y_train.value_counts()', 'y_test.value_counts()' ]) header = [ 'LabCulturePredictionPipeline("%s", 1000000)' % self._var ] # Write error report. fm_io.write_data_frame_to_file(algorithm_report, \ '/'.join([report_dir, '%s-normality-prediction-report.tab' % (self._var)]), \ header) # If successfully trained, append to a meta report. elif status == SupervisedClassifier.TRAINED: pipeline_prefix = '%s-normality-prediction-%s' % (self._var, algorithm) SupervisedLearningPipeline._analyze_predictor( self, report_dir, pipeline_prefix) if meta_report is None: meta_report = fm_io.read_file_to_data_frame('/'.join( [report_dir, '%s-report.tab' % pipeline_prefix])) else: algorithm_report = fm_io.read_file_to_data_frame('/'.join( [report_dir, '%s-report.tab' % pipeline_prefix])) log.debug('algorithm_report: %s' % algorithm_report) meta_report = meta_report.append(algorithm_report) # Write predictor to disk. predictor = SupervisedLearningPipeline.predictor(self) predictor_path = self._build_model_dump_path(algorithm) joblib.dump(predictor, predictor_path) # After building per-algorithm reports, write to meta report. # Note that if there were insufficient samples to build any of the # algorithms, then meta_report will still be None. if meta_report is not None: header = [ 'LabCulturePredictionPipeline("%s", 1000000)' % self._var ] fm_io.write_data_frame_to_file(meta_report, \ '/'.join([data_dir, '%s-normality-prediction-report.tab' % self._var]), header)