def run_seizure_detection(build_target): """ The main entry point for running seizure-detection cross-validation and predictions. Directories from settings file are configured, classifiers are chosen, pipelines are chosen, and the chosen build_target ('cv', 'predict', 'train_model') is run across all combinations of (targets, pipelines, classifiers) """ with open('SETTINGS.json') as f: settings = json.load(f) data_dir = str(settings['competition-data-dir']) cache_dir = str(settings['data-cache-dir']) submission_dir = str(settings['submission-dir']) makedirs(submission_dir) cached_data_loader = CachedDataLoader(cache_dir) ts = time.get_millis() targets = [ #'Dog_1', #'Dog_2', #'Dog_3', #'Dog_4', #'Patient_1', #'Patient_2', #'Patient_3', #'Patient_4', #'Patient_5', #'Patient_6', #'Patient_7', 'Patient_8' ] pipelines = [ # NOTE(mike): you can enable multiple pipelines to run them all and compare results # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 48), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 64), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 96), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 128), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 160), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[FFT(), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[Stats()]), # Pipeline(gen_ictal=False, pipeline=[DaubWaveletStats(4)]), # Pipeline(gen_ictal=False, pipeline=[Resample(400), DaubWaveletStats(4)]), # Pipeline(gen_ictal=False, pipeline=[Resample(400), MFCC()]), # Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]), # Pipeline(gen_ictal=True, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]), Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf') ]), # winning submission # Pipeline(gen_ictal=True, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]), # higher score than winning submission # Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]), # Pipeline(gen_ictal=True, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]), # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'usf', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=False)]), # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=False, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'none', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'usf', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=False)]), # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=False, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'none', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'us')]), # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'usf')]), # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'none')]), ] classifiers = [ # NOTE(mike): you can enable multiple classifiers to run them all and compare results # (RandomForestClassifier(n_estimators=50, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf50mss1Bfrs0'), # (RandomForestClassifier(n_estimators=150, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf150mss1Bfrs0'), # (RandomForestClassifier(n_estimators=300, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf300mss1Bfrs0'), (RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf3000mss1Bfrs0'), ] cv_ratio = 0.5 def should_normalize(classifier): clazzes = [LogisticRegression] return np.any( np.array([isinstance(classifier, clazz) for clazz in clazzes]) == True) def train_full_model(make_predictions): for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % ( pipeline.get_name(), classifier_name) guesses = ['clip,seizure,early'] classifier_filenames = [] for target in targets: task_core = TaskCore( cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_ictal=pipeline.gen_ictal, cv_ratio=cv_ratio) if make_predictions: predictions = MakePredictionsTask(task_core).run() guesses.append(predictions.data) else: task = TrainClassifierTask(task_core) task.run() classifier_filenames.append(task.filename()) if make_predictions: filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name()) filename = os.path.join(submission_dir, filename) with open(filename, 'w') as f: print >> f, '\n'.join(guesses) print 'wrote', filename else: print 'Trained classifiers ready in %s' % cache_dir for filename in classifier_filenames: print os.path.join(cache_dir, filename + '.pickle') def do_cross_validation(): summaries = [] for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % ( pipeline.get_name(), classifier_name) scores = [] S_scores = [] E_scores = [] for target in targets: print 'Processing %s (classifier %s)' % (target, classifier_name) task_core = TaskCore( cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_ictal=pipeline.gen_ictal, cv_ratio=cv_ratio) data = CrossValidationScoreTask(task_core).run() score = data.score scores.append(score) print '%.3f' % score, 'S=%.4f' % data.S_auc, 'E=%.4f' % data.E_auc S_scores.append(data.S_auc) E_scores.append(data.E_auc) if len(scores) > 0: name = pipeline.get_name() + '_' + classifier_name summary = get_score_summary(name, scores) summaries.append((summary, np.mean(scores))) print summary if len(S_scores) > 0: name = pipeline.get_name() + '_' + classifier_name summary = get_score_summary(name, S_scores) print 'S', summary if len(E_scores) > 0: name = pipeline.get_name() + '_' + classifier_name summary = get_score_summary(name, E_scores) print 'E', summary print_results(summaries) if build_target == 'cv': do_cross_validation() elif build_target == 'train_model': train_full_model(make_predictions=False) elif build_target == 'make_predictions': train_full_model(make_predictions=True) else: raise Exception("unknown build target %s" % build_target)
def run_seizure_detection(build_target): """ The main entry point for running seizure-detection cross-validation and predictions. Directories from settings file are configured, classifiers are chosen, pipelines are chosen, and the chosen build_target ('cv', 'predict', 'train_model') is run across all combinations of (targets, pipelines, classifiers) """ with open('SETTINGS.json') as f: settings = json.load(f) data_dir = str(settings['competition-data-dir']) cache_dir = str(settings['data-cache-dir']) submission_dir = str(settings['submission-dir']) figure_dir = str(settings['figure-dir']) makedirs(submission_dir) cached_data_loader = CachedDataLoader(cache_dir) ts = time.get_millis() targets = [ 'Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2', ] pipelines = [ # NOTE: you can enable multiple pipelines to run them all and compare results Pipeline(gen_preictal=True, pipeline=[ FFTWithTimeFreqCorrelation(50, 2500, 400, 18, 'usf') ]), # winning submission ] classifiers = [ # NOTE: you can enable multiple classifiers to run them all and compare results # (RandomForestClassifier(n_estimators=300, min_samples_split=1, max_features=0.5, bootstrap=False, n_jobs=-1, random_state=0), 'rf300mss1mf05Bfrs0'), # (ExtraTreesClassifier(n_estimators=3000, min_samples_split=1, max_features=0.15, bootstrap=False, n_jobs=-1, random_state=0), 'ET3000mss1mf015Bfrs0'), # # (GradientBoostingClassifier(n_estimators=3000, min_samples_split=1, max_features=0.15, learning_rate=0.02, subsample = 0.5, random_state=0), 'GBRT3000mms1mf015Lr002Ss05rs0'), (SVC(C=1e6, kernel='rbf', gamma=0.01, coef0=0.0, shrinking=True, probability=True, tol=1e-5, cache_size=2000, class_weight='auto', max_iter=-1, random_state=0), 'svcce6rbfg001co0stte-5cwautors0'), ] cv_ratio = 0.5 def should_normalize(classifier): clazzes = [LogisticRegression] return np.any( np.array([isinstance(classifier, clazz) for clazz in clazzes]) == True) def train_full_model(make_predictions): for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print('Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name)) guesses = ['clip,preictal'] classifier_filenames = [] plot2file = PdfPages( os.path.join(figure_dir, ('figure%d-_%s_%s_.pdf' % (ts, classifier_name, pipeline.get_name())))) for target in targets: task_core = TaskCore( cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal, cv_ratio=cv_ratio, plot2file=plot2file) if make_predictions: predictions = MakePredictionsTask(task_core).run() guesses.append(predictions.data) else: task = TrainClassifierTask(task_core) task.run() classifier_filenames.append(task.filename()) if make_predictions: filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name()) filename = os.path.join(submission_dir, filename) with open(filename, 'w') as f: print('\n'.join(guesses), file=f) print('wrote', filename) else: print('Trained classifiers ready in %s' % cache_dir) for filename in classifier_filenames: print(os.path.join(cache_dir, filename + '.pickle')) plot2file.close() def predict_all(make_predictions): for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print('Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name)) lines = ['clip,preictal'] subjectID = 0 X_train = y_train = X_test = test_size = [] for target in targets: task_core = TaskCore( cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal, cv_ratio=cv_ratio) data = GetCrossSubjectDataTask(task_core).run() # a = np.shape(data.X_test)[0] test_size.append(np.shape(data.X_test)[0]) if subjectID > 0: X_train = np.concatenate((X_train, data.X_train), axis=0) y_train = np.concatenate((y_train, data.y_train), axis=0) X_test = np.concatenate((X_test, data.X_test), axis=0) else: X_train = data.X_train y_train = data.y_train X_test = data.X_test subjectID += 1 #Training task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=[], pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal, cv_ratio=cv_ratio) y_train = np.ceil(0.1 * y_train) y_train.astype('int_') if should_normalize(classifier): X_train, temp = normalize_data(X_train, X_train) print("Training ...") print('Dim', np.shape(X_train), np.shape(y_train)) start = time.get_seconds() classifier.fit(X_train, y_train) elapsedSecs = time.get_seconds() - start print("t=%ds" % int(elapsedSecs)) y_estimate = classifier.predict_proba(X_train) lr = LogisticRegression(random_state=0) lr.fit(y_estimate, y_train) predictions_proba = classifier.predict_proba(X_test) predictions_calibrated = lr.predict_proba(predictions_proba) #output m = 0 totalSample = 12 startIdx = 0 for target in targets: for i in range(test_size[m] / totalSample): j = i + 1 if j < 10: nstr = '000%d' % j elif j < 100: nstr = '00%d' % j elif j < 1000: nstr = '0%d' % j else: nstr = '%d' % j preictalOverAllSample = 0 for k in range(totalSample): p = predictions_calibrated[i * totalSample + k + startIdx] preictal = translate_prediction(p) preictalOverAllSample += preictal / totalSample newline = '%s_test_segment_%s.mat,%.15f' % ( target, nstr, preictalOverAllSample) lines.append(newline) print(newline) startIdx = startIdx + test_size[m] m += 1 filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name()) filename = os.path.join(submission_dir, filename) with open(filename, 'w') as f: print('\n'.join(lines), file=f) print('wrote', filename) def do_cross_validation(): summaries = [] for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print('Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name)) scores = [] for target in targets: print('Processing %s (classifier %s)' % (target, classifier_name)) task_core = TaskCore( cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, # target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal, cv_ratio=cv_ratio) data = CrossValidationScoreTask(task_core).run() score = data.score scores.append(score) print('%.3f' % score) if len(scores) > 0: name = pipeline.get_name() + '_' + classifier_name summary = get_score_summary(name, scores) summaries.append((summary, np.mean(scores))) print(summary) print_results(summaries) if build_target == 'cv': do_cross_validation() elif build_target == 'train_model': train_full_model(make_predictions=False) elif build_target == 'make_predictions': train_full_model(make_predictions=True) elif build_target == 'predict_all': predict_all(make_predictions=True) else: raise Exception("unknown build target %s" % build_target)
def run_seizure_detection(build_target): """ The main entry point for running seizure-detection cross-validation and predictions. Directories from settings file are configured, classifiers are chosen, pipelines are chosen, and the chosen build_target ('cv', 'predict', 'train_model') is run across all combinations of (targets, pipelines, classifiers) """ with open('SETTINGS.json') as f: settings = json.load(f) data_dir = str(settings['competition-data-dir']) cache_dir = str(settings['data-cache-dir']) submission_dir = str(settings['submission-dir']) makedirs(submission_dir) cached_data_loader = CachedDataLoader(cache_dir) ts = time.get_millis() targets = [ 'Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Patient_1', 'Patient_2', 'Patient_3', 'Patient_4', 'Patient_5', 'Patient_6', 'Patient_7', 'Patient_8' ] pipelines = [ # NOTE(mike): you can enable multiple pipelines to run them all and compare results # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 48), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 64), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 96), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 128), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 160), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[FFT(), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[Stats()]), # Pipeline(gen_ictal=False, pipeline=[DaubWaveletStats(4)]), # Pipeline(gen_ictal=False, pipeline=[Resample(400), DaubWaveletStats(4)]), # Pipeline(gen_ictal=False, pipeline=[Resample(400), MFCC()]), # Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]), # Pipeline(gen_ictal=True, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]), Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]), # winning submission # Pipeline(gen_ictal=True, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]), # higher score than winning submission # Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]), # Pipeline(gen_ictal=True, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]), # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'usf', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=False)]), # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=False, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'none', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'usf', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=False)]), # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=False, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'none', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'us')]), # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'usf')]), # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'none')]), ] classifiers = [ # NOTE(mike): you can enable multiple classifiers to run them all and compare results # (RandomForestClassifier(n_estimators=50, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf50mss1Bfrs0'), # (RandomForestClassifier(n_estimators=150, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf150mss1Bfrs0'), # (RandomForestClassifier(n_estimators=300, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf300mss1Bfrs0'), # NOTE(mike): The original submission classifier was min_samples_split=1, but I had to change it to 2 after upgrading scikit. # I'm not even sure min_samples_split=1 makes sense in hindsight, how can you split on 1 sample? Anyway to get the repo functional # again with newer libraries it's now 2. # (RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf3000mss1Bfrs0'), (RandomForestClassifier(n_estimators=3000, min_samples_split=2, bootstrap=False, n_jobs=4, random_state=0), 'rf3000mss2Bfrs0'), ] cv_ratio = 0.5 def should_normalize(classifier): clazzes = [LogisticRegression] return np.any(np.array([isinstance(classifier, clazz) for clazz in clazzes]) == True) def train_full_model(make_predictions): for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name) guesses = ['clip,seizure,early'] classifier_filenames = [] for target in targets: task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_ictal=pipeline.gen_ictal, cv_ratio=cv_ratio) if make_predictions: predictions = MakePredictionsTask(task_core).run() guesses.append(predictions.data) else: task = TrainClassifierTask(task_core) task.run() classifier_filenames.append(task.filename()) if make_predictions: filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name()) filename = os.path.join(submission_dir, filename) with open(filename, 'w') as f: print >> f, '\n'.join(guesses) print 'wrote', filename else: print 'Trained classifiers ready in %s' % cache_dir for filename in classifier_filenames: print os.path.join(cache_dir, filename + '.pickle') def do_cross_validation(): summaries = [] for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name) scores = [] S_scores = [] E_scores = [] for target in targets: print 'Processing %s (classifier %s)' % (target, classifier_name) task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_ictal=pipeline.gen_ictal, cv_ratio=cv_ratio) data = CrossValidationScoreTask(task_core).run() score = data.score scores.append(score) print '%.3f' % score, 'S=%.4f' % data.S_auc, 'E=%.4f' % data.E_auc S_scores.append(data.S_auc) E_scores.append(data.E_auc) if len(scores) > 0: name = pipeline.get_name() + '_' + classifier_name summary = get_score_summary(name, scores) summaries.append((summary, np.mean(scores))) print summary if len(S_scores) > 0: name = pipeline.get_name() + '_' + classifier_name summary = get_score_summary(name, S_scores) print 'S', summary if len(E_scores) > 0: name = pipeline.get_name() + '_' + classifier_name summary = get_score_summary(name, E_scores) print 'E', summary print_results(summaries) if build_target == 'cv': do_cross_validation() elif build_target == 'train_model': train_full_model(make_predictions=False) elif build_target == 'make_predictions': train_full_model(make_predictions=True) else: raise Exception("unknown build target %s" % build_target)
def run_seizure_detection(build_target): """ The main entry point for running seizure-detection cross-validation and predictions. Directories from settings file are configured, classifiers are chosen, pipelines are chosen, and the chosen build_target ('cv', 'predict', 'train_model') is run across all combinations of (targets, pipelines, classifiers) """ with open('SETTINGS.json') as f: settings = json.load(f) data_dir = str(settings['competition-data-dir']) cache_dir = str(settings['data-cache-dir']) submission_dir = str(settings['submission-dir']) makedirs(submission_dir) cached_data_loader = CachedDataLoader(cache_dir) ts = time.get_millis() targets = [ #'Dog_1', #'Dog_2', #'Dog_3', #'Dog_4', 'Dog_5', #'Patient_1', #'Patient_2' ] pipelines = [ # NOTE(mike): you can enable multiple pipelines to run them all and compare results Pipeline(gen_preictal=False, pipeline=[Resample(400), MaximalCrossCorrelation()]), #Pipeline(gen_preictal=False, pipeline=[CorrelationWithVariance(with_eigen=False)]), #Pipeline(gen_preictal=True, pipeline=[CorrelationWithVariance(with_eigen=True)]), #Pipeline(gen_preictal=True, pipeline=[CorrelationWithVariance(with_eigen=False)]), #Pipeline(gen_preictal=True, pipeline=[FFT(), Slice(1, 48), Magnitude(), Log10()]), #Pipeline(gen_preictal=False, pipeline=[MFCC()]), #Pipeline(gen_preictal=False, pipeline=[CorrelationWithVariance()]), #Pipeline(gen_preictal=False, pipeline=[FFT(), Slice(1, 64), Magnitude(), Log10()]), #Pipeline(gen_preictal=False, pipeline=[FFT(), Slice(1, 96), Magnitude(), Log10()]), #Pipeline(gen_preictal=False, pipeline=[FFT(), Slice(1, 128), Magnitude(), Log10()]), #Pipeline(gen_preictal=False, pipeline=[FFT(), Slice(1, 160), Magnitude(), Log10()]), #Pipeline(gen_preictal=False, pipeline=[FFT()]), #Pipeline(gen_preictal=False, pipeline=[FFT(), Magnitude(), Log10()]), #Pipeline(gen_preictal=False, pipeline=[Stats()]), #Pipeline(gen_preictal=False, pipeline=[DaubWaveletStats(4)]), #Pipeline(gen_preictal=False, pipeline=[Resample(400), DaubWaveletStats(4)]), #Pipeline(gen_preictal=False, pipeline=[Resample(400), MFCC()]), #Pipeline(gen_preictal=False, pipeline=[FFTWithTimeFreqCorrelation(65, 100, 400, 'us')]), #Pipeline(gen_preictal=False, pipeline=[FFTWithTimeFreqCorrelation(30, 45, 400, 'us')]), #Pipeline(gen_preictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]), #Pipeline(gen_preictal=True, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]), #Pipeline(gen_preictal=True, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]), # winning submission #Pipeline(gen_preictal=True, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]), # higher score than winning submission #Pipeline(gen_preictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]), #Pipeline(gen_preictal=True, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]), #Pipeline(gen_preictal=False, pipeline=[TimeCorrelation(400, 'usf', with_corr=True, with_eigen=True)]), #Pipeline(gen_preictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=True)]), #Pipeline(gen_preictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=False)]), #Pipeline(gen_preictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=False, with_eigen=True)]), #Pipeline(gen_preictal=False, pipeline=[TimeCorrelation(400, 'none', with_corr=True, with_eigen=True)]), #Pipeline(gen_preictal=False, pipeline=[FreqCorrelation(1, 48, 'usf', with_corr=True, with_eigen=True)]), #Pipeline(gen_preictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=True)]), #Pipeline(gen_preictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=False)]), #Pipeline(gen_preictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=False, with_eigen=True)]), #Pipeline(gen_preictal=False, pipeline=[FreqCorrelation(1, 48, 'none', with_corr=True, with_eigen=True)]), #Pipeline(gen_preictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'us')]), #Pipeline(gen_preictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'usf')]), #Pipeline(gen_preictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'none')]), ] classifiers = [ # NOTE(mike): you can enable multiple classifiers to run them all and compare results (RandomForestClassifier(n_estimators=50, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf50mss1Bfrs0'), #(RandomForestClassifier(n_estimators=150, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf150mss1Bfrs0'), #(RandomForestClassifier(n_estimators=300, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf300mss1Bfrs0'), #(RandomForestClassifier(n_estimators=1000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf1000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=2000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf2000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf3000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=4000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf4000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=5000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf5000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=6000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf6000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=7000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf7000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=8000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf8000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=10000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf10000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=9000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf9000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=11000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf11000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=12000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf12000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=13000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf13000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=14000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf14000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=15000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf15000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=16000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf16000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=17000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf17000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=18000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf18000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=19000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf19000mss1Bfrs0'), #(RandomForestClassifier(n_estimators=20000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf20000mss1Bfrs0'), #(LogisticRegression(), 'logistic_regression'), #(LinearSVC(C=0.1), 'linearsvc_c0.1'), #(LinearSVC(C=1), 'linearsvc_c1'), ] cv_ratio = 0.5 def should_normalize(classifier): clazzes = [LogisticRegression] return np.any(np.array([isinstance(classifier, clazz) for clazz in clazzes]) == True) def train_full_model(make_predictions): for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name) guesses = ['clip,preictal'] classifier_filenames = [] for target in targets: task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal, cv_ratio=cv_ratio) if make_predictions: predictions = MakePredictionsTask(task_core).run() guesses.append(predictions.data) else: task = TrainClassifierTask(task_core) task.run() classifier_filenames.append(task.filename()) if make_predictions: filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name()) filename = os.path.join(submission_dir, filename) with open(filename, 'w') as f: print >> f, '\n'.join(guesses) print 'wrote', filename else: print 'Trained classifiers ready in %s' % cache_dir for filename in classifier_filenames: print os.path.join(cache_dir, filename + '.pickle') def do_cross_validation(): summaries = [] for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name) scores = [] for target in targets: print 'Processing %s (classifier %s)' % (target, classifier_name) task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal, cv_ratio=cv_ratio) data = CrossValidationScoreTask(task_core).run() score = data.score scores.append(score) print '%.3f' % score if len(scores) > 0: name = pipeline.get_name() + '_' + classifier_name summary = get_score_summary(name, scores) summaries.append((summary, np.mean(scores))) print summary print_results(summaries) if build_target == 'cv': do_cross_validation() elif build_target == 'train_model': train_full_model(make_predictions=False) elif build_target == 'make_predictions': train_full_model(make_predictions=True) else: raise Exception("unknown build target %s" % build_target)
def run_seizure_detection(build_target): """ The main entry point for running seizure-detection cross-validation and predictions. Directories from settings file are configured, classifiers are chosen, pipelines are chosen, and the chosen build_target ('cv', 'predict', 'train_model') is run across all combinations of (targets, pipelines, classifiers) """ with open('SETTINGS.json') as f: settings = json.load(f) data_dir = str(settings['competition-data-dir']) cache_dir = str(settings['data-cache-dir']) submission_dir = str(settings['submission-dir']) makedirs(submission_dir) cached_data_loader = CachedDataLoader(cache_dir) ts = time.get_millis() targets = [ 'Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1_downsample', 'Patient_2_downsample', ] pipelines = [ # NOTE(mike): you can enable multiple pipelines to run them all and compare results # Pipeline(pipeline=[FFT(), Slice(1, 64), Magnitude(), Log10()]), # Pipeline(pipeline=[FFT(), Slice(1, 48), Magnitude(), Log10()]), # Pipeline(pipeline=[FFT(), Slice(1, 96), Magnitude(), Log10()]), # Pipeline(pipeline=[RFFT(), Slice(1, 48), Magnitude(), Log10()]), # Pipeline(pipeline=[FFT(), Slice(1, 128), Magnitude(), Log10()]), Pipeline(pipeline=[TimeAliasing(),FFT(), Slice(1, 48), Magnitude(), Log10()]), # Pipeline(pipeline=[TimeAliasing(),FFT(), Slice(1, 64), Magnitude(), Log10()]), # Pipeline(pipeline=[FFT(), Slice(1, 160), Magnitude(), Log10()]), # Pipeline(pipeline=[FFT(), Magnitude(), Log10()]), # Pipeline(pipeline=[Stats()]), # Pipeline(pipeline=[DaubWaveletStats(4)]), # Pipeline(pipeline=[Resample(400), DaubWaveletStats(4)]), # Pipeline(pipeline=[Resample(400), MFCC()]), # Pipeline(pipeline=[TimeAliasing(),FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]), # Pipeline(pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]), # Pipeline(pipeline=[TimeAliasing(),FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]), # winning submission # Pipeline(pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]), # higher score than winning submission # Pipeline(pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]), # Pipeline(pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]), # Pipeline(pipeline=[TimeAliasing(),TimeCorrelation(400, 'usf', with_corr=True, with_eigen=True)]), # Pipeline(pipeline=[TimeAliasing(),TimeCorrelation(400, 'us', with_corr=True, with_eigen=True)]), # Pipeline(pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=False)]), # Pipeline(pipeline=[TimeCorrelation(400, 'us', with_corr=False, with_eigen=True)]), # Pipeline(pipeline=[TimeCorrelation(400, 'none', with_corr=True, with_eigen=True)]), # Pipeline(pipeline=[TimeAliasing(),FreqCorrelation(1, 48, 'usf', with_corr=True, with_eigen=True,with_fft = True)]), # Pipeline(pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=True)]), # Pipeline(pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=False)]), # Pipeline(pipeline=[FreqCorrelation(1, 48, 'us', with_corr=False, with_eigen=True)]), # Pipeline(pipeline=[FreqCorrelation(1, 48, 'none', with_corr=True, with_eigen=True)]), # Pipeline(pipeline=[TimeFreqCorrelation(1, 48, 400, 'us')]), # Pipeline(pipeline=[TimeFreqCorrelation(1, 48, 400, 'usf')]), # Pipeline(pipeline=[TimeFreqCorrelation(1, 48, 400, 'none')]), ] classifiers = [ # NOTE(mike): you can enable multiple classifiers to run them all and compare results (RandomForestClassifier(n_estimators=3, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf3mss1Bfrs0'), # (RandomForestClassifier(n_estimators=150, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf150mss1Bfrs0'), # (RandomForestClassifier(n_estimators=300, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf300mss1Bfrs0'), # (RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf3000mss1Bfrs0'), # (GaussianNB(),'gbn'), # (BernoulliRBM(n_components=100),'dbn'), # (SVC(probability = True),'svc100'), # (LDA(),'lda'), ] cv_ratio = 0.5 def should_normalize(classifier): clazzes = [LogisticRegression] return np.any(np.array([isinstance(classifier, clazz) for clazz in clazzes]) == True) def train_full_model(make_predictions): for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name) guesses = ['clip,preictal'] classifier_filenames = [] for target in targets: task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_ictal=False, cv_ratio=cv_ratio) if make_predictions: predictions = MakePredictionsTask(task_core).run() guesses.append(predictions.data) else: task = TrainClassifierTask(task_core) print "training" task.run() print "train_finished" classifier_filenames.append(task.filename()) if make_predictions: filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name()) filename = os.path.join(submission_dir, filename) with open(filename, 'w') as f: print >> f, '\n'.join(guesses) print 'wrote', filename else: print 'Trained classifiers ready in %s' % cache_dir for filename in classifier_filenames: print os.path.join(cache_dir, filename + '.pickle') def train_model_with_calib(make_predictions): for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name) guesses = ['clip,preictal'] classifier_filenames = [] for target in targets: task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_ictal=False, cv_ratio=cv_ratio) if make_predictions: predictions = MakePredictionswithCalibTask(task_core).run() guesses.append(predictions.data) else: task = TrainClassifierwithCalibTask(task_core) print "training" task.run() print "train_finished" classifier_filenames.append(task.filename()) if make_predictions: filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name()) filename = os.path.join(submission_dir, filename) with open(filename, 'w') as f: print >> f, '\n'.join(guesses) print 'wrote', filename else: print 'Trained classifiers ready in %s' % cache_dir for filename in classifier_filenames: print os.path.join(cache_dir, filename + '.pickle') def do_cross_validation_full(): summaries = [] print "ok" for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name) scores = [] S_scores = [] E_scores = [] y_cv = [] pred = [] for target in targets: print 'Processing %s (classifier %s)' % (target, classifier_name) task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_ictal=False, cv_ratio=cv_ratio) data = CrossValidationScoreFullTask(task_core).run() y_cv = np.concatenate((y_cv,data.y_cv),axis = -1); pred = np.concatenate((pred,data.pred),axis = -1); print y_cv print pred fpr,tpr,thresholds = metrics.roc_curve(y_cv,pred,pos_label = 1) print 'AUC' print metrics.auc(fpr,tpr) def do_cross_validation(): summaries = [] for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name) scores = [] S_scores = [] E_scores = [] for target in targets: print 'Processing %s (classifier %s)' % (target, classifier_name) task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_ictal=False, cv_ratio=cv_ratio) data = CrossValidationScoreTask(task_core).run() score = data.score scores.append(score) print '%.3f' % score, 'S=%.4f' % data.S_auc S_scores.append(data.S_auc) if len(scores) > 0: name = pipeline.get_name() + '_' + classifier_name summary = get_score_summary(name, scores) summaries.append((summary, np.mean(scores))) print summary if len(S_scores) > 0: name = pipeline.get_name() + '_' + classifier_name summary = get_score_summary(name, S_scores) print 'S', summary print_results(summaries) if build_target == 'cv': do_cross_validation() elif build_target == 'train_model': train_full_model(make_predictions=False) elif build_target == 'make_predictions': train_full_model(make_predictions=True) elif build_target == 'make_predictions_with_calib': train_model_with_calib(make_predictions = True) elif build_target == 'cv_full': do_cross_validation_full() else: raise Exception("unknown build target %s" % build_target) send_message('your program finished running on mercury')
def run_seizure_detection(build_target, targets=None): """ The main entry point for running seizure-detection cross-validation and predictions. Directories from settings file are configured, classifiers are chosen, pipelines are chosen, and the chosen build_target ('cv', 'predict', 'train_model') is run across all combinations of (targets, pipelines, classifiers) """ with open('SETTINGS.json') as f: settings = json.load(f) data_dir = str(settings['competition-data-dir']) cache_dir = str(settings['data-cache-dir']) import seizure.transforms seizure.transforms.cache_dir = cache_dir submission_dir = str(settings['submission-dir']) seizure.tasks.task_predict = str(settings.get('task')) == 'predict' makedirs(submission_dir) cached_data_loader = CachedDataLoader(cache_dir) ts = time.get_millis() if not targets: if seizure.tasks.task_predict: # add leader-board weight to each target. I am using the number of test example as the weight assuming # all test examples are weighted equally on the leader-board targets = [ ('Dog_1',502), ('Dog_2',1000), ('Dog_3',907), ('Dog_4',990), ('Dog_5',191), ('Patient_1',195), ('Patient_2',150), ] else: targets = [ 'Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2', 'Patient_3', 'Patient_4', 'Patient_5', 'Patient_6', 'Patient_7', 'Patient_8' ] pipelines = [ # NOTE(mike): you can enable multiple pipelines to run them all and compare results #Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 48), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 64), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 96), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 128), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 160), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[FFT(), Magnitude(), Log10()]), # Pipeline(gen_ictal=False, pipeline=[Stats()]), # Pipeline(gen_ictal=False, pipeline=[DaubWaveletStats(4)]), # Pipeline(gen_ictal=False, pipeline=[Resample(400), DaubWaveletStats(4)]), # Pipeline(gen_ictal=False, pipeline=[Resample(400), MFCC()]), # Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]), # Pipeline(gen_ictal=True, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]), #Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]), # winning detection submission # Pipeline(gen_ictal=False, pipeline=[WindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]), #Pipeline(gen_ictal=False, pipeline=[StdWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]), #Pipeline(gen_ictal=False, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]), # Pipeline(gen_ictal=True, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]), # Pipeline(gen_ictal=2, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]), # Pipeline(gen_ictal=4, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]), #Pipeline(gen_ictal=8, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]), #Pipeline(gen_ictal=-8, pipeline=[MedianWindow1FFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]), # Pipeline(gen_ictal=-8, pipeline=[MedianWindowBands1('usf2', 60, p=2)]), # Pipeline(gen_ictal=-8, pipeline=[MedianWindowBands1('141022-PCA-model', 60, p=2)]), #Pipeline(gen_ictal=-8, pipeline=[MedianWindowBands1('141022-ICA-model-1', 60, p=2)]), # Pipeline(gen_ictal=-8, pipeline=[MedianWindowBands1('ica', 60, p=2, timecorr=True)]), #Pipeline(gen_ictal=-8.5, pipeline=[MedianWindowBands1('usf', 60, p=2)]), #Pipeline(gen_ictal=-8, pipeline=[MedianWindowBands('usf', 10, p=2, window='hammingP2')]), #Pipeline(gen_ictal=-8, pipeline=[AllBands('usf', 60)]), Pipeline(gen_ictal=-8, pipeline=[AllTimeCorrelation('usf', 60)]), #Pipeline(gen_ictal=-8, pipeline=[MaxDiff(60)]), #Pipeline(gen_ictal=-8, pipeline=[MedianWindowBandsTimeCorrelation('usf', 60)]), #Pipeline(gen_ictal=-8, pipeline=[MedianWindowBandsCorrelation('usf', 60)]), #Pipeline(gen_ictal=-8, pipeline=[MedianWindowTimeCorrelation('usf', 60)]), #Pipeline(gen_ictal=False, pipeline=[MedianWindow1FFTWithTimeFreqCorrelation(1, 49, 400, 'usf',600)]), #Pipeline(gen_ictal=8, pipeline=[MedianWindowFFTWithTimeFreqCov2(1, 48, 400, 'usf',600)]), #Pipeline(gen_ictal=8, pipeline=[CleanMedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600, window='hammingP2')]), #Pipeline(gen_ictal=8, pipeline=[CleanCorMedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600, window='hammingP2')]), #Pipeline(gen_ictal=8, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600,subsample=2)]), # Pipeline(gen_ictal=16, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]), #Pipeline(gen_ictal=16, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 96, 400, 'usf',600, window='hamming')]), #Pipeline(gen_ictal=16, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600, window='hamming2')]), #Pipeline(gen_ictal=8, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600, window='hamming0')]), # Pipeline(gen_ictal=8, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600, window='square0')]), # Pipeline(gen_ictal=2, pipeline=[Variance(nwindows=600)]), # UnionPipeline(gen_ictal=2, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600),Variance(nwindows=600)]), #Pipeline(gen_ictal=True, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600, nunits=4)]), #Pipeline(gen_ictal=True, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 50, 400, 'usf',600)]), #Pipeline(gen_ictal=False, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600,[0.5,0.9])]), # Pipeline(gen_ictal=False, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600,[0.1,0.9])]), # Pipeline(gen_ictal=False, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600,[0.05,0.5,0.95])]), # Pipeline(gen_ictal=False, pipeline=[BoxWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]), # Pipeline(gen_ictal=True, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]), # higher score than winning submission # Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]), # Pipeline(gen_ictal=True, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]), # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'usf', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=False)]), # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=False, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'none', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'usf', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=False)]), # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=False, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'none', with_corr=True, with_eigen=True)]), # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'us')]), # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'usf')]), # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'none')]), ] classifiers = [ # NOTE(mike): you can enable multiple classifiers to run them all and compare results # (RandomForestClassifier(n_estimators=50, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf50mss1Bfrs0'), # (RandomForestClassifier(n_estimators=150, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf150mss1Bfrs0'), # (RandomForestClassifier(n_estimators=300, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf300mss1Bfrs0'), # (RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf3000mss1Bfrs0'), # (RandomForestClassifier(n_estimators=3000, min_samples_split=1, max_depth=10, bootstrap=True, n_jobs=-1, random_state=0), 'rf3000mss1md10Bt'), # (RandomForestClassifier(n_estimators=1000, min_samples_split=1, max_depth=10, bootstrap=False, n_jobs=-1, random_state=0), 'rf3000mss1md10Bf'), (RandomForestClassifier(n_estimators=3000, min_samples_split=1, max_depth=10, bootstrap=False, n_jobs=-1, random_state=0), 'rf3000mss1md10Bf'), # (RandomForestClassifier(n_estimators=10000, min_samples_split=1, max_depth=10, bootstrap=False, n_jobs=-1, random_state=0), 'rf10000mss1md10Bf'), # (RandomForestClassifier(n_estimators=3000, min_samples_split=1, max_depth=3, bootstrap=False, n_jobs=-1, random_state=0), 'rf3000mss1md3Bf'), # (RandomForestClassifier(n_estimators=3000, min_samples_split=1, max_depth=30, bootstrap=False, n_jobs=-1, random_state=0), 'rf3000mss1md30Bf'), # (RandomForestClassifier(n_estimators=3000, min_samples_split=1, max_depth=10, max_features='log2', bootstrap=False, n_jobs=-1, random_state=0), 'rf3000mss1md10BfmfL2'), # (RandomForestClassifier(n_estimators=3000, min_samples_split=1, max_depth=10, max_features=200, bootstrap=False, n_jobs=-1, random_state=0), 'rf3000mss1md10Bfmf200'), # (GradientBoostingClassifier(n_estimators=500,min_samples_split=1,),'gbc500mss1'), # (GradientBoostingClassifier(n_estimators=1000,min_samples_split=1, random_state=0),'gbc1000mss1'), # (GradientBoostingClassifier(n_estimators=1000,min_samples_split=1, random_state=0, learning_rate=0.03),'gbc1000mss1lr03'), # (GradientBoostingClassifier(n_estimators=1000,min_samples_split=1, random_state=0, learning_rate=0.01),'gbc1000mss1lr01'), # (GradientBoostingClassifier(n_estimators=1000,min_samples_split=1, random_state=0, learning_rate=0.01, max_depth=1000),'gbc1000mss1lr01md1000'), ] cv_ratio = 0.5 def should_normalize(classifier): clazzes = [LogisticRegression] return np.any(np.array([isinstance(classifier, clazz) for clazz in clazzes]) == True) def train_full_model(make_predictions): for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name) if seizure.tasks.task_predict: guesses = ['clip,preictal'] else: guesses = ['clip,seizure,early'] classifier_filenames = [] for target in targets: if isinstance(target,tuple): target, leaderboard_weight = target else: leaderboard_weight = 1 task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_ictal=pipeline.gen_ictal, cv_ratio=cv_ratio) if make_predictions: predictions = MakePredictionsTask(task_core).run() guesses.append(predictions.data) else: task = TrainClassifierTask(task_core) task.run() classifier_filenames.append(task.filename()) if make_predictions: filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name()) filename = os.path.join(submission_dir, filename) with open(filename, 'w') as f: print >> f, '\n'.join(guesses) print 'wrote', filename else: print 'Trained classifiers ready in %s' % cache_dir for filename in classifier_filenames: print os.path.join(cache_dir, filename + '.pickle') def do_cross_validation(): summaries = [] for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name) scores = [] S_scores = [] E_scores = [] leaderboard_weights = [] for target in targets: if isinstance(target,tuple): target, leaderboard_weight = target else: leaderboard_weight = 1 leaderboard_weights.append(leaderboard_weight) print 'Processing %s (classifier %s)' % (target, classifier_name) task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_ictal=pipeline.gen_ictal, cv_ratio=cv_ratio) data = CrossValidationScoreTask(task_core).run() score = data.score scores.append(score) print '%.3f' % score, 'S=%.4f' % data.S_auc, 'E=%.4f' % data.E_auc S_scores.append(data.S_auc) E_scores.append(data.E_auc) if len(scores) > 0: name = pipeline.get_name() + '_' + classifier_name weighted_average = np.average(scores, weights=leaderboard_weights) summary = get_score_summary(name, scores, weighted_average) summaries.append((summary, weighted_average)) print summary if len(S_scores) > 0: name = pipeline.get_name() + '_' + classifier_name summary = get_score_summary(name, S_scores, np.mean(S_scores)) print 'S', summary if len(E_scores) > 0: name = pipeline.get_name() + '_' + classifier_name summary = get_score_summary(name, E_scores, np.mean(E_scores)) print 'E', summary print_results(summaries) def do_train_data(): for pipeline in pipelines: print 'Using pipeline %s' % (pipeline.get_name()) for target in targets: if isinstance(target,tuple): target, leaderboard_weight = target task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=None, classifier=None, normalize=None, gen_ictal=pipeline.gen_ictal, cv_ratio=None) # call the load data tasks for positive and negative examples (ignore the merge of the two.) TrainingDataTask(task_core).run() def do_test_data(): for pipeline in pipelines: print 'Using pipeline %s' % (pipeline.get_name()) for target in targets: if isinstance(target,tuple): target, leaderboard_weight = target task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=None, classifier=None, normalize=None, gen_ictal=pipeline.gen_ictal, cv_ratio=None) LoadTestDataTask(task_core).run() if build_target == 'train_data': do_train_data() elif build_target == 'test_data': do_test_data() elif build_target == 'cv': do_cross_validation() elif build_target == 'train_model': train_full_model(make_predictions=False) elif build_target == 'make_predictions': train_full_model(make_predictions=True) else: raise Exception("unknown build target %s" % build_target)
def run_seizure_detection(build_target): with open('SETTINGS.json') as f: settings = json.load(f) data_dir = str(settings['competition-data-dir']) cache_dir = str(settings['data-cache-dir']) submission_dir = str(settings['submission-dir']) figure_dir = str(settings['figure-dir']) makedirs(submission_dir) cached_data_loader = CachedDataLoader(cache_dir) splitsize = 30000 ts = time.get_millis() bin_size = 50 targets = [ '1', '2', '3', ] pipelines = [ # This is better than winning submission Pipeline(gen_preictal=True, pipeline=[ GetFeature(50, 2500, 400, bin_size, 'usf', onlyfd_dfa=False, with_dfa=False, with_dy=False, with_six=True, with_equal_freq=True, with_mc=False, with_time_corr=True, smooth=True, smooth_Hz=160, power_edge=50, with_square=True, with_log=False, with_sqrt=False, splitsize=splitsize, calibrate=False) ]), #Pipeline(gen_preictal=True, pipeline=[only_FD_DFA(onlyfd_dfa=True)]), ] classifiers = [ 'GB', #'LSVC', # 'ET' ] cv_ratio = 0.5 def should_normalize(classifier): clazzes = [LogisticRegression] return np.any( np.array([isinstance(classifier, clazz) for clazz in clazzes]) == True) def train_full_model(make_predictions): for pipeline in pipelines: for classifier in classifiers: print 'Using pipeline %s with classifier %s' % ( pipeline.get_name(), classifier) guesses = ['File,Class'] classifier_filenames = [] #plot2file = PdfPages(os.path.join(figure_dir, ('figure%d-_%s_%s_.pdf' % (ts, classifier, pipeline.get_name())))) for target in targets: task_core = TaskCore( cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier=classifier, normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal, cv_ratio=cv_ratio, bin_size=bin_size) if make_predictions: predictions = MakePredictionsTask(task_core).run() guesses.append(predictions.data) else: # task = TrainClassifierTask(task_core) # task.run() # classifier_filenames.append(task.filename()) print 'not implemented' if make_predictions: filename = 'submission%d-%s_%s.csv' % (ts, classifier, pipeline.get_name()) filename = os.path.join(submission_dir, filename) with open(filename, 'w') as f: print >> f, '\n'.join(guesses) print 'wrote', filename else: print 'Trained classifiers ready in %s' % cache_dir for filename in classifier_filenames: print os.path.join(cache_dir, filename + '.pickle') if build_target == 'train_model': train_full_model(make_predictions=False) elif build_target == 'make_predictions': train_full_model(make_predictions=True) else: raise Exception("unknown build target %s" % build_target)
def run_seizure_detection(build_target): with open('SETTINGS.json') as f: settings = json.load(f) data_dir = str(settings['competition-data-dir']) cache_dir = str(settings['data-cache-dir']) submission_dir = str(settings['submission-dir']) figure_dir = str(settings['figure-dir']) makedirs(submission_dir) cached_data_loader = CachedDataLoader(cache_dir) splitsize = 30000 ts = time.get_millis() bin_size = 50 targets = [ '1', '2', '3', ] pipelines = [ # This is better than winning submission Pipeline(gen_preictal=True, pipeline=[GetFeature(50, 2500, 400, bin_size, 'usf',onlyfd_dfa=False, with_dfa=False,with_dy=False,with_six=True,with_equal_freq=True, with_mc=False,with_time_corr=True,smooth=True,smooth_Hz=160,power_edge=50, with_square=True,with_log=False,with_sqrt=False,splitsize=splitsize,calibrate=False)]), #Pipeline(gen_preictal=True, pipeline=[only_FD_DFA(onlyfd_dfa=True)]), ] classifiers = [ 'GB', #'LSVC', # 'ET' ] cv_ratio = 0.5 def should_normalize(classifier): clazzes = [LogisticRegression] return np.any(np.array([isinstance(classifier, clazz) for clazz in clazzes]) == True) def train_full_model(make_predictions): for pipeline in pipelines: for classifier in classifiers: print 'Using pipeline %s with classifier %s' % (pipeline.get_name(),classifier) guesses = ['File,Class'] classifier_filenames = [] #plot2file = PdfPages(os.path.join(figure_dir, ('figure%d-_%s_%s_.pdf' % (ts, classifier, pipeline.get_name())))) for target in targets: task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier=classifier, normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal, cv_ratio=cv_ratio,bin_size=bin_size) if make_predictions: predictions = MakePredictionsTask(task_core).run() guesses.append(predictions.data) else: # task = TrainClassifierTask(task_core) # task.run() # classifier_filenames.append(task.filename()) print 'not implemented' if make_predictions: filename = 'submission%d-%s_%s.csv' % (ts, classifier, pipeline.get_name()) filename = os.path.join(submission_dir, filename) with open(filename, 'w') as f: print >> f, '\n'.join(guesses) print 'wrote', filename else: print 'Trained classifiers ready in %s' % cache_dir for filename in classifier_filenames: print os.path.join(cache_dir, filename + '.pickle') if build_target == 'train_model': train_full_model(make_predictions=False) elif build_target == 'make_predictions': train_full_model(make_predictions=True) else: raise Exception("unknown build target %s" % build_target)
def run_seizure_detection(build_target): """ The main entry point for running seizure-detection cross-validation and predictions. Directories from settings file are configured, classifiers are chosen, pipelines are chosen, and the chosen build_target ('cv', 'predict', 'train_model') is run across all combinations of (targets, pipelines, classifiers) """ with open('SETTINGS.json') as f: settings = json.load(f) data_dir = str(settings['competition-data-dir']) cache_dir = str(settings['data-cache-dir']) submission_dir = str(settings['submission-dir']) figure_dir = str(settings['figure-dir']) makedirs(submission_dir) cached_data_loader = CachedDataLoader(cache_dir) ts = time.get_millis() targets = [ 'Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2', ] pipelines = [ # NOTE: you can enable multiple pipelines to run them all and compare results Pipeline(gen_preictal=True, pipeline=[FFTWithTimeFreqCorrelation(50, 2500, 400, 18, 'usf')]), # winning submission ] classifiers = [ # NOTE: you can enable multiple classifiers to run them all and compare results # (RandomForestClassifier(n_estimators=300, min_samples_split=1, max_features=0.5, bootstrap=False, n_jobs=-1, random_state=0), 'rf300mss1mf05Bfrs0'), # (ExtraTreesClassifier(n_estimators=3000, min_samples_split=1, max_features=0.15, bootstrap=False, n_jobs=-1, random_state=0), 'ET3000mss1mf015Bfrs0'), # # (GradientBoostingClassifier(n_estimators=3000, min_samples_split=1, max_features=0.15, learning_rate=0.02, subsample = 0.5, random_state=0), 'GBRT3000mms1mf015Lr002Ss05rs0'), (SVC(C=1e6, kernel='rbf', gamma=0.01, coef0=0.0, shrinking=True, probability=True, tol=1e-5, cache_size=2000, class_weight='auto', max_iter=-1, random_state=0), 'svcce6rbfg001co0stte-5cwautors0'), ] cv_ratio = 0.5 def should_normalize(classifier): clazzes = [LogisticRegression] return np.any(np.array([isinstance(classifier, clazz) for clazz in clazzes]) == True) def train_full_model(make_predictions): for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name) guesses = ['clip,preictal'] classifier_filenames = [] plot2file = PdfPages(os.path.join(figure_dir, ('figure%d-_%s_%s_.pdf' % (ts, classifier_name, pipeline.get_name())))) for target in targets: task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal, cv_ratio=cv_ratio, plot2file = plot2file) if make_predictions: predictions = MakePredictionsTask(task_core).run() guesses.append(predictions.data) else: task = TrainClassifierTask(task_core) task.run() classifier_filenames.append(task.filename()) if make_predictions: filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name()) filename = os.path.join(submission_dir, filename) with open(filename, 'w') as f: print >> f, '\n'.join(guesses) print 'wrote', filename else: print 'Trained classifiers ready in %s' % cache_dir for filename in classifier_filenames: print os.path.join(cache_dir, filename + '.pickle') plot2file.close() def predict_all(make_predictions): for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name) lines = ['clip,preictal'] subjectID = 0 X_train = y_train = X_test = test_size = [] for target in targets: task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal, cv_ratio=cv_ratio) data = GetCrossSubjectDataTask(task_core).run() # a = np.shape(data.X_test)[0] test_size.append(np.shape(data.X_test)[0]) if subjectID > 0: X_train = np.concatenate((X_train, data.X_train), axis=0) y_train = np.concatenate((y_train, data.y_train), axis=0) X_test = np.concatenate((X_test, data.X_test), axis=0) else: X_train = data.X_train y_train = data.y_train X_test = data.X_test subjectID += 1 #Training task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=[], pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal, cv_ratio=cv_ratio) y_train = np.ceil(0.1*y_train) y_train.astype('int_') if should_normalize(classifier): X_train, temp = normalize_data(X_train, X_train) print "Training ..." print 'Dim', np.shape(X_train), np.shape(y_train) start = time.get_seconds() classifier.fit(X_train, y_train) elapsedSecs = time.get_seconds() - start print "t=%ds" % int(elapsedSecs) y_estimate = classifier.predict_proba(X_train) lr = LogisticRegression(random_state = 0) lr.fit(y_estimate, y_train) predictions_proba = classifier.predict_proba(X_test) predictions_calibrated = lr.predict_proba(predictions_proba) #output m = 0 totalSample = 12 startIdx = 0 for target in targets: for i in range(test_size[m]/totalSample): j = i+1 if j < 10: nstr = '000%d' %j elif j < 100: nstr = '00%d' %j elif j < 1000: nstr = '0%d' %j else: nstr = '%d' %j preictalOverAllSample = 0 for k in range(totalSample): p = predictions_calibrated[i*totalSample+k+startIdx] preictal = translate_prediction(p) preictalOverAllSample += preictal/totalSample newline = '%s_test_segment_%s.mat,%.15f' % (target, nstr, preictalOverAllSample) lines.append(newline) print newline startIdx = startIdx + test_size[m] m += 1 filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name()) filename = os.path.join(submission_dir, filename) with open(filename, 'w') as f: print >> f, '\n'.join(lines) print 'wrote', filename def do_cross_validation(): summaries = [] for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name) scores = [] for target in targets: print 'Processing %s (classifier %s)' % (target, classifier_name) task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, # target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal, cv_ratio=cv_ratio) data = CrossValidationScoreTask(task_core).run() score = data.score scores.append(score) print '%.3f' % score if len(scores) > 0: name = pipeline.get_name() + '_' + classifier_name summary = get_score_summary(name, scores) summaries.append((summary, np.mean(scores))) print summary print_results(summaries) if build_target == 'cv': do_cross_validation() elif build_target == 'train_model': train_full_model(make_predictions=False) elif build_target == 'make_predictions': train_full_model(make_predictions=True) elif build_target == 'predict_all': predict_all(make_predictions=True) else: raise Exception("unknown build target %s" % build_target)
def run_seizure_detection(build_target): """ The main entry point for running seizure-detection cross-validation and predictions. Directories from settings file are configured, classifiers are chosen, pipelines are chosen, and the chosen build_target ('cv', 'predict', 'train_model') is run across all combinations of (targets, pipelines, classifiers) """ with open('SETTINGS.json') as f: settings = json.load(f) data_dir = str(settings['competition-data-dir']) cache_dir = str(settings['data-cache-dir']) submission_dir = str(settings['submission-dir']) makedirs(submission_dir) cached_data_loader = CachedDataLoader(cache_dir) ts = time.get_millis() targets = [ 'Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Patient_1', 'Patient_2', 'Patient_3', 'Patient_4', 'Patient_5', 'Patient_6', 'Patient_7', 'Patient_8' ] pipelines = [ Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]) ] classifiers = [(RandomForestClassifier(n_estimators=3000, min_samples_split=2, bootstrap=False, n_jobs=4, random_state=0), 'rf3000')] cv_ratio = 0.5 def should_normalize(classifier): clazzes = [LogisticRegression] return np.any( np.array([isinstance(classifier, clazz) for clazz in clazzes]) == True) def train_full_model(make_predictions): for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % ( pipeline.get_name(), classifier_name) guesses = ['clip,seizure,early'] classifier_filenames = [] for target in targets: task_core = TaskCore( cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_ictal=pipeline.gen_ictal, cv_ratio=cv_ratio) if make_predictions: predictions = MakePredictionsTask(task_core).run() guesses.append(predictions.data) else: task = TrainClassifierTask(task_core) task.run() classifier_filenames.append(task.filename()) if make_predictions: filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name()) filename = os.path.join(submission_dir, filename) with open(filename, 'w') as f: print >> f, '\n'.join(guesses) print 'wrote', filename else: print 'Trained classifiers ready in %s' % cache_dir for filename in classifier_filenames: print os.path.join(cache_dir, filename + '.pickle') def do_cross_validation(): for pipeline in pipelines: for (classifier, classifier_name) in classifiers: print 'Using pipeline %s with classifier %s' % ( pipeline.get_name(), classifier_name) scores = [] for target in targets: print 'Processing %s (classifier %s)' % (target, classifier_name) task_core = TaskCore( cached_data_loader=cached_data_loader, data_dir=data_dir, target=target, pipeline=pipeline, classifier_name=classifier_name, classifier=classifier, normalize=should_normalize(classifier), gen_ictal=pipeline.gen_ictal, cv_ratio=cv_ratio) data = CrossValidationScoreTask(task_core).run() score = data.score scores.append(score) print target, 'Seizure_AUC=', data.S_auc, 'Early_AUC=', data.E_auc if build_target == 'cv': do_cross_validation() elif build_target == 'train_model': train_full_model(make_predictions=False) elif build_target == 'make_predictions': train_full_model(make_predictions=True) else: raise Exception("unknown build target %s" % build_target)