示例#1
0
def run_seizure_detection(build_target):
    """
    The main entry point for running seizure-detection cross-validation and predictions.
    Directories from settings file are configured, classifiers are chosen, pipelines are
    chosen, and the chosen build_target ('cv', 'predict', 'train_model') is run across
    all combinations of (targets, pipelines, classifiers)
    """

    with open('SETTINGS.json') as f:
        settings = json.load(f)

    data_dir = str(settings['competition-data-dir'])
    cache_dir = str(settings['data-cache-dir'])
    submission_dir = str(settings['submission-dir'])

    makedirs(submission_dir)

    cached_data_loader = CachedDataLoader(cache_dir)

    ts = time.get_millis()

    targets = [
        #'Dog_1',
        #'Dog_2',
        #'Dog_3',
        #'Dog_4',
        #'Patient_1',
        #'Patient_2',
        #'Patient_3',
        #'Patient_4',
        #'Patient_5',
        #'Patient_6',
        #'Patient_7',
        'Patient_8'
    ]
    pipelines = [
        # NOTE(mike): you can enable multiple pipelines to run them all and compare results
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 48), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 64), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 96), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 128), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 160), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[Stats()]),
        # Pipeline(gen_ictal=False, pipeline=[DaubWaveletStats(4)]),
        # Pipeline(gen_ictal=False, pipeline=[Resample(400), DaubWaveletStats(4)]),
        # Pipeline(gen_ictal=False, pipeline=[Resample(400), MFCC()]),
        # Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]),
        # Pipeline(gen_ictal=True,  pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]),
        Pipeline(gen_ictal=False,
                 pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')
                           ]),  # winning submission
        # Pipeline(gen_ictal=True,  pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]), # higher score than winning submission
        # Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]),
        # Pipeline(gen_ictal=True,  pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]),
        # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'usf', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=False)]),
        # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=False, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'none', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'usf', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=False)]),
        # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=False, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'none', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'us')]),
        # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'usf')]),
        # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'none')]),
    ]
    classifiers = [
        # NOTE(mike): you can enable multiple classifiers to run them all and compare results
        # (RandomForestClassifier(n_estimators=50, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf50mss1Bfrs0'),
        # (RandomForestClassifier(n_estimators=150, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf150mss1Bfrs0'),
        # (RandomForestClassifier(n_estimators=300, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf300mss1Bfrs0'),
        (RandomForestClassifier(n_estimators=3000,
                                min_samples_split=1,
                                bootstrap=False,
                                n_jobs=4,
                                random_state=0), 'rf3000mss1Bfrs0'),
    ]
    cv_ratio = 0.5

    def should_normalize(classifier):
        clazzes = [LogisticRegression]
        return np.any(
            np.array([isinstance(classifier, clazz)
                      for clazz in clazzes]) == True)

    def train_full_model(make_predictions):
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (
                    pipeline.get_name(), classifier_name)
                guesses = ['clip,seizure,early']
                classifier_filenames = []
                for target in targets:
                    task_core = TaskCore(
                        cached_data_loader=cached_data_loader,
                        data_dir=data_dir,
                        target=target,
                        pipeline=pipeline,
                        classifier_name=classifier_name,
                        classifier=classifier,
                        normalize=should_normalize(classifier),
                        gen_ictal=pipeline.gen_ictal,
                        cv_ratio=cv_ratio)

                    if make_predictions:
                        predictions = MakePredictionsTask(task_core).run()
                        guesses.append(predictions.data)
                    else:
                        task = TrainClassifierTask(task_core)
                        task.run()
                        classifier_filenames.append(task.filename())

                if make_predictions:
                    filename = 'submission%d-%s_%s.csv' % (ts, classifier_name,
                                                           pipeline.get_name())
                    filename = os.path.join(submission_dir, filename)
                    with open(filename, 'w') as f:
                        print >> f, '\n'.join(guesses)
                    print 'wrote', filename
                else:
                    print 'Trained classifiers ready in %s' % cache_dir
                    for filename in classifier_filenames:
                        print os.path.join(cache_dir, filename + '.pickle')

    def do_cross_validation():
        summaries = []
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (
                    pipeline.get_name(), classifier_name)
                scores = []
                S_scores = []
                E_scores = []
                for target in targets:
                    print 'Processing %s (classifier %s)' % (target,
                                                             classifier_name)

                    task_core = TaskCore(
                        cached_data_loader=cached_data_loader,
                        data_dir=data_dir,
                        target=target,
                        pipeline=pipeline,
                        classifier_name=classifier_name,
                        classifier=classifier,
                        normalize=should_normalize(classifier),
                        gen_ictal=pipeline.gen_ictal,
                        cv_ratio=cv_ratio)

                    data = CrossValidationScoreTask(task_core).run()
                    score = data.score

                    scores.append(score)

                    print '%.3f' % score, 'S=%.4f' % data.S_auc, 'E=%.4f' % data.E_auc
                    S_scores.append(data.S_auc)
                    E_scores.append(data.E_auc)

                if len(scores) > 0:
                    name = pipeline.get_name() + '_' + classifier_name
                    summary = get_score_summary(name, scores)
                    summaries.append((summary, np.mean(scores)))
                    print summary
                if len(S_scores) > 0:
                    name = pipeline.get_name() + '_' + classifier_name
                    summary = get_score_summary(name, S_scores)
                    print 'S', summary
                if len(E_scores) > 0:
                    name = pipeline.get_name() + '_' + classifier_name
                    summary = get_score_summary(name, E_scores)
                    print 'E', summary

            print_results(summaries)

    if build_target == 'cv':
        do_cross_validation()
    elif build_target == 'train_model':
        train_full_model(make_predictions=False)
    elif build_target == 'make_predictions':
        train_full_model(make_predictions=True)
    else:
        raise Exception("unknown build target %s" % build_target)
示例#2
0
def run_seizure_detection(build_target):
    """
    The main entry point for running seizure-detection cross-validation and predictions.
    Directories from settings file are configured, classifiers are chosen, pipelines are
    chosen, and the chosen build_target ('cv', 'predict', 'train_model') is run across
    all combinations of (targets, pipelines, classifiers)
    """

    with open('SETTINGS.json') as f:
        settings = json.load(f)

    data_dir = str(settings['competition-data-dir'])
    cache_dir = str(settings['data-cache-dir'])
    submission_dir = str(settings['submission-dir'])
    figure_dir = str(settings['figure-dir'])

    makedirs(submission_dir)

    cached_data_loader = CachedDataLoader(cache_dir)

    ts = time.get_millis()

    targets = [
        'Dog_1',
        'Dog_2',
        'Dog_3',
        'Dog_4',
        'Dog_5',
        'Patient_1',
        'Patient_2',
    ]
    pipelines = [
        # NOTE: you can enable multiple pipelines to run them all and compare results
        Pipeline(gen_preictal=True,
                 pipeline=[
                     FFTWithTimeFreqCorrelation(50, 2500, 400, 18, 'usf')
                 ]),  # winning submission
    ]
    classifiers = [
        # NOTE: you can enable multiple classifiers to run them all and compare results
        #         (RandomForestClassifier(n_estimators=300, min_samples_split=1, max_features=0.5, bootstrap=False, n_jobs=-1, random_state=0), 'rf300mss1mf05Bfrs0'),

        #         (ExtraTreesClassifier(n_estimators=3000, min_samples_split=1, max_features=0.15, bootstrap=False, n_jobs=-1, random_state=0), 'ET3000mss1mf015Bfrs0'),
        #
        #         (GradientBoostingClassifier(n_estimators=3000, min_samples_split=1, max_features=0.15, learning_rate=0.02, subsample = 0.5, random_state=0), 'GBRT3000mms1mf015Lr002Ss05rs0'),
        (SVC(C=1e6,
             kernel='rbf',
             gamma=0.01,
             coef0=0.0,
             shrinking=True,
             probability=True,
             tol=1e-5,
             cache_size=2000,
             class_weight='auto',
             max_iter=-1,
             random_state=0), 'svcce6rbfg001co0stte-5cwautors0'),
    ]
    cv_ratio = 0.5

    def should_normalize(classifier):
        clazzes = [LogisticRegression]
        return np.any(
            np.array([isinstance(classifier, clazz)
                      for clazz in clazzes]) == True)

    def train_full_model(make_predictions):
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print('Using pipeline %s with classifier %s' %
                      (pipeline.get_name(), classifier_name))
                guesses = ['clip,preictal']
                classifier_filenames = []
                plot2file = PdfPages(
                    os.path.join(figure_dir,
                                 ('figure%d-_%s_%s_.pdf' %
                                  (ts, classifier_name, pipeline.get_name()))))
                for target in targets:
                    task_core = TaskCore(
                        cached_data_loader=cached_data_loader,
                        data_dir=data_dir,
                        target=target,
                        pipeline=pipeline,
                        classifier_name=classifier_name,
                        classifier=classifier,
                        normalize=should_normalize(classifier),
                        gen_preictal=pipeline.gen_preictal,
                        cv_ratio=cv_ratio,
                        plot2file=plot2file)

                    if make_predictions:
                        predictions = MakePredictionsTask(task_core).run()
                        guesses.append(predictions.data)
                    else:
                        task = TrainClassifierTask(task_core)
                        task.run()
                        classifier_filenames.append(task.filename())

                if make_predictions:
                    filename = 'submission%d-%s_%s.csv' % (ts, classifier_name,
                                                           pipeline.get_name())
                    filename = os.path.join(submission_dir, filename)
                    with open(filename, 'w') as f:
                        print('\n'.join(guesses), file=f)
                    print('wrote', filename)
                else:
                    print('Trained classifiers ready in %s' % cache_dir)
                    for filename in classifier_filenames:
                        print(os.path.join(cache_dir, filename + '.pickle'))

                plot2file.close()

    def predict_all(make_predictions):
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print('Using pipeline %s with classifier %s' %
                      (pipeline.get_name(), classifier_name))
                lines = ['clip,preictal']
                subjectID = 0
                X_train = y_train = X_test = test_size = []
                for target in targets:
                    task_core = TaskCore(
                        cached_data_loader=cached_data_loader,
                        data_dir=data_dir,
                        target=target,
                        pipeline=pipeline,
                        classifier_name=classifier_name,
                        classifier=classifier,
                        normalize=should_normalize(classifier),
                        gen_preictal=pipeline.gen_preictal,
                        cv_ratio=cv_ratio)

                    data = GetCrossSubjectDataTask(task_core).run()
                    #                     a = np.shape(data.X_test)[0]
                    test_size.append(np.shape(data.X_test)[0])
                    if subjectID > 0:
                        X_train = np.concatenate((X_train, data.X_train),
                                                 axis=0)
                        y_train = np.concatenate((y_train, data.y_train),
                                                 axis=0)
                        X_test = np.concatenate((X_test, data.X_test), axis=0)
                    else:
                        X_train = data.X_train
                        y_train = data.y_train
                        X_test = data.X_test
                    subjectID += 1

                #Training
                task_core = TaskCore(cached_data_loader=cached_data_loader,
                                     data_dir=data_dir,
                                     target=[],
                                     pipeline=pipeline,
                                     classifier_name=classifier_name,
                                     classifier=classifier,
                                     normalize=should_normalize(classifier),
                                     gen_preictal=pipeline.gen_preictal,
                                     cv_ratio=cv_ratio)
                y_train = np.ceil(0.1 * y_train)
                y_train.astype('int_')
                if should_normalize(classifier):
                    X_train, temp = normalize_data(X_train, X_train)

                print("Training ...")
                print('Dim', np.shape(X_train), np.shape(y_train))
                start = time.get_seconds()
                classifier.fit(X_train, y_train)
                elapsedSecs = time.get_seconds() - start
                print("t=%ds" % int(elapsedSecs))

                y_estimate = classifier.predict_proba(X_train)
                lr = LogisticRegression(random_state=0)
                lr.fit(y_estimate, y_train)
                predictions_proba = classifier.predict_proba(X_test)
                predictions_calibrated = lr.predict_proba(predictions_proba)

                #output
                m = 0
                totalSample = 12
                startIdx = 0
                for target in targets:
                    for i in range(test_size[m] / totalSample):
                        j = i + 1
                        if j < 10:
                            nstr = '000%d' % j
                        elif j < 100:
                            nstr = '00%d' % j
                        elif j < 1000:
                            nstr = '0%d' % j
                        else:
                            nstr = '%d' % j

                        preictalOverAllSample = 0
                        for k in range(totalSample):
                            p = predictions_calibrated[i * totalSample + k +
                                                       startIdx]
                            preictal = translate_prediction(p)
                            preictalOverAllSample += preictal / totalSample

                        newline = '%s_test_segment_%s.mat,%.15f' % (
                            target, nstr, preictalOverAllSample)
                        lines.append(newline)

                    print(newline)
                    startIdx = startIdx + test_size[m]
                    m += 1

                filename = 'submission%d-%s_%s.csv' % (ts, classifier_name,
                                                       pipeline.get_name())
                filename = os.path.join(submission_dir, filename)
                with open(filename, 'w') as f:
                    print('\n'.join(lines), file=f)
                print('wrote', filename)

    def do_cross_validation():
        summaries = []
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print('Using pipeline %s with classifier %s' %
                      (pipeline.get_name(), classifier_name))
                scores = []
                for target in targets:
                    print('Processing %s (classifier %s)' %
                          (target, classifier_name))

                    task_core = TaskCore(
                        cached_data_loader=cached_data_loader,
                        data_dir=data_dir,
                        target=target,
                        pipeline=pipeline,
                        #                                          target=target, pipeline=pipeline,
                        classifier_name=classifier_name,
                        classifier=classifier,
                        normalize=should_normalize(classifier),
                        gen_preictal=pipeline.gen_preictal,
                        cv_ratio=cv_ratio)

                    data = CrossValidationScoreTask(task_core).run()
                    score = data.score

                    scores.append(score)

                    print('%.3f' % score)

                if len(scores) > 0:
                    name = pipeline.get_name() + '_' + classifier_name
                    summary = get_score_summary(name, scores)
                    summaries.append((summary, np.mean(scores)))
                    print(summary)

            print_results(summaries)

    if build_target == 'cv':
        do_cross_validation()
    elif build_target == 'train_model':
        train_full_model(make_predictions=False)
    elif build_target == 'make_predictions':
        train_full_model(make_predictions=True)
    elif build_target == 'predict_all':
        predict_all(make_predictions=True)
    else:
        raise Exception("unknown build target %s" % build_target)
def run_seizure_detection(build_target):
    """
    The main entry point for running seizure-detection cross-validation and predictions.
    Directories from settings file are configured, classifiers are chosen, pipelines are
    chosen, and the chosen build_target ('cv', 'predict', 'train_model') is run across
    all combinations of (targets, pipelines, classifiers)
    """

    with open('SETTINGS.json') as f:
        settings = json.load(f)

    data_dir = str(settings['competition-data-dir'])
    cache_dir = str(settings['data-cache-dir'])
    submission_dir = str(settings['submission-dir'])

    makedirs(submission_dir)

    cached_data_loader = CachedDataLoader(cache_dir)

    ts = time.get_millis()

    targets = [
        'Dog_1',
        'Dog_2',
        'Dog_3',
        'Dog_4',
        'Patient_1',
        'Patient_2',
        'Patient_3',
        'Patient_4',
        'Patient_5',
        'Patient_6',
        'Patient_7',
        'Patient_8'
    ]
    pipelines = [
        # NOTE(mike): you can enable multiple pipelines to run them all and compare results
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 48), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 64), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 96), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 128), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 160), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[Stats()]),
        # Pipeline(gen_ictal=False, pipeline=[DaubWaveletStats(4)]),
        # Pipeline(gen_ictal=False, pipeline=[Resample(400), DaubWaveletStats(4)]),
        # Pipeline(gen_ictal=False, pipeline=[Resample(400), MFCC()]),
        # Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]),
        # Pipeline(gen_ictal=True,  pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]),
        Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]), # winning submission
        # Pipeline(gen_ictal=True,  pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]), # higher score than winning submission
        # Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]),
        # Pipeline(gen_ictal=True,  pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]),
        # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'usf', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=False)]),
        # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=False, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'none', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'usf', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=False)]),
        # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=False, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'none', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'us')]),
        # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'usf')]),
        # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'none')]),
    ]
    classifiers = [
        # NOTE(mike): you can enable multiple classifiers to run them all and compare results
        # (RandomForestClassifier(n_estimators=50, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf50mss1Bfrs0'),
        # (RandomForestClassifier(n_estimators=150, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf150mss1Bfrs0'),
        # (RandomForestClassifier(n_estimators=300, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf300mss1Bfrs0'),

        # NOTE(mike): The original submission classifier was min_samples_split=1, but I had to change it to 2 after upgrading scikit.
        # I'm not even sure min_samples_split=1 makes sense in hindsight, how can you split on 1 sample? Anyway to get the repo functional
        # again with newer libraries it's now 2.
        # (RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf3000mss1Bfrs0'),
        (RandomForestClassifier(n_estimators=3000, min_samples_split=2, bootstrap=False, n_jobs=4, random_state=0), 'rf3000mss2Bfrs0'),
    ]
    cv_ratio = 0.5

    def should_normalize(classifier):
        clazzes = [LogisticRegression]
        return np.any(np.array([isinstance(classifier, clazz) for clazz in clazzes]) == True)

    def train_full_model(make_predictions):
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name)
                guesses = ['clip,seizure,early']
                classifier_filenames = []
                for target in targets:
                    task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                         target=target, pipeline=pipeline,
                                         classifier_name=classifier_name, classifier=classifier,
                                         normalize=should_normalize(classifier), gen_ictal=pipeline.gen_ictal,
                                         cv_ratio=cv_ratio)

                    if make_predictions:
                        predictions = MakePredictionsTask(task_core).run()
                        guesses.append(predictions.data)
                    else:
                        task = TrainClassifierTask(task_core)
                        task.run()
                        classifier_filenames.append(task.filename())

                if make_predictions:
                    filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name())
                    filename = os.path.join(submission_dir, filename)
                    with open(filename, 'w') as f:
                        print >> f, '\n'.join(guesses)
                    print 'wrote', filename
                else:
                    print 'Trained classifiers ready in %s' % cache_dir
                    for filename in classifier_filenames:
                        print os.path.join(cache_dir, filename + '.pickle')

    def do_cross_validation():
        summaries = []
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name)
                scores = []
                S_scores = []
                E_scores = []
                for target in targets:
                    print 'Processing %s (classifier %s)' % (target, classifier_name)

                    task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                         target=target, pipeline=pipeline,
                                         classifier_name=classifier_name, classifier=classifier,
                                         normalize=should_normalize(classifier), gen_ictal=pipeline.gen_ictal,
                                         cv_ratio=cv_ratio)

                    data = CrossValidationScoreTask(task_core).run()
                    score = data.score

                    scores.append(score)

                    print '%.3f' % score, 'S=%.4f' % data.S_auc, 'E=%.4f' % data.E_auc
                    S_scores.append(data.S_auc)
                    E_scores.append(data.E_auc)

                if len(scores) > 0:
                    name = pipeline.get_name() + '_' + classifier_name
                    summary = get_score_summary(name, scores)
                    summaries.append((summary, np.mean(scores)))
                    print summary
                if len(S_scores) > 0:
                    name = pipeline.get_name() + '_' + classifier_name
                    summary = get_score_summary(name, S_scores)
                    print 'S', summary
                if len(E_scores) > 0:
                    name = pipeline.get_name() + '_' + classifier_name
                    summary = get_score_summary(name, E_scores)
                    print 'E', summary

            print_results(summaries)

    if build_target == 'cv':
        do_cross_validation()
    elif build_target == 'train_model':
        train_full_model(make_predictions=False)
    elif build_target == 'make_predictions':
        train_full_model(make_predictions=True)
    else:
        raise Exception("unknown build target %s" % build_target)
def run_seizure_detection(build_target):
    """
    The main entry point for running seizure-detection cross-validation and predictions.
    Directories from settings file are configured, classifiers are chosen, pipelines are
    chosen, and the chosen build_target ('cv', 'predict', 'train_model') is run across
    all combinations of (targets, pipelines, classifiers)
    """

    with open('SETTINGS.json') as f:
        settings = json.load(f)

    data_dir = str(settings['competition-data-dir'])
    cache_dir = str(settings['data-cache-dir'])
    submission_dir = str(settings['submission-dir'])

    makedirs(submission_dir)

    cached_data_loader = CachedDataLoader(cache_dir)

    ts = time.get_millis()

    targets = [
        #'Dog_1',
        #'Dog_2',
        #'Dog_3',
        #'Dog_4',
        'Dog_5',
        #'Patient_1',
        #'Patient_2'
    ]
    pipelines = [
        # NOTE(mike): you can enable multiple pipelines to run them all and compare results
        Pipeline(gen_preictal=False, pipeline=[Resample(400), MaximalCrossCorrelation()]),
        #Pipeline(gen_preictal=False, pipeline=[CorrelationWithVariance(with_eigen=False)]),
        #Pipeline(gen_preictal=True, pipeline=[CorrelationWithVariance(with_eigen=True)]),
        #Pipeline(gen_preictal=True, pipeline=[CorrelationWithVariance(with_eigen=False)]),
        #Pipeline(gen_preictal=True,  pipeline=[FFT(), Slice(1, 48), Magnitude(), Log10()]),
        #Pipeline(gen_preictal=False, pipeline=[MFCC()]),
        #Pipeline(gen_preictal=False, pipeline=[CorrelationWithVariance()]),
        #Pipeline(gen_preictal=False, pipeline=[FFT(), Slice(1, 64), Magnitude(), Log10()]),
        #Pipeline(gen_preictal=False, pipeline=[FFT(), Slice(1, 96), Magnitude(), Log10()]),
        #Pipeline(gen_preictal=False, pipeline=[FFT(), Slice(1, 128), Magnitude(), Log10()]),
        #Pipeline(gen_preictal=False, pipeline=[FFT(), Slice(1, 160), Magnitude(), Log10()]),
        #Pipeline(gen_preictal=False, pipeline=[FFT()]),
        #Pipeline(gen_preictal=False, pipeline=[FFT(), Magnitude(), Log10()]),
        #Pipeline(gen_preictal=False, pipeline=[Stats()]),
        #Pipeline(gen_preictal=False, pipeline=[DaubWaveletStats(4)]),
        #Pipeline(gen_preictal=False, pipeline=[Resample(400), DaubWaveletStats(4)]),
        #Pipeline(gen_preictal=False, pipeline=[Resample(400), MFCC()]),
        #Pipeline(gen_preictal=False, pipeline=[FFTWithTimeFreqCorrelation(65, 100, 400, 'us')]),
        #Pipeline(gen_preictal=False, pipeline=[FFTWithTimeFreqCorrelation(30, 45, 400, 'us')]),
        #Pipeline(gen_preictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]),
        #Pipeline(gen_preictal=True,  pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]),
        #Pipeline(gen_preictal=True,  pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]), # winning submission
        #Pipeline(gen_preictal=True,  pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]), # higher score than winning submission
        #Pipeline(gen_preictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]),
        #Pipeline(gen_preictal=True,  pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]),
        #Pipeline(gen_preictal=False, pipeline=[TimeCorrelation(400, 'usf', with_corr=True, with_eigen=True)]),
        #Pipeline(gen_preictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=True)]),
        #Pipeline(gen_preictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=False)]),
        #Pipeline(gen_preictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=False, with_eigen=True)]),
        #Pipeline(gen_preictal=False, pipeline=[TimeCorrelation(400, 'none', with_corr=True, with_eigen=True)]),
        #Pipeline(gen_preictal=False, pipeline=[FreqCorrelation(1, 48, 'usf', with_corr=True, with_eigen=True)]),
        #Pipeline(gen_preictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=True)]),
        #Pipeline(gen_preictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=False)]),
        #Pipeline(gen_preictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=False, with_eigen=True)]),
        #Pipeline(gen_preictal=False, pipeline=[FreqCorrelation(1, 48, 'none', with_corr=True, with_eigen=True)]),
        #Pipeline(gen_preictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'us')]),
        #Pipeline(gen_preictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'usf')]),
        #Pipeline(gen_preictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'none')]),
    ]
    classifiers = [
        # NOTE(mike): you can enable multiple classifiers to run them all and compare results
        (RandomForestClassifier(n_estimators=50, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf50mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=150, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf150mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=300, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf300mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=1000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf1000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=2000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf2000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf3000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=4000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf4000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=5000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf5000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=6000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf6000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=7000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf7000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=8000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf8000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=10000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf10000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=9000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf9000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=11000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf11000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=12000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf12000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=13000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf13000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=14000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf14000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=15000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf15000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=16000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf16000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=17000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf17000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=18000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf18000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=19000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf19000mss1Bfrs0'),
        #(RandomForestClassifier(n_estimators=20000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf20000mss1Bfrs0'),
        #(LogisticRegression(), 'logistic_regression'),
        #(LinearSVC(C=0.1), 'linearsvc_c0.1'),
        #(LinearSVC(C=1), 'linearsvc_c1'),
    ]
    cv_ratio = 0.5

    def should_normalize(classifier):
        clazzes = [LogisticRegression]
        return np.any(np.array([isinstance(classifier, clazz) for clazz in clazzes]) == True)

    def train_full_model(make_predictions):
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name)
                guesses = ['clip,preictal']
                classifier_filenames = []
                for target in targets:
                    task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                         target=target, pipeline=pipeline,
                                         classifier_name=classifier_name, classifier=classifier,
                                         normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal,
                                         cv_ratio=cv_ratio)

                    if make_predictions:
                        predictions = MakePredictionsTask(task_core).run()
                        guesses.append(predictions.data)
                    else:
                        task = TrainClassifierTask(task_core)
                        task.run()
                        classifier_filenames.append(task.filename())

                if make_predictions:
                    filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name())
                    filename = os.path.join(submission_dir, filename)
                    with open(filename, 'w') as f:
                        print >> f, '\n'.join(guesses)
                    print 'wrote', filename
                else:
                    print 'Trained classifiers ready in %s' % cache_dir
                    for filename in classifier_filenames:
                        print os.path.join(cache_dir, filename + '.pickle')

    def do_cross_validation():
        summaries = []
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name)
                scores = []
                for target in targets:
                    print 'Processing %s (classifier %s)' % (target, classifier_name)

                    task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                         target=target, pipeline=pipeline,
                                         classifier_name=classifier_name, classifier=classifier,
                                         normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal,
                                         cv_ratio=cv_ratio)

                    data = CrossValidationScoreTask(task_core).run()
                    score = data.score

                    scores.append(score)

                    print '%.3f' % score

                if len(scores) > 0:
                    name = pipeline.get_name() + '_' + classifier_name
                    summary = get_score_summary(name, scores)
                    summaries.append((summary, np.mean(scores)))
                    print summary

            print_results(summaries)

    if build_target == 'cv':
        do_cross_validation()
    elif build_target == 'train_model':
        train_full_model(make_predictions=False)
    elif build_target == 'make_predictions':
        train_full_model(make_predictions=True)
    else:
        raise Exception("unknown build target %s" % build_target)
def run_seizure_detection(build_target):
    """
    The main entry point for running seizure-detection cross-validation and predictions.
    Directories from settings file are configured, classifiers are chosen, pipelines are
    chosen, and the chosen build_target ('cv', 'predict', 'train_model') is run across
    all combinations of (targets, pipelines, classifiers)
    """

    with open('SETTINGS.json') as f:
        settings = json.load(f)

    data_dir = str(settings['competition-data-dir'])
    cache_dir = str(settings['data-cache-dir'])
    submission_dir = str(settings['submission-dir'])

    makedirs(submission_dir)

    cached_data_loader = CachedDataLoader(cache_dir)

    ts = time.get_millis()

    targets = [
        'Dog_1',
        'Dog_2',
        'Dog_3',
        'Dog_4',
        'Dog_5',
        'Patient_1_downsample',
        'Patient_2_downsample',
   ]
    pipelines = [
        # NOTE(mike): you can enable multiple pipelines to run them all and compare results
        # Pipeline(pipeline=[FFT(), Slice(1, 64), Magnitude(), Log10()]),
        # Pipeline(pipeline=[FFT(), Slice(1, 48), Magnitude(), Log10()]),
        # Pipeline(pipeline=[FFT(), Slice(1, 96), Magnitude(), Log10()]),
        # Pipeline(pipeline=[RFFT(), Slice(1, 48), Magnitude(), Log10()]),
        # Pipeline(pipeline=[FFT(), Slice(1, 128), Magnitude(), Log10()]),
         Pipeline(pipeline=[TimeAliasing(),FFT(), Slice(1, 48), Magnitude(), Log10()]),
        # Pipeline(pipeline=[TimeAliasing(),FFT(), Slice(1, 64), Magnitude(), Log10()]),
        # Pipeline(pipeline=[FFT(), Slice(1, 160), Magnitude(), Log10()]),
        # Pipeline(pipeline=[FFT(), Magnitude(), Log10()]),
        # Pipeline(pipeline=[Stats()]),
        # Pipeline(pipeline=[DaubWaveletStats(4)]),
        # Pipeline(pipeline=[Resample(400), DaubWaveletStats(4)]),
        # Pipeline(pipeline=[Resample(400), MFCC()]),
        # Pipeline(pipeline=[TimeAliasing(),FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]),
        # Pipeline(pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]),
        # Pipeline(pipeline=[TimeAliasing(),FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]), # winning submission
        # Pipeline(pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]), # higher score than winning submission
        # Pipeline(pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]),
        # Pipeline(pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]),
        # Pipeline(pipeline=[TimeAliasing(),TimeCorrelation(400, 'usf', with_corr=True, with_eigen=True)]),
        # Pipeline(pipeline=[TimeAliasing(),TimeCorrelation(400, 'us', with_corr=True, with_eigen=True)]),
        # Pipeline(pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=False)]),
        # Pipeline(pipeline=[TimeCorrelation(400, 'us', with_corr=False, with_eigen=True)]),
        # Pipeline(pipeline=[TimeCorrelation(400, 'none', with_corr=True, with_eigen=True)]),
        # Pipeline(pipeline=[TimeAliasing(),FreqCorrelation(1, 48, 'usf', with_corr=True, with_eigen=True,with_fft = True)]),
        # Pipeline(pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=True)]),
        # Pipeline(pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=False)]),
        # Pipeline(pipeline=[FreqCorrelation(1, 48, 'us', with_corr=False, with_eigen=True)]),
        # Pipeline(pipeline=[FreqCorrelation(1, 48, 'none', with_corr=True, with_eigen=True)]),
        # Pipeline(pipeline=[TimeFreqCorrelation(1, 48, 400, 'us')]),
        # Pipeline(pipeline=[TimeFreqCorrelation(1, 48, 400, 'usf')]),
        # Pipeline(pipeline=[TimeFreqCorrelation(1, 48, 400, 'none')]),


    ]
    classifiers = [
        # NOTE(mike): you can enable multiple classifiers to run them all and compare results
         (RandomForestClassifier(n_estimators=3, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf3mss1Bfrs0'),
        # (RandomForestClassifier(n_estimators=150, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf150mss1Bfrs0'),
        # (RandomForestClassifier(n_estimators=300, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf300mss1Bfrs0'),
        # (RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf3000mss1Bfrs0'),
        # (GaussianNB(),'gbn'),
        # (BernoulliRBM(n_components=100),'dbn'),
        # (SVC(probability = True),'svc100'),
        # (LDA(),'lda'),

    ]
    cv_ratio = 0.5

    def should_normalize(classifier):
        clazzes = [LogisticRegression]
        return np.any(np.array([isinstance(classifier, clazz) for clazz in clazzes]) == True)

    def train_full_model(make_predictions):
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name)
                guesses = ['clip,preictal']
                classifier_filenames = []
                for target in targets:
                    task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                         target=target, pipeline=pipeline,
                                         classifier_name=classifier_name, classifier=classifier,
                                         normalize=should_normalize(classifier), gen_ictal=False,
                                         cv_ratio=cv_ratio)


                    if make_predictions:
                        predictions = MakePredictionsTask(task_core).run()
                        guesses.append(predictions.data)
                    else:
                        task = TrainClassifierTask(task_core)
                        print "training"
                        task.run()
                        print "train_finished"
                        classifier_filenames.append(task.filename())


                if make_predictions:
                    filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name())
                    filename = os.path.join(submission_dir, filename)
                    with open(filename, 'w') as f:
                        print >> f, '\n'.join(guesses)
                    print 'wrote', filename
                else:
                    print 'Trained classifiers ready in %s' % cache_dir
                    for filename in classifier_filenames:
                        print os.path.join(cache_dir, filename + '.pickle')




    def train_model_with_calib(make_predictions):
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name)
                guesses = ['clip,preictal']
                classifier_filenames = []
                for target in targets:
                    task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                         target=target, pipeline=pipeline,
                                         classifier_name=classifier_name, classifier=classifier,
                                         normalize=should_normalize(classifier), gen_ictal=False,
                                         cv_ratio=cv_ratio)


                    if make_predictions:
                        predictions = MakePredictionswithCalibTask(task_core).run()
                        guesses.append(predictions.data)
                    else:
                        task = TrainClassifierwithCalibTask(task_core)
                        print "training"
                        task.run()
                        print "train_finished"
                        classifier_filenames.append(task.filename())


                if make_predictions:
                    filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name())
                    filename = os.path.join(submission_dir, filename)
                    with open(filename, 'w') as f:
                        print >> f, '\n'.join(guesses)
                    print 'wrote', filename
                else:
                    print 'Trained classifiers ready in %s' % cache_dir
                    for filename in classifier_filenames:
                        print os.path.join(cache_dir, filename + '.pickle')



    def do_cross_validation_full():
        summaries = []
        print "ok"
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name)
                scores = []
                S_scores = []
                E_scores = []
                y_cv = []
                pred = []
                for target in targets:
                    print 'Processing %s (classifier %s)' % (target, classifier_name)
                    task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                         target=target, pipeline=pipeline,
                                         classifier_name=classifier_name, classifier=classifier,
                                         normalize=should_normalize(classifier), gen_ictal=False,
                                         cv_ratio=cv_ratio)

                    data = CrossValidationScoreFullTask(task_core).run()
                    y_cv = np.concatenate((y_cv,data.y_cv),axis = -1);
                    pred = np.concatenate((pred,data.pred),axis = -1);
                print y_cv
                print pred
                fpr,tpr,thresholds = metrics.roc_curve(y_cv,pred,pos_label = 1)
                print 'AUC'
                print metrics.auc(fpr,tpr)



    def do_cross_validation():
        summaries = []
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name)
                scores = []
                S_scores = []
                E_scores = []
                for target in targets:
                    print 'Processing %s (classifier %s)' % (target, classifier_name)

                    task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                         target=target, pipeline=pipeline,
                                         classifier_name=classifier_name, classifier=classifier,
                                         normalize=should_normalize(classifier), gen_ictal=False,
                                         cv_ratio=cv_ratio)

                    data = CrossValidationScoreTask(task_core).run()
                    score = data.score
                    scores.append(score)

                    print '%.3f' % score, 'S=%.4f' % data.S_auc
                    S_scores.append(data.S_auc)

                if len(scores) > 0:
                    name = pipeline.get_name() + '_' + classifier_name
                    summary = get_score_summary(name, scores)
                    summaries.append((summary, np.mean(scores)))
                    print summary
                if len(S_scores) > 0:
                    name = pipeline.get_name() + '_' + classifier_name
                    summary = get_score_summary(name, S_scores)
                    print 'S', summary

            print_results(summaries)




    if build_target == 'cv':
        do_cross_validation()
    elif build_target == 'train_model':
        train_full_model(make_predictions=False)
    elif build_target == 'make_predictions':
        train_full_model(make_predictions=True)
    elif build_target == 'make_predictions_with_calib':
        train_model_with_calib(make_predictions = True)
    elif build_target == 'cv_full':
        do_cross_validation_full()
    else:
        raise Exception("unknown build target %s" % build_target)

    send_message('your program finished running on mercury')
def run_seizure_detection(build_target, targets=None):
    """
    The main entry point for running seizure-detection cross-validation and predictions.
    Directories from settings file are configured, classifiers are chosen, pipelines are
    chosen, and the chosen build_target ('cv', 'predict', 'train_model') is run across
    all combinations of (targets, pipelines, classifiers)
    """

    with open('SETTINGS.json') as f:
        settings = json.load(f)

    data_dir = str(settings['competition-data-dir'])
    cache_dir = str(settings['data-cache-dir'])
    import seizure.transforms
    seizure.transforms.cache_dir = cache_dir
    submission_dir = str(settings['submission-dir'])
    seizure.tasks.task_predict = str(settings.get('task')) == 'predict'

    makedirs(submission_dir)

    cached_data_loader = CachedDataLoader(cache_dir)

    ts = time.get_millis()

    if not targets:
        if seizure.tasks.task_predict:
            # add leader-board weight to each target. I am using the number of test example as the weight assuming
            # all test examples are weighted equally on the leader-board
            targets = [
                ('Dog_1',502),
                ('Dog_2',1000),
                ('Dog_3',907),
                ('Dog_4',990),
                ('Dog_5',191),
                ('Patient_1',195),
                ('Patient_2',150),
            ]
        else:
            targets = [
                'Dog_1',
                'Dog_2',
                'Dog_3',
                'Dog_4',
                'Dog_5',
                'Patient_1',
                'Patient_2',
                'Patient_3',
                'Patient_4',
                'Patient_5',
                'Patient_6',
                'Patient_7',
                'Patient_8'
            ]

    pipelines = [
        # NOTE(mike): you can enable multiple pipelines to run them all and compare results
        #Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 48), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 64), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 96), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 128), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Slice(1, 160), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[FFT(), Magnitude(), Log10()]),
        # Pipeline(gen_ictal=False, pipeline=[Stats()]),
        # Pipeline(gen_ictal=False, pipeline=[DaubWaveletStats(4)]),
        # Pipeline(gen_ictal=False, pipeline=[Resample(400), DaubWaveletStats(4)]),
        # Pipeline(gen_ictal=False, pipeline=[Resample(400), MFCC()]),
        # Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]),
        # Pipeline(gen_ictal=True,  pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'us')]),
        #Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]), # winning detection submission
        # Pipeline(gen_ictal=False, pipeline=[WindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]),
        #Pipeline(gen_ictal=False, pipeline=[StdWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]),
        #Pipeline(gen_ictal=False, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]),
        # Pipeline(gen_ictal=True, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]),
        # Pipeline(gen_ictal=2, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]),
        # Pipeline(gen_ictal=4, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]),
        #Pipeline(gen_ictal=8, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]),
        #Pipeline(gen_ictal=-8, pipeline=[MedianWindow1FFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]),
        # Pipeline(gen_ictal=-8, pipeline=[MedianWindowBands1('usf2', 60, p=2)]),
        # Pipeline(gen_ictal=-8, pipeline=[MedianWindowBands1('141022-PCA-model', 60, p=2)]),
        #Pipeline(gen_ictal=-8, pipeline=[MedianWindowBands1('141022-ICA-model-1', 60, p=2)]),
        # Pipeline(gen_ictal=-8, pipeline=[MedianWindowBands1('ica', 60, p=2, timecorr=True)]),
        #Pipeline(gen_ictal=-8.5, pipeline=[MedianWindowBands1('usf', 60, p=2)]),
        #Pipeline(gen_ictal=-8, pipeline=[MedianWindowBands('usf', 10, p=2, window='hammingP2')]),
        #Pipeline(gen_ictal=-8, pipeline=[AllBands('usf', 60)]),
        Pipeline(gen_ictal=-8, pipeline=[AllTimeCorrelation('usf', 60)]),
        #Pipeline(gen_ictal=-8, pipeline=[MaxDiff(60)]),
        #Pipeline(gen_ictal=-8, pipeline=[MedianWindowBandsTimeCorrelation('usf', 60)]),
        #Pipeline(gen_ictal=-8, pipeline=[MedianWindowBandsCorrelation('usf', 60)]),
        #Pipeline(gen_ictal=-8, pipeline=[MedianWindowTimeCorrelation('usf', 60)]),
        #Pipeline(gen_ictal=False, pipeline=[MedianWindow1FFTWithTimeFreqCorrelation(1, 49, 400, 'usf',600)]),
        #Pipeline(gen_ictal=8, pipeline=[MedianWindowFFTWithTimeFreqCov2(1, 48, 400, 'usf',600)]),
        #Pipeline(gen_ictal=8, pipeline=[CleanMedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600, window='hammingP2')]),
        #Pipeline(gen_ictal=8, pipeline=[CleanCorMedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600, window='hammingP2')]),
        #Pipeline(gen_ictal=8, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600,subsample=2)]),
        # Pipeline(gen_ictal=16, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]),
        #Pipeline(gen_ictal=16, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 96, 400, 'usf',600, window='hamming')]),
        #Pipeline(gen_ictal=16, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600, window='hamming2')]),
        #Pipeline(gen_ictal=8, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600, window='hamming0')]),
        # Pipeline(gen_ictal=8, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600, window='square0')]),
        # Pipeline(gen_ictal=2, pipeline=[Variance(nwindows=600)]),
        # UnionPipeline(gen_ictal=2, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600),Variance(nwindows=600)]),
        #Pipeline(gen_ictal=True, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600, nunits=4)]),
        #Pipeline(gen_ictal=True, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 50, 400, 'usf',600)]),
        #Pipeline(gen_ictal=False, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600,[0.5,0.9])]),
        # Pipeline(gen_ictal=False, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600,[0.1,0.9])]),
        # Pipeline(gen_ictal=False, pipeline=[MedianWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600,[0.05,0.5,0.95])]),
        # Pipeline(gen_ictal=False, pipeline=[BoxWindowFFTWithTimeFreqCorrelation(1, 48, 400, 'usf',600)]),
        # Pipeline(gen_ictal=True,  pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')]), # higher score than winning submission
        # Pipeline(gen_ictal=False, pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]),
        # Pipeline(gen_ictal=True,  pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'none')]),
        # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'usf', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=True, with_eigen=False)]),
        # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'us', with_corr=False, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[TimeCorrelation(400, 'none', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'usf', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=True, with_eigen=False)]),
        # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'us', with_corr=False, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[FreqCorrelation(1, 48, 'none', with_corr=True, with_eigen=True)]),
        # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'us')]),
        # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'usf')]),
        # Pipeline(gen_ictal=False, pipeline=[TimeFreqCorrelation(1, 48, 400, 'none')]),
    ]
    classifiers = [
        # NOTE(mike): you can enable multiple classifiers to run them all and compare results
        # (RandomForestClassifier(n_estimators=50, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf50mss1Bfrs0'),
        # (RandomForestClassifier(n_estimators=150, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf150mss1Bfrs0'),
        # (RandomForestClassifier(n_estimators=300, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf300mss1Bfrs0'),
        # (RandomForestClassifier(n_estimators=3000, min_samples_split=1, bootstrap=False, n_jobs=4, random_state=0), 'rf3000mss1Bfrs0'),
        # (RandomForestClassifier(n_estimators=3000, min_samples_split=1, max_depth=10, bootstrap=True, n_jobs=-1, random_state=0), 'rf3000mss1md10Bt'),
        # (RandomForestClassifier(n_estimators=1000, min_samples_split=1, max_depth=10, bootstrap=False, n_jobs=-1, random_state=0), 'rf3000mss1md10Bf'),
        (RandomForestClassifier(n_estimators=3000, min_samples_split=1, max_depth=10, bootstrap=False, n_jobs=-1, random_state=0), 'rf3000mss1md10Bf'),
        # (RandomForestClassifier(n_estimators=10000, min_samples_split=1, max_depth=10, bootstrap=False, n_jobs=-1, random_state=0), 'rf10000mss1md10Bf'),
        # (RandomForestClassifier(n_estimators=3000, min_samples_split=1, max_depth=3, bootstrap=False, n_jobs=-1, random_state=0), 'rf3000mss1md3Bf'),
        # (RandomForestClassifier(n_estimators=3000, min_samples_split=1, max_depth=30, bootstrap=False, n_jobs=-1, random_state=0), 'rf3000mss1md30Bf'),
        # (RandomForestClassifier(n_estimators=3000, min_samples_split=1, max_depth=10, max_features='log2', bootstrap=False, n_jobs=-1, random_state=0), 'rf3000mss1md10BfmfL2'),
        # (RandomForestClassifier(n_estimators=3000, min_samples_split=1, max_depth=10, max_features=200, bootstrap=False, n_jobs=-1, random_state=0), 'rf3000mss1md10Bfmf200'),
        # (GradientBoostingClassifier(n_estimators=500,min_samples_split=1,),'gbc500mss1'),
        # (GradientBoostingClassifier(n_estimators=1000,min_samples_split=1, random_state=0),'gbc1000mss1'),
        # (GradientBoostingClassifier(n_estimators=1000,min_samples_split=1, random_state=0, learning_rate=0.03),'gbc1000mss1lr03'),
        # (GradientBoostingClassifier(n_estimators=1000,min_samples_split=1, random_state=0, learning_rate=0.01),'gbc1000mss1lr01'),
        # (GradientBoostingClassifier(n_estimators=1000,min_samples_split=1, random_state=0, learning_rate=0.01, max_depth=1000),'gbc1000mss1lr01md1000'),
   ]
    cv_ratio = 0.5

    def should_normalize(classifier):
        clazzes = [LogisticRegression]
        return np.any(np.array([isinstance(classifier, clazz) for clazz in clazzes]) == True)

    def train_full_model(make_predictions):
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name)
                if seizure.tasks.task_predict:
                    guesses = ['clip,preictal']
                else:
                    guesses = ['clip,seizure,early']
                classifier_filenames = []
                for target in targets:
                    if isinstance(target,tuple):
                        target, leaderboard_weight = target
                    else:
                        leaderboard_weight = 1
                    task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                         target=target, pipeline=pipeline,
                                         classifier_name=classifier_name, classifier=classifier,
                                         normalize=should_normalize(classifier), gen_ictal=pipeline.gen_ictal,
                                         cv_ratio=cv_ratio)

                    if make_predictions:
                        predictions = MakePredictionsTask(task_core).run()
                        guesses.append(predictions.data)
                    else:
                        task = TrainClassifierTask(task_core)
                        task.run()
                        classifier_filenames.append(task.filename())

                if make_predictions:
                    filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name())
                    filename = os.path.join(submission_dir, filename)
                    with open(filename, 'w') as f:
                        print >> f, '\n'.join(guesses)
                    print 'wrote', filename
                else:
                    print 'Trained classifiers ready in %s' % cache_dir
                    for filename in classifier_filenames:
                        print os.path.join(cache_dir, filename + '.pickle')

    def do_cross_validation():
        summaries = []
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name)
                scores = []
                S_scores = []
                E_scores = []
                leaderboard_weights = []
                for target in targets:
                    if isinstance(target,tuple):
                        target, leaderboard_weight = target
                    else:
                        leaderboard_weight = 1
                    leaderboard_weights.append(leaderboard_weight)
                    print 'Processing %s (classifier %s)' % (target, classifier_name)

                    task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                         target=target, pipeline=pipeline,
                                         classifier_name=classifier_name, classifier=classifier,
                                         normalize=should_normalize(classifier), gen_ictal=pipeline.gen_ictal,
                                         cv_ratio=cv_ratio)

                    data = CrossValidationScoreTask(task_core).run()
                    score = data.score

                    scores.append(score)

                    print '%.3f' % score, 'S=%.4f' % data.S_auc, 'E=%.4f' % data.E_auc
                    S_scores.append(data.S_auc)
                    E_scores.append(data.E_auc)

                if len(scores) > 0:
                    name = pipeline.get_name() + '_' + classifier_name
                    weighted_average = np.average(scores, weights=leaderboard_weights)
                    summary = get_score_summary(name, scores, weighted_average)
                    summaries.append((summary, weighted_average))
                    print summary
                if len(S_scores) > 0:
                    name = pipeline.get_name() + '_' + classifier_name
                    summary = get_score_summary(name, S_scores, np.mean(S_scores))
                    print 'S', summary
                if len(E_scores) > 0:
                    name = pipeline.get_name() + '_' + classifier_name
                    summary = get_score_summary(name, E_scores, np.mean(E_scores))
                    print 'E', summary

            print_results(summaries)

    def do_train_data():
        for pipeline in pipelines:
            print 'Using pipeline %s' % (pipeline.get_name())
            for target in targets:
                if isinstance(target,tuple):
                    target, leaderboard_weight = target

                task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                     target=target, pipeline=pipeline,
                                         classifier_name=None, classifier=None,
                                         normalize=None, gen_ictal=pipeline.gen_ictal,
                                         cv_ratio=None)
                # call the load data tasks for positive and negative examples (ignore the merge of the two.)
                TrainingDataTask(task_core).run()

    def do_test_data():
        for pipeline in pipelines:
            print 'Using pipeline %s' % (pipeline.get_name())
            for target in targets:
                if isinstance(target,tuple):
                    target, leaderboard_weight = target

                task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                     target=target, pipeline=pipeline,
                                         classifier_name=None, classifier=None,
                                         normalize=None, gen_ictal=pipeline.gen_ictal,
                                         cv_ratio=None)

                LoadTestDataTask(task_core).run()

    if build_target == 'train_data':
        do_train_data()
    elif build_target == 'test_data':
        do_test_data()
    elif build_target == 'cv':
        do_cross_validation()
    elif build_target == 'train_model':
        train_full_model(make_predictions=False)
    elif build_target == 'make_predictions':
        train_full_model(make_predictions=True)
    else:
        raise Exception("unknown build target %s" % build_target)
def run_seizure_detection(build_target):

    with open('SETTINGS.json') as f:
        settings = json.load(f)

    data_dir = str(settings['competition-data-dir'])
    cache_dir = str(settings['data-cache-dir'])
    submission_dir = str(settings['submission-dir'])
    figure_dir = str(settings['figure-dir'])

    makedirs(submission_dir)

    cached_data_loader = CachedDataLoader(cache_dir)
    splitsize = 30000
    ts = time.get_millis()
    bin_size = 50
    targets = [
        '1',
        '2',
        '3',
    ]
    pipelines = [
        # This is better than winning submission
        Pipeline(gen_preictal=True,
                 pipeline=[
                     GetFeature(50,
                                2500,
                                400,
                                bin_size,
                                'usf',
                                onlyfd_dfa=False,
                                with_dfa=False,
                                with_dy=False,
                                with_six=True,
                                with_equal_freq=True,
                                with_mc=False,
                                with_time_corr=True,
                                smooth=True,
                                smooth_Hz=160,
                                power_edge=50,
                                with_square=True,
                                with_log=False,
                                with_sqrt=False,
                                splitsize=splitsize,
                                calibrate=False)
                 ]),
        #Pipeline(gen_preictal=True, pipeline=[only_FD_DFA(onlyfd_dfa=True)]),
    ]
    classifiers = [
        'GB',
        #'LSVC',
        # 'ET'
    ]
    cv_ratio = 0.5

    def should_normalize(classifier):
        clazzes = [LogisticRegression]
        return np.any(
            np.array([isinstance(classifier, clazz)
                      for clazz in clazzes]) == True)

    def train_full_model(make_predictions):
        for pipeline in pipelines:
            for classifier in classifiers:
                print 'Using pipeline %s with classifier %s' % (
                    pipeline.get_name(), classifier)
                guesses = ['File,Class']
                classifier_filenames = []
                #plot2file = PdfPages(os.path.join(figure_dir, ('figure%d-_%s_%s_.pdf' % (ts, classifier, pipeline.get_name()))))
                for target in targets:
                    task_core = TaskCore(
                        cached_data_loader=cached_data_loader,
                        data_dir=data_dir,
                        target=target,
                        pipeline=pipeline,
                        classifier=classifier,
                        normalize=should_normalize(classifier),
                        gen_preictal=pipeline.gen_preictal,
                        cv_ratio=cv_ratio,
                        bin_size=bin_size)

                    if make_predictions:
                        predictions = MakePredictionsTask(task_core).run()
                        guesses.append(predictions.data)
                    else:
                        # task = TrainClassifierTask(task_core)
                        # task.run()
                        # classifier_filenames.append(task.filename())
                        print 'not implemented'

                if make_predictions:
                    filename = 'submission%d-%s_%s.csv' % (ts, classifier,
                                                           pipeline.get_name())
                    filename = os.path.join(submission_dir, filename)
                    with open(filename, 'w') as f:
                        print >> f, '\n'.join(guesses)
                    print 'wrote', filename
                else:
                    print 'Trained classifiers ready in %s' % cache_dir
                    for filename in classifier_filenames:
                        print os.path.join(cache_dir, filename + '.pickle')

    if build_target == 'train_model':
        train_full_model(make_predictions=False)
    elif build_target == 'make_predictions':
        train_full_model(make_predictions=True)
    else:
        raise Exception("unknown build target %s" % build_target)
def run_seizure_detection(build_target):

    with open('SETTINGS.json') as f:
        settings = json.load(f)

    data_dir = str(settings['competition-data-dir'])
    cache_dir = str(settings['data-cache-dir'])
    submission_dir = str(settings['submission-dir'])
    figure_dir = str(settings['figure-dir'])

    makedirs(submission_dir)

    cached_data_loader = CachedDataLoader(cache_dir)
    splitsize = 30000
    ts = time.get_millis()
    bin_size = 50
    targets = [
            '1',
             '2',
             '3',
    ]
    pipelines = [
        # This is better than winning submission
        Pipeline(gen_preictal=True, pipeline=[GetFeature(50, 2500, 400, bin_size, 'usf',onlyfd_dfa=False,
                                                            with_dfa=False,with_dy=False,with_six=True,with_equal_freq=True,
                                                            with_mc=False,with_time_corr=True,smooth=True,smooth_Hz=160,power_edge=50,
                                                            with_square=True,with_log=False,with_sqrt=False,splitsize=splitsize,calibrate=False)]),
        #Pipeline(gen_preictal=True, pipeline=[only_FD_DFA(onlyfd_dfa=True)]),
    ]
    classifiers = [
        'GB',
        #'LSVC',
        # 'ET'

    ]
    cv_ratio = 0.5

    def should_normalize(classifier):
        clazzes = [LogisticRegression]
        return np.any(np.array([isinstance(classifier, clazz) for clazz in clazzes]) == True)

    def train_full_model(make_predictions):
        for pipeline in pipelines:
            for classifier in classifiers:
                print 'Using pipeline %s with classifier %s' % (pipeline.get_name(),classifier)
                guesses = ['File,Class']
                classifier_filenames = []
                #plot2file = PdfPages(os.path.join(figure_dir, ('figure%d-_%s_%s_.pdf' % (ts, classifier, pipeline.get_name()))))
                for target in targets:
                    task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                         target=target, pipeline=pipeline,
                                         classifier=classifier,
                                         normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal,
                                         cv_ratio=cv_ratio,bin_size=bin_size)

                    if make_predictions:
                        predictions = MakePredictionsTask(task_core).run()
                        guesses.append(predictions.data)
                    else:
                        # task = TrainClassifierTask(task_core)
                        # task.run()
                        # classifier_filenames.append(task.filename())
                        print 'not implemented'

                if make_predictions:
                    filename = 'submission%d-%s_%s.csv' % (ts, classifier, pipeline.get_name())
                    filename = os.path.join(submission_dir, filename)
                    with open(filename, 'w') as f:
                        print >> f, '\n'.join(guesses)
                    print 'wrote', filename
                else:
                    print 'Trained classifiers ready in %s' % cache_dir
                    for filename in classifier_filenames:
                        print os.path.join(cache_dir, filename + '.pickle')


    if build_target == 'train_model':
        train_full_model(make_predictions=False)
    elif build_target == 'make_predictions':
        train_full_model(make_predictions=True)
    else:
        raise Exception("unknown build target %s" % build_target)
def run_seizure_detection(build_target):
    """
    The main entry point for running seizure-detection cross-validation and predictions.
    Directories from settings file are configured, classifiers are chosen, pipelines are
    chosen, and the chosen build_target ('cv', 'predict', 'train_model') is run across
    all combinations of (targets, pipelines, classifiers)
    """

    with open('SETTINGS.json') as f:
        settings = json.load(f)

    data_dir = str(settings['competition-data-dir'])
    cache_dir = str(settings['data-cache-dir'])
    submission_dir = str(settings['submission-dir'])
    figure_dir = str(settings['figure-dir'])

    makedirs(submission_dir)

    cached_data_loader = CachedDataLoader(cache_dir)

    ts = time.get_millis()
    
    targets = [
        'Dog_1',
        'Dog_2',
        'Dog_3',
        'Dog_4',
        'Dog_5',
        'Patient_1',
        'Patient_2',
    ]
    pipelines = [
        # NOTE: you can enable multiple pipelines to run them all and compare results
        Pipeline(gen_preictal=True,  pipeline=[FFTWithTimeFreqCorrelation(50, 2500, 400, 18, 'usf')]), # winning submission
    ]
    classifiers = [
        # NOTE: you can enable multiple classifiers to run them all and compare results
#         (RandomForestClassifier(n_estimators=300, min_samples_split=1, max_features=0.5, bootstrap=False, n_jobs=-1, random_state=0), 'rf300mss1mf05Bfrs0'),

#         (ExtraTreesClassifier(n_estimators=3000, min_samples_split=1, max_features=0.15, bootstrap=False, n_jobs=-1, random_state=0), 'ET3000mss1mf015Bfrs0'),
#         
#         (GradientBoostingClassifier(n_estimators=3000, min_samples_split=1, max_features=0.15, learning_rate=0.02, subsample = 0.5, random_state=0), 'GBRT3000mms1mf015Lr002Ss05rs0'),

        (SVC(C=1e6, kernel='rbf', gamma=0.01, coef0=0.0, shrinking=True, probability=True, tol=1e-5, cache_size=2000, class_weight='auto', max_iter=-1, random_state=0), 'svcce6rbfg001co0stte-5cwautors0'),
    ]
    cv_ratio = 0.5

    def should_normalize(classifier):
        clazzes = [LogisticRegression]
        return np.any(np.array([isinstance(classifier, clazz) for clazz in clazzes]) == True)

    def train_full_model(make_predictions):
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name)
                guesses = ['clip,preictal']
                classifier_filenames = []
                plot2file = PdfPages(os.path.join(figure_dir, ('figure%d-_%s_%s_.pdf' % (ts, classifier_name, pipeline.get_name()))))
                for target in targets:
                    task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                         target=target, pipeline=pipeline,
                                         classifier_name=classifier_name, classifier=classifier,
                                         normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal,
                                         cv_ratio=cv_ratio, plot2file = plot2file)

                    if make_predictions:
                        predictions = MakePredictionsTask(task_core).run()
                        guesses.append(predictions.data)
                    else:
                        task = TrainClassifierTask(task_core)
                        task.run()
                        classifier_filenames.append(task.filename())

                if make_predictions:
                    filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name())
                    filename = os.path.join(submission_dir, filename)
                    with open(filename, 'w') as f:
                        print >> f, '\n'.join(guesses)
                    print 'wrote', filename
                else:
                    print 'Trained classifiers ready in %s' % cache_dir
                    for filename in classifier_filenames:
                        print os.path.join(cache_dir, filename + '.pickle')
                        
                plot2file.close()

    def predict_all(make_predictions):
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name)
                lines = ['clip,preictal']
                subjectID = 0
                X_train = y_train = X_test = test_size = []
                for target in targets:
                    task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                         target=target, pipeline=pipeline,
                                         classifier_name=classifier_name, classifier=classifier,
                                         normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal,
                                         cv_ratio=cv_ratio)
                    
                    data = GetCrossSubjectDataTask(task_core).run()
#                     a = np.shape(data.X_test)[0]
                    test_size.append(np.shape(data.X_test)[0])
                    if subjectID > 0:
                        X_train = np.concatenate((X_train, data.X_train), axis=0)
                        y_train = np.concatenate((y_train, data.y_train), axis=0)
                        X_test = np.concatenate((X_test, data.X_test), axis=0)
                    else:
                        X_train = data.X_train
                        y_train = data.y_train
                        X_test = data.X_test
                    subjectID += 1
                    
                #Training
                task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                     target=[], pipeline=pipeline,
                                     classifier_name=classifier_name, classifier=classifier,
                                     normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal,
                                     cv_ratio=cv_ratio)
                y_train = np.ceil(0.1*y_train)
                y_train.astype('int_')
                if should_normalize(classifier):
                    X_train, temp = normalize_data(X_train, X_train)
                    
                print "Training ..."
                print 'Dim', np.shape(X_train), np.shape(y_train)
                start = time.get_seconds()
                classifier.fit(X_train, y_train)
                elapsedSecs = time.get_seconds() - start
                print "t=%ds" % int(elapsedSecs)
                
                y_estimate = classifier.predict_proba(X_train)
                lr = LogisticRegression(random_state = 0)      
                lr.fit(y_estimate, y_train)
                predictions_proba = classifier.predict_proba(X_test)
                predictions_calibrated = lr.predict_proba(predictions_proba)
                
                #output
                m = 0
                totalSample = 12
                startIdx = 0
                for target in targets:
                    for i in range(test_size[m]/totalSample):
                        j = i+1
                        if j < 10:
                            nstr = '000%d' %j
                        elif j < 100:    
                            nstr = '00%d' %j
                        elif j < 1000:
                            nstr = '0%d' %j
                        else:
                            nstr = '%d' %j
                        
                        preictalOverAllSample = 0
                        for k in range(totalSample):
                            p = predictions_calibrated[i*totalSample+k+startIdx]
                            preictal = translate_prediction(p)
                            preictalOverAllSample += preictal/totalSample
                         
                        newline =  '%s_test_segment_%s.mat,%.15f' % (target, nstr, preictalOverAllSample)   
                        lines.append(newline)
                        
                    print newline
                    startIdx = startIdx + test_size[m]
                    m += 1
                
                filename = 'submission%d-%s_%s.csv' % (ts, classifier_name, pipeline.get_name())
                filename = os.path.join(submission_dir, filename)
                with open(filename, 'w') as f:
                    print >> f, '\n'.join(lines)
                print 'wrote', filename

    def do_cross_validation():
        summaries = []
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (pipeline.get_name(), classifier_name)
                scores = []
                for target in targets:
                    print 'Processing %s (classifier %s)' % (target, classifier_name)

                    task_core = TaskCore(cached_data_loader=cached_data_loader, data_dir=data_dir,
                                         target=target, pipeline=pipeline,
#                                          target=target, pipeline=pipeline,
                                         classifier_name=classifier_name, classifier=classifier,
                                         normalize=should_normalize(classifier), gen_preictal=pipeline.gen_preictal,
                                         cv_ratio=cv_ratio)

                    data = CrossValidationScoreTask(task_core).run()
                    score = data.score

                    scores.append(score)

                    print '%.3f' % score

                if len(scores) > 0:
                    name = pipeline.get_name() + '_' + classifier_name
                    summary = get_score_summary(name, scores)
                    summaries.append((summary, np.mean(scores)))
                    print summary

            print_results(summaries)

    if build_target == 'cv':
        do_cross_validation()
    elif build_target == 'train_model':
        train_full_model(make_predictions=False)
    elif build_target == 'make_predictions':
        train_full_model(make_predictions=True)
    elif build_target == 'predict_all':
        predict_all(make_predictions=True)
    else:
        raise Exception("unknown build target %s" % build_target)
示例#10
0
def run_seizure_detection(build_target):
    """
    The main entry point for running seizure-detection cross-validation and predictions.
    Directories from settings file are configured, classifiers are chosen, pipelines are
    chosen, and the chosen build_target ('cv', 'predict', 'train_model') is run across
    all combinations of (targets, pipelines, classifiers)
    """

    with open('SETTINGS.json') as f:
        settings = json.load(f)

    data_dir = str(settings['competition-data-dir'])
    cache_dir = str(settings['data-cache-dir'])
    submission_dir = str(settings['submission-dir'])

    makedirs(submission_dir)

    cached_data_loader = CachedDataLoader(cache_dir)

    ts = time.get_millis()

    targets = [
        'Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Patient_1', 'Patient_2',
        'Patient_3', 'Patient_4', 'Patient_5', 'Patient_6', 'Patient_7',
        'Patient_8'
    ]

    pipelines = [
        Pipeline(gen_ictal=False,
                 pipeline=[FFTWithTimeFreqCorrelation(1, 48, 400, 'usf')])
    ]
    classifiers = [(RandomForestClassifier(n_estimators=3000,
                                           min_samples_split=2,
                                           bootstrap=False,
                                           n_jobs=4,
                                           random_state=0), 'rf3000')]
    cv_ratio = 0.5

    def should_normalize(classifier):
        clazzes = [LogisticRegression]
        return np.any(
            np.array([isinstance(classifier, clazz)
                      for clazz in clazzes]) == True)

    def train_full_model(make_predictions):
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (
                    pipeline.get_name(), classifier_name)
                guesses = ['clip,seizure,early']
                classifier_filenames = []
                for target in targets:
                    task_core = TaskCore(
                        cached_data_loader=cached_data_loader,
                        data_dir=data_dir,
                        target=target,
                        pipeline=pipeline,
                        classifier_name=classifier_name,
                        classifier=classifier,
                        normalize=should_normalize(classifier),
                        gen_ictal=pipeline.gen_ictal,
                        cv_ratio=cv_ratio)

                    if make_predictions:
                        predictions = MakePredictionsTask(task_core).run()
                        guesses.append(predictions.data)
                    else:
                        task = TrainClassifierTask(task_core)
                        task.run()
                        classifier_filenames.append(task.filename())

                if make_predictions:
                    filename = 'submission%d-%s_%s.csv' % (ts, classifier_name,
                                                           pipeline.get_name())
                    filename = os.path.join(submission_dir, filename)
                    with open(filename, 'w') as f:
                        print >> f, '\n'.join(guesses)
                    print 'wrote', filename
                else:
                    print 'Trained classifiers ready in %s' % cache_dir
                    for filename in classifier_filenames:
                        print os.path.join(cache_dir, filename + '.pickle')

    def do_cross_validation():
        for pipeline in pipelines:
            for (classifier, classifier_name) in classifiers:
                print 'Using pipeline %s with classifier %s' % (
                    pipeline.get_name(), classifier_name)
                scores = []
                for target in targets:
                    print 'Processing %s (classifier %s)' % (target,
                                                             classifier_name)

                    task_core = TaskCore(
                        cached_data_loader=cached_data_loader,
                        data_dir=data_dir,
                        target=target,
                        pipeline=pipeline,
                        classifier_name=classifier_name,
                        classifier=classifier,
                        normalize=should_normalize(classifier),
                        gen_ictal=pipeline.gen_ictal,
                        cv_ratio=cv_ratio)

                    data = CrossValidationScoreTask(task_core).run()
                    score = data.score
                    scores.append(score)

                    print target, 'Seizure_AUC=', data.S_auc, 'Early_AUC=', data.E_auc

    if build_target == 'cv':
        do_cross_validation()
    elif build_target == 'train_model':
        train_full_model(make_predictions=False)
    elif build_target == 'make_predictions':
        train_full_model(make_predictions=True)
    else:
        raise Exception("unknown build target %s" % build_target)