Exemplo n.º 1
0
def get_submission_targets_and_masks(settings, targets, classifier, classifier_name, pipeline_groups, random_pipelines, random_ratio=0.525, ngen=10, limit=2, random_limit=2):
    assert random_limit % limit == 0
    random_multiplier = random_limit / limit
    quiet = True

    random_pipeline = FeatureConcatPipeline(*random_pipelines)

    all_pipelines = []
    all_pipelines.extend(random_pipelines)
    for pg, ratio in pipeline_groups:
        all_pipelines.extend(pg)
    full_pipeline = FeatureConcatPipeline(*all_pipelines)
    run_prepare_data(settings, [(target, full_pipeline, []) for target in targets], test=True)

    def get_pipeline_and_feature_masks(target, pipelines, classifier, classifier_name, ratio, ngen):
        print target, 'fetching GA pipelines', [p.get_name() for p in pipelines]
        pipeline = FeatureConcatPipeline(*pipelines)
        score, best_N = process_target(settings, target, pipeline, classifier, classifier_name, ratio=ratio, ngen=ngen, quiet=quiet)
        return pipeline, best_N

    targets_and_pipelines = []
    for target in targets:
        # NOTE(mike): All this stuff is a bit nasty. It gets the random-masks and the genetic-masks
        # for different pipelines, and then pulls out the mask for each individual pipeline. A single
        # FeatureConcatPipeline is then created to represent all the features, and the masks for each
        # member of the FCP are merged together to form the single feature mask across the whole FCP.

        random_masks = generate_feature_masks(settings, target, random_pipeline, random_limit, random_ratio, random_state=0, quiet=quiet)
        # contains a list of pairs, (pipeline, mask)
        ga_groups = [get_pipeline_and_feature_masks(target, p, classifier, classifier_name, ratio, ngen) for p, ratio in pipeline_groups]
        ga_groups = [(p, masks[0:limit]) for p, masks in ga_groups]

        print target, 'extracting GA per-pipeline masks...'
        # contains a list of mask dictionaries
        ga_dicts = [extract_masks_for_pipeline_and_masks(settings, target, pipeline, masks) for pipeline, masks in ga_groups]
        ga_dicts = [mask_dicts * random_multiplier for mask_dicts in ga_dicts]

        r_dicts = extract_masks_for_pipeline_and_masks(settings, target, random_pipeline, random_masks)
        # this contains a list of dictionaries which maps pipeline names to masks
        # e.g. [r_dicts, ga_dicts0, ga_dicts1, ...]
        zip_group = [r_dicts] + ga_dicts

        print target, 'merging all masks...'
        feature_mask_dicts = [merge_dicts(*x) for x in zip(*zip_group)]

        feature_masks = []
        for feature_mask_dict in feature_mask_dicts:
            mask = []
            for p in full_pipeline.get_pipelines():
                mask.extend(feature_mask_dict[p.get_name()])
            feature_masks.append(mask)

        targets_and_pipelines.append((target, full_pipeline, feature_masks))
    return targets_and_pipelines
Exemplo n.º 2
0
 def get_pipeline_and_feature_masks(target, pipelines, classifier,
                                    classifier_name, ratio, ngen):
     print(target, 'fetching GA pipelines',
           [p.get_name() for p in pipelines])
     pipeline = FeatureConcatPipeline(*pipelines)
     score, best_N = process_target(settings,
                                    target,
                                    pipeline,
                                    classifier,
                                    classifier_name,
                                    ratio=ratio,
                                    ngen=ngen,
                                    quiet=quiet)
     return pipeline, best_N
Exemplo n.º 3
0
def main():
    settings = load_settings()

    targets = [
        'Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2'
    ]

    # The genetic algorithm will be run individually on each pipeline group
    pipeline_groups = [
        ([
            Pipeline(InputSource(), Preprocess(), Windower(75), PFD()),
        ], 0.55),
        ([
            Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()),
        ], 0.55),
        ([
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy(
                    [0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5,
                     24])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([0.25, 2, 3.5])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([6, 15, 24])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([2, 3.5, 6])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([3.5, 6, 15])),
            Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)),
        ], 0.55),
    ]

    make_submission = len(sys.argv) >= 2 and sys.argv[1] == 'submission'
    run_ga = not make_submission

    # This classifier is used in the genetic algorithm
    ga_classifier, ga_classifier_name = make_svm(gamma=0.0079, C=2.7)

    if run_ga:
        quiet = False
        summaries = []
        for ngen in [10]:
            for pipelines, ratio in pipeline_groups:
                out = []
                for target in targets:
                    print 'Running target', target
                    run_prepare_data_for_cross_validation(settings, [target],
                                                          pipelines,
                                                          quiet=True)
                    pipeline = FeatureConcatPipeline(*pipelines)
                    score, best_N = process_target(settings,
                                                   target,
                                                   pipeline,
                                                   ga_classifier,
                                                   ga_classifier_name,
                                                   ratio=ratio,
                                                   ngen=ngen,
                                                   quiet=quiet)
                    print target, score, [
                        np.sum(mask) for mask in best_N[0:10]
                    ]
                    out.append((target, score, pipeline, best_N))

            scores = np.array([score for _, score, _, _ in out])
            summary = get_score_summary(
                '%s ngen=%d' % (ga_classifier_name, ngen), scores,
                np.mean(scores), targets)
            summaries.append((summary, np.mean(scores)))
            print summary

        print_results(summaries)

    if make_submission:
        random_pipelines = [
            Pipeline(InputSource(), Preprocess(), Windower(75),
                     Correlation('none')),
            Pipeline(InputSource(), Preprocess(), Windower(75),
                     FreqCorrelation(1, None, 'none')),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                FreqBinning(winning_bins, 'mean'), Log10(), FlattenChannels()),
        ]

        # These classifiers are used to make the final predictions
        final_classifiers = [
            # make_svm(gamma=0.0079, C=2.7),
            make_svm(gamma=0.0068, C=2.0),
            # make_svm(gamma=0.003, C=150.0),
            # make_lr(C=0.04),
            # make_simple_lr(),
        ]
        targets_and_pipelines = get_submission_targets_and_masks(
            settings, targets, ga_classifier, ga_classifier_name,
            pipeline_groups, random_pipelines)
        for classifier, classifier_name in final_classifiers:
            run_make_submission(settings, targets_and_pipelines, classifier,
                                classifier_name)
Exemplo n.º 4
0
def get_submission_targets_and_masks(settings,
                                     targets,
                                     classifier,
                                     classifier_name,
                                     pipeline_groups,
                                     random_pipelines,
                                     random_ratio=0.525,
                                     ngen=10,
                                     limit=2,
                                     random_limit=2):
    assert random_limit % limit == 0
    random_multiplier = random_limit / limit
    quiet = True

    random_pipeline = FeatureConcatPipeline(*random_pipelines)

    all_pipelines = []
    all_pipelines.extend(random_pipelines)
    for pg, ratio in pipeline_groups:
        all_pipelines.extend(pg)
    full_pipeline = FeatureConcatPipeline(*all_pipelines)
    run_prepare_data(settings,
                     [(target, full_pipeline, []) for target in targets],
                     test=True)

    def get_pipeline_and_feature_masks(target, pipelines, classifier,
                                       classifier_name, ratio, ngen):
        print target, 'fetching GA pipelines', [
            p.get_name() for p in pipelines
        ]
        pipeline = FeatureConcatPipeline(*pipelines)
        score, best_N = process_target(settings,
                                       target,
                                       pipeline,
                                       classifier,
                                       classifier_name,
                                       ratio=ratio,
                                       ngen=ngen,
                                       quiet=quiet)
        return pipeline, best_N

    targets_and_pipelines = []
    for target in targets:
        # NOTE(mike): All this stuff is a bit nasty. It gets the random-masks and the genetic-masks
        # for different pipelines, and then pulls out the mask for each individual pipeline. A single
        # FeatureConcatPipeline is then created to represent all the features, and the masks for each
        # member of the FCP are merged together to form the single feature mask across the whole FCP.

        random_masks = generate_feature_masks(settings,
                                              target,
                                              random_pipeline,
                                              random_limit,
                                              random_ratio,
                                              random_state=0,
                                              quiet=quiet)
        # contains a list of pairs, (pipeline, mask)
        ga_groups = [
            get_pipeline_and_feature_masks(target, p, classifier,
                                           classifier_name, ratio, ngen)
            for p, ratio in pipeline_groups
        ]
        ga_groups = [(p, masks[0:limit]) for p, masks in ga_groups]

        print target, 'extracting GA per-pipeline masks...'
        # contains a list of mask dictionaries
        ga_dicts = [
            extract_masks_for_pipeline_and_masks(settings, target, pipeline,
                                                 masks)
            for pipeline, masks in ga_groups
        ]
        ga_dicts = [mask_dicts * random_multiplier for mask_dicts in ga_dicts]

        r_dicts = extract_masks_for_pipeline_and_masks(settings, target,
                                                       random_pipeline,
                                                       random_masks)
        # this contains a list of dictionaries which maps pipeline names to masks
        # e.g. [r_dicts, ga_dicts0, ga_dicts1, ...]
        zip_group = [r_dicts] + ga_dicts

        print target, 'merging all masks...'
        feature_mask_dicts = [merge_dicts(*x) for x in zip(*zip_group)]

        feature_masks = []
        for feature_mask_dict in feature_mask_dicts:
            mask = []
            for p in full_pipeline.get_pipelines():
                mask.extend(feature_mask_dict[p.get_name()])
            feature_masks.append(mask)

        targets_and_pipelines.append((target, full_pipeline, feature_masks))
    return targets_and_pipelines
Exemplo n.º 5
0
def main():
    settings = load_settings()

    pipelines = [
        FeatureConcatPipeline(
            Pipeline(InputSource(), Preprocess(), Windower(75),
                     Correlation('none')),
            Pipeline(InputSource(), Preprocess(), Windower(75),
                     FreqCorrelation(1, None, 'none')),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                FreqBinning(winning_bins, 'mean'), Log10(), FlattenChannels()),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy(
                    [0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5,
                     24])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([0.25, 2, 3.5])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([6, 15, 24])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([2, 3.5, 6])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([3.5, 6, 15])),
            Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)),
            Pipeline(InputSource(), Preprocess(), Windower(75), PFD()),
            Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()),
        ),
    ]

    targets = [
        'Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2'
    ]

    classifiers = [
        make_svm(gamma=0.0079, C=2.7),
        make_svm(gamma=0.0068, C=2.0),
        make_svm(gamma=0.003, C=150.0),
        make_lr(C=0.04),
        make_simple_lr(),
    ]

    make_submission = len(sys.argv) >= 2 and sys.argv[1] == 'submission'
    do_cv = not make_submission

    if do_cv:
        mask_range = [3]
        split_ratios = [0.4, 0.525, 0.6]
        run_prepare_data_for_cross_validation(settings, targets, pipelines)
        run_cross_validation(settings, targets, pipelines, mask_range,
                             split_ratios, classifiers)

    if make_submission:
        num_masks = 10
        split_ratio = 0.525
        classifiers = [
            # make_svm(gamma=0.0079, C=2.7),
            make_svm(gamma=0.0068, C=2.0),
            # make_svm(gamma=0.003, C=150.0),
            # make_lr(C=0.04),
            # make_simple_lr(),
        ]

        targets_and_pipelines = []
        pipeline = pipelines[0]
        for classifier, classifier_name in classifiers:
            for i, target in enumerate(targets):
                run_prepare_data(settings, [target], [pipeline], test=True)
                feature_masks = generate_feature_masks(settings,
                                                       target,
                                                       pipeline,
                                                       num_masks,
                                                       split_ratio,
                                                       random_state=0,
                                                       quiet=True)
                targets_and_pipelines.append((target, pipeline, feature_masks,
                                              classifier, classifier_name))

        run_make_submission(settings, targets_and_pipelines, split_ratio)
Exemplo n.º 6
0
def main():

    settings = load_settings()

    targets = [
        'Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2'
    ]

    pipelines = [
        FeatureConcatPipeline(
            Pipeline(InputSource(), Preprocess(), Windower(75),
                     Correlation('none')),
            Pipeline(InputSource(), Preprocess(), Windower(75),
                     FreqCorrelation(1, None, 'none')),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                FreqBinning(winning_bins, 'mean'), Log10(), FlattenChannels()),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy(
                    [0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5,
                     24])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([0.25, 2, 3.5])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([6, 15, 24])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([2, 3.5, 6])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([3.5, 6, 15])),
            Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)),
            Pipeline(InputSource(), Preprocess(), Windower(75), PFD()),
            Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()),
        ),
    ]

    classifiers = [
        make_svm(gamma=0.0079, C=2.7),
        make_svm(gamma=0.0068, C=2.0),
        make_svm(gamma=0.003, C=150.0),
        make_lr(C=0.04),
        make_simple_lr(),
    ]

    submission_pipelines = [
        FeatureConcatPipeline(
            Pipeline(InputSource(), Preprocess(), Windower(75),
                     Correlation('none')),
            Pipeline(InputSource(), Preprocess(), Windower(75),
                     FreqCorrelation(1, None, 'none')),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                FreqBinning(winning_bins, 'mean'), Log10(), FlattenChannels()),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy(
                    [0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5,
                     24])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([0.25, 2, 3.5])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([6, 15, 24])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([2, 3.5, 6])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([3.5, 6, 15])),
            Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)),
            Pipeline(InputSource(), Preprocess(), Windower(75), PFD()),
            Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()),
        ),
    ]

    submission_classifiers = [
        make_simple_lr(),
    ]

    if len(sys.argv) >= 2 and sys.argv[1] == 'submission':
        run_make_submission(settings, targets, submission_classifiers,
                            submission_pipelines)
    else:
        run_cross_validation(settings, targets, classifiers, pipelines)