def run_cross_validation(settings, targets, classifiers, pipelines): print 'Cross-validation task' print 'Targets', ', '.join(targets) print 'Pipelines:\n ', '\n '.join([p.get_name() for p in pipelines]) print 'Classifiers', ', '.join([c[1] for c in classifiers]) run_prepare_data_for_cross_validation(settings, targets, pipelines) # run on pool first, then show results after pool = Pool(settings.N_jobs) for i, pipeline in enumerate(pipelines): for j, (classifier, classifier_name) in enumerate(classifiers): for k, target in enumerate(targets): progress_str = 'P=%d/%d C=%d/%d T=%d/%d' % (i+1, len(pipelines), j+1, len(classifiers), k+1, len(targets)) cross_validation_score(settings, target, pipeline, classifier, classifier_name, strategy=cross_validation_strategy, pool=pool, progress_str=progress_str, return_data=False, quiet=True) pool.close() pool.join() summaries = [] best = {} for p_num, pipeline in enumerate(pipelines): for c_num, (classifier, classifier_name) in enumerate(classifiers): mean_scores = [] median_scores = [] datas = [] for target in targets: print 'Running %s pipeline %s classifier %s' % (target, pipeline.get_name(), classifier_name) data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, strategy=cross_validation_strategy, quiet=True) datas.append(data) if data.mean_score != data.median_score: print '%.3f (mean)' % data.mean_score, data.mean_scores print '%.3f (median)' % data.median_score, data.median_scores else: print '%.3f' % data.mean_score mean_scores.append(data.mean_score) median_scores.append(data.median_score) best_score = best.get(target, [0, None, None, None])[0] cur_score = max(data.mean_score, data.median_score) if cur_score > best_score: best[target] = [cur_score, pipeline, classifier, classifier_name] name = 'p=%d c=%d %s mean %s' % (p_num, c_num, classifier_name, pipeline.get_name()) summary = get_score_summary(name, mean_scores) summaries.append((summary, np.mean(mean_scores))) print summary name = 'p=%d c=%d %s median %s' % (p_num, c_num, classifier_name, pipeline.get_name()) summary = get_score_summary(name, median_scores) summaries.append((summary, np.mean(median_scores))) print summary print_results(summaries) print '\nbest' for target in targets: pipeline = best[target][1] classifier_name = best[target][3] print target, best[target][0], classifier_name, pipeline.get_names()
def run_cross_validation(settings, targets, pipelines, mask_range, split_ratios, classifiers): pool = Pool(settings.N_jobs) for i, pipeline in enumerate(pipelines): for j, (classifier, classifier_name) in enumerate(classifiers): for k, target in enumerate(targets): pool.apply_async(cross_validation_score, [settings, target, pipeline, classifier, classifier_name], {'quiet': True}) for split_num, split_ratio in enumerate(split_ratios): masks = generate_feature_masks(settings, target, pipeline, np.max(mask_range), split_ratio, random_state=0, quiet=True) for mask_num, mask in enumerate(masks): progress_str = 'P=%d/%d C=%d/%d T=%d/%d S=%d/%d M=%d/%d' % (i+1, len(pipelines), j+1, len(classifiers), k+1, len(targets), split_num+1, len(split_ratios), mask_num+1, len(masks)) cross_validation_score(settings, target, pipeline, classifier, classifier_name, feature_mask=mask, quiet=True, return_data=False, pool=pool, progress_str=progress_str) pool.close() pool.join() print 'Finished cross validation mp' summaries = [] for p_num, pipeline in enumerate(pipelines): for classifier, classifier_name in classifiers: scores_full = [] scores_masked = [[[] for y in mask_range] for x in split_ratios] for i, target in enumerate(targets): run_prepare_data_for_cross_validation(settings, [target], [pipeline], quiet=True) data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, pool=None, quiet=True) scores_full.append(data.mean_score) for split_index, split_ratio in enumerate(split_ratios): masks = generate_feature_masks(settings, target, pipeline, np.max(mask_range), split_ratio, random_state=0, quiet=True) for mask_index, num_masks in enumerate(mask_range): predictions = [] y_cvs = None for mask in masks[0:num_masks]: data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, feature_mask=mask, pool=None, quiet=True) predictions.append(data.mean_predictions) if y_cvs is None: y_cvs = data.y_cvs else: for y_cv_1, y_cv_2 in zip(y_cvs, data.y_cvs): assert np.alltrue(y_cv_1 == y_cv_2) predictions = np.mean(predictions, axis=0) scores = [roc_auc_score(y_cv, p) for p, y_cv in zip(predictions, y_cvs)] score = np.mean(scores) scores_masked[split_index][mask_index].append(score) summary = get_score_summary('%s p=%d full' % (classifier_name, p_num), scores_full, np.mean(scores_full), targets) summaries.append((summary, np.mean(scores_full))) for split_index, split_ratio in enumerate(split_ratios): for mask_index, num_masks in enumerate(mask_range): scores = scores_masked[split_index][mask_index] summary = get_score_summary('%s p=%d split_ratio=%s masks=%d' % (classifier_name, p_num, split_ratio, num_masks), scores, np.mean(scores), targets) summaries.append((summary, np.mean(scores))) print summary print_results(summaries)
def run_cross_validation(settings, targets, pipelines, mask_range, split_ratios, classifiers): pool = Pool(settings.N_jobs) for i, pipeline in enumerate(pipelines): for j, (classifier, classifier_name) in enumerate(classifiers): for k, target in enumerate(targets): pool.apply_async(cross_validation_score, [settings, target, pipeline, classifier, classifier_name], {'quiet': True}) for split_num, split_ratio in enumerate(split_ratios): masks = generate_feature_masks(settings, target, pipeline, np.max(mask_range), split_ratio, random_state=0, quiet=True) for mask_num, mask in enumerate(masks): progress_str = 'P=%d/%d C=%d/%d T=%d/%d S=%d/%d M=%d/%d' % (i+1, len(pipelines), j+1, len(classifiers), k+1, len(targets), split_num+1, len(split_ratios), mask_num+1, len(masks)) cross_validation_score(settings, target, pipeline, classifier, classifier_name, feature_mask=mask, quiet=True, return_data=False, pool=pool, progress_str=progress_str) pool.close() pool.join() print('Finished cross validation mp') summaries = [] for p_num, pipeline in enumerate(pipelines): for classifier, classifier_name in classifiers: scores_full = [] scores_masked = [[[] for y in mask_range] for x in split_ratios] for i, target in enumerate(targets): run_prepare_data_for_cross_validation(settings, [target], [pipeline], quiet=True) data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, pool=None, quiet=True) scores_full.append(data.mean_score) for split_index, split_ratio in enumerate(split_ratios): masks = generate_feature_masks(settings, target, pipeline, np.max(mask_range), split_ratio, random_state=0, quiet=True) for mask_index, num_masks in enumerate(mask_range): predictions = [] y_cvs = None for mask in masks[0:num_masks]: data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, feature_mask=mask, pool=None, quiet=True) predictions.append(data.mean_predictions) if y_cvs is None: y_cvs = data.y_cvs else: for y_cv_1, y_cv_2 in zip(y_cvs, data.y_cvs): assert np.alltrue(y_cv_1 == y_cv_2) predictions = np.mean(predictions, axis=0) scores = [roc_auc_score(y_cv, p) for p, y_cv in zip(predictions, y_cvs)] score = np.mean(scores) scores_masked[split_index][mask_index].append(score) summary = get_score_summary('%s p=%d full' % (classifier_name, p_num), scores_full) summaries.append((summary, np.mean(scores_full))) for split_index, split_ratio in enumerate(split_ratios): for mask_index, num_masks in enumerate(mask_range): scores = scores_masked[split_index][mask_index] summary = get_score_summary('%s p=%d split_ratio=%s masks=%d' % (classifier_name, p_num, split_ratio, num_masks), scores) summaries.append((summary, np.mean(scores))) print(summary) print_results(summaries)
def main(): settings = load_settings() targets = [ 'Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2' ] # The genetic algorithm will be run individually on each pipeline group pipeline_groups = [ ([ Pipeline(InputSource(), Preprocess(), Windower(75), PFD()), ], 0.55), ([ Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()), ], 0.55), ([ Pipeline( InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy( [0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5, 24])), Pipeline( InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])), Pipeline( InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])), Pipeline( InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5])), Pipeline( InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([6, 15, 24])), Pipeline( InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([2, 3.5, 6])), Pipeline( InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([3.5, 6, 15])), Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)), ], 0.55), ] make_submission = len(sys.argv) >= 2 and sys.argv[1] == 'submission' run_ga = not make_submission # This classifier is used in the genetic algorithm ga_classifier, ga_classifier_name = make_svm(gamma=0.0079, C=2.7) if run_ga: quiet = False summaries = [] for ngen in [10]: for pipelines, ratio in pipeline_groups: out = [] for target in targets: print 'Running target', target run_prepare_data_for_cross_validation(settings, [target], pipelines, quiet=True) pipeline = FeatureConcatPipeline(*pipelines) score, best_N = process_target(settings, target, pipeline, ga_classifier, ga_classifier_name, ratio=ratio, ngen=ngen, quiet=quiet) print target, score, [ np.sum(mask) for mask in best_N[0:10] ] out.append((target, score, pipeline, best_N)) scores = np.array([score for _, score, _, _ in out]) summary = get_score_summary( '%s ngen=%d' % (ga_classifier_name, ngen), scores, np.mean(scores), targets) summaries.append((summary, np.mean(scores))) print summary print_results(summaries) if make_submission: random_pipelines = [ Pipeline(InputSource(), Preprocess(), Windower(75), Correlation('none')), Pipeline(InputSource(), Preprocess(), Windower(75), FreqCorrelation(1, None, 'none')), Pipeline( InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), FreqBinning(winning_bins, 'mean'), Log10(), FlattenChannels()), ] # These classifiers are used to make the final predictions final_classifiers = [ # make_svm(gamma=0.0079, C=2.7), make_svm(gamma=0.0068, C=2.0), # make_svm(gamma=0.003, C=150.0), # make_lr(C=0.04), # make_simple_lr(), ] targets_and_pipelines = get_submission_targets_and_masks( settings, targets, ga_classifier, ga_classifier_name, pipeline_groups, random_pipelines) for classifier, classifier_name in final_classifiers: run_make_submission(settings, targets_and_pipelines, classifier, classifier_name)
def main(): settings = load_settings() targets = [ 'Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2' ] # The genetic algorithm will be run individually on each pipeline group pipeline_groups = [ ([ Pipeline(InputSource(), Preprocess(), Windower(75), PFD()), ], 0.55), ([ Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()), ], 0.55), ([ Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5, 24])), Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])), Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])), Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5])), Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([6, 15, 24])), Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([2, 3.5, 6])), Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([3.5, 6, 15])), Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)), ], 0.55), ] make_submission = len(sys.argv) >= 2 and sys.argv[1] == 'submission' run_ga = not make_submission # This classifier is used in the genetic algorithm ga_classifier, ga_classifier_name = make_svm(gamma=0.0079, C=2.7) if run_ga: quiet = False summaries = [] for ngen in [10]: for pipelines, ratio in pipeline_groups: out = [] for target in targets: print 'Running target', target run_prepare_data_for_cross_validation(settings, [target], pipelines, quiet=True) pipeline = FeatureConcatPipeline(*pipelines) score, best_N = process_target(settings, target, pipeline, ga_classifier, ga_classifier_name, ratio=ratio, ngen=ngen, quiet=quiet) print target, score, [np.sum(mask) for mask in best_N[0:10]] out.append((target, score, pipeline, best_N)) scores = np.array([score for _, score, _, _ in out]) summary = get_score_summary('%s ngen=%d' % (ga_classifier_name, ngen), scores) summaries.append((summary, np.mean(scores))) print summary print_results(summaries) if make_submission: random_pipelines = [ Pipeline(InputSource(), Preprocess(), Windower(75), Correlation('none')), Pipeline(InputSource(), Preprocess(), Windower(75), FreqCorrelation(1, None, 'none')), Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), FreqBinning(winning_bins, 'mean'), Log10(), FlattenChannels()), ] # These classifiers are used to make the final predictions final_classifiers = [ # make_svm(gamma=0.0079, C=2.7), make_svm(gamma=0.0068, C=2.0), # make_svm(gamma=0.003, C=150.0), # make_lr(C=0.04), # make_simple_lr(), ] targets_and_pipelines = get_submission_targets_and_masks(settings, targets, ga_classifier, ga_classifier_name, pipeline_groups, random_pipelines) for classifier, classifier_name in final_classifiers: run_make_submission(settings, targets_and_pipelines, classifier, classifier_name)
def run_cross_validation(settings, targets, classifiers, pipelines): print 'Cross-validation task' print 'Targets', ', '.join(targets) print 'Pipelines:\n ', '\n '.join([p.get_name() for p in pipelines]) print 'Classifiers', ', '.join([c[1] for c in classifiers]) run_prepare_data_for_cross_validation(settings, targets, pipelines) # run on pool first, then show results after pool = Pool(settings.N_jobs) for i, pipeline in enumerate(pipelines): for j, (classifier, classifier_name) in enumerate(classifiers): for k, target in enumerate(targets): progress_str = 'P=%d/%d C=%d/%d T=%d/%d' % ( i + 1, len(pipelines), j + 1, len(classifiers), k + 1, len(targets)) cross_validation_score(settings, target, pipeline, classifier, classifier_name, strategy=cross_validation_strategy, pool=pool, progress_str=progress_str, return_data=False, quiet=True) pool.close() pool.join() summaries = [] best = {} for p_num, pipeline in enumerate(pipelines): for c_num, (classifier, classifier_name) in enumerate(classifiers): mean_scores = [] median_scores = [] datas = [] for target in targets: print 'Running %s pipeline %s classifier %s' % ( target, pipeline.get_name(), classifier_name) data = cross_validation_score( settings, target, pipeline, classifier, classifier_name, strategy=cross_validation_strategy, quiet=True) datas.append(data) if data.mean_score != data.median_score: print '%.3f (mean)' % data.mean_score, data.mean_scores print '%.3f (median)' % data.median_score, data.median_scores else: print '%.3f' % data.mean_score mean_scores.append(data.mean_score) median_scores.append(data.median_score) best_score = best.get(target, [0, None, None, None])[0] cur_score = max(data.mean_score, data.median_score) if cur_score > best_score: best[target] = [ cur_score, pipeline, classifier, classifier_name ] name = 'p=%d c=%d %s mean %s' % (p_num, c_num, classifier_name, pipeline.get_name()) summary = get_score_summary(name, mean_scores) summaries.append((summary, np.mean(mean_scores))) print summary name = 'p=%d c=%d %s median %s' % (p_num, c_num, classifier_name, pipeline.get_name()) summary = get_score_summary(name, median_scores) summaries.append((summary, np.mean(median_scores))) print summary print_results(summaries) print '\nbest' for target in targets: pipeline = best[target][1] classifier_name = best[target][3] print target, best[target][0], classifier_name, pipeline.get_names()