def trainer(features, cfg, psd_params, epochs, cls_params): """ Train classifiers using computed features """ psd_subjects = {} if 'fmin' in psd_params and 'fmax' in psd_params: for subject in features: psd_subjects[subject] = psd_params else: for subject in features: psd_subjects[subject] = psd_params[subject] cls1 = {} cls2 = {} for subject in features: print('Training %s' % subject) X1 = features[subject]['X1'] X2 = features[subject]['X2'] Y = features[subject]['Y'] gbp = cls_params[subject] cls1[subject] = GradientBoostingClassifier(n_estimators=gbp['trees'], learning_rate=gbp['learning_rate'], max_depth=gbp['max_depth'], max_features=gbp['max_features'], subsample=gbp['subsample'], random_state=gbp['random_state']) cls1[subject].fit(X1, Y) cls1[subject].n_jobs = 1 # set to 1 for testing cls2[subject] = GradientBoostingClassifier(n_estimators=gbp['trees'], learning_rate=gbp['learning_rate'], max_depth=gbp['max_depth'], max_features=gbp['max_features'], subsample=gbp['subsample'], random_state=gbp['random_state']) cls2[subject].fit(X2, Y) cls2[subject].n_jobs = 1 # set to 1 for testing clsfile = '%s/classifiers.pkl' % MY_PATH qc.save_obj(clsfile, dict(cls1=cls1, cls2=cls2, cfg=cfg, psd_params=psd_subjects, epochs=epochs, cls_params=cls_params)) print('Classifiers exported to %s' % clsfile)
def train_decoder(cfg, featdata, feat_file=None): """ Train the final decoder using all data """ # Init a classifier selected_classifier = cfg.CLASSIFIER['selected'] if selected_classifier == 'GB': cls = GradientBoostingClassifier( loss='deviance', learning_rate=cfg.CLASSIFIER[selected_classifier]['learning_rate'], n_estimators=cfg.CLASSIFIER[selected_classifier]['trees'], subsample=1.0, max_depth=cfg.CLASSIFIER[selected_classifier]['depth'], random_state=cfg.CLASSIFIER[selected_classifier]['seed'], max_features='sqrt', verbose=0, warm_start=False, presort='auto') elif selected_classifier == 'XGB': cls = XGBClassifier( loss='deviance', learning_rate=cfg.CLASSIFIER[selected_classifier]['learning_rate'], n_estimators=cfg.CLASSIFIER[selected_classifier]['trees'], subsample=1.0, max_depth=cfg.CLASSIFIER[selected_classifier]['depth'], random_state=cfg.GB['seed'], max_features='sqrt', verbose=0, warm_start=False, presort='auto') elif selected_classifier == 'RF': cls = RandomForestClassifier( n_estimators=cfg.CLASSIFIER[selected_classifier]['trees'], max_features='auto', max_depth=cfg.CLASSIFIER[selected_classifier]['depth'], n_jobs=cfg.N_JOBS, random_state=cfg.CLASSIFIER[selected_classifier]['seed'], oob_score=False, class_weight='balanced_subsample') elif selected_classifier == 'LDA': cls = LDA() elif selected_classifier == 'rLDA': cls = rLDA(cfg.CLASSIFIER[selected_classifier][r_coeff]) else: logger.error('Unknown classifier %s' % selected_classifier) raise ValueError # Setup features X_data = featdata['X_data'] Y_data = featdata['Y_data'] wlen = featdata['wlen'] if cfg.FEATURES['PSD']['wlen'] is None: cfg.FEATURES['PSD']['wlen'] = wlen w_frames = featdata['w_frames'] ch_names = featdata['ch_names'] X_data_merged = np.concatenate(X_data) Y_data_merged = np.concatenate(Y_data) if cfg.CV['BALANCE_SAMPLES']: X_data_merged, Y_data_merged = balance_samples( X_data_merged, Y_data_merged, cfg.CV['BALANCE_SAMPLES'], verbose=True) # Start training the decoder logger.info_green('Training the decoder') timer = qc.Timer() cls.n_jobs = cfg.N_JOBS cls.fit(X_data_merged, Y_data_merged) logger.info('Trained %d samples x %d dimension in %.1f sec' %\ (X_data_merged.shape[0], X_data_merged.shape[1], timer.sec())) cls.n_jobs = 1 # always set n_jobs=1 for testing # Export the decoder classes = {c: cfg.tdef.by_value[c] for c in np.unique(Y_data)} if cfg.FEATURES['selected'] == 'PSD': data = dict(cls=cls, ch_names=ch_names, psde=featdata['psde'], sfreq=featdata['sfreq'], picks=featdata['picks'], classes=classes, epochs=cfg.EPOCH, w_frames=w_frames, w_seconds=cfg.FEATURES['PSD']['wlen'], wstep=cfg.FEATURES['PSD']['wstep'], spatial=cfg.SP_FILTER, spatial_ch=featdata['picks'], spectral=cfg.TP_FILTER[cfg.TP_FILTER['selected']], spectral_ch=featdata['picks'], notch=cfg.NOTCH_FILTER[cfg.NOTCH_FILTER['selected']], notch_ch=featdata['picks'], multiplier=cfg.MULTIPLIER, ref_ch=cfg.REREFERENCE[cfg.REREFERENCE['selected']], decim=cfg.FEATURES['PSD']['decim']) clsfile = '%s/classifier/classifier-%s.pkl' % (cfg.DATA_PATH, platform.architecture()[0]) qc.make_dirs('%s/classifier' % cfg.DATA_PATH) qc.save_obj(clsfile, data) logger.info('Decoder saved to %s' % clsfile) # Reverse-lookup frequency from FFT fq = 0 if type(cfg.FEATURES['PSD']['wlen']) == list: fq_res = 1.0 / cfg.FEATURES['PSD']['wlen'][0] else: fq_res = 1.0 / cfg.FEATURES['PSD']['wlen'] fqlist = [] while fq <= cfg.FEATURES['PSD']['fmax']: if fq >= cfg.FEATURES['PSD']['fmin']: fqlist.append(fq) fq += fq_res # Show top distinctive features if cfg.FEATURES['selected'] == 'PSD': logger.info_green('Good features ordered by importance') if selected_classifier in ['RF', 'GB', 'XGB']: keys, values = qc.sort_by_value(list(cls.feature_importances_), rev=True) elif selected_classifier in ['LDA', 'rLDA']: keys, values = qc.sort_by_value(cls.w, rev=True) keys = np.array(keys) values = np.array(values) if cfg.EXPORT_GOOD_FEATURES: if feat_file is None: gfout = open('%s/classifier/good_features.txt' % cfg.DATA_PATH, 'w') else: gfout = open(feat_file, 'w') if type(wlen) is not list: ch_names = [ch_names[c] for c in featdata['picks']] else: ch_names = [] for w in range(len(wlen)): for c in featdata['picks']: ch_names.append('w%d-%s' % (w, ch_names[c])) chlist, hzlist = features.feature2chz(keys, fqlist, ch_names=ch_names) valnorm = values[:cfg.FEAT_TOPN].copy() valsum = np.sum(valnorm) if valsum == 0: valsum = 1 valnorm = valnorm / valsum * 100.0 # show top-N features for i, (ch, hz) in enumerate(zip(chlist, hzlist)): if i >= cfg.FEAT_TOPN: break txt = '%-3s %5.1f Hz normalized importance %-6s raw importance %-6s feature %-5d' %\ (ch, hz, '%.2f%%' % valnorm[i], '%.2f%%' % (values[i] * 100.0), keys[i]) logger.info(txt) if cfg.EXPORT_GOOD_FEATURES: gfout.write('Importance(%) Channel Frequency Index\n') for i, (ch, hz) in enumerate(zip(chlist, hzlist)): gfout.write('%.3f\t%s\t%s\t%d\n' % (values[i] * 100.0, ch, hz, keys[i])) gfout.close()
def balance_tpr(cfg, featdata): """ Find the threshold of class index 0 that yields equal number of true positive samples of each class. Currently only available for binary classes. Params ====== cfg: config module feetdata: feature data computed using compute_features() """ n_jobs = cfg.N_JOBS if n_jobs is None: n_jobs = mp.cpu_count() if n_jobs > 1: logger.info('balance_tpr(): Using %d cores' % n_jobs) pool = mp.Pool(n_jobs) results = [] # Init a classifier selected_classifier = cfg.CLASSIFIER[cfg.CLASSIFIER['selected']] if selected_classifier == 'GB': cls = GradientBoostingClassifier( loss='deviance', learning_rate=cfg.CLASSIFIER['GB']['learning_rate'], n_estimators=cfg.CLASSIFIER['GB']['trees'], subsample=1.0, max_depth=cfg.CLASSIFIER['GB']['depth'], random_state=cfg.CLASSIFIER[selected_classifier]['seed'], max_features='sqrt', verbose=0, warm_start=False, presort='auto') elif selected_classifier == 'XGB': cls = XGBClassifier( loss='deviance', learning_rate=cfg.CLASSIFIER['XGB']['learning_rate'], n_estimators=cfg.CLASSIFIER['XGB']['trees'], subsample=1.0, max_depth=cfg.CLASSIFIER['XGB']['depth'], random_state=cfg.CLASSIFIER['XGB']['seed'], max_features='sqrt', verbose=0, warm_start=False, presort='auto') elif selected_classifier == 'RF': cls = RandomForestClassifier( n_estimators=cfg.CLASSIFIER['RF']['trees'], max_features='auto', max_depth=cfg.CLASSIFIER['RF']['depth'], n_jobs=cfg.N_JOBS, random_state=cfg.CLASSIFIER['RF']['seed'], oob_score=False, class_weight='balanced_subsample') elif selected_classifier == 'LDA': cls = LDA() elif selected_classifier == 'rLDA': cls = rLDA(cfg.CLASSIFIER['rLDA']) else: logger.error('Unknown classifier type %s' % selected_classifier) raise ValueError # Setup features X_data = featdata['X_data'] Y_data = featdata['Y_data'] wlen = featdata['wlen'] if cfg.CLASSIFIER['PSD']['wlen'] is None: cfg.CLASSIFIER['PSD']['wlen'] = wlen # Choose CV type ntrials, nsamples, fsize = X_data.shape selected_CV = cfg.CV_PERFORM[cfg.CV_PERFORM['selected']] if cselected_CV == 'LeaveOneOut': logger.info_green('\n%d-fold leave-one-out cross-validation' % ntrials) if SKLEARN_OLD: cv = LeaveOneOut(len(Y_data)) else: cv = LeaveOneOut() elif selected_CV == 'StratifiedShuffleSplit': logger.info_green( '\n%d-fold stratified cross-validation with test set ratio %.2f' % (cfg.CV_PERFORM[selected_CV]['folds'], cfg.CV_PERFORM[selected_CV]['test_ratio'])) if SKLEARN_OLD: cv = StratifiedShuffleSplit( Y_data[:, 0], cfg.CV_PERFORM[selected_CV]['folds'], test_size=cfg.CV_PERFORM[selected_CV]['test_ratio'], random_state=cfg.CV_PERFORM[selected_CV]['random_seed']) else: cv = StratifiedShuffleSplit( n_splits=cfg.CV_PERFORM[selected_CV]['folds'], test_size=cfg.CV_PERFORM[selected_CV]['test_ratio'], random_state=cfg.CV_PERFORM[selected_CV]['random_seed']) else: logger.error('%s is not supported yet. Sorry.' % selected_CV) raise NotImplementedError logger.info('%d trials, %d samples per trial, %d feature dimension' % (ntrials, nsamples, fsize)) # For classifier itself, single core is usually faster cls.n_jobs = 1 Y_preds = [] if SKLEARN_OLD: splits = cv else: splits = cv.split(X_data, Y_data[:, 0]) for cnum, (train, test) in enumerate(splits): X_train = np.concatenate(X_data[train]) X_test = np.concatenate(X_data[test]) Y_train = np.concatenate(Y_data[train]) Y_test = np.concatenate(Y_data[test]) if n_jobs > 1: results.append( pool.apply_async( get_predict_proba, [cls, X_train, Y_train, X_test, Y_test, cnum + 1])) else: Y_preds.append( get_predict_proba(cls, X_train, Y_train, X_test, Y_test, cnum + 1)) cnum += 1 # Aggregate predictions if n_jobs > 1: pool.close() pool.join() for r in results: Y_preds.append(r.get()) Y_preds = np.concatenate(Y_preds, axis=0) # Find threshold for class index 0 Y_preds = sorted(Y_preds) mid_idx = int(len(Y_preds) / 2) if len(Y_preds) == 1: return 0.5 # should not reach here in normal conditions elif len(Y_preds) % 2 == 0: thres = Y_preds[mid_idx - 1] + (Y_preds[mid_idx] - Y_preds[mid_idx - 1]) / 2 else: thres = Y_preds[mid_idx] return thres