def calc_mv_parallel_classifier(args): (filename, classifier, scorer, comp_dims, fis, feature_names, method), reg = args X, y = np.memmap(filename, dtype='object', mode='r', shape=comp_dims)[reg] fis = fis[reg] n_topics = X.shape[1] if method == 'sequential': results = [] ix = np.abs(fis).argsort()[::-1] for i in range(1, n_topics): X_1 = X[:, np.abs(fis).argsort()[::-1][0:i]] feature = feature_names[ix[i-1]] output = classify.classify( X_1, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='summary') results.append([output['score'], i, reg, feature]) elif method == 'combinatorial': results = [] ix = [np.abs(fis).argsort()[::-1][0]] remaining = range(0, n_topics) remaining.remove(ix[0]) for i in range(1, n_topics + 1): if i == 1: X_1 = X[:, ix] feature = feature_names[ix[i-1]] output = classify.classify( X_1, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='summary') results.append([output['score'], i, reg, feature]) else: test_results = [] features = [] for num, new_feat in enumerate(remaining): try_comb = ix + [new_feat] X_1 = X[:, try_comb] feature = feature_names[new_feat] output = classify.classify( X_1, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='summary') test_results.append([output['score'], i, reg, feature]) features.append(new_feat) test_results = pd.DataFrame(test_results) winner = test_results.ix[:, 0] == test_results.ix[:, 0].max() results.append(map(list, test_results[winner].values)[0]) remaining.remove(features[np.where(winner)[0][0]]) ix += [features[np.where(winner)[0][0]]] return results
def best_subsets_parallel(args): (X, y, classifier, scorer, feature_names), comb = args X_1 = X[:, comb] features = list(np.array(feature_names)[list(comb)]) output = classify.classify(X_1, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='summary') return (output['score'], features, comb)
def best_subsets_parallel(args): (X, y, classifier, scorer, feature_names), comb = args X_1 = X[:, comb] features = list(np.array(feature_names)[list(comb)]) output = classify.classify( X_1, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='summary') return (output['score'], features, comb)
def classify_parallel(args): (classifier, param_grid, scoring, filename, feat_select, length, class_weight), index = args X, y = np.memmap(filename, dtype='object', mode='r', shape=(length, length))[index] output = classify.classify( X, y, classifier=classifier, cross_val='4-Fold', class_weight=class_weight, scoring=scoring, param_grid=param_grid, feat_select=feat_select) output['index'] = index # Remember to add vector to output that keeps track of seleted features to asses stability return output
def classify_parallel(classifier, scoring, region_data, importance_function): """ Parallel classification function. Used to classify for each region if study was activated or not (typically based on neurosynth features) classifier: sklearn classifier scoring: sklearn scoring function region_data: contains (X, y) data for a given region importance function: function to format importance vector (i.e. what to pull out from fitted classifier) returns summary dictionary with score, importance, preditions and importance vectors """ X, y = region_data output = classify( X, y, classifier=classifier, cross_val='4-Fold', scoring=scoring) output['importance'] = importance_function(output['clf'].clf) return output
def classify_parallel(classifier, scoring, region_data, importance_function): """ Parallel classification function. Used to classify for each region if study was activated or not (typically based on neurosynth features) classifier: sklearn classifier scoring: sklearn scoring function region_data: contains (X, y) data for a given region importance function: function to format importance vector (i.e. what to pull out from fitted classifier) returns summary dictionary with score, importance, preditions and importance vectors """ X, y = region_data output = classify(X, y, classifier=classifier, cross_val='4-Fold', scoring=scoring) output['importance'] = importance_function(output['clf'].clf) return output
def classify(self, features=None, scoring='accuracy', X_threshold=None, feat_select=None, processes=1, class_weight='auto', dummy=None): if self.c_data is None: self.load_data(features, X_threshold) self.initalize_containers(features, feat_select, dummy) print "Classifying..." pb = tools.ProgressBar(len(list(self.comparisons)), start=True) if processes > 1: from multiprocessing import Pool pool = Pool(processes=processes) else: pool = itertools try: filename = self.c_data.filename for output in pool.imap( classify_parallel, itertools.izip( itertools.repeat( (self.classifier, scoring, filename, feat_select, self.comp_dims, class_weight)), self.comparisons)): index = output['index'] self.class_score[index] = output['score'] if self.memsave is False: self.fit_clfs[index] = output['clf'] try: self.feature_importances[index] = output['clf'].clf.coef_[0] except AttributeError: try: self.feature_importances[index] = output['clf'].clf.feature_importances_ except AttributeError: pass if feat_select: self.features_selected[index] = output['features_selected'] self.predictions[index] = output['predictions'] if dummy is not None: from sklearn.dummy import DummyClassifier X, y = self.c_data[index] output = classify.classify( X, y, classifier=DummyClassifier(strategy=dummy), cross_val='4-Fold', class_weight=class_weight, scoring=scoring, feat_select=feat_select) self.dummy_score[index] = output['score'] self.dummy_predictions[index] = output['predictions'] pb.next() finally: if processes > 1: pool.close() pool.join() if dummy is None: self.final_score = self.class_score else: self.final_score = self.class_score - self.dummy_score
def calc_mv_parallel_classifier(args): (filename, classifier, scorer, comp_dims, fis, feature_names, method), reg = args X, y = np.memmap(filename, dtype='object', mode='r', shape=comp_dims)[reg] fis = fis[reg] n_topics = X.shape[1] results = [] for i in range(1, n_topics + 1): ix = np.abs(fis).argsort()[::-1] X_1 = X[:, ix[0:i]] feature = feature_names[ix[i-1]] output = classify.classify(X_1, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='summary') results.append([output['score'], i, reg, feature]) if method == 'sequential': results = [] for i in range(1, n_topics): X_1 = X[:, np.abs(fis).argsort()[::-1][0:i]] feature = feature_names[ix[i-1]] output = classify.classify(X_1, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='summary') results.append([output['score'], i, reg, feature]) elif method == 'best_subsets': results = [] total_features = X.shape[1] for n_comb in range(1, n_topics): combinations = itertools.combinations(range(0, total_features), n_comb) print combinations test_results = [] for comb in combinations: X_1 = X[:, comb] features = list(np.array(feature_names)[list(comb)]) output = classify.classify(X_1, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='summary') test_results.append([output['score'], n_comb, reg, features]) test_results = pd.DataFrame(test_results) winner = test_results.ix[:, 0] == test_results.ix[:, 0].max() results.append(map(list, test_results[winner].values)[0]) elif method == 'combinatorial': results = [] ix = [np.abs(fis).argsort()[::-1][0]] remaining = range(0, n_topics) remaining.remove(ix[0]) for i in range(1, n_topics + 1): if i == 1: X_1 = X[:, ix] feature = feature_names[ix[i-1]] output = classify.classify(X_1, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='summary') results.append([output['score'], i, reg, feature]) else: test_results = [] features = [] for num, new_feat in enumerate(remaining): try_comb = ix + [new_feat] X_1 = X[:, try_comb] feature = feature_names[new_feat] output = classify.classify(X_1, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='summary') test_results.append([output['score'], i, reg, feature]) features.append(new_feat) test_results = pd.DataFrame(test_results) winner = test_results.ix[:, 0] == test_results.ix[:, 0].max() results.append(map(list, test_results[winner].values)[0]) remaining.remove(features[np.where(winner)[0][0]]) ix += [features[np.where(winner)[0][0]]] return results
def bootstrap_mv_full_parallel(args): try: (X, y_high, y_low, classifier, scorer, method), boot_n = args np.random.seed() ran_index = np.random.choice(X.shape[0], X.shape[0]) from neurosynth.analysis.classify import regularize # Bootstrap sample X & y X = X.iloc[ran_index, :] y_high = pd.DataFrame(y_high[:, ran_index]) y_low = pd.DataFrame(y_low[:, ran_index]) feature_names = X.columns.tolist() n_topics = len(feature_names) X = regularize(X, method='scale') results = [] for reg_i, reg_y_high in y_high.iterrows(): reg_ix = ( (y_low.iloc[reg_i, :] == True) & (reg_y_high == False)) == False reg_y = reg_y_high[reg_ix].astype('int') reg_X = X[reg_ix.values, :] if method == 'combinatorial': ix = [] # Feature order index remaining = range(0, n_topics) for i in range(0, n_topics): test_results = [] for num, new_feat in enumerate(remaining): try_comb = ix + [new_feat] X_1 = reg_X[:, try_comb] feature = feature_names[new_feat] output = classify.classify( X_1, reg_y.values, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='summary') test_results.append( [output['score'], i, feature, reg_i, boot_n, new_feat]) test_results = pd.DataFrame(test_results) winner = test_results[ test_results.ix[:, 0] == test_results.ix[:, 0].max()] if winner.shape[0] > 1: winner = winner.iloc[0] results.append(map(list, winner.values)[0][0:5]) remaining.remove(winner[5].values) ix += winner[5].values.tolist() # elif method == 'shannons': # from base.statistics import shannons # clf = classify.classify(X, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='clf') # odds_ratios = np.log(clf.clf.theta_[1] / clf.clf.theta_[0]) # odds_ratios -= (odds_ratios.min() - 0.000001) # results = [shannons(odds_ratios), reg, boot_n] except: import warnings warnings.warn('something went wrong') results = None finally: return results
def bootstrap_mv_parallel(args): try: (X, y, classifier, scorer, fis, feature_names, method, reg), boot_n = args n_topics = X.shape[1] np.random.seed() ran_index = np.random.choice(X.shape[0], X.shape[0]) # Bootstrap sample X & y X = X[ran_index, :] y = y[ran_index] if method == 'sequential': results = [] ix = np.abs(fis).argsort()[::-1] for i in range(1, n_topics): X_1 = X[:, np.abs(fis).argsort()[::-1][0:i]] feature = feature_names[ix[i-1]] output = classify.classify( X_1, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='summary') results.append([output['score'], i, reg, feature]) elif method == 'combinatorial': results = [] ix = [] # Feature order index remaining = range(0, n_topics) for i in range(0, n_topics): test_results = [] for num, new_feat in enumerate(remaining): try_comb = ix + [new_feat] X_1 = X[:, try_comb] feature = feature_names[new_feat] output = classify.classify( X_1, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='summary') test_results.append( [output['score'], i, feature, reg, boot_n, new_feat]) test_results = pd.DataFrame(test_results) winner = test_results[ test_results.ix[:, 0] == test_results.ix[:, 0].max()] if winner.shape[0] > 1: winner = winner.iloc[0] results.append(map(list, winner.values)[0][0:5]) remaining.remove(winner[5].values) ix += winner[5].values.tolist() elif method == 'shannons': from base.statistics import shannons clf = classify.classify( X, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='clf') odds_ratios = np.log(clf.clf.theta_[1] / clf.clf.theta_[0]) odds_ratios -= (odds_ratios.min() - 0.000001) results = [shannons(odds_ratios), reg, boot_n] except: import warnings warnings.warn('something went wrong') results = None finally: return results
def classify(self, features=None, scoring='accuracy', dummy = True, X_threshold=None): iters = list(itertools.permutations(self.masklist, 2)) prog = 0.0 total = len(list(iters)) self.update_progress(0) if features: self.feature_names = features else: self.feature_names = self.dataset.get_feature_names() # Make feature importance grid w/ masked diagonals self.feature_importances = np.ma.masked_array(np.zeros((self.mask_num, self.mask_num, len(self.feature_names)))) i, j, k = np.meshgrid(*map(np.arange, self.feature_importances.shape), indexing='ij') self.feature_importances.mask = (i == j) for pairs in iters: index = (pairs[0][1], pairs[1][1]) # Tuple numeric index of pairs names = [pairs[0][0], pairs[1][0]] # Actual paths to masks if self.c_data[index] is None: X, y = classify.get_studies_by_regions(self.dataset, names, threshold=self.thresh, features=features, regularization='scale') if X_threshold is not None: X = binarize(X, X_threshold) # if features is not None: # X = X[:, classify.get_feature_order(self.dataset, self.feature_names)] self.c_data[index] = (X, y) if isinstance(self.classifier, RFE): self.classifier.fit(*self.c_data[index]) self.fit_clfs[index] = self.classifier self.class_score[index] = self.classifier.score(*self.c_data[index]) self.feature_importances[index] = self.classifier.estimator_.coef_[0] self.feature_ranking[index] = self.classifier.ranking_ else: output = classify.classify(X, y, classifier = self.classifier, output = 'summary_clf', cross_val = '4-Fold', class_weight = 'auto', scoring=scoring, param_grid=self.param_grid) self.class_score[index] = output['score'] self.fit_clfs[index] = output['clf'].fit(*self.c_data[index]) # import ipdb; ipdb.set_trace() if self.param_grid: # Just get them if you used a grid try: self.feature_importances[index] = self.fit_clfs[index].best_estimator_.coef_[0] except AttributeError: try: self.feature_importances[index] = self.fit_clfs[index].feature_importances_ except AttributeError: pass else: try: self.feature_importances[index] = self.fit_clfs[index].coef_[0] except AttributeError: try: self.feature_importances[index] = self.fit_clfs[index].feature_importances_ except AttributeError: pass self.dummy_score[index] = classify.classify_regions(self.dataset, names, method='Dummy' , threshold=self.thresh)['score'] prog = prog + 1 self.update_progress(int(prog / total * 100)) self.class_score = np.ma.masked_array(self.class_score, self.class_score == 0) self.dummy_score = np.ma.masked_array(self.dummy_score, self.dummy_score == 0) if dummy: self.final_score = self.class_score - self.dummy_score else: self.final_score = self.class_score # Make results fill in across diagonal # for j in range(0, self.mask_num): # for b in range(0, self.mask_num): # if self.final_score.mask[j, b] and not j == b: # self.final_score[j, b] = self.final_score[b, j] # self.fit_clfs[j, b] = self.fit_clfs[b, j] # self.c_data[j, b] = self.c_data[b, j] # if isinstance(self.classifier, LinearSVC): # self.feature_importances[j, b] = self.feature_importances[b, j] * -1 # else: # self.feature_importances[j, b] = self.feature_importances[b, j] # if self.feature_ranking is not None: # self.feature_ranking[j, b] = self.feature_ranking[b, j] self.status = 1
def bootstrap_mv_full_parallel(args): try: (X, y_high, y_low, classifier, scorer, method), boot_n = args np.random.seed() ran_index = np.random.choice(X.shape[0], X.shape[0]) from neurosynth.analysis.classify import regularize ## Bootstrap sample X & y X = X.iloc[ran_index, :] y_high = pd.DataFrame(y_high[:, ran_index]) y_low = pd.DataFrame(y_low[:, ran_index]) feature_names = X.columns.tolist() n_topics = len(feature_names) X = regularize(X, method='scale') results = [] for reg_i, reg_y_high in y_high.iterrows(): reg_ix = ((y_low.iloc[reg_i, :] == True) & (reg_y_high == False)) == False reg_y = reg_y_high[reg_ix].astype('int') reg_X = X[reg_ix.values, :] if method == 'combinatorial': ix = [] # Feature order index remaining = range(0, n_topics) for i in range(0, n_topics): test_results = [] for num, new_feat in enumerate(remaining): try_comb = ix + [new_feat] X_1 = reg_X[:, try_comb] feature = feature_names[new_feat] output = classify.classify(X_1, reg_y.values, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='summary') test_results.append([ output['score'], i, feature, reg_i, boot_n, new_feat ]) test_results = pd.DataFrame(test_results) winner = test_results[test_results.ix[:, 0] == test_results.ix[:, 0].max()] if winner.shape[0] > 1: winner = winner.iloc[0] results.append(map(list, winner.values)[0][0:5]) remaining.remove(winner[5].values) ix += winner[5].values.tolist() # elif method == 'shannons': # from base.statistics import shannons # clf = classify.classify(X, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='clf') # odds_ratios = np.log(clf.clf.theta_[1] / clf.clf.theta_[0]) # odds_ratios -= (odds_ratios.min() - 0.000001) # results = [shannons(odds_ratios), reg, boot_n] except: import warnings warnings.warn('something went wrong') results = None finally: return results
def bootstrap_mv_parallel(args): try: (X, y, classifier, scorer, fis, feature_names, method, reg), boot_n = args n_topics = X.shape[1] np.random.seed() ran_index = np.random.choice(X.shape[0], X.shape[0]) ## Bootstrap sample X & y X = X[ran_index, :] y = y[ran_index] if method == 'sequential': results = [] ix = np.abs(fis).argsort()[::-1] for i in range(1, n_topics): X_1 = X[:, np.abs(fis).argsort()[::-1][0:i]] feature = feature_names[ix[i - 1]] output = classify.classify(X_1, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='summary') results.append([output['score'], i, reg, feature]) elif method == 'combinatorial': results = [] ix = [] # Feature order index remaining = range(0, n_topics) for i in range(0, n_topics): test_results = [] for num, new_feat in enumerate(remaining): try_comb = ix + [new_feat] X_1 = X[:, try_comb] feature = feature_names[new_feat] output = classify.classify(X_1, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='summary') test_results.append( [output['score'], i, feature, reg, boot_n, new_feat]) test_results = pd.DataFrame(test_results) winner = test_results[test_results.ix[:, 0] == test_results.ix[:, 0].max()] if winner.shape[0] > 1: winner = winner.iloc[0] results.append(map(list, winner.values)[0][0:5]) remaining.remove(winner[5].values) ix += winner[5].values.tolist() elif method == 'shannons': from base.statistics import shannons clf = classify.classify(X, y, classifier=classifier, cross_val='4-Fold', scoring=scorer, output='clf') odds_ratios = np.log(clf.clf.theta_[1] / clf.clf.theta_[0]) odds_ratios -= (odds_ratios.min() - 0.000001) results = [shannons(odds_ratios), reg, boot_n] except: import warnings warnings.warn('something went wrong') results = None finally: return results
def classify(self, features=None, scoring='accuracy', X_threshold=None, feat_select=None, processes=1, class_weight = 'auto', dummy = None): self.load_data(features, X_threshold) self.initalize_containers(features, feat_select, dummy) print "Classifying..." pb = tools.ProgressBar(len(list(self.mask_pairs)), start=True) if processes > 1: pool = Pool(processes=processes) else: pool = itertools try: filename = self.c_data.filename for output in pool.imap( classify_parallel, itertools.izip( itertools.repeat((self.classifier, self.param_grid, scoring, filename, feat_select, self.mask_num, class_weight)), self.mask_pairs)): index = output['index'] self.class_score[index] = output['score'] if self.memsave is False: self.fit_clfs[index] = output['clf'] if self.param_grid: # Just get the FIs if you used a grid try: self.feature_importances[index] = self.fit_clfs[ index].best_estimator_.coef_[0] except AttributeError: try: self.feature_importances[index] = self.fit_clfs[ index].best_estimator.feature_importances_ except AttributeError: pass else: try: self.feature_importances[ index] = self.fit_clfs[index].clf.coef_[0] except AttributeError: try: self.feature_importances[index] = self.fit_clfs[ index].clf.feature_importances_ except AttributeError: pass if feat_select: self.features_selected[index] = output['features_selected'] if dummy is not None: X, y = self.c_data[index] output = classify.classify(X, y, classifier=DummyClassifier(strategy=dummy), cross_val='4-Fold', class_weight=class_weight, scoring=scoring, feat_select=feat_select) self.dummy_score[index] = output['score'] pb.next() finally: if processes > 1: pool.close() pool.join() if dummy is None: self.final_score = self.class_score else: self.final_score = self.class_score - self.dummy_score
def classify(self, features=None, scoring='accuracy', dummy=True, X_threshold=None): iters = list(itertools.permutations(self.masklist, 2)) prog = 0.0 total = len(list(iters)) self.update_progress(0) if features: self.feature_names = features else: self.feature_names = self.dataset.get_feature_names() # Make feature importance grid w/ masked diagonals self.feature_importances = np.ma.masked_array( np.zeros((self.mask_num, self.mask_num, len(self.feature_names)))) i, j, k = np.meshgrid(*map(np.arange, self.feature_importances.shape), indexing='ij') self.feature_importances.mask = (i == j) for pairs in iters: index = (pairs[0][1], pairs[1][1]) # Tuple numeric index of pairs names = [pairs[0][0], pairs[1][0]] # Actual paths to masks if self.c_data[index] is None: X, y = classify.get_studies_by_regions(self.dataset, names, threshold=self.thresh, features=features, regularization='scale') if X_threshold is not None: X = binarize(X, X_threshold) # if features is not None: # X = X[:, classify.get_feature_order(self.dataset, self.feature_names)] self.c_data[index] = (X, y) if isinstance(self.classifier, RFE): self.classifier.fit(*self.c_data[index]) self.fit_clfs[index] = self.classifier self.class_score[index] = self.classifier.score( *self.c_data[index]) self.feature_importances[ index] = self.classifier.estimator_.coef_[0] self.feature_ranking[index] = self.classifier.ranking_ else: output = classify.classify(X, y, classifier=self.classifier, output='summary_clf', cross_val='4-Fold', class_weight='auto', scoring=scoring, param_grid=self.param_grid) self.class_score[index] = output['score'] self.fit_clfs[index] = output['clf'].fit(*self.c_data[index]) # import ipdb; ipdb.set_trace() if self.param_grid: # Just get them if you used a grid try: self.feature_importances[index] = self.fit_clfs[ index].best_estimator_.coef_[0] except AttributeError: try: self.feature_importances[index] = self.fit_clfs[ index].feature_importances_ except AttributeError: pass else: try: self.feature_importances[index] = self.fit_clfs[ index].coef_[0] except AttributeError: try: self.feature_importances[index] = self.fit_clfs[ index].feature_importances_ except AttributeError: pass self.dummy_score[index] = classify.classify_regions( self.dataset, names, method='Dummy', threshold=self.thresh)['score'] prog = prog + 1 self.update_progress(int(prog / total * 100)) self.class_score = np.ma.masked_array(self.class_score, self.class_score == 0) self.dummy_score = np.ma.masked_array(self.dummy_score, self.dummy_score == 0) if dummy: self.final_score = self.class_score - self.dummy_score else: self.final_score = self.class_score # Make results fill in across diagonal # for j in range(0, self.mask_num): # for b in range(0, self.mask_num): # if self.final_score.mask[j, b] and not j == b: # self.final_score[j, b] = self.final_score[b, j] # self.fit_clfs[j, b] = self.fit_clfs[b, j] # self.c_data[j, b] = self.c_data[b, j] # if isinstance(self.classifier, LinearSVC): # self.feature_importances[j, b] = self.feature_importances[b, j] * -1 # else: # self.feature_importances[j, b] = self.feature_importances[b, j] # if self.feature_ranking is not None: # self.feature_ranking[j, b] = self.feature_ranking[b, j] self.status = 1