def select_data(self, target_data, size): n = len(target_data) assert size <= n-1 bs = Bootstrap(n, 1, train_size=n-1) train_index, test_index = bs.__iter__().next() train_index = list(train_index) inds = train_index[:size] new_target_data = target_data.make_copy_from_selection(inds) return new_target_data
def select_data(self, target_data, size): n = len(target_data) assert size <= n - 1 bs = Bootstrap(n, 1, train_size=n - 1) train_index, test_index = bs.__iter__().next() train_index = list(train_index) inds = train_index[:size] new_target_data = target_data.make_copy_from_selection(inds) return new_target_data
def run(sc): def zero_matrix(n, m): return np.zeros(n * m, dtype=int).reshape(n, m) def vote_increment(y_est): increment = zero_matrix(y_est.size, n_ys) increment[np.arange(y_est.size), y_est] = 1 return increment # test point x class matrix with 1s marking the estimator prediction X, y = make_classification() X_train, X_test, y_train, y_test = train_test_split(X, y) n_test = X_test.shape[0] n_ys = np.unique(y_train).size model = DecisionTreeClassifier() # Partition the training data into random sub-samples with replacement. samples = sc.parallelize(Bootstrap(y.size)) # Train a model for each sub-sample and apply it to the test data. vote_tally = samples.map(lambda (index, _): model.fit(X[index], y[index]). predict(X_test)).map(vote_increment).fold( zero_matrix(n_test, n_ys), np.add) # Take the learner majority vote. y_estimate_vote = np.argmax(vote_tally, axis=1) return accuracy_score(y_test, y_estimate_vote)
def meta_extractor(targetfile, metafile, yfolds=5, zfolds=10): # setup CV sets resultdict = {} # load data print "--loading data" target = np.load(targetfile) y = target['ymap'] Z = target['Z'] # compute cv print "--computing cv sets" TRAIN, TEST = [], [] skf = StratifiedKFold(y, yfolds, False) for train, test in skf: TRAIN.append(np.array(train, dtype=bool)) TEST.append(np.array(test, dtype=bool)) train = np.vstack(TRAIN) test = np.vstack(TEST) resultdict['ycv'] = yfolds resultdict["train"] = train resultdict["test"] = test # compute Z cv # subtrain[ix, jx] = indexes correponsing to trainset ix, subtrain set jx # subtest[ix, jx] = indexes correponsing to traintest ix, subtest set jx print "--computing Z cv sets" Nt = int(len(y) * ((yfolds - 1) / float(yfolds))) Kt = int(Nt * ((zfolds - 1) / float(zfolds))) Ke = Nt - Kt SUBTRAIN = np.zeros((yfolds, zfolds, Kt), dtype=bool) SUBTEST = np.zeros((yfolds, zfolds, Ke), dtype=bool) for idx, trainset in enumerate(train): # each trainig set Zt = Z[trainset] Nt = len(Zt) skf = Bootstrap(Nt, 10000, Kt, Ke) for fold in range(zfolds): for ztrain, ztest in skf: ntrain = Zt[ztrain].sum(axis=0) ntest = Zt[ztest].sum(axis=0) if np.alltrue(ntrain > 0) and np.alltrue( ntest > 0): #accept this SUBTRAIN[idx, fold] = ztrain SUBTEST[idx, fold] = ztest break resultdict['zcv'] = zfolds resultdict["subtrain"] = SUBTRAIN resultdict["subtest"] = SUBTEST # save results print "--save final result" np.savez(metafile, **resultdict)
def bootstrap_seqs(seq, n_iter, subsize, random_state=0): """ seq = the sequence to be selected from n_iter = number of sub sequences subsize = length of sub sequences return iterable of subseqs """ bs = Bootstrap(len(seq), n_iter=n_iter, train_size=subsize, random_state=random_state) sub_indices = [index for (index, _) in bs] seq_array = np.asarray(seq) return [seq_array[i] for i in sub_indices]
def run_bootstrap_model(df_cv, model, best_features, X_fut, ref_column): bs = Bootstrap(len(df_cv), n_iter=1000, train_size=int(len(df_cv) * 3 / 4), test_size=int(len(df_cv) * 1 / 4)) count = 1 first = True for train_index, test_index in bs: X_tr = df_cv.ix[train_index][best_features] y_tr = df_cv.ix[train_index][ref_column] mdl = model.fit(X_tr, y_tr) pred = mdl.predict(X_fut) if first: df_pred = pd.DataFrame(pred) first = False else: df_pred[count] = pred count += 1 return df_pred
sparse=True) validation_features = dio.join_features("%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns, extra_valid_features, sparse=True) print features.shape print validation_features.shape salaries = dio.get_salaries(type_n, log=True) if not submission: valid_salaries = dio.get_salaries(type_v, log=True) print salaries.shape bs = Bootstrap(len(salaries), random_state=45, train_size=0.6) train_index, test_index = next(iter(bs)) param = """Normal count vector with max 200. New submission which is repeatable. and nicer Bag of Words: %s\n Encoded cols: %s\n Logged vectorizer = TfidfVectorizer( sublinear_tf=True, max_df=0.5, stop_words='english'
import numpy as np from sklearn.cross_validation import train_test_split, Bootstrap from sklearn.datasets import make_classification from sklearn.metrics import accuracy_score from sklearn.tree import DecisionTreeClassifier from sklearn import datasets, svm, pipeline from sklearn.kernel_approximation import RBFSampler from sklearn.linear_model import SGDClassifier if __name__ == '__main__': conf = SparkConf() conf.setMaster("spark://172.18.109.87:7077") # conf.setMaster("local") conf.setAppName("spark_svm") conf.set("spark.executor.memory", "12g") sc = SparkContext(conf=conf) X, y = make_classification(n_samples=10000, n_features=30, n_classes=2) X_train, X_test, y_train, y_test = train_test_split(X, y) samples = sc.parallelize(Bootstrap(y.size)) feature_map_fourier = RBFSampler(gamma=.2, random_state=1) fourier_approx_svm = pipeline.Pipeline([("feature_map", feature_map_fourier), ("svm", SGDClassifier())]) fourier_approx_svm.set_params(feature_map__n_components=700) results = samples.map(lambda (index, _): fourier_approx_svm.fit(X[index], y[index]).score(X_test, y_test)) \ .reduce(lambda x,y: x+y) final_results = results / len(Bootstrap(y.size)) print(final_results)
precisions = [] recalls = [] model_start = time.time() for targetColumn in topFeatures: parsedData = getXY(complete_matrix, int(targetColumn)) X_train, X_test, y_train, y_test = train_test_split(parsedData[0], parsedData[1], train_size=0.9) n_test = X_test.shape[0] model = BernoulliNB() #model = MultinomialNB() #model = KNeighborsClassifier() #model = linear_model.SGDClassifier() #model = linear_model.LogisticRegressionCV() #model = svm.LinearSVC() samples = sc.parallelize(Bootstrap(X_train.shape[0], n_iter=19, train_size=0.5), 8) vote_result = samples.map(lambda (index, _) : model.fit(X_train[index], y_train[index]).predict(X_test)).map(vote_increment).fold(zero_matrix(n_test, n_ys), numpy.add) y_estimate_vote = numpy.argmax(vote_result, axis = 1) precisions.append(precision_score(y_test, y_estimate_vote)) recalls.append(recall_score(y_test, y_estimate_vote)) samples = sc.parallelize([numpy.arange(X_train.shape[0])]) vote_result = samples.map(lambda index : model.fit(X_train[index], y_train[index]).predict(X_test)).map(vote_increment).fold(zero_matrix(n_test, n_ys), numpy.add) y_estimate_vote = numpy.argmax(vote_result, axis = 1) precisions.append(precision_score(y_test, y_estimate_vote)) recalls.append(recall_score(y_test, y_estimate_vote)) end = time.time() print (end - model_start) / 60 numpy.mean(precisions) numpy.mean(recalls)
from sklearn.cross_validation import train_test_split, Bootstrap from sklearn.datasets import make_classification from sklearn.metrics import accuracy_score from sklearn.tree import DecisionTreeClassifier def zero_matrix(n, m): return np.zeros(n * m, dtype=int).reshape(n, m) def vote_increment(y_est): increment = zero_matrix(y_est.size, n_ys) increment[np.arange(y_est.size), y_est] = 1 return increment # test point x class matrix with 1s marking the estimator prediction X, y = make_classification() X_train, X_test, y_train, y_test = train_test_split(X, y) n_test = X_test.shape[0] n_ys = np.unique(y_train).size model = BernoulliNB() # Partition the training data into random sub-samples with replacement. samples = sc.parallelize(Bootstrap(y.size)) # Train a model for each sub-sample and apply it to the test data. vote_tally = samples.map(lambda (index, _): model.fit(X[index], y[ index]).predict(X_test)).map(vote_increment).fold(zero_matrix( n_test, n_ys), np.add) # Take the learner majority vote. y_estimate_vote = np.argmax(vote_tally, axis=1) print accuracy_score(y_test, y_estimate_vote)
makePred = False #set to true if you want to make predictions getConM = True #set to true to get the confusion matrix using our optimal parameters with open("./DataSetWithDictionarys/trainingSetX.txt", "rb") as trainingFileX: trainingX = pickle.load(trainingFileX) with open("./DataSetWithDictionarys/trainingSetY.txt", "rb") as trainingFileY: trainingY = pickle.load(trainingFileY) with open("./DataSetWithDictionarys/validationSet.txt", "rb") as validationFile: validationSet = pickle.load(validationFile) bs = Bootstrap(trainingX.shape[0], n_iter=NUMIT, random_state=0) kBest = SelectKBest(chi2, k=NUMFT) if makePred: for nb in NUMNB: for s in sigma: acc = np.array([]) acpcl = np.array([0.0, 0.0, 0.0, 0.0]) pre = np.array([0.0, 0.0, 0.0, 0.0]) rec = np.array([0.0, 0.0, 0.0, 0.0]) f1score = np.array([0.0, 0.0, 0.0, 0.0]) for train_index, test_index in bs: [accuraccy, accpcl, precision, recall, f1] = (bootstrap(train_index, test_index, nb, trainingX, trainingY, kBest, s)) acc = np.append(acc, accuraccy)