def initialize(self, X, k, random_seed, method='naive'): if method == 'naive': # Randomly pick k data points to be the centroids of the k clusters centroids = resample(X, n_samples=k, random_state=random_seed, replace=False) elif method == 'kmeans++': # https://en.wikipedia.org/wiki/K-means%2B%2B # Step 1: Choose one center uniformly at random from among the data points centroids = resample(X, n_samples=1, random_state=random_seed, replace=False) N = len(X) # Sampling the 1~k centroids for i in range(1, k): distances = [ -1 ] * N # Step 2: For each data point x, compute D(x) for j in range(N): # The distance between x and the nearest center that has already been chosen distances[j] = min(np.linalg.norm(X[j] - centroid) for centroid in centroids) # Step 3: Choose one new data point at randome as a new center, # using a weighted probability distribution where a point x is chosen with probability proportional to D(x)^2 square_distances = [ distance ** 2 for distance in distances ] total_square_distance = sum(square_distances) # Naturally excluded already selected data points, because their probability is 0 probabilities = [ square_distance / total_square_distance for square_distance in square_distances ] new_centroid_index = np.random.choice(range(N), size=1, replace=False, p=probabilities)[0] centroids = np.append(centroids, [ X[new_centroid_index] ], axis=0) return centroids
def run_scikit_digits(epochs=0, layers=0, neuron_count=0): """ Run Handwritten Digits dataset from Scikit-Learn. Learning set is split into 70% for training, 15% for testing, and 15% for validation. Parameters ---------- epochs : int Number of iterations of the the traininng loop for the whole dataset layers : int Number of layers (not counting the input layer, but does count output layer) neuron_count : list The number of neurons in each of the layers (in order), does not count the bias term Attributes ---------- target_values : list The possible values for each training vector """ # Imported from linear_neuron temp_digits = datasets.load_digits() digits = utils.resample(temp_digits.data, random_state=3) temp_answers = utils.resample(temp_digits.target, random_state=3) # images = utils.resample(temp_digits.images, random_state=0) num_of_training_vectors = 1250 answers, answers_to_test, validation_answers = ( temp_answers[:num_of_training_vectors], temp_answers[num_of_training_vectors : num_of_training_vectors + 260], temp_answers[num_of_training_vectors + 260 :], ) training_set, testing_set, validation_set = ( digits[:num_of_training_vectors], digits[num_of_training_vectors : num_of_training_vectors + 260], digits[num_of_training_vectors + 260 :], ) ########### # network.visualization(training_set[10], answers[10]) # network.visualization(training_set[11], answers[11]) # network.visualization(training_set[12], answers[12]) network = Network(layers, neuron_count, training_set[0]) network.train(training_set, answers, epochs) f = open("my_net.pickle", "wb") # fr = open('my_net.pickle', 'rb') dill.dump(network, f) # network = pickle.load(fr) # fr.close() f.close() # guess_list = network.run_unseen(testing_set) return network.run_unseen(testing_set)
def test_resample(): # Border case not worth mentioning in doctests assert resample() is None # Check that invalid arguments yield ValueError assert_raises(ValueError, resample, [0], [0, 1]) assert_raises(ValueError, resample, [0, 1], [0, 1], replace=False, n_samples=3) assert_raises(ValueError, resample, [0, 1], [0, 1], meaning_of_life=42) # Issue:6581, n_samples can be more when replace is True (default). assert_equal(len(resample([1, 2], n_samples=5)), 5)
def resample_training_dataset(self, labels, feature_array, sizes = (5000,500)): """ Inputs: - labels - features - sizes: tuple, for each class (0,1,etc)m the number of training chunks you want. i.e for 500 seizures, 5000 baseline, sizes = (5000, 500), as 0 is baseline, 1 is Seizure Takes labels and features an WARNING: Up-sampling target class prevents random forest oob from being accurate. """ if len (labels.shape) == 1: labels = labels[:, None] resampled_labels = [] resampled_features = [] for i,label in enumerate(np.unique(labels.astype('int'))): class_inds = np.where(labels==label)[0] class_labels = labels[class_inds] class_features = feature_array[class_inds,:] if class_features.shape[0] < sizes[i]: # need to oversample class_features_duplicated = np.vstack([class_features for i in range(int(sizes[i]/class_features.shape[0]))]) class_labels_duplicated = np.vstack([class_labels for i in range(int(sizes[i]/class_labels.shape[0]))]) n_extra_needed = sizes[i] - class_labels_duplicated.shape[0] extra_features = resample(class_features, n_samples = n_extra_needed,random_state = 7, replace = False) extra_labels = resample(class_labels, n_samples = n_extra_needed,random_state = 7, replace = False) boot_array = np.vstack([class_features_duplicated,extra_features]) boot_labels = np.vstack([class_labels_duplicated,extra_labels]) elif class_features.shape[0] > sizes[i]: # need to undersample boot_array = resample(class_features, n_samples = sizes[i],random_state = 7, replace = False) boot_labels = resample(class_labels, n_samples = sizes[i],random_state = 7, replace = False) elif class_features.shape[0] == sizes[i]: logging.debug('label '+str(label)+ ' had exact n as sample, doing nothing!') boot_array = class_features boot_labels = class_labels else: print(class_features.shape[0], sizes[i]) print ('fuckup') resampled_features.append(boot_array) resampled_labels.append(boot_labels) # stack both up... resampled_labels = np.vstack(resampled_labels) resampled_features = np.vstack(resampled_features) logging.debug('Original label counts: '+str(pd.Series(labels[:,0]).value_counts())) logging.debug('Resampled label counts: '+str(pd.Series(resampled_labels[:,0]).value_counts())) return resampled_labels, resampled_features
def run_mnist(epochs, layers, neuron_count): """ Run Mnist dataset and output a guess list on the Kaggle test_set Parameters ---------- epochs : int Number of iterations of the the traininng loop for the whole dataset layers : int Number of layers (not counting the input layer, but does count output layer) neuron_count : list The number of neurons in each of the layers (in order), does not count the bias term Attributes ---------- target_values : list The possible values for each training vector """ with open('train.csv', 'r') as f: reader = csv.reader(f) t = list(reader) train = [[int(x) for x in y] for y in t[1:]] with open('test.csv', 'r') as f: reader = csv.reader(f) raw_nums = list(reader) test_set = [[int(x) for x in y] for y in raw_nums[1:]] ans_train = [x[0] for x in train] train_set = [x[1:] for x in train] ans_train.pop(0) train_set.pop(0) train_set = utils.resample(train_set, random_state=2) ans_train = utils.resample(ans_train, random_state=2) network = Network(layers, neuron_count, train_set[0]) network.train(train_set, ans_train, epochs) # For validation purposes # guess_list = network.run_unseen(train_set[4000:4500]) # network.report_results(guess_list, ans_train[4000:4500]) # guess_list = network.run_unseen(train_set[4500:5000]) # network.report_results(guess_list, ans_train[4500:5000]) guess_list = network.run_unseen(test_set) with open('digits.txt', 'w') as d: for elem in guess_list: d.write(str(elem)+'\n')
def test_resample_stratified(): # Make sure resample can stratify rng = np.random.RandomState(0) n_samples = 100 p = .9 X = rng.normal(size=(n_samples, 1)) y = rng.binomial(1, p, size=n_samples) _, y_not_stratified = resample(X, y, n_samples=10, random_state=0, stratify=None) assert np.all(y_not_stratified == 1) _, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y) assert not np.all(y_stratified == 1) assert np.sum(y_stratified) == 9 # all 1s, one 0
def eval_prox_random(self, n_sample_node=5, sample_nodes=[]): cs = self.cs measurements = {} nodes = cs.nodes() test_nodes = [] if len(sample_nodes): if type(sample_nodes[0]) is str: test_nodes = sample_nodes elif type(sample_nodes[0]) is int: test_nodes = [nodes[i] for i in sample_nodes] else: test_nodes = resample(nodes, n_samples=n_sample_node) # nae of coordinate-based proximity vs ground-proximity coor_test = self.coor_all[test_nodes] ground_prox = ( cs.proximity_to(sources=test_nodes, dests=cs.nodes()).as_matrix().transpose() ) # shape: test_nodes x all_nodes coor_prox = np.dot(coor_test.as_matrix().transpose(), self.coor_all.as_matrix()) nae = pd.Series.combine( pd.Series(coor_prox.flatten()), pd.Series(ground_prox.flatten()), lambda c, g: abs(c - g) / g ) nae_plot = pd.Series(np.linspace(0.0, 1.0, num=len(nae)), index=nae.order()) measurements["nae"] = nae measurements["nae_plot"] = nae_plot return measurements
def bootstrap_auc(df, col, pred_col, n_bootstrap=1000): """ Calculate the boostrapped AUC for a given col trying to predict a pred_col. Parameters ---------- df : pandas.DataFrame col : str column to retrieve the values from pred_col : str the column we're trying to predict n_boostrap : int the number of bootstrap samples Returns ------- list : AUCs for each sampling """ scores = np.zeros(n_bootstrap) old_len = len(df) df.dropna(subset=[col], inplace=True) new_len = len(df) if new_len < old_len: logger.info("Dropping NaN values in %s to go from %d to %d rows" % (col, old_len, new_len)) preds = df[pred_col].astype(int) for i in range(n_bootstrap): sampled_counts, sampled_pred = resample(df[col], preds) if is_single_class(sampled_pred, col=pred_col): continue scores[i] = roc_auc_score(sampled_pred, sampled_counts) return scores
def boot_estimates(model, X, y, nboot): ''' Evaluate coefficient estimates for nboot boostrap samples ''' coefs = [np.hstack([model.fit(iX, iy).intercept_, model.fit(iX, iy).coef_.ravel()]) for iX, iy in (resample(X, y) for i in xrange(nboot))] return np.vstack(coefs)
def balanced_resample(data, labels): """Do a balanced resampling of data and labels, returning them See the test routine at the bottom for an example of behavior """ most_common, num_required = mstats.mode(labels) possible_labels = np.unique(labels) data_resampled = [] labels_resampled = [] for possible_label in possible_labels: in_this_label = labels == possible_label data_buffered = np.array([]) data_buffered = np.reshape(data_buffered, (0, data.shape[1])) labels_buffered = np.array([]) while len(data_buffered) < num_required: data_buffered = np.vstack([data_buffered, data[in_this_label]]) labels_buffered = np.hstack([labels_buffered, labels[in_this_label]]) single_data_resampled, single_labels_resampled = utils.resample( data_buffered, labels_buffered, n_samples=int(num_required), replace=True ) data_resampled.append(single_data_resampled) labels_resampled.append(single_labels_resampled) return np.vstack(data_resampled).astype(data.dtype), np.hstack(labels_resampled).astype(labels.dtype)
def run_method_usage(methods,cases): methods = [m[0] for m in methods] # Bootstrap the percentage error bars: percents =[] for i in range(10000): nc = resample(cases) percents.append(100*np.sum(nc,axis=0)/len(nc)) percents=np.array(percents) mean_percents = np.mean(percents,axis=0) std_percents = np.std(percents,axis=0)*1.96 inds=np.argsort(mean_percents).tolist() inds.reverse() avg_usage = np.mean(mean_percents) fig = plt.figure() ax = fig.add_subplot(111) x=np.arange(len(methods)) ax.plot(x,[avg_usage]*len(methods),'-',color='0.25',lw=1,alpha=0.2) ax.bar(x, mean_percents[inds], 0.6, color=paired[0],linewidth=0, yerr=std_percents[inds],ecolor=paired[1]) #ax.set_title('Method Occurrence') ax.set_ylabel('Occurrence %',fontsize=30) ax.set_xlabel('Method',fontsize=30) ax.set_xticks(np.arange(len(methods))) ax.set_xticklabels(np.array(methods)[inds],fontsize=8) fig.autofmt_xdate() fix_axes() plt.tight_layout() fig.savefig(figure_path+'method_occurrence.pdf', bbox_inches=0) fig.show() return inds,mean_percents[inds]
def fit(self, dataSet): for clt in self.forest: randSet= resample(dataSet) #print "randSet size = %d" % len(randSet) target = [x[0] for x in randSet] train = [x[1:] for x in randSet] clt.fit(train, target)
def downsample(y, sizes = [30000, 3000]): # classes = Counter(y) res = [] for class_i, sz in enumerate(sizes): indices = [x for x in y == class_i if x] res.append(resample(indices, replace = True, n_samples = sz)) return tuple(res)
def Reduce_scikit_kmeans(img, number_of_colors): t0 = time() from sklearn.cluster import KMeans img_64 = np.array(img, dtype=np.float64) / 255 w, h, d = tuple(img_64.shape) assert d == 3 image_array = np.reshape(img_64, (w * h, d)) LOGGER.info("shape=%s", image_array.shape) from sklearn.utils import resample image_array_sample = resample( image_array, replace=True, n_samples=min([image_array.shape[0], 1000]), random_state=1 ) kmeans = KMeans( n_clusters=number_of_colors, random_state=1, precompute_distances=True).fit(image_array_sample) labels = kmeans.predict(image_array) LOGGER.info("ms=%s", ms(t0)) return kmeans.cluster_centers_, labels
def fit(self, X, Y): num_examples = len(X) data_indices = np.arange(num_examples) self.data = X Y = np.array(Y, dtype=float) sample = resample(data_indices, replace=False, n_samples=min(20, num_examples), random_state=0) for i in sample: y = Y[i] self.S.add(i) self.y[i] = y self.alpha[i] = 0.0 self.g[i] = y for i in xrange(5): min_delta = 999999999 for i in data_indices: self.process(i, Y[i]) delta = self.reprocess() min_delta = min(min_delta, delta) if min_delta < self.tau: break data_indices = shuffle(data_indices) while True: delta = self.reprocess() if delta < self.tau: break
def test_mnist(self): mnist = fetch_mldata('MNIST original') X, Y = resample(mnist.data, mnist.target, replace=False, n_samples=1000, random_state=0) X = X.astype(float) Y = [1 if y == 0 else -1 for y in Y] svm = LASVM(C=10, tau=0.001) svm.fit(X, Y) X_test, Y_test = resample(mnist.data, mnist.target, replace=False, n_samples=300, random_state=2) X_test = X_test.astype(float) Y_test = [1 if y == 0 else -1 for y in Y_test] Y_predict = svm.predict(X_test) percent_correct = np.sum(Y_predict == Y_test) / 300.0 self.assertGreater(percent_correct, 0.95)
def show_bootstrap_statistics(clf, X, y, features): num_features = len(features) coefs = [] for i in range(num_features): coefs.append([]) for _ in range(BOOTSTRAP_ITERATIONS): X_sample, y_sample = resample(X, y) clf.fit(X_sample, y_sample) for i, c in enumerate(get_normalized_coefs(clf)): coefs[i].append(c) poi_index = features.index('POI') building_index = features.index('Building') coefs[building_index] = coefs[poi_index] intervals = [] print() print('***** Bootstrap statistics *****') print('{:<20}{:<20}{:<10}{:<10}'.format('Feature', '95% interval', 't-value', 'Pr(>|t|)')) print() for i, cs in enumerate(coefs): values = np.array(cs) lo = np.percentile(values, 2.5) hi = np.percentile(values, 97.5) interval = '({:.3f}, {:.3f})'.format(lo, hi) tv = np.mean(values) / np.std(values) pr = (1.0 - t.cdf(x=abs(tv), df=len(values))) * 0.5 stv = '{:.3f}'.format(tv) spr = '{:.3f}'.format(pr) print('{:<20}{:<20}{:<10}{:<10}'.format(features[i], interval, stv, spr))
def test_resample_stratify_2dy(): # Make sure y can be 2d when stratifying rng = np.random.RandomState(0) n_samples = 100 X = rng.normal(size=(n_samples, 1)) y = rng.randint(0, 2, size=(n_samples, 2)) X, y = resample(X, y, n_samples=50, random_state=rng, stratify=y) assert y.ndim == 2
def make_pred_prob_plot_data(model, df, column): dfc = df.copy() rng = np.linspace(df[column].min(), df[column].max()) probs = [] for val in rng: dfc[column] = val pred_probs = model.predict_proba(dfc)[:, 1] probs.append([boot_sample.mean() for boot_sample in (resample(pred_probs) for _ in xrange(1000))]) return rng, np.array(probs).T
def bootstrap_auc(y_c,y_pred,N=100): """Bootstrap the AUC score.""" scores=[] for i in xrange(N): res_y=resample(np.column_stack([y_c,y_pred])) scores.append(roc_auc_score(res_y[:,0],res_y[:,1])) print 'Score is :', '%.4f' % np.mean(scores), print '+-','%.4f' % np.std(scores)
def _balance(self, class0_k, class1_k): """Balances collection with their respective coefficients for classes Collection should be sorted with 1 labels go before 0s. """ import numpy as np from sklearn.utils import resample class1_count = len([1 for x in self.labels if x]) class0_count = len(self.labels) - class1_count class1_col = self.collection[:class1_count] class0_col = self.collection[class1_count:] num_class0 = int(class0_count*class0_k) num_class1 = int(class1_count*class1_k) class0_col = resample(class0_col, replace=False, n_samples=num_class0, random_state=1) class1_col = resample(class1_col, replace=False, n_samples=num_class1, random_state=1) col = np.concatenate([class1_col, class0_col]) labels = np.concatenate((np.ones(num_class1), np.zeros(num_class0))) return col, labels
def resample_split(X, y, state): # Train index train_index = resample(range(0,len(X)), random_state = state) X_train = X[train_index] y_train = y[train_index] # Test are the rest test_index = [i for i in range(len(X)) if i not in train_index] X_test = [X[i] for i in range(len(X)) if i not in train_index] y_test = [y[i] for i in range(len(X)) if i not in train_index] return X_train, y_train, X_test, y_test, test_index
def test_resample_stratify_sparse_error(): # resample must be ndarray rng = np.random.RandomState(0) n_samples = 100 X = rng.normal(size=(n_samples, 2)) y = rng.randint(0, 2, size=n_samples) stratify = sp.csr_matrix(y) with pytest.raises(TypeError, match='A sparse matrix was passed'): X, y = resample(X, y, n_samples=50, random_state=rng, stratify=stratify)
def main(): parser = argparse.ArgumentParser() parser.add_argument('prediction', type=str) parser.add_argument('--test_listfile', type=str, default='../data/length-of-stay/test/listfile.csv') parser.add_argument('--n_iters', type=int, default=1000) parser.add_argument('--save_file', type=str, default='los_results.json') args = parser.parse_args() pred_df = pd.read_csv(args.prediction, index_col=False, dtype={'period_length': np.float32, 'y_true': np.float32}) test_df = pd.read_csv(args.test_listfile, index_col=False, dtype={'period_length': np.float32, 'y_true': np.float32}) df = test_df.merge(pred_df, left_on=['stay', 'period_length'], right_on=['stay', 'period_length'], how='left', suffixes=['_l', '_r']) assert (df['prediction'].isnull().sum() == 0) assert (df['y_true_l'].equals(df['y_true_r'])) metrics = [('Kappa', 'kappa'), ('MAD', 'mad'), ('MSE', 'mse'), ('MAPE', 'mape')] data = np.zeros((df.shape[0], 2)) data[:, 0] = np.array(df['prediction']) data[:, 1] = np.array(df['y_true_l']) results = dict() results['n_iters'] = args.n_iters ret = print_metrics_regression(data[:, 1], data[:, 0], verbose=0) for (m, k) in metrics: results[m] = dict() results[m]['value'] = ret[k] results[m]['runs'] = [] for i in range(args.n_iters): cur_data = sk_utils.resample(data, n_samples=len(data)) ret = print_metrics_regression(cur_data[:, 1], cur_data[:, 0], verbose=0) for (m, k) in metrics: results[m]['runs'].append(ret[k]) for (m, k) in metrics: runs = results[m]['runs'] results[m]['mean'] = np.mean(runs) results[m]['median'] = np.median(runs) results[m]['std'] = np.std(runs) results[m]['2.5% percentile'] = np.percentile(runs, 2.5) results[m]['97.5% percentile'] = np.percentile(runs, 97.5) del results[m]['runs'] print "Saving the results in {} ...".format(args.save_file) with open(args.save_file, 'w') as f: json.dump(results, f) print results
def run_scikit_digits(epochs, layers, neuron_count): """ Run Handwritten Digits dataset from Scikit-Learn. Learning set is split into 70% for training, 15% for testing, and 15% for validation. Parameters ---------- epochs : int Number of iterations of the the traininng loop for the whole dataset layers : int Number of layers (not counting the input layer, but does count output layer) neuron_count : list The number of neurons in each of the layers (in order), does not count the bias term Attributes ---------- target_values : list The possible values for each training vector """ # Imported from linear_neuron temp_digits = datasets.load_digits() digits = utils.resample(temp_digits.data, random_state=3) temp_answers = utils.resample(temp_digits.target, random_state=3) # images = utils.resample(temp_digits.images, random_state=0) num_of_training_vectors = 1250 answers, answers_to_test, validation_answers = temp_answers[:num_of_training_vectors], temp_answers[num_of_training_vectors:num_of_training_vectors+260], temp_answers[num_of_training_vectors+260:] training_set, testing_set, validation_set = digits[:num_of_training_vectors], digits[num_of_training_vectors:num_of_training_vectors+260], digits[num_of_training_vectors+260:] ########### # network.visualization(training_set[10], answers[10]) # network.visualization(training_set[11], answers[11]) # network.visualization(training_set[12], answers[12]) network = Network(layers, neuron_count, training_set[0]) network.train(training_set, answers, epochs) guess_list = network.run_unseen(testing_set) network.report_results(guess_list, answers_to_test) valid_list = network.run_unseen(validation_set) network.report_results(valid_list, validation_answers)
def test_resample_stratified_replace(): # Make sure stratified resampling supports the replace parameter rng = np.random.RandomState(0) n_samples = 100 X = rng.normal(size=(n_samples, 1)) y = rng.randint(0, 2, size=n_samples) X_replace, _ = resample(X, y, replace=True, n_samples=50, random_state=rng, stratify=y) X_no_replace, _ = resample(X, y, replace=False, n_samples=50, random_state=rng, stratify=y) assert np.unique(X_replace).shape[0] < 50 assert np.unique(X_no_replace).shape[0] == 50 # make sure n_samples can be greater than X.shape[0] if we sample with # replacement X_replace, _ = resample(X, y, replace=True, n_samples=1000, random_state=rng, stratify=y) assert X_replace.shape[0] == 1000 assert np.unique(X_replace).shape[0] == 100
def bootstrap(arr,n_boots): ''' variables: arr = the data that we are random sampling from n_boots = the number of bootstraps we want to make returns: list of lists containing the number of bootstrap samples we wanted to make ''' return [resample(arr) for _ in xrange(n_boots)]
def _contin_value_plot(): # We need to set up an array of x_i values to search over if num_linspace: # Create an interval over +-1 std of the mean of the x column mean, std = df[column].mean(), df[column].std() lower = np.max([mean - (std), df[column].min()]) upper = np.min([mean + (std), df[column].max()]) x_i = np.linspace(lower, upper, num=num_linspace) else: # If num_linspace=None, make x_i the unique values x_i = np.unique(df[column]) # For each value in our search space, set the entire column in question to that value and run model.predict or model.predict_proba # Average out those predictions and add it to a list of y_hats that we are keeping track of preds = [] for val in x_i: print val dfc[column] = val if classification: class_ind = list(model.classes_).index(class_pred) pred = model.predict_proba(dfc)[:, class_ind] else: pred = model.predict(dfc) preds.append([boot_sample.mean() for boot_sample in (resample(pred) for _ in xrange(1000))]) probs = np.array(preds) prob_means = probs.mean(axis=1) lower_bounds = np.percentile(probs, q=10, axis=1) upper_bounds = np.percentile(probs, q=90, axis=1) # Create the fill to indicate the confidence bounds ax1.fill_between(x_i, lower_bounds, upper_bounds, facecolor=cmap[0], alpha=0.25) # Plot the predictions ax1.plot(x_i, prob_means, c=cmap[1], linewidth=2) ax1.tick_params(axis="x", labelsize=14) ax1.tick_params(axis="y", labelsize=13) if freq: ax2 = ax1.twinx() if num_linspace: ax2.hist( df.loc[(df[column] >= mean - std) & (df[column] <= mean + std), column].values, facecolor=cmap[1], alpha=0.4, ) else: ax2.hist(df[column].values, facecolor=cmap[1], alpha=0.4) ax2.set_ylabel("Frequency") # Set xlims to mirror the min and max datapoint if xlim == None: ax1.set_xlim([x_i.min(), x_i.max()]) else: ax1.set_xlim(xlim)
def _discrete_value_plot(scatter_num): # Create an array of the unique discrete bins labels = np.unique(dfc[column]) # Create list for keeping track of predictions preds = [] for label in labels: # Set all of that column to that particular label dfc[column] = label # Make predictions using inputed model if classification: pred = model.predict_proba(dfc)[:, 1] else: pred = model.predict(dfc) # Append array of means of bootstrapped predictions preds.append( np.array([boot_sample.mean() for boot_sample in (resample(pred) for _ in xrange(1000))]).reshape(-1, 1) ) # Probably do this irrespective of discrete vs contin # fig, ax1 = plt.subplots(figsize=figsize) # Create the boxplots for each label and alter colors bp = plt.boxplot(preds, sym="", whis=[5, 95], labels=labels) # , widths=0.35) plt.setp(bp["boxes"], color=cmap[0]) plt.setp(bp["whiskers"], color=cmap[0]) plt.setp(bp["caps"], color=cmap[0]) # Fill the boxes with color for idx in xrange(len(labels)): box = bp["boxes"][idx] boxCoords = box.get_xydata() boxPolygon = Polygon(boxCoords, facecolor=cmap[0], alpha=0.7) ax1.add_patch(boxPolygon) # Set the xtick labels xtickNames = plt.setp(ax1, xticklabels=labels) plt.setp(xtickNames, rotation=-45, fontsize=14) # Superimpose jittered scatter plot if scatter is set to True if scatter: # If the number of points to plot was not set with scatter_num # Set the number to 200 * the number of discrete bins if not scatter_num: scatter_num = 200 * len(labels) # Make the number of points per bin perportional to that labels # representation in the original dataset num_per_label = [int((df[column] == label).mean() * scatter_num) for label in labels] for idx, num in enumerate(num_per_label): y_data = np.random.choice(preds[idx].flatten(), size=num) x_data = [bp["whiskers"][idx * 2].get_xdata()[0]] * num jittered_x = _rand_jitter(x_data, bp["boxes"][idx]) ax1.scatter(jittered_x, y_data, c=cmap[1], alpha=0.6)
def balanced_index(targets): class_index = {0: [], 1: []} for i, c in enumerate(targets): class_index[c].append(i) minor_class = 0 if len(class_index[0]) < len(class_index[1]) else 1 balanced_class_index = resample(class_index[1 - minor_class], n_samples=len(class_index[minor_class]), replace=False, random_state=5) index_ = np.concatenate((class_index[minor_class], balanced_class_index)) index_.sort() return index_
from sklearn.utils import resample from sklearn.preprocessing import MinMaxScaler, StandardScaler from sklearn import svm test_day = ['2020-01-19', '2020-02-01', '2020-02-02', '2020-02-08'] # test_day = ['2020-02-09', '2020-02-15', '2020-02-16', '2020-02-22'] training_x, training_y = DataGenerator.get_data(test_day, is_training=True) # Up-sample training_df = pd.concat([training_x, training_y], axis=1) minor_df = training_df[training_df.win == 1] major_df = training_df[training_df.win == 0] minor_df_upsample = resample(minor_df, replace=True, n_samples=len(major_df), random_state=1) new_training_df = pd.concat([major_df, minor_df_upsample], axis=0) training_y = new_training_df['win'] training_x = new_training_df.drop(['win'], axis=1) scaler = MinMaxScaler() training_x = scaler.fit_transform(training_x, training_y) model = svm.SVC(C=1, kernel='linear', random_state=0) feature_selection = SFS(model, forward=False, cv=10,
train = covtype.loc[0:15119, :] test = covtype.loc[15120:, :] # In[5]: # Features - target X_train = train.loc[:, 0:53] y_train = train.loc[:, 54] X_test = test.loc[:, 0:53] y_test = test.loc[:, 54] # In[3]: # Train set sampling (2000 samples) train_sample = resample(train, n_samples=2000, random_state=0) X_train_sample = train_sample.loc[:, 0:53] y_train_sample = train_sample.loc[:, 54] # In[6]: # Scale features data in [0,1] range X_train_sample_minmax = minmax_scaler.fit_transform(X_train_sample) X_train_minmax = minmax_scaler.fit_transform(X_train) X_test_minmax = minmax_scaler.fit_transform(X_test) # In[7]:
def resampleProcedure(data): data_r = resample(data) return data_r
def roc_calculate(Ytrue, Yscore, bootnum=1000, metric=None, val=None): """Calculates required metrics for the roc plot function (fpr, tpr, and tpr_ci). Parameters ---------- Ytrue : array-like, shape = [n_samples] Binary label for samples (0s and 1s) Yscore : array-like, shape = [n_samples] Predicted y score for samples Returns ---------------------------------- fpr : array-like, shape = [n_samples] False positive rates. tpr : array-like, shape = [n_samples] True positive rates. tpr_ci : array-like, shape = [n_samples, 2] True positive rates 95% confidence intervals [lowci, uppci]. """ # Get fpr, tpr fpr, tpr, threshold = metrics.roc_curve(Ytrue, Yscore, pos_label=1, drop_intermediate=False) # fpr, tpr with drop_intermediates for fpr = 0 (useful for plot... since we plot specificity on x-axis, we don't need intermediates when fpr=0) tpr0 = tpr[fpr == 0][-1] tpr = np.concatenate([[tpr0], tpr[fpr > 0]]) fpr = np.concatenate([[0], fpr[fpr > 0]]) # if metric is provided, calculate stats if metric is not None: specificity, sensitivity, threshold = get_spec_sens_cuttoff(Ytrue, Yscore, metric, val) stats = get_stats(Ytrue, Yscore, specificity) stats["val_specificity"] = specificity stats["val_sensitivity"] = specificity stats["val_cutoffscore"] = threshold # bootstrap using vertical averaging tpr_boot = [] boot_stats = [] for i in range(bootnum): # Resample and get tpr, fpr Ytrue_res, Yscore_res = resample(Ytrue, Yscore) fpr_res, tpr_res, threshold_res = metrics.roc_curve(Ytrue_res, Yscore_res, pos_label=1, drop_intermediate=False) # Drop intermediates when fpr=0 tpr0_res = tpr_res[fpr_res == 0][-1] tpr_res = np.concatenate([[tpr0_res], tpr_res[fpr_res > 0]]) fpr_res = np.concatenate([[0], fpr_res[fpr_res > 0]]) # Vertical averaging... use closest fpr_res to fpr, and append the corresponding tpr idx = [np.abs(i - fpr_res).argmin() for i in fpr] tpr_list = tpr_res[idx] tpr_boot.append(tpr_list) # if metric is provided, calculate stats if metric is not None: stats_res = get_stats(Ytrue_res, Yscore_res, specificity) boot_stats.append(stats_res) # Get CI for bootstat if metric is not None: bootci_stats = {} for i in boot_stats[0].keys(): stats_i = [k[i] for k in boot_stats] stats_i = np.array(stats_i) stats_i = stats_i[~np.isnan(stats_i)] # Remove nans try: lowci = np.percentile(stats_i, 2.5) uppci = np.percentile(stats_i, 97.5) except IndexError: lowci = np.nan uppci = np.nan bootci_stats[i] = [lowci, uppci] # Get CI for tpr tpr_lowci = np.percentile(tpr_boot, 2.5, axis=0) tpr_uppci = np.percentile(tpr_boot, 97.5, axis=0) # Add the starting 0 tpr = np.insert(tpr, 0, 0) fpr = np.insert(fpr, 0, 0) tpr_lowci = np.insert(tpr_lowci, 0, 0) tpr_uppci = np.insert(tpr_uppci, 0, 0) # Concatenate tpr_ci tpr_ci = np.array([tpr_lowci, tpr_uppci]) if metric is None: return fpr, tpr, tpr_ci else: return fpr, tpr, tpr_ci, stats, bootci_stats
def __init__(self, config): """ The constructor of the DataGenerator class. It loads the training labels and the images. Parameters ---------- config: dict a dictionary with necessary information for the dataloader (e.g batch size) """ cwd = os.getenv("DATA_PATH") if cwd is None: print("Set your DATA_PATH env first") sys.exit(1) self.config = config try: if self.config.augment: pass except AttributeError: self.config.augment = False # Read csv file tmp = pd.read_csv(os.path.abspath(os.path.join(cwd, 'train.csv')), delimiter=',', engine='python') # A vector of images id. image_ids = tmp["Id"] data_path = os.path.join(cwd, 'train') print(data_path) self.n = len(image_ids) # For each id sublist of the 4 filenames [batch_size, 4] self.filenames = np.asarray([[ os.path.join(cwd, 'train', id + '_' + c + '.png') for c in ['red', 'green', 'yellow', 'blue'] ] for id in image_ids]) # Labels self.labels = tmp["Target"].values # To one-hot representation of labels # e.g. before e.g. ['22 0' '12 23 0'] # after split [['22', '0'], ['12', '23', '0']] # after binarize it is one hot representation binarizer = MultiLabelBinarizer(classes=np.arange(28)) self.labels = [[int(c) for c in l.split(' ')] for l in self.labels] self.labels = binarizer.fit_transform(self.labels) # Build a validation set try: self.train_filenames, self.val_filenames,\ self.train_labels, self.val_labels = train_test_split( self.filenames, self.labels, test_size=self.config.val_split, random_state=42) except AttributeError: print('WARN: val_split not set - using 0.1') self.train_filenames, self.val_filenames,\ self.train_labels, self.val_labels = train_test_split( self.filenames, self.labels, test_size=0.1, random_state=42) print("Shape of training data: {}".format(self.train_filenames.shape)) print("Shape of training labels: {}".format(self.train_labels.shape)) # Get list of all possible images (incl. augmented if exist) data_train_folder = os.path.join(cwd, 'train') # Augment training data if specified in config file (and if possible) if self.config.augment: print("Getting augmented dataset...") filter_list = ['yellow', 'red', 'blue', 'green'] aug_train_list = [] aug_train_labels = [] for i in range(0, self.train_filenames.shape[0]): filename = self.train_filenames[i][0] \ .rsplit('/')[-1].rsplit('_')[0] print("Augmenting {}".format(filename)) temp_rot = [] temp_rev = [] counter = 1 while True: test_f = os.path.join( data_train_folder, filename + '_rot{}'.format(counter) + '_' + filter_list[0] + '.png') if os.path.isfile(test_f) is False: break temp_rot = [ os.path.join( data_train_folder, filename + '_rot{}'.format(counter) + '_' + f + '.png') for f in filter_list ] temp_rev = [ os.path.join( data_train_folder, filename + '_rev{}'.format(counter) + '_' + f + '.png') for f in filter_list ] flag = True if SKIP_CHECK is False: try: for fname in temp_rev: with open(fname, 'rb') as f: # Check header of file flag = flag and (f.read(4) == b'\x89PNG') for fname in temp_rot: with open(fname, 'rb') as f: # Check header of file flag = flag and (f.read(4) == b'\x89PNG') except IOError as e: print(e) flag = False if flag is True: aug_train_list.append(temp_rot) aug_train_labels.append(self.train_labels[i]) aug_train_list.append(temp_rev) aug_train_labels.append(self.train_labels[i]) else: print("corrupted images found") print(temp_rot) print(temp_rev) counter += 1 try: # Append list of all aug filenames to training set self.train_filenames = np.vstack( (self.train_filenames, np.asarray(aug_train_list))) self.train_labels = np.vstack( (self.train_labels, np.asarray(aug_train_labels))) # Append list of all aug filenames to 'all' set self.filenames = np.vstack( (self.filenames, np.asarray(aug_train_list))) self.labels = np.vstack( (self.labels, np.asarray(aug_train_labels))) # aug_train_list is empty (no aug data available) except ValueError: print('No augmented data found. Please augment first') # New label frequency print("New label distribution: {}".format( self.train_labels.sum(axis=0))) self.n_train = len(self.train_labels) self.n_val = len(self.val_labels) self.n = len(self.labels) if hasattr(config, 'random_state'): random_state = config.random_state else: random_state = 42 np.random.seed(random_state) if hasattr(config, 'bootstrap_size'): n_samples = int(config.bootstrap_size * self.n_train) new_indices = resample(np.arange(self.n_train), n_samples=n_samples, random_state=random_state) self.train_filenames = self.train_filenames[new_indices] self.train_labels = self.train_labels[new_indices] self.n_train = len(self.train_labels) print('Size of training set is {}'.format(self.n_train)) print('Size of validation set is {}'.format(self.n_val)) # Compute class weigths self.class_weights = (self.n_train) * np.reshape( 1 / np.sum(self.train_labels, axis=0), (1, -1)) # Number batches per epoch self.train_batches_per_epoch = int( (self.n_train - 1) / self.config.batch_size) + 1 self.val_batches_per_epoch = int( (self.n_val - 1) / self.config.batch_size) + 1 self.all_batches_per_epoch = int( (self.n - 1) / self.config.batch_size) + 1
tmp = pd.DataFrame(); df2 = pd.DataFrame(); few = ['spy.','perl.','phf.','multihop.','ftp_write.','loadmodule.','rootkit.','imap.','warezmaster.','land.','buffer_overflow.','guess_passwd.','pod.'] for fff in few: tmp = df.loc[df['41'] == fff]; df2 = pd.concat([df2,tmp]); df.drop(df[df['41'] == fff].index ,inplace=True); ########### SMOTE Smaller categories ############### print("\t> Synthetically generating new smaller categories...") td = df.loc[df['41'] == 'normal.']; td = resample(td, replace=False, n_samples=450, random_state=1); ##### C point - size of smaller sample smotes for i in range(42): df2.rename(columns = {i: str(i)}, inplace = True); few = ['multihop.','ftp_write.','loadmodule.','rootkit.','imap.','warezmaster.','land.','buffer_overflow.','guess_passwd.','pod.']; smotenc = SMOTENC([1,2,3,6,11,20,21], random_state=1); for smaller in few: tt = df2.loc[df2['41'] == smaller]; df_tmp = pd.concat([tt,td]); X_tmp = df_tmp.iloc[:,:-1]; Y_tmp = np.array(df_tmp.iloc[:,-1]); Y_tmp = Y_tmp.reshape(len(Y_tmp),1);
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) print(len(X_train)) print(len(X_test)) #df=df.iloc[row_selected,:] X_train=pd.DataFrame(X_train,columns=X.columns) y_train=pd.DataFrame(y_train,columns=['TARGET']) train= pd.concat((X_train, y_train), axis = 1) # Separate majority and minority classes df_majority = train[train['TARGET']==0] df_minority = train[train['TARGET']==1] print(len(df_majority)) print(len(df_minority)) # Downsample majority class df_majority_underampled = resample(df_majority, replace=False, # sample without replacement n_samples=len(df_minority), # to match minority class random_state=123) # reproducible results # Combine minority class with downsampled majority class train = pd.concat([df_majority_underampled, df_minority]) X_train_new=train.drop(columns=['TARGET']) y_train_new=train['TARGET'] # Display new class counts #train['TARGET'].value_counts() unique, counts = np.unique(y_train_new, return_counts=True) y_pos = np.arange(len(unique)) plt.bar(y_pos, counts, align='center', alpha=0.5)
test_size=0.2, random_state=42) # # PROCESSING TRAIN DATA(UPSAMPLING) # In[15]: X_train.y.value_counts() # In[16]: #upsampling of the dataset from sklearn.utils import resample major = X_train[X_train['y'] == -1] minor = X_train[X_train['y'] == 1] upsampled = resample(minor, replace=True, n_samples=1516, random_state=123) newupsampled = pd.concat([major, upsampled]) # In[17]: newupsampled.y.value_counts() # In[18]: ynew_train = newupsampled['y'] X = newupsampled.drop('y', axis=1) # In[19]: X.shape
def outliers_removal(df, output_dir, log, detect_based_on_structure_features): """ TBD """ # Load dataset cells = initial_parsing(df=df) # %% Threshold for determing outliers cell_dens_th_CN = 1e-20 # for cell-nucleus metrics across all cells cell_dens_th_S = 1e-10 # for structure volume metrics # Remove outliers # %% Remove cells that lack a Structure Volume value cells_ao = cells[["CellId", "structure_name"]].copy() cells_ao["Outlier"] = "No" CellIds_remove = cells.loc[cells["Structure volume"].isnull(), "CellId"].values cells_ao.loc[cells_ao["CellId"].isin(CellIds_remove), "Outlier"] = "yes_missing_structure_volume" cells = cells.drop(cells[cells["CellId"].isin(CellIds_remove)].index) cells.reset_index(drop=True) log.info( f"Removing {len(CellIds_remove)} cells that lack a Structure Volume measurement value" ) log.info(f"Shape of remaining dataframe: {cells.shape}") # %% Feature set for cell and nuclear features cellnuc_metrics = [ "Cell surface area", "Cell volume", "Cell height", "Nuclear surface area", "Nuclear volume", "Nucleus height", "Cytoplasmic volume", ] cellnuc_abbs = [ "Cell area", "Cell vol", "Cell height", "Nuc area", "Nuc vol", "Nuc height", "Cyto vol", ] # %% All metrics including height L = len(cellnuc_metrics) pairs = np.zeros((int(L * (L - 1) / 2), 2)).astype(np.int) i = 0 for f1 in np.arange(L): for f2 in np.arange(L): if f2 > f1: pairs[i, :] = [f1, f2] i += 1 # %% The typical six scatter plots xvec = [1, 1, 6, 1, 4, 6] yvec = [4, 6, 4, 0, 3, 3] pairs2 = np.stack((xvec, yvec)).T # %% Just one xvec = [1] yvec = [4] # %% Parameters nbins = 100 N = 10000 fac = 1000 Rounds = 5 # %% For all pairs compute densities remove_cells = cells["CellId"].to_frame().copy() for i, xy_pair in enumerate(pairs): metricX = cellnuc_metrics[xy_pair[0]] metricY = cellnuc_metrics[xy_pair[1]] log.info(f"{metricX} vs {metricY}") # data x = cells[metricX].to_numpy() / fac y = cells[metricY].to_numpy() / fac # density estimate, repeat because of probabilistic nature of density estimate # used here for r in np.arange(Rounds): remove_cells[f"{metricX} vs {metricY}_{r}"] = np.nan log.info(f"Round {r + 1} of {Rounds}") rs = int(r) xS, yS = resample(x, y, replace=False, n_samples=np.amin([N, len(x)]), random_state=rs) k = gaussian_kde(np.vstack([xS, yS])) cell_dens = k(np.vstack([x.flatten(), y.flatten()])) cell_dens = cell_dens / np.sum(cell_dens) remove_cells.loc[remove_cells.index[np.arange(len(cell_dens))], f"{metricX} vs {metricY}_{r}", ] = cell_dens # %% Summarize across repeats remove_cells_summary = cells["CellId"].to_frame().copy() for i, xy_pair in enumerate(pairs): metricX = cellnuc_metrics[xy_pair[0]] metricY = cellnuc_metrics[xy_pair[1]] log.info(f"{metricX} vs {metricY}") metricX = cellnuc_metrics[xy_pair[0]] metricY = cellnuc_metrics[xy_pair[1]] filter_col = [ col for col in remove_cells if col.startswith(f"{metricX} vs {metricY}") ] x = remove_cells[filter_col].to_numpy() pos = np.argwhere(np.any(x < cell_dens_th_CN, axis=1)) y = x[pos, :].squeeze() fig, axs = plt.subplots(1, 2, figsize=(16, 9)) xr = np.log(x.flatten()) xr = np.delete(xr, np.argwhere(np.isinf(xr))) axs[0].hist(xr, bins=100) axs[0].set_title("Histogram of cell probabilities (log scale)") axs[0].set_yscale("log") im = axs[1].imshow(np.log(y), aspect="auto") plt.colorbar(im) axs[1].set_title("Heatmap with low probability cells (log scale)") plot_save_path = f"{output_dir}/{metricX}_vs_{metricY}_cellswithlowprobs.png" plt.savefig(plot_save_path, format="png", dpi=150) plt.close("all") remove_cells_summary[f"{metricX} vs {metricY}"] = np.median(x, axis=1) # %% Identify cells to be removed CellIds_remove_dict = {} CellIds_remove = np.empty(0, dtype=int) for i, xy_pair in enumerate(pairs): metricX = cellnuc_metrics[xy_pair[0]] metricY = cellnuc_metrics[xy_pair[1]] CellIds_remove_dict[f"{metricX} vs {metricY}"] = np.argwhere( remove_cells_summary[f"{metricX} vs {metricY}"].to_numpy() < cell_dens_th_CN) CellIds_remove = np.union1d( CellIds_remove, CellIds_remove_dict[f"{metricX} vs {metricY}"]) log.info(len(CellIds_remove)) # %% Plot and remove outliers plotname = "CellNucleus" oplot( cellnuc_metrics, cellnuc_abbs, pairs2, cells, True, output_dir, f"{plotname}_6_org_fine", 0.5, [], ) oplot( cellnuc_metrics, cellnuc_abbs, pairs2, cells, True, output_dir, f"{plotname}_6_org_thick", 2, [], ) oplot( cellnuc_metrics, cellnuc_abbs, pairs2, cells, True, output_dir, f"{plotname}_6_outliers", 2, CellIds_remove_dict, ) oplot( cellnuc_metrics, cellnuc_abbs, pairs, cells, True, output_dir, f"{plotname}_21_org_fine", 0.5, [], ) oplot( cellnuc_metrics, cellnuc_abbs, pairs, cells, True, output_dir, f"{plotname}_21_org_thick", 2, [], ) oplot( cellnuc_metrics, cellnuc_abbs, pairs, cells, True, output_dir, f"{plotname}_21_outliers", 2, CellIds_remove_dict, ) log.info(cells.shape) CellIds_remove = (cells.loc[cells.index[CellIds_remove], "CellId"].squeeze().to_numpy()) cells_ao.loc[cells_ao["CellId"].isin(CellIds_remove), "Outlier"] = "yes_abnormal_cell_or_nuclear_metric" cells = cells.drop(cells.index[cells["CellId"].isin(CellIds_remove)]) log.info( f"Removing {len(CellIds_remove)} cells due to abnormal cell or nuclear metric" ) log.info(cells.shape) oplot( cellnuc_metrics, cellnuc_abbs, pairs2, cells, True, output_dir, f"{plotname}_6_clean_thick", 2, [], ) oplot( cellnuc_metrics, cellnuc_abbs, pairs2, cells, True, output_dir, f"{plotname}_6_clean_fine", 0.5, [], ) oplot( cellnuc_metrics, cellnuc_abbs, pairs, cells, True, output_dir, f"{plotname}_21_clean_thick", 2, [], ) oplot( cellnuc_metrics, cellnuc_abbs, pairs, cells, True, output_dir, f"{plotname}_21_clean_fine", 0.5, [], ) # %% Feature sets for structures selected_metrics = [ "Cell volume", "Cell surface area", "Nuclear volume", "Nuclear surface area", ] selected_metrics_abb = ["Cell Vol", "Cell Area", "Nuc Vol", "Nuc Area"] selected_structures = [ "LMNB1", "ST6GAL1", "TOMM20", "SEC61B", "ATP2A2", "LAMP1", "RAB5A", "SLC25A17", "TUBA1B", "TJP1", "NUP153", "FBL", "NPM1", "SON", ] structure_metric = "Structure volume" # %% Parameters N = 1000 fac = 1000 Rounds = 5 if detect_based_on_structure_features: # We may want to skip this part when running the test dataset # or any small dataset that does not have enough cells per # structure. # %% For all pairs compute densities remove_cells = cells["CellId"].to_frame().copy() for xm, metric in enumerate(selected_metrics): for ys, struct in enumerate(selected_structures): # data x = (cells.loc[cells["structure_name"] == struct, [metric]].squeeze().to_numpy() / fac) y = (cells.loc[cells["structure_name"] == struct, [structure_metric]].squeeze().to_numpy() / fac) # density estimate, repeat because of probabilistic nature of density # estimate used here for r in np.arange(Rounds): if ys == 0: remove_cells[ f"{metric} vs {structure_metric}_{r}"] = np.nan rs = int(r) xS, yS = resample(x, y, replace=False, n_samples=np.amin([N, len(x)]), random_state=rs) k = gaussian_kde(np.vstack([xS, yS])) cell_dens = k(np.vstack([x.flatten(), y.flatten()])) cell_dens = cell_dens / np.sum(cell_dens) remove_cells.loc[ cells["structure_name"] == struct, f"{metric} vs {structure_metric}_{r}", ] = cell_dens # remove_cells = pd.read_csv(data_root_extra / 'structures.csv') # %% Summarize across repeats remove_cells_summary = cells["CellId"].to_frame().copy() for xm, metric in enumerate(selected_metrics): log.info(metric) filter_col = [ col for col in remove_cells if col.startswith(f"{metric} vs {structure_metric}") ] x = remove_cells[filter_col].to_numpy() pos = np.argwhere(np.any(x < cell_dens_th_S, axis=1)) y = x[pos, :].squeeze() fig, axs = plt.subplots(1, 2, figsize=(16, 9)) xr = np.log(x.flatten()) xr = np.delete(xr, np.argwhere(np.isinf(xr))) axs[0].hist(xr, bins=100) axs[0].set_title("Histogram of cell probabilities (log scale)") axs[0].set_yscale("log") im = axs[1].imshow(np.log(y), aspect="auto") plt.colorbar(im) axs[1].set_title("Heatmap with low probability cells (log scale)") plot_save_path = ( f"{output_dir}/{metric}_vs_{structure_metric}_cellswithlowprobs.png" ) plt.savefig(plot_save_path, format="png", dpi=150) remove_cells_summary[f"{metric} vs {structure_metric}"] = np.median( x, axis=1) # %% Identify cells to be removed CellIds_remove_dict = {} CellIds_remove = np.empty(0, dtype=int) for xm, metric in enumerate(selected_metrics): log.info(metric) CellIds_remove_dict[f"{metric} vs {structure_metric}"] = np.argwhere( remove_cells_summary[f"{metric} vs {structure_metric}"].to_numpy() < cell_dens_th_S) CellIds_remove = np.union1d( CellIds_remove, CellIds_remove_dict[f"{metric} vs {structure_metric}"]) log.info(len(CellIds_remove)) # %% Plot and remove outliers plotname = "Structures" splot( selected_metrics, selected_metrics_abb, selected_structures[0:7], structure_metric, cells, True, output_dir, f"{plotname}_1_org_fine", 0.5, [], ) splot( selected_metrics, selected_metrics_abb, selected_structures[7:14], structure_metric, cells, True, output_dir, f"{plotname}_2_org_fine", 0.5, [], ) splot( selected_metrics, selected_metrics_abb, selected_structures[0:7], structure_metric, cells, True, output_dir, f"{plotname}_1_org_thick", 2, [], ) splot( selected_metrics, selected_metrics_abb, selected_structures[7:14], structure_metric, cells, True, output_dir, f"{plotname}_2_org_thick", 2, [], ) splot( selected_metrics, selected_metrics_abb, selected_structures[0:7], structure_metric, cells, True, output_dir, f"{plotname}_1_outliers", 2, CellIds_remove_dict, ) splot( selected_metrics, selected_metrics_abb, selected_structures[7:14], structure_metric, cells, True, output_dir, f"{plotname}_2_outliers", 2, CellIds_remove_dict, ) log.info(cells.shape) CellIds_remove = (cells.loc[cells.index[CellIds_remove], "CellId"].squeeze().to_numpy()) cells_ao.loc[cells_ao["CellId"].isin(CellIds_remove), "Outlier"] = "yes_abnormal_structure_volume_metrics" cells = cells.drop(cells.index[cells["CellId"].isin(CellIds_remove)]) log.info( f"Removing {len(CellIds_remove)} cells due to structure volume metrics" ) log.info(cells.shape) splot( selected_metrics, selected_metrics_abb, selected_structures[0:7], structure_metric, cells, True, output_dir, f"{plotname}_1_clean_fine", 0.5, [], ) splot( selected_metrics, selected_metrics_abb, selected_structures[7:14], structure_metric, cells, True, output_dir, f"{plotname}_2_clean_fine", 0.5, [], ) splot( selected_metrics, selected_metrics_abb, selected_structures[0:7], structure_metric, cells, True, output_dir, f"{plotname}_1_clean_thick", 2, [], ) splot( selected_metrics, selected_metrics_abb, selected_structures[7:14], structure_metric, cells, True, output_dir, f"{plotname}_2_clean_thick", 2, [], ) # %% Final diagnostic plot cells = initial_parsing(df=df) CellIds_remove_dict = {} for i, xy_pair in enumerate(pairs): metricX = cellnuc_metrics[xy_pair[0]] metricY = cellnuc_metrics[xy_pair[1]] CellIds_remove_dict[f"{metricX} vs {metricY}"] = np.argwhere( (cells_ao["Outlier"] == "yes_abnormal_cell_or_nuclear_metric" ).to_numpy()) oplot( cellnuc_metrics, cellnuc_abbs, pairs2, cells, True, output_dir, "Check_cellnucleus", 2, CellIds_remove_dict, ) CellIds_remove_dict = {} for xm, metric in enumerate(selected_metrics): CellIds_remove_dict[f"{metric} vs {structure_metric}"] = np.argwhere( ((cells_ao["Outlier"] == "yes_abnormal_structure_volume_metrics") | (cells_ao["Outlier"] == "yes_abnormal_cell_or_nuclear_metric")).to_numpy()) splot( selected_metrics, selected_metrics_abb, selected_structures[0:7], structure_metric, cells, True, output_dir, "Check_structures_1", 2, CellIds_remove_dict, ) splot( selected_metrics, selected_metrics_abb, selected_structures[7:14], structure_metric, cells, True, output_dir, "Check_structures_2", 2, CellIds_remove_dict, ) cells_ao = cells_ao.set_index("CellId", drop=True) return cells_ao
m = X_.shape[0] batch_size = 11 steps_per_epoch = m // batch_size graph = topological_sort(feed_dict) trainables = [W1, b1, W2, b2] print("Total number of examples = {}".format(m)) # Step 4 for i in range(epochs): loss = 0 for j in range(steps_per_epoch): # Step 1 # Randomly sample a batch of examples X_batch, y_batch = resample(X_, y_, n_samples=batch_size) # Reset value of X and y Inputs X.value = X_batch y.value = y_batch # Step 2 forward_and_backward(graph) # Step 3 sgd_update(trainables) loss += graph[-1].value print("Epoch: {}, Loss: {:.3f}".format(i+1, loss/steps_per_epoch))
print(f'% insufficient data for H{horizon} T{change}') continue # not enough data if verbose: print( f'%%% Selecting features for H{horizon} T{change} with {n} data points' ) for replica in range(replicas): # if technically splits could be made parts = [] goal = n // present for label in [0, 1, 2]: # resample to balance the classes if label in counters: matches = data[data.label == label] lm = len(matches) if lm >= MINIMUM: # disregard the nearly-absent class instead of gross oversampling part = resample(matches, replace=lm < goal, n_samples=goal) parts.append(part) training, testing = train_test_split(pd.concat(parts), test_size=0.3) expected = [l for l in testing[labels]] # a simple list trainData = training[features].to_numpy() start = time() preproc = FRFS() # very slow, perform on a subset sample = resample(training, replace=False, n_samples=ss) selected = preproc.process(sample[features].to_numpy(), \ np.reshape(sample[labels].to_numpy(), (ss, 1))) fstimes.append(1000 * (time() - start)) # ms for pos in range(len(selected)): # update the usage counters if selected[pos]: i = features[pos]
# print "ddd ",d_arr # col4=[] # for q in arr: # q1 = [' '.join(x) for x in ngrams(q, 1)]# q1:mang cac 1-grams # q2 = [' '.join(x) for x in ngrams(q, 2)] # q2: mang cac phan tu 2-grams # print "q2 ",q2 # q3 = [' '.join(x.replace(' ','_') for x in q2)] # print "q3 ",q3 # y=q1+q3 # z = " ".join(y) # print "yyyy ",z # col4.append(z) # print "col4 ",col4 # # a =['a b','c d'] # b = ['a b','c d','e f'] row = np.array([0, 3, 1, 0]) # print row.shape # X2 = np.array([[1., 0.], [2., 1.], [0., 0.]]) X = np.array([[1., 0.], [2., 1.], [0., 0.],[3., 5.], [0, 0]]) # print X.shape y = np.array([0, 1, 2, 3, 4]) # y2 = np.array([0, 1, 2]) # X_sparse = coo_matrix(X) # print X_sparse X, y = resample(X, y, n_samples=7) print X print y
'threshold': pd.Series(threshold, index=i) }) roc_t = roc.ix[(roc.tf - 0).abs().argsort()[:1]] return list(roc_t['threshold']) # Load dataset print('===> loading dataset') data = pd.read_csv('~/repos/dataset/HR.csv') dataset = data.rename(columns={'left': 'class'}) # Unbalanced dataset # Upsampling minority = dataset[dataset['class'] == 1] minority_upsampled = resample(minority, replace=True, n_samples=11428, random_state=123) # Downsampling majority = dataset[dataset['class'] == 0] majority_downsampled = resample(majority, replace=False, n_samples=3571, random_state=123) dataset = pd.concat([minority, majority_downsampled]) # Transform features dataset = pd.get_dummies(dataset, columns=['sales', 'salary']) # Selection features
''' data=lung_cancer[lung_cancer['YEAR_DX'].between(2004,2010)].drop(columns=drop_cols+['survival_classes']) target=lung_cancer[lung_cancer['YEAR_DX'].between(2004,2010)]['survival_classes'] data=pd.get_dummies(data,prefix=catg_cols,columns=catg_cols,drop_first=False) class_weights = dict(enumerate(class_weight.compute_class_weight('balanced', pd.np.unique(target),target))) ''' data = lung_cancer[lung_cancer['YEAR_DX'].between( 2004, 2010)].drop(columns=drop_cols) low_survival = data[data['survival_classes'] == '<=6months'] mid_survival = data[data['survival_classes'] == '0.5-2yrs'] high_survival = data[data['survival_classes'] == '>2yrs'] mid_survival = resample(mid_survival, replace=True, n_samples=len(low_survival), random_state=21) high_survival = resample(high_survival, replace=True, n_samples=len(low_survival), random_state=21) data_upsampled = pd.concat([low_survival, mid_survival, high_survival], axis=0) data = data_upsampled.drop(columns=['survival_classes']) target = data_upsampled['survival_classes'] data = pd.get_dummies(data, prefix=catg_cols, columns=catg_cols, drop_first=False)
def Bootstrap(x1, x2, y, N_boot=500, method='ols', degrees=5, random_state=42): """ Computes bias^2, variance and the mean squared error using bootstrap resampling method for the provided data and the method. Arguments: x1: 1D numpy array, covariate x2: 1D numpy array, covariate N_boot: integer type, the number of bootstrap samples method: string type, accepts 'ols', 'ridge' or 'lasso' as arguments degree: integer type, polynomial degree for generating the design matrix random_state: integer, ensures the same split when using the train_test_split functionality Returns: Bias_vec, Var_vec, MSE_vec, betaVariance_vec numpy arrays. Bias, Variance, MSE and the variance of beta for the predicted model """ ##split x1, x2 and y arrays as a train and test data and generate design matrix x1_train, x1_test, x2_train, x2_test, y_train, y_test = train_test_split( x1, x2, y, test_size=0.2, random_state=random_state) y_pred_test = np.zeros((y_test.shape[0], N_boot)) X_test = designMatrix(x1_test, x2_test, degrees) betaMatrix = np.zeros((X_test.shape[1], N_boot)) ##resample and fit the corresponding method on the train data for i in range(N_boot): x1_, x2_, y_ = resample(x1_train, x2_train, y_train) X_train = designMatrix(x1_, x2_, degrees) scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_train[:, 0] = 1 X_test = designMatrix(x1_test, x2_test, degrees) X_test = scaler.transform(X_test) X_test[:, 0] = 1 if method == 'ols': manual_regression = linregOwn(method='ols') beta = manual_regression.fit(X_train, y_) if method == 'ridge': manual_regression = linregOwn(method='ridge') beta = manual_regression.fit(X_train, y_, lambda_=0.05) if method == 'lasso': manual_regression = linregOwn(method='lasso') beta = manual_regression.fit(X_train, y_, lambda_=0.05) ##predict on the same test data y_pred_test[:, i] = np.dot(X_test, beta) betaMatrix[:, i] = beta y_test = y_test.reshape(len(y_test), 1) Bias_vec = [] Var_vec = [] MSE_vec = [] betaVariance_vec = [] R2_score = [] y_test = y_test.reshape(len(y_test), 1) MSE = np.mean(np.mean((y_test - y_pred_test)**2, axis=1, keepdims=True)) bias = np.mean((y_test - np.mean(y_pred_test, axis=1, keepdims=True))**2) variance = np.mean(np.var(y_pred_test, axis=1, keepdims=True)) betaVariance = np.var(betaMatrix, axis=1) print("-------------------------------------------------------------") print("Degree: %d" % degrees) print('MSE:', np.round(MSE, 3)) print('Bias^2:', np.round(bias, 3)) print('Var:', np.round(variance, 3)) print('{} >= {} + {} = {}'.format(MSE, bias, variance, bias + variance)) print("-------------------------------------------------------------") Bias_vec.append(bias) Var_vec.append(variance) MSE_vec.append(MSE) betaVariance_vec.append(betaVariance) return Bias_vec, Var_vec, MSE_vec, betaVariance_vec
np.exp(0.11018577) predictions = logreg.predict(X_test_sc) # Making the Confusion Matrix and Accuracy_score¶ from sklearn.metrics import confusion_matrix, accuracy_score cm = confusion_matrix(y_test, predictions) cm cm = pd.DataFrame(cm, columns=['Predicted Negative','Predicted Positive'], index=['Actual Negative','Actual Positive']) cm accuracy_score(y_test, predictions) df['class'].value_counts() df_v1['class'].value_counts() df_v1.shape df_v1['class'].value_counts() df_v1_maj = df_v1[ df_v1['class'] == 'ckd' ] df_v1_min = df_v1[ df_v1['class'] == 'notckd' ] df_upsample = resample(df_v1_maj, replace = True, n_samples = 4850, random_state = 42) df_upsample = pd.concat([df_upsample, df_v1_min]) df_upsample['class'].value_counts() X = df_upsample[v1_features] y = df_upsample['class'] X_poly = poly.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state = 42) ss.fit(X_train) X_train_sc = ss.transform(X_train) X_test_sc = ss.transform(X_test) logreg.fit(X_train_sc, y_train) # Earlier score was 0.96666666666 logreg.score(X_train_sc, y_train) # Earlier score was 0.96 logreg.score(X_test_sc, y_test) predictions = logreg.predict(X_test_sc)
len(df_no_missing) # **29,932** samples is a relatively large number for a **Support Vector Machine**, so let's downsample. To make sure we get **1,000** of each category, we start by splitting the data into two **dataframes**, one for people that did not default and one for people that did. # In[ ]: df_no_default = df_no_missing[df_no_missing['DEFAULT'] == 0] df_default = df_no_missing[df_no_missing['DEFAULT'] == 1] # Now downsample the dataset that did not default... # In[ ]: df_no_default_downsampled = resample(df_no_default, replace=False, n_samples=1000, random_state=42) len(df_no_default_downsampled) # Now downsample the dataset that defaulted... # In[ ]: df_default_downsampled = resample(df_default, replace=False, n_samples=1000, random_state=42) len(df_default_downsampled) # Now let's merge the two downsampled datasets into a single **dataframe** and print out the total number of samples to make sure everything is hunky dory.
""" A note on bootstrapping estimates: To generate the bootstrap estimates we modify the sess_estimates.py script only slightly to include an additional loop in the run_estimates(.) function, then gather these additional results. """ from sklearn.utils import resample #[...] for bootstrapi in range(num_bootstraps): X_index = range(X.shape[0]) resamp = resample(X_index, random_state=9889) ycurr = y[resamp] Xcurr = X[resamp] ycurr.index = range(len(ycurr)) skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=1234) #[...]
sample_variance = np.var(samples) print("sample mean = {} and sample variance = {}".format(sample_mean, sample_variance)) # part (c) from sklearn.utils import resample N_resample = 1000 # number of resample std = [] mean = [] # re-sample, compute corresponding mean & std, and store them for i in range(N_resample): re_samples = resample(samples, n_samples=len(samples), replace=True) mean.append(np.mean(re_samples)) std.append(np.std(re_samples)) # sort mean and std mean = np.sort(mean) std = np.sort(std) # compute 95% confidence interval a = 95 per_1_mean = np.percentile(mean, (100-a)/2, interpolation='nearest') per_2_mean = np.percentile(mean, (100+a)/2, interpolation='nearest') print('The ', str((100-a)/2), '% percentile mean is: ', str(per_1_mean)) print('The ', str((100+a)/2), '% percentile mean is: ', str(per_2_mean))
def create_upsampled_test_data(dataset, labels, num_of_classes): len_dataset = len(dataset) freq_array = [0 for x in range(num_of_classes)] new_label = numpy.reshape(labels, (len_dataset, 1)) for label in labels: freq_array[label - 1] += 1 concat_data_label = numpy.concatenate((dataset, new_label), axis=1) max_no_of_class = max(freq_array) index_of_max_class = numpy.argmax(freq_array) len_concat_data_label = len(concat_data_label[0]) # upsample for label in range(num_of_classes): new_array = numpy.zeros((freq_array[label], len_concat_data_label)) index = 0 for count in range(0, len_dataset): if label == labels[count] - 1: new_array[index] = concat_data_label[count] index += 1 if label == index_of_max_class: upsamped = copy.deepcopy(new_array) else: upsamped = resample(copy.deepcopy(new_array), n_samples=max_no_of_class) if label == 0: resampled_arr = copy.deepcopy(upsamped) else: resampled_arr = numpy.concatenate((resampled_arr, upsamped), axis=0) resampled_arr = shuffle(resampled_arr) len_upsampled_col = len(resampled_arr[0]) len_upsampled_row = len(resampled_arr) labels_upsampled = numpy.array(resampled_arr[:, [len_upsampled_col - 1]]) resampled_arr = numpy.delete(resampled_arr, [len_upsampled_col - 1], axis=1) len_upsampled_col -= 1 with open("training_set.csv", 'w') as file: for row in range(len_upsampled_row): for col in range(len_upsampled_col): file.write(str(resampled_arr[row][col])) if col == len_upsampled_col - 1: break if col != len_upsampled_col: file.write(",") file.write("\n") with open("training_labels.csv", 'w') as file: for row in range(len_upsampled_row): file.write(str(labels_upsampled[row][0])) file.write("\n") labels_upsampled = numpy.reshape(labels_upsampled, len(labels_upsampled)) labels_matrix_upsampled = create_label_matrix(labels_upsampled) return resampled_arr, labels_matrix_upsampled
def bootstrap_heat_capacity(frame_begin=0, sample_spacing=1, frame_end=-1, plot_file='heat_capacity_boot.pdf', output_data="output/output.nc", num_intermediate_states=0, frac_dT=0.05, conf_percent='sigma', n_trial_boot=200): """ Calculate and plot the heat capacity curve, with uncertainty determined using bootstrapping. Uncorrelated datasets are selected using a random starting frame, repeated n_trial_boot times. Uncertainty in melting point and full-width half maximum of the C_v curve are also returned. :param frame_begin: index of first frame defining the range of samples to use as a production period (default=0) :type frame_begin: int :param sample_spacing: spacing of uncorrelated data points, for example determined from pymbar timeseries subsampleCorrelatedData (default=1) :type sample_spacing: int :param frame_end: index of last frame to include in heat capacity calculation (default=-1) :type frame_end: int :param output_data: Path to the output data for a NetCDF-formatted file containing replica exchange simulation data (default = "output/output.nc") :type output_data: str :param num_intermediate_states: The number of states to insert between existing states in 'temperature_list' (default=0) :type num_intermediate_states: int :param frac_dT: The fraction difference between temperatures points used to calculate finite difference derivatives (default=0.05) :type num_intermediate_states: float :param conf_percent: Confidence level in percent for outputting uncertainties (default = 68.27 = 1 sigma) :type conf_percent: float :param n_trial_boot: number of trials to run for generating bootstrapping uncertainties :type n_trial_boot: int :returns: - T_list ( List( float * unit.simtk.temperature ) ) - The temperature list corresponding to the heat capacity values in 'C_v' - C_v_values ( List( float * kJ/mol/K ) ) - The heat capacity values for all (including inserted intermediates) states - C_v_uncertainty ( Tuple ( np.array(float) * kJ/mol/K ) ) - confidence interval for all C_v_values computed from bootstrapping - Tm_value ( float * unit.simtk.temperature ) - Melting point mean value computed from bootstrapping - Tm_uncertainty ( Tuple ( float * unit.simtk.temperature ) ) - confidence interval for melting point computed from bootstrapping - FWHM_value ( float * unit.simtk.temperature ) - C_v full width half maximum mean value computed from bootstrapping - FWHM_uncertainty ( Tuple ( float * unit.simtk.temperature ) ) - confidence interval for C_v full width half maximum computed from bootstrapping """ # extract reduced energies and the state indices from the .nc reporter = MultiStateReporter(output_data, open_mode="r") analyzer = ReplicaExchangeAnalyzer(reporter) ( replica_energies_all, unsampled_state_energies, neighborhoods, replica_state_indices, ) = analyzer.read_energies() # Store data for each sampling trial: C_v_values_boot = {} C_v_uncertainty_boot = {} Tm_boot = np.zeros(n_trial_boot) Cv_height = np.zeros(n_trial_boot) FWHM = np.zeros(n_trial_boot) for i_boot in range(n_trial_boot): # Select production frames to analyze # Here we can potentially change the reference frame for each bootstrap trial. ref_shift = np.random.randint(sample_spacing) # ***We should check if these energies arrays will be the same size for # different reference frames replica_energies = replica_energies_all[:, :, (frame_begin + ref_shift)::sample_spacing] # Get all possible sample indices sample_indices_all = np.arange(0, len(replica_energies[0, 0, :])) # n_samples should match the size of the sliced replica energy dataset sample_indices = resample(sample_indices_all, replace=True, n_samples=len(sample_indices_all)) n_state = replica_energies.shape[0] replica_energies_resample = np.zeros_like(replica_energies) # replica_energies is [n_states x n_states x n_frame] # Select the sampled frames from array_folded_states and replica_energies: j = 0 for i in sample_indices: replica_energies_resample[:, :, j] = replica_energies[:, :, i] j += 1 # Run heat capacity expectation calculation: C_v_values_boot[i_boot], C_v_uncertainty_boot[ i_boot], T_list = get_heat_capacity( output_data=output_data, num_intermediate_states=num_intermediate_states, frac_dT=frac_dT, plot_file=None, bootstrap_energies=replica_energies_resample, ) if i_boot == 0: # Get units: C_v_unit = C_v_values_boot[0][0].unit T_unit = T_list[0].unit # Compute the melting point: max_index = np.argmax(C_v_values_boot[i_boot]) Tm_boot[i_boot] = T_list[max_index].value_in_unit(T_unit) # Compute the peak height, relative to lowest C_v value in the temp range: Cv_height[i_boot] = ( np.max(C_v_values_boot[i_boot]) - np.min(C_v_values_boot[i_boot])).value_in_unit(C_v_unit) # Compute the FWHM: # C_v value at half-maximum: mid_val = np.min(C_v_values_boot[i_boot]).value_in_unit( C_v_unit) + Cv_height[i_boot] / 2 #***Note: this assumes that there is only a single heat capacity peak, with # monotonic behavior on each side of the peak. half_lo_found = False half_hi_found = False T_half_lo = None T_half_hi = None # Reverse scan for lower half: k = 1 while half_lo_found == False: index = max_index - k if index < 0: # The lower range does not contain the lower midpoint break else: curr_val = C_v_values_boot[i_boot][index].value_in_unit( C_v_unit) prev_val = C_v_values_boot[i_boot][index + 1].value_in_unit(C_v_unit) if curr_val <= mid_val: # The lower midpoint lies within T[index] and T[index+1] # Interpolate solution: T_half_lo = T_list[index] + (mid_val - curr_val) * ( T_list[index + 1] - T_list[index]) / (prev_val - curr_val) half_lo_found = True else: k += 1 # Forward scan for upper half: m = 1 while half_hi_found == False: index = max_index + m if index == len(T_list): # The upper range does not contain the upper midpoint break else: curr_val = C_v_values_boot[i_boot][index].value_in_unit( C_v_unit) prev_val = C_v_values_boot[i_boot][index - 1].value_in_unit(C_v_unit) if curr_val <= mid_val: # The upper midpoint lies within T[index] and T[index-1] # Interpolate solution: T_half_hi = T_list[index] + (mid_val - curr_val) * ( T_list[index - 1] - T_list[index]) / (prev_val - curr_val) half_hi_found = True else: m += 1 if half_lo_found and half_hi_found: FWHM[i_boot] = (T_half_hi - T_half_lo).value_in_unit(T_unit) elif half_lo_found == True and half_hi_found == False: FWHM[i_boot] = 2 * (Tm_boot[i_boot] - T_half_lo.value_in_unit(T_unit)) elif half_lo_found == False and half_hi_found == True: FWHM[i_boot] = 2 * (T_half_hi.value_in_unit(T_unit) - Tm_boot[i_boot]) # Compute uncertainty at all temps in T_list over the n_trial_boot trials performed: # Convert dicts to array arr_C_v_values_boot = np.zeros((n_trial_boot, len(T_list))) for i_boot in range(n_trial_boot): arr_C_v_values_boot[i_boot, :] = C_v_values_boot[i_boot].value_in_unit( C_v_unit) # Compute mean values: C_v_values = np.mean(arr_C_v_values_boot, axis=0) * C_v_unit Cv_height_value = np.mean(Cv_height) * C_v_unit Tm_value = np.mean(Tm_boot) * T_unit FWHM_value = np.mean(FWHM) * T_unit # Compute confidence intervals: if conf_percent == 'sigma': # Use analytical standard deviation instead of percentile method: # C_v values: C_v_std = np.std(arr_C_v_values_boot, axis=0) C_v_uncertainty = (-C_v_std * C_v_unit, C_v_std * C_v_unit) # C_v peak height: Cv_height_std = np.std(Cv_height) Cv_height_uncertainty = (-Cv_height_std * C_v_unit, Cv_height_std * C_v_unit) # Melting point: Tm_std = np.std(Tm_boot) Tm_uncertainty = (-Tm_std * T_unit, Tm_std * T_unit) # Full width half maximum: FWHM_std = np.std(FWHM) FWHM_uncertainty = (-FWHM_std * T_unit, FWHM_std * T_unit) else: # Compute specified confidence interval: p_lo = (100 - conf_percent) / 2 p_hi = 100 - p_lo # C_v values: C_v_diff = arr_C_v_values_boot - np.mean(arr_C_v_values_boot, axis=0) C_v_conf_lo = np.percentile(C_v_diff, p_lo, axis=0, interpolation='linear') C_v_conf_hi = np.percentile(C_v_diff, p_hi, axis=0, interpolation='linear') C_v_uncertainty = (C_v_conf_lo * C_v_unit, C_v_conf_hi * C_v_unit) # C_v peak height: Cv_height_diff = Cv_height - np.mean(Cv_height) Cv_height_conf_lo = np.percentile(Cv_height_diff, p_lo, interpolation='linear') Cv_height_conf_hi = np.percentile(Cv_height_diff, p_hi, interpolation='linear') Cv_height_uncertainty = (Cv_height_conf_lo * C_v_unit, Cv_height_conf_hi * C_v_unit) # Melting point: Tm_diff = Tm_boot - np.mean(Tm_boot) Tm_conf_lo = np.percentile(Tm_diff, p_lo, interpolation='linear') Tm_conf_hi = np.percentile(Tm_diff, p_hi, interpolation='linear') Tm_uncertainty = (Tm_conf_lo * T_unit, Tm_conf_hi * T_unit) # Full width half maximum: FWHM_diff = FWHM - np.mean(FWHM) FWHM_conf_lo = np.percentile(FWHM_diff, p_lo, interpolation='linear') FWHM_conf_hi = np.percentile(FWHM_diff, p_hi, interpolation='linear') FWHM_uncertainty = (FWHM_conf_lo * T_unit, FWHM_conf_hi * T_unit) # Plot and return the heat capacity (with units) if plot_file is not None: plot_heat_capacity(C_v_values, C_v_uncertainty, T_list, file_name=plot_file) return T_list, C_v_values, C_v_uncertainty, Tm_value, Tm_uncertainty, Cv_height_value, Cv_height_uncertainty, FWHM_value, FWHM_uncertainty
def train_and_evaluate( model: BertForSequenceClassification, tokenizer: BertTokenizer, condition_type: str, sampling_bin: int, n: int, metrics_output_path: str, ): """Train and evaluate the model on N conditions. @param model is the model to encode CLS tokens with. @param tokenizer is a BERT tokenizer. @param condition_type are we using the icd/medcat extracted conditions? @param b is which frequency to sample from. @param n is the number of conditions to sample the bin from. @return all AUCs and precision @ K scores. """ ### Get Relevant Data subject_id_to_patient_info = get_subject_id_to_patient_info( condition_type=condition_type) condition_code_to_count = get_condition_code_to_count( condition_type=condition_type) condition_code_to_description = get_condition_code_to_descriptions( condition_type=condition_type) set_to_use = filter_condition_code_by_count(condition_code_to_count, min_count=0, max_count=500000) binned_conditions = get_frequency_bins(condition_code_to_count, condition_type) subject_ids = sorted(list(subject_id_to_patient_info.keys())) train_subject_ids, test_subject_ids = train_test_split(subject_ids, train_size=0.5, random_state=2021, shuffle=True) ### Filter condition in each bin so we have atleast one positive training examples ### And One positive test example ### Otherwise, we can't train a LR model or calculate roc_auc_score train_set_conditions = get_non_zero_count_conditions( set_to_use, train_subject_ids, subject_id_to_patient_info) test_set_conditions = get_non_zero_count_conditions( set_to_use, test_subject_ids, subject_id_to_patient_info) binned_conditions = [ set(bin_) & train_set_conditions & test_set_conditions for bin_ in binned_conditions ] binned_conditions = [sorted(list(bin_)) for bin_ in binned_conditions] ### Sample condition in selected bin condition_bin = binned_conditions[sampling_bin] np.random.seed(2021) sampled_conditions = np.random.choice(condition_bin, size=n, replace=False) ## Train a Classifier for Each Condition auc_score_list, precision_at_10_list = [], [] for condition in tqdm(sampled_conditions): desc = condition_code_to_description[condition] train_templates = [] train_labels = [] for subject_id in train_subject_ids: patient_info = subject_id_to_patient_info[subject_id] template = generate_name_condition_template( patient_info.FIRST_NAME, patient_info.LAST_NAME, patient_info.GENDER, desc) label = condition in patient_info.CONDITIONS train_templates.append(template) train_labels.append(label) ## Resample to Upsample positive examples negative_indices = [i for i, x in enumerate(train_labels) if x == 0] positive_indices = [i for i, x in enumerate(train_labels) if x == 1] positive_indices = resample(positive_indices, replace=True, n_samples=len(negative_indices), random_state=2021) total_indices = negative_indices + positive_indices ### Divide Train Set into Train and Validation Set training_indices, validation_indices = train_test_split( total_indices, train_size=0.85, random_state=2021, shuffle=True) # Not too sure we can ensure the validation templates have a positive label in it... # Or if there is only 1, that it doesn't end up in the validation set. validation_templates = [train_templates[i] for i in validation_indices] validation_labels = [train_labels[i] for i in validation_indices] np.random.seed(2021) np.random.shuffle(training_indices) train_templates = [train_templates[i] for i in training_indices] train_labels = [train_labels[i] for i in training_indices] ### Train the BERT Model train_dataset = get_as_dataset(tokenizer, train_templates, train_labels) validation_dataset = get_as_dataset(tokenizer, validation_templates, validation_labels) clf = train_model(model, train_dataset, validation_dataset) ### Get Test Templates test_templates = [] test_labels = [] for subject_id in test_subject_ids: patient_info = subject_id_to_patient_info[subject_id] template = generate_name_condition_template( patient_info.FIRST_NAME, patient_info.LAST_NAME, patient_info.GENDER, desc) label = condition in patient_info.CONDITIONS test_templates.append(template) test_labels.append(label) ### Get Test Predictions test_dataset = get_as_dataset(tokenizer, test_templates, test_labels) test_predictions = clf.predict(test_dataset) test_predictions = test_predictions.predictions[:, 1] ### Calculate Metrics auc_score = roc_auc_score(test_labels, test_predictions) precision_at_10 = precision_at_k(test_labels, test_predictions, k=10) auc_score_list.append(auc_score) precision_at_10_list.append(precision_at_10) from experiments.MLM.common import mean_std_as_string with open(f"{metrics_output_path}/results.txt", "w") as f: f.write(mean_std_as_string("Model AUC", auc_score_list)) f.write(mean_std_as_string("Model P@K", precision_at_10_list))
inplace=True) # Checking for missing values print(df.isnull().values.any()) # Checking for imbalances in cases for each class print(df.team_placement.value_counts()) # Resampling imbalanced data df_majority = df[df.team_placement == -10] df_minority = df[df.team_placement != -10] # Downsample majority class df_majority_downsampled = resample( df_majority, replace=False, # sample without replacement n_samples=135167, # to match minority class random_state=42) # reproducible results # Combine minority class with downsampled majority class df_downsampled = pd.concat([df_majority_downsampled, df_minority]) # Randomly marking 70% rows for training df_downsampled['is_train'] = np.random.uniform(0, 1, len(df_downsampled)) <= .70 # Setting team_placement as categorical change = {"team_placement": {-1: "!top 10", 1: "top 10"}} df_downsampled.replace(change, inplace=True) df_downsampled["team_placement"] = df_downsampled["team_placement"].astype( 'category')
# splitting up testing and training sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) y_test = y_test # concatenate our training data back together upsample_input = pd.concat([X_train, y_train], axis=1) # separate minority and majority classes Worst_qual = upsample_input[upsample_input.Takeover_Quality == 1] Bad_qual = upsample_input[upsample_input.Takeover_Quality == 2] Good_qual = upsample_input[upsample_input.Takeover_Quality == 3] #----------------------------------------------------- # Downsample the majorities Worst_qual_upsampled = resample( Worst_qual, replace=True, # sample with replacement n_samples=len(Good_qual), # match number in majority class random_state=27) # reproducible results Bad_qual_upsampled = resample( Bad_qual, replace=True, # sample with replacement n_samples=len(Good_qual), # match number in majority class random_state=27) # reproducible results # combine majority and upsampled minority downsampled = pd.concat( [Bad_qual_upsampled, Good_qual, Worst_qual_upsampled]) # check new class counts print(downsampled.Takeover_Quality.value_counts()) #63079
x_treino, x_teste, y_treino, y_teste = train_test_split(x, y, test_size=0.3, random_state=378) # concatenate our training data back together X = pd.concat([x_treino, y_treino], axis=1) # separate minority and majority classes not_ordem = X[X['LocalMax'] == 0].copy() ordem = X[X['LocalMax'] == 1].copy() # upsample minority ordem_upsampled = resample( ordem, replace=True, # sample with replacement n_samples=len(not_ordem), # match number in majority class random_state=378) # reproducible results # combine majority and upsampled minority upsampled = pd.concat([not_ordem, ordem_upsampled]) x_treino = upsampled[[c for c in df_cluster_dia.columns if c not in cols_rem]] y_treino = upsampled['LocalMax'] display(y_treino.value_counts()) #xgb.fit(x_treino, y_treino, eval_set = [(x_treino, y_treino), (x_teste, y_teste)], eval_metric=f1_score) param = { 'max_depth': 10, 'eta': 2,
stratify=y_train_temp_less) # training split = 80%, validation split = 10% # Take minority data samples from dataframe to array neutral_array = df_neutral.to_numpy() # Shuffle the data samples of minority class np.random.shuffle(neutral_array) # Split minority class Neutral in 80:10:10 ratio. train_neutral = neutral_array[0:869, :] val_neutral = neutral_array[869:978, :] test_neutral = neutral_array[978:1087, :] # Resample Neutral data to match majority class samples. train_neutral_resampled = resample(train_neutral, n_samples=1017, replace=True, random_state=0) val_neutral_resampled = resample(val_neutral, n_samples=127, replace=True, random_state=0) test_neutral_resampled = resample(test_neutral, n_samples=127, replace=True, random_state=0) # Separate features and target labels for Neutral data. X_train_neutral = train_neutral_resampled[:, 0:62] X_val_neutral = val_neutral_resampled[:, 0:62] X_test_neutral = test_neutral_resampled[:, 0:62] y_train_neutral = train_neutral_resampled[:, 62]
plt.ylabel('True Positive Rate') plt.title(heading) plt.legend(loc="lower right") plt.show() #Main #Read the Data youTubeTrendingData = pd.read_csv("TrendingVideos.csv", encoding="UTF-8", index_col='video_id') youTubeNonTrendingData = pd.read_csv("NonTrendingVideos.csv", encoding="UTF-8", index_col='V_id') youTubeTrendingData = resample( youTubeTrendingData, replace=False, n_samples=len(youTubeNonTrendingData) ) #Resampling of Data for balancing Class Labels #Pre-processing the Data youTubeData, youTubeTrendingData, youTubeNonTrendingData = preProcessTheData( youTubeTrendingData, youTubeNonTrendingData) #Processing the combined Trending and Non Trending Data youTubeData = processTheData(youTubeData, youTubeTrendingData) #Drop unused features youTubeDataForFeatureSelection = youTubeData.drop([ 'category_id', 'description', 'obtained_date', 'publish_time', 'thumbnail_link' ], axis=1) #Divide the Data into features and class labels X = youTubeDataForFeatureSelection.ix[:, (0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
swa, ], class_weight=weight) ypred = model.predict(X_test) ypred = np.argmax(ypred, axis=1) test_acc = balanced_accuracy_score(y_test, ypred) return model, test_acc scores, members = list(), list() used = list() for co in range(20): #20 splits # select indexes ix = [i for i in range(len(X))] train_ix = resample(ix, replace=True, n_samples=1000) #generate a new set with 1000 samples test_ix = [x for x in ix if x not in train_ix] print('Model {} of {}'.format(co + 1, 20)) print('Unique training data: {}, testing data: {}'.format( X.shape[0] - len(test_ix), len(test_ix))) # select data X_train, y_train = X[train_ix], y[train_ix] X_test, y_test = X[test_ix], y[test_ix] # train each model model, test_acc = trainModel(X_train, y_train, X_test, y_test) print('Test accuracy: {:3.3f}'.format(test_acc)) scores.append(test_acc) members.append(model) #this list will hold all trained models used += train_ix
def split_train_test(pos_data, neg_data, NEG_SIZE_TRAIN=NEG_SIZE_TRAIN): """ Split the data into a test set and a validation set """ m_total_pos = pos_data.shape[0] m_total_neg = neg_data.shape[0] pos_data_train, pos_data_test = cross_validation.train_test_split( pos_data, test_size=0.20, random_state=random_state) del pos_data # don't have access to it anymore, agh! m_neg_train = pos_data_train.shape[0] * NEG_SIZE_TRAIN m_neg_test = pos_data_test.shape[0] * NEG_SIZE_TEST assert neg_data.shape[0] >= (m_neg_train + m_neg_test) # Split the negative data into training and validation sets neg_data = np.array(neg_data) neg_data = utils.shuffle(neg_data, random_state=random_state) neg_data_train = neg_data[m_neg_test:] neg_data_test = neg_data[:m_neg_test] del neg_data # don't have access to it anymore, agh! o_neg_train = neg_data_train.shape[0] / m_neg_train assert neg_data_train.shape[0] >= o_neg_train * m_neg_train # Cut the negative training examples to be an exact multiple of the positive training examples if SPLIT_DATA_BY == 'cut' or SPLIT_DATA_BY == 'reshape': neg_data_train = neg_data_train[:o_neg_train * m_neg_train] # Split negative examples (for the training set) into o_neg_train-sized batches if SPLIT_DATA_BY == 'reshape' and o_neg_train >= 2: neg_data_train_new = np.empty( (m_neg_train, neg_data_train.shape[1], o_neg_train), dtype=float) for o_idx in range(o_neg_train): neg_data_train_new[:, :, o_idx] = neg_data_train[m_neg_train * o_idx:m_neg_train * (o_idx + 1), :] neg_data_train = neg_data_train_new assert neg_data_train.shape[0] == m_neg_train # Sample negative data to generate different training examples if SPLIT_DATA_BY == 'resample': o_neg_train = O_NEG_RESAMPLED neg_data_train_new = np.empty( (m_neg_train, neg_data_train.shape[1], o_neg_train), dtype=float) for o_idx in range(o_neg_train): neg_data_train_new[:, :, o_idx] = utils.resample( neg_data_train, replace=True, n_samples=m_neg_train, random_state=random_state) neg_data_train = neg_data_train_new assert neg_data_train.shape[0] == m_neg_train print("Number of negative training datasets: %i" % o_neg_train) if SPLIT_DATA_BY == 'cut1': neg_data_train = neg_data_train[:m_neg_train] print("m_pos_train: %i, m_neg_train: %i, m_pos_total: %i, m_neg_total: %i" % \ (pos_data_train.shape[0], neg_data_train.shape[0], m_total_pos, m_total_neg)) print(pos_data_train.shape, pos_data_test.shape, neg_data_train.shape, neg_data_test.shape) return pos_data_train, pos_data_test, neg_data_train, neg_data_test
print("Test Set:"% test.columns,test.shape,len(test)) def clean_text(df,text_field): df[text_field] = df[text_field].str.lower() df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+) | ([^0-9A-Za-z \t]) | (\w+:\/\/\S+) | ^rt | http.+?", "",elem)) return df test_clean = clean_text(test,"tweet") train_clean = clean_text(train,"tweet") # Upsampling : We repeatedly takes samples with replacement from minority class until # the class is the same size as the majority train_majority = train_clean[train_clean.label==0] train_minority = train_clean[train_clean.label==1] train_minority_upsampled = resample(train_minority,replace=True,n_samples=len(train_majority),random_state=123) train_upsampled = pd.concat([train_minority_upsampled,train_majority]) train_upsampled['label'].value_counts() train_majority = train_clean[train_clean.label==0] train_minority = train_clean[train_clean.label==1] train_majority_downsampled = resample(train_majority,replace=True,n_samples=len(train_minority),random_state=123) train_downsampled = pd.concat([train_majority_downsampled,train_minority]) train_downsampled['label'].value_counts() X_train,X_test,y_train,y_test =train_test_split(train_upsampled['tweet'],train_upsampled['label'],random_state = 0)