def get_sms_feats(df): df['request_datetime'] = pd.to_datetime(df['request_datetime']) df["hour"] = df['request_datetime'].dt.hour df["day"] = df['request_datetime'].dt.day phone_no_m = df[["phone_no_m"]].copy() phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last') #对话人数和对话次数 tmp = df.groupby("phone_no_m")["opposite_no_m"].agg(sms_count="count", sms_nunique="nunique") tmp["sms_rate"] = tmp["sms_count"] / tmp["sms_nunique"] phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left") """短信下行比例 """ calltype2 = df[df["calltype_id"] == 2].copy() calltype2 = calltype2.groupby("phone_no_m")["calltype_id"].agg( calltype_2="count") phone_no_m = phone_no_m.merge(calltype2, on="phone_no_m", how="left") phone_no_m[ "calltype_rate"] = phone_no_m["calltype_2"] / phone_no_m["sms_count"] """短信时间 """ tmp = df.groupby("phone_no_m")["hour"].agg( hour_mode=lambda x: stats.mode(x)[0][0], hour_mode_count=lambda x: stats.mode(x)[1][0], hour_nunique="nunique") phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left") tmp = df.groupby("phone_no_m")["day"].agg( day_mode=lambda x: stats.mode(x)[0][0], day_mode_count=lambda x: stats.mode(x)[1][0], day_nunique="nunique") phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left") return phone_no_m
def actionUpdate(self): x = np.arange(-np.pi, np.pi, np.pi / 10) y = eval(str(self.ui.lineEdit.text())); self.mpl.axes.plot(x, y, '--rx', linewidth=2); self.mpl.axes.set_title('Sine Function'); self.mpl.draw() print stats.mode([1, 2, 3, 3, 4, 5])
def actionUpdate(self): x = np.arange(-np.pi, np.pi, np.pi / 10) if str(self.ui.lineEdit.text()) != "": y = eval(str(self.ui.lineEdit.text())) self.mpl.axes.plot(x, y, '--rx', linewidth=2) self.mpl.axes.set_title('Sine Function') self.mpl.draw() print stats.mode([1, 2, 3, 3, 4, 5])
def stat_mode(df, cate_fea, num_fea): aim_ = pd.concat([ df.groupby(cate_fea)[num_].agg( { num_ + '_mode': lambda x: stats.mode(x)[0][0], num_ + '_mode_count': lambda x: stats.mode(x)[1][0] }) for num_ in num_fea ], axis=1) aim_.reset_index(inplace=True) return aim_
def get_data(column, np_values, alpha): mvs = bayes_mvs(np_values, alpha) #report these metrics output = [ present("Column", column), present("Length", len(np_values)), present("Unique", len(np.unique(np_values))), present("Min", np_values.min()), present("Max", np_values.max()), present("Mid-Range", (np_values.max() - np_values.min())/2), present("Range", np_values.max() - np_values.min()), present("Mean", np_values.mean()), present("Mean-%s-CI" % alpha, tupleToString(mvs[0][1])), present("Variance", mvs[1][0]), present("Var-%s-CI" % alpha, tupleToString(mvs[1][1])), present("StdDev", mvs[2][0]), present("Std-%s-CI" % alpha, tupleToString(mvs[2][1])), present("Mode", stats.mode(np_values)[0][0]), present("Q1", stats.scoreatpercentile(np_values, 25)), present("Q2", stats.scoreatpercentile(np_values, 50)), present("Q3", stats.scoreatpercentile(np_values, 75)), present("Trimean", trimean(np_values)), present("Minhinge", midhinge(np_values)), present("Skewness", stats.skew(np_values)), present("Kurtosis", stats.kurtosis(np_values)), present("StdErr", sem(np_values)), present("Normal-P-value", normaltest(np_values)[1]) ] return output
def arrayStatistics(numpy_array, missing_value=N.nan): if N.isfinite(missing_value): valid_values = numpy_array[N.where(numpy_array!=missing_value)] if numpy_array.dtype.kind == 'f': valid_values = valid_values[N.where(N.isfinite(valid_values))] else: valid_values = numpy_array[N.where(N.isfinite(numpy_array))] if len(valid_values) > 0: statistics = { 'min' : N.min(valid_values), 'max' : N.max(valid_values), 'mean' : N.mean(valid_values), 'stddev' : N.std(valid_values), 'median' : N.median(valid_values), 'mode' : scipy_stats.mode(valid_values), 'missing' : len(numpy_array) - len(valid_values), } else: statistics = { 'min' : missing_value, 'max' : missing_value, 'mean' : missing_value, 'stddev' : 0.0, 'median' : missing_value, 'mode' : ( N.array([missing_value,]), N.array([len(numpy_array),]) ), 'missing' : len(numpy_array), } return statistics
def knn_classifier(X_train, y_train, X_validation, X_test, k): # Returns the labels for test_data, predicted by the k-NN clasifier trained on X_train and y_train # Input: # X_train - num_train x num_features matrix with features for the training data # y_train - num_train x 1 vector with labels for the training data # X_validation - num_test x num_features matrix with features for the validation data # X_test - num_test x num_features matrix with features for the test data # k - Number of neighbors to take into account # Output: # y_pred_validation - num_test x 1 predicted vector with labels for the validation data # y_pred_test - num_test x 1 predicted vector with labels for the test data X_test_val = np.vstack((X_validation, X_test)) # Compute standardized euclidian distance of validation and test points to the other points D = cdist(X_test_val, X_train, metric='seuclidean') # Sort distances per row and return array of indices from low to high sort_ix = np.argsort(D, axis=1) # Get the k smallest distances sort_ix_k = sort_ix[:, :k] predicted_labels = y_train[sort_ix_k] # Predictions for each point is the mode of the K labels closest to the point predicted_labels = mode(predicted_labels, axis=1)[0] y_pred_validation = predicted_labels[:len(X_validation)] y_pred_test = predicted_labels[len(X_validation):] return y_pred_validation, y_pred_test
def get_diff(set_val,time_slots,num_type,conf_lev): time_slots_utc = dtime_to_unix(time_slots) TIMELET_INV_seconds = (time_slots[1]-time_slots[0]).seconds diff_mean = list() for r, utc_t in enumerate(time_slots_utc): utc_t_s = utc_t utc_t_e = utc_t + TIMELET_INV_seconds idx = np.nonzero((set_val[0] >= utc_t_s) & (set_val[0] < utc_t_e))[0] if len(idx) < 2: diff_val = np.inf else: temp_val = abs(np.diff(set_val[1][idx])) upper_val = np.sort(temp_val)[int(np.floor(len(temp_val)*conf_lev)):] if len(upper_val) == 0: diff_val = np.inf else: if num_type == FLOAT_TYPE: diff_val = np.mean(upper_val) elif num_type == INT_TYPE: diff_val = int(stats.mode(upper_val)[0]) else: log.error('Sample type must either INT or FLOAT type') raise NameError('Sample type must either INT or FLOAT type') #diff_val=max(abs(diff(set_val[1][idx]))) #sort(abs(diff(set_val[1][idx])))[::-1] diff_mean.append(diff_val) #diff_mean=np.array(diff_mean)[:,np.newaxis] diff_mean = np.array(diff_mean) return diff_mean
def get_data(column, np_values, alpha): mvs = bayes_mvs(np_values, alpha) #report these metrics output = [ present("Column", column), present("Length", len(np_values)), present("Unique", len(np.unique(np_values))), present("Min", np_values.min()), present("Max", np_values.max()), present("Mid-Range", (np_values.max() - np_values.min()) / 2), present("Range", np_values.max() - np_values.min()), present("Mean", np_values.mean()), present("Mean-%s-CI" % alpha, tupleToString(mvs[0][1])), present("Variance", mvs[1][0]), present("Var-%s-CI" % alpha, tupleToString(mvs[1][1])), present("StdDev", mvs[2][0]), present("Std-%s-CI" % alpha, tupleToString(mvs[2][1])), present("Mode", stats.mode(np_values)[0][0]), present("Q1", stats.scoreatpercentile(np_values, 25)), present("Q2", stats.scoreatpercentile(np_values, 50)), present("Q3", stats.scoreatpercentile(np_values, 75)), present("Trimean", trimean(np_values)), present("Minhinge", midhinge(np_values)), present("Skewness", stats.skew(np_values)), present("Kurtosis", stats.kurtosis(np_values)), present("StdErr", sem(np_values)), present("Normal-P-value", normaltest(np_values)[1]) ] return output
def predict(clf2, test_set): uid = pd.DataFrame() # test_set = processing(trainSpan=(1, 30), label=False) uid["user_id"] = test_set["user_id"] test_set = test_set.drop(labels=["user_id"], axis=1) # if isinstance(selector,RFECV): # test_set_new = selector.transform(test_set.values) # elif isinstance(selector,list): # test_set_new = test_set[selector] # else: # test_set_new = test_set print("begin to make predictions") res = clf2.predict(test_set.values) uid["y_hat"] = pd.Series(res) uid["label"] = uid.groupby(by=["user_id"])["y_hat"].transform(lambda x: stats.mode(x)[0][0]) str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) uid_file = "result/uid_" + str_time + ".csv" uid.to_csv(uid_file,header=True,index=False) active_users = (uid.loc[uid["label"] == 1]).user_id.unique().tolist() print(len(active_users)) print(active_users) str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) submission_file = "result/submission_" + str_time + ".csv" with open(submission_file, "a", newline="") as f: writer = csv.writer(f) for i in active_users: writer.writerow([i])
def fit_predict(self, X, y, **fit_params): self.X_ = X self.y_ = y # 这里为了尽可能用np的向量化简化计算,对训练集自身的预测通过滚动训练集的方法进行。 X_roll = X distances = np.zeros((len(X), len(X) - 1)) for i in range(len(self.X_) - 1): X_roll = np.roll(X_roll, -1, axis=0) # 通过对样本整体滚动计算样本两两之间的距离 distances[:, i] = np.power( np.sum(np.power(X - X_roll, self.p), axis=1), 1 / self.p) min_k_ind = np.argpartition(distances, self.k, axis=1)[:, :self.k] # distances的第[i, j]个元素为x[i]与x[(i + j + 1) % len(X)]的距离,因此需要调整一下 neighbors_ind = (min_k_ind + np.arange(1, len(X) + 1).reshape(-1, 1)) % len(X) neighbors_labels = y[neighbors_ind] # 等权预测 if self.weights == 'uniform': return stats.mode(neighbors_labels, axis=1).mode.flatten() # 加权预测 neighbors_distances = np.vstack( (distances[i][min_k_ind[i]] for i in range(len(X)))) # 距离倒数 if self.weights == 'inverse': weights_ = 1 / neighbors_distances # 距离的高斯函数 if self.weights == 'Gaussian': weights_ = np.exp(-np.square(neighbors_distances) / 2) return self.weighted_predict(neighbors_labels, weights_)
def extractfromfits(self, filename, loc, size, sky=0.): """ Extracts a subregion from a fits file and converts it according to the astro and math convention: pixel (0,0) is at the bottom left """ x, y = loc radius = int(size/2) r = size-radius * 2 hdulist = pyfits.open(filename) # open a FITS file if len(hdulist) != 1: raise RuntimeError, "extractfromfits : len(hdulist) > 1 not allowed" fulldata = hdulist[0].data # assumes the first extension is an image if x< 0 or y< 0 or x>= fulldata.shape[1] or y>= fulldata.shape[0]: raise RuntimeError, "extractfromfits : bad extraction parameters" if x+radius+r >= fulldata.shape[1] or y+radius+r >= fulldata.shape[0] or x-radius<0 or y-radius<0: #TODO: set outside pixels to NaN print "outside" self.array = self.array + np.zeros(self.array.shape, dtype=np.float64) # switch to 8 byte self.array = fulldata[y-radius:y+radius+r, x-radius:x+radius+r].transpose() # get values from the subsection # This tansposition makes the pixelarray coordinates (x,y) equal to those in the ds9 display etc. # In other words, we are in the math and astro convention. # x = horizontal, y = vertical, (0, 0) is bottom left. self.array[np.where(np.isnan(self.array))] = sky hdulist.close() if sky is None: #TODO: check... self.array -= stats.mode(self.array.ravel())[0][0] else: self.array -= sky self.setzscale()
def create_dataset(X, y, time_steps=1, step=1): Xs, ys = [], [] for i in range(0, len(X) - time_steps, step): v = X.iloc[i:(i + time_steps)].values labels = y.iloc[i:i + time_steps] Xs.append(v) ys.append(stats.mode(labels)[0][0]) return np.array(Xs), np.array(ys).reshape(-1, 1)
def unsupervised_habitat_class_modes(self): hcm = {} for hab in self.habitats: md, cn = mode( self.unsupervised_habitat_class_dict[hab] ) if len( md )==1: hcm[hab] = md[0] else: hcm[hab] = None return hcm
def unsupervised_habitat_class_modes(self): hcm = {} for hab in self.habitats: md, cn = mode(self.unsupervised_habitat_class_dict[hab]) if len(md) == 1: hcm[hab] = md[0] else: hcm[hab] = None return hcm
def calculate_weighted_loss(data): wighted_loss = 0 if data.shape[0] != 0: label_column = data[:, -1] label_column = np.array(label_column.tolist()) y_prediction_mode = stats.mode(label_column, axis=0)[0][0] y_prediction = [y_prediction_mode] * label_column.shape[0] wighted_loss = ch.custom_weighted_loss(label_column, np.array(y_prediction)) return wighted_loss
def Aggregate(self, sourceRaster, outFName, method=None, numSourcePerTarget=10): tmpOutput = os.path.splitext(outFName)[0] + ".tif" tmpOutDataset = SpatialUtilities.SAHMRaster(tmpOutput) tmpOutDataset.pullParamsFromRaster(self.templateRaster.source) tmpOutDataset.createNewRaster() rows = int(sourceRaster.height) cols = int(sourceRaster.width) # loop of 'blocks' of data maybe. bSize = 2048 # source pixels # convert this to the nearest whole number of target pixels bSize = int(round(bSize / numSourcePerTarget) * numSourcePerTarget) if bSize == 0: bSize = int(numSourcePerTarget) for i in range(0, rows, bSize): if i + bSize < rows: numRows = bSize else: numRows = rows - i for j in range(0, cols, bSize): if j + bSize < cols: numCols = bSize else: numCols = cols - j data = sourceRaster.getBlock(j, i, numCols, numRows) if method == None: method = "Mean" if method in ["Mean", "Max", "Min", "STD"]: ans = self.rebin(data, (numRows / numSourcePerTarget, numCols / numSourcePerTarget), method) else: X, Y = data.shape x = X // numSourcePerTarget y = Y // numSourcePerTarget ndMask = data.reshape( (x, numSourcePerTarget, y, numSourcePerTarget)) ndMask = ndMask.transpose([0, 2, 1, 3]) ndMask = ndMask.reshape( (x * y, numSourcePerTarget * numSourcePerTarget)) ans = np.array(stats.mode(ndMask, 1)[0]).reshape(x, y) tmpOutDataset.putBlock(ans, int(j / numSourcePerTarget), int(i / numSourcePerTarget)) tmpOutDataset.calcStats() tmpOutDataset.close()
def HMM(X, Y, human_bounds): """fit hidden markov model Fit HMM to average data and cross-validate with leftout subject using within song and between song average correlations Parameters ---------- A: voxel by time ndarray (2D) B: voxel by time ndarray (2D) C: voxel by time ndarray (2D) D: voxel by time ndarray (2D) K: # of events for HMM (scalar) Returns ------- z: z-score after performing permuted cross-validation analysis """ # Fit to all but one subject nPerm = 1000 within_across = np.zeros(nPerm + 1) K = len(human_bounds) + 1 nTR = X.shape[1] # create events vector out of human bounds add_zero = np.append([0], human_bounds) full_human_bounds = np.append(add_zero, [nTR]) diff_bounds = np.diff(full_human_bounds) events = np.zeros((nTR)) for l in range(len(diff_bounds)): events[full_human_bounds[l]:full_human_bounds[l + 1]] = l max_event_length = stats.mode(events)[1][0] # compute timepoint by timepoint correlation matrix cc = np.corrcoef(Y.T) # Should be a time by time correlation matrix # Create a mask to only look at values up to max_event_length local_mask = np.zeros(cc.shape, dtype=bool) for k in range(1, max_event_length): local_mask[np.diag(np.ones(cc.shape[0] - k, dtype=bool), k)] = True for p in range(nPerm + 1): same_event = events[:, np.newaxis] == events within = fisher_mean(cc[same_event * local_mask]) across = fisher_mean(cc[(~same_event) * local_mask]) within_across[p] = within - across np.random.seed(p) events = np.zeros(nTR, dtype=np.int) events[np.random.choice(nTR, K - 1, replace=False)] = 1 events = np.cumsum(events) return within_across
def HMM(X, Y, human_bounds): """fit hidden markov model Fit HMM to average data and cross-validate with leftout subject using within song and between song average correlations Parameters ---------- A: voxel by time ndarray (2D) B: voxel by time ndarray (2D) C: voxel by time ndarray (2D) D: voxel by time ndarray (2D) K: # of events for HMM (scalar) Returns ------- z: z-score after performing permuted cross-validation analysis """ # Fit to all but one subject nPerm = 1000 within_across = np.zeros(nPerm + 1) K = len(human_bounds) + 1 nTR = X.shape[1] ev = brainiak.eventseg.event.EventSegment(K, split_merge=True, split_merge_proposals=3) ev.fit(X.T) events = np.argmax(ev.segments_[0], axis=1) bounds = np.where(np.diff(np.argmax(ev.segments_[0], axis=1)))[0] _, event_lengths = np.unique(events, return_counts=True) max_event_length = stats.mode(events)[1][0] # compute timepoint by timepoint correlation matrix cc = np.corrcoef(Y.T) # Should be a time by time correlation matrix # Create a mask to only look at values up to max_event_length local_mask = np.zeros(cc.shape, dtype=bool) for k in range(1, max_event_length): local_mask[np.diag(np.ones(cc.shape[0] - k, dtype=bool), k)] = True for p in range(nPerm + 1): same_event = events[:, np.newaxis] == events within = fisher_mean(cc[same_event * local_mask]) across = fisher_mean(cc[(~same_event) * local_mask]) within_across[p] = within - across np.random.seed(p) perm_lengths = np.random.permutation(event_lengths) events = np.zeros(nTR, dtype=np.int) events[np.cumsum(perm_lengths[:-1])] = 1 events = np.cumsum(events) return within_across, bounds
def test_kpi_level_model(predict_result_file, final_result_file): df = pd.read_csv(predict_result_file, sep=',', dtype=str) df = df[df['predict_event'] == '1'] mapping_dict = {'Biz': 0, 'Mon': 1, 'Ora': 2, 'Trd': 3, 'Other': 4} knn_model_list = [] knn_model_list = kpi_level_model.test_KNN_model(cluster_data_dir) all_df = pd.DataFrame(columns=[ 'alertgroup', 'classifier', 'hostname', 'predict_event', 'predict_level' ]) for alertgroup, group in df.groupby('alertgroup'): column_list = [ 'cpu_max', 'cpu_min', 'mem_max', 'mem_min', 'cpu_max_1', 'cpu_min_1', 'mem_max_1', 'mem_min_1', 'cpu_max_2', 'cpu_min_2', 'mem_max_2', 'mem_min_2' ] data = group[column_list] kpi_predict_result = [] for i in knn_model_list: kpi_predict_result.append(i.predict(data)) print(kpi_predict_result) predict_results = np.zeros(len(group)) df_res = pd.DataFrame(columns=['predict_level']) for idx in range(len(group)): sample_predict_vec = np.array([ np.round(kpi_predict_result[0][idx]), np.round(kpi_predict_result[1][idx]), np.round(kpi_predict_result[2][idx]), np.round(kpi_predict_result[3][idx]), np.round(kpi_predict_result[4][idx]) ]) # print(sample_predict_vec) mode_prediction_res = stats.mode(sample_predict_vec)[0][ 0] # 5个模型预测结果的众数 print(mode_prediction_res) max_prediction_res = sample_predict_vec[np.argmax( sample_predict_vec)] # 5个模型预测结果的最大值 print(max_prediction_res) group_prediction_res = sample_predict_vec[mapping_dict[ alertgroup]] # group_prediction_val <= max_prediction_val, 该条数据对应的业务模型预测的结果 print(group_prediction_res) if (mode_prediction_res <= 2 and max_prediction_res <= 2): predict_results[idx] = group_prediction_res else: predict_results[idx] = max_prediction_res df_res.loc[idx] = int(predict_results[idx]) new_df = group[[ 'alertgroup', 'classifier', 'hostname', 'predict_event' ]].reset_index(drop=True).join(df_res, how='outer') all_df = pd.concat([all_df, new_df]) print(all_df) all_df.to_csv(final_result_file, sep=',', index=False)
def average_predictions(models, test_features, np_value="int", method="mode"): predictions = np.column_stack([model.predict(test_features) for model in models]) print(f"Predictions of first 5 rows: {predictions[:5]}") if method == "mode": averaged_predictions = stats.mode(predictions, axis=1)[0].astype(np_value) elif method == "mean": averaged_predictions = np.average(predictions, axis=1).astype(np_value) else: raise Exception("Method undefined") print(f"Averaged predictions of first 5 rows: {averaged_predictions[:5]}") return np.hstack(averaged_predictions)
def predict(self, examples): X = vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in examples] ) dist, ind = self.lsh.kneighbors(X) rows, columns = ind.shape for row in xrange(0, rows): for column in xrange(0, columns): ind[row, column] = self.Y[ind[row, column]] vals, counts = mode(ind, axis=1) return reshape(vals, (1, len(examples))).tolist()[0]
def info_univariate(data, features_name): df_np = np.array(data) df_transposed = np.transpose(d) for f in range(0, len(df_transposed), 1): ds = sorted(df_transposed[f]) moda = stats.mode(ds) print( 'Feature: {}:\nMAX: --> {}\nMIN: --> {}\nAVG: --> {}\nMODE: --> V:{} --> {}\nMed --> {}\n' .format(features_name[f], np.max(df_transposed[f]), np.min(df_transposed[f]), round(np.mean(df_transposed[f]), 1), moda[0], moda[1], np.median(ds))) plot_boxnotch_univariateanalysis(df_transposed, features_name) return
def ensemble_voting(predictions, gold, dataset): stacked = numpy.stack(predictions, axis=0) modals = stats.mode(stacked, axis=0)[0].squeeze().astype(int) if dataset != "test": accuracy = acc(gold, modals) f1 = f1_macro(gold, modals) print("acc: ", accuracy) print("f1: ", f1) else: accuracy = 0 f1 = 0 return modals, accuracy, f1
def density_categorical_accuracy(labels, predicted_labels, classes): assert (len(labels) == len(predicted_labels)) if len(labels) == 0: return 0 n_cluster = np.max(predicted_labels) + 1 clusters = [[] for _ in range(n_cluster)] for label, predicted_label in zip(labels, predicted_labels): clusters[predicted_label].append(label) catacc = np.average([stats.mode(d)[1][0] / len(d) for d in clusters], weights=[len(d) for d in clusters]) corrected_catacc = (catacc - 1.0 / classes) / (1.0 - 1.0 / classes) return corrected_catacc
def get_qda_oof_prediction(x_train,y_train,x_test): oof_train = np.zeros((ntrain,)) oof_test = np.zeros((ntest,)) oof_test_skf = np.empty((NFOLDS,ntest)) for i,(train_ind,test_ind) in enumerate(skf.split(x_train,y_train)): model = QuadraticDiscriminantAnalysis() y_tr = y_train[train_ind] x_tr = x_train[train_ind] x_ts = x_train[test_ind] model.fit(x_tr,y_tr) oof_train[test_ind] = model.predict(x_ts) oof_test_skf[i,:] = model.predict(x_test) print("Test score {} ".format(f1_score(y_train[test_ind],oof_train[test_ind]))) oof_test = stats.mode(oof_test_skf,axis=0)[0] return oof_train.reshape(-1,1),oof_test.reshape(-1,1)
def predict(self, X): distances = np.zeros((len(X), len(self.X_))) for i in range(len(X)): distances[i, :] = np.power( np.sum(np.power(X[i] - self.X_, self.p), axis=1), 1 / self.p) neighbors_ind = np.argpartition(distances, self.k, axis=1)[:, :self.k] neighbors_labels = self.y_[neighbors_ind] if self.weights == 'uniform': return stats.mode(neighbors_labels).mode neighbors_distances = np.vstack( (distances[i][neighbors_ind[i]] for i in range(len(X)))) if self.weights == 'inverse': weights_ = 1 / neighbors_distances if self.weights == 'Gaussian': weights_ = np.exp(-np.square(neighbors_distances) / 2) return self.weighted_predict(neighbors_labels, weights_)
def get_feature(data_dict_samples,num_type): x_temp=[] for i,sample in enumerate(data_dict_samples): # If sample=[], np.std returns 0. Avoid zero std, add a infitestimal number if len(sample)==0: # Set infty if no sample is availble x_temp.append(np.inf) else: if num_type==INT_TYPE: x_temp.append(int(stats.mode(sample)[0])) elif num_type==FLOAT_TYPE: x_temp.append(np.mean(sample)) else: raise NameError('Sample type must either INT or FLOAT type') x_temp=np.array(x_temp)[:,np.newaxis] return x_temp
def fit(self, trainExamples): self.expectedValues = {} for x in trainExamples: for (key, value) in x.items(): if key != "Image": if not key in self.expectedValues: self.expectedValues[key] = [] if len(value) > 0: self.expectedValues[key].append(round(float(value)/0.5,0)*0.5) for key in self.expectedValues.keys(): self.expectedValues[key], _ = mode(self.expectedValues[key]) self.expectedValues[key] = self.expectedValues[key][0] return self
def get_create_feature(row): feature = pd.Series() feature['user_id'] = list(row['user_id'])[0] # feature['create_count'] = len(row) diff_day = np.diff(row['day']) if len(diff_day) != 0: # feature['create_day_diff_mean'] = np.mean(diff_day) # feature['create_day_diff_std'] = np.std(diff_day) # feature['create_day_diff_min'] = np.min(diff_day) # feature['create_day_diff_mode'] = stats.mode(interval_data)[0][0] feature['create_day_diff_ske'] = stats.skew(diff_day) feature['create_day_diff_kur'] = stats.kurtosis(diff_day) # feature['create_day_diff_max'] = np.max(diff_day) feature['create_day_last'] = diff_day[-1] feature['create_sub_register'] = np.subtract(np.max(row['max_day']), np.max(row['day'])) feature['create_mode'] = stats.mode(row['day'])[0][0] return feature
def get_sgd_oof_prediction(SEED,x_train,y_train,x_test): oof_train = np.zeros((ntrain,)) oof_test = np.zeros((ntest,)) oof_test_skf = np.empty((NFOLDS,ntest)) for i,(train_ind,test_ind) in enumerate(skf.split(x_train,y_train)): model = SGDClassifier(max_iter=100,random_state=SEED,loss="squared_hinge",alpha=0.009,penalty='l1') y_tr = y_train[train_ind] scaler = StandardScaler() x_tr = scaler.fit_transform(x_train[train_ind]) x_ts = scaler.transform(x_train[test_ind]) x_test_s = scaler.transform(x_test) model.fit(x_tr,y_tr) oof_train[test_ind] = model.predict(x_ts) oof_test_skf[i,:] = model.predict(x_test_s) print("Test score {} ".format(f1_score(y_train[test_ind],oof_train[test_ind]))) oof_test = stats.mode(oof_test_skf,axis=0)[0] return oof_train.reshape(-1,1),oof_test.reshape(-1,1)
def get_log_oof_prediction(SEED,x_train,y_train,x_test): oof_train = np.zeros((ntrain,)) oof_test = np.zeros((ntest,)) oof_test_skf = np.empty((NFOLDS,ntest)) for i,(train_ind,test_ind) in enumerate(skf.split(x_train,y_train)): model = LogisticRegression(random_state=SEED,C=0.8252042855888113,penalty='l1',verbose=2) y_tr = y_train[train_ind] scaler = StandardScaler() x_tr = scaler.fit_transform(x_train[train_ind]) x_ts = scaler.transform(x_train[test_ind]) x_test_s = scaler.transform(x_test) model.fit(x_tr,y_tr) oof_train[test_ind] = model.predict(x_ts) oof_test_skf[i,:] = model.predict(x_test_s) print("Test score {} ".format(f1_score(y_train[test_ind],oof_train[test_ind]))) oof_test = stats.mode(oof_test_skf,axis=0)[0] return oof_train.reshape(-1,1),oof_test.reshape(-1,1)
def process(img, head, gray_old): if shot_frame: cv2.imwrite(folder_depth + head + f"{loop:05d}.jpg", img) img = cv2.resize(img, resize_shape, interpolation=cv2.INTER_LINEAR) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) gray = cv2.medianBlur(gray, 3) if mix_image: diff = cv2.absdiff(gray, gray_old) th = stats.mode(diff, axis=None)[0][0] diff_valid = diff > th + gray_diff_th if gray_mode: diff[np.logical_not(diff_valid)] = 0 return diff, gray if np.sum(diff_valid) > change_th: return None, gray return gray, gray elif gray_mode: return gray, gray return img, gray
def split_data_into_steps(data, N_TIME_STEPS, N_FEATURES, step, RANDOM_SEED): segments = [] labels = [] activities = set() for i in range(0, len(data) - N_TIME_STEPS, step): xs = data['x'].values[i: i + N_TIME_STEPS] ys = data['y'].values[i: i + N_TIME_STEPS] zs = data['z'].values[i: i + N_TIME_STEPS] label = stats.mode(data['activity'][i: i + N_TIME_STEPS])[0][0] segments.append([xs, ys, zs]) labels.append(label) activities.add(label) reshaped_segments = pd.np.asarray(segments, dtype=pd.np.float32).reshape(-1, N_TIME_STEPS, N_FEATURES) labels = pd.np.asarray(pd.get_dummies(labels), dtype=pd.np.float32) validation_split = get_validation_split() x_train, x_test, y_train, y_test = train_test_split(reshaped_segments, labels, test_size=validation_split, random_state=RANDOM_SEED) return x_train, x_test, y_train, y_test, activities
def test_voting(model, ipca, tensor, labels, times, interval_len, cnt): ok = 0. total = 0. with tf.Session() as sess: for i in range(cnt): x, y = prepate_data_for_voting(sess, ipca, tensor, labels, times, interval_len) num, width, temp, feat = x.shape x = np.reshape(x, (-1, x.shape[2] * x.shape[3])) x = (x - ipca.mean_my) / ipca.var_my x = ipca.transform(x) y_pred = model.predict(x) y_pred = np.reshape(y_pred, (-1, width)) y_pred = stats.mode(y_pred, axis=1)[0] y_pred = np.reshape(y_pred, (-1)) y = np.argmax(y, axis=1) ok += np.sum(y == y_pred, axis=0) total += y.shape[0] return ok / total
def get_feature(data_dict_samples,num_type): x_temp = [] for i, sample in enumerate(data_dict_samples): # If sample=[], np.std returns 0. Avoid zero std, add a infitestimal number # Set infty if no sample is availble if len(sample) == 0: x_temp.append(np.inf) else: if num_type == INT_TYPE: x_temp.append(int(stats.mode(sample)[0])) elif num_type == FLOAT_TYPE: x_temp.append(np.mean(sample)) else: raise NameError('Sample type must either INT or FLOAT type') x_temp = np.array(x_temp)[:, np.newaxis] return x_temp
def train(self, input_data, target_data): (self._most_frequent_value, ), _ = mode(target_data, axis = 0) self._target_type = target_data.dtype
#A special column #This would be obsolete with: #from matplotlib.dates import MONDAY, MonthLocator, WeekdayLocator, DateFormatter time = getTime(DateTimeUT) #Just an empty class class Dummy(object): pass #Lets get some statistics tdfst = Dummy() tdfStat = [] tdfst.name, tdfst.median, tdfst.max, tdfst.min, tdfst.mean, tdfst.stdev = \ "Seeing", median(medianFWHM), max(medianFWHM), min(medianFWHM), mean(medianFWHM), std(medianFWHM) tdfst.mode = mode(medianFWHM) tdfStat.append(tdfst) if opts.verbose == True: print print ("%5s" + "%11s"*6) % ("Name", "Median", "Max", "Min", "Mean", "Stdev", "Mode") frmt = "%5s" + "%11.2f"*6 print frmt % (tdfst.name, tdfst.median, tdfst.max, tdfst.min, tdfst.mean, tdfst.stdev, tdfst.mode[0]) print #Calculates some 2D correlations WDCorr = spearmanr(medianFWHM, WindDirectionDeg) Humidity = spearmanr(medianFWHM, HumidityPercent) Pressure = spearmanr(medianFWHM, PressureHPA)
def open_behavioural(path, subj, **kwargs): ############# BOLOGNA ################## dropped_trials = [] behavioural_data = [] for arg in kwargs: if arg == 'dropped_trials': dropped_trials = np.int_(kwargs[arg].split(',')) if arg == 'behavioural_data': behavioural_data = kwargs[arg].split(',') import xlrd fn = os.path.join(path, subj) book = xlrd.open_workbook(fn) #Open workbook sh = book.sheet_by_index(0) #Choose sheet labels = sh.row_values(0) labels = [unicode.lower(unicode(l)) for l in labels] l_array = np.array(labels, dtype = np.str) indexes = [] data_tot = [] dtype = [] for field in behavioural_data: index = np.nonzero(l_array == str.lower(field))[0][0] data = sh.col_values(int(index))[1:] print field type_ = mode([x.__class__ for x in data])[0][0] if type_ == unicode or type == str: data = [x.__class__.lower(x) for x in data] t = (field, np.str_, 45) else: #print data data = [(int(x) if (x != 'NULL') and (x != '') else 0) for x in data] t = (field, np.int_, 1) dtype.append(t) data_tot.append(data) data_tot.append(range(1,len(sh.col_values(0)[1:])+1)) dtype.append(('TrialNo.', np.int_, 1)) ''' behavioural = np.array(zip( sh.col_values(6)[1:], #Condition Label sh.col_values(19)[1:], np.float_([(int(x) if x else 0) for x in sh.col_values(18)[1:]]), #Accuracy np.int_([(int(x) if x else 0) for x in sh.col_values(4)[1:]]), np.arange(len(sh.col_values(0)[1:]))+1 #Combination ), dtype=[('Condition', np.str_,2), ('SlideImage', np.str_,10), ('Accuracy', np.int_, 1), ('Combination', np.int_, 1), ('TrialNo.', np.int_, 1)] ) ''' behavioural = np.array(zip(*data_tot), dtype=dtype) if len(dropped_trials) > 0: mask = 0 for trial in dropped_trials: mask = mask + np.int_(behavioural['TrialNo.'] == trial) behavioural = behavioural[~np.bool_(mask)] return behavioural
Southwest, 4.79, 2.71 Wales, 5.27, 3.53 Scotland, 6.08, 4.51 Northern Ireland, 4.02, 4.56''' data = data.splitlines() data = [i.split(', ') for i in data] column_names = data[0] data_rows = data[1::] df = pd.DataFrame(data_rows, columns=column_names) df['Alcohol'] = df['Alcohol'].astype(float) df['Tobacco'] = df['Tobacco'].astype(float) print "The mean for the Alcohol dataset is", df['Alcohol'].mean() print "The median for the Alcohol dataset is", df['Alcohol'].median() # NOT SURE HOW TO MODIFY THE SCRIPT TO ONLY RETURN THE FIRST ARRAY # ALSO NOT SURE HOW TO IMPROVE UPON THE VALUE RETURNED SO THAT IF THE FREQUENCY ARRAY RETURNED IS 1, "there is no mode" is the response returned. print "The mode for the Alcohol dataset is", stats.mode(df['Alcohol']) print "The range for the Alcohol dataset is", max(df['Alcohol']) - min(df['Alcohol']) print "The standard deviation for the Alcohol dataset is", df['Alcohol'].std() print "The variance for the Alcohol dataset is", df['Alcohol'].var() print "The mean for the Tobacco dataset is", df['Tobacco'].mean() print "The median for the Tobacco dataset is", df['Tobacco'].median() print "The mode for the Tobacco dataset is", stats.mode(df['Tobacco']) print "The range for the Tobacco dataset is", max(df['Tobacco']) - min(df['Tobacco']) print "The standard deviation for the Tobacco dataset is", df['Tobacco'].std() print "The variance for the Tobacco dataset is", df['Tobacco'].var()