def detect_ground(profile): """Automatic detection of ground (end of snowpack). :param snowmicropyn.Profile profile: The profile to detect ground in. :return: Distance where ground was detected. :rtype: float """ force = profile.samples.force distance = profile.samples.distance ground = distance.iloc[-1] if force.max() >= profile.overload: i_ol = force.argmax() i_threshhold = np.where( distance.values >= distance.values[i_ol] - 20)[0][0] f_mean = np.mean(force.iloc[0:i_threshhold]) f_std = np.std(force.iloc[0:i_threshhold]) threshhold = f_mean + 5 * f_std while force.iloc[i_ol] > threshhold: i_ol -= 10 ground = distance.iloc[i_ol] log.info('Detected ground at {:.3f} mm in profile {}'.format( ground, profile)) return ground
def find_missing_seat(seats_data: List[Tuple[int, int]]) -> Tuple[int, int]: """ Given a list of seat coordinates taken, return the only one with previous and after seats occupied :param seats_data: Occupied seats data coordinates :return: """ # Create seats matrix ar = np.array(seats_data) res = np.zeros((PLANE_ROW_NUMBER, PLANE_COLUMN_NUMBER), dtype=int) res[ar[:, 0], ar[:, 1]] = 1 # Find all empty seats empty_seats_raw = np.where(res == 0) empty_seats = list(zip(empty_seats_raw[0], empty_seats_raw[1])) # Find the only valid empty seat for empty_seat in empty_seats: before_seat, after_seat = get_adjacent_seats(*empty_seat) if (before_seat is not None and after_seat is not None and res[before_seat[0]][before_seat[1]] and res[after_seat[0]][after_seat[1]]): return empty_seat
def predict(self, X, Y=None): results = {} print(f'Get data for {X}') data = self._data_fetcher(X, last=True) print(f'Data:\n {data}') # create features df = create_features(data) # Split test train data X, Y = create_X_Y(df, 'buysell') X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y) # Base model to tune rf = RandomForestClassifier() param_grid = { 'max_depth': [5, 10, 20, 40], 'max_features': ['sqrt'], 'min_samples_leaf': [2], 'min_samples_split': [6], 'n_estimators': [50, 100, 200] } grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2) grid_search.fit(X_train, y_train) best_param = grid_search.best_params_ clf = RandomForestClassifier(n_estimators=best_param['n_estimators'], min_samples_split=best_param['min_samples_split'], min_samples_leaf=best_param['min_samples_leaf'], max_features=best_param['max_features'], max_depth=best_param['max_depth']) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) results["balanced_accuracy"] = balanced_accuracy_score(y_test, y_pred) results["important_features"] = {"data": clf.feature_importances_, "index": X.columns} prediction = clf.predict(X_train) prediction = np.where(prediction == 1, "BUY", "SELL") results["prediction"] = prediction.flatten()[-1] return results
def f1(y_hat, y_true, THRESHOLD=0.5): ''' y_hat是未经过sigmoid函数激活的 输出的f1为Macro-F1 ''' epsilon = 1e-7 y_hat = y_hat > THRESHOLD y_hat = np.int8(y_hat) tp = np.sum(y_hat * y_true, axis=0) fp = np.sum(y_hat * (1 - y_true), axis=0) fn = np.sum((1 - y_hat) * y_true, axis=0) p = tp / (tp + fp + epsilon) # epsilon的意义在于防止分母为0,否则当分母为0时python会报错 r = tp / (tp + fn + epsilon) f1 = 2 * p * r / (p + r + epsilon) f1 = np.where(np.isnan(f1), np.zeros_like(f1), f1) return np.mean(f1)
def dice(img1, img2, labels=None, nargout=1): ''' Dice [1] volume overlap metric The default is to *not* return a measure for the background layer (label = 0) [1] Dice, Lee R. "Measures of the amount of ecologic association between species." Ecology 26.3 (1945): 297-302. Parameters ---------- vol1 : nd array. The first volume (e.g. predicted volume) vol2 : nd array. The second volume (e.g. "true" volume) labels : optional vector of labels on which to compute Dice. If this is not provided, Dice is computed on all non-background (non-0) labels nargout : optional control of output arguments. if 1, output Dice measure(s). if 2, output tuple of (Dice, labels) Output ------ if nargout == 1 : dice : vector of dice measures for each labels if nargout == 2 : (dice, labels) : where labels is a vector of the labels on which dice was computed ''' if labels is None: labels = np.unique(np.concatenate((img1, img2))) # 输出一维数组 labels = np.delete(labels, np.where(labels == 0)) # remove background dicem = np.zeros(len(labels)) for idx, lab in enumerate(labels): top = 2 * np.sum(np.logical_and(img1 == lab, img2 == lab)) bottom = np.sum(img1 == lab) + np.sum(img2 == lab) bottom = np.maximum(bottom, np.finfo(float).eps) # add epsilon. 机器最小的正数 dicem[idx] = top / bottom if nargout == 1: return dicem else: return (dicem, labels)
def init_groups(self, c, label_size): gp_size = math.ceil(c * len(self.clients)) done = False size = len(self.clients) wrk_cls = [[False for i in range(label_size)] for j in range(size)] cls_q = [Queue(maxsize=size) for _ in range(10)] for i, cls_list in enumerate(self.class_distributions): wrk_cls[i] = [True if freq != 0 else False for _, freq in cls_list] for worker, class_list in enumerate(reversed(wrk_cls)): for cls, exist in enumerate(class_list): if exist: cls_q[cls].put(size - worker - 1) taken_count = [0 for _ in range(label_size)] print('generating balanced groups for training...') while not done: visited = [False for _ in range(size)] g = [] for _ in range(gp_size): cls = np.where(taken_count == np.amin(taken_count))[0][0] assert 0 <= cls <= len(taken_count) done_q = False count = 0 while not done_q: wrkr = cls_q[cls].get() if not visited[wrkr] and wrk_cls[wrkr][cls]: g.append(wrkr) taken_count += self.class_distributions[wrkr][1] visited[wrkr] = True done_q = True cls_q[cls].put(wrkr) count += 1 if count == size: done_q = True self.groups.append(g) if len(self.groups) > 10: done = True
tfidf = TfidfVectorizer() tfidf.fit(result['Concultatory']) X = tfidf.transform(result['Concultatory']) result['Concultatory'][1] #print([X[1, tfidf.vocabulary_['διοίκησης']]]) #print([X[1, tfidf.vocabulary_['βαθμό']]]) #print([X[1, tfidf.vocabulary_['αποσπάσεως']]]) #Sentiment Classification #Θετικές 1,2 , Αρνητικές 3,4 result.dropna(inplace=True) #result[result['Score'] != 1] result['Positivity'] = np.where(result['Status'] == 1, 1, -1) cols = ['Status'] result.drop(cols, axis=1, inplace=True) result.head() result.groupby('Positivity').size() fig = plt.figure(figsize=(8,6)) result.groupby('Positivity').Concultatory.count().plot.bar(ylim=0) plt.show() X = result.Concultatory y = result.Positivity X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 0) print("Το σύνολο Εκπαίδευσης έχει συνολικά {0} Γνωμοδοτήσεις με {1:.2f}% σε εκκρεμότητα, {2:.2f}% αποδεκτές".format(len(X_train),
tfidf = TfidfVectorizer() tfidf.fit(result['Subject']) X = tfidf.transform(result['Subject']) result['Subject'][1] #print([X[1, tfidf.vocabulary_['δημόσιας']]]) #print([X[1, tfidf.vocabulary_['κατάταξη']]]) #print([X[1, tfidf.vocabulary_['βαθμό']]]) #Sentiment Classification result.dropna(inplace=True) result[result['Score'] != 3] result['Positivity'] = np.where(result['Score'] >= 2, 1, 0) cols = ['Score'] result.drop(cols, axis=1, inplace=True) result.head() result.groupby('Positivity').size() X = result.Subject y = result.Positivity X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) print( "Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive" .format(len(X_train), (len(X_train[y_train == 0]) / (len(X_train) * 1.)) * 100, (len(X_train[y_train == 1]) / (len(X_train) * 1.)) * 100)) print(
def threshold(data): data[np.where((0.5 > data))] = 0 data[np.where((0.5 <= data) & (data < 1.5))] = 1 data[np.where((1.5 <= data) & (data < 2.5))] = 2 data[np.where((2.5 <= data) & (data < 3.5))] = 3 return data
def add_ranking_flags(self, df: pd.DataFrame) -> pd.DataFrame: self.logger.info("Adding potential target flags") df['is_oscar_winner'] = np.where(df.index.isin(oscar_winners), 1, 0) df['is_top250'] = np.where(df.index.isin(top_250_engl), 1, 0) df['is_worst100'] = np.where(df.index.isin(worst_100), 1, 0) return df
t0 = cpu_time() train_features_tfidf = tfidf.fit_transform(train_features).toarray() del train_features print("Computing the TFIDF took {} sec of the CPU's time.".format(cpu_time() - t0)) print('Transforming the test data using the trained TFIDF...') test = tfidf.transform(test).toarray() print('Finished transforming the test data using the trained TFIDF.') # encode labels as integers 0-8 (from "Class_1", "Class_2", etc) classification_encoder = preprocessing.LabelEncoder() print('Transforming labels from text (class names) into an integer ENUM...') t0 = cpu_time() train_targets_encoded = classification_encoder.fit_transform(train_targets) print("Transforming labels took {} sec of the CPU's time.".format(cpu_time() - t0)) assert(all(sample_label_set[i] == 'Class_{}'.format(i+1) == train_targets[np.where(train_targets_encoded == i)[0][0]] for i in range(len(sample_label_set)))) # train a random forest classifier rfc = RandomForestClassifier(n_jobs=-1, n_estimators=300) print('Training a random forest on the training set...') t0 = cpu_time() # `train_features_tfidf` = 60k x 93 matrix of term frequencies normalized (divided by) document frequencies # `train_targets` = array(['Class_1', 'Class_1', 'Class_1', ..., 'Class_9', 'Class_9', 'Class_9'], dtype=object) rfc.fit(train_features_tfidf, train_targets) print("Random Forest took {} sec of the CPU's time.".format(cpu_time() - t0)) # predict on training set print('Rerunning the predictor to predict the the labels for the {} training set records...'.format(len(train_features_tfidf))) t0 = cpu_time()