def process_one_cell(df_train, df_test, grid_id, th): """ Classification inside one grid cell. """ # Working on df_train df_cell_train = df_train.loc[df_train.grid_cell == grid_id] place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= th).values df_cell_train = df_cell_train.loc[mask] # Working on df_test df_cell_test = df_test.loc[df_test.grid_cell == grid_id] row_ids = df_cell_test.index # Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int) X_test = df_cell_test.drop(['grid_cell'], axis=1).values.astype(int) # Applying the classifier clf = KNeighborsClassifier(n_neighbors=conf['neighbours'], weights='distance', metric='manhattan') clf.fit(X, y) y_pred = clf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3]) return pred_labels, row_ids
def process_one_cell(df_cell_train, df_cell_test): #Working on df_train place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= 5).values df_cell_train = df_cell_train.loc[mask] #Working on df_test row_ids = df_cell_test.index #Feature engineering on x and y df_cell_train.loc[:,'x'] *= 462.0 df_cell_train.loc[:,'y'] *= 975.0 df_cell_test.loc[:,'x'] *= 462.0 df_cell_test.loc[:,'y'] *= 975.0 #Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id'], axis=1).values #Applying the classifier, ct = 5.3 #5.1282 clf = KNeighborsClassifier(n_neighbors=np.floor(np.sqrt(y.size)/5.2).astype(int), weights=calculate_distance,metric='manhattan',n_jobs=2) clf.fit(X, y) y_pred = clf.predict_proba(df_cell_test.values) ##1 #pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:n_topx]) return pred_labels, row_ids
class labelOnehotEnc(): def __init__(self): self.le = LabelEncoder() self.oe = OneHotEncoder(sparse=False) def label_fit(self,x): feature = self.le.fit_transform(x) self.oe = OneHotEncoder(sparse=False) return self.oe.fit_transform(feature.reshape(-1,1)) def onehot_inverse(self,x): self.indecies = [] for t in range(len(x)): ind = np.argmax((x[t])) self.indecies.append(ind) return self.le.inverse_transform(self.indecies) def inverse_label(self,x): return self.le.inverse_transform(x)
def test_hard_vote(): X,y,test_X,test_Y =get_test_data() print("bag of words") bow = BagOfWordsClassifier() bow_probs = bow.get_proba(X,y,test_X,prefix="t") print("direct attribute") da = DirectAttributeClassifier() da_probs = da.get_proba(X,y,test_X,prefix="t") probs = zip(*[item for p in [bow_probs,da_probs] for item in p]) #train_probs = probs[0] test_probs = probs[1] print(len(test_probs)) preds = [x.idxmax(1) for x in test_probs] pred = np.zeros(len(preds[0]),dtype=np.int8) print(len(pred)) for i in range(len(preds[0])): votes = [p[i] for p in preds] print(votes) pred[i]= max(set(votes),key=votes.count) print(pred[i]) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() le.fit(y) pred = le.inverse_transform(pred) print(metrics.accuracy_score(test_Y,pred)) """
def test_same_inverse_transform(self): Y, Y_rdd = self.make_dense_randint_rdd(low=0, high=10, shape=(1000,)) local = LabelEncoder().fit(Y) dist = SparkLabelEncoder().fit(Y_rdd) assert_array_equal(local.inverse_transform(Y), dist.inverse_transform(Y_rdd).toarray())
class Classifier(BaseEstimator): def __init__(self): self.label_encoder = LabelEncoder() self.scaler = StandardScaler() self.clf = None def fit(self, X, y): X = self.scaler.fit_transform(X.astype(np.float32)) y = self.label_encoder.fit_transform(y).astype(np.int32) dtrain = xgb.DMatrix( X, label=y.astype(np.float32)) param = {'objective':'multi:softprob', 'eval_metric':'mlogloss'} param['nthread'] = 4 param['num_class'] = 9 param['colsample_bytree'] = 0.55 param['subsample'] = 0.85 param['gamma'] = 0.95 param['min_child_weight'] = 3.0 param['eta'] = 0.05 param['max_depth'] = 12 num_round = 400 # to be faster ?? #num_round = 820 self.clf = xgb.train(param, dtrain, num_round) def predict(self, X): X = self.scaler.transform(X.astype(np.float32)) dtest = xgb.DMatrix(X) label_index_array = np.argmax(self.clf.predict(dtest), axis=1) return self.label_encoder.inverse_transform(label_index_array) def predict_proba(self, X): X = self.scaler.transform(X.astype(np.float32)) dtest = xgb.DMatrix(X) return self.clf.predict(dtest)
def process_one_cell(df_cell_train, df_cell_test): #Working on df_train place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= 8).values df_cell_train = df_cell_train.loc[mask] #Working on df_test row_ids = df_cell_test.index #Feature engineering on x and y df_cell_train.loc[:,'x'] *= 500.0 df_cell_train.loc[:,'y'] *= 1000.0 df_cell_test.loc[:,'x'] *= 500.0 df_cell_test.loc[:,'y'] *= 1000.0 #Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id'], axis=1).values X_test = df_cell_test.values #Applying the classifier clf = KNeighborsClassifier(n_neighbors=36, weights=calculate_distance, metric='manhattan') clf.fit(X, y) y_pred = clf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) return pred_labels, row_ids
def test_vote_soft(): X,y,test_X,test_Y =get_test_data() print("bag of words") bow = BagOfWordsClassifier() bow_probs = bow.get_proba(X,y,test_X,prefix="t") print("direct attribute") da = DirectAttributeClassifier() da_probs = da.get_proba(X,y,test_X,prefix="t") probs = zip(*[item for p in [bow_probs,da_probs] for item in p]) train_probs = probs[0] test_probs = probs[1] print(len(train_probs)) for prob in train_probs: print(prob.shape) print(type(prob)) #train_attr = reduce(lambda a,b:a+b,train_probs) test_attr = reduce(lambda a,b:a+b,test_probs) pred = test_attr.idxmax(1) from sklearn.preprocessing import LabelEncoder le = LabelEncoder() le.fit(y) pred = le.inverse_transform(pred) print(metrics.accuracy_score(test_Y,pred))
def process_1_grid(df_train, df_test, grid, threshold): # Creating data with the particular grid id. df_train_1_grid = df_train.loc[df_train.grid_num == grid] df_test_1_grid = df_test.loc[df_test.grid_num == grid] place_counts = df_train_1_grid.place_id.value_counts() mask = (place_counts[df_train_1_grid.place_id.values] >= threshold).values df_train_1_grid = df_train_1_grid.loc[mask] # Label Encoding le = LabelEncoder() labels = le.fit_transform(df_train_1_grid.place_id.values) # Computing train and test feature data for grid grid. X = df_train_1_grid.drop(['place_id','grid_num'], axis=1).values.astype(int) X_test = df_test_1_grid.drop(['grid_num'], axis=1).values.astype(int) row_id = df_test_1_grid.index # KNN Classifier clf = KNeighborsClassifier(n_neighbors=20, weights= 'distance', metric='manhattan') #clf = GaussianNB() # Training of the classifier #clf = XGBClassifier(max_depth=10, learning_rate=0.5, n_estimators=25,objective='multi:softprob', subsample=0.5, colsample_bytree=0.5, seed=0) clf.fit(X,labels) # Predicting probabilities for each of the label for test data. prob_y = clf.predict_proba(X_test) # Transforming back to labels from One hot Encoding pred_labels = le.inverse_transform(np.argsort(prob_y, axis=1)[:,::-1][:,:3]) return pred_labels, row_id
def one_partition_NDCG(x ,labels ,model ,i ,factor): le = LabelEncoder() y = le.fit_transform(labels) piv_train = x.shape[0] trans_x = [] trans_y = [] test_x = [] test_y = [] if i == 0: trans_x = x[(i+1)*factor:] trans_y = y[(i+1)*factor:] test_x = x[:(i+1)*factor] test_y = y[:(i+1)*factor] elif i+1 == piv_train/factor: trans_x = x[:i*factor] trans_y = y[:i*factor] test_x = x[i*factor:] test_y = y[i*factor:] else: trans_x = np.concatenate((x[:i*factor],x[(i+1)*factor:])) trans_y = np.concatenate((y[:i*factor],y[(i+1)*factor:])) test_x = x[i*factor:(i+1)*factor] test_y = y[i*factor:(i+1)*factor] model.fit(trans_x,trans_y) y_pred = model.predict_proba(test_x) ids = [] cts = [] for j in range(factor): cts += [le.inverse_transform(np.argsort(y_pred[j])[::-1])[:5].tolist()] preds = pd.DataFrame(cts) truth = pd.Series(labels[i*factor:(i+1)*factor]) #truth = pd.Series(le.inverse_transform(test_y).tolist()) return mean_NDCG(preds, truth)
def process_one_cell(df_train, df_test, grid_id, th): """ Classification inside one grid cell. """ #Working on df_train df_cell_train = df_train.loc[df_train.grid_cell == grid_id] place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= th).values df_cell_train = df_cell_train.loc[mask] #Working on df_test df_cell_test = df_test.loc[df_test.grid_cell == grid_id] row_ids = df_cell_test.index #Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values.astype(int) X_test = df_cell_test.drop(['grid_cell'], axis = 1).values.astype(int) #Applying the classifier X_ = xgb.DMatrix(X, label=y) X_t = xgb.DMatrix(X_test) boost = xgb.train({'eta': 0.1, 'objective': 'multi:softprob', 'num_class': len(le.classes_), 'alpha': 0.1, 'lambda': 0.1, 'booster': 'gbtree'}, X_, num_boost_round = 75) y_pred = boost.predict(X_t) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) return pred_labels, row_ids
def process_one_cell_df(train_cell, test_cell, g): """ Return: ------ pred_labels: numpy ndarray Array with the prediction of the top 3 labels for each sample row_ids: IDs of the samples in the submission dataframe """ train = np.frombuffer(shared_train).reshape(train_x, train_y) test = np.frombuffer(shared_test).reshape(test_x, test_y) if (train_cell[0] >= train_cell[1]) | (test_cell[0] >= test_cell[1]): return None, None row_ids = test[test_cell[0]:test_cell[1], 0].astype(int) le = LabelEncoder() y = le.fit_transform(train[train_cell[0]:train_cell[1], 0]) X = train[train_cell[0]:train_cell[1], 1:] clf = create_classifier(g.clf, y.size) clf.fit(X, y) X_test = test[test_cell[0]:test_cell[1], 1:] y_prob = clf.predict_proba(X_test) pred_y = np.argsort(y_prob, axis=1)[:,::-1][:,:g.top] pred_labels = le.inverse_transform(pred_y).astype(np.int64) labs = pd.DataFrame(pred_labels, index=row_ids) labs.index.name = "row_id" probs = pd.DataFrame(y_prob[np.arange(len(y_prob)).reshape(-1,1), pred_y], index=row_ids) probs.index.name = "row_id" return labs, probs
class Classifier(BaseEstimator): def __init__(self): self.label_encoder = LabelEncoder() self.scaler = StandardScaler() self.clf = None self.param = {'eval_metric':'mlogloss'} self.param['num_class'] = 9 self.param['subsample'] = 0.795 self.param['gamma'] = 0.9 self.num_round = 170 self.obj = 'multi:softprob' def fit(self, X, y): X = self.scaler.fit_transform(X.astype(np.float32)) y = self.label_encoder.fit_transform(y).astype(np.int32) dtrain = xgb.DMatrix( X, label=y.astype(np.float32)) self.param['objective'] = self.obj self.clf = xgb.train(self.param, dtrain, self.num_round) def predict(self, X): X = self.scaler.transform(X.astype(np.float32)) dtest = xgb.DMatrix(X) label_index_array = np.argmax(self.clf.predict(dtest), axis=1) return self.label_encoder.inverse_transform(label_index_array) def predict_proba(self, X): X = self.scaler.transform(X.astype(np.float32)) dtest = xgb.DMatrix(X) return self.clf.predict(dtest)
def process_cell(self, df_cell_train, df_cell_test, window): place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= th).values df_cell_train = df_cell_train.loc[mask] # Working on df_test row_ids = df_cell_test.index # Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id', ], axis=1).values.astype(int) X_test = df_cell_test.values.astype(int) # Applying the classifier clf1 = KNeighborsClassifier(n_neighbors=50, weights='distance', metric='manhattan') clf2 = RandomForestClassifier(n_estimators=50, n_jobs=-1) eclf = VotingClassifier(estimators=[('knn', clf1), ('rf', clf2)], voting='soft') eclf.fit(X, y) y_pred = eclf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3]) return pred_labels, row_ids
class LogisticRegression: """ Logistic regression. Minimize regularized log-loss: L(x, y|w) = - sum_i log p(y_i|x_i, w) + l2 ||w||^2 p(y|x, w) = exp(w[y].x) / (sum_y' exp(w[y'].x)) Parameters ---------- l2: float, default=0 L2 regularization strength """ def __init__(self, l2=0): self.l2 = l2 self.loss = LogisticLoss() def fit(self, X, y): self.label_encoder_ = LabelEncoder() y = self.label_encoder_.fit_transform(y).astype(numpy.int32) self.n_classes = len(numpy.unique(y)) self.coef_ = numpy.zeros((X.shape[1] + 1) * (self.n_classes - 1), dtype=numpy.float64) dataset = IntegerDataset(X, y) self.loss.fit(dataset, self.coef_, self.l2) return self def predict(self, X): n_features = self.coef_.size/(self.n_classes - 1) - 1 assert X.shape[1] == n_features return self.label_encoder_.inverse_transform(self.loss.predict(n_features, self.n_classes, self.coef_, X)) def predict_proba(self, X): n_features = self.coef_.size/(self.n_classes - 1) - 1 assert X.shape[1] == n_features return self.loss.predict_proba(n_features, self.n_classes, self.coef_, X)
class PipelineNet(NeuralNet): # By default Lasagne is super finicky with inputs and outputs. So I just handle most of the pre and postprocessing for you. def fit(self,X, y,**params): self.label_encoder = LabelEncoder() self.one_hot = OneHotEncoder() y = list(map(lambda x:[x],self.label_encoder.fit_transform(y))) y = np.array(self.one_hot.fit_transform(y).toarray(),dtype=np.float32) X = np.array(X,dtype=np.float32) self.output_num_units=len(y[0]) self.input_shape=(None,X.shape[1]) self.output_nonlinearity=lasagne.nonlinearities.softmax return NeuralNet.fit(self,X,y,**params) def predict(self, X): X = np.array(X,dtype=np.float32) preds = NeuralNet.predict(self,X) preds = np.argmax(preds,axis=1) preds = self.label_encoder.inverse_transform(preds) return preds def score(self, X, y): return sklearn.metrics.accuracy_score(self.predict(X),y)
def process_grid_cell(train, test, grid_id, threshold, model, grid_variable): """ Creates model and generates predictions for row_ids in a particular grid cell. """ start = time.time() # Filter data onto single grid cell train_cell = train[train[grid_variable] == grid_id] test_cell = test[test[grid_variable] == grid_id] test_ids = test_cell.index # Remove place ids from train data with frequency below threshold place_counts = train_cell.place_id.value_counts() mask = place_counts[train_cell.place_id.values] >= threshold train_cell = train_cell.loc[mask.values] # Encode place id as labels le = LabelEncoder() y_train = le.fit_transform(train_cell.place_id.values) X_train = train_cell.drop(['place_id', grid_variable], axis = 1).values X_test = test_cell.drop([grid_variable], axis = 1).values.astype(int) # Build training classifier and predict model.fit(X_train, y_train) X_test = test_cell.drop([grid_variable], axis = 1).values y_pred = model.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) end = time.time() time_elapsed = (end - start) # Return data return pred_labels, test_ids, time_elapsed
def fit(self, df_X, df_y): if not df_y.shape[0] == df_X.shape[0]: raise ValueError("number of regions is not equal") if df_y.shape[1] != 1: raise ValueError("y needs to have 1 label column") le = LabelEncoder() y = le.fit_transform(df_y.iloc[:,0].values) clf = RandomForestClassifier(n_estimators=100) # Multiclass if len(le.classes_) > 2: orc = OneVsRestClassifier(clf) orc.fit(df_X.values, y) importances = np.array([c.feature_importances_ for c in orc.estimators_]).T else: # Only two classes clf.fit(df_X.values, y) importances = np.array([ clf.feature_importances_, clf.feature_importances_ ]).T for i,c in enumerate(le.classes_): diff = df_X.loc[y == c].quantile(q=0.75) - df_X.loc[y != c].quantile(q=0.75) sign = (diff >= 0) * 2 - 1 importances[:,i] *= sign # create output DataFrame self.act_ = pd.DataFrame(importances, columns=le.inverse_transform(range(len(le.classes_))), index=df_X.columns)
def test_label_encoder(): """Test LabelEncoder's transform and inverse_transform methods""" le = LabelEncoder() le.fit([1, 1, 4, 5, -1, 0]) assert_array_equal(le.classes_, [-1, 0, 1, 4, 5]) assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0]) assert_array_equal(le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]) assert_raises(ValueError, le.transform, [0, 6])
class EnsembleClassifier(BaseEstimator, ClassifierMixin, TransformerMixin): def __init__(self, clfs, voting = 'hard', weights = None): self.clfs = clfs self.named_clfs = {key:value for key, value in _name_estimators(clfs)} self.voting = voting self.weights = weights def fit(self, X, y): self.le_ = LabelEncoder() self.le_.fit(y) self.classes_ = self.le_.classes_ self.clfs_ = [] for clf in self.clfs: fitted_clf = clone(clf).fit(X, self.le_.transform(y)) self.clfs_.append(fitted_clf) return self def predict(self, X): if self.voting == 'soft': maj = np.argmax(self.predict_proba(X), axis=1) else: # 'hard' voting predictions = self._predict(X) maj = np.apply_along_axis( lambda x: np.argmax(np.bincount(x, weights = self.weights)), axis = 1, arr = predictions ) maj = self.le_.inverse_transform(maj) return maj def predict_proba(self, X): avg = np.average(self._predict_probas(X), axis=0, weights=self.weights) return avg def transform(self, X): if self.voting == 'soft': return self._predict_probas(X) else: return self._predict(X) def get_params(self, deep=True): """ Return estimator parameter names for GridSearch support""" if not deep: return super(EnsembleClassifier, self).get_params(deep=False) else: out = self.named_clfs.copy() for name, step in six.iteritems(self.named_clfs): for key, value in six.iteritems(step.get_params(deep=True)): out['%s__%s' % (name, key)] = value return out def _predict(self, X): """ Collect results from clf.predict calls. """ return np.asarray([clf.predict(X) for clf in self.clfs_]).T def _predict_probas(self, X): """ Collect results from clf.predict calls. """ return np.asarray([clf.predict_proba(X) for clf in self.clfs_])
def test_label_encoder_string_labels(): """Test LabelEncoder's transform and inverse_transform methods with non-numeric labels""" le = LabelEncoder() le.fit(["paris", "paris", "tokyo", "amsterdam"]) assert_array_equal(le.classes_, ["amsterdam", "paris", "tokyo"]) assert_array_equal(le.transform(["tokyo", "tokyo", "paris"]), [2, 2, 1]) assert_array_equal(le.inverse_transform([2, 2, 1]), ["tokyo", "tokyo", "paris"]) assert_raises(ValueError, le.transform, ["london"])
def process_one_cell(df_train, df_test, grid_id, th): """ Does all the processing inside a single grid cell: Computes the training and test sets inside the cell. Fits a classifier to the training data and predicts on the test data. Selects the top 3 predictions. Parameters: ---------- df_train: pandas DataFrame Training set df_test: pandas DataFrame Test set grid_id: int The id of the grid to be analyzed th: int Threshold for place_id. Only samples with place_id with at least th occurrences are kept in the training set. Return: ------ pred_labels: numpy ndarray Array with the prediction of the top 3 labels for each sample row_ids: IDs of the samples in the submission dataframe """ # Working on df_train df_cell_train = df_train.loc[df_train.grid_cell == grid_id] place_counts = df_cell_train.place_id.value_counts() mask = place_counts[df_cell_train.place_id.values] >= th df_cell_train = df_cell_train.loc[mask.values] # Working on df_test df_cell_test = df_test.loc[df_test.grid_cell == grid_id] row_ids = df_cell_test.index le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1).values X_test = df_cell_test.drop(['grid_cell'], axis=1).values if (X_test.shape[0] > 0): # Training Classifier if (X.shape[0] == 0): print("empty training set - grid_id:"+str(grid_id)) ##clf = SGDClassifier(loss='modified_huber', n_iter=1, random_state=0, n_jobs=-1) #clf = RandomForestClassifier(n_estimators=200) #clf = KNeighborsClassifier(n_neighbors=25, weights='distance', # metric='manhattan') clf = tree.DecisionTreeClassifier() clf.fit(X, y) y_pred = clf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:, ::-1][:, :3]) return pred_labels, row_ids else: print("X_test.shape == 0 ... ") return [], row_ids
def process_one_cell(df_train, df_test, x_min, x_max, y_min, y_max): x_border_augment = 0.025 y_border_augment = 0.0125 #Working on df_train df_cell_train = df_train[(df_train['x'] >= x_min-x_border_augment) & (df_train['x'] < x_max+x_border_augment) & (df_train['y'] >= y_min-y_border_augment) & (df_train['y'] < y_max+y_border_augment)] place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= th).values df_cell_train = df_cell_train.loc[mask] #Working on df_test # to be delete: df_cell_test = df_test.loc[df_test.grid_cell == grid_id] df_cell_test = df_test[(df_test['x'] >= x_min) & (df_test['x'] < x_max) & (df_test['y'] >= y_min) & (df_test['y'] < y_max)] row_ids = df_cell_test.index if(len(df_cell_train) == 0 or len(df_cell_test) == 0): return None, None #Feature engineering on x and y df_cell_train.loc[:,'x'] *= fw[0] df_cell_train.loc[:,'y'] *= fw[1] df_cell_test.loc[:,'x'] *= fw[0] df_cell_test.loc[:,'y'] *= fw[1] #Preparing data le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id'], axis=1).values.astype(float) if 'place_id' in df_cell_test.columns: cols = df_cell_test.columns cols = cols.drop('place_id') X_test = df_cell_test[cols].values.astype(float) else: X_test = df_cell_test.values.astype(float) #Applying the classifier # clf = KNeighborsClassifier(n_neighbors=26, weights='distance', # metric='manhattan') clf1 = BaggingClassifier(KNeighborsClassifier(n_neighbors=26, weights='distance', metric='manhattan'), n_jobs=-1, n_estimators=50) clf2 = RandomForestClassifier(n_estimators=100, n_jobs=-1) eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)], voting='hard') eclf.fit(X, y) y_pred = eclf.predict_proba(X_test) pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3]) return pred_labels, row_ids
class XGBClassifier(XGBModel, XGBClassifier): __doc__ = """ Implementation of the scikit-learn API for XGBoost classification """ + "\n".join(XGBModel.__doc__.split('\n')[2:]) def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective="binary:logistic", nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, base_score=0.5, seed=0, missing=None): super(XGBClassifier, self).__init__(max_depth, learning_rate, n_estimators, silent, objective, nthread, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, base_score, seed, missing) def fit(self, X, y, sample_weight=None): self.classes_ = list(np.unique(y)) self.n_classes_ = len(self.classes_) if self.n_classes_ > 2: # Switch to using a multiclass objective in the underlying XGB instance self.objective = "multi:softprob" xgb_options = self.get_xgb_params() xgb_options['num_class'] = self.n_classes_ else: xgb_options = self.get_xgb_params() self._le = LabelEncoder().fit(y) training_labels = self._le.transform(y) if sample_weight is not None: trainDmatrix = DMatrix(X, label=training_labels, weight=sample_weight, missing=self.missing) else: trainDmatrix = DMatrix(X, label=training_labels, missing=self.missing) self._Booster = train(xgb_options, trainDmatrix, self.n_estimators) return self def predict(self, X): testDmatrix = DMatrix(X, missing=self.missing) class_probs = self.booster().predict(testDmatrix) if len(class_probs.shape) > 1: column_indexes = np.argmax(class_probs, axis=1) else: column_indexes = np.repeat(0, X.shape[0]) column_indexes[class_probs > 0.5] = 1 return self._le.inverse_transform(column_indexes) def predict_proba(self, X): testDmatrix = DMatrix(X, missing=self.missing) class_probs = self.booster().predict(testDmatrix) if self.objective == "multi:softprob": return class_probs else: classone_probs = class_probs classzero_probs = 1.0 - classone_probs return np.vstack((classzero_probs, classone_probs)).transpose()
def fit_predict_proba_2clf(X, y, test): #return test; le = LabelEncoder() y = le.fit_transform(y) clf1 = KNeighborsClassifier(n_neighbors=20, weights=lambda x: x ** -2, metric='manhattan',n_jobs=-1) #clf1 = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1, # min_samples_split=4, random_state=0, criterion='entropy') clf2 = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1, min_samples_split=4, random_state=0, criterion='gini') preds_level1 = pd.DataFrame() row_ids = test.index.values clf1.fit(X, y) y_pred_1 = clf1.predict_proba(test) y_pred_1 = dict(zip(le.inverse_transform(clf1.classes_), zip(*y_pred_1))) y_pred_1 = pd.DataFrame.from_dict(y_pred_1) y_pred_1['row_id'] = row_ids y_pred_1 = y_pred_1.set_index('row_id') y_pred_1.index.name = 'row_id'; clf2.fit(X, y) y_pred_2 = clf2.predict_proba(test) y_pred_2 = dict(zip(le.inverse_transform(clf2.classes_), zip(*y_pred_2))) y_pred_2 = pd.DataFrame.from_dict(y_pred_2) y_pred_2['row_id'] = row_ids y_pred_2 = y_pred_2.set_index('row_id') y_pred_2.index.name = 'row_id'; all_columns = y_pred_1.columns y_pred_1.rename(columns = lambda x: str(x)+'_1', inplace=True) y_pred_2.rename(columns = lambda x: str(x)+'_2', inplace=True) preds_level1 = pd.concat([y_pred_1, y_pred_2], axis=1) #print preds_level1.shape return preds_level1
class BaseClassifier(BaseEstimator): def predict_proba(self, X): if len(self.classes_) != 2: raise NotImplementedError("predict_(log_)proba only supported" " for binary classification") if self.loss == "log": df = self.decision_function(X).ravel() prob = 1.0 / (1.0 + np.exp(-df)) elif self.loss == "modified_huber": df = self.decision_function(X).ravel() prob = np.minimum(1, np.maximum(-1, df)) prob += 1 prob /= 2 else: raise NotImplementedError("predict_(log_)proba only supported when" " loss='log' or loss='modified_huber' " "(%s given)" % self.loss) out = np.zeros((X.shape[0], 2), dtype=np.float64) out[:, 1] = prob out[:, 0] = 1 - prob return out def _set_label_transformers(self, y, reencode=False, neg_label=-1): if reencode: self.label_encoder_ = LabelEncoder() y = self.label_encoder_.fit_transform(y).astype(np.int32) else: y = y.astype(np.int32) self.label_binarizer_ = LabelBinarizer(neg_label=neg_label, pos_label=1) self.label_binarizer_.fit(y) self.classes_ = self.label_binarizer_.classes_.astype(np.int32) n_classes = len(self.label_binarizer_.classes_) n_vectors = 1 if n_classes <= 2 else n_classes return y, n_classes, n_vectors def decision_function(self, X): pred = safe_sparse_dot(X, self.coef_.T) if hasattr(self, "intercept_"): pred += self.intercept_ return pred def predict(self, X): pred = self.decision_function(X) out = self.label_binarizer_.inverse_transform(pred) if hasattr(self, "label_encoder_"): out = self.label_encoder_.inverse_transform(out) return out
class Dataset: def __init__(self, frame_size=40, hop_size=3): self.frame_size = frame_size self.hop_size = hop_size path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt") self.text = open(path).read().lower() print('corpus length:', len(self.text)) chars = sorted(list(set(self.text))) self.class_count = len(chars) print('total chars:', self.class_count) self.le = LabelEncoder().fit(chars) self.text_ohe = self.text_to_ohe(self.text) def split_to_frames(values, frame_size, hop_size): """ Split to overlapping frames. """ return np.stack(values[i:i + frame_size] for i in range(0, len(values) - frame_size + 1, hop_size)) def split_features_targets(frames): """ Split each frame to features (all but last element) and targets (last element). """ frame_size = frames.shape[1] X = frames[:, :frame_size - 1] y = frames[:, -1] return X, y # cut the text in semi-redundant sequences of frame_size characters self.X, self.y = split_features_targets(split_to_frames( self.text_ohe, frame_size + 1, hop_size)) print('X.shape:', self.X.shape, 'y.shape:', self.y.shape) def ohe_to_text(self, text_ohe): return self.le_to_text(text_ohe.argmax(axis=1)) def text_to_ohe(self, text): return self.le_to_ohe(self.text_to_le(list(text))) def le_to_text(self, text_le): return ''.join(self.le.inverse_transform(text_le)) def text_to_le(self, text): return self.le.transform(text) def le_to_ohe(self, text_le): return to_categorical(text_le, nb_classes=self.class_count)
class EnsembleClassifier(BaseEstimator, ClassifierMixin, TransformerMixin): def __init__(self,clfs,voting='hard',weights=None): self.clfs = clfs if voting in ('hard','soft'): self.voting = voting if weights != None and len(clfs) == len(weights): self.weights = weights else: self.weights = None self.le = LabelEncoder() def fit(self,X,y): for clf in self.clfs: clf.fit(X,y) self.le.fit(y) return self def predict(self,X): if 'soft' == self.voting: average = self.predict_proba(X) majority = self.le.inverse_transform(np.argmax(average,axis=1)) else: self.classes = self.predict_classes(X) self.classes = np.asarray([self.classes[:,c] for c in range(self.classes.shape[1])]) if self.weights: self.classes = np.concatenate([np.tile(self.classes[:,c,None],w) for w,c in zip(self.weights,range(self.classes.shape[1]))],axis=1) majority = np.apply_along_axis(lambda x:np.argmax(np.bincount(x)),axis=1,arr=self.classes) return majority def transform(self, X): if self.weights: return self.predict_proba(X) else: return self.predict_classes(X) def predict_proba(self,X): self.probability = np.asarray([clf.predict_proba(X) for clf in self.clfs]) return np.average(self.probability,axis=0,weights=self.weights) def predict_classes(self,X): return np.asarray([clf.predict(X) for clf in self.clfs])
def process_one_cell(df_cell_train, df_cell_test): #Working on df_train place_counts = df_cell_train.place_id.value_counts() mask = (place_counts[df_cell_train.place_id.values] >= 8).values df_cell_train = df_cell_train.loc[mask] #Working on df_test row_ids = df_cell_test.index #Feature engineering on x and y df_cell_train.loc[:,'x'] *= 500.0 df_cell_train.loc[:,'y'] *= 1000.0 df_cell_test.loc[:,'x'] *= 500.0 df_cell_test.loc[:,'y'] *= 1000.0 #print "Preparing data" le = LabelEncoder() y = le.fit_transform(df_cell_train.place_id.values) X = df_cell_train.drop(['place_id'], axis=1).values X_test = df_cell_test.values #Applying the classifier clf_knn = KNeighborsClassifier(n_neighbors=36, weights=calculate_distance, metric='manhattan') clf_knn.fit(X, y) y_pred_knn = clf_knn.predict_proba(X_test) params = {'n_estimators': 100, 'subsample': 0.95, 'learning_rate': 0.15, 'colsample_bytree': 0.7, 'min_child_weight': 6.0} clf_xgb = xgb.XGBClassifier(**params) clf_xgb.fit(X, y) y_pred_xgb = clf_xgb.predict_proba(X_test) paras_rf = {'min_samples_split': 7.0, 'max_depth': 12.0, 'random_state':1234} clf_rf = RandomForestClassifier(**paras_rf) clf_rf.fit(X, y) y_pred_rf = clf_rf.predict_proba(X_test) #6 4 1 #3 1 0 ytotal = 1 * y_pred_xgb + 0.2 * y_pred_knn + 0.07 * y_pred_rf #ytotal = y_pred_knn pred_labels = le.inverse_transform(np.argsort(ytotal, axis=1)[:,::-1][:,:5]) return pred_labels, row_ids
class MajorityVoteClassifier(BaseEstimator, ClassifierMixin): def __init__(self, classifiers, vote='classlabel', weights=None): self.classifiers = classifiers self.named_classifiers = {key: value for key, value in _name_estimators(classifiers)} self.vote = vote self.weights = weights def fit(self, x, y): self.lablenc_ = LabelEncoder() self.lablenc_.fit(y) self.classes_ = self.lablenc_.classes_ self.classifiers_ = [] for clf in self.classifiers: fitted_clf = clone(clf).fit(x, self.lablenc_.transform(y)) self.classifiers_.append(fitted_clf) return self def predict(self, x): if self.vote == 'probability': maj_vote = np.argmax(self.predict_proba(x), axis=1) else: predictions = np.asarray([clf.predict(x) for clf in self.classifiers_]).T maj_vote = np.apply_along_axis(lambda x: np.argmax(np.bincount(x, weights=self.weights)), axis=1, arr=predictions) maj_vote = self.lablenc_.inverse_transform(maj_vote) return maj_vote def predict_proba(self, x): probas = np.asarray([clf.predict_proba(x) for clf in self.classifiers_]) avg_proba = np.average(probas, axis=0, weights=self.weights) return avg_proba def get_params(self, deep=True): if not deep: return super(MajorityVoteClassifier, self).get_params(deep=False) else: out = self.named_classifiers.copy() for name, step in six.iteritems(self.named_classifiers): for key, value in six.iteritems(step.get_params(deep=True)): out['%s__%s' % (name, key)] = value return out
class AutoConverter(): def __init__(self, target, strategy='auto', coltype_converters={}, column_converters={}, use_column_converter_only=True, n_jobs=1): """Big wrapping class for convertors Args: target (str): target column name strategy (str): {'auto'} coltype_converters (dict): dict of customized Transformers column_converters (dict): dict of customized column transformers use_column_converter_only (bool): Use only column converter or not n_jobs (int): n_jobs parameter for FeatureUnion column_converters will be applied to columns on a priority basis. If use_column_converter == True (default value), pre-defined transformers in TransformerCatalog will NOT be applied. Therefore, giving an empty list to a column can be used to "ignore" the column for feature extraction. In the following example, only TfIdfVectorizer with default parameters will be applied to "Name" column and no transformer will be applied to "Age" column. column_converters={"Name": [(TfIdfVectorizer, {})], "Age": []} """ self.target = target self.strategy = strategy self.feature_names = [] self.X = None self.y = None self.hasdata = False self.target_le = LabelEncoder() self.subtables_ = None self.converter_catalog = None self.set_converter(coltype_converters) self.column_converters = column_converters self.use_column_converter_only = use_column_converter_only self.n_jobs = n_jobs def set_converter(self, coltype_converters): """Insert customized transformers into self.converter_catalog.""" # TODO(Yoshi): Technically, dict.update overwrite existing entry # We might want to "append" instead. To be discussed. self.converter_catalog = ( DefaultTransformerCatalog.transformer_dict.copy()) self.converter_catalog.update(coltype_converters) def fit(self, df, subtables=None, y=None, custom_types={}): """Fits the data to the custom_converters Args: df (pd.DataFrame): main dataframe table subtables (dictionary): dictionary of subtables with keys for linking them to main table. Default: None. subtables = {tabname(str) : { "table": (pd.Dataframe), "link_key": (str) main table column name, "group_key": (str) this table column name, "custom_aggregator": (dict) col_type:aggregator_class}} Example: {"school_table": {"table": school_df, "link_key": "school_id", "group_key": "id", "custom_aggregator": {"text": CustomTextAggregator()} } } custom_types (dictionary): dictionary of col_types that overrides col_type_dicts made by auto_converter orcibly Returns: self """ assert self.target in df # filtering None df.dropna(subset=[self.target], inplace=True) # filterung NaN df = df[df[self.target].notnull()] self.target_le.fit(df[self.target].as_matrix()) X_df = df.drop(self.target, axis=1) # 1. typing columns self.colname_type_dict = type_columns(X_df) if isinstance(custom_types, dict): self.colname_type_dict.update(custom_types) # 2. Pre-imputing missing values for textual column for colname in X_df.columns: if (self.colname_type_dict[colname] == 'text' or self.colname_type_dict[colname] == 'categorical' or self.colname_type_dict[colname] == 'text_ja'): X_df.loc[:, colname] = X_df[colname].fillna("NaN").astype(str) # 3. create feature union transformer_list = [] for colname in X_df.columns: if colname in self.column_converters: for transformer_cls, kwargs in self.column_converters[colname]: transformer_list.append( (u"{}.{}".format(colname, transformer_cls.__name__), transformer_cls(colname, **kwargs))) if self.use_column_converter_only: # Since transformer(s) are defined by users, # skip automatic assignment of transformers for this column continue assert colname in self.colname_type_dict coltype = self.colname_type_dict[colname] if coltype == 'ignore': continue if coltype == 'date': # we don't want to pass np array to date transformer, # instead we pass pandas df # TODO(Yoshi): This is hard-coded?? d = DateTransformer(colname=colname) transformer_list.append((u"{}.{}".format(colname, 'date'), d)) continue t_dict = self.converter_catalog[coltype] for transformer in t_dict: transformer_cls = transformer[0] kwargs = transformer[1] transformer_list.append( (u"{}.{}".format(colname, transformer_cls.__name__), transformer_cls(colname, **kwargs))) # 4. fit feature union if transformer_list: # if there's something to transform self.fu = HeterogeneousFeatureUnion(transformer_list, n_jobs=self.n_jobs) self.fu.fit(X_df) feature_names = list( map(lambda x: 'main..' + text_type(x), self.fu.get_feature_names())) else: # emppty main table (only target and ignore types) # we assume there exist information in subtables then if not subtables: raise ValueError("There's nothing to transform") self.fu = None feature_names = [] # defining Aggregator structure and fitting the tables in if subtables: self.subtables_ = subtables for key in sorted(list(subtables.keys())): subtable_dict = subtables[key] if subtable_dict['link_key'] not in X_df.columns: raise KeyError("Link key " + subtable_dict['link_key'] + " does not exist in the main table") aggr = AutoAggregator(group_key=subtable_dict['group_key'], custom_aggregators=subtables.get( "custom_aggregator", {})) self.subtables_[key]['aggr'] = aggr aggr.fit(subtable_dict['table']) self.colname_type_dict[key] = aggr.colname_type_dict.copy() # gathering feature names from subtables append_list = list( map(lambda x: text_type(key) + '..' + text_type(x), aggr.feature_names)) feature_names.extend(append_list) self.feature_names = feature_names return self def transform(self, df, subtables=None, prediction=False): """Transforms data to feature matrix Args: df (pandas.DataFrame): data to transform subtables (dictionary): dictionary of subtables with keys for linking them to main table. Default: None. subtables = {tabname(str) : { "table": (pd.Dataframe), "link_key": (str) main table column name, "group_key": (str) this table column name }} Example: {"school_table": {"table": shool_pd}, "link_key": "school_id", "group_key": "id" } } prediction (bool): Returns only X if True Returns: X (numpy.ndarray): feature matrix y (array-like of shape [n_samples]): target vector """ if not prediction: # filtering None df.dropna(subset=[self.target], inplace=True) # filterung NaN df = df[df[self.target].notnull()] # TODO(Yoshi): Should display Warning message when transform # is called with prediction=False if self.hasdata is True if self.hasdata: print("[WARNING] This instance already has been fitted.") assert self.target in df y_unique = df[self.target].unique() if len(y_unique) == 1 and np.isnan(y_unique[0]): # this just leaves y equal to a np.nan vector of the same size # TODO(Yoshi): This should raise exception. # Will revise here after specifying exceptions y = df[self.target] else: y = self.target_le.transform(df[self.target].as_matrix()) X_df = df.drop(self.target, axis=1) else: # Prediction if self.subtables_ is not None: assert subtables is not None if self.target in df: X_df = df.drop(self.target, axis=1) else: X_df = df # TODO(later): Pre-imputing. This part could be redundant for colname in X_df.columns: if self.colname_type_dict[colname] in [ 'categorical', 'text', 'text_ja' ]: X_df.loc[:, colname] = X_df[colname].fillna("NaN").astype(str) if self.fu: X = self.fu.transform(X_df) else: # Creating the empty matrix of the same size to use it later during # data aggregation, since we can't use feature union in absence of # features X = np.empty([X_df.shape[0], 0]) # Ad-hoc way to convert sparse matrix into numpy.array and replace NaN # values with 0.0 if type(X) == sp.sparse.csr.csr_matrix: X = X.toarray() X[np.isnan(X)] = 0.0 # transforming subtables and concating them with main table feature # matrix if subtables: # TODO(Kate): make sure that subtables passed and subtables stored # have the same structure. Any ideas? X_gather = pd.DataFrame(X) for key in sorted(list(subtables.keys())): subtable = subtables[key] aggr = subtable['aggr'] link_key = subtable['link_key'] X_sub = aggr.transform(subtable['table']) # combine X_gather with subtable['link_key'] if link_key in X_gather.columns.tolist(): raise KeyError('column already exists in a dataframe' + link_key) X_gather[link_key] = df[link_key] # X_sub is already a pd.DataFrame with group_key included # as index X_gather = X_gather.merge(X_sub, how='left', left_on=link_key, right_index=True) # make sure we don't leave anything(index) behind ;) del X_gather[link_key] # do something with get_feature_names X = X_gather.as_matrix() # TODO(Yoshi): Post pre-processing such as missing value imputation # TODO(Yoshi): Tentative naive replacement of NaN values X = np.nan_to_num(X) if not prediction: self.X = X self.y = y self.hasdata = True return [X, y] else: return X def fit_transform(self, df, subtables=None, y=None): """Fit + Transform Args: df (pandas.DataFrame): main df subtables (dict): dictionary of subtables Returns: X (numpy.ndarray): feature matrix y (array-like of shape [n_samples]): target vector """ return self.fit(df, subtables).transform(df, subtables) def index2label(self, predictions): """Transforms predictions from numerical format back to labels Args: predictions (np.array): array of label numbers Returns: labels (np.array): array of label values """ return self.target_le.inverse_transform(predictions) def get_feature_names(self, colname=None): """Returns feature names Args: colname (str or tuple): column name if colname is a tuple (subtable name, colname) if None returns all feature names (default: None) Returns: feature_names (list) """ if colname is None: if len(self.feature_names) == 0: # TODO(Yoshi): Do we want to use a "trained" flag instead? print("[WARNING]:", "AutoConverter instance has extracted no feature.", "Probably, it has not been fit to data yet.") return self.feature_names # Use tuple (or list) to handle subtable feature names if type(colname) in [tuple, list]: # TODO(Yoshi): replace with Exception assert len(colname) == 2 colname_ = "..".join(colname) else: # colname is in main table colname_ = "main..{}".format(colname) colname_idx_list = list( filter(lambda x: colname_ in x[1], enumerate(self.feature_names))) colname_list = list(map(lambda x: x[1], colname_idx_list)) return colname_list def save(self, filepath, overwrite=False): """Save AutoConverter object as pickle file Args: filepath (str): Output pickle filepath overwrite (bool): Overwrites a file with the same name if true Returns: success_flag (bool) """ if not overwrite and os.path.exists(filepath): # TODO(Yoshi): Warning handling print("File already exists. Skip.") return False with open(filepath, "wb") as fout: pickle.dump(self, fout) return True @classmethod def load(cls, filepath): """Load AutoConverter object from pickle file Args: filepath (str): Input pickle filepath Returns: AutoLearn object """ with open(filepath, "rb") as fin: obj = pickle.load(fin) assert obj.__class__.__name__ == 'AutoConverter' return obj
from sklearn.model_selection import train_test_split my_features = my_combined['Cluster'].values.reshape(my_combined.shape[0], 1) print(np.unique(my_features)) from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelEncoder rgrps = [0, 1, 2] le = LabelEncoder() le.fit(rgrps) ohe = OneHotEncoder(sparse=False) le_data = le.transform(my_features).reshape(my_combined.shape[0], 1) ohecluster = ohe.fit_transform(le_data) print(ohecluster) enc = [0, 0, 1] print(le.inverse_transform(np.argmax(enc).reshape(1, 1))) labels = my_combined['Purchased'].values.reshape(my_combined.shape[0], 1) print(labels) # Evaluate the model by splitting into train and test sets # Notice the stratify keyword argument. # Roughly 40% of our data are lost contracts and 60% are won contracts. # We want our random testing and training data sets to have close to this same ratio. # Otherwise, we might be training or testing based on a biased sample. x_train, x_test, y_train, y_test = train_test_split(ohecluster, labels, test_size=0.4, stratify=labels, random_state=23) lr_model = model.fit(x_train, y_train)
def main(): file = "../../Resources/data/AudioFile/livefile.wav" sns.set() # Use seaborn's default style to make attractive graphs # Plot nice figures using Python's "standard" matplotlib library snd = parselmouth.Sound(file) plt.figure(figsize=(15, 5)) plt.plot(snd.xs(), snd.values.T) plt.xlim([snd.xmin, snd.xmax]) plt.xlabel("time [s]") plt.ylabel("amplitude") #plt.show() or plt.savefig("Resources/images/sound.png") plt.savefig("../../Resources/images/sound.png") def draw_spectrogram(spectrogram, dynamic_range=70): X, Y = spectrogram.x_grid(), spectrogram.y_grid() sg_db = 10 * np.log10(spectrogram.values) plt.pcolormesh(X, Y, sg_db, vmin=sg_db.max() - dynamic_range, cmap='afmhot') plt.ylim([spectrogram.ymin, spectrogram.ymax]) plt.xlabel("time [s]") plt.ylabel("frequency [Hz]") def draw_intensity(intensity): plt.plot(intensity.xs(), intensity.values.T, linewidth=3, color='w') plt.plot(intensity.xs(), intensity.values.T, linewidth=1) plt.grid(False) plt.ylim(0) plt.ylabel("intensity [dB]") intensity = snd.to_intensity() spectrogram = snd.to_spectrogram() plt.figure() draw_spectrogram(spectrogram) plt.twinx() draw_intensity(intensity) plt.xlim([snd.xmin, snd.xmax]) plt.savefig("../../Resources/images/spectrogram.png") def draw_pitch(pitch): # Extract selected pitch contour, and # replace unvoiced samples by NaN to not plot pitch_values = pitch.selected_array['frequency'] pitch_values[pitch_values == 0] = np.nan plt.plot(pitch.xs(), pitch_values, 'o', markersize=5, color='w') plt.plot(pitch.xs(), pitch_values, 'o', markersize=2) plt.grid(False) plt.ylim(0, pitch.ceiling) plt.ylabel("fundamental frequency [Hz]") pitch = snd.to_pitch() # If desired, pre-emphasize the sound fragment before calculating the spectrogram pre_emphasized_snd = snd.copy() pre_emphasized_snd.pre_emphasize() spectrogram = pre_emphasized_snd.to_spectrogram(window_length=0.03, maximum_frequency=8000) plt.figure() draw_spectrogram(spectrogram) plt.twinx() draw_pitch(pitch) plt.xlim([snd.xmin, snd.xmax]) plt.savefig("../../Resources/images/spectrogram_0.03.png") #livedf= pd.DataFrame(columns=['feature']) X, sample_rate = librosa.load(file, res_type='kaiser_fast', duration=2.5, sr=22050 * 2, offset=0.5) sample_rate = np.array(sample_rate) mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0) featurelive = mfccs livedf2 = featurelive livedf2 = pd.DataFrame(data=livedf2) livedf2 = livedf2.stack().to_frame().T livedf2 json_file = open('model.json', 'r') loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json) # load weights into new model loaded_model.load_weights("saved_models/Emotion_Voice_Detection_Model.h5") twodim = np.expand_dims(livedf2, axis=2) livepreds = loaded_model.predict(twodim, batch_size=32, verbose=1) livepreds1 = livepreds.argmax(axis=1) liveabc = livepreds1.astype(int).flatten() print(liveabc) lb = LabelEncoder() y_train = load('y_train.npy', allow_pickle=True) y_test = load('y_test.npy', allow_pickle=True) y_train = np_utils.to_categorical(lb.fit_transform(y_train)) y_test = np_utils.to_categorical(lb.fit_transform(y_test)) livepredictions = str(lb.inverse_transform((liveabc))[0]) gender_emotion = livepredictions.split('_') gender = gender_emotion[0].capitalize() emotion = gender_emotion[1].capitalize() return gender, emotion
class BoWS(BaseEstimator, TransformerMixin): def __init__(self, min_df=2, stop_words='english', alpha=0.1): self.min_df = min_df self.stop_words = stop_words self._cv = StemmedTfidfVectorizer(min_df=self.min_df, stop_words=self.stop_words) self._le = LabelEncoder() self.alpha = alpha self._fitted_ = False self.models_ = [] def __del__(self): del self.models_[:] def fit(self, X_texts, y=None): if y is None: raise TypeError("y can't be None") a = list(zip(X_texts, y)) shuffle(a) X_texts, scores = list(zip(*a)) X_texts = list(X_texts) y = list(scores) X_TF = self._cv.fit_transform(X_texts).tocsr() X = self._build_binary_cooccur_matrix_(X_TF) y = self._normalize_y_(y) self._build_auxiliar_features(X, y) self._build_class_models_() del self.Ntc_ del self.Nt_ del self.Pt_ self._fitted_ = True return self def transform(self, X_texts): if not self._fitted_: raise TypeError("The model did'nt fit yet!") X_TF = self._cv.transform(X_texts).tocsr() X = self._build_binary_cooccur_matrix_(X_TF) X_classes = {} for c in range(self.C_): X_classes[self._le.inverse_transform( [c])[0]] = transform_class_repr( X, self.models_[c].copy()).multiply(X_TF) return X_classes def _build_binary_cooccur_matrix_(self, X_TF): X = sp.csr_matrix((np.ones(len(X_TF.data)), X_TF.nonzero()), shape=X_TF.shape) del X_TF return X def _normalize_y_(self, y): return self._le.fit_transform(y) def _build_auxiliar_features(self, X, y): # número de documentos self.N_ = X.shape[0] # tamanho do vocabulário self.V_ = X.shape[1] # Número de classes self.C_ = max(y) + 1 # Número de cada co-ocrrência por classe self.Ntc_ = [ sp.lil_matrix((self.V_, self.V_)) for _ in range(max(y) + 1) ] for i, doc_matrix in tqdm(enumerate(generate_lines(X)), total=self.N_, desc='Building class representations'): self.Ntc_[y[i]] = (self.Ntc_[y[i]] + doc_matrix) ### Remove diagonal principal for i in range(len(self.Ntc_)): self.Ntc_[i].setdiag(0) self.Ntc_[i].eliminate_zeros() # frequencia de cada co-ocorrência por classe self.Nt_ = np.sum(self.Ntc_) # priori de cada termo P(t) self.Pt_ = self.Nt_ / self.N_ self.Pt_.eliminate_zeros() def _build_class_models_(self): self.models_ = [] for i in tqdm(range(self.C_), total=self.C_, desc='Building Models'): # Probabilidade P(t,c) data = np.array(self.Ntc_[i][self.Nt_.nonzero()] / self.Nt_[self.Nt_.nonzero()])[0] Ptc = sp.csr_matrix((data, self.Nt_.nonzero()), shape=self.Nt_.shape) # Jenilek-Mercer smoothing norm_Ptc = (1. - self.alpha) * Ptc + self.alpha * self.Pt_ # P*sqrt(n) data1 = np.multiply(norm_Ptc[norm_Ptc.nonzero()], np.sqrt(self.Ntc_[i][norm_Ptc.nonzero()])) # 2*sqrt( p(1-p) ) data2 = 2. * np.sqrt( np.multiply(norm_Ptc[norm_Ptc.nonzero()], 1. - norm_Ptc[norm_Ptc.nonzero()])) CI_dominance_smooth = sp.csr_matrix( (np.array(data1 / data2)[0], norm_Ptc.nonzero()), shape=norm_Ptc.shape) del data del data1 del data2 max_prob = (1. - self.alpha ) * Ptc.data.max() + self.alpha * self.Pt_.data.max() max_size_ic = (max_prob * np.sqrt(self.Ntc_[i].data.max())) / ( 2. * np.sqrt(max_prob * (1. - max_prob))) CI_dominance_smoooth_norm = CI_dominance_smooth / max_size_ic CI_dominance_smoooth_norm.eliminate_zeros() self.models_.append(CI_dominance_smoooth_norm) del CI_dominance_smooth del norm_Ptc del Ptc
for cv_train_index, cv_test_index in kf: xg_train = xgboost.DMatrix(train.values[cv_train_index, :], label=train_labels.iloc[cv_train_index].values.flatten()) xg_test = xgboost.DMatrix(train.values[cv_test_index, :], label=train_labels.iloc[cv_test_index].values.flatten()) xgclassifier = xgboost.train( params, xg_train, num_boost_round=params['num_round'], evals=[(xg_train, 'train'), (xg_test, 'test')], early_stopping_rounds=50 ) all_best_rounds.append(xgclassifier.best_iteration) best_boost_round = int(np.mean(all_best_rounds)) print('The best n_rounds is %d' % best_boost_round) # build final model xg_train = xgboost.DMatrix(train, label=train_labels.values.flatten()) xg_test = xgboost.DMatrix(test) final_round = int(best_boost_round * 1.2) xgclassifier = xgboost.train(params, xg_train, final_round, evals=[(xg_train, 'train')]) xgclassifier.save_model(best_model_path) # prediction print('writing to file') preds = xgclassifier.predict(xg_test).astype(int) preds = label_encoder.inverse_transform(preds) submission_file = pd.DataFrame.from_csv(submission_format_path) submission_file['status_group'] = preds submission_file.to_csv(prediction_path)
model = tf.keras.Sequential([ tf.keras.layers.Dense(16, input_shape=(len(features), ), activation='relu', name='fc1'), #layer 1 tf.keras.layers.Dense(8, activation='relu', name='fc2'), #layer 2 tf.keras.layers.Dense(num_classes, activation='softmax', name='output') ]) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) return model model = get_compiled_model() model.fit(X_train, y_train, batch_size=100, epochs=200) results = model.evaluate(X_test, y_test) print('Final test set loss: {:4f}'.format(results[0])) print('Final test set accuracy: {:4f}'.format(results[1])) pitches = le.inverse_transform(data["pitch_type"].unique()) data["pitch_type"].unique() some_data = data.sample(n=1) ynew = model.predict_classes(some_data[features]) print(some_data) print(le.inverse_transform(ynew))
class AirQuality: dataset = "" x = "" y = "" x_train = "" x_test = "" y_train = "" y_test = "" RandomForestModel = "" XgbModel = "" SvmModel = "" DecisionTreeModel = "" le_X_city = "" le_X_date = "" le_Y = "" def readCsv(self, file_name): #Importing the datset self.dataset = pd.read_csv(file_name) self.dataset.dropna(axis=0, subset=[ "Air_quality", "Xylene", "AQI", "Toluene", "Benzene", "O3", "SO2", "CO", "NH3", "NOx", "NO2", "PM10", "PM2.5", "NO" ], how='all', inplace=True) self.dataset.dropna(subset=["Air_quality"], inplace=True) # print("asasd") self.x = self.dataset.iloc[:, :-1].values self.y = self.dataset.iloc[:, 15].values #Filling the missing values imputer = SimpleImputer(missing_values=np.nan, strategy='median') imputer = imputer.fit(self.x[:, 2:15]) self.x[:, 2:15] = imputer.transform(self.x[:, 2:15]) #Encoding the attributes self.le_X_city = LabelEncoder() self.le_X_date = LabelEncoder() self.le_Y = LabelEncoder() self.y = self.le_Y.fit_transform(self.y) self.x[:, 0] = self.le_X_city.fit_transform(self.x[:, 0]) self.x[:, 1] = self.le_X_date.fit_transform(self.x[:, 1]) #Splitting the dataset self.x_train, self.x_test, self.y_train, self.y_test = train_test_split( self.x, self.y, test_size=0.3, random_state=0) ax = sns.countplot(self.y_train) #plt.bar(['Good','Moderate','Poor','Satisfactory','Severe','Very Poor'], height=, kwargs) #plt.hist(self.y_train, color='green') #plt.show() print('Classes and number of values in trainset', Counter(self.y_train)) from imblearn.over_sampling import SMOTE oversample = SMOTE() self.x_train, self.y_train = oversample.fit_resample( self.x_train, self.y_train) print('Classes and number of values in trainset after SMOTE:', Counter(self.y_train)) self.med = np.median(self.x_train, axis=0) sns.countplot(self.y) #plt.hist(self.y_train, color='green') #plt.show() def trainRF(self): self.RandomForestModel = RandomForestClassifier(n_estimators=100, random_state=0) self.RandomForestModel.fit(self.x_train, self.y_train) def trainXGB(self): self.XgbModel = XGBClassifier(random_state=0) self.XgbModel.fit(self.x_train, self.y_train) def trainSVM(self): self.SvmModel = SVC(kernel="rbf", random_state=0) self.SvmModel.fit(self.x_train, self.y_train) def trainDT(self): self.DecisionTreeModel = DecisionTreeClassifier(random_state=0) self.DecisionTreeModel.fit(self.x_train, self.y_train) def RandomForest(self): #RandomForstClassifier Model self.y_pred = self.RandomForestModel.predict(self.x_test) cm = confusion_matrix(self.y_test, self.y_pred) print(cm) a = accuracy_score(self.y_test, self.y_pred) precision = precision_score(self.y_test, self.y_pred, average='micro') recall = recall_score(self.y_test, self.y_pred, average='micro') f1 = f1_score(self.y_test, self.y_pred, average='micro') return cm, a * 100, precision * 100, recall * 100, f1 * 100 def XGB(self): #XGBCLassifier Model self.y_pred = self.XgbModel.predict(self.x_test) cm = confusion_matrix(self.y_test, self.y_pred) print(cm) a = accuracy_score(self.y_test, self.y_pred) precision = precision_score(self.y_test, self.y_pred, average='micro') recall = recall_score(self.y_test, self.y_pred, average='micro') f1 = f1_score(self.y_test, self.y_pred, average='micro') return cm, a * 100, precision * 100, recall * 100, f1 * 100 def SVC(self): #SVC Model self.y_pred = self.SvmModel.predict(self.x_test) cm = confusion_matrix(self.y_test, self.y_pred) print(cm) a = accuracy_score(self.y_test, self.y_pred) precision = precision_score(self.y_test, self.y_pred, average='weighted') recall = recall_score(self.y_test, self.y_pred, average='weighted') f1 = f1_score(self.y_test, self.y_pred, average='weighted') return cm, a * 100, precision * 100, recall * 100, f1 * 100 def DecisionTree(self): #DecisionTreeClassifier Model self.y_pred = self.DecisionTreeModel.predict(self.x_test) cm = confusion_matrix(self.y_test, self.y_pred) print(cm) a = accuracy_score(self.y_test, self.y_pred) precision = precision_score(self.y_test, self.y_pred, average='micro') recall = recall_score(self.y_test, self.y_pred, average='micro') f1 = f1_score(self.y_test, self.y_pred, average='micro') return cm, a * 100, precision * 100, recall * 100, f1 * 100 def predict(self, City, Date, PM25, PM10, NO, NO2, NOx, NH3, CO, SO2, O3, Benzene, Toluene, Xylene, AQI): res = [] city = self.le_X_city.fit_transform([City]) date = self.le_X_date.fit_transform([Date]) if (not PM25): PM25val = self.med[2] else: PM25val = float(PM25) if (not PM10): PM10val = self.med[3] else: PM10val = float(PM10) if (not NO): NOval = self.med[4] else: NOval = float(NO) if (not NO2): NO2val = self.med[5] else: NO2val = float(NO2) if (not NOx): NOxval = self.med[6] else: NOxval = float(NOx) if (not NH3): NH3val = self.med[7] else: NH3val = float(NH3) if (not CO): COval = self.med[8] else: COval = float(CO) if (not SO2): SO2val = self.med[9] else: SO2val = float(SO2) if (not O3): O3val = self.med[10] else: O3val = float(O3) if (not Benzene): Benzeneval = self.med[11] else: Benzeneval = float(Benzene) if (not Toluene): Tolueneval = self.med[12] else: Tolueneval = float(Toluene) if (not Xylene): Xyleneval = self.med[13] else: Xyleneval = float(Xylene) if (not AQI): AQIval = self.med[14] else: AQIval = float(AQI) ls = [ city[0], date[0], PM25val, PM10val, NOval, NO2val, NOxval, NH3val, COval, SO2val, O3val, Benzeneval, Tolueneval, Xyleneval, AQIval ] lst = [] lst.append(ls) temp = self.le_Y.inverse_transform(self.RandomForestModel.predict(lst)) temp = temp.tolist() res.append(temp[0]) temp = self.le_Y.inverse_transform(self.SvmModel.predict(lst)) temp = temp.tolist() res.append(temp[0]) temp = self.le_Y.inverse_transform(self.DecisionTreeModel.predict(lst)) temp = temp.tolist() res.append(temp[0]) # print(lst) ll = np.array(lst).reshape(1, -1) # print(ll) temp = self.le_Y.inverse_transform(self.XgbModel.predict(ll)) temp = temp.tolist() res.append(temp[0]) return res
xg_train = xgboost.DMatrix(train, label=train_labels.values.flatten()) xg_test = xgboost.DMatrix(test) watchlist = [(xg_train, 'train')] xgclassifier = xgboost.train(params, xg_train, num_round, watchlist) predicted_results = xgclassifier.predict(xg_test) mc_pred.append(predicted_results) meta_solvers_test.append( (np.mean(np.array(mc_pred), axis=0) + 0.5).astype(int)) """ Write opt solution """ print('writing to file') mc_train_pred = label_encoder.inverse_transform( mc_train_pred.astype(int)) print(meta_solvers_test[-1]) meta_solvers_test[-1] = label_encoder.inverse_transform( meta_solvers_test[-1]) pd.DataFrame(mc_train_pred).to_csv('results/train_xgboost_d6.csv') submission_file['status_group'] = meta_solvers_test[-1] submission_file.to_csv("results/test_xgboost_d6.csv") # saving best score for printing if mc_acc_mean[-1] < best_score: print('new best log loss') best_score = mc_acc_mean[-1] best_params = params best_train_prediction = mc_train_pred if params['mc_test']: best_prediction = meta_solvers_test[-1]
pred=model.predict(X_test) # In[12]: '''evaluate predictions''' accuracy = accuracy_score(y_test, pred) print("Accuracy: %.2f%%" % (accuracy * 100.0)) # In[13]: '''select other data attributes after prediction''' recommendation=pd.DataFrame({'user':X_test['user'],'bookName':le2.inverse_transform(X_test['bookName']),'impression': pred }) # In[14]: '''convert prediction column real responses''' recommendation['impression'].replace( 1 ,'dislike',inplace=True) recommendation['impression'].replace(2,'like',inplace=True) recommendation['impression'].replace(3,'view',inplace=True) recommendation['impression'].replace(4,'interact',inplace=True) recommendation['impression'].replace(5,'add to cart',inplace=True) recommendation['impression'].replace(6,'checkout',inplace=True) # In[15]:
in_encoder = Normalizer(norm='l2') encodings = in_encoder.transform(encodings) # label encode targets out_encoder = LabelEncoder() out_encoder.fit(names) names = out_encoder.transform(names) # Create and train the SVC classifier clf = svm.SVC(gamma='scale', probability=True) #clf = svm.SVC(kernel='linear', probability=True) clf.fit(encodings, names) # Load the test image with unknown faces into a numpy array test_image = face_recognition.load_image_file('test/test.jpg') # Find all the faces in the test image using the default HOG-based model face_locations = face_recognition.face_locations(test_image) no = len(face_locations) print("Number of faces detected: ", no) # Predict all the faces in the test image using the trained classifier print("Found:") for i in range(no): test_image_enc = face_recognition.face_encodings(test_image)[i] test_image_enc = in_encoder.transform([test_image_enc]) name = clf.predict(test_image_enc) prob = clf.predict_proba(test_image_enc) print(prob) acc = prob[0, name[0]] name = out_encoder.inverse_transform(name) print(*name, acc)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder import numpy as np import pandas as pd items = ['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '선풍기', '믹서', '믹서'] # LabelEncoder를 객체로 생성한 후, fit()과 transform()으로 레이블 인코딩 수행 encoder = LabelEncoder() encoder.fit(items) labels = encoder.transform(items) # vector values labels = labels.reshape(-1, 1) print('인코딩 변경값: ', labels) # Encoding print('인코딩 클래스: ', encoder.classes_) # Decoding print('디코딩 원본 값', encoder.inverse_transform([4, 5, 2, 0, 1, 1, 3, 3])) oh_encoder = OneHotEncoder() oh_encoder.fit(labels) oh_labels = oh_encoder.transform(labels) print('One-Hot encoding Data') print(oh_labels.toarray()) print('One-Hoe encoding Shape') print(oh_labels.shape) df = pd.DataFrame({'item':items}) # One-Hot Encoder API -> get_dummies() pd.get_dummies(df)
CNN_Model.add(SeqSelfAttention(attention_width=8, attention_activation='sigmoid', name='Attention',)) CNN_Model.add(SpatialDropout1D(0.3)) CNN_Model.add(layers.Conv1D(512, 3, activation='relu')) CNN_Model.add(layers.GlobalMaxPooling1D()) CNN_Model.add(layers.Dense(3, activation='softmax')) CNN_Model.compile(loss='categorical_crossentropy',optimizer='adam', metrics=['accuracy']) CNN_Model.summary() #============================================================================== # Evaluate model and print results #============================================================================== CNN_History=CNN_Model.fit(x_train, y_train, epochs = 5, batch_size = 256,verbose=1, validation_data=(x_val,y_val), shuffle=True) plot_history(CNN_History) full_multiclass_report(CNN_Model, x_val, y_val, encoder.inverse_transform(np.arange(3)))
# mapping ordinal features size_mapping = {'XL': 3, 'L': 2, 'M': 1} df['size'] = df['size'].map(size_mapping) print(df) class_mapping = {label: idx for idx, label in enumerate(np.unique(df['classLabel']))} print(class_mapping) df['classLabel'] = df['classLabel'].map(class_mapping) print(df) inv_class_mapping = {v: k for k, v in class_mapping.items()} df['classLabel'] = df['classLabel'].map(inv_class_mapping) print(df) class_encoder = LabelEncoder() y = class_encoder.fit_transform(df['classLabel'].values) print(y) print(class_encoder.inverse_transform(y)) x = df [['color', 'size', 'price']].values class_encoder = LabelEncoder() x[:, 0] = class_encoder.fit_transform(x[:, 0]) print(x) # one-hot encoding one_encoder = OneHotEncoder(categorical_features=[0]) print(one_encoder.fit_transform(x).toarray()) # this one-hot is more readable print(pd.get_dummies(df[['price', 'color', 'size']]))
def main(): columns = [ 'AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU23_r', 'AU25_r', 'AU26_r', 'AU45_r', 'culture', 'emotion' ] df = pd.read_csv("../old_data/videos_relabelled.csv") #extracting total set for training and testing sets # training and testing on only NA and Persian culture df = df[(df['culture'] == 'North America') | (df['culture'] == 'Philippines')] # training and testing on only NA and Philippines culture #df = df[(df['culture'] == 'North America') | (df['culture'] == 'Philippines')] #df = df[df['culture'] == 'Persian'] #training and testing on only persian culture #df = df[df['culture'] == 'Philippines'] #training and testing on only philipines culture #df = df[df['culture'] == 'North America'] #training and testing on only NA culture #df['culture_code'] = df['culture'].astype('category').cat.codes ############# testing model by selecting specific videos to test so components of video are not in training set ############### validation_array = [] test_array = [] vf_score = [] tf_score = [] kfold = KFold(5, True, 1) videos = df['filename'].unique() print(videos) ## Roya: Add label encoder to convert labels to integer le = LabelEncoder() #this part is for extracting what to test on #this is testing set for NA culture #test_df = df[(df['filename'] == 'contempt_38') | (df['filename'] == 'contempt_39') | (df['filename'] == 'anger_26') | (df['filename'] == 'anger_27') | (df['filename'] == 'disgust_20') | (df['filename'] == 'disgust_21') ] #this is testing set for Persian culture # test_df = df[(df['filename'] == '40') | (df['filename'] == '42') | (df['filename'] == '77') | (df['filename'] == '36') | (df['filename'] == '38') | (df['filename'] == '41') ] #this is testing set for Filipino culture #test_df = df[(df['filename'] == 'contempt_25_p') | (df['filename'] == 'contempt_18_p') | (df['filename'] == 'anger_17_p') | (df['filename'] == 'anger_6_p') | (df['filename'] == 'disgust_7_p') | (df['filename'] == 'disgust_8_p') ] #this is for testing on all of Filipino culture #test_df = df[df['culture'] == 'Philippines'] #this is for testing on all of Persian culture ## Roya: add a test dataframe for displaying results test_df = df[df['culture'] == 'Philippines'] test_videos = test_df['filename'].unique() df = df[~df['filename'].isin(list(test_videos))] videos = np.array(list(set(videos) - set(test_videos))) splits = kfold.split(videos) test_df_copy = test_df.drop([ 'frame', 'face_id', 'culture', 'filename', 'emotion', 'confidence', 'success' ], axis=1) for (i, (train, test)) in enumerate(splits): print('%d-th split: train: %d, test: %d' % (i + 1, len(videos[train]), len(videos[test]))) train_df = df[df['filename'].isin(videos[train])] # test_df = df[df['filename'].isin(videos[test])] y = train_df['emotion'].values X = train_df.drop(columns=[ 'success', 'confidence', 'face_id', 'frame', 'emotion', 'culture', 'filename', 'talking', 'gender' ]).values ## Change labels to int using a label encoder Y = le.fit_transform(y) X_train, X_valid, y_train, y_valid = train_test_split(X, Y) #print(X_train) print('LABEL ENCODER CLASSES: ', le.classes_) clf, score, fscore = create_svm(X_train, X_valid, y_train, y_valid) validation_array.append(score) vf_score.append(fscore) #cv_scores = cross_validate(clf, X, y, cv = 10) #print(cv_scores) # print(test_df[['frame','filename','culture','emotion']].head()) int_test = test_df.drop(columns=[ 'success', 'confidence', 'face_id', 'frame', 'emotion', 'culture', 'filename', 'talking', 'gender' ]).values # print(len(int_test)) ## Roya: change string labels to integer values int_predict = le.fit_transform(test_df['emotion'].values) # print(len(int_predict)) # predictions = clf.predict(int_test) predictions = clf.predict(int_test) #integers predicted ## Roya: change integer labels to string values test_df['predicted'] = le.inverse_transform(predictions) ## Roya: calculate confusion matrix cf_matrix = confusion_matrix(test_df['emotion'].values, test_df['predicted'].values) print('CONFUSION MATRIX:\n', cf_matrix) df_cm = pd.DataFrame(cf_matrix, index=le.inverse_transform([0, 1, 2]), columns=le.inverse_transform([0, 1, 2])) df_cm = df_cm.div(df_cm.sum(axis=1), axis=0) ## Plot Confusion matrix plt.figure(figsize=(9, 6)) sn.heatmap(df_cm, annot=True, fmt='.0%') plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() # print("predictions: ", predictions[0:10]) # print("int_predict: ", int_predict[0:10]) print(accuracy_score(int_predict, predictions)) fscore = f1_score(le.fit_transform(int_predict), predictions, average='macro') test_df.drop(columns=['predicted'], inplace=True) print('\n') test_array.append(accuracy_score(int_predict, predictions)) tf_score.append(fscore) print("Average accuracy for all Folds on valid dataset: " + str(np.mean(validation_array))) print("Average accuracy for all Folds on test dataset: " + str(np.mean(test_array))) print("Average f-score for all Folds on valid dataset: " + str(np.mean(vf_score))) print("Average f-score for all Folds on test dataset: " + str(np.mean(tf_score)))
class BuildingAdapterInterface(Inferencer): def __init__(self, target_building, target_srcids, source_buildings, pgid=pgid, config={}, load_from_file=1 ): super(BuildingAdapterInterface, self).__init__( target_building=target_building, source_buildings=source_buildings, target_srcids=target_srcids, pgid=pgid, ) #gather the source/target data and name features, labels #TODO: handle multiple source buildings self.stop_predict_flag = False if 'source_time_ranges' in config: self.source_time_ranges = config['source_time_ranges'] assert len(self.source_time_ranges) == len(source_buildings) else: self.source_time_ranges = [(None, None)]\ * len(source_buildings) if 'target_time_range' in config: self.target_time_range = config['target_time_range'] else: self.target_time_range = (None, None) if 'threshold' in config: self.threshold = config['threshold'] else: self.threshold = 0.5 source_building = source_buildings[0] if not load_from_file: #data features source_ids, train_fd = get_data_features(source_building, self.source_time_ranges[0][0], self.source_time_ranges[0][1], pgid=self.pgid, ) target_ids, test_fd = get_data_features(target_building, self.target_time_range[0], self.target_time_range[1], pgid=self.pgid, ) #name features, labels source_res = get_namefeatures_labels(source_building, pgid=self.pgid) train_label = [source_res[srcid][1] for srcid in source_ids] self.target_res = get_namefeatures_labels(target_building, pgid=self.pgid) test_fn = np.asarray( [self.target_res[tgtid][0] for tgtid in target_ids] ) test_label = [self.target_res[tgtid][1] for tgtid in target_ids] #find the label intersection intersect = list( set(test_label) & set(train_label) ) print ('intersected tagsets:', intersect) #preserve the intersection, get ids for indexing data feature matrices if intersect: train_filtered = [[i,j] for i,j in enumerate(train_label) if j in intersect] train_id, train_label = [list(x) for x in zip(*train_filtered)] test_filtered = [[i,j,k] for i,(j,k) in enumerate(zip(test_label,target_ids)) if j in intersect] self.test_id, test_label, self.test_srcids = [list(x) for x in zip(*test_filtered)] else: raise ValueError('no common labels!') self.train_fd = train_fd[train_id, :] self.test_fd = test_fd[self.test_id, :] self.test_fn = test_fn[self.test_id, :] print ('%d training examples left'%len(self.train_fd)) print ('%d testing examples left'%len(self.test_fd)) self.le = LE() self.le.fit(intersect) self.train_label = self.le.transform(train_label) self.test_label = self.le.transform(test_label) res = [self.train_fd, self.test_fd, self.train_label, self.test_label, self.test_fn, self.test_srcids, self.target_res, self.le] with open('./%s-%s.pkl'%(source_building,target_building), 'wb') as wf: pk.dump(res, wf) else: print ('loading from prestored file') with open('./%s-%s.pkl'%(source_building,target_building), 'rb') as rf: res = pk.load(rf) self.train_fd, self.test_fd, self.train_label, self.test_label, self.test_fn, self.test_srcids, self.target_res, self.le = \ res[0], res[1], res[2], res[3], res[4], res[5], res[6], res[7] print ( '# of classes:', len(set(self.train_label)) ) print ( 'data features for %s with dim:'%source_building, self.train_fd.shape) print ( 'data features for %s with dim:'%target_building, self.test_fd.shape) self.learner = transfer_learning( self.train_fd, self.test_fd, self.train_label, self.test_label, self.test_fn, threshold = self.threshold ) self.run_auto() def predict(self, target_srcids, verbose=False): ''' return: tagset, srcid, and confidence of each labeled example ''' if self.stop_predict_flag: self.pred_g = self.new_graph(empty=True) self.prior_confidences = {} return self.pred_g preds, labeled_set, confidence = self.learner.predict() srcids = [self.test_srcids[i] for i in labeled_set] tagsets = list(self.le.inverse_transform(preds)) names = [self.target_res[i][-1] for i in srcids] if verbose: for i,j,k,l in zip(srcids, names, tagsets, confidence): print ('srcid %s with name %s got label %s with s %.4f'%(i,j,k,l)) self.stop_predict_flag = True self.pred_g = self.new_graph(empty=True) acc_with_high_conf = 0 cnt_with_high_conf = 0 for srcid, tagset, prob in zip(srcids, tagsets, confidence): self._add_pred_point_result(self.pred_g, srcid, tagset, prob) #return srcids, tagsets, confidence return self.pred_g def run_auto(self): self.learner.run_auto() def select_informative_samples(self, sample_num): super(BuildingAdapterInterface, self)\ .select_informative_samples(sample_num) return []
test_file = "./data/test.csv" train_df = pd.read_csv(train_file) test_df = pd.read_csv(test_file) # This tells us which columns have null values train_df.isnull().any(axis=0) # Only 2 null Embarked values... let's drop it train_df['Embarked'].isnull().sum() train_df = train_df[train_df['Embarked'].notnull()] le = LabelEncoder() le.fit(['A', 'B', 'C', 'D']) train_df['Pclass'] = le.inverse_transform(train_df['Pclass']) # Fill in missing values for age using linear interpolation train_df['Age'] = train_df['Age'].interpolate() predictor_columns = [ 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Age' ] #predictor_columns = ['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Age'] label_column = 'Survived' X = pd.get_dummies(train_df[predictor_columns]) y = train_df[label_column] X_train, X_test, y_train, y_test = train_test_split(X, y)
#<<<` #building sparse matrix on test data X_test = CV_test.fit_transform(test_attr_list).toarray() # #>>>Predicting the Y values in the test data, model building RFC = RandomForestClassifier(n_estimators=300, criterion='entropy', random_state=0) RFC.fit(X_train, Y_train) Y_test = RFC.predict(X_test) #<<< #Finding Accuracies kfold = KFold(n_splits=10, random_state=0) test_accuracies = cross_val_score(estimator=RFC, X=X_test, y=Y_test, cv=kfold) print("Test Accuracies = ", test_accuracies) print("Test Data Accuracies SD = ", test_accuracies.std()) print("Test Data Accuracies mean = ", test_accuracies.mean()) # #>>> Coverting numeric to labels Y_test = labelencoder.inverse_transform(Y_test) # #>>> adding the predicted values to the test data as label column Y_test = np.reshape(Y_test, (len(Y_test), 1)) test_data1 = np.append(arr=test_data, values=Y_test, axis=1) test_data1 = pd.DataFrame(test_data1, columns=["id", "additionalAttributes", "labels"]) #<<< #>>>Writeing Testing set to submissions file with open('submissions.csv', 'w') as outfile: test_data1.to_csv(outfile, index=False) #<<<
class AdaBoostClassifier(Component): """Text classifier using the sklearn framework""" name = "AdaBoost_Classifier" provides = ["classifylabel", "classifylabel_ranking"] requires = ["sentence_embedding"] def __init__(self, config=None, clf=None, le=None): # type: (sklearn.model_selection.GridSearchCV, sklearn.preprocessing.LabelEncoder) -> None """Construct a new classifier using the sklearn framework.""" from sklearn.preprocessing import LabelEncoder if le is not None: self.le = le else: self.le = LabelEncoder() self.clf = clf @classmethod def required_packages(cls): # type: () -> List[Text] return ["numpy", "sklearn"] def transform_labels_str2num(self, labels): # type: (List[Text]) -> np.ndarray """Transforms a list of strings into numeric label representation. :param labels: List of labels to convert to numeric representation""" return self.le.fit_transform(labels) def transform_labels_num2str(self, y): # type: (np.ndarray) -> np.ndarray """Transforms a list of strings into numeric label representation. :param y: List of labels to convert to numeric representation""" return self.le.inverse_transform(y) def train(self, training_data, config, **kwargs): # type: (TrainingData, RasaNLUConfig, **Any) -> None """Train the classifier on a data set. :param num_threads: number of threads used during training time""" from sklearn.model_selection import GridSearchCV from sklearn.ensemble import AdaBoostClassifier labels = [e.get("label") for e in training_data.classify_examples] if len(set(labels)) < 2: logger.warning( "Can not train an classifier. Need at least 2 different classes. " + "Skipping training of classifier.") else: y = self.transform_labels_str2num(labels) # TODO fix it, in future sentence will replaced by "features" X = np.stack([ example.get("sentence_embedding") for example in training_data.classify_examples ]) self.clf = AdaBoostClassifier() # sklearn_config = config.get("classifier_sklearn") # C = sklearn_config.get("C", [1, 2, 5, 10, 20, 100]) # kernel = sklearn_config.get("kernel", "linear") # # dirty str fix because sklearn is expecting str not instance of basestr... # tuned_parameters = [{"C": C, "kernel": [str(kernel)]}] # cv_splits = max(2, min(MAX_CV_FOLDS, np.min(np.bincount(y)) // 5)) # aim for 5 examples in each fold # # self.clf = GridSearchCV(SVC(C=1, probability=True, class_weight='balanced'), # param_grid=tuned_parameters, n_jobs=config["num_threads"], # cv=cv_splits, scoring='f1_weighted', verbose=1) self.clf.fit(X, y) def process(self, message, **kwargs): # type: (Message, **Any) -> None """Returns the most likely label and its probability for the input text.""" if not self.clf: # component is either not trained or didn't receive enough training data label = None label_ranking = [] else: X = message.get("sentence_embedding").reshape(1, -1) label_ids, probabilities = self.predict(X) labels = self.transform_labels_num2str(label_ids) # `predict` returns a matrix as it is supposed # to work for multiple examples as well, hence we need to flatten labels, probabilities = labels.flatten(), probabilities.flatten() if labels.size > 0 and probabilities.size > 0: ranking = list( zip(list(labels), list(probabilities)))[:CLASSIFY_RANKING_LENGTH] label = {"name": labels[0], "confidence": probabilities[0]} label_ranking = [{ "name": label_name, "confidence": score } for label_name, score in ranking] else: label = {"name": None, "confidence": 0.0} label_ranking = [] message.set("classifylabel", label, add_to_output=True) message.set("classifylabel_ranking", label_ranking, add_to_output=True) def predict_prob(self, X): # type: (np.ndarray) -> np.ndarray """Given a bow vector of an input text, predict the classify label. Returns probabilities for all labels. :param X: bow of input text :return: vector of probabilities containing one entry for each label""" return self.clf.predict_proba(X) def predict(self, X): # type: (np.ndarray) -> Tuple[np.ndarray, np.ndarray] """Given a bow vector of an input text, predict most probable label. Returns only the most likely label. :param X: bow of input text :return: tuple of first, the most probable label and second, its probability""" pred_result = self.predict_prob(X) # sort the probabilities retrieving the indices of the elements in sorted order sorted_indices = np.fliplr(np.argsort(pred_result, axis=1)) return sorted_indices, pred_result[:, sorted_indices] @classmethod def load(cls, model_dir=None, model_metadata=None, cached_component=None, **kwargs): # type: (Text, Metadata, Optional[Component], **Any) -> SklearnClassifier import cloudpickle if model_dir and model_metadata.get("classifier_sklearn"): classifier_file = os.path.join( model_dir, model_metadata.get("classifier_sklearn")) with io.open(classifier_file, 'rb') as f: # pragma: no test return cloudpickle.load(f, encoding="latin-1") else: return SklearnClassifier() def persist(self, model_dir): # type: (Text) -> Dict[Text, Any] """Persist this model into the passed directory. Returns the metadata necessary to load the model again.""" import cloudpickle classifier_file = os.path.join(model_dir, "label_classifier.pkl") with io.open(classifier_file, 'wb') as f: cloudpickle.dump(self, f) return {"classifier_sklearn": "label_classifier.pkl"}
"pm/text-ml-classification/scripts/result_non_split.pkl") trainDf = pd.read_pickle( "/home/zhen.di/pm/text-ml-classification/scripts/trainDf.pkl") # split data into training data and testing data X_train, X_test, y_train, y_test = train_test_split(trainDf, result.Class, test_size=0.2, random_state=5, stratify=result.Class) # encode labels le = LabelEncoder() y_train = le.fit_transform(y_train) # int64 y_test = le.transform(y_test) encoded_test_y = np_utils.to_categorical((le.inverse_transform(y_test))) def plot_confusion_matrix(clf, test_y, predict_y): """ Give confusion matrix based on testing data """ C = confusion_matrix(test_y, predict_y) labels = le.classes_ fig = plt.figure(figsize=(10, 8)) sns.heatmap(C, annot=True, cmap="Blues", fmt=".0f", xticklabels=labels, yticklabels=labels)
X_train = embedded[train_idx] # 50 test examples of 10 identities (5 examples each) X_test = embedded[test_idx] y_train = y[train_idx] y_test = y[test_idx] knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean') svc = LinearSVC() knn.fit(X_train, y_train) svc.fit(X_train, y_train) acc_knn = accuracy_score(y_test, knn.predict(X_test)) acc_svc = accuracy_score(y_test, svc.predict(X_test)) print('KNN accuracy = {}, SVM accuracy = {}'.format(acc_knn,acc_svc)) import warnings # Suppress LabelEncoder warning warnings.filterwarnings('ignore') example_idx = 15 example_image = load_image(metadata[test_idx][example_idx].image_path()) example_prediction = svc.predict([embedded[test_idx][example_idx]]) example_identity = encoder.inverse_transform(example_prediction)[0] plt.imshow(example_image) plt.title('Recognized as {}'.format(example_identity)) plt.show()
class MLPAgent(): def __init__(self, hidden_layer_sizes=(100,), activation="relu", solver="lbfgs", # faster and better than adam for small data http://scikit-learn.org/stable/modules/neural_networks_supervised.html#tips-on-practical-use max_iter=200, verbose=True, early_stopping=False, ngram_range=(1, 1), max_features=None): self.vect = TfidfVectorizer( tokenizer=word_tokenize, ngram_range=ngram_range, max_features=max_features) self.mlp = MLPClassifier( hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=max_iter, #early_stopping=early_stopping, verbose=verbose) self.enc = LabelEncoder() def __str__(self): return "{}-layer MLP-Agent {}-grams".format(self.mlp.hidden_layer_sizes, self.vect.ngram_range) #"{:d}-NN Agent {}".format(self.get_params()) def get_params(self): """Get parameters< for this estimator. ------- params : plain flushed dict of parameters """ # dict = {} # for object_name, values in self.__dict__: # dict[object_name] return self.__dict__ def fit(self, X, y): """ X : question y : utterances >>> x = 3 >>> x == 3 True """ #X,y = X[:5000], y[:5000] print("classifier.fit: X.shape", X.shape,"y.shape", y.shape) # learns vectorizer on utterances and answers self.vect.fit(np.append(X, y)) # transforms the inputs to vectorized form questions_vec = self.vect.transform(X) #answers_vec = self.vect.transform(y) #TODO: replace answer values with labels answers_vec = self.enc.fit_transform(y) self.mlp.fit(questions_vec,answers_vec) print("Fitting MLP") print('n_samples', X.shape[0]) print('vocabulary size', len(self.vect.vocabulary_)) print('targets', answers_vec.shape) def predict(self, question): vector_question = self.vect.transform([question]) p = self.mlp.predict_proba(vector_question)[0] # only one question # [p_0, p_1, p_2] labels = p.nonzero()[0] ind = np.argsort(p[labels])[::-1] #print("p ",p," labels ",labels, " ind ",ind) # use cluster_ind -> medoid mapping # inverse transform medoid return (self.enc.inverse_transform(labels[ind]), p[labels[ind]])
paramCheck = '(' + str(MAX_DEPTH) + ',' + str(ETA) + ',' + str( num_round) + ',' + str(SUB_SAMPLE) + ',' + str(COL_SAMPLE) + ')' timeStr = str(time.strftime('%Y-%m-%d %H%M%S', time.localtime())) bst.save_model(MODEL_FOLDER + 'model' + timeStr + str(cvndcg) + paramCheck + str(param['seed']) + '.model') bst.dump_model(MODEL_FOLDER + timeStr + "dump.raw.txt") fscore = bst.get_fscore() sorted_fscore = sorted(fscore.items(), key=operator.itemgetter(1), reverse=True) print fscore print sorted_fscore ypred = bst.predict(dtest) # print ypred[:5] # Taking the 5 classes with highest probabilities ids = [] # list of ids cts = [] # list of countries for i in range(len(ypred)): idx = idSave[i] ids += [idx] * 5 cts += le.inverse_transform(np.argsort(ypred[i])[::-1])[:5].tolist() # Generate submission sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country']) stTimeStr = '(' + str(startTime)[11:19] + 'start)' timeSpend = (dt.now() - startTime) print timeSpend, stTimeStr sub.to_csv(OUT_FOLDER + 'sub' + timeStr + paramCheck + '.csv', index=False)
class flask_serving_classifier(Component): """Intent classifier using the sklearn framework""" name = "flask_serving_classifier" provides = ["intent", "intent_ranking"] requires = ["text_features"] def __init__(self, component_config=None, # type: Dict[Text, Any] clf=None, # type: sklearn.model_selection.GridSearchCV le=None # type: sklearn.preprocessing.LabelEncoder ): # type: (...) -> None """Construct a new intent classifier using the sklearn framework.""" from sklearn.preprocessing import LabelEncoder super(flask_serving_classifier, self).__init__(component_config) if le is not None: self.le = le else: self.le = LabelEncoder() self.clf = clf _sklearn_numpy_warning_fix() @classmethod def required_packages(cls): # type: () -> List[Text] return ["sklearn"] def transform_labels_str2num(self, labels): # type: (List[Text]) -> np.ndarray """Transforms a list of strings into numeric label representation. :param labels: List of labels to convert to numeric representation""" return self.le.fit_transform(labels) def transform_labels_num2str(self, y): # type: (np.ndarray) -> np.ndarray """Transforms a list of strings into numeric label representation. :param y: List of labels to convert to numeric representation""" return self.le.inverse_transform(y) def train(self, training_data, cfg, **kwargs): # type: (TrainingData, RasaNLUModelConfig, **Any) -> None """Train the intent classifier on a data set.""" logger.warn("ED CLASSIFIER TRAIN") num_threads = kwargs.get("num_threads", 1) labels = [e.get("intent") for e in training_data.intent_examples] if len(set(labels)) < 2: logger.warn("Can not train an intent classifier. " "Need at least 2 different classes. " "Skipping training of intent classifier.") else: y = self.transform_labels_str2num(labels).tolist() # X = np.stack([example.get("text_features") # for example in training_data.intent_examples]) # attrs = vars(training_data.intent_examples[0]) # print(', '.join("%s: %s" % item for item in attrs.items())) # print('ED TRAIN DATA:', training_data.intent_examples[0]) X = [i.text for i in training_data.intent_examples] categories = [i for i in set(y)] model_name = 'datetime' host = '172.17.0.5' port = 9000 url = f'http://{host}:{port}/train' data = {'text': X, 'labels': y, 'unique_labels': categories} print('ED DATA', data) tr = requests.put(url, json=data) ###train print(tr.json()) self.clf = model_name # self.clf = self._create_classifier(num_threads, y) # self.clf.fit(X, y) def process(self, message, **kwargs): # type: (Message, **Any) -> None """Return the most likely intent and its probability for a message.""" logger.warn("ED CLASSIFIER PROCESS MESSAGE:") if not self.clf: # component is either not trained or didn't # receive enough training data intent = None intent_ranking = [] else: print('ED message', message) # # attrs = vars(message) # print(', '.join("%s: %s" % item for item in attrs.items())) # X = message.get("text_features").reshape(1, -1) X = message.text # X = message.get('text) # X = message.data.text intent_ids, probabilities = self.predict(X) intents = self.transform_labels_num2str(np.ravel(intent_ids)) # `predict` returns a matrix as it is supposed # to work for multiple examples as well, hence we need to flatten probabilities = probabilities.flatten() if intents.size > 0 and probabilities.size > 0: ranking = list(zip(list(intents), list(probabilities)))[:INTENT_RANKING_LENGTH] intent = {"name": intents[0], "confidence": probabilities[0]} intent_ranking = [{"name": intent_name, "confidence": score} for intent_name, score in ranking] else: intent = {"name": None, "confidence": 0.0} intent_ranking = [] message.set("intent", intent, add_to_output=True) message.set("intent_ranking", intent_ranking, add_to_output=True) def predict_prob(self, X): # type: (np.ndarray) -> np.ndarray """Given a bow vector of an input text, predict the intent label. Return probabilities for all labels. :param X: bow of input text :return: vector of probabilities containing one entry for each label""" data = {'text': X,'labels':[], 'unique_labels':[]} host = '172.17.0.5' port = 9000 url = f'http://{host}:{port}/predict' pred = requests.post(url, json=data) out = np.array(pred.json()['prediction']) return out def predict(self, X): # type: (np.ndarray) -> Tuple[np.ndarray, np.ndarray] """Given a bow vector of an input text, predict most probable label. Return only the most likely label. :param X: bow of input text :return: tuple of first, the most probable label and second, its probability.""" pred_result = self.predict_prob(X) # sort the probabilities retrieving the indices of # the elements in sorted order sorted_indices = np.fliplr(np.argsort(pred_result, axis=1)) return sorted_indices, pred_result[:, sorted_indices] @classmethod def load(cls, model_dir=None, # type: Optional[Text] model_metadata=None, # type: Optional[Metadata] cached_component=None, # type: Optional[Component] **kwargs # type: **Any ): # type: (...) -> SklearnIntentClassifier meta = model_metadata.for_component(cls.name) file_name = meta.get("classifier_file", SKLEARN_MODEL_FILE_NAME) classifier_file = os.path.join(model_dir, file_name) if os.path.exists(classifier_file): return utils.pycloud_unpickle(classifier_file) else: return cls(meta) def persist(self, model_dir): # type: (Text) -> Optional[Dict[Text, Any]] """Persist this model into the passed directory.""" classifier_file = os.path.join(model_dir, SKLEARN_MODEL_FILE_NAME) utils.pycloud_pickle(classifier_file, self) return {"classifier_file": SKLEARN_MODEL_FILE_NAME}
import cv2 import pickle import argparse import numpy as np from sklearn.preprocessing import LabelEncoder ap = argparse.ArgumentParser() ap.add_argument("-s", "--saved", required=True, help="Path of saved model") ap.add_argument("-f", "--flower", required=True, help="Path of image") ap.add_argument("-m", "--mask", required=True, help="Path of mask") args = vars(ap.parse_args()) model = pickle.load(open(args["saved"], 'rb')) flower = cv2.imread(args["flower"]) mask = cv2.imread(args["mask"]) gray_mask = cv2.cvtColor(mask, cv2.COLOR_BGR2GRAY) label_encoder = LabelEncoder() label_encoder.classes_ = np.load('classes.npy') hist = cv2.calcHist([flower], [0, 1, 2], gray_mask, [8, 8, 8], [0, 256, 0, 256, 0, 256]) cv2.normalize(hist, hist) flower_class = label_encoder.inverse_transform(model.predict([hist.flatten()]))[0] print(flower_class) cv2.imshow("flower", flower) cv2.waitKey(0)
for i in idxs: # load the testing image, clone it, and resize it image = cv2.imread(testingPaths[i]) output = image.copy() output = cv2.resize(output, (128, 128)) # pre-process the image in the same manner we did earlier image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) image = cv2.resize(image, (200, 200)) image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1] # quantify the image and make predictions based on the extracted # features using the last trained Random Forest features = quantify_image(image) preds = model.predict([features]) label = le.inverse_transform(preds)[0] # draw the colored class label on the output image and add it to # the set of output images color = (0, 255, 0) if label == "healthy" else (0, 0, 255) cv2.putText(output, label, (3, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) images.append(output) # create a montage using 128x128 "tiles" with 5 rows and 5 columns montage = build_montages(images, (128, 128), (5, 5))[0] # show the output montage cv2.imshow("Output", montage) cv2.waitKey(0)
class RecommenderDeepNN: ''' Recommender for Yelp dataset using the deepFM model. Parameters ---------- category: 'restaurants', Keep only businesses of a certain category - Options: 'restaurants', 'automotive', 'shopping' min_review: 5, Keep only business with more review_count than this value min_category: 50, Keep only categories that apply to more than this amount of businesses weight: False, Whether or not to use weights for the attribute matrix in the DeepFM scaler: 'minmax', Scaler for dense features optimizer: "adam", Optimizer for the DeepFM loss: 'mse', Loss function for the DeepFM batch_size: 256, epochs: 10, train_size: 0.8, deepfm__dnn_hidden_units: (128, 128), deepfm__l2_reg_linear: 1e-05, deepfm__l2_reg_embedding: 1e-05, deepfm__l2_reg_dnn: 0, deepfm__seed: 1024, deepfm__dnn_dropout: 0, deepfm__dnn_activation: 'relu' Example ------- deepnn = RecommenderDeepNN(deepfm__seed=2048) deepnn.load_data(config.JSON_BUSINESS, config.CSV_RATINGS) deepnn.fit() deepnn.topN(260, n=5) deepnn = RecommenderDeepNN(scaler='standard', train_size=0.99) deepnn.fit(config.JSON_BUSINESS, config.CSV_RATINGS) ''' def __init__(self, **kwargs): ''' Parameters ---------- path_business: Path to the business.json file that contains 'attributes' and 'catogories' as dictionaries for all businesses path_ratings: Path to the ratings.csv file that contains 'user_id', 'business_id' and 'stars'. The review text is not needed here. ''' self.path_business = "" self.path_ratings = "" self.features_sparse = features_sparse self.features_dense = features_dense self.params = params_deepnn self.params_deepfm = {} self.business = None self.data = None self.attr2index = {} self.raw_to_iid = {} self.iid_to_raw = {} self.raw_to_uid = {} self.uid_to_raw = {} # Label encoders self.lbe_user = None self.lbe_item = None self.model = None self.features_linear = [] self.features_dnn = [] self.model_input = {} self.update_params(**kwargs) def load_data(self, path_business, path_ratings): ''' Load data and transform it to usable format. ''' print("Loading data ...") self.path_business = path_business self.path_ratings = path_ratings df = pd.read_json(self.path_business, lines=True, encoding='utf-8') df_ratings = pd.read_csv(self.path_ratings) df_ratings.rename({'stars':'rating'}, axis=1, inplace=True) to_keep = config.Keywords_Categories[self.params['category']] keeprows = utils.filter_business_with_categories(df, to_keep) df = df[keeprows] # Map user_id and business_id encodings to integers self.uid_to_raw = dict(df_ratings['user_id'].drop_duplicates().reset_index()['user_id']) self.raw_to_uid = {k:v for v, k in self.uid_to_raw.items()} self.iid_to_raw = dict(df['business_id']) self.raw_to_iid = {k:v for v, k in self.iid_to_raw.items()} self.business = df[['business_id', 'name', 'stars', 'review_count', 'categories']] df = df[df['review_count'] > self.params['min_review']] df = df_ratings.join(df[['business_id', 'stars', 'review_count', 'categories']].set_index('business_id'), on='business_id', how='right') # Has to be "right"... otherwise there will be NaNs # Also, use df.set_index() because df is smaller in size df['user_id'] = df['user_id'].map(self.raw_to_uid) df['business_id'] = df['business_id'].map(self.raw_to_iid) self.lbe_user = LabelEncoder() self.lbe_item = LabelEncoder() df['user_id'] = self.lbe_user.fit_transform(df['user_id']) df['business_id'] = self.lbe_item.fit_transform(df['business_id']) # x = lbe_user.inverse_transform(df_ratings['user_id']) # y = lbe_item.inverse_transform(df_ratings['business_id']) if(self.params['scaler'] == 'minmax'): scaler = MinMaxScaler(feature_range=(0,1)) elif(self.params['scaler'] == 'standard'): scaler = StandardScaler() df[self.features_dense] = scaler.fit_transform(df[self.features_dense]) lbe = LabelEncoder() for var in self.features_sparse: if(var not in ['business_id', 'user_id']): df[var] = lbe.fit_transform(df[var]) self.data = df del df, df_ratings def _compile_business_categories(self, df_business): ''' Find all the categories that apply to the businesses in the DataFrame df_business ''' categories = Counter() for line in df_business['categories']: if(isinstance(line, str)): categories.update(re.split(', ', line)) categories = pd.DataFrame.from_dict(categories, orient='index', columns=['count']) return categories def _build_category_dict(self, drop_categories=[]): attrs = self._compile_business_categories(self.data) attrs = attrs[attrs['count'] > self.params['min_category']].sort_values(by='count', ascending=False) for cat in drop_categories: attrs.drop(cat, inplace=True) attrs.index.to_list() self.attr2index = {k:v+1 for v, k in enumerate(attrs.index.to_list())} del attrs def _category_vectorizer(self, x): ''' Label encode categories of any business x into a list of indices. The mapping is given by the dictionary attr2index{catogory:index}. ''' if(isinstance(x, str)): spt = re.split(', ', x) return list(map(lambda x: self.attr2index[x] if x in self.attr2index else 0, spt)) else: return [] def _get_category_matrix(self, df): attrs_matrix = [self._category_vectorizer(x) for x in df['categories'].values] attrs_max_len = max(np.array(list(map(len, attrs_matrix)))) attrs_matrix = pad_sequences(attrs_matrix, maxlen=attrs_max_len, padding='post',) print("Matrix takes {:5.2f} MB".format(attrs_matrix.nbytes/1024./1024.)) return attrs_matrix, attrs_max_len def _build_model(self): to_drop = config.Keywords_Categories[self.params['category']] self._build_category_dict(drop_categories=to_drop) attrs_matrix, attrs_max_len = self._get_category_matrix(self.data) vars_fixlen = [SparseFeat(var, self.data[var].nunique(), embedding_dim=4) for var in self.features_sparse] vars_fixlen += [DenseFeat(var, 1,) for var in self.features_dense] vars_varlen = [VarLenSparseFeat(SparseFeat('categories', vocabulary_size=len(self.attr2index) + 1, embedding_dim=4), maxlen=attrs_max_len, combiner='mean', weight_name='attrs_weight' if self.params['weight'] else None)] self.features_linear = vars_fixlen + vars_varlen self.features_dnn = vars_fixlen + vars_varlen self.model = DeepFM(self.features_linear, self.features_dnn, task='regression', **self.params_deepfm) return attrs_matrix, attrs_max_len def get_feature_names(self): return get_feature_names(self.features_linear + self.features_dnn) def _set_params_deepfm(self): for k, v in self.params.items(): spt = k.split('__') if(len(spt) > 1): self.params_deepfm[spt[1]] = v def update_params(self, recompile=True, **kwargs): ''' Update parameters for the recommender and re-compile the DeepFM model unless recompile is set to False. Example ------- deepnn.update_params(epochs=20, deepfm__l2_reg_linear=2e-4) ''' for (k, v) in kwargs.items(): if(k in self.params): self.params[k] = v else: raise ValueError('{0} is not a valid parameter for RecommenderDeepNN.'.format(k)) self._set_params_deepfm() if(recompile == True and self.model is not None): self.model = DeepFM(self.features_linear, self.features_dnn, task='regression', **self.params_deepfm) def fit(self, path_business=None, path_ratings=None): if(self.data is None): self.load_data(path_business, path_ratings) model_input = self._get_model_input(self.data) self.model.compile(self.params['optimizer'], self.params['loss'], metrics=[self.params['loss']],) self.model.fit(model_input, self.data['rating'].values, batch_size=self.params['batch_size'], epochs=self.params['epochs'], validation_split=1-self.params['train_size'], verbose=2) def _get_model_input(self, df): if(self.model is None): attrs_matrix, attrs_max_len = self._build_model() else: attrs_matrix, attrs_max_len = self._get_category_matrix(df) features = self.get_feature_names() model_input = {name: df[name] for name in features} model_input['categories'] = attrs_matrix if(self.params['weight']): model_input['attrs_weight'] = np.random.randn(df.shape[0], attrs_max_len, 1) return model_input def predictAllItemsForUser(self, uid): ''' Returns predicted ratings of all businesses for any user (uid) ''' df = self.data.drop_duplicates('business_id').drop('user_id', axis=1) df['user_id'] = uid model_input = self._get_model_input(df) pred = self.model.predict(model_input, batch_size=self.params['batch_size']) return pd.DataFrame(pred,index=df['business_id'],columns=['pred']) def topN(self, uid, n=5): inner_uid = self.lbe_user.transform([uid])[0] pred = self.predictAllItemsForUser(inner_uid) topn = pred.nlargest(n, columns='pred') top_n_iid = self.lbe_item.inverse_transform(topn.index) predictions = topn['pred'].to_list() n_reviews = self.data['user_id'].value_counts()[inner_uid] print() print("UserID: {0}, Rated: {1}".format(uid, n_reviews)) print("--------------------------------") topN_business = self.business.loc[top_n_iid] for i, (_, business) in enumerate(topN_business.iterrows()): print(business['name']) print(business['categories']) print("Pred: %4.2f Avg: %3.1f out of %d reviews\n" % \ (predictions[i], business['stars'], business['review_count']))
#load values from dataset: X_test = df.values[:, 3:] Y_test = df.values[:, 2:3].ravel() #DETERMINE RULESIZE BY THE NUMBER OF UNIQUE Y's rulesize = len(np.unique(Y_test)) #integer encode with sklearn label_encoder = LabelEncoder() integer_encoded_Y = label_encoder.fit_transform(Y_test) #one hot encode with keras: onehot_Y_test = to_categorical(integer_encoded_Y) # #reverse encoding... target_names = np.unique( label_encoder.inverse_transform(argmax(onehot_Y_test, axis=1))) #convert to strings just in case... target_names = [str(i) for i in target_names] #def top 5, 10 accuracy: def top_5_categorical_accuracy(y_true, y_pred): return top_k_categorical_accuracy(y_true, y_pred, k=5) def top_10_categorical_accuracy(y_true, y_pred): return top_k_categorical_accuracy(y_true, y_pred, k=10) #save model #load from json print "Loading your model..." model_json = open( os.path.join(
#df = df.query("rating_type == @target") # filter rating type df = df.query("rating != 'Geen van allen'") with pd.option_context('mode.chained_assignment', None): # suppress stupid warning df.loc[:, 'rating'] = df.loc[:, 'rating'].replace(DU2EN) # translate df_emo = df.query("rating_type == 'emotion'") df_emo.loc[:, 'rating'] = le.transform(df_emo['rating']) df_emo = convert_doubles_to_single_labels(df_emo['rating'], soft=False, keepdims=False) df_emo = pd.DataFrame(df_emo.values.argmax(axis=1), columns=['rating'], index=df_emo.index) df_emo['rating'] = le.inverse_transform(df_emo['rating']) df_emo = df_emo.rename({'rating': 'emotion'}, axis=1) df_tmp = df.query("rating_type == 'valence'") df_val = pd.DataFrame() for idx in df_emo.index: tmp = df_tmp.loc[idx, 'rating'] if isinstance(tmp, pd.Series): val = df_tmp.loc[idx, 'rating'].astype(float).values.mean() else: val = tmp df_val.loc[idx, 'valence'] = val df_tmp = df.query("rating_type == 'arousal'") df_aro = pd.DataFrame()
class DefaultPreprocessor(AbstractPreprocessor): def __init__(self, config: ModelConfig, cache_home=None, use_cache=False): super().__init__(config) self.reset() self.X_types = None self.y_type = None self.cache_dir = self._prepare_cache_dir(cache_home) self.use_cache = use_cache def reset(self): self.metainfo = None self.categorical_columns = None self.var_len_categorical_columns = None self.continuous_columns = None self.y_lable_encoder = None self.X_transformers = collections.OrderedDict() def prepare_X(self, X): if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) if len(set(X.columns)) != len(list(X.columns)): cols = [ item for item, count in collections.Counter(X.columns).items() if count > 1 ] raise ValueError(f'Columns with duplicate names in X: {cols}') if X.columns.dtype != 'object': X.columns = ['x_' + str(c) for c in X.columns] logger.warn(f"Column index of X has been converted: {X.columns}") return X def fit_transform(self, X, y, copy_data=True): sign = self.get_X_y_signature(X, y) if self.use_cache: logger.info('Try to load (X, y) from cache') X_t, y_t = self.get_transformed_X_y_from_cache(sign) if X_t is not None and y_t is not None: if self.load_transformers_from_cache(): return X_t, y_t else: logger.info('Load failed') start = time.time() self.reset() if X is None: raise ValueError(f'X cannot be none.') if y is None: raise ValueError(f'y cannot be none.') if len(X.shape) != 2: raise ValueError(f'X must be a 2D datasets.') # if len(y.shape) != 1: # raise ValueError(f'y must be a 1D datasets.') if X.shape[0] != y.shape[0]: raise ValueError( f"The number of samples of X and y must be the same. X.shape:{X.shape}, y.shape{y.shape}" ) y_df = pd.DataFrame(y) if y_df.isnull().sum().sum() > 0: raise ValueError("Missing values in y.") if copy: X = copy.deepcopy(X) y = copy.deepcopy(y) y = self.fit_transform_y(y) X = self.prepare_X(X) X = self.__prepare_features(X) if self.config.auto_imputation: X = self._imputation(X) if self.config.auto_encode_label: X = self._categorical_encoding(X) if self.config.auto_discrete: X = self._discretization(X) if self.config.apply_gbm_features and y is not None: X = self._apply_gbm_features(X, y) var_len_categorical_columns = self.config.var_len_categorical_columns if var_len_categorical_columns is not None and len( var_len_categorical_columns) > 0: X = self._var_len_encoder(X, var_len_categorical_columns) self.X_transformers['last'] = PassThroughEstimator() cat_cols = self.get_categorical_columns() cont_cols = self.get_continuous_columns() if len(cat_cols) > 0: X[cat_cols] = X[cat_cols].astype('category') if len(cont_cols) > 0: X[cont_cols] = X[cont_cols].astype('float') logger.info(f'fit_transform taken {time.time() - start}s') if self.use_cache: logger.info('Put (X, y) into cache') self.save_transformed_X_y_to_cache(sign, X, y) self.save_transformers_to_cache() return X, y def fit_transform_y(self, y): if self.config.task == consts.TASK_AUTO: self.task_, self.labels_ = deeptable.infer_task_type(y) else: self.task_ = self.config.task if self.task_ in [consts.TASK_BINARY, consts.TASK_MULTICLASS]: self.y_lable_encoder = LabelEncoder() y = self.y_lable_encoder.fit_transform(y) self.labels_ = self.y_lable_encoder.classes_ elif self.task_ == consts.TASK_MULTILABEL: self.labels_ = list(range(y.shape[-1])) else: self.labels_ = [] return y def transform(self, X, y, copy_data=True): sign = self.get_X_y_signature(X, y) if self.use_cache: logger.info('Try to load (X, y) from cache') X_t, y_t = self.get_transformed_X_y_from_cache(sign) if X_t is not None and y_t is not None: return X_t, y_t else: logger.info('Load failed') X_t = self.transform_X(X, copy_data) y_t = self.transform_y(y, copy_data) cat_cols = self.get_categorical_columns() cont_cols = self.get_continuous_columns() if len(cat_cols) > 0: X_t[cat_cols] = X_t[cat_cols].astype('category') if len(cont_cols) > 0: X_t[cont_cols] = X_t[cont_cols].astype('float') if self.use_cache: logger.info('Put (X, y) into cache') self.save_transformed_X_y_to_cache(sign, X_t, y_t) return X_t, y_t def transform_y(self, y, copy_data=True): logger.info("Transform [y]...") start = time.time() if copy_data: y = copy.deepcopy(y) if self.y_lable_encoder is not None: y = self.y_lable_encoder.transform(y) logger.info(f'transform_y taken {time.time() - start}s') y = np.array(y) return y def transform_X(self, X, copy_data=True): start = time.time() logger.info("Transform [X]...") if copy_data: X = copy.deepcopy(X) X = self.prepare_X(X) steps = [step for step in self.X_transformers.values()] pipeline = make_pipeline(*steps) X_t = pipeline.transform(X) logger.info(f'transform_X taken {time.time() - start}s') return X_t def inverse_transform_y(self, y_indicator): if self.y_lable_encoder is not None: return self.y_lable_encoder.inverse_transform(y_indicator) else: return y_indicator def __prepare_features(self, X): start = time.time() logger.info(f'Preparing features...') num_vars = [] convert2cat_vars = [] cat_vars = [] excluded_vars = [] if self.config.cat_exponent >= 1: raise ValueError( f'"cat_expoent" must be less than 1, not {self.config.cat_exponent} .' ) var_len_categorical_columns = self.config.var_len_categorical_columns var_len_column_names = [] if var_len_categorical_columns is not None and len( var_len_categorical_columns) > 0: # check items for v in var_len_categorical_columns: if not isinstance(v, (tuple, list)) or len(v) != 3: raise ValueError( "Var len column config should be a tuple 3.") else: var_len_column_names.append(v[0]) var_len_col_sep_dict = { v[0]: v[1] for v in var_len_categorical_columns } var_len_col_pooling_strategy_dict = { v[0]: v[2] for v in var_len_categorical_columns } else: var_len_col_sep_dict = {} var_len_col_pooling_strategy_dict = {} unique_upper_limit = round(X.shape[0]**self.config.cat_exponent) for c in X.columns: nunique = X[c].nunique() dtype = str(X[c].dtype) if nunique <= 1 and self.config.auto_discard_unique: continue if c in self.config.exclude_columns: excluded_vars.append((c, dtype, nunique)) continue # handle var len feature if c in var_len_column_names: self.__append_var_len_categorical_col( c, nunique, var_len_col_sep_dict[c], var_len_col_pooling_strategy_dict[c]) continue if self.config.categorical_columns is not None and isinstance( self.config.categorical_columns, list): if c in self.config.categorical_columns: cat_vars.append((c, dtype, nunique)) else: if np.issubdtype(dtype, np.number): num_vars.append((c, dtype, nunique)) else: print( f'Column [{c}] has been discarded. It is not numeric and not in [config.categorical_columns].' ) else: if dtype == 'object' or dtype == 'category' or dtype == 'bool': cat_vars.append((c, dtype, nunique)) elif self.config.auto_categorize and nunique < unique_upper_limit: convert2cat_vars.append((c, dtype, nunique)) else: num_vars.append((c, dtype, nunique)) if len(convert2cat_vars) > 0: ce = CategorizeEncoder([c for c, d, n in convert2cat_vars], self.config.cat_remain_numeric) X = ce.fit_transform(X) self.X_transformers['categorize'] = ce if self.config.cat_remain_numeric: cat_vars = cat_vars + ce.new_columns num_vars = num_vars + convert2cat_vars else: cat_vars = cat_vars + convert2cat_vars logger.debug( f'{len(cat_vars)} categorical variables and {len(num_vars)} continuous variables found. ' f'{len(convert2cat_vars)} of them are from continuous to categorical.' ) self.__append_categorical_cols([(c[0], c[2] + 2) for c in cat_vars]) self.__append_continuous_cols([c[0] for c in num_vars], consts.INPUT_PREFIX_NUM + 'all') print(f'Preparing features taken {time.time() - start}s') return X def _imputation(self, X): start = time.time() logger.info('Data imputation...') continuous_vars = self.get_continuous_columns() categorical_vars = self.get_categorical_columns() var_len_categorical_vars = self.get_var_len_categorical_columns() transformers = [ ('categorical', SimpleImputer(missing_values=np.nan, strategy='constant'), categorical_vars), ('continuous', SimpleImputer(missing_values=np.nan, strategy='mean'), continuous_vars), ] if len(var_len_categorical_vars) > 0: transformers.append( ('var_len_categorical', SimpleImputer(missing_values=np.nan, strategy='constant'), var_len_categorical_vars), ) ct = ColumnTransformer(transformers) dfwrapper = DataFrameWrapper( ct, categorical_vars + continuous_vars + var_len_categorical_vars) X = dfwrapper.fit_transform(X) self.X_transformers['imputation'] = dfwrapper print(f'Imputation taken {time.time() - start}s') return X def _categorical_encoding(self, X): start = time.time() logger.info('Categorical encoding...') vars = self.get_categorical_columns() mle = MultiLabelEncoder(vars) X = mle.fit_transform(X) self.X_transformers['label_encoder'] = mle print(f'Categorical encoding taken {time.time() - start}s') return X def _discretization(self, X): start = time.time() logger.info('Data discretization...') vars = self.get_continuous_columns() mkbd = MultiKBinsDiscretizer(vars) X = mkbd.fit_transform(X) self.__append_categorical_cols([ (new_name, bins + 1) for name, new_name, bins in mkbd.new_columns ]) self.X_transformers['discreter'] = mkbd print(f'Discretization taken {time.time() - start}s') return X def _var_len_encoder(self, X, var_len_categorical_columns): start = time.time() logger.info('Encoder var length feature...') transformer = MultiVarLenFeatureEncoder(var_len_categorical_columns) X = transformer.fit_transform(X) # update var_len_categorical_columns for c in self.var_len_categorical_columns: _encoder: VarLenFeatureEncoder = transformer._encoders[c.name] c.max_elements_length = _encoder.max_element_length self.X_transformers['var_len_encoder'] = transformer print(f'Encoder taken {time.time() - start}s') return X def _apply_gbm_features(self, X, y): start = time.time() logger.info('Extracting GBM features...') cont_vars = self.get_continuous_columns() cat_vars = self.get_categorical_columns() gbmencoder = LgbmLeavesEncoder(cat_vars, cont_vars, self.task_, **self.config.gbm_params) X = gbmencoder.fit_transform(X, y) self.X_transformers['gbm_features'] = gbmencoder if self.config.gbm_feature_type == consts.GBM_FEATURE_TYPE_EMB: self.__append_categorical_cols([ (name, X[name].max() + 1) for name in gbmencoder.new_columns ]) else: self.__append_continuous_cols( [name for name in gbmencoder.new_columns], consts.INPUT_PREFIX_NUM + 'gbm_leaves') print(f'Extracting gbm features taken {time.time() - start}s') return X def __append_var_len_categorical_col(self, name, voc_size, sep, pooling_strategy): logger.debug(f'Var len categorical variables {name} appended.') if self.config.fixed_embedding_dim: embedding_output_dim = self.config.embeddings_output_dim if self.config.embeddings_output_dim > 0 else consts.EMBEDDING_OUT_DIM_DEFAULT else: embedding_output_dim = 0 if self.var_len_categorical_columns is None: self.var_len_categorical_columns = [] vc = \ VarLenCategoricalColumn(name, voc_size, embedding_output_dim if embedding_output_dim > 0 else min(4 * int(pow(voc_size, 0.25)), 20), sep=sep, pooling_strategy=pooling_strategy) self.var_len_categorical_columns.append(vc) def __append_categorical_cols(self, cols): logger.debug(f'{len(cols)} categorical variables appended.') if self.config.fixed_embedding_dim: embedding_output_dim = self.config.embeddings_output_dim if self.config.embeddings_output_dim > 0 else consts.EMBEDDING_OUT_DIM_DEFAULT else: embedding_output_dim = 0 # if self.categorical_columns is None: self.categorical_columns = [] if cols is not None and len(cols) > 0: self.categorical_columns = self.categorical_columns + \ [CategoricalColumn(name, voc_size, embedding_output_dim if embedding_output_dim > 0 else min(4 * int(pow(voc_size, 0.25)), 20)) for name, voc_size in cols] def __append_continuous_cols(self, cols, input_name): if self.continuous_columns is None: self.continuous_columns = [] if cols is not None and len(cols) > 0: self.continuous_columns = self.continuous_columns + [ ContinuousColumn(name=input_name, column_names=[c for c in cols]) ] def get_categorical_columns(self): return [c.name for c in self.categorical_columns] def get_var_len_categorical_columns(self): if self.var_len_categorical_columns is not None: return [c.name for c in self.var_len_categorical_columns] else: return [] def get_continuous_columns(self): cont_vars = [] for c in self.continuous_columns: cont_vars = cont_vars + c.column_names return cont_vars def _prepare_cache_dir(self, cache_home, clear_cache=False): if cache_home is None: cache_home = 'cache' if cache_home[-1] == '/': cache_home = cache_home[:-1] cache_home = os.path.expanduser(f'{cache_home}') if not os.path.exists(cache_home): os.makedirs(cache_home) else: if clear_cache: shutil.rmtree(cache_home) os.makedirs(cache_home) cache_dir = f'{cache_home}/{self.signature}' if not os.path.exists(cache_dir): os.makedirs(cache_dir) return cache_dir def get_transformed_X_y_from_cache(self, sign): file_x_y = f'{self.cache_dir}/X_y_{sign}.h5' X_t, y_t = None, None if os.path.exists(file_x_y): global h5 try: h5 = pd.HDFStore(file_x_y) df = h5['data'] y_t = df.pop('saved__y__') X_t = df except Exception as e: logger.error(e) h5.close() os.remove(file_x_y) return X_t, y_t def save_transformed_X_y_to_cache(self, sign, X, y): filepath = f'{self.cache_dir}/X_y_{sign}.h5' try: # x_t = X.copy(deep=True) X.insert(0, 'saved__y__', y) X.to_hdf(filepath, key='data', mode='w', format='t') return True except Exception as e: logger.error(e) if os.path.exists(filepath): os.remove(filepath) return False def load_transformers_from_cache(self): transformer_path = f'{self.cache_dir}/transformers.pkl' if os.path.exists(transformer_path): try: with open(transformer_path, 'rb') as input: preprocessor = pickle.load(input) self.__dict__.update(preprocessor.__dict__) return True except Exception as e: logger.error(e) os.remove(transformer_path) return False def save_transformers_to_cache(self): transformer_path = f'{self.cache_dir}/transformers.pkl' with open(transformer_path, 'wb') as output: pickle.dump(self, output, protocol=2) def clear_cache(self): shutil.rmtree(self.cache_dir) os.makedirs(self.cache_dir)
print("The full size of the synthetic data set is", synthetic_df.shape) print("\n Displaying the first five rows of the *real* data set:\n") print(real_df.head(5)) print("The full size of the real data set is", synthetic_df.shape) print("\n The features are described as the following: \n") print(features_description.to_csv(index=False)) ### preprocess the data for machine learning # y: readmitted # substitute "NO" with 2, "<30" with 0, ">30" with 1 # real_df.readmitted = real_df.readmitted.replace(["NO", "<30", ">30"], [2, 0, 1]) real_df.readmitted.value_counts() le_real = LabelEncoder() real_y = le_real.fit_transform(real_df.readmitted) # numpy.ndarray le_real.inverse_transform(real_y) # maybe for visualisation synthetic_df.readmitted.value_counts() le_synthetic = LabelEncoder() synthetic_y = le_synthetic.fit_transform(synthetic_df.readmitted) # numpy.ndarray le_synthetic.inverse_transform(synthetic_y) # maybe for visualisation # x real_df = real_df.drop(labels="readmitted", axis=1) # axis 1 means columns real_x = pd.get_dummies(real_df) # one-hot encode, non-categorical variables will be left unchanged real_x_columns = real_x.columns # maybe for visualisation synthetic_df = synthetic_df.drop(labels="readmitted", axis=1) # axis 1 means columns synthetic_x = pd.get_dummies(synthetic_df) # non-categorical variables will be left unchanged synthetic_x_columns = synthetic_x.columns # maybe for visualisation