def train(save_path: str = 'models/model.h5') -> NoReturn: ''' Train intent classification for Dawn based on features extracted from BERT Args: save_path (str): path to save the model ''' CONFIG_PATH = 'models/LargeBert/bert_config.json' CHECKPOINT_PATH = 'models/LargeBert/bert_model.ckpt' DICT_PATH = 'models/LargeBert/vocab.txt' model = load_trained_model_from_checkpoint( CONFIG_PATH, CHECKPOINT_PATH, training=False, trainable=False, output_layer_num=4, ) # keras.utils.plot_model(model, to_file='model.png', show_shapes=True) pool_layer = MaskedGlobalMaxPool1D(name='Pooling')( model.get_layer(name='Encoder-Output').output) out = Dense(32, activation='relu', name='Pre-Output')(pool_layer) output = Dense(units=N_CLASS, activation='softmax', name='Final-Output')(out) model = Model(inputs=[ model.get_layer(name='Input-Token').input, model.get_layer(name='Input-Segment').input ], outputs=output) model.summary(line_length=120) opt = Adam(lr=0.0005, beta_1=0.9, beta_2=0.999, decay=0.01) model.compile(opt, loss='categorical_crossentropy', metrics=['acc']) checkpoint = ModelCheckpoint(save_path, verbose=1, monitor='val_loss', save_best_only=True, mode='auto') x_tokens, x_segments, y_in = load_data(dict_path=DICT_PATH) model.fit([x_tokens, x_segments], y_in, epochs=300, batch_size=32, callbacks=[checkpoint], validation_split=0.3, shuffle=True)
def main(): parser = argparse.ArgumentParser() parser.add_argument("seed", type=int) parser.add_argument("n_folds", type=int) args = parser.parse_args() SEED = args.seed N_FOLDS = args.n_folds seed_everything(SEED) X, y = load_data("data/bank-additional-full.csv") folds = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED) avg_score = 0 for fold_idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)): print("Fold:", fold_idx, flush=True) X_train, y_train = X.iloc[train_idx, :], y[train_idx] X_valid, y_valid = X.iloc[valid_idx, :], y[valid_idx] model = LogisticRegression(tol=0.014562448890118148, C=9.256722875165577, fit_intercept=True, class_weight="balanced", solver="newton-cg", max_iter=120, warm_start=True, random_state=SEED) model.fit(X_train, y_train) y_pred = model.predict_proba(X_valid).astype(float)[:, 1] score = roc_auc_score(list(y_valid), y_pred) avg_score += score print("logistic regression score:", score, flush=True) avg_score /= N_FOLDS print("average score:", avg_score, flush=True)
[unlabeled_pool, unlabeled_remained[:to_fill]]) unlabeled_remained = unlabeled_remained[to_fill:] def predict(self, X): X1, X2 = X[:, self.features1], X[:, self.features2] proba1 = self.model1.predict_proba(X1) proba2 = self.model2.predict_proba(X2) ensemble: np.ndarray = proba1 + proba2 return ensemble.argmax(axis=1) def score(self, X, y): return accuracy_score(y, self.predict(X)) if __name__ == '__main__': unlabeled_clinical_X, Ctr_X, Ctr_Y, Cval_X, Cval_Y, Ct_X, Ct_Y, unlabeled_genomic_X, Gtr_X, Gtr_Y, Gval_X, Gval_Y, Gt_X, Gt_Y = load_data( True) num_unlabeled_samples = len(unlabeled_genomic_X) num_features = Gtr_X.shape[1] unlabeled_y = np.ones(num_unlabeled_samples) * -1 Gtr_X = np.concatenate([Gtr_X, unlabeled_genomic_X]) Gtr_Y = np.concatenate([Gtr_Y, unlabeled_y]) features = set(range(0, num_features)) logger = init_logger(name='genomic_feature.log') best_score, best_features = 0, None for size in range(1, int(num_features / 2) + 1): for features1 in set(itertools.combinations(features, size)): features1 = set(features1) features2 = features - features1 features1 = np.array(list(features1), dtype=np.int) features2 = np.array(list(features2), dtype=np.int)
def test_answer(): X, Y = load_data() assert func(4) == 5
from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score from sklearn.model_selection import StratifiedKFold import xgboost as xgb from src.preprocess import load_data from src.utility import seed_everything warnings.filterwarnings("ignore") SEED = 123 N_FOLDS = 5 seed_everything(SEED) X, y = load_data("data/bank-additional-full.csv") folds = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED) def objective(args): avg_score = 0 for train_idx, valid_idx in folds.split(X, y): X_train, y_train = X.iloc[train_idx, :], y[train_idx] X_valid, y_valid = X.iloc[valid_idx, :], y[valid_idx] model = LogisticRegression(n_jobs=-1, random_state=SEED, **args) model.fit(X_train, y_train) y_pred = model.predict_proba(X_valid).astype(float)[:, 1]
# Initialize layers activations = [X] for i in range(self.n_layers_ - 1): activations.append(np.empty((X.shape[0], layer_units[i + 1]))) # forward propagate self._forward_pass(activations, self.coefs_, self.intercepts_) y_pred = activations[-1] if self.n_outputs_ == 1: y_pred = y_pred.ravel() return self.label_binarizer.inverse_transform(y_pred) def validate_input(self, X, y): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], multi_output=True) if y.ndim == 2 and y.shape[1] == 1: y = column_or_1d(y, warn=True) classes = unique_labels(y) self.label_binarizer = LabelBinarizer() self.label_binarizer.fit(classes) y = self.label_binarizer.transform(y) return X, y if __name__ == '__main__': Ctr_X, Ctr_Y, Cval_X, Cval_Y, Ct_X, Ct_Y, Gtr_X, Gtr_Y, Gval_X, Gval_Y, Gt_X, Gt_Y = load_data() goamlp_ctr = GOAMultilayerPerceptron(N=100, x_val=Cval_X, y_val=Cval_Y, hidden_layer_sizes=[70], max_iter=5000, random_state=1) classify(goamlp_ctr, Ctr_X, Ctr_Y, Cval_X, Cval_Y, "GOAMLPClassifier", "clinical") goamlp_gtr = GOAMultilayerPerceptron(N=10000, x_val=Gval_X, y_val=Gval_Y, hidden_layer_sizes=[36], max_iter=50, random_state=1) classify(goamlp_gtr, Gtr_X, Gtr_Y, Gval_X, Gval_Y, "GOAMLPClassifier", "genetic")
def main(): parser = argparse.ArgumentParser() parser.add_argument("seed", type=int) parser.add_argument("n_folds", type=int) parser.add_argument('-o', '--output_features', action="store_true") args = parser.parse_args() SEED = args.seed N_FOLDS = args.n_folds seed_everything(SEED) X, y = load_data("data/bank-additional-full.csv") folds = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED) avg_score = 0 feature_importance = pd.DataFrame() feature_importance["Feature"] = X.columns feature_importance["Value"] = 0 for fold_idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)): print("Fold:", fold_idx, flush=True) X_train, y_train = X.iloc[train_idx, :], y[train_idx] X_valid, y_valid = X.iloc[valid_idx, :], y[valid_idx] model = xgb.XGBRegressor(n_estimators=486, max_depth=23, learning_rate=0.014315933846251667, booster="gbtree", tree_method="exact", gamma=0.7581225878358416, subsample=0.9340339327920703, colsample_bytree=0.6940772015224637, colsample_bylevel=0.559247335020885, colsample_bynode=0.7962006061767392, reg_alpha=0.6394227535273009, reg_lambda=0.19510772446939947, scale_pos_weight=0.8349805523658489, objective="reg:squarederror", random_state=SEED) model.fit(X_train, y_train) y_pred = model.predict(X_valid).astype(float) score = roc_auc_score(list(y_valid), y_pred) avg_score += score print("xgboost score:", score, flush=True) current_importance = pd.DataFrame(zip(X.columns, model.feature_importances_), columns=["Feature", "Value"]) feature_importance = pd.concat( (feature_importance, current_importance)).groupby("Feature", as_index=False).sum() avg_score /= N_FOLDS print("average score:", avg_score, flush=True) if args.output_features: feature_importance["Value"] *= 100 / feature_importance["Value"].sum() fig = plt.figure(figsize=(20, 20)) fig.patch.set_facecolor("white") sns.set(style="whitegrid") sns.barplot(x="Value", y="Feature", data=feature_importance.sort_values(by="Value", ascending=False)) plt.title("Feature importance (%)") plt.tight_layout() plt.show()