def acc(net: NeuralNet, ds: Dataset, y: torch.Tensor) -> float: predict_values = net.predict(ds) return acc_as_metric(predict_values, y)
def run(): parser = get_arg_parser() cmd_args = parser.parse_args() if cmd_args.gpu is not None: os.environ['CUDA_VISIBLE_DEVICES'] = str(cmd_args.gpu) gpunum = os.getenv('CUDA_VISIBLE_DEVICES') logging.info("GPU has been set to {}".format(gpunum)) logging.info("Model used for the regression network: {}" .format(cmd_args.model_name)) # 1. Dataset retrieval # -------------------- tab_printer(constants.Dataset) dataset = Dataset(nrows=constants.Dataset.nrows, augment_labels=constants.Dataset.augment_labels, top_n=constants.Dataset.top_n) logging.info("Going to create vocabulary and fit a preprocessing pipeline" "using {} samples. Settings will be listed below" .format(len(dataset.X_train))) # 2. Preprocessing # ----------------- tab_printer(constants.NLP) preprocessor = Preprocessing(dataset.X_train) # Preprocess documents X_train = preprocessor.transform_documents(dataset.X_train) X_test = preprocessor.transform_documents(dataset.X_test) # 3. Word embeddings with word2vec # -------------------------------- # Train word2vec embeddings if train_word2vec option is selected if cmd_args.train_word2vec: utils.embeddings.main() weights = get_embedding_tensor(preprocessor) # 4. Node embeddings with AttentionWalk # ------------------------------------- args = _generate_deepwalk_parameters(dataset.y_train_graph) if cmd_args.train_attentionwalk: train_attention_walk(args) graph_embeddings = pd.read_csv(args.embedding_path).iloc[:, 1:].values # Get document representations using node embeddings y_embedded = _get_label_embeddings(dataset.y_train, graph_embeddings) y_test_embedded = _get_label_embeddings(dataset.y_test, graph_embeddings) # 5. Regressor Training # --------------------- device = 'cuda:' + str(os.getenv("CUDA_VISIBLE_DEVICES")) \ if torch.cuda.is_available() else 'cpu' regressor_nn = NeuralNet( get_network_class(cmd_args.model_name), max_epochs=constants.NeuralNetworkTraining.epochs, lr=constants.NeuralNetworkTraining.learning_rate, batch_size=constants.NeuralNetworkTraining.batch_size, optimizer=torch.optim.Adam, criterion=torch.nn.MSELoss, module__output_dim=args.dimensions, module__embedding=weights, module__embedding_dim=constants.NLP.embedding_size, device=device, train_split=None, ) # Train the regressor neural network regressor_nn.fit(X_train, y_embedded.astype(np.float32)) # 6. Train Multi-label KNN algorithm # ---------------------------------- tab_printer(constants.MLKNN) # Train multi-label KNN to turn label embeddings into label predictions classifier = MLkNN(k=constants.MLKNN.k, s=constants.MLKNN.s) classifier.fit(y_embedded, dataset.y_train) # 7. Evaluation # ------------- # Label prediction with documents y_test_pred = regressor_nn.predict(X_test) preds = classifier.predict(y_test_pred) preds_raw = classifier.predict_proba(y_test_pred) # Label prediction with label embeddings preds_w_labels = classifier.predict(y_test_embedded) preds_w_labels_raw = classifier.predict_proba(y_test_embedded) # Log evaluation result with label embeddings eval_metrics_w_labels = evaluation \ .all_metrics(preds_w_labels.toarray(), dataset.y_test, yhat_raw=preds_w_labels_raw.toarray()) logging.info(str(eval_metrics_w_labels)) # Log evaluation result with documents report_evaluation(preds.toarray(), dataset.y_test, yhat_raw=preds_raw.toarray())
#%% Fit the model io = net.fit(WD) # Save model net.save_params(f_params='models/baselineNN.pkl') #%% Or load it from disk net.initialize() net.load_params(f_params="models/baselineNN.pkl") #%% Predict on train # Out yhat = net.predict(WD) # Classes yhatc = yhat.argmax(axis=1) # True labels ytrue = WD.y (ytrue == yhatc).sum() / yhatc.size # Classification report from sklearn import metrics print( metrics.classification_report(ytrue, yhatc, target_names=list(category_map.values()))) metrics.confusion_matrix(ytrue, yhatc) #%% Predict on test