def get_cado_predictions(): data_path = '../../datasets/cado/train.csv' test_path = '../../datasets/cado/test.csv' data = du.load_data(data_path) test = du.load_data(test_path) text_index = 6 label_start_index = 7 X = [d[text_index] for d in data] labels = [d[label_start_index:label_start_index + 12] for d in data] X_test = [d[text_index] for d in test] labels_test = [d[label_start_index:label_start_index + 12] for d in test] Y = np.array(labels, dtype='int') y_test = np.array(labels_test, dtype='int') #Y = np.array(binary_labels, dtype='int') test_index = len(X) X = X + X_test Y = np.vstack([Y, y_test]) tokenizer = tokenize_data(X) word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(X) X = pad_sequences(sequences, maxlen=700, padding="post", truncating="post", value=0) num_words = min(MAX_NB_WORDS, len(word_index) + 1) embedding_matrix = np.zeros((num_words, 1)) for word, i in word_index.items(): if i >= MAX_NB_WORDS: continue embedding_matrix[i] = 1 X_train = X[0:test_index, :] Y_train = Y[0:test_index, :] x_test = X[test_index:len(X), :] y_test = Y[test_index:len(Y), :] classifier = MLkNN() classifier.fit(X_train, Y_train) predictions = classifier.predict(x_test) scores = classifier.predict_proba(x_test) y_pred = predictions.toarray() y_score = scores.toarray() return y_pred, y_score
def RecommendByMLKNN(train_data, train_data_y, test_data, test_data_y, recommendNum=5): """ML KNN算法""" classifier = MLkNN(k=train_data_y.shape[1]) classifier.fit(train_data, train_data_y) predictions = classifier.predict_proba(test_data).todense() """预测结果转化为data array""" predictions = numpy.asarray(predictions) recommendList = DataProcessUtils.getListFromProbable(predictions, range(1, train_data_y.shape[1] + 1), recommendNum) answerList = test_data_y print(predictions) print(test_data_y) print(recommendList) print(answerList) return [recommendList, answerList]
def run(): parser = get_arg_parser() cmd_args = parser.parse_args() if cmd_args.gpu is not None: os.environ['CUDA_VISIBLE_DEVICES'] = str(cmd_args.gpu) gpunum = os.getenv('CUDA_VISIBLE_DEVICES') logging.info("GPU has been set to {}".format(gpunum)) logging.info("Model used for the regression network: {}" .format(cmd_args.model_name)) # 1. Dataset retrieval # -------------------- tab_printer(constants.Dataset) dataset = Dataset(nrows=constants.Dataset.nrows, augment_labels=constants.Dataset.augment_labels, top_n=constants.Dataset.top_n) logging.info("Going to create vocabulary and fit a preprocessing pipeline" "using {} samples. Settings will be listed below" .format(len(dataset.X_train))) # 2. Preprocessing # ----------------- tab_printer(constants.NLP) preprocessor = Preprocessing(dataset.X_train) # Preprocess documents X_train = preprocessor.transform_documents(dataset.X_train) X_test = preprocessor.transform_documents(dataset.X_test) # 3. Word embeddings with word2vec # -------------------------------- # Train word2vec embeddings if train_word2vec option is selected if cmd_args.train_word2vec: utils.embeddings.main() weights = get_embedding_tensor(preprocessor) # 4. Node embeddings with AttentionWalk # ------------------------------------- args = _generate_deepwalk_parameters(dataset.y_train_graph) if cmd_args.train_attentionwalk: train_attention_walk(args) graph_embeddings = pd.read_csv(args.embedding_path).iloc[:, 1:].values # Get document representations using node embeddings y_embedded = _get_label_embeddings(dataset.y_train, graph_embeddings) y_test_embedded = _get_label_embeddings(dataset.y_test, graph_embeddings) # 5. Regressor Training # --------------------- device = 'cuda:' + str(os.getenv("CUDA_VISIBLE_DEVICES")) \ if torch.cuda.is_available() else 'cpu' regressor_nn = NeuralNet( get_network_class(cmd_args.model_name), max_epochs=constants.NeuralNetworkTraining.epochs, lr=constants.NeuralNetworkTraining.learning_rate, batch_size=constants.NeuralNetworkTraining.batch_size, optimizer=torch.optim.Adam, criterion=torch.nn.MSELoss, module__output_dim=args.dimensions, module__embedding=weights, module__embedding_dim=constants.NLP.embedding_size, device=device, train_split=None, ) # Train the regressor neural network regressor_nn.fit(X_train, y_embedded.astype(np.float32)) # 6. Train Multi-label KNN algorithm # ---------------------------------- tab_printer(constants.MLKNN) # Train multi-label KNN to turn label embeddings into label predictions classifier = MLkNN(k=constants.MLKNN.k, s=constants.MLKNN.s) classifier.fit(y_embedded, dataset.y_train) # 7. Evaluation # ------------- # Label prediction with documents y_test_pred = regressor_nn.predict(X_test) preds = classifier.predict(y_test_pred) preds_raw = classifier.predict_proba(y_test_pred) # Label prediction with label embeddings preds_w_labels = classifier.predict(y_test_embedded) preds_w_labels_raw = classifier.predict_proba(y_test_embedded) # Log evaluation result with label embeddings eval_metrics_w_labels = evaluation \ .all_metrics(preds_w_labels.toarray(), dataset.y_test, yhat_raw=preds_w_labels_raw.toarray()) logging.info(str(eval_metrics_w_labels)) # Log evaluation result with documents report_evaluation(preds.toarray(), dataset.y_test, yhat_raw=preds_raw.toarray())
def mlknn_train_pred(k_list, df_train_x, df_train_y, df_test_x, df_test_y, target_cols, NFOLDS=5): """ This function z-score normalizes the train and test data, split the train data in K-folds and run the Multilabel KNN on the folds to choose the best "K", thereafter predicting on the K-fold train data and test set using the Best K, averaging out the predictions across all folds for the test set. Args: k_list: A list of "K" nearest neighbours to perform gridsearch on df_train_x: train data with only phenotypic/morphological features - pandas dataframe. df_train_y: train data with only the MOA (Mechanism of actions) target labels - pandas dataframe. df_test_x: test data with only phenotypic/morphological features - pandas dataframe. df_test_y: test data with only the MOA (Mechanism of actions) target labels- pandas dataframe. target_cols: A list of MOA (Mechanism of actions) target labels NFOLDS: A value that represent number of K-subset/cross-validation we want to perform Returns: oof_preds: Train out-of-fold predictions - pandas dataframe. test_preds: Test predictions - pandas dataframe. """ sc = StandardScaler() df_train_x_scaled = pd.DataFrame(sc.fit_transform(df_train_x), columns=df_train_x.columns) df_test_x_scaled = pd.DataFrame(sc.transform(df_test_x), columns=df_test_x.columns) acc_losses = [] oof_preds = pd.DataFrame(np.zeros(shape=(df_train_y.shape)), columns=target_cols) test_preds = pd.DataFrame(np.zeros(shape=(df_test_y.shape)), columns=target_cols) skf = MultilabelStratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=133) print('Execution time | Fold number | logloss | Best K |') for fn, (trn_idx, val_idx) in enumerate(skf.split(df_train_x_scaled, df_train_y)): start_time = time() X_train, X_val = df_train_x_scaled.loc[ trn_idx, :], df_train_x_scaled.loc[val_idx, :] y_train, y_val = df_train_y.iloc[trn_idx, :], df_train_y.iloc[ val_idx, :] best_k = 0 best_loss = np.inf for k_item in k_list: classifier = MLkNN(k=k_item) classifier.fit(X_train.values, y_train.values) val_preds = classifier.predict_proba(X_val.values) loss = log_loss(np.ravel(y_val), np.ravel(val_preds.toarray())) if loss < best_loss: best_loss = loss best_k = k_item oof_preds.iloc[val_idx, :] = val_preds.toarray() classifier = MLkNN(k=best_k) classifier.fit(X_train.values, y_train.values) acc_losses.append(best_loss) preds = classifier.predict_proba(df_test_x_scaled.values) test_preds += preds.toarray() / NFOLDS print('{}\t\t{}\t\t{:.5f}\t\t{}'.format( str(datetime.timedelta(seconds=time() - start_time))[:7], fn, loss, best_k)) return oof_preds, test_preds
k=pd.DataFrame(predicts.todense()) ss[TARGET_COLS] = k ss.to_csv(r"C:\Users\Sheeja Ayoob\Desktop\hacklive_NLP_sub7.csv", index = False) -------------------------------------------------------------------------------------------- #optimal threshold def get_best_thresholds(true, preds): thresholds = [i/100 for i in range(100)] best_thresholds = [] for idx in range(25): f1_scores = [f1_score(true[:, idx], (preds[:, idx] > thresh) * 1) for thresh in thresholds] best_thresh = thresholds[np.argmax(f1_scores)] best_thresholds.append(best_thresh) return best_thresholds val_preds = mlknn_classifier.predict_proba(X_val_tfidf) val_preds=val_preds.toarray() best_thresholds = get_best_thresholds(y_val,val_preds) for i, thresh in enumerate(best_thresholds): val_preds[:, i] = (val_preds[:, i] > thresh) * 1 f1_score(y_val, val_preds, average='micro') preds_test = mlknn_classifier.predict_proba(X_test1_tfidf) for i, thresh in enumerate(best_thresholds): preds_test[:, i] = (preds_test[:, i] > thresh) * 1