def main(): data_reader = DataReader() df = data_reader.get_all_data() # random split of data train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(df) # set up train data train_tokens, train_y_raw = tokenize(train_x_raw, train_y_raw, save_missing_feature_as_string=False, remove_empty=True) train_x, train_y, feature_names = tokens_to_bagofwords(train_tokens, train_y_raw) # train model model = _get_nn_model_bag_of_words_simple_v2(train_x, train_y, data_reader.get_region_labels()['Code'], epochs=50, batch_size=64) # set up test data test_tokens, test_y_raw = tokenize(test_x_raw, test_y_raw, save_missing_feature_as_string=False, remove_empty=True) test_x, test_y, _ = tokens_to_bagofwords(test_tokens, test_y_raw, feature_names=feature_names) # evaluate model evaluate_model_nn(model, test_x, test_y, plot_roc=False) # ABOVE IS BASIC SUPERVISED LEARNING TO GENERATE MODEL ################################################# # BELOW IS SEMI-SUPERVISED SELF-TRAINING TO FUTHER TRAIN MODEL # read unlabelled data and format it to be the same as labelled data unlabelled_df = data_reader.get_east_dir() unlabelled_df = normalize_east_dir_df(unlabelled_df) # set up unlabelled data as semi-supervised data tokens, _ = tokenize(unlabelled_df, _, save_missing_feature_as_string=False, remove_empty=True) semi_x_base, _, _ = tokens_to_bagofwords(tokens, _, feature_names=feature_names) # Confidence threshold to train on train_threshold = 0.8 semi_train_amount = 30 # SELF TRAIN MANY TIMES for i in range(semi_train_amount): # get predictions on unlabelled data pred = model.model.predict(semi_x_base) # convert probablities to 1 hot encoded output semi_y = np.zeros_like(pred) semi_y[np.arange(len(pred)), pred.argmax(1)] = 1 # filter semi_x and semi_y to only include predictions above train_threshold semi_y = semi_y[pred.max(axis=1) > train_threshold] semi_x = semi_x_base[pred.max(axis=1) > train_threshold] # train on semi supervised data model.model.fit(semi_x, semi_y, batch_size=64, epochs=100) # retrain on original train data model.model.fit(train_x, model.encoder.transform(train_y), batch_size=32, epochs=10) # evaluate model evaluate_model_nn(model, test_x, test_y, plot_roc=False) # remove semi data used in this iteration from future iterations semi_x_base = semi_x_base[~(pred.max(axis=1) > train_threshold)]
def eval(): # load data data_reader = DataReader() df = data_reader.get_all_data() train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(df) # get bag of words train_x, train_y, test_x, test_y = bag_of_words_full_no_empty( train_x_raw, train_y_raw, test_x_raw, test_y_raw) #train logistic regression, random forest and naive bayes on bag of words, and run get accuracy, precision, recall, f1score log_reg_bow = _get_multiclass_logistic_regression_model_bag_of_words_full( train_x, train_y) eval_model(log_reg_bow, test_x, test_y) evaluate_model(log_reg_bow, test_x, test_y, plot_roc=False) rand_for_bow = _get_random_forest_model_bag_of_words_full(train_x, train_y) eval_model(rand_for_bow, test_x, test_y) evaluate_model(rand_for_bow, test_x, test_y, plot_roc=False) nb_bow = _get_naive_bayes_model_bag_of_words_full(train_x, train_y) eval_model(nb_bow, test_x.A, test_y) evaluate_model(nb_bow, test_x.A, test_y, plot_roc=False) # get tfidf train_x, train_y, test_x, test_y = tfidf_no_empty(train_x_raw, train_y_raw, test_x_raw, test_y_raw) # train logistic regression, random forest and naive bayes on tfidf, and run get accuracy, precision, recall, f1score log_reg_bow = _get_multiclass_logistic_regression_model_tfidf( train_x, train_y) eval_model(log_reg_bow, test_x, test_y) evaluate_model(log_reg_bow, test_x, test_y, plot_roc=False) rand_for_bow = _get_random_forest_model_tfidf(train_x, train_y) print("random forest, tfidf") eval_model(rand_for_bow, test_x, test_y) evaluate_model(rand_for_bow, test_x, test_y, plot_roc=False) nb_bow = _get_naive_bayes_model_tfidf(train_x.A, train_y) print("naive bayes, tfidf") eval_model(nb_bow, test_x.A, test_y) evaluate_model(nb_bow, test_x, test_y, plot_roc=False) # get doc2vec train_x, train_y, test_x, test_y = doc2vec_simple(train_x_raw, train_y_raw, test_x_raw, test_y_raw) # train logistic regression, random forest and naive bayes on doc2vec, and run get accuracy, precision, recall, f1score log_reg_bow = _get_multiclass_logistic_regression_model_doc2vec_simple( train_x, train_y) print("bow, doc2vec") eval_model(log_reg_bow, test_x, test_y) evaluate_model(log_reg_bow, test_x, test_y, plot_roc=False) rand_for_bow = _get_random_forest_model_doc2vec_simple(train_x, train_y) print("random forest, doc2evc") eval_model(rand_for_bow, test_x, test_y) evaluate_model(rand_for_bow, test_x, test_y, plot_roc=False) nb_bow = _get_naive_bayes_model_doc2vec_simple(train_x, train_y) print("naive bayes, doc2vec") eval_model(nb_bow, test_x, test_y) evaluate_model(nb_bow, test_x, test_y, plot_roc=False)
def eval_ae(): from Models.logistic_regression import MultiClassLogisticRegression from Models.random_forest import RandomForest from Models.naive_bayes import NaiveBayes from Models.svm import SVM # load data data_reader = DataReader() df = data_reader.get_all_data() train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split( df) train_x, train_y, val_x, val_y, test_x, test_y = bag_of_words_full_no_empty_val_no_num_no_short_no_repeat( train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw) # Train an auto encoder of size 4096 encoder = get_encoder(train_x, test_x, 4096) # use auto encoder to encode the train, validate and test sets encoded_train = encoder.predict(train_x) encoded_test = encoder.predict(test_x) encoded_val = encoder.predict(val_x) # train the neural network model and calculate the precision, recall, f1 score, and accuracy print('neural net ae') model = _get_nn_model_bag_of_words_simple_scratch( encoded_train, train_y, encoded_val, val_y, data_reader.get_region_labels()['Code'], epochs=100, batch_size=256) eval_nn(model, encoded_test, test_y) evaluate_model_nn(model, encoded_test, test_y) # train the logistic regression model and calculate the precision, recall, f1 score, and accuracy print('logistic regression ae') model = MultiClassLogisticRegression() model.train(encoded_train, train_y) model_obj = lambda: None model_obj.model = model eval_model(model_obj, encoded_test, test_y) evaluate_model(model, encoded_test, test_y) # train the random forest model and calculate the precision, recall, f1 score, and accuracy print('random forest ae') model = RandomForest() model.train(encoded_train, train_y) model_obj = lambda: None model_obj.model = model eval_model(model_obj, encoded_test, test_y) evaluate_model(model, encoded_test, test_y) # train the naive bayes model and calculate the precision, recall, f1 score, and accuracy print('naive bayes ae') model = NaiveBayes() model.train(encoded_train, train_y) model_obj = lambda: None model_obj.model = model eval_model(model_obj, encoded_test, test_y) evaluate_model(model, encoded_test, test_y)
def main(): data_reader = DataReader() df = data_reader.get_all_data() y = df['ON WG IDENTIFIER'].values df.drop(['src_file', 'ON WG IDENTIFIER'], 1, inplace=True) tokens, y = tokenize_columns(df, y, save_missing_feature_as_string=False, remove_repeats=True, remove_num=True) x, y, feature_names = tokens_to_bagofwords(tokens, y) #tfid corpus = list(map(' '.join, tokens[:])) vectorizer = TfidfVectorizer() mat = vectorizer.fit_transform(corpus) # def __init__(self, num_clusters, feature_names, train_x, train_y): b = Birch_(10, feature_names, mat, y) print('d b score: ' + str(b.get_db_idx_score())) print('sil score: ' + str(b.get_sil_score())) h = Hierarchial(10, feature_names, mat, y) print('d b score: ' + str(h.get_get_db_idx_score())) print('sil score: ' + str(h.get_sil_score()))
from data_reader import DataReader from data_manipulator import * import numpy as np from sklearn.feature_extraction.text import CountVectorizer # clean for BioASQ bioclean = lambda t: re.sub( '[.,?;*!%^&_+():-\[\]{}]', '', t.replace('"', '').replace('/', '').replace('\\', '').replace("'", ''). strip().lower()).split() tokens = bioclean('This is a sentence w/o you!') print(tokens) data_reader = DataReader() df = data_reader.get_all_data() df = df[[ 'RIS PROCEDURE DESCRIPTION', 'PACS STUDY DESCRIPTION', 'ON WG IDENTIFIER' ]] # drop missing rows df = df.dropna() df['text'] = df[['RIS PROCEDURE DESCRIPTION', 'PACS STUDY DESCRIPTION']].apply(lambda x: ' '.join(x), axis=1) df = df.drop(['RIS PROCEDURE DESCRIPTION', 'PACS STUDY DESCRIPTION'], axis=1) df = df.rename(columns={'ON WG IDENTIFIER': 'target'}).values targets = df[:, 0] words = df[:, 1] vectorizer = CountVectorizer(tokenizer=bioclean)
def original_model(): if not os.path.isfile('demo_nn.h5'): data_reader = DataReader() df = data_reader.get_all_data() top_x_predictions = 10 # Split data train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split( df) # get bag of words train_x, train_y, val_x, val_y, test_x, test_y, feature_names = get_bag_of_words( train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw) # get all labels labels = data_reader.get_region_labels()['Code'] # train neural net model = MultiClassNNScratch(train_x.shape, np.array(labels), epochs=150, batch_size=1024) model.set_train_data(train_x, train_y) model.train(val_x, val_y) # save neural net model.model.save('demo_nn.h5') else: # load neural net model = MultiClassNNScratch(1, np.array([]), epochs=150, batch_size=1024) model.set_train_data(1, 1) model.model = load_model( 'demo_nn.h5', custom_objects={'top_3_accuracy': top_3_accuracy}) # from IPython import embed # embed() regex_string = r'[a-zA-Z0-9]+' while True: stdin = input("Enter all information:") if stdin == 'quit': break try: top_x_predictions = int(stdin) print("Will return top " + str(top_x_predictions) + " predictions") except ValueError: tokenizer = RegexpTokenizer(regex_string) tokens = tokenizer.tokenize(stdin.lower()) vectorizer = CountVectorizer(tokenizer=lambda x: x, lowercase=False, strip_accents=False, vocabulary=feature_names) model_input = vectorizer.fit_transform([tokens]) pred = model.model.predict(model_input) # Top X Predictions rows = [] for i in range(top_x_predictions): one_hot_pred = np.zeros_like(pred) one_hot_pred[np.arange(len(pred)), (np.argpartition(pred[0], -top_x_predictions )[-top_x_predictions:][i])] = 1 id = model.encoder.inverse_transform(one_hot_pred)[0][0] row = data_reader.regional_df[data_reader.regional_df['Code'] == id] row['Prediction Confidence'] = (pred[0][(np.argpartition( pred[0], -top_x_predictions)[-top_x_predictions:][i])]) * 100 rows.append(row) rows = (pd.concat(rows)).sort_values('Prediction Confidence', ascending=False) print(rows)
def main(): config = tf.ConfigProto(device_count={'GPU': 0}) # config.gpu_options.per_process_gpu_memory_fraction = 0.64 set_session(tf.Session(config=config)) data_reader = DataReader() df = data_reader.get_all_data() # Split data train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split( df) # get bag of words train_x, train_y, val_x, val_y, test_x, test_y, feature_names = get_bag_of_words( train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw) # get all labels labels = data_reader.get_region_labels()['Code'] if not os.path.isfile('demo_nn.h5'): # train neural net model = MultiClassNNScratch(train_x.shape, np.array(labels), epochs=150, batch_size=1024) model.set_train_data(train_x, train_y) model.train(val_x, val_y) # save neural net model.model.save('demo_nn.h5') else: # load neural net model = MultiClassNNScratch(train_x.shape, np.array(labels), epochs=150, batch_size=1024) model.set_train_data(train_x, train_y) model.model = load_model( 'demo_nn.h5', custom_objects={'top_3_accuracy': top_3_accuracy}) # from IPython import embed # embed() regex_string = r'[a-zA-Z0-9]+' while True: stdin = input("Enter all information:") if stdin == 'quit': break tokenizer = RegexpTokenizer(regex_string) tokens = tokenizer.tokenize(stdin.lower()) vectorizer = CountVectorizer(tokenizer=lambda x: x, lowercase=False, strip_accents=False, vocabulary=feature_names) model_input = vectorizer.fit_transform([tokens]) pred = model.model.predict(model_input) one_hot_pred = np.zeros_like(pred) one_hot_pred[np.arange(len(pred)), pred.argmax(1)] = 1 id = model.encoder.inverse_transform(one_hot_pred)[0][0] row = data_reader.regional_df[data_reader.regional_df['Code'] == id] print(row)
def siamese_fewshot(train, snn_seen_score, snn_unseen_score, knn_score, nn_score, limit, unseen_num_class): # get the data data_reader = DataReader() df = data_reader.get_all_data() train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(df) tokens, train_y_raw = tokenize_columns( train_x_raw, train_y_raw, save_missing_feature_as_string=False, remove_empty=True, remove_num=True, remove_repeats=True, remove_short=True) train_x, train_y, feature_names = tokens_to_bagofwords( tokens, train_y_raw, vectorizer_class=CountVectorizer) tokens, test_y_raw = tokenize_columns(test_x_raw, test_y_raw, save_missing_feature_as_string=False, remove_empty=True, remove_num=True, remove_repeats=True, remove_short=True) test_x, test_y, _ = tokens_to_bagofwords(tokens, test_y_raw, vectorizer_class=CountVectorizer, feature_names=feature_names) # encode the labels into smaller integers rather than large integers le = preprocessing.LabelEncoder() le.fit(np.concatenate((test_y.values, train_y.values))) train_y = le.transform(train_y.values) test_y = le.transform(test_y.values) # combine train and test x = np.concatenate((train_x.todense(), test_x.todense())) y = np.concatenate((train_y, test_y)) # delete unwanted variables del train_x, test_x, train_y, test_y # create pairwise dataset train_paired, test_paired, \ train_paired_target, test_paired_target, \ labels_dict_train, labels_dict_test, \ uc_support_set, uc_test_samples, uc_test_labels = create_pairwise_dataset(x, y, limit=limit, unseen_num_class=unseen_num_class) # set the data as separate numpy array to be passed into model pair1_train = [] pair2_train = [] for i in range(len(train_paired)): pair1_train.append(np.array(train_paired[i][0])) pair2_train.append(np.array(train_paired[i][1])) pair1_train = np.array(pair1_train) pair2_train = np.array(pair2_train) train_paired_target = np.array(train_paired_target) # shuffle in unison pair1_train, pair2_train, train_paired_target = unison_shuffled_copies( pair1_train, pair2_train, train_paired_target) pair1_test = [] pair2_test = [] for i in range(len(test_paired)): pair1_test.append(np.array(test_paired[i][0])) pair2_test.append(np.array(test_paired[i][1])) pair1_test = np.array(pair1_test) pair2_test = np.array(pair2_test) test_paired_target = np.array(test_paired_target) # shuffle in unison pair1_test, pair2_test, test_paired_target = unison_shuffled_copies( pair1_test, pair2_test, test_paired_target) feature_size = pair1_train.shape[-1] pair1_train = pair1_train.reshape(-1, feature_size) pair2_train = pair2_train.reshape(-1, feature_size) pair1_test = pair1_test.reshape(-1, feature_size) pair2_test = pair2_test.reshape(-1, feature_size) input_shape = pair1_train.shape[1] # if train the model from scratch if train: # siamese neural network structure siamese_net = SiameseNN(input_shape) siamese_net.train([pair1_train, pair2_train], train_paired_target, [pair1_test, pair2_test], test_paired_target) siamese_net.save('siamese-' + str(limit)) # if load the pre-trained model else: siamese_net = SiameseNN(input_shape) siamese_net.load('siamese-' + str(limit)) #========================================================================# #======================= Prepare data for testing ========================# #========================================================================# # support set for SNN testing support_set = {} counter = 0 for labelA in sorted(labels_dict_train): if counter > limit: break else: support_set[labelA] = labels_dict_train[labelA] counter += 1 # training data for kNN and neural network x_train = [] y_train = [] counter = 0 for label in sorted(labels_dict_train): if counter > limit: break samples = labels_dict_train[label] for s in samples: x_train.append(s) y_train.append(label) counter += 1 x_train = np.array(x_train) x_train = x_train.reshape(-1, x_train.shape[-1]) y_train = np.array(y_train).reshape(-1, 1) # testing data for kNN and neural network test_samples = [] test_labels = [] counter = 0 for labelA in sorted(labels_dict_test): if counter > limit: break else: samples = labels_dict_test[labelA] idxs = np.random.permutation(len(samples)) for k in idxs: test_samples.append(samples[k]) test_labels.append(labelA) counter += 1 test_samples = np.array(test_samples) #========================================================================# #================ Evaluation for SNN on seen classes ====================# #========================================================================# if snn_seen_score: print('=================================================') print('Running test on seen classes with Siamese NN ....') # run testing in a non-parametric way score = siamese_net.score_non_parametric(test_samples, test_labels, support_set) print('Siamese NN seen class accuracy with {} classes: {}%'.format( limit, round(score, 1))) #========================================================================# #============== Evaluation for SNN on unseen classes ====================# #========================================================================# if snn_unseen_score: print('=================================================') print('Running test on unseen classes with Siamese NN ....') # run testing in a non-parametric way score = siamese_net.score_non_parametric(uc_test_samples, uc_test_labels, uc_support_set) print('Siamese NN {} unseen class accuracy: {}%'.format( unseen_num_class, round(score, 1))) #========================================================================# #=================== Evaluation with baseline KNN =======================# #========================================================================# if knn_score: print('=================================================') print('Running test on kNN algorithm ....') test_samples = test_samples.reshape(-1, test_samples.shape[-1]) test_labels = np.array(test_labels).reshape(-1, 1) knn = KNN(n_neighbors=10) knn.train(x_train, y_train) score = knn.score(test_samples, test_labels) * 100 print('kNN accuracy with {} classes: {}%'.format( limit, round(score, 1))) #========================================================================# #=================== Evaluation with Neural Network =====================# #========================================================================# if nn_score: print('=================================================') print('Running train and test on neural network ....') inputs = Input(shape=(feature_size, ), name="input") x = Dense(7750, activation="relu", name="dense1", input_dim=feature_size)(inputs) output = Dense(limit, activation="softmax", name="output")(x) nn = Model(inputs, output) nn.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['categorical_accuracy']) callbacks = [ EarlyStopping(monitor='val_loss', patience=3, mode='min', restore_best_weights=True) ] y_train_onehot = np_utils.to_categorical(y_train) test_samples = test_samples.reshape(-1, test_samples.shape[-1]) test_labels = np.array(test_labels).reshape(-1, 1) test_labels_onehot = np_utils.to_categorical(test_labels) nn.fit(x_train, y_train_onehot, batch_size=256, epochs=100, verbose=0, callbacks=callbacks, validation_data=(test_samples, test_labels_onehot)) score, acc = nn.evaluate(test_samples, test_labels_onehot) print('Neural network acuracy with {} classes: {}%'.format( limit, round(score, 1)))
def eval_pub_med(): from gensim.models.keyedvectors import KeyedVectors # Need to download file from http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin # Load the pubmed model model = KeyedVectors.load_word2vec_format( 'wikipedia-pubmed-and-PMC-w2v.bin', binary=True) # Load data into train/validate/test sets data_reader = DataReader() df = data_reader.get_all_data() train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split( df) tokens_train, train_y_raw = tokenize(train_x_raw, train_y_raw, save_missing_feature_as_string=False, remove_empty=True) # for the each tokenized vector in the train set, run the model on each word and take the average. # If no words are vectorized by pubmed, append an 0 vector avg = [] for item in tokens_train: words = [] for word in item: if word in model.wv.vocab: vec = model.get_vector(word) words.append(vec) average = np.average(np.array(words), axis=0) if type(average) == np.float64: print('****') print(average) avg.append(np.zeros(200)) else: avg.append(list(average)) pub_med_train = np.array(avg) # run the same for the validation set tokens_val, val_y_raw = tokenize(val_x_raw, val_y_raw, save_missing_feature_as_string=False, remove_empty=True) avg = [] for item in tokens_val: words = [] for word in item: if word in model.wv.vocab: vec = model.get_vector(word) words.append(vec) average = np.average(np.array(words), axis=0) if type(average) == np.float64: print('****') print(average) avg.append(np.zeros(200)) else: avg.append(list(average)) pub_med_val = np.array(avg) # run the same for the test set tokens_test, test_y_raw = tokenize(test_x_raw, test_y_raw, save_missing_feature_as_string=False, remove_empty=True) avg = [] for item in tokens_test: words = [] for word in item: if word in model.wv.vocab: vec = model.get_vector(word) words.append(vec) average = np.average(np.array(words), axis=0) if type(average) == np.float64: print('****') print(average) avg.append(np.zeros(200)) else: avg.append(list(average)) pub_med_test = np.array(avg) # train the neural network model and calculate the precision, recall, f1 score, and accuracy print("pubmed, nn") nn_model = _get_nn_model_bag_of_words_simple_scratch( pub_med_train, train_y_raw, pub_med_val, val_y_raw, data_reader.get_region_labels()['Code'], epochs=100, batch_size=256) eval_nn(nn_model, pub_med_test, test_y_raw) evaluate_model_nn(nn_model, pub_med_test, test_y_raw, plot_roc=False) # train the logistic regression model and calculate the precision, recall, f1 score, and accuracy print("pubmed, logistic regression") from Models.logistic_regression import MultiClassLogisticRegression log_reg = MultiClassLogisticRegression() log_reg.train(pub_med_train, train_y_raw) eval_model(log_reg, pub_med_test, test_y_raw) evaluate_model(log_reg, pub_med_test, test_y_raw, plot_roc=False) # train the random forest model and calculate the precision, recall, f1 score, and accuracy print("pubmed, random forest") from Models.random_forest import RandomForest rand_for = RandomForest() rand_for.train(pub_med_train, train_y_raw) eval_model(rand_for, pub_med_test, test_y_raw) evaluate_model(rand_for, pub_med_test, test_y_raw, plot_roc=False) # train the naive bayes model and calculate the precision, recall, f1 score, and accuracy print("pubmed, naivebayes") from Models.naive_bayes import NaiveBayes nb = NaiveBayes() nb.train(pub_med_train, train_y_raw) eval_model(nb, pub_med_test, test_y_raw) evaluate_model(nb, pub_med_test, test_y_raw, plot_roc=False)
def top_keywords_kmeans(): # get data data_reader = DataReader() df = data_reader.get_all_data() train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(df) train_x_raw.drop(['RIS PROCEDURE CODE'], axis=1, inplace=True) test_x_raw.drop(['RIS PROCEDURE CODE'], axis=1, inplace=True) # identify ON WG IDENTIFIERS that occur infrequently min_samples = 5 train_y_list = train_y_raw['ON WG IDENTIFIER'].values.tolist() unique_ids = list(set(train_y_list)) small_clusters = list() for i in unique_ids: if train_y_list.count(i) < min_samples: small_clusters.append(i) train_x_raw = train_x_raw[~train_y_raw['ON WG IDENTIFIER']. isin(small_clusters)] train_y_raw = train_y_raw[~train_y_raw['ON WG IDENTIFIER']. isin(small_clusters)] num_clusters = len(unique_ids) - len(small_clusters) # append the ON WG IDENTIFIERS to the original documents train_y_raw = pd.concat([train_x_raw, train_y_raw], axis=1) test_y_raw = pd.concat([test_x_raw, test_y_raw], axis=1) # tokenize and subsample tokens_train, train_y_raw = tokenize_columns( train_x_raw, train_y_raw, regex_string=r'[a-zA-Z0-9]+', save_missing_feature_as_string=False, remove_short=True, remove_num=True, remove_empty=True) tokens_test, test_y_raw = tokenize_columns( test_x_raw, test_y_raw, regex_string=r'[a-zA-Z0-9]+', save_missing_feature_as_string=False, remove_short=True, remove_num=True, remove_empty=True) # get TF-IDF representation of data feature_names = list() train_x = list() train_y = list() test_x = list() test_y = list() train_x, train_y, feature_names = tokens_to_bagofwords( tokens_train, train_y_raw, TfidfVectorizer) test_x, test_y, _ = tokens_to_bagofwords(tokens_test, test_y_raw, TfidfVectorizer, feature_names=feature_names) train_x = train_x.toarray() test_x = test_x.toarray() # run kmeans kmeans = Kmeans(num_clusters, feature_names, train_x, train_y, "tfidf") kmeans.eval() labels = kmeans.get_labels() # get top 10 keywords for each cluster n_terms = 10 # group by clusters and get the mean occurence of each word df = pd.DataFrame(train_x).groupby(labels).mean() # iterate through each cluster and get the most frequent occuring words for i, r in df.iterrows(): print( 'Cluster {}: '.format(i) + ','.join([str(feature_names[t]) for t in np.argsort(r)[-n_terms:]]))
def per_site_accuracy_increase(): # load data data_reader = DataReader() df = data_reader.get_all_data() all_tokens, _ = tokenize(df, df, save_missing_feature_as_string=False, remove_empty=True) _, _, vocab = tokens_to_bagofwords(all_tokens, all_tokens) lst = [] from random import shuffle #split data on source hospital and save to seperate dataframes in a list for i in df['src_file'].unique(): lst.append(df[df['src_file'] == i]) from Models.neural_net import MultiClassNNScratch # save an empty neural network so we can quickly reset the network model = MultiClassNNScratch( (0, len(vocab)), np.array(data_reader.get_region_labels()['Code']), epochs=100, batch_size=256) model.model.save_weights('empty_model.h5') # run evaluation some n times for i in range(30): # shuffle the order shuffle(lst) # iterate from 1 to len(lst)-1 from size of train set. Train model on 1->i sites and test on i->len(lst)-1. # Print results to file so we can easily visualize later. # each run of the 30 gets its own file. i = 1 file = open("output_dir/" + randomword(7) + '.txt', "w") while i < len(lst): model.model.load_weights('empty_model.h5') train_set = lst[:i] test_set = lst[i:] test_x_raw, test_y_raw = get_x_y_split(pd.concat(test_set)) test_tokens, test_y_raw = tokenize( test_x_raw, test_y_raw, save_missing_feature_as_string=False, remove_empty=True) test_x, test_y, _ = tokens_to_bagofwords(test_tokens, test_y_raw, feature_names=vocab) item = pd.concat(train_set) train_x_raw, train_y_raw, val_x_raw, val_y_raw = get_train_test_split( item) train_tokens, train_y_raw = tokenize( train_x_raw, train_y_raw, save_missing_feature_as_string=False, remove_empty=True) train_x, train_y, _ = tokens_to_bagofwords(train_tokens, train_y_raw, feature_names=vocab) val_tokens, val_y_raw = tokenize( val_x_raw, val_y_raw, save_missing_feature_as_string=False, remove_empty=True) val_x, val_y, _ = tokens_to_bagofwords(val_tokens, val_y_raw, feature_names=vocab) model.set_train_data(train_x, train_y) model.train(val_x, val_y) accuracy = evaluate_model_nn(model, test_x, test_y, plot_roc=False) file.write("%d, %d, %4.2f, %d" % (len(train_set), len(test_set), accuracy, len(item))) i += 1 file.close()
def main(): # parse arguments parser = argparse.ArgumentParser(description='Run unsupervised methods', add_help=True) parser.add_argument("-m", "--model", action="store", required=True, dest="MODELS", nargs='+', choices=[ 'all', 'kmeans', 'lda', 'dbscan', 'birch', 'hierarchical', 'gmm', 'meanshift', 'spectral', 'affinity' ], help="Run model") parser.add_argument( "-r", "--rep", action="store", required=False, dest="REP", choices=['bow', 'tfidf', 'doc2vec', 'pca'], help= "Use bag of words representation (BOW), tfidf, doc2vec representation, or PCA" ) parser.add_argument("--use-autoencoder", action="store_true", dest="USE_AUTOENCODER", help="Use autoencoders to reduce representations") parser.add_argument("--use-doc2vec", action="store_true", dest="USE_DOC2VEC", help="Use doc2vec representations") parser.add_argument( "--queries", action="store", dest="queries", nargs='+', help= "Return closest neighbours for query words to test search querying capabilities" ) parser.add_argument( "--print-keywords", action="store_true", dest="PRINT_KEYWORDS", help="Use if you want to print keywords in each cluster") parser.add_argument("--find-optimal-k", action="store_true", dest="FIND_OPTIMAL_K", help="Find optimal K") parser.add_argument("-s", "--sample-size", action="store", required=False, dest="SIZE", help="Use smaller set") parser.add_argument("-d", "--downsample-frac", action="store", required=False, default=1.0, dest="DOWNSAMPLE_FRAC", type=float, help="downsample fraction (0-1]") parser.add_argument( "--min-cluster-size", action="store", required=False, default=5, dest="MIN_CLUSTER_SIZE", help= "Filter out any ON WG IDENTIFIER classes with less than MIN_CLUSTER_SIZE" ) parser.add_argument( "-n", "--num-clusters", action="store", required=False, default=1500, dest="NUM_CLUSTERS", help="Number of clusters for algorithms that require it") args = parser.parse_args() #print(args.MODELS) assert (not (args.DOWNSAMPLE_FRAC) or (args.DOWNSAMPLE_FRAC > 0.0 and args.DOWNSAMPLE_FRAC <= 1.0 and args.DOWNSAMPLE_FRAC)) # get data data_reader = DataReader() df = data_reader.get_all_data() if args.SIZE: subset_df = df.sample(n=int(args.SIZE)) train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split( subset_df) elif args.DOWNSAMPLE_FRAC: subset_df = df.sample(frac=float(args.DOWNSAMPLE_FRAC)) train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split( subset_df) else: train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split( df) #train_x_raw = pd.concat([train_x_raw, test_x_raw], axis=0) #train_y_raw = pd.concat([train_y_raw, test_y_raw], axis=0) train_x_raw.drop(['RIS PROCEDURE CODE'], axis=1, inplace=True) test_x_raw.drop(['RIS PROCEDURE CODE'], axis=1, inplace=True) print(train_x_raw.shape) # identify ON WG IDENTIFIERS that occur infrequently #print("MIN_CLUSTER_SIZE: " + str(args.MIN_CLUSTER_SIZE)) min_samples = args.MIN_CLUSTER_SIZE train_y_list = train_y_raw['ON WG IDENTIFIER'].values.tolist() unique_ids = list(set(train_y_list)) small_clusters = list() for i in unique_ids: if train_y_list.count(i) < min_samples: small_clusters.append(i) train_x_raw = train_x_raw[~train_y_raw['ON WG IDENTIFIER']. isin(small_clusters)] train_y_raw = train_y_raw[~train_y_raw['ON WG IDENTIFIER']. isin(small_clusters)] #print(train_y_raw['ON WG IDENTIFIER']) #print(len(unique_ids)) num_clusters = len(unique_ids) - len(small_clusters) #print("NUM_CLUSTERS: " + str(num_clusters)) print(train_x_raw.shape) # append the ON WG IDENTIFIERS to the original documents train_y_raw = pd.concat([train_x_raw, train_y_raw], axis=1) test_y_raw = pd.concat([test_x_raw, test_y_raw], axis=1) # tokenize and subsample tokens_train, train_y_raw = tokenize_columns( train_x_raw, train_y_raw, regex_string=r'[a-zA-Z0-9]+', save_missing_feature_as_string=False, remove_short=True, remove_num=True, remove_empty=True) tokens_test, test_y_raw = tokenize_columns( test_x_raw, test_y_raw, regex_string=r'[a-zA-Z0-9]+', save_missing_feature_as_string=False, remove_short=True, remove_num=True, remove_empty=True) #print("done tokenizing columns") print(train_x_raw.shape) # get representation of data feature_names = list() train_x = list() train_y = list() test_x = list() test_y = list() #print(test_x_raw.shape) if args.REP == "bow" or args.USE_AUTOENCODER: train_x, train_y, feature_names = tokens_to_bagofwords( tokens_train, train_y_raw, CountVectorizer) test_x, test_y, _ = tokens_to_bagofwords(tokens_test, test_y_raw, CountVectorizer, feature_names=feature_names) train_x = train_x.toarray() test_x = test_x.toarray() #print("done converting to bag of words representation") elif args.REP == "tfidf": train_x, train_y, feature_names = tokens_to_bagofwords( tokens_train, train_y_raw, TfidfVectorizer) test_x, test_y, _ = tokens_to_bagofwords(tokens_test, test_y_raw, TfidfVectorizer, feature_names=feature_names) print(train_x.shape) train_x = train_x.toarray() test_x = test_x.toarray() #print("done converting to tfidf representation") elif args.REP == "doc2vec": train_x, train_y, _ = tokens_to_doc2vec(tokens_train, train_y_raw) test_x, train_y, _ = tokens_to_doc2vec(tokens_test, train_y_raw) #print("done converting to doc2vec representation") elif args.REP == "pca": train_x, train_y, feature_names = tokens_to_bagofwords( tokens_train, train_y_raw, CountVectorizer) test_x, test_y, _ = tokens_to_bagofwords(tokens_test, test_y_raw, CountVectorizer, feature_names=feature_names) #get number of components pca = PCA() pca.fit(train_x.toarray()) var = np.cumsum(pca.explained_variance_ratio_) n_comp = np.argmax(var > .9) + 1 # fit pca pca = PCA(n_components=n_comp) pca.fit(train_x.toarray()) train_x = pca.fit_transform(train_x.toarray()) test_x = pca.fit_transform(test_x.toarray()) VOCAB_SIZE = train_x.shape[1] if args.USE_AUTOENCODER: #print(int(len(data_reader.get_region_labels()['Code']))) # use an autoencoder with representation size = VOCAB_SIZE / 10 REP_SIZE = 100 encoder = get_encoder(train_x, test_x, REP_SIZE) train_x = encoder.predict(train_x) test_x = encoder.predict(test_x) #print("done converting to autoencoder representation") # run models print("TRAIN_X SHAPE = " + str(train_x.shape) + ", VOCAB_SIZE = " + str(VOCAB_SIZE) + ", NUM_CLUSTERS = " + str(num_clusters) + ", MIN_CLUSTER_SIZE = " + str(args.MIN_CLUSTER_SIZE)) if "kmeans" in args.MODELS or "all" in args.MODELS: kmeans = Kmeans(num_clusters, feature_names, train_x, train_y, args.REP) kmeans.eval() labels = kmeans.get_labels() # print results print("kmeans, " + args.REP + ", " + str(args.DOWNSAMPLE_FRAC) + ", " + str(kmeans.get_sil_score()) + ", " + str(kmeans.get_db_idx_score())) if args.FIND_OPTIMAL_K: find_optimal_clusters(2000, feature_names, train_x, train_y, args.REP) #plot_cluster_size_frequency(train_x, labels, num_clusters) # example queries print("getting nearest: ") if args.queries: for q in args.queries: kmeans.get_nearest_neighbours(str(q)) if args.PRINT_KEYWORDS: # get top keywords for clusters print("getting top 10 keywords for each cluster: ") get_top_keywords(train_x, labels, feature_names, 10) ''' # plot 500 random clusters plt.figure(figsize=(10, 7)) fig, ax = plt.subplots() print("number of unique labels: " + str(len(np.unique(labels)))) num_clusters_to_plot = 50 tsne = TSNE(n_components=2, verbose=1) random_clusters = random.sample(range(1, num_clusters), num_clusters_to_plot) reduced_data = tsne.fit_transform(train_x.todense()) cmap = plt.cm.get_cmap('rainbow',num_clusters_to_plot) for i in range(num_clusters_to_plot): l = random_clusters[i] print("cluster " + str(l)) indices = np.where(labels == l) col = cmap(i) cluster_reduced_data = reduced_data[indices[0]] print(cluster_reduced_data.shape) plt.scatter(cluster_reduced_data[:,0], cluster_reduced_data[:,1], color=col) plt.savefig('kmeans_' + args.REP + '_' + str(num_clusters_to_plot) + '.tsne.png') ''' if "lda" in args.MODELS or "all" in args.MODELS: # run lda lda = Lda(train_x_raw, train_y_raw, 1500, passes=15) lda.train() print("finished running lda") if "dbscan" in args.MODELS or "all" in args.MODELS: # run dbscan dbs = DBscan(num_clusters, feature_names, train_x, train_y, args.REP) dbs.eval() print("dbscan, " + args.REP + ", " + str(dbs.get_sil_score()) + ", " + str(dbs.get_db_idx_score())) if "birch" in args.MODELS or "all" in args.MODELS: b = Birch_(num_clusters, feature_names, train_x, train_y, args.REP) print("birch, " + args.REP + ", " + str(b.get_sil_score()) + ", " + str(b.get_db_idx_score())) if "hierarchical" in args.MODELS or "all" in args.MODELS: h = Hierarchical(num_clusters, feature_names, train_x, train_y, args.REP) print("hierarchical, " + args.REP + ", " + str(h.get_sil_score()) + ", " + str(h.get_db_idx_score())) labels = h.get_labels() # get top keywords for clusters if args.PRINT_KEYWORDS: print("getting top 10 keywords for each cluster: ") get_top_keywords(train_x, labels, feature_names, 10) if args.FIND_OPTIMAL_K: find_optimal_clusters(2000, feature_names, train_x, train_y, args.REP) if args.queries: h.get_nearest_neighbours(args.queries) if "gmm" in args.MODELS or "all" in args.MODELS: gmm = GMM(num_clusters, feature_names, train_x, train_y, args.REP) print("GMM, " + args.REP + ", " + str(gmm.get_sil_score()) + ", " + str(gmm.get_db_idx_score())) if "meanshift" in args.MODELS or "all" in args.MODELS: ms = Meanshift(feature_names, train_x, train_y, args.REP) print("meanshift, " + args.REP + ", " + str(ms.get_sil_score()) + ", " + str(ms.get_db_idx_score())) if "spectral" in args.MODELS or "all" in args.MODELS: sp = Spectral(num_clusters, feature_names, train_x, train_y, args.REP) print("spectral, " + args.REP + ", " + str(sp.get_sil_score()) + ", " + str(sp.get_db_idx_score())) if "affinity" in args.MODELS or "all" in args.MODELS: af = Affinity(num_clusters, feature_names, train_x, train_y, args.REP) print("affinity, " + args.REP + ", " + str(af.get_sil_score()) + ", " + str(af.get_db_idx_score()))
import numpy as np from data_reader import DataReader import warnings warnings.filterwarnings("ignore") # get all the labelled data reader = DataReader() data = reader.get_all_data() print( '\n================== Before removing missing values ==================\n') print('No. of samples: {}'.format(len(data))) print('No. of classes: {}'.format(data['ON WG IDENTIFIER'].nunique())) counts = data.groupby(['ON WG IDENTIFIER']).size().to_frame(name='counts') \ .sort_values(['counts']).values print('Max no. of samples for a class: {}'.format(counts[-1][-1])) print('Min no. of samples for a class: {}'.format(counts[0][0])) print('Avg no. of samples for a class: {}'.format(round(np.mean(counts), 2))) print( '\n===================================================================\n') print( '================== After removing missing values ====================\n') # drop rows with missing values dataNoNan = data.dropna() print('No. of samples: {}'.format(len(dataNoNan))) print('No. of classes: {}'.format(dataNoNan['ON WG IDENTIFIER'].nunique())) counts = dataNoNan.groupby(['ON WG IDENTIFIER']).size().to_frame(name='counts') \ .sort_values(['counts']).values