def main(): data_reader = DataReader() df = data_reader.get_all_data() # random split of data train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(df) # set up train data train_tokens, train_y_raw = tokenize(train_x_raw, train_y_raw, save_missing_feature_as_string=False, remove_empty=True) train_x, train_y, feature_names = tokens_to_bagofwords(train_tokens, train_y_raw) # train model model = _get_nn_model_bag_of_words_simple_v2(train_x, train_y, data_reader.get_region_labels()['Code'], epochs=50, batch_size=64) # set up test data test_tokens, test_y_raw = tokenize(test_x_raw, test_y_raw, save_missing_feature_as_string=False, remove_empty=True) test_x, test_y, _ = tokens_to_bagofwords(test_tokens, test_y_raw, feature_names=feature_names) # evaluate model evaluate_model_nn(model, test_x, test_y, plot_roc=False) # ABOVE IS BASIC SUPERVISED LEARNING TO GENERATE MODEL ################################################# # BELOW IS SEMI-SUPERVISED SELF-TRAINING TO FUTHER TRAIN MODEL # read unlabelled data and format it to be the same as labelled data unlabelled_df = data_reader.get_east_dir() unlabelled_df = normalize_east_dir_df(unlabelled_df) # set up unlabelled data as semi-supervised data tokens, _ = tokenize(unlabelled_df, _, save_missing_feature_as_string=False, remove_empty=True) semi_x_base, _, _ = tokens_to_bagofwords(tokens, _, feature_names=feature_names) # Confidence threshold to train on train_threshold = 0.8 semi_train_amount = 30 # SELF TRAIN MANY TIMES for i in range(semi_train_amount): # get predictions on unlabelled data pred = model.model.predict(semi_x_base) # convert probablities to 1 hot encoded output semi_y = np.zeros_like(pred) semi_y[np.arange(len(pred)), pred.argmax(1)] = 1 # filter semi_x and semi_y to only include predictions above train_threshold semi_y = semi_y[pred.max(axis=1) > train_threshold] semi_x = semi_x_base[pred.max(axis=1) > train_threshold] # train on semi supervised data model.model.fit(semi_x, semi_y, batch_size=64, epochs=100) # retrain on original train data model.model.fit(train_x, model.encoder.transform(train_y), batch_size=32, epochs=10) # evaluate model evaluate_model_nn(model, test_x, test_y, plot_roc=False) # remove semi data used in this iteration from future iterations semi_x_base = semi_x_base[~(pred.max(axis=1) > train_threshold)]
def eval_ae(): from Models.logistic_regression import MultiClassLogisticRegression from Models.random_forest import RandomForest from Models.naive_bayes import NaiveBayes from Models.svm import SVM # load data data_reader = DataReader() df = data_reader.get_all_data() train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split( df) train_x, train_y, val_x, val_y, test_x, test_y = bag_of_words_full_no_empty_val_no_num_no_short_no_repeat( train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw) # Train an auto encoder of size 4096 encoder = get_encoder(train_x, test_x, 4096) # use auto encoder to encode the train, validate and test sets encoded_train = encoder.predict(train_x) encoded_test = encoder.predict(test_x) encoded_val = encoder.predict(val_x) # train the neural network model and calculate the precision, recall, f1 score, and accuracy print('neural net ae') model = _get_nn_model_bag_of_words_simple_scratch( encoded_train, train_y, encoded_val, val_y, data_reader.get_region_labels()['Code'], epochs=100, batch_size=256) eval_nn(model, encoded_test, test_y) evaluate_model_nn(model, encoded_test, test_y) # train the logistic regression model and calculate the precision, recall, f1 score, and accuracy print('logistic regression ae') model = MultiClassLogisticRegression() model.train(encoded_train, train_y) model_obj = lambda: None model_obj.model = model eval_model(model_obj, encoded_test, test_y) evaluate_model(model, encoded_test, test_y) # train the random forest model and calculate the precision, recall, f1 score, and accuracy print('random forest ae') model = RandomForest() model.train(encoded_train, train_y) model_obj = lambda: None model_obj.model = model eval_model(model_obj, encoded_test, test_y) evaluate_model(model, encoded_test, test_y) # train the naive bayes model and calculate the precision, recall, f1 score, and accuracy print('naive bayes ae') model = NaiveBayes() model.train(encoded_train, train_y) model_obj = lambda: None model_obj.model = model eval_model(model_obj, encoded_test, test_y) evaluate_model(model, encoded_test, test_y)
def original_model(): if not os.path.isfile('demo_nn.h5'): data_reader = DataReader() df = data_reader.get_all_data() top_x_predictions = 10 # Split data train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split( df) # get bag of words train_x, train_y, val_x, val_y, test_x, test_y, feature_names = get_bag_of_words( train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw) # get all labels labels = data_reader.get_region_labels()['Code'] # train neural net model = MultiClassNNScratch(train_x.shape, np.array(labels), epochs=150, batch_size=1024) model.set_train_data(train_x, train_y) model.train(val_x, val_y) # save neural net model.model.save('demo_nn.h5') else: # load neural net model = MultiClassNNScratch(1, np.array([]), epochs=150, batch_size=1024) model.set_train_data(1, 1) model.model = load_model( 'demo_nn.h5', custom_objects={'top_3_accuracy': top_3_accuracy}) # from IPython import embed # embed() regex_string = r'[a-zA-Z0-9]+' while True: stdin = input("Enter all information:") if stdin == 'quit': break try: top_x_predictions = int(stdin) print("Will return top " + str(top_x_predictions) + " predictions") except ValueError: tokenizer = RegexpTokenizer(regex_string) tokens = tokenizer.tokenize(stdin.lower()) vectorizer = CountVectorizer(tokenizer=lambda x: x, lowercase=False, strip_accents=False, vocabulary=feature_names) model_input = vectorizer.fit_transform([tokens]) pred = model.model.predict(model_input) # Top X Predictions rows = [] for i in range(top_x_predictions): one_hot_pred = np.zeros_like(pred) one_hot_pred[np.arange(len(pred)), (np.argpartition(pred[0], -top_x_predictions )[-top_x_predictions:][i])] = 1 id = model.encoder.inverse_transform(one_hot_pred)[0][0] row = data_reader.regional_df[data_reader.regional_df['Code'] == id] row['Prediction Confidence'] = (pred[0][(np.argpartition( pred[0], -top_x_predictions)[-top_x_predictions:][i])]) * 100 rows.append(row) rows = (pd.concat(rows)).sort_values('Prediction Confidence', ascending=False) print(rows)
def main(): config = tf.ConfigProto(device_count={'GPU': 0}) # config.gpu_options.per_process_gpu_memory_fraction = 0.64 set_session(tf.Session(config=config)) data_reader = DataReader() df = data_reader.get_all_data() # Split data train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split( df) # get bag of words train_x, train_y, val_x, val_y, test_x, test_y, feature_names = get_bag_of_words( train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw) # get all labels labels = data_reader.get_region_labels()['Code'] if not os.path.isfile('demo_nn.h5'): # train neural net model = MultiClassNNScratch(train_x.shape, np.array(labels), epochs=150, batch_size=1024) model.set_train_data(train_x, train_y) model.train(val_x, val_y) # save neural net model.model.save('demo_nn.h5') else: # load neural net model = MultiClassNNScratch(train_x.shape, np.array(labels), epochs=150, batch_size=1024) model.set_train_data(train_x, train_y) model.model = load_model( 'demo_nn.h5', custom_objects={'top_3_accuracy': top_3_accuracy}) # from IPython import embed # embed() regex_string = r'[a-zA-Z0-9]+' while True: stdin = input("Enter all information:") if stdin == 'quit': break tokenizer = RegexpTokenizer(regex_string) tokens = tokenizer.tokenize(stdin.lower()) vectorizer = CountVectorizer(tokenizer=lambda x: x, lowercase=False, strip_accents=False, vocabulary=feature_names) model_input = vectorizer.fit_transform([tokens]) pred = model.model.predict(model_input) one_hot_pred = np.zeros_like(pred) one_hot_pred[np.arange(len(pred)), pred.argmax(1)] = 1 id = model.encoder.inverse_transform(one_hot_pred)[0][0] row = data_reader.regional_df[data_reader.regional_df['Code'] == id] print(row)
def eval_pub_med(): from gensim.models.keyedvectors import KeyedVectors # Need to download file from http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin # Load the pubmed model model = KeyedVectors.load_word2vec_format( 'wikipedia-pubmed-and-PMC-w2v.bin', binary=True) # Load data into train/validate/test sets data_reader = DataReader() df = data_reader.get_all_data() train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split( df) tokens_train, train_y_raw = tokenize(train_x_raw, train_y_raw, save_missing_feature_as_string=False, remove_empty=True) # for the each tokenized vector in the train set, run the model on each word and take the average. # If no words are vectorized by pubmed, append an 0 vector avg = [] for item in tokens_train: words = [] for word in item: if word in model.wv.vocab: vec = model.get_vector(word) words.append(vec) average = np.average(np.array(words), axis=0) if type(average) == np.float64: print('****') print(average) avg.append(np.zeros(200)) else: avg.append(list(average)) pub_med_train = np.array(avg) # run the same for the validation set tokens_val, val_y_raw = tokenize(val_x_raw, val_y_raw, save_missing_feature_as_string=False, remove_empty=True) avg = [] for item in tokens_val: words = [] for word in item: if word in model.wv.vocab: vec = model.get_vector(word) words.append(vec) average = np.average(np.array(words), axis=0) if type(average) == np.float64: print('****') print(average) avg.append(np.zeros(200)) else: avg.append(list(average)) pub_med_val = np.array(avg) # run the same for the test set tokens_test, test_y_raw = tokenize(test_x_raw, test_y_raw, save_missing_feature_as_string=False, remove_empty=True) avg = [] for item in tokens_test: words = [] for word in item: if word in model.wv.vocab: vec = model.get_vector(word) words.append(vec) average = np.average(np.array(words), axis=0) if type(average) == np.float64: print('****') print(average) avg.append(np.zeros(200)) else: avg.append(list(average)) pub_med_test = np.array(avg) # train the neural network model and calculate the precision, recall, f1 score, and accuracy print("pubmed, nn") nn_model = _get_nn_model_bag_of_words_simple_scratch( pub_med_train, train_y_raw, pub_med_val, val_y_raw, data_reader.get_region_labels()['Code'], epochs=100, batch_size=256) eval_nn(nn_model, pub_med_test, test_y_raw) evaluate_model_nn(nn_model, pub_med_test, test_y_raw, plot_roc=False) # train the logistic regression model and calculate the precision, recall, f1 score, and accuracy print("pubmed, logistic regression") from Models.logistic_regression import MultiClassLogisticRegression log_reg = MultiClassLogisticRegression() log_reg.train(pub_med_train, train_y_raw) eval_model(log_reg, pub_med_test, test_y_raw) evaluate_model(log_reg, pub_med_test, test_y_raw, plot_roc=False) # train the random forest model and calculate the precision, recall, f1 score, and accuracy print("pubmed, random forest") from Models.random_forest import RandomForest rand_for = RandomForest() rand_for.train(pub_med_train, train_y_raw) eval_model(rand_for, pub_med_test, test_y_raw) evaluate_model(rand_for, pub_med_test, test_y_raw, plot_roc=False) # train the naive bayes model and calculate the precision, recall, f1 score, and accuracy print("pubmed, naivebayes") from Models.naive_bayes import NaiveBayes nb = NaiveBayes() nb.train(pub_med_train, train_y_raw) eval_model(nb, pub_med_test, test_y_raw) evaluate_model(nb, pub_med_test, test_y_raw, plot_roc=False)
def per_site_accuracy_increase(): # load data data_reader = DataReader() df = data_reader.get_all_data() all_tokens, _ = tokenize(df, df, save_missing_feature_as_string=False, remove_empty=True) _, _, vocab = tokens_to_bagofwords(all_tokens, all_tokens) lst = [] from random import shuffle #split data on source hospital and save to seperate dataframes in a list for i in df['src_file'].unique(): lst.append(df[df['src_file'] == i]) from Models.neural_net import MultiClassNNScratch # save an empty neural network so we can quickly reset the network model = MultiClassNNScratch( (0, len(vocab)), np.array(data_reader.get_region_labels()['Code']), epochs=100, batch_size=256) model.model.save_weights('empty_model.h5') # run evaluation some n times for i in range(30): # shuffle the order shuffle(lst) # iterate from 1 to len(lst)-1 from size of train set. Train model on 1->i sites and test on i->len(lst)-1. # Print results to file so we can easily visualize later. # each run of the 30 gets its own file. i = 1 file = open("output_dir/" + randomword(7) + '.txt', "w") while i < len(lst): model.model.load_weights('empty_model.h5') train_set = lst[:i] test_set = lst[i:] test_x_raw, test_y_raw = get_x_y_split(pd.concat(test_set)) test_tokens, test_y_raw = tokenize( test_x_raw, test_y_raw, save_missing_feature_as_string=False, remove_empty=True) test_x, test_y, _ = tokens_to_bagofwords(test_tokens, test_y_raw, feature_names=vocab) item = pd.concat(train_set) train_x_raw, train_y_raw, val_x_raw, val_y_raw = get_train_test_split( item) train_tokens, train_y_raw = tokenize( train_x_raw, train_y_raw, save_missing_feature_as_string=False, remove_empty=True) train_x, train_y, _ = tokens_to_bagofwords(train_tokens, train_y_raw, feature_names=vocab) val_tokens, val_y_raw = tokenize( val_x_raw, val_y_raw, save_missing_feature_as_string=False, remove_empty=True) val_x, val_y, _ = tokens_to_bagofwords(val_tokens, val_y_raw, feature_names=vocab) model.set_train_data(train_x, train_y) model.train(val_x, val_y) accuracy = evaluate_model_nn(model, test_x, test_y, plot_roc=False) file.write("%d, %d, %4.2f, %d" % (len(train_set), len(test_set), accuracy, len(item))) i += 1 file.close()