def main():
    data_reader = DataReader()
    df = data_reader.get_all_data()

    # random split of data
    train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(df)

    # set up train data
    train_tokens, train_y_raw = tokenize(train_x_raw, train_y_raw, save_missing_feature_as_string=False,
                                       remove_empty=True)
    train_x, train_y, feature_names = tokens_to_bagofwords(train_tokens, train_y_raw)

    # train model
    model  = _get_nn_model_bag_of_words_simple_v2(train_x, train_y, data_reader.get_region_labels()['Code'],
                                                      epochs=50, batch_size=64)

    # set up test data
    test_tokens, test_y_raw = tokenize(test_x_raw, test_y_raw, save_missing_feature_as_string=False, remove_empty=True)
    test_x, test_y, _ = tokens_to_bagofwords(test_tokens, test_y_raw, feature_names=feature_names)

    # evaluate model
    evaluate_model_nn(model, test_x, test_y, plot_roc=False)

    # ABOVE IS BASIC SUPERVISED LEARNING TO GENERATE MODEL
    #################################################
    # BELOW IS SEMI-SUPERVISED SELF-TRAINING TO FUTHER TRAIN MODEL

    # read unlabelled data and format it to be the same as labelled data
    unlabelled_df = data_reader.get_east_dir()
    unlabelled_df = normalize_east_dir_df(unlabelled_df)

    # set up unlabelled data as semi-supervised data
    tokens, _ = tokenize(unlabelled_df, _, save_missing_feature_as_string=False, remove_empty=True)
    semi_x_base, _, _ = tokens_to_bagofwords(tokens, _, feature_names=feature_names)

    # Confidence threshold to train on
    train_threshold = 0.8
    semi_train_amount = 30

    # SELF TRAIN MANY TIMES
    for i in range(semi_train_amount):
        # get predictions on unlabelled data
        pred = model.model.predict(semi_x_base)
        # convert probablities to 1 hot encoded output
        semi_y = np.zeros_like(pred)
        semi_y[np.arange(len(pred)), pred.argmax(1)] = 1
        # filter semi_x and semi_y to only include predictions above train_threshold
        semi_y = semi_y[pred.max(axis=1) > train_threshold]
        semi_x = semi_x_base[pred.max(axis=1) > train_threshold]

        # train on semi supervised data
        model.model.fit(semi_x, semi_y, batch_size=64, epochs=100)
        # retrain on original train data
        model.model.fit(train_x, model.encoder.transform(train_y), batch_size=32, epochs=10)

        # evaluate model
        evaluate_model_nn(model, test_x, test_y, plot_roc=False)

        # remove semi data used in this iteration from future iterations
        semi_x_base = semi_x_base[~(pred.max(axis=1) > train_threshold)]
예제 #2
0
def eval():
    # load data
    data_reader = DataReader()
    df = data_reader.get_all_data()

    train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(df)

    # get bag of words
    train_x, train_y, test_x, test_y = bag_of_words_full_no_empty(
        train_x_raw, train_y_raw, test_x_raw, test_y_raw)

    #train logistic regression, random forest and naive bayes on bag of words, and run get accuracy, precision, recall, f1score
    log_reg_bow = _get_multiclass_logistic_regression_model_bag_of_words_full(
        train_x, train_y)
    eval_model(log_reg_bow, test_x, test_y)
    evaluate_model(log_reg_bow, test_x, test_y, plot_roc=False)
    rand_for_bow = _get_random_forest_model_bag_of_words_full(train_x, train_y)
    eval_model(rand_for_bow, test_x, test_y)
    evaluate_model(rand_for_bow, test_x, test_y, plot_roc=False)
    nb_bow = _get_naive_bayes_model_bag_of_words_full(train_x, train_y)
    eval_model(nb_bow, test_x.A, test_y)
    evaluate_model(nb_bow, test_x.A, test_y, plot_roc=False)

    # get tfidf
    train_x, train_y, test_x, test_y = tfidf_no_empty(train_x_raw, train_y_raw,
                                                      test_x_raw, test_y_raw)

    # train logistic regression, random forest and naive bayes on tfidf, and run get accuracy, precision, recall, f1score
    log_reg_bow = _get_multiclass_logistic_regression_model_tfidf(
        train_x, train_y)
    eval_model(log_reg_bow, test_x, test_y)
    evaluate_model(log_reg_bow, test_x, test_y, plot_roc=False)
    rand_for_bow = _get_random_forest_model_tfidf(train_x, train_y)
    print("random forest, tfidf")
    eval_model(rand_for_bow, test_x, test_y)
    evaluate_model(rand_for_bow, test_x, test_y, plot_roc=False)
    nb_bow = _get_naive_bayes_model_tfidf(train_x.A, train_y)
    print("naive bayes, tfidf")
    eval_model(nb_bow, test_x.A, test_y)
    evaluate_model(nb_bow, test_x, test_y, plot_roc=False)

    # get doc2vec
    train_x, train_y, test_x, test_y = doc2vec_simple(train_x_raw, train_y_raw,
                                                      test_x_raw, test_y_raw)

    # train logistic regression, random forest and naive bayes on doc2vec, and run get accuracy, precision, recall, f1score
    log_reg_bow = _get_multiclass_logistic_regression_model_doc2vec_simple(
        train_x, train_y)
    print("bow, doc2vec")
    eval_model(log_reg_bow, test_x, test_y)
    evaluate_model(log_reg_bow, test_x, test_y, plot_roc=False)
    rand_for_bow = _get_random_forest_model_doc2vec_simple(train_x, train_y)
    print("random forest, doc2evc")
    eval_model(rand_for_bow, test_x, test_y)
    evaluate_model(rand_for_bow, test_x, test_y, plot_roc=False)
    nb_bow = _get_naive_bayes_model_doc2vec_simple(train_x, train_y)
    print("naive bayes, doc2vec")
    eval_model(nb_bow, test_x, test_y)
    evaluate_model(nb_bow, test_x, test_y, plot_roc=False)
예제 #3
0
def eval_ae():
    from Models.logistic_regression import MultiClassLogisticRegression
    from Models.random_forest import RandomForest
    from Models.naive_bayes import NaiveBayes
    from Models.svm import SVM
    # load data
    data_reader = DataReader()
    df = data_reader.get_all_data()

    train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split(
        df)
    train_x, train_y, val_x, val_y, test_x, test_y = bag_of_words_full_no_empty_val_no_num_no_short_no_repeat(
        train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw)
    # Train an auto encoder of size 4096
    encoder = get_encoder(train_x, test_x, 4096)
    # use auto encoder to encode the train, validate and test sets
    encoded_train = encoder.predict(train_x)
    encoded_test = encoder.predict(test_x)
    encoded_val = encoder.predict(val_x)

    # train the neural network model and calculate the precision, recall, f1 score, and accuracy
    print('neural net ae')
    model = _get_nn_model_bag_of_words_simple_scratch(
        encoded_train,
        train_y,
        encoded_val,
        val_y,
        data_reader.get_region_labels()['Code'],
        epochs=100,
        batch_size=256)
    eval_nn(model, encoded_test, test_y)
    evaluate_model_nn(model, encoded_test, test_y)
    # train the logistic regression model and calculate the precision, recall, f1 score, and accuracy
    print('logistic regression ae')
    model = MultiClassLogisticRegression()
    model.train(encoded_train, train_y)
    model_obj = lambda: None
    model_obj.model = model
    eval_model(model_obj, encoded_test, test_y)
    evaluate_model(model, encoded_test, test_y)

    # train the random forest model and calculate the precision, recall, f1 score, and accuracy
    print('random forest ae')
    model = RandomForest()
    model.train(encoded_train, train_y)
    model_obj = lambda: None
    model_obj.model = model
    eval_model(model_obj, encoded_test, test_y)
    evaluate_model(model, encoded_test, test_y)

    # train the naive bayes model and calculate the precision, recall, f1 score, and accuracy
    print('naive bayes ae')
    model = NaiveBayes()
    model.train(encoded_train, train_y)
    model_obj = lambda: None
    model_obj.model = model
    eval_model(model_obj, encoded_test, test_y)
    evaluate_model(model, encoded_test, test_y)
def main():
    data_reader = DataReader()
    df = data_reader.get_all_data()
    y = df['ON WG IDENTIFIER'].values
    df.drop(['src_file', 'ON WG IDENTIFIER'], 1, inplace=True)
    tokens, y = tokenize_columns(df, y, save_missing_feature_as_string=False, remove_repeats=True,
                                remove_num=True)
    x, y, feature_names = tokens_to_bagofwords(tokens, y)

    #tfid
    corpus = list(map(' '.join, tokens[:]))
    vectorizer = TfidfVectorizer()
    mat = vectorizer.fit_transform(corpus)

    #    def __init__(self, num_clusters, feature_names, train_x, train_y):
    b = Birch_(10, feature_names, mat, y)
    print('d b score: ' + str(b.get_db_idx_score()))
    print('sil score: ' + str(b.get_sil_score()))

    h = Hierarchial(10, feature_names, mat, y)
    print('d b score: ' + str(h.get_get_db_idx_score()))
    print('sil score: ' + str(h.get_sil_score()))
예제 #5
0
from data_reader import DataReader
from data_manipulator import *
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# clean for BioASQ
bioclean = lambda t: re.sub(
    '[.,?;*!%^&_+():-\[\]{}]', '',
    t.replace('"', '').replace('/', '').replace('\\', '').replace("'", '').
    strip().lower()).split()

tokens = bioclean('This is a sentence w/o you!')
print(tokens)

data_reader = DataReader()
df = data_reader.get_all_data()
df = df[[
    'RIS PROCEDURE DESCRIPTION', 'PACS STUDY DESCRIPTION', 'ON WG IDENTIFIER'
]]

# drop missing rows
df = df.dropna()
df['text'] = df[['RIS PROCEDURE DESCRIPTION',
                 'PACS STUDY DESCRIPTION']].apply(lambda x: ' '.join(x),
                                                  axis=1)
df = df.drop(['RIS PROCEDURE DESCRIPTION', 'PACS STUDY DESCRIPTION'], axis=1)
df = df.rename(columns={'ON WG IDENTIFIER': 'target'}).values

targets = df[:, 0]
words = df[:, 1]
vectorizer = CountVectorizer(tokenizer=bioclean)
예제 #6
0
def original_model():
    if not os.path.isfile('demo_nn.h5'):
        data_reader = DataReader()
        df = data_reader.get_all_data()

        top_x_predictions = 10

        # Split data
        train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split(
            df)

        # get bag of words
        train_x, train_y, val_x, val_y, test_x, test_y, feature_names = get_bag_of_words(
            train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw,
            test_y_raw)
        # get all labels
        labels = data_reader.get_region_labels()['Code']
        # train neural net
        model = MultiClassNNScratch(train_x.shape,
                                    np.array(labels),
                                    epochs=150,
                                    batch_size=1024)
        model.set_train_data(train_x, train_y)
        model.train(val_x, val_y)

        # save neural net
        model.model.save('demo_nn.h5')
    else:
        # load neural net
        model = MultiClassNNScratch(1,
                                    np.array([]),
                                    epochs=150,
                                    batch_size=1024)
        model.set_train_data(1, 1)
        model.model = load_model(
            'demo_nn.h5', custom_objects={'top_3_accuracy': top_3_accuracy})

    # from IPython import embed
    # embed()

    regex_string = r'[a-zA-Z0-9]+'
    while True:
        stdin = input("Enter all information:")
        if stdin == 'quit':
            break
        try:
            top_x_predictions = int(stdin)
            print("Will return top " + str(top_x_predictions) + " predictions")
        except ValueError:
            tokenizer = RegexpTokenizer(regex_string)
            tokens = tokenizer.tokenize(stdin.lower())

            vectorizer = CountVectorizer(tokenizer=lambda x: x,
                                         lowercase=False,
                                         strip_accents=False,
                                         vocabulary=feature_names)

            model_input = vectorizer.fit_transform([tokens])
            pred = model.model.predict(model_input)

            # Top X Predictions
            rows = []
            for i in range(top_x_predictions):
                one_hot_pred = np.zeros_like(pred)
                one_hot_pred[np.arange(len(pred)),
                             (np.argpartition(pred[0], -top_x_predictions
                                              )[-top_x_predictions:][i])] = 1
                id = model.encoder.inverse_transform(one_hot_pred)[0][0]
                row = data_reader.regional_df[data_reader.regional_df['Code']
                                              == id]
                row['Prediction Confidence'] = (pred[0][(np.argpartition(
                    pred[0],
                    -top_x_predictions)[-top_x_predictions:][i])]) * 100
                rows.append(row)

            rows = (pd.concat(rows)).sort_values('Prediction Confidence',
                                                 ascending=False)
            print(rows)
def main():
    config = tf.ConfigProto(device_count={'GPU': 0})
    #    config.gpu_options.per_process_gpu_memory_fraction = 0.64
    set_session(tf.Session(config=config))

    data_reader = DataReader()
    df = data_reader.get_all_data()

    # Split data
    train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split(
        df)

    # get bag of words
    train_x, train_y, val_x, val_y, test_x, test_y, feature_names = get_bag_of_words(
        train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw)
    # get all labels
    labels = data_reader.get_region_labels()['Code']

    if not os.path.isfile('demo_nn.h5'):
        # train neural net
        model = MultiClassNNScratch(train_x.shape,
                                    np.array(labels),
                                    epochs=150,
                                    batch_size=1024)
        model.set_train_data(train_x, train_y)
        model.train(val_x, val_y)

        # save neural net
        model.model.save('demo_nn.h5')
    else:
        # load neural net
        model = MultiClassNNScratch(train_x.shape,
                                    np.array(labels),
                                    epochs=150,
                                    batch_size=1024)
        model.set_train_data(train_x, train_y)
        model.model = load_model(
            'demo_nn.h5', custom_objects={'top_3_accuracy': top_3_accuracy})

    # from IPython import embed
    # embed()

    regex_string = r'[a-zA-Z0-9]+'
    while True:
        stdin = input("Enter all information:")
        if stdin == 'quit':
            break

        tokenizer = RegexpTokenizer(regex_string)
        tokens = tokenizer.tokenize(stdin.lower())

        vectorizer = CountVectorizer(tokenizer=lambda x: x,
                                     lowercase=False,
                                     strip_accents=False,
                                     vocabulary=feature_names)

        model_input = vectorizer.fit_transform([tokens])
        pred = model.model.predict(model_input)

        one_hot_pred = np.zeros_like(pred)
        one_hot_pred[np.arange(len(pred)), pred.argmax(1)] = 1

        id = model.encoder.inverse_transform(one_hot_pred)[0][0]
        row = data_reader.regional_df[data_reader.regional_df['Code'] == id]

        print(row)
def siamese_fewshot(train, snn_seen_score, snn_unseen_score, knn_score,
                    nn_score, limit, unseen_num_class):

    # get the data
    data_reader = DataReader()
    df = data_reader.get_all_data()
    train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(df)

    tokens, train_y_raw = tokenize_columns(
        train_x_raw,
        train_y_raw,
        save_missing_feature_as_string=False,
        remove_empty=True,
        remove_num=True,
        remove_repeats=True,
        remove_short=True)
    train_x, train_y, feature_names = tokens_to_bagofwords(
        tokens, train_y_raw, vectorizer_class=CountVectorizer)

    tokens, test_y_raw = tokenize_columns(test_x_raw,
                                          test_y_raw,
                                          save_missing_feature_as_string=False,
                                          remove_empty=True,
                                          remove_num=True,
                                          remove_repeats=True,
                                          remove_short=True)
    test_x, test_y, _ = tokens_to_bagofwords(tokens,
                                             test_y_raw,
                                             vectorizer_class=CountVectorizer,
                                             feature_names=feature_names)

    # encode the labels into smaller integers rather than large integers
    le = preprocessing.LabelEncoder()
    le.fit(np.concatenate((test_y.values, train_y.values)))
    train_y = le.transform(train_y.values)
    test_y = le.transform(test_y.values)

    # combine train and test
    x = np.concatenate((train_x.todense(), test_x.todense()))
    y = np.concatenate((train_y, test_y))

    # delete unwanted variables
    del train_x, test_x, train_y, test_y

    # create pairwise dataset
    train_paired, test_paired, \
    train_paired_target, test_paired_target, \
    labels_dict_train, labels_dict_test, \
    uc_support_set, uc_test_samples, uc_test_labels = create_pairwise_dataset(x, y, limit=limit, unseen_num_class=unseen_num_class)

    # set the data as separate numpy array to be passed into model
    pair1_train = []
    pair2_train = []
    for i in range(len(train_paired)):
        pair1_train.append(np.array(train_paired[i][0]))
        pair2_train.append(np.array(train_paired[i][1]))

    pair1_train = np.array(pair1_train)
    pair2_train = np.array(pair2_train)
    train_paired_target = np.array(train_paired_target)

    # shuffle in unison
    pair1_train, pair2_train, train_paired_target = unison_shuffled_copies(
        pair1_train, pair2_train, train_paired_target)

    pair1_test = []
    pair2_test = []
    for i in range(len(test_paired)):
        pair1_test.append(np.array(test_paired[i][0]))
        pair2_test.append(np.array(test_paired[i][1]))

    pair1_test = np.array(pair1_test)
    pair2_test = np.array(pair2_test)
    test_paired_target = np.array(test_paired_target)

    # shuffle in unison
    pair1_test, pair2_test, test_paired_target = unison_shuffled_copies(
        pair1_test, pair2_test, test_paired_target)

    feature_size = pair1_train.shape[-1]
    pair1_train = pair1_train.reshape(-1, feature_size)
    pair2_train = pair2_train.reshape(-1, feature_size)
    pair1_test = pair1_test.reshape(-1, feature_size)
    pair2_test = pair2_test.reshape(-1, feature_size)

    input_shape = pair1_train.shape[1]

    # if train the model from scratch
    if train:

        # siamese neural network structure
        siamese_net = SiameseNN(input_shape)

        siamese_net.train([pair1_train, pair2_train], train_paired_target,
                          [pair1_test, pair2_test], test_paired_target)

        siamese_net.save('siamese-' + str(limit))

    # if load the pre-trained model
    else:

        siamese_net = SiameseNN(input_shape)
        siamese_net.load('siamese-' + str(limit))

    #========================================================================#
    #======================= Prepare data for testing ========================#
    #========================================================================#

    # support set for SNN testing
    support_set = {}
    counter = 0
    for labelA in sorted(labels_dict_train):
        if counter > limit:
            break
        else:
            support_set[labelA] = labels_dict_train[labelA]
            counter += 1

    # training data for kNN and neural network
    x_train = []
    y_train = []
    counter = 0
    for label in sorted(labels_dict_train):
        if counter > limit:
            break
        samples = labels_dict_train[label]
        for s in samples:
            x_train.append(s)
            y_train.append(label)
        counter += 1
    x_train = np.array(x_train)
    x_train = x_train.reshape(-1, x_train.shape[-1])
    y_train = np.array(y_train).reshape(-1, 1)

    # testing data for kNN and neural network
    test_samples = []
    test_labels = []
    counter = 0
    for labelA in sorted(labels_dict_test):
        if counter > limit:
            break
        else:
            samples = labels_dict_test[labelA]
            idxs = np.random.permutation(len(samples))
            for k in idxs:
                test_samples.append(samples[k])
                test_labels.append(labelA)
            counter += 1
    test_samples = np.array(test_samples)

    #========================================================================#
    #================ Evaluation for SNN on seen classes ====================#
    #========================================================================#

    if snn_seen_score:

        print('=================================================')
        print('Running test on seen classes with Siamese NN ....')

        # run testing in a non-parametric way
        score = siamese_net.score_non_parametric(test_samples, test_labels,
                                                 support_set)

        print('Siamese NN seen class accuracy with {} classes: {}%'.format(
            limit, round(score, 1)))

    #========================================================================#
    #============== Evaluation for SNN on unseen classes ====================#
    #========================================================================#

    if snn_unseen_score:

        print('=================================================')
        print('Running test on unseen classes with Siamese NN ....')

        # run testing in a non-parametric way
        score = siamese_net.score_non_parametric(uc_test_samples,
                                                 uc_test_labels,
                                                 uc_support_set)

        print('Siamese NN {} unseen class accuracy: {}%'.format(
            unseen_num_class, round(score, 1)))

    #========================================================================#
    #=================== Evaluation with baseline KNN =======================#
    #========================================================================#

    if knn_score:

        print('=================================================')
        print('Running test on kNN algorithm ....')

        test_samples = test_samples.reshape(-1, test_samples.shape[-1])
        test_labels = np.array(test_labels).reshape(-1, 1)
        knn = KNN(n_neighbors=10)
        knn.train(x_train, y_train)
        score = knn.score(test_samples, test_labels) * 100

        print('kNN accuracy with {} classes: {}%'.format(
            limit, round(score, 1)))

    #========================================================================#
    #=================== Evaluation with Neural Network =====================#
    #========================================================================#

    if nn_score:

        print('=================================================')
        print('Running train and test on neural network ....')

        inputs = Input(shape=(feature_size, ), name="input")

        x = Dense(7750,
                  activation="relu",
                  name="dense1",
                  input_dim=feature_size)(inputs)
        output = Dense(limit, activation="softmax", name="output")(x)
        nn = Model(inputs, output)
        nn.compile(optimizer="adam",
                   loss='categorical_crossentropy',
                   metrics=['categorical_accuracy'])

        callbacks = [
            EarlyStopping(monitor='val_loss',
                          patience=3,
                          mode='min',
                          restore_best_weights=True)
        ]

        y_train_onehot = np_utils.to_categorical(y_train)

        test_samples = test_samples.reshape(-1, test_samples.shape[-1])
        test_labels = np.array(test_labels).reshape(-1, 1)
        test_labels_onehot = np_utils.to_categorical(test_labels)
        nn.fit(x_train,
               y_train_onehot,
               batch_size=256,
               epochs=100,
               verbose=0,
               callbacks=callbacks,
               validation_data=(test_samples, test_labels_onehot))

        score, acc = nn.evaluate(test_samples, test_labels_onehot)

        print('Neural network acuracy with {} classes: {}%'.format(
            limit, round(score, 1)))
예제 #9
0
def eval_pub_med():
    from gensim.models.keyedvectors import KeyedVectors
    # Need to download file from http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin
    # Load the pubmed model
    model = KeyedVectors.load_word2vec_format(
        'wikipedia-pubmed-and-PMC-w2v.bin', binary=True)
    # Load data into train/validate/test sets
    data_reader = DataReader()
    df = data_reader.get_all_data()
    train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split(
        df)
    tokens_train, train_y_raw = tokenize(train_x_raw,
                                         train_y_raw,
                                         save_missing_feature_as_string=False,
                                         remove_empty=True)
    # for the each tokenized vector in the train set, run the model on each word and take the average.
    # If no words are vectorized by pubmed, append an 0 vector
    avg = []
    for item in tokens_train:
        words = []
        for word in item:
            if word in model.wv.vocab:
                vec = model.get_vector(word)
                words.append(vec)
        average = np.average(np.array(words), axis=0)
        if type(average) == np.float64:
            print('****')
            print(average)
            avg.append(np.zeros(200))
        else:
            avg.append(list(average))
    pub_med_train = np.array(avg)

    # run the same for the validation set
    tokens_val, val_y_raw = tokenize(val_x_raw,
                                     val_y_raw,
                                     save_missing_feature_as_string=False,
                                     remove_empty=True)
    avg = []
    for item in tokens_val:
        words = []
        for word in item:
            if word in model.wv.vocab:
                vec = model.get_vector(word)
                words.append(vec)
        average = np.average(np.array(words), axis=0)
        if type(average) == np.float64:
            print('****')
            print(average)
            avg.append(np.zeros(200))
        else:
            avg.append(list(average))
    pub_med_val = np.array(avg)

    # run the same for the test set
    tokens_test, test_y_raw = tokenize(test_x_raw,
                                       test_y_raw,
                                       save_missing_feature_as_string=False,
                                       remove_empty=True)
    avg = []
    for item in tokens_test:
        words = []
        for word in item:
            if word in model.wv.vocab:
                vec = model.get_vector(word)
                words.append(vec)
        average = np.average(np.array(words), axis=0)
        if type(average) == np.float64:
            print('****')
            print(average)
            avg.append(np.zeros(200))
        else:
            avg.append(list(average))
    pub_med_test = np.array(avg)

    # train the neural network model and calculate the precision, recall, f1 score, and accuracy
    print("pubmed, nn")
    nn_model = _get_nn_model_bag_of_words_simple_scratch(
        pub_med_train,
        train_y_raw,
        pub_med_val,
        val_y_raw,
        data_reader.get_region_labels()['Code'],
        epochs=100,
        batch_size=256)
    eval_nn(nn_model, pub_med_test, test_y_raw)
    evaluate_model_nn(nn_model, pub_med_test, test_y_raw, plot_roc=False)
    # train the logistic regression model and calculate the precision, recall, f1 score, and accuracy
    print("pubmed, logistic regression")
    from Models.logistic_regression import MultiClassLogisticRegression
    log_reg = MultiClassLogisticRegression()
    log_reg.train(pub_med_train, train_y_raw)
    eval_model(log_reg, pub_med_test, test_y_raw)
    evaluate_model(log_reg, pub_med_test, test_y_raw, plot_roc=False)
    # train the random forest model and calculate the precision, recall, f1 score, and accuracy
    print("pubmed, random forest")
    from Models.random_forest import RandomForest
    rand_for = RandomForest()
    rand_for.train(pub_med_train, train_y_raw)
    eval_model(rand_for, pub_med_test, test_y_raw)
    evaluate_model(rand_for, pub_med_test, test_y_raw, plot_roc=False)
    # train the naive bayes model and calculate the precision, recall, f1 score, and accuracy
    print("pubmed, naivebayes")
    from Models.naive_bayes import NaiveBayes
    nb = NaiveBayes()
    nb.train(pub_med_train, train_y_raw)
    eval_model(nb, pub_med_test, test_y_raw)
    evaluate_model(nb, pub_med_test, test_y_raw, plot_roc=False)
예제 #10
0
def top_keywords_kmeans():
    # get data
    data_reader = DataReader()
    df = data_reader.get_all_data()
    train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(df)
    train_x_raw.drop(['RIS PROCEDURE CODE'], axis=1, inplace=True)
    test_x_raw.drop(['RIS PROCEDURE CODE'], axis=1, inplace=True)

    # identify ON WG IDENTIFIERS that occur infrequently
    min_samples = 5
    train_y_list = train_y_raw['ON WG IDENTIFIER'].values.tolist()
    unique_ids = list(set(train_y_list))
    small_clusters = list()
    for i in unique_ids:
        if train_y_list.count(i) < min_samples:
            small_clusters.append(i)
    train_x_raw = train_x_raw[~train_y_raw['ON WG IDENTIFIER'].
                              isin(small_clusters)]
    train_y_raw = train_y_raw[~train_y_raw['ON WG IDENTIFIER'].
                              isin(small_clusters)]
    num_clusters = len(unique_ids) - len(small_clusters)

    # append the ON WG IDENTIFIERS to the original documents
    train_y_raw = pd.concat([train_x_raw, train_y_raw], axis=1)
    test_y_raw = pd.concat([test_x_raw, test_y_raw], axis=1)

    # tokenize and subsample
    tokens_train, train_y_raw = tokenize_columns(
        train_x_raw,
        train_y_raw,
        regex_string=r'[a-zA-Z0-9]+',
        save_missing_feature_as_string=False,
        remove_short=True,
        remove_num=True,
        remove_empty=True)
    tokens_test, test_y_raw = tokenize_columns(
        test_x_raw,
        test_y_raw,
        regex_string=r'[a-zA-Z0-9]+',
        save_missing_feature_as_string=False,
        remove_short=True,
        remove_num=True,
        remove_empty=True)

    # get TF-IDF representation of data
    feature_names = list()
    train_x = list()
    train_y = list()
    test_x = list()
    test_y = list()
    train_x, train_y, feature_names = tokens_to_bagofwords(
        tokens_train, train_y_raw, TfidfVectorizer)
    test_x, test_y, _ = tokens_to_bagofwords(tokens_test,
                                             test_y_raw,
                                             TfidfVectorizer,
                                             feature_names=feature_names)
    train_x = train_x.toarray()
    test_x = test_x.toarray()

    # run kmeans
    kmeans = Kmeans(num_clusters, feature_names, train_x, train_y, "tfidf")
    kmeans.eval()
    labels = kmeans.get_labels()

    # get top 10 keywords for each cluster
    n_terms = 10
    # group by clusters and get the mean occurence of each word
    df = pd.DataFrame(train_x).groupby(labels).mean()
    # iterate through each cluster and get the most frequent occuring words
    for i, r in df.iterrows():
        print(
            'Cluster {}: '.format(i) +
            ','.join([str(feature_names[t])
                      for t in np.argsort(r)[-n_terms:]]))
예제 #11
0
def per_site_accuracy_increase():
    # load data
    data_reader = DataReader()
    df = data_reader.get_all_data()
    all_tokens, _ = tokenize(df,
                             df,
                             save_missing_feature_as_string=False,
                             remove_empty=True)
    _, _, vocab = tokens_to_bagofwords(all_tokens, all_tokens)

    lst = []
    from random import shuffle

    #split data on source hospital and save to seperate dataframes in a list
    for i in df['src_file'].unique():
        lst.append(df[df['src_file'] == i])

    from Models.neural_net import MultiClassNNScratch

    # save an empty neural network so we can quickly reset the network
    model = MultiClassNNScratch(
        (0, len(vocab)),
        np.array(data_reader.get_region_labels()['Code']),
        epochs=100,
        batch_size=256)
    model.model.save_weights('empty_model.h5')

    # run evaluation some n times
    for i in range(30):
        # shuffle the order
        shuffle(lst)
        # iterate from 1 to len(lst)-1 from size of train set. Train model on 1->i sites and test on i->len(lst)-1.
        # Print results to file so we can easily visualize later.
        # each run of the 30 gets its own file.
        i = 1
        file = open("output_dir/" + randomword(7) + '.txt', "w")
        while i < len(lst):
            model.model.load_weights('empty_model.h5')
            train_set = lst[:i]
            test_set = lst[i:]

            test_x_raw, test_y_raw = get_x_y_split(pd.concat(test_set))
            test_tokens, test_y_raw = tokenize(
                test_x_raw,
                test_y_raw,
                save_missing_feature_as_string=False,
                remove_empty=True)
            test_x, test_y, _ = tokens_to_bagofwords(test_tokens,
                                                     test_y_raw,
                                                     feature_names=vocab)

            item = pd.concat(train_set)
            train_x_raw, train_y_raw, val_x_raw, val_y_raw = get_train_test_split(
                item)
            train_tokens, train_y_raw = tokenize(
                train_x_raw,
                train_y_raw,
                save_missing_feature_as_string=False,
                remove_empty=True)
            train_x, train_y, _ = tokens_to_bagofwords(train_tokens,
                                                       train_y_raw,
                                                       feature_names=vocab)

            val_tokens, val_y_raw = tokenize(
                val_x_raw,
                val_y_raw,
                save_missing_feature_as_string=False,
                remove_empty=True)
            val_x, val_y, _ = tokens_to_bagofwords(val_tokens,
                                                   val_y_raw,
                                                   feature_names=vocab)

            model.set_train_data(train_x, train_y)
            model.train(val_x, val_y)

            accuracy = evaluate_model_nn(model, test_x, test_y, plot_roc=False)
            file.write("%d, %d, %4.2f, %d" %
                       (len(train_set), len(test_set), accuracy, len(item)))

            i += 1
        file.close()
예제 #12
0
def main():
    # parse arguments
    parser = argparse.ArgumentParser(description='Run unsupervised methods',
                                     add_help=True)
    parser.add_argument("-m",
                        "--model",
                        action="store",
                        required=True,
                        dest="MODELS",
                        nargs='+',
                        choices=[
                            'all', 'kmeans', 'lda', 'dbscan', 'birch',
                            'hierarchical', 'gmm', 'meanshift', 'spectral',
                            'affinity'
                        ],
                        help="Run model")
    parser.add_argument(
        "-r",
        "--rep",
        action="store",
        required=False,
        dest="REP",
        choices=['bow', 'tfidf', 'doc2vec', 'pca'],
        help=
        "Use bag of words representation (BOW), tfidf, doc2vec representation, or PCA"
    )
    parser.add_argument("--use-autoencoder",
                        action="store_true",
                        dest="USE_AUTOENCODER",
                        help="Use autoencoders to reduce representations")
    parser.add_argument("--use-doc2vec",
                        action="store_true",
                        dest="USE_DOC2VEC",
                        help="Use doc2vec representations")
    parser.add_argument(
        "--queries",
        action="store",
        dest="queries",
        nargs='+',
        help=
        "Return closest neighbours for query words to test search querying capabilities"
    )
    parser.add_argument(
        "--print-keywords",
        action="store_true",
        dest="PRINT_KEYWORDS",
        help="Use if you want to print keywords in each cluster")
    parser.add_argument("--find-optimal-k",
                        action="store_true",
                        dest="FIND_OPTIMAL_K",
                        help="Find optimal K")
    parser.add_argument("-s",
                        "--sample-size",
                        action="store",
                        required=False,
                        dest="SIZE",
                        help="Use smaller set")
    parser.add_argument("-d",
                        "--downsample-frac",
                        action="store",
                        required=False,
                        default=1.0,
                        dest="DOWNSAMPLE_FRAC",
                        type=float,
                        help="downsample fraction (0-1]")
    parser.add_argument(
        "--min-cluster-size",
        action="store",
        required=False,
        default=5,
        dest="MIN_CLUSTER_SIZE",
        help=
        "Filter out any ON WG IDENTIFIER classes with less than MIN_CLUSTER_SIZE"
    )
    parser.add_argument(
        "-n",
        "--num-clusters",
        action="store",
        required=False,
        default=1500,
        dest="NUM_CLUSTERS",
        help="Number of clusters for algorithms that require it")
    args = parser.parse_args()
    #print(args.MODELS)

    assert (not (args.DOWNSAMPLE_FRAC)
            or (args.DOWNSAMPLE_FRAC > 0.0 and args.DOWNSAMPLE_FRAC <= 1.0
                and args.DOWNSAMPLE_FRAC))

    # get data
    data_reader = DataReader()
    df = data_reader.get_all_data()
    if args.SIZE:
        subset_df = df.sample(n=int(args.SIZE))
        train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(
            subset_df)
    elif args.DOWNSAMPLE_FRAC:
        subset_df = df.sample(frac=float(args.DOWNSAMPLE_FRAC))
        train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(
            subset_df)
    else:
        train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(
            df)
    #train_x_raw = pd.concat([train_x_raw, test_x_raw], axis=0)
    #train_y_raw = pd.concat([train_y_raw, test_y_raw], axis=0)
    train_x_raw.drop(['RIS PROCEDURE CODE'], axis=1, inplace=True)
    test_x_raw.drop(['RIS PROCEDURE CODE'], axis=1, inplace=True)
    print(train_x_raw.shape)

    # identify ON WG IDENTIFIERS that occur infrequently
    #print("MIN_CLUSTER_SIZE: " + str(args.MIN_CLUSTER_SIZE))
    min_samples = args.MIN_CLUSTER_SIZE
    train_y_list = train_y_raw['ON WG IDENTIFIER'].values.tolist()
    unique_ids = list(set(train_y_list))
    small_clusters = list()
    for i in unique_ids:
        if train_y_list.count(i) < min_samples:
            small_clusters.append(i)
    train_x_raw = train_x_raw[~train_y_raw['ON WG IDENTIFIER'].
                              isin(small_clusters)]
    train_y_raw = train_y_raw[~train_y_raw['ON WG IDENTIFIER'].
                              isin(small_clusters)]
    #print(train_y_raw['ON WG IDENTIFIER'])
    #print(len(unique_ids))
    num_clusters = len(unique_ids) - len(small_clusters)
    #print("NUM_CLUSTERS: " + str(num_clusters))
    print(train_x_raw.shape)

    # append the ON WG IDENTIFIERS to the original documents
    train_y_raw = pd.concat([train_x_raw, train_y_raw], axis=1)
    test_y_raw = pd.concat([test_x_raw, test_y_raw], axis=1)

    # tokenize and subsample
    tokens_train, train_y_raw = tokenize_columns(
        train_x_raw,
        train_y_raw,
        regex_string=r'[a-zA-Z0-9]+',
        save_missing_feature_as_string=False,
        remove_short=True,
        remove_num=True,
        remove_empty=True)
    tokens_test, test_y_raw = tokenize_columns(
        test_x_raw,
        test_y_raw,
        regex_string=r'[a-zA-Z0-9]+',
        save_missing_feature_as_string=False,
        remove_short=True,
        remove_num=True,
        remove_empty=True)
    #print("done tokenizing columns")
    print(train_x_raw.shape)
    # get representation of data
    feature_names = list()
    train_x = list()
    train_y = list()
    test_x = list()
    test_y = list()
    #print(test_x_raw.shape)
    if args.REP == "bow" or args.USE_AUTOENCODER:
        train_x, train_y, feature_names = tokens_to_bagofwords(
            tokens_train, train_y_raw, CountVectorizer)
        test_x, test_y, _ = tokens_to_bagofwords(tokens_test,
                                                 test_y_raw,
                                                 CountVectorizer,
                                                 feature_names=feature_names)
        train_x = train_x.toarray()
        test_x = test_x.toarray()
        #print("done converting to bag of words representation")
    elif args.REP == "tfidf":
        train_x, train_y, feature_names = tokens_to_bagofwords(
            tokens_train, train_y_raw, TfidfVectorizer)
        test_x, test_y, _ = tokens_to_bagofwords(tokens_test,
                                                 test_y_raw,
                                                 TfidfVectorizer,
                                                 feature_names=feature_names)
        print(train_x.shape)
        train_x = train_x.toarray()
        test_x = test_x.toarray()
        #print("done converting to tfidf representation")
    elif args.REP == "doc2vec":
        train_x, train_y, _ = tokens_to_doc2vec(tokens_train, train_y_raw)
        test_x, train_y, _ = tokens_to_doc2vec(tokens_test, train_y_raw)
        #print("done converting to doc2vec representation")
    elif args.REP == "pca":
        train_x, train_y, feature_names = tokens_to_bagofwords(
            tokens_train, train_y_raw, CountVectorizer)
        test_x, test_y, _ = tokens_to_bagofwords(tokens_test,
                                                 test_y_raw,
                                                 CountVectorizer,
                                                 feature_names=feature_names)

        #get number of components
        pca = PCA()
        pca.fit(train_x.toarray())
        var = np.cumsum(pca.explained_variance_ratio_)
        n_comp = np.argmax(var > .9) + 1
        # fit pca
        pca = PCA(n_components=n_comp)
        pca.fit(train_x.toarray())
        train_x = pca.fit_transform(train_x.toarray())
        test_x = pca.fit_transform(test_x.toarray())

    VOCAB_SIZE = train_x.shape[1]
    if args.USE_AUTOENCODER:
        #print(int(len(data_reader.get_region_labels()['Code'])))
        # use an autoencoder with representation size = VOCAB_SIZE / 10
        REP_SIZE = 100
        encoder = get_encoder(train_x, test_x, REP_SIZE)
        train_x = encoder.predict(train_x)
        test_x = encoder.predict(test_x)
        #print("done converting to autoencoder representation")

    # run models
    print("TRAIN_X SHAPE = " + str(train_x.shape) + ", VOCAB_SIZE = " +
          str(VOCAB_SIZE) + ", NUM_CLUSTERS = " + str(num_clusters) +
          ", MIN_CLUSTER_SIZE = " + str(args.MIN_CLUSTER_SIZE))
    if "kmeans" in args.MODELS or "all" in args.MODELS:
        kmeans = Kmeans(num_clusters, feature_names, train_x, train_y,
                        args.REP)
        kmeans.eval()
        labels = kmeans.get_labels()

        # print results
        print("kmeans, " + args.REP + ", " + str(args.DOWNSAMPLE_FRAC) + ", " +
              str(kmeans.get_sil_score()) + ", " +
              str(kmeans.get_db_idx_score()))
        if args.FIND_OPTIMAL_K:
            find_optimal_clusters(2000, feature_names, train_x, train_y,
                                  args.REP)
        #plot_cluster_size_frequency(train_x, labels, num_clusters)
        # example queries
        print("getting nearest: ")
        if args.queries:
            for q in args.queries:
                kmeans.get_nearest_neighbours(str(q))

        if args.PRINT_KEYWORDS:
            # get top keywords for clusters
            print("getting top 10 keywords for each cluster: ")
            get_top_keywords(train_x, labels, feature_names, 10)
        '''
                                                                  
        # plot 500 random clusters
        plt.figure(figsize=(10, 7)) 
        fig, ax = plt.subplots()
        print("number of unique labels: " + str(len(np.unique(labels))))
        num_clusters_to_plot = 50
        tsne = TSNE(n_components=2, verbose=1)
        random_clusters = random.sample(range(1, num_clusters), num_clusters_to_plot)
        reduced_data = tsne.fit_transform(train_x.todense())
        cmap = plt.cm.get_cmap('rainbow',num_clusters_to_plot)

        for i in range(num_clusters_to_plot):
            l = random_clusters[i]
            print("cluster " + str(l))
            indices = np.where(labels == l)
            col = cmap(i)
            cluster_reduced_data = reduced_data[indices[0]]
            print(cluster_reduced_data.shape)
            plt.scatter(cluster_reduced_data[:,0], cluster_reduced_data[:,1], color=col)
        plt.savefig('kmeans_' +  args.REP + '_' + str(num_clusters_to_plot) + '.tsne.png')  '''
    if "lda" in args.MODELS or "all" in args.MODELS:
        # run lda
        lda = Lda(train_x_raw, train_y_raw, 1500, passes=15)
        lda.train()
        print("finished running lda")
    if "dbscan" in args.MODELS or "all" in args.MODELS:
        # run dbscan
        dbs = DBscan(num_clusters, feature_names, train_x, train_y, args.REP)
        dbs.eval()
        print("dbscan, " + args.REP + ", " + str(dbs.get_sil_score()) + ", " +
              str(dbs.get_db_idx_score()))
    if "birch" in args.MODELS or "all" in args.MODELS:
        b = Birch_(num_clusters, feature_names, train_x, train_y, args.REP)
        print("birch, " + args.REP + ", " + str(b.get_sil_score()) + ", " +
              str(b.get_db_idx_score()))
    if "hierarchical" in args.MODELS or "all" in args.MODELS:
        h = Hierarchical(num_clusters, feature_names, train_x, train_y,
                         args.REP)
        print("hierarchical, " + args.REP + ", " + str(h.get_sil_score()) +
              ", " + str(h.get_db_idx_score()))
        labels = h.get_labels()
        # get top keywords for clusters
        if args.PRINT_KEYWORDS:
            print("getting top 10 keywords for each cluster: ")
            get_top_keywords(train_x, labels, feature_names, 10)
        if args.FIND_OPTIMAL_K:
            find_optimal_clusters(2000, feature_names, train_x, train_y,
                                  args.REP)
        if args.queries:
            h.get_nearest_neighbours(args.queries)
    if "gmm" in args.MODELS or "all" in args.MODELS:
        gmm = GMM(num_clusters, feature_names, train_x, train_y, args.REP)
        print("GMM, " + args.REP + ", " + str(gmm.get_sil_score()) + ", " +
              str(gmm.get_db_idx_score()))
    if "meanshift" in args.MODELS or "all" in args.MODELS:
        ms = Meanshift(feature_names, train_x, train_y, args.REP)
        print("meanshift, " + args.REP + ", " + str(ms.get_sil_score()) +
              ", " + str(ms.get_db_idx_score()))
    if "spectral" in args.MODELS or "all" in args.MODELS:
        sp = Spectral(num_clusters, feature_names, train_x, train_y, args.REP)
        print("spectral, " + args.REP + ", " + str(sp.get_sil_score()) + ", " +
              str(sp.get_db_idx_score()))
    if "affinity" in args.MODELS or "all" in args.MODELS:
        af = Affinity(num_clusters, feature_names, train_x, train_y, args.REP)
        print("affinity, " + args.REP + ", " + str(af.get_sil_score()) + ", " +
              str(af.get_db_idx_score()))
예제 #13
0
import numpy as np
from data_reader import DataReader
import warnings

warnings.filterwarnings("ignore")

# get all the labelled data
reader = DataReader()
data = reader.get_all_data()

print(
    '\n================== Before removing missing values ==================\n')
print('No. of samples: {}'.format(len(data)))
print('No. of classes: {}'.format(data['ON WG IDENTIFIER'].nunique()))
counts = data.groupby(['ON WG IDENTIFIER']).size().to_frame(name='counts') \
                        .sort_values(['counts']).values
print('Max no. of samples for a class: {}'.format(counts[-1][-1]))
print('Min no. of samples for a class: {}'.format(counts[0][0]))
print('Avg no. of samples for a class: {}'.format(round(np.mean(counts), 2)))
print(
    '\n===================================================================\n')

print(
    '================== After removing missing values ====================\n')
# drop rows with missing values
dataNoNan = data.dropna()
print('No. of samples: {}'.format(len(dataNoNan)))
print('No. of classes: {}'.format(dataNoNan['ON WG IDENTIFIER'].nunique()))

counts = dataNoNan.groupby(['ON WG IDENTIFIER']).size().to_frame(name='counts') \
                        .sort_values(['counts']).values