def main():
    data_reader = DataReader()
    df = data_reader.get_all_data()

    # random split of data
    train_x_raw, train_y_raw, test_x_raw, test_y_raw = get_train_test_split(df)

    # set up train data
    train_tokens, train_y_raw = tokenize(train_x_raw, train_y_raw, save_missing_feature_as_string=False,
                                       remove_empty=True)
    train_x, train_y, feature_names = tokens_to_bagofwords(train_tokens, train_y_raw)

    # train model
    model  = _get_nn_model_bag_of_words_simple_v2(train_x, train_y, data_reader.get_region_labels()['Code'],
                                                      epochs=50, batch_size=64)

    # set up test data
    test_tokens, test_y_raw = tokenize(test_x_raw, test_y_raw, save_missing_feature_as_string=False, remove_empty=True)
    test_x, test_y, _ = tokens_to_bagofwords(test_tokens, test_y_raw, feature_names=feature_names)

    # evaluate model
    evaluate_model_nn(model, test_x, test_y, plot_roc=False)

    # ABOVE IS BASIC SUPERVISED LEARNING TO GENERATE MODEL
    #################################################
    # BELOW IS SEMI-SUPERVISED SELF-TRAINING TO FUTHER TRAIN MODEL

    # read unlabelled data and format it to be the same as labelled data
    unlabelled_df = data_reader.get_east_dir()
    unlabelled_df = normalize_east_dir_df(unlabelled_df)

    # set up unlabelled data as semi-supervised data
    tokens, _ = tokenize(unlabelled_df, _, save_missing_feature_as_string=False, remove_empty=True)
    semi_x_base, _, _ = tokens_to_bagofwords(tokens, _, feature_names=feature_names)

    # Confidence threshold to train on
    train_threshold = 0.8
    semi_train_amount = 30

    # SELF TRAIN MANY TIMES
    for i in range(semi_train_amount):
        # get predictions on unlabelled data
        pred = model.model.predict(semi_x_base)
        # convert probablities to 1 hot encoded output
        semi_y = np.zeros_like(pred)
        semi_y[np.arange(len(pred)), pred.argmax(1)] = 1
        # filter semi_x and semi_y to only include predictions above train_threshold
        semi_y = semi_y[pred.max(axis=1) > train_threshold]
        semi_x = semi_x_base[pred.max(axis=1) > train_threshold]

        # train on semi supervised data
        model.model.fit(semi_x, semi_y, batch_size=64, epochs=100)
        # retrain on original train data
        model.model.fit(train_x, model.encoder.transform(train_y), batch_size=32, epochs=10)

        # evaluate model
        evaluate_model_nn(model, test_x, test_y, plot_roc=False)

        # remove semi data used in this iteration from future iterations
        semi_x_base = semi_x_base[~(pred.max(axis=1) > train_threshold)]
def eval_ae():
    from Models.logistic_regression import MultiClassLogisticRegression
    from Models.random_forest import RandomForest
    from Models.naive_bayes import NaiveBayes
    from Models.svm import SVM
    # load data
    data_reader = DataReader()
    df = data_reader.get_all_data()

    train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split(
        df)
    train_x, train_y, val_x, val_y, test_x, test_y = bag_of_words_full_no_empty_val_no_num_no_short_no_repeat(
        train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw)
    # Train an auto encoder of size 4096
    encoder = get_encoder(train_x, test_x, 4096)
    # use auto encoder to encode the train, validate and test sets
    encoded_train = encoder.predict(train_x)
    encoded_test = encoder.predict(test_x)
    encoded_val = encoder.predict(val_x)

    # train the neural network model and calculate the precision, recall, f1 score, and accuracy
    print('neural net ae')
    model = _get_nn_model_bag_of_words_simple_scratch(
        encoded_train,
        train_y,
        encoded_val,
        val_y,
        data_reader.get_region_labels()['Code'],
        epochs=100,
        batch_size=256)
    eval_nn(model, encoded_test, test_y)
    evaluate_model_nn(model, encoded_test, test_y)
    # train the logistic regression model and calculate the precision, recall, f1 score, and accuracy
    print('logistic regression ae')
    model = MultiClassLogisticRegression()
    model.train(encoded_train, train_y)
    model_obj = lambda: None
    model_obj.model = model
    eval_model(model_obj, encoded_test, test_y)
    evaluate_model(model, encoded_test, test_y)

    # train the random forest model and calculate the precision, recall, f1 score, and accuracy
    print('random forest ae')
    model = RandomForest()
    model.train(encoded_train, train_y)
    model_obj = lambda: None
    model_obj.model = model
    eval_model(model_obj, encoded_test, test_y)
    evaluate_model(model, encoded_test, test_y)

    # train the naive bayes model and calculate the precision, recall, f1 score, and accuracy
    print('naive bayes ae')
    model = NaiveBayes()
    model.train(encoded_train, train_y)
    model_obj = lambda: None
    model_obj.model = model
    eval_model(model_obj, encoded_test, test_y)
    evaluate_model(model, encoded_test, test_y)
def original_model():
    if not os.path.isfile('demo_nn.h5'):
        data_reader = DataReader()
        df = data_reader.get_all_data()

        top_x_predictions = 10

        # Split data
        train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split(
            df)

        # get bag of words
        train_x, train_y, val_x, val_y, test_x, test_y, feature_names = get_bag_of_words(
            train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw,
            test_y_raw)
        # get all labels
        labels = data_reader.get_region_labels()['Code']
        # train neural net
        model = MultiClassNNScratch(train_x.shape,
                                    np.array(labels),
                                    epochs=150,
                                    batch_size=1024)
        model.set_train_data(train_x, train_y)
        model.train(val_x, val_y)

        # save neural net
        model.model.save('demo_nn.h5')
    else:
        # load neural net
        model = MultiClassNNScratch(1,
                                    np.array([]),
                                    epochs=150,
                                    batch_size=1024)
        model.set_train_data(1, 1)
        model.model = load_model(
            'demo_nn.h5', custom_objects={'top_3_accuracy': top_3_accuracy})

    # from IPython import embed
    # embed()

    regex_string = r'[a-zA-Z0-9]+'
    while True:
        stdin = input("Enter all information:")
        if stdin == 'quit':
            break
        try:
            top_x_predictions = int(stdin)
            print("Will return top " + str(top_x_predictions) + " predictions")
        except ValueError:
            tokenizer = RegexpTokenizer(regex_string)
            tokens = tokenizer.tokenize(stdin.lower())

            vectorizer = CountVectorizer(tokenizer=lambda x: x,
                                         lowercase=False,
                                         strip_accents=False,
                                         vocabulary=feature_names)

            model_input = vectorizer.fit_transform([tokens])
            pred = model.model.predict(model_input)

            # Top X Predictions
            rows = []
            for i in range(top_x_predictions):
                one_hot_pred = np.zeros_like(pred)
                one_hot_pred[np.arange(len(pred)),
                             (np.argpartition(pred[0], -top_x_predictions
                                              )[-top_x_predictions:][i])] = 1
                id = model.encoder.inverse_transform(one_hot_pred)[0][0]
                row = data_reader.regional_df[data_reader.regional_df['Code']
                                              == id]
                row['Prediction Confidence'] = (pred[0][(np.argpartition(
                    pred[0],
                    -top_x_predictions)[-top_x_predictions:][i])]) * 100
                rows.append(row)

            rows = (pd.concat(rows)).sort_values('Prediction Confidence',
                                                 ascending=False)
            print(rows)
def main():
    config = tf.ConfigProto(device_count={'GPU': 0})
    #    config.gpu_options.per_process_gpu_memory_fraction = 0.64
    set_session(tf.Session(config=config))

    data_reader = DataReader()
    df = data_reader.get_all_data()

    # Split data
    train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split(
        df)

    # get bag of words
    train_x, train_y, val_x, val_y, test_x, test_y, feature_names = get_bag_of_words(
        train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw)
    # get all labels
    labels = data_reader.get_region_labels()['Code']

    if not os.path.isfile('demo_nn.h5'):
        # train neural net
        model = MultiClassNNScratch(train_x.shape,
                                    np.array(labels),
                                    epochs=150,
                                    batch_size=1024)
        model.set_train_data(train_x, train_y)
        model.train(val_x, val_y)

        # save neural net
        model.model.save('demo_nn.h5')
    else:
        # load neural net
        model = MultiClassNNScratch(train_x.shape,
                                    np.array(labels),
                                    epochs=150,
                                    batch_size=1024)
        model.set_train_data(train_x, train_y)
        model.model = load_model(
            'demo_nn.h5', custom_objects={'top_3_accuracy': top_3_accuracy})

    # from IPython import embed
    # embed()

    regex_string = r'[a-zA-Z0-9]+'
    while True:
        stdin = input("Enter all information:")
        if stdin == 'quit':
            break

        tokenizer = RegexpTokenizer(regex_string)
        tokens = tokenizer.tokenize(stdin.lower())

        vectorizer = CountVectorizer(tokenizer=lambda x: x,
                                     lowercase=False,
                                     strip_accents=False,
                                     vocabulary=feature_names)

        model_input = vectorizer.fit_transform([tokens])
        pred = model.model.predict(model_input)

        one_hot_pred = np.zeros_like(pred)
        one_hot_pred[np.arange(len(pred)), pred.argmax(1)] = 1

        id = model.encoder.inverse_transform(one_hot_pred)[0][0]
        row = data_reader.regional_df[data_reader.regional_df['Code'] == id]

        print(row)
def eval_pub_med():
    from gensim.models.keyedvectors import KeyedVectors
    # Need to download file from http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin
    # Load the pubmed model
    model = KeyedVectors.load_word2vec_format(
        'wikipedia-pubmed-and-PMC-w2v.bin', binary=True)
    # Load data into train/validate/test sets
    data_reader = DataReader()
    df = data_reader.get_all_data()
    train_x_raw, train_y_raw, val_x_raw, val_y_raw, test_x_raw, test_y_raw = get_train_validate_test_split(
        df)
    tokens_train, train_y_raw = tokenize(train_x_raw,
                                         train_y_raw,
                                         save_missing_feature_as_string=False,
                                         remove_empty=True)
    # for the each tokenized vector in the train set, run the model on each word and take the average.
    # If no words are vectorized by pubmed, append an 0 vector
    avg = []
    for item in tokens_train:
        words = []
        for word in item:
            if word in model.wv.vocab:
                vec = model.get_vector(word)
                words.append(vec)
        average = np.average(np.array(words), axis=0)
        if type(average) == np.float64:
            print('****')
            print(average)
            avg.append(np.zeros(200))
        else:
            avg.append(list(average))
    pub_med_train = np.array(avg)

    # run the same for the validation set
    tokens_val, val_y_raw = tokenize(val_x_raw,
                                     val_y_raw,
                                     save_missing_feature_as_string=False,
                                     remove_empty=True)
    avg = []
    for item in tokens_val:
        words = []
        for word in item:
            if word in model.wv.vocab:
                vec = model.get_vector(word)
                words.append(vec)
        average = np.average(np.array(words), axis=0)
        if type(average) == np.float64:
            print('****')
            print(average)
            avg.append(np.zeros(200))
        else:
            avg.append(list(average))
    pub_med_val = np.array(avg)

    # run the same for the test set
    tokens_test, test_y_raw = tokenize(test_x_raw,
                                       test_y_raw,
                                       save_missing_feature_as_string=False,
                                       remove_empty=True)
    avg = []
    for item in tokens_test:
        words = []
        for word in item:
            if word in model.wv.vocab:
                vec = model.get_vector(word)
                words.append(vec)
        average = np.average(np.array(words), axis=0)
        if type(average) == np.float64:
            print('****')
            print(average)
            avg.append(np.zeros(200))
        else:
            avg.append(list(average))
    pub_med_test = np.array(avg)

    # train the neural network model and calculate the precision, recall, f1 score, and accuracy
    print("pubmed, nn")
    nn_model = _get_nn_model_bag_of_words_simple_scratch(
        pub_med_train,
        train_y_raw,
        pub_med_val,
        val_y_raw,
        data_reader.get_region_labels()['Code'],
        epochs=100,
        batch_size=256)
    eval_nn(nn_model, pub_med_test, test_y_raw)
    evaluate_model_nn(nn_model, pub_med_test, test_y_raw, plot_roc=False)
    # train the logistic regression model and calculate the precision, recall, f1 score, and accuracy
    print("pubmed, logistic regression")
    from Models.logistic_regression import MultiClassLogisticRegression
    log_reg = MultiClassLogisticRegression()
    log_reg.train(pub_med_train, train_y_raw)
    eval_model(log_reg, pub_med_test, test_y_raw)
    evaluate_model(log_reg, pub_med_test, test_y_raw, plot_roc=False)
    # train the random forest model and calculate the precision, recall, f1 score, and accuracy
    print("pubmed, random forest")
    from Models.random_forest import RandomForest
    rand_for = RandomForest()
    rand_for.train(pub_med_train, train_y_raw)
    eval_model(rand_for, pub_med_test, test_y_raw)
    evaluate_model(rand_for, pub_med_test, test_y_raw, plot_roc=False)
    # train the naive bayes model and calculate the precision, recall, f1 score, and accuracy
    print("pubmed, naivebayes")
    from Models.naive_bayes import NaiveBayes
    nb = NaiveBayes()
    nb.train(pub_med_train, train_y_raw)
    eval_model(nb, pub_med_test, test_y_raw)
    evaluate_model(nb, pub_med_test, test_y_raw, plot_roc=False)
def per_site_accuracy_increase():
    # load data
    data_reader = DataReader()
    df = data_reader.get_all_data()
    all_tokens, _ = tokenize(df,
                             df,
                             save_missing_feature_as_string=False,
                             remove_empty=True)
    _, _, vocab = tokens_to_bagofwords(all_tokens, all_tokens)

    lst = []
    from random import shuffle

    #split data on source hospital and save to seperate dataframes in a list
    for i in df['src_file'].unique():
        lst.append(df[df['src_file'] == i])

    from Models.neural_net import MultiClassNNScratch

    # save an empty neural network so we can quickly reset the network
    model = MultiClassNNScratch(
        (0, len(vocab)),
        np.array(data_reader.get_region_labels()['Code']),
        epochs=100,
        batch_size=256)
    model.model.save_weights('empty_model.h5')

    # run evaluation some n times
    for i in range(30):
        # shuffle the order
        shuffle(lst)
        # iterate from 1 to len(lst)-1 from size of train set. Train model on 1->i sites and test on i->len(lst)-1.
        # Print results to file so we can easily visualize later.
        # each run of the 30 gets its own file.
        i = 1
        file = open("output_dir/" + randomword(7) + '.txt', "w")
        while i < len(lst):
            model.model.load_weights('empty_model.h5')
            train_set = lst[:i]
            test_set = lst[i:]

            test_x_raw, test_y_raw = get_x_y_split(pd.concat(test_set))
            test_tokens, test_y_raw = tokenize(
                test_x_raw,
                test_y_raw,
                save_missing_feature_as_string=False,
                remove_empty=True)
            test_x, test_y, _ = tokens_to_bagofwords(test_tokens,
                                                     test_y_raw,
                                                     feature_names=vocab)

            item = pd.concat(train_set)
            train_x_raw, train_y_raw, val_x_raw, val_y_raw = get_train_test_split(
                item)
            train_tokens, train_y_raw = tokenize(
                train_x_raw,
                train_y_raw,
                save_missing_feature_as_string=False,
                remove_empty=True)
            train_x, train_y, _ = tokens_to_bagofwords(train_tokens,
                                                       train_y_raw,
                                                       feature_names=vocab)

            val_tokens, val_y_raw = tokenize(
                val_x_raw,
                val_y_raw,
                save_missing_feature_as_string=False,
                remove_empty=True)
            val_x, val_y, _ = tokens_to_bagofwords(val_tokens,
                                                   val_y_raw,
                                                   feature_names=vocab)

            model.set_train_data(train_x, train_y)
            model.train(val_x, val_y)

            accuracy = evaluate_model_nn(model, test_x, test_y, plot_roc=False)
            file.write("%d, %d, %4.2f, %d" %
                       (len(train_set), len(test_set), accuracy, len(item)))

            i += 1
        file.close()