def train_doc2vec(args):
    X, Y = data_processor.read_data(args.train_path)
    X_t, Y_t = data_processor.read_data(args.test_path)

    documents = [TaggedDocument(x, f'{idx}') for idx, x in enumerate(np.concatenate((X, X_t), axis=0))]
    model = Doc2Vec(documents, vector_size=300, window=8, min_count=0, workers=4)
    model.save('model/doc2vec')
예제 #2
0
def input_fn(trainingset_path, use_weights):
    x_train, y_train = read_data(trainingset_path, 'train')
    y_train = y_train.apply(lambda x: 1 if x < -1 else 0)
    print('Train set balance: {}/{}'.format(np.count_nonzero(y_train),
                                            len(y_train)))
    x_eval, y_eval = read_data(trainingset_path, 'eval')
    y_eval = y_eval.apply(lambda x: 1 if x < -1 else 0)
    print('Eval set balance: {}/{}'.format(np.count_nonzero(y_eval),
                                           len(y_eval)))
    x_test, y_test = read_data(trainingset_path, 'infer')
    y_test = y_test.apply(lambda x: 1 if x < -1 else 0)
    print('Infer set balance: {}/{}'.format(np.count_nonzero(y_test),
                                            len(y_test)))

    # Compute class weights to optimize binary cross-entropy
    if use_weights:
        weights = compute_class_weight('balanced', np.unique(y_test), y_test)
    else:
        weights = None

    x_train = np.reshape(x_train.values, (-1, x_train.shape[1], 1))
    y_train = np.reshape(y_train.values, (-1, 1))
    x_eval = np.reshape(x_eval.values, (-1, x_eval.shape[1], 1))
    y_eval = np.reshape(y_eval.values, (-1, 1))
    x_test = np.reshape(x_test.values, (-1, x_test.shape[1], 1))
    y_test = np.reshape(y_test.values, (-1, 1))
    return x_train, y_train, x_eval, y_eval, x_test, y_test, weights
예제 #3
0
def train_and_test():
    training_data = dp.read_data('dataset/splice-Xtrain.dat',
                                 'dataset/splice-Ytrain.dat')
    test_data = dp.read_data('dataset/test40.txt', 'dataset/ytest40.txt')
    feature = Features()
    dp.remove_ambiguous_entry_plus(training_data)
    training_set = feature.amino_acid_count(training_data)
    test_set = feature.amino_acid_count(test_data)

    k_nearest_neighbors = KNN(training_set, 26)

    confusion_matrix = np.zeros([3, 3])
    correct = 0.0
    total = 0.0

    for index in range(len(test_set)):
        feature_vector, correct_class = test_set[index]
        prediction = k_nearest_neighbors.predict_codon_cosine(
            feature_vector, k_nearest_neighbors.no_weight)
        total += 1
        if prediction == correct_class:
            correct += 1
        if prediction == 0 and correct_class == 0:
            confusion_matrix[0, 0] += 1
        if prediction == 0 and correct_class == 1:
            confusion_matrix[0, 1] += 1
        if prediction == 0 and correct_class == 2:
            confusion_matrix[0, 2] += 1
        if prediction == 1 and correct_class == 0:
            confusion_matrix[1, 0] += 1
        if prediction == 1 and correct_class == 1:
            confusion_matrix[1, 1] += 1
        if prediction == 1 and correct_class == 2:
            confusion_matrix[1, 2] += 1
        if prediction == 2 and correct_class == 0:
            confusion_matrix[2, 0] += 1
        if prediction == 2 and correct_class == 1:
            confusion_matrix[2, 1] += 1
        if prediction == 2 and correct_class == 2:
            confusion_matrix[2, 2] += 1

    print confusion_matrix
    print correct / total
예제 #4
0
def input_fn(mode, hparams, trainingset_path):
    data_x, data_y = read_data(trainingset_path, mode)
    return tf.compat.v1.estimator.inputs.pandas_input_fn(
        data_x,
        y=data_y,
        batch_size=128,
        num_epochs=10,
        shuffle=True,
        num_threads=2,
    )
예제 #5
0
def train_and_test():
    training_data = dp.read_data("dataset/splice-Xtrain.dat", "dataset/splice-Ytrain.dat")
    test_data = dp.read_data("dataset/test40.txt", "dataset/ytest40.txt")
    feature = Features()
    dp.remove_ambiguous_entry_plus(training_data)
    training_set = feature.amino_acid_count(training_data)
    test_set = feature.amino_acid_count(test_data)

    k_nearest_neighbors = KNN(training_set, 26)

    confusion_matrix = np.zeros([3, 3])
    correct = 0.0
    total = 0.0

    for index in range(len(test_set)):
        feature_vector, correct_class = test_set[index]
        prediction = k_nearest_neighbors.predict_codon_cosine(feature_vector, k_nearest_neighbors.no_weight)
        total += 1
        if prediction == correct_class:
            correct += 1
        if prediction == 0 and correct_class == 0:
            confusion_matrix[0, 0] += 1
        if prediction == 0 and correct_class == 1:
            confusion_matrix[0, 1] += 1
        if prediction == 0 and correct_class == 2:
            confusion_matrix[0, 2] += 1
        if prediction == 1 and correct_class == 0:
            confusion_matrix[1, 0] += 1
        if prediction == 1 and correct_class == 1:
            confusion_matrix[1, 1] += 1
        if prediction == 1 and correct_class == 2:
            confusion_matrix[1, 2] += 1
        if prediction == 2 and correct_class == 0:
            confusion_matrix[2, 0] += 1
        if prediction == 2 and correct_class == 1:
            confusion_matrix[2, 1] += 1
        if prediction == 2 and correct_class == 2:
            confusion_matrix[2, 2] += 1

    print confusion_matrix
    print correct / total
예제 #6
0
def train_and_test():
    training_data = dp.read_data('dataset/splice-Xtrain.dat',
                                 'dataset/splice-Ytrain.dat')
    test_data = dp.read_data('dataset/test40.txt', 'dataset/ytest40.txt')
    feature = Features()
    training_set = feature.simple(training_data)
    test_set = feature.simple(test_data)

    #dp.remove_ambiguous_entry(training_set)
    naive_bayes = NaiveBayes(training_set, 4, False)

    confusion_matrix = np.zeros([3, 3])
    correct = 0.0
    total = 0.0
    for index in range(len(test_set)):
        feature_vector, correct_class = test_set[index]
        prediction = naive_bayes.predict(feature_vector)
        total += 1
        if prediction == correct_class:
            correct += 1
        if prediction == 0 and correct_class == 0:
            confusion_matrix[0, 0] += 1
        if prediction == 0 and correct_class == 1:
            confusion_matrix[0, 1] += 1
        if prediction == 0 and correct_class == 2:
            confusion_matrix[0, 2] += 1
        if prediction == 1 and correct_class == 0:
            confusion_matrix[1, 0] += 1
        if prediction == 1 and correct_class == 1:
            confusion_matrix[1, 1] += 1
        if prediction == 1 and correct_class == 2:
            confusion_matrix[1, 2] += 1
        if prediction == 2 and correct_class == 0:
            confusion_matrix[2, 0] += 1
        if prediction == 2 and correct_class == 1:
            confusion_matrix[2, 1] += 1
        if prediction == 2 and correct_class == 2:
            confusion_matrix[2, 2] += 1
    print confusion_matrix
    print correct / total
예제 #7
0
def train_and_test():
    training_data = dp.read_data("dataset/splice-Xtrain.dat", "dataset/splice-Ytrain.dat")
    test_data = dp.read_data("dataset/test40.txt", "dataset/ytest40.txt")
    feature = Features()
    training_set = feature.simple(training_data)
    test_set = feature.simple(test_data)

    # dp.remove_ambiguous_entry(training_set)
    naive_bayes = NaiveBayes(training_set, 4, False)

    confusion_matrix = np.zeros([3, 3])
    correct = 0.0
    total = 0.0
    for index in range(len(test_set)):
        feature_vector, correct_class = test_set[index]
        prediction = naive_bayes.predict(feature_vector)
        total += 1
        if prediction == correct_class:
            correct += 1
        if prediction == 0 and correct_class == 0:
            confusion_matrix[0, 0] += 1
        if prediction == 0 and correct_class == 1:
            confusion_matrix[0, 1] += 1
        if prediction == 0 and correct_class == 2:
            confusion_matrix[0, 2] += 1
        if prediction == 1 and correct_class == 0:
            confusion_matrix[1, 0] += 1
        if prediction == 1 and correct_class == 1:
            confusion_matrix[1, 1] += 1
        if prediction == 1 and correct_class == 2:
            confusion_matrix[1, 2] += 1
        if prediction == 2 and correct_class == 0:
            confusion_matrix[2, 0] += 1
        if prediction == 2 and correct_class == 1:
            confusion_matrix[2, 1] += 1
        if prediction == 2 and correct_class == 2:
            confusion_matrix[2, 2] += 1
    print confusion_matrix
    print correct / total
예제 #8
0
def train_and_val():
    training_data = dp.read_data('dataset/splice-Xtrain.dat',
                                 'dataset/splice-Ytrain.dat')
    training_set_indices, validation_set_indices = dp.read_training_val_set(
        'dataset/train.txt', 'dataset/val.txt')
    feature = Features()
    features_labels_pair = feature.simple(training_data)
    training_set = []
    for index in training_set_indices:
        training_set.append(features_labels_pair[index])

    dp.remove_ambiguous_entry(training_set)
    k_nn = KNN(training_set, 19)

    confusion_matrix = np.zeros([3, 3])
    correct = 0.0
    total = 0.0

    validation_set = []
    for index in validation_set_indices:
        validation_set.append(features_labels_pair[index])

    dp.remove_ambiguous_entry(validation_set)
    for feature_vector, correct_class in validation_set:
        prediction = k_nn.predict_diff_bases(feature_vector, k_nn.no_weight)
        total += 1
        if prediction == correct_class:
            correct += 1
        if prediction == 0 and correct_class == 0:
            confusion_matrix[0, 0] += 1
        if prediction == 0 and correct_class == 1:
            confusion_matrix[0, 1] += 1
        if prediction == 0 and correct_class == 2:
            confusion_matrix[0, 2] += 1
        if prediction == 1 and correct_class == 0:
            confusion_matrix[1, 0] += 1
        if prediction == 1 and correct_class == 1:
            confusion_matrix[1, 1] += 1
        if prediction == 1 and correct_class == 2:
            confusion_matrix[1, 2] += 1
        if prediction == 2 and correct_class == 0:
            confusion_matrix[2, 0] += 1
        if prediction == 2 and correct_class == 1:
            confusion_matrix[2, 1] += 1
        if prediction == 2 and correct_class == 2:
            confusion_matrix[2, 2] += 1
        #print prediction, correct_class
    print confusion_matrix
    print correct / total
def train_and_val():
    training_data = dp.read_data('dataset/splice-Xtrain.dat', 'dataset/splice-Ytrain.dat')
    training_set_indices, validation_set_indices = dp.read_training_val_set('dataset/train.txt', 'dataset/val.txt')
    feature = Features()    
    features_labels_pair = feature.simple(training_data)
    training_set = []
    for index in training_set_indices:
        training_set.append(features_labels_pair[index])
    
    #dp.remove_ambiguous_entry(training_set)
    naive_bayes = NaiveBayes(training_set, 4, False)
    
    validation_set = []
    for index in validation_set_indices:
        validation_set.append(features_labels_pair[index])
    
    dp.remove_ambiguous_entry(validation_set)
    
    confusion_matrix = np.zeros([3,3])
    correct = 0.0
    total = 0.0
    for feature_vector, correct_class in validation_set: 
        prediction = naive_bayes.predict(feature_vector)
        total += 1
        if prediction == correct_class:
            correct += 1
        if prediction == 0 and correct_class == 0:
            confusion_matrix[0,0] += 1
        if  prediction == 0 and correct_class == 1:
            confusion_matrix[0,1] += 1
        if  prediction == 0 and correct_class == 2:
            confusion_matrix[0,2] += 1
        if  prediction == 1 and correct_class == 0:
            confusion_matrix[1,0] += 1
        if  prediction == 1 and correct_class == 1:
            confusion_matrix[1,1] += 1
        if  prediction == 1 and correct_class == 2:
            confusion_matrix[1,2] += 1
        if  prediction == 2 and correct_class == 0:
            confusion_matrix[2,0] += 1
        if  prediction == 2 and correct_class == 1:
            confusion_matrix[2,1] += 1
        if  prediction == 2 and correct_class == 2:
            confusion_matrix[2,2] += 1            
        #print prediction, correct_class
    print confusion_matrix      
    print correct/total
def train_and_val():
    training_data = dp.read_data('dataset/splice-Xtrain.dat', 'dataset/splice-Ytrain.dat')
    training_set_indices, validation_set_indices = dp.read_training_val_set('dataset/train.txt', 'dataset/val.txt')
    feature = Features()
    features_labels_pair = feature.amino_acid_count(training_data)
    training_set = []
    for index in training_set_indices:
        training_set.append(features_labels_pair[index])
    
    dp.remove_ambiguous_entry_plus(training_set)
    k_nn = KNN(training_set, 23)
    
    confusion_matrix = np.zeros([3,3])
    correct = 0.0
    total = 0.0
    
    validation_set = []
    for index in validation_set_indices:
        validation_set.append(features_labels_pair[index])
    
    dp.remove_ambiguous_entry_plus(validation_set)
    for feature_vector, correct_class in validation_set: 
        prediction = k_nn.predict_codon_cosine(feature_vector, k_nn.no_weight)
        total += 1
        if prediction == correct_class:
            correct += 1
        if prediction == 0 and correct_class == 0:
            confusion_matrix[0,0] += 1
        if  prediction == 0 and correct_class == 1:
            confusion_matrix[0,1] += 1
        if  prediction == 0 and correct_class == 2:
            confusion_matrix[0,2] += 1
        if  prediction == 1 and correct_class == 0:
            confusion_matrix[1,0] += 1
        if  prediction == 1 and correct_class == 1:
            confusion_matrix[1,1] += 1
        if  prediction == 1 and correct_class == 2:
            confusion_matrix[1,2] += 1
        if  prediction == 2 and correct_class == 0:
            confusion_matrix[2,0] += 1
        if  prediction == 2 and correct_class == 1:
            confusion_matrix[2,1] += 1
        if  prediction == 2 and correct_class == 2:
            confusion_matrix[2,2] += 1  
        #print prediction, correct_class
    print confusion_matrix      
    print correct/total     
def generate_train_val():
    training_data = dp.read_data('dataset/splice-Xtrain.dat','dataset/splice-Ytrain.dat')
    dp.generate_training_val_set(len(training_data), 'dataset/train.txt', 'dataset/val.txt')
def train_word2vec(args):
    X, Y = data_processor.read_data(args.train_path) X_t, Y_t = data_processor.read_data(args.test_path)

    documents = np.concatenate((X, X_t), axis=0)
    model = Word2Vec(documents, size=300, window=8, min_count=0, workers=4)
    model.save('model/word2vec')
예제 #13
0
                        "Number of checkpoints to store (default: 5)")

tf.flags.DEFINE_boolean("allow_soft_placement", True,
                        "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

print("Loading data...")
s1, s2, score = data_processor.read_data(FLAGS.data_file)
score = np.asarray([[s] for s in score])
sample_num = len(score)
train_end = int(sample_num * FLAGS.train_sample_percentage)

# TODO: Cross validation
s1_train, s1_dev = s1[:train_end], s1[train_end:]
s2_train, s2_dev = s2[:train_end], s2[train_end:]
score_train, score_dev = score[:train_end], score[train_end:]
print("Train/Dev split: {:d}/{:d}".format(len(score_train), len(score_dev)))

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(
        allow_soft_placement=FLAGS.allow_soft_placement,
        log_device_placement=FLAGS.log_device_placement)
    sess = tf.Session(config=session_conf)
예제 #14
0
def generate_train_val():
    training_data = dp.read_data('dataset/splice-Xtrain.dat',
                                 'dataset/splice-Ytrain.dat')
    dp.generate_training_val_set(len(training_data), 'dataset/train.txt',
                                 'dataset/val.txt')
예제 #15
0
def main():
    data = data_processor.read_data(constants.path_to_data)
    # part_one_codes(data)
    part_two_codes(data)