def train_doc2vec(args): X, Y = data_processor.read_data(args.train_path) X_t, Y_t = data_processor.read_data(args.test_path) documents = [TaggedDocument(x, f'{idx}') for idx, x in enumerate(np.concatenate((X, X_t), axis=0))] model = Doc2Vec(documents, vector_size=300, window=8, min_count=0, workers=4) model.save('model/doc2vec')
def input_fn(trainingset_path, use_weights): x_train, y_train = read_data(trainingset_path, 'train') y_train = y_train.apply(lambda x: 1 if x < -1 else 0) print('Train set balance: {}/{}'.format(np.count_nonzero(y_train), len(y_train))) x_eval, y_eval = read_data(trainingset_path, 'eval') y_eval = y_eval.apply(lambda x: 1 if x < -1 else 0) print('Eval set balance: {}/{}'.format(np.count_nonzero(y_eval), len(y_eval))) x_test, y_test = read_data(trainingset_path, 'infer') y_test = y_test.apply(lambda x: 1 if x < -1 else 0) print('Infer set balance: {}/{}'.format(np.count_nonzero(y_test), len(y_test))) # Compute class weights to optimize binary cross-entropy if use_weights: weights = compute_class_weight('balanced', np.unique(y_test), y_test) else: weights = None x_train = np.reshape(x_train.values, (-1, x_train.shape[1], 1)) y_train = np.reshape(y_train.values, (-1, 1)) x_eval = np.reshape(x_eval.values, (-1, x_eval.shape[1], 1)) y_eval = np.reshape(y_eval.values, (-1, 1)) x_test = np.reshape(x_test.values, (-1, x_test.shape[1], 1)) y_test = np.reshape(y_test.values, (-1, 1)) return x_train, y_train, x_eval, y_eval, x_test, y_test, weights
def train_and_test(): training_data = dp.read_data('dataset/splice-Xtrain.dat', 'dataset/splice-Ytrain.dat') test_data = dp.read_data('dataset/test40.txt', 'dataset/ytest40.txt') feature = Features() dp.remove_ambiguous_entry_plus(training_data) training_set = feature.amino_acid_count(training_data) test_set = feature.amino_acid_count(test_data) k_nearest_neighbors = KNN(training_set, 26) confusion_matrix = np.zeros([3, 3]) correct = 0.0 total = 0.0 for index in range(len(test_set)): feature_vector, correct_class = test_set[index] prediction = k_nearest_neighbors.predict_codon_cosine( feature_vector, k_nearest_neighbors.no_weight) total += 1 if prediction == correct_class: correct += 1 if prediction == 0 and correct_class == 0: confusion_matrix[0, 0] += 1 if prediction == 0 and correct_class == 1: confusion_matrix[0, 1] += 1 if prediction == 0 and correct_class == 2: confusion_matrix[0, 2] += 1 if prediction == 1 and correct_class == 0: confusion_matrix[1, 0] += 1 if prediction == 1 and correct_class == 1: confusion_matrix[1, 1] += 1 if prediction == 1 and correct_class == 2: confusion_matrix[1, 2] += 1 if prediction == 2 and correct_class == 0: confusion_matrix[2, 0] += 1 if prediction == 2 and correct_class == 1: confusion_matrix[2, 1] += 1 if prediction == 2 and correct_class == 2: confusion_matrix[2, 2] += 1 print confusion_matrix print correct / total
def input_fn(mode, hparams, trainingset_path): data_x, data_y = read_data(trainingset_path, mode) return tf.compat.v1.estimator.inputs.pandas_input_fn( data_x, y=data_y, batch_size=128, num_epochs=10, shuffle=True, num_threads=2, )
def train_and_test(): training_data = dp.read_data("dataset/splice-Xtrain.dat", "dataset/splice-Ytrain.dat") test_data = dp.read_data("dataset/test40.txt", "dataset/ytest40.txt") feature = Features() dp.remove_ambiguous_entry_plus(training_data) training_set = feature.amino_acid_count(training_data) test_set = feature.amino_acid_count(test_data) k_nearest_neighbors = KNN(training_set, 26) confusion_matrix = np.zeros([3, 3]) correct = 0.0 total = 0.0 for index in range(len(test_set)): feature_vector, correct_class = test_set[index] prediction = k_nearest_neighbors.predict_codon_cosine(feature_vector, k_nearest_neighbors.no_weight) total += 1 if prediction == correct_class: correct += 1 if prediction == 0 and correct_class == 0: confusion_matrix[0, 0] += 1 if prediction == 0 and correct_class == 1: confusion_matrix[0, 1] += 1 if prediction == 0 and correct_class == 2: confusion_matrix[0, 2] += 1 if prediction == 1 and correct_class == 0: confusion_matrix[1, 0] += 1 if prediction == 1 and correct_class == 1: confusion_matrix[1, 1] += 1 if prediction == 1 and correct_class == 2: confusion_matrix[1, 2] += 1 if prediction == 2 and correct_class == 0: confusion_matrix[2, 0] += 1 if prediction == 2 and correct_class == 1: confusion_matrix[2, 1] += 1 if prediction == 2 and correct_class == 2: confusion_matrix[2, 2] += 1 print confusion_matrix print correct / total
def train_and_test(): training_data = dp.read_data('dataset/splice-Xtrain.dat', 'dataset/splice-Ytrain.dat') test_data = dp.read_data('dataset/test40.txt', 'dataset/ytest40.txt') feature = Features() training_set = feature.simple(training_data) test_set = feature.simple(test_data) #dp.remove_ambiguous_entry(training_set) naive_bayes = NaiveBayes(training_set, 4, False) confusion_matrix = np.zeros([3, 3]) correct = 0.0 total = 0.0 for index in range(len(test_set)): feature_vector, correct_class = test_set[index] prediction = naive_bayes.predict(feature_vector) total += 1 if prediction == correct_class: correct += 1 if prediction == 0 and correct_class == 0: confusion_matrix[0, 0] += 1 if prediction == 0 and correct_class == 1: confusion_matrix[0, 1] += 1 if prediction == 0 and correct_class == 2: confusion_matrix[0, 2] += 1 if prediction == 1 and correct_class == 0: confusion_matrix[1, 0] += 1 if prediction == 1 and correct_class == 1: confusion_matrix[1, 1] += 1 if prediction == 1 and correct_class == 2: confusion_matrix[1, 2] += 1 if prediction == 2 and correct_class == 0: confusion_matrix[2, 0] += 1 if prediction == 2 and correct_class == 1: confusion_matrix[2, 1] += 1 if prediction == 2 and correct_class == 2: confusion_matrix[2, 2] += 1 print confusion_matrix print correct / total
def train_and_test(): training_data = dp.read_data("dataset/splice-Xtrain.dat", "dataset/splice-Ytrain.dat") test_data = dp.read_data("dataset/test40.txt", "dataset/ytest40.txt") feature = Features() training_set = feature.simple(training_data) test_set = feature.simple(test_data) # dp.remove_ambiguous_entry(training_set) naive_bayes = NaiveBayes(training_set, 4, False) confusion_matrix = np.zeros([3, 3]) correct = 0.0 total = 0.0 for index in range(len(test_set)): feature_vector, correct_class = test_set[index] prediction = naive_bayes.predict(feature_vector) total += 1 if prediction == correct_class: correct += 1 if prediction == 0 and correct_class == 0: confusion_matrix[0, 0] += 1 if prediction == 0 and correct_class == 1: confusion_matrix[0, 1] += 1 if prediction == 0 and correct_class == 2: confusion_matrix[0, 2] += 1 if prediction == 1 and correct_class == 0: confusion_matrix[1, 0] += 1 if prediction == 1 and correct_class == 1: confusion_matrix[1, 1] += 1 if prediction == 1 and correct_class == 2: confusion_matrix[1, 2] += 1 if prediction == 2 and correct_class == 0: confusion_matrix[2, 0] += 1 if prediction == 2 and correct_class == 1: confusion_matrix[2, 1] += 1 if prediction == 2 and correct_class == 2: confusion_matrix[2, 2] += 1 print confusion_matrix print correct / total
def train_and_val(): training_data = dp.read_data('dataset/splice-Xtrain.dat', 'dataset/splice-Ytrain.dat') training_set_indices, validation_set_indices = dp.read_training_val_set( 'dataset/train.txt', 'dataset/val.txt') feature = Features() features_labels_pair = feature.simple(training_data) training_set = [] for index in training_set_indices: training_set.append(features_labels_pair[index]) dp.remove_ambiguous_entry(training_set) k_nn = KNN(training_set, 19) confusion_matrix = np.zeros([3, 3]) correct = 0.0 total = 0.0 validation_set = [] for index in validation_set_indices: validation_set.append(features_labels_pair[index]) dp.remove_ambiguous_entry(validation_set) for feature_vector, correct_class in validation_set: prediction = k_nn.predict_diff_bases(feature_vector, k_nn.no_weight) total += 1 if prediction == correct_class: correct += 1 if prediction == 0 and correct_class == 0: confusion_matrix[0, 0] += 1 if prediction == 0 and correct_class == 1: confusion_matrix[0, 1] += 1 if prediction == 0 and correct_class == 2: confusion_matrix[0, 2] += 1 if prediction == 1 and correct_class == 0: confusion_matrix[1, 0] += 1 if prediction == 1 and correct_class == 1: confusion_matrix[1, 1] += 1 if prediction == 1 and correct_class == 2: confusion_matrix[1, 2] += 1 if prediction == 2 and correct_class == 0: confusion_matrix[2, 0] += 1 if prediction == 2 and correct_class == 1: confusion_matrix[2, 1] += 1 if prediction == 2 and correct_class == 2: confusion_matrix[2, 2] += 1 #print prediction, correct_class print confusion_matrix print correct / total
def train_and_val(): training_data = dp.read_data('dataset/splice-Xtrain.dat', 'dataset/splice-Ytrain.dat') training_set_indices, validation_set_indices = dp.read_training_val_set('dataset/train.txt', 'dataset/val.txt') feature = Features() features_labels_pair = feature.simple(training_data) training_set = [] for index in training_set_indices: training_set.append(features_labels_pair[index]) #dp.remove_ambiguous_entry(training_set) naive_bayes = NaiveBayes(training_set, 4, False) validation_set = [] for index in validation_set_indices: validation_set.append(features_labels_pair[index]) dp.remove_ambiguous_entry(validation_set) confusion_matrix = np.zeros([3,3]) correct = 0.0 total = 0.0 for feature_vector, correct_class in validation_set: prediction = naive_bayes.predict(feature_vector) total += 1 if prediction == correct_class: correct += 1 if prediction == 0 and correct_class == 0: confusion_matrix[0,0] += 1 if prediction == 0 and correct_class == 1: confusion_matrix[0,1] += 1 if prediction == 0 and correct_class == 2: confusion_matrix[0,2] += 1 if prediction == 1 and correct_class == 0: confusion_matrix[1,0] += 1 if prediction == 1 and correct_class == 1: confusion_matrix[1,1] += 1 if prediction == 1 and correct_class == 2: confusion_matrix[1,2] += 1 if prediction == 2 and correct_class == 0: confusion_matrix[2,0] += 1 if prediction == 2 and correct_class == 1: confusion_matrix[2,1] += 1 if prediction == 2 and correct_class == 2: confusion_matrix[2,2] += 1 #print prediction, correct_class print confusion_matrix print correct/total
def train_and_val(): training_data = dp.read_data('dataset/splice-Xtrain.dat', 'dataset/splice-Ytrain.dat') training_set_indices, validation_set_indices = dp.read_training_val_set('dataset/train.txt', 'dataset/val.txt') feature = Features() features_labels_pair = feature.amino_acid_count(training_data) training_set = [] for index in training_set_indices: training_set.append(features_labels_pair[index]) dp.remove_ambiguous_entry_plus(training_set) k_nn = KNN(training_set, 23) confusion_matrix = np.zeros([3,3]) correct = 0.0 total = 0.0 validation_set = [] for index in validation_set_indices: validation_set.append(features_labels_pair[index]) dp.remove_ambiguous_entry_plus(validation_set) for feature_vector, correct_class in validation_set: prediction = k_nn.predict_codon_cosine(feature_vector, k_nn.no_weight) total += 1 if prediction == correct_class: correct += 1 if prediction == 0 and correct_class == 0: confusion_matrix[0,0] += 1 if prediction == 0 and correct_class == 1: confusion_matrix[0,1] += 1 if prediction == 0 and correct_class == 2: confusion_matrix[0,2] += 1 if prediction == 1 and correct_class == 0: confusion_matrix[1,0] += 1 if prediction == 1 and correct_class == 1: confusion_matrix[1,1] += 1 if prediction == 1 and correct_class == 2: confusion_matrix[1,2] += 1 if prediction == 2 and correct_class == 0: confusion_matrix[2,0] += 1 if prediction == 2 and correct_class == 1: confusion_matrix[2,1] += 1 if prediction == 2 and correct_class == 2: confusion_matrix[2,2] += 1 #print prediction, correct_class print confusion_matrix print correct/total
def generate_train_val(): training_data = dp.read_data('dataset/splice-Xtrain.dat','dataset/splice-Ytrain.dat') dp.generate_training_val_set(len(training_data), 'dataset/train.txt', 'dataset/val.txt')
def train_word2vec(args): X, Y = data_processor.read_data(args.train_path) X_t, Y_t = data_processor.read_data(args.test_path) documents = np.concatenate((X, X_t), axis=0) model = Word2Vec(documents, size=300, window=8, min_count=0, workers=4) model.save('model/word2vec')
"Number of checkpoints to store (default: 5)") tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement") tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices") FLAGS = tf.flags.FLAGS FLAGS._parse_flags() print("\nParameters:") for attr, value in sorted(FLAGS.__flags.items()): print("{}={}".format(attr.upper(), value)) print("") print("Loading data...") s1, s2, score = data_processor.read_data(FLAGS.data_file) score = np.asarray([[s] for s in score]) sample_num = len(score) train_end = int(sample_num * FLAGS.train_sample_percentage) # TODO: Cross validation s1_train, s1_dev = s1[:train_end], s1[train_end:] s2_train, s2_dev = s2[:train_end], s2[train_end:] score_train, score_dev = score[:train_end], score[train_end:] print("Train/Dev split: {:d}/{:d}".format(len(score_train), len(score_dev))) with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf)
def generate_train_val(): training_data = dp.read_data('dataset/splice-Xtrain.dat', 'dataset/splice-Ytrain.dat') dp.generate_training_val_set(len(training_data), 'dataset/train.txt', 'dataset/val.txt')
def main(): data = data_processor.read_data(constants.path_to_data) # part_one_codes(data) part_two_codes(data)