def KFOLD_KNN_parameter_test(X, y, n_cross_val = 5,n_neighbors = 5): weights1 = 'uniform' weights2 = 'distance' results_1 = [] results_2 = [] i = [] for n_neighbors in range(2, 21): print 'number of neighbors:', n_neighbors # build two classifiers clf1 = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights1) clf2 = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights2) scores1 = util.cross_validation(X, y, clf1, cv=n_cross_val) scores2 = util.cross_validation(X, y, clf2, cv=n_cross_val) i.append(n_neighbors) results_1.append(scores1.mean()) results_2.append(scores2.mean()) return i, results_1, results_2
def KFOLD_KNN_parameter_test(X, y, n_cross_val=5, n_neighbors=5): weights1 = 'uniform' weights2 = 'distance' results_1 = [] results_2 = [] i = [] for n_neighbors in range(2, 21): print 'number of neighbors:', n_neighbors # build two classifiers clf1 = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights1) clf2 = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights=weights2) scores1 = util.cross_validation(X, y, clf1, cv=n_cross_val) scores2 = util.cross_validation(X, y, clf2, cv=n_cross_val) i.append(n_neighbors) results_1.append(scores1.mean()) results_2.append(scores2.mean()) return i, results_1, results_2
def main(_): opts = Options(save_path=FLAGS.save_path, train_biom=FLAGS.train_biom, test_biom=FLAGS.test_biom, train_metadata=FLAGS.train_metadata, test_metadata=FLAGS.test_metadata, formula=FLAGS.formula, tree=FLAGS.tree, learning_rate=FLAGS.learning_rate, clipping_size=FLAGS.clipping_size, beta_mean=FLAGS.beta_mean, beta_scale=FLAGS.beta_scale, gamma_mean=FLAGS.gamma_mean, gamma_scale=FLAGS.gamma_scale, epochs_to_train=FLAGS.epochs_to_train, num_neg_samples=FLAGS.num_neg_samples, batch_size=FLAGS.batch_size, min_sample_count=FLAGS.min_sample_count, min_feature_count=FLAGS.min_feature_count, statistics_interval=FLAGS.statistics_interval, summary_interval=FLAGS.summary_interval, checkpoint_interval=FLAGS.checkpoint_interval) # preprocessing train_table, train_metadata = opts.train_table, opts.train_metadata train_metadata = train_metadata.loc[train_table.ids(axis='sample')] sample_filter = lambda val, id_, md: ( (id_ in train_metadata.index) and np.sum(val) > opts.min_sample_count) read_filter = lambda val, id_, md: np.sum(val) > opts.min_feature_count metadata_filter = lambda val, id_, md: id_ in train_metadata.index train_table = train_table.filter(metadata_filter, axis='sample') train_table = train_table.filter(sample_filter, axis='sample') train_table = train_table.filter(read_filter, axis='observation') train_metadata = train_metadata.loc[train_table.ids(axis='sample')] sort_f = lambda xs: [xs[train_metadata.index.get_loc(x)] for x in xs] train_table = train_table.sort(sort_f=sort_f, axis='sample') train_metadata = dmatrix(opts.formula, train_metadata, return_type='dataframe') tree = opts.tree train_table, tree = match_tips(train_table, tree) basis, _ = sparse_balance_basis(tree) basis = basis.T # hold out data preprocessing test_table, test_metadata = opts.test_table, opts.test_metadata metadata_filter = lambda val, id_, md: id_ in test_metadata.index obs_lookup = set(train_table.ids(axis='observation')) feat_filter = lambda val, id_, md: id_ in obs_lookup test_table = test_table.filter(metadata_filter, axis='sample') test_table = test_table.filter(feat_filter, axis='observation') test_metadata = test_metadata.loc[test_table.ids(axis='sample')] sort_f = lambda xs: [xs[test_metadata.index.get_loc(x)] for x in xs] test_table = test_table.sort(sort_f=sort_f, axis='sample') test_metadata = dmatrix(opts.formula, test_metadata, return_type='dataframe') test_table, tree = match_tips(test_table, tree) p = train_metadata.shape[1] # number of covariates G_data = train_metadata.values y_data = train_table.matrix_data.tocoo().T y_test = np.array(test_table.matrix_data.todense()).T N, D = y_data.shape save_path = opts.save_path learning_rate = opts.learning_rate batch_size = opts.batch_size gamma_mean, gamma_scale = opts.gamma_mean, opts.gamma_scale beta_mean, beta_scale = opts.beta_mean, opts.beta_scale num_neg = opts.num_neg_samples clipping_size = opts.clipping_size epoch = y_data.nnz // batch_size num_iter = int(opts.epochs_to_train * epoch) holdout_size = test_metadata.shape[0] checkpoint_interval = opts.checkpoint_interval # Model code with tf.Graph().as_default(), tf.Session() as session: with tf.device("/cpu:0"): # Place holder variables to accept input data Gpos_ph = tf.placeholder(tf.float32, [batch_size, p], name='G_pos') Gneg_ph = tf.placeholder(tf.float32, [num_neg, p], name='G_neg') G_holdout = tf.placeholder(tf.float32, [holdout_size, p], name='G_holdout') Y_holdout = tf.placeholder(tf.float32, [holdout_size, D], name='Y_holdout') Y_ph = tf.placeholder(tf.float32, [batch_size], name='Y_ph') pos_row = tf.placeholder(tf.int32, shape=[batch_size], name='pos_row') pos_col = tf.placeholder(tf.int32, shape=[batch_size], name='pos_col') neg_row = tf.placeholder(tf.int32, shape=[num_neg], name='neg_row') neg_col = tf.placeholder(tf.int32, shape=[num_neg], name='neg_col') neg_data = tf.zeros(shape=[num_neg], name='neg_data', dtype=tf.float32) total_zero = tf.constant(y_data.shape[0] * y_data.shape[1] - y_data.nnz, dtype=tf.float32) total_nonzero = tf.constant(y_data.nnz, dtype=tf.float32) # Define PointMass Variables first qgamma = tf.Variable(tf.random_normal([1, D - 1]), name='qgamma') qbeta = tf.Variable(tf.random_normal([p, D - 1]), name='qB') theta = tf.Variable(tf.random_normal([N, 1]), name='theta') # Distributions species bias gamma = Normal(loc=tf.zeros([1, D - 1]) + gamma_mean, scale=tf.ones([1, D - 1]) * gamma_scale, name='gamma') # regression coefficents distribution beta = Normal(loc=tf.zeros([p, D - 1]) + beta_mean, scale=tf.ones([p, D - 1]) * beta_scale, name='B') Bprime = tf.concat([qgamma, qbeta], axis=0) # Add bias terms for samples Gpos = tf.concat([tf.ones([batch_size, 1]), Gpos_ph], axis=1) Gneg = tf.concat([tf.ones([num_neg, 1]), Gneg_ph], axis=1) # Convert basis to SparseTensor psi = tf.SparseTensor(indices=np.mat([basis.row, basis.col]).transpose(), values=basis.data, dense_shape=basis.shape) V = tf.transpose( tf.sparse_tensor_dense_matmul(psi, tf.transpose(Bprime))) # sparse matrix multiplication for positive samples pos_prime = tf.reduce_sum(tf.multiply( Gpos, tf.transpose(tf.gather(V, pos_col, axis=1))), axis=1) pos_phi = tf.reshape(tf.gather(theta, pos_row), shape=[batch_size]) + pos_prime Y = Poisson(log_rate=pos_phi, name='Y') # sparse matrix multiplication for negative samples neg_prime = tf.reduce_sum(tf.multiply( Gneg, tf.transpose(tf.gather(V, neg_col, axis=1))), axis=1) neg_phi = tf.reshape(tf.gather(theta, neg_row), shape=[num_neg]) + neg_prime neg_poisson = Poisson(log_rate=neg_phi, name='neg_counts') loss = -( tf.reduce_sum(gamma.log_prob(qgamma)) + \ tf.reduce_sum(beta.log_prob(qbeta)) + \ tf.reduce_sum(Y.log_prob(Y_ph)) * (total_nonzero / batch_size) + \ tf.reduce_sum(neg_poisson.log_prob(neg_data)) * (total_zero / num_neg) ) optimizer = tf.train.AdamOptimizer(learning_rate, beta1=0.9, beta2=0.9) gradients, variables = zip(*optimizer.compute_gradients(loss)) gradients, _ = tf.clip_by_global_norm(gradients, clipping_size) train = optimizer.apply_gradients(zip(gradients, variables)) with tf.name_scope('accuracy'): holdout_count = tf.reduce_sum(Y_holdout, axis=1) spred = tf.nn.softmax( tf.transpose( tf.sparse_tensor_dense_matmul( psi, tf.transpose( (tf.matmul(G_holdout, qbeta) + qgamma))))) pred = tf.reshape(holdout_count, [-1, 1]) * spred mse = tf.reduce_mean(tf.squeeze(tf.abs(pred - Y_holdout))) tf.summary.scalar('mean_absolute_error', mse) tf.summary.scalar('loss', loss) tf.summary.histogram('qbeta', qbeta) tf.summary.histogram('qgamma', qgamma) tf.summary.histogram('theta', theta) merged = tf.summary.merge_all() tf.global_variables_initializer().run() writer = tf.summary.FileWriter(save_path, session.graph) losses = np.array([0.] * num_iter) idx = np.arange(train_metadata.shape[0]) log_handle = open(os.path.join(save_path, 'run.log'), 'w') gen = get_batch(batch_size, N, D, y_data.data, y_data.row, y_data.col, num_neg=num_neg) start_time = time.time() last_checkpoint_time = 0 start_time = time.time() saver = tf.train.Saver() for i in range(num_iter): batch_idx = np.random.choice(idx, size=batch_size) batch = next(gen) (positive_row, positive_col, positive_data, negative_row, negative_col, negative_data) = batch feed_dict = { Y_ph: positive_data, Y_holdout: y_test.astype(np.float32), G_holdout: test_metadata.values.astype(np.float32), Gpos_ph: G_data[positive_row, :], Gneg_ph: G_data[negative_row, :], pos_row: positive_row, pos_col: positive_col, neg_row: negative_row, neg_col: negative_col } if i % 1000 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() _, summary, train_loss, grads = session.run( [train, merged, loss, gradients], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % i) writer.add_summary(summary, i) elif i % 5000 == 0: _, summary, err, train_loss, grads = session.run( [train, mse, merged, loss, gradients], feed_dict=feed_dict) writer.add_summary(summary, i) else: _, summary, train_loss, grads = session.run( [train, merged, loss, gradients], feed_dict=feed_dict) writer.add_summary(summary, i) now = time.time() if now - last_checkpoint_time > checkpoint_interval: saver.save(session, os.path.join(opts.save_path, "model.ckpt"), global_step=i) last_checkpoint_time = now losses[i] = train_loss elapsed_time = time.time() - start_time print('Elapsed Time: %f seconds' % elapsed_time) # Cross validation pred_beta = qbeta.eval() pred_gamma = qgamma.eval() mse, mrc = cross_validation(test_metadata.values, pred_beta @ basis.T, pred_gamma @ basis.T, y_test) print("MSE: %f, MRC: %f" % (mse, mrc))
def main(_): options = Options(save_path=FLAGS.save_path, train_biom=FLAGS.train_biom, test_biom=FLAGS.test_biom, train_metadata=FLAGS.train_metadata, test_metadata=FLAGS.test_metadata, formula=FLAGS.formula, learning_rate=FLAGS.learning_rate, clipping_size=FLAGS.clipping_size, beta_mean=FLAGS.beta_mean, beta_scale=FLAGS.beta_scale, gamma_mean=FLAGS.gamma_mean, gamma_scale=FLAGS.gamma_scale, epochs_to_train=FLAGS.epochs_to_train, num_neg_samples=FLAGS.num_neg_samples, batch_size=FLAGS.batch_size, block_size=FLAGS.block_size, min_sample_count=FLAGS.min_sample_count, min_feature_count=FLAGS.min_feature_count, statistics_interval=FLAGS.statistics_interval, summary_interval=FLAGS.summary_interval, checkpoint_interval=FLAGS.checkpoint_interval) # preprocessing (i.e. biom table, metadata, ...) (train_table, test_biom, train_metadata, test_metadata) = preprocess( options.formula, options.train_table, options.train_metadata, options.test_table, options.test_metadata, options.min_sample_count, options.min_feature_count) samp_ids = train_table.ids(axis='sample') obs_ids = train_table.ids(axis='observation') md_ids = np.array(train_metadata.columns) biom_train = train_table.matrix_data.tocoo().T biom_test = test_biom.matrix_data.tocoo().T # Model code with tf.Graph().as_default(), tf.Session() as session: model = PoissonRegression(options, session) model.initialize() gen = model.retrieve(train_table, train_metadata) y_feed, G_feed = next(gen) y_data = tf.sparse_placeholder(dtype=tf.int32, shape=(model.M, model.D), name='y_data_ph') G_data = tf.placeholder(tf.float32, shape=(model.M, model.p), name='G_data_ph') # setup cross validation data G_test = tf.constant(test_metadata.values, dtype=tf.float32) y_test = tf.SparseTensorValue(indices=np.array( [biom_test.row, biom_test.col]).T, values=biom_test.data, dense_shape=biom_test.shape) positive_batch, random_batch = model.sample(y_data) log_loss = model.loss(G_data, y_data, positive_batch, random_batch) train_step, grads, variables = model.optimize(log_loss) mean_err = model.evaluate(G_test, y_test) tf.global_variables_initializer().run() # summary information tf.summary.histogram('qbeta', model.qbeta) tf.summary.histogram('qgamma', model.qgamma) tf.summary.scalar('mean_absolute_error', mean_err) for i, g in enumerate(grads): tf.summary.histogram('gradient/%s' % variables[i], g) merged = tf.summary.merge_all() last_checkpoint_time = 0 last_summary_time = 0 last_statistics_time = 0 # initialize with small minibatch train_, loss, err, beta, gamma = session.run( [train_step, log_loss, mean_err, model.qbeta, model.qgamma], feed_dict={ y_data: y_feed, G_data: G_feed }) epoch = model.num_nonzero // options.batch_size num_iter = int(options.epochs_to_train * epoch) saver = tf.train.Saver() writer = tf.summary.FileWriter(options.save_path, session.graph) start_time = time.time() k = 0 for i in tqdm(range(1, num_iter)): now = time.time() # grab the next block if i % options.block_size == 0: y_feed, G_feed = next(gen) train_, loss, err, beta, gamma = session.run([ train_step, log_loss, mean_err, model.qbeta, model.qgamma ], feed_dict={ y_data: y_feed, G_data: G_feed }) k = k % options.block_size # check for summary elif now - last_summary_time > options.summary_interval: run_metadata = tf.RunMetadata() run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) _, summary = session.run([train_step, merged], options=run_options, run_metadata=run_metadata, feed_dict={ y_data: y_feed, G_data: G_feed }) writer.add_summary(summary, i) writer.add_run_metadata(run_metadata, 'step%d' % i) last_summary_time = now elif now - last_checkpoint_time > options.checkpoint_interval: saver.save(session, os.path.join(options.save_path, "model.ckpt"), global_step=i) last_checkpoint_time = now else: train_, loss, beta, gamma = session.run( [train_step, log_loss, model.qbeta, model.qgamma], feed_dict={ y_data: y_feed, G_data: G_feed }) elapsed_time = time.time() - start_time print('Elapsed Time: %f seconds' % elapsed_time) # save all parameters to the save path train_, loss, beta, gamma, theta = session.run( [train_step, log_loss, model.qbeta, model.qgamma]) pd.DataFrame( beta, index=md_ids, columns=obs_ids, ).to_csv(os.path.join(options.save_path, 'beta.csv')) pd.DataFrame( gamma, index=['intercept'], columns=obs_ids, ).to_csv(os.path.join(options.save_path, 'gamma.csv')) pd.DataFrame( theta, index=samp_ids, columns=['theta'], ).to_csv(os.path.join(options.save_path, 'theta.csv')) # Run final round of cross validation y_test = np.array(model.y_test.todense()) G_test = model.G_test # Cross validation pred_beta = model.qbeta.eval() pred_gamma = model.qgamma.eval() mse, mrc = cross_validation(G_test, pred_beta, pred_gamma, y_test) print("MSE: %f, MRC: %f" % (mse, mrc))
def main(_): opts = Options(save_path=FLAGS.save_path, train_biom=FLAGS.train_biom, test_biom=FLAGS.test_biom, train_metadata=FLAGS.train_metadata, test_metadata=FLAGS.test_metadata, formula=FLAGS.formula, learning_rate=FLAGS.learning_rate, clipping_size=FLAGS.clipping_size, beta_mean=FLAGS.beta_mean, beta_scale=FLAGS.beta_scale, gamma_mean=FLAGS.gamma_mean, gamma_scale=FLAGS.gamma_scale, epochs_to_train=FLAGS.epochs_to_train, num_neg_samples=FLAGS.num_neg_samples, batch_size=FLAGS.batch_size, min_sample_count=FLAGS.min_sample_count, min_feature_count=FLAGS.min_feature_count, statistics_interval=FLAGS.statistics_interval, summary_interval=FLAGS.summary_interval, checkpoint_interval=FLAGS.checkpoint_interval) # preprocessing train_table, train_metadata = opts.train_table, opts.train_metadata train_metadata = train_metadata.loc[train_table.ids(axis='sample')] sample_filter = lambda val, id_, md: ( (id_ in train_metadata.index) and np.sum(val) > opts.min_sample_count) read_filter = lambda val, id_, md: np.sum(val) > opts.min_feature_count metadata_filter = lambda val, id_, md: id_ in train_metadata.index train_table = train_table.filter(metadata_filter, axis='sample') train_table = train_table.filter(sample_filter, axis='sample') train_table = train_table.filter(read_filter, axis='observation') train_metadata = train_metadata.loc[train_table.ids(axis='sample')] sort_f = lambda xs: [xs[train_metadata.index.get_loc(x)] for x in xs] train_table = train_table.sort(sort_f=sort_f, axis='sample') train_metadata = dmatrix(opts.formula, train_metadata, return_type='dataframe') # hold out data preprocessing test_table, test_metadata = opts.test_table, opts.test_metadata metadata_filter = lambda val, id_, md: id_ in test_metadata.index obs_lookup = set(train_table.ids(axis='observation')) feat_filter = lambda val, id_, md: id_ in obs_lookup test_table = test_table.filter(metadata_filter, axis='sample') test_table = test_table.filter(feat_filter, axis='observation') test_metadata = test_metadata.loc[test_table.ids(axis='sample')] sort_f = lambda xs: [xs[test_metadata.index.get_loc(x)] for x in xs] test_table = test_table.sort(sort_f=sort_f, axis='sample') test_metadata = dmatrix(opts.formula, test_metadata, return_type='dataframe') p = train_metadata.shape[1] # number of covariates G_data = train_metadata.values y_data = np.array(train_table.matrix_data.todense()).T y_test = np.array(test_table.matrix_data.todense()).T N, D = y_data.shape save_path = opts.save_path learning_rate = opts.learning_rate batch_size = opts.batch_size gamma_mean, gamma_scale = opts.gamma_mean, opts.gamma_scale beta_mean, beta_scale = opts.beta_mean, opts.beta_scale num_iter = (N // batch_size) * opts.epochs_to_train holdout_size = test_metadata.shape[0] checkpoint_interval = opts.checkpoint_interval # Model code with tf.Graph().as_default(), tf.Session() as session: with tf.device("/cpu:0"): # Place holder variables to accept input data G_ph = tf.placeholder(tf.float32, [batch_size, p], name='G_ph') Y_ph = tf.placeholder(tf.float32, [batch_size, D], name='Y_ph') G_holdout = tf.placeholder(tf.float32, [holdout_size, p], name='G_holdout') Y_holdout = tf.placeholder(tf.float32, [holdout_size, D], name='Y_holdout') total_count = tf.placeholder(tf.float32, [batch_size], name='total_count') # Define PointMass Variables first qgamma = tf.Variable(tf.random_normal([1, D]), name='qgamma') qbeta = tf.Variable(tf.random_normal([p, D]), name='qB') # Distributions # species bias gamma = Normal(loc=tf.zeros([1, D]) + gamma_mean, scale=tf.ones([1, D]) * gamma_scale, name='gamma') # regression coefficents distribution beta = Normal(loc=tf.zeros([p, D]) + beta_mean, scale=tf.ones([p, D]) * beta_scale, name='B') Bprime = tf.concat([qgamma, qbeta], axis=0) # add bias terms for samples Gprime = tf.concat([tf.ones([batch_size, 1]), G_ph], axis=1) eta = tf.matmul(Gprime, Bprime) phi = tf.nn.log_softmax(eta) Y = Multinomial(total_count=total_count, logits=phi, name='Y') loss = -(tf.reduce_mean(gamma.log_prob(qgamma)) + \ tf.reduce_mean(beta.log_prob(qbeta)) + \ tf.reduce_mean(Y.log_prob(Y_ph)) * (N / batch_size)) loss = tf.Print(loss, [loss]) optimizer = tf.train.AdamOptimizer(learning_rate) gradients, variables = zip(*optimizer.compute_gradients(loss)) gradients, _ = tf.clip_by_global_norm(gradients, opts.clipping_size) train = optimizer.apply_gradients(zip(gradients, variables)) with tf.name_scope('accuracy'): holdout_count = tf.reduce_sum(Y_holdout, axis=1) pred = tf.reshape(holdout_count, [-1, 1]) * tf.nn.softmax( tf.matmul(G_holdout, qbeta) + qgamma) mse = tf.reduce_mean(tf.squeeze(tf.abs(pred - Y_holdout))) tf.summary.scalar('mean_absolute_error', mse) tf.summary.scalar('loss', loss) tf.summary.histogram('qbeta', qbeta) tf.summary.histogram('qgamma', qgamma) merged = tf.summary.merge_all() tf.global_variables_initializer().run() writer = tf.summary.FileWriter(save_path, session.graph) losses = np.array([0.] * num_iter) idx = np.arange(train_metadata.shape[0]) log_handle = open(os.path.join(save_path, 'run.log'), 'w') last_checkpoint_time = 0 start_time = time.time() saver = tf.train.Saver() for i in range(num_iter): batch_idx = np.random.choice(idx, size=batch_size) feed_dict = { Y_ph: y_data[batch_idx].astype(np.float32), G_ph: train_metadata.values[batch_idx].astype(np.float32), Y_holdout: y_test.astype(np.float32), G_holdout: test_metadata.values.astype(np.float32), total_count: y_data[batch_idx].sum(axis=1).astype(np.float32) } if i % 1000 == 0: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() _, summary, train_loss, grads = session.run( [train, merged, loss, gradients], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % i) writer.add_summary(summary, i) elif i % 5000 == 0: _, summary, err, train_loss, grads = session.run( [train, mse, merged, loss, gradients], feed_dict=feed_dict) writer.add_summary(summary, i) else: _, summary, train_loss, grads = session.run( [train, merged, loss, gradients], feed_dict=feed_dict) writer.add_summary(summary, i) now = time.time() if now - last_checkpoint_time > checkpoint_interval: saver.save(session, os.path.join(opts.save_path, "model.ckpt"), global_step=i) last_checkpoint_time = now losses[i] = train_loss elapsed_time = time.time() - start_time print('Elapsed Time: %f seconds' % elapsed_time) # Cross validation pred_beta = qbeta.eval() pred_gamma = qgamma.eval() mse, mrc = cross_validation(test_metadata.values, pred_beta, pred_gamma, y_test) print("MSE: %f, MRC: %f" % (mse, mrc))