You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. Copyright Brian Dolhansky 2014 [email protected] """ from decision_tree import DecisionTree from fast_decision_tree import FastDecisionTree from sklearn.datasets import fetch_mldata from data_utils import integral_to_indicator, split_train_test import numpy as np print "Loading data..." mnist = fetch_mldata('MNIST original', data_home='/home/bdol/data') train_data, test_data, train_target, test_target = split_train_test( mnist.data, mnist.target) train_target = integral_to_indicator(train_target) test_target = integral_to_indicator(test_target) print "Done!" np.seterr(all='ignore') print "Training decision tree..." # Comment the following two lines and uncomment the two lines following that # if you want a faster version of the decision tree. # dt = DecisionTree(6, 10) # root = dt.train(train_data, train_target) fast_dt = FastDecisionTree(10, 10, feat_subset=0.3) root = fast_dt.train(train_data, train_target) print "Done training!" print "Testing..."
You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. Copyright Brian Dolhansky 2014 [email protected] """ import numpy as np from data_utils import split_train_test, RMSE from linear_regression import LinearRegression from sklearn import preprocessing from sklearn.datasets import fetch_california_housing print "Loading data..." housing = fetch_california_housing(data_home='/home/bdol/data') train_data, test_data, train_target, test_target = split_train_test( housing.data, housing.target) # Normalize the data train_data = preprocessing.scale(train_data) test_data = preprocessing.scale(test_data) # Append bias feature train_data = np.hstack( (train_data, np.ones((train_data.shape[0], 1), dtype=train_data.dtype))) test_data = np.hstack( (test_data, np.ones((test_data.shape[0], 1), dtype=test_data.dtype))) train_target = train_target[:, None] test_target = test_target[:, None] lin_reg = LinearRegression()
Copyright Brian Dolhansky 2014 [email protected] """ import numpy as np from data_utils import split_train_test, RMSE from linear_regression import LinearRegression from sklearn import preprocessing from sklearn.datasets import fetch_california_housing print "Loading data..." housing = fetch_california_housing(data_home='/home/bdol/data') train_data, test_data, train_target, test_target = split_train_test( housing.data, housing.target ) # Normalize the data train_data = preprocessing.scale(train_data) test_data = preprocessing.scale(test_data) # Append bias feature train_data = np.hstack((train_data, np.ones((train_data.shape[0], 1), dtype=train_data.dtype))) test_data = np.hstack((test_data, np.ones((test_data.shape[0], 1), dtype=test_data.dtype))) train_target = train_target[:, None] test_target = test_target[:, None]
You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. Copyright Brian Dolhansky 2014 [email protected] """ from random_forest import RandomForest from sklearn.datasets import fetch_mldata from data_utils import integral_to_indicator, split_train_test import numpy as np print "Loading data..." mnist = fetch_mldata('MNIST original', data_home='/home/bdol/data') train_data, test_data, train_target, test_target = split_train_test(mnist.data, mnist.target) train_target = integral_to_indicator(train_target) test_target_integral = integral_to_indicator(test_target) print "Done!" np.seterr(all='ignore') print "Training random forest..." rf = RandomForest(21, 10, 10, boot_percent=0.3, feat_percent=0.1, debug=False) rf.train(train_data, train_target) print "Done training!" print "Testing..." yhat = rf.test(test_data, test_target_integral) err = (np.sum(yhat != test_target[:, None]).astype(float))/test_target.shape[0] print "Error rate: {0}".format(err)
def main(_): data, vocab = data_utils.load_data(FLAGS.review_path) unk_vocab = data_utils.replace_UNK(vocab, FLAGS.min_count) word_idx_map = data_utils.get_word_idx_map(unk_vocab) all_text, all_label = zip(*data) labels = np.array(all_label) indexed_docs = data_utils.docs2mat(all_text, FLAGS.max_doc_len, FLAGS.max_sent_len, word_idx_map) folds = data_utils.split_train_test(indexed_docs, labels) # now = datetime.utcnow().strftime("%Y%m%d%H%M%S") # checkpoint_dir = "{}/run-{}".format(FLAGS.record_dir, now) with tf.Graph().as_default(): # Set log and checkpoint dir for saving and restoring model. config = tf.ConfigProto(log_device_placement=True, allow_soft_placement=True) sess = tf.Session(config=config) with sess.as_default(): # Create essential model and operators. han = han_model.HanModel(FLAGS.max_doc_len, FLAGS.max_sent_len, len(word_idx_map), FLAGS.embedding_size, FLAGS.learning_rate, FLAGS.gru_size, FLAGS.context_size, FLAGS.gru_size, FLAGS.context_size, labels.shape[1]) embedding = han.embedding_from_scratch() logits = han.inference(embedding) global_step = tf.Variable(0, name="global_step", trainable=False) loss_op = han.loss(logits) train_op = han.training(loss_op, global_step) eval_op = han.evaluate(logits) # Merge all summaries. merged_summaries = tf.summary.merge_all() # Create summary writers for trainning and testing respectively. train_writer = tf.summary.FileWriter(FLAGS.record_dir + "/train", sess.graph) test_writer = tf.summary.FileWriter(FLAGS.record_dir + "/test") # Now feed the data. init = tf.global_variables_initializer() sess.run(init) saver = tf.train.Saver() for train_data, test_data in folds: test_X, test_y = zip(*test_data) num_batches = int(np.ceil(len(train_data) / FLAGS.batch_size)) best_eval_accuracy = 0.0 last_improvement_epoch = 0 for epoch in range(FLAGS.num_epochs): shuffled_indices = np.random.permutation(len(train_data)) shuffled_data = train_data[shuffled_indices] avg_loss = 0.0 for i in range(num_batches): beg = i * FLAGS.batch_size end = min((i + 1) * FLAGS.batch_size, len(train_data)) cur_batch = shuffled_data[beg:end] X_batch, y_batch = zip(*cur_batch) train_feed_dict = { han.input_X: np.array(X_batch), han.input_y: np.array(y_batch) } summary, _, loss, step = sess.run( [merged_summaries, train_op, loss_op, global_step], feed_dict=train_feed_dict) avg_loss += loss / num_batches train_writer.add_summary(summary, step) train_acc = 1 - avg_loss if (epoch % FLAGS.eval_step == 0 or epoch == FLAGS.num_epochs - 1): val_feed_dict = { han.input_X: np.array(test_X), han.input_y: np.array(test_y) } summary, eval_acc, step = sess.run( [merged_summaries, eval_op, global_step], feed_dict=val_feed_dict) print("Epoch:%d, training accuracy:%f," " validation accuracy:%f" % (epoch, train_acc, eval_acc)) test_writer.add_summary(summary, step) if eval_acc > best_eval_accuracy: best_eval_accuracy = eval_acc last_improvement_epoch = epoch saver.save(sess, FLAGS.record_dir, global_step=global_step) if (FLAGS.early_stop_interval > 0 and last_improvement_epoch - epoch > FLAGS.early_stop_interval): break break
def train(n_epochs=10): data_file = '../data/train-stanford-raw.conll' # if vocab_file is given (ie for pretrained wordvectors), use x2i and i2x from this file. # If not given, create new vocab file in ../data vocab_file = None log_folder = '../logs' model_folder = '../models' model_name = 'wsj_3' model_file = os.path.join(model_folder, model_name + '_{}.model') log_file = open(os.path.join(log_folder, model_name + '.csv'), 'w', 1) print('epoch,train_loss,val_loss,arc_acc,lab_acc', file=log_file) batch_size = 64 prints_per_epoch = 10 n_epochs *= prints_per_epoch # load data print('loading data...') data, x2i, i2x = make_dataset(data_file) if not vocab_file: with open('../data/vocab_{}.pkl'.format(model_name), 'wb') as f: pickle.dump((x2i, i2x), f) # make train and val batch loaders train_data, val_data = split_train_test(data) print('# train sentences', len(train_data)) print('# val sentences', len(val_data)) train_loader = batch_loader(train_data, batch_size) val_loader = batch_loader(val_data, batch_size, shuffle=False) print('creating model...') # make model model = BiAffineParser(word_vocab_size=len(x2i['word']), word_emb_dim=100, pos_vocab_size=len(x2i['tag']), pos_emb_dim=28, emb_dropout=0.33, lstm_hidden=512, lstm_depth=3, lstm_dropout=.33, arc_hidden=256, arc_depth=1, arc_dropout=.33, arc_activation='ReLU', lab_hidden=128, lab_depth=1, lab_dropout=.33, lab_activation='ReLU', n_labels=len(x2i['label'])) print(model) model.cuda() base_params, arc_params, lab_params = model.get_param_groups() opt = Adam([ { 'params': base_params, 'lr': 2e-3 }, { 'params': arc_params, 'lr': 2e-3 }, { 'params': lab_params, 'lr': 1e-4 }, ], betas=[.9, .9]) sched = ReduceLROnPlateau(opt, threshold=1e-3, patience=8, factor=.4, verbose=True) n_train_batches = int(len(train_data) / batch_size) n_val_batches = int(len(val_data) / batch_size) batches_per_epoch = int(n_train_batches / prints_per_epoch) for epoch in range(n_epochs): t0 = time.time() # Training train_loss = 0 model.train() for i in range(batches_per_epoch): opt.zero_grad() # Load batch words, tags, arcs, lengths = next(train_loader) words = words.cuda() tags = tags.cuda() # Forward S_arc, S_lab = model(words, tags, lengths=lengths) # Calculate loss arc_loss = get_arc_loss(S_arc, arcs) lab_loss = get_label_loss(S_lab, arcs) loss = arc_loss + .025 * lab_loss train_loss += arc_loss.data[0] + lab_loss.data[0] # Backward loss.backward() opt.step() train_loss /= batches_per_epoch # Evaluation val_loss = 0 arc_acc = 0 lab_acc = 0 model.eval() for i in range(n_val_batches): words, tags, arcs, lengths = next(val_loader) words = words.cuda() tags = tags.cuda() S_arc, S_lab = model(words, tags, lengths=lengths) arc_loss = get_arc_loss(S_arc, arcs) lab_loss = get_label_loss(S_lab, arcs) loss = arc_loss + lab_loss val_loss += arc_loss.data[0] + lab_loss.data[0] arc_acc += get_arc_accuracy(S_arc, arcs) lab_acc += get_label_accuracy(S_lab, arcs) val_loss /= n_val_batches arc_acc /= n_val_batches lab_acc /= n_val_batches epoch_time = time.time() - t0 print( 'epoch {:.1f}\t train_loss {:.3f}\t val_loss {:.3f}\t arc_acc {:.3f}\t lab_acc {:.3f}\t time {:.1f} sec' .format(epoch / prints_per_epoch, train_loss, val_loss, arc_acc, lab_acc, epoch_time), end="\r") print('{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}'.format( epoch / prints_per_epoch, train_loss, val_loss, arc_acc, lab_acc), file=log_file) sched.step(val_loss) print('Done!') torch.save(model, model_file.format(val_loss)) log_file.close()
from PIL import Image from sklearn.ensemble import RandomForestClassifier from sklearn.externals import joblib from sklearn.metrics import confusion_matrix from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler #from keras.optimizers import Adam #from keras.layers import Lambda, Conv2D, MaxPooling2D, Dropout, Dense, Flatten #from utils import INPUT_SHAPE, batch_generator from data_utils import load_image_data, split_train_test from img_utils import process_image if __name__ == '__main__': data, labels = load_image_data() X_train, y_train, X_test, y_test = split_train_test(data, labels, 0.6) pipe_line = Pipeline([('standard_scaler', StandardScaler()), ('model', RandomForestClassifier())]) pipe_line.fit_transform(X_train.astype(float), y_train) y_test_pred = pipe_line.predict(X_test.astype(float)) print(confusion_matrix(y_test, y_test_pred)) joblib.dump(pipe_line, 'car_detection.pkl') #def build_model(): # model = Sequential() # model.add(Lambda(lambda x: x/127.5-1.0, input_shape=INPUT_SHAPE)) # model.add(Conv2D(24, 5, 5, activation='elu', subsample=(2, 2)))
print(data.info()) print(data["strength"].value_counts()) print(data.describe()) # Sequentially apply a list of transforms and a final estimator classifier_model = Pipeline([ ('tfidf', TfidfVectorizer(analyzer='char')), ('logisticRegression',LogisticRegression(multi_class='multinomial', solver='sag')), ]) train_set, test_set = utils.split_train_test(data,0.2) print("test set size:", len(test_set)) print("train set size:",len(train_set)) # Features which are passwords features_train = train_set.values[:, 1].astype('str') features_test = test_set.values[:, 1].astype('str') # Labels which are strength of password (target of classification) labels_train = train_set.values[:, -1].astype('int') labels_test = test_set.values[:, -1].astype('int') # Fit the Model classifier_model.fit(features_train, labels_train) #predictions