示例#1
0
You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

Copyright Brian Dolhansky 2014
[email protected]
"""

from decision_tree import DecisionTree
from fast_decision_tree import FastDecisionTree
from sklearn.datasets import fetch_mldata
from data_utils import integral_to_indicator, split_train_test
import numpy as np

print "Loading data..."
mnist = fetch_mldata('MNIST original', data_home='/home/bdol/data')
train_data, test_data, train_target, test_target = split_train_test(
    mnist.data, mnist.target)
train_target = integral_to_indicator(train_target)
test_target = integral_to_indicator(test_target)
print "Done!"

np.seterr(all='ignore')
print "Training decision tree..."

# Comment the following two lines and uncomment the two lines following that
# if you want a faster version of the decision tree.
# dt = DecisionTree(6, 10)
# root = dt.train(train_data, train_target)
fast_dt = FastDecisionTree(10, 10, feat_subset=0.3)
root = fast_dt.train(train_data, train_target)
print "Done training!"
print "Testing..."
You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

Copyright Brian Dolhansky 2014
[email protected]
"""

import numpy as np
from data_utils import split_train_test, RMSE
from linear_regression import LinearRegression
from sklearn import preprocessing
from sklearn.datasets import fetch_california_housing

print "Loading data..."
housing = fetch_california_housing(data_home='/home/bdol/data')
train_data, test_data, train_target, test_target = split_train_test(
    housing.data, housing.target)

# Normalize the data
train_data = preprocessing.scale(train_data)
test_data = preprocessing.scale(test_data)

# Append bias feature
train_data = np.hstack(
    (train_data, np.ones((train_data.shape[0], 1), dtype=train_data.dtype)))
test_data = np.hstack(
    (test_data, np.ones((test_data.shape[0], 1), dtype=test_data.dtype)))

train_target = train_target[:, None]
test_target = test_target[:, None]

lin_reg = LinearRegression()
Copyright Brian Dolhansky 2014
[email protected]
"""

import numpy as np
from data_utils import split_train_test, RMSE
from linear_regression import LinearRegression
from sklearn import preprocessing
from sklearn.datasets import fetch_california_housing


print "Loading data..."
housing = fetch_california_housing(data_home='/home/bdol/data')
train_data, test_data, train_target, test_target = split_train_test(
    housing.data, housing.target
)

# Normalize the data
train_data = preprocessing.scale(train_data)
test_data = preprocessing.scale(test_data)

# Append bias feature
train_data = np.hstack((train_data, np.ones((train_data.shape[0], 1),
                                            dtype=train_data.dtype)))
test_data = np.hstack((test_data, np.ones((test_data.shape[0], 1),
                                          dtype=test_data.dtype)))

train_target = train_target[:, None]
test_target = test_target[:, None]
示例#4
0
You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

Copyright Brian Dolhansky 2014
[email protected]
"""

from random_forest import RandomForest
from sklearn.datasets import fetch_mldata
from data_utils import integral_to_indicator, split_train_test
import numpy as np

print "Loading data..."
mnist = fetch_mldata('MNIST original', data_home='/home/bdol/data')
train_data, test_data, train_target, test_target = split_train_test(mnist.data,
                                                                    mnist.target)
train_target = integral_to_indicator(train_target)
test_target_integral = integral_to_indicator(test_target)
print "Done!"

np.seterr(all='ignore')

print "Training random forest..."
rf = RandomForest(21, 10, 10, boot_percent=0.3, feat_percent=0.1, debug=False)
rf.train(train_data, train_target)
print "Done training!"

print "Testing..."
yhat = rf.test(test_data, test_target_integral)
err = (np.sum(yhat != test_target[:, None]).astype(float))/test_target.shape[0]
print "Error rate: {0}".format(err)
示例#5
0
文件: train.py 项目: yanJiang0216/han
def main(_):
    data, vocab = data_utils.load_data(FLAGS.review_path)
    unk_vocab = data_utils.replace_UNK(vocab, FLAGS.min_count)
    word_idx_map = data_utils.get_word_idx_map(unk_vocab)
    all_text, all_label = zip(*data)
    labels = np.array(all_label)
    indexed_docs = data_utils.docs2mat(all_text, FLAGS.max_doc_len,
                                       FLAGS.max_sent_len, word_idx_map)
    folds = data_utils.split_train_test(indexed_docs, labels)
    # now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
    # checkpoint_dir = "{}/run-{}".format(FLAGS.record_dir, now)
    with tf.Graph().as_default():
        # Set log and checkpoint dir for saving and restoring model.
        config = tf.ConfigProto(log_device_placement=True,
                                allow_soft_placement=True)
        sess = tf.Session(config=config)
        with sess.as_default():
            # Create essential model and operators.
            han = han_model.HanModel(FLAGS.max_doc_len, FLAGS.max_sent_len,
                                     len(word_idx_map), FLAGS.embedding_size,
                                     FLAGS.learning_rate, FLAGS.gru_size,
                                     FLAGS.context_size, FLAGS.gru_size,
                                     FLAGS.context_size, labels.shape[1])
            embedding = han.embedding_from_scratch()
            logits = han.inference(embedding)
            global_step = tf.Variable(0, name="global_step", trainable=False)
            loss_op = han.loss(logits)
            train_op = han.training(loss_op, global_step)
            eval_op = han.evaluate(logits)
            # Merge all summaries.
            merged_summaries = tf.summary.merge_all()
            # Create summary writers for trainning and testing respectively.
            train_writer = tf.summary.FileWriter(FLAGS.record_dir + "/train",
                                                 sess.graph)
            test_writer = tf.summary.FileWriter(FLAGS.record_dir + "/test")
            # Now feed the data.
            init = tf.global_variables_initializer()
            sess.run(init)
            saver = tf.train.Saver()
            for train_data, test_data in folds:
                test_X, test_y = zip(*test_data)
                num_batches = int(np.ceil(len(train_data) / FLAGS.batch_size))
                best_eval_accuracy = 0.0
                last_improvement_epoch = 0
                for epoch in range(FLAGS.num_epochs):
                    shuffled_indices = np.random.permutation(len(train_data))
                    shuffled_data = train_data[shuffled_indices]
                    avg_loss = 0.0
                    for i in range(num_batches):
                        beg = i * FLAGS.batch_size
                        end = min((i + 1) * FLAGS.batch_size, len(train_data))
                        cur_batch = shuffled_data[beg:end]
                        X_batch, y_batch = zip(*cur_batch)
                        train_feed_dict = {
                            han.input_X: np.array(X_batch),
                            han.input_y: np.array(y_batch)
                        }
                        summary, _, loss, step = sess.run(
                            [merged_summaries, train_op, loss_op, global_step],
                            feed_dict=train_feed_dict)
                        avg_loss += loss / num_batches
                        train_writer.add_summary(summary, step)
                    train_acc = 1 - avg_loss
                    if (epoch % FLAGS.eval_step == 0
                            or epoch == FLAGS.num_epochs - 1):
                        val_feed_dict = {
                            han.input_X: np.array(test_X),
                            han.input_y: np.array(test_y)
                        }
                        summary, eval_acc, step = sess.run(
                            [merged_summaries, eval_op, global_step],
                            feed_dict=val_feed_dict)
                        print("Epoch:%d, training accuracy:%f,"
                              " validation accuracy:%f" %
                              (epoch, train_acc, eval_acc))
                        test_writer.add_summary(summary, step)
                        if eval_acc > best_eval_accuracy:
                            best_eval_accuracy = eval_acc
                            last_improvement_epoch = epoch
                            saver.save(sess,
                                       FLAGS.record_dir,
                                       global_step=global_step)
                    if (FLAGS.early_stop_interval > 0
                            and last_improvement_epoch - epoch >
                            FLAGS.early_stop_interval):
                        break
                break
示例#6
0
def train(n_epochs=10):
    data_file = '../data/train-stanford-raw.conll'
    # if vocab_file is given (ie for pretrained wordvectors), use x2i and i2x from this file.
    # If not given, create new vocab file in ../data
    vocab_file = None

    log_folder = '../logs'
    model_folder = '../models'
    model_name = 'wsj_3'

    model_file = os.path.join(model_folder, model_name + '_{}.model')
    log_file = open(os.path.join(log_folder, model_name + '.csv'), 'w', 1)
    print('epoch,train_loss,val_loss,arc_acc,lab_acc', file=log_file)

    batch_size = 64
    prints_per_epoch = 10
    n_epochs *= prints_per_epoch

    # load data
    print('loading data...')
    data, x2i, i2x = make_dataset(data_file)

    if not vocab_file:
        with open('../data/vocab_{}.pkl'.format(model_name), 'wb') as f:
            pickle.dump((x2i, i2x), f)

    # make train and val batch loaders
    train_data, val_data = split_train_test(data)
    print('# train sentences', len(train_data))
    print('# val sentences', len(val_data))
    train_loader = batch_loader(train_data, batch_size)
    val_loader = batch_loader(val_data, batch_size, shuffle=False)

    print('creating model...')
    # make model
    model = BiAffineParser(word_vocab_size=len(x2i['word']),
                           word_emb_dim=100,
                           pos_vocab_size=len(x2i['tag']),
                           pos_emb_dim=28,
                           emb_dropout=0.33,
                           lstm_hidden=512,
                           lstm_depth=3,
                           lstm_dropout=.33,
                           arc_hidden=256,
                           arc_depth=1,
                           arc_dropout=.33,
                           arc_activation='ReLU',
                           lab_hidden=128,
                           lab_depth=1,
                           lab_dropout=.33,
                           lab_activation='ReLU',
                           n_labels=len(x2i['label']))
    print(model)
    model.cuda()
    base_params, arc_params, lab_params = model.get_param_groups()

    opt = Adam([
        {
            'params': base_params,
            'lr': 2e-3
        },
        {
            'params': arc_params,
            'lr': 2e-3
        },
        {
            'params': lab_params,
            'lr': 1e-4
        },
    ],
               betas=[.9, .9])
    sched = ReduceLROnPlateau(opt,
                              threshold=1e-3,
                              patience=8,
                              factor=.4,
                              verbose=True)

    n_train_batches = int(len(train_data) / batch_size)
    n_val_batches = int(len(val_data) / batch_size)
    batches_per_epoch = int(n_train_batches / prints_per_epoch)

    for epoch in range(n_epochs):
        t0 = time.time()

        # Training
        train_loss = 0
        model.train()
        for i in range(batches_per_epoch):
            opt.zero_grad()

            # Load batch
            words, tags, arcs, lengths = next(train_loader)
            words = words.cuda()
            tags = tags.cuda()

            # Forward
            S_arc, S_lab = model(words, tags, lengths=lengths)

            # Calculate loss
            arc_loss = get_arc_loss(S_arc, arcs)
            lab_loss = get_label_loss(S_lab, arcs)
            loss = arc_loss + .025 * lab_loss
            train_loss += arc_loss.data[0] + lab_loss.data[0]

            # Backward
            loss.backward()
            opt.step()

        train_loss /= batches_per_epoch

        # Evaluation
        val_loss = 0
        arc_acc = 0
        lab_acc = 0
        model.eval()
        for i in range(n_val_batches):
            words, tags, arcs, lengths = next(val_loader)
            words = words.cuda()
            tags = tags.cuda()

            S_arc, S_lab = model(words, tags, lengths=lengths)

            arc_loss = get_arc_loss(S_arc, arcs)
            lab_loss = get_label_loss(S_lab, arcs)
            loss = arc_loss + lab_loss

            val_loss += arc_loss.data[0] + lab_loss.data[0]
            arc_acc += get_arc_accuracy(S_arc, arcs)
            lab_acc += get_label_accuracy(S_lab, arcs)

        val_loss /= n_val_batches
        arc_acc /= n_val_batches
        lab_acc /= n_val_batches
        epoch_time = time.time() - t0

        print(
            'epoch {:.1f}\t train_loss {:.3f}\t val_loss {:.3f}\t arc_acc {:.3f}\t lab_acc {:.3f}\t time {:.1f} sec'
            .format(epoch / prints_per_epoch, train_loss, val_loss, arc_acc,
                    lab_acc, epoch_time),
            end="\r")

        print('{:.3f},{:.3f},{:.3f},{:.3f},{:.3f}'.format(
            epoch / prints_per_epoch, train_loss, val_loss, arc_acc, lab_acc),
              file=log_file)

        sched.step(val_loss)

    print('Done!')
    torch.save(model, model_file.format(val_loss))
    log_file.close()
from PIL import Image
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
#from keras.optimizers import Adam
#from keras.layers import Lambda, Conv2D, MaxPooling2D, Dropout, Dense, Flatten
#from utils import INPUT_SHAPE, batch_generator

from data_utils import load_image_data, split_train_test
from img_utils import process_image

if __name__ == '__main__':
    data, labels = load_image_data()
    X_train, y_train, X_test, y_test = split_train_test(data, labels, 0.6)

    pipe_line = Pipeline([('standard_scaler', StandardScaler()),
                          ('model', RandomForestClassifier())])

    pipe_line.fit_transform(X_train.astype(float), y_train)

    y_test_pred = pipe_line.predict(X_test.astype(float))
    print(confusion_matrix(y_test, y_test_pred))

    joblib.dump(pipe_line, 'car_detection.pkl')

#def build_model():
#    model = Sequential()
#    model.add(Lambda(lambda x: x/127.5-1.0, input_shape=INPUT_SHAPE))
#    model.add(Conv2D(24, 5, 5, activation='elu', subsample=(2, 2)))
示例#8
0
print(data.info())
print(data["strength"].value_counts())
print(data.describe())



# Sequentially apply a list of transforms and a final estimator
classifier_model = Pipeline([
                ('tfidf', TfidfVectorizer(analyzer='char')),
                ('logisticRegression',LogisticRegression(multi_class='multinomial', solver='sag')),
])



train_set, test_set = utils.split_train_test(data,0.2)
print("test set size:", len(test_set))
print("train set size:",len(train_set))

# Features which are passwords
features_train = train_set.values[:, 1].astype('str')
features_test = test_set.values[:, 1].astype('str')

# Labels which are strength of password (target of classification)
labels_train = train_set.values[:, -1].astype('int')
labels_test = test_set.values[:, -1].astype('int')

# Fit the Model
classifier_model.fit(features_train, labels_train)

#predictions