예제 #1
0
def train_data():
    test_file = "./fixtures/net_20180312_201803114_100k_preprocessed.test"
    train_file = "./fixtures/net_20180312_201803114_100k_preprocessed.train"
    fmap_file = "./fixtures/net_20180312_201803114_100k.feature_map"

    num_features = -1
    for _ in open(fmap_file):
        num_features += 1

    conf = {
        "feature_cols": "PageID OrderID".split(),
        "target_col": "IsClick",
        "num_fields": 2,
        "num_features": num_features,
        "dim": 10,
        "use_unary": True,
        "num_iter": 5,
        "opt_cls": optim.Adam,
        "opt_kwargs": {
            "lr": 1e-3
        },
        "batch_size": 64
    }

    test = read_dataset(test_file, conf["feature_cols"], conf["target_col"])
    train_iter = BatchIter(train_file,
                           conf["feature_cols"],
                           conf["target_col"],
                           batch_size=conf["batch_size"])
    return Data(train_iter, test, conf)
예제 #2
0
def run(dataset_name, output_file_name):
    dataset = data.read_dataset('datasets/' + dataset_name + '.csv', **utils.reader_parameters[dataset_name])
    random.shuffle(dataset)
    params = parameters[dataset_name]
    network = params['NETWORKS'][0]

    with open('J_vs_N_' + str(network) + dataset_name + '.csv', 'w') as file:
        folds = fold(10, dataset)
        valids = folds[0]
        train = [i for s in folds[1:] for i in s]
        trains = [train[i:i + 10] for i in range(0, len(train), 10)]
        n = 0
        net = NeuralNetwork(network, ALPHA=params['ALPHA'], LAMBDA=0, K=0, BETA=0.8, STOP=1)
        for i in range(50):
            for train in trains:
                n += len(train)
                net.train(train)
                J = 0
                for valid in valids:
                    J += net.verifyPerformance(valid)
                file.write(str(n) + ',' + str(J) +'\n')

    for network in params['NETWORKS']:
        with open('f1_vs_lambda_' + str(network) + dataset_name + '.csv', 'w') as file:
            for LAMBDA in [0, 1, 8, 64, 512]:
                net = NeuralNetwork(network, ALPHA=params['ALPHA'], LAMBDA=LAMBDA/1000.0, K=params['K'], BETA=0.8, STOP=params['STOP'])
                cv = CrossValidator(k=10, classifier=net)
                cv.run(dataset)
                try:
                    file.write(str(LAMBDA/1000.0) + ',' + str(mean(cv.f1s(1))) + ',' + str(stdev(cv.f1s(1))) +'\n')
                except:
                    file.write(str(LAMBDA/1000.0) + ',0,0\n')
                file.flush()
예제 #3
0
def evaluate():
  """Eval SUN3D for a number of steps."""
  with tf.Graph().as_default() as g:
    # Get images and labels for SUN3D.
    images, depths,transforms = data.read_dataset(eval_data=True)
    # Build a Graph that computes the logits predictions from the
    # inference model.
    result, transform = model.inference(images,True)
    result = lss.inverse(result)
    depths = lss.inverse(depths)
    depths=tf.slice(depths,(0, 0, 0, 0), (BATCH_SIZE,IMAGE_SIZE_H, IMAGE_SIZE_W,1))



    # Restore the moving average version of the learned variables for eval.
    variable_averages = tf.train.ExponentialMovingAverage(
        model.MOVING_AVERAGE_DECAY)
    variables_to_restore = variable_averages.variables_to_restore()
    saver = tf.train.Saver(variables_to_restore)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7)
    
    config = tf.ConfigProto(gpu_options=gpu_options)
    while True:
      print('Start depth output')   
      eval_once(result,depths,config,saver)
      if EVAL_RUN_ONCE:
        print('end of depth output')
        break
      time.sleep(EVAL_INTERVAL_SECS)
def main():
    ## create an es client
    es_conn = Elasticsearch(config.ES_HOSTS, timeout=30)
    ## create the index if it doesn't exist
    create_index(es_conn = es_conn, index_name = config.INDEX_NAME)
    dataset = read_dataset(config.DOCS_PATH)
    counter_read, counter_idx_failed = 0, 0 ## counters

    while True:
        try:
            doc = next(dataset)
            res = es_conn.index(
                index = config.INDEX_NAME,
                doc_type = config.DOC_TYPE,
                id = doc.filename,
                body = doc._asdict())
            counter_read += 1

            if counter_read % config.PRINT_EVERY == 0:
                print('indexed {} documents'.format(counter_read))

            if not res['created']:
                print('indexing `{}` failed'.format(doc.path))
                counter_idx_failed += 1

        except StopIteration:
            print('finished reading docs from `{}`'.format(config.DOCS_PATH))
            break

    print('indexed {} docs to index `{}`, failed to index {} docs'.format(
        counter_read,
        config.INDEX_NAME,
        counter_idx_failed
    ))
예제 #5
0
def train():
    tr, va, te = read_dataset('../mnist.pkl.gz')
    binarizer = LabelBinarizer().fit(range(10))

    x = tf.placeholder(tf.float32, [None, 784])
    y = tf.placeholder(tf.float32, [None, 10])
    keep_prob = tf.placeholder(tf.float32)
    preds = model.inference(x, keep_prob)
    loss, total_loss = model.loss(preds, y)
    acc = model.evaluation(preds, y)
    # learning rate: 0.1
    train_op = model.training(total_loss, 0.1)

    init = tf.initialize_all_variables()
    sess = tf.Session()
    sess.run(init)
    for i in xrange(10000):
        batch_xs, batch_ys = tr.next_batch(50)
        if i % 100 == 0:
            train_acc = acc.eval(feed_dict={
                x:batch_xs, y:binarizer.transform(batch_ys),
                keep_prob: 1.0}, session=sess)
            print "step: {0}, training accuracy {1}".format(i, train_acc)
            validation_accuracy = getAccuracy(x, y, keep_prob, binarizer, acc, va, sess)
            print("Validation accuracy : {0}".format(validation_accuracy))
        train_op.run(feed_dict={
            x:batch_xs, y:binarizer.transform(batch_ys), keep_prob: 0.5},
                     session=sess)

    test_accuracy = getAccuracy(x, y, keep_prob, binarizer, acc, te, sess)
    print("Test accuracy : ", test_accuracy)
예제 #6
0
def load_data():
    #data.create_datasets()
    #data.generate_data('val.pkl', 5000)
    print("loading data...")
    train_data, test_data, val_data, eval_data = data.read_dataset()
    print("data loaded")
    return train_data, test_data, val_data, eval_data
예제 #7
0
def evaluate_on_dev(model, filename, batch_size=32):
    """Prints predictions and metrics by model on development dataset."""

    lemmas, tags, inflected_forms = read_dataset(filename)
    predictions = generate_predictions(model, lemmas, tags, batch_size)

    for prediction in predictions:
        print(prediction)

    print()
    print("Accuracy: {}, Average Distance: {}".format(accuracy(predictions, inflected_forms), average_distance(predictions, inflected_forms)))
예제 #8
0
def evaluate_on_dev(model, filename, batch_size=32):
    """Prints predictions and metrics by model on development dataset."""

    lemmas, tags, inflected_forms = read_dataset(filename)
    predictions = generate_predictions(model, lemmas, tags, batch_size)

    result_text = ''
    for lemma, tag, inflected_form, prediction in zip(lemmas, tags,
                                                      inflected_forms,
                                                      predictions):
        if inflected_form != prediction:
            result_text += '{}\t{}\t{}\t{}\n'.format(lemma, tag,
                                                     inflected_form,
                                                     prediction)

    return result_text, accuracy(predictions,
                                 inflected_forms), average_distance(
                                     predictions, inflected_forms)
예제 #9
0
def main(argv):
    keep_prob = tf.placeholder(tf.float32, name="keep_prob")
    image = tf.placeholder(tf.float32, shape=[FLAGS.batch_size, IMAGE_SIZE, IMAGE_SIZE, 3], name="input_image")
    annotation = tf.placeholder(tf.int32, shape=[FLAGS.batch_size, IMAGE_SIZE, IMAGE_SIZE, 1], name="annotation")

    pred_annotation, logits = inference(image)
    tf.summary.image("input_image", image)
    tf.summary.image("ground_truth", tf.cast(annotation, tf.uint8))
    tf.summary.image("pred_annotation", tf.cast(pred_annotation, tf.uint8))

    loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=logits, labels=tf.squeeze(annotation,squeeze_dims=[3])))
    tf.summary.scalar("loss", loss)

    trainable_var = tf.trainable_variables()
    train_op = train(FLAGS.learning_rate, loss, trainable_var)
    summary_op = tf.summary.merge_all()

    train_records, valid_records = data.read_dataset(FLAGS.data_dir)
    print("Train records:", len(train_records))
    print("Valid records:", len(valid_records))
    train_reader = BatchReader(train_records, {'resize': True, 'resize_size': IMAGE_SIZE})

    sess = tf.Session()
    saver = tf.train.Saver(max_to_keep=10)
    summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.model_dir, "logs"), sess.graph)

    sess.run(tf.global_variables_initializer())
    ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
        print("Model restored from", ckpt.model_checkpoint_path)
    else:
        print("initialize new model")

    for itr in xrange(MAX_ITERATION):
        train_images, train_annotations = train_reader.next_batch(FLAGS.batch_size)
        feed_dict = {image: train_images, annotation: train_annotations, keep_prob: 0.8}
        train_loss, _, pred_result, summary_str = sess.run([loss, train_op, pred_annotation, summary_op], feed_dict=feed_dict)
        summary_writer.add_summary(summary_str, itr)
        print("Time: %d, Step: %d, Train loss: %g" % (time.time(), itr, train_loss))
        if itr % 10 == 0 and itr > 0:
            saver.save(sess, FLAGS.model_dir + "model.ckpt", itr)
            print(pred_result[0])
예제 #10
0
def train(config, exp_name, data_path):
    # Read train and dev data, set dev mode = True
    results_dir = os.path.join(data_path, exp_name)
    if os.path.exists(results_dir):
        print("{} already exists, no need to train.\n".format(results_dir))
        return
    os.makedirs(results_dir)
    json.dump(config,
              open(os.path.join(results_dir, 'config.json'), 'w'),
              sort_keys=True,
              separators=(',\n', ': '))
    is_typed = config.get('is_typed', False)
    print("Typed Regularizer {}".format(is_typed))
    data_set = data.read_dataset(data_path, dev_mode=True, is_typed=is_typed)
    is_dev = config['is_dev']
    print("\n***{} MODE***\n".format('DEV' if is_dev else 'TEST'))
    print("Number of training data points {}".format(len(data_set['train'])))
    print("Number of dev data points {}".format(len(data_set['test'])))

    # Set up functions and params
    neg_sampler = data.NegativeSampler(data_set['train'])
    model = models.get_model(config, neg_sampler)
    evaluater = RankEvaluater(model, neg_sampler)
    updater = algorithms.Adam()
    typed_data = data_set['typed'] if is_typed else None
    minimizer = optimizer.GradientDescent(data_set['train'],
                                          data_set['test'],
                                          updater,
                                          model,
                                          evaluater,
                                          results_dir,
                                          'single',
                                          config,
                                          is_typed=is_typed,
                                          typed_data=typed_data)
    print('Training {}...\n'.format(config['model']))
    start = time.time()
    minimizer.minimize()
    end = time.time()
    hours = (end - start) / 3600
    minutes = ((end - start) % 3600) / 60
    print("Finished Training! Took {} hours and {} minutes\n".format(
        hours, minutes))
예제 #11
0
파일: eval.py 프로젝트: Dtananaev/tf_SfM
def evaluate():
    """Eval SUN3D for a number of steps."""
    with tf.Graph().as_default() as g:
        # Get images and labels for SUN3D.
        images, depths, transforms = data.read_dataset(eval_data=True)
        # Build a Graph that computes the logits predictions from the
        # inference model.

        result, resulttransform = model.inference(images, False)
        depths = lss.inverse(depths)
        result = lss.inverse(result)
        depths = tf.slice(depths, (0, 0, 0, 0),
                          (BATCH_SIZE, IMAGE_SIZE_H, IMAGE_SIZE_W, 1))

        # Calculate predictions.
        scale_inv_error = evalfunct.scinv(result, depths)
        L1_relative_error = evalfunct.L1rel(result, depths)
        L1_inverse_error = evalfunct.L1inv(result, depths)
        L1_transform = tf.reduce_mean(tf.abs(resulttransform - transforms))
        # Restore the moving average version of the learned variables for eval.
        variable_averages = tf.train.ExponentialMovingAverage(
            model.MOVING_AVERAGE_DECAY)
        variables_to_restore = variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.summary.merge_all()

        summary_writer = tf.summary.FileWriter(TEST_LOG, g)
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.7)
        config = tf.ConfigProto(gpu_options=gpu_options)
        while True:
            print('Start evaluation')
            eval_once(result, config, saver, summary_writer, scale_inv_error,
                      L1_relative_error, L1_inverse_error, L1_transform,
                      summary_op)
            if EVAL_RUN_ONCE:
                print('end of evaluation')
                break
            time.sleep(EVAL_INTERVAL_SECS)
예제 #12
0
def test(config, exp_name, data_path):

    print("Testing...\n")
    is_dev = config['is_dev']
    print("\n***{} MODE***\n".format('DEV' if is_dev else 'TEST'))
    results_dir = os.path.join(data_path, exp_name)
    params_path = os.path.join(data_path, exp_name, 'params_single.cpkl')
    if not os.path.exists(params_path):
        print("No trained params found, quitting.")
        return

    data_set = data.read_dataset(data_path, dev_mode=is_dev)
    all_data = copy.copy(data_set['train'])
    all_data.extend(data_set['dev'])
    all_data.extend(data_set['test'])
    neg_sampler = data.NegativeSampler(all_data)
    # Initializing the model changes config.
    model = models.get_model(config, neg_sampler)
    params = data.load_params(params_path, model)
    print("Number of Test Samples {}".format(len(data_set['test'])))
    evaluater = TestEvaluater(model, neg_sampler, params, is_dev, results_dir)
    evaluate(data_set['test'], evaluater, results_dir, is_dev)
예제 #13
0
def test(config,exp_name,data_path):

    print("Testing...\n")
    is_dev = config['is_dev']
    print("\n***{} MODE***\n".format('DEV' if is_dev else 'TEST'))
    results_dir = os.path.join(data_path, exp_name)
    params_path = os.path.join(data_path,exp_name,'params.torch')
    if not os.path.exists(params_path):
        print("No trained params found, quitting.")
        return

    data_set = data.read_dataset(data_path,results_dir,dev_mode=is_dev)


    print("Number of Test Samples {}".format(len(data_set['test'])))
    print("Vocabulary size: {}".format(constants.vocab_size))

    # Initializing the model changes config.
    print("Loading model params")

    model = models.TCNN(constants.vocab_size,config['ent_dim'])

    if torch.cuda.is_available():
        model.cuda()
        model.load_state_dict(
            torch.load(os.path.join(results_dir, "params.torch")))
        print("Using GPU {}".format(torch.cuda.current_device()))
    else:
        print("Using CPU")
        torch.set_num_threads(56)
        model.load_state_dict(
            torch.load(os.path.join(results_dir, "params.torch"), map_location=lambda storage, loc: storage))

    print("Model keys: {}".format(model.state_dict().keys()))
    
    check_embeddings(model,data_set['test'])
    model.eval()
    loss = evaluate(model,data_set['test'])
    print("Test KL Divergence Loss {}".format(loss))
예제 #14
0
def train():
    tr, va, te = read_dataset('../mnist.pkl.gz')
    binarizer = LabelBinarizer().fit(range(10))

    x = tf.placeholder(tf.float32, [None, 784])
    y = tf.placeholder(tf.float32, [None, 10])
    keep_prob = tf.placeholder(tf.float32)
    preds = model.inference(x, keep_prob)
    loss, total_loss = model.loss(preds, y)
    acc = model.evaluation(preds, y)
    # learning rate: 0.1
    train_op = model.training(total_loss, 0.1)

    init = tf.initialize_all_variables()
    sess = tf.Session()
    sess.run(init)
    for i in xrange(10000):
        batch_xs, batch_ys = tr.next_batch(50)
        if i % 100 == 0:
            train_acc = acc.eval(feed_dict={
                x: batch_xs,
                y: binarizer.transform(batch_ys),
                keep_prob: 1.0
            },
                                 session=sess)
            print "step: {0}, training accuracy {1}".format(i, train_acc)
            validation_accuracy = getAccuracy(x, y, keep_prob, binarizer, acc,
                                              va, sess)
            print("Validation accuracy : {0}".format(validation_accuracy))
        train_op.run(feed_dict={
            x: batch_xs,
            y: binarizer.transform(batch_ys),
            keep_prob: 0.5
        },
                     session=sess)

    test_accuracy = getAccuracy(x, y, keep_prob, binarizer, acc, te, sess)
    print("Test accuracy : ", test_accuracy)
예제 #15
0
def visualize(config,exp_name,data_path):

    results_dir = os.path.join(data_path, exp_name)
    params_path = os.path.join(data_path, exp_name, 'params.torch')
    if not os.path.exists(params_path):
        print("No trained params found, quitting.")
        return

    data_set = data.read_dataset(data_path, results_dir, dev_mode=True,gen_neg=False)


    # Initializing the model changes config.
    print("Loading model params for visualization")
    model = models.TCNN(constants.vocab_size, config['ent_dim'])
    model.load_state_dict(torch.load(os.path.join(results_dir, "params.torch"), map_location=lambda storage, loc: storage))
    model.eval()
    if torch.cuda.is_available():
        model.cuda()
        print("Using GPU {}".format(torch.cuda.current_device()))
        torch.cuda.manual_seed(241984)
    else:
        print("Using CPU")
        torch.manual_seed(241984)

    count = 1
    for s in util.chunk(data_set['dev'],1000): 
        conv_1_weights, conv_2_weights, conv_3_weights = model.visualize(util.get_tuples(s))
     
        print("Layer: {}".format(conv_1_weights))    
        np.save(open(results_dir + '/conv_1_weights.' + str(count), 'w'), conv_1_weights)
        print("Layer: {}".format(conv_2_weights))    
        np.save(open(results_dir + '/conv_2_weights.' + str(count), 'w'), conv_2_weights)
        print("Layer: {}".format(conv_1_weights))    
        np.save(open(results_dir + '/conv_3_weights.' + str(count), 'w'), conv_3_weights)

        count += 1
예제 #16
0
def train(config,exp_name,data_path):
    # Read train and dev data, set dev mode = True
    results_dir =  os.path.join(data_path,exp_name)
    if os.path.exists(results_dir):
        print("{} already exists, no need to train.\n".format(results_dir))
        return
    os.makedirs(results_dir)
    json.dump(config,open(os.path.join(results_dir,'config.json'),'w'),
              sort_keys=True,separators=(',\n', ': '))
    data_set = data.read_dataset(data_path,results_dir,dev_mode=True)
    is_dev = config['is_dev']
    print("\n***{} MODE***\n".format('DEV' if is_dev else 'TEST'))
    print("Number of training data points {}".format(len(data_set['train'])))
    print("Number of dev data points {}".format(len(data_set['dev'])))
    print("Number of test data points {}".format(len(data_set['test'])))
    print("Vocabulary size: {}".format(constants.vocab_size))
        
    model = models.TCNN(constants.vocab_size, config['ent_dim'])

    if torch.cuda.is_available():
        model.cuda()
        print("Using GPU {}".format(torch.cuda.current_device()))
    else:
        print("Using CPU")
        torch.set_num_threads(56)

    grad_descent = optimizer.GradientDescent(data_set['train'],data_set['test'],
                                             config,results_dir,model)

    print('Training...\n')
    start = time.time()
    grad_descent.minimize()
    end = time.time()
    hours = int((end-start)/ 3600)
    minutes = ((end-start) % 3600) / 60
    print("Finished Training! Took {} hours and {} minutes\n".format(hours,minutes))
import os

from constants import TASK1_DATA_PATH
from data import read_dataset
from utils import accuracy, average_distance, mean

if __name__ == "__main__":
    predictions_dir = os.path.join('output', 'dh0p1')
    acc = {'low': [], 'medium': [], 'high': []}
    dist = {'low': [], 'medium': [], 'high': []}
    for filename in sorted(os.listdir(predictions_dir)):
        for dataset in ['low', 'medium', 'high']:
            if dataset in filename and dataset == filename.split('-')[-2]:
                language = '-'.join(filename.split('-')[:-2])
                _, _, predictions = read_dataset(os.path.join(predictions_dir, filename))
                _, _, truth = read_dataset(os.path.join(TASK1_DATA_PATH, '{}-dev'.format(language)))

                print('{}[task 1/{}]: {:.4f}, {:.4f}'.format(language, dataset, accuracy(predictions, truth), average_distance(predictions, truth)))
                acc[dataset].append(accuracy(predictions, truth))
                dist[dataset].append(average_distance(predictions, truth))

    print()
    print()
    for dataset in ['low', 'medium', 'high']:
        print('Average[{}]: {:.4f}, {:.4f}'.format(dataset, mean(acc[dataset]), mean(dist[dataset])))
예제 #18
0
train_path = 'resources/traing_dataset.csv'
valid_path = 'resources/valid_dataset.csv'
test_path =  'resources/test_dataset.csv'


# Setting for learning
batch_size = 100
iteration = 10
epochs = 10
valid_size = 5

X, Y, is_training, cost, optimizer, accuracy, merged = alexnet()

# Read dataset.
classes = ['daisy', 'dandelion', 'rose', 'sunflower', 'tulip']
train = read_dataset(train_path)
valid = read_dataset(valid_path)
tran_labels = train[classes]
valid_labels = valid[classes]

sess = tf.Session()

# For tensorboard
logdir='log/'
train_writer = tf.summary.FileWriter(logdir + '/train', sess.graph)
valid_writer = tf.summary.FileWriter(logdir + '/valid')

# Initialize valuables
init = tf.global_variables_initializer()
sess.run(init)
예제 #19
0
                    total_loss += loss_value
        print("\nLoss per sentence: %.3f" % (total_loss/len(traindata)))
        print("Example outputs:")
        for s in traindata[:5]:
            for i,fields in enumerate(s):
                wf, lemma, msd = fields
                if (iscandidatemsd(msd) or (msd == NONE and lemma != NONE))\
                   and random() < SAMPLETRAIN:
                    print("INPUT:", s[i][LEMMA], "OUTPUT:",
                          generate(i,s,id2char),
                          "GOLD:",wf)
                    break

        devacc, devlev = eval(devdata,id2char)
        print("Development set accuracy: %.2f" % (100*devacc))
        print("Development set avg. Levenshtein distance: %.2f" % devlev)
        print()


if __name__=='__main__':
    traindata, wf2id, lemma2id, char2id, msd2id = read_dataset(sysargv[1])
    devinputdata, _, _, _, _ = read_dataset(sysargv[2])
    devgolddata, _, _, _, _ = read_dataset(sysargv[3])

    id2char = {id:char for char,id in char2id.items()}
    init_model(wf2id,lemma2id,char2id,msd2id)
    train(traindata,[devinputdata,devgolddata],
          wf2id,lemma2id,char2id,id2char,msd2id,20)    
    eval([devinputdata,devgolddata],id2char,generating=1,
         outf=open("%s-out" % sysargv[2],"w"))
예제 #20
0
파일: train.py 프로젝트: Dtananaev/tf_SfM
def train():
    """Train SUN3D for a number of steps."""
    with tf.Graph().as_default(), tf.device('/gpu:0'):
        global_step = tf.contrib.framework.get_or_create_global_step()

        # Get images and labels for SUN3D.
        images, gtdepths, gttransforms = data.read_dataset(eval_data=False)

        pdepth, ptransforms = model.inference(images, True)

        # Calculate loss.
        loss = model.loss(pdepth, gtdepths, gttransforms, ptransforms)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = model.train(loss, global_step)

        config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=LOG_DEVICE_PLACEMENT,
            intra_op_parallelism_threads=NUM_PREPROCESS_THREADS)
        config.gpu_options.allow_growth = True

        #config.gpu_options.per_process_gpu_memory_fraction = 0.4

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1

            def before_run(self, run_context):
                self._step += 1
                self._start_time = time.time()
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                duration = time.time() - self._start_time
                loss_value = run_values.results
                if self._step % 10 == 0:
                    num_examples_per_step = BATCH_SIZE
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                save_checkpoint_secs=3600,
                checkpoint_dir=TRAIN_LOG,
                hooks=[
                    tf.train.StopAtStepHook(last_step=NUM_ITER),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=config) as mon_sess:
            while not mon_sess.should_stop():
                print(mon_sess.run(loss))
                mon_sess.run(train_op)
예제 #21
0
def main(argv):
    keep_prob = tf.placeholder(tf.float32, name="keep_prob")
    image = tf.placeholder(tf.float32,
                           shape=[FLAGS.batch_size, IMAGE_SIZE, IMAGE_SIZE, 3],
                           name="input_image")
    annotation = tf.placeholder(
        tf.int32,
        shape=[FLAGS.batch_size, IMAGE_SIZE, IMAGE_SIZE, 1],
        name="annotation")

    pred_annotation, logits = inference(image)
    tf.summary.image("input_image", image)
    tf.summary.image("ground_truth", tf.cast(annotation, tf.uint8))
    tf.summary.image("pred_annotation", tf.cast(pred_annotation, tf.uint8))

    loss = tf.reduce_mean(
        tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                       labels=tf.squeeze(
                                                           annotation,
                                                           squeeze_dims=[3])))
    tf.summary.scalar("loss", loss)

    trainable_var = tf.trainable_variables()
    train_op = train(FLAGS.learning_rate, loss, trainable_var)
    summary_op = tf.summary.merge_all()

    train_records, valid_records = data.read_dataset(FLAGS.data_dir)
    print("Train records:", len(train_records))
    print("Valid records:", len(valid_records))
    train_reader = BatchReader(train_records, {
        'resize': True,
        'resize_size': IMAGE_SIZE
    })

    sess = tf.Session()
    saver = tf.train.Saver(max_to_keep=10)
    summary_writer = tf.summary.FileWriter(
        os.path.join(FLAGS.model_dir, "logs"), sess.graph)

    sess.run(tf.global_variables_initializer())
    ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
        print("Model restored from", ckpt.model_checkpoint_path)
    else:
        print("initialize new model")

    for itr in xrange(MAX_ITERATION):
        train_images, train_annotations = train_reader.next_batch(
            FLAGS.batch_size)
        feed_dict = {
            image: train_images,
            annotation: train_annotations,
            keep_prob: 0.8
        }
        train_loss, _, pred_result, summary_str = sess.run(
            [loss, train_op, pred_annotation, summary_op], feed_dict=feed_dict)
        summary_writer.add_summary(summary_str, itr)
        print("Time: %d, Step: %d, Train loss: %g" %
              (time.time(), itr, train_loss))
        if itr % 10 == 0 and itr > 0:
            saver.save(sess, FLAGS.model_dir + "model.ckpt", itr)
            print(pred_result[0])
예제 #22
0
    optimE = optim.Adam(encoder.parameters(), lr=config.getfloat('training', 'lr')*0.01) 
    optimG = optim.Adam(generator.parameters(), lr=config.getfloat('training', 'lr'))
    optimD = optim.Adam(discriminator.parameters(), lr=config.getfloat('training', 'lr'))

    '''
    Quake_Smart_seq2 = data.read_dataset(_path+"../data/Quake_Smart-seq2/data.h5")
    Quake_10x = data.read_dataset(_path+"../data/Quake_10x/data.h5")
    merge = {"A":Quake_Smart_seq2, "B":Quake_10x}
    mergedexpr, mergedl = data.merge_datasets(merge)
    s = mergedexpr.sum(axis=1)
    x = (mergedexpr.T/s).T
    x = x * 10000
    x,y,z,w = data.split_data(x, mergedl, test_size=0.01)
    '''
    
    Baron_human = data.read_dataset(_path+"../data/Baron_human/data.h5")
    Muraro = data.read_dataset(_path+"../data/Muraro/data.h5")
    Enge = data.read_dataset(_path+"../data/Enge/data.h5")
    Segerstolpe = data.read_dataset(_path+"../data/Segerstolpe/data.h5")
    Xin_2016 = data.read_dataset(_path+"../data/Xin_2016/data.h5")
    Lawlor = data.read_dataset(_path+"../data/Lawlor/data.h5")
    merge = {'Baron_human':Baron_human, 'Muraro':Muraro, 'Enge':Enge, 'Segerstolpe':Segerstolpe, 
    'Xin_2016':Xin_2016, 'Lawlor':Lawlor}
    mergedexpr, mergedl = data.merge_datasets(merge)

    s = mergedexpr.sum(axis=1)
    x = (mergedexpr.T/s).T
    x = x*10000
    #x = x[: ,:1000]
    whole_set = dataset.Single(x, mergedl)
    
예제 #23
0
    with open(os.path.join(M2M_ALIGNER_PATH, 'output'), 'r',
              encoding='utf8') as file:
        for i, line in enumerate(file):
            if 'NO ALIGNMENT' in line:
                alignments.append([-1] * len(tgts[i]))
                continue

            src, tgt = line.split('\t')
            src = src.replace('|', '')
            tgt = tgt.replace('|', '')

            seq_alignment = []
            src_i = 0
            for token1, token2 in zip(src, tgt):
                if token1 != '_':
                    seq_alignment.append(src_i)
                    src_i += 1
                else:
                    seq_alignment.append(-1)

            alignments.append(seq_alignment)

    return alignments


if __name__ == "__main__":
    lemmas, tags, inflected_forms = read_dataset(
        os.path.join(TASK1_DATA_PATH, 'hindi-train-high'))
    alignments = one_one_alignment([list(word) for word in lemmas],
                                   [list(word) for word in inflected_forms])
    print(alignments)
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from data import read_dataset

mnist = read_dataset("Data")

import tensorflow as tf
sess = tf.InteractiveSession()

x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 1])

W = tf.Variable(tf.zeros([784, 50]))
b = tf.Variable(tf.zeros([50]))


def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)


def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)


def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
예제 #25
0
파일: dataset.py 프로젝트: tyfei216/GCH
            list_of_length.append(i[1] - i[0])
        print("length of all types:", list_of_length)

    def __getitem__(self, index):
        index = self.index[index]
        label = self.train_Y[index]
        r1 = random.randint(
            0, self.st[label][1] - self.st[label][0] - 1) + self.st[label][0]
        r2 = random.randint(
            0,
            len(self) - self.st[label][1] + self.st[label][0] - 1)
        r2 = (r2 + self.st[label][1]) % len(self)
        r1 = self.index[r1]
        r2 = self.index[r2]
        assert self.train_Y[index] == self.train_Y[r1]
        assert self.train_Y[index] != self.train_Y[r2]
        return self.train_X[index], self.train_X[r1], self.train_X[
            r2], self.train_Y[index]


if __name__ == '__main__':
    a = data.read_dataset("../data/Plasschaert/data.h5")
    #a.exprs = a.exprs[:,:1000]
    x, y, z, w = data.getdata(a)
    print(x.shape)
    dataset = Single(x, z)
    print(dataset.catagories)
    dl = DataLoader(dataset, batch_size=5)
    for i, j in enumerate(dl):
        pass
예제 #26
0
def generate_entry(model_name,
                   hyperparameters,
                   datasets=('low', 'medium', 'high'),
                   use_hierarchical_attention=False,
                   use_ptr_gen=True,
                   test_data='test',
                   write_hyperparameter=False,
                   output_folder=None,
                   resume=False):

    languages = get_languages()

    if output_folder is None:
        output_folder = os.path.join('output', model_name)
    if not resume:
        os.makedirs(output_folder)

    if write_hyperparameter:
        with open(os.path.join(output_folder, 'hyperparameters'),
                  'w',
                  encoding='utf8') as file:
            file.write(hyperparameters)

    for language in tqdm(sorted(languages)):
        for dataset in datasets:
            if resume and os.path.exists(
                    os.path.join(output_folder, '{}-{}-out'.format(
                        language, dataset))):
                continue
            lr = hyperparameters['lr'][dataset]
            embedding_size = hyperparameters['embedding_size'][dataset]
            hidden_size = hyperparameters['hidden_size'][dataset]
            clip = hyperparameters['clip'][dataset]
            dropout_p = hyperparameters['dropout_p'][dataset]
            alpha = hyperparameters['alpha'][dataset]
            beta = hyperparameters['beta'][dataset]
            patience = hyperparameters['patience'][dataset]
            epochs_extension = hyperparameters['epochs_extension'][dataset]

            experiment_name = "{}_{}_{}_lr{}_em{}_hd_{}_clip{}_p{}_a{}_b_{}_{}".format(
                model_name, language, dataset, lr, embedding_size, hidden_size,
                str(clip), dropout_p, alpha, beta, int(time.time()))

            try:
                model_inputs_train, model_inputs_val, labels_train, labels_val, \
                vocab = package.data.load_data(language, dataset, test_data=test_data, use_external_val_data=True,
                                               val_ratio=0.2, random_state=42)
            except FileNotFoundError:
                continue

            model = package.net.Model(
                vocab,
                embedding_size=embedding_size,
                hidden_size=hidden_size,
                use_hierarchical_attention=use_hierarchical_attention,
                use_ptr_gen=use_ptr_gen,
                dropout_p=dropout_p).to(device)
            optimizer = optim.Adam(lr=lr, params=model.parameters())
            loss_fn = package.loss.Criterion(vocab, alpha, beta)

            writer = SummaryWriter('runs/' + experiment_name)
            model_save_dir = os.path.join('./saved_models', experiment_name)
            os.makedirs(model_save_dir)

            epochs = hyperparameters['epochs'][dataset]
            train_and_evaluate(model_inputs_train,
                               labels_train,
                               model_inputs_val,
                               labels_val,
                               model,
                               optimizer,
                               loss_fn,
                               epochs=epochs,
                               batch_size=32,
                               model_save_dir=model_save_dir,
                               show_progress=False,
                               writer=writer,
                               clip=clip)
            epochs_trained = epochs

            # Load best performing model on validation set
            best_state = torch.load(os.path.join(model_save_dir, 'best.model'))
            while epochs_trained - best_state['epoch_num'] < patience:
                train_and_evaluate(model_inputs_train,
                                   labels_train,
                                   model_inputs_val,
                                   labels_val,
                                   model,
                                   optimizer,
                                   loss_fn,
                                   epochs=epochs_extension,
                                   batch_size=32,
                                   model_save_dir=model_save_dir,
                                   show_progress=False,
                                   writer=writer,
                                   clip=clip,
                                   starting_epoch=epochs_trained + 1,
                                   initial_best_val_acc=best_state['val_acc'])
                epochs_trained += epochs_extension
                best_state = torch.load(
                    os.path.join(model_save_dir, 'best.model'))
            model.load_state_dict(best_state['model_state'])

            if test_data == 'dev':
                dev_file = os.path.join(TASK1_DATA_PATH,
                                        '{}-dev'.format(language))
                lemmas_test, tags_test, _ = read_dataset(dev_file)
            elif test_data == 'test':
                test_file = os.path.join(TASK1_DATA_PATH,
                                         '{}-covered-test'.format(language))
                lemmas_test, tags_test = read_covered_dataset(test_file)
            else:
                raise ValueError

            file_path = os.path.join(output_folder,
                                     '{}-{}-out'.format(language, dataset))
            generate_output(model, lemmas_test, tags_test, file_path)
예제 #27
0
    encoder = model.get_encoder(config, "M").cuda()
    discriminator = model.get_discriminator(config).cuda()
    generator = model.get_generator(config).cuda()
    encoder = encoder.cpu()
    encoder = encoder.cuda()
    #classifier = model.get_classifier(config).cuda()
    #gpu_tracker.track()
    #optimC = optim.Adam(classifier.parameters(), lr=config.getfloat('training', 'lr'))
    optimE = optim.Adam(encoder.parameters(),
                        lr=config.getfloat('training', 'lr') * 0.01)
    optimG = optim.Adam(generator.parameters(),
                        lr=config.getfloat('training', 'lr'))
    optimD = optim.Adam(discriminator.parameters(),
                        lr=config.getfloat('training', 'lr'))

    Quake_Smart_seq2 = data.read_dataset(_path +
                                         "../data/Quake_Smart-seq2/data.h5")
    Quake_10x = data.read_dataset(_path + "../data/Quake_10x/data.h5")
    merge = {"A": Quake_Smart_seq2, "B": Quake_10x}
    mergedexpr, mergedl = data.merge_datasets(merge)
    s = mergedexpr.sum(axis=1)
    x = (mergedexpr.T / s).T
    x = x * 10000
    x, y, z, w = data.split_data(x, mergedl, test_size=0.01)
    '''
    Baron_human = data.read_dataset(_path+"../data/Baron_human/data.h5")
    Muraro = data.read_dataset(_path+"../data/Muraro/data.h5")
    Enge = data.read_dataset(_path+"../data/Enge/data.h5")
    Segerstolpe = data.read_dataset(_path+"../data/Segerstolpe/data.h5")
    Xin_2016 = data.read_dataset(_path+"../data/Xin_2016/data.h5")
    Lawlor = data.read_dataset(_path+"../data/Lawlor/data.h5")
    merge = {'Baron_human':Baron_human, 'Muraro':Muraro, 'Enge':Enge, 'Segerstolpe':Segerstolpe, 
예제 #28
0
def main():
    """
    Main script for training and evaluating a GSDT.
    """
    args = parse_args()
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    device = to_device(args.device)

    trn_x, trn_y, test_x, test_y = data.read_dataset(args.data)
    in_features = trn_x.shape[1]
    out_classes = trn_y.max() + 1
    model = models.GSDT(in_features, out_classes, args.depth, args.branch,
                        args.cov, args.rank).to(device)

    trn_loader = to_loader(trn_x,
                           trn_y,
                           batch_size=args.batch_size,
                           shuffle=True)
    test_loader = to_loader(test_x, test_y, batch_size=args.batch_size)
    loss_func = models.TreeLoss(args.loss_type, lamda=args.lamda)
    optimizer1 = optim.Adam(model.parameters(), lr=args.lr)
    optimizer2 = optim.Adam(model.parameters(), lr=1e-2)

    logs = []
    for epoch in range(1, args.epochs + 1):
        model.train()
        loss1_sum, loss2_sum, count = 0, 0, 0
        for x, y in trn_loader:
            x = x.to(device)
            y = y.to(device)
            loss1, loss2 = loss_func(model, x, y)

            optimizer1.zero_grad()
            (loss1 + loss2).backward()
            optimizer1.step()

            loss1_sum += loss1.item() * x.size(0)
            loss2_sum += loss2.item() * x.size(0)
            count += x.size(0)

        if epoch == args.epochs // 2:
            fit_leaves(model, trn_loader, device, optimizer2)

        trn_acc = evaluate(model, trn_loader, device)
        test_acc = evaluate(model, test_loader, device)
        logs.append(
            (epoch, loss1_sum / count, loss2_sum / count, trn_acc, test_acc))

    if args.save:
        model_path = '{}/{}/{}.model'.format(args.out, args.data, args.seed)
        os.makedirs(os.path.dirname(model_path), exist_ok=True)
        torch.save(model.state_dict(), model_path)

    df = pd.DataFrame(logs)
    log_path = '{}/{}/{}.log'.format(args.out, args.data, args.seed)
    os.makedirs(os.path.dirname(log_path), exist_ok=True)
    df.to_csv(log_path,
              index=False,
              sep='\t',
              header=False,
              float_format='%.4f')

    trn_acc = evaluate(model, trn_loader, device)
    test_acc = evaluate(model, test_loader, device)
    result = np.array([trn_acc, test_acc])
    np.save('{}/{}/{}'.format(args.out, args.data, args.seed), result)
예제 #29
0
def load_data(language, dataset, test_data='dev', val_ratio=0.2, random_state=42, use_external_val_data=False):
    """Loads training data."""

    train_dataset = os.path.join(TASK1_DATA_PATH, '{}-train-{}'.format(language, dataset))
    lemmas, tags, inflected_forms = read_dataset(train_dataset)
    train_data_size = len(lemmas)

    if val_ratio*train_data_size > 1000:
        val_ratio = 1000/train_data_size
    val_dataset = None

    if use_external_val_data:
        dev_dataset = os.path.join(TASK1_DATA_PATH, '{}-dev'.format(language))
        high_dataset = os.path.join(TASK1_DATA_PATH, '{}-train-high'.format(language))
        medium_dataset = os.path.join(TASK1_DATA_PATH, '{}-train-medium'.format(language))
        low_dataset = os.path.join(TASK1_DATA_PATH, '{}-train-low'.format(language))

        if test_data != 'dev':
            val_dataset = dev_dataset
        elif os.path.exists(high_dataset) and train_dataset != high_dataset:
            val_dataset = high_dataset
        elif os.path.exists(medium_dataset) and train_dataset != medium_dataset:
            val_dataset = medium_dataset
        elif os.path.exists(low_dataset) and train_dataset != low_dataset:
            val_dataset = low_dataset

        if val_dataset is not None:
            lemmas_val, tags_val, inflected_forms_val = read_dataset(val_dataset)

    if val_dataset is not None and len(lemmas_val) >= val_ratio*train_data_size:
        lemmas_train, tags_train, inflected_forms_train = lemmas, tags, inflected_forms

        val_data = list(zip(lemmas_val, tags_val, inflected_forms_val))
        random.seed(random_state)
        val_data_size = int(min(max(val_ratio*train_data_size, 100), len(lemmas_val)))
        val_data = random.sample(val_data, val_data_size)
        lemmas_val, tags_val, inflected_forms_val = zip(*val_data)
        lemmas_val, tags_val, inflected_forms_val = list(lemmas_val), list(tags_val), list(inflected_forms_val)
    else:
        lemmas_train, lemmas_val, tags_train, tags_val, inflected_forms_train, inflected_forms_val = train_test_split(
            lemmas, tags, inflected_forms, test_size=val_ratio, random_state=random_state)

    train_data_size = len(lemmas_train)
    val_data_size = len(lemmas_val)

    if test_data == 'dev':
        dev_data = os.path.join(TASK1_DATA_PATH, '{}-dev'.format(language))
        lemmas_test, tags_test, _ = read_dataset(dev_data)
    elif test_data == 'test':
        test_data = os.path.join(TASK1_DATA_PATH, '{}-covered-test'.format(language))
        lemmas_test, tags_test = read_covered_dataset(test_data)
    else:
        lemmas_test, tags_test, inflected_forms_test = [], [], []

    vocab = Vocab(lemmas_train+lemmas_val+lemmas_test, tags_train+tags_val+tags_test, inflected_forms_train)

    alignments_train = get_alignment(lemmas_train, inflected_forms_train, vocab)
    p_gens_train = get_p_gens([[vocab.START_CHAR] + list(lemma) + [vocab.STOP_CHAR] for lemma in lemmas_train],
                              [list(inflected_form) + [vocab.STOP_CHAR] for inflected_form in inflected_forms_train], alignments_train)

    alignments_val = get_alignment(lemmas_train+lemmas_val, inflected_forms_train+inflected_forms_val, vocab)[train_data_size:]
    p_gens_val = get_p_gens([[vocab.START_CHAR] + list(lemma) + [vocab.STOP_CHAR] for lemma in lemmas_val],
                              [list(inflected_form) + [vocab.STOP_CHAR] for inflected_form in inflected_forms_val], alignments_val)

    lemmas_indices = vocab.words_to_indices(lemmas_train+lemmas_val, start_char=True, stop_char=True)
    tags_indices = vocab.tag_to_indices(tags_train+tags_val)
    inflected_forms_indices = vocab.words_to_indices(inflected_forms_train+inflected_forms_val)

    model_inputs_train = list(zip(lemmas_indices[:train_data_size], tags_indices[:train_data_size]))
    labels_train = list(zip(inflected_forms_indices[:train_data_size], alignments_train, p_gens_train))

    model_inputs_val = list(zip(lemmas_indices[train_data_size:], tags_indices[train_data_size:]))
    labels_val = list(zip(inflected_forms_indices[train_data_size:], alignments_val, p_gens_val))

    return model_inputs_train, model_inputs_val, labels_train, labels_val, vocab
예제 #30
0
from sys import argv

from baseline import eval
from data import read_dataset

if __name__ == "__main__":
    sysdata, _, _, _, _ = read_dataset(argv[1])
    golddata, _, _, _, _ = read_dataset(argv[2])
    acc, lev = eval([sysdata, golddata], id2char={}, generating=0)
    print("Accuracy: %.2f" % (100 * acc))
    print("Avg. Levenshtein distance: %.2f" % (lev))
예제 #31
0
        #    print(i)
        dis = []
        for j in range(data.shape[0]):
            if i == j:
                dis.append(10000000000.0)
            else:
                dis.append(np.sqrt(((data[i] - data[j])**2).sum()))
        index = np.argsort(dis)
        knn.append(index[:k])
        if label != None:
            for j in range(k):
                if label[index[j]] == label[i]:
                    gcnt += 1

    if label != None:
        print(gcnt, gcnt / float(len(label) * 10))

    return knn


if __name__ == '__main__':
    a = data.read_dataset("../data/Adam/data.h5")
    #a.exprs = a.exprs[:,:1000]
    x, y, z, w = data.getdata(a)

    knn = buildKNN(x, label=z, k=10)

    with open("./data/Adamknn.pkl", 'wb') as f:
        pickle.dump(knn, f)

    np.save()
예제 #32
0
                        (lang, devlev))
                    print()
            break


if __name__ == '__main__':

    exp_name = str(sysargv[5])
    exp_path = 'dumped/' + exp_name
    global wf2id_dict, lemma2id_dict, char2id_dict, id2char_dict, msd2id_dict, msd2id_split, id2msd_split, languages
    languages = str(sysargv[1]).split(',')
    traindata, devinputdata, devgolddata, wf2id_dict, lemma2id_dict, char2id_dict, msd2id_dict, id2char_dict, msd2id_split_monolingual, id2msd_split = {},{},{},{},{},{},{},{},{},{}

    for lang in languages:
        traindata[lang], wf2id_dict[lang], lemma2id_dict[lang], char2id_dict[lang], msd2id_dict[lang], \
            msd2id_split_monolingual[lang] = read_dataset(sysargv[2].format(lang))
        devinputdata[lang], _, _, _, _, _ = read_dataset(
            sysargv[3].format(lang))
        devgolddata[lang], _, _, _, _, _ = read_dataset(
            sysargv[4].format(lang))

        id2char_dict[lang] = {
            id: char
            for char, id in char2id_dict[lang].items()
        }

    all_msd_splits = []
    for i, lang in enumerate(languages):
        if i == 0: all_msd_splits += msd2id_split_monolingual[lang].keys()
        else:
            for m in msd2id_split_monolingual[lang].keys():