Пример #1
0
def combine_regression_data():
    utils.write_data(
        input("Output File: "),
        utils.join_data(
            [input("file 1: "), input("file 2: ")],
            expt=[5, 3],
            d_filter=[[
                utils.sanitize_lower, utils.sanitize_lower, float, float, float
            ], [utils.sanitize_lower, utils.sanitize_lower, float]]))
Пример #2
0
    def sample_topk(self,
                    prefix,
                    end_tag=[u' ', u'.', u',', u'\n', u'\t'],
                    k=5,
                    B=100,
                    N=30):
        # B, N: width, depth of beam search
        prefix = u" " + prefix
        prefix = utils.split_raw(prefix, self.text_modeling)
        prefix = np.array(list(map(self.vocab.get, prefix)))
        current_state = self.model.initial_rnn_state(1)
        for x in prefix[:-1]:
            _, current_state, _ = self.model.sample_output(
                self.sess, x, current_state)
        end_tag = [self.vocab.get(e, -1) for e in end_tag]

        # not so efficient beam search. just initial version
        candidates = [{
            'p': 1.,
            'seq': prefix[1:],
            'rnn_state': current_state,
            'done': False
        }]
        for depth in range(len(prefix) - 1, N):
            new_candidates = []
            for d in candidates:
                if d['done']:
                    new_candidates.append(d)
                    continue
                _, next_rnn_state, next_x_prob = self.model.sample_output(
                    self.sess, d['seq'][-1], d['rnn_state'])
                for next_x in range(len(next_x_prob)):
                    next_p = d['p'] * next_x_prob[next_x]
                    next_seq = np.concatenate((d['seq'], [next_x]))
                    done = next_x in end_tag
                    new_candidates.append({
                        'p': next_p,
                        'seq': next_seq,
                        'rnn_state': next_rnn_state,
                        'done': done
                    })
            candidates = []
            new_candidates.sort(key=lambda x: x['p'], reverse=True)
            for idx in range(min(B, len(new_candidates))):
                candidates.append(new_candidates[idx])
        result = []
        for idx in range(min(k, len(candidates))):
            d = candidates[idx]
            seq = list(map(self.inv_vocab.get, d['seq']))
            result.append((utils.join_data(seq, self.text_modeling), d['p']))
        return result
Пример #3
0
 def random_sentence(self, prefix=u"", seq_length=100):
     prefix = u" " + prefix
     prefix = utils.split_raw(prefix, self.text_modeling)
     prefix = np.array(list(map(self.vocab.get, prefix)))
     current_state = self.model.initial_rnn_state(1)
     output = []
     for x in prefix[:-1]:
         output.append(x)
         _, current_state, _ = self.model.sample_output(
             self.sess, x, current_state)
     x = prefix[-1]
     output.append(x)
     while len(output) < seq_length:
         x, current_state, _ = self.model.sample_output(
             self.sess, x, current_state)
         output.append(x)
     output = list(map(self.inv_vocab.get, output))
     output_str = utils.join_data(output, self.text_modeling)
     return output_str
Пример #4
0
def setup_data(npath, ppath):
    from ml_data import SequenceDinucLabelsProperties

    global data
    global limits
    global lengths
    global shared_input_length

    # Join data into a single input vector
    data = join_data(SequenceDinucLabelsProperties(npath, ppath))

    # Get lenghts of input vectors
    lengths = get_input_lengths(data)

    # Define limits of data type on single vector
    limits = calc_limits(data)

    # Compute shared input vector length
    shared_input_length = 79
Пример #5
0
def setup_data(npath, ppath):
    from ml_data import SimpleHistData
    from ml_data import DinucCrossCovarData

    global data
    global limits
    global lengths
    global shared_input_length

    # Join data into a single input vector
    data = join_data(
        SimpleHistData(npath, ppath, k=4, upto=True),
        DinucCrossCovarData(npath, ppath, k=3, upto=True),
    )

    # Get lenghts of input vectors
    lengths = get_input_lengths(data)

    # Define limits of data type on single vector
    limits = calc_limits(data)

    # Compute shared input vector length
    shared_input_length = limits[-1][-1]
Пример #6
0
if args.load_dir is not None:
    print "Continue from {}".format(args.load_dir)
    saver.restore(sess, args.load_dir)

start_time = time.time()
for epoch in range(args.n_epochs):
    losses = []
    n_batch = train_loader.n_batch(args.batch_size, args.seq_length)
    for idx, (batch_x, batch_y) in enumerate(
            train_loader.get_batch(args.batch_size, args.seq_length)):
        loss = model.run_train_op(sess, train_op, batch_x, batch_y,
                                  model.initial_rnn_state(args.batch_size))
        losses.append(loss)
        print "Epoch {} ({} / {}), loss: {:.4f}, elapsed time: {:.1f}s".format(
            epoch, idx, n_batch, loss,
            time.time() - start_time)
    writer.add_summary(
        sess.run(loss_summary, feed_dict={loss_log: np.mean(losses)}), epoch)

    saver.save(sess, args.save_dir)

    output, x, current_state = [], train_loader.vocab.get(
        unichr(32)), model.initial_rnn_state(1)
    for _ in range(100):
        x, current_state, _ = model.sample_output(sess, x, current_state)
        output.append(x)
    output = list(map(train_loader.inv_vocab.get, output))
    print output
    print utils.join_data(output, args.text_modeling)