Пример #1
0
def inference(params):
    embedding_size = params['embedding_size']
    vocab_size = params['vocab_size']
    sentence_len = params['num_words_before'] + params['num_words_after']
    embedding_wd = utils.get_dict_value(params, 'embedding_wd', 0.0)
    embedding_device = utils.get_dict_value(params, 'embedding_device', None)
    embedding_initializer = utils.get_dict_value(params,
                                                 'embedding_initializer', None)
    embedding_keep_prob = utils.get_dict_value(params, 'embedding_keep_prob',
                                               0.0)
    print("USING EMBEDDING DEVICE %s" % embedding_device)
    if embedding_device is not None:
        with tf.device(embedding_device):
            embedding_matrix = nlp.variable_with_weight_decay(
                'embedding_matrix', [vocab_size, embedding_size],
                initializer=embedding_initializer,
                wd=embedding_wd)
    else:
        embedding_matrix = nlp.variable_with_weight_decay(
            'embedding_matrix', [vocab_size, embedding_size],
            initializer=embedding_initializer,
            wd=embedding_wd)
    if embedding_keep_prob is not None and embedding_keep_prob < 1.0:
        [embedding_matrix], _ = core.dropout([embedding_matrix],
                                             [embedding_keep_prob])
    input_sentence = tf.placeholder(tf.int32, [None, sentence_len], 'sentence')
    emb_sentence = tf.nn.embedding_lookup(embedding_matrix, input_sentence,
                                          'emb_sentence')
    enc_sentence, _ = sentence_encoder(emb_sentence, params)

    return enc_sentence, None
Пример #2
0
def optimizer(optimizer_param, loss_nodes, learning_rate, var_lists=None):
    # this version has gradient clipping
    optimizer_nodes = []
    max_grad_norm = utils.get_dict_value(optimizer_param, 'max_grad_norm', 5)

    # if var_lists is None, then make it a list of None matching the # of loss nodes
    if var_lists == None:
        var_lists = [None] * len(loss_nodes)

    # just create adam optimizers
    for loss_node, var_list in zip(loss_nodes, var_lists):
        loss = loss_node

        if utils.get_dict_value(optimizer_param,
                                trainer.ENABLE_REGULARIZATION_PARAM_NAME,
                                False):
            reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            reg_constant = 1.0  # already have wd, make this parametrizable
            loss += reg_constant * sum(reg_losses)
        if utils.get_dict_value(optimizer_param, 'optimizer',
                                'adam') == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        else:
            print("USING SGD OPTIMIZER WITH LR OF %s" % learning_rate)
            optimizer = tf.train.GradientDescentOptimizer(
                learning_rate=learning_rate)
        grads_vars = optimizer.compute_gradients(loss, var_list=var_list)
        grads = [g for (g, v) in grads_vars]
        vars = [v for (g, v) in grads_vars]
        if max_grad_norm > 0:
            grad, _ = tf.clip_by_global_norm(grads, max_grad_norm)
        train_op = optimizer.apply_gradients(zip(grad, vars))
        optimizer_nodes.append(train_op)
    return optimizer_nodes
Пример #3
0
def _gen_data(params, sentence, num_before, num_after, null_sample_factor=0):
    global max_value
    vocab_size = utils.get_dict_value(params, 'vocab_size', 256)
    start_char = utils.get_dict_value(params, 'start_char', 1)
    slen = len(sentence)
    z = [ord(x) for x in sentence if ord(x) > vocab_size - 1]
    if len(z) > 0 and max(z) > max_value:
        max_value = max(z)
        print('max_value = %s' % max_value)
    sentence = [min(ord(x), vocab_size - 1) for x in sentence]
    sentence = [0] * (num_before - 1) + [start_char
                                         ] + sentence + [0] * (num_after - 1)

    null_list = []
    pos_list = []
    keychars = [ord(',')]
    for i in range(num_before, num_before + slen):
        if sentence[i] in keychars:
            pos_list.append(sentence[i - num_before:i] +
                            sentence[i + 1:i + num_after + 1])
        null_list.append(sentence[i - num_before:i] +
                         sentence[i:i + num_after])

    if null_sample_factor < 0:
        random.shuffle(null_list)
        null_list = null_list[:((len(pos_list) + 1))]
    elif null_sample_factor > 0:
        random.shuffle(null_list)
        null_list = null_list[:(null_sample_factor * (len(pos_list) + 1))]
    result = [[x, 0] for x in null_list] + [[x, 1] for x in pos_list]
    for x in result:
        yield x
Пример #4
0
def sentence_encoder(emb_sentence, params, name='encoded_sentence'):
	"""
	@param emb_sentence:
	@param params:
	@return:
	"""
	conv_num_features = utils.get_dict_value(params, 'conv_num_features', [[100,100,100], [100]])
	conv_widths = utils.get_dict_value(params, 'conv_widths', [[2,3,4],[3]])
	conv_keep_probs = utils.get_dict_value(params, 'conv_keep_probs', 0.5)
	mlp_config = utils.get_dict_value(params, 'mlp_config', [512])
	bipass_conv = utils.get_dict_value(params, 'bipass_conv', False)
	mlp_activations = utils.get_dict_value(params, 'mlp_activations', 'sigmoid')
	mlp_dropout_keep_probs = utils.get_dict_value(params, 'mlp_keep_probs', 0.9)
	use_no_conv_path = utils.get_dict_value(params, 'use_no_conv_path', False)

	weight_wd_regularization = utils.get_dict_value(params, 'weight_wd_regularization', 0.0)
	bias_wd_regularization = utils.get_dict_value(params, 'bias_wd_regularization', 0.0)

	if bipass_conv:
		conv_group = [emb_sentence]
	else:
		if use_no_conv_path:
			conv_group = [emb_sentence]
		else:
			conv_group = []
		for i, (conv_num_feature, conv_width) in enumerate(zip(conv_num_features, conv_widths)):
			conv_out = nlp.conv1d_array(emb_sentence, conv_num_feature, conv_width,name='conv%s'%(str(i)),
										w_wds=weight_wd_regularization,
										b_wds=bias_wd_regularization, keep_probs=conv_keep_probs)
			conv_group.append(conv_out)
	conv_out, _ = misc.concat(conv_group)
	mlp_out, _ = mlp.fully_connected_network(conv_out, mlp_config, layer_activations=mlp_activations, dropout_keep_probs=mlp_dropout_keep_probs)
	return [tf.identity(mlp_out[0], name=name)], {}
Пример #5
0
	def load(self, model_dir):
		self._model_dir = model_dir
		self._paramsfile = os.path.join(self._model_dir, 'params.py')
		self._params = utils.load_param_file(self._paramsfile)
		ckpt = os.path.join(utils.get_dict_value(self._params,'output_location'),
												utils.get_dict_value(self._params, 'model_name') + '.ckpt')
		self._e = Evaluator.load2(ckpt)
Пример #6
0
 def __init__(self, file_list, indexer=None, params=None):
     self._file_list = file_list
     self._cur_list = []
     self._next_file = 0
     self._keywords = [',']
     self._num_before = utils.get_dict_value(params, 'num_words_before', 5)
     self._num_after = utils.get_dict_value(params, 'num_words_after', 5)
     self.load_next_file()
     self._indexer = indexer
     self._current_epoch = 0
     self._current_index = 0
Пример #7
0
def inference(params):
    feature_count = utils.get_dict_value(params, 'feature_count')
    mlp_config = utils.get_dict_value(params, 'mlp_config')
    mlp_activations = utils.get_dict_value(params, 'mlp_activations')
    mlp_dropout_keep_probs = utils.get_dict_value(params,
                                                  'mlp_dropout_keep_probs')
    x = tf.placeholder(tf.float32, [None, feature_count], 'features')
    mlp_out, _ = mlp.fully_connected_network(
        [x],
        mlp_config,
        layer_activations=mlp_activations,
        dropout_keep_probs=mlp_dropout_keep_probs)
    return mlp_out, _
Пример #8
0
	def load(self, model_dir):
		self._model_dir = model_dir
		self._paramsfile = os.path.join(self._model_dir, 'params.py')
		self._params = utils.load_param_file(self._paramsfile)
		self._num_before = utils.get_dict_value(self._params, "num_words_before")
		self._num_after = utils.get_dict_value(self._params, "num_words_after")
		ckpt = os.path.join(utils.get_dict_value(self._params,'output_location'),
												utils.get_dict_value(self._params, 'model_name') + '.ckpt')
		vocab_file = os.path.join(utils.get_dict_value(self._params, 'output_location'), 'vocab.pkl')
		self._e = Evaluator.load2(ckpt)
		self._i = TextIndexer.from_file(vocab_file)
		self._keywords = self._params['keywords']
		self._id_to_word = self._params['id_to_keyword']
Пример #9
0
def contrastive(network, name='contrastive_loss', params=None):
    """
    Implement contrastive loss as given in LeCunn's paper.
      http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    @param network: input network that contains 3 nodes (left, right, label)
    @return:
    #label, margin, y_weight=1,l2_name=None, l2_norm_name=None, loss_name=None, params=None
    """
    #    assert(isinstance(network, list) and (len(network)==3), 'losses.contrastive: input must contain 3 nodes')
    with tf.variable_scope(name):
        margin = common_utils.get_dict_value(params, 'contrastive_loss_margin',
                                             128)
        l2_name = common_utils.get_dict_value(params, 'l2_name', 'l2')
        l2_norm_name = common_utils.get_dict_value(params, 'l2_norm_name',
                                                   'l2_norm')
        left_feature = network[0]
        right_feature = network[1]
        label = network[2]
        one = tf.constant(1.0, dtype=tf.float32)
        zero = tf.constant(0.0, dtype=tf.float32)
        margin_constant = tf.constant(margin, dtype=tf.float32)
        label_sum = tf.reduce_sum(label)
        Y = tf.cond(
            label_sum > 0, lambda: tf.mul(
                tf.div(tf.cast(tf.size(label), dtype=tf.float32), label_sum),
                label), lambda: label)
        one_minus_y = tf.sub(one, Y, name='one_minus_y')
        N = tf.constant(1 / float(right_feature.get_shape().as_list()[1]))
        Dw2 = tf.reduce_sum(tf.square(tf.sub(left_feature, right_feature)),
                            1,
                            name=l2_name)
        right_term = tf.mul(Y,
                            tf.square(
                                tf.maximum(
                                    zero,
                                    tf.sub(
                                        margin_constant,
                                        tf.mul(N,
                                               tf.sqrt(Dw2,
                                                       name=l2_norm_name))))),
                            name='left_term')
        left_term = tf.mul(one_minus_y,
                           tf.mul(tf.square(N), Dw2),
                           name='right_term')
        loss = tf.mul(tf.constant(0.5),
                      tf.reduce_mean(tf.add(right_term, left_term)),
                      name=name)
    return [loss], {}
Пример #10
0
    def _adam_optimizer(optimizer_param,
                        loss_nodes,
                        learning_rate,
                        var_lists=None):
        """
		Default optimizer uses adam optimizer

		@param optimizer_param: dict
		@param loss_nodes: list of tensorflow nodes
		@param var_lists: list of list of variables to optimize
		@return:
		a list of tensorflow optimizer nodes.  This list has equal length as loss_nodes.
		"""
        optimizer_nodes = []

        # if var_lists is None, then make it a list of None matching the # of loss nodes
        if var_lists == None:
            var_lists = [None] * len(loss_nodes)

        # just create adam optimizers
        for loss_node, var_list in zip(loss_nodes, var_lists):
            loss = loss_node

            if utils.get_dict_value(optimizer_param,
                                    ENABLE_REGULARIZATION_PARAM_NAME, False):
                reg_losses = tf.get_collection(
                    tf.GraphKeys.REGULARIZATION_LOSSES)
                reg_constant = 1.0  # already have wd, make this parametrizable
                loss += reg_constant * sum(reg_losses)
            min_node = tf.train.AdamOptimizer(learning_rate).minimize(
                loss, var_list)
            optimizer_nodes.append(min_node)
        return optimizer_nodes
Пример #11
0
def _default_train_iteration_done(trainer, epoch, index, iteration_count,
                                  loss_value, training_done, run_results,
                                  params):

    stats = params['stats']
    next_batch_time = np.mean(stats['next_batch_time_list'])
    training_time = np.mean(stats['training_time_list'])
    overhead_time = np.mean(stats['overhead_time_list'])

    if iteration_count == 1:
        trainer._training_log_file = open(
            os.path.join(utils.get_dict_value(params, 'output_location'),
                         'training_log.txt'), 'w')
        trainer._training_log_file.write(
            "%s,%s,%s,%s,%s,%s,%s,%s\n" %
            ('epoch', 'iteration', 'time', 'loss', 'next_batch_time',
             'training_time', 'overhead_time', 'efficiency'))

    msg = ("%02d, %04d, %s, %s, %0.4f, %0.5f, %0.5f, %0.5f" %
           (epoch, iteration_count, time(), loss_value, next_batch_time,
            training_time, overhead_time, training_time /
            sum([next_batch_time, training_time, overhead_time])))
    if "eval_results" in params:
        eval_results = params['eval_results']
        for x in eval_results:
            msg += ", %0.4f" % x

    print('%s' % msg)
    trainer._training_log_file.write('%s\n' % msg)
    trainer._training_log_file.flush()

    if trainer._model_log_db is not None:
        trainer._model_log_db.on_update(epoch, index, iteration_count,
                                        loss_value, msg)
    return False
Пример #12
0
def train_iteration_done(trainer, epoch, index, iteration_count, loss_value, training_done, run_results, params):
	if iteration_count == 1:
		trainer._out_file = open(os.path.join(utils.get_dict_value(params,'output_location'), 'training_log.txt'), 'w')

	msg = ("%s, %s"%(time(), loss_value))
	print('%s: %s' % (iteration_count, msg))
	trainer._out_file.write('%s\n'%msg)
	trainer._out_file.flush()
Пример #13
0
def train_iteration_done(trainer, epoch, index, iteration_count, loss_value,
                         training_done, run_results, params):
    if hasattr(trainer, 'last_epoch') and trainer.last_epoch != epoch:
        # lr decay
        lrd_epoch_start = utils.get_dict_value(
            params, 'learning_rate_decay_start_epoch', -1)
        if epoch > lrd_epoch_start:
            lr_decay = utils.get_dict_value(params, 'learning_rate_decay', -1)
            lr = utils.get_dict_value(params, 'learning_rate', 0.001)
            if lr_decay > 0:
                new_lr = lr * (lr_decay**(epoch - lrd_epoch_start))
                print("NEW LEARNING RATE %s" % new_lr)
                trainer.set_learning_rate(new_lr)

    params['eval_results'] = [run_results['tpp']]
    trainer.last_epoch = epoch
    return framework.trainer._default_train_iteration_done(
        trainer, epoch, index, iteration_count, loss_value, training_done,
        run_results, params)
Пример #14
0
def main(argv):
    try:
        argv = FLAGS(argv)  # parse flags
    except gflags.FlagsError as e:
        print('%s\\nUsage: %s ARGS\\n%s' % (e, sys.argv[0], FLAGS))
        sys.exit(1)
    print(FLAGS.paramsfile)
    params = utils.load_param_file(FLAGS.paramsfile)
    data = load_results(params)
    fig, ax = plt.subplots()
    ax.plot([(x * 8192) / 1000000 for x in data[1]], data[8])

    ax.set(xlabel='Million Records Seen',
           ylabel='Accuracy @ 1',
           title=params['model_name'])
    ax.grid()

    max_value = np.max(data[8])
    #	plt.ylim((.75,math.ceil(max_value*10)/10))
    #plt.ylim((.75,1))
    fig.savefig(
        os.path.join(utils.get_dict_value(params, 'output_location'),
                     "accuracy.png"))
    plt.show(block=False)

    fig, ax = plt.subplots()
    ax.plot([(x * 8192) / 1000000 for x in data[1]], data[3])

    ax.set(xlabel='Million Records Seen',
           ylabel='Loss',
           title=params['model_name'])
    ax.grid()

    min_value = np.min(data[3])
    #	plt.ylim((math.floor(min_value*10)/10,1))
    #plt.ylim((.75,1))
    fig.savefig(
        os.path.join(utils.get_dict_value(params, 'output_location'),
                     "loss.png"))
    plt.show(block=False)

    input("Press enter to exit...")
Пример #15
0
def gen_data(dataobj,
             tokens,
             keywords,
             num_before=5,
             num_after=5,
             pad_tok="<pad>",
             null_sample_factor=0,
             add_redundant_keyword_data=True,
             use_negative_only_data=True,
             ignore_negative_data=False,
             add_keyword_removal_data=False):
    dataobj._mean = np.mean(dataobj._y_count)
    dataobj._std = np.std(dataobj._y_count)
    dataobj._max = np.max(dataobj._y_count)
    dataobj._min = np.min(dataobj._y_count)
    params = dataobj._params
    sampling = utils.get_dict_value(params, 'data_sampling', 'freeform')

    tokens = [pad_tok] * num_before + tokens + [pad_tok] * (num_after + 5)
    class_offset = 1
    if ignore_negative_data:
        class_offset = 0
    results = []
    unk_list = []
    no_insert_list = []
    for toki in range(num_before, len(tokens) - num_before - 4):
        tok0 = tokens[toki]
        if tok0 in keywords:
            ki = keywords[tok0]
            if (sampling=='freeform') \
             or ((sampling == 'uniform') and (dataobj._y_count[ki+class_offset] < dataobj._min + 5)) \
             :
                #dataobj._y_count[ki+class_offset] < dataobj._min + 5:# + dataobj._std * 1.0:
                results.append( \
                 (tokens[(toki - num_before):toki] + tokens[(toki + 1):(toki + num_after + 1)], \
                  ki + class_offset))


#		else:
# add unk
#			if 'unk' in keywords:
#				ki = keywords.index('unk')
#				unk_list.append((tokens[(toki-num_before):toki]+tokens[(toki+1):(toki+num_after+1)], ki + class_offset))
#		no_insert_list.append((tokens[(toki-num_before):toki]+tokens[(toki):(toki+num_after)], 0))
#	num_to_add = min([int(len(results)/len(keywords)),len(no_insert_list),len(unk_list)])
#	if num_to_add == 0 and len(results)>0:
#		num_to_add = 1
#	random.shuffle(no_insert_list)
#	random.shuffle(unk_list)
#	if num_to_add > 0:
#		results += no_insert_list[:num_to_add]
#		results += unk_list[:num_to_add]
    return results
Пример #16
0
def load_results(params):
    traininglog_file = os.path.join(
        utils.get_dict_value(params, 'output_location'), 'training_log.txt')
    data = []
    with open(traininglog_file, 'r') as f:
        csvdata = csv.reader(f, delimiter=',')
        for row in csvdata:
            for i, col in enumerate(row):
                if i >= len(data):
                    data.append([])
                data[i].append(float(col))
    return data
Пример #17
0
def inference(params):
    embedding_size = params['embedding_size']
    sentence_len = params['num_before'] + params['num_after']
    embedding_wd = utils.get_dict_value(params, 'embedding_wd')
    embedding_device = utils.get_dict_value(params, 'embedding_device')
    embedding_initializer = utils.get_dict_value(params,
                                                 'embedding_initializer')
    embedding_keep_prob = utils.get_dict_value(params, 'embedding_keep_prob')
    word_embedding_size = utils.get_dict_value(params, 'word_embedding_size',
                                               embedding_size)
    vocab_size = utils.get_dict_value(params, 'vocab_size', 256)
    if embedding_device is not None:
        with tf.device(embedding_device):
            word_embedding_matrix = nlp.variable_with_weight_decay(
                'word_embedding_matrix', [vocab_size, word_embedding_size],
                initializer=embedding_initializer,
                wd=embedding_wd)
    else:
        word_embedding_matrix = nlp.variable_with_weight_decay(
            'word_embedding_matrix', [vocab_size, word_embedding_size],
            initializer=embedding_initializer,
            wd=embedding_wd)

    input_sentence = tf.placeholder(tf.int32, [None, sentence_len], 'sentence')
    emb_sentence = tf.nn.embedding_lookup(word_embedding_matrix,
                                          input_sentence, 'emb_word')
    if embedding_keep_prob is not None and embedding_keep_prob < 1.0:
        [emb_sentence], _ = core.dropout([emb_sentence], [embedding_keep_prob])
    enc_sentence, _ = encoder(emb_sentence, params)

    return enc_sentence, None
Пример #18
0
 def __init__(self,
              params=None,
              files=[['features_000.npy', 'scores_000.npy'],
                     ['features_001.npy', 'scores_001.npy'],
                     ['features_002.npy', 'scores_002.npy'],
                     ['features_003.npy', 'scores_003.npy']]):
     self._files = files
     self._current_file = 0
     self._current_index = 0
     self._current_epoch = 0
     self._num_records_seen = 0
     self._separate_epochs = True
     self._num_minibatches = 0
     self._data_dir = utils.get_dict_value(params, 'data_dir')
     self.load_next_file()
Пример #19
0
def generate_model_input_sentences(tokens, params):
    pad_tok = '<pad>'
    num_before = params['num_words_before']
    num_after = params['num_words_after']
    start_token = utils.get_dict_value(params, 'start_token')
    if start_token is not None and len(start_token) > 0:
        tokens = [pad_tok] * (num_before - 1) + [
            start_token
        ] + tokens + [pad_tok] * (num_after + 5)
    else:
        tokens = [pad_tok] * num_before + tokens + [pad_tok] * (num_after + 5)
    result = []
    for toki in range(num_before, len(tokens) - num_before - 5):
        result.append(tokens[toki - num_before:toki] +
                      tokens[toki:toki + num_after])
    return result
Пример #20
0
	def load(self, model_dir):
		self._model_dir = model_dir
		self._paramsfile = os.path.join(self._model_dir, 'params.py')
		self._params = utils.load_param_file(self._paramsfile)
		self._num_before = utils.get_dict_value(self._params, "num_words_before")
		self._num_after = utils.get_dict_value(self._params, "num_words_after")
		ckpt = os.path.join(utils.get_dict_value(self._params,'output_location'),
												utils.get_dict_value(self._params, 'model_name') + '.ckpt')
		vocab_file = os.path.join(utils.get_dict_value(self._params, 'output_location'), 'vocab.pkl')
		self._e = Evaluator.load2(ckpt)
		self._i = TextIndexer.from_file(vocab_file)
		with open(os.path.join(
				utils.get_dict_value(self._params, 'output_location'),
				'keywords.pkl'), 'rb') as f:
			keywords = pickle.load(f)
		self._params['keywords'] = keywords
		self._keywords = self._params['keywords']
		self._keyword_map, self._keyword_list = gen_keywords(self._params)
Пример #21
0
 def __init__(self,
              tellme_datadir='/mnt/work/tellme/data',
              datafiles=['trn1.npy', 'trn2.npy'],
              params={}):
     self._tcid_count = 2
     ticid_mapfile = os.path.join(tellme_datadir, "tcid.map")
     with open(ticid_mapfile, "r") as f:
         for line in f:
             value = line.strip("\r\n").split("\t")
             if (int(value[1]) > 1):
                 self._tcid_count += 1
         self._tcid_count += 1
     self._data_chunks = []
     print("self._tcid_count = %s" % self._tcid_count)
     print("Loading data...")
     self._separate_epochs = utils.get_dict_value(params, "separate_epochs",
                                                  False)
     self._tcids_data = np.load(os.path.join(tellme_datadir, datafiles[0]))
     self._timing_info_data = np.load(
         os.path.join(tellme_datadir, datafiles[1]))
     print("done loading data!")
     self._current_epoch = 0
     self._current_index = 0
     self._num_minibatches = 0
Пример #22
0
def eval(params,
         save_accuracy_file=True,
         batch_size=5000,
         num_batches=20,
         topn=1,
         verbose=True):
    ckpt = os.path.join(utils.get_dict_value(params, 'output_location'),
                        utils.get_dict_value(params, 'model_name') + '.ckpt')
    accuracy_file = os.path.join(
        utils.get_dict_value(params, 'output_location'), 'accuracy.txt')
    e = Evaluator.load2(ckpt)
    if verbose:
        e.dump_variable_sizes()
    training_data = TellmeData(tellme_datadir='/mnt/work/tellme/data',
                               datafiles=['tst1.npy', 'tst2.npy'])
    correct_list = []
    incorrect_list = []
    batch_size = batch_size
    dt_list = []
    #	num_test_records = 0000 #22596471
    for i in range(num_batches):
        batch = training_data.next_batch(batch_size=batch_size)
        batch_y = batch['y']
        del batch['y']
        bef = time()
        [r] = e.eval(batch, {'sm_decision'})
        aft = time()
        dt_list.append(aft - bef)
        ccorrect = 0
        cincorrect = 0
        for model, gt in zip(r, batch_y):
            topn_idx = np.argpartition(model, -topn)[-topn:]
            #model_predict = np.argmax(model)
            if gt in topn_idx:  #model_predict == int(gt):
                ccorrect += 1
            else:
                cincorrect += 1
        correct_list.append(ccorrect)
        incorrect_list.append(cincorrect)
        if verbose:
            print('accuracy = %0.4f' % (ccorrect / (ccorrect + cincorrect)))
    accuracy_list = [
        c / (c + ic) for c, ic in zip(correct_list, incorrect_list)
    ]
    correct = np.sum(correct_list)
    incorrect = np.sum(incorrect_list)
    total_accuracy = np.mean(
        accuracy_list)  #(correct / (correct + incorrect));
    accuracy_std = np.std(accuracy_list)
    accuracy_sem = accuracy_std / np.sqrt(len(correct_list))
    if save_accuracy_file:
        f = open(accuracy_file, 'a')
        f.write('%s %s\n' % (time(), total_accuracy))
        f.close()
    if verbose:
        print('accuracy = %0.4f +/- %0.4f (std=%0.4f)' %
              (total_accuracy, accuracy_sem, accuracy_std))
    dt_mean = np.mean(dt_list)
    dt_std = np.std(dt_list)
    if verbose:
        print("dt_mean = %0.4f dt_std = %0.4f" % (dt_mean, dt_std))
    return total_accuracy, accuracy_sem, accuracy_std
Пример #23
0
import framework.subgraph.losses as losses
import framework.utils.common as utils
import data
from framework.trainer import Trainer, _default_train_iteration_done
from time import time
import pickle
import model
import os
import shutil
import copy
import numpy as np

param_file = 'params.py'
params = utils.load_param_file(param_file)
params['num_classes'] = len(params['keywords'])+1
indexer = TextIndexer.from_txt_file(utils.get_dict_value(params, 'vocab_file'), max_size=utils.get_dict_value(params,'max_vocab_size',-1))
indexer.add_token('<pad>')
indexer.add_token('unk')
output_indexer = copy.deepcopy(indexer)
output_indexer.add_token('<blank>')
os.makedirs(utils.get_dict_value(params,'output_location'), exist_ok=True)
indexer.save_vocab_as_pkl(os.path.join(utils.get_dict_value(params,'output_location'), 'vocab.pkl'))

files_to_copy = [param_file]
for file in files_to_copy:
	shutil.copyfile(file,os.path.join(utils.get_dict_value(params,'output_location'), file))

params['vocab_size'] = indexer.vocab_size()

if 'training_data_dir' in params:
	training_data = ClassifierData.get_training_data(base_dir=params['training_data_dir'], indexer=indexer, params=params,
Пример #24
0
from framework.utils.data.text_indexer import TextIndexer
from word_classifier.data import ClassifierData
import framework.subgraph.losses as losses
import framework.utils.common as utils
from framework.trainer import Trainer, _default_train_iteration_done
from time import time
import model
import os
import shutil

param_file = 'params.py'
params = utils.load_param_file(param_file)
if not utils.get_dict_value(params, 'ignore_negative_data', False):
    params['num_classes'] = len(params['keywords']) + 1
else:
    params['num_classes'] = len(params['keywords'])
indexer = TextIndexer.from_txt_file(utils.get_dict_value(params, 'vocab_file'))
indexer.add_token('<pad>')
indexer.add_token('unk')
os.makedirs(utils.get_dict_value(params, 'output_location'), exist_ok=True)
indexer.save_vocab_as_pkl(
    os.path.join(utils.get_dict_value(params, 'output_location'), 'vocab.pkl'))
shutil.copyfile(
    param_file,
    os.path.join(utils.get_dict_value(params, 'output_location'), param_file))

params['vocab_size'] = indexer.vocab_size()
training_data = ClassifierData.get_monolingual_training(
    base_dir=params['monolingual_dir'], indexer=indexer, params=params)

Пример #25
0
def eval(params,
				 save_accuracy_file=True,
				 batch_size=5000,
				 num_batches=20,
				 topn=1,
				 verbose=True):
	num_before = utils.get_dict_value(params, "num_words_before")
	num_after = utils.get_dict_value(params, "num_words_after")
	ckpt = os.path.join(utils.get_dict_value(params,'output_location'),
											utils.get_dict_value(params, 'model_name') + '.ckpt')
	accuracy_file = os.path.join(utils.get_dict_value(params,'output_location'),
											'accuracy.txt')
	vocab_file = os.path.join(utils.get_dict_value(params, 'output_location'), 'vocab.pkl')
	keywords_file = os.path.join(utils.get_dict_value(params, 'output_location'), 'keywords.pkl')
	e = Evaluator.load2(ckpt)
	i = TextIndexer.from_file(vocab_file)
	#test_sentence = "<S> ___ quick brown fox jumped over the lazy dog"
	test_sentence = "<S> ___ is no way to know whether it will work"
	#test_sentence = "<S> ___ house is on fire"
#	test_sentence = "<S> ___ in your best interest to lie"
#	test_sentence = "<S> ___ yours and I cannot touch it"
	#test_sentence = "<S> I ate a ___ and an apple"
	#test_sentence = "<S> I have to take ___ life away"
#	test_sentence = "<S> ___ may and it is raining"
	#test_sentence = "<S> This will take ___ before it will actually work"
	#test_sentence = "<S> this is probably bigger ___ that"
#	test_sentence = "<S> ___ is no place like home"
	#test_sentence = "I have ___ of money"
	#test_sentence = "<S> I think I ___ have it"
	test_sentence = "<S> don 't forget to get orange , banana , and ___ ."
#	test_sentence = "<S> in the heat ___ the night"
#	test_sentence = "<S> in the river , ___ the boat"
#	test_sentence = "<S> nothing can be ___ from the truth"
#	test_sentence = "<S> the ___ knot will unwind"
#	test_sentence = "<S> if you keep playing, you will ___ ."
	test_sentence = "<s> I ate a ___ of oranges ."
#	test_sentence = "<s> I ate a ___ and oranges ."
#	test_sentence = "<s> I live in a ___ ."
#	test_sentence = "<s> I ate a ___ of oranges ."
	test_sentence = "<s> I ate a ___ and oranges ."
	test_sentence = "<s> I live in a ___ ."
	test_sentence = "<s> I have seen it on him , and can ___ to it ."
	test_sentence = "<s> the thieves ___ the library and got very little for their pains ."

	# input data
	with open('/mnt/work/NeuralRewriting/eval/small_eval_data.json') as f:
		data = json.load(f)
	with open(keywords_file, 'rb') as f:
		k = pickle.load(f)

	unk_list = []
	for q in data:
		query_word = q['query_word']
		orig_sent = q['orig_sent']
		options = q['options']
		orig_sent = orig_sent.replace(query_word, "___")
		orig_sent = "<s> " + orig_sent
		test_sentence = orig_sent.lower()
		split_sentence = list(split_sentence_for_eval(test_sentence.split(), ["___"], num_before, num_after))
#		print(split_sentence[0][0])
		_, sentence, _, _ = i.index_wordlist(split_sentence[0][0])
		bef = time()
		r = e.eval({'sentence': [sentence]}, {'sm_decision'})
		aft = time()
		sm = r[0][0]

		for o in options:
			synonym = o['synonym']
			if synonym not in k:
				score = -1000
				unk_list += [synonym]
			else:
				score = math.log(sm[k.index(synonym)])
			o['clmtV1'] = score
			print(score)

	# save output
	with open('/mnt/work/NeuralRewriting/eval/small_eval_data_out.json','w') as f:
		json.dump(data,f)
Пример #26
0
#    params['logfile'].write(msg)
#    params['logfile'].write('\n')


def train_iteration_done(trainer, epoch, index, iteration_count, loss_value,
                         training_done, run_results, params):
    if iteration_count == 1:
        trainer._out_file = open('output.txt', 'w')

    msg = ("%s, %s" % (time(), loss_value))
    print('%s: %s' % (iteration_count, msg))
    trainer._out_file.write('%s\n' % msg)


trainer = Trainer(inference=model.inference,
                  batch_size=utils.get_dict_value(params, 'batch_size', 128),
                  loss=losses.softmax_xentropy,
                  model_output_location="./output",
                  name=MODEL_NAME,
                  training_data=training_data,
                  train_iteration_done=train_iteration_done,
                  params=params)

trainer.run(restore_latest_ckpt=False,
            save_network=True,
            save_ckpt=True,
            mini_batches_between_checkpoint=utils.get_dict_value(
                params, 'mini_batches_between_checkpoint', 1000),
            additional_nodes_to_evaluate=['encoded_sentence'],
            on_checkpoint_saved=on_checkpoint_saved)
Пример #27
0
 def get_model_name(self):
     return utils.get_dict_value(self._params, 'model_name',
                                 '_UNKNOWN_MODEL_')
Пример #28
0
import framework.utils.common as utils
import word_classifier.data as data
import os
import urllib.parse
from time import time
from urllib.parse import urlparse
from http.server import BaseHTTPRequestHandler, HTTPServer
run_server = True


paramsfile = "params.py"
data_base_dir = ""
http_port = 8080
params = utils.load_param_file(paramsfile)

vocab_file = os.path.join(utils.get_dict_value(params,'output_location'), 'vocab.pkl')
ckpt = os.path.join(utils.get_dict_value(params,'output_location'),
										utils.get_dict_value(params, 'model_name') + '.ckpt')
print(ckpt)
e = Evaluator.load2(ckpt)
i = TextIndexer.from_file(vocab_file)

num_before = utils.get_dict_value(params, "num_words_before")
num_after = utils.get_dict_value(params, "num_words_after")
pad_tok = utils.get_dict_value(params, "pad_tok", '<pad>')


def split_sentence_for_eval(sentence, keywords, num_before, num_after):
	result = data.gen_data(sentence, keywords, num_before=num_before, num_after=num_after,
                          ignore_negative_data=True, add_redundant_keyword_data=False)
	return result
Пример #29
0
from framework.utils.data.text_indexer import TextIndexer
from framework.evaluator import Evaluator
import framework.utils.common as utils
import os
import numpy as np
run_server = False

params = utils.load_param_file('params.py')

vocab_file = os.path.join(utils.get_dict_value(params, 'output_location'),
                          'vocab.pkl')
ckpt = os.path.join(utils.get_dict_value(params, 'output_location'),
                    utils.get_dict_value(params, 'model_name') + '.ckpt')

sentences = ['The apple , which is rotten is not edible']

e = Evaluator.load2(ckpt)
i = TextIndexer.from_file(vocab_file)

num_before = utils.get_dict_value(params, "num_words_before")
num_after = utils.get_dict_value(params, "num_words_after")
pad_tok = utils.get_dict_value(params, "pad_tok", '<pad>')

sentence = "In simple terms , high precision means that an algorithm " \
   "returned substantially more relevant results than irrelevant ones , while" \
   " high recall means that an algorithm returned most of the relevant results ."

sentence = "<S> In simple terms , high precision means that algorithm " \
   "returned substantially more relevant results than irrelevant ones , while" \
   " high recall means that algorithm returned most of relevant results ."
#sentence = "<S> Precision can be seen as measure of exactness or quality , "\
Пример #30
0
from framework.utils.data.text_indexer import TextIndexer
from tellme.data import TellmeData
import framework.subgraph.losses as losses
import framework.utils.common as utils
from framework.trainer import Trainer
from eval import eval
from time import time
import numpy as np
import model
import os
import shutil

param_file = 'params.py'
params = utils.load_param_file(param_file)
os.makedirs(utils.get_dict_value(params, 'output_location'), exist_ok=True)
shutil.copyfile(
    param_file,
    os.path.join(utils.get_dict_value(params, 'output_location'), param_file))

training_data = TellmeData()
params['vocab_size'] = training_data.get_tcid_count()
params['num_classes'] = training_data.get_tcid_count()


def on_checkpoint_saved(trainer, params, save_path):
    msg = 'saved checkpoint: ' + save_path
    print(msg)
    accuracy, accuracy_sem, accuracy_std = eval(params)
    params['eval_results'] = [accuracy, accuracy_sem, accuracy_std]