def input_fn(labeled: DataSet, labeled_size: int, unlabeled: DataSet, unlabeled_size: int): input_dict = {} # labeled data labeled = labeled.get_batch(labeled_size) input_dict['labeled_inputs'] = tf.constant(np.array(labeled.inputs())) input_dict['labeled_sequence_length'] = tf.constant(labeled.lengths()) input_dict['labeled_mask'] = tf.constant(labeled.masks()) input_dict['labeled_size'] = tf.constant(labeled.size()) labels = tf.constant(labeled.labels()) # unlabeled data unlabeled = unlabeled.get_batch(unlabeled_size) input_dict['unlabeled_inputs'] = tf.constant( np.array(unlabeled.inputs())) input_dict['unlabeled_sequence_length'] = tf.constant( unlabeled.lengths()) input_dict['unlabeled_mask'] = tf.constant(unlabeled.masks()) input_dict['unlabeled_size'] = tf.constant(unlabeled.size()) return input_dict, labels
def input_fn(labeled: DataSet, unlabeled: DataSet = None, size: int = BATCH_SIZE): input_dict = { } if unlabeled is not None and unlabeled.size() == 0: unlabeled = None # labeled data labeled = labeled.get_batch(size) input_dict['labeled_inputs'] = tf.constant(np.array(labeled.inputs())) input_dict['labeled_sequence_length'] = tf.constant(labeled.lengths()) input_dict['labeled_mask'] = tf.constant(labeled.masks()) labels = tf.constant(labeled.labels()) # unlabeled data unlabeled = unlabeled is None and labeled or unlabeled.get_batch(labeled.size()) input_dict['unlabeled_inputs'] = tf.constant(np.array(unlabeled.inputs())) input_dict['unlabeled_sequence_length'] = tf.constant(unlabeled.lengths()) input_dict['unlabeled_mask'] = tf.constant(unlabeled.masks()) input_dict['unlabeled_size'] = tf.constant(unlabeled.size()) input_dict['unlabeled_target'] = tf.constant(unlabeled.labels()) return input_dict, labels
def run(cls, dev, test, labeled_slot, labeled_train, unlabeled_slot, unlabeled_train, steps, gpu_memory): training_set = DataSet(labeled_slot, labeled_train) validation_set = DataSet(labeled_slot, dev) test_set = DataSet(labeled_slot, test) unlabeled_set = DataSet(unlabeled_slot, unlabeled_train) print('# training_set (%d)' % training_set.size()) print('# validation_set (%d)' % validation_set.size()) print('# test_set (%d)' % test_set.size()) print('# unlabeled_set (%d)' % unlabeled_set.size()) classifier = tf.contrib.learn.Estimator( model_fn=SlotFilling.rnn_model_fn, params={ 'num_slot': training_set.num_classes(), 'num_pos': unlabeled_set.num_classes(), 'drop_out': DROP_OUT, 'embedding_dimension': EMBEDDING_DIMENSION, 'vocab_size': DataSet.vocab_size(), 'unlabeled': unlabeled_set.size() > 0 }, config=tf.contrib.learn.RunConfig( gpu_memory_fraction=gpu_memory, save_checkpoints_secs=30, ), model_dir='./model') validation_metrics = { "accuracy": tf.contrib.learn.MetricSpec( metric_fn=tf.contrib.metrics.streaming_accuracy, prediction_key='predictions', weight_key='labeled_mask') } monitor = tf.contrib.learn.monitors.ValidationMonitor( input_fn=lambda: SlotFilling.input_fn( validation_set, unlabeled_set, validation_set.size(), 1), eval_steps=1, every_n_steps=50, metrics=validation_metrics, early_stopping_metric="loss", early_stopping_metric_minimize=True, early_stopping_rounds=300) classifier.fit(input_fn=lambda: SlotFilling.input_fn( training_set, unlabeled_set, training_set.size(), 500), monitors=[monitor], steps=steps) predictions = classifier.predict(input_fn=lambda: SlotFilling.input_fn( test_set, unlabeled_set, test_set.size(), 1)) slot_correct = 0 slot_no_match = 0 slot_mismatch = 0 slot_over_match = 0 for i, p in enumerate(predictions): target = test_set.labels()[i][:test_set.lengths()[i]] prediction = list(p['predictions'])[:test_set.lengths()[i]] for expected, actual in zip(target, prediction): actual = int(actual) if expected is actual: slot_correct += 1 elif test_set.get_slot(actual) is 'o': slot_no_match += 1 elif test_set.get_slot(expected) is 'o': slot_over_match += 1 else: slot_mismatch += 1 return { 'accuracy': slot_correct / sum(test_set.lengths()), 'correct': slot_correct, 'no_match': slot_no_match, 'mismatch': slot_mismatch, 'over_match': slot_over_match, }
config = config_plain # experiments = experiments[5:6] if not os.path.exists('./out'): os.mkdir('./out') # for vocab size DataSet('./data/atis.pkl.slot', './data/atis.pkl.train') DataSet('./data/atis.pos.slot', './data/atis.pos.train') slot = common['slot'] validation_set = DataSet(slot, common['dev']) test_set = DataSet(slot, common['test']) print('# Experiments (%d)' % len(experiments)) print('# validation_set (%d)' % validation_set.size()) print('# test_set (%d)' % test_set.size()) pos_model = None if 'pos_model' in config: pos_set = DataSet('./data/atis.pos.slot', './data/atis.pos.train') print('# Pre-training') print('# POS training set (%d)' % pos_set.size()) pos_model = PosTagging.run(training_set=pos_set, steps=config['pos_model'], gpu_memory=0.2, random_seed=RANDOM_SEED, vocab_size=DataSet.vocab_size(), drop_out=config['drop_out'], cell_size=CELL_SIZE,