def testReadFromTextFile(self): reader = sentence_io.ConllSentenceReader(self.filepath, self.batch_size) self.assertParseable(reader, self.batch_size, False) self.assertParseable(reader, self.batch_size, False) self.assertParseable(reader, 14, True) self.assertParseable(reader, 0, True) self.assertParseable(reader, 0, True)
def testReadAndProjectivize(self): reader = sentence_io.ConllSentenceReader(self.filepath, self.batch_size, projectivize=True) self.assertParseable(reader, self.batch_size, False) self.assertParseable(reader, self.batch_size, False) self.assertParseable(reader, 14, True) self.assertParseable(reader, 0, True) self.assertParseable(reader, 0, True)
def testReadFirstSentence(self): reader = sentence_io.ConllSentenceReader(self.filepath, 1) sentences, last = reader.read() self.assertEqual(1, len(sentences)) pb = sentence_pb2.Sentence() pb.ParseFromString(sentences[0]) self.assertFalse(last) self.assertEqual( u'I knew I could do it properly if given the right kind of support .', pb.text)
def main(unused_argv): # Validate that we have a parser saved model passed to this script. if FLAGS.parser_saved_model is None: tf.logging.fatal('A parser saved model must be provided.') # Parse the flags containint lists, using regular expressions. # This matches and extracts key=value pairs. component_beam_sizes = re.findall(r'([^=,]+)=(\d+)', FLAGS.inference_beam_size) tf.logging.info('Found beam size dict %s' % component_beam_sizes) # This matches strings separated by a comma. Does not return any empty # strings. components_to_locally_normalize = re.findall(r'[^,]+', FLAGS.locally_normalize) tf.logging.info('Found local normalization dict %s' % components_to_locally_normalize) # Create a session config with the requested number of threads. session_config = tf.ConfigProto(log_device_placement=False, intra_op_parallelism_threads=FLAGS.threads, inter_op_parallelism_threads=FLAGS.threads) # Get the segmented input data for the parser, either by running the # segmenter ourselves or by simply reading it from the CoNLL file. if FLAGS.segmenter_saved_model is None: # If no segmenter was provided, we must use the data from the CONLL file. input_file = FLAGS.input_file parser_input = sentence_io.ConllSentenceReader(input_file).corpus() use_gold_segmentation = True else: # If the segmenter was provided, use it. segmenter_input = get_segmenter_corpus(FLAGS.input_file, FLAGS.text_format) parser_input = run_segmenter(segmenter_input, FLAGS.segmenter_saved_model, session_config, FLAGS.max_batch_size, FLAGS.timeline_output_file) use_gold_segmentation = False # Now that we have parser input data, parse. processed = run_parser(parser_input, FLAGS.parser_saved_model, session_config, component_beam_sizes, components_to_locally_normalize, FLAGS.max_batch_size, FLAGS.timeline_output_file) if FLAGS.output_file: print_output(FLAGS.output_file, FLAGS.text_format, use_gold_segmentation, processed)
def get_segmenter_corpus(input_data_path, use_text_format): """Reads in a character corpus for segmenting.""" # Read in the documents. tf.logging.info('Reading documents...') if use_text_format: char_corpus = sentence_io.FormatSentenceReader(input_data_path, 'untokenized-text').corpus() else: input_corpus = sentence_io.ConllSentenceReader(input_data_path).corpus() with tf.Session(graph=tf.Graph()) as tmp_session: char_input = gen_parser_ops.char_token_generator(input_corpus) char_corpus = tmp_session.run(char_input) check.Eq(len(input_corpus), len(char_corpus)) return char_corpus
def main(unused_argv): logging.set_verbosity(logging.INFO) if not gfile.IsDirectory(FLAGS.resource_path): gfile.MakeDirs(FLAGS.resource_path) # Constructs lexical resources for SyntaxNet in the given resource path, from # the training data. if FLAGS.compute_lexicon: logging.info('Computing lexicon...') lexicon.build_lexicon(FLAGS.resource_path, FLAGS.training_corpus_path) # Construct the "lookahead" ComponentSpec. This is a simple right-to-left RNN # sequence model, which encodes the context to the right of each token. It has # no loss except for the downstream components. char2word = spec_builder.ComponentSpecBuilder('char_lstm') char2word.set_network_unit(name='wrapped_units.LayerNormBasicLSTMNetwork', hidden_layer_sizes='256') char2word.set_transition_system(name='char-shift-only', left_to_right='true') char2word.add_fixed_feature(name='chars', fml='char-input.text-char', embedding_dim=16) char2word.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master) lookahead = spec_builder.ComponentSpecBuilder('lookahead') lookahead.set_network_unit(name='wrapped_units.LayerNormBasicLSTMNetwork', hidden_layer_sizes='256') lookahead.set_transition_system(name='shift-only', left_to_right='false') lookahead.add_link(source=char2word, fml='input.last-char-focus', embedding_dim=32) lookahead.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master) # Construct the ComponentSpec for tagging. This is a simple left-to-right RNN # sequence tagger. tagger = spec_builder.ComponentSpecBuilder('tagger') tagger.set_network_unit(name='wrapped_units.LayerNormBasicLSTMNetwork', hidden_layer_sizes='256') tagger.set_transition_system(name='tagger') tagger.add_token_link(source=lookahead, fml='input.focus', embedding_dim=32) tagger.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master) # Construct the ComponentSpec for parsing. parser = spec_builder.ComponentSpecBuilder('parser') parser.set_network_unit(name='FeedForwardNetwork', hidden_layer_sizes='256', layer_norm_hidden='True') parser.set_transition_system(name='arc-standard') parser.add_token_link(source=lookahead, fml='input.focus', embedding_dim=32) parser.add_token_link(source=tagger, fml='input.focus stack.focus stack(1).focus', embedding_dim=32) # Recurrent connection for the arc-standard parser. For both tokens on the # stack, we connect to the last time step to either SHIFT or REDUCE that # token. This allows the parser to build up compositional representations of # phrases. parser.add_link( source=parser, # recurrent connection name='rnn-stack', # unique identifier fml='stack.focus stack(1).focus', # look for both stack tokens source_translator='shift-reduce-step', # maps token indices -> step embedding_dim=32) # project down to 32 dims parser.fill_from_resources(FLAGS.resource_path, FLAGS.tf_master) master_spec = spec_pb2.MasterSpec() master_spec.component.extend( [char2word.spec, lookahead.spec, tagger.spec, parser.spec]) logging.info('Constructed master spec: %s', str(master_spec)) hyperparam_config = spec_pb2.GridPoint() hyperparam_config.decay_steps = 128000 hyperparam_config.learning_rate = 0.001 hyperparam_config.learning_method = 'adam' hyperparam_config.adam_beta1 = 0.9 hyperparam_config.adam_beta2 = 0.9 hyperparam_config.adam_eps = 0.0001 hyperparam_config.gradient_clip_norm = 1 hyperparam_config.self_norm_alpha = 1.0 hyperparam_config.use_moving_average = True hyperparam_config.dropout_rate = 0.7 hyperparam_config.seed = 1 # Build the TensorFlow graph. graph = tf.Graph() with graph.as_default(): builder = graph_builder.MasterBuilder(master_spec, hyperparam_config) component_targets = spec_builder.default_targets_from_spec(master_spec) trainers = [ builder.add_training_from_config(target) for target in component_targets ] assert len(trainers) == 2 annotator = builder.add_annotation() builder.add_saver() # Read in serialized protos from training data. training_set = sentence_io.ConllSentenceReader( FLAGS.training_corpus_path, projectivize=FLAGS.projectivize_training_set).corpus() dev_set = sentence_io.ConllSentenceReader(FLAGS.dev_corpus_path, projectivize=False).corpus() # Ready to train! logging.info('Training on %d sentences.', len(training_set)) logging.info('Tuning on %d sentences.', len(dev_set)) pretrain_steps = [100, 0] tagger_steps = 1000 train_steps = [tagger_steps, 8 * tagger_steps] tf.logging.info('Creating TensorFlow checkpoint dir...') gfile.MakeDirs(os.path.dirname(FLAGS.checkpoint_filename)) summary_writer = trainer_lib.get_summary_writer(FLAGS.tensorboard_dir) with tf.Session(FLAGS.tf_master, graph=graph) as sess: # Make sure to re-initialize all underlying state. sess.run(tf.global_variables_initializer()) trainer_lib.run_training(sess, trainers, annotator, evaluation.parser_summaries, pretrain_steps, train_steps, training_set, dev_set, dev_set, FLAGS.batch_size, summary_writer, FLAGS.report_every, builder.saver, FLAGS.checkpoint_filename)
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) check.NotNone(FLAGS.model_dir, '--model_dir is required') check.Ne( FLAGS.pretrain_steps is None, FLAGS.pretrain_epochs is None, 'Exactly one of --pretrain_steps or --pretrain_epochs is required') check.Ne(FLAGS.train_steps is None, FLAGS.train_epochs is None, 'Exactly one of --train_steps or --train_epochs is required') config_path = os.path.join(FLAGS.model_dir, 'config.txt') master_path = os.path.join(FLAGS.model_dir, 'master.pbtxt') hyperparameters_path = os.path.join(FLAGS.model_dir, 'hyperparameters.pbtxt') targets_path = os.path.join(FLAGS.model_dir, 'targets.pbtxt') checkpoint_path = os.path.join(FLAGS.model_dir, 'checkpoints/best') tensorboard_dir = os.path.join(FLAGS.model_dir, 'tensorboard') with tf.gfile.FastGFile(config_path) as config_file: config = collections.defaultdict(bool, ast.literal_eval(config_file.read())) train_corpus_path = config['train_corpus_path'] tune_corpus_path = config['tune_corpus_path'] projectivize_train_corpus = config['projectivize_train_corpus'] master = _read_text_proto(master_path, spec_pb2.MasterSpec) hyperparameters = _read_text_proto(hyperparameters_path, spec_pb2.GridPoint) targets = spec_builder.default_targets_from_spec(master) if tf.gfile.Exists(targets_path): targets = _read_text_proto(targets_path, spec_pb2.TrainingGridSpec).target # Build the TensorFlow graph. graph = tf.Graph() with graph.as_default(): tf.set_random_seed(hyperparameters.seed) builder = graph_builder.MasterBuilder(master, hyperparameters) trainers = [ builder.add_training_from_config(target) for target in targets ] annotator = builder.add_annotation() builder.add_saver() # Read in serialized protos from training data. train_corpus = sentence_io.ConllSentenceReader( train_corpus_path, projectivize=projectivize_train_corpus).corpus() tune_corpus = sentence_io.ConllSentenceReader(tune_corpus_path, projectivize=False).corpus() gold_tune_corpus = tune_corpus # Convert to char-based corpora, if requested. if config['convert_to_char_corpora']: # NB: Do not convert the |gold_tune_corpus|, which should remain word-based # for segmentation evaluation purposes. train_corpus = _convert_to_char_corpus(train_corpus) tune_corpus = _convert_to_char_corpus(tune_corpus) pretrain_steps = _get_steps(FLAGS.pretrain_steps, FLAGS.pretrain_epochs, len(train_corpus)) train_steps = _get_steps(FLAGS.train_steps, FLAGS.train_epochs, len(train_corpus)) check.Eq(len(targets), len(pretrain_steps), 'Length mismatch between training targets and --pretrain_steps') check.Eq(len(targets), len(train_steps), 'Length mismatch between training targets and --train_steps') # Ready to train! tf.logging.info('Training on %d sentences.', len(train_corpus)) tf.logging.info('Tuning on %d sentences.', len(tune_corpus)) tf.logging.info('Creating TensorFlow checkpoint dir...') summary_writer = trainer_lib.get_summary_writer(tensorboard_dir) checkpoint_dir = os.path.dirname(checkpoint_path) if tf.gfile.IsDirectory(checkpoint_dir): tf.gfile.DeleteRecursively(checkpoint_dir) elif tf.gfile.Exists(checkpoint_dir): tf.gfile.Remove(checkpoint_dir) tf.gfile.MakeDirs(checkpoint_dir) with tf.Session(FLAGS.tf_master, graph=graph) as sess: # Make sure to re-initialize all underlying state. sess.run(tf.global_variables_initializer()) trainer_lib.run_training(sess, trainers, annotator, evaluation.parser_summaries, pretrain_steps, train_steps, train_corpus, tune_corpus, gold_tune_corpus, FLAGS.batch_size, summary_writer, FLAGS.report_every, builder.saver, checkpoint_path) tf.logging.info('Best checkpoint written to:\n%s', checkpoint_path)
def main(unused_argv): tf.logging.set_verbosity(tf.logging.INFO) # Parse the flags containint lists, using regular expressions. # This matches and extracts key=value pairs. component_beam_sizes = re.findall(r'([^=,]+)=(\d+)', FLAGS.inference_beam_size) # This matches strings separated by a comma. Does not return any empty # strings. components_to_locally_normalize = re.findall(r'[^,]+', FLAGS.locally_normalize) # Reads master spec. master_spec = spec_pb2.MasterSpec() with gfile.FastGFile(FLAGS.master_spec) as fin: text_format.Parse(fin.read(), master_spec) # Rewrite resource locations. if FLAGS.resource_dir: for component in master_spec.component: for resource in component.resource: for part in resource.part: part.file_pattern = os.path.join(FLAGS.resource_dir, part.file_pattern) if FLAGS.complete_master_spec: spec_builder.complete_master_spec(master_spec, None, FLAGS.resource_dir) # Graph building. tf.logging.info('Building the graph') g = tf.Graph() with g.as_default(), tf.device('/device:CPU:0'): hyperparam_config = spec_pb2.GridPoint() hyperparam_config.use_moving_average = True builder = graph_builder.MasterBuilder(master_spec, hyperparam_config) annotator = builder.add_annotation() builder.add_saver() tf.logging.info('Reading documents...') input_corpus = sentence_io.ConllSentenceReader(FLAGS.input_file).corpus() session_config = tf.ConfigProto(log_device_placement=False, intra_op_parallelism_threads=FLAGS.threads, inter_op_parallelism_threads=FLAGS.threads) with tf.Session(graph=g, config=session_config) as sess: tf.logging.info('Initializing variables...') sess.run(tf.global_variables_initializer()) tf.logging.info('Loading from checkpoint...') sess.run('save/restore_all', {'save/Const:0': FLAGS.checkpoint_file}) tf.logging.info('Processing sentences...') processed = [] start_time = time.time() run_metadata = tf.RunMetadata() for start in range(0, len(input_corpus), FLAGS.max_batch_size): end = min(start + FLAGS.max_batch_size, len(input_corpus)) feed_dict = {annotator['input_batch']: input_corpus[start:end]} for comp, beam_size in component_beam_sizes: feed_dict['%s/InferenceBeamSize:0' % comp] = beam_size for comp in components_to_locally_normalize: feed_dict['%s/LocallyNormalize:0' % comp] = True if FLAGS.timeline_output_file and end == len(input_corpus): serialized_annotations = sess.run( annotator['annotations'], feed_dict=feed_dict, options=tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE), run_metadata=run_metadata) trace = timeline.Timeline(step_stats=run_metadata.step_stats) with open(FLAGS.timeline_output_file, 'w') as trace_file: trace_file.write(trace.generate_chrome_trace_format()) else: serialized_annotations = sess.run(annotator['annotations'], feed_dict=feed_dict) processed.extend(serialized_annotations) tf.logging.info('Processed %d documents in %.2f seconds.', len(input_corpus), time.time() - start_time) pos, uas, las = evaluation.calculate_parse_metrics( input_corpus, processed) if FLAGS.log_file: with gfile.GFile(FLAGS.log_file, 'w') as f: f.write('%s\t%f\t%f\t%f\n' % (FLAGS.language_name, pos, uas, las)) if FLAGS.output_file: with gfile.GFile(FLAGS.output_file, 'w') as f: for serialized_sentence in processed: sentence = sentence_pb2.Sentence() sentence.ParseFromString(serialized_sentence) f.write(text_format.MessageToString(sentence) + '\n\n')
def main(unused_argv): # Parse the flags containint lists, using regular expressions. # This matches and extracts key=value pairs. component_beam_sizes = re.findall(r'([^=,]+)=(\d+)', FLAGS.inference_beam_size) # This matches strings separated by a comma. Does not return any empty # strings. components_to_locally_normalize = re.findall(r'[^,]+', FLAGS.locally_normalize) ## SEGMENTATION ## if not FLAGS.use_gold_segmentation: # Reads master spec. master_spec = spec_pb2.MasterSpec() with gfile.FastGFile(FLAGS.segmenter_master_spec) as fin: text_format.Parse(fin.read(), master_spec) if FLAGS.complete_master_spec: spec_builder.complete_master_spec(master_spec, None, FLAGS.segmenter_resource_dir) # Graph building. tf.logging.info('Building the graph') g = tf.Graph() with g.as_default(), tf.device('/device:CPU:0'): hyperparam_config = spec_pb2.GridPoint() hyperparam_config.use_moving_average = True builder = graph_builder.MasterBuilder(master_spec, hyperparam_config) annotator = builder.add_annotation() builder.add_saver() tf.logging.info('Reading documents...') input_corpus = sentence_io.ConllSentenceReader( FLAGS.input_file).corpus() with tf.Session(graph=tf.Graph()) as tmp_session: char_input = gen_parser_ops.char_token_generator(input_corpus) char_corpus = tmp_session.run(char_input) check.Eq(len(input_corpus), len(char_corpus)) session_config = tf.ConfigProto( log_device_placement=False, intra_op_parallelism_threads=FLAGS.threads, inter_op_parallelism_threads=FLAGS.threads) with tf.Session(graph=g, config=session_config) as sess: tf.logging.info('Initializing variables...') sess.run(tf.global_variables_initializer()) tf.logging.info('Loading from checkpoint...') sess.run('save/restore_all', {'save/Const:0': FLAGS.segmenter_checkpoint_file}) tf.logging.info('Processing sentences...') processed = [] start_time = time.time() run_metadata = tf.RunMetadata() for start in range(0, len(char_corpus), FLAGS.max_batch_size): end = min(start + FLAGS.max_batch_size, len(char_corpus)) feed_dict = {annotator['input_batch']: char_corpus[start:end]} if FLAGS.timeline_output_file and end == len(char_corpus): serialized_annotations = sess.run( annotator['annotations'], feed_dict=feed_dict, options=tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE), run_metadata=run_metadata) trace = timeline.Timeline( step_stats=run_metadata.step_stats) with open(FLAGS.timeline_output_file, 'w') as trace_file: trace_file.write(trace.generate_chrome_trace_format()) else: serialized_annotations = sess.run(annotator['annotations'], feed_dict=feed_dict) processed.extend(serialized_annotations) tf.logging.info('Processed %d documents in %.2f seconds.', len(char_corpus), time.time() - start_time) input_corpus = processed else: input_corpus = sentence_io.ConllSentenceReader( FLAGS.input_file).corpus() ## PARSING # Reads master spec. master_spec = spec_pb2.MasterSpec() with gfile.FastGFile(FLAGS.parser_master_spec) as fin: text_format.Parse(fin.read(), master_spec) if FLAGS.complete_master_spec: spec_builder.complete_master_spec(master_spec, None, FLAGS.parser_resource_dir) # Graph building. tf.logging.info('Building the graph') g = tf.Graph() with g.as_default(), tf.device('/device:CPU:0'): hyperparam_config = spec_pb2.GridPoint() hyperparam_config.use_moving_average = True builder = graph_builder.MasterBuilder(master_spec, hyperparam_config) annotator = builder.add_annotation() builder.add_saver() tf.logging.info('Reading documents...') session_config = tf.ConfigProto(log_device_placement=False, intra_op_parallelism_threads=FLAGS.threads, inter_op_parallelism_threads=FLAGS.threads) with tf.Session(graph=g, config=session_config) as sess: tf.logging.info('Initializing variables...') sess.run(tf.global_variables_initializer()) tf.logging.info('Loading from checkpoint...') sess.run('save/restore_all', {'save/Const:0': FLAGS.parser_checkpoint_file}) tf.logging.info('Processing sentences...') processed = [] start_time = time.time() run_metadata = tf.RunMetadata() for start in range(0, len(input_corpus), FLAGS.max_batch_size): end = min(start + FLAGS.max_batch_size, len(input_corpus)) feed_dict = {annotator['input_batch']: input_corpus[start:end]} for comp, beam_size in component_beam_sizes: feed_dict['%s/InferenceBeamSize:0' % comp] = beam_size for comp in components_to_locally_normalize: feed_dict['%s/LocallyNormalize:0' % comp] = True if FLAGS.timeline_output_file and end == len(input_corpus): serialized_annotations = sess.run( annotator['annotations'], feed_dict=feed_dict, options=tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE), run_metadata=run_metadata) trace = timeline.Timeline(step_stats=run_metadata.step_stats) with open(FLAGS.timeline_output_file, 'w') as trace_file: trace_file.write(trace.generate_chrome_trace_format()) else: serialized_annotations = sess.run(annotator['annotations'], feed_dict=feed_dict) processed.extend(serialized_annotations) tf.logging.info('Processed %d documents in %.2f seconds.', len(input_corpus), time.time() - start_time) if FLAGS.output_file: with gfile.GFile(FLAGS.output_file, 'w') as f: for serialized_sentence in processed: sentence = sentence_pb2.Sentence() sentence.ParseFromString(serialized_sentence) f.write('#' + sentence.text.encode('utf-8') + '\n') for i, token in enumerate(sentence.token): head = token.head + 1 f.write('%s\t%s\t_\t_\t_\t_\t%d\t%s\t_\t_\n' % (i + 1, token.word.encode('utf-8'), head, token.label.encode('utf-8'))) f.write('\n\n')