def __init__(self, filepath, batch_size=32, projectivize=False, morph_to_pos=False): self._graph = tf.Graph() self._session = tf.Session(graph=self._graph) task_context_str = """ input { name: 'documents' record_format: 'conll-sentence' Part { file_pattern: '%s' } }""" % filepath if morph_to_pos: task_context_str += """ Parameter { name: "join_category_to_pos" value: "true" } Parameter { name: "add_pos_as_attribute" value: "true" } Parameter { name: "serialize_morph_to_pos" value: "true" } """ with self._graph.as_default(): self._source, self._is_last = gen_parser_ops.document_source( task_context_str=task_context_str, batch_size=batch_size) self._source = gen_parser_ops.well_formed_filter(self._source) if projectivize: self._source = gen_parser_ops.projectivize_filter(self._source)
def main(unused_argv): logging.set_verbosity(logging.INFO) if not gfile.IsDirectory(OutputPath('')): gfile.MakeDirs(OutputPath('')) # Rewrite context. RewriteContext() # Creates necessary term maps. if FLAGS.compute_lexicon: logging.info('Computing lexicon...') with tf.Session(FLAGS.tf_master) as sess: gen_parser_ops.lexicon_builder(task_context=OutputPath('context'), corpus_name=FLAGS.training_corpus).run() with tf.Session(FLAGS.tf_master) as sess: feature_sizes, domain_sizes, embedding_dims, num_actions = sess.run( gen_parser_ops.feature_size(task_context=OutputPath('context'), arg_prefix=FLAGS.arg_prefix)) # Well formed and projectivize. if FLAGS.projectivize_training_set: logging.info('Preprocessing...') with tf.Session(FLAGS.tf_master) as sess: source, last = gen_parser_ops.document_source( task_context=OutputPath('context'), batch_size=FLAGS.batch_size, corpus_name=FLAGS.training_corpus) sink = gen_parser_ops.document_sink( task_context=OutputPath('context'), corpus_name='projectivized-training-corpus', documents=gen_parser_ops.projectivize_filter( gen_parser_ops.well_formed_filter(source, task_context=OutputPath( 'context')), task_context=OutputPath('context'))) while True: tf_last, _ = sess.run([last, sink]) if tf_last: break logging.info('Training...') with tf.Session(FLAGS.tf_master) as sess: Train(sess, num_actions, feature_sizes, domain_sizes, embedding_dims)
def main(unused_argv): logging.set_verbosity(logging.INFO) if not gfile.IsDirectory(OutputPath('')): gfile.MakeDirs(OutputPath('')) # Rewrite context. RewriteContext() # Creates necessary term maps. if FLAGS.compute_lexicon: logging.info('Computing lexicon...') with tf.Session(FLAGS.tf_master) as sess: gen_parser_ops.lexicon_builder( task_context=OutputPath('context'), corpus_name=FLAGS.training_corpus).run() with tf.Session(FLAGS.tf_master) as sess: feature_sizes, domain_sizes, embedding_dims, num_actions = sess.run( gen_parser_ops.feature_size(task_context=OutputPath('context'), arg_prefix=FLAGS.arg_prefix)) # Well formed and projectivize. if FLAGS.projectivize_training_set: logging.info('Preprocessing...') with tf.Session(FLAGS.tf_master) as sess: source, last = gen_parser_ops.document_source( task_context=OutputPath('context'), batch_size=FLAGS.batch_size, corpus_name=FLAGS.training_corpus) sink = gen_parser_ops.document_sink( task_context=OutputPath('context'), corpus_name='projectivized-training-corpus', documents=gen_parser_ops.projectivize_filter( gen_parser_ops.well_formed_filter( source, task_context=OutputPath('context')), task_context=OutputPath('context'))) while True: tf_last, _ = sess.run([last, sink]) if tf_last: break logging.info('Training...') with tf.Session(FLAGS.tf_master) as sess: Train(sess, num_actions, feature_sizes, domain_sizes, embedding_dims)
def __init__(self, filepath, record_format, batch_size=32, check_well_formed=False, projectivize=False, morph_to_pos=False): self._graph = tf.Graph() self._session = tf.Session(graph=self._graph) task_context_str = """ input { name: 'documents' record_format: '%s' Part { file_pattern: '%s' } }""" % (record_format, filepath) if morph_to_pos: task_context_str += """ Parameter { name: "join_category_to_pos" value: "true" } Parameter { name: "add_pos_as_attribute" value: "true" } Parameter { name: "serialize_morph_to_pos" value: "true" } """ with self._graph.as_default(): self._source, self._is_last = gen_parser_ops.document_source( task_context_str=task_context_str, batch_size=batch_size) if check_well_formed: self._source = gen_parser_ops.well_formed_filter(self._source) if projectivize: self._source = gen_parser_ops.projectivize_filter(self._source)