def parse_file(self, dataset, graph_outputs, sess, output_dir=None, output_filename=None, print_time=True): """""" probability_tensors = graph_outputs.probabilities input_filename = dataset.conllu_files[0] graph_outputs.restart_timer() for i, indices in enumerate(dataset.batch_iterator(shuffle=False)): with Timer('Parsing batch %d' % i): tokens, lengths = dataset.get_tokens(indices) feed_dict = dataset.set_placeholders(indices) probabilities = sess.run(probability_tensors, feed_dict=feed_dict) predictions = graph_outputs.probs_to_preds( probabilities, lengths) tokens.update({ vocab.field: vocab[predictions[vocab.field]] for vocab in self.output_vocabs }) graph_outputs.cache_predictions(tokens, indices) with Timer('Dumping predictions'): if output_dir is None and output_filename is None: graph_outputs.print_current_predictions() else: input_dir, input_filename = os.path.split(input_filename) if output_dir is None: output_dir = os.path.join(self.save_dir, 'parsed', input_dir) elif output_filename is None: output_filename = input_filename if not os.path.exists(output_dir): os.makedirs(output_dir) output_filename = os.path.join(output_dir, output_filename) with codecs.open(output_filename, 'w', encoding='utf-8') as f: graph_outputs.dump_current_predictions(f) if print_time: print('\033[92mParsing 1 file took {:0.1f} seconds\033[0m'.format( time.time() - graph_outputs.time)) return
def parse(self, conllu_files, output_dir=None, output_filename=None): """""" with Timer('Building dataset'): parseset = conllu_dataset.CoNLLUDataset(conllu_files, self.vocabs, config=self._config) if output_filename: assert len( conllu_files ) == 1, "output_filename can only be specified for one input file" factored_deptree = None factored_semgraph = None for vocab in self.output_vocabs: if vocab.field == 'deprel': factored_deptree = vocab.factorized elif vocab.field == 'semrel': factored_semgraph = vocab.factorized with Timer('Building TF'): with tf.variable_scope(self.classname, reuse=False): parse_graph = self.build_graph(reuse=True) parse_outputs = DevOutputs(*parse_graph, load=False, factored_deptree=factored_deptree, factored_semgraph=factored_semgraph, config=self._config) parse_tensors = parse_outputs.accuracies all_variables = set(tf.global_variables()) non_save_variables = set(tf.get_collection('non_save_variables')) save_variables = all_variables - non_save_variables saver = tf.train.Saver(list(save_variables), max_to_keep=1) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.allow_soft_placement = True with tf.Session(config=config) as sess: with Timer('Initializing non_save variables'): print(list(non_save_variables)) sess.run(tf.variables_initializer(list(non_save_variables))) with Timer('Restoring save variables'): saver.restore(sess, tf.train.latest_checkpoint(self.save_dir)) if len(conllu_files) == 1 or output_filename is not None: with Timer('Parsing file'): self.parse_file(parseset, parse_outputs, sess, output_dir=output_dir, output_filename=output_filename) else: with Timer('Parsing files'): self.parse_files(parseset, parse_outputs, sess, output_dir=output_dir) return
def parse_files(self, dataset, graph_outputs, sess, output_dir=None, print_time=True): """""" probability_tensors = graph_outputs.probabilities graph_outputs.restart_timer() for input_filename in dataset.conllu_files: for i, indices in enumerate(dataset.batch_iterator(shuffle=False)): with Timer('batch {}'.format(i)): tokens, lengths = dataset.get_tokens(indices) feed_dict = dataset.set_placeholders(indices) probabilities = sess.run(probability_tensors, feed_dict=feed_dict) predictions = graph_outputs.probs_to_preds( probabilities, lengths) tokens.update({ vocab.field: vocab[predictions[vocab.field]] for vocab in self.output_vocabs }) graph_outputs.cache_predictions(tokens, indices) input_dir, input_filename = os.path.split(input_filename) if output_dir is None: file_output_dir = os.path.join(self.save_dir, 'parsed', input_dir) else: file_output_dir = output_dir if not os.path.exists(file_output_dir): os.makedirs(file_output_dir) output_filename = os.path.join(file_output_dir, input_filename) with codecs.open(output_filename, 'w', encoding='utf-8') as f: graph_outputs.dump_current_predictions(f) # Load the next conllu file dataset.load_next() if print_time: n_files = len(dataset.conllu_files) print( '\033[92mParsing {} file{} took {:0.1f} seconds\033[0m'.format( n_files, 's' if n_files > 1 else '', time.time() - graph_outputs.time)) return
def __init__(self, input_networks=set(), config=None): """""" with Timer('Initializing the network (including pretrained vocab)'): self._config = config #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' self._input_networks = input_networks input_network_classes = set( input_network.classname for input_network in self._input_networks) assert input_network_classes == set( self.input_network_classes ), 'Not all input networks were passed in to {}'.format( self.classname) extant_vocabs = {} for input_network in self.input_networks: for vocab in input_network.vocabs: if vocab.classname in extant_vocabs: assert vocab is extant_vocabs[ vocab. classname], "Two input networks have different instances of {}".format( vocab.classname) else: extant_vocabs[vocab.classname] = vocab if 'IDIndexVocab' in extant_vocabs: self._id_vocab = extant_vocabs['IDIndexVocab'] else: self._id_vocab = vocabs.IDIndexVocab(config=config) extant_vocabs['IDIndexVocab'] = self._id_vocab self._input_vocabs = [] for input_vocab_classname in self.input_vocab_classes: if input_vocab_classname in extant_vocabs: self._input_vocabs.append( extant_vocabs[input_vocab_classname]) else: VocabClass = getattr(vocabs, input_vocab_classname) vocab = VocabClass(config=config) vocab.load() or vocab.count(self.train_conllus) self._input_vocabs.append(vocab) extant_vocabs[input_vocab_classname] = vocab self._output_vocabs = [] for output_vocab_classname in self.output_vocab_classes: if output_vocab_classname in extant_vocabs: self._output_vocabs.append( extant_vocabs[output_vocab_classname]) else: VocabClass = getattr(vocabs, output_vocab_classname) vocab = VocabClass(config=config) vocab.load() or vocab.count(self.train_conllus) self._output_vocabs.append(vocab) extant_vocabs[output_vocab_classname] = vocab self._throughput_vocabs = [] for throughput_vocab_classname in self.throughput_vocab_classes: if throughput_vocab_classname in extant_vocabs: self._throughput_vocabs.append( extant_vocabs[throughput_vocab_classname]) else: VocabClass = getattr(vocabs, throughput_vocab_classname) vocab = VocabClass(config=config) vocab.load() or vocab.count(self.train_conllus) self._throughput_vocabs.append(vocab) extant_vocabs[throughput_vocab_classname] = vocab with tf.variable_scope(self.classname, reuse=False): self.global_step = tf.Variable(0., trainable=False, name='Global_step') self._vocabs = set(extant_vocabs.values()) return