def parse_generator(self): """ This is a (hacky) way to maintain everything loaded. Every time you call __next__() on this generator, it will parse data found in self.current_input which should be an open file or StringIO""" with tf.Graph().as_default(): config_proto = tf.compat.v1.ConfigProto() # if self.per_process_gpu_memory_fraction == -1: config_proto.gpu_options.allow_growth = True # else: # config_proto.gpu_options.per_process_gpu_memory_fraction = self.per_process_gpu_memory_fraction with tf.compat.v1.Session(config=config_proto) as sess: # load the model and prep the parse set print("SELF.TRAIN_FILES",self.train_files,file=sys.stderr) self.add_file_vocabs(self.train_files) self.setup_vocabs() trainset = Trainset.from_configurable(self, self.vocabs, nlp_model=self.nlp_model) with tf.compat.v1.variable_scope(self.name.title()): train_tensors = trainset() train_outputs = [train_tensors[train_key] for train_key in trainset.train_keys] saver = tf.compat.v1.train.Saver(self.save_vars, max_to_keep=1) for var in self.non_save_vars: sess.run(var.initializer) saver.restore(sess, tf.train.latest_checkpoint(self.save_dir)) # create parseset outside of the while loop parseset = Parseset.from_configurable(self, self.vocabs, parse_files=self.current_input, nlp_model=self.nlp_model) with tf.compat.v1.variable_scope(self.name.title(), reuse=True): parse_tensors = parseset(moving_params=self.optimizer) parse_outputs = [parse_tensors[parse_key] for parse_key in parseset.parse_keys] while True: self.prune_vocabs() self.add_file_vocabs([self.current_input]) # add new vocubulary items from the current data parseset.reinit(self.vocabs, self.current_input) # this creates new buckets for current data probs = [] sents = [] for feed_dict, tokens in parseset.iterbatches(shuffle=False): probs.append(sess.run(parse_outputs, feed_dict=feed_dict)) sents.append(tokens) outp=io.StringIO() parseset.write_probs(sents, outp, probs, parseset._metadata) yield outp.getvalue() del trainset if self.verbose: try: print(ctext('Parsing {0} file(s) took {1} seconds'.format(len(input_files), time.time()-start_time), 'bright_green'),file=sys.stderr) except: print(ctext('Parsing took {} seconds'.format(time.time()-start_time), 'bright_green'),file=sys.stderr) return
def print_accuracy(self, accumulators, time, prefix='Train'): """ """ acc_dict = self.process_accumulators(accumulators, time=time) strings = [] strings.append(color_pattern('Loss:', '{Loss:7.3f}', 'bright_red')) strings.append(color_pattern('TS:', '{TS:5.2f}%', 'bright_cyan')) strings.append(color_pattern('SS:', '{SS:5.2f}%', 'bright_green')) strings.append(color_pattern('Speed:', '{Seq_rate:6.1f} seqs/sec', 'bright_magenta')) string = ctext('{0} ', 'bold') + ' | '.join(strings) print(string.format(prefix, **acc_dict),file=sys.stderr) return
def from_dataset(cls, dataset, *args, **kwargs): """ """ multibucket = cls.from_configurable(dataset, *args, **kwargs) indices = [] for multibucket_ in dataset: indices.append(multibucket_.indices) #for i in xrange(1, len(indices)): # assert np.equal(indices[0].astype(int), indices[i].astype(int)).all() multibucket._indices = np.array(multibucket_.indices) buckets = [ Bucket.from_dataset(dataset, i, *args, **kwargs) for i in range(len(multibucket_)) ] multibucket._buckets = buckets if dataset.verbose: for bucket in multibucket: print('Bucket {name} is {shape}'.format( name=bucket.name, shape=ctext( ' x '.join(str(x) for x in bucket.indices.shape), 'bright_blue')), file=sys.stderr) return multibucket
def parse(self, input_files, output_dir=None, output_file=None): """ """ if isinstance(input_files, types.GeneratorType): pass else: if not isinstance(input_files, (tuple, list)): input_files = [input_files] if len(input_files) > 1 and output_file is not None: raise ValueError( 'Cannot provide a value for --output_file when parsing multiple files' ) with tf.Graph().as_default(): config_proto = tf.ConfigProto() if self.per_process_gpu_memory_fraction == -1: config_proto.gpu_options.allow_growth = True else: config_proto.gpu_options.per_process_gpu_memory_fraction = self.per_process_gpu_memory_fraction with tf.Session(config=config_proto) as sess: # load the model and prep the parse set print("SELF.TRAIN_FILES", self.train_files, file=sys.stderr) self.add_file_vocabs(self.train_files) self.setup_vocabs() trainset = Trainset.from_configurable(self, self.vocabs, nlp_model=self.nlp_model) with tf.variable_scope(self.name.title()): train_tensors = trainset() train_outputs = [ train_tensors[train_key] for train_key in trainset.train_keys ] saver = tf.train.Saver(self.save_vars, max_to_keep=1) for var in self.non_save_vars: sess.run(var.initializer) saver.restore(sess, tf.train.latest_checkpoint(self.save_dir)) start_time = time.time() for input_file in input_files: #print("Parseset vocab") self.add_file_vocabs([input_file]) #print("Beg Parseset.from_configurable") parseset = Parseset.from_configurable( self, self.vocabs, parse_files=input_file, nlp_model=self.nlp_model) #print("Done Parseset.from_configurable") with tf.variable_scope(self.name.title(), reuse=True): parse_tensors = parseset(moving_params=self.optimizer) parse_outputs = [ parse_tensors[parse_key] for parse_key in parseset.parse_keys ] if not isinstance(input_file, io.StringIO): input_dir, input_file = os.path.split(input_file) if output_dir is None and output_file is None: output_dir = self.save_dir if output_dir == input_dir and output_file is None: output_path = os.path.join(input_dir, 'parsed-' + input_file) elif output_file is None: output_path = os.path.join(output_dir, input_file) else: output_path = output_file else: assert output_file is not None output_path = output_file #The expectation is for this to be an open file probs = [] sents = [] for feed_dict, tokens in parseset.iterbatches( shuffle=False): probs.append( sess.run(parse_outputs, feed_dict=feed_dict)) sents.append(tokens) parseset.write_probs(sents, output_path, probs, parseset._metadata) del parseset del trainset if self.verbose: try: print(ctext( 'Parsing {0} file(s) took {1} seconds'.format( len(input_files), time.time() - start_time), 'bright_green'), file=sys.stderr) except: print(ctext( 'Parsing took {} seconds'.format(time.time() - start_time), 'bright_green'), file=sys.stderr) return
def train(self, load=False): """ """ # prep the configurables self.add_file_vocabs(self.parse_files) self.setup_vocabs() trainset = Trainset.from_configurable(self, self.vocabs, nlp_model=self.nlp_model) with tf.variable_scope(self.name.title()): train_tensors = trainset() print("train_tensors: ", train_tensors) train = self.optimizer(tf.losses.get_total_loss()) train_outputs = [ train_tensors[train_key] for train_key in trainset.train_keys ] saver = tf.train.Saver(self.save_vars, max_to_keep=1) validset = Parseset.from_configurable(self, self.vocabs, nlp_model=self.nlp_model) with tf.variable_scope(self.name.title(), reuse=True): valid_tensors = validset(moving_params=self.optimizer) valid_outputs = [ valid_tensors[train_key] for train_key in validset.train_keys ] valid_outputs2 = [ valid_tensors[valid_key] for valid_key in validset.valid_keys ] current_acc = 0 best_acc = 0 n_iters_since_improvement = 0 n_iters_in_epoch = 0 # calling these properties is inefficient so we save them in separate variables min_train_iters = self.min_train_iters max_train_iters = self.max_train_iters validate_every = self.validate_every save_every = self.save_every verbose = self.verbose quit_after_n_iters_without_improvement = self.quit_after_n_iters_without_improvement # load or prep the history if load: self.history = pkl.load( open(os.path.join(self.save_dir, 'history.pkl'))) else: self.history = { 'train': defaultdict(list), 'valid': defaultdict(list) } # start up the session config_proto = tf.ConfigProto() #if self.per_process_gpu_memory_fraction == -1: config_proto.gpu_options.allow_growth = True #else: # config_proto.gpu_options.per_process_gpu_memory_fraction = self.per_process_gpu_memory_fraction with tf.Session(config=config_proto) as sess: sess.run(tf.global_variables_initializer()) if load: saver.restore(sess, tf.train.latest_checkpoint(self.save_dir)) total_train_iters = sess.run(self.global_step) train_accumulators = np.zeros(len(train_outputs)) train_time = 0 # training loop while total_train_iters < max_train_iters: print(total_train_iters) for feed_dict in trainset.iterbatches(): # print("feed_dict: ",feed_dict) start_time = time.time() batch_values = sess.run(train_outputs + [train], feed_dict=feed_dict)[:-1] batch_time = time.time() - start_time # update accumulators total_train_iters += 1 n_iters_since_improvement += 1 train_accumulators += batch_values train_time += batch_time # possibly validate if total_train_iters == 1 or (total_train_iters % validate_every == 0): valid_accumulators = np.zeros(len(train_outputs)) valid_time = 0 with codecs.open(os.path.join(self.save_dir, 'sanity_check'), 'w', encoding='utf-8', errors='ignore') as f: for feed_dict, sents in validset.iterbatches( return_check=True): #print("sent: ", sents[0]) start_time = time.time() batch_values = sess.run(valid_outputs + valid_outputs2, feed_dict=feed_dict) batch_time = time.time() - start_time # update accumulators valid_accumulators += batch_values[:len( valid_outputs)] valid_preds = batch_values[len(valid_outputs):] valid_time += batch_time validset.check(valid_preds, sents, f) # update history trainset.update_history(self.history['train'], train_accumulators) current_acc = validset.update_history( self.history['valid'], valid_accumulators) # print if verbose: print( ctext('{0:6d}'.format(int(total_train_iters)), 'bold') + ')') sys.stdout.flush() trainset.print_accuracy(train_accumulators, train_time) validset.print_accuracy(valid_accumulators, valid_time) train_accumulators = np.zeros(len(train_outputs)) train_time = 0 if current_acc > best_acc: if verbose: print(ctext('Saving model...', 'bright_yellow'), file=sys.stderr) sys.stderr.flush() best_acc = current_acc n_iters_since_improvement = 0 saver.save( sess, os.path.join(self.save_dir, self.name.lower()), #global_step=self.global_epoch, write_meta_graph=False) with open( os.path.join(self.save_dir, 'history.pkl'), 'wb') as f: pkl.dump(dict(self.history), f) elif n_iters_since_improvement >= quit_after_n_iters_without_improvement and total_train_iters > min_train_iters: break else: # We've completed one epoch if total_train_iters <= min_train_iters: saver.save( sess, os.path.join(self.save_dir, self.name.lower()), #global_step=self.global_epoch, write_meta_graph=False) with open(os.path.join(self.save_dir, 'history.pkl'), 'wb') as f: pkl.dump(dict(self.history), f) sess.run(self.global_epoch.assign_add(1.)) continue break # Now parse the training and testing files input_files = self.train_files + self.parse_files saver.restore(sess, tf.train.latest_checkpoint(self.save_dir)) for input_file in input_files: parseset = Parseset.from_configurable(self, self.vocabs, parse_files=input_file, nlp_model=self.nlp_model) with tf.variable_scope(self.name.title(), reuse=True): parse_tensors = parseset(moving_params=self.optimizer) parse_outputs = [ parse_tensors[parse_key] for parse_key in parseset.parse_keys ] input_dir, input_file = os.path.split(input_file) output_dir = self.save_dir output_file = input_file start_time = time.time() probs = [] sents = [] for feed_dict, tokens in parseset.iterbatches(shuffle=False): probs.append(sess.run(parse_outputs, feed_dict=feed_dict)) sents.append(tokens) parseset.write_probs(sents, os.path.join(output_dir, output_file), probs, parseset._metadata) if self.verbose: print(ctext( 'Parsing {0} file(s) took {1} seconds'.format( len(input_files), time.time() - start_time), 'bright_green'), file=sys.stderr) return