def parse(self, input_files, output_dir=None, output_file=None): """""" if not isinstance(input_files, (tuple, list)): input_files = [input_files] if len(input_files) > 1 and output_file is not None: raise ValueError('Cannot provide a value for --output_file when parsing multiple files') self.add_file_vocabs(input_files) start_time = time.time() for input_file in input_files: with tf.Graph().as_default(): config_proto = tf.ConfigProto() if self.per_process_gpu_memory_fraction == -1: config_proto.gpu_options.allow_growth = True else: config_proto.gpu_options.per_process_gpu_memory_fraction = self.per_process_gpu_memory_fraction with tf.Session(config=config_proto) as sess: # load the model and prep the parse set self.setup_vocabs() trainset = Trainset.from_configurable(self, self.vocabs, nlp_model=self.nlp_model) with tf.variable_scope(self.name.title()): train_tensors = trainset() train_outputs = [train_tensors[train_key] for train_key in trainset.train_keys] saver = tf.train.Saver(self.save_vars, max_to_keep=1) for var in self.non_save_vars: sess.run(var.initializer) saver.restore(sess, tf.train.latest_checkpoint(self.save_dir)) # Iterate through files and batches parseset = Parseset.from_configurable(self, self.vocabs, parse_files=input_file, nlp_model=self.nlp_model) with tf.variable_scope(self.name.title(), reuse=True): parse_tensors = parseset(moving_params=self.optimizer) parse_outputs = [parse_tensors[parse_key] for parse_key in parseset.parse_keys] input_dir, input_file = os.path.split(input_file) if output_dir is None and output_file is None: output_dir = self.save_dir if output_dir == input_dir and output_file is None: output_path = os.path.join(input_dir, 'parsed-'+input_file) elif output_file is None: output_path = os.path.join(output_dir, input_file) else: output_path = output_file probs = [] sents = [] for feed_dict, tokens in parseset.iterbatches(shuffle=False): probs.append(sess.run(parse_outputs, feed_dict=feed_dict)) sents.append(tokens) parseset.write_probs(sents, output_path, probs) del trainset del parseset print('Finished one') if self.verbose: print(ctext('Parsing {0} file(s) took {1} seconds'.format(len(input_files), time.time()-start_time), 'bright_green')) return
def print_accuracy(self, accumulators, time, prefix='Train'): """""" acc_dict = self.process_accumulators(accumulators, time=time) strings = [] strings.append(color_pattern('Loss:', '{Loss:7.3f}', 'bright_red')) strings.append(color_pattern('TS:', '{TS:5.2f}%', 'bright_cyan')) strings.append(color_pattern('SS:', '{SS:5.2f}%', 'bright_green')) strings.append( color_pattern('Speed:', '{Seq_rate:6.1f} seqs/sec', 'bright_magenta')) string = ctext('{0} ', 'bold') + ' | '.join(strings) print(string.format(prefix, **acc_dict)) return
def from_dataset(cls, dataset, *args, **kwargs): """""" multibucket = cls.from_configurable(dataset, *args, **kwargs) indices = [] for multibucket_ in dataset: indices.append(multibucket_.indices) for i in xrange(1, len(indices)): pass # assert np.equal(indices[0].astype(int), indices[i].astype(int)).all() multibucket._indices = np.array(multibucket_.indices) buckets = [Bucket.from_dataset(dataset, i, *args, **kwargs) for i in xrange(len(multibucket_))] multibucket._buckets = buckets if dataset.verbose: for bucket in multibucket: print('Bucket {name} is {shape}'.format(name=bucket.name, shape=ctext(' x '.join(str(x) for x in bucket.indices.shape), 'bright_blue'))) return multibucket
def from_dataset(cls, dataset, *args, **kwargs): """""" multibucket = cls.from_configurable(dataset, *args, **kwargs) indices = [] for multibucket_ in dataset: indices.append(multibucket_.indices) #Ici il construit les batchs en attribuant à chaque phrase l'id de son batch et son id relatif dans le batch #(phrase 1 dans 4:5 === phrase 1 du corpus mise dans le batch 4 à la position 5 ) #for i in xrange(1, len(indices)): # assert np.equal(indices[0].astype(int), indices[i].astype(int)).all() multibucket._indices = np.array(multibucket_.indices) buckets = [Bucket.from_dataset(dataset, i, *args, **kwargs) for i in xrange(len(multibucket_))] multibucket._buckets = buckets if dataset.verbose: for bucket in multibucket: print('Bucket {name} is {shape}'.format(name=bucket.name, shape=ctext(' x '.join(str(x) for x in bucket.indices.shape), 'bright_blue'))) return multibucket
def train(self, load=False): """""" # prep the configurables self.add_file_vocabs(self.parse_files) self.setup_vocabs() trainset = Trainset.from_configurable(self, self.vocabs, nlp_model=self.nlp_model) with tf.variable_scope(self.name.title()): train_tensors = trainset() train = self.optimizer(tf.losses.get_total_loss()) train_outputs = [train_tensors[train_key] for train_key in trainset.train_keys] saver = tf.train.Saver(self.save_vars, max_to_keep=1) validset = Parseset.from_configurable(self, self.vocabs, nlp_model=self.nlp_model) with tf.variable_scope(self.name.title(), reuse=True): valid_tensors = validset(moving_params=self.optimizer) valid_outputs = [valid_tensors[train_key] for train_key in validset.train_keys] valid_outputs2 = [valid_tensors[valid_key] for valid_key in validset.valid_keys] current_acc = 0 best_acc = 0 n_iters_since_improvement = 0 n_iters_in_epoch = 0 # calling these properties is inefficient so we save them in separate variables min_train_iters = self.min_train_iters max_train_iters = self.max_train_iters validate_every = self.validate_every save_every = self.save_every verbose = self.verbose quit_after_n_iters_without_improvement = self.quit_after_n_iters_without_improvement # load or prep the history if load: self.history = pkl.load(open(os.path.join(self.save_dir, 'history.pkl'))) else: self.history = {'train': defaultdict(list), 'valid': defaultdict(list)} # start up the session config_proto = tf.ConfigProto() if self.per_process_gpu_memory_fraction == -1: config_proto.gpu_options.allow_growth = True else: config_proto.gpu_options.per_process_gpu_memory_fraction = self.per_process_gpu_memory_fraction with tf.Session(config=config_proto) as sess: sess.run(tf.global_variables_initializer()) if load: saver.restore(sess, tf.train.latest_checkpoint(self.save_dir)) total_train_iters = sess.run(self.global_step) train_accumulators = np.zeros(len(train_outputs)) train_time = 0 # training loop while total_train_iters < max_train_iters: for feed_dict in trainset.iterbatches(): start_time = time.time() batch_values = sess.run(train_outputs + [train], feed_dict=feed_dict)[:-1] batch_time = time.time() - start_time # update accumulators total_train_iters += 1 n_iters_since_improvement += 1 train_accumulators += batch_values train_time += batch_time # possibly validate if total_train_iters == 1 or (total_train_iters % validate_every == 0): valid_accumulators = np.zeros(len(train_outputs)) valid_time = 0 with codecs.open(os.path.join(self.save_dir, 'sanity_check'), 'w', encoding='utf-8', errors='ignore') as f: for feed_dict, sents in validset.iterbatches(return_check=True): start_time = time.time() batch_values = sess.run(valid_outputs+valid_outputs2, feed_dict=feed_dict) batch_time = time.time() - start_time # update accumulators valid_accumulators += batch_values[:len(valid_outputs)] valid_preds = batch_values[len(valid_outputs):] valid_time += batch_time validset.check(valid_preds, sents, f) # update history trainset.update_history(self.history['train'], train_accumulators) current_acc = validset.update_history(self.history['valid'], valid_accumulators) # print if verbose: print(ctext('{0:6d}'.format(int(total_train_iters)), 'bold')+')') trainset.print_accuracy(train_accumulators, train_time) validset.print_accuracy(valid_accumulators, valid_time) train_accumulators = np.zeros(len(train_outputs)) train_time = 0 if current_acc > best_acc: if verbose: print(ctext('Saving model...', 'bright_yellow')) best_acc = current_acc n_iters_since_improvement = 0 saver.save(sess, os.path.join(self.save_dir, self.name.lower()), #global_step=self.global_epoch, write_meta_graph=False) with open(os.path.join(self.save_dir, 'history.pkl'), 'w') as f: pkl.dump(dict(self.history), f) elif n_iters_since_improvement >= quit_after_n_iters_without_improvement and total_train_iters > min_train_iters: break else: # We've completed one epoch if total_train_iters <= min_train_iters: saver.save(sess, os.path.join(self.save_dir, self.name.lower()), #global_step=self.global_epoch, write_meta_graph=False) with open(os.path.join(self.save_dir, 'history.pkl'), 'w') as f: pkl.dump(dict(self.history), f) sess.run(self.global_epoch.assign_add(1.)) continue break # Now parse the training and testing files input_files = self.train_files + self.parse_files saver.restore(sess, tf.train.latest_checkpoint(self.save_dir)) for input_file in input_files: parseset = Parseset.from_configurable(self, self.vocabs, parse_files=input_file, nlp_model=self.nlp_model) with tf.variable_scope(self.name.title(), reuse=True): parse_tensors = parseset(moving_params=self.optimizer) parse_outputs = [parse_tensors[parse_key] for parse_key in parseset.parse_keys] input_dir, input_file = os.path.split(input_file) output_dir = self.save_dir output_file = input_file start_time = time.time() probs = [] sents = [] for feed_dict, tokens in parseset.iterbatches(shuffle=False): probs.append(sess.run(parse_outputs, feed_dict=feed_dict)) sents.append(tokens) parseset.write_probs(sents, os.path.join(output_dir, output_file), probs) if self.verbose: print(ctext('Parsing {0} file(s) took {1} seconds'.format(len(input_files), time.time()-start_time), 'bright_green')) return
def train(self, load=False): """""" print('TRAIN') # prep the configurables self.add_file_vocabs(self.parse_files) #train trainset = Trainset.from_configurable(self, self.vocabs, nlp_model=self.nlp_model) with tf.variable_scope(self.name.title()): train_tensors = trainset() train = self.optimizer(tf.losses.get_total_loss()) train_outputs = [ train_tensors[train_key] for train_key in trainset.train_keys ] saver = tf.train.Saver(self.save_vars, max_to_keep=1) #valid validset = Parseset.from_configurable(self, self.vocabs, nlp_model=self.nlp_model) with tf.variable_scope(self.name.title(), reuse=True): valid_tensors = validset(moving_params=self.optimizer) valid_outputs = [ valid_tensors[train_key] for train_key in validset.train_keys ] valid_outputs2 = [ valid_tensors[valid_key] for valid_key in validset.valid_keys ] #init current_acc = 0 best_acc = 0 n_iters_since_improvement = 0 n_iters_in_epoch = 0 # calling these properties is inefficient so we save them in separate variables min_train_iters = self.min_train_iters max_train_time = self.max_train_time min_percent_per_hour = self.min_percent_per_hour validate_every = self.validate_every save_every = self.save_every verbose = self.verbose quit_after_n_iters_without_improvement = self.quit_after_n_iters_without_improvement #if no load file, don't load if not op.isfile(os.path.join(self.save_dir, 'history.pkl')): load = False # load or prep the history if load: self.history = pkl.load( open(os.path.join(self.save_dir, 'history.pkl'))) else: self.history = { 'train': defaultdict(list), 'valid': defaultdict(list) } #stoping criteria range_scores = 20 last_scores = np.repeat(0.0, range_scores) last_train_time = np.repeat(1.0, range_scores) # start up the session last_training_time = time.time() #configuration du processeur config_proto = tf.ConfigProto() #allow_soft_placement=True) #log_device_placement=True,allow_soft_placement=Tru #config_proto.gpu_options.per_process_gpu_memory_fraction = 0.5 #config_proto.gpu_options.allow_growth = True with tf.Session( config=config_proto) as sess: #initialisation de la session #train_writer = tf.summary.FileWriter( './logs/1/train ', sess.graph) sess.run(tf.global_variables_initializer()) if load: #Restaure une session saver.restore(sess, tf.train.latest_checkpoint(self.save_dir)) total_train_iters = sess.run(self.global_step) train_accumulators = np.zeros(len(train_outputs)) train_time = 0 # training loop while sess.run(self.global_time) < max_train_time: for feed_dict in trainset.iterbatches(): sys.stdout.write("-") sys.stdout.flush() start_time = time.time() #print(sess.run(train_outputs + [train], feed_dict=feed_dict)[:-1]) batch_values = sess.run(train_outputs + [train], feed_dict=feed_dict)[:-1] batch_time = time.time() - start_time #merge = tf.summary.merge_all() #summary = sess.run(merge, feed_dict=feed_dict) #train_writer.add_summary(summary, total_train_iters) # update accumulators total_train_iters += 1 n_iters_since_improvement += 1 train_accumulators += batch_values train_time += batch_time sess.run( self.global_time.assign_add( (time.time() - last_training_time) / 60.0)) last_training_time = time.time() # possibly validate if total_train_iters == 1 or (total_train_iters % validate_every == 0): valid_accumulators = np.zeros(len(train_outputs)) valid_time = 0 print( ctext('\nStarting sanity check...', 'bright_yellow')) with codecs.open(os.path.join(self.save_dir, 'sanity_check'), 'w', encoding='utf-8', errors='ignore') as f: for feed_dict, sents in validset.iterbatches( return_check=True): start_time = time.time() batch_values = sess.run(valid_outputs + valid_outputs2, feed_dict=feed_dict) batch_time = time.time() - start_time # update accumulators valid_accumulators += batch_values[:len( valid_outputs)] valid_preds = batch_values[len(valid_outputs):] valid_time += batch_time #validset.check(valid_preds, sents, f) print(ctext('End of sanity check', 'bright_yellow')) # update history trainset.update_history(self.history['train'], train_accumulators) current_acc = validset.update_history( self.history['valid'], valid_accumulators) # print if verbose: print( ctext('{0:6d}'.format(int(total_train_iters)), 'bold') + ')') trainset.print_accuracy(train_accumulators, train_time) validset.print_accuracy(valid_accumulators, valid_time) print("Train since: ", sess.run(self.global_time), "min") train_accumulators = np.zeros(len(train_outputs)) #evaluation du critere d'arret last_scores[range_scores - 1] = current_acc last_train_time[range_scores - 1] = train_time avg_percent_per_sec = np.diff( last_scores) / last_train_time[1:] avg_percent_per_sec = np.mean(avg_percent_per_sec) avg_percent_per_hour = avg_percent_per_sec * 3600 print((int)(avg_percent_per_hour), "% par heure") stop_criteria = avg_percent_per_hour < min_percent_per_hour and total_train_iters > min_train_iters stop_criteria = stop_criteria or sess.run( self.global_time) > max_train_time #logs print("avg_percent_per_hour ", avg_percent_per_hour) print("min_percent_per_hour ", min_percent_per_hour) print("total_train_iters ", total_train_iters) print("min_train_iters ", min_train_iters) print("global_time ", sess.run(self.global_time)) print("max_train_time ", max_train_time) print("avg_percent_per_hour < min_percent_per_hour ", avg_percent_per_hour < min_percent_per_hour) print("total_train_iters > min_train_iters ", total_train_iters > min_train_iters) print("sess.run(self.global_time) > max_train_time ", sess.run(self.global_time) > max_train_time) print("stop_criteria ", stop_criteria) print("best acc ", best_acc) #Best model if current_acc > best_acc: if verbose: print(ctext('Saving model...', 'bright_yellow')) best_acc = current_acc n_iters_since_improvement = 0 saver.save( sess, os.path.join(self.save_dir, self.name.lower()), #global_step=self.global_epoch, write_meta_graph=False) with open( os.path.join(self.save_dir, 'history.pkl'), 'w') as f: pkl.dump(dict(self.history), f) if verbose: print(ctext('Saved!', 'bright_yellow')) #Stopping criteria if stop_criteria: break #shift stopping criteria arrays for i in range(1, range_scores): last_scores[i - 1] = last_scores[i] last_train_time[i - 1] = last_train_time[i] train_time = 0 else: # We've completed one epoch if total_train_iters <= min_train_iters: saver.save( sess, os.path.join(self.save_dir, self.name.lower()), #global_step=self.global_epoch, write_meta_graph=False) with open(os.path.join(self.save_dir, 'history.pkl'), 'w') as f: pkl.dump(dict(self.history), f) sess.run(self.global_epoch.assign_add(1.)) continue break # Now parse the training and testing files input_files = self.train_files + self.parse_files saver.restore(sess, tf.train.latest_checkpoint(self.save_dir)) for input_file in input_files: parseset = Parseset.from_configurable(self, self.vocabs, parse_files=input_file, nlp_model=self.nlp_model) with tf.variable_scope(self.name.title(), reuse=True): parse_tensors = parseset(moving_params=self.optimizer) parse_outputs = [ parse_tensors[parse_key] for parse_key in parseset.parse_keys ] input_dir, input_file = os.path.split(input_file) output_dir = self.save_dir output_file = input_file start_time = time.time() probs = [] sents = [] for feed_dict, tokens in parseset.iterbatches(shuffle=False): probs.append(sess.run(parse_outputs, feed_dict=feed_dict)) sents.append(tokens) parseset.write_probs(sents, os.path.join(output_dir, output_file), probs) if self.verbose: print( ctext( 'Parsing {0} file(s) took {1} seconds'.format( len(input_files), time.time() - start_time), 'bright_green')) return