def __init__(self, Batch, BatchAP, MiniBatchSize, chkpt_dir): self.MiniBatchSize = MiniBatchSize self.Batch = Batch ap_pairs = [] for spk in self.Batch['speaker_id'].unique(): files = list( self.Batch[self.Batch['speaker_id'] == spk]['filename']) for i in itertools.combinations(files, 2): ap_pairs.append([i[0], i[1], spk]) self.BatchAP = pd.DataFrame( ap_pairs, columns=['anchor', 'positive', 'speaker_id']) self.Cores = cpu_count() / 2 self.Partitions = min(self.Cores, MiniBatchSize * 3) self.Speakers = self.Batch['speaker_id'].unique().tolist() self.APlen = np.inf embeddings = model(x, 1) tf.get_variable_scope().reuse_variables() saver = tf.train.Saver() with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if os.path.exists(chkpt_dir + '/checkpoint'): saver.restore(sess, chkpt_dir + '/model.ckpt') print('restoring !') else: print('Checkpoint File Not Found !') xs = self.Batch['filename'].apply(lambda xx: load_filterbanks(xx)) filenames = self.Batch['filename'].tolist() embds = {} for _x in range(xs): embds[filenames[_x]] = sess.run(embeddings, feed_dict={x: xs[_x]}) self.embds = embds
def train(self): # iteration number global_step = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') # training graph iterator = self._data_layer() input_layer, labels, _ = iterator.get_next() logits = model(input_layer, self.num_labels, training=True) total_loss = self._loss_functions(logits, labels) optimizer = self._optimizer(total_loss, global_step) # summary ops and placeholders summ_op, mean_loss = self._summaries(logits) # don't allocate entire gpu memory config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(self.checkpoint_path, sess.graph) saver = tf.train.Saver(max_to_keep=None) # keep all checkpoints ckpt = tf.train.get_checkpoint_state(self.checkpoint_path) # resume training if a checkpoint exists if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) initial_step = global_step.eval() # train the model streaming_loss = 0 for i in range(initial_step, self.num_iter + 1): _, loss_batch = sess.run([optimizer, total_loss]) # log training statistics streaming_loss += loss_batch if i % self.log_iter == self.log_iter - 1: streaming_loss /= self.log_iter print(i + 1, streaming_loss) summary = sess.run(summ_op, feed_dict={mean_loss: streaming_loss}) writer.add_summary(summary, global_step=i) streaming_loss = 0 # save model if i % self.save_iter == self.save_iter - 1: saver.save(sess, os.path.join(self.checkpoint_path, 'checkpoint'), global_step=global_step) print("Model saved!") writer.close()
def __init__(self, checkpoint_path='./checkpoints/'): self.checkpoint_path = checkpoint_path self.num_labels = 3 self.input_layer = tf.placeholder(tf.float32, shape=[None, None], name='input') logits = model(tf.expand_dims(self.input_layer, axis=0), num_labels=self.num_labels, training=False) self.preds = tf.squeeze(tf.nn.softmax(logits))
def train(): BatchLoader = MultiTaskBatchManager(Data=train_df, batch_size=batch_size, Ntasks=5, Nepochs=num_epoch) print('epoch size : = %s' % (len(train_df) / batch_size)) with T_graph.as_default(): tower_grads = [] global_step = tf.Variable(0, name='global_step', trainable=False) x = tf.placeholder('float32', shape=[batch_size, None]) y = tf.placeholder('int32', shape=[batch_size]) optimizer = tf.train.AdamOptimizer(learning_rate=args.lr) available_gpus = get_available_gpus() num_clones = len(available_gpus) #x,y,flag=train_data.GetBatch() #x=tf.convert_to_tensor(x, dtype=tf.float32) #y=tf.convert_to_tensor(y, dtype=tf.int32) #batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue([x, y], capacity=2 * num_clones) print('Number of clones = %d' % num_clones) with tf.variable_scope(tf.get_variable_scope()): for i in range(num_clones): with tf.device(available_gpus[i]): #batch_x, batch_y = batch_queue.dequeue() # Distribute data among all clones equally. step = int(batch_size / float(num_clones)) # Network outputs. prediction = model(x[i * step:(i + 1) * step], step, total_speakers) label_onehot = tf.one_hot(y[i * step:(i + 1) * step], depth=total_speakers + 1) #prediction= model(batch_x,batch_size,total_speakers) #label_onehot = tf.one_hot(batch_y, depth=total_speakers+1) SOFTMAX = tf.nn.softmax_cross_entropy_with_logits( logits=prediction, labels=label_onehot) with tf.name_scope('loss'): loss = tf.reduce_mean(SOFTMAX) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Calculate the gradients for the batch of data on this tower. grads = optimizer.compute_gradients(loss) tower_grads.append(grads) grads = average_gradients(tower_grads) # Apply the gradients to adjust the shared variables. apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step) # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_op = tf.group(apply_gradient_op, variables_averages_op) summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) summaries.add(tf.summary.scalar('loss', loss)) summary_op = tf.summary.merge(list(summaries)) with tf.Session(graph=T_graph, config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) as sess: saver = tf.train.Saver() summary_writer = tf.summary.FileWriter(chkpt_dir, graph=T_graph) sess.run(tf.global_variables_initializer()) tf.train.start_queue_runners(sess=sess) if os.path.exists(chkpt_dir + '/checkpoint'): print('restoring !!') saver.restore(sess, chkpt_file) elif not os.path.exists(chkpt_dir): os.mkdir(chkpt_dir) print('Training Started !!') isrunning = True stepcount = 0 steploss = 0 epoch_loss = 0 nep = 0 while isrunning: stepcount += 1 #st=time.time() batch_x, batch_y, flag, isrunning = BatchLoader.next_batch() #print('data loading time %s'%(time.time()-st)) if isrunning: #st=time.time() _, c, summary, g = sess.run( [apply_gradient_op, loss, summary_op, global_step], feed_dict={ x: batch_x, y: batch_y }) #print('training time %s'%(time.time()-st)) steploss += c epoch_loss += c if stepcount % 100 == 0: epoch_loss += steploss save_path = saver.save(sess, chkpt_file) print('step_loss : %s ' % steploss) logging.info('step_loss : %s' % (steploss)) steploss = 0 if stepcount % (len(train_df) / batch_size) == 0: logging.info('Epoch %d loss : %f' % (nep, epoch_loss)) print('Epoch %d loss : %f' % (nep, epoch_loss)) epoch_loss = 0 nep += 1 logging.info('last Batch %d loss : %f' % (nep, epoch_loss)) print('last Batch %d loss : %f' % (nep, epoch_loss)) BatchLoader.close()
def train(): BatchLoader = MultiTaskBatchManager(Data=train_df, batch_size=batch_size, Ntasks=5, Nepochs=num_epoch) with T_graph.as_default(): tower_grads = [] global_step = tf.Variable(0, name='global_step', trainable=False) x = tf.placeholder('float32') alpha = tf.placeholder('float32') optimizer = tf.train.AdamOptimizer(learning_rate=args.lr) available_gpus = get_available_gpus() num_clones = len(available_gpus) print('Number of clones = %d' % num_clones) with tf.variable_scope(tf.get_variable_scope()): for i in range(num_clones): with tf.device(available_gpus[i]): # Network outputs. prediction = model(x[i], batch_size * 3, total_speakers) prediction = tf.nn.l2_normalize(prediction, 1, 1e-10, name='embeddings') with tf.name_scope('loss'): loss = triplet_loss(prediction, alpha) # Reuse variables for the next tower. tf.get_variable_scope().reuse_variables() # Calculate the gradients for the batch of data on this tower. grads = optimizer.compute_gradients(loss) tower_grads.append(grads) grads = average_gradients(tower_grads) # Apply the gradients to adjust the shared variables. apply_gradient_op = optimizer.apply_gradients(grads, global_step=global_step) # Track the moving averages of all trainable variables. MOVING_AVERAGE_DECAY = 0.9999 variable_averages = tf.train.ExponentialMovingAverage( MOVING_AVERAGE_DECAY, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) train_op = tf.group(apply_gradient_op, variables_averages_op) summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES)) summaries.add(tf.summary.scalar('loss', loss)) summary_op = tf.summary.merge(list(summaries)) with tf.Session(graph=T_graph, config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) as sess: saver = tf.train.Saver() summary_writer = tf.summary.FileWriter(chkpt_dir, graph=T_graph) sess.run(tf.global_variables_initializer()) tf.train.start_queue_runners(sess=sess) if os.path.exists(chkpt_dir + '/checkpoint'): print('restoring !!') saver.restore(sess, chkpt_file) elif not os.path.exists(chkpt_dir): os.mkdir(chkpt_dir) print('Training Started !!') isrunning = True stepcount = 0 steploss = 0 while isrunning: stepcount += 1 #st=time.time() batch_xs = [] for _ in range(num_clones): batch_x, batch_y, flag, isrunning = BatchLoader.next_batch( ) batch_xs.append(batch_x) if not isrunning: break if not isrunning: break #print('data loading time %s'%(time.time()-st)) if isrunning: #st=time.time() _, c, summary, g = sess.run( [apply_gradient_op, loss, summary_op, global_step], feed_dict={ x: batch_xs, alpha: 0.1 }) #print('training time %s'%(time.time()-st)) steploss += c if stepcount % 100 == 0: save_path = saver.save(sess, chkpt_file) print('step_loss : %s ' % steploss) steploss = 0 BatchLoader.close()