def __init__(self, n_classes, input_ch, input_time, batch_norm=True, batch_norm_alpha=0.1): super(EmbeddingShallowCNN, self).__init__() self.batch_norm = batch_norm self.batch_norm_alpha = batch_norm_alpha self.n_classes = n_classes self.num_filters = 50 self.convnet = nn.Sequential( nn.Conv2d(1, self.num_filters, (1, 25), stride=1), nn.Conv2d(self.num_filters, self.num_filters, (62, 1), stride=1), nn.BatchNorm2d(self.num_filters, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True), nn.ELU(), nn.AvgPool2d(kernel_size=(1, 75), stride=(1, 15)) # 1,149 ) self.convnet.eval() out = self.convnet( np_to_var(np.ones((1, 1, input_ch, input_time), dtype=np.float32))) temp = out.view(out.size()[0], -1) n_out_time = temp.cpu().data.numpy().shape[1] self.num_hidden = n_out_time
def __init__(self, batch_norm=True, batch_norm_alpha=0.1): super(EEGNet_v2, self).__init__() self.batch_norm = batch_norm self.batch_norm_alpha = batch_norm_alpha n_ch1 = 100 n_ch2 = 100 n_ch3 = 100 self.convnet = nn.Sequential( # nn.Conv3d(1,50,(1, 1, 10),1), # nn.Conv3d(50, 50, (1, 1, 10), 1), nn.Conv2d(1, 8, kernel_size=(1, 64), stride=1), nn.BatchNorm2d(8), nn.Conv2d(8, 16, kernel_size=(62, 1), stride=1, groups=8), nn.BatchNorm2d(16), nn.ELU(), nn.AvgPool2d(kernel_size=(1, 4)), nn.Dropout(p=0.25), nn.Conv2d(16, 16, kernel_size=(1, 6), groups=16), nn.Conv2d(16, 16, kernel_size=(1, 1)), nn.BatchNorm2d(16), nn.ELU(), nn.AvgPool2d(kernel_size=(1, 8)), nn.Dropout(p=0.25), ) self.convnet.eval() out = self.convnet( np_to_var(np.ones((1, 1, 62, 1000), dtype=np.float32))) n_out_time = out.cpu().data.numpy().shape[3] # self.final_conv_length = n_out_time self.num_hidden = out.size()[1] * out.size()[2] * out.size()[3]
def __init__(self, batch_norm=True, batch_norm_alpha=0.1): super(Deep4NetWs, self).__init__() self.batch_norm = batch_norm self.batch_norm_alpha = batch_norm_alpha self.convnet = nn.Sequential( L.Conv2d(1, 25, kernel_size=(1, 10), stride=1), L.Conv2d(25, 25, kernel_size=(62, 1), stride=1, bias=not self.batch_norm), L.BatchNorm2d(25), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 2)), nn.Dropout(p=0.5), L.Conv2d(25, 50, kernel_size=(1, 10), stride=1, bias=not self.batch_norm), L.BatchNorm2d(50), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 2)), nn.Dropout(p=0.5), L.Conv2d(50, 100, kernel_size=(1, 10), stride=1, bias=not self.batch_norm), L.BatchNorm2d(100), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 2)), nn.Dropout(p=0.5), L.Conv2d(100, 100, kernel_size=(1, 10), stride=1), L.BatchNorm2d(100), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3)), ) self.convnet.eval() out = self.convnet( np_to_var(np.ones((1, 1, 62, 400), dtype=np.float32))) n_out_time = out.cpu().data.numpy().shape[3] self.final_conv_length = n_out_time self.num_hidden = self.final_conv_length * 100
def __init__(self, n_classes, input_ch, input_time, batch_norm=True, batch_norm_alpha=0.1): super(Deep4Net_origin, self).__init__() self.batch_norm = batch_norm self.batch_norm_alpha = batch_norm_alpha self.n_classes = n_classes n_ch1 = 25 n_ch2 = 50 n_ch3 = 100 self.n_ch4 = 200 if self.batch_norm: self.convnet = nn.Sequential( nn.Conv2d(1, n_ch1, kernel_size=(1, 10), stride=1), nn.Conv2d(n_ch1, n_ch1, kernel_size=(input_ch, 1), stride=1, bias=not self.batch_norm), nn.BatchNorm2d(n_ch1, momentum=self.batch_norm_alpha, affine=True, eps=1e-5), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3)), nn.Dropout(p=0.5), nn.Conv2d(n_ch1, n_ch2, kernel_size=(1, 10), stride=1, bias=not self.batch_norm), nn.BatchNorm2d(n_ch2, momentum=self.batch_norm_alpha, affine=True, eps=1e-5), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3)), nn.Dropout(p=0.5), nn.Conv2d(n_ch2, n_ch3, kernel_size=(1, 10), stride=1, bias=not self.batch_norm), nn.BatchNorm2d(n_ch3, momentum=self.batch_norm_alpha, affine=True, eps=1e-5), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3)), nn.Dropout(p=0.5), nn.Conv2d(n_ch3, self.n_ch4, kernel_size=(1, 10), stride=1), nn.BatchNorm2d(self.n_ch4, momentum=self.batch_norm_alpha, affine=True, eps=1e-5), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3)), ) else: self.convnet = nn.Sequential( nn.Conv2d(1, n_ch1, kernel_size=(1, 10), stride=1), nn.Conv2d(n_ch1, n_ch1, kernel_size=(input_ch, 1), stride=1, bias=not self.batch_norm), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 2)), nn.Dropout(p=0.5), nn.Conv2d(n_ch1, n_ch2, kernel_size=(1, 10), stride=1, bias=not self.batch_norm), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 2)), nn.Dropout(p=0.5), nn.Conv2d(n_ch2, n_ch3, kernel_size=(1, 10), stride=1, bias=not self.batch_norm), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 2)), nn.Dropout(p=0.5), nn.Conv2d(n_ch3, self.n_ch4, kernel_size=(1, 10), stride=1), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3)), ) self.convnet.eval() out = self.convnet( np_to_var(np.ones((1, 1, input_ch, input_time), dtype=np.float32))) n_out_time = out.cpu().data.numpy().shape[3] self.final_conv_length = n_out_time self.num_hidden = self.final_conv_length * self.n_ch4
def __init__(self, batch_norm=True, batch_norm_alpha=0.1): super(DWConvNet2, self).__init__() self.batch_norm = batch_norm self.batch_norm_alpha = batch_norm_alpha n_ch1 = 100 n_ch2 = 100 n_ch3 = 100 self.convnet = nn.Sequential( # nn.Conv3d(1,50,(1, 1, 10),1), # nn.Conv3d(50, 50, (1, 1, 10), 1), nn.Conv2d(62, 62, kernel_size=(1, 30), stride=1, groups=62), nn.Conv2d(62, n_ch1, 1, 1, 0, 1, 1, bias=False), # nn.ReLU(), # nn.Conv2d(n_ch1, n_ch1, kernel_size=(62, 1), stride=1, bias=not self.batch_norm), nn.BatchNorm2d(n_ch1, momentum=self.batch_norm_alpha, affine=True, eps=1e-5), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 2)), nn.Dropout(p=0.5), nn.Conv2d(n_ch1, n_ch2, kernel_size=(1, 10), stride=1, bias=not self.batch_norm), nn.BatchNorm2d(n_ch2, momentum=self.batch_norm_alpha, affine=True, eps=1e-5), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 2)), nn.Dropout(p=0.5), nn.Conv2d(n_ch2, n_ch3, kernel_size=(1, 10), stride=1, bias=not self.batch_norm), nn.BatchNorm2d(n_ch3, momentum=self.batch_norm_alpha, affine=True, eps=1e-5), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 2)), nn.Dropout(p=0.5), nn.Conv2d(n_ch3, n_ch3, kernel_size=(1, 10), stride=1), nn.BatchNorm2d(n_ch3, momentum=self.batch_norm_alpha, affine=True, eps=1e-5), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3)), ) self.convnet.eval() out = self.convnet( np_to_var(np.ones((1, 62, 1, 400), dtype=np.float32))) n_out_time = out.cpu().data.numpy().shape[3] self.final_conv_length = n_out_time self.num_hidden = self.final_conv_length * 100
def __init__(self, n_classes,batch_norm=True, batch_norm_alpha=0.1): super(Deep4Net_origin, self).__init__() self.batch_norm = batch_norm self.batch_norm_alpha = batch_norm_alpha self.n_classes = n_classes n_ch1 = 25 n_ch2 = 50 n_ch3 = 100 n_ch4 = 200 self.convnet = nn.Sequential( nn.Conv2d(1, n_ch1, kernel_size=(1, 10), stride=1), # nn.ELU(), # nn.BatchNorm2d(n_ch1, # momentum=self.batch_norm_alpha, # affine=True, # eps=1e-5), nn.Conv2d(n_ch1, n_ch1, kernel_size=(62, 1), stride=1, bias=not self.batch_norm), nn.BatchNorm2d(n_ch1, momentum=self.batch_norm_alpha, affine=True, eps=1e-5), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3)), nn.Dropout(p=0.5), nn.Conv2d(n_ch1, n_ch2, kernel_size=(1, 10), stride=1, bias=not self.batch_norm), nn.BatchNorm2d(n_ch2, momentum=self.batch_norm_alpha, affine=True, eps=1e-5), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3)), nn.Dropout(p=0.5), nn.Conv2d(n_ch2, n_ch3, kernel_size=(1, 10), stride=1, bias=not self.batch_norm), nn.BatchNorm2d(n_ch3, momentum=self.batch_norm_alpha, affine=True, eps=1e-5), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3)), nn.Dropout(p=0.5), nn.Conv2d(n_ch3, n_ch4, kernel_size=(1, 10), stride=1), nn.BatchNorm2d(n_ch4, momentum=self.batch_norm_alpha, affine=True, eps=1e-5), nn.ELU(), nn.MaxPool2d(kernel_size=(1, 3), stride=(1, 3)), ) self.convnet.eval() out = self.convnet(np_to_var(np.ones( (1, 1, 62, 1000), dtype=np.float32))) n_out_time = out.cpu().data.numpy().shape[3] self.final_conv_length = n_out_time self.num_hidden = self.final_conv_length*n_ch4 # self.fc = nn.Sequential(nn.Linear(1000,1000), # nn.ReLU()) # self.fc = nn.Linear(1000,1000) self.convnet.add_module( "conv_classifier", nn.Conv2d( self.n_ch4, self.n_classes, (self.final_conv_length, 1), bias=True, ), )
def train(): # Getting settings from config.py max_len = cfg.MAX_TOKEN_LEN num_token = cfg.NUM_OF_TOKEN imw = cfg.IMW imh = cfg.IMH # Training params is_train = True batch_size_const = cfg.GPU_BATCH_SIZE num_ite_to_update = cfg.NUM_ITE_TO_UPDATE lr = cfg.LR momentum = cfg.MOMENTUM lr_decay = cfg.LR_DECAY max_grad = cfg.MAX_GRAD_CLIP num_e = cfg.NUM_EPOCH # Tracking/Saving last_e = -1 global_step = 0 running_loss = 0 num_ite_to_log = cfg.NUM_ITE_TO_LOG num_ite_to_vis = cfg.NUM_ITE_TO_VIS num_epoch_to_save = cfg.NUM_EPOCH_TO_SAVE all_loss = [] save_name = cfg.SAVE_NAME meta_name = cfg.META_NAME vis_path = cfg.VIS_PATH use_cuda = cfg.CUDA and torch.cuda.is_available() save_path = cfg.MODEL_FOLDER dataset_path = cfg.DATASET_PATH + 'CROHME2013_data/TrainINKML/' subset_list = cfg.SUBSET_LIST scale_factors = cfg.SCALE_FACTORS # Load the vocab dictionary for display purpose _, id_to_word = get_gt.build_vocab('mathsymbolclass.txt') # Initialize the network and load its weights net = AGRU() save_files = glob.glob(save_path + save_name + '*.dat') meta_files = glob.glob(save_path + meta_name + '*.dat') if (len(save_files) > 0): save_file = sorted(save_files)[-1] print('Loading network weights saved at %s...' % save_file) loadobj = torch.load(save_file) net.load_state_dict(loadobj['state_dict']) last_e, running_loss, all_loss, lr = util.load_list( sorted(meta_files)[-1]) print('Loading done.') if (use_cuda): net.cuda() # For debugging if (not is_train): net.train(False) # Get a list of convolutional layers conv_layers = util.get_layers(net, lambda x: type(x) == type(net.conv1_3)) # Get conv parameters conv_params = [] for c in conv_layers: for p in c.parameters(): if (p.requires_grad): conv_params.append(p) # Get a list of trainable layers that are not convolutional other_layers = util.get_layers( net, lambda x: type(x) != type(net.conv1_3) and hasattr(x, 'parameters')) other_layers = other_layers[1:] # The first layer is attend_GRU.AGRU # Get GRU parameters gru_params = [] for l in other_layers: for p in l.parameters(): gru_params.append(p) # Set different learning rates for conv layers and GRU layers optimizer = optim.Adam([{ 'params': gru_params }, { 'params': conv_params, 'lr': lr }], lr=lr) # Loss function criterion = nn.CrossEntropyLoss(ignore_index=1) # Get full paths to train inkml files, create a list of scale factors to be used for rendering train images inkml_list = [] scale_list = [] for i, subset in enumerate(subset_list): subset_inkml_list = glob.glob(dataset_path + subset + '*.inkml') inkml_list += subset_inkml_list scale_list += [scale_factors[i]] * len(subset_inkml_list) inkml_list = np.asarray(inkml_list) scale_list = np.asarray(scale_list) #inkml_list = inkml_list[0:120] #scale_list = scale_list[0:120] num_train = len(inkml_list) num_ite = int(np.ceil(1.0 * num_train / batch_size_const)) # Main train loop optimizer.zero_grad() for e in range(last_e + 1, num_e): permu_ind = np.random.permutation(range(num_train)) inkml_list = inkml_list[permu_ind.astype(int)] scale_list = scale_list[permu_ind.astype(int)] if (e % cfg.NUM_EPOCH_TO_DECAY == cfg.NUM_EPOCH_TO_DECAY - 1): lr = lr * lr_decay print('Current learning rate: %.8f' % lr) optimizer.param_groups[0]['lr'] = lr optimizer.param_groups[1]['lr'] = lr for i in range(num_ite): batch_idx = range(i * batch_size_const, (i + 1) * batch_size_const) if (batch_idx[-1] >= num_train): batch_idx = range(i * batch_size_const, num_train) batch_size = len(batch_idx) batch_x = util.batch_data(inkml_list[batch_idx], scale_list[batch_idx], is_train) batch_y_np = util.batch_target(inkml_list[batch_idx]) batch_y = util.np_to_var(batch_y_np, use_cuda) pred_y, attention = net(batch_x, batch_y) # Convert the 3D tensor to 2D matrix of shape (batch_size*MAX_TOKEN_LEN, NUM_OF_TOKEN) to compute log loss pred_y = pred_y.view(-1, num_token) # Remove the <start> token from target vector & prediction vvector batch_y = batch_y.view(batch_size, max_len) batch_y = batch_y[:, 1:].contiguous() batch_y = batch_y.view(-1) pred_y = pred_y.view(batch_size, max_len, num_token) pred_y = pred_y[:, 1:].contiguous() pred_y = pred_y.view(batch_size * (max_len - 1), num_token) loss = criterion(pred_y, batch_y) loss.backward() running_loss += loss.data[0] if (global_step % num_ite_to_update == (num_ite_to_update - 1)): util.grad_clip(net, max_grad) optimizer.step() optimizer.zero_grad() running_loss /= num_ite_to_update all_loss.append(running_loss) running_loss = 0 # Printing stuffs to console if (global_step % num_ite_to_log == (num_ite_to_log - 1)): print('Finished ite %d/%d, epoch %d/%d, loss: %.5f' % (i, num_ite, e, num_e, all_loss[-1])) # Printing prediction and target pred_y_np = util.var_to_np(pred_y, use_cuda) pred_y_np = np.reshape(pred_y_np, (batch_size, max_len - 1, num_token)) # Only display the first sample in the batch pred_y_np = pred_y_np[0, 0:40, :] pred_y_np = np.argmax(pred_y_np, axis=1) pred_list = [id_to_word[idx] for idx in list(pred_y_np)] print('Prediction: %s' % ' '.join(pred_list)) batch_y_np = np.reshape(batch_y_np, (batch_size, max_len)) batch_y_np = batch_y_np[0, 1:40] target_list = [id_to_word[idx] for idx in list(batch_y_np)] print('Target: %s\n' % ' '.join(target_list)) if (global_step % num_ite_to_vis == (num_ite_to_vis - 1)): tmp_x = util.var_to_np(batch_x, use_cuda)[0, :, :, :] tmp_x = np.transpose(tmp_x, (1, 2, 0))[:, :, 0:3] attention_np = attention.data.cpu().numpy()[0, 2:, :, :] for k in range(10): attention_k = attention_np[k, :, :] / np.max( attention_np[k, :, :]) * 0.8 attention_k = (scipy.misc.imresize( attention_k, 16.0, interp='bicubic')) / 255.0 tmp_x = scipy.misc.imresize(tmp_x, attention_k.shape) attention_k = np.repeat(np.expand_dims(attention_k, 2), 3, 2) attention_k = attention_k * 255 attention_k += tmp_x attention_k /= 2.0 attention_k[attention_k > 255] = 255 attention_k = (attention_k).astype(np.uint8) scipy.misc.imsave(vis_path + ('%02d.jpg' % k), attention_k) plt.clf() plt.plot(all_loss) plt.show() plt.savefig(vis_path + 'loss.png') global_step += 1 if (e % num_epoch_to_save == (num_epoch_to_save - 1)): print('Saving at epoch %d/%d' % (e, num_e)) torch.save( { 'state_dict': net.state_dict(), 'opt': optimizer.state_dict() }, save_path + save_name + ('_%03d' % e) + '.dat') metadata = [e, running_loss, all_loss, lr] util.save_list(metadata, save_path + meta_name + ('_%03d' % e) + '.dat') last_e = e
def test(): # Getting settings from config.py max_len = cfg.MAX_TOKEN_LEN num_token = cfg.NUM_OF_TOKEN imw = cfg.IMW imh = cfg.IMH # Training params is_train = False batch_size = 1 # Tracking/Saving num_ite_to_log = cfg.NUM_ITE_TO_LOG num_ite_to_vis = cfg.NUM_ITE_TO_VIS save_name = cfg.SAVE_NAME test_name = cfg.TEST_NAME vis_path = cfg.VIS_PATH use_cuda = cfg.CUDA and torch.cuda.is_available() save_path = cfg.MODEL_FOLDER dataset_path = cfg.DATASET_PATH + 'CROHME2013_data/TestINKML/' scale_factor = cfg.TEST_SCALE_FACTOR # Load the vocab dictionary for display purpose word_to_id, id_to_word = get_gt.build_vocab('mathsymbolclass.txt') start_id = word_to_id['<s>'] stop_id = word_to_id['</s>'] # Initialize the network and load its weights net = AGRU() save_files = glob.glob(save_path + save_name + '*.dat') if (len(save_files) > 0): save_file = sorted(save_files)[-1] print('Loading network weights saved at %s...' % save_file) loadobj = torch.load(save_file) net.load_state_dict(loadobj['state_dict']) print('Loading done.') if (use_cuda): net.cuda() # For debugging if (not is_train): net.train(False) # Get full paths to test inkml files, create a list of scale factors to be used for rendering test images inkml_list = glob.glob(dataset_path + '*.inkml') scale_list = [scale_factor] * len(inkml_list) inkml_list = np.asarray(inkml_list) scale_list = np.asarray(scale_list) #inkml_list = inkml_list[0:120] #scale_list = scale_list[0:120] num_test = len(inkml_list) num_ite = int(np.ceil(1.0 * num_test / batch_size)) # Exact match and word error rate em = [] wer = [] all_pred = [] all_gt = [] # Main test loop for i in range(num_ite): batch_idx = range(i * batch_size, (i + 1) * batch_size) if (batch_idx[-1] >= num_test): batch_idx = range(i * batch_size, num_test) batch_size = len(batch_idx) batch_x = util.batch_data(inkml_list[batch_idx], scale_list[batch_idx], is_train) batch_y_np = util.batch_target(inkml_list[batch_idx]) batch_y = util.np_to_var(batch_y_np, use_cuda) #pred_y, attention = net(batch_x, batch_y) pred_y, attention = net.beam_search(batch_x, start_id, stop_id) pred_y = util.var_to_np(pred_y, use_cuda) pred_y = np.argmax(pred_y, 2) batch_y = np.reshape(batch_y_np, (batch_size, max_len)) print('Finished ite %d/%d.' % (i, num_ite)) j = 0 pred_string = pred_y[j, :] pred_string = [id_to_word[idx] for idx in list(pred_string)] gt_string = batch_y[0, :] gt_string = [id_to_word[idx] for idx in list(gt_string)] all_pred.append(pred_string) all_gt.append(gt_string) em.append(util.exact_match(pred_string, gt_string)) if ('</s>' in pred_string): pred_string = pred_string[0:pred_string.index('</s>') + 1] gt_string = gt_string[0:gt_string.index('</s>') + 1] wer.append(util.levenshtein_distance(pred_string, gt_string)) if (i % 4 == 0): continue # Printing stuffs to console print('Prediction: %s' % ' '.join(pred_string)) print('Target: %s\n' % ' '.join(gt_string)) # Save attention to files for visualization file_name = ntpath.basename(inkml_list[batch_idx[j]])[:-6] vis_path_j = vis_path + file_name + '/' if (not os.path.exists(vis_path_j)): os.makedirs(vis_path_j) tmp_x = np.sum(batch_x.data.cpu().numpy()[j, :, :, :], axis=0) attention_np = attention.data.cpu().numpy()[j, 1:, :, :] pred_string = pred_string[1:] for k, word in enumerate(pred_string): word = word.replace('/', 'slash_') attention_k = attention_np[k, :, :] / np.max( attention_np[k, :, :]) * 0.8 attention_k = (scipy.misc.imresize(attention_k, 16.0)) / 255.0 tmp_x = scipy.misc.imresize(tmp_x, attention_k.shape) attention_k += tmp_x attention_k[attention_k > 1] = 1 try: scipy.misc.imsave(vis_path_j + ('%02d_%s.jpg' % (k, word)), attention_k) except FileNotFoundError: pdb.set_trace() if (word == '<slash_s>'): break #pdb.set_trace() print("Exact match count: %d/%d" % (sum(em), len(em))) print("Word error rate: %.5f" % (np.mean(wer))) pdb.set_trace() util.save_list([em, wer, all_pred, all_gt], save_path + test_name + '.dat') pdb.set_trace()