def cifar_ten_basic_recognition(): batch_size = 256 train_loader = data_preprocessing.load_cifar_ten.get_train_loader( batch_size) test_loader = data_preprocessing.load_cifar_ten.get_test_loader(batch_size) # test_mdrnn_cell() #test_mdrnn() input_height = 32 input_width = 32 input_channels = 3 hidden_states_size = 32 # https://stackoverflow.com/questions/45027234/strange-loss-curve-while-training-lstm-with-keras # Possibly a batch size of 128 leads to more instability in training? #batch_size = 128 compute_multi_directional = False # https://discuss.pytorch.org/t/dropout-changing-between-training-mode-and-eval-mode/6833 use_dropout = False # TODO: Add gradient clipping? This might also make training more stable? # Interesting link with tips on how to fix training: # https://blog.slavv.com/37-reasons-why-your-neural-network-is-not-working-4020854bd607 # https://discuss.pytorch.org/t/about-torch-nn-utils-clip-grad-norm/13873 # https://discuss.pytorch.org/t/proper-way-to-do-gradient-clipping/191 input_size = SizeTwoDimensional.create_size_two_dimensional( input_height, input_width) #with torch.autograd.profiler.profile(use_cuda=False) as prof: train_mdrnn(train_loader, test_loader, input_channels, input_size, hidden_states_size, batch_size, compute_multi_directional, use_dropout)
def forward(self, x): # if self.input_and_output_are_lists: # tensor_list_chunking = TensorListChunking.create_tensor_list_chunking(x, self.block_size) # x_chunked = tensor_list_chunking.chunk_tensor_list_into_blocks_concatenate_along_batch_dimension(x, True) # output = self.multi_dimensional_lstm(x_chunked) # output_ordered_back_to_input_format = tensor_list_chunking.\ # dechunk_block_tensor_concatenated_along_batch_dimension(output) # # print("output_ordered_back_to_input_format : " + str(output_ordered_back_to_input_format )) # return output_ordered_back_to_input_format # else: original_size = SizeTwoDimensional.create_size_two_dimensional( x.size(2), x.size(3)) # Tensor chunking is created dynamically, so that every batch may have a different # two-dimensional size (within each batch, examples must still be of the same size) # print("BlockMultiDimensionalLSTM - self.block_size: " + str(self.block_size)) tensor_chunking = TensorChunking.create_tensor_chunking( original_size, self.block_size) x_chunked = tensor_chunking.chunk_tensor_into_blocks_concatenate_along_batch_dimension( x) output = self.multi_dimensional_lstm(x_chunked) output_ordered_back_to_input_format = tensor_chunking.\ dechunk_block_tensor_concatenated_along_batch_dimension(output) # print("output_ordered_back_to_input_format : " + str(output_ordered_back_to_input_format )) return output_ordered_back_to_input_format
def test_tensor_list_block_chunking_followed_by_dechunking_reconstructs_original_multiple_block_rows( tensors_all_have_same_height: bool): tensor_one = torch.Tensor([range(1, 33)]).view(2, 4, 4) tensor_two = torch.Tensor([range(33, 65)]).view(2, 4, 4) block_size = SizeTwoDimensional.create_size_two_dimensional(2, 2) test_tensor_list_block_chunking_followed_by_dechunking_reconstructs_original( tensor_one, tensor_two, block_size, tensors_all_have_same_height)
def test_tensor_block_chunking_followed_by_dechunking_reconstructs_original(): tensor = torch.Tensor([range(1, 97)]).view(2, 2, 4, 6) if Utils.use_cuda(): tensor = tensor.cuda() print(tensor) print("tensor[0, 0, :, :]: " + str(tensor[0, 0, :, :])) # chunking = chunk_tensor_into_blocks_return_as_list( # tensor, SizeTwoDimensional.create_size_two_dimensional(2, 2)) # print("chunking: " + str(chunking)) # for item in chunking: # print("item.size(): " + str(item.size())) original_size = SizeTwoDimensional.create_size_two_dimensional(4, 6) block_size = SizeTwoDimensional.create_size_two_dimensional(2, 2) tensor_chunking = TensorChunking.create_tensor_chunking( original_size, block_size) chunking = tensor_chunking.chunk_tensor_into_blocks_concatenate_along_batch_dimension( tensor) print("chunking: " + str(chunking)) print("chunking.size(): " + str(chunking.size())) dechunked_tensor = tensor_chunking.dechunk_block_tensor_concatenated_along_batch_dimension( chunking) print("dechunked_tensor: " + str(dechunked_tensor)) # https://stackoverflow.com/questions/32996281/how-to-check-if-two-torch-tensors-or-matrices-are-equal # https://discuss.pytorch.org/t/tensor-math-logical-operations-any-and-all-functions/6624 tensors_are_equal = torch.eq(tensor, dechunked_tensor).all() print("tensors_are_equal: " + str(tensors_are_equal)) if not tensors_are_equal: raise RuntimeError("Error: original tensor " + str(tensor) + " and dechunked tensor " + str(dechunked_tensor) + " are not equal") else: print( "Success: original tensor and dechunked(chunked(tensor)) are equal" )
def test_tensor_block_stacking(): tensor = torch.Tensor([range(1, 17)]).view(4, 4) print("original tensor: " + str(tensor)) result = TensorBlockStacking.rescale_tensor_by_stacking_tensor_blocks( tensor, SizeTwoDimensional.create_size_two_dimensional(2, 2), 1) print("result: " + str(result)) expected_result = torch.Tensor([[[1., 3.], [9., 11.]], [[2., 4.], [10., 12.]], [[5., 7.], [13., 15.]], [[6., 8.], [14., 16.]]]) if not util.tensor_utils.TensorUtils.tensors_are_equal( result, expected_result): raise RuntimeError("Error: expected the result to be equal to : " + str(expected_result) + " but got: " + str(result))
def get_original_sizes_from_tensor_list(tensor_list: list): result = list([]) for x in tensor_list: if TensorUtils.number_of_dimensions(x) != 3: raise RuntimeError("Error: tenor x with size " + str(x.size()) + " does not have 3 dimensions, as required") # print("x.size(): " + str(x.size())) original_size = SizeTwoDimensional.create_size_two_dimensional( x.size(1), x.size(2)) # print("original_size: " + str(original_size)) result.append(original_size) # print(" get_original_sizes_from_tensor_list - result: " + str(result)) return result
def compute_activations_with_block_mdlstm(self, x): # print(">>>Entered compute_activations_with_block_mdlstm...") # print("network_to_softmax_network - network input x sizes: " ) # for element in x: # print(">>> input list element size - " + str(element.size())) network_consumed_block_size = SizeTwoDimensional(self.get_real_network().get_height_reduction_factor(), self.get_real_network().get_width_reduction_factor()) # print("Network_consumed_block_size: " + str(network_consumed_block_size)) # # Plot two row images for debugging # for element in x: # if element.size(1) > 64: # print("image to be plotted size: " + str(element.size())) # element_without_channel_dimension = element.squeeze(0) # util.image_visualization.imshow_tensor_2d(element_without_channel_dimension) tensor_list_chunking = TensorListChunking.create_tensor_list_chunking(x, network_consumed_block_size) # Chunk the input input_chunked = tensor_list_chunking. \ chunk_tensor_list_into_blocks_concatenate_along_batch_dimension(x, False) # print("input_chunked.size(): " + str(input_chunked.size())) # Debugging: check that the de-chunked version recovers the original ModuleIOStructuring.\ check_dechunking_chunked_tensor_list_recovers_original(tensor_list_chunking, x, input_chunked) # print("input_chunked :" + str(input_chunked)) # Compute the activations on the chunked input activations_chunked = self.network(input_chunked) # print("network_to_softmax_network - activations_chunked.size(): " + str(activations_chunked.size())) # de-chunk the chunked activations activations = NetworkToSoftMaxNetwork.dechunk_activations(activations_chunked, tensor_list_chunking) #return NetworkToSoftMaxNetwork.get_activations_single_tensor_and_activation_heights_and_widths( # activations, self.input_network_produces_multiple_output_directions) multiple_output_directions = self.input_network_produces_multiple_output_directions or self.use_example_packing # print(">>> multiple_output_directions: " + str(multiple_output_directions)) return NetworkToSoftMaxNetwork.get_activations_single_tensor_and_activation_heights_and_widths( activations, multiple_output_directions)
def train_mdrnn(train_loader, test_loader, input_channels: int, input_size: SizeTwoDimensional, hidden_states_size: int, batch_size, compute_multi_directional: bool, use_dropout: bool): import torch.optim as optim criterion = nn.CrossEntropyLoss() #multi_dimensional_rnn = MultiDimensionalRNN.create_multi_dimensional_rnn(hidden_states_size, # batch_size, # compute_multi_directional, # nonlinearity="sigmoid") #multi_dimensional_rnn = MultiDimensionalRNNFast.create_multi_dimensional_rnn_fast(hidden_states_size, # batch_size, # compute_multi_directional, # use_dropout, # nonlinearity="sigmoid") #multi_dimensional_rnn = MultiDimensionalLSTM.create_multi_dimensional_lstm(hidden_states_size, # batch_size, # compute_multi_directional, # use_dropout, # nonlinearity="sigmoid") # http://pytorch.org/docs/master/notes/cuda.html device = torch.device("cuda:0") # device_ids should include device! # device_ids lists all the gpus that may be used for parallelization # device is the initial device the model will be put on #device_ids = [0, 1] device_ids = [0] # multi_dimensional_rnn = MultiDimensionalLSTM.create_multi_dimensional_lstm_fast(input_channels, # hidden_states_size, # compute_multi_directional, # use_dropout, # nonlinearity="sigmoid") mdlstm_block_size = SizeTwoDimensional.create_size_two_dimensional(4, 4) # multi_dimensional_rnn = BlockMultiDimensionalLSTM.create_block_multi_dimensional_lstm(input_channels, # hidden_states_size, # mdlstm_block_size, # compute_multi_directional, # use_dropout, # nonlinearity="sigmoid") # # block_strided_convolution_block_size = SizeTwoDimensional.create_size_two_dimensional(4, 4) # output_channels = mdlstm_block_size.width * mdlstm_block_size.height * hidden_states_size # multi_dimensional_rnn = BlockMultiDimensionalLSTMLayerPair.\ # create_block_multi_dimensional_lstm_layer_pair(input_channels, hidden_states_size, # output_channels, mdlstm_block_size, # block_strided_convolution_block_size, # compute_multi_directional, # use_dropout, # nonlinearity="tanh") # # An intermediate test case with first a layer-pair that consists of a # # BlockMultiDimensionalLSTM layer, followed by a BlockStructuredConvolution layer. # # After this comes an additional single block_strided_convolution layer as # # opposed to another full layer pair # mdlstm_block_size = SizeTwoDimensional.create_size_two_dimensional(4, 4) # block_strided_convolution_block_size = SizeTwoDimensional.create_size_two_dimensional(4, 4) # multi_dimensional_rnn = BlockMultiDimensionalLSTMLayerPairStacking.\ # create_one_layer_pair_plus_second_block_convolution_layer_network(hidden_states_size, mdlstm_block_size, # block_strided_convolution_block_size) # # An intermediate test case with first a layer-pair that consists of a # # BlockMultiDimensionalLSTM layer, followed by a BlockStructuredConvolution layer. # # After this comes an additional single mdlstm layer as # # opposed to another full layer pair # mdlstm_block_size = SizeTwoDimensional.create_size_two_dimensional(4, 4) # block_strided_convolution_block_size = SizeTwoDimensional.create_size_two_dimensional(4, 4) # multi_dimensional_rnn = BlockMultiDimensionalLSTMLayerPairStacking.\ # create_one_layer_pair_plus_second_block_mdlstm_layer_network(hidden_states_size, mdlstm_block_size, # block_strided_convolution_block_size) # mdlstm_block_size = SizeTwoDimensional.create_size_two_dimensional(4, 2) block_strided_convolution_block_size = SizeTwoDimensional.create_size_two_dimensional( 4, 2) multi_dimensional_rnn = MultiDimensionalLSTMLayerPairStacking.\ create_two_layer_pair_network(hidden_states_size, mdlstm_block_size, block_strided_convolution_block_size, False) network = MultiDimensionalRNNToSingleClassNetwork.\ create_multi_dimensional_rnn_to_single_class_network(multi_dimensional_rnn, input_size) #multi_dimensional_rnn = Net() if Utils.use_cuda(): #multi_dimensional_rnn = multi_dimensional_rnn.cuda() network = nn.DataParallel(network, device_ids=device_ids) network.to(device) #print("multi_dimensional_rnn.module.mdlstm_direction_one_parameters.parallel_memory_state_column_computation :" # + str(multi_dimensional_rnn.module.mdlstm_direction_one_parameters.parallel_memory_state_column_computation)) #print("multi_dimensional_rnn.module.mdlstm_direction_one_parameters." # "parallel_memory_state_column_computation.parallel_convolution.bias :" # + str(multi_dimensional_rnn.module.mdlstm_direction_one_parameters. # parallel_memory_state_column_computation.parallel_convolution.bias)) #print("multi_dimensional_rnn.module.mdlstm_direction_one_parameters." # "parallel_hidden_state_column_computation.parallel_convolution.bias :" # + str(multi_dimensional_rnn.module.mdlstm_direction_one_parameters. # parallel_hidden_state_column_computation.parallel_convolution.bias)) print_number_of_parameters(multi_dimensional_rnn) #optimizer = optim.SGD(multi_dimensional_rnn.parameters(), lr=0.001, momentum=0.9) # Adding some weight decay seems to do magic, see: http://pytorch.org/docs/master/optim.html optimizer = optim.SGD(network.parameters(), lr=0.001, momentum=0.9, weight_decay=1e-5) # Faster learning #optimizer = optim.SGD(multi_dimensional_rnn.parameters(), lr=0.01, momentum=0.9) start = time.time() num_gradient_corrections = 0 for epoch in range(4): # loop over the dataset multiple times running_loss = 0.0 for i, data in enumerate(train_loader, 0): # get the inputs inputs, labels = data if Utils.use_cuda(): inputs = inputs.to(device) # Set requires_grad(True) directly and only for the input inputs.requires_grad_(True) # wrap them in Variable # labels = Variable(labels) # Labels need no gradient apparently if Utils.use_cuda(): labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() #print("inputs: " + str(inputs)) # forward + backward + optimize #outputs = multi_dimensional_rnn(Variable(inputs)) # For "Net" (Le Net) time_start_network_forward = time.time() outputs = network(inputs) # print("Time used for network forward: " + str(util.timing.time_since(time_start_network_forward))) # print("outputs: " + str(outputs)) # print("outputs.size(): " + str(outputs.size())) #print("labels: " + str(labels)) time_start_loss_computation = time.time() loss = criterion(outputs, labels) # print("Time used for loss computation: " + str(util.timing.time_since(time_start_loss_computation))) time_start_loss_backward = time.time() get_dot = modules.find_bad_gradients.register_hooks(outputs) loss.backward() dot = get_dot() dot.save('mdlstm_find_bad_gradients.dot') render('dot', 'png', 'mdlstm_find_bad_gradients.dot') raise RuntimeError("stopping after find bad gradients") # print("Time used for loss backward: " + str(util.timing.time_since(time_start_loss_backward))) # Perform gradient clipping made_gradient_norm_based_correction = clip_gradient( multi_dimensional_rnn) if made_gradient_norm_based_correction: num_gradient_corrections += 1 optimizer.step() # print statistics # print("loss.data: " + str(loss.data)) # print("loss.data[0]: " + str(loss.data[0])) running_loss += loss.data #if i % 2000 == 1999: # print every 2000 mini-batches # See: https://stackoverflow.com/questions/5598181/python-multiple-prints-on-the-same-line #print(str(i)+",", end="", flush=True) if i % 100 == 99: # print every 100 mini-batches end = time.time() running_time = end - start print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 100) + " Running time: " + str(running_time)) print("Number of gradient norm-based corrections: " + str(num_gradient_corrections)) running_loss = 0.0 num_gradient_corrections = 0 print('Finished Training') # Run evaluation # multi_dimensional_rnn.set_training(False) # Normal case network.module.set_training(False) # When using DataParallel evaluate_mdrnn(test_loader, network, batch_size, device)
def dechunk_activations(activations_chunked, tensor_list_chunking): return tensor_list_chunking. \ dechunk_block_tensor_concatenated_along_batch_dimension_changed_block_size(activations_chunked, SizeTwoDimensional(1, 1))
def compute_forward_from_chunked_input_using_portions(self, x_chunked, tensor_list_chunking): # Sum the results for multiple directions contained in chunks of the result if self.compute_multi_directional: # print("compute_forward_from_chunked_input_using_portions - x_chunked.size(): " + # str(x_chunked.size())) cat_list = list([]) data_portions = torch.chunk(x_chunked, 4, 0) for data_portion in data_portions: data_portion_conv_result = self.compute_forward_one_directional(data_portion) data_portion_results_per_direction = torch.chunk(data_portion_conv_result, 4, 1) data_portion_result = torch.sum(torch.stack(data_portion_results_per_direction, 0), 0) cat_list.append(data_portion_result) result = torch.cat(cat_list, 0) else: result = self.compute_forward_one_directional(x_chunked) if self.use_example_packing: # print("block_strided_convolution - use example packing") result = tensor_list_chunking. \ dechunk_block_tensor_concatenated_along_batch_dimension_changed_block_size(result, SizeTwoDimensional(1, 1)) return result
def compute_forward_from_chunked_input(self, x_chunked, tensor_list_chunking): result = self.compute_forward_one_directional(x_chunked) # If the input and output are lists, the output of the convolution # and activation function must again be converted back to the original list # format # if self.input_and_output_are_list: # convolution_output_size = SizeTwoDimensional.create_size_two_dimensional(1, 1) # output_ordered_back_to_input_format = tensor_list_chunking. \ # dechunk_block_tensor_concatenated_along_batch_dimension_changed_block_size(result, # convolution_output_size) # return output_ordered_back_to_input_format # print("block_strided_convolution - result.size(): " + str(result.size())) # Sum the results for multiple directions contained in chunks of the result # If the weights are shared across directions, this summation has already been done over the inputs # before computing the convolution if self.compute_multi_directional and not self.share_weights_across_directions: result = BlockStridedConvolution.chunk_four_parts_on_channel_dimension_and_sum(result) # result = TensorUtils.sum_list_of_tensors(results_per_direction) if self.use_example_packing: # print("block_strided_convolution - use example packing") result = tensor_list_chunking. \ dechunk_block_tensor_concatenated_along_batch_dimension_changed_block_size(result, SizeTwoDimensional(1, 1)) return result
def get_output_size_two_dimensional(self, input_size: SizeTwoDimensional): block_size = self.block_size height = int(input_size.height / block_size.height) width = int(input_size.width / block_size.width) return SizeTwoDimensional.create_size_two_dimensional(height, width)