def getValidation(self, params): if self.validation_dict == None: images = [] labels = [] # Read files for sample_filepath in self.validation_list: sample_fullpath = self.corpus_dirpath + '/' + sample_filepath + '/' + sample_filepath # IMAGE sample_img = cv2.imread( sample_fullpath + '.png', cv2.IMREAD_GRAYSCALE) # Grayscale is assumed! height = params['img_height'] sample_img = ctc_utils.resize(sample_img, height) images.append(ctc_utils.normalize(sample_img)) # GROUND TRUTH if self.semantic: sample_full_filepath = sample_fullpath + '.semantic' else: sample_full_filepath = sample_fullpath + '.agnostic' sample_gt_file = open(sample_full_filepath, 'r') sample_gt_plain = sample_gt_file.readline().rstrip().split( ctc_utils.word_separator()) sample_gt_file.close() labels.append([self.word2int[lab] for lab in sample_gt_plain]) # Transform to batch image_widths = [img.shape[1] for img in images] max_image_width = max(image_widths) batch_images = np.ones(shape=[ len(self.validation_list), params['img_height'], max_image_width, params['img_channels'] ], dtype=np.float32) * self.PAD_COLUMN for i, img in enumerate(images): batch_images[i, 0:img.shape[0], 0:img.shape[1], 0] = img # LENGTH width_reduction = 1 for i in range(params['conv_blocks']): width_reduction = width_reduction * params[ 'conv_pooling_size'][i][1] lengths = [batch_images.shape[2] / width_reduction ] * batch_images.shape[0] self.validation_dict = { 'inputs': batch_images, 'seq_lengths': np.asarray(lengths), 'targets': labels, } return self.validation_dict, len(self.validation_list)
def nextBatch(self, params): images = [] labels = [] for i in range(16): temp_filepath = self.training_data[self.curr_idx] full_path = self.data_dirpath + '/' + temp_filepath + '/' + temp_filepath sample_img = cv2.imread(full_path + '.png', False) height = 128 sample_img = ctc_utils.resize(sample_img,height) images.append(ctc_utils.normalize(sample_img)) sample_full_filepath = full_path + '.semantic' gt_file = open(sample_full_filepath, 'r') gt_list = gt_file.readline().rstrip().split(ctc_utils.word_separator()) gt_file.close() labels.append([self.word2int[lab] for lab in gt_list]) self.curr_idx = (self.curr_idx + 1) % len( self.training_data ) image_widths = [img.shape[1] for img in images] max_width = max(image_widths) batch_images = np.ones(shape=[16, 128, max_width, 1], dtype=np.float32)*self.PAD_COLUMN for i, img in enumerate(images): batch_images[i, 0:img.shape[0], 0:img.shape[1], 0] = img width_reduction = 1 for i in range(4): width_reduction = width_reduction * conv_pool[i][1] lengths = [ batch_images.shape[2] / width_reduction ] * batch_images.shape[0] return { 'inputs': batch_images, 'seq_lengths': np.asarray(lengths), 'targets': labels, }
# Constants that are saved inside the model itself WIDTH_REDUCTION, HEIGHT = sess.run([width_reduction_tensor, height_tensor]) decoded, _ = tf.nn.ctc_greedy_decoder(logits, seq_len) results = [] minres = 10 maxres = -1 for x_in in inputs: imgpath = f'{corpus}/{x_in}/{x_in}.jpg' image = cv2.imread(imgpath, 0) image = ctc_utils.resize(image, HEIGHT) image = ctc_utils.normalize(image) image = np.asarray(image).reshape(1, image.shape[0], image.shape[1], 1) seq_lengths = [image.shape[2] / WIDTH_REDUCTION] prediction = sess.run(decoded, feed_dict={ input: image, seq_len: seq_lengths, rnn_keep_prob: 1.0, }) str_predictions = ctc_utils.sparse_tensor_to_strs(prediction) output = "" for w in str_predictions[0]: output += str(int2word[w])
def nextBatch(self, params, mode = 'Train'): images = [] labels = [] # Read files for _ in range(params['batch_size']): if mode == 'Train': sample_filepath = self.training_list[self.current_idx] sample_fullpath = self.corpus_dirpath + '/' + sample_filepath + '/' + sample_filepath elif mode == 'Validation': sample_filepath = self.validation_list[self.current_val_idx] sample_fullpath = self.corpus_dirpath + '/' + sample_filepath + '/' + sample_filepath # IMAGE if self.distortions: sample_img = cv2.imread(sample_fullpath + '_distorted.jpg', False) # Grayscale is assumed else: sample_img = cv2.imread(sample_fullpath + '.png', False) # Grayscale is assumed! height = params['img_height'] sample_img = ctc_utils.resize(sample_img,height) images.append(ctc_utils.normalize(sample_img)) # GROUND TRUTH if self.semantic: sample_full_filepath = sample_fullpath + '.semantic' else: sample_full_filepath = sample_fullpath + '.agnostic' sample_gt_file = open(sample_full_filepath, 'r') sample_gt_plain = sample_gt_file.readline().rstrip().split(ctc_utils.word_separator()) sample_gt_file.close() labels.append([self.word2int[lab] for lab in sample_gt_plain]) if mode == 'Train': self.current_idx = (self.current_idx + 1) % len( self.training_list ) elif mode == 'Validation': self.current_val_idx = (self.current_val_idx + 1) % len( self.validation_list ) # Transform to batch image_widths = [img.shape[1] for img in images] max_image_width = max(image_widths) batch_images = np.ones(shape=[params['batch_size'], params['img_height'], max_image_width, params['img_channels']], dtype=np.float32)*self.PAD_COLUMN for i, img in enumerate(images): batch_images[i, 0:img.shape[0], 0:img.shape[1], 0] = img # LENGTH width_reduction = 1 for i in range(params['conv_blocks']): width_reduction = width_reduction * params['conv_pooling_size'][i][1] lengths = [ batch_images.shape[2] / width_reduction ] * batch_images.shape[0] return { 'inputs': batch_images, 'seq_lengths': np.asarray(lengths), 'targets': labels, }
def main(ms_file_name, line_freq, ouptut_file): tf.reset_default_graph() sess = tf.InteractiveSession() # load vocabulary int2word = read_vocab("models/vocabulary_semantic.txt") # Restore weights model = "models/semantic_model.meta" saver = tf.train.import_meta_graph(model) saver.restore(sess, model[:-5]) graph = tf.get_default_graph() model_input = graph.get_tensor_by_name("model_input:0") seq_len = graph.get_tensor_by_name("seq_lengths:0") rnn_keep_prob = graph.get_tensor_by_name("keep_prob:0") height_tensor = graph.get_tensor_by_name("input_height:0") width_reduction_tensor = graph.get_tensor_by_name("width_reduction:0") logits = tf.get_collection("logits")[0] # Constants that are saved inside the model itself WIDTH_REDUCTION, HEIGHT = sess.run([width_reduction_tensor, height_tensor]) decoded, _ = tf.nn.ctc_greedy_decoder(logits, seq_len) # split the music score into lines print(f"Process {ms_file_name}\n") lines = split_score(ms_file_name, line_freq) output = open(ouptut_file, "w") # process save file for idx, line in enumerate(lines): # write the file to sample directory for sampling print(f"./samples/sample{idx}.png\n") cv2.imwrite(f"./samples/sample{idx}.png", line) gray = cv2.cvtColor(line, cv2.COLOR_BGR2GRAY) image = ctc_utils.resize(gray, HEIGHT) image = ctc_utils.normalize(image) image = np.asarray(image).reshape(1, image.shape[0], -1, 1) seq_lengths = [image.shape[2] / WIDTH_REDUCTION] prediction = sess.run(decoded, feed_dict={ model_input: image, seq_len: seq_lengths, rnn_keep_prob: 1.0, }) str_predictions = ctc_utils.sparse_tensor_to_strs(prediction) for w in str_predictions[0]: description = int2word[w] notation, v1, v2 = parse_description(description) if v1 != "tie": if notation == "barline": output.write("### ----------------\n") elif notation == "note" or notation == "gracenote": output.write(f'- ["{notation}", "{v1}", "{v2}"]\n') elif notation == "rest": output.write(f'- ["rest", "{v1}"]\n') output.close()
def predict(image): tf.reset_default_graph() sess = tf.InteractiveSession() voc_file = 'vocabulary_agnostic.txt' model = './Models/model.hdf5-69000.meta' # Read the dictionary dict_file = open(voc_file, "r") dict_list = dict_file.read().splitlines() int2word = dict() for word in dict_list: word_idx = len(int2word) int2word[word_idx] = word dict_file.close() # Restore weights saver = tf.train.import_meta_graph(model) saver.restore(sess, model[:-5]) graph = tf.get_default_graph() input = graph.get_tensor_by_name("model_input:0") seq_len = graph.get_tensor_by_name("seq_lengths:0") rnn_keep_prob = graph.get_tensor_by_name("keep_prob:0") height_tensor = graph.get_tensor_by_name("input_height:0") width_reduction_tensor = graph.get_tensor_by_name("width_reduction:0") logits = tf.get_collection("logits")[0] # Constants that are saved inside the model itself WIDTH_REDUCTION, HEIGHT = sess.run([width_reduction_tensor, height_tensor]) decoded, _ = tf.nn.ctc_greedy_decoder(logits, seq_len) image = cv2.imread(image, False) image = ctc_utils.resize(image, HEIGHT) image = ctc_utils.normalize(image) image = np.asarray(image).reshape(1, image.shape[0], image.shape[1], 1) seq_lengths = [image.shape[2] / WIDTH_REDUCTION] prediction = sess.run( decoded, feed_dict={input: image, seq_len: seq_lengths, rnn_keep_prob: 1.0,} ) str_predictions = ctc_utils.sparse_tensor_to_strs(prediction) notes = [] for w in str_predictions[0]: temp = int2word[w].split('.') print(temp) if (len(temp) != 2): continue else: symbol, des = temp if (symbol == 'note'): length, note = des.split('-', 1) if ('beamed' in length): length = 'eigth' notes.append((length, notes_dict[note])) elif (symbol == 'rest'): length, _ = des.split('-', 1) notes.append((length, 'rest')) return notes