Exemplo n.º 1
0
        im_height, im_width, _ = frame.shape
        tf_frame = np.expand_dims(frame, axis=0)
        output_dict = run_inference_for_single_image(tf_frame, detection_graph)

        for i in range(output_dict['num_detections']):
            if output_dict['detection_scores'][i] < args.detection_th:
                continue
            ymin, xmin, ymax, xmax = output_dict['detection_boxes'][i]
            left, right, top, bottom = int(xmin * im_width), int(
                xmax * im_width), int(ymin * im_height), int(ymax * im_height)
            cv2.rectangle(frame, (left, top), (right, bottom), (0, 0, 255), 2)

            # convert
            box_frame = cv2.resize(frame[top:bottom, left:right],
                                   (crnn.width, crnn.height))
            box_frame = cv2.cvtColor(box_frame, cv2.COLOR_BGR2GRAY)
            box_frame = np.expand_dims(box_frame, axis=2)
            box_frame = box_frame / 255.0
            box_frame = np.expand_dims(box_frame, axis=0)

            # Text recognition
            out = crnn.predict(box_frame)
            out_text = []
            for x in out:
                for p in x:
                    out_text.append(crnn.char_list[int(p)])
            cv2.putText(frame, ''.join(out_text), (left, top - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 2)

        cv2.imshow('Text Detection and Text Recognition', frame)
Exemplo n.º 2
0
def train_model(spectr_dir,
                train_csv,
                test_csv,
                model_type,
                use_cache=False,
                resume=False):

    print('Loading data... ')

    if not use_cache:
        x_train, y_train = load_data_from_spectrogram_dir(
            spectr_dir, ['h', 'p'], train_csv)
        x_test, y_test_ori = load_data_from_spectrogram_dir(
            spectr_dir, ['h', 'p'], test_csv)

        print('Saving data to cache...')
        np.save('train_data.npy', x_train)
        np.save('train_label.npy', y_train)
        np.save('val_data.npy', x_test)
        np.save('val_label.npy', y_test_ori)
    else:
        print('Loading data from cache...')
        x_train = np.load('train_data.npy')
        y_train = np.load('train_label.npy')
        x_test = np.load('val_data.npy')
        y_test_ori = np.load('val_label.npy')
        print(y_train.shape)

    label_count = np.bincount(y_train)
    print('Train label count: ', label_count)

    from sklearn.utils import class_weight
    y_train = list(y_train)
    class_weights = class_weight.compute_class_weight('balanced',
                                                      np.unique(y_train),
                                                      y_train)

    print('sample weight per class', class_weights)

    print('-' * 130)
    print('Model train')
    print('-' * 130)

    input_shape = (rows, cols, channels)

    from model import ResnetBuilder, CRNN, simple_CNN

    if model_type == "resnet18":
        model = ResnetBuilder.build_resnet_18(input_shape, num_classes)
    elif model_type == "resnet34":
        model = ResnetBuilder.build_resnet_18(input_shape, num_classes)
    elif model_type == "CRNN":
        model = CRNN(input_shape, num_classes)
    else:
        model = simple_CNN(input_shape, num_classes)

    optimizer = Adadelta(0.1, rho=0.7)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])

    x_train = x_train.reshape(x_train.shape[0], rows, cols, channels)
    x_test = x_test.reshape(x_test.shape[0], rows, cols, channels)
    y_train = to_categorical(y_train, num_classes)
    y_test = to_categorical(y_test_ori, num_classes)

    ### Load weights
    if resume:
        model.load_weights('./model/' + model_name)
    #
    checkpoint = ModelCheckpoint('./model/' + model_name,
                                 monitor='val_acc',
                                 verbose=1,
                                 save_best_only=True,
                                 mode='max')
    early_stop = EarlyStopping(monitor='val_acc', patience=10, mode='max')
    callbacks_list = [checkpoint, early_stop]

    print('epochs', epochs)

    model.fit(x_train,
              y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(x_test, y_test),
              verbose=1,
              shuffle=True,
              callbacks=callbacks_list,
              class_weight=class_weights)

    print('Report')
    y_prob = model.predict(x_test)
    y_pred = y_prob.argmax(axis=-1)
    print('test_y', np.bincount(y_test_ori))
    print(classification_report(y_test_ori, y_pred, target_names=CLASSES))

    model.save(model_name)
class OCRTrainer:
    def __init__(self, args):
        self.args = args

        self.dataset = DataSet(self.args.dataset, self.args.batch_size)
        self.model = CRNN(args.lr, self.dataset.img_width, self.dataset.img_height, self.dataset.total_labels)
        
        # restoring the model weights        
        if self.args.resume is not None:
            print(f"Resuming from checkpoint: {self.args.resume}")
            self.model.load_weights(self.args.resume)

        # inverse chord map for single character -> name used in decode chord
        self.inverse_chord_map = {
            "Δ":"major",
            "M": "major",
            "m": "minor",
            "+": "augmented",
            "-" : "diminished",
            "o": "diminished ",
            "ø": "half diminished",
            "#": "sharp",
            "b": "bimol",
            # "<"   : "mychord",
        }


    def decode_chord(self, pred_texts_batch):
        chord_batch = []
        for preds in pred_texts_batch:
            chord = ""
            i = 0
            # iterate over the chord prediction
            while(i < len(preds)):
                if preds[i:i+3] in ("Maj", "maj"):
                    chord += "major"
                    i += 3
                elif preds[i:i+3] == "min":
                    chord += "minor"
                    i += 3
                elif preds[i:i+3] == "dim":
                    chord += "diminished"
                    i += 3
                elif (preds[i] == "-") and (preds[i-1] in ("C", "D", "E", "F", "G", "A", "B")):
                    chord += "minor"
                    i += 1
                # elif preds[i:i+3] == "9\\6":
                #     chord += "6add9"
                #     i += 3
                elif preds[i:i+5] == "[UNK]": # ctc's unknown token
                    i += 5
                else:
                    # for single character try to get inverse chord mapping, if none, then return the same character
                    chord += self.inverse_chord_map.get(preds[i]) or preds[i]
                    i+=1
            # append the one completed chord
            chord_batch.append(chord)
        return chord_batch

    def train(self):
        # Train the model
        min_loss = 1000
        saved = ""
        for ep in range(self.args.epochs):
            loss = 0
            for idx, batch in enumerate(self.dataset.train_dataset):
                batch_images = batch["image"]
                batch_labels = batch["label"]
                loss += self.model.train_on_batch(batch_images, batch_labels)
            loss /= len(self.dataset.train_dataset)
            if loss < min_loss:
                self.model.save_weights("./checkpoint/best.h5")
                min_loss = loss
                saved = "Saving model at ./checkpoint/best.h5"
            print("Epoch {}/{}\tLoss = {:.5f}\t{}".format(ep+1, self.args.epochs, loss, saved))
            saved = ""
            # lr schdule
            if ep%self.args.interval == 0:
                self.model.optimizer.lr = self.model.optimizer.lr*0.1
        self.eval()

    def eval(self):
        #  Let's check results on some validation samples
        for batch in self.dataset.train_dataset.take(1):
            batch_images = batch["image"]
            batch_labels = batch["label"]
            # model prediction
            preds = self.model.predict(batch_images)
            # ctc decoding
            pred_texts = self.decode_batch_predictions(preds)
            # chord decoding
            pred_chord = self.decode_chord(pred_texts)
            # labels decoding
            orig_texts = []
            for label in batch_labels:
                label = tf.strings.reduce_join(self.dataset.num_to_char(label)).numpy().decode("utf-8")
                orig_texts.append(label)

            # plotting
            _, ax = plt.subplots(4, 4, figsize=(15, 5))
            for i in range(len(pred_chord)):
                # plotting from the RNN only model
                # img = (batch_images[i, :, :] * 255).numpy().astype(np.uint8)
                # plotting from the CNN + RNN model
                img = (batch_images[i, :, :, 0] * 255).numpy().astype(np.uint8)
                img = img.T
                title = f"{pred_chord[i]}"
                ax[i // 4, i % 4].imshow(img, cmap="gray")
                ax[i // 4, i % 4].set_title(title)
                ax[i // 4, i % 4].axis("off")
        plt.savefig('eval.png')
        plt.show()


    def eval_chords(self):
        correct = 0
        for idx, batch in enumerate(self.dataset.train_dataset):
            batch_images = batch["image"]
            batch_labels = batch["label"]
            preds = self.model.predict(batch_images)
            pred_texts = self.decode_batch_predictions(preds)
            pred_chord = self.decode_chord(pred_texts)
            orig_texts = []
            for label in batch_labels:
                label = tf.strings.reduce_join(self.dataset.num_to_char(label)).numpy().decode("utf-8")
                orig_texts.append(label)
            # comparing predicted OCR text with labels to compute accuracy
            for i,j in zip(orig_texts, pred_texts):
                correct += 1 if i == j else 0
        print("Accuracy: {:.2f}%".format((correct/self.dataset.len)*100))


    def eval_img(self, path):
        # passing the single image to the dataloader
        img = self.dataset.encode_single_sample(path, "None")["image"]
        # adding batch dim
        img = tf.expand_dims(img, 0)
        
        # the numpy and PIL image way
        # imgp = Image.open(path).convert('L')
        # imgp = imgp.resize(size=(self.dataset.img_width, self.dataset.img_height))
        # imgp = np.asarray(imgp)
        # imgp = imgp.T
        # imgp = np.expand_dims(imgp, 0)
        # imgp = np.expand_dims(imgp, -1)
        # imgn = imgp/imgp.max()

        preds = self.model.predict(img)
        pred_texts = self.decode_batch_predictions(preds)
        pred_chord = self.decode_chord(pred_texts)[0]
        print(f"Predicted OCR: {pred_texts[0]}, Predicted Chord: {pred_chord}")

        # plotting from the RNN only model
        # img = (img[0, :, :] * 255).numpy().astype(np.uint8)

        # plotting from the CNN + RNN model
        # img = (img[0, :, :, 0] * 255).numpy().astype(np.uint8)

        # for plotting
        # img = img.T
        # plt.imshow(img, cmap="gray")
        # plt.title(f"{pred_chord}")
        # plt.axis("off")
        # plt.show()


    # A utility function to decode the output of the network
    def decode_batch_predictions(self, pred):
        input_len = np.ones(pred.shape[0]) * pred.shape[1]
        # Use greedy search. For complex tasks, you can use beam search
        results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][
            :, :self.dataset.max_length
        ]
        # Iterate over the results and get back the text
        output_text = []
        for res in results:
            res = tf.strings.reduce_join(self.dataset.num_to_char(res)).numpy().decode("utf-8")
            output_text.append(res)
        return output_text
Exemplo n.º 4
0
    h, w = image.shape
    if w < 128:
        image = np.concatenate((image, np.ones((h, 128 - w)) * 255), axis=1)
    if h < 32:
        image = np.concatenate((image, np.ones((32 - h, 128)) * 255))
    image = np.expand_dims(image, axis=2)
    image = image / 255.0
    image = np.expand_dims(image, axis=0)
    return image


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_image',
                        required=True,
                        help='Path to input image')
    args = parser.parse_args()

    image = preprocess_image_cv(args.input_image)

    crnn = CRNN()
    crnn.load_weights('crnn_model.h5')
    out = crnn.predict(image)

    for x in out:
        print('predicted: ', end='')
        for p in x:
            if int(p) != -1:
                print(crnn.char_list[int(p)], end='')
        print('')