def train(): print('start training ...........') batch_size = 16 num_epochs = 50 learning_rate = 0.1 label_converter = LabelConverter(char_set=string.ascii_lowercase + string.digits) vocab_size = label_converter.get_vocab_size() device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu") model = CRNN(vocab_size=vocab_size).to(device) # model.load_state_dict(torch.load('output/weight.pth', map_location=device)) train_loader, val_loader = get_loader('data/CAPTCHA Images/', batch_size=batch_size) optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min') # scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 10, 2) train_losses, val_losses = [], [] for epoch in range(num_epochs): train_epoch_loss = fit(epoch, model, optimizer, label_converter, device, train_loader, phase='training') val_epoch_loss = fit(epoch, model, optimizer, label_converter, device, val_loader, phase='validation') print('-----------------------------------------') if epoch == 0 or val_epoch_loss <= np.min(val_losses): torch.save(model.state_dict(), 'output/weight.pth') train_losses.append(train_epoch_loss) val_losses.append(val_epoch_loss) write_figure('output', train_losses, val_losses) write_log('output', epoch, train_epoch_loss, val_epoch_loss) scheduler.step(val_epoch_loss)
def predict(model_path, im_path, norm_height=32, norm_width=128, device='cpu'): ''' predict a new image using a trained model :param model_path: path of the saved model :param im_path: path of an image :param norm_height: image normalization height :param norm_width: image normalization width :param device: 'cpu' or 'cuda' ''' # step 1: initialize a model and put it on device model = CRNN() model = model.to(device) # step 2: load state_dict from saved model checkpoint = torch.load(model_path, map_location=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')) model.load_state_dict(checkpoint['state_dict']) print('[Info] Load model from {}'.format(model_path)) # step 3: initialize the label converter label_converter = LabelConverter() # step 4: read image and normalization transform = transforms.Compose([ transforms.ToPILImage(), transforms.Resize((norm_height, norm_width)), transforms.ToTensor() ]) im = cv2.imread(im_path) if im is None: raise AssertionError( f'the image {im_path} may not exist, please check it.') x = transform(im) x = x.unsqueeze(0) # add the batch dimension # step 5: run model model.eval() with torch.no_grad(): logits, _ = model(x) raw_pred = logits.argmax(2) pred = label_converter.decode(raw_pred)[0] print('prediction: {}\n'.format(pred)) # visualize probabilities output by CTC savepath = os.path.splitext(im_path)[0] + '_vis.jpg' visual_ctc_results(im, logits, savepath)
class CaptchaDataset(Dataset): def __init__(self, dataset_metadata_df, vocab, is_external_img=False): self.dataset_metadata_df = dataset_metadata_df self.vocab = vocab self.label_converter = LabelConverter(self.vocab) self.is_external_img = is_external_img def __len__(self): return len(self.dataset_metadata_df) def __getitem__(self, idx): if torch.is_tensor(idx): idx = idx.tolist() img_metadata = self.dataset_metadata_df.iloc[idx] img_path = img_metadata[0] raw_label = img_metadata[1] # print("img_path = ", img_path) # print("raw_label = ", raw_label) image = Image.open(img_path) # print("Opened image") if self.is_external_img: # Our external dataset has 4 channels (RGBA) and needs to be converted to RGB background = Image.new("RGB", image.size, (255, 255, 255)) background.paste(image, mask=image.split()[3]) # 3 is the alpha channel image = background # print("Converted to rgb") preprocess = transforms.Compose([ transforms.Resize(289), # transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) # print("b4 preprocess") image = preprocess(image) # print("AFTER preprocess") label = self.label_converter.encode(raw_label) # print("label = ", label) return (image, label)
import numpy as np from PIL import Image import torch import torchvision.transforms.functional as F import matplotlib.pyplot as plt from model import CRNN import os from tqdm import tqdm import glob from dataset import CaptchaImagesDataset from utils import LabelConverter from tqdm import tqdm if __name__ == '__main__': device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu") label_converter = LabelConverter(char_set=string.ascii_lowercase + string.digits) vocab_size = label_converter.get_vocab_size() model = CRNN(vocab_size=vocab_size).to(device) model.load_state_dict(torch.load('output/weight.pth', map_location=device)) model.eval() correct = 0.0 image_list = glob.glob('data/CAPTCHA Images/test/*') for image in tqdm(image_list): ground_truth = image.split('/')[-1].split('.')[0] image = Image.open(image).convert('RGB') image = F.to_tensor(image).unsqueeze(0).to(device) output = model(image) encoded_text = output.squeeze().argmax(1)
testset_percentage = 0.01 if not isRandomTestSet: for input_test_folder in input_test_folders: pass train_data = [] bar = Bar('Processing input folders', max=len(input_train_folders)) for input_train_folder in input_train_folders: bar.next() labels_paths = glob.glob('{}/labels_voc/*'.format(input_train_folder)) if len(labels_paths) == 0: raise Exception('ARE YOU SURE THERE IS LABELS IN THE FOLDER {}/labels_voc/ ?'.format(input_train_folder)) bar2 = Bar('Processing labels in {}'.format(input_train_folder), max=len(labels_paths)) for label_path in labels_paths[:]: image_path = LabelConverter.get_image_path_from_label_path(label_path) if not verify_image(image_path): print("LOL IS BROKEN") bar2.next() continue train_data.append([image_path, label_path]) bar2.next() bar2.finish() bar.finish() shuffle(train_data) if isRandomTestSet: test_set = train_data[:int(len(train_data)*testset_percentage)] train_set = train_data[int(len(train_data)*testset_percentage):] print('The dataset has a total of {} images, splitted across {} training images and {} test images'.format(len(train_data), len(train_set), len(test_set)))
if __name__ == "__main__": # 01 vocab # vocab = "01" # train_dataset_path = '/Users/tomtalpir/dev/tom/captcha_project/CaptchasRNN/generated_images_1590229754' # Digits vocab # vocab = string.digits # train_dataset_path = '/Users/tomtalpir/dev/tom/captcha_project/CaptchasRNN/generated_images_1591000952' vocab = string.ascii_lowercase train_dataset_path = '/Users/tomtalpir/dev/tom/captcha_project/CaptchasRNN/local_train_lowercase_ascii' # vocab = string.ascii_lowercase + string.digits # train_dataset_path = '/Users/tomtalpir/dev/tom/captcha_project/CaptchasRNN/local_train_lowercase_ascii' lc = LabelConverter(vocab) claptcha_test_dataset_path = '/Users/tomtalpir/dev/tom/captcha_project/CaptchasRNN/claptcha_test' claptcha_test_dataset_metadata_df = get_metadata_df( claptcha_test_dataset_path) claptcha_test_dataset_metadata_df = claptcha_test_dataset_metadata_df.head( 2) claptcha_test_dataset = CaptchaDataset(claptcha_test_dataset_metadata_df, vocab) claptcha_test_dataset_loader = torch.utils.data.DataLoader( claptcha_test_dataset, batch_size=200, shuffle=True, collate_fn=custom_collate_func) train_dataset_metadata_df = get_metadata_df(train_dataset_path)
import xml.etree.ElementTree as ET import os import utils.LabelConverter as LabelConverter from PIL import Image from progress.bar import Bar input_test_data = "/media/esteve/1615F2A532ED483C/Ubuntu/ML/fish_dataset/imagenet_dataset/imagenet_split_renamed/images/easy_classes_n01497118_277.jpg 1 0.603365 118 173 376 284 \n/media/esteve/1615F2A532ED483C/Ubuntu/ML/fish_dataset/imagenet_dataset/imagenet_split_renamed/images/easy_classes_n01497118_278.jpg 1 0.807335 89 -8 383 354 \n/media/esteve/1615F2A532ED483C/Ubuntu/ML/fish_dataset/imagenet_dataset/imagenet_split_renamed/images/easy_classes_n01497118_279.jpg 1 0.717453 122 25 438 355 \n/media/esteve/1615F2A532ED483C/Ubuntu/ML/fish_dataset/imagenet_dataset/imagenet_split_renamed/images/easy_classes_n01497118_427.jpg 1 0.971344 -8 6 504 270 \n/media/esteve/1615F2A532ED483C/Ubuntu/ML/fish_dataset/imagenet_dataset/imagenet_split_renamed/images/easy_classes_n01497118_280.jpg 1 0.956263 237 152 489 274 \n/media/esteve/1615F2A532ED483C/Ubuntu/ML/fish_dataset/imagenet_dataset/imagenet_split_renamed/images/easy_classes_n01497118_280.jpg 1 0.798489 49 79 418 199" input_file = '/media/esteve/1615F2A532ED483C/Ubuntu/ML/fish_dataset/imagenet_dataset/imagenet_split_renamed/out.txt' input_data = open(input_file, 'r').read().split('\n') bar = Bar('Creating xml files...', max=len(input_data)) for line in input_data[:]: bar.next() data = line.split(' ') label_file_path = LabelConverter.get_label_path_from_image_path(data[0]) try: tree = ET.parse(label_file_path) root = tree.getroot() except FileNotFoundError: root = ET.Element('annotation') tree = ET.ElementTree(root) folder = ET.SubElement(root, 'folder') folder.text = '/'.join(label_file_path.split('/')[:-1]) filename = ET.SubElement(root, 'filename') filename.text = label_file_path.split('/')[-1] source = ET.SubElement(root, 'source') ET.SubElement(source, 'database').text = "Selflabeled"
def train_val( train_im_dir='data/train', val_im_dir='data/train', # data path configs norm_height=32, norm_width=128, # image normalization configs n_epochs=20, batch_size=4, lr=1e-4, # training configs model_save_epoch=5, model_save_dir='models', # model saving configs load_pretrain=False, pretrain_path=None, # pretrained model configs device='cpu'): ''' The main training procedure ---------------------------- :param train_im_dir: path to directory with training images and ground-truth file :param val_im_dir: path to directory with validation images and ground-truth file :param norm_height: image normalization height :param norm_width: image normalization width :param n_epochs: number of training epochs :param batch_size: training and validation batch size :param lr: learning rate :param model_save_epoch: save model after each {model_save_epoch} epochs :param model_save_dir: path to save the model :param load_pretrain: whether to load a pretrained model :param pretrain_path: path of the pretrained model :param device: 'cpu' or 'cuda' ''' # step 1: initialize training and validation data loaders # please see ListDataset and dataLoader (line 19 and line 92) in utils.py for details trainloader = dataLoader(train_im_dir, norm_height, norm_width, batch_size, training=True) valloader = dataLoader(val_im_dir, norm_height, norm_width, batch_size, training=False) # step 2: initialize the label converter # please see LabelConverter (line 112) in utils.py for details label_converter = LabelConverter() # step 3: initialize the model model = CRNN() model = model.to(device) if load_pretrain: try: checkpoint = torch.load( pretrain_path, map_location=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')) model.load_state_dict(checkpoint['state_dict']) print(f'[Info] load pretrained model from {pretrain_path}') except Exception as e: print( f'[Warning] load pretrain model failed, the reason is:\n {e}' ) print('[Warning] the model will be trained from scratch!') # step 4: define CTC loss function and optimizer # -- CTC loss function in PyTorch is nn.CTCLoss() # note that the first input of nn.CTCLoss() is logarithmized probabilities # please refer to the following document to look up its usage # https://pytorch.org/docs/stable/generated/torch.nn.CTCLoss.html#torch.nn.CTCLoss criterion = nn.CTCLoss() optimizer = optim.Adam(model.parameters(), lr) # step 5: training & validation # two lists to save training loss and validation accuracy for each epoch losses, accuracies = [], [] for epoch in range(n_epochs): # train print('\nEpoch [{}/{}] start ...'.format(epoch + 1, n_epochs)) train_loss = train_one_epoch(model, trainloader, optimizer, criterion, label_converter, device) losses.append(train_loss) # validation accuracy = val_one_epoch(model, valloader, label_converter, device) accuracies.append(accuracy) # show information of the epoch print('train loss = {:.3f}, validation word accuracy = {:.1f}%'.format( train_loss, 100 * accuracy)) # save model if (epoch + 1) % model_save_epoch == 0: model_save_path = os.path.join( model_save_dir, 'model_epoch{}.pth'.format(epoch + 1)) torch.save({'state_dict': model.state_dict()}, model_save_path) print('[Info] model saved in {}'.format(model_save_path)) # draw the loss and accuracy curve plot_loss_and_accuracies(losses, accuracies)
def __init__(self, dataset_metadata_df, vocab, is_external_img=False): self.dataset_metadata_df = dataset_metadata_df self.vocab = vocab self.label_converter = LabelConverter(self.vocab) self.is_external_img = is_external_img
def main(): """ main function """ # define some command line arguments parser = argparse.ArgumentParser( description='Todo Bicig handwritten text recognition') parser.add_argument('--train', action='store_true', help='train the NN') parser.add_argument('--validate', action='store_true', help='validate the NN') args = parser.parse_args() dataset = TodoDataset(Params.dataset_path, Params.image_size) converter = LabelConverter(dataset.char_set) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if args.train or args.validate: #train_loss = [] #val_loss = [] #char_error = [] #word_acc = [] if args.train: # split on train and validation sets train_size = int(0.8 * len(dataset)) val_size = len(dataset) - train_size train_set, val_set = random_split(dataset, [train_size, val_size]) train_dataloader = DataLoader(train_set, batch_size=32, shuffle=True, num_workers=10) val_dataloader = DataLoader(val_set, batch_size=32, shuffle=True, num_workers=10) # training model model = Model(1024, len(dataset.char_set) + 1) loss = torch.nn.CTCLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1.0e-4, amsgrad=True) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1) #train_loss, val_loss, char_error, word_acc = train_model(model=model, loss=loss, optimizer=optimizer, scheduler=scheduler, num_epochs=70, train_dataloader=train_dataloader, val_dataloader=val_dataloader, converter=converter) torch.save(model.state_dict(), 'model/model.pth') if args.validate: # use all dataset val_dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=10) loss = torch.nn.CTCLoss() model = Model(1024, len(dataset.char_set) + 1) model.load_state_dict( torch.load('model/model.pth', map_location=device)) #val_loss, char_error_rate, word_accuracy = validate_model(model, loss, val_dataloader, converter) else: model = Model(1024, len(dataset.char_set) + 1) model.load_state_dict( torch.load('model/model.pth', map_location=device)) image = ImagePreprocess().resize_image( cv.imread(Params.test_image, cv.IMREAD_GRAYSCALE), Params.image_size) result = recognize(model, image, converter) print(result)