def __init__(self, data_dir, label_path): self.data_dir = data_dir self.label2id = create_label_dict(label_path) self.id2label = {j: i for i, j in self.label2id.items()} self.tokenizer = AutoTokenizer.from_pretrained( "chinese-roberta-wwm-ext") self.bad_data = 0
train_pairs, train_max = ioutils.read_corpus(args.train, args.lower, args.lang, args.ratio) logger.info('Reading validation data') valid_pairs, valid_max = ioutils.read_corpus(args.validation, args.lower, args.lang) logger.info('Reading test data') test_pairs, test_max = ioutils.read_corpus(args.test, args.lower, args.lang) logger.info('Reading word embeddings') word_dict, embeddings = ioutils.load_embeddings(args.embeddings, args.vocab) max_len = None #print(train_pairs) #embeddings = utils.normalize_embeddings(embeddings) logger.debug('Embeddings have shape {} (including unknown, padding and null)' .format(embeddings.shape)) logger.info('Converting words to indices') # find out which labels are there in the data (more flexible to different datasets) label_dict = utils.create_label_dict(train_pairs) train_data = utils.create_dataset(train_pairs, word_dict, label_dict, max_len, max_len) valid_data = utils.create_dataset(valid_pairs, word_dict, label_dict, max_len, max_len) test_data = utils.create_dataset(test_pairs, word_dict, label_dict, max_len, max_len) #print(train_data.sizes1) ioutils.write_extra_embeddings(embeddings, args.save) ioutils.write_params(args.save, lowercase=args.lower, language=args.lang, mode=args.mode) ioutils.write_label_dict(label_dict, args.save) weights, bias = ioutils.load_weights(args.weights) msg = '{} sentences have shape {} (firsts) and {} (seconds)' logger.debug(msg.format('Training', train_data.sentences1.shape, train_data.sentences2.shape)) logger.debug(msg.format('Validation',
def RUN(batchsize, lr): #batchsize=config.batchsize #lr=config.learning_rate #num_epochs=config.num_epochs num_epochs = 50 device = config.device if device == None: device = utils.get_default_device() label_dict = utils.create_label_dict(config.symbols) revdict = {} for i, sym in enumerate(config.symbols): revdict[i] = sym model = InceptFC.FC_Model() #model=Resnet.ResNet50(3,97) model.to(device) print(config.checkpath) checkpoint = torch.load(config.checkpath, map_location=device) model.load_state_dict(checkpoint['model_state_dict']) print("MODEL LOADED") model.train() for name, child in model.named_children(): if name in ['conv_block1', "conv_block2", "conv1"]: print(name + ' is frozen') for param in child.parameters(): param.requires_grad = False else: print(name + ' is unfrozen') for param in child.parameters(): param.requires_grad = True optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=lr, weight_decay=lr / 10.) finepath = config.data_dir_path myvalpath = "/home/ubuntu/data/ocr/kdeval/good/images/" valid_paths = [ join(myvalpath, f) for f in listdir(myvalpath) if isfile(join(myvalpath, f)) ] refinement_ratio = [0.5] checkpath = os.path.dirname(config.checkpath) checkpath = join(checkpath, "FineTune2") os.system('mkdir -p ' + checkpath) p = 'runs/Inceptfinalrun/hypergridfine_tune/LR' + str(int( 1000000 * lr)) + 'BS' + str(batchsize) writer = SummaryWriter(p) fineds = [f for f in listdir(finepath) if isfile(join(finepath, f))] for epoch_fine in range(num_epochs): random.shuffle(fineds) ds_train = DataUtils.FINEIMGDS(label_dict, finepath, fineds) train_gen = torch.utils.data.DataLoader(ds_train, batch_size=batchsize, shuffle=True, num_workers=6, pin_memory=True) train_gen = DataUtils.DeviceDataLoader(train_gen, device) result = ModelUtils.fit_fine(model, train_gen, optimizer) loss_epoch = result.item() print("MEAN LOSS ON EPOCH {} is : {}".format(epoch_fine, loss_epoch)) ## SAVE WEIGHT AFTER FINETUNE PER EPOCH ''' torch.save({ 'epoch': epoch_fine, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss_epoch, }, os.path.join(checkpath, 'fine-epoch-{}.pt'.format(epoch_fine))) ''' ## WRITER TENSORBOARD writer.add_scalar('Training loss per epoch', loss_epoch, epoch_fine) ############################################################### ####### CHECK FOR VALIDATION+ pdf_acc = [] weight = [] for imgpath in tqdm(valid_paths, desc="TEST"): with io.open(imgpath, 'rb') as image_file: content = image_file.read() jsonpath = "/home/ubuntu/data/ocr/kdeval/good/json/" + os.path.splitext( os.path.basename(imgpath))[0] + ".json" with open(jsonpath) as f: bounds = json.load(f) bounds = bounds_refine(bounds, imgpath, 0.48) #print("Characters in Image=",len(bounds)) ds = get_ds(imgpath, bounds) ds_train = DataUtils.EVALIMGDS(label_dict, ds) train_gen = torch.utils.data.DataLoader(ds_train, batch_size=64, shuffle=False, num_workers=6, pin_memory=True) train_gen = DataUtils.DeviceDataLoader(train_gen, device) result = ModelUtils.evaluate(model, train_gen) pdf_acc.append(len(bounds) * result['val_acc']) weight.append(len(bounds)) print("EPOCHFINE={} Validation Accuracy Mean on GOOD pdf is {}".format( epoch_fine, sum(pdf_acc) / sum(weight))) writer.add_scalar('validation acc per epoch', sum(pdf_acc) / sum(weight), epoch_fine)
labels.append(label) coord.append((bound["vertices"][0]['x'], bound["vertices"][0]['y'], bound["vertices"][2]['x'], bound["vertices"][2]['y'])) #image.save(str(uuid.uuid1()) + '_handwritten.png') return ds,coord,labels,wordid,seq if __name__ =='__main__': device=config.device if device==None: device = utils.get_default_device() label_dict=utils.create_label_dict(config.symbols) revdict={} for i,sym in enumerate(config.symbols): revdict[i]=sym model=InceptFC.FC_Model() #model=Resnet.ResNet50(3,97) model.to(device) print(config.checkpath) checkpoint=torch.load(config.checkpath, map_location=device) model.load_state_dict(checkpoint['model_state_dict']) print("MODEL LOADED") model.train() pdf_acc=[] weight=[] mypath=join(config.pdfdata,"images") imgpaths = [join(mypath, f) for f in listdir(mypath) if isfile(join(mypath, f))]
from tqdm import tqdm from texttable import Texttable import json import datamodels import evaluate import models import utils from datamodels.DataModel import collate_fn from configs import BertConfig import evaluate_reference parser = argparse.ArgumentParser() parser.add_argument('--path', default='checkpoints/Bert_Lstm_Crf_best.pt') parser.add_argument('--outpath', default='./badcase/badcase.txt') id2label = utils.create_label_dict('dataset/tool_data/label.txt', reverse=True) def cut_name(data_list, max_length=5): res = [] for i in data_list: if len(i) > max_length: i = i[:max_length] res.append(i) return res def get_raw_text(path): lines = [json.loads(line.strip()) for line in open(path) if line.strip()] texts = [line['text'] for line in lines] return texts
def __init__(self, data_dir, label_path, sen_max_length): self.data_dir = data_dir self.label2id = create_label_dict(label_path) self.id2label = {j: i for i, j in self.label2id.items()} self.tokenizer = AutoTokenizer.from_pretrained("./bert-base-uncased") self.sen_max_length = sen_max_length
def __init__(self, data_dir, label_path, sen_max_length): self.data_dir = data_dir self.label2id = create_label_dict(label_path) self.id2label = {j: i for i, j in self.label2id.items()} self.sen_max_length = sen_max_length