예제 #1
0
 def __init__(self, data_dir, label_path):
     self.data_dir = data_dir
     self.label2id = create_label_dict(label_path)
     self.id2label = {j: i for i, j in self.label2id.items()}
     self.tokenizer = AutoTokenizer.from_pretrained(
         "chinese-roberta-wwm-ext")
     self.bad_data = 0
    train_pairs, train_max = ioutils.read_corpus(args.train, args.lower, args.lang, args.ratio)
    logger.info('Reading validation data')
    valid_pairs, valid_max = ioutils.read_corpus(args.validation, args.lower, args.lang)
    logger.info('Reading test data')
    test_pairs, test_max = ioutils.read_corpus(args.test, args.lower, args.lang)
    logger.info('Reading word embeddings')
    word_dict, embeddings = ioutils.load_embeddings(args.embeddings, args.vocab)
    max_len = None
    #print(train_pairs)
    #embeddings = utils.normalize_embeddings(embeddings)
    logger.debug('Embeddings have shape {} (including unknown, padding and null)'
                 .format(embeddings.shape))

    logger.info('Converting words to indices')
    # find out which labels are there in the data (more flexible to different datasets)
    label_dict = utils.create_label_dict(train_pairs)
    train_data = utils.create_dataset(train_pairs, word_dict, label_dict, max_len, max_len)
    valid_data = utils.create_dataset(valid_pairs, word_dict, label_dict, max_len, max_len)
    test_data = utils.create_dataset(test_pairs, word_dict, label_dict, max_len, max_len)

    #print(train_data.sizes1)
    ioutils.write_extra_embeddings(embeddings, args.save)
    ioutils.write_params(args.save, lowercase=args.lower, language=args.lang, mode=args.mode)
    ioutils.write_label_dict(label_dict, args.save)
    weights, bias = ioutils.load_weights(args.weights)

    msg = '{} sentences have shape {} (firsts) and {} (seconds)'
    logger.debug(msg.format('Training',
                            train_data.sentences1.shape,
                            train_data.sentences2.shape))
    logger.debug(msg.format('Validation',
예제 #3
0
def RUN(batchsize, lr):
    #batchsize=config.batchsize
    #lr=config.learning_rate
    #num_epochs=config.num_epochs
    num_epochs = 50
    device = config.device
    if device == None:
        device = utils.get_default_device()
    label_dict = utils.create_label_dict(config.symbols)
    revdict = {}
    for i, sym in enumerate(config.symbols):
        revdict[i] = sym
    model = InceptFC.FC_Model()
    #model=Resnet.ResNet50(3,97)
    model.to(device)
    print(config.checkpath)
    checkpoint = torch.load(config.checkpath, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    print("MODEL LOADED")
    model.train()
    for name, child in model.named_children():
        if name in ['conv_block1', "conv_block2", "conv1"]:
            print(name + ' is frozen')
            for param in child.parameters():
                param.requires_grad = False
        else:
            print(name + ' is unfrozen')
            for param in child.parameters():
                param.requires_grad = True

    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        model.parameters()),
                                 lr=lr,
                                 weight_decay=lr / 10.)
    finepath = config.data_dir_path
    myvalpath = "/home/ubuntu/data/ocr/kdeval/good/images/"
    valid_paths = [
        join(myvalpath, f) for f in listdir(myvalpath)
        if isfile(join(myvalpath, f))
    ]
    refinement_ratio = [0.5]
    checkpath = os.path.dirname(config.checkpath)
    checkpath = join(checkpath, "FineTune2")
    os.system('mkdir -p ' + checkpath)
    p = 'runs/Inceptfinalrun/hypergridfine_tune/LR' + str(int(
        1000000 * lr)) + 'BS' + str(batchsize)
    writer = SummaryWriter(p)
    fineds = [f for f in listdir(finepath) if isfile(join(finepath, f))]
    for epoch_fine in range(num_epochs):
        random.shuffle(fineds)
        ds_train = DataUtils.FINEIMGDS(label_dict, finepath, fineds)
        train_gen = torch.utils.data.DataLoader(ds_train,
                                                batch_size=batchsize,
                                                shuffle=True,
                                                num_workers=6,
                                                pin_memory=True)
        train_gen = DataUtils.DeviceDataLoader(train_gen, device)
        result = ModelUtils.fit_fine(model, train_gen, optimizer)
        loss_epoch = result.item()
        print("MEAN LOSS ON EPOCH {} is : {}".format(epoch_fine, loss_epoch))
        ## SAVE WEIGHT AFTER FINETUNE PER EPOCH
        '''
        torch.save({
                    'epoch': epoch_fine,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': loss_epoch,
                    }, os.path.join(checkpath, 'fine-epoch-{}.pt'.format(epoch_fine)))
        '''
        ## WRITER TENSORBOARD
        writer.add_scalar('Training loss per epoch', loss_epoch, epoch_fine)

        ###############################################################
        ####### CHECK FOR VALIDATION+
        pdf_acc = []
        weight = []
        for imgpath in tqdm(valid_paths, desc="TEST"):
            with io.open(imgpath, 'rb') as image_file:
                content = image_file.read()
            jsonpath = "/home/ubuntu/data/ocr/kdeval/good/json/" + os.path.splitext(
                os.path.basename(imgpath))[0] + ".json"
            with open(jsonpath) as f:
                bounds = json.load(f)
            bounds = bounds_refine(bounds, imgpath, 0.48)
            #print("Characters in Image=",len(bounds))
            ds = get_ds(imgpath, bounds)
            ds_train = DataUtils.EVALIMGDS(label_dict, ds)
            train_gen = torch.utils.data.DataLoader(ds_train,
                                                    batch_size=64,
                                                    shuffle=False,
                                                    num_workers=6,
                                                    pin_memory=True)
            train_gen = DataUtils.DeviceDataLoader(train_gen, device)
            result = ModelUtils.evaluate(model, train_gen)
            pdf_acc.append(len(bounds) * result['val_acc'])
            weight.append(len(bounds))
        print("EPOCHFINE={} Validation Accuracy Mean on GOOD pdf is {}".format(
            epoch_fine,
            sum(pdf_acc) / sum(weight)))
        writer.add_scalar('validation acc per epoch',
                          sum(pdf_acc) / sum(weight), epoch_fine)
예제 #4
0
        labels.append(label)
        coord.append((bound["vertices"][0]['x'],
                       bound["vertices"][0]['y'],
                       bound["vertices"][2]['x'],
                       bound["vertices"][2]['y']))
    
    #image.save(str(uuid.uuid1()) + '_handwritten.png')
    return ds,coord,labels,wordid,seq



if __name__ =='__main__':
    device=config.device
    if device==None:
        device = utils.get_default_device()
    label_dict=utils.create_label_dict(config.symbols)
    revdict={}
    for i,sym in enumerate(config.symbols):
        revdict[i]=sym
    model=InceptFC.FC_Model()    
    #model=Resnet.ResNet50(3,97)
    model.to(device)
    print(config.checkpath)
    checkpoint=torch.load(config.checkpath, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    print("MODEL LOADED")
    model.train()
    pdf_acc=[]
    weight=[]
    mypath=join(config.pdfdata,"images")
    imgpaths = [join(mypath, f) for f in listdir(mypath) if isfile(join(mypath, f))]
예제 #5
0
from tqdm import tqdm
from texttable import Texttable
import json

import datamodels
import evaluate
import models
import utils
from datamodels.DataModel import collate_fn
from configs import BertConfig
import evaluate_reference

parser = argparse.ArgumentParser()
parser.add_argument('--path', default='checkpoints/Bert_Lstm_Crf_best.pt')
parser.add_argument('--outpath', default='./badcase/badcase.txt')
id2label = utils.create_label_dict('dataset/tool_data/label.txt', reverse=True)

def cut_name(data_list, max_length=5):
    res = []
    for i in data_list:
        if len(i) > max_length:
            i = i[:max_length]
        res.append(i)
    return res

def get_raw_text(path):
    lines = [json.loads(line.strip()) for line in open(path) if line.strip()]
    texts = [line['text'] for line in lines]
    return texts

 def __init__(self, data_dir, label_path, sen_max_length):
     self.data_dir = data_dir
     self.label2id = create_label_dict(label_path)
     self.id2label = {j: i for i, j in self.label2id.items()}
     self.tokenizer = AutoTokenizer.from_pretrained("./bert-base-uncased")
     self.sen_max_length = sen_max_length
 def __init__(self, data_dir, label_path, sen_max_length):
     self.data_dir = data_dir
     self.label2id = create_label_dict(label_path)
     self.id2label = {j: i for i, j in self.label2id.items()}
     self.sen_max_length = sen_max_length