Пример #1
0
def torch_read_tfrecord(image_path, part_idx=0):
    tfrecord_path = image_path + '_part_' + str(part_idx) + ".tfrecord"
    index_path = image_path + '_part_' + str(part_idx) + ".idx"
    # index_path =None
    description = {
        "image": "byte",
        "label": "int",
        "index": "int",
        "name": "byte"
    }
    batch_size = 1
    num_worker = 6
    dataset = TFRecordDataset(tfrecord_path,
                              index_path,
                              description,
                              shuffle_queue_size=batch_size * num_worker,
                              transform=decode_image)
    loader = torch.utils.data.DataLoader(dataset,
                                         batch_size=batch_size,
                                         shuffle=False,
                                         pin_memory=False,
                                         drop_last=False,
                                         num_workers=num_worker)
    i = 1
    for data in tqdm(loader):
        print("data", i, len(data["label"]), data["label"])
        #print(data["label"],data["image"])
        i += 1
def tfrec_extract(filename):
    global train_row
    global test_row
    global train
    global test
    tfrecord_path = os.path.join(tfrec_dir, filename)
    index_path = tfrecord_path.replace('.tfrec', '.index')
    
    if 'train' in filename:
        savedir = train_dir
    else:savedir = test_dir
    dataset = TFRecordDataset(tfrecord_path, index_path, transform=decode_image)
    loader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False)
    for data in T(loader):
        # print(len(loader))
        if 'train' in filename:
            train_row += 1
        else: test_row += 1
        img_name = data['image_name'].squeeze().data.cpu().numpy().copy()
        img_name = os.path.join(savedir, ''.join(map(chr, img_name)))
        img_name += '.jpg'
        image_file = data['image'].squeeze().data.cpu().numpy()
        cv2.imwrite(img_name, image_file)
        del data['image']
        del data['image_name']
        for k, v in data.items():
            if 'train' in filename:
                train.loc[train_row, 'image_name'] = img_name
                train.loc[train_row, k] = v.squeeze().data.cpu().numpy()
                train.loc[train_row, 'tfrec'] = filename.replace('.tfrec', '')
            else:
                test.loc[test_row, 'image_name'] = img_name
                test.loc[test_row, k] = v.squeeze().data.cpu().numpy()
                test.loc[test_row, 'tfrec'] = filename.replace('.tfrec', '')
Пример #3
0
    def __init__(self):
        tfrecord_path = cfg['datasets']['train']['tfrecord_path']

        self.HR_size = cfg['datasets']['train']['HR_size']
        # self.batch_size = cfg['datasets']['train']['batch_size']

        self.dataset = TFRecordDataset(tfrecord_path, None)
        self.loader = iter(torch.utils.data.DataLoader(self.dataset, batch_size=1))
Пример #4
0
    def __init__(self):
        tfrecord_path = cfg['datasets']['train']['tfrecord_path']
        self.mask_dir = cfg['datasets']['train']['masks']
        self.mask_files = glob.glob(self.mask_dir + '/**/*.png', recursive=True)

        self.HR_size = cfg['datasets']['train']['HR_size']
        # self.batch_size = cfg['datasets']['train']['batch_size']

        self.dataset = TFRecordDataset(tfrecord_path, None)
        self.loader = iter(torch.utils.data.DataLoader(self.dataset, batch_size=1))
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    auto_encoder = AE().to(device)
    optimizer = torch.optim.Adam(auto_encoder.parameters(),
                                 lr=LR,
                                 weight_decay=0.01)
    loss_func = nn.MSELoss()
    # load input data
    # index_path = None
    description = {"vol_raw": "byte"}
    test_dataset = TFRecordDataset(tfrecord_path,
                                   index_path=None,
                                   description=description)
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=batch_size)
    # data = next(iter(loader))
    # print(data)
    for epoch in range(n_epochs):
        loss = 0
        for step, input in enumerate(test_loader):
            # ===================== forward =========================
            input_x = input_pipeline(input['vol_raw'].float())
            encoder_out, decoder_out = auto_encoder(input_x)
            # encoder_out = encoder_out.float()
            decoder_out = decoder_out.float()
            # loss = loss_func(decoder_out, brain_img.float())
            # print(decoder_out)
            # print(decoder_out)

            # ===================== backward =========================
            optimizer.zero_grad()
            # loss.backward()
            optimizer.step()

            # --------------------accuracy begin-------------------------#
            # _, prediction = torch.max(decoder_out, 1)  # prediction里面是一维索引
            pred = decoder_out.argmax(dim=1)

            # correct_num += torch.eq(pred, brain_labels).sum().float().item()

            # print(prediction.shape)
            # print(prediction)
            # print(brain_labels.shape)
            # print(brain_labels)
            # correct += (prediction == brain_labels).sum().item()  # 获得一个batch正确的数量
            # out_class = (decoder_out[:] > 0).float()  # 将out矩阵中大于0的转化为1,小于0的转化为0,存入a中
            # right_num = torch.sum(b_x == out_class).float()  # 分类对的数值
            # precision = correct / decoder_out.shape[0]  # 准确率
            # --------------------accuracy end-------------------------#

            if step % 100 == 0:
                print('Epoch:{}, Train_loss:{:.8f}'.format(epoch, loss.item()))
Пример #6
0
def load_dataset(transform=None):
    filename="zoom_blur_1"
    return TFRecordDataset(
            data_path=imagenetc_path+filename+'.tfrecords', 
            index_path=imagenetc_path+filename+'.tfrecords_index',
            description={ 'height': 'int',
                          'width':  'int',
                          'depth':  'int',
                          'corruption_type': 'byte',
                          'severity_level' : 'int',
                          'class_label': 'byte',
                          'image_raw':  'byte' }, 
            transform=transform)
Пример #7
0
 def __init__(self,
              tfrecord_path,
              description=None,
              index_path=None,
              batch_size=16,
              transform_fn=None):
     if description is None:
         description = {"image": "byte", "label": "float"}
     self.dataset = TFRecordDataset(tfrecord_path,
                                    index_path,
                                    description,
                                    transform=transform_fn)
     self.loader = torch.utils.data.DataLoader(self.dataset,
                                               batch_size=batch_size)
def viz(model_name, tf_records_path, record_num, word_num, table_num,
        pred_thresh):

    model_path = "C:/Users/Jesper/Desktop/TableRecognition/Table_Detection_and_Recognition/Table_Recognition/models/{}/model.pt".format(
        model_name)
    model = VexMoutNet()
    model.load_state_dict(torch.load(model_path))
    model.eval()

    path = "C:/Users/Jesper/Desktop/TableRecognition/Table_Detection_and_Recognition/Table_Recognition/Data/{}".format(
        tf_records_path)
    files = os.listdir(path)
    record = files[record_num]
    device = torch.device("cpu")

    batch_size = 1
    #variables for tfrecord loader
    index_path = None
    tfrecord_description = {
        "imgs": "float",
        "num_words": "int",
        "vertex_features": "float",
        "adjacency_matrix_cells": "int",
        "adjacency_matrix_cols": "int",
        "adjacency_matrix_rows": "int",
        "num_edges": 'int',
        "edge_indexes": 'int'
    }

    tfrecord_path = os.path.join(path, record)
    dataset = TFRecordDataset(tfrecord_path, index_path, tfrecord_description)
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)

    for idx, tmp in enumerate(loader):
        batch = tmp
        if idx == table_num:
            break

    data_dict = tfrecord_preparer(batch, device=device, batch_size=batch_size)

    preds_dict = model(data_dict, device, 0.5)
    for k, v in preds_dict.items():
        reshap = int(math.sqrt(v.shape[0]))
        preds_dict[k] = torch.sigmoid(v.reshape(reshap, reshap))

    img_cells = visualize(word_num, data_dict, preds_dict, pred_thresh,
                          'cells')
    img_rows = visualize(word_num, data_dict, preds_dict, pred_thresh, 'rows')
    img_cols = visualize(word_num, data_dict, preds_dict, pred_thresh, 'cols')
    return img_cells, img_rows, img_cols
Пример #9
0
 def __getitem__(self, idx):
     tfr_file, n = self.data_index[idx]
     # load tfrecord
     dataset = TFRecordDataset(tfr_file,
                               tfr_file.replace('tfrecords', 'index'))
     single_data = list(dataset)[n]
     img = single_data['data'].reshape(
         single_data['shape']).astype('float32')
     # normalize image to [0, 1]
     img = (img / 2**(8 - self.n_bits_x)).round() / (2.**self.n_bits_x)
     # apply transformation
     img = self.transform(img)
     # return img
     return img, single_data['label'][0]
Пример #10
0
def main(dataset, split, tfr_path, lmdb_path):
    assert split in {'train', 'validation'}

    # create target directory
    if not os.path.exists(lmdb_path):
        os.makedirs(lmdb_path, exist_ok=True)
    if dataset == 'celeba' and split in {'train', 'validation'}:
        num_shards = {'train': 120, 'validation': 40}[split]
        lmdb_path = os.path.join(lmdb_path, '%s.lmdb' % split)
        tfrecord_path_template = os.path.join(
            tfr_path, '%s/%s-r08-s-%04d-of-%04d.tfrecords')
    elif dataset == 'imagenet-oord_32':
        num_shards = {'train': 2000, 'validation': 80}[split]
        # imagenet_oord_lmdb_path += '_32'
        lmdb_path = os.path.join(lmdb_path, '%s.lmdb' % split)
        tfrecord_path_template = os.path.join(
            tfr_path, '%s/%s-r05-s-%04d-of-%04d.tfrecords')
    elif dataset == 'imagenet-oord_64':
        num_shards = {'train': 2000, 'validation': 80}[split]
        # imagenet_oord_lmdb_path += '_64'
        lmdb_path = os.path.join(lmdb_path, '%s.lmdb' % split)
        tfrecord_path_template = os.path.join(
            tfr_path, '%s/%s-r06-s-%04d-of-%04d.tfrecords')
    else:
        raise NotImplementedError

    # create lmdb
    env = lmdb.open(lmdb_path, map_size=1e12)
    count = 0
    with env.begin(write=True) as txn:
        for tf_ind in range(num_shards):
            # read tf_record
            tfrecord_path = tfrecord_path_template % (split, split, tf_ind,
                                                      num_shards)
            index_path = None
            description = {'shape': 'int', 'data': 'byte', 'label': 'int'}
            dataset = TFRecordDataset(tfrecord_path, index_path, description)
            loader = torch.utils.data.DataLoader(dataset, batch_size=1)

            # put the data in lmdb
            for data in loader:
                im = data['data'][0].cpu().numpy()
                txn.put(str(count).encode(), im)
                count += 1
                if count % 100 == 0:
                    print(count)

        print('added %d items to the LMDB dataset.' % count)
def get_train_loader(conf,worker_rank,use_hdf5=True,use_tfrecord=False):
    if use_hdf5:
        class_num = 144752
        # class_num = 1013232
        train_sampler = None
        hdf5file = str(conf.ms1m_folder) + '/datasets_miracle_v2_part' + str(worker_rank) + '.hdf5'
        # hdf5file = '/sdd_data/100WID_part' + str(worker_rank) + '.hdf5'
        print("using hdf5 file,", hdf5file, ",DataLoader with multi process")
        # datah5=dataset_h5(hdf5file)
        datah5 = dataset_h5_concurrency(hdf5file)
        # loader =DataLoader(dataset=datah5, batch_size=conf.batch_size,  shuffle=False, pin_memory=conf.pin_memory,
        #                     num_workers=0)
        loader = DataLoader(dataset=datah5, batch_size=conf.batch_size, shuffle=True, pin_memory=conf.pin_memory,
                            num_workers=conf.num_workers)

    elif use_tfrecord:
        # class_num=144752
        # tfrecord_path =str(conf.ms1m_folder) + '/datasets_miracle_v2_part_' + str(worker_rank) + '.tfrecord'
        # index_path = str(conf.ms1m_folder) + '/datasets_miracle_v2_part_' + str(worker_rank) + '.idx'
        # tfrecord_path = str(conf.ms1m_folder) + '/100WID_part_' + str(worker_rank) + '.tfrecord'
        # index_path = str(conf.ms1m_folder) + '/100WID_part_' + str(worker_rank) + '.idx'
        tfrecord_path = str(conf.data_path) + '/100WID_part_' + str(worker_rank) + '.tfrecord'
        index_path = str(conf.data_path) + '/100WID_part_' + str(worker_rank) + '.idx'
        class_num = 1013232
        train_sampler = None
        print("use tfrecord file",tfrecord_path)
        description = {"image": "byte", "label": "int", "index": "int", "name": "byte"}
        dataset = TFRecordDataset(tfrecord_path, index_path, description,shuffle_queue_size=conf.batch_size*conf.num_workers*10, transform=decode_image)
        # dataset = TFRecordDataset(tfrecord_path, index_path, description, transform=decode_image)
        # loader = torch.utils.data.DataLoader(dataset,  batch_size=conf.batch_size ,shuffle=False,   pin_memory=conf.pin_memory,
        #                     num_workers=conf.num_workers,drop_last=False,sampler=train_sampler)
        loader = DataLoaderX(dataset, batch_size=conf.batch_size, shuffle=False,
                                             pin_memory=conf.pin_memory,
                                             num_workers=conf.num_workers, drop_last=False, sampler=train_sampler)
    else:
        print("use data folder")
        # ds, class_num = get_train_dataset(conf.ms1m_folder / 'datasets_miracle_v2')
        ds, class_num = get_train_dataset(conf.data_path / '100WID')

        train_sampler = torch.utils.data.distributed.DistributedSampler(ds)
        loader = DataLoader(ds, batch_size=conf.batch_size,  shuffle=(train_sampler is None), pin_memory=conf.pin_memory,
                            num_workers=0,drop_last=False,sampler=train_sampler)
        # loader = DataLoader(ds, batch_size=conf.batch_size, shuffle=(train_sampler is None), pin_memory=conf.pin_memory,
        #                     num_workers=conf.num_workers, drop_last=False, sampler=train_sampler)
        print('ms1m ImageFlod generated:class ', class_num)
    print("check Data class num:",class_num)
    return loader, class_num ,train_sampler
 def build_loader(self):
     train_sampler = None
     print("read tfrecord file:", self.tfrecord_path)
     description = {
         "image": "byte",
         "label": "int",
         "index": "int",
         "name": "byte"
     }
     dataset = TFRecordDataset(self.tfrecord_path,
                               self.index_path,
                               description,
                               shuffle_queue_size=self.batch_size *
                               self.num_workers * 10,
                               transform=decode_image)
     self.sample_loader = DataLoaderX(dataset,
                                      batch_size=self.batch_size,
                                      shuffle=False,
                                      pin_memory=self.pin_memory,
                                      num_workers=self.num_workers,
                                      drop_last=False,
                                      sampler=train_sampler)
#######################################################################################################

for epoch in range(num_epochs):
    train_loss = 0
    val_loss = 0
    ct_train = 0
    ct_val = 0

    model.train()
    #load filenames of folder:
    tfrecord_files = os.listdir(Train_path)
    loop = tqdm(enumerate(tfrecord_files), total=len(tfrecord_files))
    for idx, record in loop:

        tfrecord_path = os.path.join(Train_path, record)
        dataset = TFRecordDataset(tfrecord_path, config.index_path,
                                  config.tfrecord_description)
        loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)

        for batch in loader:
            ct_train += 1
            data_dict = tfrecord_preparer(batch,
                                          device=device,
                                          batch_size=batch_size)

            optimizer.zero_grad()

            loss_cells, loss_cols, loss_rows, stat_dict = model(
                data_dict, device, prediction_thres)

            total_loss = loss_cells + loss_cols + loss_rows
Пример #14
0
    albert_pretrain.cuda()
    print(albert_pretrain.device)

# Create optimizer
optimizer = Lamb([{
    "params": [p for n, p in list(albert_pretrain.named_parameters())]
}],
                 lr=LEARNING_RATE)

# FP16
albert_pretrain, optimizer = amp.initialize(albert_pretrain,
                                            optimizer,
                                            opt_level="O2")

albert_pretrain.train()
dataset = TFRecordDataset(pretrain_file, index_path=None, description=feat_map)
loader = torch.utils.data.DataLoader(dataset,
                                     batch_size=BATCH_SIZE,
                                     drop_last=True)

tmp_loss = 0
start_time = time.time()

if os.path.isfile('pretrain_checkpoint'):
    print(f"--- Load from checkpoint ---")
    checkpoint = torch.load("pretrain_checkpoint")
    albert_pretrain.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    losses = checkpoint['losses']
Пример #15
0
from grid import *
MIXUP = False
GRIDMASK = False
NUMLABEL = 100
EPOCHS = 200

# SET GPU ID 0
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

#CONFIGURE DATA
tfrecord_train = "/home/lizhaochen/FYP/data/cifar-100-data-im-0.05/train.tfrecords"
tfrecord_val = "/home/lizhaochen/FYP/data/cifar-100-data-im-0.05/eval.tfrecords"
index_path = None
description = {"image": "byte", "label": "int"}
train_dataset = TFRecordDataset(tfrecord_train, index_path, description)
val_dataset = TFRecordDataset(tfrecord_val, index_path, description)
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=512)
valloader = torch.utils.data.DataLoader(val_dataset, batch_size=64)


# function for MIX UP
def mixup_data(x, y, alpha=1.0):
    '''Returns mixed inputs, pairs of targets, and lambda'''
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).cuda()
Пример #16
0
def predict(args, model):
    tf.gfile.MakeDirs(args.output_dir)
    #read prediction candidates from entire prediction input jsonl.gz file
    candidates_dict = read_candidates(args.predict_file)
    #Prediction!
    logging.info("start predicting!")

    #define following routines to call
    full_tydi_pred_dict = {}
    total_num_examples = 0
    shards_iter = enumerate(
        ((f, 0, 0)
         for f in sorted(tf.gfile.Glob(args.precomputed_predict_file))), 1)

    #Iterating through different shards to get results as we want
    for shard_num, (shard_filename, shard_num_examples,
                    shard_num_features) in shards_iter:
        all_results = []
        total_num_examples += shard_num_examples
        logging.info(
            "Shard %d: Running prediction for %s; %d examples, %d features.",
            shard_num, shard_filename, shard_num_examples, shard_num_features)
        print(shard_filename)
        #use tfrecord_dataset to read tfrecord into dataset for pytorch
        eval_dataset = TFRecordDataset(shard_filename, index_path=None)
        eval_dataloader = DataLoader(eval_dataset,
                                     batch_size=args.predict_batch_size,
                                     shuffle=False)

        for step, batch in enumerate(eval_dataloader):
            #Turn on model evaluation mode, and set frame to no_grad
            model.eval()
            with torch.no_grad():
                outputs = model(
                    is_training=False,
                    input_ids=batch["input_ids"].long().to(DEVICE),
                    attention_mask=batch['input_mask'].long().to(DEVICE),
                    token_type_ids=batch['segment_ids'].long().to(DEVICE),
                )
                #print(torch.max(outputs[0], -1))
                #write results into RawResult format for post-process
                for num, (i, j, k) in enumerate(
                        zip(outputs[0], outputs[1], outputs[2])):
                    unique_ids = int(batch['unique_ids'][num])
                    start_logits = [float(x) for x in i]
                    end_logits = [float(x) for x in j]
                    answer_type_logits = [float(x) for x in k]
                    all_results.append(
                        RawResult(unique_id=unique_ids,
                                  start_logits=start_logits,
                                  end_logits=end_logits,
                                  answer_type_logits=answer_type_logits))
            print('We at step %d of shard % d', step, shard_filename)

        predict_features = [
            tf.train.Example.FromString(r)
            for r in tf.python_io.tf_record_iterator(shard_filename)
        ]

        logging.info("Shard %d: Post-processing predictions.", shard_num)
        logging.info(
            "  Num candidate examples loaded (includes all shards): %d",
            len(candidates_dict))
        logging.info("  Num candidate features loaded: %d",
                     len(predict_features))
        logging.info("  Num prediction result features: %d", len(all_results))
        logging.info("  Num shard features: %d", shard_num_features)

        #pass candidates dict, raw results and features to postproc for later use
        tydi_pred_dict = postproc.compute_pred_dict(
            candidates_dict,
            predict_features, [r._asdict() for r in all_results],
            candidate_beam=args.candidate_beam)

        logging.info("Shard %d: Post-processed predictions.", shard_num)
        logging.info("  Num shard examples: %d", shard_num_examples)
        logging.info("  Num post-processed results: %d", len(tydi_pred_dict))
        if shard_num_examples != len(tydi_pred_dict):
            logging.warning("  Num missing predictions: %d",
                            shard_num_examples - len(tydi_pred_dict))
        for key, value in tydi_pred_dict.items():
            if key in full_tydi_pred_dict:
                logging.warning("ERROR: '%s' already in full_tydi_pred_dict!",
                                key)
            full_tydi_pred_dict[key] = value
        #break
    #Finish up predictions for all shards and start logging
    logging.info("Prediction finished for all shards.")
    logging.info("  Total input examples: %d", total_num_examples)
    logging.info("  Total output predictions: %d", len(full_tydi_pred_dict))

    with tf.gfile.Open(args.output_prediction_file, "w") as output_file:
        for prediction in full_tydi_pred_dict.values():
            output_file.write((json.dumps(prediction) + "\n").encode())
Пример #17
0
# @Description  :
# https://github.com/spotify/tfreader
# https://www.w3cschool.cn/tensorflow_python

from pathlib import Path

# Torch
# https://github.com/vahidk/tfrecord
import torch
from tfrecord.torch.dataset import TFRecordDataset

p = "/Users/yuanjie/Desktop/Projects/Spark/MIPush/test-output.tfrecord"
tfrecord_paths = list(map(str, Path(p).glob("part*")))
index_path = None
description = {"id": "int", "feature": "int"}
dataset = TFRecordDataset(tfrecord_paths[0], index_path, description)
loader = torch.utils.data.DataLoader(dataset, batch_size=32)

data = next(iter(loader))
print(data)

# TF
# https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset
import tensorflow as tf

# d = tf.data.TFRecordDataset(input_file)
# d = d.shard(num_workers, worker_index)
# d = d.repeat(num_epochs)
# d = d.shuffle(shuffle_buffer_size)
# d = d.map(parser_fn, num_parallel_calls=num_map_threads)