def __init__( self, urls, *, length=None, open_fn=gopen.reader, handler=reraise_exception, tarhandler=None, prepare_for_worker=True, initial_pipeline=None, shard_selection=worker_urls, ): tarhandler = handler if tarhandler is None else tarhandler IterableDataset.__init__(self) SampleIterator.__init__( self, initial_pipeline=initial_pipeline, tarhandler=tarhandler, open_fn=open_fn, ) if isinstance(urls, str): urls = list(braceexpand.braceexpand(urls)) self.urls = urls self.length = length self.handler = handler self.total = 0 self.reseed_hook = do_nothing self.node_selection = identity self.shard_selection = shard_selection self.shard_shuffle = identity
def list_connected_datapipes(scan_obj, exclude_primitive): f = io.BytesIO() p = pickle.Pickler(f) # Not going to work for lambdas, but dill infinite loops on typing and can't be used as is def stub_pickler(obj): return stub_unpickler, () captured_connections = [] def getstate_hook(obj): state = {} for k, v in obj.__dict__.items(): if callable(v) or isinstance(v, PRIMITIVE): continue state[k] = v return state def reduce_hook(obj): if obj == scan_obj: raise NotImplementedError else: captured_connections.append(obj) return stub_unpickler, () # TODO(VitalyFedyunin): Better do it as `with` context for safety IterableDataset.set_reduce_ex_hook(reduce_hook) if exclude_primitive: IterableDataset.set_getstate_hook(getstate_hook) p.dump(scan_obj) IterableDataset.set_reduce_ex_hook(None) if exclude_primitive: IterableDataset.set_getstate_hook(None) return captured_connections
def train_val_split(path, batch_size=64, decoder=None): samples = glob.glob(path) np.random.shuffle(samples) nb_train = math.ceil(0.9 * len(samples)) # 共有10万+样本,9万用于训练,1万+用于验证 train_samples = samples[:nb_train] train_dataset = IterableDataset( train_samples, transform=None, decoder=decoder) train_loader = DataLoader(train_dataset, batch_size=batch_size) val_samples = samples[nb_train:] val_dataset = IterableDataset(val_samples, transform=None, decoder=decoder) val_loader = DataLoader(val_dataset, batch_size=batch_size) return train_loader, val_loader, train_samples, val_samples
def __init__( self, dataset=None, workers=4, output_size=100, pin_memory=True, prefetch=-1 ): IterableDataset.__init__(self) omp_warning() self.output_queue = mp.Queue(output_size) self.pin_memory = pin_memory self.jobs = [] for i in range(workers): job = mp.Process( target=_parallel_job, args=(dataset, i, workers, prefetch, self.output_queue), daemon=True, ) self.jobs.append(job) job.start() D("started")
def list_connected_datapipes(scan_obj): f = io.BytesIO() p = pickle.Pickler(f) # Not going to work for lambdas, but dill infinite loops on typing and can't be used as is def stub_pickler(obj): return stub_unpickler, () captured_connections = [] def reduce_hook(obj): if obj == scan_obj: raise NotImplementedError else: captured_connections.append(obj) return stub_unpickler, () # TODO(VitalyFedyunin): Better do it as `with` context for safety IterableDataset.set_reduce_ex_hook(reduce_hook) p.dump(scan_obj) IterableDataset.set_reduce_ex_hook(None) return captured_connections
def __init__(self): IterableDataset.__init__(self) self.images_and_density_maps = pipeline_results self.image_transform = torch_transforms.Compose([ torch_transforms.ToTensor() ])
def torchIterableDataset(): from torch.utils.data import Dataset, IterableDataset, DataLoader import torchvision.transforms as transforms import numpy as np from PIL import Image import os train_transform = transforms.Compose([ transforms.ToPILImage(), transforms.RandomHorizontalFlip(), # 隨機將圖片水平翻轉 transforms.RandomRotation(15), # 隨機旋轉圖片 # 將圖片轉成 Tensor,並把數值 normalize 到 [0,1] (data normalization) transforms.ToTensor(), ]) test_transform = transforms.Compose([ transforms.ToPILImage(), transforms.ToTensor(), ]) class IterableDataset(IterableDataset): def __init__(self, filepath, transform=None): # super().__init__() self.filepath = filepath img_size = (50, 120) self.width, self.height = img_size[1], img_size[0] self.transform = transform def parseFile(self, filepath): with open(filepath, 'r') as f: for line in f: token = line.strip('\n').strip(' ') print('-----', line, token) yield from token # def get_stream(self, filepath): # from itertools import cycle # return self.parseFile(filepath) def read_img(self, img_dir): # print(img_dir) for img in img_dir: print(img) img_gray = Image.open(img).convert('L') img_two = img_gray.point(lambda x: 255 if x > 129 else 0) one_channel = cv2.resize(np.array(img_two), (self.width, self.height)) x = np.array([one_channel, one_channel, one_channel]).transpose(1, 2, 0) #x = cv2.resize(cv2.imread(img), (self.width, self.height)) if self.transform is not None: x = self.transform(x) # print(numpy.array(x).transpose(1,2 , 0).shape) # cv2.imshow('new', numpy.array(x).transpose(1,2 , 0)) # cv2.waitKey(0) y = [keys.get(i) for i in img[-8:-4].lower()] # print('---', x,y) yield x, np.array(y) def __iter__(self): return self.read_img(self.filepath) samples = glob.glob(r'/Users/faith/Downloads/captcha-dataset/label/*png') for s in samples: if len(os.path.basename(s)) != 8: print(s) os.remove(s) np.random.shuffle(samples) nb_train = math.ceil(0.9 * len(samples)) # 共有10万+样本,9万用于训练,1万+用于验证 train_samples = samples[:nb_train] test_samples = samples[nb_train:] train_dataset = IterableDataset(train_samples, transform=train_transform) train_loader = DataLoader(train_dataset, batch_size=2) print(train_loader) letter_list = [chr(i) for i in range(97, 123)] char_list = [str(i) for i in range(0, 10)] + letter_list keys = {} values = {} for i, c in enumerate(char_list): keys[c] = i values[i] = c for i, data in enumerate(train_loader): print(i, data) break # test_dataset = IterableDataset(test_samples, transform=test_transform) # test_loader = DataLoader(test_dataset, batch_size=100) import pretrainedmodels import torch.nn as nn class CaptchaModel(nn.Module): def __init__(self, num_classes=len(keys)): super(CaptchaModel, self).__init__() model_name = 'xception' self.model = pretrainedmodels.__dict__[model_name]( num_classes=1000, pretrained='imagenet') conv1 = self.model.conv1 self.model.conv1 = nn.Conv2d(in_channels=3, out_channels=conv1.out_channels, kernel_size=conv1.kernel_size, stride=conv1.stride, padding=conv1.padding, bias=conv1.bias) # copy pretrained weights self.model.conv1.weight.data[:, :3, :, :] = conv1.weight.data self.model.conv1.weight.data[:, 3:, :, :] = conv1.weight.data[:, : 1, :, :] self.model.avgpool = nn.AdaptiveAvgPool2d(1) in_features = self.model.last_linear.in_features self.model.last_linear = nn.Linear(in_features, num_classes) def forward(self, x): return self.model(x), self.model(x), self.model(x), self.model(x)