示例#1
0
      def __init__(self, root=MyPath.db_root_dir('bird'), split='train', transform=None):
        super(Birds, self).__init__()



        self.transform = transform

        self.resize = tf.Resize(256)


        path = untar_data(URLs.CUB_200_2011)


        self.files = get_image_files(path/"images")
        self.label = dict(sorted(enumerate(set(self.files.map(self.label_func))), key=itemgetter(1)))
        self.labels = dict([(value, key) for key, value in self.label.items()])
        self.df = pd.read_csv(path/'train_test_split.txt',delimiter=' ')


        if self.split == 'train':
          self.file_index = [i['1'] for i in self.df.to_dict('records') if i['0']==1]
          self.Files= [i for i in self.files if self.splitter(i) in self.file_index]


        else:
          self.file_index = [i['1'] for i in self.df.to_dict('records') if i['0']==0]
          self.Files = [i for i in self.files if self.splitter(i) in self.file_index]
示例#2
0
def setup_data(vocab_size, min_frequency):
    def _create_vocab(df, vocab_size, min_frequency=1):
        counter = Counter()
        print(f"Starting parsing docs, in total {len(df.values)}")
        for _, doc in tqdm(enumerate(df.values.tolist())):
            doc_counter = Counter([
                token.text for token in tokenizer(doc)
                if not token.is_stop and token.is_alpha
            ])
            counter += doc_counter

        vocab_strings = [
            token for token, count in counter.most_common(vocab_size)
            if count >= min_frequency
        ]
        # create a dictionary with a default of -1 for word not existing in our vocab
        vocab = defaultdict(
            lambda: -1,
            {value: key
             for key, value in enumerate(vocab_strings)})
        print(
            f"Created vocab of size {len(vocab)}. Most common words are {vocab_strings[:10]}"
        )
        return vocab, vocab_strings

    path = untar_data(URLs.WIKITEXT)
    df = pd.read_csv(path / 'train.csv', header=None).apply(lambda x: x[0],
                                                            axis=1)
    vocab, vocab_strings = _create_vocab(df, vocab_size, min_frequency)
    return df, vocab, vocab_strings
示例#3
0
def read_mnist():
    path = untar_data(URLs.MNIST_SAMPLE)

    threes_t = load_lazy('/tmp/mnist_sample_stacked3.pt',
                         (path / 'train' / '3').ls().sorted())
    seven_t = load_lazy('/tmp/mnist_sample_stacked7.pt',
                        (path / 'train' / '7').ls().sorted())

    threes_t_v = load_lazy('/tmp/mnist_sample_stacked3_valid.pt',
                           (path / 'valid' / '3').ls().sorted())
    seven_t_v = load_lazy('/tmp/mnist_sample_stacked7_valid.pt',
                          (path / 'valid' / '7').ls().sorted())

    return threes_t, seven_t, threes_t_v, seven_t_v
示例#4
0
def get_paths():
    '''
    Download sample of COCO dataset
    Sample 10k images from COCO dataset
    Split train/val 80/20
    Return:
    train_paths (list), val_paths (list): image paths
    '''
    coco_path = untar_data(URLs.COCO_SAMPLE)
    coco_path = str(coco_path) + "/train_sample"
    
    paths = glob.glob(coco_path + "/*.jpg") # Grabbing all the image file names
    np.random.seed(123)
    paths_subset = np.random.choice(paths, 10_000, replace=False) # choosing 1000 images randomly
    rand_idxs = np.random.permutation(10_000)
    train_idxs = rand_idxs[:8000] # choosing the first 8000 as training set
    val_idxs = rand_idxs[8000:] # choosing last 2000 as validation set
    return paths_subset[train_idxs], paths_subset[val_idxs]
示例#5
0
# - how to pass along a custom `splitter` to `Learner` to take advantage of transfer learning

# ## Preparing the data

# To make our data ready for training a model, we need to create a `DataLoaders` object in fastai. It is just a wrapper around a training `DataLoader` and a validation `DataLoader`, so if you already have your own PyTorch dataloaders, you can create such an object directly.
#
# Here we don't have anything ready yet. Usually, when using PyTorch, the first step is to create a `Dataset` that is then wrapped inside a `DataLoader`. We will do this first, then see how to change this `Dataset` into a `Transform` that will let us take advantage of fastai's functionality for showing a batch or using data augmentation on the GPU. Lastly we will see how we can customize the data block API and create our own new `TransformBlock`.

# ### Purely in PyTorch

# To begin with, we will only use PyTorch and PIL to create a `Dataset` and see how to get this inside fastai. The only helper functions from fastai we will use are `untar_data` (to download and untar the dataset) and `get_image_files` (that looks for all images in a folder recursively). Here, we will use the [Oxford-IIIT Pet Dataset](https://www.robots.ox.ac.uk/~vgg/data/pets/).


# `untar_data` returns a `pathlib.Path` object with the location of the decompressed dataset, and in this case, all the images are in an images subfolder:

path = untar_data(URLs.PETS)
files = get_image_files(path / "images")
files[0]

# We can open the first image with PIL and have a look at it:


img = PIL.Image.open(files[0])
img

# Let's wrap all the standard preprocessing (resize, conversion to tensor, dividing by 255 and reordering of the channels) in one helper function:



def open_image(fname, size=224):
    img = PIL.Image.open(fname).convert('RGB')
    img = Image.open(self.paths[idx]).convert("RGB")
    img = self.transforms(img)
    img = np.array(img)

    lab_img = rgb2lab(img).astype("float32")
    lab_img = transforms.ToTensor()(lab_img)

    L = lab_img[[0], ...] / 50. - 1.
    ab = lab_img[[1, 2], ...] / 110.

    return {"L": L, "ab": ab}

  def __len__(self):
    return len(self.paths)

root = str(untar_data(URLs.COCO_SAMPLE)) + "/train_sample"

paths = glob.glob(root + "/*.jpg")

np.random.seed(42)
paths_subset = np.random.choice(paths, 12_000, replace=False)

rand_idxs = np.random.permutation(12_000)
train_idxs = rand_idxs[:10_000]
val_idxs = rand_idxs[10_000:]

train_paths = paths_subset[train_idxs]
val_paths = paths_subset[val_idxs]

train_dset = TrainingDataset(train_paths)
val_dset = ValidationDataset(val_paths)
"""
Downloads test data used in CI
"""
from IPython import get_ipython
from fastai.data.external import untar_data, URLs
from fastai.torch_core import parallel
import pickle

urls = [
    'ADULT_SAMPLE', 'BIWI_SAMPLE', 'CAMVID_TINY', 'CIFAR', 'COCO_TINY', 'IMDB',
    'IMDB_SAMPLE', 'ML_SAMPLE', 'MNIST', 'MNIST_SAMPLE', 'MNIST_TINY', 'PETS'
]
url_list = [URLs.__dict__[k] for k in urls]
files = [(print(f'Downloading {u}'), untar_data(u)) for u in url_list]
示例#8
0
    #     splitter=RandomSplitter(0.1)
    #     )
    # dls_lm = dls_lm.dataloaders(df_all, bs=64, seq_len=72)
    # print(dls_lm.show_batch(max_n=3))

    # learn = language_model_learner(
    #     dls_lm, AWD_LSTM,
    #     metrics=[accuracy, Perplexity()]).to_fp16()
    # print(learn.model)

    # print(learn.lr_find())
    # learn.fine_tune(5, 1e-2, cbd=TensorBoardCallback(PATH_TENSORBOARD, trace_model=True))

    #%%
    # Prepare IMDB data
    path = untar_data(URLs.IMDB)
    bs = 32

    # Fine-tune pretrained language model (based on wikitext) to the IMDB corpus
    get_imdb = partial(get_text_files, folders=["train", "test", "unsup"])
    dls_lm = DataBlock(blocks=TextBlock.from_folder(path,
                                                    is_lm=True,
                                                    n_workers=4),
                       get_items=get_imdb,
                       splitter=RandomSplitter(0.1))
    dls_lm = dls_lm.dataloaders(path, path=path, bs=bs, seq_len=80)
    print(dls_lm.show_batch(max_n=3))

    # #%%
    # learn = language_model_learner(
    #     dls_lm, AWD_LSTM, drop_mult=0.3,
示例#9
0
from fastai.learner import Learner
from fastai.metrics import Precision, Recall, accuracy
from fastai.vision.augment import aug_transforms
from fastai.vision.core import PILImageBW
from fastai.vision.data import ImageBlock, ImageDataLoaders
from pytorch_model_summary import summary

from ml_for_programmers.config import Config

# %%
config = Config()
mnist_root = config.data_dir_path / "external/mnist_fastai"
mnist_root.mkdir(parents=True, exist_ok=True)

# %%
mnist_dir = untar_data(URLs.MNIST, dest=mnist_root)
mnist_dir

# %%
Path.BASE_PATH = mnist_dir
print(mnist_dir.ls(), "\n")
pprint(sorted((mnist_dir / "training").ls()))

# %%
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

# %%