示例#1
0
def huajianji(root):
    """Huajianji dataset from Chinese classical literature.
    
    "Hua Jian Ji" is a collection of Chinese poetry 
    compiled during the Five Dynasties and Ten Kingdoms period. 
    It is also the first collection of literati in the history of literature. 
    It was edited by Zhao Chongxi, a later monk. 
    The book contains 18 classic works of poetry by Wen Tingjun and Wei Zhuang. 
    It concentrates and typically reflects the subject orientation, 
    aesthetic taste, physical style and artistic achievement 
    of the human creation in the early word history.
    
    Data storage directory:
    root = `/user/.../mydata`
    huajianji data: 
    `root/huajianji/huajianji.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/huajianji`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/huajianji`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'huajianji')
    url = "https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/huajianji.json"
    rq.files(url, path_join(task_path, url.split('/')[-1]))
    print('huajianji dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#2
0
def ci_song(root):
    """Song_ci dataset from Chinese classical literature.
    
    "The Song of the Whole Song" is one of the most important 
    achievements of ancient books in China in the past 100 years. 
    Song poetry and Tang poetry are the artistic peaks of 
    Chinese classical poetry. The "Full Tang Poetry" edited in 
    the Qing Dynasty is a household name, and now it is newly 
    compiled "Full Song Ci", which is called the double shackles 
    of Chinese literature. The book has a total of five volumes, 
    a collection of words from the Song Dynasty for three hundred years.
    
    Data storage directory:
    root = `/user/.../mydata`
    ci_song data: 
    `root/ci_song/ci_song.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/ci_song`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/ci_song`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'ci_song')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/ci_song.json'
    rq.files(url, path_join(task_path, url.split('/')[-1]))
    print('ci_song dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#3
0
def mnist_kuzushiji_kanji(root):
    """Kuzushiji-Kanji dataset from https://github.com/rois-codh/kmnist.
    
    Kuzushiji-Kanji is a large and highly imbalanced 64x64 dataset 
    of 3832 Kanji characters, containing 140,426 images 
    of both common and rare characters.
    
    Attention: if exist dirs `root/mnist_kuzushiji_kanji`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_kuzushiji_kanji data: 
    `root/mnist_kuzushiji_kanji/train/U+55C7/xx.png`
    `root/mnist_kuzushiji_kanji/train/U+7F8E/xx.png`
    `root/mnist_kuzushiji_kanji/train/U+9593/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_kuzushiji_kanji`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_kuzushiji_kanji.
    """
    start = time.time()
    task_path = assert_dirs(root, 'mnist_kuzushiji_kanji', make_root_dir=False)
    url = "http://codh.rois.ac.jp/kmnist/dataset/kkanji/kkanji.tar"
    rq.files(url, gfile.path_join(root, url.split('/')[-1]))
    un_tar(gfile.path_join(root, url.split('/')[-1]), task_path)
    gfile.rename(gfile.path_join(task_path, 'kkanji2'), gfile.path_join(task_path, 'train'))
    gfile.remove(gfile.path_join(root, 'kkanji.tar'))
    print('mnist_kuzushiji_kanji dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#4
0
def poet_tang(root):
    """Tang_poet dataset from Chinese classical literature.
    
    "Full Tang Poetry" is the 44th year of Qing Emperor Kangxi (1705), 
    Peng Dingqiu, Shen Sanzeng, Yang Zhongna, Wang Shizhen, Wang Wei, 
    Yu Mei, Xu Shuben, Che Dingjin, Pan Conglu, and Cha Yu 
    "There are more than 48,900 poems, more than 2,200 people,"
    a total of 900 volumes, 12 volumes of catalogues.
    
    Data storage directory:
    root = `/user/.../mydata`
    poet_tang data: 
    `root/poet_tang/poet_tang.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/poet_tang`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/poet_tang`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'poet_tang')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/poetry_tang.json.bz2'
    rq.files(url, path_join(task_path, 'poet_tang.json.bz2'))
    un_bz2(path_join(task_path, 'poet_tang.json.bz2'))
    remove(path_join(task_path, 'poet_tang.json.bz2'))
    print('poet_tang dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#5
0
def poet_song(root):
    """Song_poet dataset from Chinese classical literature.
    
    "Full Song Poetry" After the high prosperity of Tang poetry, 
    Song poetry has new development and creation in ideological 
    content and artistic expression. 
    Many excellent writers have appeared, 
    and many schools have been formed, 
    which have produced poetry development in Yuan, 
    Ming and Qing. A far-reaching impact.
    
    Data storage directory:
    root = `/user/.../mydata`
    poetry_song data: 
    `root/poet_song/poet_song.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/poet_song`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/poet_song`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'poet_song')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/poetry_song.json.bz2'
    rq.files(url, path_join(task_path, 'poet_song.json.bz2'))
    un_bz2(path_join(task_path, 'poet_song.json.bz2'))
    remove(path_join(task_path, 'poet_song.json.bz2'))
    print('poet_song dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#6
0
文件: _dm.py 项目: Hourout/tensordata
def arrhythmia(root):
    """Aarrhythmia datasets from http://archive.ics.uci.edu/ml/datasets/Arrhythmia.
    
    This database contains 279 attributes, 206 of which are linear
    valued and the rest are nominal. 
    
    Data storage directory:
    root = `/user/.../mydata`
    arrhythmia data: 
    `root/abalone/arrhythmia.txt`
    `root/abalone/introduce.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/arrhythmia`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/arrhythmia`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'arrhythmia')
    url_introduce = 'http://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.names'
    url_txt = 'http://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data'
    rq.files(url_introduce,
             gfile.path_join(task_path, 'introduce.txt'),
             verbose=0)
    rq.table(url_txt, gfile.path_join(task_path, 'arrhythmia.txt'))
    print('arrhythmia dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#7
0
def shijing(root):
    """Shijing dataset from Chinese classical literature.
    
    The earliest poetry collection in China, The Book of Songs, 
    is the earliest collection of poems in ancient Chinese poetry. 
    It collects poems from the early Western Zhou Dynasty to the 
    middle of spring and autumn (the first 11th century to the 
    first six centuries), including 311 articles, 
    of which 6 The article is a poem, that is, only the title, 
    no content, called the six poems of the poems 
    (Nan, Baihua, Huaying, Yukang, Chongwu, Yuyi),
    reflecting the period from the beginning of the week to 
    the late Zhou Dynasty for about five hundred years.
    
    Data storage directory:
    root = `/user/.../mydata`
    shijing data: 
    `root/shijing/shijing.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/shijing`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/shijing`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'shijing')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/shijing.json'
    rq.files(url, path_join(task_path, url.split('/')[-1]))
    print('shijing dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#8
0
def poetry_SouthernTang(root):
    """poetry_SouthernTang dataset from Chinese classical literature.
    
    "The Southern Tang Dynasty's two main words", 
    is the Southern Tang Dynasty master Li Jing, the latter master Li Yu. 
    The book was written in the Southern Song Dynasty, 
    and later generations have been compiled, 
    and later generations have written various versions.
    
    Data storage directory:
    root = `/user/.../mydata`
    poetry_SouthernTang data: 
    `root/poetry_SouthernTang/poetry_SouthernTang.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/poetry_SouthernTang`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/poetry_SouthernTang`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'poetry_SouthernTang')
    url = "https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/nantang_erzhu_poetry.json"
    rq.files(url, path_join(task_path, 'poetry_SouthernTang.json'))
    print(
        'poetry_SouthernTang dataset download completed, run time %d min %.2f sec'
        % divmod((time.time() - start), 60))
    return task_path
示例#9
0
def economist(root, date, mode='pdf'):
    """The Economist from https://github.com/nailperry-zd/The-Economist.
    
    Data storage directory:
    root = `/user/.../mydata`
    economist data: 
    `root/...(pdf or epub or mobi)`
    Args:
        root: str, Store the absolute path of the data directory.
        date: str, eg:'2019-01-01'.
        mode: str, one of ['pdf', 'epub', 'mobi'].
    Returns:
        Store the absolute path of the data directory, is `root/...(pdf or epub or mobi)`.
    """
    start = time.time()
    assert mode in ['pdf', 'epub',
                    'mobi'], "`mode` should be one of ['pdf', 'epub', 'mobi']."
    t = divmod((pd.to_datetime(date) - pd.to_datetime('2017-05-06')).days, 7)
    if t[0] < 0 or t[1] > 0:
        raise ValueError("No book that meets the date.")
    task_path = assert_dirs(root)
    t = 'https://github.com/nailperry-zd/The-Economist/raw/master/{}'.format(
        date)
    t = [
        i for i in requests.get(t).content.decode('utf-8').split('\n')
        if ('The-Economist/blob/master/{}'.format(date) in i) & (mode in i)
    ]
    url = 'https://github.com/nailperry-zd/The-Economist/raw/master/' + date + '/' + [
        i[7:-1] for i in t[0].split(' ') if 'title' in i
    ][0]
    task_path = path_join(task_path, url.split('/')[-1])
    rq.files(url, task_path, verbose=1)
    print('economist dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#10
0
def arxiv(root, ids, new_name=None):
    """Download paper from https://arxiv.org, the file format is pdf.
    
    Data storage directory:
    root = `/user/.../mydata`
    `ids`.pdf data: 
    `root/arxiv/`ids`.pdf` or `root/arxiv/`new_name`.pdf`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/arxiv`,
              root should be `/user/.../mydata`.
        ids: str, arxiv paper id.
             example:ids = '1605.09782' mean you want get paper links https://arxiv.org/abs/1605.09782.
        new_name: str, default None. if not None, download file path is `root/arxiv/new_name.pdf`.
    Returns:
        Store the absolute path of the data directory, is `root/arxiv`.
    """
    start = time.time()
    assert gfile.isdir(root), '`root` should be directory.'
    assert isinstance(ids, str), '`ids` type should be str.'
    if new_name is None:
        task_path = gfile.path_join(root, 'arxiv', ids + '.pdf')
    else:
        task_path = gfile.path_join(root, 'arxiv', new_name + '.pdf')
    gfile.makedirs(gfile.path_join(root, 'arxiv'))
    gfile.remove(task_path)
    url = 'https://arxiv.org/pdf/' + str(ids) + '.pdf'
    rq.files(url, task_path)
    print('arxiv paper download completed, run time %d min %.2f sec' % divmod(
        (time.time() - start), 60))
    return task_path
示例#11
0
def mnist_fashion(root):
    """A dataset of Zalando's article images consisting of fashion products.
    
    Fashion mnist datasets is a drop-in replacement of the original MNIST dataset
    from https://github.com/zalandoresearch/fashion-mnist.
    Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1).
    
    Attention: if exist dirs `root/mnist_fashion`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_fashion data: 
    `root/mnist_fashion/train/0/xx.png`
    `root/mnist_fashion/train/2/xx.png`
    `root/mnist_fashion/train/6/xx.png`
    `root/mnist_fashion/test/0/xx.png`
    `root/mnist_fashion/test/2/xx.png`
    `root/mnist_fashion/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_fashion`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_fashion`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'mnist_fashion')
    url_list = ['http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz',
                'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz',
                'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz',
                'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz']
    for url in url_list:
        rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    with gzip.open(gfile.path_join(task_path, 'train-labels-idx1-ubyte.gz'), 'rb') as lbpath:
        train_label = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(gfile.path_join(task_path, 'train-images-idx3-ubyte.gz'), 'rb') as imgpath:
        train = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(train_label), 28, 28)

    with gzip.open(gfile.path_join(task_path, 't10k-labels-idx1-ubyte.gz'), 'rb') as lbpath:
        test_label = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(gfile.path_join(task_path, 't10k-images-idx3-ubyte.gz'), 'rb') as imgpath:
        test = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(test_label), 28, 28)
    
    for i in set(train_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for i in set(test_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(train.shape[0]):
        save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), 
                   array_to_image(train[idx].reshape(28, 28, 1)))
    for idx in range(test.shape[0]):
        save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), 
                   array_to_image(test[idx].reshape(28, 28, 1)))
    for url in url_list:
        gfile.remove(gfile.path_join(task_path, url.split('/')[-1]))
    print('mnist_fashion dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#12
0
def cifar100(root, fine_label=True):
    """CIFAR100 image classification dataset from https://www.cs.toronto.edu/~kriz/cifar.html
    
    Each sample is an image (in 3D NDArray) with shape (32, 32, 3).
    
    Attention: if exist dirs `root/cifar100`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    cifar100 data: 
    `root/cifar100/train/0/xx.png`
    `root/cifar100/train/2/xx.png`
    `root/cifar100/train/6/xx.png`
    `root/cifar100/test/0/xx.png`
    `root/cifar100/test/2/xx.png`
    `root/cifar100/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/cifar100`,
              root should be `/user/.../mydata`.
        fine_label: bool, default False.
                    Whether to load the fine-grained (100 classes) or 
                    coarse-grained (20 super-classes) labels.
    Returns:
        Store the absolute path of the data directory, is `root/cifar100`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'cifar100')
    url = 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/cifar100/cifar-100-binary.tar.gz'
    rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    with tarfile.open(gfile.path_join(task_path, url.split('/')[-1])) as t:
        t.extractall(task_path)
    noise_flie = gfile.listdir(task_path)
    with open(gfile.path_join(task_path, 'train.bin'), 'rb') as fin:
        data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3072 + 2)
        train = data[:, 2:].reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)
        train_label = data[:, 0 + fine_label].astype(np.int32)
    for i in set(train_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for idx in range(train.shape[0]):
        save_image(
            gfile.path_join(task_path, 'train', str(train_label[idx]),
                            str(idx) + '.png'), array_to_image(train[idx]))
    with open(gfile.path_join(task_path, 'test.bin'), 'rb') as fin:
        data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3072 + 2)
        test = data[:, 2:].reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)
        test_label = data[:, 0 + fine_label].astype(np.int32)
    for i in set(test_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(test.shape[0]):
        save_image(
            gfile.path_join(task_path, 'test', str(test_label[idx]),
                            str(idx) + '.png'), array_to_image(test[idx]))
    for file in noise_flie:
        gfile.remove(gfile.path_join(task_path, file))
    print('cifar100 dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#13
0
def mnist(root):
    """MNIST handwritten digits dataset from http://yann.lecun.com/exdb/mnist
    
    Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1).
    
    Attention: if exist dirs `root/mnist`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist data: 
    `root/mnist/train/0/xx.png`
    `root/mnist/train/2/xx.png`
    `root/mnist/train/6/xx.png`
    `root/mnist/test/0/xx.png`
    `root/mnist/test/2/xx.png`
    `root/mnist/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'mnist')
    url_list = ['https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-labels-idx1-ubyte.gz',
                'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-images-idx3-ubyte.gz',
                'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-labels-idx1-ubyte.gz',
                'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-images-idx3-ubyte.gz']
    for url in url_list:
        rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    with gzip.open(gfile.path_join(task_path, 'train-labels-idx1-ubyte.gz'), 'rb') as lbpath:
        train_label = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(gfile.path_join(task_path, 'train-images-idx3-ubyte.gz'), 'rb') as imgpath:
        train = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(train_label), 28, 28)

    with gzip.open(gfile.path_join(task_path, 't10k-labels-idx1-ubyte.gz'), 'rb') as lbpath:
        test_label = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(gfile.path_join(task_path, 't10k-images-idx3-ubyte.gz'), 'rb') as imgpath:
        test = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(test_label), 28, 28)
    
    for i in set(train_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for i in set(test_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(train.shape[0]):
        save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), 
                   array_to_image(train[idx].reshape(28, 28, 1)))
    for idx in range(test.shape[0]):
        save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), 
                   array_to_image(test[idx].reshape(28, 28, 1)))
    for url in url_list:
        gfile.remove(gfile.path_join(task_path, url.split('/')[-1]))
    print('mnist dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#14
0
def mnist_kuzushiji10(root):
    """Kuzushiji-MNIST from https://github.com/rois-codh/kmnist.
    
    Kuzushiji-MNIST is a drop-in replacement for the
    MNIST dataset (28x28 grayscale, 70,000 images), 
    provided in the original MNIST format as well as a NumPy format.
    Since MNIST restricts us to 10 classes, we chose one character to
    represent each of the 10 rows of Hiragana when creating Kuzushiji-MNIST.
    
    Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1).
    
    Attention: if exist dirs `root/mnist_kuzushiji10`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_kuzushiji10 data: 
    `root/mnist_kuzushiji10/train/0/xx.png`
    `root/mnist_kuzushiji10/train/2/xx.png`
    `root/mnist_kuzushiji10/train/6/xx.png`
    `root/mnist_kuzushiji10/test/0/xx.png`
    `root/mnist_kuzushiji10/test/2/xx.png`
    `root/mnist_kuzushiji10/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_kuzushiji10`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_kuzushiji10`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'mnist_kuzushiji10')
    url_list = ['http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-imgs.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-labels.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-imgs.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-labels.npz']
    for url in url_list:
        rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    train = np.load(gfile.path_join(task_path, 'kmnist-train-imgs.npz'))['arr_0']
    train_label = np.load(gfile.path_join(task_path, 'kmnist-train-labels.npz'))['arr_0']
    test = np.load(gfile.path_join(task_path, 'kmnist-test-imgs.npz'))['arr_0']
    test_label = np.load(gfile.path_join(task_path, 'kmnist-test-labels.npz'))['arr_0']
    for i in set(train_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for i in set(test_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(train.shape[0]):
        save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), 
                   array_to_image(train[idx].reshape(28, 28, 1)))
    for idx in range(test.shape[0]):
        save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), 
                   array_to_image(test[idx].reshape(28, 28, 1)))
    for url in url_list:
        gfile.remove(gfile.path_join(task_path, url.split('/')[-1]))
    print('mnist_kuzushiji10 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#15
0
def mnist_kuzushiji49(root):
    """Kuzushiji-49 from https://github.com/rois-codh/kmnist.
    
    Kuzushiji-49, as the name suggests, has 49 classes (28x28 grayscale, 270,912 images),
    is a much larger, but imbalanced dataset containing 48 Hiragana 
    characters and one Hiragana iteration mark.
    
    Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1).
    
    Attention: if exist dirs `root/mnist_kuzushiji49`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_kuzushiji49 data: 
    `root/mnist_kuzushiji49/train/0/xx.png`
    `root/mnist_kuzushiji49/train/2/xx.png`
    `root/mnist_kuzushiji49/train/6/xx.png`
    `root/mnist_kuzushiji49/test/0/xx.png`
    `root/mnist_kuzushiji49/test/2/xx.png`
    `root/mnist_kuzushiji49/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_kuzushiji49`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_kuzushiji49`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'mnist_kuzushiji49')
    url_list = ['http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-imgs.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-labels.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-imgs.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-labels.npz']
    for url in url_list:
        rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    train = np.load(gfile.path_join(task_path, 'k49-train-imgs.npz'))['arr_0']
    train_label = np.load(gfile.path_join(task_path, 'k49-train-labels.npz'))['arr_0']
    test = np.load(gfile.path_join(task_path, 'k49-test-imgs.npz'))['arr_0']
    test_label = np.load(gfile.path_join(task_path, 'k49-test-labels.npz'))['arr_0']
    for i in set(train_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for i in set(test_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(train.shape[0]):
        save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), 
                   array_to_image(train[idx].reshape(28, 28, 1)))
    for idx in range(test.shape[0]):
        save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), 
                   array_to_image(test[idx].reshape(28, 28, 1)))
    for url in url_list:
        gfile.remove(gfile.path_join(task_path, url.split('/')[-1]))
    print('mnist_kuzushiji49 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#16
0
文件: _dm.py 项目: Hourout/tensordata
def wine(root):
    """Title of Database: Wine recognition data
    Updated Sept 21, 1998 by C.Blake : Added attribute information
    
    These data are the results of a chemical analysis of
    wines grown in the same region in Italy but derived from three
    different cultivars.
    The analysis determined the quantities of 13 constituents
    found in each of the three types of wines. 
    
    Number of Instances
    class 1 59
    class 2 71
    class 3 48
    
    Data storage directory:
    root = `/user/.../mydata`
    wine data: 
    `root/wine/wine.txt`
    `root/wine/wine.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/wine`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/wine`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'wine')
    url_introduce = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
    url_txt = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
    rq.files(url_introduce,
             gfile.path_join(task_path, 'introduce.txt'),
             verbose=0)
    rq.table(url_txt,
             gfile.path_join(task_path, 'wine.txt'),
             names=[
                 'label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash',
                 'Magnesium', 'Total phenols', 'Flavanoids',
                 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity',
                 'Hue', 'OD280/OD315 of diluted wines', 'Proline'
             ])
    print('wine dataset download completed, run time %d min %.2f sec' % divmod(
        (time.time() - start), 60))
    return task_path
示例#17
0
def coil100(root):
    """COIL100 dataset from http://www.cs.columbia.edu/CAVE/software/softlib/coil-100.php
    
    "Columbia Object Image Library (COIL-100),"
    S. A. Nene, S. K. Nayar and H. Murase,
    Technical Report CUCS-006-96, February 1996.
    
    Each sample is an gray image (in 3D NDArray) with shape (128, 128, 1).
    Attention: if exist dirs `root/coil100`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    coil100 data: 
    `root/coil100/train/0/xx.png`
    `root/coil100/train/2/xx.png`
    `root/coil100/train/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/coil100`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/coil100`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'coil100')
    url = "http://www.cs.columbia.edu/CAVE/databases/SLAM_coil-20_coil-100/coil-100/coil-100.zip"
    rq.files(url, gfile.path_join(task_path, 'coil100.zip'))
    un_zip(gfile.path_join(task_path, 'coil100.zip'))
    image = gfile.listdir(gfile.path_join(task_path, 'coil100', 'coil-100'))
    t = pd.DataFrame(image, columns=['image'])
    t['label'] = t.image.map(lambda x:x.split('__')[0][3:])
    t['image_old_path'] = t.image.map(lambda x:gfile.path_join(task_path, 'coil100', 'coil-100', x))
    t['image_new_path'] = (t.label+'/'+t.image).map(lambda x:gfile.path_join(task_path, 'train', x))
    for i in t.label.unique():
        gfile.makedirs(gfile.path_join(task_path, 'train', i))
    for i,j in zip(t.image_old_path, t.image_new_path):
        gfile.copy(i, j)
    gfile.remove(gfile.path_join(task_path, 'coil100.zip'))
    gfile.remove(gfile.path_join(task_path, 'coil100'))
    gfile.remove(gfile.path_join(task_path, 'train', 'vertGroupppm2png.pl'))
    print('coil100 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#18
0
def caltech101(root):
    """Caltech101 dataset from http://www.vision.caltech.edu/Image_Datasets/Caltech101
    
    Pictures of objects belonging to 101 categories. 
    About 40 to 800 images per category.
    Most categories have about 50 images. 
    Collected in September 2003 by Fei-Fei Li, Marco Andreetto, 
    and Marc 'Aurelio Ranzato.  
    The size of each image is roughly 300 x 200 pixels.

    We have carefully clicked outlines of each object in these pictures, 
    these are included under the 'Annotations.tar'.
    There is also a matlab script to view the annotaitons, 'show_annotations.m'.
    
    Attention: if exist dirs `root/caltech101`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    caltech101 data: 
    `root/caltech101/train/accordion/xx.jpg`
    `root/caltech101/train/brain/xx.ipg`
    `root/caltech101/train/panda/xx.jpg`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/caltech101`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/caltech101.
    """
    start = time.time()
    task_path = assert_dirs(root, 'caltech101', make_root_dir=False)
    url = 'http://www.vision.caltech.edu/Image_Datasets/Caltech101/101_ObjectCategories.tar.gz'
    rq.files(url, gfile.path_join(root, url.split('/')[-1]))
    un_tar(un_gz(gfile.path_join(root, url.split('/')[-1])), task_path)
    gfile.rename(gfile.path_join(task_path, '101_ObjectCategories'),
                 gfile.path_join(task_path, 'train'))
    for i in ['101_ObjectCategories.tar.gz', '101_ObjectCategories.tar']:
        gfile.remove(gfile.path_join(root, i))
    print('caltech101 dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#19
0
def abbreviation(root):
    """Chinese abbreviation datasets.
    
    datasets url:`https://github.com/zhangyics/Chinese-abbreviation-dataset`
    
    A corpus of Chinese abbreviation
    This is the dataset released by the paper "A Chinese Dataset with Negative Full 
    Forms for General Abbreviation Prediction".
    
    Data storage directory:
    root = `/user/.../mydata`
    Chinese abbreviation datasets data: 
    `root/chinese_abbreviation/train_set.txt`
    `root/chinese_abbreviation/test_set.txt`
    `root/chinese_abbreviation/dev_set.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_abbreviation`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_abbreviation`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_abbreviation')
    url_train = "https://raw.githubusercontent.com/zhangyics/Chinese-abbreviation-dataset/master/train_set.txt"
    url_test = "https://raw.githubusercontent.com/zhangyics/Chinese-abbreviation-dataset/master/test_set.txt"
    url_dev = "https://raw.githubusercontent.com/zhangyics/Chinese-abbreviation-dataset/master/dev_set.txt"
    rq.files(url_train, path_join(task_path, 'train_set.txt'))
    rq.files(url_test, path_join(task_path, 'test_set.txt'))
    rq.files(url_dev, path_join(task_path, 'dev_set.txt'))
    print(
        'chinese abbreviation dataset download completed, run time %d min %.2f sec'
        % divmod((time.time() - start), 60))
    return task_path
示例#20
0
文件: _dm.py 项目: Hourout/tensordata
def abalone(root):
    """Predicting the age of abalone from physical measurements.
    
    The age of abalone is determined by cutting the shell through the cone, 
    staining it, and counting the number of rings through a microscope -- a boring and
    time-consuming task.  Other measurements, which are easier to obtain, are
    used to predict the age.  Further information, such as weather patterns
    and location (hence food availability) may be required to solve the problem.
    
    Data storage directory:
    root = `/user/.../mydata`
    abalone data: 
    `root/abalone/abalone.txt`
    `root/abalone/introduce.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/abalone`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/abalone`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'abalone')
    url_introduce = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.names'
    url_txt = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
    rq.files(url_introduce,
             gfile.path_join(task_path, 'introduce.txt'),
             verbose=0)
    rq.table(url_txt,
             gfile.path_join(task_path, 'abalone.txt'),
             names=[
                 'Sex', 'Length', 'Diameter', 'Height'
                 'Whole_weight', 'Shucked_weight', 'Viscera_weight',
                 'Shell_weight', 'label'
             ])
    print('abalone dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#21
0
def lunyu(root):
    """Lunyu dataset from Chinese classical literature.
    
    The Chinese Confucian classics, "The Analects of Confucius" is 
    a collection of quotations of Confucius and his disciples. 
    It was written by Confucius disciples and re-transmission disciples, 
    and was written in the early period of the Warring States Period. 
    The book consists of 20 chapters and 492 chapters. 
    It is mainly composed of quotations and supplemented by narratives. 
    It mainly records the words and deeds of Confucius and his disciples, 
    and more concentratedly reflects Confucius' political opinions, 
    ethical thoughts, moral concepts and educational principles. 
    This book is one of the classic works of Confucianism. 
    It is also called "Four Books" with "University", 
    "The Doctrine of the Mean" and "Mencius",
    plus "The Book of Songs", "Shangshu", "Book of Rites", 
    "Zhou Yi", "Spring and Autumn", collectively called "four books". Five Classics."
    
    Data storage directory:
    root = `/user/.../mydata`
    lunyu data: 
    `root/lunyu/lunyu.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/lunyu`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/lunyu`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'lunyu')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/lunyu.json'
    rq.files(url, path_join(task_path, url.split('/')[-1]))
    print('lunyu dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#22
0
def youmengying(root):
    """Youmengying dataset from Chinese classical literature.
    
    "You Meng Ying" is an anthology of Zhang Chao's 
    creations by Qing Dynasty writers.
    
    Data storage directory:
    root = `/user/.../mydata`
    youmengying data: 
    `root/youmengying/youmengying.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/youmengying`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/youmengying`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'youmengying')
    url = "https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/youmengying.json"
    rq.files(url, path_join(task_path, url.split('/')[-1]))
    print('youmengying dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#23
0
def mnist_kannada(root):
    """kannada-MNIST from https://github.com/vinayprabhu/Kannada_MNIST.
    
    The Kannada-MNIST dataset was created an a drop-in substitute for the standard MNIST dataset.
    
    Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1).
    
    Attention: if exist dirs `root/mnist_kannada`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_kannada data: 
    `root/mnist_kannada/train/0/xx.png`
    `root/mnist_kannada/train/2/xx.png`
    `root/mnist_kannada/train/6/xx.png`
    `root/mnist_kannada/test/0/xx.png`
    `root/mnist_kannada/test/2/xx.png`
    `root/mnist_kannada/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_kannada`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_kannada`.
    """
    start = time.time()
    print('Downloading data from https://github.com/Hourout/datasets/releases/download/0.0.1/kannada_MNIST.zip')
    task_path = assert_dirs(root, 'mnist_kannada')
    zip_path = rq.files('https://github.com/Hourout/datasets/releases/download/0.0.1/kannada_MNIST.zip', task_path+'/kannada_MNIST.zip')
    unzip_path = un_zip(task_path+'/kannada_MNIST.zip')
    train = pd.read_csv(gfile.path_join(task_path, 'kannada_MNIST/kannada_MNIST_train.csv'), header=None, dtype='uint8')
    test = pd.read_csv(gfile.path_join(task_path, 'kannada_MNIST/kannada_MNIST_test.csv'), header=None, dtype='uint8')
    for i in set(train[0]):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for i in range(len(train)):
        save_image(gfile.path_join(task_path, 'train', str(train.iat[i, 0]), str(i)+'.png'),
                       array_to_image(train.iloc[i, 1:].values.reshape(28, 28, 1)))
    for i in range(len(test)):
        save_image(gfile.path_join(task_path, 'test', str(test.iat[i, 0]), str(i)+'.png'),
                       array_to_image(test.iloc[i, 1:].values.reshape(28, 28, 1)))
    gfile.remove(zip_path)
    gfile.remove(unzip_path)
    print('mnist_kannada dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#24
0
def _download(path):
    rq.files(
        path.split('|')[1],
        gfile.path_join(path.split('|')[0],
                        path.split('|')[1].split('/')[-1]))
示例#25
0
def stl10(root):
    """Stl10 dataset from http://ai.stanford.edu/~acoates/stl10
    
    The STL-10 dataset is an image recognition dataset for developing 
    unsupervised feature learning, deep learning, self-taught learning algorithms.
    It is inspired by the CIFAR-10 dataset but with some modifications. 
    In particular, each class has fewer labeled training examples than in CIFAR-10, 
    but a very large set of unlabeled examples is provided to learn image models 
    prior to supervised training. The primary challenge is to make use of the 
    unlabeled data (which comes from a similar but different 
    distribution from the labeled data) to build a useful prior. 
    We also expect that the higher resolution of this dataset (96x96) 
    will make it a challenging benchmark for developing 
    more scalable unsupervised learning methods.
    
    Attention: if exist dirs `root/stl10`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    stl10 data: 
    `root/stl10/train/1/xx.png`
    `root/stl10/train/4/xx.png`
    `root/stl10/train/8/xx.png`
    `root/stl10/test/1/xx.png`
    `root/stl10/test/4/xx.png`
    `root/stl10/test/8/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/stl10`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/stl10`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'stl10')
    url = "http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz"
    rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    un_tar(un_gz(gfile.path_join(task_path, url.split('/')[-1])))
    
    with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/test_X.bin'), 'rb') as fin:
        data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3,96,96).transpose((0, 3, 2, 1))
    with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/test_y.bin'), 'rb') as fin:
        data_label = np.frombuffer(fin.read(), dtype=np.uint8)
    for i in set(data_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(data.shape[0]):
        save_image(gfile.path_join(task_path, 'test', str(data_label[idx]), str(idx)+'.png'), array_to_image(data[idx]))
    
    with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/train_X.bin'), 'rb') as fin:
        data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3,96,96).transpose((0, 3, 2, 1))
    with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/train_y.bin'), 'rb') as fin:
        data_label = np.frombuffer(fin.read(), dtype=np.uint8)
    for i in set(data_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for idx in range(data.shape[0]):
        save_image(gfile.path_join(task_path, 'train', str(data_label[idx]), str(idx)+'.png'), array_to_image(data[idx]))

    with open(gfile.path_join(task_path, 'stl10_binary/stl10_binary/unlabeled_X.bin'), 'rb') as fin:
        data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3,96,96).transpose((0, 3, 2, 1))
    gfile.makedirs(gfile.path_join(task_path, 'unlabeled'))
    for idx in range(data.shape[0]):
        save_image(gfile.path_join(task_path, 'unlabeled', str(idx)+'.png'), array_to_image(data[idx]))
    
    gfile.remove(gfile.path_join(task_path, 'stl10_binary.tar.gz'))
    gfile.remove(gfile.path_join(task_path, 'stl10_binary.tar'))
    gfile.remove(path_join(task_path, 'stl10_binary'))
    print('stl10 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path