示例#1
0
def ctrip_hotel(root):
    """Ctrip hotel reviews datasets.
    
    datasets url:`https://github.com/SophonPlus/ChineseNlpCorpus/blob/
    master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv`
    
    Ctrip's review data set contains 7000+ samples, 
    including more than 5,000 positive reviews 
    and more than 2,000 negative reviews
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_ctrip_hotel data: 
    `root/chinese_reviews_ctrip_hotel/chinese_reviews_ctrip_hotel.txt`
    `root/chinese_reviews_ctrip_hotel/chinese_reviews_ctrip_hotel.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_ctrip_hotel`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_ctrip_hotel`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_ctrip_hotel')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_ctrip_hotel/chinese_reviews_ctrip_hotel.json'
    url_txt = 'https://raw.githubusercontent.com/SophonPlus/ChineseNlpCorpus/master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv'
    rq.json(url_json, path_join(task_path, 'chinese_reviews_ctrip_hotel.json'))
    rq.table(url_txt, path_join(task_path, 'chinese_reviews_ctrip_hotel.txt'))
    print('chinese_reviews_ctrip_hotel dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#2
0
def sina_weibo(root):
    """Chinese Sina weibo reviews datasets.
        
    Chinese Sina weibo reviews datasets contains 110,000+ samples, 
    including more than 59,000 positive reviews 
    and more than 59,000 negative reviews
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_sina_weibo data: 
    `root/chinese_reviews_sina_weibo/chinese_reviews_sina_weibo.json`
    `root/chinese_reviews_sina_weibo/chinese_reviews_sina_weibo.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_sina_weibo`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_sina_weibo`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_sina_weibo')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo/chinese_reviews_sina_weibo.json'
    url_txt = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo/chinese_reviews_sina_weibo.csv'
    rq.json(url_json, path_join(task_path, 'chinese_reviews_sina_weibo.json'))
    rq.table(url_txt, path_join(task_path, 'chinese_reviews_sina_weibo.txt'))
    print('chinese_reviews_sina_weibo dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#3
0
def sina_weibo_emotion4(root):
    """Chinese Sina weibo 4 emotion reviews datasets.
        
    Chinese Sina weibo reviews datasets contains 360,000+ samples, 
    contains 4 emotions, including about 200,000 joys, 
    anger, disgust, and low, more than 50,000.
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_sina_weibo_emotion4 data: 
    `root/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4.json`
    `root/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_sina_weibo_emotion4`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_sina_weibo_emotion4`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_sina_weibo_emotion4')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4.json'
    url_txt = ['https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4_01.txt',
               'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4_02.txt',
               'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4_03.txt',]
    rq.json(url_json, path_join(task_path, 'chinese_reviews_sina_weibo_emotion4.json'))
    data = pd.DataFrame()
    for url in url_txt:
        s = requests.get(url).content
        data = pd.concat([data, pd.read_csv(io.StringIO(s.decode('utf-8')))])
    data.to_csv(path_join(task_path, 'chinese_reviews_sina_weibo_emotion4.txt'), index=False)
    print('chinese_reviews_sina_weibo_emotion4 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#4
0
def huajianji(root):
    """Huajianji dataset from Chinese classical literature.
    
    "Hua Jian Ji" is a collection of Chinese poetry 
    compiled during the Five Dynasties and Ten Kingdoms period. 
    It is also the first collection of literati in the history of literature. 
    It was edited by Zhao Chongxi, a later monk. 
    The book contains 18 classic works of poetry by Wen Tingjun and Wei Zhuang. 
    It concentrates and typically reflects the subject orientation, 
    aesthetic taste, physical style and artistic achievement 
    of the human creation in the early word history.
    
    Data storage directory:
    root = `/user/.../mydata`
    huajianji data: 
    `root/huajianji/huajianji.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/huajianji`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/huajianji`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'huajianji')
    url = "https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/huajianji.json"
    rq.files(url, path_join(task_path, url.split('/')[-1]))
    print('huajianji dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#5
0
def adult(root):
    """This data was extracted from the census bureau database found at
    http://www.census.gov/ftp/pub/DES/www/welcome.html
    
    48842 instances, mix of continuous and discrete    (train=32561, test=16281)
    45222 if instances with unknown values are removed (train=30162, test=15060)
    Duplicate or conflicting instances : 6
    Class probabilities for adult.all file
    Probability for the label '>50K'  : 23.93% / 24.78% (without unknowns)
    Probability for the label '<=50K' : 76.07% / 75.22% (without unknowns)
    
    Data storage directory:
    root = `/user/.../mydata`
    adult data: 
    `root/adult/adult.txt`
    `root/adult/adult.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/adult`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/adult`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'adult')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/dm/adult/adult.json'
    url_txt = 'https://raw.githubusercontent.com/Hourout/datasets/master/dm/adult/adult.txt'
    rq.json(url_json, os.path.join(task_path, 'adult.json'))
    rq.txt(url_txt, os.path.join(task_path, 'adult.txt'))
    print('adult dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#6
0
def poet_song(root):
    """Song_poet dataset from Chinese classical literature.
    
    "Full Song Poetry" After the high prosperity of Tang poetry, 
    Song poetry has new development and creation in ideological 
    content and artistic expression. 
    Many excellent writers have appeared, 
    and many schools have been formed, 
    which have produced poetry development in Yuan, 
    Ming and Qing. A far-reaching impact.
    
    Data storage directory:
    root = `/user/.../mydata`
    poetry_song data: 
    `root/poet_song/poet_song.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/poet_song`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/poet_song`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'poet_song')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/poetry_song.json.bz2'
    rq.files(url, path_join(task_path, 'poet_song.json.bz2'))
    un_bz2(path_join(task_path, 'poet_song.json.bz2'))
    remove(path_join(task_path, 'poet_song.json.bz2'))
    print('poet_song dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#7
0
def ci_song(root):
    """Song_ci dataset from Chinese classical literature.
    
    "The Song of the Whole Song" is one of the most important 
    achievements of ancient books in China in the past 100 years. 
    Song poetry and Tang poetry are the artistic peaks of 
    Chinese classical poetry. The "Full Tang Poetry" edited in 
    the Qing Dynasty is a household name, and now it is newly 
    compiled "Full Song Ci", which is called the double shackles 
    of Chinese literature. The book has a total of five volumes, 
    a collection of words from the Song Dynasty for three hundred years.
    
    Data storage directory:
    root = `/user/.../mydata`
    ci_song data: 
    `root/ci_song/ci_song.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/ci_song`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/ci_song`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'ci_song')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/ci_song.json'
    rq.files(url, path_join(task_path, url.split('/')[-1]))
    print('ci_song dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#8
0
def poet_tang(root):
    """Tang_poet dataset from Chinese classical literature.
    
    "Full Tang Poetry" is the 44th year of Qing Emperor Kangxi (1705), 
    Peng Dingqiu, Shen Sanzeng, Yang Zhongna, Wang Shizhen, Wang Wei, 
    Yu Mei, Xu Shuben, Che Dingjin, Pan Conglu, and Cha Yu 
    "There are more than 48,900 poems, more than 2,200 people,"
    a total of 900 volumes, 12 volumes of catalogues.
    
    Data storage directory:
    root = `/user/.../mydata`
    poet_tang data: 
    `root/poet_tang/poet_tang.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/poet_tang`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/poet_tang`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'poet_tang')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/poetry_tang.json.bz2'
    rq.files(url, path_join(task_path, 'poet_tang.json.bz2'))
    un_bz2(path_join(task_path, 'poet_tang.json.bz2'))
    remove(path_join(task_path, 'poet_tang.json.bz2'))
    print('poet_tang dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#9
0
def shijing(root):
    """Shijing dataset from Chinese classical literature.
    
    The earliest poetry collection in China, The Book of Songs, 
    is the earliest collection of poems in ancient Chinese poetry. 
    It collects poems from the early Western Zhou Dynasty to the 
    middle of spring and autumn (the first 11th century to the 
    first six centuries), including 311 articles, 
    of which 6 The article is a poem, that is, only the title, 
    no content, called the six poems of the poems 
    (Nan, Baihua, Huaying, Yukang, Chongwu, Yuyi),
    reflecting the period from the beginning of the week to 
    the late Zhou Dynasty for about five hundred years.
    
    Data storage directory:
    root = `/user/.../mydata`
    shijing data: 
    `root/shijing/shijing.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/shijing`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/shijing`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'shijing')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/shijing.json'
    rq.files(url, path_join(task_path, url.split('/')[-1]))
    print('shijing dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#10
0
def poetry_SouthernTang(root):
    """poetry_SouthernTang dataset from Chinese classical literature.
    
    "The Southern Tang Dynasty's two main words", 
    is the Southern Tang Dynasty master Li Jing, the latter master Li Yu. 
    The book was written in the Southern Song Dynasty, 
    and later generations have been compiled, 
    and later generations have written various versions.
    
    Data storage directory:
    root = `/user/.../mydata`
    poetry_SouthernTang data: 
    `root/poetry_SouthernTang/poetry_SouthernTang.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/poetry_SouthernTang`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/poetry_SouthernTang`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'poetry_SouthernTang')
    url = "https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/nantang_erzhu_poetry.json"
    rq.files(url, path_join(task_path, 'poetry_SouthernTang.json'))
    print(
        'poetry_SouthernTang dataset download completed, run time %d min %.2f sec'
        % divmod((time.time() - start), 60))
    return task_path
示例#11
0
def takeaway(root):
    """Chinese takeaway reviews datasets.
    
    datasets url:`https://github.com/SophonPlus/ChineseNlpCorpus/blob/master/datasets/waimai_10k/waimai_10k.csv`
    
    Chinese takeaway reviews datasets contains 12,000+ samples, 
    including more than 4,000 positive reviews 
    and more than 8,000 negative reviews
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_takeaway data: 
    `root/chinese_reviews_takeaway/chinese_reviews_takeaway.json`
    `root/chinese_reviews_takeaway/chinese_reviews_takeaway.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_takeaway`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_takeaway`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_takeaway')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_takeaway/chinese_reviews_takeaway.json'
    url_txt = 'https://raw.githubusercontent.com/SophonPlus/ChineseNlpCorpus/master/datasets/waimai_10k/waimai_10k.csv'
    rq.json(url_json, path_join(task_path, 'chinese_reviews_takeaway.json'))
    rq.table(url_txt, path_join(task_path, 'chinese_reviews_takeaway.txt'))
    print('chinese_reviews_takeaway dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#12
0
def douban_movies(root):
    """Chinese douban movies reviews datasets.
        
    Chinese douban movies reviews datasets Includes 28 movies, 
    over 700,000 users, over 2 million ratings.
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_douban_movies data: 
    `root/chinese_reviews_douban_movies/chinese_reviews_douban_movies.json`
    `root/chinese_reviews_douban_movies/ratings.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_douban_movies`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_douban_movies`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_douban_movies')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_douban_movies/chinese_reviews_douban_movies.json'
    url_movies = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_douban_movies/movies.txt'
    url_ratings = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_douban_movies/ratings.txt'
    rq.json(url_json, path_join(task_path, 'chinese_reviews_douban_movies.json'))
    rq.table(url_movies, path_join(task_path, 'movies.txt'))
    l = [url_ratings[:-4]+str(i)+url_ratings[-4:] for i in range(13)]
    with concurrent.futures.ProcessPoolExecutor() as excutor:
        data = pd.concat(excutor.map(_request_txt, l))
    data.to_csv(path_join(task_path, 'ratings.txt'), index=False)
    print('chinese_reviews_douban_movies dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#13
0
def abbreviation(root):
    """Chinese abbreviation datasets.
    
    datasets url:`https://github.com/zhangyics/Chinese-abbreviation-dataset`
    
    A corpus of Chinese abbreviation
    This is the dataset released by the paper "A Chinese Dataset with Negative Full 
    Forms for General Abbreviation Prediction".
    
    Data storage directory:
    root = `/user/.../mydata`
    Chinese abbreviation datasets data: 
    `root/chinese_abbreviation/train_set.txt`
    `root/chinese_abbreviation/test_set.txt`
    `root/chinese_abbreviation/dev_set.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_abbreviation`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_abbreviation`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_abbreviation')
    url_train = "https://raw.githubusercontent.com/zhangyics/Chinese-abbreviation-dataset/master/train_set.txt"
    url_test = "https://raw.githubusercontent.com/zhangyics/Chinese-abbreviation-dataset/master/test_set.txt"
    url_dev = "https://raw.githubusercontent.com/zhangyics/Chinese-abbreviation-dataset/master/dev_set.txt"
    rq.files(url_train, path_join(task_path, 'train_set.txt'))
    rq.files(url_test, path_join(task_path, 'test_set.txt'))
    rq.files(url_dev, path_join(task_path, 'dev_set.txt'))
    print(
        'chinese abbreviation dataset download completed, run time %d min %.2f sec'
        % divmod((time.time() - start), 60))
    return task_path
示例#14
0
文件: _dm.py 项目: Hourout/tensordata
def titanic(root):
    """Titanic dataset
    
    'train.csv' will contain the details of a subset of the 
    passengers on board (891 to be exact) and importantly, 
    will reveal whether they survived or not, 
    also known as the “ground truth”.
    
    The `test.csv` dataset contains similar information 
    but does not disclose the “ground truth” for each passenger. 
    
    Data storage directory:
    root = `/user/.../mydata`
    titanic data: 
    `root/titanic/train.csv`
    `root/titanic/test.csv`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/titanic`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/titanic`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'titanic')
    url_train = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
    url_test = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"
    rq.table(url_train, gfile.path_join(task_path, 'train.csv'))
    rq.table(url_test, gfile.path_join(task_path, 'test.csv'))
    print('titanic dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#15
0
def boston_housing(root):
    """Housing Values in Suburbs of Boston
    
    Harrison, D. and Rubinfeld, D.L. (1978) Hedonic prices and the demand for 
    clean air. J. Environ. Economics and Management 5, 81–102.
    
    Belsley D.A., Kuh, E. and Welsch, R.E. (1980) Regression Diagnostics. 
    Identifying Influential Data and Sources of Collinearity. New York: Wiley.
    
    Data storage directory:
    root = `/user/.../mydata`
    boston_housing data: 
    `root/boston_housing/boston_housing.txt`
    `root/boston_housing/boston_housing.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/boston_housing`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/boston_housing`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'boston_housing')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/dm/boston_house/boston_housing.json'
    url_txt = 'https://raw.githubusercontent.com/Hourout/datasets/master/dm/boston_house/boston_housing.txt'
    rq.json(url_json, os.path.join(task_path, 'boston_housing.json'))
    rq.txt(url_txt, os.path.join(task_path, 'boston_housing.txt'))
    print(
        'boston_housing dataset download completed, run time %d min %.2f sec' %
        divmod((time.time() - start), 60))
    return task_path
示例#16
0
def HIT(root):
    """Chinese stop word of Harbin Institute of Technology datasets.
        
    Chinese stop word of Harbin Institute of Technology datasets Includes 767 stop word.
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_stop_word_HIT data: 
    `root/chinese_stop_word_HIT/chinese_stop_word_HIT`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_stop_word_HIT`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_stop_word_HIT`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_stop_word_HIT')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_stop_word/chinese_stop_word_HIT.txt'
    data = pd.DataFrame(
        list(
            map(lambda x: x.replace('\n', ''),
                io.StringIO(requests.get(url).content.decode('utf-8')))))
    data.to_csv(path_join(task_path, 'chinese_stop_word_HIT.txt'),
                index=False,
                header=None)
    print(
        'chinese_stop_word_HIT dataset download completed, run time %d min %.2f sec'
        % divmod((time.time() - start), 60))
    return task_path
示例#17
0
文件: _dm.py 项目: Hourout/tensordata
def arrhythmia(root):
    """Aarrhythmia datasets from http://archive.ics.uci.edu/ml/datasets/Arrhythmia.
    
    This database contains 279 attributes, 206 of which are linear
    valued and the rest are nominal. 
    
    Data storage directory:
    root = `/user/.../mydata`
    arrhythmia data: 
    `root/abalone/arrhythmia.txt`
    `root/abalone/introduce.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/arrhythmia`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/arrhythmia`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'arrhythmia')
    url_introduce = 'http://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.names'
    url_txt = 'http://archive.ics.uci.edu/ml/machine-learning-databases/arrhythmia/arrhythmia.data'
    rq.files(url_introduce,
             gfile.path_join(task_path, 'introduce.txt'),
             verbose=0)
    rq.table(url_txt, gfile.path_join(task_path, 'arrhythmia.txt'))
    print('arrhythmia dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#18
0
def mnist_kuzushiji_kanji(root):
    """Kuzushiji-Kanji dataset from https://github.com/rois-codh/kmnist.
    
    Kuzushiji-Kanji is a large and highly imbalanced 64x64 dataset 
    of 3832 Kanji characters, containing 140,426 images 
    of both common and rare characters.
    
    Attention: if exist dirs `root/mnist_kuzushiji_kanji`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_kuzushiji_kanji data: 
    `root/mnist_kuzushiji_kanji/train/U+55C7/xx.png`
    `root/mnist_kuzushiji_kanji/train/U+7F8E/xx.png`
    `root/mnist_kuzushiji_kanji/train/U+9593/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_kuzushiji_kanji`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_kuzushiji_kanji.
    """
    start = time.time()
    task_path = assert_dirs(root, 'mnist_kuzushiji_kanji', make_root_dir=False)
    url = "http://codh.rois.ac.jp/kmnist/dataset/kkanji/kkanji.tar"
    rq.files(url, gfile.path_join(root, url.split('/')[-1]))
    un_tar(gfile.path_join(root, url.split('/')[-1]), task_path)
    gfile.rename(gfile.path_join(task_path, 'kkanji2'), gfile.path_join(task_path, 'train'))
    gfile.remove(gfile.path_join(root, 'kkanji.tar'))
    print('mnist_kuzushiji_kanji dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#19
0
def economist(root, date, mode='pdf'):
    """The Economist from https://github.com/nailperry-zd/The-Economist.
    
    Data storage directory:
    root = `/user/.../mydata`
    economist data: 
    `root/...(pdf or epub or mobi)`
    Args:
        root: str, Store the absolute path of the data directory.
        date: str, eg:'2019-01-01'.
        mode: str, one of ['pdf', 'epub', 'mobi'].
    Returns:
        Store the absolute path of the data directory, is `root/...(pdf or epub or mobi)`.
    """
    start = time.time()
    assert mode in ['pdf', 'epub',
                    'mobi'], "`mode` should be one of ['pdf', 'epub', 'mobi']."
    t = divmod((pd.to_datetime(date) - pd.to_datetime('2017-05-06')).days, 7)
    if t[0] < 0 or t[1] > 0:
        raise ValueError("No book that meets the date.")
    task_path = assert_dirs(root)
    t = 'https://github.com/nailperry-zd/The-Economist/raw/master/{}'.format(
        date)
    t = [
        i for i in requests.get(t).content.decode('utf-8').split('\n')
        if ('The-Economist/blob/master/{}'.format(date) in i) & (mode in i)
    ]
    url = 'https://github.com/nailperry-zd/The-Economist/raw/master/' + date + '/' + [
        i[7:-1] for i in t[0].split(' ') if 'title' in i
    ][0]
    task_path = path_join(task_path, url.split('/')[-1])
    rq.files(url, task_path, verbose=1)
    print('economist dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#20
0
def famous_person(root):
    """Chinese lexicon famous person datasets.
    
    datasets url:`https://github.com/fighting41love/funNLP`
    
    chinese_lexicon_famous_person dataset contains 13600+ samples.
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_lexicon_famous_person data: 
    `root/chinese_lexicon_famous_person/chinese_lexicon_famous_person.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_lexicon_famous_person`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_lexicon_famous_person`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_lexicon_famous_person')
    url = "https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_lexicon/chinese_lexicon_famous_person.txt"
    rq.table(url, path_join(task_path, 'chinese_lexicon_famous_person.txt'))
    print(
        'chinese_lexicon_famous_person dataset download completed, run time %d min %.2f sec'
        % divmod((time.time() - start), 60))
    return task_path
示例#21
0
def online_shopping_10_cats(root):
    """Chinese online shopping reviews datasets.
        
    Chinese online shopping reviews datasets contains 60,000+ samples, 
    about 10 categories (books, tablets, mobile phones, fruits, shampoos, 
    water heaters, Mengniu, clothes, computers, hotels),
    including more than 30,000 positive reviews 
    and more than 30,000 negative reviews
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_online_shopping_10_cats data: 
    `root/chinese_reviews_online_shopping_10_cats/chinese_reviews_online_shopping_10_cats.json`
    `root/chinese_reviews_online_shopping_10_cats/chinese_reviews_online_shopping_10_cats.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_online_shopping_10_cats`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_online_shopping_10_cats`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_online_shopping_10_cats')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_online_shopping_10_cats/chinese_reviews_online_shopping_10_cats.json'
    url_txt = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_online_shopping_10_cats/chinese_reviews_online_shopping_10_cats.csv'
    rq.json(url_json, path_join(task_path, 'chinese_reviews_online_shopping_10_cats.json'))
    rq.table(url_txt, path_join(task_path, 'chinese_reviews_online_shopping_10_cats.txt'))
    print('chinese_reviews_online_shopping_10_cats dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#22
0
def mnist_fashion(root):
    """A dataset of Zalando's article images consisting of fashion products.
    
    Fashion mnist datasets is a drop-in replacement of the original MNIST dataset
    from https://github.com/zalandoresearch/fashion-mnist.
    Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1).
    
    Attention: if exist dirs `root/mnist_fashion`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_fashion data: 
    `root/mnist_fashion/train/0/xx.png`
    `root/mnist_fashion/train/2/xx.png`
    `root/mnist_fashion/train/6/xx.png`
    `root/mnist_fashion/test/0/xx.png`
    `root/mnist_fashion/test/2/xx.png`
    `root/mnist_fashion/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_fashion`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_fashion`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'mnist_fashion')
    url_list = ['http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz',
                'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz',
                'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz',
                'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz']
    for url in url_list:
        rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    with gzip.open(gfile.path_join(task_path, 'train-labels-idx1-ubyte.gz'), 'rb') as lbpath:
        train_label = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(gfile.path_join(task_path, 'train-images-idx3-ubyte.gz'), 'rb') as imgpath:
        train = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(train_label), 28, 28)

    with gzip.open(gfile.path_join(task_path, 't10k-labels-idx1-ubyte.gz'), 'rb') as lbpath:
        test_label = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(gfile.path_join(task_path, 't10k-images-idx3-ubyte.gz'), 'rb') as imgpath:
        test = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(test_label), 28, 28)
    
    for i in set(train_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for i in set(test_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(train.shape[0]):
        save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), 
                   array_to_image(train[idx].reshape(28, 28, 1)))
    for idx in range(test.shape[0]):
        save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), 
                   array_to_image(test[idx].reshape(28, 28, 1)))
    for url in url_list:
        gfile.remove(gfile.path_join(task_path, url.split('/')[-1]))
    print('mnist_fashion dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#23
0
def cifar100(root, fine_label=True):
    """CIFAR100 image classification dataset from https://www.cs.toronto.edu/~kriz/cifar.html
    
    Each sample is an image (in 3D NDArray) with shape (32, 32, 3).
    
    Attention: if exist dirs `root/cifar100`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    cifar100 data: 
    `root/cifar100/train/0/xx.png`
    `root/cifar100/train/2/xx.png`
    `root/cifar100/train/6/xx.png`
    `root/cifar100/test/0/xx.png`
    `root/cifar100/test/2/xx.png`
    `root/cifar100/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/cifar100`,
              root should be `/user/.../mydata`.
        fine_label: bool, default False.
                    Whether to load the fine-grained (100 classes) or 
                    coarse-grained (20 super-classes) labels.
    Returns:
        Store the absolute path of the data directory, is `root/cifar100`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'cifar100')
    url = 'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/cifar100/cifar-100-binary.tar.gz'
    rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    with tarfile.open(gfile.path_join(task_path, url.split('/')[-1])) as t:
        t.extractall(task_path)
    noise_flie = gfile.listdir(task_path)
    with open(gfile.path_join(task_path, 'train.bin'), 'rb') as fin:
        data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3072 + 2)
        train = data[:, 2:].reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)
        train_label = data[:, 0 + fine_label].astype(np.int32)
    for i in set(train_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for idx in range(train.shape[0]):
        save_image(
            gfile.path_join(task_path, 'train', str(train_label[idx]),
                            str(idx) + '.png'), array_to_image(train[idx]))
    with open(gfile.path_join(task_path, 'test.bin'), 'rb') as fin:
        data = np.frombuffer(fin.read(), dtype=np.uint8).reshape(-1, 3072 + 2)
        test = data[:, 2:].reshape(-1, 3, 32, 32).transpose(0, 2, 3, 1)
        test_label = data[:, 0 + fine_label].astype(np.int32)
    for i in set(test_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(test.shape[0]):
        save_image(
            gfile.path_join(task_path, 'test', str(test_label[idx]),
                            str(idx) + '.png'), array_to_image(test[idx]))
    for file in noise_flie:
        gfile.remove(gfile.path_join(task_path, file))
    print('cifar100 dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
示例#24
0
def mnist(root):
    """MNIST handwritten digits dataset from http://yann.lecun.com/exdb/mnist
    
    Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1).
    
    Attention: if exist dirs `root/mnist`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist data: 
    `root/mnist/train/0/xx.png`
    `root/mnist/train/2/xx.png`
    `root/mnist/train/6/xx.png`
    `root/mnist/test/0/xx.png`
    `root/mnist/test/2/xx.png`
    `root/mnist/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'mnist')
    url_list = ['https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-labels-idx1-ubyte.gz',
                'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/train-images-idx3-ubyte.gz',
                'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-labels-idx1-ubyte.gz',
                'https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/mnist/t10k-images-idx3-ubyte.gz']
    for url in url_list:
        rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    with gzip.open(gfile.path_join(task_path, 'train-labels-idx1-ubyte.gz'), 'rb') as lbpath:
        train_label = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(gfile.path_join(task_path, 'train-images-idx3-ubyte.gz'), 'rb') as imgpath:
        train = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(train_label), 28, 28)

    with gzip.open(gfile.path_join(task_path, 't10k-labels-idx1-ubyte.gz'), 'rb') as lbpath:
        test_label = np.frombuffer(lbpath.read(), np.uint8, offset=8)

    with gzip.open(gfile.path_join(task_path, 't10k-images-idx3-ubyte.gz'), 'rb') as imgpath:
        test = np.frombuffer(imgpath.read(), np.uint8, offset=16).reshape(len(test_label), 28, 28)
    
    for i in set(train_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for i in set(test_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(train.shape[0]):
        save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), 
                   array_to_image(train[idx].reshape(28, 28, 1)))
    for idx in range(test.shape[0]):
        save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), 
                   array_to_image(test[idx].reshape(28, 28, 1)))
    for url in url_list:
        gfile.remove(gfile.path_join(task_path, url.split('/')[-1]))
    print('mnist dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#25
0
def mnist_kuzushiji10(root):
    """Kuzushiji-MNIST from https://github.com/rois-codh/kmnist.
    
    Kuzushiji-MNIST is a drop-in replacement for the
    MNIST dataset (28x28 grayscale, 70,000 images), 
    provided in the original MNIST format as well as a NumPy format.
    Since MNIST restricts us to 10 classes, we chose one character to
    represent each of the 10 rows of Hiragana when creating Kuzushiji-MNIST.
    
    Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1).
    
    Attention: if exist dirs `root/mnist_kuzushiji10`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_kuzushiji10 data: 
    `root/mnist_kuzushiji10/train/0/xx.png`
    `root/mnist_kuzushiji10/train/2/xx.png`
    `root/mnist_kuzushiji10/train/6/xx.png`
    `root/mnist_kuzushiji10/test/0/xx.png`
    `root/mnist_kuzushiji10/test/2/xx.png`
    `root/mnist_kuzushiji10/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_kuzushiji10`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_kuzushiji10`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'mnist_kuzushiji10')
    url_list = ['http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-imgs.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-labels.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-imgs.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-labels.npz']
    for url in url_list:
        rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    train = np.load(gfile.path_join(task_path, 'kmnist-train-imgs.npz'))['arr_0']
    train_label = np.load(gfile.path_join(task_path, 'kmnist-train-labels.npz'))['arr_0']
    test = np.load(gfile.path_join(task_path, 'kmnist-test-imgs.npz'))['arr_0']
    test_label = np.load(gfile.path_join(task_path, 'kmnist-test-labels.npz'))['arr_0']
    for i in set(train_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for i in set(test_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(train.shape[0]):
        save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), 
                   array_to_image(train[idx].reshape(28, 28, 1)))
    for idx in range(test.shape[0]):
        save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), 
                   array_to_image(test[idx].reshape(28, 28, 1)))
    for url in url_list:
        gfile.remove(gfile.path_join(task_path, url.split('/')[-1]))
    print('mnist_kuzushiji10 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#26
0
def mnist_kuzushiji49(root):
    """Kuzushiji-49 from https://github.com/rois-codh/kmnist.
    
    Kuzushiji-49, as the name suggests, has 49 classes (28x28 grayscale, 270,912 images),
    is a much larger, but imbalanced dataset containing 48 Hiragana 
    characters and one Hiragana iteration mark.
    
    Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1).
    
    Attention: if exist dirs `root/mnist_kuzushiji49`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_kuzushiji49 data: 
    `root/mnist_kuzushiji49/train/0/xx.png`
    `root/mnist_kuzushiji49/train/2/xx.png`
    `root/mnist_kuzushiji49/train/6/xx.png`
    `root/mnist_kuzushiji49/test/0/xx.png`
    `root/mnist_kuzushiji49/test/2/xx.png`
    `root/mnist_kuzushiji49/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_kuzushiji49`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_kuzushiji49`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'mnist_kuzushiji49')
    url_list = ['http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-imgs.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-labels.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-imgs.npz',
                'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-labels.npz']
    for url in url_list:
        rq.files(url, gfile.path_join(task_path, url.split('/')[-1]))
    train = np.load(gfile.path_join(task_path, 'k49-train-imgs.npz'))['arr_0']
    train_label = np.load(gfile.path_join(task_path, 'k49-train-labels.npz'))['arr_0']
    test = np.load(gfile.path_join(task_path, 'k49-test-imgs.npz'))['arr_0']
    test_label = np.load(gfile.path_join(task_path, 'k49-test-labels.npz'))['arr_0']
    for i in set(train_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for i in set(test_label):
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for idx in range(train.shape[0]):
        save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'), 
                   array_to_image(train[idx].reshape(28, 28, 1)))
    for idx in range(test.shape[0]):
        save_image(gfile.path_join(task_path, 'test', str(test_label[idx]), str(idx)+'.png'), 
                   array_to_image(test[idx].reshape(28, 28, 1)))
    for url in url_list:
        gfile.remove(gfile.path_join(task_path, url.split('/')[-1]))
    print('mnist_kuzushiji49 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#27
0
def mnist_tibetan(root):
    """Tibetan-MNIST from https://github.com/bat67/TibetanMNIST.
    
    Tibetan-MNIST is a drop-in replacement for the
    MNIST dataset (28x28 grayscale, 70,000 images), 
    provided in the original MNIST format as well as a NumPy format.
    Since MNIST restricts us to 10 classes, we chose one character to
    represent each of the 10 rows of Hiragana when creating Tibetan-MNIST.
    
    Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1).
    
    Attention: if exist dirs `root/mnist_tibetan`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_tibetan data: 
    `root/mnist_tibetan/train/0/xx.png`
    `root/mnist_tibetan/train/2/xx.png`
    `root/mnist_tibetan/train/6/xx.png`
    `root/mnist_tibetan/test/0/xx.png`
    `root/mnist_tibetan/test/2/xx.png`
    `root/mnist_tibetan/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_tibetan`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_tibetan`.
    """
    start = time.time()
    print('Downloading data from https://github.com/Hourout/datasets/tree/master/TibetanMNIST')
    task_path = assert_dirs(root, 'mnist_tibetan')
    url_list = ['https://raw.githubusercontent.com/Hourout/datasets/master/TibetanMNIST/TibetanMNIST_28_28_01.csv',
                'https://raw.githubusercontent.com/Hourout/datasets/master/TibetanMNIST/TibetanMNIST_28_28_02.csv']
    data = pd.DataFrame()
    for url in url_list:
        s = requests.get(url).content
        data = pd.concat([data, pd.read_csv(io.StringIO(s.decode('utf-8')),header=None, dtype='uint8')])
    train = data.loc[:, 1:].values.reshape(-1, 28, 28)
    train_label = data.loc[:, 0].values
    for i in set(train_label):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
    for idx in range(train.shape[0]):
        save_image(gfile.path_join(task_path, 'train', str(train_label[idx]), str(idx)+'.png'),
                   array_to_image(train[idx].reshape(28, 28, 1)))
    print('mnist_tibetan dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#28
0
文件: _dm.py 项目: Hourout/tensordata
def wine(root):
    """Title of Database: Wine recognition data
    Updated Sept 21, 1998 by C.Blake : Added attribute information
    
    These data are the results of a chemical analysis of
    wines grown in the same region in Italy but derived from three
    different cultivars.
    The analysis determined the quantities of 13 constituents
    found in each of the three types of wines. 
    
    Number of Instances
    class 1 59
    class 2 71
    class 3 48
    
    Data storage directory:
    root = `/user/.../mydata`
    wine data: 
    `root/wine/wine.txt`
    `root/wine/wine.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/wine`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/wine`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'wine')
    url_introduce = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.names'
    url_txt = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
    rq.files(url_introduce,
             gfile.path_join(task_path, 'introduce.txt'),
             verbose=0)
    rq.table(url_txt,
             gfile.path_join(task_path, 'wine.txt'),
             names=[
                 'label', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash',
                 'Magnesium', 'Total phenols', 'Flavanoids',
                 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity',
                 'Hue', 'OD280/OD315 of diluted wines', 'Proline'
             ])
    print('wine dataset download completed, run time %d min %.2f sec' % divmod(
        (time.time() - start), 60))
    return task_path
示例#29
0
def mnist_kannada(root):
    """kannada-MNIST from https://github.com/vinayprabhu/Kannada_MNIST.
    
    The Kannada-MNIST dataset was created an a drop-in substitute for the standard MNIST dataset.
    
    Each sample is an gray image (in 3D NDArray) with shape (28, 28, 1).
    
    Attention: if exist dirs `root/mnist_kannada`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    mnist_kannada data: 
    `root/mnist_kannada/train/0/xx.png`
    `root/mnist_kannada/train/2/xx.png`
    `root/mnist_kannada/train/6/xx.png`
    `root/mnist_kannada/test/0/xx.png`
    `root/mnist_kannada/test/2/xx.png`
    `root/mnist_kannada/test/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/mnist_kannada`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/mnist_kannada`.
    """
    start = time.time()
    print('Downloading data from https://github.com/Hourout/datasets/releases/download/0.0.1/kannada_MNIST.zip')
    task_path = assert_dirs(root, 'mnist_kannada')
    zip_path = rq.files('https://github.com/Hourout/datasets/releases/download/0.0.1/kannada_MNIST.zip', task_path+'/kannada_MNIST.zip')
    unzip_path = un_zip(task_path+'/kannada_MNIST.zip')
    train = pd.read_csv(gfile.path_join(task_path, 'kannada_MNIST/kannada_MNIST_train.csv'), header=None, dtype='uint8')
    test = pd.read_csv(gfile.path_join(task_path, 'kannada_MNIST/kannada_MNIST_test.csv'), header=None, dtype='uint8')
    for i in set(train[0]):
        gfile.makedirs(gfile.path_join(task_path, 'train', str(i)))
        gfile.makedirs(gfile.path_join(task_path, 'test', str(i)))
    for i in range(len(train)):
        save_image(gfile.path_join(task_path, 'train', str(train.iat[i, 0]), str(i)+'.png'),
                       array_to_image(train.iloc[i, 1:].values.reshape(28, 28, 1)))
    for i in range(len(test)):
        save_image(gfile.path_join(task_path, 'test', str(test.iat[i, 0]), str(i)+'.png'),
                       array_to_image(test.iloc[i, 1:].values.reshape(28, 28, 1)))
    gfile.remove(zip_path)
    gfile.remove(unzip_path)
    print('mnist_kannada dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
示例#30
0
def coil100(root):
    """COIL100 dataset from http://www.cs.columbia.edu/CAVE/software/softlib/coil-100.php
    
    "Columbia Object Image Library (COIL-100),"
    S. A. Nene, S. K. Nayar and H. Murase,
    Technical Report CUCS-006-96, February 1996.
    
    Each sample is an gray image (in 3D NDArray) with shape (128, 128, 1).
    Attention: if exist dirs `root/coil100`, api will delete it and create it.
    Data storage directory:
    root = `/user/.../mydata`
    coil100 data: 
    `root/coil100/train/0/xx.png`
    `root/coil100/train/2/xx.png`
    `root/coil100/train/6/xx.png`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/coil100`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/coil100`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'coil100')
    url = "http://www.cs.columbia.edu/CAVE/databases/SLAM_coil-20_coil-100/coil-100/coil-100.zip"
    rq.files(url, gfile.path_join(task_path, 'coil100.zip'))
    un_zip(gfile.path_join(task_path, 'coil100.zip'))
    image = gfile.listdir(gfile.path_join(task_path, 'coil100', 'coil-100'))
    t = pd.DataFrame(image, columns=['image'])
    t['label'] = t.image.map(lambda x:x.split('__')[0][3:])
    t['image_old_path'] = t.image.map(lambda x:gfile.path_join(task_path, 'coil100', 'coil-100', x))
    t['image_new_path'] = (t.label+'/'+t.image).map(lambda x:gfile.path_join(task_path, 'train', x))
    for i in t.label.unique():
        gfile.makedirs(gfile.path_join(task_path, 'train', i))
    for i,j in zip(t.image_old_path, t.image_new_path):
        gfile.copy(i, j)
    gfile.remove(gfile.path_join(task_path, 'coil100.zip'))
    gfile.remove(gfile.path_join(task_path, 'coil100'))
    gfile.remove(gfile.path_join(task_path, 'train', 'vertGroupppm2png.pl'))
    print('coil100 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path