Exemplo n.º 1
0
def sina_weibo_emotion4(root):
    """Chinese Sina weibo 4 emotion reviews datasets.
        
    Chinese Sina weibo reviews datasets contains 360,000+ samples, 
    contains 4 emotions, including about 200,000 joys, 
    anger, disgust, and low, more than 50,000.
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_sina_weibo_emotion4 data: 
    `root/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4.json`
    `root/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_sina_weibo_emotion4`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_sina_weibo_emotion4`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_sina_weibo_emotion4')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4.json'
    url_txt = ['https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4_01.txt',
               'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4_02.txt',
               'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo_emotion4/chinese_reviews_sina_weibo_emotion4_03.txt',]
    rq.json(url_json, path_join(task_path, 'chinese_reviews_sina_weibo_emotion4.json'))
    data = pd.DataFrame()
    for url in url_txt:
        s = requests.get(url).content
        data = pd.concat([data, pd.read_csv(io.StringIO(s.decode('utf-8')))])
    data.to_csv(path_join(task_path, 'chinese_reviews_sina_weibo_emotion4.txt'), index=False)
    print('chinese_reviews_sina_weibo_emotion4 dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Exemplo n.º 2
0
def ctrip_hotel(root):
    """Ctrip hotel reviews datasets.
    
    datasets url:`https://github.com/SophonPlus/ChineseNlpCorpus/blob/
    master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv`
    
    Ctrip's review data set contains 7000+ samples, 
    including more than 5,000 positive reviews 
    and more than 2,000 negative reviews
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_ctrip_hotel data: 
    `root/chinese_reviews_ctrip_hotel/chinese_reviews_ctrip_hotel.txt`
    `root/chinese_reviews_ctrip_hotel/chinese_reviews_ctrip_hotel.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_ctrip_hotel`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_ctrip_hotel`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_ctrip_hotel')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_ctrip_hotel/chinese_reviews_ctrip_hotel.json'
    url_txt = 'https://raw.githubusercontent.com/SophonPlus/ChineseNlpCorpus/master/datasets/ChnSentiCorp_htl_all/ChnSentiCorp_htl_all.csv'
    rq.json(url_json, path_join(task_path, 'chinese_reviews_ctrip_hotel.json'))
    rq.table(url_txt, path_join(task_path, 'chinese_reviews_ctrip_hotel.txt'))
    print('chinese_reviews_ctrip_hotel dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Exemplo n.º 3
0
def poet_tang(root):
    """Tang_poet dataset from Chinese classical literature.
    
    "Full Tang Poetry" is the 44th year of Qing Emperor Kangxi (1705), 
    Peng Dingqiu, Shen Sanzeng, Yang Zhongna, Wang Shizhen, Wang Wei, 
    Yu Mei, Xu Shuben, Che Dingjin, Pan Conglu, and Cha Yu 
    "There are more than 48,900 poems, more than 2,200 people,"
    a total of 900 volumes, 12 volumes of catalogues.
    
    Data storage directory:
    root = `/user/.../mydata`
    poet_tang data: 
    `root/poet_tang/poet_tang.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/poet_tang`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/poet_tang`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'poet_tang')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/poetry_tang.json.bz2'
    rq.files(url, path_join(task_path, 'poet_tang.json.bz2'))
    un_bz2(path_join(task_path, 'poet_tang.json.bz2'))
    remove(path_join(task_path, 'poet_tang.json.bz2'))
    print('poet_tang dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
Exemplo n.º 4
0
def sina_weibo(root):
    """Chinese Sina weibo reviews datasets.
        
    Chinese Sina weibo reviews datasets contains 110,000+ samples, 
    including more than 59,000 positive reviews 
    and more than 59,000 negative reviews
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_sina_weibo data: 
    `root/chinese_reviews_sina_weibo/chinese_reviews_sina_weibo.json`
    `root/chinese_reviews_sina_weibo/chinese_reviews_sina_weibo.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_sina_weibo`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_sina_weibo`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_sina_weibo')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo/chinese_reviews_sina_weibo.json'
    url_txt = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_sina_weibo/chinese_reviews_sina_weibo.csv'
    rq.json(url_json, path_join(task_path, 'chinese_reviews_sina_weibo.json'))
    rq.table(url_txt, path_join(task_path, 'chinese_reviews_sina_weibo.txt'))
    print('chinese_reviews_sina_weibo dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Exemplo n.º 5
0
def douban_movies(root):
    """Chinese douban movies reviews datasets.
        
    Chinese douban movies reviews datasets Includes 28 movies, 
    over 700,000 users, over 2 million ratings.
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_douban_movies data: 
    `root/chinese_reviews_douban_movies/chinese_reviews_douban_movies.json`
    `root/chinese_reviews_douban_movies/ratings.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_douban_movies`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_douban_movies`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_douban_movies')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_douban_movies/chinese_reviews_douban_movies.json'
    url_movies = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_douban_movies/movies.txt'
    url_ratings = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_douban_movies/ratings.txt'
    rq.json(url_json, path_join(task_path, 'chinese_reviews_douban_movies.json'))
    rq.table(url_movies, path_join(task_path, 'movies.txt'))
    l = [url_ratings[:-4]+str(i)+url_ratings[-4:] for i in range(13)]
    with concurrent.futures.ProcessPoolExecutor() as excutor:
        data = pd.concat(excutor.map(_request_txt, l))
    data.to_csv(path_join(task_path, 'ratings.txt'), index=False)
    print('chinese_reviews_douban_movies dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Exemplo n.º 6
0
def poet_song(root):
    """Song_poet dataset from Chinese classical literature.
    
    "Full Song Poetry" After the high prosperity of Tang poetry, 
    Song poetry has new development and creation in ideological 
    content and artistic expression. 
    Many excellent writers have appeared, 
    and many schools have been formed, 
    which have produced poetry development in Yuan, 
    Ming and Qing. A far-reaching impact.
    
    Data storage directory:
    root = `/user/.../mydata`
    poetry_song data: 
    `root/poet_song/poet_song.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/poet_song`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/poet_song`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'poet_song')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/poetry_song.json.bz2'
    rq.files(url, path_join(task_path, 'poet_song.json.bz2'))
    un_bz2(path_join(task_path, 'poet_song.json.bz2'))
    remove(path_join(task_path, 'poet_song.json.bz2'))
    print('poet_song dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
Exemplo n.º 7
0
def online_shopping_10_cats(root):
    """Chinese online shopping reviews datasets.
        
    Chinese online shopping reviews datasets contains 60,000+ samples, 
    about 10 categories (books, tablets, mobile phones, fruits, shampoos, 
    water heaters, Mengniu, clothes, computers, hotels),
    including more than 30,000 positive reviews 
    and more than 30,000 negative reviews
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_online_shopping_10_cats data: 
    `root/chinese_reviews_online_shopping_10_cats/chinese_reviews_online_shopping_10_cats.json`
    `root/chinese_reviews_online_shopping_10_cats/chinese_reviews_online_shopping_10_cats.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_online_shopping_10_cats`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_online_shopping_10_cats`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_online_shopping_10_cats')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_online_shopping_10_cats/chinese_reviews_online_shopping_10_cats.json'
    url_txt = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_online_shopping_10_cats/chinese_reviews_online_shopping_10_cats.csv'
    rq.json(url_json, path_join(task_path, 'chinese_reviews_online_shopping_10_cats.json'))
    rq.table(url_txt, path_join(task_path, 'chinese_reviews_online_shopping_10_cats.txt'))
    print('chinese_reviews_online_shopping_10_cats dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Exemplo n.º 8
0
def takeaway(root):
    """Chinese takeaway reviews datasets.
    
    datasets url:`https://github.com/SophonPlus/ChineseNlpCorpus/blob/master/datasets/waimai_10k/waimai_10k.csv`
    
    Chinese takeaway reviews datasets contains 12,000+ samples, 
    including more than 4,000 positive reviews 
    and more than 8,000 negative reviews
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_reviews_takeaway data: 
    `root/chinese_reviews_takeaway/chinese_reviews_takeaway.json`
    `root/chinese_reviews_takeaway/chinese_reviews_takeaway.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_reviews_takeaway`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_reviews_takeaway`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_reviews_takeaway')
    url_json = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_reviews_takeaway/chinese_reviews_takeaway.json'
    url_txt = 'https://raw.githubusercontent.com/SophonPlus/ChineseNlpCorpus/master/datasets/waimai_10k/waimai_10k.csv'
    rq.json(url_json, path_join(task_path, 'chinese_reviews_takeaway.json'))
    rq.table(url_txt, path_join(task_path, 'chinese_reviews_takeaway.txt'))
    print('chinese_reviews_takeaway dataset download completed, run time %d min %.2f sec' %divmod((time.time()-start), 60))
    return task_path
Exemplo n.º 9
0
def abbreviation(root):
    """Chinese abbreviation datasets.
    
    datasets url:`https://github.com/zhangyics/Chinese-abbreviation-dataset`
    
    A corpus of Chinese abbreviation
    This is the dataset released by the paper "A Chinese Dataset with Negative Full 
    Forms for General Abbreviation Prediction".
    
    Data storage directory:
    root = `/user/.../mydata`
    Chinese abbreviation datasets data: 
    `root/chinese_abbreviation/train_set.txt`
    `root/chinese_abbreviation/test_set.txt`
    `root/chinese_abbreviation/dev_set.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_abbreviation`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_abbreviation`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_abbreviation')
    url_train = "https://raw.githubusercontent.com/zhangyics/Chinese-abbreviation-dataset/master/train_set.txt"
    url_test = "https://raw.githubusercontent.com/zhangyics/Chinese-abbreviation-dataset/master/test_set.txt"
    url_dev = "https://raw.githubusercontent.com/zhangyics/Chinese-abbreviation-dataset/master/dev_set.txt"
    rq.files(url_train, path_join(task_path, 'train_set.txt'))
    rq.files(url_test, path_join(task_path, 'test_set.txt'))
    rq.files(url_dev, path_join(task_path, 'dev_set.txt'))
    print(
        'chinese abbreviation dataset download completed, run time %d min %.2f sec'
        % divmod((time.time() - start), 60))
    return task_path
Exemplo n.º 10
0
def famous_person(root):
    """Chinese lexicon famous person datasets.
    
    datasets url:`https://github.com/fighting41love/funNLP`
    
    chinese_lexicon_famous_person dataset contains 13600+ samples.
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_lexicon_famous_person data: 
    `root/chinese_lexicon_famous_person/chinese_lexicon_famous_person.txt`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_lexicon_famous_person`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_lexicon_famous_person`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_lexicon_famous_person')
    url = "https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_lexicon/chinese_lexicon_famous_person.txt"
    rq.table(url, path_join(task_path, 'chinese_lexicon_famous_person.txt'))
    print(
        'chinese_lexicon_famous_person dataset download completed, run time %d min %.2f sec'
        % divmod((time.time() - start), 60))
    return task_path
Exemplo n.º 11
0
def ci_song(root):
    """Song_ci dataset from Chinese classical literature.
    
    "The Song of the Whole Song" is one of the most important 
    achievements of ancient books in China in the past 100 years. 
    Song poetry and Tang poetry are the artistic peaks of 
    Chinese classical poetry. The "Full Tang Poetry" edited in 
    the Qing Dynasty is a household name, and now it is newly 
    compiled "Full Song Ci", which is called the double shackles 
    of Chinese literature. The book has a total of five volumes, 
    a collection of words from the Song Dynasty for three hundred years.
    
    Data storage directory:
    root = `/user/.../mydata`
    ci_song data: 
    `root/ci_song/ci_song.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/ci_song`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/ci_song`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'ci_song')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/ci_song.json'
    rq.files(url, path_join(task_path, url.split('/')[-1]))
    print('ci_song dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
Exemplo n.º 12
0
def shijing(root):
    """Shijing dataset from Chinese classical literature.
    
    The earliest poetry collection in China, The Book of Songs, 
    is the earliest collection of poems in ancient Chinese poetry. 
    It collects poems from the early Western Zhou Dynasty to the 
    middle of spring and autumn (the first 11th century to the 
    first six centuries), including 311 articles, 
    of which 6 The article is a poem, that is, only the title, 
    no content, called the six poems of the poems 
    (Nan, Baihua, Huaying, Yukang, Chongwu, Yuyi),
    reflecting the period from the beginning of the week to 
    the late Zhou Dynasty for about five hundred years.
    
    Data storage directory:
    root = `/user/.../mydata`
    shijing data: 
    `root/shijing/shijing.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/shijing`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/shijing`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'shijing')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/shijing.json'
    rq.files(url, path_join(task_path, url.split('/')[-1]))
    print('shijing dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
Exemplo n.º 13
0
def poetry_SouthernTang(root):
    """poetry_SouthernTang dataset from Chinese classical literature.
    
    "The Southern Tang Dynasty's two main words", 
    is the Southern Tang Dynasty master Li Jing, the latter master Li Yu. 
    The book was written in the Southern Song Dynasty, 
    and later generations have been compiled, 
    and later generations have written various versions.
    
    Data storage directory:
    root = `/user/.../mydata`
    poetry_SouthernTang data: 
    `root/poetry_SouthernTang/poetry_SouthernTang.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/poetry_SouthernTang`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/poetry_SouthernTang`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'poetry_SouthernTang')
    url = "https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/nantang_erzhu_poetry.json"
    rq.files(url, path_join(task_path, 'poetry_SouthernTang.json'))
    print(
        'poetry_SouthernTang dataset download completed, run time %d min %.2f sec'
        % divmod((time.time() - start), 60))
    return task_path
Exemplo n.º 14
0
def HIT(root):
    """Chinese stop word of Harbin Institute of Technology datasets.
        
    Chinese stop word of Harbin Institute of Technology datasets Includes 767 stop word.
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_stop_word_HIT data: 
    `root/chinese_stop_word_HIT/chinese_stop_word_HIT`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_stop_word_HIT`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_stop_word_HIT`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_stop_word_HIT')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_stop_word/chinese_stop_word_HIT.txt'
    data = pd.DataFrame(
        list(
            map(lambda x: x.replace('\n', ''),
                io.StringIO(requests.get(url).content.decode('utf-8')))))
    data.to_csv(path_join(task_path, 'chinese_stop_word_HIT.txt'),
                index=False,
                header=None)
    print(
        'chinese_stop_word_HIT dataset download completed, run time %d min %.2f sec'
        % divmod((time.time() - start), 60))
    return task_path
Exemplo n.º 15
0
def huajianji(root):
    """Huajianji dataset from Chinese classical literature.
    
    "Hua Jian Ji" is a collection of Chinese poetry 
    compiled during the Five Dynasties and Ten Kingdoms period. 
    It is also the first collection of literati in the history of literature. 
    It was edited by Zhao Chongxi, a later monk. 
    The book contains 18 classic works of poetry by Wen Tingjun and Wei Zhuang. 
    It concentrates and typically reflects the subject orientation, 
    aesthetic taste, physical style and artistic achievement 
    of the human creation in the early word history.
    
    Data storage directory:
    root = `/user/.../mydata`
    huajianji data: 
    `root/huajianji/huajianji.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/huajianji`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/huajianji`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'huajianji')
    url = "https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/huajianji.json"
    rq.files(url, path_join(task_path, url.split('/')[-1]))
    print('huajianji dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
Exemplo n.º 16
0
def economist(root, date, mode='pdf'):
    """The Economist from https://github.com/nailperry-zd/The-Economist.
    
    Data storage directory:
    root = `/user/.../mydata`
    economist data: 
    `root/...(pdf or epub or mobi)`
    Args:
        root: str, Store the absolute path of the data directory.
        date: str, eg:'2019-01-01'.
        mode: str, one of ['pdf', 'epub', 'mobi'].
    Returns:
        Store the absolute path of the data directory, is `root/...(pdf or epub or mobi)`.
    """
    start = time.time()
    assert mode in ['pdf', 'epub',
                    'mobi'], "`mode` should be one of ['pdf', 'epub', 'mobi']."
    t = divmod((pd.to_datetime(date) - pd.to_datetime('2017-05-06')).days, 7)
    if t[0] < 0 or t[1] > 0:
        raise ValueError("No book that meets the date.")
    task_path = assert_dirs(root)
    t = 'https://github.com/nailperry-zd/The-Economist/raw/master/{}'.format(
        date)
    t = [
        i for i in requests.get(t).content.decode('utf-8').split('\n')
        if ('The-Economist/blob/master/{}'.format(date) in i) & (mode in i)
    ]
    url = 'https://github.com/nailperry-zd/The-Economist/raw/master/' + date + '/' + [
        i[7:-1] for i in t[0].split(' ') if 'title' in i
    ][0]
    task_path = path_join(task_path, url.split('/')[-1])
    rq.files(url, task_path, verbose=1)
    print('economist dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
Exemplo n.º 17
0
def SCU(root):
    """Chinese stop word of Sichuan University datasets.
        
    Chinese stop word of Sichuan University datasets Includes 976 stop word.
    
    Data storage directory:
    root = `/user/.../mydata`
    chinese_stop_word_SCU data: 
    `root/chinese_stop_word_SCU/chinese_stop_word_SCU`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/chinese_stop_word_SCU`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/chinese_stop_word_SCU`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'chinese_stop_word_SCU')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/chinese_stop_word/chinese_stop_word_SCU.txt'
    rq.table(url, path_join(task_path, 'chinese_stop_word_SCU.txt'))
    print(
        'chinese_stop_word_SCU dataset download completed, run time %d min %.2f sec'
        % divmod((time.time() - start), 60))
    return task_path
Exemplo n.º 18
0
def lunyu(root):
    """Lunyu dataset from Chinese classical literature.
    
    The Chinese Confucian classics, "The Analects of Confucius" is 
    a collection of quotations of Confucius and his disciples. 
    It was written by Confucius disciples and re-transmission disciples, 
    and was written in the early period of the Warring States Period. 
    The book consists of 20 chapters and 492 chapters. 
    It is mainly composed of quotations and supplemented by narratives. 
    It mainly records the words and deeds of Confucius and his disciples, 
    and more concentratedly reflects Confucius' political opinions, 
    ethical thoughts, moral concepts and educational principles. 
    This book is one of the classic works of Confucianism. 
    It is also called "Four Books" with "University", 
    "The Doctrine of the Mean" and "Mencius",
    plus "The Book of Songs", "Shangshu", "Book of Rites", 
    "Zhou Yi", "Spring and Autumn", collectively called "four books". Five Classics."
    
    Data storage directory:
    root = `/user/.../mydata`
    lunyu data: 
    `root/lunyu/lunyu.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/lunyu`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/lunyu`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'lunyu')
    url = 'https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/lunyu.json'
    rq.files(url, path_join(task_path, url.split('/')[-1]))
    print('lunyu dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path
Exemplo n.º 19
0
def youmengying(root):
    """Youmengying dataset from Chinese classical literature.
    
    "You Meng Ying" is an anthology of Zhang Chao's 
    creations by Qing Dynasty writers.
    
    Data storage directory:
    root = `/user/.../mydata`
    youmengying data: 
    `root/youmengying/youmengying.json`
    Args:
        root: str, Store the absolute path of the data directory.
              example:if you want data path is `/user/.../mydata/youmengying`,
              root should be `/user/.../mydata`.
    Returns:
        Store the absolute path of the data directory, is `root/youmengying`.
    """
    start = time.time()
    task_path = assert_dirs(root, 'youmengying')
    url = "https://raw.githubusercontent.com/Hourout/datasets/master/nlp/wenxue/youmengying.json"
    rq.files(url, path_join(task_path, url.split('/')[-1]))
    print('youmengying dataset download completed, run time %d min %.2f sec' %
          divmod((time.time() - start), 60))
    return task_path