예제 #1
0
파일: build.py 프로젝트: xhuvom/ParlAI
def build(opt):
    dpath = opt['datapath'] + "/WebQuestions/"

    if not build_data.built(dpath):
        print("[building data: " + dpath + "]")
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

         # Download the data.
        url = ("https://worksheets.codalab.org/rest/bundles/" +
               "0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/")
        build_data.download(dpath, url)
        build_data.move(dpath + 'index.html', dpath + 'train.json')

        url = ("https://worksheets.codalab.org/rest/bundles/" +
               "0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/")
        build_data.download(dpath, url)
        build_data.move(dpath + 'index.html', dpath + 'test.json')

        create_fb_format(dpath, 'train', dpath + 'train.json')
        create_fb_format(dpath, 'valid', dpath + 'train.json')
        create_fb_format(dpath, 'test', dpath + 'test.json')

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #2
0
def train_test_split(inpath, train, test, split, random_seed):
    """
    RuCor doesn't provide train/test data splitting, it makes random splitting.
    Args:
        inpath: path to data
        train: path to train folder
        test: path to test folder
        split: int, split ratio
        random_seed: seed for random module

    Returns:

    """
    print('Start train-test splitting ...')
    z = os.listdir(inpath)
    doc_split = ShuffleSplit(1, test_size=split, random_state=random_seed)
    for train_indeses, test_indeses in doc_split.split(z):
        train_set = [z[i] for i in sorted(list(train_indeses))]
        test_set = [z[i] for i in sorted(list(test_indeses))]
    for x in train_set:
        build_data.move(os.path.join(inpath, x), os.path.join(train, x))
    for x in test_set:
        build_data.move(os.path.join(inpath, x), os.path.join(test, x))
    print('End train-test splitts.')
    return None
예제 #3
0
def build(opt):
    version = 'v0.9'
    dpath = os.path.join(opt['datapath'], 'VisDial-v0.9')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')

        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname1 = 'visdial_0.9_train.zip'
        fname2 = 'visdial_0.9_val.zip'

        url = 'https://computing.ece.vt.edu/~abhshkdz/data/visdial/'
        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)

        build_data.untar(dpath, fname1)
        build_data.untar(dpath, fname2)

        print('processing unpacked files')
        # Use 1000 examples from training set as validation.
        json1 = os.path.join(dpath, fname1.rsplit('.', 1)[0] + '.json')
        with open(json1) as t_json:
            train_data = json.load(t_json)

        valid_data = train_data.copy()
        valid_data['data'] = train_data['data'].copy()
        valid_data['data']['dialogs'] = []

        # Use constant stride to pick examples.
        num_valid = 1000
        total = len(train_data['data']['dialogs'])
        step = total // (num_valid - 1)
        for i in range(total - 1, 0, -step)[:num_valid]:
            valid_data['data']['dialogs'].append(
                train_data['data']['dialogs'][i])
            del train_data['data']['dialogs'][i]

        train_json = json1.rsplit('.', 1)[0] + '_train.json'
        valid_json = json1.rsplit('.', 1)[0] + '_valid.json'
        with open(train_json, 'w') as t_out, open(valid_json, 'w') as v_out:
            json.dump(train_data, t_out)
            json.dump(valid_data, v_out)
        os.remove(json1)

        # Use validation data as test.
        json2 = os.path.join(dpath, fname2.rsplit('.', 1)[0] + '.json')
        test_json = json2.rsplit('.', 1)[0] + '_test.json'
        build_data.move(json2, test_json)

        # Mark the data as built.
        build_data.mark_done(dpath, version)
예제 #4
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    version = 'v0.9'
    dpath = os.path.join(opt['datapath'], 'VisDial-v0.9')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')

        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname1 = 'visdial_0.9_train.zip'
        fname2 = 'visdial_0.9_val.zip'

        url = 'https://computing.ece.vt.edu/~abhshkdz/data/visdial/'
        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)

        build_data.untar(dpath, fname1)
        build_data.untar(dpath, fname2)

        print('processing unpacked files')
        # Use 1000 examples from training set as validation.
        json1 = os.path.join(dpath, fname1.rsplit('.', 1)[0] + '.json')
        with open(json1) as t_json:
            train_data = json.load(t_json)

        valid_data = train_data.copy()
        valid_data['data'] = train_data['data'].copy()
        valid_data['data']['dialogs'] = []

        # Use constant stride to pick examples.
        num_valid = 1000
        total = len(train_data['data']['dialogs'])
        step = total // (num_valid - 1)
        for i in range(total-1, 0, -step)[:num_valid]:
            valid_data['data']['dialogs'].append(train_data['data']['dialogs'][i])
            del train_data['data']['dialogs'][i]

        train_json = json1.rsplit('.', 1)[0] + '_train.json'
        valid_json = json1.rsplit('.', 1)[0] + '_valid.json'
        with open(train_json, 'w') as t_out, open(valid_json, 'w') as v_out:
            json.dump(train_data, t_out)
            json.dump(valid_data, v_out)
        os.remove(json1)

        # Use validation data as test.
        json2 = os.path.join(dpath, fname2.rsplit('.', 1)[0] + '.json')
        test_json = json2.rsplit('.', 1)[0] + '_test.json'
        build_data.move(json2, test_json)

        # Mark the data as built.
        build_data.mark_done(dpath, version)
예제 #5
0
def build(opt):
    version = 'v0.9'
    dpath = os.path.join(opt['datapath'], 'VisDial-v0.9')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')

        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        for downloadable_file in RESOURCES:
            downloadable_file.download_file(dpath)

        print('processing unpacked files')
        # Use 1000 examples from training set as validation.
        json1 = os.path.join(
            dpath, RESOURCES[0].file_name.rsplit('.', 1)[0] + '.json')
        with PathManager.open(json1) as t_json:
            train_data = json.load(t_json)

        valid_data = train_data.copy()
        valid_data['data'] = train_data['data'].copy()
        valid_data['data']['dialogs'] = []

        # Use constant stride to pick examples.
        num_valid = 1000
        total = len(train_data['data']['dialogs'])
        step = total // (num_valid - 1)
        for i in range(total - 1, 0, -step)[:num_valid]:
            valid_data['data']['dialogs'].append(
                train_data['data']['dialogs'][i])
            del train_data['data']['dialogs'][i]

        train_json = json1.rsplit('.', 1)[0] + '_train.json'
        valid_json = json1.rsplit('.', 1)[0] + '_valid.json'
        with PathManager.open(train_json,
                              'w') as t_out, PathManager.open(valid_json,
                                                              'w') as v_out:
            json.dump(train_data, t_out)
            json.dump(valid_data, v_out)
        PathManager.rm(json1)

        # Use validation data as test.
        json2 = os.path.join(
            dpath, RESOURCES[1].file_name.rsplit('.', 1)[0] + '.json')
        test_json = json2.rsplit('.', 1)[0] + '_test.json'
        build_data.move(json2, test_json)

        # Mark the data as built.
        build_data.mark_done(dpath, version)
예제 #6
0
def train_test_split(inpath, train, test, split, random_seed):
    print('Start train-test splitting ...')
    z = os.listdir(inpath)
    doc_split = ShuffleSplit(1, test_size=split, random_state=random_seed)
    for train_indeses, test_indeses in doc_split.split(z): 
        train_set = [z[i] for i in sorted(list(train_indeses))]
        test_set = [z[i] for i in sorted(list(test_indeses))]
    for x in train_set:
        build_data.move(os.path.join(inpath, x), os.path.join(train, x))
    for x in test_set:
        build_data.move(os.path.join(inpath, x), os.path.join(test, x))
    print('End train-test splitts.')
    return None
예제 #7
0
def train_valid_test_split(inpath,
                           train_path,
                           valid_path,
                           test_path,
                           valid_ratio,
                           test_ratio,
                           seed=None):
    """split dataset on train/valid/test

    splits dataset and moves datafiles to train/valid/test folders

    Args:
        inpath: all datafiles
        train_path: path to save train datafiles
        valid_path: path to save valid datafiles
        test_path: path to save test datafiles
        valid_ratio: len(valid) / len(all_datafiles)
        test_ratio: len(test) / len(all_datafiles)
        seed: random seed

    Returns:
        nothing
    """

    assert valid_ratio + test_ratio <= 1.0
    assert valid_ratio > 0 and test_ratio > 0
    source_files = list(sorted(os.listdir(inpath)))

    train_valid, test = train_test_split(source_files,
                                         test_size=test_ratio,
                                         random_state=seed)
    train, valid = train_test_split(train_valid,
                                    test_size=valid_ratio / (1 - test_ratio),
                                    random_state=seed)

    print('train_valid_test_split: {}/{}/{}'.format(len(train), len(valid),
                                                    len(test)))
    for dataset, data_path in zip([train, valid, test],
                                  [train_path, valid_path, test_path]):
        for el in dataset:
            build_data.move(os.path.join(inpath, el),
                            os.path.join(data_path, el))
    return None
예제 #8
0
def build(opt):
    dpath = opt['datapath'] + "/MovieDialog/"

    if not build_data.built(dpath):
        print("[building data: " + dpath + "]")
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = "moviedialog.tar.gz"
        url = "https://s3.amazonaws.com/fair-data/parlai/moviedialog/" + fname
        build_data.download(dpath, url)
        build_data.untar(dpath, fname)
        dpath2 = dpath + "/movie_dialog_dataset/task4_reddit/"
        fname2a = dpath2 + "p6tyohj"
        fname2b = dpath2 + "p6tyohj.tgz"
        url2 = "http://tinyurl.com/" + "p6tyohj"
        build_data.download(dpath2, url2)
        build_data.move(fname2a, fname2b)
        build_data.untar(dpath2, "p6tyohj.tgz")

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #9
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'MovieDialog')

    if not build_data.built(dpath):
        print('[building data: ' + dpath + ']')
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'moviedialog.tar.gz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/moviedialog/' + fname
        build_data.download(dpath, url)
        build_data.untar(dpath, fname)
        dpath2 = os.path.join(dpath, 'movie_dialog_dataset', 'task4_reddit')
        fname2a = os.path.join(dpath2, 'p6tyohj')
        fname2b = os.path.join(dpath2, 'p6tyohj.tgz')
        url2 = 'http://tinyurl.com/' + 'p6tyohj'
        build_data.download(dpath2, url2)
        build_data.move(fname2a, fname2b)
        build_data.untar(dpath2, 'p6tyohj.tgz')

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #10
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'dstc2')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove there outdates files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)
        ds_path = os.environ.get('DATASETS_URL')
        filename = 'dstc2.tar.gz'

        # Download the data.
        print('Trying to download a dataset %s from the repository' % filename)
        url = urllib.parse.urljoin(ds_path, filename)
        if url.startswith('file://'):
            build_data.move(url[7:], dpath)
        else:
            build_data.download(url, dpath, filename)
        build_data.untar(dpath, filename)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)