예제 #1
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    version = 'v1.0'
    dpath = os.path.join(opt['datapath'], 'QACNN')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'cnn.tgz'
        gd_id = '0BwmD_VLjROrfTTljRDVZMFJnVWM'
        build_data.download_from_google_drive(gd_id, os.path.join(dpath, fname))
        build_data.untar(dpath, fname)

        create_fb_format(dpath, 'train',
                         os.path.join(dpath, 'cnn', 'questions', 'training'))
        create_fb_format(dpath, 'valid',
                         os.path.join(dpath, 'cnn', 'questions', 'validation'))
        create_fb_format(dpath, 'test',
                         os.path.join(dpath, 'cnn', 'questions', 'test'))

        # Mark the data as built.
        build_data.mark_done(dpath, version)
예제 #2
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'MovieDialog')
    version = '1'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        dpath2 = os.path.join(dpath, 'movie_dialog_dataset', 'task4_reddit')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)
        build_data.make_dir(dpath2)

        # Download the data.
        fname = 'moviedialog.tar.gz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/moviedialog/' + fname
        build_data.download(url, dpath, fname)

        url2 = 'http://tinyurl.com/' + 'p6tyohj'
        build_data.download(url2, dpath2, 'p6tyohj.tgz')

        build_data.untar(dpath, fname)
        build_data.untar(dpath2, 'p6tyohj.tgz')

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #3
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'WikiQA')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'wikiqa.tar.gz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/wikiqa/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        dpext = os.path.join(dpath, 'WikiQACorpus')
        create_fb_format(dpath, 'train',
                         os.path.join(dpext, 'WikiQA-train.tsv'))
        create_fb_format(dpath, 'valid',
                         os.path.join(dpext, 'WikiQA-dev.tsv'))
        create_fb_format(dpath, 'test',
                         os.path.join(dpext, 'WikiQA-test.tsv'))
        create_fb_format(dpath, 'train-filtered',
                         os.path.join(dpext, 'WikiQA-train.tsv'))
        create_fb_format(dpath, 'valid-filtered',
                         os.path.join(dpext, 'WikiQA-dev.tsv'))
        create_fb_format(dpath, 'test-filtered',
                         os.path.join(dpext, 'WikiQA-test.tsv'))

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #4
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'VQA-v2')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        # An older version exists, so remove these outdated files.
        if build_data.built(dpath):
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname1 = 'v2_Questions_Train_mscoco.zip'
        fname2 = 'v2_Questions_Val_mscoco.zip'
        fname3 = 'v2_Questions_Test_mscoco.zip'

        fname4 = 'v2_Annotations_Val_mscoco.zip'
        fname5 = 'v2_Annotations_Train_mscoco.zip'

        url = 'http://visualqa.org/data/mscoco/vqa/'
        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)
        build_data.download(url + fname3, dpath, fname3)

        build_data.download(url + fname4, dpath, fname4)
        build_data.download(url + fname5, dpath, fname5)

        build_data.untar(dpath, fname1)
        build_data.untar(dpath, fname2)
        build_data.untar(dpath, fname3)
        build_data.untar(dpath, fname4)
        build_data.untar(dpath, fname5)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #5
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'MCTest')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'mctest.tar.gz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/mctest/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        dpext = os.path.join(dpath, 'mctest')
        create_fb_format(dpath, 'train160',
                         os.path.join(dpext, 'MCTest', 'mc160.train'), None)
        create_fb_format(dpath, 'valid160',
                         os.path.join(dpext, 'MCTest', 'mc160.dev'), None)
        create_fb_format(dpath, 'test160',
                         os.path.join(dpext, 'MCTest', 'mc160.test'),
                         os.path.join(dpext, 'MCTestAnswers', 'mc160.test.ans'))
        create_fb_format(dpath, 'train500',
                         os.path.join(dpext, 'MCTest', 'mc500.train'), None)
        create_fb_format(dpath, 'valid500',
                         os.path.join(dpext, 'MCTest', 'mc500.dev'), None)
        create_fb_format(dpath, 'test500',
                         os.path.join(dpext, 'MCTest', 'mc500.test'),
                         os.path.join(dpext, 'MCTestAnswers', 'mc500.test.ans'))

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #6
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'NarrativeQA')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')

        if build_data.built(dpath):
            # an older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # download the data.
        fname = 'narrative_qa.zip'
        # dataset URL
        url = NARRATIVE_QA_DOWNLOAD_URL
        build_data.download(url, dpath, fname)

        # uncompress it
        build_data.untar(dpath, fname)

        print('downloading stories now')
        base_path = os.path.join(dpath, 'narrativeqa-master')

        download_stories(base_path)

        # move from tmp to stories
        tmp_stories_path = os.path.join(base_path,
                                        'tmp')
        new_stories_path = os.path.join(base_path,
                                        'stories')
        shutil.move(tmp_stories_path, new_stories_path)

        # divide into train, valid and test for summaries
        summaries_csv_path = os.path.join(base_path, 'third_party',
                                          'wikipedia', 'summaries.csv')
        new_path = os.path.join(base_path, 'summaries.csv')
        shutil.move(summaries_csv_path, new_path)

        divide_csv_into_sets(new_path)

        # divide into sets for questions
        questions_path = os.path.join(base_path, 'qaps.csv')
        divide_csv_into_sets(questions_path)

        # divide into sets for documents
        documents_path = os.path.join(base_path, 'documents.csv')
        divide_csv_into_sets(documents_path)

        # move specific set's files into their set's folder
        make_folders(base_path)
        move_files(base_path)

        # move narrativeqa-master to narrative_qa
        new_path = os.path.join(dpath, 'narrative_qa')
        shutil.move(base_path, new_path)

        # mark the data as built
        build_data.mark_done(dpath, version_string=version)
예제 #7
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    version = 'v0.9'
    dpath = os.path.join(opt['datapath'], 'VisDial-v0.9')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')

        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname1 = 'visdial_0.9_train.zip'
        fname2 = 'visdial_0.9_val.zip'

        url = 'https://computing.ece.vt.edu/~abhshkdz/data/visdial/'
        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)

        build_data.untar(dpath, fname1)
        build_data.untar(dpath, fname2)

        print('processing unpacked files')
        # Use 1000 examples from training set as validation.
        json1 = os.path.join(dpath, fname1.rsplit('.', 1)[0] + '.json')
        with open(json1) as t_json:
            train_data = json.load(t_json)

        valid_data = train_data.copy()
        valid_data['data'] = train_data['data'].copy()
        valid_data['data']['dialogs'] = []

        # Use constant stride to pick examples.
        num_valid = 1000
        total = len(train_data['data']['dialogs'])
        step = total // (num_valid - 1)
        for i in range(total-1, 0, -step)[:num_valid]:
            valid_data['data']['dialogs'].append(train_data['data']['dialogs'][i])
            del train_data['data']['dialogs'][i]

        train_json = json1.rsplit('.', 1)[0] + '_train.json'
        valid_json = json1.rsplit('.', 1)[0] + '_valid.json'
        with open(train_json, 'w') as t_out, open(valid_json, 'w') as v_out:
            json.dump(train_data, t_out)
            json.dump(valid_data, v_out)
        os.remove(json1)

        # Use validation data as test.
        json2 = os.path.join(dpath, fname2.rsplit('.', 1)[0] + '.json')
        test_json = json2.rsplit('.', 1)[0] + '_test.json'
        build_data.move(json2, test_json)

        # Mark the data as built.
        build_data.mark_done(dpath, version)
예제 #8
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'wizard_of_wikipedia')
    fname = 'wizard_of_wikipedia.tgz'
    version = '1.0'
    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)
        url = 'http://parl.ai/downloads/wizard_of_wikipedia/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)
        build_data.mark_done(dpath, version)
예제 #9
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'dailydialog_augmented')
    version = 'None'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        remote_fname = 'dailydialog.tar.gz'
        local_fname = 'dailydialog_augmented.tar.gz'
        url = 'http://parl.ai/downloads/dailydialog/' + remote_fname
        build_data.download(url, dpath, local_fname)
        build_data.untar(dpath, local_fname)

        fpath = os.path.join(dpath, 'train.json')
        with open(fpath, mode='r+') as f:
            data = []
            print('augmenting dailydialog')
            for line in f:
                dialog = list(
                    map(lambda obj_dialog: obj_dialog['text'],
                        json.loads(line)['dialogue']))
                data.append(dialog)
            augmented_data = augment_dataset(data)

            def package_dialog(dialog):
                packaged_utterances = list(
                    map(
                        lambda utterance: {
                            'emotion': "",
                            'act': "",
                            'text': utterance
                        }, dialog))
                return {
                    "fold": "train",
                    "topic": "",
                    "dialogue": packaged_utterances
                }

            augmented_data = list(map(package_dialog, augmented_data))
            f.seek(0)
            json.dump(augmented_data, f)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #10
0
파일: build.py 프로젝트: sikopet/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'InsuranceQA')

    if not build_data.built(dpath):
        print('[building data: ' + dpath + ']')
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data from github.
        fname = 'insuranceqa.zip'
        url = 'https://github.com/shuzi/insuranceQA/archive/master.zip'
        build_data.download(url, dpath, fname, redownload=False)
        build_data.untar(dpath, fname)

        # According to the author, V2 holds the latest data
        dpext = os.path.join(dpath, 'insuranceQA-master/V2')

        # read vocab file
        vocab_path = os.path.join(dpext, "vocabulary")
        d_vocab = read_vocab(vocab_path)

        # read label2answer file
        label2answer_path_gz = os.path.join(
            dpext, "InsuranceQA.label2answer.token.encoded.gz")
        d_label_answer = read_label2answer(label2answer_path_gz, d_vocab)

        # TODO: right now it uses 100 by default, but 500, 1000, 1500 (# of label candidates) should also be available
        train_path_gz = os.path.join(
            dpext,
            "InsuranceQA.question.anslabel.token.100.pool.solr.train.encoded.gz"
        )
        valid_path_gz = os.path.join(
            dpext,
            "InsuranceQA.question.anslabel.token.100.pool.solr.valid.encoded.gz"
        )
        test_path_gz = os.path.join(
            dpext,
            "InsuranceQA.question.anslabel.token.100.pool.solr.test.encoded.gz"
        )

        create_fb_format(dpath, 'train', train_path_gz, d_vocab,
                         d_label_answer)
        create_fb_format(dpath, 'valid', valid_path_gz, d_vocab,
                         d_label_answer)
        create_fb_format(dpath, 'test', test_path_gz, d_vocab, d_label_answer)

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #11
0
def build_data_for_agent(opt):
    
    # get path to data directory and create folders tree
    dpath = join(opt['model_file'])
    # define languages
    language = opt['language']
    dpath = join(dpath, language, 'agent')
    build_data.make_dir(dpath)
    
    build_data.make_dir(join(dpath, 'embeddings'))
    build_data.make_dir(join(dpath, 'vocab'))
    build_data.make_dir(join(dpath, 'logs', opt['name']))
    
    if not isfile(join(dpath, 'embeddings', 'embeddings_lenta_100.vec')):     
        print('[Download the word embeddings]...')
        try:
            embed_url = os.environ['EMBEDDINGS_URL'] + 'embeddings_lenta_100.vec'
            build_data.download(embed_url, join(dpath, 'embeddings'), 'embeddings_lenta_100.vec')
            print('[End of download the word embeddings]...')
        except RuntimeWarning:
            raise('To use your own embeddings, please, put the file embeddings_lenta_100.vec in the folder '
                  '{0}'.format(join(dpath,'embeddings')))

    if not isfile(join(dpath, 'vocab', 'char_vocab.russian.txt')):
        print('[Download the chars vocalibary]...')
        try:
            vocab_url = os.environ['MODELS_URL'] + 'coreference/vocabs/char_vocab.russian.txt'
            build_data.download(vocab_url, join(dpath, 'vocab'), 'char_vocab.russian.txt')
            print('[End of download the chars vocalibary]...')
        except RuntimeWarning:
            raise('To use your own char vocalibary, please, put the file char_vocab.russian.txt in the folder '
                  '{0}'.format(join(dpath,'vocabs')))
    
    if opt['name'] == 'pretrain_model' and not isdir(join(dpath, 'logs', 'pretrain_model')):
        print('[Download the pretrain model]...')
        try:
            pretrain_url = os.environ['MODELS_URL'] + 'coreference/OpeanAI/pretrain_model.zip'
            build_data.download(pretrain_url, join(dpath, 'logs'), 'pretrain_model.zip')
            build_data.untar(join(dpath, 'logs'), 'pretrain_model.zip')
            print('[End of download pretrain model]...')
        except RuntimeWarning:
            raise('To train your own model, please, change the variable --name in build.py:train_coreference '
                  'to anything other than `pretrain_model`')
        
    build_data.make_dir(join(dpath, 'reports', 'response_files'))
    build_data.make_dir(join(dpath, 'reports', 'results'))
    build_data.make_dir(join(dpath, 'reports', 'predictions'))
    return None
예제 #12
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'dialog-bAbI')

    if not build_data.built(dpath):
        print('[building data: ' + dpath + ']')
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'dialog_babi.tar.gz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/dialog_babi/' + fname
        build_data.download(dpath, url)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #13
0
def build(opt):
    dpath = opt['datapath'] + "/dialog-bAbI/"

    if not build_data.built(dpath):
        print("[building data: " + dpath + "]")
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = "dialog_babi.tar.gz"
        url = "https://s3.amazonaws.com/fair-data/parlai/dialog_babi/" + fname
        build_data.download(dpath, url)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #14
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'BookTest')

    if not build_data.built(dpath):
        print('[building data: ' + dpath + ']')
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'booktest.tar.bz2'
        url = 'https://s3.amazonaws.com/fair-data/parlai/booktest/' + fname
        build_data.download(dpath, url)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #15
0
파일: build.py 프로젝트: xhuvom/ParlAI
def build(opt):
    dpath = opt['datapath'] + "/BookTest/"

    if not build_data.built(dpath):
        print("[building data: " + dpath + "]")
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = "booktest.tar.bz2"
        url = "https://s3.amazonaws.com/fair-data/parlai/booktest/" + fname
        build_data.download(dpath, url)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #16
0
파일: build.py 프로젝트: xhuvom/ParlAI
def build(opt):
    dpath = opt['datapath'] + '/Ubuntu/'

    if not build_data.built(dpath):
        print('[building data: ' + dpath + ']')
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'ubuntu.tar.gz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/ubuntu/' + fname
        build_data.download(dpath, url)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #17
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'WikiMovies')

    if not build_data.built(dpath):
        print('[building data: ' + dpath + ']')
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'wikimovies.tar.gz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/wikimovies/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #18
0
파일: build.py 프로젝트: zwcdp/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'FVQA')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        # An older version exists, so remove these outdated files.
        if build_data.built(dpath):
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        build_data.download('https://dl.dropboxusercontent.com/s/iyz6l7jhbt6jb7q/new_dataset_release.zip', dpath, 'FVQA.zip')  # noqa: E501
        build_data.untar(dpath, 'FVQA.zip')

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #19
0
파일: build.py 프로젝트: sagar-spkt/ParlAI
def build(opt):
    dpath = os.path.join(opt["datapath"], "metalwoz")
    version = "1.0"

    if not build_data.built(dpath, version_string=version):
        if build_data.built(dpath):
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)
        build_data.make_dir(os.path.join(dpath, "train", "dialogues"))
        build_data.make_dir(os.path.join(dpath, "test", "dialogues"))

        # Download the data.
        RESOURCES[0].download_file(os.path.join(dpath, "train"))
        RESOURCES[1].download_file(os.path.join(dpath, "test"))

        build_data.untar(os.path.join(dpath, "test"), "dstc8_metalwoz_heldout.zip")
        build_data.mark_done(dpath, version_string=version)
예제 #20
0
파일: build.py 프로젝트: sikopet/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'personalized-dialog')

    if not build_data.built(dpath):
        print('[building data: ' + dpath + ']')
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        # https://www.dropbox.com/s/4i9u4y24pt3paba/personalized-dialog-dataset.tar.gz?dl=1
        fname = 'personalized-dialog-dataset.tar.gz'
        url = 'https://www.dropbox.com/s/4i9u4y24pt3paba/' + fname + '?dl=1'
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #21
0
파일: build.py 프로젝트: simplecoka/cortx
def build(opt):
    dpath = os.path.join(opt['datapath'], 'metalwoz')
    version = '1.0'

    if not build_data.built(dpath, version_string=version):
        if build_data.built(dpath):
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)
        build_data.make_dir(os.path.join(dpath, 'train', 'dialogues'))
        build_data.make_dir(os.path.join(dpath, 'test', 'dialogues'))

        # Download the data.
        RESOURCES[0].download_file(os.path.join(dpath, 'train'))
        RESOURCES[1].download_file(os.path.join(dpath, 'test'))

        build_data.untar(os.path.join(dpath, 'test'), 'dstc8_metalwoz_heldout.zip')
        build_data.mark_done(dpath, version_string=version)
예제 #22
0
def download_process_wikiqa(data_path='data'):
    dpath = os.path.join(data_path, 'WikiQA')
    build_data.make_dir(dpath)
    fname = 'wikiqa.tar.gz'
    url = 'http://parl.ai/downloads/wikiqa/' + fname
    build_data.download(url, dpath, fname)
    build_data.untar(dpath, fname)
    dpext = os.path.join(dpath, 'WikiQACorpus')
    create_fb_format(dpath, 'train', os.path.join(dpext, 'WikiQA-train.tsv'))
    create_fb_format(dpath, 'valid', os.path.join(dpext, 'WikiQA-dev.tsv'))
    create_fb_format(dpath, 'test', os.path.join(dpext, 'WikiQA-test.tsv'))
    create_fb_format(dpath, 'train-filtered',
                     os.path.join(dpext, 'WikiQA-train.tsv'))
    create_fb_format(dpath, 'valid-filtered',
                     os.path.join(dpext, 'WikiQA-dev.tsv'))
    create_fb_format(dpath, 'test-filtered',
                     os.path.join(dpext, 'WikiQA-test.tsv'))
예제 #23
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'SimpleQuestions')

    if not build_data.built(dpath):
        print('[building data: ' + dpath + ']')
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'simplequestions.tar.gz'
        url = ('https://s3.amazonaws.com/fair-data/parlai/simplequestions/' +
               fname)
        build_data.download(dpath, url)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #24
0
파일: build.py 프로젝트: xhuvom/ParlAI
def build(opt):
    dpath = opt['datapath'] + "/SimpleQuestions/"

    if not build_data.built(dpath):
        print("[building data: " + dpath + "]")
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = "simplequestions.tar.gz"
        url = ("https://s3.amazonaws.com/fair-data/parlai/simplequestions/" +
               fname)
        build_data.download(dpath, url)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #25
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'wikipedia')
    task = opt.get('task', 'wikipedia:all')
    extract_full = task.split(':')[-1] == 'all'
    if extract_full:
        dpath = os.path.join(dpath, 'full')
        fname = 'wiki_full_extracted.tgz'
    else:
        dpath = os.path.join(dpath, 'summary')
        fname = "summaries.tgz"
    if not build_data.built(dpath):
        print('[building data: ' + dpath + ']')
        build_data.make_dir(dpath)
        url = 'http://parl.ai/downloads/wikipedia/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)
        build_data.mark_done(dpath)
예제 #26
0
파일: 19675_build.py 프로젝트: xlrshop/Parl
def build(opt):
    dpath = os.path.join(opt['datapath'], 'dialog-bAbI-plus')
    fname = "dialog-bAbI-plus.zip"
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        url = "https://drive.google.com/uc?export=download&id=0B2MvoQfXtqZmMTJqclpBdGN2bmc"
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        build_data.mark_done(dpath, version)
예제 #27
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'MCTest')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'mctest.tar.gz'
        url = 'http://parl.ai/downloads/mctest/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        dpext = os.path.join(dpath, 'mctest')
        create_fb_format(
            dpath, 'train160', os.path.join(dpext, 'MCTest', 'mc160.train'), None
        )
        create_fb_format(
            dpath, 'valid160', os.path.join(dpext, 'MCTest', 'mc160.dev'), None
        )
        create_fb_format(
            dpath,
            'test160',
            os.path.join(dpext, 'MCTest', 'mc160.test'),
            os.path.join(dpext, 'MCTestAnswers', 'mc160.test.ans'),
        )
        create_fb_format(
            dpath, 'train500', os.path.join(dpext, 'MCTest', 'mc500.train'), None
        )
        create_fb_format(
            dpath, 'valid500', os.path.join(dpext, 'MCTest', 'mc500.dev'), None
        )
        create_fb_format(
            dpath,
            'test500',
            os.path.join(dpext, 'MCTest', 'mc500.test'),
            os.path.join(dpext, 'MCTestAnswers', 'mc500.test.ans'),
        )

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #28
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'OpenSubtitles')

    if not build_data.built(dpath):
        print('[building data: ' + dpath + ']')
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'download.php?f=OpenSubtitles/en.tar.gz'
        url = ('http://opus.lingfil.uu.se/' + fname)
        build_data.download(os.path.join(dpath, 'OpenSubtitles.tar.gz'), url)
        build_data.untar(dpath, 'OpenSubtitles.tar.gz')

        create_fb_format(os.path.join(dpath, 'OpenSubtitles', 'en'), dpath)

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #29
0
파일: build.py 프로젝트: vtantia/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'personality_captions')
    image_path = os.path.join(dpath, 'images')
    fname = 'personality_captions.tgz'
    version = '1.0'
    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)
        url = 'http://parl.ai/downloads/personality_captions/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)
        build_data.mark_done(dpath, version)

    if not build_data.built(image_path, version) and not opt.get('yfcc_path'):
        download_images(opt)
예제 #30
0
def build(opt):
    dpath = opt['datapath'] + "/OpenSubtitles/"

    if not build_data.built(dpath):
        print("[building data: " + dpath + "]")
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = "download.php?f=OpenSubtitles/en.tar.gz"
        url = ("http://opus.lingfil.uu.se/" + fname)
        build_data.download(dpath, url)
        build_data.untar(dpath, 'download.php?f=OpenSubtitles%2Fen.tar.gz')

        create_fb_format(dpath + '/OpenSubtitles/en/', dpath)

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #31
0
def download(opt, path, fname, version='1.0'):
    fshort = fname[:fname.find('.')] if '.' in fname else fname
    dpath = os.path.join(opt['datapath'], 'models', path, fshort)

    if not build_data.built(dpath, version):
        print('[downloading: ' + dpath + '/' + fname + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        url = 'https://s3.amazonaws.com/fair-data/parlai/_models/convai2/' + fname
        build_data.download(url, dpath, fname)
        if '.tgz' in fname or '.gz' in fname:
            build_data.untar(dpath, fname)
        # Mark the data as built.
        build_data.mark_done(dpath, version)
예제 #32
0
def build(opt):
    # Depends upon another dataset, wikimovies, build that first.
    wikimovies_build.build(opt)

    dpath = opt['datapath'] + "/DBLL/"
    if not build_data.built(dpath):
        print("[building data: " + dpath + "]")
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = "dbll.tgz"
        url = "https://s3.amazonaws.com/fair-data/parlai/dbll/" + fname
        build_data.download(dpath, url)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #33
0
def build(opt):
    # Depends upon another dataset, wikimovies, build that first.
    wikimovies_build.build(opt)

    dpath = os.path.join(opt['datapath'], 'DBLL')
    if not build_data.built(dpath):
        print('[building data: ' + dpath + ']')
        build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'dbll.tgz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/dbll/' + fname
        build_data.download(dpath, url)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath)
예제 #34
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'TriviaQA')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'triviaqa-rc.tar.gz'
        url = 'http://nlp.cs.washington.edu/triviaqa/data/'
        build_data.download(url + fname, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #35
0
파일: build.py 프로젝트: reloadbrain/ParlAI
def build(opt):
    version = 'v3.0'
    dpath = os.path.join(opt['datapath'], 'ConvAI2')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'convai2.tar.gz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/convai2/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version)
예제 #36
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    version = 'v1.0'
    dpath = os.path.join(opt['datapath'], 'Persona-Chat')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'personachat.tgz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/personachat/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version)
예제 #37
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'TriviaQA')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'triviaqa-rc.tar.gz'
        url = 'http://nlp.cs.washington.edu/triviaqa/data/'
        build_data.download(url + fname, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #38
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'TalkTheWalk')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'talkthewalk.tgz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/projects/talkthewalk/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #39
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'dialog-bAbI')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'dialog_babi.tar.gz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/dialog_babi/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #40
0
파일: build.py 프로젝트: zwcdp/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'bAbI')
    version = 'None'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'babi.tar.gz'
        url = 'http://parl.ai/downloads/babi/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #41
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'personalized-dialog')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        # https://www.dropbox.com/s/4i9u4y24pt3paba/personalized-dialog-dataset.tar.gz?dl=1
        fname = 'personalized-dialog-dataset.tar.gz'
        url = 'https://www.dropbox.com/s/4i9u4y24pt3paba/' + fname + '?dl=1'
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #42
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'OpenSubtitles')
    version = '1'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        url = ('http://opus.lingfil.uu.se/download.php?f=OpenSubtitles/en.tar.gz')
        build_data.download(url, dpath, 'OpenSubtitles.tar.gz')
        build_data.untar(dpath, 'OpenSubtitles.tar.gz')

        create_fb_format(os.path.join(dpath, 'OpenSubtitles', 'en'), dpath)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #43
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'CLEVR')
    version = 'v1.0'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        # An older version exists, so remove these outdated files.
        if build_data.built(dpath):
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'CLEVR_v1.0.zip'
        url = 'https://s3-us-west-1.amazonaws.com/clevr/'

        build_data.download(url + fname, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #44
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    data_path = os.path.join(opt['datapath'], 'DialogueQE')
    version = '1501534800'

    if not build_data.built(data_path, version_string=version):
        print('[building data: ' + data_path + ']')

        if build_data.built(data_path):
            build_data.remove_dir(data_path)
        build_data.make_dir(data_path)

        fname = 'data_' + version + '.tar.gz'
        url = 'https://raw.githubusercontent.com/deepmipt/turing-data/master/' + fname
        build_data.download(url, data_path, fname)
        build_data.untar(data_path, fname)

        os.rename(os.path.join(data_path, 'data_train_' + version + '.json'), os.path.join(data_path, 'train.json'))
        os.rename(os.path.join(data_path, 'data_test_' + version + '.json'), os.path.join(data_path, 'test.json'))

        build_data.mark_done(data_path, version_string=version)
예제 #45
0
파일: build.py 프로젝트: ahiroto/ParlAI
def buildImage(opt):
    dpath = os.path.join(opt['datapath'], 'COCO-IMG')
    version = '1'

    if not build_data.built(dpath, version_string=version):
        print('[building image data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the image data.
        fname1 = 'train2014.zip'
        fname2 = 'val2014.zip'
        fname3 = 'test2015.zip'

        url = 'https://s3.amazonaws.com/fair-data/parlai/COCO-IMG/'

        build_data.download(url + fname1, dpath, fname1)
        build_data.download(url + fname2, dpath, fname2)
        build_data.download(url + fname3, dpath, fname3)

        build_data.untar(dpath, fname1)
        build_data.untar(dpath, fname2)
        build_data.untar(dpath, fname3)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #46
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    # Depends upon another dataset, wikimovies, build that first.
    wikimovies_build.build(opt)

    dpath = os.path.join(opt['datapath'], 'DBLL')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'dbll.tgz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/dbll/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #47
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'COPA')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')

        if build_data.built(dpath):
            # an older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # download the data.
        fname = 'COPA-resources.tgz'
        # dataset URL
        url = 'http://people.ict.usc.edu/~gordon/downloads/' + fname
        build_data.download(url, dpath, fname)

        # uncompress it
        build_data.untar(dpath, fname)

        # mark the data as built
        build_data.mark_done(dpath, version_string=version)
예제 #48
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'negotiation')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')

        # make a clean directory if needed
        if build_data.built(dpath):
            # an older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data from github
        fname = 'negotiation.zip'
        url = ('https://github.com/facebookresearch/end-to-end-negotiator/'
               'archive/master.zip')
        print('[downloading data from: ' + url + ']')
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        # Mark as done
        build_data.mark_done(dpath, version_string=version)
예제 #49
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    dpath = os.path.join(opt['datapath'], 'InsuranceQA')
    version = '1'

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data from github.
        fname = 'insuranceqa.zip'
        url = 'https://github.com/shuzi/insuranceQA/archive/master.zip'
        print('[downloading data from: ' + url + ']')
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        ParseInsuranceQAV1.build(dpath)
        ParseInsuranceQAV2.build(dpath)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #50
0
def build(opt):
    dpath = os.path.join(opt['datapath'], 'CornellMovie')
    version = None

    if not build_data.built(dpath, version_string=version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'cornell_movie_dialogs_corpus.zip'
        url = 'http://www.mpi-sws.org/~cristian/data/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        dpext = os.path.join(dpath, 'cornell movie-dialogs corpus')
        create_fb_format(os.path.join(dpext, 'movie_lines.txt'),
                         os.path.join(dpext, 'movie_conversations.txt'),
                         dpath)

        # Mark the data as built.
        build_data.mark_done(dpath, version_string=version)
예제 #51
0
파일: build.py 프로젝트: ahiroto/ParlAI
def build(opt):
    version = 'v1.0'
    dpath = os.path.join(opt['datapath'], 'SCAN')

    if not build_data.built(dpath, version):
        print('[building data: ' + dpath + ']')
        if build_data.built(dpath):
            # An older version exists, so remove these outdated files.
            build_data.remove_dir(dpath)
        build_data.make_dir(dpath)

        # Download the data.
        fname = 'scan.tgz'
        url = 'https://s3.amazonaws.com/fair-data/parlai/scan/' + fname
        build_data.download(url, dpath, fname)
        build_data.untar(dpath, fname)

        ext = os.path.join('dailymail', 'questions')
        create_fb_format(dpath, 'train', os.path.join(dpath, 'tasks_train_simple.txt'))
        create_fb_format(dpath, 'valid', os.path.join(dpath, 'tasks_train_simple.txt'))
        create_fb_format(dpath, 'test', os.path.join(dpath, 'tasks_test_simple.txt'))

        # Mark the data as built.
        build_data.mark_done(dpath, version)