Пример #1
0
    def test_intro_stat_learning(self, cleanup, signup, testcfg):
        cfg_name = cfg_settings_islr['name']
        cfg_filenames_islr = [
            'Advertising.csv', 'Advertising2.csv', 'Auto.csv', 'Ch10Ex11.csv',
            'College.csv', 'Credit.csv', 'Heart.csv', 'Income1.csv',
            'Income2.csv', 'LICENSE.md', 'README.md'
        ]

        # start with local repo
        pipelocal = d6tpipe.PipeLocal(cfg_name,
                                      profile=cfg_profile,
                                      filecfg=cfg_cfgfname)
        pipelocal.delete_files_local(confirm=False, delete_all=False)
        pipelocal.import_dir('tests/intro-stat-learning/')
        assert pipelocal.scan_local() == cfg_filenames_islr
        assert pipelocal.files() == []
        assert pipelocal.files(fromdb=False) == cfg_filenames_islr

        df = pd.read_csv(pipelocal.dirpath / 'Advertising.csv')
        assert not df.empty

        if not testcfg.get('local', False):
            # set up public repo
            api = getapi()
            d6tpipe.upsert_pipe(api, cfg_settings_islr)
            d6tpipe.upsert_permissions(api, cfg_name, {
                "username": '******',
                "role": "read"
            })
            pipe = d6tpipe.Pipe(api, cfg_name, mode='all')
            pipe.delete_files_remote(confirm=False)
            assert pipe.scan_remote(cached=False) == []
            assert pipe.push() == cfg_filenames_islr
            pipelocal = d6tpipe.PipeLocal(cfg_name,
                                          profile=cfg_profile,
                                          filecfg=cfg_cfgfname)
            assert len(pipelocal.schema) > 0

            api2 = getapi2()
            pipe = d6tpipe.Pipe(api2, cfg_name)
            pipe.delete_files_local(confirm=False, delete_all=False)
            assert pipe.pull() == cfg_filenames_islr

            df = pd.read_csv(pipe.dirpath / 'Advertising.csv',
                             **pipe.schema['pandas'])
            assert not df.empty

            import dask.dataframe as dd
            files = pipe.filepaths(include='Advertising*.csv')
            ddf = dd.read_csv(files, **pipe.schema['dask'])
            assert not ddf.compute().empty
            pipe.delete_files_local(confirm=False, delete_all=False)

        pipelocal.delete_files_local(confirm=False, delete_all=True)
Пример #2
0
    def test_ftp(self, cleanup, signup, testcfg):
        api = getapi(testcfg.get('local', False))

        # test quick create
        d6tpipe.upsert_pipe_json(api, 'tests/.creds-test.json', 'pipe-test-ftp')
        cfg_name = 'test-ftp'

        # test paths
        r,d = api.cnxn.pipes._(cfg_name).get()
        assert d['options']['remotepath']=='/utest/'

        # test push/pull
        pipe = getpipe(api, name=cfg_name, mode='all')
        cfg_copyfile = 'test.csv'
        df = pd.DataFrame({'a':range(10)})
        df.to_csv(pipe.dirpath/cfg_copyfile,index=False)
        assert pipe.scan_remote(cached=False)==[]
        assert pipe.pull()==[]
        assert pipe.push_preview()==[cfg_copyfile]
        assert pipe.push()==[cfg_copyfile]
        pipe._cache_scan.clear()
        assert pipe.pull()==[cfg_copyfile]
        pipe.delete_files(confirm=False,all_local=True)
        assert pipe.scan_remote(cached=False)==[]
Пример #3
0
    def test_d6tfree(self, cleanup, signup, testcfg):
        if not testcfg.get('local', False):

            cfg_name = 'utest-d6tfree'
            api = getapi(testcfg.get('local', False))

            # test quick create
            d6tpipe.upsert_pipe(api, {'name': cfg_name})
            r, d = api.cnxn.pipes._(cfg_name).get()
            assert cfg_usr in d['options']['remotepath'] and cfg_name in d[
                'options']['remotepath'] and d['protocol'] == 's3'
            cred_read = api.cnxn.pipes._(cfg_name).credentials.get(
                query_params={'role': 'read'})[1]
            cred_write = api.cnxn.pipes._(cfg_name).credentials.get(
                query_params={'role': 'write'})[1]
            assert "aws_session_token" in cred_read and "aws_session_token" in cred_write
            assert cred_read['aws_access_key_id'] != cred_write[
                'aws_access_key_id']

            # assert False

            # test force renew
            pipe = getpipe(api, name=cfg_name, mode='all')
            pipe._reset_credentials()
            cred_read2 = api.cnxn.pipes._(cfg_name).credentials.get(
                query_params={'role': 'read'})[1]
            assert cred_read2['aws_access_key_id'] != cred_read[
                'aws_access_key_id']

            # test push/pull
            cfg_copyfile = 'folder/test.csv'
            cfg_copyfile2 = 'folder/test2.csv'
            pipe.delete_files_remote(confirm=False)

            df = pd.DataFrame({'a': range(10)})
            (pipe.dirpath / cfg_copyfile).parent.mkdir(exist_ok=True)
            df.to_csv(pipe.dirpath / cfg_copyfile, index=False)
            # assert False
            assert pipe.push() == [cfg_copyfile]
            pipe._cache_scan.clear()
            assert pipe.pull() == [cfg_copyfile]

            # permissions - no access
            api2 = getapi2(testcfg.get('local', False))
            with pytest.raises(APIError, match='403'):
                pipe2 = getpipe(api2, name=cfg_name, mode='all')
                pipe2.pull()

            # permissions - read
            settings = {"username": cfg_usr2, "role": "read"}
            d6tpipe.upsert_permissions(api, cfg_name, settings)

            pipe2 = getpipe(api2, name=cfg_name, mode='all')
            assert pipe2.role == 'read'
            assert pipe2.pull() == [cfg_copyfile]

            df.to_csv(pipe2.dirpath / cfg_copyfile2, index=False)
            with pytest.raises(ValueError, match='Read-only'):
                pipe2.push()

            # permissions - write
            settings = {"username": cfg_usr2, "role": "write"}
            d6tpipe.upsert_permissions(api, cfg_name, settings)

            pipe2 = getpipe(api2, name=cfg_name, mode='all', chk_empty=False)
            assert pipe2.role == 'write'
            assert pipe2.pull() == [cfg_copyfile]
            assert pipe2.push() == [cfg_copyfile, cfg_copyfile2]

            # todo: check don't have access to parent paths in s3
            # todo: file include patterns
            # d6tpipe.upsert_pipe(api,{'name':'demo-vendor-daily','parent':'demo-vendor','options':{'include':'*daily*.csv'}})
            # d6tpipe.upsert_pipe(api,{'name':'demo-vendor-monthly','parent':'demo-vendor','options':{'include':'*monthly*.csv'}})

            # cleanup
            pipe.delete_files_remote(confirm=False)
            assert pipe.scan_remote(cached=False) == []
            pipe.delete_files_local(confirm=False, delete_all=True)
Пример #4
0
    def test_pipes_pull(self, cleanup, signup, parentinit, pipeinit, testcfg):
        api = getapi(testcfg.get('local', False))
        pipe = getpipe(api)
        assert pipe.name in api.list_pipes()

        cfg_chk_crc = [
            '8a9782e9efa8befa9752045ca506a62e',
            '5fe579d6b71031dad399e8d4ea82820b',
            '4c7da169df85253d7ff737dde1e7400b',
            'ca62a122993494e763fd1676cce95e76'
        ]

        # assert False
        assert pipe.files() == []
        assert pipe.scan_remote() == cfg_filenames_chk
        r, d = pipe.scan_remote(attributes=True)
        assert _filenames(d) == cfg_filenames_chk
        assert [o['crc'] for o in d] == cfg_chk_crc

        assert api.list_local_pipes() == []
        assert pipe.pull_preview() == cfg_filenames_chk
        assert pipe.pull() == cfg_filenames_chk
        assert pipe.pull_preview() == []
        assert api.list_local_pipes() == [pipe.name]

        assert pipe.files() == cfg_filenames_chk
        assert pipe.filepaths() == [
            Path(pipe.dirpath) / f for f in pipe.files()
        ]
        assert pipe.filepaths(aspathlib=False) == [
            str(Path(pipe.dirpath) / f) for f in pipe.files()
        ]

        pipe = getpipe(api, chk_empty=False, mode='all')
        assert pipe.pull_preview() == cfg_filenames_chk

        # PipeLocal
        pipelocal = d6tpipe.PipeLocal(pipe.name,
                                      profile=cfg_profile,
                                      filecfg=cfg_cfgfname)
        assert pipelocal.files() == cfg_filenames_chk
        assert pipelocal.scan_local() == cfg_filenames_chk
        assert pipelocal.schema == cfg_settings_pipe['schema']
        df = pd.read_csv(pipe.dirpath / cfg_filenames_chk[0],
                         **pipe.schema['pandas'])

        # permissions
        if not testcfg.get('local', False):
            api2 = getapi2(testcfg.get('local', False))
            with pytest.raises(APIError, match='403'):
                pipe2 = getpipe(api2, name=cfg_pipe_name, mode='all')
                pipe2.pull()

            settings = {"username": cfg_usr2, "role": "read"}
            r, d = d6tpipe.upsert_permissions(api, cfg_parent_name, settings)

            pipe2 = getpipe(api2, name=cfg_pipe_name, mode='all')
            assert pipe2.pull() == cfg_filenames_chk

        # cleanup
        pipe.delete_files_local(confirm=False, delete_all=True)