Python DefaultExtractor示例，ketl.extractor.Extractor.DefaultExtractor Python示例

示例#1

0

显示文件

文件： test_extractor.py 项目： grapesmoker/ketl

def test_default_extractor_init():

    api: models.API = APIFactory(name='my nice api')
    creds_details = {
        'cookie': {
            'name': 'my-cookie',
            'value': 'my-value'
        },
        'auth': {
            'username': '******',
            'password': '******'
        },
        'auth_token': {
            'header': 'Token',
            'token': 'my-token'
        }
    }
    creds: models.Creds = CredsFactory(api_config=api,
                                       creds_details=creds_details)

    extractor = DefaultExtractor(api)

    assert extractor.auth == creds.creds_details['auth']
    assert extractor.headers['Cookie'] == 'my-cookie=my-value'
    assert extractor.auth_token == creds.creds_details['auth_token']
    assert extractor.headers == {
        'Cookie': 'my-cookie=my-value',
        'Token': 'my-token'
    }

    extractor_with_id = DefaultExtractor(api.id)
    extractor_with_name = DefaultExtractor(api.name)
    assert extractor_with_id.api == extractor.api == extractor_with_name.api

示例#2

0

显示文件

文件： test_extractor.py 项目： grapesmoker/ketl

def test_fetch_generic_file(mock_generic_writer, mock_requires_update,
                            mock_smart_open, tmp_path):

    mock_requires_update.return_value = True

    mock_open_file = mock.Mock()
    mock_open_file.content_length = 1

    mock_smart_open.return_value.__enter__.return_value = mock_open_file

    mock_generic_writer.return_value = None

    api: models.API = APIFactory(name='my nice api')
    extractor = DefaultExtractor(api)

    cached_file: models.CachedFile = CachedFileFactory(
        url='url/to/file', url_params={'query': 'item'})

    with NamedTemporaryFile(dir=tmp_path) as tf:
        result = extractor._fetch_generic_file(
            cached_file.full_url,
            Path(tf.name),
            timedelta(days=7),
            url_params=cached_file.url_params,
            headers={'Bearer': 'Token'},
            auth={
                'username': '******',
                'password': '******'
            })

    assert result

    mock_requires_update.return_value = False
    assert not extractor._fetch_generic_file(cached_file.full_url, Path(
        tf.name), timedelta(days=7))

示例#3

0

显示文件

def test_pipeline_init_bad_fanout():

    api = APIFactory(name='my nice api')

    ex1 = DefaultExtractor(api)
    ex2 = DefaultExtractor(api)
    tf1 = DelimitedTableTransformer()
    tf2 = DelimitedTableTransformer()
    l1 = DelimitedFileLoader('out.csv')
    l2 = DelimitedFileLoader('out.csv')

    fanout = {
        ex1: [tf1, l1],
        tf1: [l1, l2],
        tf2: [l2]
    }

    with pytest.raises(InvalidPipelineError):
        pipeline = ETLPipeline(fanout=fanout)

    fanout = {
        ex1: [tf1, tf2],
        tf1: [l1, l2],
        tf2: [tf2]
    }

    with pytest.raises(InvalidPipelineError):
        pipeline = ETLPipeline(fanout=fanout)

示例#4

0

显示文件

文件： test_extractor.py 项目： grapesmoker/ketl

def test_fetch_ftp(mock_requires_update, mock_ftp_class, tmp_path):

    mock_requires_update.return_value = True

    mock_ftp = mock.Mock()
    mock_ftp.login.return_value = None
    mock_ftp.size.return_value = 11
    mock_ftp.retrbinary.return_value = None

    mock_ftp_class.return_value = mock_ftp

    api: models.API = APIFactory(name='my nice api')
    extractor = DefaultExtractor(api)

    cached_file: models.CachedFile = CachedFileFactory(url='url/to/file')

    with NamedTemporaryFile(dir=tmp_path) as tf:
        result = extractor._fetch_ftp_file(cached_file.full_url, Path(tf.name),
                                           timedelta(days=7))

    assert result

    mock_requires_update.return_value = False
    assert not extractor._fetch_ftp_file(cached_file.full_url, Path(tf.name),
                                         timedelta(days=7))

示例#5

0

显示文件

文件： test_extractor.py 项目： grapesmoker/ketl

def test_ftp_writer():

    target = io.BytesIO()

    bar = tqdm(1)
    DefaultExtractor._ftp_writer(target, b'hello world', bar=bar)
    target.seek(0)
    bar.close()

    assert target.read() == b'hello world'

示例#6

0

显示文件

def test_fire_extractors(mock_extract):

    api = APIFactory(name='my nice api')

    ex1 = DefaultExtractor(api)
    ex2 = DefaultExtractor(api)
    tf1 = DelimitedTableTransformer()
    tf2 = DelimitedTableTransformer()

    pipeline = ETLPipeline(extractors=[ex1, ex2], transformers=[tf1, tf2])

    mock_extract.side_effect = [[Path('file1'), Path('file2')], [Path('file3')]]

    result = pipeline._fire_extractors()

    assert result == {ex1: [Path('file1'), Path('file2')], ex2: [Path('file3')]}

示例#7

0

显示文件

文件： test_extractor.py 项目： grapesmoker/ketl

def test_generic_writer():

    size = 32768
    bar = tqdm(size)
    letters = [chr(i) for i in range(65, 65 + 26)]
    data = bytes(''.join(random.choices(letters, k=size)), 'utf-8')

    source = io.BytesIO(data)
    source.seek(0)
    target = io.BytesIO()

    api: models.API = APIFactory(name='my nice api')
    extractor = DefaultExtractor(api)
    extractor._generic_writer(source, target, bar=bar)

    source.seek(0)
    target.seek(0)
    assert source.read() == target.read()

示例#8

0

显示文件

文件： test_extractor.py 项目： grapesmoker/ketl

def test_handle_s3_urls():

    api = APIFactory(name='my nice api')
    extractor = DefaultExtractor(api)

    bad_s3_url = 's3://some-bucket/badly#formed-file'
    url = furl(bad_s3_url)

    result = extractor._handle_s3_urls(url)

    assert result == 's3://some-bucket/badly%23formed-file'

    bad_s3_url = 's3://some-bucket/badly&formed-file'
    url = furl(bad_s3_url)

    result = extractor._handle_s3_urls(url)

    assert result == 's3://some-bucket/badly%26formed-file'

示例#9

0

显示文件

文件： test_extractor.py 项目： grapesmoker/ketl

def test_update_cache_file(session, tmp_path):

    tf = NamedTemporaryFile(dir=tmp_path, delete=False)
    tf.write(b'hello world')
    tf.close()

    api: models.API = APIFactory(name='my nice api')
    extractor = DefaultExtractor(api)

    cached_file: models.CachedFile = CachedFileFactory(path=tf.name)
    extractor._update_file_cache(cached_file, Path(tf.name))

    cached_file = session.query(models.CachedFile).get(cached_file.id)

    assert cached_file.path == tf.name
    assert cached_file.hash == file_hash(Path(tf.name)).hexdigest()
    assert cached_file.last_download is not None
    assert cached_file.size == 11

示例#10

0

显示文件

def test_pipeline_init_with_fanout(tmp_path):

    api = APIFactory(name='my nice api')

    ex1 = DefaultExtractor(api)
    ex2 = DefaultExtractor(api)
    tf1 = DelimitedTableTransformer()
    tf2 = DelimitedTableTransformer()
    l1 = DelimitedFileLoader('out.csv')
    l2 = DelimitedFileLoader('out.csv')

    fanout = {
        ex1: [tf1, tf2],
        tf1: [l1],
        tf2: [l2]
    }

    pipeline = ETLPipeline(extractors=[ex1, ex2], transformers=[tf1, tf2], loaders=[l1, l2], fanout=fanout)

    assert pipeline.fanout == fanout

示例#11

0

显示文件

def test_pipeline_init_without_fanout():

    api = APIFactory(name='my nice api')

    ex1 = DefaultExtractor(api)
    ex2 = DefaultExtractor(api)
    tf1 = DelimitedTableTransformer()
    tf2 = DelimitedTableTransformer()
    l1 = DelimitedFileLoader('out.csv')
    l2 = DelimitedFileLoader('out.csv')

    pipeline = ETLPipeline(extractors=[ex1, ex2], transformers=[tf1, tf2], loaders=[l1, l2])

    for k, v in pipeline.fanout.items():
        if k == ex1:
            assert pipeline.fanout[k] == [tf1, tf2]
        if k == ex2:
            assert pipeline.fanout[k] == [tf1, tf2]
        if k == tf1:
            assert pipeline.fanout[k] == [l1, l2]
        if k == tf2:
            assert pipeline.fanout[k] == [l1, l2]

示例#12

0

显示文件

文件： test_extractor.py 项目： grapesmoker/ketl

def test_requires_update(tmp_path):

    api: models.API = APIFactory(name='my nice api')
    extractor = DefaultExtractor(api)

    assert extractor._requires_update(Path('some/file'), 1, timedelta(days=1))

    tf = NamedTemporaryFile(dir=tmp_path, delete=False)
    tf.write(b'hello world')
    tf.close()

    assert not extractor._requires_update(Path(tf.name), 11)
    assert not extractor._requires_update(Path(tf.name), -1, timedelta(days=1))
    assert extractor._requires_update(Path(tf.name), -1, timedelta(0))

示例#13

0

显示文件

def test_fire_transformers(mock_transform: mock.Mock, mock_load: mock.Mock):

    api = APIFactory(name='my nice api')

    ex1 = DefaultExtractor(api)
    tf1 = DelimitedTableTransformer()
    l1 = DelimitedFileLoader('out.csv')

    pipeline = ETLPipeline(extractors=[ex1], transformers=[tf1], loaders=[l1])

    extraction_results = {ex1: [Path('file1')]}

    mock_df = mock.Mock()
    mock_df.empty = False
    mock_transform.return_value = [mock_df]
    mock_load.return_value = None

    pipeline._fire_transformers(extraction_results)

    mock_transform.assert_called_once()
    mock_load.assert_called_once()

示例#14

0

显示文件

文件： test_extractor.py 项目： grapesmoker/ketl

def test_get_file(mock_fetch_ftp, mock_fetch_generic, mock_update, mock_hash,
                  mock_stat):

    mock_update.return_value = None
    mock_hash.side_effect = [sha1(b'hash1'), sha1(b'hash2'), sha1(b'hash3')]
    mock_stat_result = mock.Mock()
    mock_stat_result.st_size = 1024
    mock_stat.side_effect = [
        mock_stat_result, mock_stat_result, mock_stat_result
    ]

    api = APIFactory(name='my nice api')
    source1: models.Source = SourceFactory(base_url='ftp://base/url',
                                           data_dir='data_dir')
    source2: models.Source = SourceFactory(base_url='http://base/url',
                                           data_dir='data_dir')
    cached_file: models.CachedFile = CachedFileFactory(
        url='file',
        source=source1,
        path='cached_file',
        url_params={'query': 'item'})

    extractor = DefaultExtractor(api)

    interval = datetime.timedelta(days=7)

    mock_fetch_ftp.return_value = True
    result = extractor.get_file(cached_file.id, cached_file.full_url,
                                cached_file.full_path, interval)
    # can't patch datetime
    assert isinstance(result['last_download'], datetime.datetime)
    del result['last_download']
    assert result == {
        'hash': sha1(b'hash1').hexdigest(),
        'id': 1,
        'size': 1024
    }

    cached_file: models.CachedFile = CachedFileFactory(
        url='file',
        source=source2,
        path='cached_file',
        url_params={'query': 'item'})

    mock_fetch_generic.return_value = True
    result = extractor.get_file(cached_file.id, cached_file.full_url,
                                cached_file.full_path, interval)
    # can't patch datetime
    assert isinstance(result['last_download'], datetime.datetime)
    del result['last_download']
    assert result == {
        'hash': sha1(b'hash2').hexdigest(),
        'id': 2,
        'size': 1024
    }

    mock_fetch_generic.return_value = False
    assert extractor.get_file(cached_file.id, cached_file.full_url,
                              cached_file.full_path, interval) is None

    mock_fetch_generic.side_effect = [ValueError('something bad happened')]
    mock_fetch_generic.return_value = True
    assert extractor.get_file(cached_file.id, cached_file.full_url,
                              cached_file.full_path, interval) is None

示例#15

0

显示文件

文件： test_extractor.py 项目： grapesmoker/ketl

def test_extract(mock_get_file, mock_pool):

    api: models.API = APIFactory()
    source: models.Source = SourceFactory(api_config=api, data_dir='data/dir')

    extractor = DefaultExtractor(api)

    cached_file1: models.CachedFile = CachedFileFactory(
        url='url/to/file1',
        path='path/to/file1',
        expected_mode=models.ExpectedMode.self,
        hash='hash1',
        source=source)
    cached_file2: models.CachedFile = CachedFileFactory(
        url='url/to/file2',
        path='path/to/file2',
        expected_mode=models.ExpectedMode.self,
        source=source)
    cached_file3: models.CachedFile = CachedFileFactory(
        url='url/to/file3',
        path='path/to/file3',
        expected_mode=models.ExpectedMode.self,
        source=source)

    mock_get_file.side_effect = [{
        'id': cached_file1.id,
        'hash': 'hash1',
        'last_download': datetime.datetime.now(),
        'size': 1
    }, {
        'id': cached_file2.id,
        'hash': 'hash2',
        'last_download': datetime.datetime.now(),
        'size': 1
    }, None]

    expected_files = extractor.extract()

    for i, ef in enumerate(expected_files):
        assert f'data/dir/path/to/file{i + 1}' in str(ef)

    # test with only missing files
    extractor = DefaultExtractor(api,
                                 skip_existing_files=True,
                                 on_disk_check='hash')

    mock_get_file.side_effect = [{
        'id': cached_file1.id,
        'hash': 'hash1',
        'last_download': datetime.datetime.now(),
        'size': 1
    }, {
        'id': cached_file2.id,
        'hash': 'hash2',
        'last_download': datetime.datetime.now(),
        'size': 1
    }, None]

    expected_files = extractor.extract()

    for i, ef in enumerate(expected_files):
        assert f'data/dir/path/to/file{i + 1}' in str(ef)

    # test with multiprocessing

    mock_future1 = mock.Mock()
    mock_future2 = mock.Mock()
    mock_future3 = mock.Mock()

    mock_future1.get.return_value = {
        'id': cached_file1.id,
        'hash': 'hash1',
        'last_download': datetime.datetime.now(),
        'size': 1
    }
    mock_future2.get.return_value = {
        'id': cached_file2.id,
        'hash': 'hash2',
        'last_download': datetime.datetime.now(),
        'size': 1
    }
    mock_future3.get.return_value = None

    mock_futures = mock.Mock()
    mock_futures.get.return_value = [mock_future1, mock_future2, mock_future3]

    mock_mp_pool = mock.Mock()
    mock_mp_pool.starmap_async.return_value = mock_futures
    mock_mp_pool.join.return_value = None
    mock_pool.__enter__.return_value = mock_mp_pool

    extractor = DefaultExtractor(api, concurrency='multiprocess')

    mock_get_file.side_effect = [{
        'id': cached_file1.id,
        'hash': 'hash1',
        'last_download': datetime.datetime.now(),
        'size': 1
    }, {
        'id': cached_file2.id,
        'hash': 'hash2',
        'last_download': datetime.datetime.now(),
        'size': 1
    }, None]

    expected_files = extractor.extract()

    for i, ef in enumerate(expected_files):
        assert f'data/dir/path/to/file{i + 1}' in str(ef)