def test_sample_dag(sqlite_client_and_tmp_dir, class_, identifier): client, _ = sqlite_client_and_tmp_dir dag = DAG() product = GenericProduct('some_file.txt', client=client) PythonCallable(touch, product, dag) dag.build() assert Path('some_file.txt').exists() assert product.exists() assert product.fetch_metadata() is not None
def test_delete_sqlite_backend(sqlite_client_and_tmp_dir, class_, identifier): client, tmp_dir = sqlite_client_and_tmp_dir product = GenericProduct('some_identifier.txt', client=client) m = {'metadata': 'value'} product.save_metadata(m) product.delete() assert not product.exists()
def test_upload_to_s3(s3, tmp_directory): dag = DAG() dag.clients[GenericProduct] = SQLAlchemyClient('sqlite://') Path('somefile.txt').touch() UploadToS3('somefile.txt', GenericProduct('somefile-in-s3.txt'), dag, bucket='some-bucket', name='s3_upload') dag.build() contents = s3.list_objects(Bucket='some-bucket')['Contents'] assert len(contents) == 1 assert contents[0]['Key'] == 'somefile-in-s3.txt'
def make(date_): date_str = date_.strftime('%Y.%m.%d') ROOT = Path('data', date_str) ROOT.mkdir(exist_ok=True, parents=True) dag = DAG() client = SQLAlchemyClient('sqlite:///metadata.db') dag.clients[GenericProduct] = client loader = SourceLoader(path='.') source = 'https://www.inegi.org.mx/contenidos/programas/ccpv/2010/datosabiertos/iter_nal_2010_csv.zip' population_zip = DownloadFromURL(source, File(ROOT / 'population.zip'), dag, name='population.zip') population = PythonCallable(_uncompress, File(ROOT / 'population'), dag, name='population') pop_by_state = PythonCallable(_pop_by_state, File(ROOT / 'pop_by_state.csv'), dag, name='pop_by_state') population_zip >> population >> pop_by_state confirmed = ShellScript(loader['get_confirmed.sh'], {'pdf': File(ROOT / 'confirmed.pdf'), 'csv': File(ROOT / 'confirmed.csv')}, dag, params={'date_str': date_str}) suspected = ShellScript(loader['get_suspected.sh'], {'pdf': File(ROOT / 'suspected.pdf'), 'csv': File(ROOT / 'suspected.csv')}, dag, params={'date_str': date_str}) confirmed_regex = re.compile( r'^(\d+)\s{1}([\w\s]+)\s{1}(FEMENINO|MASCULINO)\s{1}(\d+)\s{1}(.+)\s{1}(Confirmado)') suspected_regex = re.compile( r'^(\d+)\s{1}([\w\s]+)\s{1}(FEMENINO|MASCULINO)\s{1}(\d+)\s{1}(.+)\s?(Sospechoso)') clean_confirmed = PythonCallable(_clean, File(ROOT / 'confirmed_clean.csv'), dag, name='clean_confirmed', params={'regex': confirmed_regex}) clean_suspected = PythonCallable(_clean, File(ROOT / 'suspected_clean.csv'), dag, name='clean_suspected', params={'regex': suspected_regex}) agg = PythonCallable(_agg, File(ROOT / 'cases_and_population.csv'), dag, name='cases_pop') confirmed >> clean_confirmed >> agg suspected >> clean_suspected >> agg pop_by_state >> agg if args.upload: upload_confirmed = UploadToS3('{{upstream["clean_confirmed"]}}', GenericProduct( 'mx-health-ministry/{}/confirmed.csv'.format(date_str)), dag, bucket='mx-covid-data', name='upload_mx_confirmed') upload_suspected = UploadToS3('{{upstream["clean_suspected"]}}', GenericProduct( 'mx-health-ministry/{}/suspected.csv'.format(date_str)), dag, bucket='mx-covid-data', name='upload_mx_suspected') clean_confirmed >> upload_confirmed clean_suspected >> upload_suspected upload_agg = UploadToS3('{{upstream["cases_pop"]}}', GenericProduct( 'mx-health-ministry/{}/cases_pop.csv'.format(date_str)), dag, bucket='mx-covid-data', name='upload_cases_pop') agg >> upload_agg return dag
kernelspec_name=None, static_analysis=False, kwargs={}) ], [ SQLScript, dict(source='CREATE TABLE {{product}} FROM some_table', kwargs={}) ], [SQLDump, dict(source='SELECT * FROM some_tablle', kwargs={})], ]) def test_init_source(class_, kwargs): assert class_._init_source(**kwargs) @pytest.mark.parametrize('Task, prod, source', [ (ShellScript, GenericProduct('file.txt'), 'touch {{product}}'), (SQLScript, GenericSQLRelation( ('name', 'table')), 'CREATE TABLE {{product}}'), (SQLDump, GenericProduct('file.txt'), 'SELECT * FROM {{upstream["key"]}}'), (SQLTransfer, GenericSQLRelation( ('name', 'table')), 'SELECT * FROM {{upstream["key"]}}'), (SQLUpload, GenericSQLRelation(('name', 'table')), 'some_file.txt'), (PostgresCopyFrom, PostgresRelation(('name', 'table')), 'file.parquet') ]) def test_task_init_source_with_placeholder_obj(Task, prod, source): """ Testing we can initialize a task with a Placeholder as the source argument """ dag = DAG() dag.clients[Task] = Mock() dag.clients[type(prod)] = Mock()
name='deaths_clean') deaths >> clean_deaths def _mortality_rate(upstream, product): pop = pd.read_csv(str(upstream['pop_clean'])) deaths = pd.read_csv(str(upstream['deaths_clean'])) df = deaths.merge(pop, on='country', how='left') df['deaths_over_100k'] = df.deaths / (df.population / 100_000) df.to_csv(str(product), index=False) rate = PythonCallable(_mortality_rate, File(ROOT / 'mortality_rate.csv'), dag, name='mortality_rate') (clean_deaths + clean) >> rate if args.upload: upload = UploadToS3('{{upstream["mortality_rate"]}}', GenericProduct('mortality_rate_s3'), dag, bucket='mx-covid-data', name='upload_s3') rate >> upload table = dag.build() print(table)
def make(date_): date_str = date_.strftime('%Y.%m.%d') ROOT = Path('data', date_str) ROOT.mkdir(exist_ok=True, parents=True) dag = DAG() client = SQLAlchemyClient('sqlite:///metadata.db') dag.clients[GenericProduct] = client loader = SourceLoader(path='.') confirmed = ShellScript(loader['get_confirmed.sh'], { 'pdf': File(ROOT / 'confirmed.pdf'), 'csv': File(ROOT / 'confirmed.csv') }, dag, params={'date_str': date_str}) suspected = ShellScript(loader['get_suspected.sh'], { 'pdf': File(ROOT / 'suspected.pdf'), 'csv': File(ROOT / 'suspected.csv') }, dag, params={'date_str': date_str}) confirmed_regex = re.compile( r'^(\d+)\s{1}([\w\s]+)\s{1}(FEMENINO|MASCULINO)\s{1}(\d+),(.+),(Confirmado)' ) suspected_regex = re.compile( r'^(\d+)\s{1}([\w\s]+)\s{1}(FEMENINO|MASCULINO)\s{1}(\d+)\s{1}(.+)\s{1}(Sospechoso)' ) clean_confirmed = PythonCallable(_clean, File(ROOT / 'confirmed_clean.csv'), dag, name='clean_confirmed', params={'regex': confirmed_regex}) clean_suspected = PythonCallable(_clean, File(ROOT / 'suspected_clean.csv'), dag, name='clean_suspected', params={'regex': suspected_regex}) confirmed >> clean_confirmed suspected >> clean_suspected if args.upload: upload_confirmed = UploadToS3( '{{upstream["clean_confirmed"]}}', GenericProduct( 'mx-health-ministry/{}/confirmed.csv'.format(date_str)), dag, bucket='mx-covid-data', name='upload_mx_confirmed') upload_suspected = UploadToS3( '{{upstream["clean_suspected"]}}', GenericProduct( 'mx-health-ministry/{}/suspected.csv'.format(date_str)), dag, bucket='mx-covid-data', name='upload_mx_suspected') clean_confirmed >> upload_confirmed clean_suspected >> upload_suspected return dag