def connect(self, data_directory): path = os.environ.get('IBIS_TEST_SQLITE_DATABASE', data_directory / 'ibis_testing.db') path = Path(path) if not path.exists(): pytest.skip('SQLite testing db {} does not exist'.format(path)) return ibis.sqlite.connect(str(path))
def update(meta, source_path): path = Path(meta) click.echo('Updating {} recipe...'.format(path.parent)) content = render(path) recipe = ruamel.yaml.round_trip_load(content) # update the necessary fields, skip leading 'v' in the version recipe['package']['version'] = ibis.__version__[1:] recipe['source'] = {'path': source_path} # XXX: because render will remove the {{ PYTHON }} variable recipe['build']['script'] = SCRIPT updated_content = ruamel.yaml.round_trip_dump(recipe, default_flow_style=False, width=sys.maxsize).strip() if PY2: updated_content = updated_content.decode('utf-8') click.echo(updated_content) path.write_text(updated_content)
def data_directory(): root = Path(__file__).absolute().parents[3] default = root / 'ci' / 'ibis-testing-data' datadir = os.environ.get('IBIS_TEST_DATA_DIRECTORY', default) datadir = Path(datadir) if not datadir.exists(): pytest.skip('test data directory not found') return datadir
def sqlite(database, schema, tables, data_directory, **params): database = Path(database) data_directory = Path(data_directory) logger.info('Initializing SQLite...') try: database.unlink() except OSError: pass params['database'] = str(database) engine = init_database('sqlite', params, schema, recreate=False) insert_tables(engine, tables, data_directory)
def parquet(tables, data_directory, ignore_missing_dependency, **params): try: import pyarrow as pa # noqa: F401 import pyarrow.parquet as pq # noqa: F401 except ImportError: msg = 'PyArrow dependency is missing' if ignore_missing_dependency: logger.warning('Ignored: %s', msg) return 0 else: raise click.ClickException(msg) data_directory = Path(data_directory) for table, df in read_tables(tables, data_directory): if table == 'functional_alltypes': schema = pa.schema([ pa.field('string_col', pa.string()), pa.field('date_string_col', pa.string()) ]) else: schema = None arrow_table = pa.Table.from_pandas(df, schema=schema) target_path = data_directory / '{}.parquet'.format(table) pq.write_table(arrow_table, str(target_path))
def mysql(schema, tables, data_directory, **params): data_directory = Path(data_directory) click.echo('Initializing MySQL...') engine = init_database('mysql+pymysql', params, schema, isolation_level='AUTOCOMMIT') insert_tables(engine, tables, data_directory)
def mysql(schema, tables, data_directory, **params): data_directory = Path(data_directory) logger.info('Initializing MySQL...') with warnings.catch_warnings(): warnings.simplefilter("ignore") engine = init_database('mysql+pymysql', params, schema, isolation_level='AUTOCOMMIT') insert_tables(engine, tables, data_directory)
def clone(repo_uri, destination): if Path(destination).exists(): return cmd = git['clone', repo_uri, destination] cmd(stdout=click.get_binary_stream('stdout'), stderr=click.get_binary_stream('stderr'))
def update(meta, source_path): path = Path(meta) click.echo('\nUpdating {} recipe...'.format(path.parent)) content = render(path) recipe = ruamel.yaml.round_trip_load(content) # update the necessary fields, skip leading 'v' in the version recipe['package']['version'] = ibis.__version__[1:] recipe['source'] = {'path': source_path} updated_content = ruamel.yaml.round_trip_dump(recipe, default_flow_style=False) if PY2: updated_content = updated_content.decode('utf-8') path.write_text(updated_content)
def download(base_url, directory, name): directory = Path(directory) if not directory.exists(): directory.mkdir() data_url = '{}/{}'.format(base_url, name) path = directory / name if not path.exists(): download = curl[data_url, '-o', path, '-L'] download(stdout=click.get_binary_stream('stdout'), stderr=click.get_binary_stream('stderr')) else: logger.info('Skipping download: %s already exists', name) logger.info('Extracting archive to %s', directory) if path.suffix in ('.tar', '.gz', '.bz2', '.xz'): with tarfile.open(str(path), mode='r|gz') as f: f.extractall(path=str(directory))
def deploy(package_location, artifact_directory, architectures): artifact_dir = Path(artifact_directory) artifact_dir.mkdir(parents=True, exist_ok=True) package_loc = Path(package_location) assert package_loc.exists(), 'Path {} does not exist'.format(package_loc) for architecture in architectures: arch_artifact_directory = str(artifact_dir / architecture) arch_package_directory = str(package_loc / architecture) shutil.copytree(arch_package_directory, arch_artifact_directory) cmd = conda['index', artifact_directory] cmd(stdout=click.get_binary_stream('stdout'), stderr=click.get_binary_stream('stderr'))
def clickhouse(schema, tables, data_directory, **params): data_directory = Path(data_directory) logger.info('Initializing ClickHouse...') engine = init_database('clickhouse+native', params, schema) for table, df in read_tables(tables, data_directory): if table == 'batting': # float nan problem cols = df.select_dtypes([float]).columns df[cols] = df[cols].fillna(0).astype(int) # string None driver problem cols = df.select_dtypes([object]).columns df[cols] = df[cols].fillna('') elif table == 'awards_players': # string None driver problem cols = df.select_dtypes([object]).columns df[cols] = df[cols].fillna('') insert(engine, table, df)
def postgres(schema, tables, data_directory, **params): data_directory = Path(data_directory) logger.info('Initializing PostgreSQL...') engine = init_database('postgresql', params, schema, isolation_level='AUTOCOMMIT') query = "COPY {} FROM STDIN WITH (FORMAT CSV, HEADER TRUE, DELIMITER ',')" database = params['database'] for table in tables: src = data_directory / '{}.csv'.format(table) load = psql['--host', params['host'], '--port', params['port'], '--username', params['user'], '--dbname', database, '--command', query.format(table)] with local.env(PGPASSWORD=params['password']): with src.open('r') as f: load(stdin=f) engine.execute('VACUUM FULL ANALYZE')
def bigquery(data_directory, ignore_missing_dependency, **params): try: import google.api_core.exceptions from google.cloud import bigquery except ImportError: msg = 'google-cloud-bigquery dependency is missing' if ignore_missing_dependency: logger.warning('Ignored: %s', msg) return 0 else: raise click.ClickException(msg) project_id = os.environ['GOOGLE_BIGQUERY_PROJECT_ID'] bqclient = bigquery.Client(project=project_id) # Create testing dataset. testing_dataset = bqclient.dataset('testing') try: bqclient.create_dataset(bigquery.Dataset(testing_dataset)) except google.api_core.exceptions.Conflict: pass # Skip if already created. # Set up main data table. data_directory = Path(data_directory) functional_alltypes_path = data_directory / 'functional_alltypes.csv' functional_alltypes_schema = [] schema_path = data_directory / 'functional_alltypes_bigquery_schema.json' with open(str(schema_path)) as schemafile: schema_json = json.load(schemafile) for field in schema_json: functional_alltypes_schema.append( bigquery.SchemaField.from_api_repr(field)) load_config = bigquery.LoadJobConfig() load_config.skip_leading_rows = 1 # skip the header row. load_config.schema = functional_alltypes_schema # Load main data table. functional_alltypes_schema = [] with open(str(functional_alltypes_path), 'rb') as csvfile: job = bqclient.load_table_from_file( csvfile, testing_dataset.table('functional_alltypes'), job_config=load_config).result() if job.error_result: raise click.ClickException(str(job.error_result)) # Load an ingestion time partitioned table. functional_alltypes_path = data_directory / 'functional_alltypes.csv' with open(str(functional_alltypes_path), 'rb') as csvfile: load_config.time_partitioning = bigquery.TimePartitioning() job = bqclient.load_table_from_file( csvfile, testing_dataset.table('functional_alltypes_parted'), job_config=load_config).result() if job.error_result: raise click.ClickException(str(job.error_result)) # Create a table with complex data types (nested and repeated). struct_table_path = data_directory / 'struct_table.avro' with open(str(struct_table_path), 'rb') as avrofile: load_config = bigquery.LoadJobConfig() load_config.source_format = 'AVRO' job = bqclient.load_table_from_file( avrofile, testing_dataset.table('struct_table'), job_config=load_config) if job.error_result: raise click.ClickException(str(job.error_result)) # Create empty date-partitioned table. date_table = bigquery.Table(testing_dataset.table('date_column_parted')) date_table.schema = [ bigquery.SchemaField('my_date_parted_col', 'DATE'), bigquery.SchemaField('string_col', 'STRING'), bigquery.SchemaField('int_col', 'INTEGER'), ] date_table.time_partitioning = bigquery.TimePartitioning( field='my_date_parted_col') bqclient.create_table(date_table) # Create empty timestamp-partitioned tables. timestamp_table = bigquery.Table( testing_dataset.table('timestamp_column_parted')) timestamp_table.schema = [ bigquery.SchemaField('my_timestamp_parted_col', 'DATE'), bigquery.SchemaField('string_col', 'STRING'), bigquery.SchemaField('int_col', 'INTEGER'), ] timestamp_table.time_partitioning = bigquery.TimePartitioning( field='my_timestamp_parted_col') bqclient.create_table(timestamp_table) # Create a table with a numeric column numeric_table = bigquery.Table(testing_dataset.table('numeric_table')) numeric_table.schema = [ bigquery.SchemaField('string_col', 'STRING'), bigquery.SchemaField('numeric_col', 'NUMERIC'), ] bqclient.create_table(numeric_table) df = pd.read_csv( str(data_directory / 'functional_alltypes.csv'), usecols=['string_col', 'double_col'], header=0, ) with tempfile.NamedTemporaryFile(mode='a+b') as csvfile: df.to_csv(csvfile, header=False, index=False) csvfile.seek(0) load_config = bigquery.LoadJobConfig() load_config.skip_leading_rows = 1 # skip the header row. load_config.schema = numeric_table.schema job = bqclient.load_table_from_file( csvfile, testing_dataset.table('numeric_table'), job_config=load_config).result() if job.error_result: raise click.ClickException(str(job.error_result))
def mapd(schema, tables, data_directory, **params): if sys.version_info.major < 3: logger.info('MapD backend is unavailable for Python 2.') return import pymapd data_directory = Path(data_directory) reserved_words = ['table', 'year', 'month'] # connection logger.info('Initializing MapD...') if params['database'] != 'mapd': conn = pymapd.connect(host=params['host'], user=params['user'], password=params['password'], port=params['port'], dbname='mapd') stmt = 'CREATE DATABASE {}'.format(params['database']) try: conn.execute(stmt) except Exception: logger.exception('MapD DDL statement %r failed', stmt) conn.close() conn = pymapd.connect(host=params['host'], user=params['user'], password=params['password'], port=params['port'], dbname=params['database']) # create tables for stmt in filter(None, map(str.strip, schema.read().split(';'))): try: conn.execute(stmt) except Exception: logger.exception('MapD DDL statement \n%r\n failed', stmt) # import data for table, df in read_tables(tables, data_directory): if table == 'batting': # float nan problem cols = df.select_dtypes([float]).columns df[cols] = df[cols].fillna(0).astype(int) # string None driver problem cols = df.select_dtypes([object]).columns df[cols] = df[cols].fillna('') elif table == 'awards_players': # string None driver problem cols = df.select_dtypes([object]).columns df[cols] = df[cols].fillna('') # rename fields for df_col in df.columns: if ' ' in df_col or ':' in df_col: column = df_col.replace(' ', '_').replace(':', '_') elif df_col in reserved_words: column = '{}_'.format(df_col) else: continue df.rename(columns={df_col: column}, inplace=True) conn.load_table_columnar(table, df) conn.close()
def mapd(schema, tables, data_directory, **params): if sys.version_info[0] < 3: click.echo('[MAPD|EE] MapD backend is unavailable for Python 2.') return import pymapd data_directory = Path(data_directory) reserved_words = ['table', 'year', 'month'] # connection click.echo('Initializing MapD...') if params['database'] != 'mapd': conn = pymapd.connect(host=params['host'], user=params['user'], password=params['password'], port=params['port'], dbname='mapd') try: conn.execute('CREATE DATABASE {}'.format(params['database'])) except Exception as e: click.echo('[MAPD|WW]{}'.format(e)) conn.close() conn = pymapd.connect(host=params['host'], user=params['user'], password=params['password'], port=params['port'], dbname=params['database']) # create tables for stmt in schema.read().split(';'): stmt = stmt.strip() if len(stmt): try: conn.execute(stmt) except Exception as e: click.echo('[MAPD|WW] {}'.format(str(e))) click.echo('[MAPD|II] Creating tables ... OK') # import data click.echo('[MAPD|II] Loading data ...') for table, df in read_tables(tables, data_directory): if table == 'batting': # float nan problem cols = df.select_dtypes([float]).columns df[cols] = df[cols].fillna(0).astype(int) # string None driver problem cols = df.select_dtypes([object]).columns df[cols] = df[cols].fillna('') elif table == 'awards_players': # string None driver problem cols = df.select_dtypes([object]).columns df[cols] = df[cols].fillna('') # rename fields for df_col in df.columns: if ' ' in df_col or ':' in df_col: column = df_col.replace(' ', '_').replace(':', '_') elif df_col in reserved_words: column = '{}_'.format(df_col) else: continue df.rename(columns={df_col: column}, inplace=True) conn.load_table_columnar(table, df) conn.close() click.echo('[MAPD|II] Done!')
import os import random import shutil import sys import tempfile import click import ruamel.yaml from jinja2 import Environment, FileSystemLoader from plumbum.cmd import git, conda import ibis from ibis.compat import Path, PY2 IBIS_DIR = Path(__file__).parent.parent.absolute() def render(path): env = Environment(loader=FileSystemLoader(str(path.parent))) template = env.get_template(path.name) return template.render() @click.group() def cli(): pass default_repo = 'https://github.com/conda-forge/ibis-framework-feedstock' default_dest = os.path.join(
import warnings import click import six import pandas as pd import sqlalchemy as sa from toolz import dissoc from plumbum import local from plumbum.cmd import curl, psql import ibis from ibis.compat import Path SCRIPT_DIR = Path(__file__).parent.absolute() DATA_DIR = Path( os.environ.get('IBIS_TEST_DATA_DIRECTORY', SCRIPT_DIR / 'ibis-testing-data')) TEST_TABLES = ['functional_alltypes', 'diamonds', 'batting', 'awards_players'] logger = ibis.util.get_logger('datamgr') def recreate_database(driver, params, **kwargs): url = sa.engine.url.URL(driver, **dissoc(params, 'database')) engine = sa.create_engine(url, **kwargs) with engine.connect() as conn: conn.execute('DROP DATABASE IF EXISTS {}'.format(params['database']))
def __init__(self, root): self.root = Path(str(root)) self.dictionary = {}