def __init__(self, uri=None, # type: Optional[str] user=None, # type: Optional[str] password=None, # type: Optional[str] host=None, # type: Optional[str] port=9091, # type: Optional[int] dbname=None, # type: Optional[str] protocol='binary', # type: Optional[str] ): # type: (...) -> None if uri is not None: if not all([user is None, password is None, host is None, port == 9091, dbname is None, protocol == 'binary']): raise TypeError("Cannot specify both URI and other arguments") user, password, host, port, dbname, protocol = _parse_uri(uri) if host is None: raise TypeError("`host` parameter is required.") if protocol in ("http", "https"): if not host.startswith(protocol): # the THttpClient expects http[s]://localhost host = protocol + '://' + host transport = THttpClient.THttpClient("{}:{}".format(host, port)) proto = TJSONProtocol.TJSONProtocol(transport) socket = None elif protocol == "binary": socket = TSocket.TSocket(host, port) transport = TTransport.TBufferedTransport(socket) proto = TBinaryProtocol.TBinaryProtocolAccelerated(transport) else: raise ValueError("`protocol` should be one of", " ['http', 'https', 'binary'],", " got {} instead".format(protocol)) self._user = user self._password = password self._host = host self._port = port self._dbname = dbname self._transport = transport self._protocol = protocol self._socket = socket self._closed = 0 self._tdf = None try: self._transport.open() except TTransportException as e: if e.NOT_OPEN: err = OperationalError("Could not connect to database") six.raise_from(err, e) else: raise self._client = Client(proto) try: self._session = self._client.connect(user, password, dbname) except TMapDException as e: six.raise_from(_translate_exception(e), e)
def __init__( self, uri=None, user=None, password=None, host=None, port=6274, dbname=None, protocol='binary', sessionid=None, ): if sessionid is not None: if any([user, password, uri, dbname]): raise TypeError("Cannot specify sessionid with user, password," " dbname, or uri") if uri is not None: if not all([ user is None, password is None, host is None, port == 6274, dbname is None, protocol == 'binary' ]): raise TypeError("Cannot specify both URI and other arguments") user, password, host, port, dbname, protocol = _parse_uri(uri) if host is None: raise TypeError("`host` parameter is required.") if protocol in ("http", "https"): if not host.startswith(protocol): # the THttpClient expects http[s]://localhost host = protocol + '://' + host transport = THttpClient.THttpClient("{}:{}".format(host, port)) proto = TJSONProtocol.TJSONProtocol(transport) socket = None elif protocol == "binary": socket = TSocket.TSocket(host, port) transport = TTransport.TBufferedTransport(socket) proto = TBinaryProtocol.TBinaryProtocolAccelerated(transport) else: raise ValueError("`protocol` should be one of", " ['http', 'https', 'binary'],", " got {} instead".format(protocol)) self._user = user self._password = password self._host = host self._port = port self._dbname = dbname self._transport = transport self._protocol = protocol self._socket = socket self._closed = 0 self._tdf = None try: self._transport.open() except TTransportException as e: if e.NOT_OPEN: err = OperationalError("Could not connect to database") raise err from e else: raise self._client = Client(proto) try: # If a sessionid was passed, we should validate it if sessionid: self._session = sessionid self.get_tables() else: self._session = self._client.connect(user, password, dbname) except TMapDException as e: raise _translate_exception(e) from e except TTransportException: raise ValueError(f"Connection failed with port {port} and " f"protocol '{protocol}'. Try port 6274 for " "protocol == binary or 6273, 6278 or 443 for " "http[s]") # if OmniSci version <4.6, raise RuntimeError, as data import can be # incorrect for columnar date loads # Caused by https://github.com/omnisci/pymapd/pull/188 semver = self._client.get_version() if Version(semver.split("-")[0]) < Version("4.6"): raise RuntimeError(f"Version {semver} of OmniSci detected. " "Please use pymapd <0.11. See release notes " "for more details.")
class Connection: """Connect to your OmniSci database.""" def __init__( self, uri=None, user=None, password=None, host=None, port=6274, dbname=None, protocol='binary', sessionid=None, ): if sessionid is not None: if any([user, password, uri, dbname]): raise TypeError("Cannot specify sessionid with user, password," " dbname, or uri") if uri is not None: if not all([ user is None, password is None, host is None, port == 6274, dbname is None, protocol == 'binary' ]): raise TypeError("Cannot specify both URI and other arguments") user, password, host, port, dbname, protocol = _parse_uri(uri) if host is None: raise TypeError("`host` parameter is required.") if protocol in ("http", "https"): if not host.startswith(protocol): # the THttpClient expects http[s]://localhost host = protocol + '://' + host transport = THttpClient.THttpClient("{}:{}".format(host, port)) proto = TJSONProtocol.TJSONProtocol(transport) socket = None elif protocol == "binary": socket = TSocket.TSocket(host, port) transport = TTransport.TBufferedTransport(socket) proto = TBinaryProtocol.TBinaryProtocolAccelerated(transport) else: raise ValueError("`protocol` should be one of", " ['http', 'https', 'binary'],", " got {} instead".format(protocol)) self._user = user self._password = password self._host = host self._port = port self._dbname = dbname self._transport = transport self._protocol = protocol self._socket = socket self._closed = 0 self._tdf = None try: self._transport.open() except TTransportException as e: if e.NOT_OPEN: err = OperationalError("Could not connect to database") raise err from e else: raise self._client = Client(proto) try: # If a sessionid was passed, we should validate it if sessionid: self._session = sessionid self.get_tables() else: self._session = self._client.connect(user, password, dbname) except TMapDException as e: raise _translate_exception(e) from e except TTransportException: raise ValueError(f"Connection failed with port {port} and " f"protocol '{protocol}'. Try port 6274 for " "protocol == binary or 6273, 6278 or 443 for " "http[s]") # if OmniSci version <4.6, raise RuntimeError, as data import can be # incorrect for columnar date loads # Caused by https://github.com/omnisci/pymapd/pull/188 semver = self._client.get_version() if Version(semver.split("-")[0]) < Version("4.6"): raise RuntimeError(f"Version {semver} of OmniSci detected. " "Please use pymapd <0.11. See release notes " "for more details.") def __repr__(self): tpl = ('Connection(mapd://{user}:***@{host}:{port}/{dbname}?protocol' '={protocol})') return tpl.format(user=self._user, host=self._host, port=self._port, dbname=self._dbname, protocol=self._protocol) def __del__(self): self.close() def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close() @property def closed(self): return self._closed def close(self): """Disconnect from the database""" try: self._client.disconnect(self._session) except (TMapDException, AttributeError, TypeError): pass finally: self._closed = 1 def commit(self): """This is a noop, as OmniSci does not provide transactions. Implemented to comply with the DBI specification. """ return None def execute(self, operation, parameters=None): """Execute a SQL statement Parameters ---------- operation: str A SQL statement to exucute Returns ------- c: Cursor """ c = Cursor(self) return c.execute(operation.strip(), parameters=parameters) def cursor(self): """Create a new :class:`Cursor` object attached to this connection.""" return Cursor(self) def select_ipc_gpu(self, operation, parameters=None, device_id=0, first_n=-1, release_memory=True): """Execute a ``SELECT`` operation using GPU memory. Parameters ---------- operation: str A SQL statement parameters: dict, optional Parameters to insert into a parametrized query device_id: int GPU to return results to first_n: int, optional Number of records to return release_memory: bool, optional Call ``self.deallocate_ipc_gpu(df)`` after DataFrame created Returns ------- gdf: cudf.GpuDataFrame Notes ----- This method requires ``cudf`` and ``libcudf`` to be installed. An ``ImportError`` is raised if those aren't available. This method requires the Python code to be executed on the same machine where OmniSci running. """ try: from cudf.comm.gpuarrow import GpuArrowReader # noqa from cudf.dataframe import DataFrame # noqa except ImportError: raise ImportError("The 'cudf' package is required for " "`select_ipc_gpu`") if parameters is not None: operation = str(_bind_parameters(operation, parameters)) tdf = self._client.sql_execute_gdf(self._session, operation.strip(), device_id=device_id, first_n=first_n) self._tdf = tdf df = _parse_tdf_gpu(tdf) # Deallocate TDataFrame at OmniSci instance if release_memory: self.deallocate_ipc_gpu(df) return df def select_ipc(self, operation, parameters=None, first_n=-1, release_memory=True): """Execute a ``SELECT`` operation using CPU shared memory Parameters ---------- operation: str A SQL select statement parameters: dict, optional Parameters to insert for a parametrized query first_n: int, optional Number of records to return release_memory: bool, optional Call ``self.deallocate_ipc(df)`` after DataFrame created Returns ------- df: pandas.DataFrame Notes ----- This method requires the Python code to be executed on the same machine where OmniSci running. """ if parameters is not None: operation = str(_bind_parameters(operation, parameters)) tdf = self._client.sql_execute_df(self._session, operation.strip(), device_type=0, device_id=0, first_n=first_n) self._tdf = tdf sm_buf = load_buffer(tdf.sm_handle, tdf.sm_size) df_buf = load_buffer(tdf.df_handle, tdf.df_size) schema = _load_schema(sm_buf[0]) df = _load_data(df_buf[0], schema, tdf) # free shared memory from Python # https://github.com/omnisci/pymapd/issues/46 # https://github.com/omnisci/pymapd/issues/31 free_sm = shmdt(ctypes.cast(sm_buf[1], ctypes.c_void_p)) # noqa free_df = shmdt(ctypes.cast(df_buf[1], ctypes.c_void_p)) # noqa # Deallocate TDataFrame at OmniSci instance if release_memory: self.deallocate_ipc(df) return df def deallocate_ipc_gpu(self, df, device_id=0): """Deallocate a DataFrame using GPU memory. Parameters ---------- device_ids: int GPU which contains TDataFrame """ tdf = df.get_tdf() result = self._client.deallocate_df(session=self._session, df=tdf, device_type=TDeviceType.GPU, device_id=device_id) return result def deallocate_ipc(self, df, device_id=0): """Deallocate a DataFrame using CPU shared memory. Parameters ---------- device_id: int GPU which contains TDataFrame """ tdf = df.get_tdf() result = self._client.deallocate_df(session=self._session, df=tdf, device_type=TDeviceType.CPU, device_id=device_id) return result # -------------------------------------------------------------------------- # Convenience methods # -------------------------------------------------------------------------- def get_tables(self): """List all the tables in the database Examples -------- >>> con.get_tables() ['flights_2008_10k', 'stocks'] """ return self._client.get_tables(self._session) def get_table_details(self, table_name): """Get the column names and data types associated with a table. Parameters ---------- table_name: str Returns ------- details: List[tuples] Examples -------- >>> con.get_table_details('stocks') [ColumnDetails(name='date_', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT'), ColumnDetails(name='trans', type='STR', nullable=True, precision=0, scale=0, comp_param=32, encoding='DICT'), ... ] """ details = self._client.get_table_details(self._session, table_name) return _extract_column_details(details.row_desc) def create_table(self, table_name, data, preserve_index=False): """Create a table from a pandas.DataFrame Parameters ---------- table_name: str data: DataFrame preserve_index: bool, default False Whether to create a column in the table for the DataFrame index """ row_desc = build_row_desc(data, preserve_index=preserve_index) self._client.create_table(self._session, table_name, row_desc, TFileType.DELIMITED, TCreateParams(False)) def load_table(self, table_name, data, method='infer', preserve_index=False, create='infer'): """Load data into a table Parameters ---------- table_name: str data: pyarrow.Table, pandas.DataFrame, or iterable of tuples method: {'infer', 'columnar', 'rows', 'arrow'} Method to use for loading the data. Three options are available 1. ``pyarrow`` and Apache Arrow loader 2. columnar loader 3. row-wise loader The Arrow loader is typically the fastest, followed by the columnar loader, followed by the row-wise loader. If a DataFrame or ``pyarrow.Table`` is passed and ``pyarrow`` is installed, the Arrow-based loader will be used. If arrow isn't available, the columnar loader is used. Finally, ``data`` is an iterable of tuples the row-wise loader is used. preserve_index: bool, default False Whether to keep the index when loading a pandas DataFrame create: {"infer", True, False} Whether to issue a CREATE TABLE before inserting the data. * infer: check to see if the table already exists, and create a table if it does not * True: attempt to create the table, without checking if it exists * False: do not attempt to create the table See Also -------- load_table_arrow load_table_columnar """ if create not in ['infer', True, False]: raise ValueError(f"Unexpected value for create: '{create}'. " "Expected one of {'infer', True, False}") if create == 'infer': # ask the database if we already exist, creating if not create = table_name not in set( self._client.get_tables(self._session)) if create: self.create_table(table_name, data) if method == 'infer': if (isinstance(data, pd.DataFrame) or isinstance(data, pa.Table) or isinstance(data, pa.RecordBatch)): # noqa return self.load_table_arrow(table_name, data) elif (isinstance(data, pd.DataFrame)): return self.load_table_columnar(table_name, data) elif method == 'arrow': return self.load_table_arrow(table_name, data) elif method == 'columnar': return self.load_table_columnar(table_name, data) elif method != 'rows': raise TypeError( "Method must be one of {{'infer', 'arrow', " "'columnar', 'rows'}}. Got {} instead".format(method)) if isinstance(data, pd.DataFrame): # We need to convert a Pandas dataframe to a list of tuples before # loading row wise data = data.itertuples(index=preserve_index, name=None) input_data = _build_input_rows(data) self._client.load_table(self._session, table_name, input_data) def load_table_rowwise(self, table_name, data): """Load data into a table row-wise Parameters ---------- table_name: str data: Iterable of tuples Each element of `data` should be a row to be inserted See Also -------- load_table load_table_arrow load_table_columnar Examples -------- >>> data = [(1, 'a'), (2, 'b'), (3, 'c')] >>> con.load_table('bar', data) """ input_data = _build_input_rows(data) self._client.load_table(self._session, table_name, input_data) def load_table_columnar(self, table_name, data, preserve_index=False, chunk_size_bytes=0, col_names_from_schema=False): """Load a pandas DataFrame to the database using OmniSci's Thrift-based columnar format Parameters ---------- table_name: str data: DataFrame preserve_index: bool, default False Whether to include the index of a pandas DataFrame when writing. chunk_size_bytes: integer, default 0 Chunk the loading of columns to prevent large Thrift requests. A value of 0 means do not chunk and send the dataframe as a single request col_names_from_schema: bool, default False Read the existing table schema to determine the column names. This will read the schema of an existing table in OmniSci and match those names to the column names of the dataframe. This is for user convenience when loading from data that is unordered, especially handy when a table has a large number of columns. Examples -------- >>> df = pd.DataFrame({"a": [1, 2, 3], "b": ['d', 'e', 'f']}) >>> con.load_table_columnar('foo', df, preserve_index=False) See Also -------- load_table load_table_arrow load_table_rowwise Notes ----- Use ``pymapd >= 0.11.0`` while running with ``omnisci >= 4.6.0`` in order to avoid loading inconsistent values into DATE column. """ if isinstance(data, pd.DataFrame): table_details = self.get_table_details(table_name) # Validate that there are the same number of columns in the table # as there are in the dataframe. No point trying to load the data # if this is not the case if len(table_details) != len(data.columns): raise ValueError( 'Number of columns in dataframe ({}) does not \ match number of columns in OmniSci table \ ({})'.format(len(data.columns), len(table_details))) col_names = [i[0] for i in table_details] if \ col_names_from_schema \ else list(data) col_types = [(i[1], i[4]) for i in table_details] input_cols = _pandas_loaders.build_input_columnar( data, preserve_index=preserve_index, chunk_size_bytes=chunk_size_bytes, col_types=col_types, col_names=col_names) else: raise TypeError("Unknown type {}".format(type(data))) for cols in input_cols: self._client.load_table_binary_columnar(self._session, table_name, cols) def load_table_arrow(self, table_name, data, preserve_index=False): """Load a pandas.DataFrame or a pyarrow Table or RecordBatch to the database using Arrow columnar format for interchange Parameters ---------- table_name: str data: pandas.DataFrame, pyarrow.RecordBatch, pyarrow.Table preserve_index: bool, default False Whether to include the index of a pandas DataFrame when writing. Examples -------- >>> df = pd.DataFrame({"a": [1, 2, 3], "b": ['d', 'e', 'f']}) >>> con.load_table_arrow('foo', df, preserve_index=False) See Also -------- load_table load_table_columnar load_table_rowwise """ metadata = self.get_table_details(table_name) payload = _serialize_arrow_payload(data, metadata, preserve_index=preserve_index) self._client.load_table_binary_arrow(self._session, table_name, payload.to_pybytes()) def render_vega(self, vega, compression_level=1): """Render vega data on the database backend, returning the image as a PNG. Parameters ---------- vega: dict The vega specification to render. compression_level: int The level of compression for the rendered PNG. Ranges from 0 (low compression, faster) to 9 (high compression, slower). """ result = self._client.render_vega(self._session, widget_id=None, vega_json=vega, compression_level=compression_level, nonce=None) rendered_vega = RenderedVega(result) return rendered_vega def get_dashboards(self): """List all the dashboards in the database Examples -------- >>> con.get_dashboards() """ dashboards = self._client.get_dashboards(session=self._session) return dashboards def duplicate_dashboard(self, dashboard_id, new_name=None, source_remap=None): """ Duplicate an existing dashboard, returning the new dashboard id. Parameters ---------- dashboard_id: int The id of the dashboard to duplicate new_name: str The name for the new dashboard source_remap: dict EXPERIMENTAL A dictionary remapping table names. The old table name(s) should be keys of the dict, with each value being another dict with a 'name' key holding the new table value. This structure can be used later to support changing column names. Examples -------- >>> source_remap = {'oldtablename1': {'name': 'newtablename1'}, \ 'oldtablename2': {'name': 'newtablename2'}} >>> newdash = con.duplicate_dashboard(12345, "new dash", source_remap) """ source_remap = source_remap or {} d = self._client.get_dashboard(session=self._session, dashboard_id=dashboard_id) newdashname = new_name or '{0} (Copy)'.format(d.dashboard_name) d = change_dashboard_sources(d, source_remap) if source_remap else d new_dashboard_id = self._client.create_dashboard( session=self._session, dashboard_name=newdashname, dashboard_state=d.dashboard_state, image_hash='', dashboard_metadata=d.dashboard_metadata, ) return new_dashboard_id
class Connection(object): """Connect to your mapd database.""" def __init__( self, uri=None, # type: Optional[str] user=None, # type: Optional[str] password=None, # type: Optional[str] host=None, # type: Optional[str] port=9091, # type: Optional[int] dbname=None, # type: Optional[str] protocol='binary', # type: Optional[str] ): # type: (...) -> None if uri is not None: if not all([ user is None, password is None, host is None, port == 9091, dbname is None, protocol == 'binary' ]): raise TypeError("Cannot specify both URI and other arguments") user, password, host, port, dbname, protocol = _parse_uri(uri) if host is None: raise TypeError("`host` parameter is required.") if protocol in ("http", "https"): if not host.startswith(protocol): # the THttpClient expects http[s]://localhost host = protocol + '://' + host transport = THttpClient.THttpClient("{}:{}".format(host, port)) proto = TJSONProtocol.TJSONProtocol(transport) socket = None elif protocol == "binary": socket = TSocket.TSocket(host, port) transport = TTransport.TBufferedTransport(socket) proto = TBinaryProtocol.TBinaryProtocolAccelerated(transport) else: raise ValueError("`protocol` should be one of", " ['http', 'https', 'binary'],", " got {} instead".format(protocol)) self._user = user self._password = password self._host = host self._port = port self._dbname = dbname self._transport = transport self._protocol = protocol self._socket = socket self._closed = 0 self._tdf = None try: self._transport.open() except TTransportException as e: if e.NOT_OPEN: err = OperationalError("Could not connect to database") six.raise_from(err, e) else: raise self._client = Client(proto) try: self._session = self._client.connect(user, password, dbname) except TMapDException as e: six.raise_from(_translate_exception(e), e) def __repr__(self): # type: () -> str tpl = ('Connection(mapd://{user}:***@{host}:{port}/{dbname}?protocol' '={protocol})') return tpl.format(user=self._user, host=self._host, port=self._port, dbname=self._dbname, protocol=self._protocol) def __del__(self): # type: () -> None self.close() def __enter__(self): return self.cursor() def __exit__(self, exc_type, exc_val, exc_tb): pass @property def closed(self): return self._closed def close(self): # type: () -> None """Disconnect from the database""" try: self._client.disconnect(self._session) except (TMapDException, AttributeError): pass finally: self._closed = 1 def commit(self): # type: () -> None """This is a noop, as mapd does not provide transactions. Implementing to comply with the specification. """ return None def execute(self, operation, parameters=None): # type: (str, Optional[Tuple]) -> Cursor """Execute a SQL statement Parameters ---------- operation : str A SQL statement to exucute Returns ------- c : Cursor """ c = Cursor(self) return c.execute(operation, parameters=parameters) def cursor(self): # type: () -> Cursor """Create a new :class:`Cursor` object attached to this connection.""" return Cursor(self) def select_ipc_gpu(self, operation, parameters=None, device_id=0, first_n=-1): """Execute a ``SELECT`` operation using GPU memory. Parameters ---------- operation : str A SQL statement parameters : dict, optional Parameters to insert into a parametrized query device_id : int GPU to return results to Returns ------- gdf : pygdf.GpuDataFrame Notes ----- This requires the option ``pygdf`` and ``libgdf`` libraries. An ``ImportError`` is raised if those aren't available. """ try: from pygdf.gpuarrow import GpuArrowReader # noqa from pygdf.dataframe import DataFrame # noqa except ImportError: raise ImportError("The 'pygdf' package is required for " "`select_ipc_gpu`") if parameters is not None: operation = str(_bind_parameters(operation, parameters)) tdf = self._client.sql_execute_gdf(self._session, operation, device_id=device_id, first_n=first_n) self._tdf = tdf return _parse_tdf_gpu(tdf) def select_ipc(self, operation, parameters=None, first_n=-1): """Execute a ``SELECT`` operation using CPU shared memory Parameters ---------- operation : str A SQL select statement parameters : dict, optional Parameters to insert for a parametrized query Returns ------- df : pandas.DataFrame Notes ----- This method requires pandas and pyarrow to be installed """ try: import pyarrow # noqa except ImportError: raise ImportError("pyarrow is required for `select_ipc`") try: import pandas # noqa except ImportError: raise ImportError("pandas is required for `select_ipc`") from .shm import load_buffer if parameters is not None: operation = str(_bind_parameters(operation, parameters)) tdf = self._client.sql_execute_df(self._session, operation, device_type=0, device_id=0, first_n=first_n) self._tdf = tdf sm_buf = load_buffer(tdf.sm_handle, tdf.sm_size) df_buf = load_buffer(tdf.df_handle, tdf.df_size) schema = _load_schema(sm_buf) df = _load_data(df_buf, schema, tdf) return df def deallocate_ipc_gpu(self, df, device_id=0): # type: () -> None """Deallocate a DataFrame using GPU memory. Parameters ---------- device_id : int GPU which contains TDataFrame """ tdf = df.get_tdf() result = self._client.deallocate_df(session=self._session, df=tdf, device_type=TDeviceType.GPU, device_id=device_id) return result def deallocate_ipc(self, df, device_id=0): # type: () -> None """Deallocate a DataFrame using CPU shared memory. Parameters ---------- device_id : int GPU which contains TDataFrame """ tdf = df.get_tdf() result = self._client.deallocate_df(session=self._session, df=tdf, device_type=TDeviceType.CPU, device_id=device_id) return result # -------------------------------------------------------------------------- # Convenience methods # -------------------------------------------------------------------------- def get_tables(self): """List all the tables in the database Examples -------- >>> con.get_tables() ['flights_2008_10k', 'stocks'] """ return self._client.get_tables(self._session) def get_table_details(self, table_name): """Get the column names and data types associated with a table. Parameters ---------- table_name : str Returns ------- details : List[tuples] Examples -------- >>> con.get_table_details('stocks') [ColumnDetails(name='date_', type='STR', nullable=True, precision=0, scale=0, comp_param=32), ColumnDetails(name='trans', type='STR', nullable=True, precision=0, scale=0, comp_param=32), ... ] """ details = self._client.get_table_details(self._session, table_name) return _extract_column_details(details.row_desc) def create_table(self, table_name, data, preserve_index=False): """Create a table from a pandas.DataFrame Parameters ---------- table_name : str data : DataFrame preserve_index : bool, default False Whether to create a column in the table for the DataFrame index """ from mapd.ttypes import TTableType from ._pandas_loaders import build_row_desc row_desc = build_row_desc(data, preserve_index=preserve_index) self._client.create_table(self._session, table_name, row_desc, TTableType.DELIMITED) def load_table(self, table_name, data, method='infer', preserve_index=False, create='infer'): """Load data into a table Parameters ---------- table_name : str data : pyarrow.Table, pandas.DataFrame, or iterable of tuples method : {'infer', 'columnar', 'rows'} Method to use for loading the data. Three options are available 1. ``pyarrow`` and Apache Arrow loader 2. columnar loader 3. row-wise loader The Arrow loader is typically the fastest, followed by the columnar loader, followed by the row-wise loader. If a DataFrame or ``pyarrow.Table`` is passed and ``pyarrow`` is installed, the Arrow-based loader will be used. If arrow isn't available, the columnar loader is used. Finally, ``data`` is an iterable of tuples the row-wise loader is used. preserve_index : bool, default False Whether to keep the index when loading a pandas DataFrame create : {"infer", True, False} Whether to issue a CREATE TABLE before inserting the data. * infer : check to see if the table already exists, and create a table if it does not * True : attempt to create the table, without checking if it exists * False : do not attempt to create the table See Also -------- load_table_arrow load_table_columnar """ create = _check_create(create) if create == 'infer': # ask the database if we already exist, creating if not create = table_name not in set( self._client.get_tables(self._session)) if create: self.create_table(table_name, data) if method == 'infer': if (_is_pandas(data) or _is_arrow(data)) and _HAS_ARROW: return self.load_table_arrow(table_name, data) elif _is_pandas(data): return self.load_table_columnar(table_name, data) elif method == 'arrow': return self.load_table_arrow(table_name, data) elif method == 'columnar': return self.load_table_columnar(table_name, data) elif method != 'rows': raise TypeError( "Method must be one of {{'infer', 'arrow', " "'columnar', 'rows'}}. Got {} instead".format(method)) input_data = _build_input_rows(data) self._client.load_table(self._session, table_name, input_data) def load_table_rowwise(self, table_name, data): """Load data into a table row-wise Parameters ---------- table_name : str data : Iterable of tuples Each element of `data` should be a row to be inserted See Also -------- load_table load_table_arrow load_table_columnar Examples -------- >>> data = [(1, 'a'), (2, 'b'), (3, 'c')] >>> con.load_table('bar', data) """ input_data = _build_input_rows(data) self._client.load_table(self._session, table_name, input_data) def load_table_columnar(self, table_name, data, preserve_index=False): """Load a pandas DataFrame to the database using MapD's Thrift-based columnar format Parameters ---------- table_name : str data : DataFrame preserve_index : bool, default False Whether to include the index of a pandas DataFrame when writing. Examples -------- >>> df = pd.DataFrame({"a": [1, 2, 3], "b": ['d', 'e', 'f']}) >>> con.load_table_columnar('foo', df, preserve_index=False) See Also -------- load_table load_table_arrow load_table_rowwise """ from . import _pandas_loaders if _is_pandas(data): input_cols = _pandas_loaders.build_input_columnar( data, preserve_index=preserve_index) else: raise TypeError("Unknown type {}".format(type(data))) self._client.load_table_binary_columnar(self._session, table_name, input_cols) def load_table_arrow(self, table_name, data, preserve_index=False): """Load a pandas.DataFrame or a pyarrow Table or RecordBatch to the database using Arrow columnar format for interchange Parameters ---------- table_name : str data : pandas.DataFrame, pyarrow.RecordBatch, pyarrow.Table preserve_index : bool, default False Whether to include the index of a pandas DataFrame when writing. Examples -------- >>> df = pd.DataFrame({"a": [1, 2, 3], "b": ['d', 'e', 'f']}) >>> con.load_table_arrow('foo', df, preserve_index=False) See Also -------- load_table load_table_columnar load_table_rowwise """ metadata = self.get_table_details(table_name) from ._pandas_loaders import _serialize_arrow_payload payload = _serialize_arrow_payload(data, metadata, preserve_index=preserve_index) self._client.load_table_binary_arrow(self._session, table_name, payload.to_pybytes()) def render_vega(self, vega, compression_level=1): """Render vega data on the database backend, returning the image as a PNG. Parameters ---------- vega : dict The vega specification to render. compression_level: int The level of compression for the rendered PNG. Ranges from 0 (low compression, faster) to 9 (high compression, slower). """ result = self._client.render_vega(self._session, widget_id=None, vega_json=vega, compression_level=compression_level, nonce=None) rendered_vega = RenderedVega(result) return rendered_vega
class Connection(object): """Connect to your mapd database.""" def __init__( self, uri=None, # type: Optional[str] user=None, # type: Optional[str] password=None, # type: Optional[str] host=None, # type: Optional[str] port=9091, # type: Optional[int] dbname=None, # type: Optional[str] protocol='binary', # type: Optional[str] ): # type: (...) -> None if uri is not None: if not all([ user is None, password is None, host is None, port == 9091, dbname is None, protocol == 'binary' ]): raise TypeError("Cannot specify both URI and other arguments") user, password, host, port, dbname, protocol = _parse_uri(uri) if host is None: raise TypeError("`host` parameter is required.") if protocol == "http": if not host.startswith(protocol): # the THttpClient expects http[s]://localhost host = protocol + '://' + host transport = THttpClient.THttpClient("{}:{}".format(host, port)) proto = TJSONProtocol.TJSONProtocol(transport) socket = None elif protocol == "binary": socket = TSocket.TSocket(host, port) transport = TTransport.TBufferedTransport(socket) proto = TBinaryProtocol.TBinaryProtocolAccelerated(transport) else: raise ValueError("`protocol` should be one of ['http', 'binary'],", " got {} instead".format(protocol)) self._user = user self._password = password self._host = host self._port = port self._dbname = dbname self._transport = transport self._protocol = protocol self._socket = socket self._closed = 0 try: self._transport.open() except TTransportException as e: if e.NOT_OPEN: err = OperationalError("Could not connect to database") six.raise_from(err, e) else: raise self._client = Client(proto) try: self._session = self._client.connect(user, password, dbname) except TMapDException as e: six.raise_from(_translate_exception(e), e) def __repr__(self): # type: () -> str tpl = ('Connection(mapd://{user}:***@{host}:{port}/{dbname}?protocol' '={protocol})') return tpl.format(user=self._user, host=self._host, port=self._port, dbname=self._dbname, protocol=self._protocol) def __del__(self): # type: () -> None self.close() def __enter__(self): return self.cursor() def __exit__(self, exc_type, exc_val, exc_tb): pass @property def closed(self): return self._closed def close(self): # type: () -> None """Disconnect from the database""" try: self._client.disconnect(self._session) except (TMapDException, AttributeError): pass finally: self._closed = 1 def commit(self): # type: () -> None """This is a noop, as mapd does not provide transactions. Implementing to comply with the specification. """ return None def execute(self, operation, parameters=None): # type: (str, Optional[Tuple]) -> Cursor """Execute a SQL statement Parameters ---------- operation : str A SQL statement to exucute Returns ------- c : Cursor """ c = Cursor(self) return c.execute(operation, parameters=parameters) def cursor(self): # type: () -> Cursor """Create a new :class:`Cursor` object attached to this connection.""" return Cursor(self) def select_ipc_gpu(self, operation, parameters=None, device_id=0, first_n=-1): """Execute a ``SELECT`` operation using GPU memory. Parameters ---------- operation : str A SQL statement parameters : dict, optional Parameters to insert into a parametrized query device_id : int GPU to return results to Returns ------- gdf : pygdf.GpuDataFrame Notes ----- This requires the option ``pygdf`` and ``libgdf`` libraries. An ``ImportError`` is raised if those aren't available. """ try: from pygdf.gpuarrow import GpuArrowReader # noqa from pygdf.dataframe import DataFrame # noqa except ImportError: raise ImportError("The 'pygdf' package is required for " "`select_ipc_gpu`") if parameters is not None: operation = str(_bind_parameters(operation, parameters)) tdf = self._client.sql_execute_gdf(self._session, operation, device_id=device_id, first_n=first_n) return _parse_tdf_gpu(tdf) def select_ipc(self, operation, parameters=None, first_n=-1): """Execute a ``SELECT`` operation using CPU shared memory Parameters ---------- operation : str A SQL select statement parameters : dict, optional Parameters to insert for a parametrized query Returns ------- df : pandas.DataFrame Notes ----- This method requires pandas and pyarrow to be installed """ try: import pyarrow # noqa except ImportError: raise ImportError("pyarrow is required for `select_ipc`") try: import pandas # noqa except ImportError: raise ImportError("pandas is required for `select_ipc`") from .shm import load_buffer if parameters is not None: operation = str(_bind_parameters(operation, parameters)) tdf = self._client.sql_execute_df(self._session, operation, device_type=0, device_id=0, first_n=first_n) sm_buf = load_buffer(tdf.sm_handle, tdf.sm_size) df_buf = load_buffer(tdf.df_handle, tdf.df_size) schema = _load_schema(sm_buf) df = _load_data(df_buf, schema) return df # -------------------------------------------------------------------------- # Convenience methods # -------------------------------------------------------------------------- def get_tables(self): """List all the tables in the database Examples -------- >>> con.get_tables() ['flights_2008_10k', 'stocks'] """ return self._client.get_tables(self._session) def get_table_details(self, table_name): """Get the column names and data types associated with a table. Parameters ---------- table_name : str Returns ------- details : List[tuples] Examples -------- >>> con.get_table_details('stocks') [ColumnDetails(name='date_', type='STR', nullable=True, precision=0, scale=0, comp_param=32), ColumnDetails(name='trans', type='STR', nullable=True, precision=0, scale=0, comp_param=32), ... ] """ details = self._client.get_table_details(self._session, table_name) return _extract_column_details(details.row_desc) def load_table(self, table_name, data, method='infer'): """Load rows of data into a table Parameters ---------- table_name : str data : iterable of tuples method : {'infer', 'columnar', 'rows'} Method to use for loading the data. Pandas DataFrames can be loaded more quickly with :meth:`Connect.load_table_columnar` If ``data`` is a ``DataFrame``, that method will be used. Notes ----- Using a ``pandas.DataFrame`` should result in considerable faster performance than a list of tuples. Examples -------- >>> data = [(1, 'a'), (2, 'b'), (3, 'c')] >>> con.load_table('bar', data) See Also -------- load_table_columnar """ if method == 'infer': if _is_pandas(data): return self.load_table_columnar(table_name, data) elif method == 'columnar' and not _is_pandas(data): raise ValueError("'data' must be a DataFrame with " "`method='columnar'`. Got {} instead".format( type(data))) elif method != 'rows': raise ValueError("Method must be one of {{'infer', 'columnar', " "'rows'}}. Got {} instead".format(method)) input_data = _build_input_rows(data) self._client.load_table(self._session, table_name, input_data) def load_table_columnar(self, table_name, data, preserve_index=True): """Load a pandas DataFrame to the database Parameters ---------- table_name : str data : DataFrame preserve_index : bool, default True Whether to include the index of a pandas DataFrame when writing. Examples -------- >>> df = pd.DataFrame({"a": [1, 2, 3], "b": ['d', 'e', 'f']}) >>> con.load_table_columnar('foo', df, preserve_index=False) See Also -------- load_table """ from . import _pandas_loaders import pandas as pd if isinstance(data, pd.DataFrame): input_cols = _pandas_loaders.build_input_columnar( data, preserve_index=preserve_index) else: raise TypeError("Unknown type {}".format(type(data))) self._client.load_table_binary_columnar(self._session, table_name, input_cols)
class Connection(object): """Connect to your mapd database.""" def __init__( self, user=None, # type: Optional[str] password=None, # type: Optional[str] host=None, # type: Optional[str] port=9091, # type: Optional[int] dbname=None, # type: Optional[str] protocol='binary', # type: Optional[str] uri=None # type: Optional[str] ): # type: (...) -> None if uri is not None: if not all([ user is None, password is None, host is None, port == 9091, dbname is None, protocol == 'binary' ]): raise TypeError("Cannot specify both URI and other arguments") user, password, host, port, dbname, protocol = _parse_uri(uri) if host is None: raise TypeError("`host` parameter is required.") if protocol == "http": if not host.startswith(protocol): # the THttpClient expects http[s]://localhost host = protocol + '://' + host transport = THttpClient.THttpClient("{}:{}".format(host, port)) proto = TJSONProtocol.TJSONProtocol(transport) socket = None elif protocol == "binary": socket = TSocket.TSocket(host, port) transport = TTransport.TBufferedTransport(socket) proto = TBinaryProtocol.TBinaryProtocol(transport) else: raise TypeError("`protocol` should be one of ['http', 'binary'], ", "got {} instead".format(protocol)) self._user = user self._password = password self._host = host self._port = port self._dbname = dbname self._transport = transport self._protocol = protocol self._socket = socket self._closed = 0 try: self._transport.open() except TTransportException as e: if e.NOT_OPEN: err = OperationalError("Could not connect to database") six.raise_from(err, e) else: raise self._client = Client(proto) try: self._session = self._client.connect(user, password, dbname) except TMapDException as e: six.raise_from(_translate_exception(e), e) def __repr__(self): # type: () -> str tpl = ('Connection(mapd://{user}:***@{host}:{port}/{dbname}?protocol' '={protocol})') return tpl.format(user=self._user, host=self._host, port=self._port, dbname=self._dbname, protocol=self._protocol) def __del__(self): # type: () -> None self.close() def __enter__(self): return self.cursor() def __exit__(self, exc_type, exc_val, exc_tb): pass @property def closed(self): return self._closed def close(self): # type: () -> None """Disconnect from the database""" try: self._client.disconnect(self._session) except (TMapDException, AttributeError): pass finally: self._closed = 1 def commit(self): # type: () -> None """This is a noop, as mapd does not provide transactions. Implementing to comply with the specification. """ return None def execute(self, operation, parameters=None): # type: (str, Optional[Tuple]) -> Cursor """Execute a SQL statement Parameters ---------- operation : str A SQL statement to exucute Returns ------- c : Cursor """ c = Cursor(self) return c.execute(operation, parameters=parameters) def cursor(self): # type: () -> Cursor """Create a new :class:`Cursor` object attached to this connection.""" return Cursor(self) def select_ipc_gpu(self, operation, parameters=None, device_id=0, first_n=-1): """Execute a ``SELECT`` operation using GPU memory. Parameters ---------- operation : str A SQL statement parameters : dict, optional Parameters to insert into a parametrized query device_id : int GPU to return results to Returns ------- gdf : pygdf.GpuDataFrame Notes ----- This requires the option ``pygdf`` and ``libgdf`` libraries. An ``ImportError`` is raised if those aren't available. """ try: from pygdf.gpuarrow import GpuArrowReader # noqa from pygdf.dataframe import DataFrame # noqa except ImportError: raise ImportError("Install pygdf") if parameters is not None: operation = str(_bind_parameters(operation, parameters)) tdf = self._client.sql_execute_gdf(self._session, operation, device_id=device_id, first_n=first_n) return _parse_tdf_gpu(tdf) def select_ipc(self, operation, parameters=None, first_n=-1): """Execute a ``SELECT`` operation using CPU shared memory Parameters ---------- operation : str A SQL select statement parameters : dict, optional Parameters to insert for a parametrized query Returns ------- df : pandas.DataFrame Notes ----- This method requires pandas and pyarrow to be installed """ try: import pandas # noqa except ImportError: raise ImportError("pandas is required for `select_ipc`") try: import pyarrow # noqa except ImportError: raise ImportError("pyarrow is required for `select_ipc`") from .shm import load_buffer if parameters is not None: operation = str(_bind_parameters(operation, parameters)) tdf = self._client.sql_execute_df(self._session, operation, device_type=0, device_id=0, first_n=first_n) sm_buf = load_buffer(tdf.sm_handle, tdf.sm_size) df_buf = load_buffer(tdf.df_handle, tdf.df_size) schema = _load_schema(sm_buf) df = _load_data(df_buf, schema) return df