Пример #1
0
    def test_set_api_result_failed(self, mock_api, m_sleep):
        poller = _create_poller_mock("failed")

        result = CivisFuture(poller, (1, 2))
        assert result._state == 'FINISHED'
        with pytest.raises(CivisJobFailure):
            result.result()
        with pytest.raises(CivisJobFailure):
            result.outputs()
Пример #2
0
def run_template(id, arguments, JSONValue=False, client=None):
    """Run a template and return the results.

    Parameters
    ----------
    id: int
        The template id to be run.
    arguments: dict
        Dictionary of arguments to be passed to the template.
    JSONValue: bool, optional
        If True, will return the JSON output of the template.
        If False, will return the file ids associated with the
        output results.
    client: :class:`civis.APIClient`, optional
        If not provided, an :class:`civis.APIClient` object will be
        created from the :envvar:`CIVIS_API_KEY`.

    Returns
    -------
    output: dict
        If JSONValue = False, dictionary of file ids with the keys
        being their output names.
        If JSONValue = True, JSON dict containing the results of the
        template run. Expects only a single JSON result. Will return
        nothing if either there is no JSON result or there is more
        than 1 JSON result.


    """
    if client is None:
        client = APIClient()
    job = client.scripts.post_custom(id, arguments=arguments)
    run = client.scripts.post_custom_runs(job.id)
    fut = CivisFuture(client.scripts.get_custom_runs, (job.id, run.id),
                      client=client)
    fut.result()
    outputs = client.scripts.list_custom_runs_outputs(job.id, run.id)
    if JSONValue:
        json_output = [
            o.value for o in outputs if o.object_type == "JSONValue"
        ]
        if len(json_output) == 0:
            log.warning("No JSON output for template {}".format(id))
            return
        if len(json_output) > 1:
            log.warning("More than 1 JSON output for template {}"
                        " -- returning only the first one.".format(id))
        # Note that the cast to a dict is to convert
        # an expected Response object.
        return dict(json_output[0])
    else:
        file_ids = {o.name: o.object_id for o in outputs}
        return file_ids
Пример #3
0
    def test_set_api_result_failed(self, mock_subscribe, mock_api):
        mock_pubnub = mock.Mock()
        mock_pubnub.unsubscribe_all.return_value = None
        mock_subscribe.return_value = mock_pubnub
        poller = mock.Mock()
        api_result = mock.Mock()
        api_result.state = 'failed'

        result = CivisFuture(poller, (1, 2))
        result._set_api_result(api_result)
        assert mock_pubnub.unsubscribe_all.call_count == 1
        assert result._state == 'FINISHED'
        with pytest.raises(CivisJobFailure):
            result.result()
Пример #4
0
def civis_to_multifile_csv(sql, database, job_name=None, api_key=None,
                           client=None, credential_id=None,
                           include_header=True,
                           compression='none', delimiter='|',
                           unquoted=False, prefix=None,
                           polling_interval=None, hidden=True):
    """Unload the result of SQL query and return presigned urls.

    This function is intended for unloading large queries/tables from redshift
    as it uses a 'PARALLEL ON' S3 unload. It returns a similar manifest file
    to conventional S3 UNLOAD statements except the CSV parts are accessible
    via both files endpoint IDs and presigned S3 urls.

    Parameters
    ----------
    sql : str, optional
        The SQL select string to be executed.
    database : str or int
        Execute the query against this database. Can be the database name
        or ID.
    job_name : str, optional
        A name to give the job. If omitted, a random job name will be
        used.
    api_key : DEPRECATED str, optional
        Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY`
        environment variable will be used.
    client : :class:`civis.APIClient`, optional
        If not provided, an :class:`civis.APIClient` object will be
        created from the :envvar:`CIVIS_API_KEY`.
    credential_id : str or int, optional
        The database credential ID.  If ``None``, the default credential
        will be used.
    include_header: bool, optional
        If ``True`` include a key in the returned dictionary containing a list
        of column names. Default: ``True``.
    compression: str, optional
        Type of compression to use, if any. One of ``'none'``, ``'zip'``, or
        ``'gzip'``. Default ``'none'``.
    delimiter, str: optional
        Which delimiter to use, if any. One of ``','``, ``'\t'``, or
        ``'|'``. Default: ``'|'``.
    unquoted: bool, optional
        Whether or not to quote fields. Default: ``False``.
    prefix: str, optional
        A user specified filename prefix for the output file to have. Default:
        ``None``.
    polling_interval : int or float, optional
        Number of seconds to wait between checks for query completion.
    hidden : bool, optional
        If ``True`` (the default), this job will not appear in the Civis UI.

    Returns
    -------
    unload_manifest: dict
        A dictionary resembling an AWS manifest file. Has the following keys:
        ``'header'``, ``'query'``, ``'entries'``, respresenting the columns
        from the query, the query itself, and a list of dictionaries for each
        unloaded CSV part, each containing its file ``'id'``, ``'name'``,
        ``'size'``, and unsigned and signed S3 urls, ``'url'`` and
        ``'url_signed'``, respectively.

    Examples
    --------
    >>> sql = "SELECT * FROM schema.my_big_table"
    >>> database = "my_database"
    >>> delimiter = "|"
    >>> manifest = civis_multipart_unload(sql, database, delimiter=delimiter)
    >>> ids = [file['id'] for file in manifest['files']]
    >>> buf = BytesIO()
    >>> civis_to_file(ids[0], buf)
    >>> buf.seek(0)
    >>> df = pd.read_csv(buf, delimiter=delimiter)

    See Also
    --------
    civis.APIClient.scripts.post_sql
    """
    if client is None:
        client = APIClient(api_key=api_key, resources='all')
    delimiter = DELIMITERS.get(delimiter)
    assert delimiter, "delimiter must be one of {}".format(DELIMITERS.keys())

    csv_settings = dict(include_header=include_header,
                        compression=compression,
                        column_delimiter=delimiter,
                        unquoted=unquoted,
                        filename_prefix=prefix,
                        force_multifile=True)
    script_id, run_id = _sql_script(client, sql, database, job_name,
                                    credential_id, hidden,
                                    csv_settings=csv_settings)

    fut = CivisFuture(client.scripts.get_sql_runs, (script_id, run_id),
                      polling_interval=polling_interval, client=client,
                      poll_on_creation=False)
    outputs = fut.result()["output"]
    if not outputs:
        raise EmptyResultError("Unload query {} returned no manifest."
                               .format(script_id))

    buf = io.BytesIO()
    civis_to_file(outputs[0]['file_id'], buf)
    txt = io.TextIOWrapper(buf, encoding='utf-8')
    txt.seek(0)
    unload_manifest = json.load(txt)

    return unload_manifest
Пример #5
0
def read_civis_sql(sql, database, use_pandas=False, job_name=None,
                   api_key=None, client=None, credential_id=None,
                   polling_interval=None, archive=False,
                   hidden=True, **kwargs):
    """Read data from Civis using a custom SQL string.

    Parameters
    ----------
    sql : str, optional
        The SQL select string to be executed.
    database : str or int
        Execute the query against this database. Can be the database name
        or ID.
    use_pandas : bool, optional
        If ``True``, return a :class:`pandas:pandas.DataFrame`. Otherwise,
        return a list of results from :func:`python:csv.reader`.
    job_name : str, optional
        A name to give the job. If omitted, a random job name will be
        used.
    api_key : DEPRECATED str, optional
        Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY`
        environment variable will be used.
    client : :class:`civis.APIClient`, optional
        If not provided, an :class:`civis.APIClient` object will be
        created from the :envvar:`CIVIS_API_KEY`.
    credential_id : str or int, optional
        The database credential ID.  If ``None``, the default credential
        will be used.
    polling_interval : int or float, optional
        Number of seconds to wait between checks for query completion.
    archive : bool, optional (deprecated)
        If ``True``, archive the import job as soon as it completes.
    hidden : bool, optional
        If ``True`` (the default), this job will not appear in the Civis UI.
    **kwargs : kwargs
        Extra keyword arguments are passed into
        :func:`pandas:pandas.read_csv` if `use_pandas` is ``True`` or
        passed into :func:`python:csv.reader` if `use_pandas` is
        ``False``.

    Returns
    -------
    data : :class:`pandas:pandas.DataFrame` or list
        A list of rows (with header as first row) if `use_pandas` is
        ``False``, otherwise a `pandas` `DataFrame`. Note that if
        `use_pandas` is ``False``, no parsing of types is performed and
        each row will be a list of strings.

    Raises
    ------
    ImportError
        If `use_pandas` is ``True`` and `pandas` is not installed.

    Examples
    --------
    >>> sql = "SELECT * FROM schema.table"
    >>> df = read_civis_sql(sql, "my_database", use_pandas=True)
    >>> col_a = df["column_a"]

    >>> data = read_civis_sql(sql, "my_database")
    >>> columns = data.pop(0)
    >>> col_a_index = columns.index("column_a")
    >>> col_a = [row[col_a_index] for row in data]

    Notes
    -----
    This reads the data into memory.

    See Also
    --------
    civis.io.read_civis : Read directly into memory without SQL.
    civis.io.civis_to_csv : Write directly to a CSV file.
    """
    if client is None:
        client = APIClient(api_key=api_key, resources='all')
    if use_pandas and NO_PANDAS:
        raise ImportError("use_pandas is True but pandas is not installed.")
    if archive:
        warnings.warn("`archive` is deprecated and will be removed in v2.0.0. "
                      "Use `hidden` instead.", FutureWarning)
    script_id, run_id = _sql_script(client, sql, database,
                                    job_name, credential_id,
                                    hidden=hidden)
    fut = CivisFuture(client.scripts.get_sql_runs, (script_id, run_id),
                      polling_interval=polling_interval, client=client,
                      poll_on_creation=False)
    if archive:

        def f(x):
            return client.scripts.put_sql_archive(script_id, True)

        fut.add_done_callback(f)
    fut.result()
    outputs = client.scripts.get_sql_runs(script_id, run_id)["output"]
    if not outputs:
        raise EmptyResultError("Query {} returned no output."
                               .format(script_id))
    url = outputs[0]["path"]
    if use_pandas:
        data = pd.read_csv(url, **kwargs)
    else:
        r = requests.get(url)
        r.raise_for_status()
        data = list(csv.reader(StringIO(r.text), **kwargs))
    return data
Пример #6
0
def read_civis_sql(sql,
                   database,
                   use_pandas=False,
                   job_name=None,
                   api_key=None,
                   client=None,
                   credential_id=None,
                   polling_interval=None,
                   archive=False,
                   hidden=True,
                   **kwargs):
    """Read data from Civis using a custom SQL string.

    The custom SQL string will be executed twice; once to attempt to
    retrieve headers and once to retrieve the data. This is done to
    use a more performant method for retrieving the data. The first
    execution of the custom SQL is controlled such that changes in
    state cannot occur (e.g., INSERT, UPDATE, DELETE, etc.).

    Parameters
    ----------
    sql : str, optional
        The SQL select string to be executed.
    database : str or int
        Execute the query against this database. Can be the database name
        or ID.
    use_pandas : bool, optional
        If ``True``, return a :class:`pandas:pandas.DataFrame`. Otherwise,
        return a list of results from :func:`python:csv.reader`.
    job_name : str, optional
        A name to give the job. If omitted, a random job name will be
        used.
    api_key : DEPRECATED str, optional
        Your Civis API key. If not given, the :envvar:`CIVIS_API_KEY`
        environment variable will be used.
    client : :class:`civis.APIClient`, optional
        If not provided, an :class:`civis.APIClient` object will be
        created from the :envvar:`CIVIS_API_KEY`.
    credential_id : str or int, optional
        The database credential ID.  If ``None``, the default credential
        will be used.
    polling_interval : int or float, optional
        Number of seconds to wait between checks for query completion.
    archive : bool, optional (deprecated)
        If ``True``, archive the import job as soon as it completes.
    hidden : bool, optional
        If ``True`` (the default), this job will not appear in the Civis UI.
    **kwargs : kwargs
        Extra keyword arguments are passed into
        :func:`pandas:pandas.read_csv` if `use_pandas` is ``True`` or
        passed into :func:`python:csv.reader` if `use_pandas` is
        ``False``.

    Returns
    -------
    data : :class:`pandas:pandas.DataFrame` or list
        A list of rows (with header as first row) if `use_pandas` is
        ``False``, otherwise a `pandas` `DataFrame`. Note that if
        `use_pandas` is ``False``, no parsing of types is performed and
        each row will be a list of strings.

    Raises
    ------
    ImportError
        If `use_pandas` is ``True`` and `pandas` is not installed.

    Examples
    --------
    >>> sql = "SELECT * FROM schema.table"
    >>> df = read_civis_sql(sql, "my_database", use_pandas=True)
    >>> col_a = df["column_a"]

    >>> data = read_civis_sql(sql, "my_database")
    >>> columns = data.pop(0)
    >>> col_a_index = columns.index("column_a")
    >>> col_a = [row[col_a_index] for row in data]

    Notes
    -----
    This reads the data into memory.

    See Also
    --------
    civis.io.read_civis : Read directly into memory without SQL.
    civis.io.civis_to_csv : Write directly to a CSV file.
    """
    if client is None:
        client = APIClient(api_key=api_key, resources='all')
    if use_pandas and NO_PANDAS:
        raise ImportError("use_pandas is True but pandas is not installed.")
    if archive:
        warnings.warn(
            "`archive` is deprecated and will be removed in v2.0.0. "
            "Use `hidden` instead.", FutureWarning)

    db_id = client.get_database_id(database)
    credential_id = credential_id or client.default_credential

    # determine if we can request headers separately; if we can then Platform
    # will perform a parallel unload which is significantly more performant
    # we start by assuming headers are requested
    ovrd_include_header, headers = _include_header(client, sql, True, db_id,
                                                   credential_id,
                                                   polling_interval)

    # if we retrieved headers then we are performing a parallel unload
    # in which case we need to specify backslash as the escapechar
    if headers is not None:
        kwargs['escapechar'] = '\\'

    csv_settings = dict(include_header=ovrd_include_header, compression='gzip')

    script_id, run_id = _sql_script(client,
                                    sql,
                                    db_id,
                                    job_name,
                                    credential_id,
                                    csv_settings=csv_settings,
                                    hidden=hidden)
    fut = CivisFuture(client.scripts.get_sql_runs, (script_id, run_id),
                      polling_interval=polling_interval,
                      client=client,
                      poll_on_creation=False)
    if archive:

        def f(x):
            return client.scripts.put_sql_archive(script_id, True)

        fut.add_done_callback(f)
    fut.result()
    outputs = client.scripts.get_sql_runs(script_id, run_id)["output"]
    if not outputs:
        raise EmptyResultError(
            "Query {} returned no output.".format(script_id))

    url = outputs[0]["path"]
    file_id = outputs[0]["file_id"]
    log.debug('Exported results to Civis file %s (%s)',
              outputs[0]["output_name"], file_id)

    if use_pandas:
        # allows users to enter their own names parameter
        _kwargs = {'names': headers}
        _kwargs.update(kwargs)
        _kwargs['compression'] = 'gzip'

        data = pd.read_csv(url, **_kwargs)
    else:
        response = requests.get(url, stream=True)
        response.raise_for_status()

        with StringIO() as buf:
            if headers:
                buf.write(','.join(headers) + '\n')
            _decompress_stream(response, buf, write_bytes=False)
            buf.seek(0)
            data = list(csv.reader(buf, **kwargs))

    return data