示例#1
0
 def test_make_chunks_splits_with_num_zero(self):
     res = make_chunks(0)
     expected = []
     self.assertEqual(expected, res)
示例#2
0
def get_thor_file(connection,
                  thor_file,
                  max_workers=10,
                  chunk_size='auto',
                  max_attempts=3,
                  max_sleep=60,
                  dtype=None):
    """
    Return a thor file as a pandas.DataFrame.

    Note: Ordering of the resulting DataFrame is
    not deterministic and may not be the same as on the HPCC cluster.

    Parameters
    ----------
    connection: hpycc.Connection
        HPCC Connection instance, see also `Connection`.
    thor_file: str
        Name of thor file to be downloaded.
    max_workers: int, optional
        Number of concurrent threads to use when downloading file.
        Warning: too many may cause instability! 10 by default.
    chunk_size: int, optional
        Size of chunks to use when downloading file. If auto
        this is rows / workers (bounded between 100,000 and
        400,000). If give then no limits are enforced.
    max_attempts: int, optional
        Maximum number of times a chunk should attempt to be
        downloaded in the case of an exception being raised.
        3 by default.
    max_sleep: int, optional
        Minimum time, in seconds, to sleep between attempts.
        The true sleep time is a random int between `max_sleep` and
        `max_sleep` * 0.75.
    dtype: type name or dict of col -> type, optional
        Data type for data or columns. E.g. {‘a’: np.float64, ‘b’:
        np.int32}. If converters are specified, they will be applied
        INSTEAD of dtype conversion. If None, or columns are missing
        from the provided dict, they will be converted to one of
        bool, str or int based on the HPCC datatype. None by default.

    Returns
    -------
    df: pandas.DataFrame
        Thor file as a pandas.DataFrame.

    See Also
    --------
    save_thor_file

    Examples
    --------
    >>> import hpycc
    >>> import pandas
    >>> conn = hpycc.Connection("user")
    >>> df = pandas.DataFrame({"col1": [1, 2, 3]})
    >>> df.to_csv("example.csv", index=False)
    >>> hpycc.spray_file(conn,"example.csv","example")
    >>> hpycc.get_thor_file(conn, "example")
        col1
    0     1
    1     2
    2     3

    >>> import hpycc
    >>> import pandas
    >>> conn = hpycc.Connection("user")
    >>> df = pandas.DataFrame({"col1": [1, 2, 3]})
    >>> df.to_csv("example.csv", index=False)
    >>> hpycc.spray_file(conn,"example.csv","example")
    >>> hpycc.get_thor_file(conn, "example", dtype=str)
        col1
    0     '1'
    1     '2'
    2     '3'

    """

    resp = connection.get_chunk_from_hpcc(thor_file, 0, 1, max_attempts,
                                          max_sleep)
    try:
        wuresultresponse = resp["WUResultResponse"]
        schema_str = wuresultresponse["Result"]["XmlSchema"]["xml"]
        schema = parse_schema_from_xml(schema_str)
        schema = apply_custom_dtypes(schema, dtype)
        num_rows = wuresultresponse["Total"]
    except (KeyError, TypeError) as exc:
        msg = "Can't find schema in returned json: {}".format(resp)
        raise type(exc)(msg) from exc

    if chunk_size == 'auto':  # Automagically optimise. TODO: we could use width too.
        suggested_size = ceil(num_rows / max_workers)
        chunk_size = num_rows if suggested_size < 10000 else suggested_size  # Don't chunk small stuff.
        chunk_size = 325000 if suggested_size > 325000 else chunk_size  # More chunks than workers for big stuff.

    if not num_rows or num_rows == 0:  # if there are no rows to go and get, we should return an empty dataframe
        return pd.DataFrame(columns=schema.keys())

    chunks = filechunker.make_chunks(num_rows, chunk_size)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(connection.get_logical_file_chunk, thor_file,
                            start_row, n_rows, max_attempts, max_sleep)
            for start_row, n_rows in chunks
        ]

    results = {key: [] for key in schema.keys()}
    for result in as_completed(futures):
        result = result.result()
        [results[k].extend(result[k]) for k in results.keys()]
        del result
    results = pd.DataFrame(results)

    for col in schema.keys():
        c = schema[col]
        nam = col
        typ = c['type']
        if c['is_a_set']:  # TODO: Nested DF are also caught here. Open issue to fix
            results[nam] = results[nam].map(
                lambda x: [typ(i) for i in x["Item"]])
        else:
            try:
                results[nam] = results[nam].astype(typ)
            except OverflowError:  # An int that is horrifically long cannot be converted properly. Use float instead
                results[nam] = results[nam].astype('float')
    return results
示例#3
0
 def test_make_chunks_chunksize_equal_zero(self):
     with self.assertRaises(ZeroDivisionError):
         make_chunks(10, 0)
示例#4
0
 def test_make_chunks_uses_10000_as_default_chunksize(self):
     res = make_chunks(10000)
     expected = [(0, 10000)]
     self.assertEqual(expected, res)
示例#5
0
 def test_make_chunks_chunks_num_greater_than_chunksize(self):
     res = make_chunks(10, 3)
     expected = [(0, 3), (3, 3), (6, 3), (9, 1)]
     self.assertEqual(expected, res)
示例#6
0
 def test_make_chunks_chunks_num_less_than_chunksize(self):
     res = make_chunks(3, 10)
     expected = [(0, 3)]
     self.assertEqual(expected, res)
示例#7
0
 def test_make_chunks_chunks_sum_correctly(self):
     res = make_chunks(500, 3)
     summed = sum([i[1] for i in res])
     self.assertEqual(summed, 500)
示例#8
0
 def test_make_chunks_splits_with_two_full_chunks(self):
     res = make_chunks(20, 10)
     expected = [(0, 10), (10, 10)]
     self.assertEqual(expected, res)
示例#9
0
def spray_file(connection, source_file, logical_file, overwrite=False,
               expire=None, chunk_size=100000, max_workers=5,
               delete_workunit=True):
    """
    Spray a file to a HPCC logical file, bypassing the landing zone.

    Parameters
    ----------
    connection: `Connection`
        HPCC Connection instance, see also `Connection`.
    source_file: str, pd.DataFrame
         A pandas DataFrame or the path to a csv.
    logical_file: str
         Logical file name on THOR.
    overwrite: bool, optional
        Should the file overwrite any pre-existing logical file.
        False by default.
    chunk_size: int, optional
        Size of chunks to use when spraying file. 100000 by
        default.
    max_workers: int, optional
        Number of concurrent threads to use when spraying.
        Warning: too many will likely cause either your machine or
        your cluster to crash! 3 by default.
    expire: int
        How long (days) until the produced logical file expires? None
        (ie no expiry) by default
    delete_workunit: bool
        Delete workunit once completed.

    Returns
    -------
    None

    """
    if isinstance(source_file, pd.DataFrame):
        df = source_file
    elif isinstance(source_file, str):
        df = pd.read_csv(source_file, encoding='latin')
    else:
        raise TypeError

    if logical_file[0] != '~':
        SyntaxWarning("""Your Logical file name (%s) did not start with
                        ~ so may not be sprayed to root""" % logical_file)

    record_set = _make_record_set(df)

    chunks = make_chunks(len(df), chunk_size=chunk_size)

    print('Any unicode characters will be converted to ASCII, not saying you '
          'have any, just warning you! If you are getting odd errors you may '
          'want to deal with your UTF before spraying.')
    stringified_rows = (_stringify_rows(df, start_row, num_rows)
                        for start_row, num_rows in chunks)

    target_names = ["~TEMPHPYCC::{}from{}to{}".format(
            logical_file.replace("~", ""), start_row, start_row + num_rows)
        for start_row, num_rows in chunks]

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [
            executor.submit(_spray_stringified_data, connection, row,
                            record_set, name, overwrite, delete_workunit)
            for row, name in zip(stringified_rows, target_names)]
        _, __ = wait(futures)
        _ = [f.result() for f in futures]

    _concatenate_logical_files(connection, target_names, logical_file, record_set, overwrite, expire, delete_workunit)

    for tmp in target_names:
        delete_logical_file(connection, tmp, delete_workunit)