Exemplo n.º 1
0
def test_key_value_metadata():
    m = pa.KeyValueMetadata({'a': 'A', 'b': 'B'})
    assert len(m) == 2
    assert m['a'] == b'A'
    assert m[b'a'] == b'A'
    assert m['b'] == b'B'
    assert 'a' in m
    assert b'a' in m
    assert 'c' not in m

    m1 = pa.KeyValueMetadata({'a': 'A', 'b': 'B'})
    m2 = pa.KeyValueMetadata(a='A', b='B')
    m3 = pa.KeyValueMetadata([('a', 'A'), ('b', 'B')])

    assert m1 != 2
    assert m1 == m2
    assert m2 == m3
    assert m1 == {'a': 'A', 'b': 'B'}
    assert m1 != {'a': 'A', 'b': 'C'}

    with pytest.raises(TypeError):
        pa.KeyValueMetadata({'a': 1})
    with pytest.raises(TypeError):
        pa.KeyValueMetadata({1: 'a'})
    with pytest.raises(TypeError):
        pa.KeyValueMetadata(a=1)

    expected = [(b'a', b'A'), (b'b', b'B')]
    result = [(k, v) for k, v in m3.items()]
    assert result == expected
    assert list(m3.items()) == expected
    assert list(m3.keys()) == [b'a', b'b']
    assert list(m3.values()) == [b'A', b'B']
    assert len(m3) == 2

    # test duplicate key support
    md = pa.KeyValueMetadata([
        ('a', 'alpha'),
        ('b', 'beta'),
        ('a', 'Alpha'),
        ('a', 'ALPHA'),
    ],
                             b='BETA')

    expected = [(b'a', b'alpha'), (b'b', b'beta'), (b'a', b'Alpha'),
                (b'a', b'ALPHA'), (b'b', b'BETA')]
    assert len(md) == 5
    assert isinstance(md.keys(), Iterator)
    assert isinstance(md.values(), Iterator)
    assert isinstance(md.items(), Iterator)
    assert list(md.items()) == expected
    assert list(md.keys()) == [k for k, _ in expected]
    assert list(md.values()) == [v for _, v in expected]

    # first occurence
    assert md['a'] == b'alpha'
    assert md['b'] == b'beta'
    assert md.get_all('a') == [b'alpha', b'Alpha', b'ALPHA']
    assert md.get_all('b') == [b'beta', b'BETA']
    assert md.get_all('unkown') == []
Exemplo n.º 2
0
def test_key_value_metadata_duplicates():
    meta = pa.KeyValueMetadata({'a': '1', 'b': '2'})

    with pytest.raises(KeyError):
        pa.KeyValueMetadata(meta, a='3')
Exemplo n.º 3
0
def dataframe_to_arrays(df,
                        schema,
                        preserve_index,
                        nthreads=1,
                        columns=None,
                        safe=True):
    (all_names, column_names, index_column_names, index_descriptors,
     index_columns, columns_to_convert,
     convert_fields) = _get_columns_to_convert(df, schema, preserve_index,
                                               columns)

    # NOTE(wesm): If nthreads=None, then we use a heuristic to decide whether
    # using a thread pool is worth it. Currently the heuristic is whether the
    # nrows > 100 * ncols.
    if nthreads is None:
        nrows, ncols = len(df), len(df.columns)
        if nrows > ncols * 100:
            nthreads = pa.cpu_count()
        else:
            nthreads = 1

    def convert_column(col, field):
        if field is None:
            field_nullable = True
            type_ = None
        else:
            field_nullable = field.nullable
            type_ = field.type

        try:
            result = pa.array(col, type=type_, from_pandas=True, safe=safe)
        except (pa.ArrowInvalid, pa.ArrowNotImplementedError,
                pa.ArrowTypeError) as e:
            e.args += (
                "Conversion failed for column {!s} with type {!s}".format(
                    col.name, col.dtype), )
            raise e
        if not field_nullable and result.null_count > 0:
            raise ValueError("Field {} was non-nullable but pandas column "
                             "had {} null values".format(
                                 str(field), result.null_count))
        return result

    if nthreads == 1:
        arrays = [
            convert_column(c, f)
            for c, f in zip(columns_to_convert, convert_fields)
        ]
    else:
        from concurrent import futures
        with futures.ThreadPoolExecutor(nthreads) as executor:
            arrays = list(
                executor.map(convert_column, columns_to_convert,
                             convert_fields))

    types = [x.type for x in arrays]

    if schema is None:
        fields = []
        for name, type_ in zip(all_names, types):
            name = name if name is not None else 'None'
            fields.append(pa.field(name, type_))
        schema = pa.schema(fields)

    pandas_metadata = construct_metadata(df, column_names, index_columns,
                                         index_descriptors, preserve_index,
                                         types)
    metadata = pa.KeyValueMetadata(schema.metadata or {}, **pandas_metadata)
    schema = schema.with_metadata(metadata)

    return arrays, schema