Пример #1
0
def itermergeduplicates(table, key, missing):
    it = iter(table)
    hdr, it = iterpeek(it)
    flds = list(map(text_type, hdr))

    # determine output fields
    if isinstance(key, string_types):
        outhdr = [key]
        keyflds = set([key])
    else:
        outhdr = list(key)
        keyflds = set(key)
    valflds = [f for f in flds if f not in keyflds]
    valfldidxs = [flds.index(f) for f in valflds]
    outhdr.extend(valflds)
    yield tuple(outhdr)

    # do the work
    for k, grp in rowgroupby(it, key):
        grp = list(grp)
        if isinstance(key, string_types):
            outrow = [k]
        else:
            outrow = list(k)
        mergedvals = [
            set(row[i] for row in grp if len(row) > i and row[i] != missing)
            for i in valfldidxs
        ]
        normedvals = [
            vals.pop() if len(vals) == 1 else
            missing if len(vals) == 0 else Conflict(vals)
            for vals in mergedvals
        ]
        outrow.extend(normedvals)
        yield tuple(outrow)
Пример #2
0
def valuestoarray(vals, dtype=None, count=-1, sample=1000):
    """
    Load values from a table column into a `numpy <http://www.numpy.org/>`_
    array, e.g.::

        >>> import petl as etl
        >>> table = [('foo', 'bar', 'baz'),
        ...          ('apples', 1, 2.5),
        ...          ('oranges', 3, 4.4),
        ...          ('pears', 7, .1)]
        >>> table = etl.wrap(table)
        >>> table.values('bar').array()
        array([1, 3, 7])
        >>> # specify dtype
        ... table.values('bar').array(dtype='i4')
        array([1, 3, 7], dtype=int32)

    """

    import numpy as np
    it = iter(vals)
    if dtype is None:
        peek, it = iterpeek(it, sample)
        dtype = np.array(peek).dtype
    a = np.fromiter(it, dtype=dtype, count=count)
    return a
Пример #3
0
def valuestoarray(vals, dtype=None, count=-1, sample=1000):
    """
    Load values from a table column into a `numpy <http://www.numpy.org/>`_
    array, e.g.::

        >>> import petl as etl
        >>> table = [('foo', 'bar', 'baz'),
        ...          ('apples', 1, 2.5),
        ...          ('oranges', 3, 4.4),
        ...          ('pears', 7, .1)]
        >>> table = etl.wrap(table)
        >>> table.values('bar').array()
        array([1, 3, 7])
        >>> # specify dtype
        ... table.values('bar').array(dtype='i4')
        array([1, 3, 7], dtype=int32)

    """

    import numpy as np
    it = iter(vals)
    if dtype is None:
        peek, it = iterpeek(it, sample)
        dtype = np.array(peek).dtype
    a = np.fromiter(it, dtype=dtype, count=count)
    return a
Пример #4
0
def iterrowreduce(source, key, reducer, header):
    if header is None:
        # output header from source
        header, source = iterpeek(source)
    yield tuple(header)
    for key, rows in rowgroupby(source, key):
        yield tuple(reducer(key, rows))
Пример #5
0
def itermergeduplicates(table, key, missing):
    it = iter(table)
    hdr, it = iterpeek(it)
    flds = list(map(text_type, hdr))

    # determine output fields
    if isinstance(key, string_types):
        outhdr = [key]
        keyflds = set([key])
    else:
        outhdr = list(key)
        keyflds = set(key)
    valflds = [f for f in flds if f not in keyflds]
    valfldidxs = [flds.index(f) for f in valflds]
    outhdr.extend(valflds)
    yield tuple(outhdr)

    # do the work
    for k, grp in rowgroupby(it, key):
        grp = list(grp)
        if isinstance(key, string_types):
            outrow = [k]
        else:
            outrow = list(k)
        mergedvals = [set(row[i] for row in grp
                          if len(row) > i and row[i] != missing)
                      for i in valfldidxs]
        normedvals = [vals.pop() if len(vals) == 1
                      else missing if len(vals) == 0
                      else Conflict(vals)
                      for vals in mergedvals]
        outrow.extend(normedvals)
        yield tuple(outrow)
Пример #6
0
def iterrowreduce(source, key, reducer, header):
    if header is None:
        # output header from source
        header, source = iterpeek(source)
    yield tuple(header)
    for key, rows in rowgroupby(source, key):
        yield tuple(reducer(key, rows))
Пример #7
0
def _build_schema_from_values(table, sample):
    # table2: try not advance iterators
    samples, table2 = iterpeek(table, sample + 1)
    props = fieldnames(samples)
    peek = skip(samples, 1)
    schema_fields = _build_schema_fields_from_values(peek, props)
    schema_source = _build_schema_with(schema_fields)
    return schema_source, table2
Пример #8
0
def tobcolz(table, dtype=None, sample=1000, **kwargs):
    """Load data into a bcolz ctable, e.g.::

        >>> import petl as etl
        >>> table = [('foo', 'bar', 'baz'),
        ...          ('apples', 1, 2.5),
        ...          ('oranges', 3, 4.4),
        ...          ('pears', 7, .1)]
        >>> ctbl = etl.tobcolz(table)
        >>> ctbl
        ctable((3,), [('foo', '<U7'), ('bar', '<i8'), ('baz', '<f8')])
          nbytes: 132; cbytes: 1023.98 KB; ratio: 0.00
          cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)
        [('apples', 1, 2.5) ('oranges', 3, 4.4) ('pears', 7, 0.1)]
        >>> ctbl.names
        ['foo', 'bar', 'baz']
        >>> ctbl['foo']
        carray((3,), <U7)
          nbytes := 84; cbytes := 511.98 KB; ratio: 0.00
          cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0)
          chunklen := 18724; chunksize: 524272; blocksize: 0
        ['apples' 'oranges' 'pears']

    Other keyword arguments are passed through to the ctable constructor.

    .. versionadded:: 1.1.0

    """

    import bcolz
    import numpy as np

    it = iter(table)
    peek, it = iterpeek(it, sample)
    hdr = next(it)
    # numpy is fussy about having tuples, need to make sure
    it = (tuple(row) for row in it)
    flds = list(map(text_type, hdr))
    dtype = construct_dtype(flds, peek, dtype)

    # create ctable
    kwargs.setdefault('expectedlen', 1000000)
    kwargs.setdefault('mode', 'w')
    ctbl = bcolz.ctable(np.array([], dtype=dtype), **kwargs)

    # fill chunk-wise
    chunklen = sum(ctbl.cols[name].chunklen
                   for name in ctbl.names) // len(ctbl.names)
    while True:
        data = list(itertools.islice(it, chunklen))
        data = np.array(data, dtype=dtype)
        ctbl.append(data)
        if len(data) < chunklen:
            break

    ctbl.flush()
    return ctbl
Пример #9
0
def _fix_missing_headers(table, schema):
    '''add missing columns headers from schema'''
    if schema is None or 'fields' not in schema:
        return table
    # table2: try not advance iterators
    sample, table2 = iterpeek(table, 2)
    cols = fieldnames(sample)
    headers = _get_schema_header_names(schema)
    if len(cols) >= len(headers):
        return table2
    table3 = setheader(table2, headers)
    return table3
Пример #10
0
def _fix_missing_headers(table, schema):
    '''add missing columns headers from schema'''
    if schema is None or not 'fields' in schema:
        return table
    # table2: try not advance iterators
    sample, table2 = iterpeek(table, 2)
    cols = fieldnames(sample)
    fields = schema.get('fields')
    if len(cols) >= len(fields):
        return table2
    header = [field.get('name') for field in fields]
    table3 = setheader(table2, header)
    return table3
Пример #11
0
def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix):
    lit = iter(left)
    lhdr = next(lit)

    rhdr, rit = iterpeek(right)  # need the whole lot to pass to lookup
    rlookup = lookupone(rit, rkey, strict=False)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lhdr, lkey)
    rkind = asindices(rhdr, rkey)

    # construct functions to extract key values from left table
    lgetk = operator.itemgetter(*lkind)

    # determine indices of non-key fields in the right table
    # (in the output, we only include key fields from the left table - we
    # don't want to duplicate fields)
    rvind = [i for i in range(len(rhdr)) if i not in rkind]
    rgetv = rowgetter(*rvind)

    # determine the output fields
    if lprefix is None:
        outhdr = list(lhdr)
    else:
        outhdr = [(str(lprefix) + str(f))
                  for f in lhdr]
    if rprefix is None:
        outhdr.extend(rgetv(rhdr))
    else:
        outhdr.extend([(str(rprefix) + str(f))
                       for f in rgetv(rhdr)])
    yield tuple(outhdr)

    # define a function to join rows
    def joinrows(_lrow, _rrow):
        # start with the left row
        _outrow = list(_lrow)
        # extend with non-key values from the right row
        _outrow.extend(rgetv(_rrow))
        return tuple(_outrow)

    for lrow in lit:
        k = lgetk(lrow)
        if k in rlookup:
            rrow = rlookup[k]
            yield joinrows(lrow, rrow)
        else:
            outrow = list(lrow)  # start with the left row
            # extend with missing values in place of the right row
            outrow.extend([missing] * len(rvind))
            yield tuple(outrow)
Пример #12
0
def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix):
    lit = iter(left)
    lhdr = next(lit)

    rhdr, rit = iterpeek(right)  # need the whole lot to pass to lookup
    rlookup = lookupone(rit, rkey, strict=False)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lhdr, lkey)
    rkind = asindices(rhdr, rkey)

    # construct functions to extract key values from left table
    lgetk = operator.itemgetter(*lkind)

    # determine indices of non-key fields in the right table
    # (in the output, we only include key fields from the left table - we
    # don't want to duplicate fields)
    rvind = [i for i in range(len(rhdr)) if i not in rkind]
    rgetv = rowgetter(*rvind)

    # determine the output fields
    if lprefix is None:
        outhdr = list(lhdr)
    else:
        outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr]
    if rprefix is None:
        outhdr.extend(rgetv(rhdr))
    else:
        outhdr.extend([(text_type(rprefix) + text_type(f))
                       for f in rgetv(rhdr)])
    yield tuple(outhdr)

    # define a function to join rows
    def joinrows(_lrow, _rrow):
        # start with the left row
        _outrow = list(_lrow)
        # extend with non-key values from the right row
        _outrow.extend(rgetv(_rrow))
        return tuple(_outrow)

    for lrow in lit:
        k = lgetk(lrow)
        if k in rlookup:
            rrow = rlookup[k]
            yield joinrows(lrow, rrow)
        else:
            outrow = list(lrow)  # start with the left row
            # extend with missing values in place of the right row
            outrow.extend([missing] * len(rvind))
            yield tuple(outrow)
Пример #13
0
def iterjlines(f, header, missing):
    it = iter(f)

    if header is None:
        header = list()
        peek, it = iterpeek(it, 1)
        json_obj = json.loads(peek)
        if hasattr(json_obj, 'keys'):
            header += [k for k in json_obj.keys() if k not in header]
    yield tuple(header)

    for o in it:
        json_obj = json.loads(o)
        yield tuple(json_obj[f] if f in json_obj else missing for f in header)
Пример #14
0
def iterdicts(dicts, header, sample, missing):
    it = iter(dicts)

    # determine header row
    if header is None:
        # discover fields
        header = list()
        peek, it = iterpeek(it, sample)
        for o in peek:
            if hasattr(o, 'keys'):
                header += [k for k in o.keys() if k not in header]
    yield tuple(header)

    # generate data rows
    for o in it:
        yield tuple(o[f] if f in o else missing for f in header)
Пример #15
0
def iterdicts(dicts, header, sample, missing):
    it = iter(dicts)

    # determine header row
    if header is None:
        # discover fields
        header = list()
        peek, it = iterpeek(it, sample)
        for o in peek:
            if hasattr(o, "keys"):
                header += [k for k in o.keys() if k not in header]
    yield tuple(header)

    # generate data rows
    for o in it:
        yield tuple(o[f] if f in o else missing for f in header)
Пример #16
0
def toarray(table, dtype=None, count=-1, sample=1000):
    """
    Load data from the given `table` into a
    `numpy <http://www.numpy.org/>`_ structured array. E.g.::

        >>> import petl as etl
        >>> table = [('foo', 'bar', 'baz'),
        ...          ('apples', 1, 2.5),
        ...          ('oranges', 3, 4.4),
        ...          ('pears', 7, .1)]
        >>> a = etl.toarray(table)
        >>> a
        array([('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, 0.1)],
              dtype=(numpy.record, [('foo', '<U7'), ('bar', '<i8'), ('baz', '<f8')]))
        >>> # the dtype can be specified as a string
        ... a = etl.toarray(table, dtype='a4, i2, f4')
        >>> a
        array([(b'appl', 1, 2.5), (b'oran', 3, 4.400000095367432),
               (b'pear', 7, 0.10000000149011612)],
              dtype=[('foo', 'S4'), ('bar', '<i2'), ('baz', '<f4')])
        >>> # the dtype can also be partially specified
        ... a = etl.toarray(table, dtype={'foo': 'a4'})
        >>> a
        array([(b'appl', 1, 2.5), (b'oran', 3, 4.4), (b'pear', 7, 0.1)],
              dtype=[('foo', 'S4'), ('bar', '<i8'), ('baz', '<f8')])

    If the dtype is not completely specified, `sample` rows will be
    examined to infer an appropriate dtype.

    """
    
    import numpy as np
    it = iter(table)
    peek, it = iterpeek(it, sample)
    hdr = next(it)
    flds = list(map(str, hdr))
    dtype = construct_dtype(flds, peek, dtype)

    # numpy is fussy about having tuples, need to make sure
    it = (tuple(row) for row in it)
    sa = np.fromiter(it, dtype=dtype, count=count)

    return sa
Пример #17
0
def toarray(table, dtype=None, count=-1, sample=1000):
    """
    Load data from the given `table` into a
    `numpy <http://www.numpy.org/>`_ structured array. E.g.::

        >>> import petl as etl
        >>> table = [('foo', 'bar', 'baz'),
        ...          ('apples', 1, 2.5),
        ...          ('oranges', 3, 4.4),
        ...          ('pears', 7, .1)]
        >>> a = etl.toarray(table)
        >>> a
        array([('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, 0.1)],
              dtype=(numpy.record, [('foo', '<U7'), ('bar', '<i8'), ('baz', '<f8')]))
        >>> # the dtype can be specified as a string
        ... a = etl.toarray(table, dtype='a4, i2, f4')
        >>> a
        array([(b'appl', 1, 2.5), (b'oran', 3, 4.400000095367432),
               (b'pear', 7, 0.10000000149011612)],
              dtype=[('foo', 'S4'), ('bar', '<i2'), ('baz', '<f4')])
        >>> # the dtype can also be partially specified
        ... a = etl.toarray(table, dtype={'foo': 'a4'})
        >>> a
        array([(b'appl', 1, 2.5), (b'oran', 3, 4.4), (b'pear', 7, 0.1)],
              dtype=[('foo', 'S4'), ('bar', '<i8'), ('baz', '<f8')])

    If the dtype is not completely specified, `sample` rows will be
    examined to infer an appropriate dtype.

    """

    import numpy as np
    it = iter(table)
    peek, it = iterpeek(it, sample)
    hdr = next(it)
    flds = list(map(str, hdr))
    dtype = construct_dtype(flds, peek, dtype)

    # numpy is fussy about having tuples, need to make sure
    it = (tuple(row) for row in it)
    sa = np.fromiter(it, dtype=dtype, count=count)

    return sa
Пример #18
0
def toxml(table,
          target=None,
          root=None,
          head=None,
          rows=None,
          prologue=None,
          epilogue=None,
          style='tag',
          encoding='utf-8'):
    """
    Write the table into a new xml file according to elements defined in the
    function arguments.

    The `root`, `head` and `rows` (string, optional) arguments define the tags
    and the nesting of the xml file. Each one defines xml elements with tags
    separated by slashes (`/`) like in `root/level/tag`. They can have a
    arbitrary number of tags that will reflect in more nesting levels for the
    header or record/row written in the xml file.

    For details on tag naming and nesting rules check xml `specification`_ or
    xml `references`_.

    The `rows` argument define the elements for each row of data to be written
    in the xml file. When specified, it must have at least 2 tags for defining
    the tags for `row/column`. Additional tags will add nesting enclosing all
    records/rows/lines.

    The `head` argument is similar to the rows, but aplies only to one line/row
    of header with fieldnames. When specified, it must have at least 2 tags for
    `fields/name` and the remaining will increase nesting.

    The `root` argument defines the elements enclosing `head` and `rows` and is
    required when using `head` for specifying valid xml documents.

    When none of this arguments are specified, they will default to tags that
    generate output similar to a html table:
    `root='table', head='there/tr/td', rows='tbody/tr/td'`.

    The `prologue` argument (string, optional) could be a snippet of valid xml
    that will be inserted before other elements in the xml. It can optionally
    specify the `XML Prolog` of the file.

    The `epilogue` argument (string, optional) could be a snippet of valid xml
    that will be inserted after all other xml elements except the root closing
    tag. It must specify a closing tag if the `root` argument is not specified. 

    The `style` argument select the format of the elements in the xml file. It
    can be `tag` (default), `name`, `attribute` or a custom string to format
    each row via
    `str.format <http://docs.python.org/library/stdtypes.html#str.format>`_.

    Example usage for writing files::

        >>> import petl as etl
        >>> table1 = [['foo', 'bar'],
        ...           ['a', 1],
        ...           ['b', 2]]
        >>> etl.toxml(table1, 'example.file4.xml')
        >>> # see what we did is similar a html table:
        >>> print(open('example.file4.xml').read())
        <?xml version="1.0" encoding="UTF-8"?>
        <table><thead>
         <tr><th>foo</th><th>bar</th></tr>
        </thead><tbody>
         <tr><td>a</td><td>1</td></tr>
         <tr><td>b</td><td>2</td></tr>
        </tbody></table>
        >>> # define the nesting in xml file:
        >>> etl.toxml(table1, 'example.file5.xml', rows='plan/line/cell')
        >>> print(open('example.file5.xml').read())
        <?xml version="1.0" encoding="UTF-8"?>
        <plan>
         <line><cell>a</cell><cell>1</cell></line>
         <line><cell>b</cell><cell>2</cell></line>
        </plan>
        >>> # choose other style:
        >>> etl.toxml(table1, 'example.file6.xml', rows='row/col', style='attribute')
        >>> print(open('example.file6.xml').read())
        <?xml version="1.0" encoding="UTF-8"?>
        <row>
         <col foo="a" bar="1" />
         <col foo="b" bar="2" />
        </row>
        >>> etl.toxml(table1, 'example.file6.xml', rows='row/col', style='name')
        >>> print(open('example.file6.xml').read())
        <?xml version="1.0" encoding="UTF-8"?>
        <row>
         <col><foo>a</foo><bar>1</bar></col>
         <col><foo>b</foo><bar>2</bar></col>
        </row>

    The `toxml()` function is just a wrapper over :func:`petl.io.text.totext`.
    For advanced cases use a template with `totext()` for generating xml files.

    .. versionadded:: 1.7.0

    .. _specification: https://www.w3.org/TR/xml/
    .. _references: https://www.w3schools.com/xml/xml_syntax.asp

    """
    if not root and not head and not rows:
        root = 'table'
        head = 'thead/tr/th'
        rows = 'tbody/tr/td'

    sample, table2 = iterpeek(table, 2)
    props = fieldnames(sample)

    top = _build_xml_header(style, props, root, head, rows, prologue, encoding)
    template = _build_cols(style, props, rows, True)
    bottom = _build_xml_footer(style, epilogue, rows, root)

    totext(table2,
           source=target,
           encoding=encoding,
           errors='strict',
           template=template,
           prologue=top,
           epilogue=bottom)
Пример #19
0
def tohdf5(table, source, where=None, name=None, create=False, drop=False,
           description=None, title='', filters=None, expectedrows=10000, 
           chunkshape=None, byteorder=None, createparents=False,
           sample=1000):
    """
    Write to an HDF5 table. If `create` is `False`, assumes the table
    already exists, and attempts to truncate it before loading. If `create`
    is `True`, a new table will be created, and if `drop` is True,
    any existing table will be dropped first. If `description` is `None`,
    the description will be guessed. E.g.::

        >>> import petl as etl
        >>> table1 = (('foo', 'bar'),
        ...           (1, b'asdfgh'),
        ...           (2, b'qwerty'),
        ...           (3, b'zxcvbn'))
        >>> etl.tohdf5(table1, 'example.h5', '/testgroup', 'testtable',
        ...            drop=True, create=True, createparents=True)
        >>> etl.fromhdf5('example.h5', '/testgroup', 'testtable')
        +-----+-----------+
        | foo | bar       |
        +=====+===========+
        |   1 | b'asdfgh' |
        +-----+-----------+
        |   2 | b'qwerty' |
        +-----+-----------+
        |   3 | b'zxcvbn' |
        +-----+-----------+

    """

    import tables
    it = iter(table)
    
    if create:
        with _get_hdf5_file(source, mode='a') as h5file:

            if drop:
                try:
                    h5file.get_node(where, name)
                except tables.NoSuchNodeError:
                    pass
                else:
                    h5file.remove_node(where, name)

            # determine datatype
            if description is None:
                peek, it = iterpeek(it, sample)
                # use a numpy dtype
                description = guessdtype(peek)

            # create the table
            h5file.create_table(where, name, description,
                                title=title,
                                filters=filters,
                                expectedrows=expectedrows,
                                chunkshape=chunkshape,
                                byteorder=byteorder,
                                createparents=createparents)

    with _get_hdf5_table(source, where, name, mode='a') as h5table:

        # truncate the existing table
        h5table.truncate(0)
        
        # load the data
        _insert(it, h5table)
Пример #20
0
def tohdf5(table,
           source,
           where=None,
           name=None,
           create=False,
           drop=False,
           description=None,
           title='',
           filters=None,
           expectedrows=10000,
           chunkshape=None,
           byteorder=None,
           createparents=False,
           sample=1000):
    """
    Write to an HDF5 table. If `create` is `False`, assumes the table
    already exists, and attempts to truncate it before loading. If `create`
    is `True`, a new table will be created, and if `drop` is True,
    any existing table will be dropped first. If `description` is `None`,
    the description will be guessed. E.g.::

        >>> import petl as etl
        >>> table1 = (('foo', 'bar'),
        ...           (1, b'asdfgh'),
        ...           (2, b'qwerty'),
        ...           (3, b'zxcvbn'))
        >>> etl.tohdf5(table1, 'example.h5', '/testgroup', 'testtable',
        ...            drop=True, create=True, createparents=True)
        >>> etl.fromhdf5('example.h5', '/testgroup', 'testtable')
        +-----+-----------+
        | foo | bar       |
        +=====+===========+
        |   1 | b'asdfgh' |
        +-----+-----------+
        |   2 | b'qwerty' |
        +-----+-----------+
        |   3 | b'zxcvbn' |
        +-----+-----------+

    """

    import tables
    it = iter(table)

    if create:
        with _get_hdf5_file(source, mode='a') as h5file:

            if drop:
                try:
                    h5file.get_node(where, name)
                except tables.NoSuchNodeError:
                    pass
                else:
                    h5file.remove_node(where, name)

            # determine datatype
            if description is None:
                peek, it = iterpeek(it, sample)
                # use a numpy dtype
                description = guessdtype(peek)

            # create the table
            h5file.create_table(where,
                                name,
                                description,
                                title=title,
                                filters=filters,
                                expectedrows=expectedrows,
                                chunkshape=chunkshape,
                                byteorder=byteorder,
                                createparents=createparents)

    with _get_hdf5_table(source, where, name, mode='a') as h5table:

        # truncate the existing table
        h5table.truncate(0)

        # load the data
        _insert(it, h5table)
Пример #21
0
def toarray(table, dtype=None, count=-1, sample=1000):
    """
    Load data from the given `table` into a
    `numpy <http://www.numpy.org/>`_ structured array. E.g.::

        >>> import petl as etl
        >>> table = [('foo', 'bar', 'baz'),
        ...          ('apples', 1, 2.5),
        ...          ('oranges', 3, 4.4),
        ...          ('pears', 7, .1)]
        >>> a = etl.toarray(table)
        >>> a
        array([('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, 0.1)],
              dtype=[('foo', '<U7'), ('bar', '<i8'), ('baz', '<f8')])
        >>> # the dtype can be specified as a string
        ... a = etl.toarray(table, dtype='a4, i2, f4')
        >>> a
        array([(b'appl', 1, 2.5), (b'oran', 3, 4.400000095367432),
               (b'pear', 7, 0.10000000149011612)],
              dtype=[('foo', 'S4'), ('bar', '<i2'), ('baz', '<f4')])
        >>> # the dtype can also be partially specified
        ... a = etl.toarray(table, dtype={'foo': 'a4'})
        >>> a
        array([(b'appl', 1, 2.5), (b'oran', 3, 4.4), (b'pear', 7, 0.1)],
              dtype=[('foo', 'S4'), ('bar', '<i8'), ('baz', '<f8')])

    If the dtype is not completely specified, `sample` rows will be
    examined to infer an appropriate dtype.

    """
    
    import numpy as np
    it = iter(table)
    peek, it = iterpeek(it, sample)
    hdr = next(it)
    flds = list(map(str, hdr))

    if dtype is None:
        dtype = guessdtype(peek)

    elif isinstance(dtype, string_types):
        # insert field names from source table
        typestrings = [s.strip() for s in dtype.split(',')]
        dtype = [(f, t) for f, t in zip(flds, typestrings)]

    elif (isinstance(dtype, dict)
          and ('names' not in dtype or 'formats' not in dtype)):
        # allow for partial specification of dtype
        cols = columns(peek)
        newdtype = {'names': [], 'formats': []}
        for f in flds:
            newdtype['names'].append(f)
            if f in dtype and isinstance(dtype[f], tuple):
                # assume fully specified
                newdtype['formats'].append(dtype[f][0])
            elif f not in dtype:
                # not specified at all
                a = np.array(cols[f])
                newdtype['formats'].append(a.dtype)
            else:
                # assume directly specified, just need to add offset
                newdtype['formats'].append(dtype[f])
        dtype = newdtype

    else:
        pass  # leave dtype as-is

    # numpy is fussy about having tuples, need to make sure
    it = (tuple(row) for row in it)
    sa = np.fromiter(it, dtype=dtype, count=count)

    return sa