def itermergeduplicates(table, key, missing): it = iter(table) hdr, it = iterpeek(it) flds = list(map(text_type, hdr)) # determine output fields if isinstance(key, string_types): outhdr = [key] keyflds = set([key]) else: outhdr = list(key) keyflds = set(key) valflds = [f for f in flds if f not in keyflds] valfldidxs = [flds.index(f) for f in valflds] outhdr.extend(valflds) yield tuple(outhdr) # do the work for k, grp in rowgroupby(it, key): grp = list(grp) if isinstance(key, string_types): outrow = [k] else: outrow = list(k) mergedvals = [ set(row[i] for row in grp if len(row) > i and row[i] != missing) for i in valfldidxs ] normedvals = [ vals.pop() if len(vals) == 1 else missing if len(vals) == 0 else Conflict(vals) for vals in mergedvals ] outrow.extend(normedvals) yield tuple(outrow)
def valuestoarray(vals, dtype=None, count=-1, sample=1000): """ Load values from a table column into a `numpy <http://www.numpy.org/>`_ array, e.g.:: >>> import petl as etl >>> table = [('foo', 'bar', 'baz'), ... ('apples', 1, 2.5), ... ('oranges', 3, 4.4), ... ('pears', 7, .1)] >>> table = etl.wrap(table) >>> table.values('bar').array() array([1, 3, 7]) >>> # specify dtype ... table.values('bar').array(dtype='i4') array([1, 3, 7], dtype=int32) """ import numpy as np it = iter(vals) if dtype is None: peek, it = iterpeek(it, sample) dtype = np.array(peek).dtype a = np.fromiter(it, dtype=dtype, count=count) return a
def iterrowreduce(source, key, reducer, header): if header is None: # output header from source header, source = iterpeek(source) yield tuple(header) for key, rows in rowgroupby(source, key): yield tuple(reducer(key, rows))
def itermergeduplicates(table, key, missing): it = iter(table) hdr, it = iterpeek(it) flds = list(map(text_type, hdr)) # determine output fields if isinstance(key, string_types): outhdr = [key] keyflds = set([key]) else: outhdr = list(key) keyflds = set(key) valflds = [f for f in flds if f not in keyflds] valfldidxs = [flds.index(f) for f in valflds] outhdr.extend(valflds) yield tuple(outhdr) # do the work for k, grp in rowgroupby(it, key): grp = list(grp) if isinstance(key, string_types): outrow = [k] else: outrow = list(k) mergedvals = [set(row[i] for row in grp if len(row) > i and row[i] != missing) for i in valfldidxs] normedvals = [vals.pop() if len(vals) == 1 else missing if len(vals) == 0 else Conflict(vals) for vals in mergedvals] outrow.extend(normedvals) yield tuple(outrow)
def _build_schema_from_values(table, sample): # table2: try not advance iterators samples, table2 = iterpeek(table, sample + 1) props = fieldnames(samples) peek = skip(samples, 1) schema_fields = _build_schema_fields_from_values(peek, props) schema_source = _build_schema_with(schema_fields) return schema_source, table2
def tobcolz(table, dtype=None, sample=1000, **kwargs): """Load data into a bcolz ctable, e.g.:: >>> import petl as etl >>> table = [('foo', 'bar', 'baz'), ... ('apples', 1, 2.5), ... ('oranges', 3, 4.4), ... ('pears', 7, .1)] >>> ctbl = etl.tobcolz(table) >>> ctbl ctable((3,), [('foo', '<U7'), ('bar', '<i8'), ('baz', '<f8')]) nbytes: 132; cbytes: 1023.98 KB; ratio: 0.00 cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0) [('apples', 1, 2.5) ('oranges', 3, 4.4) ('pears', 7, 0.1)] >>> ctbl.names ['foo', 'bar', 'baz'] >>> ctbl['foo'] carray((3,), <U7) nbytes := 84; cbytes := 511.98 KB; ratio: 0.00 cparams := cparams(clevel=5, shuffle=1, cname='lz4', quantize=0) chunklen := 18724; chunksize: 524272; blocksize: 0 ['apples' 'oranges' 'pears'] Other keyword arguments are passed through to the ctable constructor. .. versionadded:: 1.1.0 """ import bcolz import numpy as np it = iter(table) peek, it = iterpeek(it, sample) hdr = next(it) # numpy is fussy about having tuples, need to make sure it = (tuple(row) for row in it) flds = list(map(text_type, hdr)) dtype = construct_dtype(flds, peek, dtype) # create ctable kwargs.setdefault('expectedlen', 1000000) kwargs.setdefault('mode', 'w') ctbl = bcolz.ctable(np.array([], dtype=dtype), **kwargs) # fill chunk-wise chunklen = sum(ctbl.cols[name].chunklen for name in ctbl.names) // len(ctbl.names) while True: data = list(itertools.islice(it, chunklen)) data = np.array(data, dtype=dtype) ctbl.append(data) if len(data) < chunklen: break ctbl.flush() return ctbl
def _fix_missing_headers(table, schema): '''add missing columns headers from schema''' if schema is None or 'fields' not in schema: return table # table2: try not advance iterators sample, table2 = iterpeek(table, 2) cols = fieldnames(sample) headers = _get_schema_header_names(schema) if len(cols) >= len(headers): return table2 table3 = setheader(table2, headers) return table3
def _fix_missing_headers(table, schema): '''add missing columns headers from schema''' if schema is None or not 'fields' in schema: return table # table2: try not advance iterators sample, table2 = iterpeek(table, 2) cols = fieldnames(sample) fields = schema.get('fields') if len(cols) >= len(fields): return table2 header = [field.get('name') for field in fields] table3 = setheader(table2, header) return table3
def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix): lit = iter(left) lhdr = next(lit) rhdr, rit = iterpeek(right) # need the whole lot to pass to lookup rlookup = lookupone(rit, rkey, strict=False) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table lgetk = operator.itemgetter(*lkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(str(lprefix) + str(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(str(rprefix) + str(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_lrow, _rrow): # start with the left row _outrow = list(_lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) return tuple(_outrow) for lrow in lit: k = lgetk(lrow) if k in rlookup: rrow = rlookup[k] yield joinrows(lrow, rrow) else: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow)
def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix): lit = iter(left) lhdr = next(lit) rhdr, rit = iterpeek(right) # need the whole lot to pass to lookup rlookup = lookupone(rit, rkey, strict=False) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table lgetk = operator.itemgetter(*lkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_lrow, _rrow): # start with the left row _outrow = list(_lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) return tuple(_outrow) for lrow in lit: k = lgetk(lrow) if k in rlookup: rrow = rlookup[k] yield joinrows(lrow, rrow) else: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow)
def iterjlines(f, header, missing): it = iter(f) if header is None: header = list() peek, it = iterpeek(it, 1) json_obj = json.loads(peek) if hasattr(json_obj, 'keys'): header += [k for k in json_obj.keys() if k not in header] yield tuple(header) for o in it: json_obj = json.loads(o) yield tuple(json_obj[f] if f in json_obj else missing for f in header)
def iterdicts(dicts, header, sample, missing): it = iter(dicts) # determine header row if header is None: # discover fields header = list() peek, it = iterpeek(it, sample) for o in peek: if hasattr(o, 'keys'): header += [k for k in o.keys() if k not in header] yield tuple(header) # generate data rows for o in it: yield tuple(o[f] if f in o else missing for f in header)
def iterdicts(dicts, header, sample, missing): it = iter(dicts) # determine header row if header is None: # discover fields header = list() peek, it = iterpeek(it, sample) for o in peek: if hasattr(o, "keys"): header += [k for k in o.keys() if k not in header] yield tuple(header) # generate data rows for o in it: yield tuple(o[f] if f in o else missing for f in header)
def toarray(table, dtype=None, count=-1, sample=1000): """ Load data from the given `table` into a `numpy <http://www.numpy.org/>`_ structured array. E.g.:: >>> import petl as etl >>> table = [('foo', 'bar', 'baz'), ... ('apples', 1, 2.5), ... ('oranges', 3, 4.4), ... ('pears', 7, .1)] >>> a = etl.toarray(table) >>> a array([('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, 0.1)], dtype=(numpy.record, [('foo', '<U7'), ('bar', '<i8'), ('baz', '<f8')])) >>> # the dtype can be specified as a string ... a = etl.toarray(table, dtype='a4, i2, f4') >>> a array([(b'appl', 1, 2.5), (b'oran', 3, 4.400000095367432), (b'pear', 7, 0.10000000149011612)], dtype=[('foo', 'S4'), ('bar', '<i2'), ('baz', '<f4')]) >>> # the dtype can also be partially specified ... a = etl.toarray(table, dtype={'foo': 'a4'}) >>> a array([(b'appl', 1, 2.5), (b'oran', 3, 4.4), (b'pear', 7, 0.1)], dtype=[('foo', 'S4'), ('bar', '<i8'), ('baz', '<f8')]) If the dtype is not completely specified, `sample` rows will be examined to infer an appropriate dtype. """ import numpy as np it = iter(table) peek, it = iterpeek(it, sample) hdr = next(it) flds = list(map(str, hdr)) dtype = construct_dtype(flds, peek, dtype) # numpy is fussy about having tuples, need to make sure it = (tuple(row) for row in it) sa = np.fromiter(it, dtype=dtype, count=count) return sa
def toxml(table, target=None, root=None, head=None, rows=None, prologue=None, epilogue=None, style='tag', encoding='utf-8'): """ Write the table into a new xml file according to elements defined in the function arguments. The `root`, `head` and `rows` (string, optional) arguments define the tags and the nesting of the xml file. Each one defines xml elements with tags separated by slashes (`/`) like in `root/level/tag`. They can have a arbitrary number of tags that will reflect in more nesting levels for the header or record/row written in the xml file. For details on tag naming and nesting rules check xml `specification`_ or xml `references`_. The `rows` argument define the elements for each row of data to be written in the xml file. When specified, it must have at least 2 tags for defining the tags for `row/column`. Additional tags will add nesting enclosing all records/rows/lines. The `head` argument is similar to the rows, but aplies only to one line/row of header with fieldnames. When specified, it must have at least 2 tags for `fields/name` and the remaining will increase nesting. The `root` argument defines the elements enclosing `head` and `rows` and is required when using `head` for specifying valid xml documents. When none of this arguments are specified, they will default to tags that generate output similar to a html table: `root='table', head='there/tr/td', rows='tbody/tr/td'`. The `prologue` argument (string, optional) could be a snippet of valid xml that will be inserted before other elements in the xml. It can optionally specify the `XML Prolog` of the file. The `epilogue` argument (string, optional) could be a snippet of valid xml that will be inserted after all other xml elements except the root closing tag. It must specify a closing tag if the `root` argument is not specified. The `style` argument select the format of the elements in the xml file. It can be `tag` (default), `name`, `attribute` or a custom string to format each row via `str.format <http://docs.python.org/library/stdtypes.html#str.format>`_. Example usage for writing files:: >>> import petl as etl >>> table1 = [['foo', 'bar'], ... ['a', 1], ... ['b', 2]] >>> etl.toxml(table1, 'example.file4.xml') >>> # see what we did is similar a html table: >>> print(open('example.file4.xml').read()) <?xml version="1.0" encoding="UTF-8"?> <table><thead> <tr><th>foo</th><th>bar</th></tr> </thead><tbody> <tr><td>a</td><td>1</td></tr> <tr><td>b</td><td>2</td></tr> </tbody></table> >>> # define the nesting in xml file: >>> etl.toxml(table1, 'example.file5.xml', rows='plan/line/cell') >>> print(open('example.file5.xml').read()) <?xml version="1.0" encoding="UTF-8"?> <plan> <line><cell>a</cell><cell>1</cell></line> <line><cell>b</cell><cell>2</cell></line> </plan> >>> # choose other style: >>> etl.toxml(table1, 'example.file6.xml', rows='row/col', style='attribute') >>> print(open('example.file6.xml').read()) <?xml version="1.0" encoding="UTF-8"?> <row> <col foo="a" bar="1" /> <col foo="b" bar="2" /> </row> >>> etl.toxml(table1, 'example.file6.xml', rows='row/col', style='name') >>> print(open('example.file6.xml').read()) <?xml version="1.0" encoding="UTF-8"?> <row> <col><foo>a</foo><bar>1</bar></col> <col><foo>b</foo><bar>2</bar></col> </row> The `toxml()` function is just a wrapper over :func:`petl.io.text.totext`. For advanced cases use a template with `totext()` for generating xml files. .. versionadded:: 1.7.0 .. _specification: https://www.w3.org/TR/xml/ .. _references: https://www.w3schools.com/xml/xml_syntax.asp """ if not root and not head and not rows: root = 'table' head = 'thead/tr/th' rows = 'tbody/tr/td' sample, table2 = iterpeek(table, 2) props = fieldnames(sample) top = _build_xml_header(style, props, root, head, rows, prologue, encoding) template = _build_cols(style, props, rows, True) bottom = _build_xml_footer(style, epilogue, rows, root) totext(table2, source=target, encoding=encoding, errors='strict', template=template, prologue=top, epilogue=bottom)
def tohdf5(table, source, where=None, name=None, create=False, drop=False, description=None, title='', filters=None, expectedrows=10000, chunkshape=None, byteorder=None, createparents=False, sample=1000): """ Write to an HDF5 table. If `create` is `False`, assumes the table already exists, and attempts to truncate it before loading. If `create` is `True`, a new table will be created, and if `drop` is True, any existing table will be dropped first. If `description` is `None`, the description will be guessed. E.g.:: >>> import petl as etl >>> table1 = (('foo', 'bar'), ... (1, b'asdfgh'), ... (2, b'qwerty'), ... (3, b'zxcvbn')) >>> etl.tohdf5(table1, 'example.h5', '/testgroup', 'testtable', ... drop=True, create=True, createparents=True) >>> etl.fromhdf5('example.h5', '/testgroup', 'testtable') +-----+-----------+ | foo | bar | +=====+===========+ | 1 | b'asdfgh' | +-----+-----------+ | 2 | b'qwerty' | +-----+-----------+ | 3 | b'zxcvbn' | +-----+-----------+ """ import tables it = iter(table) if create: with _get_hdf5_file(source, mode='a') as h5file: if drop: try: h5file.get_node(where, name) except tables.NoSuchNodeError: pass else: h5file.remove_node(where, name) # determine datatype if description is None: peek, it = iterpeek(it, sample) # use a numpy dtype description = guessdtype(peek) # create the table h5file.create_table(where, name, description, title=title, filters=filters, expectedrows=expectedrows, chunkshape=chunkshape, byteorder=byteorder, createparents=createparents) with _get_hdf5_table(source, where, name, mode='a') as h5table: # truncate the existing table h5table.truncate(0) # load the data _insert(it, h5table)
def toarray(table, dtype=None, count=-1, sample=1000): """ Load data from the given `table` into a `numpy <http://www.numpy.org/>`_ structured array. E.g.:: >>> import petl as etl >>> table = [('foo', 'bar', 'baz'), ... ('apples', 1, 2.5), ... ('oranges', 3, 4.4), ... ('pears', 7, .1)] >>> a = etl.toarray(table) >>> a array([('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, 0.1)], dtype=[('foo', '<U7'), ('bar', '<i8'), ('baz', '<f8')]) >>> # the dtype can be specified as a string ... a = etl.toarray(table, dtype='a4, i2, f4') >>> a array([(b'appl', 1, 2.5), (b'oran', 3, 4.400000095367432), (b'pear', 7, 0.10000000149011612)], dtype=[('foo', 'S4'), ('bar', '<i2'), ('baz', '<f4')]) >>> # the dtype can also be partially specified ... a = etl.toarray(table, dtype={'foo': 'a4'}) >>> a array([(b'appl', 1, 2.5), (b'oran', 3, 4.4), (b'pear', 7, 0.1)], dtype=[('foo', 'S4'), ('bar', '<i8'), ('baz', '<f8')]) If the dtype is not completely specified, `sample` rows will be examined to infer an appropriate dtype. """ import numpy as np it = iter(table) peek, it = iterpeek(it, sample) hdr = next(it) flds = list(map(str, hdr)) if dtype is None: dtype = guessdtype(peek) elif isinstance(dtype, string_types): # insert field names from source table typestrings = [s.strip() for s in dtype.split(',')] dtype = [(f, t) for f, t in zip(flds, typestrings)] elif (isinstance(dtype, dict) and ('names' not in dtype or 'formats' not in dtype)): # allow for partial specification of dtype cols = columns(peek) newdtype = {'names': [], 'formats': []} for f in flds: newdtype['names'].append(f) if f in dtype and isinstance(dtype[f], tuple): # assume fully specified newdtype['formats'].append(dtype[f][0]) elif f not in dtype: # not specified at all a = np.array(cols[f]) newdtype['formats'].append(a.dtype) else: # assume directly specified, just need to add offset newdtype['formats'].append(dtype[f]) dtype = newdtype else: pass # leave dtype as-is # numpy is fussy about having tuples, need to make sure it = (tuple(row) for row in it) sa = np.fromiter(it, dtype=dtype, count=count) return sa