def __iter__(self): # prefer implementation using xlutils.view as dates are automatically # converted if self.use_view: from petl.io import xlutils_view source = read_source_from_arg(self.filename) with source.open('rb') as source2: source3 = source2.read() wb = xlutils_view.View(source3, **self.kwargs) if self.sheet is None: ws = wb[0] else: ws = wb[self.sheet] for row in ws: yield tuple(row) else: import xlrd source = read_source_from_arg(self.filename) with source.open('rb') as source2: source3 = source2.read() with xlrd.open_workbook(file_contents=source3, on_demand=True, **self.kwargs) as wb: if self.sheet is None: ws = wb.sheet_by_index(0) elif isinstance(self.sheet, int): ws = wb.sheet_by_index(self.sheet) else: ws = wb.sheet_by_name(str(self.sheet)) for rownum in xrange(ws.nrows): yield tuple(ws.row_values(rownum))
def frompickle(source=None): """ Extract a table From data pickled in the given file. The rows in the table should have been pickled to the file one at a time. E.g.:: >>> import petl as etl >>> import pickle >>> # set up a file to demonstrate with ... with open('example.p', 'wb') as f: ... pickle.dump(['foo', 'bar'], f) ... pickle.dump(['a', 1], f) ... pickle.dump(['b', 2], f) ... pickle.dump(['c', 2.5], f) ... >>> # now demonstrate the use of frompickle() ... table1 = etl.frompickle('example.p') >>> table1 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | 1 | +-----+-----+ | 'b' | 2 | +-----+-----+ | 'c' | 2.5 | +-----+-----+ """ source = read_source_from_arg(source) return PickleView(source)
def appendxlsx(tbl, filename, sheet=None, write_header=False): """ Appends rows to an existing Excel .xlsx file. """ import openpyxl source = read_source_from_arg(filename) with source.open('rb') as source2: wb = openpyxl.load_workbook(filename=source2, read_only=False) if sheet is None: ws = wb[wb.sheetnames[0]] elif isinstance(sheet, int): ws = wb[wb.sheetnames[sheet]] else: ws = wb[str(sheet)] if write_header: it = iter(tbl) hdr = next(it) flds = list(map(text_type, hdr)) rows = itertools.chain([flds], it) else: rows = data(tbl) for row in rows: ws.append(row) target = write_source_from_arg(filename) with target.open('wb') as target2: wb.save(target2)
def frompickle(source=None): """ Returns a table providing access to the data pickled in the given file. The rows in the table should have been pickled to the file one at a time. E.g.:: >>> import pickle >>> # set up a file to demonstrate with ... with open('test.dat', 'wb') as f: ... pickle.dump(['foo', 'bar'], f) ... pickle.dump(['a', 1], f) ... pickle.dump(['b', 2], f) ... pickle.dump(['c', 2.5], f) ... >>> # now demonstrate the use of petl.frompickle ... from petl import frompickle, look >>> testdat = frompickle('test.dat') >>> look(testdat) +-------+-------+ | 'foo' | 'bar' | +=======+=======+ | 'a' | 1 | +-------+-------+ | 'b' | 2 | +-------+-------+ | 'c' | 2.5 | +-------+-------+ Supports transparent reading from URLs, ``.gz`` and ``.bz2`` files. """ source = read_source_from_arg(source) return PickleView(source)
def fromjson(source, *args, **kwargs): """ Extract data from a JSON file. The file must contain a JSON array as the top level object, and each member of the array will be treated as a row of data. E.g.:: >>> from petl import fromjson, look >>> data = '[{"foo": "a", "bar": 1}, {"foo": "b", "bar": 2}, {"foo": "c", "bar": 2}]' >>> with open('example1.json', 'w') as f: ... f.write(data) ... >>> table1 = fromjson('example1.json') >>> look(table1) +--------+--------+ | u'foo' | u'bar' | +========+========+ | u'a' | 1 | +--------+--------+ | u'b' | 2 | +--------+--------+ | u'c' | 2 | +--------+--------+ If your JSON file does not fit this structure, you will need to parse it via :func:`json.load` and select the array to treat as the data, see also :func:`fromdicts`. Supports transparent reading from URLs, ``.gz`` and ``.bz2`` files. .. versionadded:: 0.5 """ source = read_source_from_arg(source) return JsonView(source, *args, **kwargs)
def __iter__(self): import openpyxl source = read_source_from_arg(self.filename) with source.open('rb') as source2: wb = openpyxl.load_workbook(filename=source2, read_only=self.read_only, **self.kwargs) if self.sheet is None: ws = wb[wb.sheetnames[0]] elif isinstance(self.sheet, int): ws = wb[wb.sheetnames[self.sheet]] else: ws = wb[str(self.sheet)] if self.range_string is not None: rows = ws[self.range_string] else: rows = ws.iter_rows(min_row=self.min_row, min_col=self.min_col, max_row=self.max_row, max_col=self.max_col) for row in rows: yield tuple(cell.value for cell in row) try: wb._archive.close() except AttributeError: # just here in case openpyxl stops exposing an _archive property. pass
def fromfwf(source=None, widths=None, encoding=None, errors='strict', header=None, skiprows=None, skipfooter=None): """ Extract a table (As defined by the petl package) from lines in the given fixed-width file. >>> import fromfwf >>> # setup a sample file ... text = ' 18 5 2\\n2018 5 2\\n20180502' >>> with open('example.txt', 'w') as f: ... f.write(text) ... 28 >>> table1 = etl.fromfwf('example.txt', widths=[4, 2, 2]) >>> table1 +--------+------+------+ | lines | | | +========+======+======+ | ' 18' | ' 5' | ' 2' | +--------+------+------+ | '2018' | ' 5' | ' 2' | +--------+------+------+ | '2018' | '05' | '02' | +--------+------+------+ >>> # Specify headers for the file ... table1 = etl.fromfwf('example.txt', widths=[4, 2, 2], header=['year', 'month', 'day']) >>> table1 +--------+-------+------+ | year | month | day | +========+=======+======+ | ' 18' | ' 5' | ' 2' | +--------+-------+------+ | '2018' | ' 5' | ' 2' | +--------+-------+------+ | '2018' | '05' | '02' | +--------+-------+------+ """ if widths is None: raise AttributeError('No field widths provided!') if isinstance(widths, (list, tuple)) is False: raise AttributeError( 'Field widths must be a tuple or list of field widths!') if isinstance(source, Table): source = source else: source = read_source_from_arg(source) return FixedTextView(source, widths=widths, header=header, encoding=encoding, errors=errors, skiprows=skiprows, skipfooter=skipfooter)
def fromtext(source=None, encoding=None, errors=None, strip=None, header=('lines', )): """ Extract a table from lines in the given text file. E.g.:: >>> import petl as etl >>> # setup example file ... text = 'a,1\\nb,2\\nc,2\\n' >>> with open('example.txt', 'w') as f: ... f.write(text) ... 12 >>> table1 = etl.fromtext('example.txt') >>> table1 +-------+ | lines | +=======+ | 'a,1' | +-------+ | 'b,2' | +-------+ | 'c,2' | +-------+ >>> # post-process, e.g., with capture() ... table2 = table1.capture('lines', '(.*),(.*)$', ['foo', 'bar']) >>> table2 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | '1' | +-----+-----+ | 'b' | '2' | +-----+-----+ | 'c' | '2' | +-----+-----+ Note that the strip() function is called on each line, which by default will remove leading and trailing whitespace, including the end-of-line character - use the `strip` keyword argument to specify alternative characters to strip. Set the `strip` argument to `False` to disable this behaviour and leave line endings in place. """ source = read_source_from_arg(source) return TextView(source, header=header, encoding=encoding, errors=errors, strip=strip)
def fromucsv(source=None, dialect=csv.excel, encoding='utf-8', **kwargs): """ Returns a table containing unicode data extracted from a delimited file via the given encoding. Like :func:`fromcsv` but accepts an additional ``encoding`` argument which should be one of the Python supported encodings. See also :mod:`codecs`. .. versionadded:: 0.19 """ source = read_source_from_arg(source) return UnicodeCSVView(source=source, dialect=dialect, encoding=encoding, **kwargs)
def fromutext(source=None, header=(u'lines',), encoding='utf-8', strip=None): """ Construct a table from lines in the given text file via the given encoding. Like :func:`fromtext` but accepts an additional ``encoding`` argument which should be one of the Python supported encodings. See also :mod:`codecs`. .. versionadded:: 0.19 """ source = read_source_from_arg(source) return UnicodeTextView(source, header, encoding=encoding, strip=strip)
def fromtext(source=None, header=('lines',), strip=None): """ Construct a table from lines in the given text file. E.g.:: >>> # example data ... with open('test.txt', 'w') as f: ... f.write('a\\t1\\n') ... f.write('b\\t2\\n') ... f.write('c\\t3\\n') ... >>> from petl import fromtext, look >>> table1 = fromtext('test.txt') >>> look(table1) +--------------+ | 'lines' | +==============+ | 'a\\t1' | +--------------+ | 'b\\t2' | +--------------+ | 'c\\t3' | +--------------+ The :func:`fromtext` function provides a starting point for custom handling of text files. E.g., using :func:`capture`:: >>> from petl import capture >>> table2 = capture(table1, 'lines', '(.*)\\\\t(.*)$', ['foo', 'bar']) >>> look(table2) +-------+-------+ | 'foo' | 'bar' | +=======+=======+ | 'a' | '1' | +-------+-------+ | 'b' | '2' | +-------+-------+ | 'c' | '3' | +-------+-------+ Supports transparent reading from URLs, ``.gz`` and ``.bz2`` files. .. versionchanged:: 0.4 The strip() function is called on each line, which by default will remove leading and trailing whitespace, including the end-of-line character - use the `strip` keyword argument to specify alternative characters to strip. """ source = read_source_from_arg(source) return TextView(source, header, strip=strip)
def fromtext(source=None, encoding=None, errors='strict', strip=None, header=('lines',)): """ Extract a table from lines in the given text file. E.g.:: >>> import petl as etl >>> # setup example file ... text = 'a,1\\nb,2\\nc,2\\n' >>> with open('example.txt', 'w') as f: ... f.write(text) ... 12 >>> table1 = etl.fromtext('example.txt') >>> table1 +-------+ | lines | +=======+ | 'a,1' | +-------+ | 'b,2' | +-------+ | 'c,2' | +-------+ >>> # post-process, e.g., with capture() ... table2 = table1.capture('lines', '(.*),(.*)$', ['foo', 'bar']) >>> table2 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | '1' | +-----+-----+ | 'b' | '2' | +-----+-----+ | 'c' | '2' | +-----+-----+ Note that the strip() function is called on each line, which by default will remove leading and trailing whitespace, including the end-of-line character - use the `strip` keyword argument to specify alternative characters to strip. Set the `strip` argument to `False` to disable this behaviour and leave line endings in place. """ source = read_source_from_arg(source) return TextView(source, header=header, encoding=encoding, errors=errors, strip=strip)
def fromjson(source, *args, **kwargs): """ Extract data from a JSON file. The file must contain a JSON array as the top level object, and each member of the array will be treated as a row of data. E.g.:: >>> import petl as etl >>> data = ''' ... [{"foo": "a", "bar": 1}, ... {"foo": "b", "bar": 2}, ... {"foo": "c", "bar": 2}] ... ''' >>> with open('example.json', 'w') as f: ... f.write(data) ... 74 >>> table1 = etl.fromjson('example.json', header=['foo', 'bar']) >>> table1 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | 1 | +-----+-----+ | 'b' | 2 | +-----+-----+ | 'c' | 2 | +-----+-----+ If your JSON file does not fit this structure, you will need to parse it via :func:`json.load` and select the array to treat as the data, see also :func:`petl.io.json.fromdicts`. .. versionchanged:: 1.1.0 If no `header` is specified, fields will be discovered by sampling keys from the first `sample` objects in `source`. The header will be constructed from keys in the order discovered. Note that this ordering may not be stable, and therefore it may be advisable to specify an explicit `header` or to use another function like :func:`petl.transform.headers.sortheader` on the resulting table to guarantee stability. """ source = read_source_from_arg(source) return JsonView(source, *args, **kwargs)
def _load_or_create_workbook(filename, mode, sheet): if PY3: FileNotFound = FileNotFoundError else: FileNotFound = IOError import openpyxl wb = None if not (mode == "overwrite" or (mode == "replace" and sheet is None)): try: source = read_source_from_arg(filename) with source.open('rb') as source2: wb = openpyxl.load_workbook(filename=source2, read_only=False) except FileNotFound: wb = None if wb is None: wb = openpyxl.Workbook(write_only=True) return wb
def fromcsv(source=None, encoding=None, errors='strict', **csvargs): """ Extract a table from a delimited file. E.g.:: >>> import petl as etl >>> import csv >>> # set up a CSV file to demonstrate with ... table1 = [['foo', 'bar'], ... ['a', 1], ... ['b', 2], ... ['c', 2]] >>> with open('example.csv', 'w') as f: ... writer = csv.writer(f) ... writer.writerows(table1) ... >>> # now demonstrate the use of fromcsv() ... table2 = etl.fromcsv('example.csv') >>> table2 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | '1' | +-----+-----+ | 'b' | '2' | +-----+-----+ | 'c' | '2' | +-----+-----+ The `source` argument is the path of the delimited file, all other keyword arguments are passed to :func:`csv.reader`. So, e.g., to override the delimiter from the default CSV dialect, provide the `delimiter` keyword argument. Note that all data values are strings, and any intended numeric values will need to be converted, see also :func:`petl.transform.conversions.convert`. """ source = read_source_from_arg(source) csvargs.setdefault('dialect', 'excel') return fromcsv_impl(source=source, encoding=encoding, errors=errors, **csvargs)
def fromcsv(source=None, encoding=None, errors='strict', header=None, **csvargs): """ Extract a table from a delimited file. E.g.:: >>> import petl as etl >>> import csv >>> # set up a CSV file to demonstrate with ... table1 = [['foo', 'bar'], ... ['a', 1], ... ['b', 2], ... ['c', 2]] >>> with open('example.csv', 'w') as f: ... writer = csv.writer(f) ... writer.writerows(table1) ... >>> # now demonstrate the use of fromcsv() ... table2 = etl.fromcsv('example.csv') >>> table2 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | '1' | +-----+-----+ | 'b' | '2' | +-----+-----+ | 'c' | '2' | +-----+-----+ The `source` argument is the path of the delimited file, all other keyword arguments are passed to :func:`csv.reader`. So, e.g., to override the delimiter from the default CSV dialect, provide the `delimiter` keyword argument. Note that all data values are strings, and any intended numeric values will need to be converted, see also :func:`petl.transform.conversions.convert`. """ source = read_source_from_arg(source) csvargs.setdefault('dialect', 'excel') return fromcsv_impl(source=source, encoding=encoding, errors=errors, header=header, **csvargs)
def fromcsv(source=None, dialect=csv.excel, **kwargs): """ Wrapper for the standard :func:`csv.reader` function. Returns a table providing access to the data in the given delimited file. E.g.:: >>> import csv >>> # set up a CSV file to demonstrate with ... with open('test.csv', 'wb') as f: ... writer = csv.writer(f) ... writer.writerow(['foo', 'bar']) ... writer.writerow(['a', 1]) ... writer.writerow(['b', 2]) ... writer.writerow(['c', 2]) ... >>> # now demonstrate the use of petl.fromcsv ... from petl import fromcsv, look >>> testcsv = fromcsv('test.csv') >>> look(testcsv) +-------+-------+ | 'foo' | 'bar' | +=======+=======+ | 'a' | '1' | +-------+-------+ | 'b' | '2' | +-------+-------+ | 'c' | '2' | +-------+-------+ The `filename` argument is the path of the delimited file, all other keyword arguments are passed to :func:`csv.reader`. So, e.g., to override the delimiter from the default CSV dialect, provide the `delimiter` keyword argument. Note that all data values are strings, and any intended numeric values will need to be converted, see also :func:`convert`. Supports transparent reading from URLs, ``.gz`` and ``.bz2`` files. """ source = read_source_from_arg(source) return CSVView(source=source, dialect=dialect, **kwargs)
def fromjson(source, *args, **kwargs): """ Extract data from a JSON file. The file must contain a JSON array as the top level object, and each member of the array will be treated as a row of data. E.g.:: >>> import petl as etl >>> data = ''' ... [{"foo": "a", "bar": 1}, ... {"foo": "b", "bar": 2}, ... {"foo": "c", "bar": 2}] ... ''' >>> with open('example.json', 'w') as f: ... f.write(data) ... 74 >>> table1 = etl.fromjson('example.json') >>> table1 +-----+-----+ | bar | foo | +=====+=====+ | 1 | 'a' | +-----+-----+ | 2 | 'b' | +-----+-----+ | 2 | 'c' | +-----+-----+ If your JSON file does not fit this structure, you will need to parse it via :func:`json.load` and select the array to treat as the data, see also :func:`petl.io.json.fromdicts`. """ source = read_source_from_arg(source) return JsonView(source, *args, **kwargs)
def fromxml(source, *args, **kwargs): """ Extract data from an XML file. E.g.:: >>> import petl as etl >>> # setup a file to demonstrate with ... d = '''<table> ... <tr> ... <td>foo</td><td>bar</td> ... </tr> ... <tr> ... <td>a</td><td>1</td> ... </tr> ... <tr> ... <td>b</td><td>2</td> ... </tr> ... <tr> ... <td>c</td><td>2</td> ... </tr> ... </table>''' >>> with open('example1.xml', 'w') as f: ... f.write(d) ... 212 >>> table1 = etl.fromxml('example1.xml', 'tr', 'td') >>> table1 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | '1' | +-----+-----+ | 'b' | '2' | +-----+-----+ | 'c' | '2' | +-----+-----+ If the data values are stored in an attribute, provide the attribute name as an extra positional argument:: >>> d = '''<table> ... <tr> ... <td v='foo'/><td v='bar'/> ... </tr> ... <tr> ... <td v='a'/><td v='1'/> ... </tr> ... <tr> ... <td v='b'/><td v='2'/> ... </tr> ... <tr> ... <td v='c'/><td v='2'/> ... </tr> ... </table>''' >>> with open('example2.xml', 'w') as f: ... f.write(d) ... 220 >>> table2 = etl.fromxml('example2.xml', 'tr', 'td', 'v') >>> table2 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | '1' | +-----+-----+ | 'b' | '2' | +-----+-----+ | 'c' | '2' | +-----+-----+ Data values can also be extracted by providing a mapping of field names to element paths:: >>> d = '''<table> ... <row> ... <foo>a</foo><baz><bar v='1'/><bar v='3'/></baz> ... </row> ... <row> ... <foo>b</foo><baz><bar v='2'/></baz> ... </row> ... <row> ... <foo>c</foo><baz><bar v='2'/></baz> ... </row> ... </table>''' >>> with open('example3.xml', 'w') as f: ... f.write(d) ... 223 >>> table3 = etl.fromxml('example3.xml', 'row', ... {'foo': 'foo', 'bar': ('baz/bar', 'v')}) >>> table3 +------------+-----+ | bar | foo | +============+=====+ | ('1', '3') | 'a' | +------------+-----+ | '2' | 'b' | +------------+-----+ | '2' | 'c' | +------------+-----+ If `lxml <http://lxml.de/>`_ is installed, full XPath expressions can be used. Note that the implementation is currently **not** streaming, i.e., the whole document is loaded into memory. If multiple elements match a given field, all values are reported as a tuple. If there is more than one element name used for row values, a tuple or list of paths can be provided, e.g., ``fromxml('example.html', './/tr', ('th', 'td'))``. Optionally a custom parser can be provided, e.g.:: >>> from lxml import etree # doctest: +SKIP ... my_parser = etree.XMLParser(resolve_entities=False) # doctest: +SKIP ... table4 = etl.fromxml('example1.xml', 'tr', 'td', parser=my_parser) # doctest: +SKIP """ source = read_source_from_arg(source) return XmlView(source, *args, **kwargs)
def fromxml(source, *args, **kwargs): """ Extract data from an XML file. E.g.:: >>> import petl as etl >>> # setup a file to demonstrate with ... d = '''<table> ... <tr> ... <td>foo</td><td>bar</td> ... </tr> ... <tr> ... <td>a</td><td>1</td> ... </tr> ... <tr> ... <td>b</td><td>2</td> ... </tr> ... <tr> ... <td>c</td><td>2</td> ... </tr> ... </table>''' >>> with open('example1.xml', 'w') as f: ... f.write(d) ... 212 >>> table1 = etl.fromxml('example1.xml', 'tr', 'td') >>> table1 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | '1' | +-----+-----+ | 'b' | '2' | +-----+-----+ | 'c' | '2' | +-----+-----+ >>> # if the data values are stored in an attribute, provide the ... # attribute name as an extra positional argument ... d = '''<table> ... <tr> ... <td v='foo'/><td v='bar'/> ... </tr> ... <tr> ... <td v='a'/><td v='1'/> ... </tr> ... <tr> ... <td v='b'/><td v='2'/> ... </tr> ... <tr> ... <td v='c'/><td v='2'/> ... </tr> ... </table>''' >>> with open('example2.xml', 'w') as f: ... f.write(d) ... 220 >>> table2 = etl.fromxml('example2.xml', 'tr', 'td', 'v') >>> table2 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | '1' | +-----+-----+ | 'b' | '2' | +-----+-----+ | 'c' | '2' | +-----+-----+ >>> # data values can also be extracted by providing a mapping of field ... # names to element paths ... d = '''<table> ... <row> ... <foo>a</foo><baz><bar v='1'/><bar v='3'/></baz> ... </row> ... <row> ... <foo>b</foo><baz><bar v='2'/></baz> ... </row> ... <row> ... <foo>c</foo><baz><bar v='2'/></baz> ... </row> ... </table>''' >>> with open('example3.xml', 'w') as f: ... f.write(d) ... 223 >>> table3 = etl.fromxml('example3.xml', 'row', ... {'foo': 'foo', 'bar': ('baz/bar', 'v')}) >>> table3 +------------+-----+ | bar | foo | +============+=====+ | ('1', '3') | 'a' | +------------+-----+ | '2' | 'b' | +------------+-----+ | '2' | 'c' | +------------+-----+ Note that the implementation is currently *not* streaming, i.e., the whole document is loaded into memory. If multiple elements match a given field, all values are reported as a tuple. If there is more than one element name used for row values, a tuple or list of paths can be provided, e.g., ``fromxml('example.html', './/tr', ('th', 'td'))``. """ source = read_source_from_arg(source) return XmlView(source, *args, **kwargs)
def fromxml(source, *args, **kwargs): """ Access data in an XML file. E.g.:: >>> from petl import fromxml, look >>> data = \"""<table> ... <tr> ... <td>foo</td><td>bar</td> ... </tr> ... <tr> ... <td>a</td><td>1</td> ... </tr> ... <tr> ... <td>b</td><td>2</td> ... </tr> ... <tr> ... <td>c</td><td>2</td> ... </tr> ... </table>\""" >>> with open('example1.xml', 'w') as f: ... f.write(data) ... f.close() ... >>> table1 = fromxml('example1.xml', 'tr', 'td') >>> look(table1) +-------+-------+ | 'foo' | 'bar' | +=======+=======+ | 'a' | '1' | +-------+-------+ | 'b' | '2' | +-------+-------+ | 'c' | '2' | +-------+-------+ If the data values are stored in an attribute, provide the attribute name as an extra positional argument, e.g.: >>> data = \"""<table> ... <tr> ... <td v='foo'/><td v='bar'/> ... </tr> ... <tr> ... <td v='a'/><td v='1'/> ... </tr> ... <tr> ... <td v='b'/><td v='2'/> ... </tr> ... <tr> ... <td v='c'/><td v='2'/> ... </tr> ... </table>\""" >>> with open('example2.xml', 'w') as f: ... f.write(data) ... f.close() ... >>> table2 = fromxml('example2.xml', 'tr', 'td', 'v') >>> look(table2) +-------+-------+ | 'foo' | 'bar' | +=======+=======+ | 'a' | '1' | +-------+-------+ | 'b' | '2' | +-------+-------+ | 'c' | '2' | +-------+-------+ Data values can also be extracted by providing a mapping of field names to element paths, e.g.:: >>> data = \"""<table> ... <row> ... <foo>a</foo><baz><bar v='1'/><bar v='3'/></baz> ... </row> ... <row> ... <foo>b</foo><baz><bar v='2'/></baz> ... </row> ... <row> ... <foo>c</foo><baz><bar v='2'/></baz> ... </row> ... </table>\""" >>> with open('example3.xml', 'w') as f: ... f.write(data) ... f.close() ... >>> table3 = fromxml('example3.xml', 'row', {'foo': 'foo', 'bar': ('baz/bar', 'v')}) >>> look(table3) +-------+------------+ | 'foo' | 'bar' | +=======+============+ | 'a' | ('1', '3') | +-------+------------+ | 'b' | '2' | +-------+------------+ | 'c' | '2' | +-------+------------+ Note that the implementation is currently *not* streaming, i.e., the whole document is loaded into memory. Supports transparent reading from URLs, ``.gz`` and ``.bz2`` files. .. versionadded:: 0.4 .. versionchanged:: 0.6 If multiple elements match a given field, all values are reported as a tuple. .. versionchanged:: 0.25 If there is more than one element name used for row values, a tuple or list of paths can be provided, e.g., ``fromxml('example.html', './/tr', ('th', 'td'))``. """ source = read_source_from_arg(source) return XmlView(source, *args, **kwargs)
def fromjson(source, *args, **kwargs): """ Extract data from a JSON file. The file must contain a JSON array as the top level object, and each member of the array will be treated as a row of data. E.g.:: >>> import petl as etl >>> data = ''' ... [{"foo": "a", "bar": 1}, ... {"foo": "b", "bar": 2}, ... {"foo": "c", "bar": 2}] ... ''' >>> with open('example.file1.json', 'w') as f: ... f.write(data) ... 74 >>> table1 = etl.fromjson('example.file1.json', header=['foo', 'bar']) >>> table1 +-----+-----+ | foo | bar | +=====+=====+ | 'a' | 1 | +-----+-----+ | 'b' | 2 | +-----+-----+ | 'c' | 2 | +-----+-----+ Setting argument `lines` to `True` will enable to infer the document as a JSON lines document. For more details about JSON lines please visit https://jsonlines.org/. >>> import petl as etl >>> data_with_jlines = '''{"name": "Gilbert", "wins": [["straight", "7S"], ["one pair", "10H"]]} ... {"name": "Alexa", "wins": [["two pair", "4S"], ["two pair", "9S"]]} ... {"name": "May", "wins": []} ... {"name": "Deloise", "wins": [["three of a kind", "5S"]]}''' ... >>> with open('example.file2.json', 'w') as f: ... f.write(data_with_jlines) ... 223 >>> table2 = etl.fromjson('example.file2.json', lines=True) >>> table2 +-----------+-------------------------------------------+ | name | wins | +===========+===========================================+ | 'Gilbert' | [['straight', '7S'], ['one pair', '10H']] | +-----------+-------------------------------------------+ | 'Alexa' | [['two pair', '4S'], ['two pair', '9S']] | +-----------+-------------------------------------------+ | 'May' | [] | +-----------+-------------------------------------------+ | 'Deloise' | [['three of a kind', '5S']] | +-----------+-------------------------------------------+ If your JSON file does not fit this structure, you will need to parse it via :func:`json.load` and select the array to treat as the data, see also :func:`petl.io.json.fromdicts`. .. versionchanged:: 1.1.0 If no `header` is specified, fields will be discovered by sampling keys from the first `sample` objects in `source`. The header will be constructed from keys in the order discovered. Note that this ordering may not be stable, and therefore it may be advisable to specify an explicit `header` or to use another function like :func:`petl.transform.headers.sortheader` on the resulting table to guarantee stability. """ source = read_source_from_arg(source) return JsonView(source, *args, **kwargs)
def fromavro(source, limit=None, skip=0, **avro_args): """Extract a table from the records of a avro file. The `source` argument (string or file-like or fastavro.reader) can either be the path of the file, a file-like input stream or a instance from fastavro.reader. The `limit` and `skip` arguments can be used to limit the range of rows to extract. The `sample` argument (int, optional) defines how many rows are inspected for discovering the field types and building a schema for the avro file when the `schema` argument is not passed. The rows fields read from file can have scalar values like int, string, float, datetime, date and decimal but can also have compound types like enum, :ref:`array <array_schema>`, map, union and record. The fields types can also have recursive structures defined in :ref:`complex schemas <complex_schema>`. Also types with :ref:`logical types <logical_schema>` types are read and translated to coresponding python types: long timestamp-millis and long timestamp-micros: datetime.datetime, int date: datetime.date, bytes decimal and fixed decimal: Decimal, int time-millis and long time-micros: datetime.time. Example usage for reading files:: >>> # set up a Avro file to demonstrate with ... >>> schema1 = { ... 'doc': 'Some people records.', ... 'name': 'People', ... 'namespace': 'test', ... 'type': 'record', ... 'fields': [ ... {'name': 'name', 'type': 'string'}, ... {'name': 'friends', 'type': 'int'}, ... {'name': 'age', 'type': 'int'}, ... ] ... } ... >>> records1 = [ ... {'name': 'Bob', 'friends': 42, 'age': 33}, ... {'name': 'Jim', 'friends': 13, 'age': 69}, ... {'name': 'Joe', 'friends': 86, 'age': 17}, ... {'name': 'Ted', 'friends': 23, 'age': 51} ... ] ... >>> import fastavro >>> parsed_schema1 = fastavro.parse_schema(schema1) >>> with open('example-file-to-read.avro', 'wb') as f1: ... fastavro.writer(f1, parsed_schema1, records1) ... >>> # now demonstrate the use of fromavro() >>> import petl as etl >>> tbl1 = etl.fromavro('example-file-to-read.avro') >>> tbl1 +-------+---------+-----+ | name | friends | age | +=======+=========+=====+ | 'Bob' | 42 | 33 | +-------+---------+-----+ | 'Jim' | 13 | 69 | +-------+---------+-----+ | 'Joe' | 86 | 17 | +-------+---------+-----+ | 'Ted' | 23 | 51 | +-------+---------+-----+ .. versionadded:: 1.4.0 """ source2 = read_source_from_arg(source) return AvroView(source=source2, limit=limit, skip=skip, **avro_args)