def __iter__(self): # prefer implementation using xlutils.view as dates are automatically converted if self.use_view: try: import xlutils.view except ImportError as e: raise UnsatisfiedDependency(e, dep_message_utils) else: wb = xlutils.view.View(self.filename) if self.sheet is None: ws = wb[0] else: ws = wb[self.sheet] return (tuple(row) for row in ws) else: try: import xlrd except ImportError as e: raise UnsatisfiedDependency(e, dep_message) else: with xlrd.open_workbook(filename=self.filename, on_demand=True) as wb: if self.sheet is None: ws = wb.sheet_by_index(0) elif isinstance(self.sheet, int): ws = wb.sheet_by_index(self.sheet) else: ws = wb.sheet_by_name(str(self.sheet)) return (tuple(ws.row_values(rownum)) for rownum in range(ws.nrows))
def tupletrees(table, facet, start='start', stop='stop', value=None): """ Construct faceted interval trees for the given table, where each node in the tree is a row of the table. """ try: import bx.intervals except ImportError as e: raise UnsatisfiedDependency(e, dep_message) it = iter(table) fields = it.next() assert start in fields, 'start field not recognised' assert stop in fields, 'stop field not recognised' getstart = itemgetter(fields.index(start)) getstop = itemgetter(fields.index(stop)) if value is None: getvalue = tuple else: valueindices = asindices(fields, value) assert len(valueindices) > 0, 'invalid value field specification' getvalue = itemgetter(*valueindices) keyindices = asindices(fields, facet) assert len(keyindices) > 0, 'invalid key' getkey = itemgetter(*keyindices) trees = dict() for row in it: k = getkey(row) if k not in trees: trees[k] = bx.intervals.intersection.IntervalTree() trees[k].add(getstart(row), getstop(row), getvalue(row)) return trees
def __iter__(self): try: import vcf as pyvcf except ImportError as e: raise UnsatisfiedDependency(e, dep_message) reader = pyvcf.Reader(filename=self.filename) # determine header if isinstance(self.samples, (list, tuple)): # specific samples requested yield fixed_fields + tuple(self.samples) elif self.samples: # all samples yield fixed_fields + tuple(reader.samples) else: # no samples yield fixed_fields # fetch region? if None not in {self.chrom, self.start}: it = reader.fetch(self.chrom, self.start, self.end) else: it = reader # yield data for rec in it: out = tuple(getattr(rec, f) for f in fixed_fields) if isinstance(self.samples, (list, tuple)): # specific samples requested out += tuple(rec.genotype(s) for s in self.samples) elif self.samples: # all samples out += tuple(rec.samples) yield out
def todataframe(table, index=None, exclude=None, columns=None, coerce_float=False, nrows=None): """ Convenience function to load data from the given `table` into a pandas DataFrame. .. versionadded:: 0.14 """ try: import pandas as pd except ImportError as e: raise UnsatisfiedDependency(e, dep_message) else: l = list(table) data = l[1:] if columns is None: columns = l[0] return pd.DataFrame.from_records(data, index=index, exclude=exclude, columns=columns, coerce_float=coerce_float, nrows=nrows)
def display(tbl, limit=None, **kwargs): """ Display a table inline within an iPython notebook. E.g.:: In [0]: from petlx.ipython import display tbl = [['foo', 'bar'], ['a', 1], ['b', 2]] display(tbl) Alternatively, using the fluent style:: In [0]: import petl.interactive as etl import petlx.ipython tbl = etl.wrap([['foo', 'bar'], ['a', 1], ['b', 2]]) tbl.display() .. versionadded:: 0.5 """ try: from IPython.core.display import display_html except ImportError as e: raise UnsatisfiedDependency(e, dep_message) else: html = repr_html(tbl, limit=limit, **kwargs) display_html(html, raw=True)
def _get_hdf5_table(source, where, name, mode='r'): try: import tables except ImportError as e: raise UnsatisfiedDependency(e, dep_message) # allow for polymorphic args if isinstance(source, tables.Table): h5file = None h5tbl = source else: if isinstance(source, basestring): # assume it's the name of an HDF5 file h5file = tables.openFile(source, mode=mode) elif isinstance(source, tables.File): h5file = source else: raise Exception( 'invalid source argument, expected file name or tables.File or tables.Table object, found: %r' % source) h5tbl = h5file.getNode(where, name=name) assert isinstance(h5tbl, tables.Table), 'node is not a table: %r' % h5tbl return h5file, h5tbl
def unpackcall(tbl, *keys, **kwargs): """ Unpack the call column. E.g.:: >>> from petlx.vcf import fromvcf, unpackinfo, meltsamples, unpackcall >>> from petl import look, cutout >>> t1 = fromvcf('../fixture/sample.vcf') >>> t2 = meltsamples(t1) >>> t3 = unpackcall(t2) >>> t4 = cutout(t3, 'INFO') >>> look(t4) +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | 'CHROM' | 'POS' | 'ID' | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'SAMPLE' | 'GT' | 'GQ' | 'DP' | 'HQ' | +=========+=======+=============+=======+=======+========+==========+===========+=======+======+======+==============+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00001' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00002' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | 'NA00003' | '0/1' | None | None | [3, 3] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00001' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00002' | '0|0' | None | None | [10, 10] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | 'NA00003' | '0/1' | None | None | [3, 3] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00001' | '0|0' | 48 | 1 | [51, 51] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00002' | '1|0' | 48 | 8 | [51, 51] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 'NA00003' | '1/1' | 43 | 5 | [None, None] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ | '20' | 17330 | None | 'T' | [A] | 3 | ['q10'] | 'NA00001' | '0|0' | 49 | 3 | [58, 50] | +---------+-------+-------------+-------+-------+--------+----------+-----------+-------+------+------+--------------+ .. versionadded:: 0.5 """ if not keys: if hasattr(tbl, 'filename'): try: import vcf as pyvcf except ImportError as e: raise UnsatisfiedDependency(e, dep_message) reader = pyvcf.Reader(filename=tbl.filename) # all FORMAT keys = reader.formats.keys() else: tbl = convert(tbl, 'CALL', lambda v: v.data._asdict() ) # enable sampling of keys from data result = unpackdict(tbl, 'CALL', keys=keys) if 'prefix' in kwargs: result = rename(result, {k: kwargs['prefix'] + k for k in keys}) if hasattr(tbl, 'filename'): return VCFWrapper(result, tbl.filename) else: return result
def __init__(self, tree=None, proximity=0): try: import bx.intervals except ImportError as e: raise UnsatisfiedDependency(e, dep_message) if tree is None: self.tree = bx.intervals.intersection.IntervalTree() else: self.tree = tree self.proximity = proximity
def torecarray(*args, **kwargs): """ Convenient shorthand for ``toarray(...).view(np.recarray)``. .. versionadded:: 0.5.1 """ try: import numpy as np except ImportError as e: raise UnsatisfiedDependency(e, dep_message) else: return toarray(*args, **kwargs).view(np.recarray)
def guessdtype(table): try: import numpy as np except ImportError as e: raise UnsatisfiedDependency(e, dep_message) else: # get numpy to infer dtype it = iter(table) fields = it.next() rows = tuple(it) dtype = np.rec.array(rows).dtype dtype.names = fields return dtype
def iterindex(index_or_dirname, indexname, docnum_field): try: import whoosh except ImportError as e: raise UnsatisfiedDependency(e, dep_message) else: if isinstance(index_or_dirname, basestring): dirname = index_or_dirname index = whoosh.index.open_dir(dirname, indexname=indexname, readonly=True) needs_closing = True elif isinstance(index_or_dirname, whoosh.index.Index): index = index_or_dirname needs_closing = False else: raise Exception('expected string or index, found %r' % index_or_dirname) try: if docnum_field is None: # figure out the field names fields = tuple(index.schema.stored_names()) yield fields # yield all documents astuple = operator.itemgetter(*index.schema.stored_names()) for _, stored_fields_dict in index.reader().iter_docs(): yield astuple(stored_fields_dict) else: # figure out the field names fields = (docnum_field, ) + tuple(index.schema.stored_names()) yield fields # yield all documents astuple = operator.itemgetter(*index.schema.stored_names()) for docnum, stored_fields_dict in index.reader().iter_docs(): yield (docnum, ) + astuple(stored_fields_dict) except: raise finally: if needs_closing: # close the index if we're the ones who opened it index.close()
def valuestoarray(vals, dtype=None, count=-1, sample=1000): try: import numpy as np except ImportError as e: raise UnsatisfiedDependency(e, dep_message) else: it = iter(vals) if dtype is None: peek, it = iterpeek(it, sample) dtype = np.array(peek).dtype a = np.fromiter(it, dtype=dtype, count=count) return a
def toxlsx(tbl, filename, sheet=None, encoding='utf-8'): """ Write a table to a new Excel (.xlsx) file. .. versionadded:: 0.15 """ try: import openpyxl except ImportError as e: raise UnsatisfiedDependency(e, dep_message) wb = openpyxl.Workbook(optimized_write=True, encoding=encoding) ws = wb.create_sheet(title=sheet) for row in tbl: ws.append(row) wb.save(filename)
def toxls(tbl, filename, sheet, encoding='ascii', style_compression=0, styles=None): """ Write a table to a new Excel (.xls) file. .. versionadded:: 0.15 """ try: import xlwt except ImportError as e: raise UnsatisfiedDependency(e, dep_message_write) else: wb = xlwt.Workbook(encoding=encoding, style_compression=style_compression) ws = wb.add_sheet(sheet) if styles is None: # simple version, don't worry about styles for r, row in enumerate(tbl): for c, label in enumerate(row): ws.write(r, c, label=label) else: # handle styles it = iter(tbl) fields = it.next() for c, label in enumerate(fields): ws.write(0, c, label=label) if label not in styles: styles[label] = xlwt.Style.default_style # convert to list for easy zipping styles = [styles[f] for f in fields] for r, row in enumerate(it): for c, (label, style) in enumerate( izip_longest(row, styles, fillvalue=None)): if style is None: style = xlwt.Style.default_style ws.write(r + 1, c, label=label, style=style) wb.save(filename)
def recordtree(table, start='start', stop='stop'): """ Construct an interval tree for the given table, where each node in the tree is a row of the table represented as a hybrid tuple/dictionary-style record object. """ try: import bx.intervals except ImportError as e: raise UnsatisfiedDependency(e, dep_message) getstart = attrgetter(start) getstop = attrgetter(stop) tree = bx.intervals.intersection.IntervalTree() for rec in records(table): tree.add(getstart(rec), getstop(rec), rec)
def __iter__(self): try: from pysam import Tabixfile, asTuple except ImportError as e: raise UnsatisfiedDependency(e, dep_message) f = Tabixfile(self.filename, mode='r') try: # header row if self.header is not None: yield self.header else: # assume last header line has fields h = list(f.header) if len(h) > 0: yield tuple(h[-1].split('\t')) # data rows for row in f.fetch(reference=self.reference, start=self.start, end=self.end, region=self.region, parser=asTuple()): yield tuple(row) except: raise finally: f.close()
def make_sqlalchemy_table(table, tablename, schema=None, constraints=True, metadata=None): """ Create an SQLAlchemy table based on a :mod:`petl` table. Parameters ---------- table : sequence of sequences (petl table) Table data to use to infer types etc. tablename : string Name of the table schema : string Name of the database schema to create the table in constraints : bool If True use length and nullable constraints metadata : sqlalchemy.MetaData Custom table metadata """ try: import sqlalchemy except ImportError as e: raise UnsatisfiedDependency(e, dep_message) if not metadata: metadata = sqlalchemy.MetaData() sql_table = sqlalchemy.Table(tablename, metadata, schema=schema) fields = header(table) cols = columns(table) for f in fields: sql_column = make_sqlalchemy_column(cols[f], f, constraints=constraints) sql_table.append_column(sql_column) return sql_table
def __iter__(self): try: import openpyxl except ImportError as e: raise UnsatisfiedDependency(e, dep_message) use_iterators = self.range is None wb = openpyxl.load_workbook(filename=self.filename, use_iterators=use_iterators, **self.kwargs) if self.sheet is None: ws = wb.get_sheet_by_name(wb.get_sheet_names()[0]) elif isinstance(self.sheet, int): ws = wb.get_sheet_by_name(wb.get_sheet_names()[self.sheet]) else: ws = wb.get_sheet_by_name(str(self.sheet)) if self.range is not None: return (tuple(cell.value for cell in row) for row in ws.range(self.range)) else: return (tuple(cell.value for cell in row) for row in ws.iter_rows())
def recordtrees(table, facet, start='start', stop='stop'): """ Construct faceted interval trees for the given table, where each node in the tree is a row of the table represented as a hybrid tuple/dictionary-style record object. """ try: import bx.intervals except ImportError as e: raise UnsatisfiedDependency(e, dep_message) getstart = attrgetter(start) getstop = attrgetter(stop) getkey = attrgetter(facet) trees = dict() for rec in records(table): k = getkey(rec) if k not in trees: trees[k] = bx.intervals.intersection.IntervalTree() trees[k].add(getstart(rec), getstop(rec), rec) return trees
def make_create_table_statement(table, tablename, schema=None, constraints=True, metadata=None, dialect=None): """ Generate a CREATE TABLE statement based on a :mod:`petl` table. Parameters ---------- table : sequence of sequences (petl table) Table data to use to infer types etc. tablename : string Name of the table schema : string Name of the database schema to create the table in constraints : bool If True use length and nullable constraints metadata : sqlalchemy.MetaData Custom table metadata dialect : string One of {'access', 'sybase', 'sqlite', 'informix', 'firebird', 'mysql', 'oracle', 'maxdb', 'postgresql', 'mssql'} """ try: import sqlalchemy except ImportError as e: raise UnsatisfiedDependency(e, dep_message) sql_table = make_sqlalchemy_table(table, tablename, schema=schema, constraints=constraints, metadata=metadata) if dialect: module = __import__('sqlalchemy.dialects.%s' % DIALECTS[dialect], fromlist=['dialect']) sql_dialect = module.dialect() else: sql_dialect = None return unicode(sqlalchemy.schema.CreateTable(sql_table).compile(dialect=sql_dialect)).strip() + ';'
def make_sqlalchemy_column(col, colname, constraints=True): """ Infer an appropriate SQLAlchemy column type based on a sequence of values. Parameters ---------- col : sequence A sequence of values to use to infer type, length etc. colname : string Name of column constraints : bool If True use length and nullable constraints """ try: import sqlalchemy except ImportError as e: raise UnsatisfiedDependency(e, dep_message) col_not_none = [v for v in col if v is not None] sql_column_kwargs = {} sql_type_kwargs = {} if len(col_not_none) == 0: sql_column_type = sqlalchemy.String if constraints: sql_type_kwargs['length'] = NULL_COLUMN_MAX_LENGTH elif all(isinstance(v, bool) for v in col_not_none): sql_column_type = sqlalchemy.Boolean elif all(isinstance(v, int) for v in col_not_none): if max(col_not_none) > SQL_INTEGER_MAX or min(col_not_none) < SQL_INTEGER_MIN: sql_column_type = sqlalchemy.BigInteger else: sql_column_type = sqlalchemy.Integer elif all(isinstance(v, long) for v in col_not_none): sql_column_type = sqlalchemy.BigInteger elif all(isinstance(v, (int, long)) for v in col_not_none): sql_column_type = sqlalchemy.BigInteger elif all(isinstance(v, (int, long, float)) for v in col_not_none): sql_column_type = sqlalchemy.Float elif all(isinstance(v, datetime.date) for v in col_not_none): sql_column_type = sqlalchemy.Date elif all(isinstance(v, datetime.time) for v in col_not_none): sql_column_type = sqlalchemy.Time elif all(isinstance(v, datetime.datetime) for v in col_not_none): sql_column_type = sqlalchemy.DateTime else: sql_column_type = sqlalchemy.String if constraints: sql_type_kwargs['length'] = max([len(unicode(v)) for v in col]) if constraints: sql_column_kwargs['nullable'] = len(col_not_none) < len(col) return sqlalchemy.Column(colname, sql_column_type(**sql_type_kwargs), **sql_column_kwargs)
def tohdf5(table, source, where=None, name=None, create=False, description=None, title='', filters=None, expectedrows=10000, chunkshape=None, byteorder=None, createparents=False, sample=1000): """ Write to an HDF5 table. If `create` is `False`, assumes the table already exists, and attempts to truncate it before loading. If `create` is `True`, any existing table is dropped, and a new table is created; if `description` is None, the datatype will be guessed. E.g.:: >>> from petl import look >>> look(table1) +-------+----------+ | 'foo' | 'bar' | +=======+==========+ | 1 | 'asdfgh' | +-------+----------+ | 2 | 'qwerty' | +-------+----------+ | 3 | 'zxcvbn' | +-------+----------+ >>> from petlx.hdf5 import tohdf5, fromhdf5 >>> tohdf5(table1, 'test1.h5', '/testgroup', 'testtable', create=True, createparents=True) >>> look(fromhdf5('test1.h5', '/testgroup', 'testtable')) +-------+----------+ | 'foo' | 'bar' | +=======+==========+ | 1 | 'asdfgh' | +-------+----------+ | 2 | 'qwerty' | +-------+----------+ | 3 | 'zxcvbn' | +-------+----------+ See also :func:`appendhdf5`. .. versionadded:: 0.3 """ it = iter(table) if create: try: import tables except ImportError as e: raise UnsatisfiedDependency(e, dep_message) if isinstance(source, basestring): # assume it's the name of an HDF5 file h5file = tables.openFile(source, mode='a') # don't replace the whole file! elif isinstance(source, tables.File): h5file = source else: raise Exception( 'invalid source argument, expected file name or tables.File, found: %r' % source) # determine datatype if description is None: peek, it = iterpeek(it, sample) # use a numpy dtype description = guessdtype(peek) # check if the table node already exists try: h5table = h5file.getNode(where, name) except tables.NoSuchNodeError: pass else: # drop the node h5file.removeNode(where, name) # create the table h5table = h5file.createTable(where, name, description, title=title, filters=filters, expectedrows=expectedrows, chunkshape=chunkshape, byteorder=byteorder, createparents=createparents) else: h5file, h5table = _get_hdf5_table(source, where, name, mode='a') try: # truncate the existing table h5table.truncate(0) # load the data _insert(it, h5table) finally: if isinstance(source, basestring): # close the file if we opened it here h5file.close()
def toindex(tbl, index_or_dirname, schema=None, indexname=None, merge=False, optimize=False): """ Load all rows from `tbl` into a Whoosh index. N.B., this will clear any existing data in the index before loading. E.g.:: >>> from petl import look >>> from petlx.index import toindex, fromindex >>> # here is the table we want to load into an index ... look(tbl) +--------+------+------+-------+--------------------------------------------------+ | 'f0' | 'f1' | 'f2' | 'f3' | 'f4' | +========+======+======+=======+==================================================+ | u'AAA' | 12 | 4.3 | True | datetime.datetime(2014, 6, 30, 14, 7, 2, 333199) | +--------+------+------+-------+--------------------------------------------------+ | u'BBB' | 6 | 3.4 | False | datetime.datetime(1900, 1, 31, 0, 0) | +--------+------+------+-------+--------------------------------------------------+ | u'CCC' | 42 | 7.8 | True | datetime.datetime(2100, 12, 25, 0, 0) | +--------+------+------+-------+--------------------------------------------------+ >>> # define a schema for the index ... from whoosh.fields import * >>> schema = Schema(f0=TEXT(stored=True), ... f1=NUMERIC(int, stored=True), ... f2=NUMERIC(float, stored=True), ... f3=BOOLEAN(stored=True), ... f4=DATETIME(stored=True)) >>> # load data ... toindex(tbl, 'tmp/example', schema=schema) >>> # look what it did ... look(fromindex('tmp/example')) +--------+------+------+-------+--------------------------------------------------+ | 'f0' | 'f1' | 'f2' | 'f3' | 'f4' | +========+======+======+=======+==================================================+ | u'AAA' | 12 | 4.3 | True | datetime.datetime(2014, 6, 30, 14, 7, 2, 333199) | +--------+------+------+-------+--------------------------------------------------+ | u'BBB' | 6 | 3.4 | False | datetime.datetime(1900, 1, 31, 0, 0) | +--------+------+------+-------+--------------------------------------------------+ | u'CCC' | 42 | 7.8 | True | datetime.datetime(2100, 12, 25, 0, 0) | +--------+------+------+-------+--------------------------------------------------+ .. versionadded:: 0.16 Parameters ---------- tbl A table-like object (row container) containing the data to be loaded. index_or_dirname Either an instance of `whoosh.index.Index` or a string containing the directory path where the index is to be stored. indexname String containing the name of the index, if multiple indexes are stored in the same directory. merge Merge small segments during commit? optimize Merge all segments together? """ try: import whoosh except ImportError as e: raise UnsatisfiedDependency(e, dep_message) else: # deal with polymorphic argument if isinstance(index_or_dirname, basestring): dirname = index_or_dirname index = whoosh.index.create_in(dirname, schema, indexname=indexname) needs_closing = True elif isinstance(index_or_dirname, whoosh.index.Index): index = index_or_dirname needs_closing = False else: raise Exception('expected string or index, found %r' % index_or_dirname) writer = index.writer() try: for d in dicts(tbl): writer.add_document(**d) writer.commit(merge=merge, optimize=optimize, mergetype=whoosh.writing.CLEAR) except: writer.cancel() raise finally: if needs_closing: index.close()
def appendindex(tbl, index_or_dirname, indexname=None, merge=True, optimize=False): """ Load all rows from `tbl` into a Whoosh index, adding them to any existing data in the index. .. versionadded:: 0.16 Parameters ---------- tbl A table-like object (row container) containing the data to be loaded. index_or_dirname Either an instance of `whoosh.index.Index` or a string containing the directory path where the index is to be stored. indexname String containing the name of the index, if multiple indexes are stored in the same directory. merge Merge small segments during commit? optimize Merge all segments together? """ try: import whoosh except ImportError as e: raise UnsatisfiedDependency(e, dep_message) else: # deal with polymorphic argument if isinstance(index_or_dirname, basestring): dirname = index_or_dirname index = whoosh.index.open_dir(dirname, indexname=indexname, readonly=False) needs_closing = True elif isinstance(index_or_dirname, whoosh.index.Index): index = index_or_dirname needs_closing = False else: raise Exception('expected string or index, found %r' % index_or_dirname) writer = index.writer() try: for d in dicts(tbl): writer.add_document(**d) writer.commit(merge=merge, optimize=optimize) except Exception as e: writer.cancel() raise finally: if needs_closing: index.close()
def toarray(table, dtype=None, count=-1, sample=1000): """ Convenience function to load data from the given `table` into a numpy structured array. E.g.:: >>> from petl import look >>> from petlx.array import toarray >>> look(table) +-----------+-------+-------+ | 'foo' | 'bar' | 'baz' | +===========+=======+=======+ | 'apples' | 1 | 2.5 | +-----------+-------+-------+ | 'oranges' | 3 | 4.4 | +-----------+-------+-------+ | 'pears' | 7 | 0.1 | +-----------+-------+-------+ >>> a = toarray(table) >>> a array([('apples', 1, 2.5), ('oranges', 3, 4.4), ('pears', 7, 0.1)], dtype=[('foo', '|S7'), ('bar', '<i8'), ('baz', '<f8')]) >>> a['foo'] array(['apples', 'oranges', 'pears'], dtype='|S7') >>> a['bar'] array([1, 3, 7]) >>> a['baz'] array([ 2.5, 4.4, 0.1]) >>> a['foo'][0] 'apples' >>> a['bar'][1] 3 >>> a['baz'][2] 0.10000000000000001 If no datatype is specified, `sample` rows will be examined to infer an appropriate datatype for each field. The datatype can be specified as a string, e.g.: >>> a = toarray(table, dtype='a4, i2, f4') >>> a array([('appl', 1, 2.5), ('oran', 3, 4.400000095367432), ('pear', 7, 0.10000000149011612)], dtype=[('foo', '|S4'), ('bar', '<i2'), ('baz', '<f4')]) The datatype can also be partially specified, in which case datatypes will be inferred for other fields, e.g.: >>> a = toarray(table, dtype={'foo': 'a4'}) >>> a array([('appl', 1, 2.5), ('oran', 3, 4.4), ('pear', 7, 0.1)], dtype=[('foo', '|S4'), ('bar', '<i8'), ('baz', '<f8')]) """ try: import numpy as np except ImportError as e: raise UnsatisfiedDependency(e, dep_message) else: it = iter(table) peek, it = iterpeek(it, sample) fields = it.next() if dtype is None: dtype = guessdtype(peek) elif isinstance(dtype, basestring): # insert field names from source table typestrings = [s.strip() for s in dtype.split(',')] dtype = [(f, t) for f, t in zip(fields, typestrings)] elif (isinstance(dtype, dict) and ('names' not in dtype or 'formats' not in dtype)): # allow for partial specification of dtype cols = columns(peek) newdtype = {'names': [], 'formats': []} for f in fields: newdtype['names'].append(f) if f in dtype and isinstance(dtype[f], tuple): # assume fully specified newdtype['formats'].append(dtype[f][0]) elif f not in dtype: # not specified at all a = np.array(cols[f]) newdtype['formats'].append(a.dtype) else: # assume directly specified, just need to add offset newdtype['formats'].append(dtype[f]) dtype = newdtype else: pass # leave dtype as-is # numpy is fussy about having tuples, need to make sure it = (tuple(row) for row in it) sa = np.fromiter(it, dtype=dtype, count=count) return sa
def itersearchindex(index_or_dirname, query, limit, pagenum, pagelen, indexname, docnum_field, score_field, fieldboosts, search_kwargs): try: import whoosh import whoosh.qparser except ImportError as e: raise UnsatisfiedDependency(e, dep_message) else: if isinstance(index_or_dirname, basestring): dirname = index_or_dirname index = whoosh.index.open_dir(dirname, indexname=indexname, readonly=True) needs_closing = True elif isinstance(index_or_dirname, whoosh.index.Index): index = index_or_dirname needs_closing = False else: raise Exception('expected string or index, found %r' % index_or_dirname) try: # figure out header fields = tuple() if docnum_field is not None: fields += (docnum_field, ) if score_field is not None: fields += (score_field, ) stored_names = tuple(index.schema.stored_names()) fields += stored_names yield fields # parse the query if isinstance(query, basestring): # search all fields by default parser = whoosh.qparser.MultifieldParser( index.schema.names(), index.schema, fieldboosts=fieldboosts) query = parser.parse(query) elif isinstance(query, whoosh.query.Query): pass else: raise Exception( 'expected string or whoosh.query.Query, found %r' % query) # make a function to turn docs into tuples astuple = operator.itemgetter(*index.schema.stored_names()) with index.searcher() as searcher: if limit is not None: results = searcher.search(query, limit=limit, **search_kwargs) else: results = searcher.search_page(query, pagenum, pagelen=pagelen, **search_kwargs) if docnum_field is None and score_field is None: for doc in results: yield astuple(doc) else: for (docnum, score), doc in itertools.izip(results.items(), results): row = tuple() if docnum_field is not None: row += (docnum, ) if score_field is not None: row += (score, ) row += astuple(doc) yield row except: raise finally: if needs_closing: # close the index if we're the ones who opened it index.close()
def unpackinfo(tbl, *keys, **kwargs): """ Unpack the INFO field into separate fields. E.g.:: >>> from petlx.vcf import fromvcf, unpackinfo >>> from petl import look >>> t1 = fromvcf('../fixture/sample.vcf', samples=False) >>> look(t1) +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | 'CHROM' | 'POS' | 'ID' | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'INFO' | +=========+=========+=============+=======+===========+========+==========+=========================================================================================+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | {} | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | {} | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | OrderedDict([('NS', 3), ('DP', 14), ('AF', [0.5]), ('DB', True), ('H2', True)]) | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 17330 | None | 'T' | [A] | 3 | ['q10'] | OrderedDict([('NS', 3), ('DP', 11), ('AF', [0.017])]) | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 1110696 | 'rs6040355' | 'A' | [G, T] | 67 | [] | OrderedDict([('NS', 2), ('DP', 10), ('AF', [0.333, 0.667]), ('AA', 'T'), ('DB', True)]) | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 1230237 | None | 'T' | [None] | 47 | [] | OrderedDict([('NS', 3), ('DP', 13), ('AA', 'T')]) | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 1234567 | 'microsat1' | 'G' | [GA, GAC] | 50 | [] | OrderedDict([('NS', 3), ('DP', 9), ('AA', 'G'), ('AN', 6), ('AC', [3, 1])]) | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | '20' | 1235237 | None | 'T' | [None] | None | [] | {} | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ | 'X' | 10 | 'rsTest' | 'AC' | [A, ATG] | 10 | [] | {} | +---------+---------+-------------+-------+-----------+--------+----------+-----------------------------------------------------------------------------------------+ >>> t2 = unpackinfo(t1) >>> look(t2) +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | 'CHROM' | 'POS' | 'ID' | 'REF' | 'ALT' | 'QUAL' | 'FILTER' | 'NS' | 'AN' | 'AC' | 'DP' | 'AF' | 'AA' | 'DB' | 'H2' | +=========+=========+=============+=======+===========+========+==========+======+======+========+======+================+======+======+======+ | '19' | 111 | None | 'A' | [C] | 9.6 | [] | None | None | None | None | None | None | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '19' | 112 | None | 'A' | [G] | 10 | [] | None | None | None | None | None | None | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 14370 | 'rs6054257' | 'G' | [A] | 29 | [] | 3 | None | None | 14 | [0.5] | None | True | True | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 17330 | None | 'T' | [A] | 3 | ['q10'] | 3 | None | None | 11 | [0.017] | None | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 1110696 | 'rs6040355' | 'A' | [G, T] | 67 | [] | 2 | None | None | 10 | [0.333, 0.667] | 'T' | True | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 1230237 | None | 'T' | [None] | 47 | [] | 3 | None | None | 13 | None | 'T' | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 1234567 | 'microsat1' | 'G' | [GA, GAC] | 50 | [] | 3 | 6 | [3, 1] | 9 | None | 'G' | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | '20' | 1235237 | None | 'T' | [None] | None | [] | None | None | None | None | None | None | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ | 'X' | 10 | 'rsTest' | 'AC' | [A, ATG] | 10 | [] | None | None | None | None | None | None | None | None | +---------+---------+-------------+-------+-----------+--------+----------+------+------+--------+------+----------------+------+------+------+ .. versionadded:: 0.5 """ if not keys: if hasattr(tbl, 'filename'): try: import vcf as pyvcf except ImportError as e: raise UnsatisfiedDependency(e, dep_message) reader = pyvcf.Reader(filename=tbl.filename) # all INFO keys = reader.infos.keys() result = unpackdict(tbl, 'INFO', keys=keys) if 'prefix' in kwargs: result = rename(result, {k: kwargs['prefix'] + k for k in keys}) if hasattr(tbl, 'filename'): return VCFWrapper(result, tbl.filename) else: return result