def __iter__(self): it = iter(self.table) hdr = next(it) outhdr = tuple((text_type(f) + text_type(self.suffix)) for f in hdr) yield outhdr for row in it: yield row
def itersearch(table, pattern, field, flags, complement): prog = re.compile(pattern, flags) it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) yield tuple(hdr) if field is None: # search whole row test = lambda r: any(prog.search(text_type(v)) for v in r) else: indices = asindices(hdr, field) if len(indices) == 1: index = indices[0] test = lambda r: prog.search(text_type(r[index])) else: getvals = operator.itemgetter(*indices) test = lambda r: any(prog.search(text_type(v)) for v in getvals(r)) # complement==False, return rows that match if not complement: for row in it: if test(row): yield tuple(row) # complement==True, return rows that do not match else: for row in it: if not test(row): yield tuple(row)
def iterhashrightjoin(left, right, lkey, rkey, missing, llookup, lprefix, rprefix): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table rgetk = operator.itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_rrow, _lrows): for lrow in _lrows: # start with the left row _outrow = list(lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) yield tuple(_outrow) for rrow in rit: k = rgetk(rrow) if k in llookup: lrows = llookup[k] for outrow in joinrows(rrow, lrows): yield outrow else: # start with missing values in place of the left row outrow = [missing] * len(lhdr) # set key values for li, ri in zip(lkind, rkind): outrow[li] = rrow[ri] # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow)
def iterunpack(source, field, newfields, include_original, missing): it = iter(source) hdr = next(it) flds = list(map(text_type, hdr)) if field in flds: field_index = flds.index(field) elif isinstance(field, int) and field < len(flds): field_index = field field = flds[field_index] else: raise ArgumentError( 'field invalid: must be either field name or index') # determine output fields outhdr = list(flds) if not include_original: outhdr.remove(field) if isinstance(newfields, (list, tuple)): outhdr.extend(newfields) nunpack = len(newfields) elif isinstance(newfields, int): nunpack = newfields newfields = [ text_type(field) + text_type(i + 1) for i in range(newfields) ] outhdr.extend(newfields) elif newfields is None: nunpack = 0 else: raise ArgumentError( 'newfields argument must be list or tuple of field ' 'names, or int (number of values to unpack)') yield tuple(outhdr) # construct the output data for row in it: value = row[field_index] if include_original: out_row = list(row) else: out_row = [v for i, v in enumerate(row) if i != field_index] nvals = len(value) if nunpack > 0: if nvals >= nunpack: newvals = value[:nunpack] else: newvals = list(value) + ([missing] * (nunpack - nvals)) out_row.extend(newvals) yield tuple(out_row)
def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix): lit = iter(left) lhdr = next(lit) rhdr, rit = iterpeek(right) # need the whole lot to pass to lookup rlookup = lookupone(rit, rkey, strict=False) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table lgetk = operator.itemgetter(*lkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_lrow, _rrow): # start with the left row _outrow = list(_lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) return tuple(_outrow) for lrow in lit: k = lgetk(lrow) if k in rlookup: rrow = rlookup[k] yield joinrows(lrow, rrow) else: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow)
def make_create_table_statement(table, tablename, schema=None, constraints=True, metadata=None, dialect=None): """ Generate a CREATE TABLE statement based on data in `table`. Keyword arguments: table : table container Table data to use to infer types etc. tablename : text Name of the table schema : text Name of the database schema to create the table in constraints : bool If True use length and nullable constraints metadata : sqlalchemy.MetaData Custom table metadata dialect : text One of {'access', 'sybase', 'sqlite', 'informix', 'firebird', 'mysql', 'oracle', 'maxdb', 'postgresql', 'mssql'} """ import sqlalchemy sql_table = make_sqlalchemy_table(table, tablename, schema=schema, constraints=constraints, metadata=metadata) if dialect: module = __import__("sqlalchemy.dialects.%s" % DIALECTS[dialect], fromlist=["dialect"]) sql_dialect = module.dialect() else: sql_dialect = None return text_type(sqlalchemy.schema.CreateTable(sql_table).compile(dialect=sql_dialect)).strip()
def _display_html(table, limit=0, vrepr=None, index_header=None, caption=None, tr_style=None, td_styles=None, encoding=None, truncate=None, epilogue=None): # determine defaults if limit == 0: limit = config.display_limit if vrepr is None: vrepr = config.display_vrepr if index_header is None: index_header = config.display_index_header if encoding is None: encoding = locale.getpreferredencoding() table, overflow = _vis_overflow(table, limit) buf = MemorySource() tohtml(table, buf, encoding=encoding, index_header=index_header, vrepr=vrepr, caption=caption, tr_style=tr_style, td_styles=td_styles, truncate=truncate) output = text_type(buf.getvalue(), encoding) if epilogue: output += '<p>%s</p>' % epilogue elif overflow: output += '<p><strong>...</strong></p>' return output
def __repr__(self): vreprs = list(map(repr, islice(self, 6))) r = text_type(self.field) + ': ' r += ', '.join(vreprs[:5]) if len(vreprs) > 5: r += ', ...' return r
def __iter__(self): from pysam import Tabixfile, asTuple f = Tabixfile(self.filename, mode='r') try: # header row if self.header is not None: yield self.header else: # assume last header line has fields h = list(f.header) if len(h) > 0: header_line = text_type(h[-1], encoding='ascii') yield tuple(header_line.split('\t')) # data rows for row in f.fetch(reference=self.reference, start=self.start, end=self.stop, region=self.region, parser=asTuple()): yield tuple(row) except: raise finally: f.close()
def iterunpack(source, field, newfields, include_original, missing): it = iter(source) hdr = next(it) flds = list(map(text_type, hdr)) if field in flds: field_index = flds.index(field) elif isinstance(field, int) and field < len(flds): field_index = field field = flds[field_index] else: raise ArgumentError("field invalid: must be either field name or index") # determine output fields outhdr = list(flds) if not include_original: outhdr.remove(field) if isinstance(newfields, (list, tuple)): outhdr.extend(newfields) nunpack = len(newfields) elif isinstance(newfields, int): nunpack = newfields newfields = [text_type(field) + text_type(i + 1) for i in range(newfields)] outhdr.extend(newfields) elif newfields is None: nunpack = 0 else: raise ArgumentError( "newfields argument must be list or tuple of field " "names, or int (number of values to unpack)" ) yield tuple(outhdr) # construct the output data for row in it: value = row[field_index] if include_original: out_row = list(row) else: out_row = [v for i, v in enumerate(row) if i != field_index] nvals = len(value) if nunpack > 0: if nvals >= nunpack: newvals = value[:nunpack] else: newvals = list(value) + ([missing] * (nunpack - nvals)) out_row.extend(newvals) yield tuple(out_row)
def __iter__(self): it = iter(self.table) hdr = next(it) outhdr = tuple( (text_type(normalize_name(f, self.illegal_v))) for f in hdr) yield outhdr for row in it: yield row
def itercrossjoin(sources, prefix): # construct fields outhdr = list() for i, s in enumerate(sources): if prefix: # use one-based numbering outhdr.extend([text_type(i+1) + '_' + text_type(f) for f in header(s)]) else: outhdr.extend(header(s)) yield tuple(outhdr) datasrcs = [data(src) for src in sources] for prod in itertools.product(*datasrcs): outrow = list() for row in prod: outrow.extend(row) yield tuple(outrow)
def itercrossjoin(sources, prefix): # construct fields outhdr = list() for i, s in enumerate(sources): if prefix: # use one-based numbering outhdr.extend( [text_type(i + 1) + '_' + text_type(f) for f in header(s)]) else: outhdr.extend(header(s)) yield tuple(outhdr) datasrcs = [data(src) for src in sources] for prod in itertools.product(*datasrcs): outrow = list() for row in prod: outrow.extend(row) yield tuple(outrow)
def _ordered_dict_iterator(table): it = iter(table) hdr = next(it) flds = [text_type(f) for f in hdr] for row in it: items = list() for i, f in enumerate(flds): try: v = row[i] except IndexError: v = None items.append((f, v)) yield OrderedDict(items)
def asdict(hdr, row, missing=None): flds = [text_type(f) for f in hdr] try: # list comprehension should be faster items = [(flds[i], row[i]) for i in range(len(flds))] except IndexError: # short row, fall back to slower for loop items = list() for i, f in enumerate(flds): try: v = row[i] except IndexError: v = missing items.append((f, v)) return dict(items)
def fieldnames(table): """ Return the string values of the header row. If the header row contains only strings, then this function is equivalent to header(), i.e.:: >>> import petl as etl >>> table = [['foo', 'bar'], ['a', 1], ['b', 2]] >>> etl.fieldnames(table) ('foo', 'bar') >>> etl.header(table) ('foo', 'bar') """ return tuple(text_type(f) for f in header(table))
def issorted(table, key=None, reverse=False, strict=False): """ Return True if the table is ordered (i.e., sorted) by the given key. E.g.:: >>> import petl as etl >>> table1 = [['foo', 'bar', 'baz'], ... ['a', 1, True], ... ['b', 3, True], ... ['b', 2]] >>> etl.issorted(table1, key='foo') True >>> etl.issorted(table1, key='bar') False >>> etl.issorted(table1, key='foo', strict=True) False >>> etl.issorted(table1, key='foo', reverse=True) False """ # determine the operator to use when comparing rows if reverse and strict: op = operator.lt elif reverse and not strict: op = operator.le elif strict: op = operator.gt else: op = operator.ge it = iter(table) flds = [text_type(f) for f in next(it)] if key is None: prev = next(it) for curr in it: if not op(curr, prev): return False prev = curr else: getkey = comparable_itemgetter(*asindices(flds, key)) prev = next(it) prevkey = getkey(prev) for curr in it: currkey = getkey(curr) if not op(currkey, prevkey): return False prevkey = currkey return True
def __iter__(self): nr = self.numrows seed = self.seed fields = self.fields.copy() # N.B., we want this to be stable, i.e., same data each time random.seed(seed) # construct header row hdr = tuple(text_type(f) for f in fields.keys()) yield hdr # construct data rows for _ in xrange(nr): # artificial delay if self.wait: time.sleep(self.wait) yield tuple(fields[f]() for f in fields)
def make_create_table_statement(table, tablename, schema=None, constraints=True, metadata=None, dialect=None): """ Generate a CREATE TABLE statement based on data in `table`. Keyword arguments: table : table container Table data to use to infer types etc. tablename : text Name of the table schema : text Name of the database schema to create the table in constraints : bool If True use length and nullable constraints metadata : sqlalchemy.MetaData Custom table metadata dialect : text One of {'access', 'sybase', 'sqlite', 'informix', 'firebird', 'mysql', 'oracle', 'maxdb', 'postgresql', 'mssql'} """ import sqlalchemy sql_table = make_sqlalchemy_table(table, tablename, schema=schema, constraints=constraints, metadata=metadata) if dialect: module = __import__('sqlalchemy.dialects.%s' % DIALECTS[dialect], fromlist=['dialect']) sql_dialect = module.dialect() else: sql_dialect = None return text_type( sqlalchemy.schema.CreateTable(sql_table).compile( dialect=sql_dialect)).strip()
def iterlookupjoin(left, right, lkey, rkey, missing=None, lprefix=None, rprefix=None): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from both tables lgetk = operator.itemgetter(*lkind) rgetk = operator.itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join two groups of rows def joinrows(_lrowgrp, _rrowgrp): if _rrowgrp is None: for lrow in _lrowgrp: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow) else: rrow = next(iter(_rrowgrp)) # pick first arbitrarily for lrow in _lrowgrp: # start with the left row outrow = list(lrow) # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow) # construct group iterators for both tables lgit = itertools.groupby(lit, key=lgetk) rgit = itertools.groupby(rit, key=rgetk) lrowgrp = [] # loop until *either* of the iterators is exhausted lkval, rkval = None, None # initialise here to handle empty tables try: # pick off initial row groups lkval, lrowgrp = next(lgit) rkval, rrowgrp = next(rgit) while True: if lkval < rkval: for row in joinrows(lrowgrp, None): yield tuple(row) # advance left lkval, lrowgrp = next(lgit) elif lkval > rkval: # advance right rkval, rrowgrp = next(rgit) else: for row in joinrows(lrowgrp, rrowgrp): yield tuple(row) # advance both lkval, lrowgrp = next(lgit) rkval, rrowgrp = next(rgit) except StopIteration: pass # make sure any left rows remaining are yielded if lkval > rkval: # yield anything that got left hanging for row in joinrows(lrowgrp, None): yield tuple(row) # yield the rest for lkval, lrowgrp in lgit: for row in joinrows(lrowgrp, None): yield tuple(row)
def __unicode__(self): return text_type(self.obj)
def make_sqlalchemy_column(col, colname, constraints=True): """ Infer an appropriate SQLAlchemy column type based on a sequence of values. Keyword arguments: col : sequence A sequence of values to use to infer type, length etc. colname : string Name of column constraints : bool If True use length and nullable constraints """ import sqlalchemy col_not_none = [v for v in col if v is not None] sql_column_kwargs = {} sql_type_kwargs = {} if len(col_not_none) == 0: sql_column_type = sqlalchemy.String if constraints: sql_type_kwargs["length"] = NULL_COLUMN_MAX_LENGTH elif all(isinstance(v, bool) for v in col_not_none): sql_column_type = sqlalchemy.Boolean elif all(isinstance(v, int) for v in col_not_none): if max(col_not_none) > SQL_INTEGER_MAX or min(col_not_none) < SQL_INTEGER_MIN: sql_column_type = sqlalchemy.BigInteger else: sql_column_type = sqlalchemy.Integer elif all(isinstance(v, long) for v in col_not_none): sql_column_type = sqlalchemy.BigInteger elif all(isinstance(v, (int, long)) for v in col_not_none): sql_column_type = sqlalchemy.BigInteger elif all(isinstance(v, (int, long, float)) for v in col_not_none): sql_column_type = sqlalchemy.Float elif all(isinstance(v, datetime.datetime) for v in col_not_none): sql_column_type = sqlalchemy.DateTime elif all(isinstance(v, datetime.date) for v in col_not_none): sql_column_type = sqlalchemy.Date elif all(isinstance(v, datetime.time) for v in col_not_none): sql_column_type = sqlalchemy.Time else: sql_column_type = sqlalchemy.String if constraints: sql_type_kwargs["length"] = max([len(text_type(v)) for v in col]) if constraints: sql_column_kwargs["nullable"] = len(col_not_none) < len(col) return sqlalchemy.Column(colname, sql_column_type(**sql_type_kwargs), **sql_column_kwargs)
def __setitem__(self, item, value): self.fields[text_type(item)] = value
def make_sqlalchemy_column(col, colname, constraints=True): """ Infer an appropriate SQLAlchemy column type based on a sequence of values. Keyword arguments: col : sequence A sequence of values to use to infer type, length etc. colname : string Name of column constraints : bool If True use length and nullable constraints """ import sqlalchemy col_not_none = [v for v in col if v is not None] sql_column_kwargs = {} sql_type_kwargs = {} if len(col_not_none) == 0: sql_column_type = sqlalchemy.String if constraints: sql_type_kwargs['length'] = NULL_COLUMN_MAX_LENGTH elif all(isinstance(v, bool) for v in col_not_none): sql_column_type = sqlalchemy.Boolean elif all(isinstance(v, int) for v in col_not_none): if max(col_not_none) > SQL_INTEGER_MAX \ or min(col_not_none) < SQL_INTEGER_MIN: sql_column_type = sqlalchemy.BigInteger else: sql_column_type = sqlalchemy.Integer elif all(isinstance(v, long) for v in col_not_none): sql_column_type = sqlalchemy.BigInteger elif all(isinstance(v, (int, long)) for v in col_not_none): sql_column_type = sqlalchemy.BigInteger elif all(isinstance(v, (int, long, float)) for v in col_not_none): sql_column_type = sqlalchemy.Float elif all(isinstance(v, datetime.datetime) for v in col_not_none): sql_column_type = sqlalchemy.DateTime elif all(isinstance(v, datetime.date) for v in col_not_none): sql_column_type = sqlalchemy.Date elif all(isinstance(v, datetime.time) for v in col_not_none): sql_column_type = sqlalchemy.Time else: sql_column_type = sqlalchemy.String if constraints: sql_type_kwargs['length'] = max([len(text_type(v)) for v in col]) if constraints: sql_column_kwargs['nullable'] = len(col_not_none) < len(col) return sqlalchemy.Column(colname, sql_column_type(**sql_type_kwargs), **sql_column_kwargs)