def join(keys, tables): """Merge a list of `Table` objects using `keys` to group rows""" # Make new (merged) Table fields fields = OrderedDict() for table in tables: fields.update(table.fields) # TODO: may raise an error if a same field is different in some tables # Check if all keys are inside merged Table's fields fields_keys = set(fields.keys()) for key in keys: if key not in fields_keys: raise ValueError('Invalid key: "{}"'.format(key)) # Group rows by key, without missing ordering none_fields = lambda: OrderedDict({field: None for field in fields.keys()}) data = OrderedDict() for table in tables: for row in table: row_key = tuple([getattr(row, key) for key in keys]) if row_key not in data: data[row_key] = none_fields() data[row_key].update(row._asdict()) merged = Table(fields=fields) merged.extend(data.values()) return merged
def create_table(data, meta=None, fields=None, skip_header=True, import_fields=None, samples=None, force_types=None, *args, **kwargs): """Create a rows.Table object based on data rows and some configurations - `skip_header` is only used if `fields` is set - `samples` is only used if `fields` is `None`. If samples=None, all data is filled in memory - use with caution. - `force_types` is only used if `fields` is `None` - `import_fields` can be used either if `fields` is set or not, the resulting fields will seek its order - `fields` must always be in the same order as the data """ table_rows = iter(data) force_types = force_types or {} if import_fields is not None: import_fields = make_header(import_fields) if fields is None: # autodetect field types # TODO: may add `type_hints` parameter so autodetection can be easier # (plugins may specify some possible field types). header = make_header(next(table_rows)) if samples is not None: sample_rows = list(islice(table_rows, 0, samples)) table_rows = chain(sample_rows, table_rows) else: sample_rows = table_rows = list(table_rows) # Detect field types using only the desired columns detected_fields = detect_types( header, sample_rows, skip_indexes=[ index for index, field in enumerate(header) if field in force_types or field not in ( import_fields or header) ], *args, **kwargs) # Check if any field was added during detecting process new_fields = [ field_name for field_name in detected_fields.keys() if field_name not in header ] # Finally create the `fields` with both header and new field names, # based on detected fields `and force_types` fields = OrderedDict([(field_name, detected_fields.get(field_name, TextField)) for field_name in header + new_fields]) fields.update(force_types) # Update `header` and `import_fields` based on new `fields` header = list(fields.keys()) if import_fields is None: import_fields = header else: # using provided field types if not isinstance(fields, OrderedDict): raise ValueError("`fields` must be an `OrderedDict`") if skip_header: # If we're skipping the header probably this row is not trustable # (can be data or garbage). _ = next(table_rows) header = make_header(list(fields.keys())) if import_fields is None: import_fields = header fields = OrderedDict([(field_name, fields[key]) for field_name, key in zip(header, fields)]) diff = set(import_fields) - set(header) if diff: field_names = ", ".join('"{}"'.format(field) for field in diff) raise ValueError("Invalid field names: {}".format(field_names)) fields = OrderedDict([(field_name, fields[field_name]) for field_name in import_fields]) get_row = get_items(*map(header.index, import_fields)) table = Table(fields=fields, meta=meta) table.extend(dict(zip(import_fields, get_row(row))) for row in table_rows) return table
def create_table( data, meta=None, fields=None, skip_header=True, import_fields=None, samples=None, force_types=None, *args, **kwargs ): """Create a rows.Table object based on data rows and some configurations - `skip_header` is only used if `fields` is set - `samples` is only used if `fields` is `None`. If samples=None, all data is filled in memory - use with caution. - `force_types` is only used if `fields` is `None` - `import_fields` can be used either if `fields` is set or not, the resulting fields will seek its order - `fields` must always be in the same order as the data """ table_rows = iter(data) force_types = force_types or {} if import_fields is not None: import_fields = make_header(import_fields) if fields is None: # autodetect field types # TODO: may add `type_hints` parameter so autodetection can be easier # (plugins may specify some possible field types). header = make_header(next(table_rows)) if samples is not None: sample_rows = list(islice(table_rows, 0, samples)) table_rows = chain(sample_rows, table_rows) else: sample_rows = table_rows = list(table_rows) # Detect field types using only the desired columns detected_fields = detect_types( header, sample_rows, skip_indexes=[ index for index, field in enumerate(header) if field in force_types or field not in (import_fields or header) ], *args, **kwargs ) # Check if any field was added during detecting process new_fields = [ field_name for field_name in detected_fields.keys() if field_name not in header ] # Finally create the `fields` with both header and new field names, # based on detected fields `and force_types` fields = OrderedDict( [ (field_name, detected_fields.get(field_name, TextField)) for field_name in header + new_fields ] ) fields.update(force_types) # Update `header` and `import_fields` based on new `fields` header = list(fields.keys()) if import_fields is None: import_fields = header else: # using provided field types if not isinstance(fields, OrderedDict): raise ValueError("`fields` must be an `OrderedDict`") if skip_header: # If we're skipping the header probably this row is not trustable # (can be data or garbage). _ = next(table_rows) header = make_header(list(fields.keys())) if import_fields is None: import_fields = header fields = OrderedDict( [(field_name, fields[key]) for field_name, key in zip(header, fields)] ) diff = set(import_fields) - set(header) if diff: field_names = ", ".join('"{}"'.format(field) for field in diff) raise ValueError("Invalid field names: {}".format(field_names)) fields = OrderedDict( [(field_name, fields[field_name]) for field_name in import_fields] ) get_row = get_items(*map(header.index, import_fields)) table = Table(fields=fields, meta=meta) table.extend(dict(zip(import_fields, get_row(row))) for row in table_rows) return table