def read(self, reader): rows = iter(reader) header = rows.next() if self.ref_field_name not in header: raise MissingFieldError(self.ref_field_name) for field_name in self.transformer.output_field_names: if field_name not in header: raise MissingFieldError(self.ref_field_name) # first field is ID field_maps = FieldMaps() for input_field_name in self.field_names: field_maps.add(input_field_name, input_field_name) map_transformer = SimpleTransformer(field_maps) map_transformer.bind(header) count = 0 values = dict() for row in rows: transformed_row = map_transformer.transform(row) ref = int(transformed_row[0]) value = transformed_row[1:] values[value] = ref count += 1 if count != len(values): raise DuplicateValuesError() if count != len(set(values.values())): raise DuplicateRefsError() self.values = values self.next_ref = max(values.values()) + 1
def bind(self, header): self.output_field_names = tuple( field_name for field_name in header if field_name not in self.fields_to_remove) field_maps = FieldMaps() for field_name in self.output_field_names: field_maps.add(field_name, field_name) self.transformer = SimpleTransformer(field_maps) self.transformer.bind(header)
class ExtractMap(Transformer): def __init__(self, map_fields_spec, ref_field_spec, keep_fields=False): field_maps = FieldMaps() field_maps.parse_from(map_fields_spec) self.fields_to_remove = ( set() if keep_fields else field_maps.input_field_names) # TODO: this is ugly, beautify ref_field_map = FieldMaps().parse_field_map_string(ref_field_spec) self.map = Map(field_maps, ref_field_map.output_field_name) self.ref_field_name = ref_field_map.input_field_name self.transformer = None def bind(self, header): # TODO: DRY: copied from RemoveFields # except for adding the ref field # (extract common stuff into ProxyTransformer?) input_fields_to_keep = tuple( field_name for field_name in header if field_name not in self.fields_to_remove) field_maps = FieldMaps() for field_name in input_fields_to_keep: field_maps.add(field_name, field_name) field_maps.add( input_field_name=None, output_field_name=self.ref_field_name, extractor_field=RefField(self.map)) self.transformer = SimpleTransformer(field_maps) self.transformer.bind(header) def read_map(self, reader): self.map.read(reader) def write_map(self, writer): self.map.write(writer) @property def map_changed(self): return self.map.changed @property def output_field_names(self): return self.transformer.output_field_names @property def transform(self): return self.transformer.transform
class ExtractMap(Transformer): def __init__(self, map_fields_spec, ref_field_spec, keep_fields=False): field_maps = FieldMaps() field_maps.parse_from(map_fields_spec) self.fields_to_remove = (set() if keep_fields else field_maps.input_field_names) # TODO: this is ugly, beautify ref_field_map = FieldMaps().parse_field_map_string(ref_field_spec) self.map = Map(field_maps, ref_field_map.output_field_name) self.ref_field_name = ref_field_map.input_field_name self.transformer = None def bind(self, header): # TODO: DRY: copied from RemoveFields # except for adding the ref field # (extract common stuff into ProxyTransformer?) input_fields_to_keep = tuple( field_name for field_name in header if field_name not in self.fields_to_remove) field_maps = FieldMaps() for field_name in input_fields_to_keep: field_maps.add(field_name, field_name) field_maps.add(input_field_name=None, output_field_name=self.ref_field_name, extractor_field=RefField(self.map)) self.transformer = SimpleTransformer(field_maps) self.transformer.bind(header) def read_map(self, reader): self.map.read(reader) def write_map(self, writer): self.map.write(writer) @property def map_changed(self): return self.map.changed @property def output_field_names(self): return self.transformer.output_field_names @property def transform(self): return self.transformer.transform
def bind(self, header): # TODO: DRY: copied from RemoveFields # except for adding the ref field # (extract common stuff into ProxyTransformer?) input_fields_to_keep = tuple( field_name for field_name in header if field_name not in self.fields_to_remove) field_maps = FieldMaps() for field_name in input_fields_to_keep: field_maps.add(field_name, field_name) field_maps.add(input_field_name=None, output_field_name=self.ref_field_name, extractor_field=RefField(self.map)) self.transformer = SimpleTransformer(field_maps) self.transformer.bind(header)
class RemoveFields(Transformer): def __init__(self, fields_to_remove): self.fields_to_remove = fields_to_remove self.transformer = None def bind(self, header): self.output_field_names = tuple( field_name for field_name in header if field_name not in self.fields_to_remove) field_maps = FieldMaps() for field_name in self.output_field_names: field_maps.add(field_name, field_name) self.transformer = SimpleTransformer(field_maps) self.transformer.bind(header) @property def transform(self): return self.transformer.transform
def bind(self, header): # TODO: DRY: copied from RemoveFields # except for adding the ref field # (extract common stuff into ProxyTransformer?) input_fields_to_keep = tuple( field_name for field_name in header if field_name not in self.fields_to_remove) field_maps = FieldMaps() for field_name in input_fields_to_keep: field_maps.add(field_name, field_name) field_maps.add( input_field_name=None, output_field_name=self.ref_field_name, extractor_field=RefField(self.map)) self.transformer = SimpleTransformer(field_maps) self.transformer.bind(header)
def __init__(self, map_field_maps, ref_field_name): self.transformer = SimpleTransformer(map_field_maps) self.ref_field_name = ref_field_name self.values = dict() self.next_ref = 0
class Map(object): changed = False def __init__(self, map_field_maps, ref_field_name): self.transformer = SimpleTransformer(map_field_maps) self.ref_field_name = ref_field_name self.values = dict() self.next_ref = 0 def read(self, reader): rows = iter(reader) header = rows.next() if self.ref_field_name not in header: raise MissingFieldError(self.ref_field_name) for field_name in self.transformer.output_field_names: if field_name not in header: raise MissingFieldError(self.ref_field_name) # first field is ID field_maps = FieldMaps() for input_field_name in self.field_names: field_maps.add(input_field_name, input_field_name) map_transformer = SimpleTransformer(field_maps) map_transformer.bind(header) count = 0 values = dict() for row in rows: transformed_row = map_transformer.transform(row) ref = int(transformed_row[0]) value = transformed_row[1:] values[value] = ref count += 1 if count != len(values): raise DuplicateValuesError() if count != len(set(values.values())): raise DuplicateRefsError() self.values = values self.next_ref = max(values.values()) + 1 def write(self, writer): writer.writerow(self.field_names) for (value, ref) in self.values.iteritems(): writer.writerow(tuple([ref]) + tuple(value)) def translate(self, input_row): key = self.transformer.transform(input_row) ref = self.values.setdefault(key, self.next_ref) if ref == self.next_ref: self.next_ref += 1 self.changed = True return ref def bind(self, header): self.transformer.bind(header) @property def field_names(self): return ( tuple([self.ref_field_name]) + self.transformer.output_field_names)
def select(input_file, output_file, transform_spec): reader = csv.reader(input_file) writer = csv.writer(output_file) field_maps = FieldMaps() field_maps.parse_from(transform_spec) SimpleTransformer(field_maps).process(reader, writer)
class Map(object): changed = False def __init__(self, map_field_maps, ref_field_name): self.transformer = SimpleTransformer(map_field_maps) self.ref_field_name = ref_field_name self.values = dict() self.next_ref = 0 def read(self, reader): rows = iter(reader) header = rows.next() if self.ref_field_name not in header: raise MissingFieldError(self.ref_field_name) for field_name in self.transformer.output_field_names: if field_name not in header: raise MissingFieldError(self.ref_field_name) # first field is ID field_maps = FieldMaps() for input_field_name in self.field_names: field_maps.add(input_field_name, input_field_name) map_transformer = SimpleTransformer(field_maps) map_transformer.bind(header) count = 0 values = dict() for row in rows: transformed_row = map_transformer.transform(row) ref = int(transformed_row[0]) value = transformed_row[1:] values[value] = ref count += 1 if count != len(values): raise DuplicateValuesError() if count != len(set(values.values())): raise DuplicateRefsError() self.values = values self.next_ref = max(values.values()) + 1 def write(self, writer): writer.writerow(self.field_names) for (value, ref) in self.values.iteritems(): writer.writerow(tuple([ref]) + tuple(value)) def translate(self, input_row): key = self.transformer.transform(input_row) ref = self.values.setdefault(key, self.next_ref) if ref == self.next_ref: self.next_ref += 1 self.changed = True return ref def bind(self, header): self.transformer.bind(header) @property def field_names(self): return (tuple([self.ref_field_name]) + self.transformer.output_field_names)