def apply_map(map_path, data_path, out_file): """Apply a JSON mapping to data, and write the output. Args: map_path (str): Path to mapping file data_path (str): Path to data file out_file (file): output stream Return: None """ map_file = open(map_path, 'r') mapping = mapper.Mapping(map_file, encoding='latin_1') data_file = open(data_path, 'rU') data_csv = csv.reader(data_file) # map each field d = {} input_fields = data_csv.next() matched, nomatch = mapping.apply(input_fields) for field, m in matched.items(): d[field] = m.as_json() print('Mapped {} => {}'.format(field, m.field)) for field in nomatch: print('* No mapping found for input field: {}'.format(field)) d[field] = mapper.MapItem(field, None).as_json() # write mapping as a JSON try: json.dump(d, out_file, ensure_ascii=True) except BaseException: # print('** Error: While writing:\n{}'.format(d)) pass # write stats print('Mapped {} fields: {} OK and {} did not match'.format( len(input_fields), len(matched), len(nomatch)))
def find_duplicates(map_path, data_path, out_file): """Find duplicates created by a given mapping on a given input file. Args: map_path (str): Path to mapping file data_path (str): Path to data file out_file (file): output stream Return: None """ map_file = open(map_path, "r") mapping = mapper.Mapping(map_file, encoding='latin-1') data_file = open(data_path, "rU") data_csv = csv.reader(data_file) hdr = data_csv.next() seen_values, dup = {}, {} for src in hdr: value = mapping.get(src, None) if value is None: continue dst = value.field if dst in seen_values: # this is a duplicate if src in dup: # we already have >= 1 duplicates # add new duplicate to list dup[dst].append(src) else: # first duplicate # add both keys to list seen_key = seen_values[dst] dup[dst] = [seen_key, src] else: seen_values[dst] = src # print results for value, keys in dup.items(): keylist = ' | '.join(keys) out_file.write( "({n:d}) {v}: {kl}\n".format( n=len(keys), v=value, kl=keylist, ), )