def append_files_to_dataset(files, dataset): header_map, schema = crp.build_new_schema(files, dataset.schema) template = {k: None for k in schema} for f in files: with open(f, "rb") as handle: for row in csv.DictReader(handle, delimiter=";"): instance = copy.copy(template) for k, v in row.iteritems(): instance[header_map[k]] = unicode(v, "latin-1") dataset.write_row(instance)
def test_build_new_schema(self, mock_open, mock_csv): existing_schema = ["attr1"] files = [None, None] mock_csv.DictReader = mock.Mock() mock_reader = mock.Mock() mock_csv.DictReader.return_value = mock_reader mock_reader.fieldnames = ["attr1", "attr2"] header_map, schema = crp.build_new_schema(files, existing_schema) self.assertEqual(mock_open.call_count, 2) self.assertEqual(header_map, {"attr1": "attr1", "attr2": "attr2"}) self.assertEqual(schema, ["attr1", "attr2"])