def test_wrong_pivot_config(self): missing_pivot_column = {"pivot_config": {"value_column": "v_col"}} missing_value_column = {"pivot_config": {"pivot_column": "p_col"}} with self.assertRaises(ValueError): TableConfig(missing_pivot_column) with self.assertRaises(ValueError): TableConfig(missing_value_column)
def xls_to_csv(event, context): config = Config.from_lambda_event(event) output_dataset = config.payload.output_dataset step_data = config.payload.step_data input_prefixes = step_data.s3_input_prefixes if step_data.input_count < 1: raise ValueError("No input dataset prefix defined") if step_data.input_count > 1: raise ValueError(f"Too many dataset inputs: {input_prefixes}") input_dataset = list(input_prefixes)[0] input_prefix = input_prefixes[input_dataset] output_prefix = ( output_dataset.s3_prefix.replace("%stage%", "intermediate") + config.task + "/") table_config = TableConfig(config.task_config) response = s3_client.list_objects_v2(Bucket=BUCKET, Prefix=input_prefix) for content in response["Contents"]: xlsInput = content["Key"] filename = xlsInput[len(input_prefix):] filename_prefix = filename[0:filename.lower().rfind(".xls")] output = output_prefix + filename_prefix + ".csv" convert_to_csv(xlsInput, output, table_config) config.payload.step_data.s3_input_prefixes = { output_dataset.id: output_prefix } config.payload.step_data.status = "OK" return asdict(config.payload.step_data)
def test_no_config(self): config = TableConfig(None) self.assertEqual(config.sheet_name, 0) self.assertEqual(config.column_names, None) self.assertEqual(config.table_has_header, True) self.assertEqual(config.table_sources[0].start_row, 1) self.assertEqual(config.table_sources[0].start_col, 1)
def test_column_names(self): config = TableConfig({ "sheet_name": "foo", "column_names": ["A", "B", "C"], "table_has_header": True, "table_sources": [], }) self.assertEqual(config.column_names, ["A", "B", "C"])
def test_malformed_column_names(self): config = { "sheet_name": "foo", "column_names": [1, 2, 3], "table_has_header": True, "table_sources": [], } with self.assertRaises(TypeError): TableConfig(config)
def test_pivot_config(self): config = TableConfig({ "pivot_config": { "pivot_column": "p_col", "value_column": "v_col" } }) self.assertEqual(config.pivot_config.pivot_column, "p_col") self.assertEqual(config.pivot_config.value_column, "v_col")
def test_befolkning_pivot(self): config = TableConfig( { "pivot_config": { "pivot_column": "Alder", "value_column": "Antall personer", } } ) conv = TableConverter(config) wb = conv.read_excel_table( os.path.join(CWD, "data", "Befolking_test_data.xlsx") ) df = conv.convert_table(wb) self.assertEqual(type(df), pd.DataFrame) self.assertEqual(len(df.index), 2) self.assertEqual(list(df[99])[0], 0) self.assertEqual(list(df[99])[1], 3)
def test_extra_col(self): config = TableConfig({ "sheet_name": "foo", "column_names": ["A"], "table_has_header": True, "table_sources": [{ "start_row": 3, "start_col": 14, "extra_row": 13, "extra_col": 37 }], "extra_col": { "name": "year", "dtype": "int" }, }) self.assertEqual(config.extra_col.name, "year") self.assertEqual(config.extra_col.dtype, int) self.assertEqual(config.table_sources[0].extra_row, 13) self.assertEqual(config.table_sources[0].extra_col, 37)
def test_table_sources(self): config = TableConfig({ "sheet_name": "foo", "column_names": ["A"], "table_has_header": True, "table_sources": [ { "start_row": 13, "start_col": 37 }, { "start_row": 3, "start_col": 14 }, ], }) self.assertEqual(config.table_sources[0].start_row, 13) self.assertEqual(config.table_sources[0].start_col, 37) self.assertEqual(config.table_sources[1].start_row, 3) self.assertEqual(config.table_sources[1].start_col, 14)
import os import sys import unittest from copy import copy import pandas as pd import xlrd from okdata.pipeline.converters.xls.TableConfig import TableConfig from okdata.pipeline.converters.xls.TableConverter import TableConverter CWD = os.path.dirname(os.path.realpath(__file__)) sys.path.append(os.path.join(CWD, "..")) empty_config = TableConfig(None) config = TableConfig( { "sheet_name": "Sheet1", "table_has_header": True, "column_names": ["A", "B"], "table_sources": [{"start_row": 1, "start_col": 1}], } ) wrong_sheet_name_config = TableConfig( { "sheet_name": "this sheet does not exist", "table_has_header": True, "column_names": ["A", "B"], "table_sources": [{"start_row": 1, "start_col": 1}],
def test_missing_column_names(self): config = {"table_has_header": False} with self.assertRaises(ValueError): TableConfig(config)
def test_malformed_config(self): malformed_config = [] with self.assertRaises(TypeError): TableConfig(malformed_config)