Пример #1
0
 def test_wrong_pivot_config(self):
     missing_pivot_column = {"pivot_config": {"value_column": "v_col"}}
     missing_value_column = {"pivot_config": {"pivot_column": "p_col"}}
     with self.assertRaises(ValueError):
         TableConfig(missing_pivot_column)
     with self.assertRaises(ValueError):
         TableConfig(missing_value_column)
Пример #2
0
def xls_to_csv(event, context):
    config = Config.from_lambda_event(event)
    output_dataset = config.payload.output_dataset
    step_data = config.payload.step_data

    input_prefixes = step_data.s3_input_prefixes
    if step_data.input_count < 1:
        raise ValueError("No input dataset prefix defined")
    if step_data.input_count > 1:
        raise ValueError(f"Too many dataset inputs: {input_prefixes}")

    input_dataset = list(input_prefixes)[0]
    input_prefix = input_prefixes[input_dataset]
    output_prefix = (
        output_dataset.s3_prefix.replace("%stage%", "intermediate") +
        config.task + "/")
    table_config = TableConfig(config.task_config)

    response = s3_client.list_objects_v2(Bucket=BUCKET, Prefix=input_prefix)

    for content in response["Contents"]:
        xlsInput = content["Key"]

        filename = xlsInput[len(input_prefix):]
        filename_prefix = filename[0:filename.lower().rfind(".xls")]

        output = output_prefix + filename_prefix + ".csv"

        convert_to_csv(xlsInput, output, table_config)

    config.payload.step_data.s3_input_prefixes = {
        output_dataset.id: output_prefix
    }
    config.payload.step_data.status = "OK"
    return asdict(config.payload.step_data)
Пример #3
0
 def test_no_config(self):
     config = TableConfig(None)
     self.assertEqual(config.sheet_name, 0)
     self.assertEqual(config.column_names, None)
     self.assertEqual(config.table_has_header, True)
     self.assertEqual(config.table_sources[0].start_row, 1)
     self.assertEqual(config.table_sources[0].start_col, 1)
Пример #4
0
 def test_column_names(self):
     config = TableConfig({
         "sheet_name": "foo",
         "column_names": ["A", "B", "C"],
         "table_has_header": True,
         "table_sources": [],
     })
     self.assertEqual(config.column_names, ["A", "B", "C"])
Пример #5
0
 def test_malformed_column_names(self):
     config = {
         "sheet_name": "foo",
         "column_names": [1, 2, 3],
         "table_has_header": True,
         "table_sources": [],
     }
     with self.assertRaises(TypeError):
         TableConfig(config)
Пример #6
0
 def test_pivot_config(self):
     config = TableConfig({
         "pivot_config": {
             "pivot_column": "p_col",
             "value_column": "v_col"
         }
     })
     self.assertEqual(config.pivot_config.pivot_column, "p_col")
     self.assertEqual(config.pivot_config.value_column, "v_col")
Пример #7
0
 def test_befolkning_pivot(self):
     config = TableConfig(
         {
             "pivot_config": {
                 "pivot_column": "Alder",
                 "value_column": "Antall personer",
             }
         }
     )
     conv = TableConverter(config)
     wb = conv.read_excel_table(
         os.path.join(CWD, "data", "Befolking_test_data.xlsx")
     )
     df = conv.convert_table(wb)
     self.assertEqual(type(df), pd.DataFrame)
     self.assertEqual(len(df.index), 2)
     self.assertEqual(list(df[99])[0], 0)
     self.assertEqual(list(df[99])[1], 3)
Пример #8
0
 def test_extra_col(self):
     config = TableConfig({
         "sheet_name":
         "foo",
         "column_names": ["A"],
         "table_has_header":
         True,
         "table_sources": [{
             "start_row": 3,
             "start_col": 14,
             "extra_row": 13,
             "extra_col": 37
         }],
         "extra_col": {
             "name": "year",
             "dtype": "int"
         },
     })
     self.assertEqual(config.extra_col.name, "year")
     self.assertEqual(config.extra_col.dtype, int)
     self.assertEqual(config.table_sources[0].extra_row, 13)
     self.assertEqual(config.table_sources[0].extra_col, 37)
Пример #9
0
 def test_table_sources(self):
     config = TableConfig({
         "sheet_name":
         "foo",
         "column_names": ["A"],
         "table_has_header":
         True,
         "table_sources": [
             {
                 "start_row": 13,
                 "start_col": 37
             },
             {
                 "start_row": 3,
                 "start_col": 14
             },
         ],
     })
     self.assertEqual(config.table_sources[0].start_row, 13)
     self.assertEqual(config.table_sources[0].start_col, 37)
     self.assertEqual(config.table_sources[1].start_row, 3)
     self.assertEqual(config.table_sources[1].start_col, 14)
Пример #10
0
import os
import sys
import unittest
from copy import copy

import pandas as pd
import xlrd

from okdata.pipeline.converters.xls.TableConfig import TableConfig
from okdata.pipeline.converters.xls.TableConverter import TableConverter

CWD = os.path.dirname(os.path.realpath(__file__))
sys.path.append(os.path.join(CWD, ".."))

empty_config = TableConfig(None)

config = TableConfig(
    {
        "sheet_name": "Sheet1",
        "table_has_header": True,
        "column_names": ["A", "B"],
        "table_sources": [{"start_row": 1, "start_col": 1}],
    }
)

wrong_sheet_name_config = TableConfig(
    {
        "sheet_name": "this sheet does not exist",
        "table_has_header": True,
        "column_names": ["A", "B"],
        "table_sources": [{"start_row": 1, "start_col": 1}],
Пример #11
0
 def test_missing_column_names(self):
     config = {"table_has_header": False}
     with self.assertRaises(ValueError):
         TableConfig(config)
Пример #12
0
 def test_malformed_config(self):
     malformed_config = []
     with self.assertRaises(TypeError):
         TableConfig(malformed_config)