Python Table.concat 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: parsons.etl.table

클래스/타입: Table

메소드/함수: concat

hotexamples.com에서의 예제들: 4

Python Table.concat - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 parsons.etl.table.Table.concat에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Table(30)

from_csv(12)

from_json(5)

table(4)

from_csv_string(4)

select_rows(4)

concat(4)

unpack_dict(4)

unpack_list(4)

to_csv(3)

rename_column(3)

map_columns(3)

long_table(3)

to_dicts(2)

get_normalized_column_name(2)

remove_null_rows(2)

move_column(2)

add_column(2)

unpack_nested_columns_as_rows(2)

materialize(2)

cut(2)

get_column_max_width(2)

convert_columns_to_str(1)

append_csv(1)

chunk(1)

coalesce_columns(1)

to_zip_csv(1)

to_s3_csv(1)

to_redshift(1)

to_postgres(1)

to_petl(1)

to_json(1)

to_html(1)

column_data(1)

convert_column(1)

to_civis(1)

sort(1)

get_columns_type_stats(1)

set_header(1)

convert_table(1)

row_data(1)

empty_column(1)

remove_column(1)

reduce_rows(1)

fill_column(1)

materialize_to_file(1)

fillna_column(1)

match_columns(1)

from_columns(1)

from_postgres(1)

예제 #1

파일 보기

    def query_records(self, table_from=None):
        """
        Query records in a Quickbase table. This follows the patterns laid out
        in Quickbase query documentaiton, located here:
        https://help.quickbase.com/api-guide/componentsquery.html

        `Args:`
            from: str
                The ID of a Quickbase resource (i.e. a table) to query.
        `Returns:`
            Table Class
        """
        req_resp = \
            (self.client.request(f'{self.api_hostname}/records/query',
                                 'POST',
                                 json={"from": table_from}).json())

        resp_tbl = Table(req_resp['data'])
        cleaned_tbl = Table()

        for row in resp_tbl:
            row_dict = {}
            for column in resp_tbl.columns:
                row_dict[column] = row[column]['value']
            cleaned_tbl.concat(Table([row_dict]))

        column_resp = req_resp['fields']
        column_map = {}
        for entry in column_resp:
            column_map[str(entry['id'])] = entry['label'].lower().strip()

        for column in cleaned_tbl.columns:
            cleaned_tbl.rename_column(column, column_map[column])

        return cleaned_tbl

예제 #2

파일 보기

파일: test_etl.py 프로젝트: shepardjma/parsons

class TestParsonsTable(unittest.TestCase):
    def setUp(self):

        # Create Table object
        self.lst = [{
            'a': 1,
            'b': 2,
            'c': 3
        }, {
            'a': 4,
            'b': 5,
            'c': 6
        }, {
            'a': 7,
            'b': 8,
            'c': 9
        }, {
            'a': 10,
            'b': 11,
            'c': 12
        }, {
            'a': 13,
            'b': 14,
            'c': 15
        }]
        self.lst_dicts = [{'first': 'Bob', 'last': 'Smith'}]
        self.tbl = Table(self.lst_dicts)

        # Create a tmp dir
        os.mkdir('tmp')

    def tearDown(self):

        # Delete tmp folder and files
        shutil.rmtree('tmp')

    def test_from_list_of_dicts(self):

        tbl = Table(self.lst)

        # Test Iterate and is list like
        self.assertEqual(tbl[0], {'a': 1, 'b': 2, 'c': 3})

    def test_from_list_of_lists(self):

        list_of_lists = [
            ['a', 'b', 'c'],
            [1, 2, 3],
            [4, 5, 6],
        ]
        tbl = Table(list_of_lists)

        self.assertEqual(tbl[0], {'a': 1, 'b': 2, 'c': 3})

    def test_from_petl(self):

        nrows = 10
        ptbl = petl.dummytable(numrows=nrows)
        tbl = Table(ptbl)
        self.assertEqual(tbl.num_rows, nrows)

    def test_from_invalid_list(self):

        # Tests that a table can't be created from a list of invalid items
        list_of_invalid = [1, 2, 3]
        self.assertRaises(ValueError, Table, list_of_invalid)

    def test_from_empty_petl(self):
        self.assertRaises(ValueError, Table, None)

    def test_from_empty_list(self):
        # Just ensure this doesn't throw an error
        Table()
        Table([])
        Table([[]])

    def test_materialize(self):
        # Simple test that materializing doesn't change the table
        tbl_materialized = Table(self.lst_dicts)
        tbl_materialized.materialize()

        assert_matching_tables(self.tbl, tbl_materialized)

    def test_materialize_to_file(self):
        # Simple test that materializing doesn't change the table
        tbl_materialized = Table(self.lst_dicts)
        tbl_materialized.materialize_to_file()

        assert_matching_tables(self.tbl, tbl_materialized)

    def test_empty_column(self):
        # Test that returns True on an empty column and False on a populated one.

        tbl = Table([['a', 'b'], ['1', None], ['2', None]])

        self.assertTrue(tbl.empty_column('b'))
        self.assertFalse(tbl.empty_column('a'))

    def test_from_columns(self):

        header = ['col1', 'col2']
        col1 = [1, 2, 3]
        col2 = ['a', 'b', 'c']
        tbl = Table.from_columns([col1, col2], header=header)

        self.assertEqual(tbl[0], {'col1': 1, 'col2': 'a'})

    # Removing this test since it is an optional dependency.
    """
    def test_from_datafame(self):

        import pandas

        # Assert creates table without index
        tbl = Table(self.lst)
        tbl_from_df = Table.from_dataframe(tbl.to_dataframe())
        assert_matching_tables(tbl, tbl_from_df)


    def test_to_dataframe(self):

        # Is a dataframe
        self.assertIsInstance(self.tbl.to_dataframe(), pandas.core.frame.DataFrame)
    """

    def test_to_petl(self):

        # Is a petl table
        self.assertIsInstance(self.tbl.to_petl(), petl.io.json.DictsView)

    def test_to_html(self):

        html_file = 'tmp/test.html'

        # Test writing file
        self.tbl.to_html(html_file)

        # Test written correctly
        html = ("<table class='petl'>\n"
                "<thead>\n"
                "<tr>\n"
                "<th>first</th>\n"
                "<th>last</th>\n"
                "</tr>\n"
                "</thead>\n"
                "<tbody>\n"
                "<tr>\n"
                "<td>Bob</td>\n"
                "<td>Smith</td>\n"
                "</tr>\n"
                "</tbody>\n"
                "</table>\n")
        with open(html_file, 'r') as f:
            self.assertEqual(f.read(), html)

    def test_to_temp_html(self):

        # Test write to object
        path = self.tbl.to_html()

        # Written correctly
        html = ("<table class='petl'>\n"
                "<thead>\n"
                "<tr>\n"
                "<th>first</th>\n"
                "<th>last</th>\n"
                "</tr>\n"
                "</thead>\n"
                "<tbody>\n"
                "<tr>\n"
                "<td>Bob</td>\n"
                "<td>Smith</td>\n"
                "</tr>\n"
                "</tbody>\n"
                "</table>\n")
        with open(path, 'r') as f:
            self.assertEqual(f.read(), html)

    def _assert_expected_csv(self, path, orig_tbl):
        result_tbl = Table.from_csv(path)
        assert_matching_tables(orig_tbl, result_tbl)

    def test_to_from_csv(self):
        path = 'tmp/test.csv'
        self.tbl.to_csv(path)
        self._assert_expected_csv(path, self.tbl)
        os.remove(path)

    def test_to_from_csv_compressed(self):
        path = 'tmp/test.csv.gz'
        self.tbl.to_csv(path)
        self._assert_expected_csv(path, self.tbl)
        os.remove(path)

    def test_to_from_temp_csv(self):
        path = self.tbl.to_csv()
        self._assert_expected_csv(path, self.tbl)

    def test_to_from_temp_csv_compressed(self):
        path = self.tbl.to_csv(temp_file_compression='gzip')
        self._assert_expected_csv(path, self.tbl)

    def test_from_csv_string(self):
        path = self.tbl.to_csv()
        # Pull the file into a string
        with open(path, 'r') as f:
            str = f.read()

        result_tbl = Table.from_csv_string(str)
        assert_matching_tables(self.tbl, result_tbl)

    def test_append_csv_compressed(self):
        path = self.tbl.to_csv(temp_file_compression='gzip')
        append_tbl = Table([{'first': 'Mary', 'last': 'Nichols'}])
        append_tbl.append_csv(path)

        result_tbl = Table.from_csv(path)
        # Combine tables, so we can check the resulting file
        self.tbl.concat(append_tbl)
        assert_matching_tables(self.tbl, result_tbl)

    def test_from_csv_raises_on_empty_file(self):
        # Create empty file
        path = 'tmp/empty.csv'
        open(path, 'a').close()

        self.assertRaises(ValueError, Table.from_csv, path)

    def test_to_csv_zip(self):

        try:
            # Test using the to_csv() method
            self.tbl.to_csv('myzip.zip')
            zip_archive.unzip_archive('myzip.zip')
            assert_matching_tables(self.tbl, Table.from_csv('myzip.csv'))

            # Test using the to_csv_zip() method
            self.tbl.to_zip_csv('myzip.zip')
            zip_archive.unzip_archive('myzip.zip')
            assert_matching_tables(self.tbl, Table.from_csv('myzip.csv'))
        finally:
            os.unlink('myzip.zip')
            os.unlink('myzip.csv')

    def test_to_civis(self):

        # Not really sure the best way to do this at the moment.
        pass

    def test_to_from_json(self):
        path = 'tmp/test.json'
        self.tbl.to_json(path)

        result_tbl = Table.from_json(path)
        assert_matching_tables(self.tbl, result_tbl)
        os.remove(path)

    def test_to_from_json_compressed(self):
        path = 'tmp/test.json.gz'
        self.tbl.to_json(path)

        result_tbl = Table.from_json(path)
        assert_matching_tables(self.tbl, result_tbl)
        os.remove(path)

    def test_to_from_temp_json(self):
        path = self.tbl.to_json()
        result_tbl = Table.from_json(path)
        assert_matching_tables(self.tbl, result_tbl)

    def test_to_from_temp_json_compressed(self):
        path = self.tbl.to_json(temp_file_compression='gzip')
        result_tbl = Table.from_json(path)
        assert_matching_tables(self.tbl, result_tbl)

    def test_to_from_json_line_delimited(self):
        path = 'tmp/test.json'
        self.tbl.to_json(path, line_delimited=True)

        result_tbl = Table.from_json(path, line_delimited=True)
        assert_matching_tables(self.tbl, result_tbl)
        os.remove(path)

    def test_to_from_json_line_delimited_compressed(self):
        path = 'tmp/test.json.gz'
        self.tbl.to_json(path, line_delimited=True)

        result_tbl = Table.from_json(path, line_delimited=True)
        assert_matching_tables(self.tbl, result_tbl)
        os.remove(path)

    def test_columns(self):
        # Test that columns are listed correctly
        self.assertEqual(self.tbl.columns, ['first', 'last'])

    def test_add_column(self):
        # Test that a new column is added correctly
        self.tbl.add_column('middle', index=1)
        self.assertEqual(self.tbl.columns[1], 'middle')

    def test_column_add_dupe(self):
        # Test that we can't add an existing column name
        self.assertRaises(ValueError, self.tbl.add_column, 'first')

    def test_remove_column(self):
        # Test that column is removed correctly
        self.tbl.remove_column('first')
        self.assertNotEqual(self.tbl.data[0], 'first')

    def test_rename_column(self):
        # Test that you can rename a column
        self.tbl.rename_column('first', 'f')
        self.assertEqual(self.tbl.columns[0], 'f')

    def test_column_rename_dupe(self):
        # Test that we can't rename to a column that already exists
        self.assertRaises(ValueError, self.tbl.rename_column, 'last', 'first')

    def test_fill_column(self):
        # Test that the column is filled
        tbl = Table(self.lst)

        # Fixed Value
        tbl.fill_column('c', 0)
        self.assertEqual(list(tbl.table['c']), [0] * tbl.num_rows)

        # Calculated Value
        tbl.fill_column('c', lambda x: x['b'] * 2)
        self.assertEqual(list(tbl.table['c']), [x['b'] * 2 for x in self.lst])

    def test_fillna_column(self):
        # Test that None values in the column are filled

        self.lst = [{
            'a': 1,
            'b': 2,
            'c': 3
        }, {
            'a': 4,
            'b': 5,
            'c': None
        }, {
            'a': 7,
            'b': 8,
            'c': 9
        }, {
            'a': 10,
            'b': 11,
            'c': None
        }, {
            'a': 13,
            'b': 14,
            'c': 15
        }]

        # Fixed Value only
        tbl = Table(self.lst)
        tbl.fillna_column('c', 0)
        self.assertEqual(list(tbl.table['c']), [3, 0, 9, 0, 15])

    def test_move_column(self):
        # Test moving a column from end to front
        self.tbl.move_column('last', 0)
        self.assertEqual(self.tbl.columns[0], 'last')

    def test_convert_column(self):
        # Test that column updates
        self.tbl.convert_column('first', 'upper')
        self.assertEqual(self.tbl[0], {'first': 'BOB', 'last': 'Smith'})

    def test_convert_columns_to_str(self):
        # Test that all columns are string
        mixed_raw = [{
            'col1': 1,
            'col2': 2,
            'col3': 3
        }, {
            'col1': 'one',
            'col2': 2,
            'col3': [3, 'three', 3.0]
        }, {
            'col1': {
                'one': 1,
                "two": 2.0
            },
            'col2': None,
            "col3": 'three'
        }]
        tbl = Table(mixed_raw)
        tbl.convert_columns_to_str()

        cols = tbl.get_columns_type_stats()
        type_set = {i for x in cols for i in x['type']}
        self.assertTrue('str' in type_set and len(type_set) == 1)

    def test_convert_table(self):
        # Test that the table updates
        self.tbl.convert_table('upper')
        self.assertEqual(self.tbl[0], {'first': 'BOB', 'last': 'SMITH'})

    def test_coalesce_columns(self):
        # Test coalescing into an existing column
        test_raw = [
            {
                'first': 'Bob',
                'last': 'Smith',
                'lastname': None
            },
            {
                'first': 'Jane',
                'last': '',
                'lastname': 'Doe'
            },
            {
                'first': 'Mary',
                'last': 'Simpson',
                'lastname': 'Peters'
            },
        ]
        tbl = Table(test_raw)
        tbl.coalesce_columns('last', ['last', 'lastname'])

        expected = Table([
            {
                'first': 'Bob',
                'last': 'Smith'
            },
            {
                'first': 'Jane',
                'last': 'Doe'
            },
            {
                'first': 'Mary',
                'last': 'Simpson'
            },
        ])
        assert_matching_tables(tbl, expected)

        # Test coalescing into a new column
        tbl = Table(test_raw)
        tbl.coalesce_columns('new_last', ['last', 'lastname'])
        expected = Table([
            {
                'first': 'Bob',
                'new_last': 'Smith'
            },
            {
                'first': 'Jane',
                'new_last': 'Doe'
            },
            {
                'first': 'Mary',
                'new_last': 'Simpson'
            },
        ])
        assert_matching_tables(tbl, expected)

    def test_unpack_dict(self):

        test_dict = [{'a': 1, 'b': {'nest1': 1, 'nest2': 2}}]
        test_table = Table(test_dict)

        # Test that dict at the top level
        test_table.unpack_dict('b', prepend=False)
        self.assertEqual(test_table.columns, ['a', 'nest1', 'nest2'])

    def test_unpack_list(self):

        test_table = Table([{'a': 1, 'b': [1, 2, 3]}])

        # Test that list at the top level
        test_table.unpack_list('b', replace=True)
        self.assertEqual(['a', 'b_0', 'b_1', 'b_2'], test_table.columns)

    def test_unpack_list_with_mixed_col(self):

        # Test unpacking column with non-list items
        mixed_tbl = Table([{
            'id': 1,
            'tag': [1, 2, None, 4]
        }, {
            'id': 2,
            'tag': None
        }])
        tbl_unpacked = Table(mixed_tbl.unpack_list('tag'))

        # Make sure result has the right number of columns
        self.assertEqual(len(tbl_unpacked.columns), 5)

        result_table = Table([{
            'id': 1,
            'tag_0': 1,
            'tag_1': 2,
            'tag_2': None,
            'tag_3': 4
        }, {
            'id': 2,
            'tag_0': None,
            'tag_1': None,
            'tag_2': None,
            'tag_3': None
        }])

        # Check that the values for both rows are distributed correctly
        self.assertEqual(result_table.data[0] + result_table.data[1],
                         tbl_unpacked.data[0] + tbl_unpacked.data[1])

    def test_unpack_nested_columns_as_rows(self):

        # A Table with mixed content
        test_table = Table([{
            'id': 1,
            'nested': {
                'A': 1,
                'B': 2,
                'C': 3
            },
            'extra': 'hi'
        }, {
            'id': 2,
            'nested': {
                'A': 4,
                'B': 5,
                'I': 6
            },
            'extra': 'hi'
        }, {
            'id': 3,
            'nested': 'string!',
            'extra': 'hi'
        }, {
            'id': 4,
            'nested': None,
            'extra': 'hi'
        }, {
            'id': 5,
            'nested': ['this!', 'is!', 'a!', 'list!'],
            'extra': 'hi'
        }])

        standalone = test_table.unpack_nested_columns_as_rows('nested')

        # Check that the columns are as expected
        self.assertEqual(['uid', 'id', 'nested', 'value'], standalone.columns)

        # Check that the row count is as expected
        self.assertEqual(standalone.num_rows, 11)

        # Check that the uids are unique, indicating that each row is unique
        self.assertEqual(len({row['uid'] for row in standalone}), 11)

    def test_unpack_nested_columns_as_rows_expanded(self):

        test_table = Table([{
            'id': 1,
            'nested': {
                'A': 1,
                'B': 2,
                'C': 3
            },
            'extra': 'hi'
        }, {
            'id': 2,
            'nested': {
                'A': 4,
                'B': 5,
                'I': 6
            },
            'extra': 'hi'
        }, {
            'id': 3,
            'nested': 'string!',
            'extra': 'hi'
        }, {
            'id': 4,
            'nested': None,
            'extra': 'hi'
        }, {
            'id': 5,
            'nested': ['this!', 'is!', 'a!', 'list!'],
            'extra': 'hi'
        }])

        expanded = test_table.unpack_nested_columns_as_rows(
            'nested', expand_original=True)

        # Check that the columns are as expected
        self.assertEqual(['uid', 'id', 'extra', 'nested', 'nested_value'],
                         expanded.columns)

        # Check that the row count is as expected
        self.assertEqual(expanded.num_rows, 12)

        # Check that the uids are unique, indicating that each row is unique
        self.assertEqual(len({row['uid'] for row in expanded}), 12)

    def test_cut(self):

        # Test that the cut works correctly
        cut_tbl = self.tbl.cut('first')
        self.assertEqual(cut_tbl.columns, ['first'])

    def test_row_select(self):

        tbl = Table([['foo', 'bar', 'baz'], ['c', 4, 9.3], ['a', 2, 88.2],
                     ['b', 1, 23.3]])
        expected = Table([{'foo': 'a', 'bar': 2, 'baz': 88.2}])

        # Try with this method
        select_tbl = tbl.select_rows("{foo} == 'a' and {baz} > 88.1")
        self.assertEqual(select_tbl.data[0], expected.data[0])

        # And try with this method
        select_tbl2 = tbl.select_rows(
            lambda row: row.foo == 'a' and row.baz > 88.1)
        self.assertEqual(select_tbl2.data[0], expected.data[0])

    def test_remove_null_rows(self):

        # Test that null rows are removed from a single column
        null_table = Table([{'a': 1, 'b': 2}, {'a': 1, 'b': None}])
        self.assertEqual(null_table.remove_null_rows('b').num_rows, 1)

        # Teest that null rows are removed from multiple columns
        null_table = Table([{
            'a': 1,
            'b': 2,
            'c': 3
        }, {
            'a': 1,
            'b': None,
            'c': 3
        }])
        self.assertEqual(null_table.remove_null_rows(['b', 'c']).num_rows, 1)

    def test_long_table(self):

        # Create a long table, that is 4 rows long
        tbl = Table([{'id': 1, 'tag': [1, 2, 3, 4]}])
        self.assertEqual(tbl.long_table(['id'], 'tag').num_rows, 4)

        # Assert that column has been dropped
        self.assertEqual(tbl.columns, ['id'])

        # Assert that column has been retained
        tbl_keep = Table([{'id': 1, 'tag': [1, 2, 3, 4]}])
        tbl_keep.long_table(['id'], 'tag', retain_original=True)
        self.assertEqual(tbl_keep.columns, ['id', 'tag'])

    def test_long_table_with_na(self):

        # Create a long table that is 4 rows long
        tbl = Table([{'id': 1, 'tag': [1, 2, 3, 4]}, {'id': 2, 'tag': None}])
        self.assertEqual(tbl.long_table(['id'], 'tag').num_rows, 4)

        # Assert that column has been dropped
        self.assertEqual(tbl.columns, ['id'])

        # Assert that column has been retained
        tbl_keep = Table([{
            'id': 1,
            'tag': [1, 2, 3, 4]
        }, {
            'id': 2,
            'tag': None
        }])
        tbl_keep.long_table(['id'], 'tag', retain_original=True)
        self.assertEqual(tbl_keep.columns, ['id', 'tag'])

    def test_rows(self):
        # Test that there is only one row in the table
        self.assertEqual(self.tbl.num_rows, 1)

    def test_first(self):
        # Test that the first value in the table is returned.
        self.assertEqual(self.tbl.first, 'Bob')

        # Test empty value returns None
        empty_tbl = Table([[1], [], [3]])
        self.assertIsNone(empty_tbl.first)

    def test_get_item(self):
        # Test indexing on table

        # Test a valid column
        tbl = Table(self.lst)
        lst = [1, 4, 7, 10, 13]
        self.assertEqual(tbl['a'], lst)

        # Test a valid row
        row = {'a': 4, 'b': 5, 'c': 6}
        self.assertEqual(tbl[1], row)

    def test_column_data(self):
        # Test that that the data in the column is returned as a list

        # Test a valid column
        tbl = Table(self.lst)
        lst = [1, 4, 7, 10, 13]
        self.assertEqual(tbl.column_data('a'), lst)

        # Test an invalid column
        self.assertRaises(TypeError, tbl['c'])

    def test_row_data(self):

        # Test a valid column
        tbl = Table(self.lst)
        row = {'a': 4, 'b': 5, 'c': 6}
        self.assertEqual(tbl.row_data(1), row)

    def test_stack(self):
        tbl1 = self.tbl
        tbl2 = Table([{'first': 'Mary', 'last': 'Nichols'}])
        # Different column names shouldn't matter for stack()
        tbl3 = Table([{'f': 'Lucy', 'l': 'Peterson'}])
        tbl1.stack(tbl2, tbl3)

        expected_tbl = Table(petl.stack(tbl1.table, tbl2.table, tbl3.table))
        assert_matching_tables(expected_tbl, tbl1)

    def test_concat(self):
        tbl1 = self.tbl
        tbl2 = Table([{'first': 'Mary', 'last': 'Nichols'}])
        tbl3 = Table([{'first': 'Lucy', 'last': 'Peterson'}])
        tbl1.concat(tbl2, tbl3)

        expected_tbl = Table(petl.cat(tbl1.table, tbl2.table, tbl3.table))
        assert_matching_tables(expected_tbl, tbl1)

    def test_chunk(self):

        test_table = Table(petl.randomtable(3, 499, seed=42))
        chunks = test_table.chunk(100)

        # Assert rows of each is 100
        for c in chunks[:3]:
            self.assertEqual(100, c.num_rows)

        # Assert last table is 99
        self.assertEqual(99, chunks[4].num_rows)

    def test_match_columns(self):
        raw = [
            {
                'first name': 'Mary',
                'LASTNAME': 'Nichols',
                'Middle__Name': 'D'
            },
            {
                'first name': 'Lucy',
                'LASTNAME': 'Peterson',
                'Middle__Name': 'S'
            },
        ]
        tbl = Table(raw)

        desired_raw = [
            {
                'first_name': 'Mary',
                'middle_name': 'D',
                'last_name': 'Nichols'
            },
            {
                'first_name': 'Lucy',
                'middle_name': 'S',
                'last_name': 'Peterson'
            },
        ]
        desired_tbl = Table(desired_raw)

        # Test with fuzzy matching
        tbl.match_columns(desired_tbl.columns)
        assert_matching_tables(desired_tbl, tbl)

        # Test disable fuzzy matching, and fail due due to the missing cols
        self.assertRaises(TypeError,
                          Table(raw).match_columns,
                          desired_tbl.columns,
                          fuzzy_match=False,
                          if_missing_columns='fail')

        # Test disable fuzzy matching, and fail due to the extra cols
        self.assertRaises(TypeError,
                          Table(raw).match_columns,
                          desired_tbl.columns,
                          fuzzy_match=False,
                          if_extra_columns='fail')

        # Test table that already has the right columns, shouldn't need fuzzy match
        tbl = Table(desired_raw)
        tbl.match_columns(desired_tbl.columns,
                          fuzzy_match=False,
                          if_missing_columns='fail',
                          if_extra_columns='fail')
        assert_matching_tables(desired_tbl, tbl)

        # Test table with missing col, verify the missing col gets added by default
        tbl = Table([
            {
                'first name': 'Mary',
                'LASTNAME': 'Nichols'
            },
            {
                'first name': 'Lucy',
                'LASTNAME': 'Peterson'
            },
        ])
        tbl.match_columns(desired_tbl.columns)
        desired_tbl = (
            Table(desired_raw).remove_column('middle_name').add_column(
                'middle_name', index=1))
        assert_matching_tables(desired_tbl, tbl)

        # Test table with extra col, verify the extra col gets removed by default
        tbl = Table([
            {
                'first name': 'Mary',
                'LASTNAME': 'Nichols',
                'Age': 32,
                'Middle__Name': 'D'
            },
            {
                'first name': 'Lucy',
                'LASTNAME': 'Peterson',
                'Age': 26,
                'Middle__Name': 'S'
            },
        ])
        desired_tbl = Table(desired_raw)
        tbl.match_columns(desired_tbl.columns)
        assert_matching_tables(desired_tbl, tbl)

        # Test table with two columns that normalize the same and aren't in desired cols, verify
        # they both get removed.
        tbl = Table([
            {
                'first name': 'Mary',
                'LASTNAME': 'Nichols',
                'Age': 32,
                'Middle__Name': 'D',
                'AGE': None
            },
            {
                'first name': 'Lucy',
                'LASTNAME': 'Peterson',
                'Age': 26,
                'Middle__Name': 'S',
                'AGE': None
            },
        ])
        tbl.match_columns(desired_tbl.columns)
        assert_matching_tables(desired_tbl, tbl)

        # Test table with two columns that match desired cols, verify only the first gets kept.
        tbl = Table([
            {
                'first name': 'Mary',
                'LASTNAME': 'Nichols',
                'First Name': None,
                'Middle__Name': 'D'
            },
            {
                'first name': 'Lucy',
                'LASTNAME': 'Peterson',
                'First Name': None,
                'Middle__Name': 'S'
            },
        ])
        tbl.match_columns(desired_tbl.columns)
        assert_matching_tables(desired_tbl, tbl)

    def test_to_dicts(self):
        self.assertEqual(self.lst, Table(self.lst).to_dicts())
        self.assertEqual(self.lst_dicts, self.tbl.to_dicts())

    def test_reduce_rows(self):
        table = [['foo', 'bar'], ['a', 3], ['a', 7], ['b', 2], ['b', 1],
                 ['b', 9], ['c', 4]]
        expected = [{
            "foo": "a",
            "barsum": 10
        }, {
            "foo": "b",
            "barsum": 12
        }, {
            "foo": "c",
            "barsum": 4
        }]

        ptable = Table(table)

        ptable.reduce_rows(
            'foo', lambda key, rows: [key, sum(row[1] for row in rows)],
            ['foo', 'barsum'])

        self.assertEqual(expected, ptable.to_dicts())

    def test_map_columns(self):

        input_tbl = Table([['fn', 'ln'], ['J', 'B']])

        expected_tbl = Table([['first_name', 'last_name'], ['J', 'B']])

        column_map = {
            'first_name': ['fn', 'first'],
            'last_name': ['last', 'ln']
        }

        input_tbl.map_columns(column_map)

        assert_matching_tables(input_tbl, expected_tbl)

    def test_get_column_max_with(self):

        tbl = Table([['a', 'b', 'c'],
                     ['wide_text', False, 'slightly longer text'],
                     ['text', 2, 'byte_text🏽‍⚕️✊🏽🤩']])

        # Basic test
        self.assertEqual(tbl.get_column_max_width('a'), 9)

        # Doesn't break for non-strings
        self.assertEqual(tbl.get_column_max_width('b'), 5)

        # Evaluates based on byte length rather than char length
        self.assertEqual(tbl.get_column_max_width('c'), 33)

    def test_sort(self):

        # Test basic sort
        unsorted_tbl = Table([['a', 'b'], [3, 1], [2, 2], [1, 3]])
        sorted_tbl = unsorted_tbl.sort()
        self.assertEqual(sorted_tbl[0], {'a': 1, 'b': 3})

        # Test column sort
        unsorted_tbl = Table([['a', 'b'], [3, 1], [2, 2], [1, 3]])
        sorted_tbl = unsorted_tbl.sort('b')
        self.assertEqual(sorted_tbl[0], {'a': 3, 'b': 1})

        # Test reverse sort
        unsorted_tbl = Table([['a', 'b'], [3, 1], [2, 2], [1, 3]])
        sorted_tbl = unsorted_tbl.sort(reverse=True)
        self.assertEqual(sorted_tbl[0], {'a': 3, 'b': 1})

    def test_set_header(self):

        # Rename columns
        tbl = Table([['one', 'two'], [1, 2], [3, 4]])
        new_tbl = tbl.set_header(['oneone', 'twotwo'])

        self.assertEqual(new_tbl[0], {'oneone': 1, 'twotwo': 2})

        # Change number of columns
        tbl = Table([['one', 'two'], [1, 2], [3, 4]])
        new_tbl = tbl.set_header(['one'])

        self.assertEqual(new_tbl[0], {'one': 1})

    def test_bool(self):
        empty = Table()
        not_empty = Table([{'one': 1, 'two': 2}])

        self.assertEqual(not empty, True)
        self.assertEqual(not not_empty, False)

예제 #3

파일 보기

파일: etl.py 프로젝트: kcym-3c/parsons

    def unpack_nested_columns_as_rows(self,
                                      column,
                                      key='id',
                                      expand_original=False):
        """
        Unpack list or dict values from one column into separate rows.
        Not recommended for JSON columns (i.e. lists of dicts), but can handle columns
        with any mix of types. Makes use of PETL's `melt()` method.

        `Args:`
            column: str
                The column name to unpack
            key: str
                The column to use as a key when unpacking. Defaults to `id`
            expand_original: boolean or int
                If `True`: Add resulting unpacked rows (with all other columns) to original
                If `int`: Add to original unless the max added per key is above the given number
                If `False` (default): Return unpacked rows (with `key` column only) as standalone
                Removes packed list and dict rows from original either way.
        `Returns:`
            If `expand_original`, original table with packed rows replaced by unpacked rows
            Otherwise, standalone table with key column and unpacked values only
        """

        if isinstance(expand_original, int) and expand_original is not True:
            lengths = {
                len(row[column])
                for row in self if isinstance(row[column], (dict, list))
            }
            max_len = sorted(lengths, reverse=True)[0]
            if max_len > expand_original:
                expand_original = False

        if expand_original:
            # Include all columns and filter out other non-dict types in table_list
            table = self
            table_list = table.select_rows(
                lambda row: isinstance(row[column], list))
        else:
            # Otherwise, include only key and column, but keep all non-dict types in table_list
            table = self.cut(key, column)
            table_list = table.select_rows(
                lambda row: not isinstance(row[column], dict))

        # All the columns other than column to ignore while melting
        ignore_cols = table.columns
        ignore_cols.remove(column)

        # Unpack lists as separate columns
        table_list.unpack_list(column, replace=True)

        # Rename the columns to retain only the number
        for col in table_list.columns:
            if f'{column}_' in col:
                table_list.rename_column(col, col.replace(f'{column}_', ""))

        # Filter dicts and unpack as separate columns
        table_dict = table.select_rows(
            lambda row: isinstance(row[column], dict))
        table_dict.unpack_dict(column, prepend=False)

        from parsons.etl.table import Table

        # Use melt to pivot both sets of columns into their own Tables and clean out None values
        melted_list = Table(petl.melt(table_list.table, ignore_cols))
        melted_dict = Table(petl.melt(table_dict.table, ignore_cols))

        melted_list.remove_null_rows('value')
        melted_dict.remove_null_rows('value')

        melted_list.rename_column('variable', column)
        melted_dict.rename_column('variable', column)

        # Combine the list and dict Tables
        melted_list.concat(melted_dict)

        import hashlib

        if expand_original:
            # Add unpacked rows to the original table (minus packed rows)
            orig = self.select_rows(
                lambda row: not isinstance(row[column], (dict, list)))
            orig.concat(melted_list)
            # Add unique id column by hashing all the other fields
            if 'uid' not in self.columns:
                orig.add_column(
                    'uid', lambda row: hashlib.md5(
                        str.encode(''.join([str(x)
                                            for x in row]))).hexdigest())
                orig.move_column('uid', 0)

            # Rename value column in case this is done again to this Table
            orig.rename_column('value', f'{column}_value')

            # Keep column next to column_value
            orig.move_column(column, -1)
            output = orig
        else:
            orig = self.remove_column(column)
            # Add unique id column by hashing all the other fields
            melted_list.add_column(
                'uid', lambda row: hashlib.md5(
                    str.encode(''.join([str(x) for x in row]))).hexdigest())
            melted_list.move_column('uid', 0)
            output = melted_list

        self = orig
        return output

예제 #4

파일 보기

    def copy_rows(self, source_table_name, destination_table_name, cutoff,
                  order_by, **kwargs):
        """
        Copy the rows from the source to the destination.

        `Args:`
            source_table_name: str
                Full table path (e.g. ``my_schema.my_table``)
            destination_table_name: str
                Full table path (e.g. ``my_schema.my_table``)
            cutoff:
                Start value to use as a minimum for incremental updates.
            order_by:
                Column to use to order the data to ensure a stable sort.
            **kwargs: args
                Optional copy arguments for destination database.
        `Returns:`
            ``None``
        """

        # Create the table objects
        source_table = self.source_db.table(source_table_name)

        # Initialize the Parsons table we will use to store rows before writing
        buffer = Table()

        # Track the number of retries we have left before giving up
        retries_left = self.retries + 1

        total_rows_downloaded = 0
        total_rows_written = 0
        rows_buffered = 0

        # Keep going until we break out
        while True:
            try:
                # Get the records to load into the database
                if cutoff:
                    # If we have a cutoff, we are loading data incrementally -- filter out
                    # any data before our cutoff
                    rows = source_table.get_new_rows(
                        primary_key=order_by,
                        cutoff_value=cutoff,
                        offset=total_rows_downloaded,
                        chunk_size=self.read_chunk_size)
                else:
                    # Get a chunk
                    rows = source_table.get_rows(
                        offset=total_rows_downloaded,
                        chunk_size=self.read_chunk_size,
                        order_by=order_by)

                number_of_rows = rows.num_rows
                total_rows_downloaded += number_of_rows

                # If we didn't get any data, exit the loop -- there's nothing to load
                if number_of_rows == 0:
                    # If we have any rows that are unwritten, flush them to the destination database
                    if rows_buffered > 0:
                        self.dest_db.copy(buffer,
                                          destination_table_name,
                                          if_exists='append',
                                          **kwargs)
                        total_rows_written += rows_buffered

                        # Reset the buffer
                        rows_buffered = 0
                        buffer = Table()

                    break

                # Add the new rows to our buffer
                buffer.concat(rows)
                rows_buffered += number_of_rows

                # If our buffer reaches our write threshold, write it out
                if rows_buffered >= self.write_chunk_size:
                    self.dest_db.copy(buffer,
                                      destination_table_name,
                                      if_exists='append',
                                      **kwargs)
                    total_rows_written += rows_buffered

                    # Reset the buffer
                    rows_buffered = 0
                    buffer = Table()

            except Exception:
                # Tick down the number of retries
                retries_left -= 1

                # If we are out of retries, fail
                if retries_left == 0:
                    logger.debug('No retries remaining')
                    raise

                # Otherwise, log the exception and try again
                logger.exception('Unhandled error copying data; retrying')

        return total_rows_written