def test_remove_null_rows(self): # Test that null rows are removed from a single column null_table = Table([{'a': 1, 'b': 2}, {'a': 1, 'b': None}]) self.assertEqual(null_table.remove_null_rows('b').num_rows, 1) # Teest that null rows are removed from multiple columns null_table = Table([{ 'a': 1, 'b': 2, 'c': 3 }, { 'a': 1, 'b': None, 'c': 3 }]) self.assertEqual(null_table.remove_null_rows(['b', 'c']).num_rows, 1)
def unpack_nested_columns_as_rows(self, column, key='id', expand_original=False): """ Unpack list or dict values from one column into separate rows. Not recommended for JSON columns (i.e. lists of dicts), but can handle columns with any mix of types. Makes use of PETL's `melt()` method. `Args:` column: str The column name to unpack key: str The column to use as a key when unpacking. Defaults to `id` expand_original: boolean or int If `True`: Add resulting unpacked rows (with all other columns) to original If `int`: Add to original unless the max added per key is above the given number If `False` (default): Return unpacked rows (with `key` column only) as standalone Removes packed list and dict rows from original either way. `Returns:` If `expand_original`, original table with packed rows replaced by unpacked rows Otherwise, standalone table with key column and unpacked values only """ if isinstance(expand_original, int) and expand_original is not True: lengths = { len(row[column]) for row in self if isinstance(row[column], (dict, list)) } max_len = sorted(lengths, reverse=True)[0] if max_len > expand_original: expand_original = False if expand_original: # Include all columns and filter out other non-dict types in table_list table = self table_list = table.select_rows( lambda row: isinstance(row[column], list)) else: # Otherwise, include only key and column, but keep all non-dict types in table_list table = self.cut(key, column) table_list = table.select_rows( lambda row: not isinstance(row[column], dict)) # All the columns other than column to ignore while melting ignore_cols = table.columns ignore_cols.remove(column) # Unpack lists as separate columns table_list.unpack_list(column, replace=True) # Rename the columns to retain only the number for col in table_list.columns: if f'{column}_' in col: table_list.rename_column(col, col.replace(f'{column}_', "")) # Filter dicts and unpack as separate columns table_dict = table.select_rows( lambda row: isinstance(row[column], dict)) table_dict.unpack_dict(column, prepend=False) from parsons.etl.table import Table # Use melt to pivot both sets of columns into their own Tables and clean out None values melted_list = Table(petl.melt(table_list.table, ignore_cols)) melted_dict = Table(petl.melt(table_dict.table, ignore_cols)) melted_list.remove_null_rows('value') melted_dict.remove_null_rows('value') melted_list.rename_column('variable', column) melted_dict.rename_column('variable', column) # Combine the list and dict Tables melted_list.concat(melted_dict) import hashlib if expand_original: # Add unpacked rows to the original table (minus packed rows) orig = self.select_rows( lambda row: not isinstance(row[column], (dict, list))) orig.concat(melted_list) # Add unique id column by hashing all the other fields if 'uid' not in self.columns: orig.add_column( 'uid', lambda row: hashlib.md5( str.encode(''.join([str(x) for x in row]))).hexdigest()) orig.move_column('uid', 0) # Rename value column in case this is done again to this Table orig.rename_column('value', f'{column}_value') # Keep column next to column_value orig.move_column(column, -1) output = orig else: orig = self.remove_column(column) # Add unique id column by hashing all the other fields melted_list.add_column( 'uid', lambda row: hashlib.md5( str.encode(''.join([str(x) for x in row]))).hexdigest()) melted_list.move_column('uid', 0) output = melted_list self = orig return output