def test_table_init(self): tm = read_table_json("example/meta_data/db1/teams.json") self.assertTrue(tm.database is None) gtd = tm.glue_table_definition("full_db_path") self.assertTrue( gtd["StorageDescriptor"]["Location"] == 'full_db_path/teams/')
def test_add_remove_table(self): db = read_database_folder('example/meta_data/db1/') self.assertRaises(ValueError, db.remove_table, 'not_a_table') db.remove_table('employees') tns = db.table_names self.assertEqual(set(tns), set(['teams', 'pay'])) emp_table = read_table_json('example/meta_data/db1/employees.json') db.add_table(emp_table) t = all(t in ['teams', 'employees', 'pay'] for t in db.table_names) self.assertTrue(t) self.assertRaises(ValueError, db.add_table, 'not a table obj') self.assertRaises(ValueError, db.add_table, emp_table)
def test_add_remove_table(self): db = read_database_folder("example/meta_data/db1/") self.assertRaises(ValueError, db.remove_table, "not_a_table") db.remove_table("employees") tns = db.table_names self.assertEqual(set(tns), set(["teams", "pay"])) emp_table = read_table_json("example/meta_data/db1/employees.json") db.add_table(emp_table) t = all(t in ["teams", "employees", "pay"] for t in db.table_names) self.assertTrue(t) self.assertRaises(ValueError, db.add_table, "not a table obj") self.assertRaises(ValueError, db.add_table, emp_table)
def read_csv_write_to_parquet(local_data_path, s3_path, local_meta_path): if s3_path.startswith("s3://"): s3_path = s3_path.replace("s3://", "", 1) local = fs.LocalFileSystem() s3 = fs.S3FileSystem(region=REGION) with local.open_input_stream(local_data_path) as f: tab = csv.read_csv(f) metadata = read_table_json(local_meta_path) arrow_cols = [] for col in metadata.columns: if col["name"] not in metadata.partitions: arrow_cols.append(convert_meta_col_to_arrow_tuple(col)) s = pa.schema(arrow_cols) tab = tab.cast(s) with s3.open_output_stream(s3_path) as f: pq.write_table(tab, f)
def create_glue_database(self): """Creates glue database""" # Create database based on db_schema db = DatabaseMeta(**self.db_schema) for table_name, data_paths in self.meta_and_files.items(): tm = read_table_json(data_paths["meta_path"], database=db) tm.data_format = "parquet" if tm.partitions: raise AttributeError("Automated lookup tables can only be " "partitioned by their GitHub release") # Add a release column as the first file partition to every table tm.add_column( name="release", type="character", description="github release tag of this lookup", ) tm.partitions = ["release"] db.add_table(tm) db.create_glue_database(delete_if_exists=True) db.refresh_all_table_partitions()
def test_glue_specific_table(self): t = read_table_json("example/meta_data/db1/pay.json") glue_def = t.glue_table_definition("db_path") self.assertTrue( t.glue_table_definition("db_path")["Parameters"] ['skip.header.line.count'] == '1')