def test_union_all(self, mock_flat_table): repo1 = FlatTableCollection({"MCO": mock_flat_table}) repo2 = FlatTableCollection({"DCIR": mock_flat_table}) repo3 = FlatTableCollection({"MCO_CE": mock_flat_table}) repo = FlatTableCollection.union_all([repo1, repo2, repo3]) expected_names = {"MCO", "DCIR", "MCO_CE"} self.assertEqual(expected_names, repo.flat_table_names())
def plot_and_save_flattening_stat( json, pdf_path, figsize=(8, 5), id_col="NUM_ENQ", date_col="EXE_SOI_DTD", years=None ): """ This method is used to visualize flattening stat and save the result in a pdf :param json: flat table meta data :param pdf_path: the path of pdf to save the stat :param figsize: size of the figure default = (8,5) :param id_col: 'str' identity column default = 'NUM_ENQ' :param date_col: 'str' data column used for 'group by' statement default = 'EXE_SOI_DTD' :param years: a list of special years in which the data will be loaded, default is None :return: """ assert isinstance(json, str), "expected a string in the json format" metadata = FlatTableCollection.load(json) save_plots( registry, pdf_path, list(metadata.flat_tables.values()), figsize=figsize, id_col=id_col, date_col=date_col, years=years, )
def test_difference(self, mock_flat_table): repo1 = FlatTableCollection({"MCO": mock_flat_table, "DCIR": mock_flat_table}) repo2 = FlatTableCollection({"DCIR": mock_flat_table}) repo = repo1.difference(repo2) expected_names = {"MCO"} self.assertEqual(expected_names, repo.flat_table_names())
def testFromJson( self, mock_flat_table_read_data_frame, mock_single_table_read_data_frame ): mock_flat_table_read_data_frame.return_value = self.spark.createDataFrame( pd.DataFrame( { "NUM_ENQ": ["1", "2", "3"], "EXE_SOI_DTD": ["01/01/2015", "01/02/2016", "01/03/2017"], } ) ) mock_single_table_read_data_frame.return_value = self.spark.createDataFrame( pd.DataFrame( { "NUM_ENQ": ["1", "2", "3"], "EXE_SOI_DTD": ["01/01/2015", "01/02/2016", "01/03/2017"], } ) ) json = """ { "class_name" : "fr.polytechnique.cmap.cnam.flattening.FlatteningMainJoin$", "start_timestamp" : "2019-09-26T13:30:24Z", "end_timestamp" : "2019-09-26T17:02:25Z", "operations" : [{ "output_table" : "ER_UCD_F", "output_path" : "/user/ds/CNAM243/flattening/single_table", "output_type" : "single_table", "sources" : ["/shared/Observapur/raw_data/DCIR_2010/ER_UCD_F_2010.CSV", "/shared/Observapur/raw_data/DCIR_2011/ER_UCD_F_2011.CSV", "/shared/Observapur/raw_data/DCIR_2012/ER_UCD_F_2012.CSV", "/shared/Observapur/raw_data/DCIR_2013/ER_UCD_F_2013.CSV", "/shared/Observapur/raw_data/DCIR_2014/ER_UCD_F_2014.CSV"], "join_keys" : [] }, { "output_table" : "ER_ETE_F", "output_path" : "/user/ds/CNAM243/flattening/single_table", "output_type" : "single_table", "sources" : ["/shared/Observapur/raw_data/DCIR_2010/ER_ETE_F_2010.CSV", "/shared/Observapur/raw_data/DCIR_2011/ER_ETE_F_2011.CSV", "/shared/Observapur/raw_data/DCIR_2012/ER_ETE_F_2012.CSV", "/shared/Observapur/raw_data/DCIR_2013/ER_ETE_F_2013.CSV", "/shared/Observapur/raw_data/DCIR_2014/ER_ETE_F_2014.CSV"], "join_keys" : [] }, { "output_table" : "ER_PHA_F", "output_path" : "/user/ds/CNAM243/flattening/single_table", "output_type" : "single_table", "sources" : ["/shared/Observapur/raw_data/DCIR_2010/ER_PHA_F_2010.CSV", "/shared/Observapur/raw_data/DCIR_2011/ER_PHA_F_2011.CSV", "/shared/Observapur/raw_data/DCIR_2012/ER_PHA_F_2012.CSV", "/shared/Observapur/raw_data/DCIR_2013/ER_PHA_F_2013.CSV", "/shared/Observapur/raw_data/DCIR_2014/ER_PHA_F_2014.CSV"], "join_keys" : [] }, { "output_table" : "ER_PRS_F", "output_path" : "/user/ds/CNAM243/flattening/single_table", "output_type" : "single_table", "sources" : ["/shared/Observapur/raw_data/DCIR_2010/ER_PRS_F_2010.CSV", "/shared/Observapur/raw_data/DCIR_2011/ER_PRS_F_2011.CSV", "/shared/Observapur/raw_data/DCIR_2012/ER_PRS_F_2012.CSV", "/shared/Observapur/raw_data/DCIR_2013/ER_PRS_F_2013.CSV", "/shared/Observapur/raw_data/DCIR_2014/ER_PRS_F_2014.CSV"], "join_keys" : [] }, { "output_table" : "ER_CAM_F", "output_path" : "/user/ds/CNAM243/flattening/single_table", "output_type" : "single_table", "sources" : ["/shared/Observapur/raw_data/DCIR_2010/ER_CAM_F_2010.CSV", "/shared/Observapur/raw_data/DCIR_2011/ER_CAM_F_2011.CSV", "/shared/Observapur/raw_data/DCIR_2012/ER_CAM_F_2012.CSV", "/shared/Observapur/raw_data/DCIR_2013/ER_CAM_F_2013.CSV", "/shared/Observapur/raw_data/DCIR_2014/ER_CAM_F_2014.CSV"], "join_keys" : [] }, { "output_table" : "DCIR", "output_path" : "/user/ds/CNAM243/flattening/flat_table", "output_type" : "flat_table", "sources" : ["ER_PRS_F", "ER_UCD_F", "ER_CAM_F", "ER_ETE_F", "ER_PHA_F"], "join_keys" : ["DCT_ORD_NUM", "FLX_DIS_DTD", "FLX_EMT_NUM", "FLX_EMT_ORD", "FLX_EMT_TYP", "FLX_TRT_DTD", "ORG_CLE_NUM", "PRS_ORD_NUM", "REM_TYP_AFF"] }] } """ repo = FlatTableCollection.from_json(json) expected_flat_names = {"DCIR"} expected_single_names = { "ER_PRS_F", "ER_UCD_F", "ER_CAM_F", "ER_ETE_F", "ER_PHA_F", } self.assertEqual(expected_flat_names, repo.flat_table_names()) self.assertEqual( expected_single_names, repo.single_table_names_from_flat_table("DCIR") ) self.assertEqual(set(), repo.single_table_names_from_flat_table("MCO")) dcir = repo.get("DCIR") self.assertEqual("ER_PRS_F", dcir.single_tables.get("ER_PRS_F").name) self.assertEqual("ER_PRS_F", dcir.single_tables.get("ER_PRS_F").characteristics)
def test_add_flat_table(self, mock_flat_table): repo = FlatTableCollection({"MCO": mock_flat_table, "DCIR": mock_flat_table}) repo.add_flat_table("MCO_CE", mock_flat_table) expected_names = {"MCO", "DCIR", "MCO_CE"} self.assertEqual(expected_names, repo.flat_table_names())
def test_get(self, mock_flat_table): repo = FlatTableCollection({"MCO": mock_flat_table, "DCIR": mock_flat_table}) self.assertEquals(repo.get("MCO"), mock_flat_table) self.assertRaises(KeyError, repo.get, "MCO_CE")
def test_exists(self, mock_flat_table): repo = FlatTableCollection({"MCO": mock_flat_table, "DCIR": mock_flat_table}) self.assertTrue(repo.exists("MCO"))