def test_c_bind_pair(self): f1 = Frame(self.sds, self.df_cb_1) f2 = Frame(self.sds, self.df_cb_2) result_df = f1.cbind(f2).compute() self.assertTrue(isinstance(result_df, pd.DataFrame)) target_df= pd.concat([self.df_cb_1, self.df_cb_2], axis=1) self.assertTrue(target_df.equals(result_df))
def test_r_bind_pair(self): f1 = Frame(self.sds, self.df_rb_1) f2 = Frame(self.sds, self.df_rb_2) result_df = f1.rbind(f2).compute() self.assertTrue(isinstance(result_df, pd.DataFrame)) target_df = pd.concat([self.df_rb_1, self.df_rb_2], ignore_index=True) self.assertTrue(target_df.equals(result_df))
def test_r_bind_triple_twostep(self): f1 = Frame(self.sds, self.df_rb_1) f2 = Frame(self.sds, self.df_rb_2) f3 = Frame(self.sds, self.df_rb_3) tmp_df = f1.rbind(f2).compute() result_df = Frame(self.sds, tmp_df).rbind(f3).compute() self.assertTrue(isinstance(result_df, pd.DataFrame)) target_df = pd.concat([self.df_rb_1, self.df_rb_2, self.df_rb_3], ignore_index=True) self.assertTrue(target_df.equals(result_df))
def read(self, path: os.PathLike, **kwargs: Dict[str, VALID_INPUT_TYPES]) -> 'OperationNode': """ Read an file from disk. Supportted types include: CSV, Matrix Market(coordinate), Text(i,j,v), SystemDS Binay See: http://apache.github.io/systemds/site/dml-language-reference#readwrite-built-in-functions for more details :return: an Operation Node, containing the read data. """ data_type = kwargs.get("data_type", None) file_format = kwargs.get("format", None) if data_type == "frame": kwargs["data_type"] = f'"{data_type}"' if isinstance(file_format, str): kwargs["format"] = f'"{kwargs["format"]}"' return Frame(self, None, f'"{path}"', **kwargs) elif data_type == "scalar": kwargs["data_type"] = f'"{data_type}"' value_type = kwargs.get("value_type", None) if value_type == "string": kwargs["value_type"] = f'"{kwargs["value_type"]}"' return OperationNode( self, "read", [f'"{path}"'], named_input_nodes=kwargs, shape=(-1, ), output_type=OutputType.SCALAR, ) return OperationNode(self, "read", [f'"{path}"'], named_input_nodes=kwargs, shape=(-1, ))
def test_apply_recode_bin(self): with open(self.JSPEC_PATH) as jspec_file: JSPEC = json.load(jspec_file) F1 = self.sds.read( self.HOMES_PATH, data_type="frame", schema=self.HOMES_SCHEMA, format="csv", header=True, ) pd_F1 = F1.compute() jspec = self.sds.read(self.JSPEC_PATH, data_type="scalar", value_type="string") X, M = F1.transform_encode(spec=jspec).compute() self.assertTrue(isinstance(X, np.ndarray)) self.assertTrue(isinstance(M, pd.DataFrame)) self.assertTrue(X.shape == pd_F1.shape) self.assertTrue(np.all(np.isreal(X))) relevant_columns = set() for col_name in JSPEC["recode"]: relevant_columns.add(pd_F1.columns.get_loc(col_name)) self.assertTrue(M[col_name].nunique() == pd_F1[col_name].nunique()) for binning in JSPEC["bin"]: col_name = binning["name"] relevant_columns.add(pd_F1.columns.get_loc(col_name)) self.assertTrue(M[col_name].nunique() == binning["numbins"]) X2 = F1.transform_apply(spec=jspec, meta=Frame(self.sds, M)).compute() self.assertTrue(X.shape == X2.shape) self.assertTrue(np.all(np.isreal(X2)))
def test_write_read_csv(self): frame = Frame(self.sds, self.df) frame.write(self.temp_dir + "02", header=True, format="csv").compute() NX = self.sds.read(self.temp_dir + "02", data_type="frame", format="csv") result_df = NX.compute() self.assertTrue(isinstance(result_df, pd.DataFrame)) self.assertTrue(self.df.equals(result_df))
def test_write_read_binary(self): frame = Frame(self.sds, self.df) frame.write(self.temp_dir + "01").compute() NX = self.sds.read(self.temp_dir + "01", data_type="frame") result_df = NX.compute() self.assertTrue((self.df.values == result_df.values).all())