예제 #1
0
 def test_c_bind_pair(self):
     f1 = Frame(self.sds, self.df_cb_1)
     f2 = Frame(self.sds, self.df_cb_2)
     result_df = f1.cbind(f2).compute()
     self.assertTrue(isinstance(result_df, pd.DataFrame))
     target_df= pd.concat([self.df_cb_1, self.df_cb_2], axis=1)
     self.assertTrue(target_df.equals(result_df))
예제 #2
0
 def test_r_bind_pair(self):
     f1 = Frame(self.sds, self.df_rb_1)
     f2 = Frame(self.sds, self.df_rb_2)
     result_df = f1.rbind(f2).compute()
     self.assertTrue(isinstance(result_df, pd.DataFrame))
     target_df = pd.concat([self.df_rb_1, self.df_rb_2], ignore_index=True)
     self.assertTrue(target_df.equals(result_df))
예제 #3
0
 def test_write_read_csv(self):
     frame = Frame(self.sds, self.df)
     frame.write(self.temp_dir + "02", header=True, format="csv").compute()
     NX = self.sds.read(self.temp_dir + "02",
                        data_type="frame",
                        format="csv")
     result_df = NX.compute()
     self.assertTrue(isinstance(result_df, pd.DataFrame))
     self.assertTrue(self.df.equals(result_df))
예제 #4
0
 def read(self, path: os.PathLike,
          **kwargs: Dict[str, VALID_INPUT_TYPES]) -> 'OperationNode':
     """ Read an file from disk. Supportted types include:
     CSV, Matrix Market(coordinate), Text(i,j,v), SystemDS Binay
     See: http://apache.github.io/systemds/site/dml-language-reference#readwrite-built-in-functions for more details
     :return: an Operation Node, containing the read data.
     """
     data_type = kwargs.get("data_type", None)
     file_format = kwargs.get("format", None)
     if data_type == "frame":
         kwargs["data_type"] = f'"{data_type}"'
         if isinstance(file_format, str):
             kwargs["format"] = f'"{kwargs["format"]}"'
         return Frame(self, None, f'"{path}"', **kwargs)
     elif data_type == "scalar":
         kwargs["data_type"] = f'"{data_type}"'
         value_type = kwargs.get("value_type", None)
         if value_type == "string":
             kwargs["value_type"] = f'"{kwargs["value_type"]}"'
             return OperationNode(
                 self,
                 "read",
                 [f'"{path}"'],
                 named_input_nodes=kwargs,
                 shape=(-1, ),
                 output_type=OutputType.SCALAR,
             )
     return OperationNode(self,
                          "read", [f'"{path}"'],
                          named_input_nodes=kwargs,
                          shape=(-1, ))
예제 #5
0
    def test_apply_recode_bin(self):
        with open(self.JSPEC_PATH) as jspec_file:
            JSPEC = json.load(jspec_file)
        F1 = self.sds.read(
            self.HOMES_PATH,
            data_type="frame",
            schema=self.HOMES_SCHEMA,
            format="csv",
            header=True,
        )
        pd_F1 = F1.compute()
        jspec = self.sds.read(self.JSPEC_PATH, data_type="scalar", value_type="string")
        X, M = F1.transform_encode(spec=jspec).compute()
        self.assertTrue(isinstance(X, np.ndarray))
        self.assertTrue(isinstance(M, pd.DataFrame))
        self.assertTrue(X.shape == pd_F1.shape)
        self.assertTrue(np.all(np.isreal(X)))
        relevant_columns = set()
        for col_name in JSPEC["recode"]:
            relevant_columns.add(pd_F1.columns.get_loc(col_name))
            self.assertTrue(M[col_name].nunique() == pd_F1[col_name].nunique())
        for binning in JSPEC["bin"]:
            col_name = binning["name"]
            relevant_columns.add(pd_F1.columns.get_loc(col_name))
            self.assertTrue(M[col_name].nunique() == binning["numbins"])

        X2 = F1.transform_apply(spec=jspec, meta=Frame(self.sds, M)).compute()
        self.assertTrue(X.shape == X2.shape)
        self.assertTrue(np.all(np.isreal(X2)))
예제 #6
0
 def test_r_bind_triple_twostep(self):
     f1 = Frame(self.sds, self.df_rb_1)
     f2 = Frame(self.sds, self.df_rb_2)
     f3 = Frame(self.sds, self.df_rb_3)
     tmp_df = f1.rbind(f2).compute()
     result_df = Frame(self.sds, tmp_df).rbind(f3).compute()
     self.assertTrue(isinstance(result_df, pd.DataFrame))
     target_df = pd.concat([self.df_rb_1, self.df_rb_2, self.df_rb_3], ignore_index=True)
     self.assertTrue(target_df.equals(result_df))
예제 #7
0
 def test_write_read_binary(self):
     frame = Frame(self.sds, self.df)
     frame.write(self.temp_dir + "01").compute()
     NX = self.sds.read(self.temp_dir + "01", data_type="frame")
     result_df = NX.compute()
     self.assertTrue((self.df.values == result_df.values).all())