def fit_ftdata(self, x_ft, y_ft, x_ab, y_ab):
        """ Fit the model with featurized data as input

        :param x_ft: x_featurized
        :param y_ft: y_featurized
        :param x_ab: x_inverse_featurized
        :param y_ab: y_inverse_featurized
        """
        self.clf0 = CLF(**self.params).fit(x_ft,
                                           y_ft != 0)  # causal or confounded?
        self.clf1 = CLF(**self.params).fit(x_ab,
                                           y_ab == 1)  # causal or anticausal?
示例#2
0
    def fit(
        self,
        x,
        y,
        is_q=False
    ):  # x,y = original data and original labels => x=(npairs,2),y=(npairs,1)
        '''This function takes the entire dataframe and labels as input => x and y are original "data" and original "labels" respectively. 
        The output is a trained CLF.
        The idea of "train" is to 1) separately go through each row, 2) featurize each row two times (for A->B and B->A), and 3) vectically stack them.
        The idea of "labels" is to duplicate the given labels (i.e., the y variable) by cosidering both A->B and B->A directions.
        Example:
        obj.fit(data, labels)
        '''

        train = np.vstack((
            np.array([
                self.featurize_row(row.iloc[0], row.iloc[1], is_q=is_q)
                for idx, row in x.iterrows()
            ]),  #(npairs,3m)        #Does this really work for quantum?!
            np.array([
                self.featurize_row(row.iloc[1], row.iloc[0], is_q=is_q)
                for idx, row in x.iterrows()
            ])))  #(npairs,3m)
        #train=(2*npairs,3m)
        labels = np.vstack((y, -y)).ravel()  #(2*npairs,)
        verbose = 1 if self.verbose else 0
        self.clf = CLF(verbose=verbose,
                       min_samples_leaf=self.L,
                       n_estimators=self.E,
                       max_depth=self.max_depth,
                       n_jobs=self.njobs).fit(train, labels)
示例#3
0
 def fit(self, x, y):
     # CAUTION this x and y should not be dataframe, but preprocessed above
     print('training CLF ..')
     verbose = 1 if self.verbose else 0
     # FIXME and this is very im-balanced
     self.clf = CLF(verbose=verbose,
                    min_samples_leaf=self.L,
                    n_estimators=self.E,
                    max_depth=self.max_depth,
                    n_jobs=self.njobs).fit(x, y)
示例#4
0
    def fit(self, x, y):
        """Train the model.

        args:
            x: pandas.Dataframe of the data
            y: targets
        """
        train = np.vstack((np.array([self.featurize_row(row.iloc[0],
                                                        row.iloc[1]) for idx, row in x.iterrows()]),
                           np.array([self.featurize_row(row.iloc[1],
                                                        row.iloc[0]) for idx, row in x.iterrows()])))
        labels = np.vstack((y, -y)).ravel()
        verbose = 1 if self.verbose else 0
        self.clf = CLF(verbose=verbose,
                       min_samples_leaf=self.L,
                       n_estimators=self.E,
                       max_depth=self.max_depth,
                       n_jobs=self.n_jobs).fit(train, labels)
示例#5
0
    def fit(self, x, y):
        """Train the model.

        Args:
            x_tr (pd.DataFrame): CEPC format dataframe containing the pairs
            y_tr (pd.DataFrame or np.ndarray): labels associated to the pairs
        """
        train = np.vstack((np.array([self.featurize_row(row.iloc[0],
                                                        row.iloc[1]) for idx, row in x.iterrows()]),
                           np.array([self.featurize_row(row.iloc[1],
                                                        row.iloc[0]) for idx, row in x.iterrows()])))
        labels = np.vstack((y, -y)).ravel()
        verbose = 1 if self.verbose else 0
        self.clf = CLF(verbose=verbose,
                       min_samples_leaf=self.L,
                       n_estimators=self.E,
                       max_depth=self.max_depth,
                       n_jobs=self.njobs).fit(train, labels)
y_te = np.hstack((y_te, -y_te))
d_tr = np.hstack((d_tr, d_tr))
d_te = np.hstack((d_te, d_te))
x_ab = x_tr[(y_tr == 1) | (y_tr == -1)]
y_ab = y_tr[(y_tr == 1) | (y_tr == -1)]

params = {
    'random_state': 0,
    'n_estimators': E,
    'max_features': None,
    'max_depth': 50,
    'min_samples_leaf': 10,
    'verbose': 10
}

params = {
    'random_state': 0,
    'n_estimators': E,
    'min_samples_leaf': L,
    'n_jobs': 16
}

clf0 = CLF(**params).fit(x_tr, y_tr != 0)  # causal or confounded?
clf1 = CLF(**params).fit(x_ab, y_ab == 1)  # causal or anticausal?
clfd = CLF(**params).fit(x_tr, d_tr)  # dependent or independent?

p_te = clf0.predict_proba(x_te)[:,
                                1] * (2 * clf1.predict_proba(x_te)[:, 1] - 1)

print([score(y_te, p_te), clf0.score(x_te, y_te != 0), clfd.score(x_te, d_te)])