def predict_proba(self, dataset, device="cpu", idx=0): """Infer causal directions using the trained NCC pairwise model. Args: dataset (tuple): Couple of np.ndarray variables to classify device (str): Device to run the algorithm on (defaults to ``cdt.SETTINGS.default_device``) Returns: float: Causation score (Value : 1 if a->b and -1 if b->a) """ a, b = dataset device = SETTINGS.get_default(device=device) if self.model is None: print('Model has to be trained before doing any predictions') raise ValueError if len(np.array(a).shape) == 1: a = np.array(a).reshape((-1, 1)) b = np.array(b).reshape((-1, 1)) m = np.hstack((a, b)) m = scale(m) m = m.astype('float32') m = th.from_numpy(m).t().unsqueeze(0) m = m.to(device) return (self.model(m).data.cpu().numpy() - .5) * 2
def predict_dataset(self, df, device=None, verbose=None): """ Args: x_tr (pd.DataFrame): CEPC format dataframe containing the pairs epochs (int): number of train epochs learning rate (float): learning rate of Adam verbose (bool): verbosity (defaults to ``cdt.SETTINGS.verbose``) device (str): cuda or cpu device (defaults to ``cdt.SETTINGS.default_device``) Returns: pandas.DataFrame: dataframe containing the predicted causation coefficients """ verbose, device = SETTINGS.get_default(('verbose', verbose), ('device', device)) dataset = [] for i, (idx, row) in enumerate(df.iterrows()): a = row['A'].reshape((len(row['A']), 1)) b = row['B'].reshape((len(row['B']), 1)) m = np.hstack((a, b)) m = m.astype('float32') m = th.from_numpy(m).t().unsqueeze(0) dataset.append(m) dataset = [m.to(device) for m in dataset] return pd.DataFrame((th.cat( [self.model(m) for m, t in zip(dataset, trange(len(dataset)))], 0).data.cpu().numpy() - .5) * 2)
def __init__(self, score='nonlinear', cutoff=0.001, variablesel=True, selmethod='gamboost', pruning=False, prunmethod='gam', njobs=None, verbose=None): """Init the model and its available arguments.""" if not RPackages.CAM: raise ImportError("R Package CAM is not available.") super(CAM_with_score, self).__init__() self.scores = {'nonlinear': 'SEMGAM', 'linear': 'SEMLIN'} self.var_selection = { 'gamboost': 'selGamBoost', 'gam': 'selGam', 'lasso': 'selLasso', 'linear': 'selLm', 'linearboost': 'selLmBoost' } self.arguments = { '{FOLDER}': '/tmp/cdt_CAM/', '{FILE_TRAIN}': 'train_data.csv', '{FILE_VALID}': 'valid_data.csv', '{TARGETS_TRAIN}': 'targets_train.csv', '{TARGETS_VALID}': 'targets_valid.csv', '{SCORE}': 'SEMGAM', '{VARSEL}': 'TRUE', '{SELMETHOD}': 'selGamBoost', '{PRUNING}': 'TRUE', '{PRUNMETHOD}': 'selGam', '{NJOBS}': str(SETTINGS.NJOBS), '{CUTOFF}': str(0.001), '{VERBOSE}': 'FALSE', '{OUTPUT}': 'result.csv' } self.score = score self.cutoff = cutoff self.variablesel = variablesel self.selmethod = selmethod self.pruning = pruning self.prunmethod = prunmethod self.njobs = SETTINGS.get_default(njobs=njobs) self.verbose = SETTINGS.get_default(verbose=verbose)
def __init__(self, verbose=False): """Init the model and its available arguments.""" if not RPackages.pcalg: raise ImportError("R Package pcalg is not available.") super().__init__() self.arguments = {'{FOLDER}': '/tmp/cdt_pc/', '{FILE}': 'data.csv', '{SKELETON}': 'FALSE', '{GAPS}': 'fixedgaps.csv', '{REGIMES}': 'regimes.csv', '{TARGETS}': 'targets.csv', '{VERBOSE}': 'FALSE', '{ALPHA}': '1e-2', '{OUTPUT}': 'result.csv'} self.verbose = SETTINGS.get_default(verbose=verbose)
def generate(self, npairs, npoints=500, rescale=True, njobs=None): """Generate Causal pairs, such that one variable causes the other. Args: npairs (int): Number of pairs of variables to generate. npoints (int): Number of data points to generate. rescale (bool): Rescale the output with zero mean and unit variance. njobs (int): Number of parallel jobs to execute. Defaults to cdt.SETTINGS.NJOBS Returns: tuple: (pandas.DataFrame, pandas.DataFrame) data and corresponding labels. The data is at the ``SampleID, a (numpy.ndarray) , b (numpy.ndarray))`` format. """ def generate_pair(npoints, label, rescale): root = self.initial_generator(npoints)[:, np.newaxis] cause = self.mechanism(1, npoints, self.noise, noise_coeff=self.noise_coeff)(root) effect = self.mechanism( 1, npoints, self.noise, noise_coeff=self.noise_coeff)(cause).squeeze(1) cause = cause.squeeze(1) if rescale: cause = scale(cause) effect = scale(effect) return (cause, effect) if label == 1 else (effect, cause) njobs = SETTINGS.get_default(njobs=njobs) self.labels = (np.random.randint(2, size=npairs) - .5) * 2 output = [ generate_pair(npoints, self.labels[i], rescale) for i in range(npairs) ] self.data = pd.DataFrame(output, columns=['A', 'B']) self.labels = pd.DataFrame(self.labels, dtype='int32', columns=['label']) return self.data, self.labels
def predict_list(self, l, device=None, verbose=None): """ Args: l (list): CEPC format list containing the pairs verbose (bool): verbosity (defaults to ``cdt.SETTINGS.verbose``) device (str): cuda or cpu device (defaults to ``cdt.SETTINGS.default_device``) Returns: list: list containing the predicted causation coefficients """ verbose, device = SETTINGS.get_default(('verbose', verbose), ('device', device)) # points = [] # out = th.stack([self.model(m.t().unsqueeze(0)) for m in l], 0).squeeze() # for point in l: # m = np.hstack((a, b)) # point = point.astype('float32') # point = th.from_numpy(point).unsqueeze(0) # points.append(point) # points = [m.to(device) for m in points] # a = [self.model(m.t().unsqueeze(0)) for m in l] return [self.model(m.t().unsqueeze(0)) for m in l]
def train(self, X_tr, y_tr, X_val, y_val, epochs=50, batch_size=32, verbose=None, device='cpu', **kwargs): verbose, device = SETTINGS.get_default(('verbose', verbose), ('device', device)) model = self.model.to(device) y = th.Tensor(y_tr) y = y.to(device) dataset = [th.Tensor(x).t().to(device) for x in X_tr] dat = Dataset(dataset, y, device, batch_size) data_per_epoch = (len(dataset) // batch_size) self.model.eval() self.log_values(*self.compute_values(X_tr, y_tr, device), 'train') self.log_values(*self.compute_values(X_val, y_val, device), 'validation') with trange(epochs, desc="Epochs", disable=not verbose) as te: for _ in te: self.model.train() with trange(data_per_epoch, desc="Batches of 2*{}".format(batch_size), disable=not (verbose and batch_size == len(dataset))) as t: output = [] labels = [] for batch, label in dat: symmetric_batch, symmetric_label = th_enforce_symmetry( batch, label, self.anti) batch += symmetric_batch label = th.cat((label, symmetric_label)) self.opt.zero_grad() out = th.stack( [model(m.t().unsqueeze(0)) for m in batch], 0).squeeze() loss = self.criterion(out, label) loss.backward() output.append(expit(out.data.cpu())) t.set_postfix(loss=loss.item()) self.opt.step() labels.append(label.data.cpu()) length = th.cat(output, 0).data.cpu().numpy().size acc = th.where(th.cat(output, 0).data.cpu() > .5, th.ones((length, 1)).data.cpu(), th.zeros((length, 1)).data.cpu()) - \ th.cat(labels, 0).data.cpu() Acc = 1 - acc.abs().mean().item() te.set_postfix(Acc=Acc) self.model.eval() self.log_values(*self.compute_values(X_tr, y_tr, device), 'train') self.log_values(*self.compute_values(X_val, y_val, device), 'validation') return self.log_dict
def _fit(self, x_tr, y_tr, epochs=50, batch_size=32, learning_rate=0.01, verbose=None, device='cpu', half=True): """Fit the NCC model. Args: x_tr (pd.DataFrame): CEPC format dataframe containing the pairs y_tr (pd.DataFrame or np.ndarray): labels associated to the pairs epochs (int): number of train epochs batch_size (int): size of batch learning_rate (float): learning rate of Adam verbose (bool): verbosity (defaults to ``cdt.SETTINGS.verbose``) device (str): cuda or cpu device (defaults to ``cdt.SETTINGS.default_device``) """ if half: batch_size //= 2 if batch_size > len(x_tr): batch_size = len(x_tr) verbose, device = SETTINGS.get_default(('verbose', verbose), ('device', device)) model = self.model # opt = th.optim.Adam(model.parameters(), lr=learning_rate) opt = th.optim.RMSprop(model.parameters(), lr=learning_rate) criterion = nn.BCEWithLogitsLoss() model = model.to(device) y = th.Tensor(y_tr) y = y.to(device) dataset = [th.Tensor(x).t().to(device) for x in x_tr] da = Dataset(dataset, y, device, batch_size) data_per_epoch = (len(dataset) // batch_size) train_accuracy = [] with trange(epochs, desc="Epochs", disable=not verbose) as te: for _ in te: with trange(data_per_epoch, desc="Batches of 2*{}".format(batch_size), disable=not (verbose and batch_size == len(dataset))) as t: output = [] labels = [] for batch, label in da: # for (batch, label), i in zip(da, t): symmetric_batch, symmetric_label = th_enforce_symmetry( batch, label) batch += symmetric_batch label = th.cat((label, symmetric_label)) opt.zero_grad() out = th.stack( [model(m.t().unsqueeze(0)) for m in batch], 0).squeeze(2) loss = criterion(out, label) loss.backward() output.append(expit(out.data.cpu())) t.set_postfix(loss=loss.item()) opt.step() labels.append(label.data.cpu()) length = th.cat(output, 0).data.cpu().numpy().size acc = th.where(th.cat(output, 0).data.cpu() > .5, th.ones((length, 1)).data.cpu(), th.zeros((length, 1)).data.cpu()) - \ th.cat(labels, 0).data.cpu() Acc = 1 - acc.abs().mean().item() te.set_postfix(Acc=Acc) train_accuracy.append(Acc)