Exemplo n.º 1
0
    def testClassifierFromDf(self):
        csvfile = self.agrumSrcDir('miniasia.csv')

        df_asia = pd.read_csv(csvfile)
        asia_target_column = 'lung_cancer'

        x_train_asia = df_asia[:9000].drop(asia_target_column, axis=1)
        y_train_asia = df_asia[:9000][asia_target_column]

        x_test_asia = df_asia[-1000:].drop(asia_target_column, axis=1)
        y_test_asia = df_asia[-1000:][asia_target_column]

        classif2 = skbn.BNClassifier()
        classif2.fit(x_train_asia, y_train_asia)

        self.assertEqual(classif2.bn.size(), 8)

        self.assertEqual(classif2.target, asia_target_column)
        self.assertTrue(classif2.threshold <= 1)

        yproba = classif2.predict_proba(x_test_asia)
        self.assertEqual(yproba.shape, (299, 2))
        self.assertEqual(yproba[0].sum(), 1)

        ypred = classif2.predict(x_test_asia)
        self.assertEqual(ypred.shape, (299, ))
        self.assertIn(ypred[0], [0, 1])

        self.assertGreater(classif2.MarkovBlanket.size(), 0)

        classif3 = skbn.BNClassifier()
        classif3.fit(data=csvfile, targetName="lung_cancer")

        self.assertEqual(classif3.bn.size(), 8)

        self.assertEqual(classif3.target, asia_target_column)
        self.assertTrue(classif3.threshold <= 1)

        df = pd.read_csv(csvfile)
        classif4 = skbn.BNClassifier()
        classif4.fit(data=df, targetName="lung_cancer")

        self.assertEqual(classif4.bn.size(), 8)

        self.assertEqual(classif4.target, asia_target_column)
        self.assertTrue(classif4.threshold <= 1)

        # some instantiation of parents are missing : No prior should lead to division by 0
        classif3 = skbn.BNClassifier(aPriori="NoPrior")
        with self.assertRaises(gum.DatabaseError):
            classif3.fit(x_train_asia, y_train_asia)
Exemplo n.º 2
0
    def test_with_discretization(self):
        X = pd.DataFrame([
            [1, 1.5, "A", True],
            [2, 2.6, "B", False],
            [3, 3.14, "B", True],
            [1, 0.5, "A", False],
            [1, 0.15, "A", True],
        ])
        y = [3, 2, 3, 1, 2]
        classifier = skbn.BNClassifier(discretizationThreshold=3,
                                       discretizationNbBins=3)
        classifier.fit(X, y)
        res = classifier.preparedData(X, y)
        self.assertEquals(res["X1"][1], "[2.23333;3.14)")
        self.assertEquals(str(res["X3"][3]), "False")

        X = pd.DataFrame([
            [1, 0, "A", True],
            [1, 4, "B", False],
            [2, 3.11, "B", True],
            [2, 0.5, "A", False],
            [3, 0.15, "A", True],
            [3, 203, "A", True],
        ])
        y = [3, 2, 3, 1, 2, 1]
        res = classifier.preparedData(X, y)
        self.assertEquals(res["X1"][0], "(0.15;0.833333[")
        self.assertEquals(str(res["X3"][2]), "True")
Exemplo n.º 3
0
 def test_with_nparray(self):
     iris = datasets.load_iris()
     X = iris.data[:, 0:
                   2]  # we only take the first two features for visualization
     y = iris.target
     classifier = skbn.BNClassifier(discretizationThreshold=3,
                                    discretizationNbBins=3)
     classifier.fit(X, y)
     res = classifier.preparedData(X, y)
     # X0 and X1 are discretized so the labels should start with '[' but the rest is random (chosen by load_iris)...
     self.assertEquals(res["x0"][149][0], "[")
     self.assertEquals(res["x1"][149][0], "[")
Exemplo n.º 4
0
    def testFitFromCsv(self):
        csvfile = self.agrumSrcDir('miniasia.csv')

        asia_target_column = 'lung_cancer'

        classif1 = skbn.BNClassifier()
        classif1.fit(data=csvfile, targetName=asia_target_column)

        self.assertEqual(classif1.bn.size(), 8)
        self.assertEqual(classif1.target, asia_target_column)
        self.assertTrue(classif1.threshold <= 1)

        self.assertGreater(classif1.MarkovBlanket.size(), 0)
Exemplo n.º 5
0
    def test_no_discretization(self):
        X = pd.DataFrame([
            [1, 1.5, "A", True],
            [2, 2.6, "B", False],
            [3, 3.14, "B", True],
            [1, 0.5, "A", False],
            [1, 0.15, "A", True],
        ])
        y = [3, 2, 3, 1, 2]
        classifier = skbn.BNClassifier()
        classifier.fit(X, y)
        res = classifier.preparedData(X, y)
        self.assertEquals(str(res["X1"][1]), "2.6")
        self.assertEquals(str(res["X3"][3]), "False")

        X = pd.DataFrame([[1, 0, "A", True]])
        y = [3]
        with self.assertRaises(gum.OutOfBounds):
            res = classifier.preparedData(X, y)
Exemplo n.º 6
0
 def test_with_file(self):
     classifier = skbn.BNClassifier()
     classifier.fit(data=self.agrumSrcDir("miniasia.csv"),
                    targetName="dyspnoea")
     res = classifier.preparedData(data=self.agrumSrcDir("miniasia.csv"))
     self.assertEquals(str(res["lung_cancer"][0]), "0")
Exemplo n.º 7
0
def _computepoints(bn, csv_name, target, label, show_progress=True, with_labels=True, significant_digits=10):
  """
  Compute the ROC curve points.

  Parameters
  ----------
  bn : pyAgrum.BayesNet
    a Bayesian network
  csv_name : str
    a csv filename
  target : str
    the target
  label : str
    the target's label
  show_progress : bool
    indicates if the resulting curve must be printed
  significant_digits:
    number of significant digits when computing probabilities

  Returns
  -------
  tuple (res, totalP, totalN)
    where res is a list of (proba,isWellClassified) for each line of csv_name.

  """
  idTarget = bn.idFromName(target)
  label = str(label)

  if not with_labels:
    idLabel = -1
    for i in range(bn.variable(idTarget).domainSize()):
      if bn.variable(idTarget).label(i) == label:
        idLabel = i
        break
    assert idLabel >= 0
  else:
    idLabel = label

  Classifier = skbn.BNClassifier(significant_digit=significant_digits)

  if show_progress:
    # tqdm is optional:
    # pylint: disable=import-outside-toplevel
    from tqdm import tqdm
    pbar = tqdm(total=_lines_count(csv_name) - 1, desc=csv_name,
                bar_format='{desc}: {percentage:3.0f}%|{bar}|')

  Classifier.fromTrainedModel(bn, target, idLabel)
  # as a Binary classifier, y will be a list of True (good classification) and False (bad one)
  X, y = Classifier.XYfromCSV(csv_name, with_labels=with_labels, target=target)
  predictions = Classifier.predict_proba(X)

  totalP = np.count_nonzero(y)
  totalN = len(y) - totalP
  res = []
  for i in range(len(X)):
    px = predictions[i][1]
    res.append((px, y[i]))

    if show_progress:
      pbar.update()

  if show_progress:
    pbar.close()

  return res, totalP, totalN