def test_check_zero(self): tem = check_zero(0) self.assertEqual(tem != 0., True) tem = check_zero(0.0) self.assertEqual(tem != 0., True) tem = check_zero(1e-23) self.assertEqual(tem != 0., True)
def KappaMulti(ha, hb, y): vY = np.unique(np.concatenate([y, ha, hb])) dY = len(vY) # number of labels / classes ha = np.array(ha) hb = np.array(hb) # # construct a contingency table Cij = np.zeros(shape=(dY, dY)) for i in range(dY): for j in range(dY): Cij[i, j] = np.sum((ha == vY[i]) & (hb == vY[j])) m = len(y) # number of instances / samples # c_diagonal = [Cij[i][i] for i in range(dY)] # Cij[i, i] theta1 = np.sum(c_diagonal) / float(m) c_row_sum = [ np.prod([(Cij[i, i] + Cij[i, j]) for j in range(dY) if (j != i)]) for i in range(dY) ] c_col_sum = [ np.prod([(Cij[i, j] + Cij[j, j]) for i in range(dY) if (i != j)]) for j in range(dY) ] theta2 = np.sum(np.multiply(c_row_sum, c_col_sum)) / (float(m)**2) # ans = (theta1 - theta2) / check_zero(1. - theta2) del dY, ha, hb, Cij, m del c_row_sum, c_col_sum, c_diagonal gc.collect() return ans, theta1, theta2
def KLD(p, q): p = np.array(p, dtype=DTY_FLT) q = np.array(q, dtype=DTY_FLT) if np.sum(p) != 1.0: tem = np.sum(p) p /= check_zero(tem) if np.sum(q) != 1.0: tem = np.sum(q) q /= check_zero(tem) ans = 0. n = len(p) for i in range(n): tem = p[i] / check_zero(q[i]) tem = p[i] * np.log(check_zero(tem)) ans += tem return ans
def AdaBoostSelectTraining(X_trn, y_trn, weight): X_trn = np.array(X_trn, dtype=DTY_FLT) y_trn = np.array(y_trn, dtype=DTY_INT) weight = np.array(weight, dtype=DTY_FLT) vY = np.unique(y_trn) dY = len(vY) stack_X = [] stack_y = [] # init for k in range(dY): idx = (y_trn == vY[k]) tem_X = X_trn[idx].tolist() tem_y = y_trn[idx].tolist() tem_w = weight[idx] tem_w /= check_zero(np.sum(tem_w)) tem_w = tem_w.tolist() wX, wy = resample(tem_X, tem_y, tem_w) stack_X.append(deepcopy(wX)) stack_y.append(deepcopy(wy)) del idx, tem_X, tem_y, tem_w, wX, wy del X_trn, y_trn, weight, vY, dY tem_X = np.concatenate(stack_X, axis=0) tem_y = np.concatenate(stack_y, axis=0) randseed = int(time.time() * GAP_MID % GAP_INF) prng = np.random.RandomState(randseed) idx = list(range(len(tem_y))) prng.shuffle(idx) wX = tem_X[idx].tolist() wy = tem_y[idx].tolist() del stack_X, stack_y, tem_X, tem_y, idx, randseed, prng gc.collect() return deepcopy(wX), deepcopy(wy) # list
def AdaBoostEnsembleAlgorithm(X_trn, y_trn, name_cls, nb_cls): # Y\in {0,1} # translate: y_trn = [ i*2-1 for i in y_trn] # Notice alpha here is relevant to this algorithm named AdaBoost. clfs = [] nb_trn = len(y_trn) # initial weight = np.zeros((nb_cls, nb_trn), dtype=DTY_FLT) em = [0.0] * nb_cls alpha = [0.0] * nb_cls weight[0] = np.ones(nb_trn, dtype=DTY_FLT) / nb_trn for k in range(nb_cls): nb_count = 20 while nb_count >= 0: # resample data: route wheel bat wX, wy = AdaBoostSelectTraining(X_trn, y_trn, weight[k].tolist()) # train a base classifier and run it on ORIGINAL training clf = individual(name_cls, wX, wy) inspect = clf.predict(X_trn) # calculate the error rate i_tr = (inspect != np.array(y_trn)) em[k] = np.sum(weight[k] * i_tr) if em[k] >= 0. and em[k] < 0.5: break nb_count -= 1 del wX, wy del nb_count clfs.append(deepcopy(clf)) # calculate alpha alpha[k] = 0.5 * np.log2(check_zero((1. - em[k]) / check_zero(em[k]))) # update weights. Notice that: y \in {-1,+1} here, transform from {0,1} i_tr = (np.array(y_trn) * 2 - 1) * (inspect * 2 - 1) if k + 1 < nb_cls: weight[k + 1] = weight[k] * np.exp(-1. * alpha[k] * i_tr) zm = np.sum(weight[k + 1]) weight[k + 1] /= check_zero(zm) # regularization: alpha, sigma(coef)=1. am = np.sum(alpha) alpha = [i / am for i in alpha] del weight, em, clf, i_tr, zm, am gc.collect() return deepcopy(alpha), deepcopy(clfs)
def Interrater_agreement_multiclass(yt, y, m, nb_cls): y = np.array(y, dtype=DTY_INT) yt = np.array(yt, dtype=DTY_INT) p_bar = np.sum(np.sum(yt == y, axis=1)) / (float(m) * nb_cls) rho_x = np.sum(yt == y, axis=0) numerator = np.sum(rho_x * (nb_cls - rho_x)) / float(nb_cls) denominator = m * (nb_cls - 1.) * p_bar * (1. - p_bar) return 1. - numerator / check_zero(denominator)
def angle(a, b): a = np.array(a); b = np.array(b) # dot product, scalar product prod = np.sum(a * b) # $a \cdot b$ # or: prod = np.dot(a, b) # norm / module len1 = np.sqrt(np.sum(a * a)) # $|a|, |b|$ len2 = np.sqrt(np.sum(b * b)) # $\cos(\theta)$ cos_theta = prod / check_zero(len1 * len2) theta = np.arccos(cos_theta) del a,b, prod,len1,len2, cos_theta gc.collect() return theta
def test_Kappa_Statistic(self): m = 100 y1, yt1, y2, yt2 = negative_generate_simulate(m, 2) ha1, hb1 = yt1 ha2, hb2 = yt2 d1 = Kappa_Statistic_binary(ha1, hb1, m) d2 = Kappa_Statistic_binary(ha2, hb2, m) self.assertEqual(d1, d2) d3 = Kappa_Statistic_multiclass(ha1, hb1, y1, m) d4 = Kappa_Statistic_multiclass(ha2, hb2, y2, m) self.assertEqual(all(np.array(d3) == np.array(d4)), True) self.assertEqual(d1, d3[0]) self.assertEqual(d2, d4[0]) y3, yt3 = generate_simulated_data(m, 7, 2) d3, t1, t2 = Kappa_Statistic_multiclass(yt3[0], yt3[1], y3, m) self.assertEqual((t1 - t2) / check_zero(1. - t2), d3)
def Coincident_Failure_multiclass(yt, y, m, nb_cls): y = np.array(y, dtype=DTY_INT) yt = np.array(yt, dtype=DTY_INT) failing = np.sum(yt != y, axis=0) pi = [] for i in range(nb_cls + 1): tem = np.sum(failing == i) / float(m) pi.append(tem) # # if pi[0] == 1.: return 0. if pi[0] < 1.: ans = 0. for i in range(1, nb_cls + 1): ans += pi[i] * (nb_cls - i) / (nb_cls - 1.) # # return ans / check_zero(1. - pi[0]) return
def Generalized_Diversity_multiclass(yt, y, m, nb_cls): y = np.array(y, dtype=DTY_INT) yt = np.array(yt, dtype=DTY_INT) failing = np.sum(yt != y, axis=0) # failing = np.sum(yt != y, axis=0) / nb_cls * nb_cls # pi = [-1.] for i in range(1, nb_cls + 1): tem = np.sum(failing == i) / float(m) pi.append(tem) # # p_1 = 0. for i in range(1, nb_cls + 1): p_1 += pi[i] * i / nb_cls p_2 = 0. for i in range(1, nb_cls + 1): p_2 += pi[i] * (i * (i - 1.) / nb_cls / (nb_cls - 1.)) # # return 1. - p_2 / check_zero(p_1)
def Kappa_Statistic_multiclass(hi, hj, y, m): # m = len(y) # number of instances / samples vY = np.unique(np.concatenate([y, hi, hj])) # L dY = len(vY) Cij = multiclass_contingency_table(hi, hj, y) Cij = np.array(Cij, dtype=DTY_FLT) # c_diagonal = [Cij[i, i] for i in range(dY)] theta1 = np.sum(c_diagonal) / float(m) # c_row_sum = [ np.prod([Cij[i, i] + Cij[i, j] for j in range(dY) if j != i]) for i in range(dY) ] c_col_sum = [ np.prod([Cij[i, j] + Cij[j, j] for i in range(dY) if i != j]) for j in range(dY) ] theta2 = np.sum(np.multiply(c_row_sum, c_col_sum)) / (float(m)**2) # ans = (theta1 - theta2) / check_zero(1. - theta2) return ans, theta1, theta2
def Kappa_Statistic_binary(hi, hj, m): a, b, c, d = contingency_table(hi, hj) Theta_1 = (a + d) / float(m) Theta_2 = ((a + b) * (a + c) + (c + d) * (b + d)) / (float(m)**2) return (Theta_1 - Theta_2) / check_zero(1. - Theta_2)
def Correlation_Coefficient_binary(hi, hj): a, b, c, d = contingency_table(hi, hj) denominator = (a + b) * (a + c) * (c + d) * (b + d) denominator = np.sqrt(denominator) return (a * d - b * c) / check_zero(denominator)
def Q_Statistic_binary(hi, hj): a, b, c, d = contingency_table(hi, hj) tem = a * d + b * c return (a * d - b * c) / check_zero(tem)
def Entropy_sk_multiclass(yt, y, m, nb_cls): rho_x = number_individuals_correctly(yt, y) rho_x = np.array(rho_x, dtype=DTY_FLT) tmp = list(map(min, rho_x, nb_cls - rho_x)) denominator = nb_cls - np.ceil(nb_cls / 2.) return np.sum(tmp) / float(m) / check_zero(denominator)