Exemplo n.º 1
0
    def MDLPC_criterion(self, data, feature, cut_point):
        '''
        Determines whether a partition is accepted according to the MDLPC criterion
        :param feature: feature of interest
        :param cut_point: proposed cut_point
        :param partition_index: index of the sample (dataframe partition) in the interval of interest
        :return: True/False, whether to accept the partition
        '''
        #get dataframe only with desired attribute and class columns, and split by cut_point
        data_partition = data.copy(deep=True)
        data_left = data_partition[data_partition[feature] <= cut_point]
        data_right = data_partition[data_partition[feature] > cut_point]

        #compute information gain obtained when splitting data at cut_point
        cut_point_gain = cut_point_information_gain(dataset=data_partition, cut_point=cut_point,
                                                    feature_label=feature, class_label=self._class_name)
        #compute delta term in MDLPC criterion
        N = len(data_partition) # number of examples in current partition
        partition_entropy = entropy(data_partition[self._class_name])
        k = len(data_partition[self._class_name].unique())
        k_left = len(data_left[self._class_name].unique())
        k_right = len(data_right[self._class_name].unique())
        entropy_left = entropy(data_left[self._class_name])  # entropy of partition
        entropy_right = entropy(data_right[self._class_name])
        delta = log(3 ** k, 2) - (k * partition_entropy) + (k_left * entropy_left) + (k_right * entropy_right)

        #to split or not to split
        gain_threshold = (log(N - 1, 2) + delta) / N

        if cut_point_gain > gain_threshold:
            return True
        else:
            return False
Exemplo n.º 2
0
    def MDLPC_criterion(self, data, feature, cut_point):
        '''
        Determines whether a partition is accepted according to the MDLPC criterion
        :param feature: feature of interest
        :param cut_point: proposed cut_point
        :param partition_index: index of the sample (dataframe partition) in the interval of interest
        :return: True/False, whether to accept the partition
        '''
        #get dataframe only with desired attribute and class columns, and split by cut_point
        data_partition = data.copy(deep=True)
        data_left = data_partition[data_partition[feature] <= cut_point]
        data_right = data_partition[data_partition[feature] > cut_point]

        #compute information gain obtained when splitting data at cut_point
        cut_point_gain = cut_point_information_gain(dataset=data_partition, cut_point=cut_point,
                                                    feature_label=feature, class_label=self._class_name)
        #compute delta term in MDLPC criterion
        N = len(data_partition) # number of examples in current partition
        partition_entropy = entropy(data_partition[self._class_name])
        k = len(data_partition[self._class_name].unique())
        k_left = len(data_left[self._class_name].unique())
        k_right = len(data_right[self._class_name].unique())
        entropy_left = entropy(data_left[self._class_name])  # entropy of partition
        entropy_right = entropy(data_right[self._class_name])
        delta = log(3 ** k, 2) - (k * partition_entropy) + (k_left * entropy_left) + (k_right * entropy_right)

        #to split or not to split
        gain_threshold = (log(N - 1, 2) + delta) / N

        if cut_point_gain > gain_threshold:
            return True
        else:
            return False
Exemplo n.º 3
0
def Gain(S, A):
    gain = 0
    values = {}
    lenS = len(S)
    Spos, Sneg = countPN(S, masterkey)
    S_entropy = entropy(Spos, Sneg)

    for s in S:
        value = s[A]
        if value not in values:
            values[value] = [0, 0, 0]
        if s[masterkey] == "yes":
            values[value][0] = values[value][0] + 1
        elif s[masterkey] == "no":
            values[value][1] = values[value][1] + 1

    acum = 0
    S_At = 0
    propt = 0
    for k, v in values.items():
        v[2] = entropy(v[0], v[1])
        propt = (v[0] + v[1]) / lenS
        acum = propt * v[2]
        S_At = S_At - acum

    return S_entropy + S_At
Exemplo n.º 4
0
def compute_weighted_entropy(attribute_vals, ys):
    """ Pivot by match value """
    ys_by_match = defaultdict(list)
    for is_match, y in izip(attribute_vals, ys):
        ys_by_match[is_match].append(y)
    """ Compute entropy, weighted by size of attribute """
    wt_entropy = 0.0
    num_items = float(len(ys))
    for ismatch, lst in ys_by_match.items():
        ent = entropy(lst)
        wt = (len(lst) / num_items)
        wt_entropy += wt * ent
    return wt_entropy
Exemplo n.º 5
0
    def set_entropy(self):
        tic = time()
        node = self.root
        queue = Queue()
        queue.put(node)

        while not queue.empty():
            node = queue.get()
            cnts = []
            for child in node.children.values():
                queue.put(child)
                cnts.append(child.count)
            if cnts:
                cnts = np.array(cnts)
                node.entropy = entropy(cnts)
            else:
                node.entropy = 0
Exemplo n.º 6
0
])
testCol = ['EGITIM', 'YAS', 'CINSIYET']
testtData = pd.DataFrame(testManual, columns=testCol)

columns2 = ['EGITIM', 'YAS', 'CINSIYET', 'KABUL']
testDataManual = pd.DataFrame(testDataManual, columns=columns2)
# data = 'Qualitative_Bankruptcy.data.csv'
# cov19 = 'TimeAge.csv'
# df = pd.DataFrame(np.array(pd.read_csv(data)))
# columns = ['Industrial Risk', 'Management Risk', 'Financial Flexibility', 'Credibility', 'Competitiveness',
#            'Operating Risk', 'Class']
#

data = 'Qualitative_Bankruptcy.data.csv'
# # df = pd.DataFrame(pd.read_csv(data))
# # blueWins = df[['blueWins']]
# # df.drop(columns=['blueWins'],inplace=True)
# # df.insert(len(df.columns), 'blueWins', blueWins)
# # print(df.head())
e = entropy(testDataManual, 'karci', resultCol='KABUL', column_list=columns2)
e.calc()
e.result(testtData)
# for index, row in testtData.iterrows():
#
#     print(row['YAS'])
# rangei =100
# rangej = 1
# for i in range(0,rangei):
#     for j in range(0,rangej):
#         print('\rData Loading {:.5f}% {} {}'.format((i+j/(rangej-1 if rangej != 1 else rangej))/(rangei)*100,i,j), end='\n', flush=False)