예제 #1
0
def _entropy(data, target_attr):
    """
    Calculates the entropy of the given data set, which should be of 
    the form of a distribution of values.
    """
    value_dist = frequencies([instance[target_attr] for instance in data])
    # calculate the entropy of the data associated with `attribute`
    sum_frequencies = float(sum([x[1] for x in value_dist]))
    data_entropy = 0
    for _, frequency in value_dist:
        p_value = frequency / sum_frequencies
        data_entropy += (-p_value * math.log(p_value, 2))
    return data_entropy
예제 #2
0
def information_gain(data, split_attr, target_attr):
    """
    Calculates the information gain (reduction in entropy) that would
    result by splitting the data on `split_attr`
    """
    data_entropy = _entropy(data, target_attr)
    spl_att_val_dist = frequencies([instance[split_attr] for instance in data])
    total_subset_entropy = 0.0

    # For each value in the attribute we are splitting on get the
    # remaining entropy (in terms of the instance class) of the subset
    # with that value
    for value, value_freq in spl_att_val_dist:
        p_value = value_freq / float(len(data))
        data_subset = get_matching_instances(data, split_attr, value)
        total_subset_entropy += p_value * _entropy(data_subset, target_attr)

    # Subtract the subset entropy after we split on `split_attr` from
    # the current data set entropy, giving us the information gain from
    # splitting on `split_attr`
    return data_entropy - total_subset_entropy
예제 #3
0
 def test_frequencies(self):
     """ Tests dtree.frequencies """
     input_ = [1, 2, 2, 3]
     expected = [(1, 1), (2, 2), (3, 1)]
     actual = dtree.frequencies(input_)
     self.assertEquals(set(expected), set(actual))
예제 #4
0
 def test_frequencies(self):
     """ Tests dtree.frequencies """
     input_ = [1, 2, 2, 3]
     expected = [(1, 1), (2, 2), (3, 1)]
     actual = dtree.frequencies(input_)
     self.assertEquals(set(expected), set(actual))