Пример #1
0
def entropy(table, field=-1, num_categories=2):
    """Total entropy for all the categorizations assigned

    sum(p(x) * log(p(x)) for x in count_unique(table, field)

    Which measures how different each categorization is from the others
    """
    from math import log
    counts = count_unique(table, field)
    entropy = 0.0
    try:
        N = table.count()
    except:
        N = len(table)
    for k in counts:
        p = float(counts[k]) / N
        entropy -= p * log(p, num_categories)
    return entropy
Пример #2
0
def entropy(table, field=-1, num_categories=2):
    """Total entropy for all the categorizations assigned

    sum(p(x) * log(p(x)) for x in count_unique(table, field)

    Which measures how different each categorization is from the others
    """
    from math import log
    counts = count_unique(table, field)
    entropy = 0.0
    try:
        N = table.count()
    except:
        N = len(table)
    for k in counts:
        p = float(counts[k]) / N
        entropy -=  p * log(p, num_categories)
    return entropy
Пример #3
0
def gini_impurity(table, field=-1):
    """Gini impurity evaluation of predictions

    Returns the probability [0, 1], that the wrong category/prediction has been assigned.
    """
    try:
        N = table.count()
    except:
        N = len(table)
    counts = count_unique(table, field)
    impurity = 0.0
    for k1 in counts:
        p1 = float(counts[k1]) / N
        for k2 in counts:
            if not k1 == k2:
                p2 = float(counts[k2]) / N
                impurity += p1 * p2
    return impurity
Пример #4
0
def gini_impurity(table, field=-1):
    """Gini impurity evaluation of predictions

    Returns the probability [0, 1], that the wrong category/prediction has been assigned.
    """
    try:
        N = table.count()
    except:
        N = len(table)
    counts = count_unique(table, field)
    impurity = 0.0
    for k1 in counts:
        p1 = float(counts[k1]) / N
        for k2 in counts:
            if not k1 == k2:
                p2 = float(counts[k2]) / N
                impurity += p1 * p2
    return impurity
Пример #5
0
def entropy_and_impurity(table, field=-1, num_categories=2):
    """Gini impurity evaluation of predictions

    Returns the probability [0, 1], that the wrong category/prediction has been assigned.

    >>> entropy_and_impurity(tobes_data, -1)  # doctest: +ELLIPSIS
    (1.50524..., 0.6328125)
    """
    from math import log
    try:
        N = table.count()
    except:
        N = len(table)
    counts = count_unique(table, field)
    impurity = 0.0
    entropy = 0.0
    for k1 in counts:
        p1 = float(counts[k1]) / N
        entropy -= p1 * log(p1, num_categories)
        for k2 in counts:
            if not k1 == k2:
                p2 = float(counts[k2]) / N
                impurity += p1 * p2
    return entropy, impurity
Пример #6
0
def entropy_and_impurity(table, field=-1, num_categories=2):
    """Gini impurity evaluation of predictions

    Returns the probability [0, 1], that the wrong category/prediction has been assigned.

    >>> entropy_and_impurity(tobes_data, -1)  # doctest: +ELLIPSIS
    (1.50524..., 0.6328125)
    """
    from math import log
    try:
        N = table.count()
    except:
        N = len(table)
    counts = count_unique(table, field)
    impurity = 0.0
    entropy = 0.0
    for k1 in counts:
        p1 = float(counts[k1]) / N
        entropy -= p1 * log(p1, num_categories)
        for k2 in counts:
            if not k1 == k2:
                p2 = float(counts[k2]) / N
                impurity += p1 * p2
    return entropy, impurity
Пример #7
0
def build_tree(table, field=-1, scoref=entropy, ignore_fields=('pk', 'id')):
    """Build a classification decision tree

    >>> print_tree(build_tree(tobes_data))  # doctest: +NORMALIZE_WHITESPACE
    0:google? 
      T-> 3:21? 
          T-> {'Premium': 3}
          F-> 2:yes? 
              T-> {'Basic': 1}
              F-> {'None': 1}
      F-> 0:slashdot? 
          T-> {'None': 3}
          F-> 2:yes? 
              T-> {'Basic': 4}
              F-> 3:21? 
                  T-> {'Basic': 1}
                  F-> {'None': 3}
    """
    try:
        N = len(table)
    except:
        try:
            N = table.count()
        except:
            N = 0

    if not N:
        return DecisionNode()

    current_score = scoref(table)

    # Set up some variables to track the best criteria
    best_gain = 0.0
    best_criteria = None
    best_sets = None
    if isinstance(table[0], dict) and isinstance(field, int):
        keys = sorted(tuple(k for k in table[0] if k not in ignore_fields))
        M = len(keys)
    else:
        M = len(table[0])
        keys = range(M)
    keyed_field = keys[field]

    for col in range(M):
        keyed_col = keys[col]

        if keyed_col == keyed_field:
            continue
        # Generate the list of different values in
        # this column
        column_values = set()
        for row in table:
            column_values.add(get(row, keyed_col))
        # Try dividing the table up for each value in this column
        for value in column_values:
            (set1, set2) = divide(table, field=col, target=value)
            set1, set2 = tuple(set1), tuple(set2)

            # Information improvement
            p = float(len(set1)) / N
            gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
            if gain > best_gain and len(set1) > 0 and len(set2) > 0:
                best_gain = gain
                best_criteria = (col, value)
                best_sets = (set1, set2)

    # Create the sub branches
    if best_gain > 0:
        trueBranch = build_tree(best_sets[0])
        falseBranch = build_tree(best_sets[1])
        return DecisionNode(col=best_criteria[0],
                            value=best_criteria[1],
                            tb=trueBranch,
                            fb=falseBranch)
    else:
        return DecisionNode(results=count_unique(
            table, field=keyed_field))  #keyed_field
Пример #8
0
def build_tree(table, field=-1, scoref=entropy, ignore_fields=('pk', 'id')):
    """Build a classification decision tree

    >>> print_tree(build_tree(tobes_data))  # doctest: +NORMALIZE_WHITESPACE
    0:google? 
      T-> 3:21? 
          T-> {'Premium': 3}
          F-> 2:yes? 
              T-> {'Basic': 1}
              F-> {'None': 1}
      F-> 0:slashdot? 
          T-> {'None': 3}
          F-> 2:yes? 
              T-> {'Basic': 4}
              F-> 3:21? 
                  T-> {'Basic': 1}
                  F-> {'None': 3}
    """
    try:
        N = len(table)
    except:
        try:
            N = table.count()
        except:
            N = 0

    if not N:
        return DecisionNode()


    current_score=scoref(table)

    # Set up some variables to track the best criteria
    best_gain=0.0
    best_criteria=None
    best_sets=None
    if isinstance(table[0], dict) and isinstance(field, int):
        keys = sorted(tuple(k for k in table[0] if k not in ignore_fields))
        M = len(keys)
    else:
        M = len(table[0])
        keys = range(M)
    keyed_field = keys[field]

    for col in range(M):
        keyed_col = keys[col]

        if keyed_col == keyed_field:
            continue
        # Generate the list of different values in
        # this column
        column_values = set()
        for row in table:
            column_values.add(get(row, keyed_col))
        # Try dividing the table up for each value in this column
        for value in column_values:
            (set1, set2) = divide(table, field=col, target=value)
            set1, set2 = tuple(set1), tuple(set2)
            
            # Information improvement
            p = float(len(set1)) / N
            gain = current_score - p * scoref(set1) - (1 - p) * scoref(set2)
            if gain > best_gain and len(set1) > 0 and len(set2) > 0:
                best_gain = gain
                best_criteria = (col, value)
                best_sets = (set1, set2)

    # Create the sub branches   
    if best_gain > 0:
        trueBranch = build_tree(best_sets[0])
        falseBranch = build_tree(best_sets[1])
        return DecisionNode(col=best_criteria[0], value=best_criteria[1],
                            tb=trueBranch, fb=falseBranch)
    else:
        return DecisionNode(results=count_unique(table, field=keyed_field))  #keyed_field