Exemplo n.º 1
0
def outer_su(idadf1,
             key1,
             idadf2,
             key2,
             target=None,
             features1=None,
             features2=None):
    """
    Compute the symmetric uncertainty coefficients between a set of features
    and a set of target from two different IdaDataFrames on a particular key. 
    
    This is experimental 
    """
    target1, features1 = _check_input(idadf1, target, features1)
    target2, features2 = _check_input(idadf2, None, features2)

    if key1 not in idadf1.columns:
        raise ValueError("%s is not a column in idadf1")
    if key2 not in idadf2.columns:
        raise ValueError("%s is not a column in idadf2")

    condition = "a.\"%s\" = b.\"%s\"" % (key1, key2)

    if key2 in features2:
        features2.remove(key2)

    afeaturesas = ", ".join([
        "a.\"%s\" as \"a.%s\" " % (feature, feature) for feature in features1
    ])
    bfeaturesas = ", ".join([
        "b.\"%s\" as \"b.%s\" " % (feature, feature) for feature in features2
    ])

    selectlist = [afeaturesas, bfeaturesas]

    if target1 is not None:
        atargetas = ", ".join(
            ["a.\"%s\" as \"a.%s\" " % (tar, tar) for tar in [target1]])
        selectlist.append(atargetas)
        atarget = "a." + target1
    else:
        atarget = None

    abfeatures = ["a." + feature for feature in features1
                  ] + ["b." + feature for feature in features2]
    selectstr = ", ".join(selectlist)

    expression = "SELECT %s FROM %s as a FULL OUTER JOIN %s as b ON %s" % (
        selectstr, idadf1.name, idadf2.name, condition)

    viewname = idadf1._idadb._create_view_from_expression(expression)

    try:
        idadf_join = ibmdbpy.IdaDataFrame(idadf1._idadb, viewname)
        return su(idadf_join, target=atarget, features=abfeatures)
    except:
        raise
    finally:
        idadf1._idadb.drop_view(viewname)
def outer_su(idadf1, key1, idadf2, key2, target = None, features1 = None, features2 = None):
    """
    Compute the symmetric uncertainty coefficients between a set of features
    and a set of target from two different IdaDataFrames on a particular key. 
    
    This is experimental 
    """
    target1, features1 = _check_input(idadf1, target, features1)
    target2, features2 = _check_input(idadf2, None, features2)
    
    if key1 not in idadf1.columns:
        raise ValueError("%s is not a column in idadf1")
    if key2 not in idadf2.columns:
        raise ValueError("%s is not a column in idadf2")
       
    condition = "a.\"%s\" = b.\"%s\""%(key1,key2)
    
    if key2 in features2:
        features2.remove(key2)
    
    afeaturesas = ", ".join(["a.\"%s\" as \"a.%s\" "%(feature, feature) for feature in features1])
    bfeaturesas = ", ".join(["b.\"%s\" as \"b.%s\" "%(feature, feature) for feature in features2])
    
    selectlist = [afeaturesas, bfeaturesas]
    
    if target1 is not None:
        atargetas = ", ".join(["a.\"%s\" as \"a.%s\" "%(tar, tar) for tar in [target1]])
        selectlist.append(atargetas)
        atarget = "a." + target1
    else:
        atarget = None
        
    abfeatures = ["a." + feature for feature in features1] + ["b." + feature for feature in features2]
    selectstr = ", ".join(selectlist)
    
    expression = "SELECT %s FROM %s as a FULL OUTER JOIN %s as b ON %s"%(selectstr, idadf1.name, idadf2.name, condition)
    
    viewname = idadf1._idadb._create_view_from_expression(expression)
    
    try:
        idadf_join = ibmdbpy.IdaDataFrame(idadf1._idadb, viewname)
        return su(idadf_join, target = atarget, features = abfeatures)
    except:
        raise
    finally:
        idadf1._idadb.drop_view(viewname)
Exemplo n.º 3
0
def chisquared(idadf, target = None, features = None, ignore_indexer=True):
    """
    Compute the Chi-Squared statistics coefficients between a set of features 
    and a set of target in an IdaDataFrame. 
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
    
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be categorical, otherwise 
    this measure does not make much sense. 
    
    Chi-squared as defined in 
    A Comparative Study on Feature Selection and Classification Methods Using 
    Gene Expression Profiles and Proteomic Patterns. (GIW02F006)
    
    The scalability of this approach is not very good. Should not be used on 
    high dimensional data. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> chisquared(idadf)
    """
    # Check input
    target, features = _check_input(idadf, target, features, ignore_indexer)
    count_dict = dict()
    length = len(idadf)
    
    values = OrderedDict()
         
    for t in target:   
        if t not in values:
            values[t] = OrderedDict() 
        features_notarget = [x for x in features if (x != t)]
        
        ### Compute
        for feature in features_notarget:
            if feature not in values:
                values[feature] = OrderedDict()
            if t not in values[feature]:
                if t not in count_dict:
                    count = idadf.count_groupby(t)
                    count_serie = count["count"]
                    count_serie.index = count[t]
                    count_dict[t] = count_serie
            
                C = dict(count_dict[t])
                
                if feature not in count_dict:
                    count = idadf.count_groupby(feature)
                    count_serie = count["count"]
                    count_serie.index = count[feature]
                    count_dict[feature] = count_serie
                    
                R = dict(count_dict[feature])
                
                if (feature, t) not in count_dict:
                    count_dict[(feature, t)] = idadf.count_groupby([feature , t])
                
                count = count_dict[(feature, t)]
                
                chi = 0            
                for target_class in C.keys():
                    count_target = count[count[t] == target_class][[feature, "count"]]
                    A_target = count_target['count']
                    A_target.index = count_target[feature]
                    
                    for feature_class in A_target.index:
                        a = A_target[feature_class]
                        e = R[feature_class] * C[target_class] / length
                        chi += ((a - e)**2)/e
                
                values[t][feature] = chi   # chisquared is symmetric 
                if feature in target:
                    values[feature][t] = chi
        
    result = pd.DataFrame(values).fillna(np.nan)
    result = result.dropna(axis=1, how="all")
        
    if len(result.columns) > 1:
        order = [x for x in result.columns if x in features] + [x for x in features if x not in result.columns]
        result = result.reindex(order)
    
    if len(result.columns) == 1:
        if len(result) == 1:
            result = result.iloc[0,0]
        else:
            result = result[result.columns[0]].copy()
            result.sort(ascending = False) 
        

    
    
    return result
Exemplo n.º 4
0
def info_gain(idadf, target = None, features = None, ignore_indexer=True):
    """
    Compute the information gain / mutual information coefficients between a 
    set of features and a set of target in an IdaDataFrame. 
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
    
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be categorical, otherwise 
    this measure does not make much sense. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> info_gain(idadf)
    """
    # Check input
    target, features = _check_input(idadf, target, features, ignore_indexer)
    
    entropy_dict = OrderedDict()
    length = len(idadf)
    loglength = log(length)
    
    values = OrderedDict()
    
    for t in target:
        if t not in values:
            values[t] = OrderedDict() 
        features_notarget = [x for x in features if (x != t)]
        
        for feature in features_notarget:
            if feature not in values:
                values[feature] = OrderedDict()
            if t not in values[feature]:
                if t not in entropy_dict:
                    entropy_dict[t] = entropy(idadf, t, mode = "raw")
                if feature not in entropy_dict:
                    entropy_dict[feature] = entropy(idadf, feature, mode = "raw")
                join_entropy = entropy(idadf, [t] + [feature], mode = "raw")            
                
                value = ((entropy_dict[t] + entropy_dict[feature] - join_entropy)/length + loglength)/log(2)
                values[t][feature] = value
                if feature in target:
                    values[feature][t] = value
    
    result = pd.DataFrame(values).fillna(np.nan)
    result = result.dropna(axis=1, how="all")
    
    if len(result.columns) > 1:
        order = [x for x in result.columns if x in features] + [x for x in features if x not in result.columns]
        result = result.reindex(order)
    
    if len(result.columns) == 1:
        if len(result) == 1:
            result = result.iloc[0,0]
        else:
            result = result[result.columns[0]].copy()
            result.sort(ascending = False) 

    return result        
Exemplo n.º 5
0
def ttest(idadf, target=None, features=None, ignore_indexer=True):
    """
    Compute the t-statistics values of a set of features against a set of 
    target attributes. 
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against which the t-statistcs values will 
        be computed. Per default, consider all columns
    
    features : str or list of str, optional
        A column or list of columns for which the t-statistics values will be 
        computed against each target attributes. Per default, consider all 
        columns, except non numerical columns. 
    
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Raises
    ------
    TypeError
        If the features argument or the data set does not contains any 
        numerical features. Raise TypeError. 
        
    Notes
    -----
    This implements the "modified" ttest as defined in the paper
    A Modified T-test feature Selection Method and Its Application on
    the HapMap Genotype Data (Zhou et al.)
    
    The target columns should be categorical, while the feature columns should
    be numerical.
    
    The scalability of this approach is not very good. Should not be used on 
    high dimensional data. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> ttest(idadf,"CLASS")
    """
    # Check input
    target, features = _check_input(idadf, target, features, ignore_indexer)
    ttest_dict = OrderedDict()
    length = len(idadf)
    
    S_dict = dict()
    M_dict = dict()
    class_mean_dict = dict()
    
    numerical_columns = idadf._get_numerical_columns()
    
    # Filter out non numerical columns
    features = [feature for feature in features if feature in numerical_columns]
    if not features:
        raise TypeError("No numerical features.")
        
    #mean = idadf[features].mean() # This is broken
    mean = idadf.mean()
    
    if target is None:
        target = list(idadf.columns)
            
    for t in target:
        features_notarget = [x for x in features if (x != t)]
    
        if t not in M_dict:
            count = idadf.count_groupby(t)    
            target_count = count["count"]
            target_count.index = count[t]
            M_dict[t] = np.sqrt(1/target_count + 1/length)     
        
        if t not in S_dict:
            S_dict[t] = idadf.within_class_std(target = t, features = features_notarget)
        
        if t not in class_mean_dict:
            class_mean_dict[t] = idadf.mean_groupby(t, features = features_notarget)
            
        M = M_dict[t]
        S = S_dict[t]
        class_mean = class_mean_dict[t]
        
        ttest_dict[t] = OrderedDict()
        for feature in features_notarget:
            ttest_dict[t][feature] = OrderedDict()
            for target_class in class_mean.index:
                numerator = abs(class_mean.loc[target_class][feature] - mean[feature])
                denominator = M[target_class] * S[feature]
                
                ttest_dict[t][feature][target_class] = numerator / denominator
                    
        for feature in features_notarget:
            ttest_dict[t][feature] = max(ttest_dict[t][feature].values())
        
    result = pd.DataFrame(ttest_dict)
    
    if len(result.columns) == 1:
        if len(result) == 1:
            result = result.iloc[0,0]
        else:
            result = result[result.columns[0]].copy()
            result.sort(ascending = False) 
    else:
        order = [x for x in result.columns if x in features] + [x for x in features if x not in result.columns]
        result = result.reindex(order)
        
    return result
Exemplo n.º 6
0
def ttest(idadf, target=None, features=None, ignore_indexer=True):
    """
    Compute the t-statistics values of a set of features against a set of 
    target attributes. 
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against which the t-statistcs values will 
        be computed. Per default, consider all columns
    
    features : str or list of str, optional
        A column or list of columns for which the t-statistics values will be 
        computed against each target attributes. Per default, consider all 
        columns, except non numerical columns. 
    
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Raises
    ------
    TypeError
        If the features argument or the data set does not contains any 
        numerical features. Raise TypeError. 
        
    Notes
    -----
    This implements the "modified" ttest as defined in the paper
    A Modified T-test feature Selection Method and Its Application on
    the HapMap Genotype Data (Zhou et al.)
    
    The target columns should be categorical, while the feature columns should
    be numerical.
    
    The scalability of this approach is not very good. Should not be used on 
    high dimensional data. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> ttest(idadf,"CLASS")
    """
    # Check input
    target, features = _check_input(idadf, target, features, ignore_indexer)
    ttest_dict = OrderedDict()
    length = len(idadf)

    S_dict = dict()
    M_dict = dict()
    class_mean_dict = dict()

    numerical_columns = idadf._get_numerical_columns()

    # Filter out non numerical columns
    features = [
        feature for feature in features if feature in numerical_columns
    ]
    if not features:
        raise TypeError("No numerical features.")

    #mean = idadf[features].mean() # This is broken
    mean = idadf.mean()

    if target is None:
        target = list(idadf.columns)

    for t in target:
        features_notarget = [x for x in features if (x != t)]

        if t not in M_dict:
            count = idadf.count_groupby(t)
            target_count = count["count"]
            target_count.index = count[t]
            M_dict[t] = np.sqrt(1 / target_count + 1 / length)

        if t not in S_dict:
            S_dict[t] = idadf.within_class_std(target=t,
                                               features=features_notarget)

        if t not in class_mean_dict:
            class_mean_dict[t] = idadf.mean_groupby(t,
                                                    features=features_notarget)

        M = M_dict[t]
        S = S_dict[t]
        class_mean = class_mean_dict[t]

        ttest_dict[t] = OrderedDict()
        for feature in features_notarget:
            ttest_dict[t][feature] = OrderedDict()
            for target_class in class_mean.index:
                numerator = abs(class_mean.loc[target_class][feature] -
                                mean[feature])
                denominator = M[target_class] * S[feature]

                ttest_dict[t][feature][target_class] = numerator / denominator

        for feature in features_notarget:
            ttest_dict[t][feature] = max(ttest_dict[t][feature].values())

    result = pd.DataFrame(ttest_dict)

    if len(result.columns) == 1:
        if len(result) == 1:
            result = result.iloc[0, 0]
        else:
            result = result[result.columns[0]].copy()
            result.sort(ascending=False)
    else:
        order = [x for x in result.columns if x in features
                 ] + [x for x in features if x not in result.columns]
        result = result.reindex(order)

    return result
Exemplo n.º 7
0
def gini_pairwise(idadf, target=None, features=None, ignore_indexer=True):
    """
    Compute the conditional gini coefficients between a set of features and a 
    set of target in an IdaDataFrame. 
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
    
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be categorical, otherwise 
    this measure does not make much sense. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> gini_pairwise(idadf)
    """
    # Check input
    target, features = _check_input(idadf, target, features, ignore_indexer)

    gini_dict = OrderedDict()
    length = len(idadf)

    for t in target:
        gini_dict[t] = OrderedDict()
        features_notarget = [x for x in features if (x != t)]

        for feature in features_notarget:
            if t not in gini_dict:
                gini_dict[t] = OrderedDict()

            query = (
                "SELECT SUM((POWER(c,2) - gini)/c)/%s FROM " +
                "(SELECT SUM(POWER(count,2)) as gini, SUM(count) as c FROM " +
                "(SELECT CAST(COUNT(*) AS FLOAT) AS count, \"%s\" FROM %s GROUP BY \"%s\",\"%s\") "
                + "GROUP BY \"%s\")")
            query0 = query % (length, feature, idadf.name, t, feature, feature)
            gini_dict[t][feature] = idadf.ida_scalar_query(query0)

    result = pd.DataFrame(gini_dict).fillna(np.nan)

    if len(result.columns) > 1:
        order = [x for x in result.columns if x in features
                 ] + [x for x in features if x not in result.columns]
        result = result.reindex(order)

    result = result.dropna(axis=1, how="all")

    if len(result.columns) == 1:
        if len(result) == 1:
            result = result.iloc[0, 0]
        else:
            result = result[result.columns[0]].copy()
            result.sort_values(ascending=True)
    else:
        result = result.fillna(0)

    return result
Exemplo n.º 8
0
def pearson(idadf, target=None, features=None, ignore_indexer=True):
    """
    Compute the pearson correlation coefficients between a set of features and a 
    set of target in an IdaDataFrame. Provide more granualirity than 
    IdaDataFrame.corr
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
        
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be numerical. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> pearson(idadf)
    """
    numerical_columns = idadf._get_numerical_columns()
    if features is None:
        features = numerical_columns

    target, features = _check_input(idadf, target, features, ignore_indexer)

    value_dict = OrderedDict()

    for feature in features:
        if feature not in numerical_columns:
            raise TypeError(
                "Correlation-based measure not available for non-numerical column %s"
                % feature)

    if target == features:
        return idadf.corr(features=features, ignore_indexer=ignore_indexer)
    else:
        for t in target:
            if feature not in numerical_columns:
                raise TypeError(
                    "Correlation-based measure not available for non-numerical column %s"
                    % t)

        for t in target:
            value_dict[t] = OrderedDict()

            features_notarget = [x for x in features if x != t]

            if len(features_notarget) < 64:
                agg_list = [
                    "CORRELATION(\"%s\",\"%s\")" % (x, t)
                    for x in features_notarget
                ]
                agg_string = ', '.join(agg_list)
                name = idadf.internal_state.current_state
                data = idadf.ida_query("SELECT %s FROM %s" %
                                       (agg_string, name),
                                       first_row_only=True)
            else:
                chunkgen = chunklist(features_notarget, 100)
                data = ()
                for chunk in chunkgen:
                    agg_list = [
                        "CORRELATION(\"%s\",\"%s\")" % (x, t) for x in chunk
                    ]
                    agg_string = ', '.join(agg_list)

                    name = idadf.internal_state.current_state
                    data += idadf.ida_query("SELECT %s FROM %s" %
                                            (agg_string, name),
                                            first_row_only=True)

            for i, feature in enumerate(features_notarget):
                value_dict[t][feature] = data[i]

        ### Fill the matrix
        result = pd.DataFrame(value_dict).fillna(1)

        if len(result.columns) == 1:
            if len(result) == 1:
                result = result.iloc[0, 0]
            else:
                result = result[result.columns[0]].copy()
                result.sort_values(inplace=True, ascending=False)
        else:
            order = [x for x in result.columns if x in features
                     ] + [x for x in features if x not in result.columns]
            result = result.reindex(order)

        return result
Exemplo n.º 9
0
def spearman(idadf, target=None, features=None, ignore_indexer=True):
    """
    Compute the spearman rho correlation coefficients between a set of features 
    and a set of target in an IdaDataFrame.
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
        
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be numerical. 
    This function is a wrapper for pearson. 
    The scalability of this approach is not very good. Should not be used on 
    high dimensional data. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> spearman(idadf)
    """
    numerical_columns = idadf._get_numerical_columns()
    if features is None:
        features = numerical_columns

    target, features = _check_input(idadf, target, features, ignore_indexer)

    for feature in features:
        if feature not in numerical_columns:
            raise TypeError(
                "Correlation-based measure not available for non-numerical column %s"
                % feature)

    if ignore_indexer is True:
        if idadf.indexer:
            if idadf.indexer in numerical_columns:
                features.remove(idadf.indexer)

    if features is None:
        features = list(idadf.columns)

    numerical_features = [x for x in features if x in numerical_columns]
    numerical_targets = [x for x in target if x in numerical_columns]

    numerical_features = list(set(numerical_features) | set(numerical_targets))

    agg_list = [
        "CAST(RANK() OVER (ORDER BY \"%s\") AS INTEGER) AS \"%s\"" % (x, x)
        for x in numerical_features
    ]
    agg_string = ', '.join(agg_list)

    expression = "SELECT %s FROM %s" % (agg_string, idadf.name)

    viewname = idadf._idadb._create_view_from_expression(expression)

    try:
        idadf_rank = ibmdbpy.IdaDataFrame(idadf._idadb, viewname)
        return pearson(idadf_rank,
                       target=target,
                       features=numerical_features,
                       ignore_indexer=ignore_indexer)
    except:
        raise
    finally:
        idadf._idadb.drop_view(viewname)
Exemplo n.º 10
0
def gain_ratio(idadf,
               target=None,
               features=None,
               symmetry=True,
               ignore_indexer=True):
    """
    Compute the gain ratio coefficients between a set of features and a 
    set of target in an IdaDataFrame. 
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
        
    symmetry : bool, default: True
        If True, compute the symmetric gain ratio as defined by
        [Lopez de Mantaras 1991]. Otherwise, the asymmetric gain ratio. 
    
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be categorical, otherwise 
    this measure does not make much sense. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> gain_ratio(idadf)
    """
    # Check input
    target, features = _check_input(idadf, target, features, ignore_indexer)

    entropy_dict = dict()
    length = len(idadf)
    values = OrderedDict()
    corrector = length * np.log(length)

    for t in target:
        if t not in values:
            values[t] = OrderedDict()
        features_notarget = [x for x in features if (x != t)]

        for feature in features_notarget:
            if feature not in values:
                values[feature] = OrderedDict()

            if t not in values[feature]:  # i.e. it was not already computed
                if t not in entropy_dict:
                    entropy_dict[t] = entropy(idadf, t, mode="raw")
                if feature not in entropy_dict:
                    entropy_dict[feature] = entropy(idadf, feature, mode="raw")

                join_entropy = entropy(idadf, [t] + [feature], mode="raw")
                disjoin_entropy = entropy_dict[t] + entropy_dict[feature]
                info_gain = (disjoin_entropy - join_entropy)

                if symmetry:
                    gain_ratio = (info_gain + corrector) / (
                        disjoin_entropy + 2 * corrector
                    )  # 2* because symmetric
                    values[t][feature] = gain_ratio
                    if feature in target:
                        values[feature][t] = gain_ratio
                else:
                    gain_ratio_1 = (info_gain + corrector) / (entropy_dict[t] +
                                                              corrector)
                    values[t][feature] = gain_ratio_1
                    if feature in target:
                        gain_ratio_2 = (info_gain + corrector) / (
                            entropy_dict[feature] + corrector)
                        values[feature][t] = gain_ratio_2

    ### Fill the matrix
    result = pd.DataFrame(values).fillna(np.nan)
    result = result.dropna(axis=1, how="all")

    if len(result.columns) > 1:
        order = [x for x in result.columns if x in features
                 ] + [x for x in features if x not in result.columns]
        result = result.reindex(order)

    if len(result.columns) == 1:
        if len(result) == 1:
            result = result.iloc[0, 0]
        else:
            result = result[result.columns[0]].copy()
            result.sort(ascending=True)
    else:
        result = result.fillna(1)

    return result
Exemplo n.º 11
0
def info_gain(idadf, target=None, features=None, ignore_indexer=True):
    """
    Compute the information gain / mutual information coefficients between a 
    set of features and a set of target in an IdaDataFrame. 
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
    
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be categorical, otherwise 
    this measure does not make much sense. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> info_gain(idadf)
    """
    # Check input
    target, features = _check_input(idadf, target, features, ignore_indexer)

    entropy_dict = OrderedDict()
    length = len(idadf)
    loglength = log(length)

    values = OrderedDict()

    for t in target:
        if t not in values:
            values[t] = OrderedDict()
        features_notarget = [x for x in features if (x != t)]

        for feature in features_notarget:
            if feature not in values:
                values[feature] = OrderedDict()
            if t not in values[feature]:
                if t not in entropy_dict:
                    entropy_dict[t] = entropy(idadf, t, mode="raw")
                if feature not in entropy_dict:
                    entropy_dict[feature] = entropy(idadf, feature, mode="raw")
                join_entropy = entropy(idadf, [t] + [feature], mode="raw")

                value = (
                    (entropy_dict[t] + entropy_dict[feature] - join_entropy) /
                    length + loglength) / log(2)
                values[t][feature] = value
                if feature in target:
                    values[feature][t] = value

    result = pd.DataFrame(values).fillna(np.nan)
    result = result.dropna(axis=1, how="all")

    if len(result.columns) > 1:
        order = [x for x in result.columns if x in features
                 ] + [x for x in features if x not in result.columns]
        result = result.reindex(order)

    if len(result.columns) == 1:
        if len(result) == 1:
            result = result.iloc[0, 0]
        else:
            result = result[result.columns[0]].copy()
            result.sort_values(inplace=True, ascending=False)

    return result
Exemplo n.º 12
0
def chisquared(idadf, target=None, features=None, ignore_indexer=True):
    """
    Compute the Chi-Squared statistics coefficients between a set of features 
    and a set of target in an IdaDataFrame. 
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
    
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be categorical, otherwise 
    this measure does not make much sense. 
    
    Chi-squared as defined in 
    A Comparative Study on Feature Selection and Classification Methods Using 
    Gene Expression Profiles and Proteomic Patterns. (GIW02F006)
    
    The scalability of this approach is not very good. Should not be used on 
    high dimensional data. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> chisquared(idadf)
    """
    # Check input
    target, features = _check_input(idadf, target, features, ignore_indexer)
    count_dict = dict()
    length = len(idadf)

    values = OrderedDict()

    for t in target:
        if t not in values:
            values[t] = OrderedDict()
        features_notarget = [x for x in features if (x != t)]

        ### Compute
        for feature in features_notarget:
            if feature not in values:
                values[feature] = OrderedDict()
            if t not in values[feature]:
                if t not in count_dict:
                    count = idadf.count_groupby(t)
                    count_serie = count["count"]
                    count_serie.index = count[t]
                    count_dict[t] = count_serie

                C = dict(count_dict[t])

                if feature not in count_dict:
                    count = idadf.count_groupby(feature)
                    count_serie = count["count"]
                    count_serie.index = count[feature]
                    count_dict[feature] = count_serie

                R = dict(count_dict[feature])

                if (feature, t) not in count_dict:
                    count_dict[(feature,
                                t)] = idadf.count_groupby([feature, t])

                count = count_dict[(feature, t)]

                chi = 0
                for target_class in C.keys():
                    count_target = count[count[t] == target_class][[
                        feature, "count"
                    ]]
                    A_target = count_target['count']
                    A_target.index = count_target[feature]

                    for feature_class in A_target.index:
                        a = A_target[feature_class]
                        e = R[feature_class] * C[target_class] / length
                        chi += ((a - e)**2) / e

                values[t][feature] = chi  # chisquared is symmetric
                if feature in target:
                    values[feature][t] = chi

    result = pd.DataFrame(values).fillna(np.nan)
    result = result.dropna(axis=1, how="all")

    if len(result.columns) > 1:
        order = [x for x in result.columns if x in features
                 ] + [x for x in features if x not in result.columns]
        result = result.reindex(order)

    if len(result.columns) == 1:
        if len(result) == 1:
            result = result.iloc[0, 0]
        else:
            result = result[result.columns[0]].copy()
            result.sort_values(ascending=False)

    return result
Exemplo n.º 13
0
def gini_pairwise(idadf, target=None, features=None, ignore_indexer=True):
    """
    Compute the conditional gini coefficients between a set of features and a 
    set of target in an IdaDataFrame. 
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
    
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be categorical, otherwise 
    this measure does not make much sense. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> gini_pairwise(idadf)
    """
    # Check input
    target, features = _check_input(idadf, target, features, ignore_indexer)
        
    gini_dict = OrderedDict()
    length = len(idadf)
    
    for t in target:
        gini_dict[t] = OrderedDict() 
        features_notarget = [x for x in features if (x != t)]
        
        for feature in features_notarget:
            if t not in gini_dict:
                gini_dict[t] = OrderedDict()
            
            query = ("SELECT SUM((POWER(c,2) - gini)/c)/%s FROM "+ 
            "(SELECT SUM(POWER(count,2)) as gini, SUM(count) as c FROM "+
            "(SELECT CAST(COUNT(*) AS FLOAT) AS count, \"%s\" FROM %s GROUP BY \"%s\",\"%s\") "+
            "GROUP BY \"%s\")")
            query0 = query%(length, feature, idadf.name, t, feature, feature)
            gini_dict[t][feature] = idadf.ida_scalar_query(query0)
            
    result = pd.DataFrame(gini_dict).fillna(np.nan)
        
    if len(result.columns) > 1:
        order = [x for x in result.columns if x in features] + [x for x in features if x not in result.columns]
        result = result.reindex(order)
       
    result = result.dropna(axis=1, how="all")
    
    if len(result.columns) == 1:
        if len(result) == 1:
            result = result.iloc[0,0]
        else:
            result = result[result.columns[0]].copy()
            result.sort(ascending = True) 
    else:
        result = result.fillna(0)
    
    return result
Exemplo n.º 14
0
def pearson(idadf, target=None, features=None, ignore_indexer=True):
    """
    Compute the pearson correlation coefficients between a set of features and a 
    set of target in an IdaDataFrame. Provide more granualirity than 
    IdaDataFrame.corr
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
        
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be numerical. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> pearson(idadf)
    """
    numerical_columns = idadf._get_numerical_columns()
    if features is None:
        features = numerical_columns
        
    target, features = _check_input(idadf, target, features, ignore_indexer)
    
    value_dict = OrderedDict()
    
    for feature in features:
        if feature not in numerical_columns:
            raise TypeError("Correlation-based measure not available for non-numerical column %s"%feature)
                    
    if target == features:
        return idadf.corr(features = features, ignore_indexer=ignore_indexer)
    else:
        for t in target:
            if feature not in numerical_columns:
                raise TypeError("Correlation-based measure not available for non-numerical column %s"%t)
        
        for t in target:
            value_dict[t] = OrderedDict()
            
            features_notarget = [x for x in features if x != t]
            
            if len(features_notarget) < 64:
                agg_list = ["CORRELATION(\"%s\",\"%s\")"%(x, t) for x in features_notarget]
                agg_string = ', '.join(agg_list)
                name = idadf.internal_state.current_state
                data = idadf.ida_query("SELECT %s FROM %s"%(agg_string, name), first_row_only = True)
            else:
                chunkgen = chunklist(features_notarget, 100)
                data = ()
                for chunk in chunkgen: 
                    agg_list = ["CORRELATION(\"%s\",\"%s\")"%(x, t) for x in chunk]
                    agg_string = ', '.join(agg_list)
            
                    name = idadf.internal_state.current_state
                    data += idadf.ida_query("SELECT %s FROM %s"%(agg_string, name), first_row_only = True)
    
            for i, feature in enumerate(features_notarget):
                value_dict[t][feature] = data[i]
        
        ### Fill the matrix
        result = pd.DataFrame(value_dict).fillna(1)
        
        if len(result.columns) == 1:
            if len(result) == 1:
                result = result.iloc[0,0]
            else:
                result = result[result.columns[0]].copy()
                result.sort(ascending = False) 
        else:
            order = [x for x in result.columns if x in features] + [x for x in features if x not in result.columns]
            result = result.reindex(order)
        
        return result 
Exemplo n.º 15
0
def spearman(idadf, target=None, features = None, ignore_indexer=True):
    """
    Compute the spearman rho correlation coefficients between a set of features 
    and a set of target in an IdaDataFrame.
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
        
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be numerical. 
    This function is a wrapper for pearson. 
    The scalability of this approach is not very good. Should not be used on 
    high dimensional data. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> spearman(idadf)
    """
    numerical_columns = idadf._get_numerical_columns()
    if features is None:
        features = numerical_columns
        
    target, features = _check_input(idadf, target, features, ignore_indexer)
    
    for feature in features:
        if feature not in numerical_columns:
            raise TypeError("Correlation-based measure not available for non-numerical column %s"%feature)
    
    if ignore_indexer is True:
        if idadf.indexer:
            if idadf.indexer in numerical_columns:
                features.remove(idadf.indexer)
    
    if features is None:
        features = list(idadf.columns)
    
    numerical_features = [x for x in features if x in numerical_columns]
    numerical_targets = [x for x in target if x in numerical_columns]
    
    numerical_features = list(set(numerical_features) | set(numerical_targets))
    
    
    agg_list = ["CAST(RANK() OVER (ORDER BY \"%s\") AS INTEGER) AS \"%s\""%(x, x) for x in numerical_features]
    agg_string = ', '.join(agg_list)
    
    expression = "SELECT %s FROM %s"%(agg_string, idadf.name)
    
    viewname = idadf._idadb._create_view_from_expression(expression)
    
    try:
        idadf_rank = ibmdbpy.IdaDataFrame(idadf._idadb, viewname)
        return pearson(idadf_rank, target = target, features=numerical_features, ignore_indexer=ignore_indexer)
    except:
        raise
    finally:
        idadf._idadb.drop_view(viewname)
    
    
    
    
 
        
        

        
Exemplo n.º 16
0
def gain_ratio(idadf, target=None, features=None, symmetry=True, ignore_indexer=True):
    """
    Compute the gain ratio coefficients between a set of features and a 
    set of target in an IdaDataFrame. 
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
        
    symmetry : bool, default: True
        If True, compute the symmetric gain ratio as defined by
        [Lopez de Mantaras 1991]. Otherwise, the asymmetric gain ratio. 
    
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be categorical, otherwise 
    this measure does not make much sense. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> gain_ratio(idadf)
    """
    # Check input
    target, features = _check_input(idadf, target, features, ignore_indexer)

    entropy_dict = dict()
    length = len(idadf)
    values = OrderedDict()
    corrector = length * np.log(length)

    for t in target:
        if t not in values:
            values[t] = OrderedDict()
        features_notarget = [x for x in features if (x != t)]

        for feature in features_notarget:
            if feature not in values:
                values[feature] = OrderedDict()

            if t not in values[feature]:  # i.e. it was not already computed
                if t not in entropy_dict:
                    entropy_dict[t] = entropy(idadf, t, mode="raw")
                if feature not in entropy_dict:
                    entropy_dict[feature] = entropy(idadf, feature, mode="raw")

                join_entropy = entropy(idadf, [t] + [feature], mode="raw")
                disjoin_entropy = entropy_dict[t] + entropy_dict[feature]
                info_gain = disjoin_entropy - join_entropy

                if symmetry:
                    gain_ratio = (info_gain + corrector) / (disjoin_entropy + 2 * corrector)  # 2* because symmetric
                    values[t][feature] = gain_ratio
                    if feature in target:
                        values[feature][t] = gain_ratio
                else:
                    gain_ratio_1 = (info_gain + corrector) / (entropy_dict[t] + corrector)
                    values[t][feature] = gain_ratio_1
                    if feature in target:
                        gain_ratio_2 = (info_gain + corrector) / (entropy_dict[feature] + corrector)
                        values[feature][t] = gain_ratio_2

    ### Fill the matrix
    result = pd.DataFrame(values).fillna(np.nan)
    result = result.dropna(axis=1, how="all")

    if len(result.columns) > 1:
        order = [x for x in result.columns if x in features] + [x for x in features if x not in result.columns]
        result = result.reindex(order)

    if len(result.columns) == 1:
        if len(result) == 1:
            result = result.iloc[0, 0]
        else:
            result = result[result.columns[0]].copy()
            result.sort(ascending=True)
    else:
        result = result.fillna(1)

    return result