예제 #1
0
def corr(idadf, features=None,ignore_indexer=True):
    """
    See IdaDataFrame.corr
    """
    if isinstance(idadf, ibmdbpy.IdaSeries):
        raise TypeError("corr() missing 1 required positional argument: 'other'")
    # TODO: catch case n <= 1
    numerical_columns = idadf._get_numerical_columns()
    
    if not numerical_columns:
        print(idadf.name + " has no numeric columns")
        return
        
    if ignore_indexer is True:
        if idadf.indexer:
            if idadf.indexer in numerical_columns:
                numerical_columns.remove(idadf.indexer)
    
    #print(features)
    #target, features = ibmdbpy.utils._check_input(target, features)
    if features is not None:
        for feature in features:
            if feature not in numerical_columns:
                raise TypeError("Correlation-based measure not available for non-numerical columns %s"%feature)
    else:
        features = numerical_columns
    
    #if target not in columns:
    #    raise ValueError("%s is not a column of numerical type in %s"%(target, idadf.name))
    
    values = OrderedDict()
    
    combinations = [x for x in itertools.combinations(features, 2)]
    #columns_set = [{x[0], x[1]} for x in combinations]
    
    if len(features) < 64: # the limit of variables for an SQL statement is 4096, i.e 64^2
        agg_list = []
        for column_pair in combinations:
            agg = "CORRELATION(\"%s\",\"%s\")"%(column_pair[0], column_pair[1])
            agg_list.append(agg)
    
        agg_string = ', '.join(agg_list)
    
        name = idadf.internal_state.current_state
    
        data = idadf.ida_query("SELECT %s FROM %s"%(agg_string, name), first_row_only = True)
    
        for i, element in enumerate(combinations):
            if element[0] not in values:
                values[element[0]] = {}
            if element[1] not in values:
                values[element[1]] = {}
            values[element[0]][element[1]] = data[i]
            values[element[1]][element[0]] = data[i]
            
        result = pd.DataFrame(values).fillna(1)
    else:        
        chunkgen = chunklist(combinations, 100)
        
        for chunk in chunkgen: 
            agg_list = []
            for column_pair in chunk:
                agg = "CORRELATION(\"%s\",\"%s\")"%(column_pair[0], column_pair[1])
                agg_list.append(agg)
        
            agg_string = ', '.join(agg_list)
        
            name = idadf.internal_state.current_state
        
            data = idadf.ida_query("SELECT %s FROM %s"%(agg_string, name), first_row_only = True)
        
            for i, element in enumerate(chunk):
                if element[0] not in values:
                    values[element[0]] = OrderedDict()
                if element[1] not in values:
                    values[element[1]] = OrderedDict()
                values[element[0]][element[1]] = data[i]
                values[element[1]][element[0]] = data[i]
            
        result = pd.DataFrame(values).fillna(1)
    
    result = result.reindex(result.columns)
    if len(result) == 1:
        result = result[0]

    return result
예제 #2
0
def corr(idadf, features=None, ignore_indexer=True):
    """
    See IdaDataFrame.corr
    """
    if isinstance(idadf, ibmdbpy.IdaSeries):
        raise TypeError(
            "corr() missing 1 required positional argument: 'other'")
    # TODO: catch case n <= 1
    numerical_columns = idadf._get_numerical_columns()

    if not numerical_columns:
        print(idadf.name + " has no numeric columns")
        return

    if ignore_indexer is True:
        if idadf.indexer:
            if idadf.indexer in numerical_columns:
                numerical_columns.remove(idadf.indexer)

    #print(features)
    #target, features = ibmdbpy.utils._check_input(target, features)
    if features is not None:
        for feature in features:
            if feature not in numerical_columns:
                raise TypeError(
                    "Correlation-based measure not available for non-numerical columns %s"
                    % feature)
    else:
        features = numerical_columns

    #if target not in columns:
    #    raise ValueError("%s is not a column of numerical type in %s"%(target, idadf.name))

    values = OrderedDict()

    combinations = [x for x in itertools.combinations(features, 2)]
    #columns_set = [{x[0], x[1]} for x in combinations]

    if len(
            features
    ) < 64:  # the limit of variables for an SQL statement is 4096, i.e 64^2
        agg_list = []
        for column_pair in combinations:
            agg = "CORRELATION(\"%s\",\"%s\")" % (column_pair[0],
                                                  column_pair[1])
            agg_list.append(agg)

        agg_string = ', '.join(agg_list)

        name = idadf.internal_state.current_state

        data = idadf.ida_query("SELECT %s FROM %s" % (agg_string, name),
                               first_row_only=True)

        for i, element in enumerate(combinations):
            if element[0] not in values:
                values[element[0]] = {}
            if element[1] not in values:
                values[element[1]] = {}
            values[element[0]][element[1]] = data[i]
            values[element[1]][element[0]] = data[i]

        result = pd.DataFrame(values).fillna(1)
    else:
        chunkgen = chunklist(combinations, 100)

        for chunk in chunkgen:
            agg_list = []
            for column_pair in chunk:
                agg = "CORRELATION(\"%s\",\"%s\")" % (column_pair[0],
                                                      column_pair[1])
                agg_list.append(agg)

            agg_string = ', '.join(agg_list)

            name = idadf.internal_state.current_state

            data = idadf.ida_query("SELECT %s FROM %s" % (agg_string, name),
                                   first_row_only=True)

            for i, element in enumerate(chunk):
                if element[0] not in values:
                    values[element[0]] = OrderedDict()
                if element[1] not in values:
                    values[element[1]] = OrderedDict()
                values[element[0]][element[1]] = data[i]
                values[element[1]][element[0]] = data[i]

        result = pd.DataFrame(values).fillna(1)

    result = result.reindex(result.columns)
    if len(result) == 1:
        result = result[0]

    return result
예제 #3
0
def pearson(idadf, target=None, features=None, ignore_indexer=True):
    """
    Compute the pearson correlation coefficients between a set of features and a 
    set of target in an IdaDataFrame. Provide more granualirity than 
    IdaDataFrame.corr
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
        
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be numerical. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> pearson(idadf)
    """
    numerical_columns = idadf._get_numerical_columns()
    if features is None:
        features = numerical_columns

    target, features = _check_input(idadf, target, features, ignore_indexer)

    value_dict = OrderedDict()

    for feature in features:
        if feature not in numerical_columns:
            raise TypeError(
                "Correlation-based measure not available for non-numerical column %s"
                % feature)

    if target == features:
        return idadf.corr(features=features, ignore_indexer=ignore_indexer)
    else:
        for t in target:
            if feature not in numerical_columns:
                raise TypeError(
                    "Correlation-based measure not available for non-numerical column %s"
                    % t)

        for t in target:
            value_dict[t] = OrderedDict()

            features_notarget = [x for x in features if x != t]

            if len(features_notarget) < 64:
                agg_list = [
                    "CORRELATION(\"%s\",\"%s\")" % (x, t)
                    for x in features_notarget
                ]
                agg_string = ', '.join(agg_list)
                name = idadf.internal_state.current_state
                data = idadf.ida_query("SELECT %s FROM %s" %
                                       (agg_string, name),
                                       first_row_only=True)
            else:
                chunkgen = chunklist(features_notarget, 100)
                data = ()
                for chunk in chunkgen:
                    agg_list = [
                        "CORRELATION(\"%s\",\"%s\")" % (x, t) for x in chunk
                    ]
                    agg_string = ', '.join(agg_list)

                    name = idadf.internal_state.current_state
                    data += idadf.ida_query("SELECT %s FROM %s" %
                                            (agg_string, name),
                                            first_row_only=True)

            for i, feature in enumerate(features_notarget):
                value_dict[t][feature] = data[i]

        ### Fill the matrix
        result = pd.DataFrame(value_dict).fillna(1)

        if len(result.columns) == 1:
            if len(result) == 1:
                result = result.iloc[0, 0]
            else:
                result = result[result.columns[0]].copy()
                result.sort_values(inplace=True, ascending=False)
        else:
            order = [x for x in result.columns if x in features
                     ] + [x for x in features if x not in result.columns]
            result = result.reindex(order)

        return result
예제 #4
0
def pearson(idadf, target=None, features=None, ignore_indexer=True):
    """
    Compute the pearson correlation coefficients between a set of features and a 
    set of target in an IdaDataFrame. Provide more granualirity than 
    IdaDataFrame.corr
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
        
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be numerical. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> pearson(idadf)
    """
    numerical_columns = idadf._get_numerical_columns()
    if features is None:
        features = numerical_columns
        
    target, features = _check_input(idadf, target, features, ignore_indexer)
    
    value_dict = OrderedDict()
    
    for feature in features:
        if feature not in numerical_columns:
            raise TypeError("Correlation-based measure not available for non-numerical column %s"%feature)
                    
    if target == features:
        return idadf.corr(features = features, ignore_indexer=ignore_indexer)
    else:
        for t in target:
            if feature not in numerical_columns:
                raise TypeError("Correlation-based measure not available for non-numerical column %s"%t)
        
        for t in target:
            value_dict[t] = OrderedDict()
            
            features_notarget = [x for x in features if x != t]
            
            if len(features_notarget) < 64:
                agg_list = ["CORRELATION(\"%s\",\"%s\")"%(x, t) for x in features_notarget]
                agg_string = ', '.join(agg_list)
                name = idadf.internal_state.current_state
                data = idadf.ida_query("SELECT %s FROM %s"%(agg_string, name), first_row_only = True)
            else:
                chunkgen = chunklist(features_notarget, 100)
                data = ()
                for chunk in chunkgen: 
                    agg_list = ["CORRELATION(\"%s\",\"%s\")"%(x, t) for x in chunk]
                    agg_string = ', '.join(agg_list)
            
                    name = idadf.internal_state.current_state
                    data += idadf.ida_query("SELECT %s FROM %s"%(agg_string, name), first_row_only = True)
    
            for i, feature in enumerate(features_notarget):
                value_dict[t][feature] = data[i]
        
        ### Fill the matrix
        result = pd.DataFrame(value_dict).fillna(1)
        
        if len(result.columns) == 1:
            if len(result) == 1:
                result = result.iloc[0,0]
            else:
                result = result[result.columns[0]].copy()
                result.sort(ascending = False) 
        else:
            order = [x for x in result.columns if x in features] + [x for x in features if x not in result.columns]
            result = result.reindex(order)
        
        return result