def corr(idadf, features=None,ignore_indexer=True): """ See IdaDataFrame.corr """ if isinstance(idadf, ibmdbpy.IdaSeries): raise TypeError("corr() missing 1 required positional argument: 'other'") # TODO: catch case n <= 1 numerical_columns = idadf._get_numerical_columns() if not numerical_columns: print(idadf.name + " has no numeric columns") return if ignore_indexer is True: if idadf.indexer: if idadf.indexer in numerical_columns: numerical_columns.remove(idadf.indexer) #print(features) #target, features = ibmdbpy.utils._check_input(target, features) if features is not None: for feature in features: if feature not in numerical_columns: raise TypeError("Correlation-based measure not available for non-numerical columns %s"%feature) else: features = numerical_columns #if target not in columns: # raise ValueError("%s is not a column of numerical type in %s"%(target, idadf.name)) values = OrderedDict() combinations = [x for x in itertools.combinations(features, 2)] #columns_set = [{x[0], x[1]} for x in combinations] if len(features) < 64: # the limit of variables for an SQL statement is 4096, i.e 64^2 agg_list = [] for column_pair in combinations: agg = "CORRELATION(\"%s\",\"%s\")"%(column_pair[0], column_pair[1]) agg_list.append(agg) agg_string = ', '.join(agg_list) name = idadf.internal_state.current_state data = idadf.ida_query("SELECT %s FROM %s"%(agg_string, name), first_row_only = True) for i, element in enumerate(combinations): if element[0] not in values: values[element[0]] = {} if element[1] not in values: values[element[1]] = {} values[element[0]][element[1]] = data[i] values[element[1]][element[0]] = data[i] result = pd.DataFrame(values).fillna(1) else: chunkgen = chunklist(combinations, 100) for chunk in chunkgen: agg_list = [] for column_pair in chunk: agg = "CORRELATION(\"%s\",\"%s\")"%(column_pair[0], column_pair[1]) agg_list.append(agg) agg_string = ', '.join(agg_list) name = idadf.internal_state.current_state data = idadf.ida_query("SELECT %s FROM %s"%(agg_string, name), first_row_only = True) for i, element in enumerate(chunk): if element[0] not in values: values[element[0]] = OrderedDict() if element[1] not in values: values[element[1]] = OrderedDict() values[element[0]][element[1]] = data[i] values[element[1]][element[0]] = data[i] result = pd.DataFrame(values).fillna(1) result = result.reindex(result.columns) if len(result) == 1: result = result[0] return result
def corr(idadf, features=None, ignore_indexer=True): """ See IdaDataFrame.corr """ if isinstance(idadf, ibmdbpy.IdaSeries): raise TypeError( "corr() missing 1 required positional argument: 'other'") # TODO: catch case n <= 1 numerical_columns = idadf._get_numerical_columns() if not numerical_columns: print(idadf.name + " has no numeric columns") return if ignore_indexer is True: if idadf.indexer: if idadf.indexer in numerical_columns: numerical_columns.remove(idadf.indexer) #print(features) #target, features = ibmdbpy.utils._check_input(target, features) if features is not None: for feature in features: if feature not in numerical_columns: raise TypeError( "Correlation-based measure not available for non-numerical columns %s" % feature) else: features = numerical_columns #if target not in columns: # raise ValueError("%s is not a column of numerical type in %s"%(target, idadf.name)) values = OrderedDict() combinations = [x for x in itertools.combinations(features, 2)] #columns_set = [{x[0], x[1]} for x in combinations] if len( features ) < 64: # the limit of variables for an SQL statement is 4096, i.e 64^2 agg_list = [] for column_pair in combinations: agg = "CORRELATION(\"%s\",\"%s\")" % (column_pair[0], column_pair[1]) agg_list.append(agg) agg_string = ', '.join(agg_list) name = idadf.internal_state.current_state data = idadf.ida_query("SELECT %s FROM %s" % (agg_string, name), first_row_only=True) for i, element in enumerate(combinations): if element[0] not in values: values[element[0]] = {} if element[1] not in values: values[element[1]] = {} values[element[0]][element[1]] = data[i] values[element[1]][element[0]] = data[i] result = pd.DataFrame(values).fillna(1) else: chunkgen = chunklist(combinations, 100) for chunk in chunkgen: agg_list = [] for column_pair in chunk: agg = "CORRELATION(\"%s\",\"%s\")" % (column_pair[0], column_pair[1]) agg_list.append(agg) agg_string = ', '.join(agg_list) name = idadf.internal_state.current_state data = idadf.ida_query("SELECT %s FROM %s" % (agg_string, name), first_row_only=True) for i, element in enumerate(chunk): if element[0] not in values: values[element[0]] = OrderedDict() if element[1] not in values: values[element[1]] = OrderedDict() values[element[0]][element[1]] = data[i] values[element[1]][element[0]] = data[i] result = pd.DataFrame(values).fillna(1) result = result.reindex(result.columns) if len(result) == 1: result = result[0] return result
def pearson(idadf, target=None, features=None, ignore_indexer=True): """ Compute the pearson correlation coefficients between a set of features and a set of target in an IdaDataFrame. Provide more granualirity than IdaDataFrame.corr Parameters ---------- idadf : IdaDataFrame target : str or list of str, optional A column or list of columns against to be used as target. Per default, consider all columns features : str or list of str, optional A column or list of columns to be used as features. Per default, consider all columns. ignore_indexer : bool, default: True Per default, ignore the column declared as indexer in idadf Returns ------- Pandas.DataFrame or Pandas.Series if only one target Notes ----- Input columns as target and features should be numerical. Examples -------- >>> idadf = IdaDataFrame(idadb, "IRIS") >>> pearson(idadf) """ numerical_columns = idadf._get_numerical_columns() if features is None: features = numerical_columns target, features = _check_input(idadf, target, features, ignore_indexer) value_dict = OrderedDict() for feature in features: if feature not in numerical_columns: raise TypeError( "Correlation-based measure not available for non-numerical column %s" % feature) if target == features: return idadf.corr(features=features, ignore_indexer=ignore_indexer) else: for t in target: if feature not in numerical_columns: raise TypeError( "Correlation-based measure not available for non-numerical column %s" % t) for t in target: value_dict[t] = OrderedDict() features_notarget = [x for x in features if x != t] if len(features_notarget) < 64: agg_list = [ "CORRELATION(\"%s\",\"%s\")" % (x, t) for x in features_notarget ] agg_string = ', '.join(agg_list) name = idadf.internal_state.current_state data = idadf.ida_query("SELECT %s FROM %s" % (agg_string, name), first_row_only=True) else: chunkgen = chunklist(features_notarget, 100) data = () for chunk in chunkgen: agg_list = [ "CORRELATION(\"%s\",\"%s\")" % (x, t) for x in chunk ] agg_string = ', '.join(agg_list) name = idadf.internal_state.current_state data += idadf.ida_query("SELECT %s FROM %s" % (agg_string, name), first_row_only=True) for i, feature in enumerate(features_notarget): value_dict[t][feature] = data[i] ### Fill the matrix result = pd.DataFrame(value_dict).fillna(1) if len(result.columns) == 1: if len(result) == 1: result = result.iloc[0, 0] else: result = result[result.columns[0]].copy() result.sort_values(inplace=True, ascending=False) else: order = [x for x in result.columns if x in features ] + [x for x in features if x not in result.columns] result = result.reindex(order) return result
def pearson(idadf, target=None, features=None, ignore_indexer=True): """ Compute the pearson correlation coefficients between a set of features and a set of target in an IdaDataFrame. Provide more granualirity than IdaDataFrame.corr Parameters ---------- idadf : IdaDataFrame target : str or list of str, optional A column or list of columns against to be used as target. Per default, consider all columns features : str or list of str, optional A column or list of columns to be used as features. Per default, consider all columns. ignore_indexer : bool, default: True Per default, ignore the column declared as indexer in idadf Returns ------- Pandas.DataFrame or Pandas.Series if only one target Notes ----- Input columns as target and features should be numerical. Examples -------- >>> idadf = IdaDataFrame(idadb, "IRIS") >>> pearson(idadf) """ numerical_columns = idadf._get_numerical_columns() if features is None: features = numerical_columns target, features = _check_input(idadf, target, features, ignore_indexer) value_dict = OrderedDict() for feature in features: if feature not in numerical_columns: raise TypeError("Correlation-based measure not available for non-numerical column %s"%feature) if target == features: return idadf.corr(features = features, ignore_indexer=ignore_indexer) else: for t in target: if feature not in numerical_columns: raise TypeError("Correlation-based measure not available for non-numerical column %s"%t) for t in target: value_dict[t] = OrderedDict() features_notarget = [x for x in features if x != t] if len(features_notarget) < 64: agg_list = ["CORRELATION(\"%s\",\"%s\")"%(x, t) for x in features_notarget] agg_string = ', '.join(agg_list) name = idadf.internal_state.current_state data = idadf.ida_query("SELECT %s FROM %s"%(agg_string, name), first_row_only = True) else: chunkgen = chunklist(features_notarget, 100) data = () for chunk in chunkgen: agg_list = ["CORRELATION(\"%s\",\"%s\")"%(x, t) for x in chunk] agg_string = ', '.join(agg_list) name = idadf.internal_state.current_state data += idadf.ida_query("SELECT %s FROM %s"%(agg_string, name), first_row_only = True) for i, feature in enumerate(features_notarget): value_dict[t][feature] = data[i] ### Fill the matrix result = pd.DataFrame(value_dict).fillna(1) if len(result.columns) == 1: if len(result) == 1: result = result.iloc[0,0] else: result = result[result.columns[0]].copy() result.sort(ascending = False) else: order = [x for x in result.columns if x in features] + [x for x in features if x not in result.columns] result = result.reindex(order) return result