def outer_su(idadf1, key1, idadf2, key2, target=None, features1=None, features2=None): """ Compute the symmetric uncertainty coefficients between a set of features and a set of target from two different IdaDataFrames on a particular key. This is experimental """ target1, features1 = _check_input(idadf1, target, features1) target2, features2 = _check_input(idadf2, None, features2) if key1 not in idadf1.columns: raise ValueError("%s is not a column in idadf1") if key2 not in idadf2.columns: raise ValueError("%s is not a column in idadf2") condition = "a.\"%s\" = b.\"%s\"" % (key1, key2) if key2 in features2: features2.remove(key2) afeaturesas = ", ".join([ "a.\"%s\" as \"a.%s\" " % (feature, feature) for feature in features1 ]) bfeaturesas = ", ".join([ "b.\"%s\" as \"b.%s\" " % (feature, feature) for feature in features2 ]) selectlist = [afeaturesas, bfeaturesas] if target1 is not None: atargetas = ", ".join( ["a.\"%s\" as \"a.%s\" " % (tar, tar) for tar in [target1]]) selectlist.append(atargetas) atarget = "a." + target1 else: atarget = None abfeatures = ["a." + feature for feature in features1 ] + ["b." + feature for feature in features2] selectstr = ", ".join(selectlist) expression = "SELECT %s FROM %s as a FULL OUTER JOIN %s as b ON %s" % ( selectstr, idadf1.name, idadf2.name, condition) viewname = idadf1._idadb._create_view_from_expression(expression) try: idadf_join = ibmdbpy.IdaDataFrame(idadf1._idadb, viewname) return su(idadf_join, target=atarget, features=abfeatures) except: raise finally: idadf1._idadb.drop_view(viewname)
def idaview(request, idadb, idadf): """ IdaDataFrame fixture to be used for the whole testing session. Open a view based on idadf fixture. """ def fin(): try: idadb.drop_view("TEST_VIEW_ibmdbpy") idadb.commit() except: pass request.addfinalizer(fin) if idadb.exists_view("TEST_VIEW_ibmdbpy"): idadb.drop_view("TEST_VIEW_ibmdbpy") idadb._create_view(idadf, "TEST_VIEW_ibmdbpy") return ibmdbpy.IdaDataFrame(idadb, "TEST_VIEW_ibmdbpy")
def idaview_tmp(request, idadb, idadf): """ IdaDataFrame fixture to be used by destructive and semi-destructive functions. To be considered as a temporary DataFrame that is created and destroyed for each function that requires it. Opens a view based on idadf fixture. """ def fin(): try: idadb.drop_view("TEST_VIEW_ibmdbpy_TMP") idadb.commit() except: pass request.addfinalizer(fin) if idadb.exists_view("TEST_VIEW_ibmdbpy_TMP"): idadb.drop_view("TEST_VIEW_ibmdbpy_TMP") idadb._create_view(idadf, "TEST_VIEW_ibmdbpy_TMP") return ibmdbpy.IdaDataFrame(idadb, "TEST_VIEW_ibmdbpy_TMP")
def predict(self, idadf, column_id=None, outtable=None, outtableProb=None, mestimation=False): """ Use the Naive Bayes predict stored procedure to apply a Naive Bayes model to generate classification predictions for a data set. Parameters ---------- idadf : IdaDataFrame IdaDataFrame to be used as input. column_id : str, optional The column of the input table that identifies a unique instance ID. By default, the same id column that is specified in the stored procedure to build the model. outtable : str, optional The name of the output table where the predictions are stored. It should contain only alphanumerical characters and underscores. All lower case characters will be converted to upper case characters. If this parameter is not specified, it is generated automatically. If the parameter corresponds to an existing table in the database, it will be replaced. outtableProb : str, optional The name of the output table where the probabilities for each of the classes are stored. It should contain only alphanumerical characters and underscores. All lower case characters will be converted to upper case characters. If this parameter is not specified, the table is not created. If the parameter corresponds to an existing table in the database, it will be replaced. mestimation : flag, default: False A flag that indicates the use of m-estimation for probabilities. This kind of estimation might be slower than other ones, but it might produce better results for small or unbalanced data sets. Returns ------- IdaDataFrame IdaDataFrame containing the classification decision for each datapoints referenced by their ID. """ if not isinstance(idadf, ibmdbpy.IdaDataFrame): raise TypeError("Argument should be an IdaDataFrame") idadf._idadb._check_procedure("PREDICT_NAIVEBAYES", "Prediction for Naive Bayes") # Check the ID if column_id is None: column_id = self._column_id if column_id not in idadf.columns: raise ValueError( "No id columns is available in IdaDataFrame:" + column_id + ". Either create a new ID column using add_column_id function" + " or give the name of a column that can be used as ID") if self._idadb is None: raise IdaNaiveBayesError( "The Naive Bayes model was not trained before.") # Check or create an outtable name, drop it if it already exists. if outtable is None: outtable = idadf._idadb._get_valid_tablename('PREDICT_NAIVEBAYES_') else: outtable = ibmdbpy.utils.check_tablename(outtable) if idadf._idadb.exists_table(outtable): idadf._idadb.drop_table(outtable) if outtableProb is not None: outtableProb = ibmdbpy.utils.check_tablename(outtableProb) if idadf._idadb.exists_table(outtableProb): idadf._idadb.drop_table(outtableProb) self.outtable = outtable self.outtableProb = outtableProb self.mestimation = mestimation # Create a temporay view idadf.internal_state._create_view() tmp_view_name = idadf.internal_state.current_state #if "." in tmp_view_name: #tmp_view_name = tmp_view_name.split('.')[-1] try: idadf._idadb._call_stored_procedure("IDAX.PREDICT_NAIVEBAYES ", model=self.modelname, intable=tmp_view_name, id=column_id, outtable=self.outtable, outtableProb=self.outtableProb, mestimation=self.mestimation) except: raise finally: idadf.internal_state._delete_view() idadf._idadb._autocommit() self.labels_ = ibmdbpy.IdaDataFrame(idadf._idadb, self.outtable) return self.labels_
def predict(self, idadf, column_id=None, outtable=None): """ Apply the K-means clustering model to new data. Parameters ---------- idadf : IdaDataFrame IdaDataFrame to be used as input. column_id : str The column of the input table that identifies a unique instance ID. Default: the same id column that is specified in the stored procedure to build the model. outtable : str The name of the output table where the assigned clusters are stored. If this parameter is not specified, it is generated automatically. If the parameter corresponds to an existing table in the database, it is replaced. Returns ------- IdaDataFrame IdaDataFrame containing the closest cluster for each data point referenced by its ID. """ if not type(idadf).__name__ == 'IdaDataFrame': raise TypeError("Argument should be an IdaDataFrame") # Check the ID if column_id is None: column_id = self._column_id if column_id not in idadf.columns: raise ValueError( "No id columns is available in IdaDataFrame:" + column_id + ". Either create a new ID column using add_column_id function" + " or give the name of a column that can be used as ID") if self._idadb is None: raise IdaKMeansError("No KMeans model was trained before") if outtable is None: outtable = idadf._idadb._get_valid_modelname('PREDICT_KMEANS_') else: if self.outtable: outtable = self.outtable outtable = ibmdbpy.utils.check_tablename(outtable) if idadf._idadb.exists_table(outtable): idadf._idadb.drop_table(outtable) self.outtable = outtable # Create a temporay view idadf.internal_state._create_view() tmp_view_name = idadf.internal_state.current_state if "." in tmp_view_name: tmp_view_name = tmp_view_name.split('.')[-1] try: idadf._idadb._call_stored_procedure("IDAX.PREDICT_KMEANS ", model=self.modelname, intable=tmp_view_name, id=column_id, outtable=self.outtable) except: raise finally: idadf.internal_state._delete_view() idadf._idadb.commit() self.labels_ = ibmdbpy.IdaDataFrame(idadf._idadb, outtable, indexer=column_id) return self.labels_
def predict(self, idadf, outtable=None, transaction_id=None, item_id=None, type="rules", limit=1, sort=None): """ Apply the rules and patterns of an association rules model to other transactions. You can apply all rules or only specific rules according to specified criteria. Parameters ---------- idadf : IdaDataFrame IdaDataFrame to be used as input. outtable : str, optional The name of the output table in which the mapping between the input sequences and the associated rules or patterns is written. If the parameter corresponds to an existing table in the database, it is replaced. transaction_id : str, optional The column of the input table that identifies the transaction ID. By default, this is the same tid column that is specified in the stored procedure to build the model. item_id : str, optional The column of the input table that identifies an item of the transaction. By default, this is the same item column that is specified in the stored procedure to build the model. type : str, optional, default : "rules" The type of information that is written in the output table. The following values are possible: ‘rules’ and ‘patterns’. limit : int, optional, >=1, default: 1 The maximum number of rules or patterns that is written in the output table for each input sequence. sort : str or list, optional A list of keywords that indicates the order in which the rules or patterns are written in the output table. The order of the list is descending. The items are separated by semicolons. The following values are possible: ‘support’, ‘confidence’, ‘lift’, and ‘length’. The ‘confidence’ value can only be specified if the type parameter is ‘rules’. If the type parameter is ‘rules’, the default is: support;confidence;length. If the type parameter is ‘patterns’, the default is: support;lift;length. Notes ----- When "type" is set to "rules", it looks like nothing is returned. """ if not isinstance(idadf, ibmdbpy.IdaDataFrame): raise TypeError("Argument should be an IdaDataFrame") if sort is not None: sort = ';'.join(sort) if transaction_id is None: transaction_id = self.transaction_id if item_id is None: item_id = self.item_id # Check the ID if transaction_id not in idadf.columns: raise ValueError("Transaction id column" + transaction_id + " is not available in IdaDataFrame.") if self._idadb is None: raise IdaAssociationRulesError( "No Association rules model was trained before.") # The version where we don't replace the outtable if it exists but raise an exception #if outtable is not None: # if idadf._idadb.exists_table(outtable): # raise ValueError("Table "+ outtable +" already exists.") #else: # outtable = idadf._idadb._get_valid_modelname('PREDICT_ASSOCRULES_') if self.outtable is None: self.outtable = idadf._idadb._get_valid_tablename('NAIVEBAYES_') else: self.outtable = ibmdbpy.utils.check_tablename(self.outtable) if idadf._idadb.exists_table(self.outtable): idadf._idadb.drop_table(self.outtable) self.outtable = outtable self.type = type self.limit = limit self.sort = sort # Create a temporay view idadf.internal_state._create_view() tmp_view_name = idadf.internal_state.current_state if "." in tmp_view_name: tmp_view_name = tmp_view_name.split('.')[-1] try: idadf._idadb._call_stored_procedure("IDAX.PREDICT_ASSOCRULES ", model=self.modelname, intable=tmp_view_name, outtable=outtable, tid=transaction_id, item=item_id, type=type, limit=limit, sort=sort) except: raise finally: idadf.internal_state._delete_view() idadf._cursor.commit() self.labels_ = ibmdbpy.IdaDataFrame(idadf._idadb, outtable) return self.labels_
def spearman(idadf, target=None, features=None, ignore_indexer=True): """ Compute the spearman rho correlation coefficients between a set of features and a set of target in an IdaDataFrame. Parameters ---------- idadf : IdaDataFrame target : str or list of str, optional A column or list of columns against to be used as target. Per default, consider all columns features : str or list of str, optional A column or list of columns to be used as features. Per default, consider all columns. ignore_indexer : bool, default: True Per default, ignore the column declared as indexer in idadf Returns ------- Pandas.DataFrame or Pandas.Series if only one target Notes ----- Input columns as target and features should be numerical. This function is a wrapper for pearson. The scalability of this approach is not very good. Should not be used on high dimensional data. Examples -------- >>> idadf = IdaDataFrame(idadb, "IRIS") >>> spearman(idadf) """ numerical_columns = idadf._get_numerical_columns() if features is None: features = numerical_columns target, features = _check_input(idadf, target, features, ignore_indexer) for feature in features: if feature not in numerical_columns: raise TypeError( "Correlation-based measure not available for non-numerical column %s" % feature) if ignore_indexer is True: if idadf.indexer: if idadf.indexer in numerical_columns: features.remove(idadf.indexer) if features is None: features = list(idadf.columns) numerical_features = [x for x in features if x in numerical_columns] numerical_targets = [x for x in target if x in numerical_columns] numerical_features = list(set(numerical_features) | set(numerical_targets)) agg_list = [ "CAST(RANK() OVER (ORDER BY \"%s\") AS INTEGER) AS \"%s\"" % (x, x) for x in numerical_features ] agg_string = ', '.join(agg_list) expression = "SELECT %s FROM %s" % (agg_string, idadf.name) viewname = idadf._idadb._create_view_from_expression(expression) try: idadf_rank = ibmdbpy.IdaDataFrame(idadf._idadb, viewname) return pearson(idadf_rank, target=target, features=numerical_features, ignore_indexer=ignore_indexer) except: raise finally: idadf._idadb.drop_view(viewname)
def test_idadf_empty(self, idadb, df): idadb._create_table(df, "TEST_EMPTY_3496593727406047264076") to_test = ibmdbpy.IdaDataFrame(idadb, "TEST_EMPTY_3496593727406047264076") assert (to_test.empty is True) idadb.drop_table("TEST_EMPTY_3496593727406047264076")
def discretize(idadf, columns=None, disc="em", target=None, bins=None, outtable=None, clear_existing=False): """ Discretize a set of numerical columns from an IdaDataFrame and returns an IdaDataFrame open on the discretized version of the dataset. Parameters ---------- idadf : IdaDataFrame columns : str or list of str, optional A column or list of columns to be discretized disc : "ef", "em", "ew", "ewn" default: "em" Discretization method to be used - ef: Discretization bins of equal frequency - em: Discretization bins of minimal entropy - ew: Discretization bins of equal width - ewn: Discretization bins of equal width with human-friendly limits target : str Target column again which the discretization will be done. Relevant only for "em" discretization. bins: int, optional Number of bins. Not relevant for "em" discretization. outtable: str, optional The name of the output table where the assigned clusters are stored. If this parameter is not specified, it is generated automatically. If the parameter corresponds to an existing table in the database, it is replaced. clear_existing: bool, default: False If set to True, a table will be replaced when a table with the same name already exists in the database. """ if columns is None: columns = idadf._get_numerical_columns() if target is not None: columns = [x for x in columns if columns != target] else: if isinstance(columns, six.string_types): columns = [columns] stored_proc = _check(idadf, columns, disc, target, bins, outtable) bound_outtable = idadf._idadb._get_valid_tablename('DISC_BOUNDS_%s_' % idadf.tablename) intable = idadf.name # either the table or a view on the top incolumn = "\";\"".join(columns) # Calculate bounds idadf._idadb._call_stored_procedure("IDAX.%s" % stored_proc, outtable=bound_outtable, intable=intable, incolumn=incolumn, target=target, bins=bins) # Create discretized dataset if outtable is None: disc_outtable = idadf._idadb._get_valid_tablename('DISC_%s_' % idadf.tablename) else: if clear_existing is True: try: idadf._idadb.drop_table(outtable) except: pass disc_outtable = outtable try: idadf._idadb._call_stored_procedure("IDAX.APPLY_DISC", outtable=disc_outtable, intable=intable, btable=bound_outtable, replace="T") except: raise finally: idadf._idadb.drop_table(bound_outtable) return ibmdbpy.IdaDataFrame(idadf._idadb, disc_outtable)