def plot(self): """ --------------------------------------------------------------------------- Draws the model is the number of predictors is 2 or 3. """ if (2 <= len(self.X) <= 3): vDataFrame(self.name, self.cursor).scatter(columns=self.X, catcol="dbscan_cluster", max_cardinality=100, max_nb_points=10000) else: raise ValueError("Clustering Plots are only available in 2D or 3D")
def deploy_to_DB(self, name: str, view: bool = True, cutoff: float = -1): """ --------------------------------------------------------------------------- Deploys the model in the Vertica DB by creating a relation. Parameters ---------- name: str Relation name. It must include the schema (the default schema is public). view: bool If set to false, it will create a table instead of a view. cutoff: float, optional Probability cutoff. If this number is not between 0 and 1, a column corresponding to the probability to be of class 1 will be generated. Returns ------- vDataFrame the vDataFrame of the new relation. """ check_types([("name", name, [str], False), ("view", view, [bool], False), ("cutoff", cutoff, [int, float], False)]) relation = "TABLE" if not (view) else "VIEW" sql = "CREATE {} {} AS SELECT {}, {} AS {} FROM {}".format( relation, name, ", ".join(self.X), self.deploySQL(cutoff), self.y, self.test_relation) self.cursor.execute(sql) return vDataFrame(name, self.cursor)
def plot(self, voronoi: bool = False): """ --------------------------------------------------------------------------- Draws the KMeans clusters. Parameters ---------- voronoi: bool, optional If set to true, a voronoi plot will be drawn. It is only available for KMeans using 2 predictors. """ if (voronoi): if (len(self.X) == 2): from vertica_ml_python.learn.plot import voronoi_plot query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'centers')".format( self.name) self.cursor.execute(query) clusters = self.cursor.fetchall() voronoi_plot(clusters=clusters, columns=self.X) else: raise ValueError("Voronoi Plots are only available in 2D") else: vdf = vDataFrame(self.input_relation, self.cursor) self.predict(vdf, "kmeans_cluster") if (len(self.X) <= 3): vdf.scatter(columns=self.X, catcol="kmeans_cluster", max_cardinality=100, max_nb_points=10000) else: raise ValueError( "Clustering Plots are only available in 2D or 3D")
def to_vdf(self): """ --------------------------------------------------------------------------- Creates a vDataFrame of the model. Returns ------- vDataFrame model vDataFrame """ return (vDataFrame(self.name, self.cursor))
def Balance(name: str, input_relation: str, y: str, cursor=None, method: str = "hybrid", ratio: float = 0.5): """ --------------------------------------------------------------------------- Creates a view with an equal distribution of the input data based on the response_column. Parameters ---------- name: str Name of the the view. input_relation: str Relation used to create the new relation. y: str Response column. cursor: DBcursor, optional Vertica DB cursor. method: str, optional Method used to do the balancing. hybrid : Performs over-sampling and under-sampling on different classes so each class is equally represented. over : Over-samples on all classes, with the exception of the most majority class, towards the most majority class's cardinality. under : Under-samples on all classes, with the exception of the most minority class, towards the most minority class's cardinality. ratio: float, optional The desired ratio between the majority class and the minority class. This value has no effect when used with balance method 'hybrid'. Returns ------- vDataFrame vDataFrame of the created view """ check_types([("name", name, [str], False), ("input_relation", input_relation, [str], False), ("y", y, [str], False), ("method", method, ["hybrid", "over", "under"], True), ("ratio", ratio, [float], False)]) if not (cursor): cursor = read_auto_connect().cursor() else: check_cursor(cursor) method = method.lower() sql = "SELECT BALANCE('{}', '{}', '{}', '{}_sampling' USING PARAMETERS sampling_ratio = {})".format( name, input_relation, y, method, ratio) cursor.execute(sql) return (vDataFrame(name, cursor))
def load_amazon(cursor=None, schema: str = 'public', name: str = 'amazon'): """ --------------------------------------------------------------------------- Ingests the amazon dataset in the Vertica DB (Dataset ideal for TS and Regression). If a table with the same name and schema already exists, this function will create a vDataFrame from the input relation. Parameters ---------- cursor: DBcursor, optional Vertica DB cursor. schema: str, optional Schema of the new relation. The default schema is public. name: str, optional Name of the new relation. Returns ------- vDataFrame the amazon vDataFrame. See Also -------- load_iris : Ingests the iris dataset in the Vertica DB. (Clustering / Classification). load_market : Ingests the market dataset in the Vertica DB. (Basic Data Exploration). load_smart_meters : Ingests the smart meters dataset in the Vertica DB. (Time Series / Regression). load_titanic : Ingests the titanic dataset in the Vertica DB. (Classification). load_winequality : Ingests the winequality dataset in the Vertica DB. (Regression / Classification). """ check_types([("schema", schema, [str], False), ("name", name, [str], False)]) if not (cursor): cursor = read_auto_connect().cursor() else: check_cursor(cursor) try: vdf = vDataFrame(name, cursor, schema=schema) except: cursor.execute( "CREATE TABLE {}.{}(\"number\" Integer, \"date\" Date, \"state\" Varchar(32));" .format(str_column(schema), str_column(name))) try: path = os.path.dirname( vertica_ml_python.__file__) + "/learn/data/amazon.csv" query = "COPY {}.{}(\"number\", \"date\", \"state\") FROM {} DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;".format( str_column(schema), str_column(name), "{}") if ("vertica_python" in str(type(cursor))): with open(path, "r") as fs: cursor.copy(query.format('STDIN'), fs) else: cursor.execute(query.format("LOCAL '{}'".format(path))) vdf = vDataFrame(name, cursor, schema=schema) except: cursor.execute("DROP TABLE {}.{}".format(str_column(schema), str_column(name))) raise return (vdf)