예제 #1
0
    def plot(self):
        """
	---------------------------------------------------------------------------
	Draws the model is the number of predictors is 2 or 3.
		"""
        if (2 <= len(self.X) <= 3):
            vDataFrame(self.name, self.cursor).scatter(columns=self.X,
                                                       catcol="dbscan_cluster",
                                                       max_cardinality=100,
                                                       max_nb_points=10000)
        else:
            raise ValueError("Clustering Plots are only available in 2D or 3D")
예제 #2
0
    def deploy_to_DB(self, name: str, view: bool = True, cutoff: float = -1):
        """
	---------------------------------------------------------------------------
	Deploys the model in the Vertica DB by creating a relation. 

	Parameters
	----------
	name: str
		Relation name. It must include the schema (the default schema is public).
	view: bool
		If set to false, it will create a table instead of a view.
	cutoff: float, optional
		Probability cutoff. If this number is not between 0 and 1, a column
		corresponding to the probability to be of class 1 will be generated.

	Returns
	-------
	vDataFrame
 		the vDataFrame of the new relation.
		"""
        check_types([("name", name, [str], False),
                     ("view", view, [bool], False),
                     ("cutoff", cutoff, [int, float], False)])
        relation = "TABLE" if not (view) else "VIEW"
        sql = "CREATE {} {} AS SELECT {}, {} AS {} FROM {}".format(
            relation, name, ", ".join(self.X), self.deploySQL(cutoff), self.y,
            self.test_relation)
        self.cursor.execute(sql)
        return vDataFrame(name, self.cursor)
예제 #3
0
    def plot(self, voronoi: bool = False):
        """
	---------------------------------------------------------------------------
	Draws the KMeans clusters.

	Parameters
	----------
	voronoi: bool, optional
		If set to true, a voronoi plot will be drawn. It is only available for
		KMeans using 2 predictors.
		"""
        if (voronoi):
            if (len(self.X) == 2):
                from vertica_ml_python.learn.plot import voronoi_plot
                query = "SELECT GET_MODEL_ATTRIBUTE(USING PARAMETERS model_name = '{}', attr_name = 'centers')".format(
                    self.name)
                self.cursor.execute(query)
                clusters = self.cursor.fetchall()
                voronoi_plot(clusters=clusters, columns=self.X)
            else:
                raise ValueError("Voronoi Plots are only available in 2D")
        else:
            vdf = vDataFrame(self.input_relation, self.cursor)
            self.predict(vdf, "kmeans_cluster")
            if (len(self.X) <= 3):
                vdf.scatter(columns=self.X,
                            catcol="kmeans_cluster",
                            max_cardinality=100,
                            max_nb_points=10000)
            else:
                raise ValueError(
                    "Clustering Plots are only available in 2D or 3D")
예제 #4
0
	def to_vdf(self):
		"""
	---------------------------------------------------------------------------
	Creates a vDataFrame of the model.

	Returns
	-------
	vDataFrame
 		model vDataFrame
		"""
		return (vDataFrame(self.name, self.cursor))
예제 #5
0
def Balance(name: str,
            input_relation: str,
            y: str,
            cursor=None,
            method: str = "hybrid",
            ratio: float = 0.5):
    """
---------------------------------------------------------------------------
Creates a view with an equal distribution of the input data based on the 
response_column.
 
Parameters
----------
name: str
	Name of the the view.
input_relation: str
	Relation used to create the new relation.
y: str
	Response column.
cursor: DBcursor, optional
	Vertica DB cursor.
method: str, optional
	Method used to do the balancing.
		hybrid : Performs over-sampling and under-sampling on different 
			classes so each class is equally represented.
		over   : Over-samples on all classes, with the exception of the 
			most majority class, towards the most majority class's cardinality. 
		under  : Under-samples on all classes, with the exception of the most 
			minority class, towards the most minority class's cardinality.
ratio: float, optional
	The desired ratio between the majority class and the minority class. This 
	value has no effect when used with balance method 'hybrid'.

Returns
-------
vDataFrame
	vDataFrame of the created view
	"""
    check_types([("name", name, [str], False),
                 ("input_relation", input_relation, [str], False),
                 ("y", y, [str], False),
                 ("method", method, ["hybrid", "over", "under"], True),
                 ("ratio", ratio, [float], False)])
    if not (cursor):
        cursor = read_auto_connect().cursor()
    else:
        check_cursor(cursor)
    method = method.lower()
    sql = "SELECT BALANCE('{}', '{}', '{}', '{}_sampling' USING PARAMETERS sampling_ratio = {})".format(
        name, input_relation, y, method, ratio)
    cursor.execute(sql)
    return (vDataFrame(name, cursor))
예제 #6
0
def load_amazon(cursor=None, schema: str = 'public', name: str = 'amazon'):
    """
---------------------------------------------------------------------------
Ingests the amazon dataset in the Vertica DB (Dataset ideal for TS and
Regression). If a table with the same name and schema already exists, 
this function will create a vDataFrame from the input relation.

Parameters
----------
cursor: DBcursor, optional
	Vertica DB cursor. 
schema: str, optional
	Schema of the new relation. The default schema is public.
name: str, optional
	Name of the new relation.

Returns
-------
vDataFrame
	the amazon vDataFrame.

See Also
--------
load_iris         : Ingests the iris dataset in the Vertica DB.
	(Clustering / Classification).
load_market       : Ingests the market dataset in the Vertica DB.
	(Basic Data Exploration).
load_smart_meters : Ingests the smart meters dataset in the Vertica DB.
	(Time Series / Regression).
load_titanic      : Ingests the titanic dataset in the Vertica DB.
	(Classification).
load_winequality  : Ingests the winequality dataset in the Vertica DB.
	(Regression / Classification).
	"""
    check_types([("schema", schema, [str], False),
                 ("name", name, [str], False)])
    if not (cursor):
        cursor = read_auto_connect().cursor()
    else:
        check_cursor(cursor)
    try:
        vdf = vDataFrame(name, cursor, schema=schema)
    except:
        cursor.execute(
            "CREATE TABLE {}.{}(\"number\" Integer, \"date\" Date, \"state\" Varchar(32));"
            .format(str_column(schema), str_column(name)))
        try:
            path = os.path.dirname(
                vertica_ml_python.__file__) + "/learn/data/amazon.csv"
            query = "COPY {}.{}(\"number\", \"date\", \"state\") FROM {} DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;".format(
                str_column(schema), str_column(name), "{}")
            if ("vertica_python" in str(type(cursor))):
                with open(path, "r") as fs:
                    cursor.copy(query.format('STDIN'), fs)
            else:
                cursor.execute(query.format("LOCAL '{}'".format(path)))
            vdf = vDataFrame(name, cursor, schema=schema)
        except:
            cursor.execute("DROP TABLE {}.{}".format(str_column(schema),
                                                     str_column(name)))
            raise
    return (vdf)