Python vDataFrame示例，verticapy.vDataFrame Python示例

示例#1

0

显示文件

def load_dataset(schema: str,
                 name: str,
                 dtype: dict,
                 copy_cols: list = [],
                 dataset_name: str = ""):
    """
    General Function to ingest a dataset
    """
    check_types([("schema", schema, [str]), ("name", name, [str])])

    try:

        vdf = vDataFrame(name, schema=schema)

    except:

        name = quote_ident(name)
        schema = "v_temp_schema" if not (schema) else quote_ident(schema)
        create_table(table_name=name, dtype=dtype, schema=schema)

        try:

            path = os.path.dirname(verticapy.__file__)
            path += f"/data/{dataset_name}.csv"
            if not (copy_cols):
                copy_cols = [quote_ident(col) for col in dtype]
            copy_cols = ", ".join(copy_cols)
            query = ("COPY {0}.{1}({2}) FROM {3} DELIMITER ',' NULL '' "
                     "ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;").format(
                         schema, name, copy_cols, "{}")

            cur = current_cursor()

            if isinstance(cur, vertica_python.vertica.cursor.Cursor):

                query = query.format("STDIN")
                executeSQL(query,
                           title="Ingesting the data.",
                           method="copy",
                           path=path)

            else:

                query = query.format("LOCAL '{0}'".format(path))
                executeSQL(query, title="Ingesting the data.")

            executeSQL("COMMIT;", title="Commit.")
            vdf = vDataFrame(name, schema=schema)

        except:

            drop(schema + "." + name, method="table")
            raise

    return vdf

示例#2

0

显示文件

文件： test_vDF_utilities.py 项目： vertica/VerticaPy

 def test_vDF_sql(self, titanic_vd):
     sql = """-- Selecting some columns \n
              SELECT 
                 age, 
                 fare 
              FROM titanic 
              WHERE age IS NOT NULL;;"""
     vdf = vDataFrame(sql=sql)
     assert vdf.shape() == (997, 2)
     vdf = vDataFrame(sql=sql, usecols=["age"])
     assert vdf.shape() == (997, 1)

示例#3

0

显示文件

def load_dataset_num(table_name: str = "dataset_num", schema: str = "public"):
    # Numerical Dataset

    data = [
        [1, 7.2, 3.6, 6.1, 2.5],
        [2, 7.7, 2.8, 6.7, 2.0],
        [3, 7.7, 3.0, 6.1, 2.3],
        [4, 7.9, 3.8, 6.4, 2.0],
        [5, 4.4, 2.9, 1.4, 0.2],
        [6, 4.6, 3.6, 1.0, 0.2],
        [7, 4.7, 3.2, 1.6, 0.2],
        [8, 6.5, 2.8, 4.6, 1.5],
        [9, 6.8, 2.8, 4.8, 1.4],
        [10, 7.0, 3.2, 4.7, 1.4],
    ]
    input_relation = "{}.{}".format(quote_ident(schema),
                                    quote_ident(table_name))

    drop(name=input_relation, method="table")
    create_table(
        table_name=table_name,
        schema=schema,
        dtype={
            "Id": "INT",
            "col1": "FLOAT",
            "col2": "FLOAT",
            "col3": "FLOAT",
            "col4": "FLOAT",
        },
    )
    insert_into(table_name=table_name, schema=schema, data=data, copy=False)

    return vDataFrame(input_relation=input_relation)

示例#4

0

显示文件

文件： test_dummy_tree_regressor.py 项目： bryanherger/Vertica-ML-Python

def tr_data_vd(base):
    base.cursor.execute("DROP TABLE IF EXISTS public.tr_data")
    base.cursor.execute(
        'CREATE TABLE IF NOT EXISTS public.tr_data(Id INT, transportation INT, gender VARCHAR, "owned cars" INT, cost VARCHAR, income CHAR(4))'
    )
    base.cursor.execute(
        "INSERT INTO tr_data VALUES (1, 0, 'Male', 0, 'Cheap', 'Low')")
    base.cursor.execute(
        "INSERT INTO tr_data VALUES (2, 0, 'Male', 1, 'Cheap', 'Med')")
    base.cursor.execute(
        "INSERT INTO tr_data VALUES (3, 1, 'Female', 1, 'Cheap', 'Med')")
    base.cursor.execute(
        "INSERT INTO tr_data VALUES (4, 0, 'Female', 0, 'Cheap', 'Low')")
    base.cursor.execute(
        "INSERT INTO tr_data VALUES (5, 0, 'Male', 1, 'Cheap', 'Med')")
    base.cursor.execute(
        "INSERT INTO tr_data VALUES (6, 1, 'Male', 0, 'Standard', 'Med')")
    base.cursor.execute(
        "INSERT INTO tr_data VALUES (7, 1, 'Female', 1, 'Standard', 'Med')")
    base.cursor.execute(
        "INSERT INTO tr_data VALUES (8, 2, 'Female', 1, 'Expensive', 'Hig')")
    base.cursor.execute(
        "INSERT INTO tr_data VALUES (9, 2, 'Male', 2, 'Expensive', 'Med')")
    base.cursor.execute(
        "INSERT INTO tr_data VALUES (10, 2, 'Female', 2, 'Expensive', 'Hig')")
    base.cursor.execute("COMMIT")

    tr_data = vDataFrame(input_relation="public.tr_data", cursor=base.cursor)
    yield tr_data
    with warnings.catch_warnings(record=True) as w:
        drop(name="public.tr_data", cursor=base.cursor)

示例#5

0

显示文件

文件： test_vDF_create.py 项目： nightfoxbarricade/VerticaPy

    def test_creating_vDF_using_input_relation_vcolumns(
            self, base, titanic_vd):
        tvdf = vDataFrame(input_relation='public.titanic',
                          usecols=["age", "survived"],
                          cursor=base.cursor)

        assert tvdf["survived"].count() == 1234

示例#6

0

显示文件

def load_dataset_reg(table_name: str = "dataset_reg", schema: str = "public"):
    # Regression Dataset

    data = [
        [1, 0, "Male", 0, "Cheap", "Low"],
        [2, 0, "Male", 1, "Cheap", "Med"],
        [3, 1, "Female", 1, "Cheap", "Med"],
        [4, 0, "Female", 0, "Cheap", "Low"],
        [5, 0, "Male", 1, "Cheap", "Med"],
        [6, 1, "Male", 0, "Standard", "Med"],
        [7, 1, "Female", 1, "Standard", "Med"],
        [8, 2, "Female", 1, "Expensive", "Hig"],
        [9, 2, "Male", 2, "Expensive", "Med"],
        [10, 2, "Female", 2, "Expensive", "Hig"],
    ]
    input_relation = "{}.{}".format(quote_ident(schema),
                                    quote_ident(table_name))

    drop(name=input_relation, method="table")
    create_table(
        table_name=table_name,
        schema=schema,
        dtype={
            "Id": "INT",
            "transportation": "INT",
            "gender": "VARCHAR",
            "owned cars": "INT",
            "cost": "VARCHAR",
            "income": "CHAR(4)",
        },
    )
    insert_into(table_name=table_name, schema=schema, data=data, copy=False)

    return vDataFrame(input_relation=input_relation)

示例#7

0

显示文件

def Balance(
    name: str,
    input_relation: str,
    y: str,
    cursor=None,
    method: str = "hybrid",
    ratio: float = 0.5,
):
    """
---------------------------------------------------------------------------
Creates a view with an equal distribution of the input data based on the 
response_column.
 
Parameters
----------
name: str
	Name of the the view.
input_relation: str
	Relation to use to create the new relation.
y: str
	Response column.
cursor: DBcursor, optional
	Vertica DB cursor.
method: str, optional
	Method to use to do the balancing.
		hybrid : Performs over-sampling and under-sampling on different 
			classes so each class is equally represented.
		over   : Over-samples on all classes, with the exception of the 
			most majority class, towards the most majority class's cardinality. 
		under  : Under-samples on all classes, with the exception of the most 
			minority class, towards the most minority class's cardinality.
ratio: float, optional
	The desired ratio between the majority class and the minority class. This 
	value has no effect when used with balance method 'hybrid'.

Returns
-------
vDataFrame
	vDataFrame of the created view
	"""
    check_types(
        [
            ("name", name, [str],),
            ("input_relation", input_relation, [str],),
            ("y", y, [str],),
            ("method", method, ["hybrid", "over", "under"],),
            ("ratio", ratio, [float],),
        ]
    )
    if not (cursor):
        cursor = read_auto_connect().cursor()
    else:
        check_cursor(cursor)
    version(cursor=cursor, condition=[8, 1, 1])
    method = method.lower()
    sql = "SELECT BALANCE('{}', '{}', '{}', '{}_sampling' USING PARAMETERS sampling_ratio = {})".format(
        name, input_relation, y, method, ratio
    )
    cursor.execute(sql)
    return vDataFrame(name, cursor)

示例#8

0

显示文件

文件： test_xgb_classifier.py 项目： vertica/VerticaPy

 def test_to_memmodel(self, model):
     mmodel = model.to_memmodel()
     res = mmodel.predict(
         [["Male", 0, "Cheap", "Low"], ["Female", 3, "Expensive", "Hig"]]
     )
     res_py = model.to_python()(
         [["Male", 0, "Cheap", "Low"], ["Female", 3, "Expensive", "Hig"]]
     )
     assert res[0] == res_py[0]
     assert res[1] == res_py[1]
     res = mmodel.predict_proba(
         [["Male", 0, "Cheap", "Low"], ["Female", 3, "Expensive", "Hig"]]
     )
     res_py = model.to_python(return_proba=True)(
         [["Male", 0, "Cheap", "Low"], ["Female", 3, "Expensive", "Hig"]]
     )
     assert res[0][0] == res_py[0][0]
     assert res[0][1] == res_py[0][1]
     assert res[0][2] == res_py[0][2]
     assert res[1][0] == res_py[1][0]
     assert res[1][1] == res_py[1][1]
     assert res[1][2] == res_py[1][2]
     vdf = vDataFrame("public.xgbc_data")
     vdf["prediction_sql"] = mmodel.predict_sql(
         ['"Gender"', '"owned cars"', '"cost"', '"income"']
     )
     vdf["prediction_proba_sql_0"] = mmodel.predict_proba_sql(
         ['"Gender"', '"owned cars"', '"cost"', '"income"']
     )[0]
     vdf["prediction_proba_sql_1"] = mmodel.predict_proba_sql(
         ['"Gender"', '"owned cars"', '"cost"', '"income"']
     )[1]
     vdf["prediction_proba_sql_2"] = mmodel.predict_proba_sql(
         ['"Gender"', '"owned cars"', '"cost"', '"income"']
     )[2]
     model.predict(vdf, name="prediction_vertica_sql")
     model.predict_proba(
         vdf, name="prediction_proba_vertica_sql_0", pos_label=model.classes_[0]
     )
     model.predict_proba(
         vdf, name="prediction_proba_vertica_sql_1", pos_label=model.classes_[1]
     )
     model.predict_proba(
         vdf, name="prediction_proba_vertica_sql_2", pos_label=model.classes_[2]
     )
     score = vdf.score("prediction_sql", "prediction_vertica_sql", "accuracy")
     assert score == pytest.approx(1.0)
     score = vdf.score(
         "prediction_proba_sql_0", "prediction_proba_vertica_sql_0", "r2"
     )
     assert score == pytest.approx(1.0)
     score = vdf.score(
         "prediction_proba_sql_1", "prediction_proba_vertica_sql_1", "r2"
     )
     assert score == pytest.approx(1.0)
     score = vdf.score(
         "prediction_proba_sql_2", "prediction_proba_vertica_sql_2", "r2"
     )
     assert score == pytest.approx(1.0)

示例#9

0

显示文件

文件： datasets.py 项目： miaviles/VerticaPy

def load_dataset(cursor, schema: str, name: str, str_create: str,
                 str_copy: str, dataset_name: str):
    """
    General Function to ingest a dataset
    """
    check_types([(
        "schema",
        schema,
        [str],
    ), (
        "name",
        name,
        [str],
    )])
    cursor = check_cursor(cursor)[0]
    try:
        vdf = vDataFrame(name, cursor, schema=schema)
    except:
        cursor.execute("CREATE TABLE {}.{}({});".format(
            str_column(schema),
            str_column(name),
            str_create,
        ))
        try:
            path = os.path.dirname(
                verticapy.__file__) + "/learn/data/{}.csv".format(dataset_name)
            query = "COPY {}.{}({}) FROM {} DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;".format(
                str_column(schema), str_column(name), str_copy, "{}")
            import vertica_python

            if isinstance(cursor, vertica_python.vertica.cursor.Cursor):
                with open(path, "r") as fs:
                    cursor.copy(query.format("STDIN"), fs)
            else:
                cursor.execute(query.format("LOCAL '{}'".format(path)))
            cursor.execute("COMMIT;")
            vdf = vDataFrame(name, cursor, schema=schema)
        except:
            cursor.execute("DROP TABLE {}.{}".format(str_column(schema),
                                                     str_column(name)))
            raise
    return vdf

示例#10

0

显示文件

文件： cluster.py 项目： miaviles/VerticaPy

    def predict(self):
        """
	---------------------------------------------------------------------------
	Creates a vDataFrame of the model.

	Returns
	-------
	vDataFrame
 		the vDataFrame including the prediction.
		"""
        return vDataFrame(self.name, self.cursor)

示例#11

0

显示文件

文件： test_randomforest_regressor.py 项目： vertica/VerticaPy

 def test_to_memmodel(self, model):
     mmodel = model.to_memmodel()
     res = mmodel.predict([["Male", 0, "Cheap", "Low"],
                           ["Female", 1, "Expensive", "Low"]])
     res_py = model.to_python()([["Male", 0, "Cheap", "Low"],
                                 ["Female", 1, "Expensive", "Low"]])
     assert res[0] == res_py[0]
     assert res[1] == res_py[1]
     vdf = vDataFrame("public.rfr_data")
     vdf["prediction_sql"] = mmodel.predict_sql(
         ['"Gender"', '"owned cars"', '"cost"', '"income"'])
     model.predict(vdf, name="prediction_vertica_sql")
     score = vdf.score("prediction_sql", "prediction_vertica_sql", "r2")
     assert score == pytest.approx(1.0)

示例#12

0

显示文件

 def test_to_memmodel(self, model):
     mmodel = model.to_memmodel()
     res = mmodel.predict([[5.006, 3.418, 1.464, 0.244],
                           [3.0, 11.0, 1993.0, 0.0]])
     res_py = model.to_python()([[5.006, 3.418, 1.464, 0.244],
                                 [3.0, 11.0, 1993.0, 0.0]])
     assert res[0] == res_py[0]
     assert res[1] == res_py[1]
     vdf = vDataFrame("public.bsk_data")
     vdf["prediction_sql"] = mmodel.predict_sql(
         ["col1", "col2", "col3", "col4"])
     model.predict(vdf, name="prediction_vertica_sql")
     score = vdf.score("prediction_sql", "prediction_vertica_sql",
                       "accuracy")
     assert score == pytest.approx(1.0)

示例#13

0

显示文件

文件： test_bisecting_kmeans.py 项目： miaviles/VerticaPy

def bsk_data_vd(base):
    base.cursor.execute("DROP TABLE IF EXISTS public.bsk_data")
    base.cursor.execute(
        "CREATE TABLE IF NOT EXISTS public.bsk_data(Id INT, col1 FLOAT, col2 FLOAT, col3 FLOAT, col4 FLOAT)"
    )
    base.cursor.execute("INSERT INTO bsk_data VALUES (1, 7.2, 3.6, 6.1, 2.5)")
    base.cursor.execute("INSERT INTO bsk_data VALUES (2, 7.7, 2.8, 6.7, 2.0)")
    base.cursor.execute("INSERT INTO bsk_data VALUES (3, 7.7, 3.0, 6.1, 2.3)")
    base.cursor.execute("INSERT INTO bsk_data VALUES (4, 7.9, 3.8, 6.4, 2.0)")
    base.cursor.execute("INSERT INTO bsk_data VALUES (5, 4.4, 2.9, 1.4, 0.2)")
    base.cursor.execute("INSERT INTO bsk_data VALUES (6, 4.6, 3.6, 1.0, 0.2)")
    base.cursor.execute("INSERT INTO bsk_data VALUES (7, 4.7, 3.2, 1.6, 0.2)")
    base.cursor.execute("INSERT INTO bsk_data VALUES (8, 6.5, 2.8, 4.6, 1.5)")
    base.cursor.execute("INSERT INTO bsk_data VALUES (9, 6.8, 2.8, 4.8, 1.4)")
    base.cursor.execute("INSERT INTO bsk_data VALUES (10, 7.0, 3.2, 4.7, 1.4)")
    base.cursor.execute("COMMIT")

    bsk_data = vDataFrame(input_relation="public.bsk_data", cursor=base.cursor)
    yield bsk_data
    with warnings.catch_warnings(record=True) as w:
        drop_table(name="public.bsk_data", cursor=base.cursor)

示例#14

0

显示文件

def load_winequality(cursor=None, schema: str = "public", name: str = "winequality"):
    """
---------------------------------------------------------------------------
Ingests the winequality dataset in the Vertica DB (Dataset ideal for Regression
and Classification). If a table with the same name and schema already exists, 
this function will create a vDataFrame from the input relation.

Parameters
----------
cursor: DBcursor, optional
	Vertica DB cursor. 
schema: str, optional
	Schema of the new relation. The default schema is public.
name: str, optional
	Name of the new relation.

Returns
-------
vDataFrame
	the winequality vDataFrame.

See Also
--------
load_amazon       : Ingests the amazon dataset in the Vertica DB.
	(Time Series / Regression).
load_commodities  : Ingests the commodities dataset in the Vertica DB.
    (Time Series / Regression).
load_iris         : Ingests the iris dataset in the Vertica DB.
	(Clustering / Classification).
load_market       : Ingests the market dataset in the Vertica DB.
	(Basic Data Exploration).
load_smart_meters : Ingests the smart meters dataset in the Vertica DB.
	(Time Series / Regression).
load_titanic      : Ingests the titanic dataset in the Vertica DB.
	(Classification).
	"""
    check_types([("schema", schema, [str],), ("name", name, [str],)])
    if not (cursor):
        cursor = read_auto_connect().cursor()
    else:
        check_cursor(cursor)
    try:
        vdf = vDataFrame(name, cursor, schema=schema)
    except:
        cursor.execute(
            'CREATE TABLE {}.{}("fixed_acidity" Numeric(6,3), "volatile_acidity" Numeric(7,4), "citric_acid" Numeric(6,3), "residual_sugar" Numeric(7,3), "chlorides" Float, "free_sulfur_dioxide" Numeric(7,2), "total_sulfur_dioxide" Numeric(7,2), "density" Float, "pH" Numeric(6,3), "sulphates" Numeric(6,3), "alcohol" Float, "quality" Integer, "good" Integer, "color" Varchar(20));'.format(
                str_column(schema), str_column(name)
            )
        )
        try:
            path = os.path.dirname(verticapy.__file__) + "/learn/data/winequality.csv"
            query = 'COPY {}.{}("fixed_acidity", "volatile_acidity", "citric_acid", "residual_sugar", "chlorides", "free_sulfur_dioxide", "total_sulfur_dioxide", "density", "pH", "sulphates", "alcohol", "quality", "good", "color") FROM {} DELIMITER \',\' NULL \'\' ENCLOSED BY \'"\' ESCAPE AS \'\\\' SKIP 1;'.format(
                str_column(schema), str_column(name), "{}"
            )
            import vertica_python

            if isinstance(cursor, vertica_python.vertica.cursor.Cursor):
                with open(path, "r") as fs:
                    cursor.copy(query.format("STDIN"), fs)
            else:
                cursor.execute(query.format("LOCAL '{}'".format(path)))
            cursor.execute("COMMIT;")
            vdf = vDataFrame(name, cursor, schema=schema)
        except:
            cursor.execute(
                "DROP TABLE {}.{}".format(str_column(schema), str_column(name))
            )
            raise
    return vdf

示例#15

0

显示文件

文件： test_vDF_utilities.py 项目： vertica/VerticaPy

 def test_vDF_to_db(self, titanic_vd):
     drop("verticapy_titanic_tmp")
     # testing relation_type = view
     try:
         titanic_vd.copy().to_db(
             name="verticapy_titanic_tmp",
             usecols=["age", "fare", "survived"],
             relation_type="view",
             db_filter="age > 40",
             nb_split=3,
         )
         titanic_tmp = vDataFrame("verticapy_titanic_tmp")
         assert titanic_tmp.shape() == (220, 4)
         assert titanic_tmp["_verticapy_split_"].min() == 0
         assert titanic_tmp["_verticapy_split_"].max() == 2
         current_cursor().execute(
             "SELECT table_name FROM view_columns WHERE table_name = 'verticapy_titanic_tmp'"
         )
         result = current_cursor().fetchone()
         assert result[0] == "verticapy_titanic_tmp"
     except:
         drop("verticapy_titanic_tmp", method="view")
         raise
     drop("verticapy_titanic_tmp", method="view")
     # testing relation_type = table
     try:
         titanic_vd.copy().to_db(
             name="verticapy_titanic_tmp",
             usecols=["age", "fare", "survived"],
             relation_type="table",
             db_filter="age > 40",
             nb_split=3,
         )
         titanic_tmp = vDataFrame("verticapy_titanic_tmp")
         assert titanic_tmp.shape() == (220, 4)
         assert titanic_tmp["_verticapy_split_"].min() == 0
         assert titanic_tmp["_verticapy_split_"].max() == 2
         current_cursor().execute(
             "SELECT table_name FROM columns WHERE table_name = 'verticapy_titanic_tmp'"
         )
         result = current_cursor().fetchone()
         assert result[0] == "verticapy_titanic_tmp"
     except:
         drop("verticapy_titanic_tmp")
         raise
     drop("verticapy_titanic_tmp")
     # testing relation_type = temporary table
     try:
         titanic_vd.copy().to_db(
             name="verticapy_titanic_tmp",
             usecols=["age", "fare", "survived"],
             relation_type="temporary",
             db_filter="age > 40",
             nb_split=3,
         )
         titanic_tmp = vDataFrame("verticapy_titanic_tmp")
         assert titanic_tmp.shape() == (220, 4)
         assert titanic_tmp["_verticapy_split_"].min() == 0
         assert titanic_tmp["_verticapy_split_"].max() == 2
         current_cursor().execute(
             "SELECT table_name FROM columns WHERE table_name = 'verticapy_titanic_tmp'"
         )
         result = current_cursor().fetchone()
         assert result[0] == "verticapy_titanic_tmp"
     except:
         drop("verticapy_titanic_tmp")
         raise
     drop("verticapy_titanic_tmp")
     # testing relation_type = temporary local table
     try:
         titanic_vd.copy().to_db(
             name="verticapy_titanic_tmp",
             usecols=["age", "fare", "survived"],
             relation_type="local",
             db_filter="age > 40",
             nb_split=3,
         )
         titanic_tmp = vDataFrame("v_temp_schema.verticapy_titanic_tmp")
         assert titanic_tmp.shape() == (220, 4)
         assert titanic_tmp["_verticapy_split_"].min() == 0
         assert titanic_tmp["_verticapy_split_"].max() == 2
         current_cursor().execute(
             "SELECT table_name FROM columns WHERE table_name = 'verticapy_titanic_tmp'"
         )
         result = current_cursor().fetchone()
         assert result[0] == "verticapy_titanic_tmp"
     except:
         drop("verticapy_titanic_tmp")
         raise
     drop("verticapy_titanic_tmp")

示例#16

0

显示文件

def load_titanic(cursor=None, schema: str = "public", name: str = "titanic"):
    """
---------------------------------------------------------------------------
Ingests the titanic dataset in the Vertica DB (Dataset ideal for 
Classification). If a table with the same name and schema already exists, 
this function will create a vDataFrame from the input relation.

Parameters
----------
cursor: DBcursor, optional
	Vertica DB cursor. 
schema: str, optional
	Schema of the new relation. The default schema is public.
name: str, optional
	Name of the new relation.

Returns
-------
vDataFrame
	the titanic vDataFrame.

See Also
--------
load_amazon       : Ingests the amazon dataset in the Vertica DB.
	(Time Series / Regression).
load_commodities  : Ingests the commodities dataset in the Vertica DB.
    (Time Series / Regression).
load_iris         : Ingests the iris dataset in the Vertica DB.
	(Clustering / Classification).
load_market       : Ingests the market dataset in the Vertica DB.
	(Basic Data Exploration).
load_smart_meters : Ingests the smart meters dataset in the Vertica DB.
	(Time Series / Regression).
load_winequality  : Ingests the winequality dataset in the Vertica DB.
	(Regression / Classification).
	"""
    check_types([("schema", schema, [str],), ("name", name, [str],)])
    if not (cursor):
        cursor = read_auto_connect().cursor()
    else:
        check_cursor(cursor)
    try:
        vdf = vDataFrame(name, cursor, schema=schema)
    except:
        cursor.execute(
            'CREATE TABLE {}.{}("pclass" Integer, "survived" Integer, "name" Varchar(164), "sex" Varchar(20), "age" Numeric(6,3), "sibsp" Integer, "parch" Integer, "ticket" Varchar(36), "fare" Numeric(10,5), "cabin" Varchar(30), "embarked" Varchar(20), "boat" Varchar(100), "body" Integer, "home.dest" Varchar(100));'.format(
                str_column(schema), str_column(name)
            )
        )
        try:
            path = os.path.dirname(verticapy.__file__) + "/learn/data/titanic.csv"
            query = 'COPY {}.{}("pclass", "survived", "name", "sex", "age", "sibsp", "parch", "ticket", "fare", "cabin", "embarked", "boat", "body", "home.dest") FROM {} DELIMITER \',\' NULL \'\' ENCLOSED BY \'"\' ESCAPE AS \'\\\' SKIP 1;'.format(
                str_column(schema), str_column(name), "{}"
            )
            import vertica_python

            if isinstance(cursor, vertica_python.vertica.cursor.Cursor):
                with open(path, "r") as fs:
                    cursor.copy(query.format("STDIN"), fs)
            else:
                cursor.execute(query.format("LOCAL '{}'".format(path)))
            cursor.execute("COMMIT;")
            vdf = vDataFrame(name, cursor, schema=schema)
        except:
            cursor.execute(
                "DROP TABLE {}.{}".format(str_column(schema), str_column(name))
            )
            raise
    return vdf

示例#17

0

显示文件

文件： test_vDF_create.py 项目： miaviles/VerticaPy

    def test_creating_vDF_using_input_relation_schema(self, base, titanic_vd):
        tvdf = vDataFrame(input_relation="titanic",
                          schema="public",
                          cursor=base.cursor)

        assert tvdf["pclass"].count() == 1234

示例#18

0

显示文件

def load_amazon(cursor=None, schema: str = "public", name: str = "amazon"):
    """
---------------------------------------------------------------------------
Ingests the amazon dataset in the Vertica DB (Dataset ideal for TS and
Regression). If a table with the same name and schema already exists, 
this function will create a vDataFrame from the input relation.

Parameters
----------
cursor: DBcursor, optional
	Vertica DB cursor. 
schema: str, optional
	Schema of the new relation. The default schema is public.
name: str, optional
	Name of the new relation.

Returns
-------
vDataFrame
	the amazon vDataFrame.

See Also
--------
load_commodities  : Ingests the commodities dataset in the Vertica DB.
    (Time Series / Regression).
load_iris         : Ingests the iris dataset in the Vertica DB.
	(Clustering / Classification).
load_market       : Ingests the market dataset in the Vertica DB.
	(Basic Data Exploration).
load_smart_meters : Ingests the smart meters dataset in the Vertica DB.
	(Time Series / Regression).
load_titanic      : Ingests the titanic dataset in the Vertica DB.
	(Classification).
load_winequality  : Ingests the winequality dataset in the Vertica DB.
	(Regression / Classification).
	"""
    check_types([("schema", schema, [str],), ("name", name, [str],)])
    if not (cursor):
        cursor = read_auto_connect().cursor()
    else:
        check_cursor(cursor)
    try:
        vdf = vDataFrame(name, cursor, schema=schema)
    except:
        cursor.execute(
            'CREATE TABLE {}.{}("date" Date, "state" Varchar(32), "number" Integer);'.format(
                str_column(schema), str_column(name)
            )
        )
        try:
            path = os.path.dirname(verticapy.__file__) + "/learn/data/amazon.csv"
            query = "COPY {}.{}(\"date\", \"state\", \"number\") FROM {} DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;".format(
                str_column(schema), str_column(name), "{}"
            )
            import vertica_python

            if isinstance(cursor, vertica_python.vertica.cursor.Cursor):
                with open(path, "r") as fs:
                    cursor.copy(query.format("STDIN"), fs)
            else:
                cursor.execute(query.format("LOCAL '{}'".format(path)))
            cursor.execute("COMMIT;")
            vdf = vDataFrame(name, cursor, schema=schema)
        except:
            cursor.execute(
                "DROP TABLE {}.{}".format(str_column(schema), str_column(name))
            )
            raise
    return vdf

示例#19

0

显示文件

文件： cluster.py 项目： miaviles/VerticaPy

    def fit(
        self,
        input_relation: (str, vDataFrame),
        X: list = [],
        key_columns: list = [],
        index: str = "",
    ):
        """
	---------------------------------------------------------------------------
	Trains the model.

	Parameters
	----------
	input_relation: str/vDataFrame
		Train relation.
	X: list, optional
		List of the predictors. If empty, all the numerical vcolumns will be used.
	key_columns: list, optional
		Columns not used during the algorithm computation but which will be used
		to create the final relation.
	index: str, optional
		Index to use to identify each row separately. It is highly recommanded to
		have one already in the main table to avoid creation of temporary tables.

	Returns
	-------
	object
 		self
		"""
        check_types([
            (
                "input_relation",
                input_relation,
                [str, vDataFrame],
            ),
            (
                "X",
                X,
                [list],
            ),
            (
                "key_columns",
                key_columns,
                [list],
            ),
            (
                "index",
                index,
                [str],
            ),
        ])
        self.cursor = check_cursor(self.cursor, input_relation, True)[0]
        check_model(name=self.name, cursor=self.cursor)
        if isinstance(input_relation, vDataFrame):
            if not (X):
                X = input_relation.numcol()
            input_relation = input_relation.__genSQL__()
        else:
            if not (X):
                X = vDataFrame(input_relation, self.cursor).numcol()
        X = [str_column(column) for column in X]
        self.X = X
        self.key_columns = [str_column(column) for column in key_columns]
        self.input_relation = input_relation
        schema, relation = schema_relation(input_relation)
        cursor = self.cursor

        def drop_temp_elem(cursor):
            try:
                cursor.execute(
                    "DROP TABLE IF EXISTS v_temp_schema.VERTICAPY_MAIN_{}".
                    format(get_session(cursor)))
                cursor.execute(
                    "DROP TABLE IF EXISTS v_temp_schema.VERTICAPY_DBSCAN_CLUSTERS"
                )
            except:
                pass

        try:
            if not (index):
                index = "id"
                main_table = "VERTICAPY_MAIN_{}".format(
                    get_session(self.cursor))
                drop_temp_elem(cursor)
                sql = "CREATE LOCAL TEMPORARY TABLE {} ON COMMIT PRESERVE ROWS AS SELECT ROW_NUMBER() OVER() AS id, {} FROM {} WHERE {}".format(
                    main_table,
                    ", ".join(X + key_columns),
                    self.input_relation,
                    " AND ".join(["{} IS NOT NULL".format(item)
                                  for item in X]),
                )
                executeSQL(cursor, sql, "Computing the DBSCAN Table - STEP 0.")
            else:
                cursor.execute("SELECT {} FROM {} LIMIT 10".format(
                    ", ".join(X + key_columns + [index]), self.input_relation))
                main_table = self.input_relation
            sql = [
                "POWER(ABS(x.{} - y.{}), {})".format(X[i], X[i],
                                                     self.parameters["p"])
                for i in range(len(X))
            ]
            distance = "POWER({}, 1 / {})".format(" + ".join(sql),
                                                  self.parameters["p"])
            sql = "SELECT x.{} AS node_id, y.{} AS nn_id, {} AS distance FROM {} AS x CROSS JOIN {} AS y".format(
                index, index, distance, main_table, main_table)
            sql = "SELECT node_id, nn_id, SUM(CASE WHEN distance <= {} THEN 1 ELSE 0 END) OVER (PARTITION BY node_id) AS density, distance FROM ({}) distance_table".format(
                self.parameters["eps"], sql)
            sql = "SELECT node_id, nn_id FROM ({}) VERTICAPY_SUBTABLE WHERE density > {} AND distance < {} AND node_id != nn_id".format(
                sql, self.parameters["min_samples"], self.parameters["eps"])
            executeSQL(cursor, sql, "Computing the DBSCAN Table - STEP 1.")
            graph = cursor.fetchall()
            main_nodes = list(
                dict.fromkeys([elem[0] for elem in graph] +
                              [elem[1] for elem in graph]))
            clusters = {}
            for elem in main_nodes:
                clusters[elem] = None
            i = 0
            while graph:
                node = graph[0][0]
                node_neighbor = graph[0][1]
                if (clusters[node] == None) and (clusters[node_neighbor]
                                                 == None):
                    clusters[node] = i
                    clusters[node_neighbor] = i
                    i = i + 1
                else:
                    if clusters[node] != None and clusters[
                            node_neighbor] == None:
                        clusters[node_neighbor] = clusters[node]
                    elif clusters[node_neighbor] != None and clusters[
                            node] == None:
                        clusters[node] = clusters[node_neighbor]
                del graph[0]
            try:
                f = open("VERTICAPY_DBSCAN_CLUSTERS_ID.csv", "w")
                for elem in clusters:
                    f.write("{}, {}\n".format(elem, clusters[elem]))
                f.close()
                try:
                    cursor.execute(
                        "DROP TABLE IF EXISTS v_temp_schema.VERTICAPY_DBSCAN_CLUSTERS"
                    )
                except:
                    pass
                cursor.execute(
                    "CREATE LOCAL TEMPORARY TABLE VERTICAPY_DBSCAN_CLUSTERS(node_id int, cluster int) ON COMMIT PRESERVE ROWS"
                )
                if isinstance(cursor, vertica_python.vertica.cursor.Cursor):
                    with open("./VERTICAPY_DBSCAN_CLUSTERS_ID.csv", "r") as fs:
                        cursor.copy(
                            "COPY v_temp_schema.VERTICAPY_DBSCAN_CLUSTERS(node_id, cluster) FROM STDIN DELIMITER ',' ESCAPE AS '\\';",
                            fs,
                        )
                else:
                    cursor.execute(
                        "COPY v_temp_schema.VERTICAPY_DBSCAN_CLUSTERS(node_id, cluster) FROM LOCAL './VERTICAPY_DBSCAN_CLUSTERS_ID.csv' DELIMITER ',' ESCAPE AS '\\';"
                    )
                try:
                    cursor.execute("COMMIT")
                except:
                    pass
                os.remove("VERTICAPY_DBSCAN_CLUSTERS_ID.csv")
            except:
                os.remove("VERTICAPY_DBSCAN_CLUSTERS_ID.csv")
                raise
            self.n_cluster_ = i
            executeSQL(
                cursor,
                "CREATE TABLE {} AS SELECT {}, COALESCE(cluster, -1) AS dbscan_cluster FROM v_temp_schema.{} AS x LEFT JOIN v_temp_schema.VERTICAPY_DBSCAN_CLUSTERS AS y ON x.{} = y.node_id"
                .format(self.name, ", ".join(self.X + self.key_columns),
                        main_table, index),
                "Computing the DBSCAN Table - STEP 2.",
            )
            cursor.execute(
                "SELECT COUNT(*) FROM {} WHERE dbscan_cluster = -1".format(
                    self.name))
            self.n_noise_ = cursor.fetchone()[0]
        except:
            drop_temp_elem(cursor)
            raise
        drop_temp_elem(cursor)
        model_save = {
            "type": "DBSCAN",
            "input_relation": self.input_relation,
            "key_columns": self.key_columns,
            "X": self.X,
            "p": self.parameters["p"],
            "eps": self.parameters["eps"],
            "min_samples": self.parameters["min_samples"],
            "n_cluster": self.n_cluster_,
            "n_noise": self.n_noise_,
        }
        insert_verticapy_schema(
            model_name=self.name,
            model_type="DBSCAN",
            model_save=model_save,
            cursor=self.cursor,
        )
        return self

示例#20

0

显示文件

文件： test_vDF_utilities.py 项目： bryanherger/Vertica-ML-Python

 def test_vDF_to_db(self, titanic_vd):
     try:
         with warnings.catch_warnings(record=True) as w:
             drop(
                 "verticapy_titanic_tmp",
                 titanic_vd._VERTICAPY_VARIABLES_["cursor"],
                 method="view",
             )
             drop(
                 "verticapy_titanic_tmp",
                 titanic_vd._VERTICAPY_VARIABLES_["cursor"],
                 method="table",
             )
     except:
         pass
     # testing relation_type = view
     try:
         titanic_vd.copy().to_db(
             name="verticapy_titanic_tmp",
             usecols=["age", "fare", "survived"],
             relation_type="view",
             db_filter="age > 40",
             nb_split=3,
         )
         titanic_tmp = vDataFrame(
             "verticapy_titanic_tmp",
             cursor=titanic_vd._VERTICAPY_VARIABLES_["cursor"],
         )
         assert titanic_tmp.shape() == (220, 4)
         assert titanic_tmp["_verticapy_split_"].min() == 0
         assert titanic_tmp["_verticapy_split_"].max() == 2
         titanic_vd._VERTICAPY_VARIABLES_["cursor"].execute(
             "SELECT table_name FROM view_columns WHERE table_name = 'verticapy_titanic_tmp'"
         )
         result = titanic_vd._VERTICAPY_VARIABLES_["cursor"].fetchone()
         assert result[0] == "verticapy_titanic_tmp"
     except:
         with warnings.catch_warnings(record=True) as w:
             drop(
                 "verticapy_titanic_tmp",
                 titanic_vd._VERTICAPY_VARIABLES_["cursor"],
                 method="view",
             )
         raise
     with warnings.catch_warnings(record=True) as w:
         drop(
             "verticapy_titanic_tmp",
             titanic_vd._VERTICAPY_VARIABLES_["cursor"],
             method="view",
         )
     # testing relation_type = table
     try:
         titanic_vd.copy().to_db(
             name="verticapy_titanic_tmp",
             usecols=["age", "fare", "survived"],
             relation_type="table",
             db_filter="age > 40",
             nb_split=3,
         )
         titanic_tmp = vDataFrame(
             "verticapy_titanic_tmp",
             cursor=titanic_vd._VERTICAPY_VARIABLES_["cursor"],
         )
         assert titanic_tmp.shape() == (220, 4)
         assert titanic_tmp["_verticapy_split_"].min() == 0
         assert titanic_tmp["_verticapy_split_"].max() == 2
         titanic_vd._VERTICAPY_VARIABLES_["cursor"].execute(
             "SELECT table_name FROM columns WHERE table_name = 'verticapy_titanic_tmp'"
         )
         result = titanic_vd._VERTICAPY_VARIABLES_["cursor"].fetchone()
         assert result[0] == "verticapy_titanic_tmp"
     except:
         with warnings.catch_warnings(record=True) as w:
             drop("verticapy_titanic_tmp",
                  titanic_vd._VERTICAPY_VARIABLES_["cursor"])
         raise
     with warnings.catch_warnings(record=True) as w:
         drop("verticapy_titanic_tmp",
              titanic_vd._VERTICAPY_VARIABLES_["cursor"])
     # testing relation_type = temporary table
     try:
         titanic_vd.copy().to_db(
             name="verticapy_titanic_tmp",
             usecols=["age", "fare", "survived"],
             relation_type="temporary",
             db_filter="age > 40",
             nb_split=3,
         )
         titanic_tmp = vDataFrame(
             "verticapy_titanic_tmp",
             cursor=titanic_vd._VERTICAPY_VARIABLES_["cursor"],
         )
         assert titanic_tmp.shape() == (220, 4)
         assert titanic_tmp["_verticapy_split_"].min() == 0
         assert titanic_tmp["_verticapy_split_"].max() == 2
         titanic_vd._VERTICAPY_VARIABLES_["cursor"].execute(
             "SELECT table_name FROM columns WHERE table_name = 'verticapy_titanic_tmp'"
         )
         result = titanic_vd._VERTICAPY_VARIABLES_["cursor"].fetchone()
         assert result[0] == "verticapy_titanic_tmp"
     except:
         with warnings.catch_warnings(record=True) as w:
             drop("verticapy_titanic_tmp",
                  titanic_vd._VERTICAPY_VARIABLES_["cursor"])
         raise
     with warnings.catch_warnings(record=True) as w:
         drop("verticapy_titanic_tmp",
              titanic_vd._VERTICAPY_VARIABLES_["cursor"])
     # testing relation_type = temporary local table
     try:
         titanic_vd.copy().to_db(
             name="verticapy_titanic_tmp",
             usecols=["age", "fare", "survived"],
             relation_type="local",
             db_filter="age > 40",
             nb_split=3,
         )
         titanic_tmp = vDataFrame(
             "v_temp_schema.verticapy_titanic_tmp",
             cursor=titanic_vd._VERTICAPY_VARIABLES_["cursor"],
         )
         assert titanic_tmp.shape() == (220, 4)
         assert titanic_tmp["_verticapy_split_"].min() == 0
         assert titanic_tmp["_verticapy_split_"].max() == 2
         titanic_vd._VERTICAPY_VARIABLES_["cursor"].execute(
             "SELECT table_name FROM columns WHERE table_name = 'verticapy_titanic_tmp'"
         )
         result = titanic_vd._VERTICAPY_VARIABLES_["cursor"].fetchone()
         assert result[0] == "verticapy_titanic_tmp"
     except:
         with warnings.catch_warnings(record=True) as w:
             drop("verticapy_titanic_tmp",
                  titanic_vd._VERTICAPY_VARIABLES_["cursor"])
         raise
     with warnings.catch_warnings(record=True) as w:
         drop("verticapy_titanic_tmp",
              titanic_vd._VERTICAPY_VARIABLES_["cursor"])

示例#21

0

显示文件

    def fit(self, input_relation: (str, vDataFrame), X: list = []):
        """
	---------------------------------------------------------------------------
	Trains the model.

	Parameters
	----------
	input_relation: str/vDataFrame
		Train relation.
	X: list
		List of the predictors. If empty, all the columns will be used.

	Returns
	-------
	object
 		self
		"""
        check_types([(
            "input_relation",
            input_relation,
            [str, vDataFrame],
        ), (
            "X",
            X,
            [list],
        )])
        self.cursor = check_cursor(self.cursor, input_relation, True)[0]
        if isinstance(input_relation, vDataFrame):
            if not (X):
                X = input_relation.get_columns()
            self.input_relation = input_relation.__genSQL__()
        else:
            if not (X):
                X = vDataFrame(input_relation, self.cursor).get_columns()
            self.input_relation = input_relation
        self.X = [str_column(elem) for elem in X]
        schema, relation = schema_relation(self.name)
        schema = str_column(schema)
        tmp_name = "{}.VERTICAPY_COUNT_VECTORIZER_{}".format(
            schema, get_session(self.cursor))
        try:
            self.drop()
        except:
            pass
        sql = "CREATE TABLE {}(id identity(2000) primary key, text varchar({})) ORDER BY id SEGMENTED BY HASH(id) ALL NODES KSAFE;"
        executeSQL(
            self.cursor,
            sql.format(tmp_name, self.parameters["max_text_size"]),
            "Computing the CountVectorizer - STEP 0.",
        )
        text = (" || ".join(self.X) if not (self.parameters["lowercase"]) else
                "LOWER({})".format(" || ".join(self.X)))
        if self.parameters["ignore_special"]:
            text = "REGEXP_REPLACE({}, '[^a-zA-Z0-9\\s]+', '')".format(text)
        sql = "INSERT INTO {}(text) SELECT {} FROM {}".format(
            tmp_name, text, self.input_relation)
        executeSQL(self.cursor, sql, "Computing the CountVectorizer - STEP 1.")
        sql = "CREATE TEXT INDEX {} ON {}(id, text) stemmer NONE;".format(
            self.name, tmp_name)
        executeSQL(self.cursor, sql, "Computing the CountVectorizer - STEP 2.")
        stop_words = "SELECT token FROM (SELECT token, cnt / SUM(cnt) OVER () AS df, rnk FROM (SELECT token, COUNT(*) AS cnt, RANK() OVER (ORDER BY COUNT(*) DESC) AS rnk FROM {} GROUP BY 1) VERTICAPY_SUBTABLE) VERTICAPY_SUBTABLE WHERE not(df BETWEEN {} AND {})".format(
            self.name, self.parameters["min_df"], self.parameters["max_df"])
        if self.parameters["max_features"] > 0:
            stop_words += " OR (rnk > {})".format(
                self.parameters["max_features"])
        self.cursor.execute(stop_words)
        self.stop_words_ = [item[0] for item in self.cursor.fetchall()]
        self.cursor.execute(self.deploySQL())
        self.vocabulary_ = [item[0] for item in self.cursor.fetchall()]
        self.countvectorizer_table = tmp_name
        model_save = {
            "type": "CountVectorizer",
            "input_relation": self.input_relation,
            "X": self.X,
            "countvectorizer_table": tmp_name,
            "lowercase": self.parameters["lowercase"],
            "max_df": self.parameters["max_df"],
            "min_df": self.parameters["min_df"],
            "max_features": self.parameters["max_features"],
            "ignore_special": self.parameters["ignore_special"],
            "max_text_size": self.parameters["max_text_size"],
            "vocabulary": self.vocabulary_,
            "stop_words": self.stop_words_,
        }
        insert_verticapy_schema(
            model_name=self.name,
            model_type="CountVectorizer",
            model_save=model_save,
            cursor=self.cursor,
        )
        return self

示例#22

0

显示文件

文件： preprocessing.py 项目： vertica/VerticaPy

    def fit(self, input_relation: Union[str, vDataFrame], X: list = []):
        """
	---------------------------------------------------------------------------
	Trains the model.

	Parameters
	----------
	input_relation: str/vDataFrame
		Training relation.
	X: list
		List of the predictors. If empty, all the columns will be used.

	Returns
	-------
	object
 		self
		"""
        if isinstance(X, str):
            X = [X]
        check_types(
            [("input_relation", input_relation, [str, vDataFrame]), ("X", X, [list])]
        )
        if verticapy.options["overwrite_model"]:
            self.drop()
        else:
            does_model_exist(name=self.name, raise_error=True)
        if isinstance(input_relation, vDataFrame):
            if not (X):
                X = input_relation.get_columns()
            self.input_relation = input_relation.__genSQL__()
        else:
            if not (X):
                X = vDataFrame(input_relation).get_columns()
            self.input_relation = input_relation
        self.X = [quote_ident(elem) for elem in X]
        schema, relation = schema_relation(self.name)
        schema = quote_ident(schema)
        tmp_name = gen_tmp_name(schema=schema, name="countvectorizer")
        try:
            self.drop()
        except:
            pass
        sql = """CREATE TABLE {0}(id identity(2000) primary key, 
                                  text varchar({1})) 
                 ORDER BY id SEGMENTED BY HASH(id) ALL NODES KSAFE;"""
        executeSQL(
            sql.format(tmp_name, self.parameters["max_text_size"]),
            title="Computing the CountVectorizer [Step 0].",
        )
        text = (
            " || ".join(self.X)
            if not (self.parameters["lowercase"])
            else "LOWER({})".format(" || ".join(self.X))
        )
        if self.parameters["ignore_special"]:
            text = "REGEXP_REPLACE({}, '[^a-zA-Z0-9\\s]+', '')".format(text)
        sql = "INSERT INTO {}(text) SELECT {} FROM {}".format(
            tmp_name, text, self.input_relation
        )
        executeSQL(sql, "Computing the CountVectorizer [Step 1].")
        sql = "CREATE TEXT INDEX {} ON {}(id, text) stemmer NONE;".format(
            self.name, tmp_name
        )
        executeSQL(sql, "Computing the CountVectorizer [Step 2].")
        stop_words = """SELECT 
                            token 
                        FROM 
                            (SELECT 
                                token, 
                                cnt / SUM(cnt) OVER () AS df, 
                                rnk 
                            FROM 
                                (SELECT 
                                    token, 
                                    COUNT(*) AS cnt, 
                                    RANK() OVER (ORDER BY COUNT(*) DESC) AS rnk 
                                 FROM {0} GROUP BY 1) VERTICAPY_SUBTABLE) VERTICAPY_SUBTABLE 
                                 WHERE not(df BETWEEN {1} AND {2})""".format(
            self.name, self.parameters["min_df"], self.parameters["max_df"]
        )
        if self.parameters["max_features"] > 0:
            stop_words += " OR (rnk > {})".format(self.parameters["max_features"])
        res = executeSQL(stop_words, print_time_sql=False, method="fetchall")
        self.stop_words_ = [item[0] for item in res]
        res = executeSQL(self.deploySQL(), print_time_sql=False, method="fetchall")
        self.vocabulary_ = [item[0] for item in res]
        self.countvectorizer_table = tmp_name
        model_save = {
            "type": "CountVectorizer",
            "input_relation": self.input_relation,
            "X": self.X,
            "countvectorizer_table": tmp_name,
            "lowercase": self.parameters["lowercase"],
            "max_df": self.parameters["max_df"],
            "min_df": self.parameters["min_df"],
            "max_features": self.parameters["max_features"],
            "ignore_special": self.parameters["ignore_special"],
            "max_text_size": self.parameters["max_text_size"],
            "vocabulary": self.vocabulary_,
            "stop_words": self.stop_words_,
        }
        insert_verticapy_schema(
            model_name=self.name, model_type="CountVectorizer", model_save=model_save,
        )
        return self

示例#23

0

显示文件

文件： test_vDF_create.py 项目： nightfoxbarricade/VerticaPy

    def test_creating_vDF_using_input_relation(self, base, titanic_vd):
        tvdf = vDataFrame(input_relation='public.titanic', cursor=base.cursor)

        assert tvdf["pclass"].count() == 1234

示例#24

0

显示文件

    def fit(
        self,
        input_relation: (str, vDataFrame),
        X: list = [],
        key_columns: list = [],
        index: str = "",
    ):
        """
	---------------------------------------------------------------------------
	Trains the model.

	Parameters
	----------
	input_relation: str/vDataFrame
		Training relation.
	X: list, optional
		List of the predictors. If empty, all the numerical vcolumns will be used.
	key_columns: list, optional
		Columns not used during the algorithm computation but which will be used
		to create the final relation.
	index: str, optional
		Index used to identify each row separately. It is highly recommanded to
        have one already in the main table to avoid creating temporary tables.

	Returns
	-------
	object
 		self
		"""
        if isinstance(key_columns, str):
            key_columns = [key_columns]
        if isinstance(X, str):
            X = [X]
        check_types(
            [
                ("input_relation", input_relation, [str, vDataFrame]),
                ("X", X, [list]),
                ("key_columns", key_columns, [list]),
                ("index", index, [str]),
            ]
        )
        if verticapy.options["overwrite_model"]:
            self.drop()
        else:
            does_model_exist(name=self.name, raise_error=True)
        if isinstance(input_relation, vDataFrame):
            if not (X):
                X = input_relation.numcol()
            input_relation = input_relation.__genSQL__()
        else:
            if not (X):
                X = vDataFrame(input_relation).numcol()
        X = [quote_ident(column) for column in X]
        self.X = X
        self.key_columns = [quote_ident(column) for column in key_columns]
        self.input_relation = input_relation
        schema, relation = schema_relation(input_relation)
        name_main, name_dbscan_clusters = (
            gen_tmp_name(name="main"),
            gen_tmp_name(name="clusters"),
        )
        try:
            if not (index):
                index = "id"
                drop(f"v_temp_schema.{name_main}", method="table")
                sql = """CREATE LOCAL TEMPORARY TABLE {0} 
                         ON COMMIT PRESERVE ROWS AS 
                         SELECT 
                            ROW_NUMBER() OVER() AS id, 
                            {1} 
                         FROM {2} 
                         WHERE {3}""".format(
                    name_main,
                    ", ".join(X + key_columns),
                    self.input_relation,
                    " AND ".join([f"{item} IS NOT NULL" for item in X]),
                )
                executeSQL(sql, title="Computing the DBSCAN Table [Step 0]")
            else:
                executeSQL(
                    "SELECT {0} FROM {1} LIMIT 10".format(
                        ", ".join(X + key_columns + [index]), self.input_relation
                    ),
                    print_time_sql=False,
                )
                name_main = self.input_relation
            sql = [
                "POWER(ABS(x.{0} - y.{0}), {1})".format(X[i], self.parameters["p"])
                for i in range(len(X))
            ]
            distance = "POWER({0}, 1 / {1})".format(
                " + ".join(sql), self.parameters["p"]
            )
            sql = """SELECT 
                        x.{0} AS node_id, 
                        y.{0} AS nn_id, 
                        {1} AS distance 
                     FROM {2} AS x 
                     CROSS JOIN {2} AS y""".format(
                index, distance, name_main
            )
            sql = """SELECT 
                        node_id, 
                        nn_id, 
                        SUM(CASE WHEN distance <= {0} THEN 1 ELSE 0 END) 
                            OVER (PARTITION BY node_id) AS density, 
                        distance 
                     FROM ({1}) distance_table""".format(
                self.parameters["eps"], sql
            )
            if isinstance(verticapy.options["random_state"], int):
                order_by = "ORDER BY node_id, nn_id"
            else:
                order_by = ""
            sql = """SELECT 
                        node_id, 
                        nn_id 
                     FROM ({0}) VERTICAPY_SUBTABLE 
                     WHERE density > {1} 
                        AND distance < {2} 
                        AND node_id != nn_id {3}""".format(
                sql, self.parameters["min_samples"], self.parameters["eps"], order_by,
            )
            graph = executeSQL(
                sql, title="Computing the DBSCAN Table [Step 1]", method="fetchall"
            )
            main_nodes = list(
                dict.fromkeys([elem[0] for elem in graph] + [elem[1] for elem in graph])
            )
            clusters = {}
            for elem in main_nodes:
                clusters[elem] = None
            i = 0
            while graph:
                node = graph[0][0]
                node_neighbor = graph[0][1]
                if (clusters[node] == None) and (clusters[node_neighbor] == None):
                    clusters[node] = i
                    clusters[node_neighbor] = i
                    i = i + 1
                else:
                    if clusters[node] != None and clusters[node_neighbor] == None:
                        clusters[node_neighbor] = clusters[node]
                    elif clusters[node_neighbor] != None and clusters[node] == None:
                        clusters[node] = clusters[node_neighbor]
                del graph[0]
            try:
                f = open("{}.csv".format(name_dbscan_clusters), "w")
                for elem in clusters:
                    f.write("{}, {}\n".format(elem, clusters[elem]))
                f.close()
                drop("v_temp_schema.{}".format(name_dbscan_clusters), method="table")
                executeSQL(
                    (
                        f"CREATE LOCAL TEMPORARY TABLE {name_dbscan_clusters}"
                        "(node_id int, cluster int) ON COMMIT PRESERVE ROWS"
                    ),
                    print_time_sql=False,
                )
                if isinstance(current_cursor(), vertica_python.vertica.cursor.Cursor):
                    executeSQL(
                        (
                            f"COPY v_temp_schema.{name_dbscan_clusters}(node_id, cluster)"
                            " FROM STDIN DELIMITER ',' ESCAPE AS '\\';"
                        ),
                        method="copy",
                        print_time_sql=False,
                        path=f"./{name_dbscan_clusters}.csv",
                    )
                else:
                    executeSQL(
                        """COPY v_temp_schema.{0}(node_id, cluster) 
                           FROM LOCAL './{0}.csv' DELIMITER ',' ESCAPE AS '\\';""".format(
                            name_dbscan_clusters
                        ),
                        print_time_sql=False,
                    )
                executeSQL("COMMIT;", print_time_sql=False)
                os.remove(f"{name_dbscan_clusters}.csv")
            except:
                os.remove(f"{name_dbscan_clusters}.csv")
                raise
            self.n_cluster_ = i
            executeSQL(
                """CREATE TABLE {0} AS 
                   SELECT 
                        {1}, 
                        COALESCE(cluster, -1) AS dbscan_cluster 
                   FROM v_temp_schema.{2} AS x 
                   LEFT JOIN v_temp_schema.{3} AS y 
                   ON x.{4} = y.node_id""".format(
                    self.name,
                    ", ".join(self.X + self.key_columns),
                    name_main,
                    name_dbscan_clusters,
                    index,
                ),
                title="Computing the DBSCAN Table [Step 2]",
            )
            self.n_noise_ = executeSQL(
                "SELECT COUNT(*) FROM {0} WHERE dbscan_cluster = -1".format(self.name),
                method="fetchfirstelem",
                print_time_sql=False,
            )
        except:
            drop(f"v_temp_schema.{name_main}", method="table")
            drop(f"v_temp_schema.{name_dbscan_clusters}", method="table")
            raise
        drop(f"v_temp_schema.{name_main}", method="table")
        drop(f"v_temp_schema.{name_dbscan_clusters}", method="table")
        model_save = {
            "type": "DBSCAN",
            "input_relation": self.input_relation,
            "key_columns": self.key_columns,
            "X": self.X,
            "p": self.parameters["p"],
            "eps": self.parameters["eps"],
            "min_samples": self.parameters["min_samples"],
            "n_cluster": self.n_cluster_,
            "n_noise": self.n_noise_,
        }
        insert_verticapy_schema(
            model_name=self.name, model_type="DBSCAN", model_save=model_save,
        )
        return self