def load_titanic(cursor, schema: str = 'public', name='titanic'): try: query = "CREATE TABLE {}.{}(\"pclass\" Integer, \"survived\" Integer, \"name\" Varchar(164), \"sex\" Varchar(20), \"age\" Numeric(6,3), \"sibsp\" Integer, \"parch\" Integer, \"ticket\" Varchar(36), \"fare\" Numeric(10,5), \"cabin\" Varchar(30), \"embarked\" Varchar(20), \"boat\" Varchar(100), \"body\" Integer, \"home.dest\" Varchar(100));" query += "COPY {}.{}(\"pclass\", \"survived\", \"name\", \"sex\", \"age\", \"sibsp\", \"parch\", \"ticket\", \"fare\", \"cabin\", \"embarked\", \"boat\", \"body\", \"home.dest\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;" query = query.format( str_column(schema), str_column(name), str_column(schema), str_column(name), os.path.dirname(vertica_ml_python.__file__) + "/learn/data/titanic.csv") cursor.execute(query) vdf = vDataframe(name, cursor, schema=schema) except: vdf = vDataframe(name, cursor, schema=schema) return (vdf)
def load_smart_meters(cursor, schema: str = 'public', name='smart_meters'): try: query = "CREATE TABLE {}.{}(\"time\" Timestamp, \"val\" Numeric(11,7), \"id\" Integer);" query += "COPY {}.{}(\"time\", \"val\", \"id\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;" query = query.format( str_column(schema), str_column(name), str_column(schema), str_column(name), os.path.dirname(vertica_ml_python.__file__) + "/learn/data/smart_meters.csv") cursor.execute(query) vdf = vDataframe(name, cursor, schema=schema) except: vdf = vDataframe(name, cursor, schema=schema) return (vdf)
def load_iris(cursor, schema: str = 'public', name='iris'): try: query = "CREATE TABLE {}.{}(\"SepalLengthCm\" Numeric(5,2), \"SepalWidthCm\" Numeric(5,2), \"PetalLengthCm\" Numeric(5,2), \"PetalWidthCm\" Numeric(5,2), \"Species\" Varchar(30));" query += "COPY {}.{}(\"Id\" FILLER Integer, \"SepalLengthCm\", \"SepalWidthCm\", \"PetalLengthCm\", \"PetalWidthCm\", \"Species\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;" query = query.format( str_column(schema), str_column(name), str_column(schema), str_column(name), os.path.dirname(vertica_ml_python.__file__) + "/learn/data/iris.csv") cursor.execute(query) vdf = vDataframe(name, cursor, schema=schema) except: vdf = vDataframe(name, cursor, schema=schema) return (vdf)
def load_amazon(cursor, schema: str = 'public', name='amazon'): try: query = "CREATE TABLE {}.{}(\"number\" Integer, \"date\" Date, \"state\" Varchar(32));" query += "COPY {}.{}(\"number\", \"date\", \"state\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;" query = query.format( str_column(schema), str_column(name), str_column(schema), str_column(name), os.path.dirname(vertica_ml_python.__file__) + "/learn/data/amazon.csv") cursor.execute(query) vdf = vDataframe(name, cursor, schema=schema) except: vdf = vDataframe(name, cursor, schema=schema) return (vdf)
def load_winequality(cursor, schema: str = 'public', name='winequality'): try: query = "CREATE TABLE {}.{}(\"fixed_acidity\" Numeric(6,3), \"volatile_acidity\" Numeric(7,4), \"citric_acid\" Numeric(6,3), \"residual_sugar\" Numeric(7,3), \"chlorides\" Float, \"free_sulfur_dioxide\" Numeric(7,2), \"total_sulfur_dioxide\" Numeric(7,2), \"density\" Float, \"pH\" Numeric(6,3), \"sulphates\" Numeric(6,3), \"alcohol\" Float, \"quality\" Integer, \"good\" Integer, \"color\" Varchar(20));" query += "COPY {}.{}(\"fixed_acidity\", \"volatile_acidity\", \"citric_acid\", \"residual_sugar\", \"chlorides\", \"free_sulfur_dioxide\", \"total_sulfur_dioxide\", \"density\", \"pH\", \"sulphates\", \"alcohol\", \"quality\", \"good\", \"color\") FROM LOCAL '{}' DELIMITER ',' NULL '' ENCLOSED BY '\"' ESCAPE AS '\\' SKIP 1;" query = query.format( str_column(schema), str_column(name), str_column(schema), str_column(name), os.path.dirname(vertica_ml_python.__file__) + "/learn/data/winequality.csv") cursor.execute(query) vdf = vDataframe(name, cursor, schema=schema) except: vdf = vDataframe(name, cursor, schema=schema) return (vdf)
def plot(self): from vertica_ml_python import vDataframe vdf = vDataframe(self.input_relation, self.cursor) self.add_to_vdf(vdf, "kmeans_cluster") if (len(self.X) <= 3): vdf.scatter(columns = self.X, catcol = "kmeans_cluster", max_cardinality = 100, max_nb_points = 10000) else: raise ValueError("Clustering Plots are only available in 2D or 3D")
def vdf_from_relation(relation: str, name: str = "VDF", cursor=None, dsn: str = "", schema: str = "public"): check_types([("relation", relation, [str], False), ("name", name, [str], False), ("dsn", dsn, [str], False), ("schema", schema, [str], False)]) name = ''.join(ch for ch in name if ch.isalnum()) from vertica_ml_python import vDataframe vdf = vDataframe("", empty=True) vdf.dsn = dsn if (cursor == None): from vertica_ml_python import vertica_cursor cursor = vertica_cursor(dsn) vdf.input_relation = name vdf.main_relation = relation vdf.schema = schema vdf.cursor = cursor vdf.query_on = False vdf.time_on = False cursor.execute("DROP TABLE IF EXISTS {}._vpython_{}_test_;".format( str_column(schema), name)) cursor.execute( "CREATE TEMPORARY TABLE {}._vpython_{}_test_ ON COMMIT PRESERVE ROWS AS SELECT * FROM {} LIMIT 10;" .format(str_column(schema), name, relation)) cursor.execute( "SELECT column_name, data_type FROM columns WHERE table_name = '_vpython_{}_test_' AND table_schema = '{}'" .format(name, schema.replace('"', ''))) result = cursor.fetchall() cursor.execute("DROP TABLE IF EXISTS {}._vpython_{}_test_;".format( str_column(schema), name)) vdf.columns = ['"' + item[0] + '"' for item in result] vdf.where = [] vdf.order_by = ['' for i in range(100)] vdf.exclude_columns = [] vdf.history = [] vdf.saving = [] for column, ctype in result: column = '"' + column + '"' from vertica_ml_python.vcolumn import vColumn new_vColumn = vColumn(column, parent=vdf, transformations=[ (column, ctype, category_from_type(ctype=ctype)) ]) setattr(vdf, column, new_vColumn) setattr(vdf, column[1:-1], new_vColumn) return (vdf)
def to_vdf(self): from vertica_ml_python import vDataframe return (vDataframe(self.name, self.cursor))
def plot(self): from vertica_ml_python import vDataframe if (len(self.X) <= 3): vDataframe(self.name, self.cursor).scatter(columns = self.X, catcol = "dbscan_cluster", max_cardinality = 100, max_nb_points = 10000) else: raise ValueError("Clustering Plots are only available in 2D or 3D")
def read_json(path: str, cursor, schema: str = 'public', table_name: str = '', usecols: list = [], new_name: dict = {}, insert: bool = False): check_types([("schema", schema, [str], False), ("table_name", table_name, [str], False), ("usecols", usecols, [list], False), ("new_name", new_name, [dict], False), ("insert", insert, [bool], False)]) file = path.split("/")[-1] file_extension = file[-4:len(file)] if (file_extension != 'json'): raise ValueError("The file extension is incorrect !") table_name = table_name if ( table_name) else path.split("/")[-1].split(".json")[0] query = "SELECT column_name, data_type FROM columns WHERE table_name='{}' AND table_schema='{}'".format( table_name, schema) cursor.execute(query) column_name = cursor.fetchall() if ((column_name != []) and not (insert)): raise Exception("The table \"{}\".\"{}\" already exists !".format( schema, table_name)) elif ((column_name == []) and (insert)): raise Exception("The table \"{}\".\"{}\" doesn't exist !".format( schema, table_name)) else: input_relation, flex_name = '"{}"."{}"'.format( schema, table_name), "_vpython" + str( np.random.randint(10000000)) + "_flex_" cursor.execute( "CREATE FLEX LOCAL TEMP TABLE {}(x int) ON COMMIT PRESERVE ROWS;". format(flex_name)) cursor.execute("COPY {} FROM LOCAL '{}' PARSER FJSONPARSER();".format( flex_name, path)) cursor.execute( "SELECT compute_flextable_keys('{}');".format(flex_name)) cursor.execute( "SELECT key_name, data_type_guess FROM {}_keys".format(flex_name)) result = cursor.fetchall() dtype = {} for column_dtype in result: try: if ("Varchar" not in column_dtype[1]): cursor.execute('SELECT "{}"::{} FROM {} LIMIT 1000'.format( column_dtype[0], column_dtype[1], flex_name)) dtype[column_dtype[0]] = column_dtype[1] except: dtype[column_dtype[0]] = "Varchar(100)" if not (insert): cols = [column for column in dtype ] if not (usecols) else [column for column in usecols] for i, column in enumerate(cols): cols[i] = '"{}"::{} AS "{}"'.format(column.replace( '"', ''), dtype[column], new_name[column]) if ( column in new_name) else '"{}"::{}'.format( column.replace('"', ''), dtype[column]) cursor.execute("CREATE TABLE {} AS SELECT {} FROM {}".format( input_relation, ", ".join(cols), flex_name)) print("The table {} has been successfully created.".format( input_relation)) else: column_name_dtype = {} for elem in column_name: column_name_dtype[elem[0]] = elem[1] final_cols = {} for column in column_name_dtype: final_cols[column] = None for column in column_name_dtype: if column in dtype: final_cols[column] = column else: for col in new_name: if (new_name[col] == column): final_cols[column] = col final_transformation = [] for column in final_cols: final_transformation += [ 'NULL AS "{}"'.format(column) ] if (final_cols[column] == None) else [ '"{}"::{} AS "{}"'.format( final_cols[column], column_name_dtype[column], column) ] cursor.execute("INSERT INTO {} SELECT {} FROM {}".format( input_relation, ", ".join(final_transformation), flex_name)) cursor.execute("DROP TABLE " + flex_name) from vertica_ml_python import vDataframe return vDataframe(table_name, cursor, schema=schema)
def read_csv(path: str, cursor, schema: str = 'public', table_name: str = '', delimiter: str = ',', header: bool = True, header_names: list = [], null: str = '', enclosed_by: str = '"', escape: str = '\\', genSQL: bool = False, parse_n_lines: int = -1, insert: bool = False): check_types([("schema", schema, [str], False), ("table_name", table_name, [str], False), ("delimiter", delimiter, [str], False), ("header", header, [bool], False), ("header_names", header_names, [list], False), ("null", null, [str], False), ("enclosed_by", enclosed_by, [str], False), ("escape", escape, [str], False), ("genSQL", genSQL, [bool], False), ("parse_n_lines", parse_n_lines, [int, float], False), ("insert", insert, [bool], False)]) file = path.split("/")[-1] file_extension = file[-3:len(file)] if (file_extension != 'csv'): raise ValueError("The file extension is incorrect !") table_name = table_name if ( table_name) else path.split("/")[-1].split(".csv")[0] query = "SELECT column_name FROM columns WHERE table_name='{}' AND table_schema='{}'".format( table_name, schema) cursor.execute(query) result = cursor.fetchall() if ((result != []) and not (insert)): raise Exception("The table \"{}\".\"{}\" already exists !".format( schema, table_name)) elif ((result == []) and (insert)): raise Exception("The table \"{}\".\"{}\" doesn't exist !".format( schema, table_name)) else: input_relation = '"{}"."{}"'.format(schema, table_name) f = open(path, 'r') file_header = f.readline().replace('\n', '').replace('"', '').split(delimiter) f.close() if ((header_names == []) and (header)): header_names = file_header elif (len(file_header) > len(header_names)): header_names += [ "ucol{}".format(i + len(header_names)) for i in range(len(file_header) - len(header_names)) ] if ((parse_n_lines > 0) and not (insert)): f = open(path, 'r') f2 = open(path[0:-4] + "_vpython_copy.csv", 'w') for i in range(parse_n_lines + int(header)): line = f.readline() f2.write(line) f.close() f2.close() path_test = path[0:-4] + "_vpython_copy.csv" else: path_test = path query = "" if not (insert): dtype = pcsv(path_test, cursor, delimiter, header, header_names, null, enclosed_by, escape) if (parse_n_lines > 0): os.remove(path[0:-4] + "_vpython_copy.csv") query = "CREATE TABLE {}({});\n".format( input_relation, ", ".join([ '"{}" {}'.format(column, dtype[column]) for column in header_names ])) skip = " SKIP 1" if (header) else "" query += "COPY {}({}) FROM LOCAL '{}' DELIMITER '{}' NULL '{}' ENCLOSED BY '{}' ESCAPE AS '{}'{};".format( input_relation, ", ".join(['"' + column + '"' for column in header_names]), path, delimiter, null, enclosed_by, escape, skip) if (genSQL): print(query) else: cursor.execute(query) if not (insert): print("The table {} has been successfully created.".format( input_relation)) from vertica_ml_python import vDataframe return vDataframe(table_name, cursor, schema=schema)