def convert_struct_field(column: StructField) -> TableDefinition.Column: """Converts a Spark StructField to a Tableau Hyper SqlType""" if column.dataType == IntegerType(): sql_type = SqlType.int() elif column.dataType == LongType(): sql_type = SqlType.big_int() elif column.dataType == ShortType(): sql_type = SqlType.small_int() elif column.dataType == DoubleType(): sql_type = SqlType.double() elif column.dataType == FloatType(): sql_type = SqlType.double() elif column.dataType == BooleanType(): sql_type = SqlType.bool() elif column.dataType == DateType(): sql_type = SqlType.date() elif column.dataType == TimestampType(): sql_type = SqlType.timestamp() elif column.dataType == StringType(): sql_type = SqlType.text() else: # Trap the DecimalType case if str(column.dataType).startswith("DecimalType"): # Max precision is only up to 18 decimal places in Tableau Hyper API precision = column.dataType.precision if column.dataType.precision <= 18 else 18 scale = column.dataType.scale sql_type = SqlType.numeric(precision, scale) else: raise ValueError(f'Invalid StructField datatype for column `{column.name}` : {column.dataType}') nullable = NULLABLE if column.nullable else NOT_NULLABLE return TableDefinition.Column(name=column.name, type=sql_type, nullability=nullable)
def fn_convert_to_hyper_types(given_type): switcher = { 'empty': SqlType.text(), 'bool': SqlType.bool(), 'int': SqlType.big_int(), 'float-dot': SqlType.double(), 'date-YMD': SqlType.date(), 'date-MDY': SqlType.date(), 'date-DMY': SqlType.date(), 'time-24': SqlType.time(), 'time-12': SqlType.time(), 'datetime-24-YMD': SqlType.timestamp(), 'datetime-12-MDY': SqlType.timestamp(), 'datetime-24-DMY': SqlType.timestamp(), 'str': SqlType.text() } identified_type = switcher.get(given_type) if identified_type is None: identified_type = SqlType.text() return identified_type
def createHyperTable(): columns = getConfig() page_table = TableDefinition(table_name="Page") for i in range(0, len(columns)): if columns[i]['DataType'] == "String": dt = SqlType.text() elif columns[i]['DataType'] == "Date": dt = SqlType.date() else: dt = SqlType.big_int() page_table.add_column(columns[i]['ColumnName'], dt) return page_table
def df_to_extract(df, output_path): ''' Converts a Pandas dataframe to a Tableau Extract. Parameters ---------- df (pandas dataframe): Dataframe to turn into a Tableau extract output_path (str): Where to create the Tableau extract ''' # Replace nan's with 0 df = df.replace(np.nan, 0.0, regex=True) print('Creating Tableau data extract...') with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyper: with Connection(hyper.endpoint, output_path, CreateMode.CREATE_AND_REPLACE) as connection: # Create schema connection.catalog.create_schema('Extract') # Create list of column definitions, based on the datatypes in pandas dataframe dtype_map = { 'int32': SqlType.int(), 'int64': SqlType.big_int(), 'float32': SqlType.double(), 'float64': SqlType.double(), 'datetime64[ns]': SqlType.date(), 'object': SqlType.text() } table_def = [] # Get column headers to loop through them df_columns = list(df) for col_header in df_columns: dtype_str = str(df[col_header].dtype) # Use dtype_str to lookup appropiate SqlType from dtype_map and append new column to table definition table_def.append(TableDefinition.Column(col_header, dtype_map[dtype_str])) # Define table extract_table = TableDefinition(TableName('Extract', 'Extract'), table_def) # Create table connection.catalog.create_table(extract_table) # Insert data with Inserter(connection, extract_table) as inserter: for idx, row in df.iterrows(): inserter.add_row(row) inserter.execute()
def test_convert_struct_field(self): # ensure strings can be converted correctly first_name_col = StructField('first_name', StringType(), False) converted_col = convert_struct_field(first_name_col) assert (converted_col.name == Name('first_name')) assert (converted_col.nullability is NOT_NULLABLE) assert (converted_col.type == SqlType.text()) # ensure dates can be converted correctly date_col = StructField('update_date', DateType(), True) converted_col = convert_struct_field(date_col) assert (converted_col.name == Name('update_date')) assert (converted_col.nullability is NULLABLE) assert (converted_col.type == SqlType.date()) # ensure timestamps can be converted correctly timestamp_col = StructField('created_at', TimestampType(), False) converted_col = convert_struct_field(timestamp_col) assert (converted_col.name == Name('created_at')) assert (converted_col.nullability is NOT_NULLABLE) assert (converted_col.type == SqlType.timestamp())
def _hyper_sql_type(self, source_column): """ Finds the correct Hyper column type for source_column source_column (obj): Source column (Instance of google.cloud.bigquery.schema.SchemaField) Returns a tableauhyperapi.SqlType Object """ source_column_type = source_column.field_type return_sql_type = { "BOOL": SqlType.bool(), "BYTES": SqlType.bytes(), "DATE": SqlType.date(), "DATETIME": SqlType.timestamp(), "INT64": SqlType.big_int(), "INTEGER": SqlType.int(), "NUMERIC": SqlType.numeric(18, 9), "FLOAT64": SqlType.double(), "STRING": SqlType.text(), "TIME": SqlType.time(), "TIMESTAMP": SqlType.timestamp_tz(), }.get(source_column_type) if return_sql_type is None: error_message = "No Hyper SqlType defined for BigQuery source type: {}".format( source_column_type ) logger.error(error_message) raise LookupError(error_message) logger.debug( "Translated source column type {} to Hyper SqlType {}".format( source_column_type, return_sql_type ) ) return return_sql_type
def Full_refresh(result): LogFileWrite("Running Full refresh") try: with HyperProcess(telemetry=Telemetry.DO_NOT_SEND_USAGE_DATA_TO_TABLEAU) as hyperprocess: print("The HyperProcess has started.") LogFileWrite("The HyperProcess has started.") print(hyperprocess.is_open) if hyperprocess.is_open==True: with Connection(hyperprocess.endpoint, 'Facebook_campaigns.hyper', CreateMode.CREATE_AND_REPLACE,) as connection: if connection.is_open==True: print("The connection to the Hyper file is open.") LogFileWrite("The connection to the Hyper file is open.") connection.catalog.create_schema('Extract') DataTable = TableDefinition(TableName('Extract','Campaign_data'),[ ############Below Columns are extracted from Report data API TableDefinition.Column('Row_ID', SqlType.big_int()), TableDefinition.Column('Inserted Date', SqlType.date()), TableDefinition.Column('Date', SqlType.date()), TableDefinition.Column('Account Id', SqlType.varchar(50)), TableDefinition.Column('Account Name', SqlType.text()), TableDefinition.Column('Campaign Id', SqlType.varchar(50)), TableDefinition.Column('Campaign Name', SqlType.text()), TableDefinition.Column('Impressions', SqlType.big_int()), TableDefinition.Column('Clicks', SqlType.big_int()), TableDefinition.Column('Reach', SqlType.big_int()), TableDefinition.Column('Spend', SqlType.double()), TableDefinition.Column('Frequency', SqlType.double()), ]) print("The table is defined.") LogFileWrite("Successfully Facebook Campaign Table is defined") connection.catalog.create_table(DataTable) # print(Campaign_df["Id"].dtype) #print(range(len(Campaign_df["Id"]))) with Inserter(connection, TableName('Extract','Campaign_data')) as inserter: inserted_rows=1 row_id=1 for i in range(0,len(result["Campaign Id"])): #print(str(result.loc[i,"CampaignId"])) #print(result.loc[i,"Date"]) inserter.add_row([ int(row_id), datetime.today(), (datetime.strptime(result.loc[i,"Date"], '%Y-%m-%d')), #(datetime.date(result.loc[i,"Date"])),#, "%Y-%m-%d")), str(result.loc[i,"Account Id"]), str(result.loc[i,"Account Name"]), str(result.loc[i,"Campaign Id"]), str(result.loc[i,"Campaign Name"]), int(result.loc[i,"Impressions"]), int(result.loc[i,"Clicks"]), int(result.loc[i,"Reach"]), float(result.loc[i,"Spend"]), float(result.loc[i,"Frequency"]) ]) #print("instered") row_id=row_id+1 inserted_rows=inserted_rows+1 inserter.execute() print("Instered Rows are " +str(inserted_rows)) LogFileWrite("Instered Rows are " +str(inserted_rows)) table_name=TableName('Extract','Campaign_data') Delet_query=f"DELETE FROM {table_name} WHERE " +'"'+ 'Row_ID'+'"'+" NOT IN(" Delet_query+="SELECT MAX("+'"'+'Row_ID'+'"'+f") FROM {table_name} " Delet_query+="GROUP BY " +'"'+'Date'+'",'+'"'+'Campaign Id'+'",'+'"'+'Campaign Name'+'",' Delet_query+='"'+'Account Id'+'",'+'"'+'Impressions'+'",' Delet_query+='"'+'Clicks'+'",'+'"'+'Account Name'+'",'+'"'+'Reach'+'",'+'"'+'Spend'+'",' Delet_query+='"'+'Frequency'+'")' #print(Delet_query) connection.execute_command(Delet_query) print("Deleted Duplicate rows") LogFileWrite("Successfully deleted Duplicate rows") else: print("unable to open connection to hyper file") LogFileWrite("unable to open connection to hyper file") if connection.is_open==True: connection.close() print("Connection to Hyper file closed") LogFileWrite("Connection to Hyper file closed") else: print("Connection to Hyper file closed") LogFileWrite("Connection to Hyper file closed") print("Connection is open or closed" + str(connection.is_open)) else: print("Unable to start the Hyper process ") LogFileWrite("Unable to start the Hyper process ") if hyperprocess.is_open==True: hyperprocess.close() print("Forcefully shutted down the Hyper Process") LogFileWrite("Forcefully shutted down the Hyper Process") else: print("Hyper process is shutted down") LogFileWrite("Hyper process is shutted down") print("Connection is open or closed" + str(connection.is_open)) print("process is open or closed" + str(hyperprocess.is_open)) except HyperException as ex: LogFileWrite("There is exception in starting Tableau Hyper Process. Exiting...") LogFileWrite(str(ex)) connection.close() hyperprocess.close() SendEmailMessage() sys.exit()
import yaml from pathlib import Path from tableauhyperapi import HyperProcess, Telemetry, \ Connection, CreateMode, \ NOT_NULLABLE, NULLABLE, SqlType, TableDefinition, \ Inserter, \ escape_name, escape_string_literal, \ HyperException #Create table definition to be used in Hyper file workbook_table = TableDefinition( table_name="Workbooks", columns=[ TableDefinition.Column("Project Id", SqlType.text(), NOT_NULLABLE), TableDefinition.Column("Content URL", SqlType.text(), NOT_NULLABLE), TableDefinition.Column("Created At", SqlType.date(), NOT_NULLABLE), TableDefinition.Column("Id", SqlType.text(), NOT_NULLABLE), TableDefinition.Column("Project Name", SqlType.text(), NOT_NULLABLE), TableDefinition.Column("size", SqlType.big_int(), NOT_NULLABLE), TableDefinition.Column("Updated At", SqlType.date(), NOT_NULLABLE), TableDefinition.Column("Name", SqlType.text(), NOT_NULLABLE) ]) #Name of hyper file that will be generated locally and deployed to Tableau path_to_database = Path("workbooks.hyper") #Name of project where Hyper file will be deployed #This project must exist on your server PROJECT = 'HyperTest' def main():
import os import json import sys from tableauhyperapi import HyperProcess, Telemetry, Connection, CreateMode, NOT_NULLABLE, NULLABLE, SqlType, TableDefinition, escape_string_literal # SqlTypeをdictで参照 sql_type_dict = {} sql_type_dict['BIG_INT'] = SqlType.big_int() sql_type_dict['TEXT'] = SqlType.text() sql_type_dict['DOUBLE'] = SqlType.double() sql_type_dict['DATE'] = SqlType.date() sql_type_dict['TIMESTAMP'] = SqlType.timestamp() # NULLABLEをdictで参照 nullable_dict = {} nullable_dict['YES'] = NULLABLE nullable_dict['NO'] = NOT_NULLABLE def create_column_def(table_def_dict: dict): column_def = [] for key in table_def_dict.keys(): column_def.append( TableDefinition.Column( key, sql_type_dict[table_def_dict[key]['type']], nullable_dict[table_def_dict[key]['nullable']])) return column_def
# with HyperProcess(Telemetry.SEND_USAGE_DATA_TO_TABLEAU, 'myapp' ) as hyper: # Step 2: Create the the .hyper file, replace it if it already exists with Connection(endpoint=hyper.endpoint, create_mode=CreateMode.CREATE_AND_REPLACE, database=path_to_hyper) as connection: # Step 3: Create the schema connection.catalog.create_schema('Extract') # Step 4: Create the table definition schema = TableDefinition(table_name=TableName('Extract','Extract'), columns=[ TableDefinition.Column('name', SqlType.text()), TableDefinition.Column('date', SqlType.date()), TableDefinition.Column('temperature', SqlType.double()), TableDefinition.Column('chance_precipitation', SqlType.double()), TableDefinition.Column('precipitation', SqlType.double()), TableDefinition.Column('wind_speed', SqlType.double()), TableDefinition.Column('wind_gust', SqlType.double()), TableDefinition.Column('visiblity', SqlType.double()), TableDefinition.Column('cloud_cover', SqlType.double()), TableDefinition.Column('relative_humidity', SqlType.double()), TableDefinition.Column('moon_phase', SqlType.double()), TableDefinition.Column('condition', SqlType.text()), ]) # Step 5: Create the table in the connection catalog connection.catalog.create_table(schema)
escape_name, escape_string_literal, \ HyperException # Table Definitions required to create tables orders_table = TableDefinition( # Since the table name is not prefixed with an explicit schema name, the table will reside in the default "public" namespace. table_name="Orders", columns=[ TableDefinition.Column(name="Address ID", type=SqlType.small_int(), nullability=NOT_NULLABLE), TableDefinition.Column(name="Customer ID", type=SqlType.text(), nullability=NOT_NULLABLE), TableDefinition.Column(name="Order Date", type=SqlType.date(), nullability=NOT_NULLABLE), TableDefinition.Column(name="Order ID", type=SqlType.text(), nullability=NOT_NULLABLE), TableDefinition.Column(name="Ship Date", type=SqlType.date(), nullability=NULLABLE), TableDefinition.Column(name="Ship Mode", type=SqlType.text(), nullability=NULLABLE) ]) customer_table = TableDefinition( # Since the table name is not prefixed with an explicit schema name, the table will reside in the default "public" namespace. table_name="Customer",
.option("sep", cf.delimiter) \ .load(cf.input_file_path) col = list(df.dtypes) print(len(col)) for i in range(len(col)): col[i] = list(col[i]) col[i][1] = type_[col[i][1]] print(col, len(col)) xyz = [ TableDefinition.Column('cdl_uuid', SqlType.varchar(30), NULLABLE), TableDefinition.Column('cdl_frequency', SqlType.varchar(30), NULLABLE), TableDefinition.Column('cdl_effective_date', SqlType.date(), NULLABLE), TableDefinition.Column('cdl_run_identifier', SqlType.varchar(30), NULLABLE), TableDefinition.Column('customer_id', SqlType.varchar(30), NULLABLE), TableDefinition.Column('customer_type', SqlType.varchar(30), NULLABLE), TableDefinition.Column('customer_name', SqlType.varchar(30), NULLABLE), TableDefinition.Column('customer_address', SqlType.varchar(100), NULLABLE), TableDefinition.Column('city', SqlType.varchar(30), NULLABLE), TableDefinition.Column('state', SqlType.varchar(30), NULLABLE), TableDefinition.Column('zip', SqlType.varchar(30), NULLABLE), TableDefinition.Column('phone', SqlType.varchar(30), NULLABLE), TableDefinition.Column('fax', SqlType.varchar(30), NULLABLE), TableDefinition.Column('email', SqlType.varchar(30), NULLABLE), TableDefinition.Column('customer_status', SqlType.varchar(30), NULLABLE), TableDefinition.Column('metropolitan_statistical_area_name', SqlType.varchar(100), NULLABLE), TableDefinition.Column('blocked_account', SqlType.varchar(30), NULLABLE), TableDefinition.Column('phs', SqlType.varchar(30), NULLABLE),
def sparkConnect(): # fetching DF from spark filestore if cf.file_type == 'csv': df = spark.read.format(cf.file_type) \ .option("inferSchema", cf.infer_schema) \ .option("header", cf.first_row_is_header) \ .option("sep", cf.delimiter) \ .load(cf.input_file_path) # print('\n', cf.input_file_path, '\n', cf.schema, '\n') # fetching table from db from databricks elif cf.file_type == 'jdbc': df = spark.read.format("jdbc") \ .option("driver", cf.driver) \ .option("url", cf.url) \ .option("dbtable", cf.table) \ .option("user", cf.user) \ .option("password", cf.password) \ .option("inferSchema", cf.infer_schema) \ .option("header", cf.first_row_is_header) \ .load() df.write.format("csv") \ .option("enoding", cf.charset) \ .option("header", cf.first_row_is_header) \ .option("sep", cf.delimiter) \ .save('/home/hari/HyperConverter/test') # pdf = df.select('*').toPandas() # path = '/home/hari/HyperConverter/test.csv' # pdf.to_csv(path, sep=',', index=False) path = glob.glob('/home/hari/HyperConverter/test/part*.csv') cf.input_file_path = path[0] cf.input_file_path = path print('\n', cf.input_file_path, '\n') col = list(df.dtypes) print(col) print(len(col)) for i in range(len(col)): col[i] = list(col[i]) col[i][1] = type_[col[i][1]] # print('\n', col, '\n') x = [] for i, j in col: print(i, j) if j == 'varchar': max_length = df.agg({i: "max"}).collect()[0] #print(max_length) xyz = max_length["max({})".format(i)] if xyz != None: max_length = len(xyz) if 19 <= max_length <= 40: max_length = 100 else: max_length = 30 else: max_length = 35 print(max_length) x.append( TableDefinition.Column(i, SqlType.varchar(max_length), NULLABLE)) elif j == 'int': x.append(TableDefinition.Column(i, SqlType.int(), NULLABLE)) elif j == 'date': x.append(TableDefinition.Column(i, SqlType.date(), NULLABLE)) elif j == 'numeric': x.append( TableDefinition.Column(i, SqlType.numeric(10, 4), NULLABLE)) elif j == 'bool': x.append(TableDefinition.Column(i, SqlType.bool(), NULLABLE)) elif j == 'big_int': x.append(TableDefinition.Column(i, SqlType.big_int(), NULLABLE)) elif j == 'double': x.append(TableDefinition.Column(i, SqlType.double(), NULLABLE)) elif j == 'text': print("this is culprate", i, j) x.append(TableDefinition.Column(i, SqlType.text(), NULLABLE)) print(x) print(len(x)) return x
Connection, CreateMode, \ NOT_NULLABLE, NULLABLE, SqlType, TableDefinition, \ Inserter, \ escape_name, escape_string_literal, \ HyperException data_table = TableDefinition( # Since the table name is not prefixed with an explicit schema name, the table will reside in the default "public" namespace. table_name="COVID-19", columns=[ TableDefinition.Column("Country/Region", SqlType.text(), NOT_NULLABLE), TableDefinition.Column("Province/State", SqlType.text(), NULLABLE), TableDefinition.Column("Latitude", SqlType.double(), NOT_NULLABLE), TableDefinition.Column("Longitude", SqlType.double(), NOT_NULLABLE), TableDefinition.Column("Case_Type", SqlType.text(), NOT_NULLABLE), TableDefinition.Column("Date", SqlType.date(), NOT_NULLABLE), TableDefinition.Column("Cases", SqlType.big_int(), NOT_NULLABLE), TableDefinition.Column("Difference", SqlType.big_int(), NOT_NULLABLE), TableDefinition.Column("Last_Update_Date", SqlType.date(), NOT_NULLABLE), ]) def run_create_hyper_file_from_csv(): """ An example demonstrating loading data from a csv into a new Hyper file """ print("EXAMPLE - Load data from CSV into table in new Hyper file") path_to_database = Path("covid-19.hyper")
from tableauhyperapi import HyperProcess, Telemetry, \ Connection, CreateMode, \ NOT_NULLABLE, NULLABLE, SqlType, TableDefinition, \ Inserter, \ escape_name, escape_string_literal, \ HyperException # Table Definitions required to create tables orders_table = TableDefinition( # Since the table name is not prefixed with an explicit schema name, the table will reside in the default "public" namespace. table_name="Orders", columns=[ TableDefinition.Column(name="Address ID", type=SqlType.small_int(), nullability=NOT_NULLABLE), TableDefinition.Column(name="Customer ID", type=SqlType.text(), nullability=NOT_NULLABLE), TableDefinition.Column(name="Order Date", type=SqlType.date(), nullability=NOT_NULLABLE), TableDefinition.Column(name="Order ID", type=SqlType.text(), nullability=NOT_NULLABLE), TableDefinition.Column(name="Ship Date", type=SqlType.date(), nullability=NULLABLE), TableDefinition.Column(name="Ship Mode", type=SqlType.text(), nullability=NULLABLE) ] ) customer_table = TableDefinition( # Since the table name is not prefixed with an explicit schema name, the table will reside in the default "public" namespace. table_name="Customer", columns=[ TableDefinition.Column(name="Customer ID", type=SqlType.text(), nullability=NOT_NULLABLE), TableDefinition.Column(name="Customer Name", type=SqlType.text(), nullability=NOT_NULLABLE), TableDefinition.Column(name="Loyalty Reward Points", type=SqlType.big_int(), nullability=NOT_NULLABLE), TableDefinition.Column(name="Segment", type=SqlType.text(), nullability=NOT_NULLABLE) ]