def create_mysql_connection(): #Recovering DB parameters hostname = get_param('DB', 'mysql_hostname') db_name = get_param('DB', 'db_name') user_name = get_param('DB', 'user_name') pwd = get_param('DB', 'pwd') #Building connection_string db_connection_str = 'mysql+mysqlconnector://' + user_name + ':' + pwd + '@' + hostname + '/' + db_name db_connection = sqlalchemy.create_engine(db_connection_str) return db_connection
def read_table_to_df(dim): #db_connection = create_mysql_connection hostname = get_param('DB', 'mysql_hostname') db_name = get_param('DB', 'db_name') user_name = get_param('DB', 'user_name') pwd = get_param('DB', 'pwd') db_connection = mysql.connector.connect(host=hostname, user=user_name, password=pwd, database=db_name) query = "select * FROM " + dim return pd.read_sql(query, con=db_connection)
def load_compound_dims(source_data): dim_file = get_param('DIM','compound') compound_dim = get_dictionary(dim_file) created_dims = [] for key in compound_dim: i = 1 for column_name in compound_dim[key]: if column_name in source_data: if i == 1: df_acum = dim_by_column(source_data, key, column_name, 0) else: base_id = len(df_acum.index) df_acum = pd.concat([df_acum, dim_by_column(source_data, key, column_name, base_id)], ignore_index=True) new_column_name = key + '_desc' df_acum = df_acum[new_column_name].drop_duplicates(inplace=False).sort_values().reset_index(drop=True).dropna().to_frame() i += 1 else: raise Exception("ERROR: The column "+ column_name + " does not exist - Verify and correct " + dim_file) clean_df = df_acum.dropna() clean_df['Id'] = clean_df.index + 1 column_list = ['Id', new_column_name] final_df = clean_df[column_list] load_table('dim_' + key, final_df, 'replace') created_dims.append(key) if len(compound_dim) != len(created_dims): raise Exception("ERROR: there were some dimensions that couldn't been created - \n \ Dimension List: "+ ','.join(list(compound_dim.keys()))+ "\nCreated Dimensions" \ + ','.join(created_dims)) else: print("Compound dims created: " + ', '.join(created_dims)) return created_dims
def load_default_dims(source_data): dim_file = get_param('DIM','default') default_dim = get_dictionary(dim_file) created_dims = [] for key in default_dim: if default_dim[key] in source_data: df = dim_by_column(source_data, key, default_dim[key], 0) clean_df = df.dropna() try: load_table('dim_' + key, clean_df, 'replace') except: print("An error occurred while loading default dimension " + key) created_dims.append(key) else: raise Exception("ERROR: The column "+ default_dim[key] + " does not exist - Verify and correct " + dim_file) if len(default_dim) != len(created_dims): raise Exception("ERROR: there were some dimensions that couldn't been created - \n \ Dimension List: "+ ','.join(list(default_dim.keys()))+ "\nCreated Dimensions" \ + ','.join(created_dims)) else: print("Default dims created: " + ', '.join(created_dims)) return created_dims
def lookup_compound_dim(df): dim_file = get_param('DIM','compound') compound_dim = get_dictionary(dim_file) replaced_dims = [] i = 1 for key in compound_dim: table_name = "dim_" + key dim_df = read_table_to_df(table_name) j = 1 for column_name in compound_dim[key]: if column_name in df: if i == 1 and j == 1: replaced_df = replace_attrib_for_id(df, dim_df, column_name, key + "_desc") else: replaced_df = replace_attrib_for_id(replaced_df, dim_df, column_name, key + "_desc") j += 1 else: raise Exception("ERROR: The column "+ column_name + " does not exist - Verify and correct " + dim_file) replaced_dims.append(key) i += 1 if len(compound_dim) != len(replaced_dims): raise Exception("ERROR: there were some dimensions that couldn't been created - \n \ Dimension List: "+ ','.join(list(compound_dim.keys()))+ "\nCreated Dimensions: " \ + ','.join(replaced_dims)) return replaced_df
def lookup_default_dim(df): dim_file = get_param('DIM','default') default_dim = get_dictionary(dim_file) replaced_dims = [] i = 1 for key in default_dim: if default_dim[key] in df: table_name = "dim_" + key dim_df = read_table_to_df(table_name) if i == 1: replaced_df = replace_attrib_for_id(df, dim_df, default_dim[key], key + "_desc") else: replaced_df = replace_attrib_for_id(replaced_df, dim_df, default_dim[key], key + "_desc") replaced_dims.append(key) else: raise Exception("ERROR: The column "+ default_dim[key] + " does not exist - Verify and correct " + dim_file) i +=1 if len(default_dim) != len(replaced_dims): raise Exception("ERROR: there were some dimensions that couldn't been created - \n \ Dimension List: "+ ','.join(list(default_dim.keys()))+ "\nCreated Dimensions" \ + ','.join(replaced_dims)) return replaced_df
def extract_data(): #Recovering API parameters client_name = get_param('API', 'client_name') file_name = get_param('API', 'file_name') limit_size = get_param('API', 'limit_size') #limit_size = 20000 client = Socrata(client_name, None) try: results = client.get(file_name, limit=limit_size) #results = client.get_all(file_name) df = pd.DataFrame.from_records(results) df.columns = df.columns.str.replace(' ', '_') df.columns = df.columns.str.replace('sytem', 'system') except: print("Something went wrong while retrieving data.") return df