def drop_nan_rows(df, threshold, ttype, opstat, display=True): """ * -------------------------------------------------------------------------- * function : drop rows with nans greater than threshold * * parms : * df - dataframe * threshold - threshold value * ttype - threshold type * * returns : * N/A * -------------------------------------------------------- """ import math if (display): clock = RunningClock() clock.start() try: if (ttype == dim.BY_PERCENT): thold = math.floor(len(df.columns) * (float(threshold) * 0.01)) else: thold = math.floor(float(threshold)) nanslist = df.isnull().sum(axis=1).tolist() #< thold criteria = nanslist dropcount = 0 for i in range(len(nanslist)): if (nanslist[i]) < thold: criteria[i] = True else: criteria[i] = False dropcount = dropcount + 1 if (dropcount > 0): df = df[criteria] cfg.set_dfc_dataframe_df( cfg.get_config_value(cfg.CURRENT_INSPECTION_DF), df) except Exception as e: opstat.store_exception("Error dropping nan rows\n ", e) display_exception(opstat) if (display): clock.stop() #make scriptable add_to_script([ "# Drop NAN Rows ", "from dfcleanser.data_inspection.data_inspection_control import drop_nan_rows", "drop_nan_rows(" + str(threshold) + ",False)" ], opstat) return ([dropcount, len(df)])
def drop_column_names_row(display=True): """ * -------------------------------------------------------- * function : drop the column names row * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() try : df = cfg.get_current_chapter_df(cfg.DataTransform_ID) collist = df.columns.tolist() df.drop(labels=collist,axis=1,inplace=True) if(display) : #make scriptable add_to_script(["# change column names", "from dfcleanser.data_transform.data_transform_dataframe_control change_column_names", "drop_column_names(False)"],opstat) except Exception as e: opstat.store_exception("Unable to change column names ",e) return(opstat)
def sort_df_index(parms,display=True): """ * -------------------------------------------------------------------------- * function : sort df indices * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() fparms = get_parms_for_input(parms,dftw.df_sort_index_transform_input_idList) levels = fparms[0] if(len(levels) > 0) : levels = levels.lstrip("[") levels = levels.rstrip("]") levels = levels.split(",") else : levels = None if(fparms[2] == "True") : ascending = True else : ascending = False kind = fparms[3] na_position = fparms[4] if(opstat.get_status()) : try : df = cfg.get_current_chapter_df(cfg.DataTransform_ID) df.sort_index(axis=0,level=levels,ascending=ascending,inplace=True,kind=kind,na_position=na_position) if(display) : #make scriptable add_to_script(["# set row ids column", "from dfcleanser.data_transform.data_transform_dataframe_control sort_df_index", "sort_df_index(" + json.dumps(parms[1]) + ",False)"],opstat) except Exception as e: opstat.store_exception("Unable to sort df index : ",e) return(opstat)
def change_column_names(parms,display=True): """ * -------------------------------------------------------------------------- * function : change column names * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() try : fparms = get_parms_for_input(parms,dftw.df_change_row_transform_input_idList) ccolname = fparms[0] ncolname = fparms[1] if( (len(ccolname) < 1) or (len(ncolname) < 1) ) : opstat.set_status(False) if(len(ccolname) < 1) : opstat.set_errorMsg("current_column_name is invalid") else : opstat.set_errorMsg("current_column_name is invalid") else : collist = cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF).columns.tolist() try : found = collist.index(ccolname) except : opstat.set_status(False) opstat.set_errorMsg("current_column_name is not in df") if(opstat.get_status()) : collist[found] = ncolname cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF).columns = collist if(display) : #make scriptable add_to_script(["# change column names", "from dfcleanser.data_transform.data_transform_dataframe_control change_column_names", "change_column_names(" + json.dumps(parms) + ",False)"],opstat) except Exception as e: opstat.store_exception("Unable to change column names ",e) return(opstat)
def drop_nan_cols(df, threshold, ttype, opstat, display=True): """ * -------------------------------------------------------------------------- * function : drop cols with nans greater than threshold * * parms : * df - dataframe * threshold - threshold value * ttype - threshold type * * returns : * N/A * -------------------------------------------------------- """ import math if (display): clock = RunningClock() clock.start() try: if (ttype == dim.BY_PERCENT): thold = math.floor(len(df) * (float(threshold) * 0.01)) else: thold = math.floor(float(threshold)) df_cols = df.columns colswithnulls = df.isnull().sum() droplist = [] for i in range(len(colswithnulls)): if (colswithnulls[i] >= thold): droplist.append(df_cols[i]) if (len(droplist) > 0): df.drop(droplist, axis=1, inplace=True) except Exception as e: opstat.store_exception("Error dropping nan cols\n ", e) if (display): clock.stop() #make scriptable try catch add_to_script([ "# Drop NAN Cols ", "from dfcleanser.data_inspection.data_inspection_control import drop_nan_cols", "drop_nan_cols(" + str(threshold) + ",False)" ], opstat) return (len(droplist))
def save_column_names_row(parms,display=True): """ * -------------------------------------------------------------------------- * function : save column names row to a file * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() try : fparms = get_parms_for_input(parms,dftw.df_save_row_transform_input_idList) filename = fparms[0] if(len(filename) == 0) : filename = "./" + cfg.get_config_value(cfg.CURRENT_IMPORTED_DATA_SOURCE_KEY) #filename = filename.replace(".","_") filename = filename + "_column_names.json" # see if save col names row if(len(filename) > 0) : colids = cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF).columns.tolist() # save the index row as file with open(filename, 'w') as colid_file : json.dump(colids,colid_file) colid_file.close() if(display) : #make scriptable add_to_script(["# save column names row", "from dfcleanser.data_transform.data_transform_dataframe_control save_column_names_row", "save_column_names_row(" + json.dumps(parms) + ",False)"],opstat) except Exception as e: opstat.store_exception("Unable to save column names file to : " + filename,e) return([opstat, filename])
def process_custom_export(fparms, exportId, display=True): """ * -------------------------------------------------------------------------- * function : custom export * * parms : * fparms - export parms * exportId - export id * display - display flag * * returns : N/A * -------------------------------------------------------- """ opstat = opStatus() fparms[0] = fparms[0].replace("\n", "<br/>") try: exec(fparms[0]) except Exception as e: opstat.store_exception("Unable to export custom", e) if (opstat.get_status()): if (display): #make scriptable script = [ "# Export Custom ", "from dfcleanser.data_export.data_export_control import process_custom_export", "process_custom_export(" + json.dumps(fparms) + "," + str(exportId) + ",False)" ] add_to_script(script, opstat) if (len(fparms) > 0): cfg.set_config_value(exportId + "Parms", "custom") cfg.set_config_value(cfg.CURRENT_EXPORTED_FILE_NAME_KEY, "custom", True) return (opstat)
def export_pandas_sqltable(sqltableparms, dbcondict, exportid, display=True): """ * -------------------------------------------------------------------------- * function : export pandas dataframe into sql table * * parms : * sqltableparms - export parms * dbcondict - db connector dict * exportId - export id * display - display flag * * returns : N/A * -------------------------------------------------------- """ opstat = opStatus() import dfcleanser.common.db_utils as dbu dbcon = dbu.dbConnector() from dfcleanser.common.db_utils import grab_connection_parms if (dbcondict == None): parmslist = get_stored_con_Parms( cfg.get_config_value(cfg.CURRENT_DB_ID_KEY)) dbcondict = set_dbcon_dict(cfg.get_config_value(cfg.CURRENT_DB_ID_KEY), parmslist) dbconparms = grab_connection_parms(dbcondict) else: dbconparms = grab_connection_parms(dbcondict) dbcon.set_ConnectionParms(dbconparms) dbconnector = dbcon.connect_to_db(dbu.SQLALCHEMY, opstat) if (opstat.get_status()): if (len(sqltableparms) == 0): opstat.set_status(False) opstat.set_errorMsg("No Export parameters defined") else: if (sqltableparms[0] == ""): opstat.set_status(False) opstat.set_errorMsg("No dataframe selcted to export") else: if (sqltableparms[1] == ""): opstat.set_status(False) opstat.set_errorMsg("No tabl;e selcted to export to") else: df = cfg.get_dfc_dataframe_df(sqltableparms[0]) labellist = dew.pandas_export_sqltable_labelList try: sqlkeys = [ labellist[2], labellist[3], labellist[4], labellist[5], labellist[6], labellist[7] ] sqlvals = [ sqltableparms[2], sqltableparms[3], sqltableparms[4], sqltableparms[5], sqltableparms[6], sqltableparms[7] ] sqltypes = [ STRING_PARM, STRING_PARM, BOOLEAN_PARM, STRING_PARM, INT_PARM, DICT_PARM ] sqlparms = {} sqladdlparms = {} except Exception as e: opstat.set_status(False) opstat.store_exception("Error parsing Export parms", e) if (opstat.get_status()): try: sqlparms = get_function_parms( sqlkeys, sqlvals, sqltypes) if (not (sqltableparms[8] == "")): sqladdlparms = json.loads(sqltableparms[8]) if (len(sqladdlparms) > 0): addlparmskeys = sqladdlparms.keys() for i in range(len(addlparmskeys)): sqlparms.update({ addlparmskeys[i]: sqladdlparms.get(addlparmskeys[i]) }) except Exception as e: opstat.set_status(False) opstat.store_exception( "Error parsing Export additional parms", e) if (opstat.get_status()): try: df.to_sql(sqltableparms[1], dbconnector, **sqlparms) except Exception as e: opstat.store_exception( "Unable to export to sql table", e) export_notes = "" if (opstat.get_status()): if (display): #make scriptable add_to_script([ "# Export SQL Table ", "from dfcleanser.data_export.data_export_control export export_pandas_sqltable", "export_pandas_sqltable(" + json.dumps(sqltableparms) + "," + json.dumps(dbcondict) + "," + str(exportid) + ",False)" ], opstat) export_notes = dbu.get_SQLAlchemy_connector_string(dbconparms) if (len(sqltableparms) > 0): cfg.set_config_value(exportid + "Parms", sqltableparms) cfg.set_config_value(cfg.CURRENT_EXPORTED_FILE_NAME_KEY, sqltableparms[0], True) return (export_notes, opstat)
def export_pandas_html(fparms, exportId, labellist, display=True): """ * -------------------------------------------------------------------------- * function : pandas html export * * parms : * fparms - export parms * exportId - export id * labellist - parm label list * display - display flag * * returns : N/A * -------------------------------------------------------- """ opstat = opStatus() if (len(fparms) == 0): opstat.set_status(False) opstat.set_errorMsg("No Export parameters defined") else: try: htmlkeys = [labellist[2], labellist[3], labellist[4], labellist[5]] htmlvals = [fparms[2], fparms[3], fparms[4], fparms[5]] htmltypes = [INT_PARM, BOOLEAN_PARM, BOOLEAN_PARM, STRING_PARM] htmlparms = {} htmladdlparms = {} except Exception as e: opstat.store_exception("Error parsing import parms", e) if (opstat.get_status()): try: htmlparms = get_function_parms(htmlkeys, htmlvals, htmltypes) if (not (fparms[6] == "")): htmladdlparms = json.loads(fparms[6]) if (len(htmladdlparms) > 0): addlparmskeys = htmladdlparms.keys() for i in range(len(addlparmskeys)): htmlparms.update({ addlparmskeys[i]: htmladdlparms.get(addlparmskeys[i]) }) except Exception as e: opstat.store_exception("Unable to get additional parms", e) if (opstat.get_status()): if (fparms[0] == ""): opstat.set_status(False) opstat.set_errorMsg("No dataframe slected") else: df = cfg.get_dfc_dataframe_df(fparms[0]) try: if (len(htmlparms) > 0): df.to_html(fparms[1], **htmlparms) else: df.to_html(fparms[1]) except Exception as e: opstat.store_exception( "Unable to export html file" + fparms[0], e) if (opstat.get_status()): if (display): #make scriptable script = [ "# Export HTML File ", "dfcleanser.data_export.data_export_control import export_pandas_html", "export_pandas_html(" + json.dumps(fparms) + "," + str(exportId) + "," + json.dumps(labellist) + ",False)" ] add_to_script(script, opstat) if (len(fparms) > 0): cfg.set_config_value(exportId + "Parms", fparms) cfg.set_config_value(cfg.CURRENT_EXPORTED_FILE_NAME_KEY, fparms[0], True) return (opstat)
def drop_duplicate_rows(parms,display=True): """ * -------------------------------------------------------------------------- * function : drop df duplicate rows * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() fparms = get_parms_for_input(parms,dftw.df_drop_dups_transform_input_idList) colnames = fparms[0] if(len(colnames) == 0) : colnames = None if(fparms[2] == "Drop") : drop = True else : drop = False keep = fparms[3] if(keep == "False") : keep = False df = cfg.get_current_chapter_df(cfg.DataTransform_ID) if(not (colnames is None)) : if(not drop) : fcolnames = [] colslist = df.columns.tolist() for i in range(len(colslist)) : if(not (colslist[i] in colnames)) : fcolnames.append(colslist[i]) colnames = fcolnames if(opstat.get_status()) : try : df.drop_duplicates(colnames,keep=keep,inplace=True) if(display) : #make scriptable add_to_script(["# drop duplicate rows", "from dfcleanser.data_transform.data_transform_dataframe_control drop_duplicate_rows", "drop_duplicate_rows("+ json.dumps(parms) + ",False)"],opstat) except Exception as e: opstat.store_exception("Unable to drop duplicate rows : " + colnames,e) return(opstat)
def process_sort_by_column(parms,display=True) : """ * -------------------------------------------------------------------------- * function : sort by column transform option * * parms : * parms - associated parms * display - display results flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() fparms = get_parms_for_input(parms,dftw.sort_column_input_idList) colname = fparms[0] sortorder = fparms[1] if(sortorder == "True") : sortorder = True else : sortorder = False sortkind = fparms[2] sortkind = sortkind.lstrip("'") sortkind = sortkind.rstrip("'") naposition = fparms[3] naposition = naposition.lstrip("'") naposition = naposition.rstrip("'") resetrowids = fparms[4] if(resetrowids == "True") : resetrowids = True else : resetrowids = False if(opstat.get_status()) : try : df = cfg.get_current_chapter_df(cfg.DataTransform_ID) df.sort_values(colname,axis=0,ascending=sortorder,inplace=True,kind=sortkind,na_position=naposition) if(resetrowids) : from dfcleanser.data_transform.data_transform_dataframe_control import reset_df_index opstat = reset_df_index() if(display) : #make scriptable add_to_script(["# sort by column ", "from dfcleanser.data_transform.data_transform_columns_control import process_sort_by_column", "process_sort_by_column(" + json.dumps(parms) + ",False)"],opstat) opstat.set_errorMsg("df sorted by column '" + colname + "' successfully.") except Exception as e: opstat.store_exception("Sort df By Column Error : "+colname,e) cfg.drop_config_value(dftw.sort_column_input_id+"Parms") return(opstat)
def append_to_df_index(parms,display=True): """ * -------------------------------------------------------------------------- * function : append column to df indices * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() fparms = get_parms_for_input(parms,dftw.df_append_index_transform_input_idList) colnames = fparms[0] colnames = colnames.lstrip("[") colnames = colnames.rstrip("]") colnames = colnames.split(",") if(len(colnames) == 0) : opstat.set_status(False) opstat.set_errorMsg("column names list is empty") else : df = cfg.get_current_chapter_df(cfg.DataTransform_ID) if(fparms[2] == "True") : drop = True else : drop = False if(fparms[3] == "True") : verify = True else : verify = False try : """ df.reset_index(drop=False,inplace=True) cnames = list(df.columns) levels_to_drop = [] for i in range(len(cnames)) : if(cnames[i].find("level_") > -1) : levels_to_drop.append(cnames[i]) if(len(levels_to_drop) > 0) : df.drop(levels_to_drop,axis=1,inplace=True) """ df.set_index(keys=colnames,drop=drop,append=True,inplace=True,verify_integrity=verify) if(display) : #make scriptable add_to_script(["# append to df index", "from dfcleanser.data_transform.data_transform_dataframe_control append_to_df_index", "append_to_df_index(" + json.dumps(parms[1]) + ",False)"],opstat) except Exception as e: opstat.store_exception("Unable to append to df index : " + colnames,e) return(opstat)
def set_df_index(parms,display=True): """ * -------------------------------------------------------------------------- * function : set df indices * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() fparms = get_parms_for_input(parms,dftw.df_set_index_transform_input_idList) colnames = fparms[0] if(len(colnames) == 0) : opstat.set_status(False) opstat.set_errorMsg("column names list is empty") else : colnames = colnames.lstrip("[") colnames = colnames.rstrip("]") colnames = colnames.split(",") if(fparms[2] == "True") : drop = True else : drop = False if(opstat.get_status()) : if(fparms[3] == "True") : verify = True else : verify = False if(opstat.get_status()) : try : df = cfg.get_current_chapter_df(cfg.DataTransform_ID) df.set_index(colnames,drop=drop,append=True,inplace=True,verify_integrity=verify) cfg.set_dfc_dataframe_df(cfg.get_config_value(cfg.CURRENT_TRANSFORM_DF),df) if(display) : #make scriptable add_to_script(["# set df index", "from dfcleanser.data_transform.data_transform_dataframe_control set_df_index", "set_df_index(" + json.dumps(parms[1]) + ",False)"],opstat) except Exception as e: opstat.store_exception("Unable to set index of column(s) : " + str(colnames),e) return(opstat)
def reset_df_index(parms,display=True): """ * -------------------------------------------------------------------------- * function : reset df indices * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() df = cfg.get_current_chapter_df(cfg.DataTransform_ID) fparms = get_parms_for_input(parms,dftw.df_reset_index_transform_input_idList) drop_levels = fparms[0] if(len(drop_levels) > 0) : drop_levels = drop_levels.lstrip("[") drop_levels = drop_levels.rstrip("]") drop_levels = drop_levels.split(",") if(drop_levels[0] == "All") : drop_levels = [] index_columns = df.index.names if(len(index_columns) > 0) : for i in range(len(index_columns)) : if( not (index_columns[i] is None) ) : drop_levels.append(index_columns[i]) else : drop_levels = None if(fparms[2] == "True") : drop = False else : drop = True if(opstat.get_status()) : try : df.reset_index(level=drop_levels,drop=drop,inplace=True) if(display) : #make scriptable add_to_script(["# reset df index", "from dfcleanser.data_transform.data_transform_dataframe_control reset_df_index", "reset_df_index(" + json.dumps(parms[1]) + ",False)"],opstat) except Exception as e: opstat.store_exception("Unable to reset df index : ",e) return(opstat)
def add_column_names_row(parms,display=True): """ * -------------------------------------------------------------------------- * function : add a column names row * * parms : * parms - transform parms * display - display flag * * returns : * N/A * -------------------------------------------------------- """ opstat = opStatus() try : fparms = get_parms_for_input(parms,dftw.df_add_row_transform_input_idList) filename = fparms[0] collist = fparms[1] if(len(filename) == 0) : filename = "None" if(len(collist) == 0 ) : collist = "None" else : collist = collist.replace("'","") collist = collist.split(",") if( (not(filename == "None")) or (not(collist == "None"))) : if(not(filename == "None")) : try : with open(filename, 'r') as colid_file : colids = json.load(colid_file) colid_file.close() except Exception as e: opstat.store_exception("Unable to open column names file" + filename,e) else : colids = collist cfg.get_current_chapter_df(cfg.CURRENT_TRANSFORM_DF).columns = colids if(display) : #make scriptable add_to_script(["# Add Column Names Row", "from dfcleanser.data_transform.data_transform_dataframe_control add_column_names_row", "add_column_names_row(" + single_quote(filename) +"," + json.dumps(collist) + ",False)"],opstat) else : opstat.set_status(False) opstat.set_errorMsg("No Column List or filename defined") except Exception as e: opstat.store_exception("Unable to add column names",e) return(opstat)