def columns_stats(self, df, columns, buckets=10, infer=False, relative_error=RELATIVE_ERROR, approx_count=True, mismatch=None, advanced_stats=True): """ Return statistical information about a specific column in json format :param df: Dataframe to be processed :param columns: Columns that you want to profile :param buckets: Create buckets divided by range. Each bin is equal. :param infer: try to infer the column dataType :param relative_error: relative error when the percentile is calculated. 0 more precision/slow 1 less precision/faster :param approx_count: Use the function approx_count_distinct or countDistinct. approx_count_distinct is faster :param mismatch: :return: json object """ columns = parse_columns(df, columns) # Initialize Objects logger.print("Processing Stats For columns...") # Get columns data types. This is necessary to make the pertinent histogram calculations. count_by_data_type = df.cols.count_by_dtypes(columns, infer=infer, mismatch=mismatch) count_by_data_type_no_mismatch = copy.deepcopy(count_by_data_type) # Info from all the columns type_details = {} for col_name in columns: # Not count mismatch if "mismatch" in count_by_data_type_no_mismatch[col_name]: count_by_data_type_no_mismatch[col_name].pop("mismatch") # Get the greatest count by column data type greatest_data_type_count = max( count_by_data_type_no_mismatch[col_name], key=count_by_data_type_no_mismatch[col_name].get) cat = PYTHON_TO_PROFILER.get(greatest_data_type_count) assign(type_details, col_name + ".dtype", greatest_data_type_count, dict) assign(type_details, col_name + ".type", cat, dict) assign(type_details, col_name + ".stats", count_by_data_type[col_name], dict) # Count the categorical, numerical, boolean and date columns count_types = {} for value in type_details.values(): name = value["dtype"] if name in count_types: count_types[name] += 1 else: count_types[name] = 1 # List the data types this data set have dtypes = [key for key, value in count_types.items() if value > 0] columns_info = {} columns_info["count_types"] = fill_missing_col_types(count_types) columns_info["total_count_dtypes"] = len(dtypes) columns_info["dtypes_list"] = dtypes columns_info["columns"] = type_details # Aggregation stats = self.columns_agg(df, columns, buckets, relative_error, approx_count, advanced_stats) # Calculate Frequency logger.print("Processing Frequency ...") # print("COLUMNS",columns) df_freq = df.cols.select(columns, data_type=PYSPARK_NUMERIC_TYPES, invert=True) freq = None if df_freq is not None: freq = df_freq.cols.frequency("*", buckets, True, self.rows_count) # print("FREQUENCY1", freq) for col_name in columns: col_info = {} assign(col_info, "stats", stats[col_name], dict) if freq is not None: if col_name in freq: # print("ASSIGN") assign(col_info, "frequency", freq[col_name]) assign(col_info, "name", col_name) assign(col_info, "column_dtype", columns_info["columns"][col_name]['dtype']) assign(col_info, "dtypes_stats", columns_info["columns"][col_name]['stats']) assign(col_info, "column_type", columns_info["columns"][col_name]['type']) assign(columns_info, "columns." + col_name, col_info, dict) assign(col_info, "id", df.cols.get_meta(col_name, "id")) return columns_info
def __init__(self, session=None, master="local[*]", app_name="optimus", checkpoint=False, path=None, file_system="local", verbose=False, server=False, repositories=None, packages=None, jars=[], driver_class_path=[], options=None, additional_options=None, comm=None, load_avro=False, cache=True): """ Transform and roll out :param master: 'Master', 'local' or ip address to a cluster :param app_name: Spark app name :param path: path to the checkpoint folder :param checkpoint: If True create a checkpoint folder :param file_system: 'local' or 'hadoop' :param additional_options: :param options: Configuration options that are passed to spark-submit. See `the list of possible options <https://spark.apache.org/docs/2.4.1/configuration.html#available-properties>`_. Note that any options set already through PYSPARK_SUBMIT_ARGS will override these. :type options: (dict[str,str]) :param repositories: List of additional maven repositories for package lookup. :type repositories: (list[str]) :param packages: Spark packages that should be installed. :type packages: (list[str]) :param jars: Full paths to jar files that we want to include to the session. :type jars: (list[str]) """ self.preserve = False Optimus.cache = cache if comm is True: Comm.instance = Comm() else: Comm.instance = comm if session is None: # Creating Spark Session # If a Spark session in not passed by argument create one self.master = master self.app_name = app_name if options is None: options = {} self.options = options # Initialize as lists self.packages = val_to_list(packages) self.repositories = val_to_list(repositories) self.jars = val_to_list(jars) self.driver_class_path = val_to_list(driver_class_path) self.additional_options = additional_options self.verbose(verbose) # Because avro depends of a external package you can decide if should be loaded if load_avro == "2.4": self._add_spark_packages( ["org.apache.spark:spark-avro_2.12:2.4.3"]) elif load_avro == "2.3": self._add_spark_packages( ["com.databricks:spark-avro_2.11:4.0.0"]) jdbc_jars = [ "/jars/RedshiftJDBC42-1.2.16.1027.jar", "/jars/mysql-connector-java-8.0.16.jar", "/jars/ojdbc8.jar", "/jars/postgresql-42.2.5.jar", "/jars/presto-jdbc-0.224.jar", "/jars/spark-cassandra-connector_2.11-2.4.1.jar", "/jars/sqlite-jdbc-3.27.2.1.jar", "/jars/mssql-jdbc-7.4.1.jre8.jar" ] self._add_jars(absolute_path(jdbc_jars, "uri")) self._add_driver_class_path(absolute_path(jdbc_jars, "posix")) self._create_session() if path is None: path = os.getcwd() if checkpoint is True: self._set_check_point_folder(path, file_system) else: # If a session is passed by arguments just save the reference # logger.print("Spark session") Spark.instance = Spark().load(session) # Initialize Spark logger.print(""" ____ __ _ / __ \____ / /_(_)___ ___ __ _______ / / / / __ \/ __/ / __ `__ \/ / / / ___/ / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \____/ .___/\__/_/_/ /_/ /_/\__,_/____/ /_/ """) logger.print(STARTING_OPTIMUS) # Pickling # Spark.instance.sc.addPyFile(absolute_path("/helpers/pickle.py")) if server: logger.print("Starting Optimus Server...") s = Server() s.start() self.server_instance = s logger.print(SUCCESS) self.create = Create() self.load = Load() self.read = self.spark.read # Create singleton profiler Profiler.instance = Profiler() self.profiler = Profiler.instance self.ml = ML() # Set global output as html self.output("html")
def print_check_point_config(filesystem): logger.print( "Setting checkpoint folder %s. If you are in a cluster initialize Optimus with master='your_ip' as param", filesystem)
def _set_check_point_folder(path, file_system): """ Function that receives a workspace path where a folder is created. This folder will store temporal dataframes when user writes the .checkPoint(). :param path: Location of the dataset (string). :param file_system: Describes if file system is local or hadoop file system. """ print_check_point_config(file_system) if file_system == "hadoop": folder_path = path + "/" + "checkPointFolder" Optimus.delete_check_point_folder(path=path, file_system=file_system) # Creating file: logger.print("Creating the hadoop folder...") command = "hadoop fs -mkdir " + folder_path logger.print("$" + command) os.system(command) logger.print("Hadoop folder created. \n") logger.print("Setting created folder as checkpoint folder...") Spark.instance.sc.setCheckpointDir(folder_path) elif file_system == "local": # Folder path: folder_path = path + "/" + "checkPointFolder" # Checking if tempFolder exits: logger.print("Deleting previous folder if exists...") if os.path.isdir(folder_path): # Deletes folder if exits: rmtree(folder_path) logger.print("Creating the checkpoint directory...") # Creates new folder: os.mkdir(folder_path) Spark.instance.sc.setCheckpointDir(dirName="file:///" + folder_path) else: RaiseIt.value_error(file_system, ["hadoop", "local"])
def delete_check_point_folder(path, file_system): """ Function that deletes the temporal folder where temp files were stored. The path required is the same provided by user in setCheckPointFolder(). :param path: path where the info will be saved :param file_system: Describes if file system is local or hadoop file system. :return: """ if file_system == "hadoop": # Folder path: folder_path = path + "/" + "checkPointFolder" logger.print("Deleting checkpoint folder...") command = "hadoop fs -rm -r " + folder_path os.system(command) logger.print("$" + command) logger.print("Folder deleted.") elif file_system == "local": logger.print("Deleting checkpoint folder...") # Folder path: folder_path = path + "/" + "checkPointFolder" # Checking if tempFolder exits: if os.path.isdir(folder_path): # Deletes folder if exits: rmtree(folder_path) # Creates new folder: logger.print("Folder deleted.") else: logger.print("Folder deleted.") else: RaiseIt.value_error(file_system, ["hadoop", "local"])
def columns_agg(df, columns, buckets=10, relative_error=RELATIVE_ERROR, approx_count=True): columns = parse_columns(df, columns) n = BATCH_SIZE list_columns = [ columns[i * n:(i + 1) * n] for i in range((len(columns) + n - 1) // n) ] # we have problems sending +100 columns at the same time. Process in batch result = {} for i, cols in enumerate(list_columns): logger.print( "Batch Stats {BATCH_NUMBER}. Processing columns{COLUMNS}". format(BATCH_NUMBER=i, COLUMNS=cols)) funcs = [count_uniques_agg] exprs = df.cols.create_exprs(cols, funcs, approx_count) # TODO: in basic calculations funcs = [F.min, F.max] funcs = [ F.min, F.max, F.stddev, F.kurtosis, F.mean, F.skewness, F.sum, F.variance, zeros_agg ] exprs.extend(df.cols.create_exprs(cols, funcs)) # TODO: None in basic calculation funcs = [percentile_agg] exprs.extend( df.cols.create_exprs(cols, funcs, df, [0.05, 0.25, 0.5, 0.75, 0.95], relative_error)) funcs = [count_na_agg] exprs.extend(df.cols.create_exprs(cols, funcs, df)) result.update(df.cols.exec_agg(exprs)) exprs = [] n = BATCH_SIZE result_hist = {} list_columns = [ columns[i * n:(i + 1) * n] for i in range((len(columns) + n - 1) // n) ] for i, cols in enumerate(list_columns): logger.print( "Batch Histogram {BATCH_NUMBER}. Processing columns{COLUMNS}". format(BATCH_NUMBER=i, COLUMNS=cols)) funcs = [hist_agg] # min_max = None for col_name in cols: # Only process histogram id numeric. For toher data types using frequency if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): min_max = { "min": result[col_name]["min"], "max": result[col_name]["max"] } buckets = result[col_name]["count_uniques"] - 1 if buckets > MAX_BUCKETS: buckets = MAX_BUCKETS elif buckets == 0: buckets = 1 exprs.extend( df.cols.create_exprs(col_name, funcs, df, buckets, min_max)) agg_result = df.cols.exec_agg(exprs) if agg_result is not None: result_hist.update(agg_result) # Merge results for col_name in result: if col_name in result_hist: result[col_name].update(result_hist[col_name]) return result
def create(self, obj, method, suffix=None, output="df", additional_method=None, *args, **kwargs): """ This is a helper function that output python tests for Spark Dataframes. :param obj: Object to be tested :param method: Method to be tested :param suffix: The test name will be create using the method param. suffix will add a string in case you want to customize the test name. :param output: can be a 'df' or a 'json' :param additional_method: :param args: Arguments to be used in the method :param kwargs: Keyword arguments to be used in the functions :return: """ buffer = [] def add_buffer(value): buffer.append("\t" + value) # Create name name = [] if method is not None: name.append(method.replace(".", "_")) if additional_method is not None: name.append(additional_method) if suffix is not None: name.append(suffix) test_name = "_".join(name) func_test_name = "test_" + test_name + "()" filename = test_name + ".test" print("Creating {test} test function...".format(test=func_test_name)) logger.print(func_test_name) add_buffer("@staticmethod\n") add_buffer("def " + func_test_name + ":\n") source = "source_df" if obj is None: # Use the main df df_func = self.df elif isinstance(obj, pyspark.sql.dataframe.DataFrame): source_df = "\tsource_df=op.create.df(" + obj.export() + ")\n" df_func = obj add_buffer(source_df) else: source = get_var_name(obj) df_func = obj # Process simple arguments _args = [] for v in args: if is_str(v): _args.append("'" + v + "'") elif is_numeric(v): _args.append(str(v)) elif is_list(v): if is_list_of_strings(v): lst = ["'" + x + "'" for x in v] elif is_list_of_numeric(v): lst = [str(x) for x in v] elif is_list_of_tuples(v): lst = [str(x) for x in v] _args.append('[' + ','.join(lst) + ']') elif is_function(v): _args.append(v.__qualname__) else: _args.append(get_var_name(v)) # else: # import marshal # code_string = marshal.dumps(v.__code__) # add_buffer("\tfunction = '" + code_string + "'\n") # import marshal, types # # code = marshal.loads(code_string) # func = types.FunctionType(code, globals(), "some_func_name") _args = ','.join(_args) _kwargs = [] # print(_args) # Process keywords arguments for k, v in kwargs.items(): if is_str(v): v = "'" + v + "'" _kwargs.append(k + "=" + str(v)) # Separator if we have positional and keyword arguments separator = "" if (not is_list_empty(args)) & (not is_list_empty(kwargs)): separator = "," if method is None: add_buffer("\tactual_df = source_df\n") else: am = "" if additional_method: am = "." + additional_method + "()" add_buffer("\tactual_df =" + source + "." + method + "(" + _args + separator + ','.join( _kwargs) + ")" + am + "\n") # Apply function to the dataframe if method is None: df_result = self.op.create.df(*args, **kwargs) else: # Here we construct the method to be applied to the source object for f in method.split("."): df_func = getattr(df_func, f) df_result = df_func(*args, **kwargs) # Additional Methods if additional_method is not None: df_result = getattr(df_result, additional_method)() if output == "df": df_result.table() expected = "\texpected_df = op.create.df(" + df_result.export() + ")\n" elif output == "json": print(df_result) if is_str(df_result): df_result = "'" + df_result + "'" else: df_result = str(df_result) add_buffer("\tactual_df =json_enconding(actual_df)\n") expected = "\texpected_value =json_enconding(" + df_result + ")\n" else: expected = "\t\n" add_buffer(expected) # Output if output == "df": add_buffer("\tassert (expected_df.collect() == actual_df.collect())\n") elif output == "json": add_buffer("\tassert (expected_value == actual_df)\n") filename = self.path + "//" + filename if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise # Write file test_file = open(filename, 'w', encoding='utf-8') for b in buffer: test_file.write(b)
def create(self, df, func, suffix=None, output="df", *args, **kwargs): """ This is a helper function that output python tests for Spark Dataframes. :param df: Spark Dataframe :param suffix: The create method will try to create a test function with the func param given. If you want to test a function with different params you can use suffix. :param func: Spark dataframe function to be tested :param output: can be a 'df' or a 'json' :param args: Arguments to be used in the function :param kwargs: Keyword arguments to be used in the functions :return: """ buffer = [] def add_buffer(value): buffer.append("\t" + value) if suffix is None: suffix = "" else: suffix = "_" + suffix # Create func test name. If is None we just test the create.df function a not transform the data frame in # any way if func is None: func_test_name = "test_" + "create_df" + suffix + "()" filename = "create_df" + suffix + ".test" else: func_test_name = "test_" + func.replace(".", "_") + suffix + "()" filename = func.replace(".", "_") + suffix + ".test" print("Creating {test} test function...".format(test=func_test_name)) logger.print(func_test_name) add_buffer("@staticmethod\n") add_buffer("def " + func_test_name + ":\n") source = "source_df" if df is None: # Use the main df df_func = self.df elif isinstance(df, pyspark.sql.dataframe.DataFrame): source_df = "\tsource_df=op.create.df(" + df.export() + ")\n" df_func = df add_buffer(source_df) else: # TODO: op is not supposed to be hardcoded source = "op" df_func = df # Process simple arguments _args = [] for v in args: if is_str(v): _args.append("'" + v + "'") elif is_numeric(v): _args.append(str(v)) elif is_list(v): if is_list_of_strings(v): lst = ["'" + x + "'" for x in v] elif is_list_of_numeric(v): lst = [str(x) for x in v] elif is_list_of_tuples(v): lst = [str(x) for x in v] _args.append('[' + ','.join(lst) + ']') elif is_function(v): _args.append(v.__qualname__) # else: # import marshal # code_string = marshal.dumps(v.__code__) # add_buffer("\tfunction = '" + code_string + "'\n") # import marshal, types # # code = marshal.loads(code_string) # func = types.FunctionType(code, globals(), "some_func_name") _args = ','.join(_args) _kwargs = [] # print(_args) # Process keywords arguments for k, v in kwargs.items(): if is_str(v): v = "'" + v + "'" _kwargs.append(k + "=" + str(v)) # Separator if we have positional and keyword arguments separator = "" if (not is_list_empty(args)) & (not is_list_empty(kwargs)): separator = "," if func is None: add_buffer("\tactual_df = source_df\n") else: add_buffer("\tactual_df =" + source + "." + func + "(" + _args + separator + ','.join(_kwargs) + ")\n") # Apply function to the dataframe if func is None: df_result = self.op.create.df(*args, **kwargs) else: # Here we construct the method to be applied to the source object for f in func.split("."): df_func = getattr(df_func, f) df_result = df_func(*args, **kwargs) if output == "df": df_result.table() expected = "\texpected_df = op.create.df(" + df_result.export() + ")\n" elif output == "json": print(df_result) if is_str(df_result): df_result = "'" + df_result + "'" else: df_result = str(df_result) add_buffer("\tactual_df =json_enconding(actual_df)\n") expected = "\texpected_value =json_enconding(" + df_result + ")\n" else: expected = "\t\n" add_buffer(expected) if output == "df": add_buffer("\tassert (expected_df.collect() == actual_df.collect())\n") elif output == "json": add_buffer("\tassert (expected_value == actual_df)\n") filename = self.path + "//" + filename if not os.path.exists(os.path.dirname(filename)): try: os.makedirs(os.path.dirname(filename)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise # write file test_file = open(filename, 'w', encoding='utf-8') for b in buffer: test_file.write(b)
def columns(df, columns, buckets=40, infer=False, relative_error=1): """ Return statistical information about a specific column in json format :param df: Dataframe to be processed :param columns: Columns that you want to profile :param buckets: Create buckets divided by range. Each bin is equal. :param relative_error: relative error when the percentile is calculated. 0 is more exact as slow 1 more error and faster :return: json object with the """ columns = parse_columns(df, columns) # Get just a sample to infer the column data type # sample_size_number = sample_size(rows_count, 95.0, 2.0) # fraction = sample_size_number / rows_count # sample = df.sample(False, fraction, seed=1) # Initialize Objects columns_info = {} columns_info['columns'] = {} rows_count = df.count() columns_info['rows_count'] = humanize.intword(rows_count) count_dtypes = Profiler.count_data_types(df, columns, infer) columns_info["count_types"] = count_dtypes["count_types"] columns_info['size'] = humanize.naturalsize(df.size()) # Cast columns to the data type infer by count_data_types() df = Profiler.cast_columns(df, columns, count_dtypes).cache() # Calculate stats stats = Profiler.general_stats(df, columns) for col_name in columns: col_info = {} logger.print("------------------------------") logger.print("Processing column '" + col_name + "'...") columns_info['columns'][col_name] = {} col_info["stats"] = stats[col_name] col_info.update(Profiler.frequency(df, col_name, buckets)) col_info.update(Profiler.stats_by_column(col_name, stats, count_dtypes, rows_count)) col_info['column_dtype'] = count_dtypes["columns"][col_name]['dtype'] col_info["dtypes_stats"] = count_dtypes["columns"][col_name]['details'] column_type = count_dtypes["columns"][col_name]['type'] if column_type == "numeric": col_info["stats"].update(Profiler.extra_numeric_stats(df, col_name, stats, relative_error)) col_info["hist"] = df.cols.hist(col_name, stats[col_name]["min"], stats[col_name]["max"], buckets) if column_type == "categorical" or column_type == "array": col_info["hist"] = Profiler.hist_string(df, col_name, buckets) if column_type == "date": col_info["hist"] = Profiler.hist_date(df, col_name) columns_info['columns'][col_name] = col_info return columns_info
def optimus(engine=Engine.DASK.value, *args, **kwargs): """ This is the entry point to initialize the selected engine. :param engine: A string identifying an engine :classL`Engine`. :param args: :param kwargs: :return: """ logger.print("ENGINE", engine) # lemmatizer nltk.download('wordnet', quiet=True) # Stopwords nltk.download('stopwords', quiet=True) # Init engine if engine == Engine.PANDAS.value: from optimus.engines.pandas.engine import PandasEngine op = PandasEngine(*args, **kwargs) elif engine == Engine.VAEX.value: from optimus.engines.vaex.engine import VaexEngine op = VaexEngine(*args, **kwargs) elif engine == Engine.SPARK.value: from optimus.engines.spark.engine import SparkEngine op = SparkEngine(*args, **kwargs) elif engine == Engine.DASK.value: from optimus.engines.dask.engine import DaskEngine op = DaskEngine(*args, **kwargs) elif engine == Engine.IBIS.value: from optimus.engines.ibis.engine import IbisEngine op = IbisEngine(*args, **kwargs) elif engine == Engine.CUDF.value: from optimus.engines.cudf.engine import CUDFEngine op = CUDFEngine(*args, **kwargs) elif engine == Engine.DASK_CUDF.value: from optimus.engines.dask_cudf.engine import DaskCUDFEngine op = DaskCUDFEngine(*args, **kwargs) else: RaiseIt.value_error(engine, Engine.list()) # Set cupy yo user RMM def switch_to_rmm_allocator(): import rmm import cupy cupy.cuda.set_allocator(rmm.rmm_cupy_allocator) return True if engine == Engine.CUDF.value: switch_to_rmm_allocator() if engine == Engine.DASK_CUDF.value: if op.client: op.client.run(switch_to_rmm_allocator) return op
def parse_columns(df, cols_args, get_args=False, is_regex=None, filter_by_column_dtypes=None, accepts_missing_cols=False, invert=False): """ Return a list of columns and check that columns exists in the dataframe Accept '*' as parameter in which case return a list of all columns in the dataframe. Also accept a regex. If a list of tuples return to list. The first element is the columns name the others element are params. This params can be used to create custom transformation functions. You can find and example in cols().cast() :param df: Dataframe in which the columns are going to be checked :param cols_args: Accepts * as param to return all the string columns in the dataframe :param get_args: :param is_regex: Use True is col_attrs is a regex :param filter_by_column_dtypes: A data type for which a columns list is going be filtered :param accepts_missing_cols: if true not check if column exist in the dataframe :param invert: Invert the final selection. For example if you want to select not integers :return: A list of columns string names """ if not is_dataframe(df): RaiseIt.type_error(df, "Dataframe") attrs = None # if columns value is * get all dataframes columns if is_regex is True: r = re.compile(cols_args[0]) cols = list(filter(r.match, df.columns)) elif cols_args == "*" or cols_args is None: cols = df.columns # In case we have a list of tuples we use the first element of the tuple is taken as the column name # and the rest as params. We can use the param in a custom function as follow # def func(attrs): attrs return (1,2) and (3,4) # return attrs[0] + 1 # df.cols().apply([('col_1',1,2),('cols_2', 3 ,4)], func) # Verify if we have a list with tuples elif is_tuple(cols_args) or is_list_of_tuples(cols_args): cols_args = val_to_list(cols_args) # Extract a specific position in the tuple cols = [(i[0:1][0]) for i in cols_args] attrs = [(i[1:]) for i in cols_args] else: # if not a list convert to list cols = val_to_list(cols_args) # Get col name from index cols = [c if is_str(c) else df.columns[c] for c in cols] # Check for missing columns if accepts_missing_cols is False: check_for_missing_columns(df, cols) # Filter by column data type filter_by_column_dtypes = val_to_list(filter_by_column_dtypes) if is_list_of_list(filter_by_column_dtypes): filter_by_column_dtypes = [ item for sublist in filter_by_column_dtypes for item in sublist ] columns_residual = None # If necessary filter the columns by data type if filter_by_column_dtypes: # Get columns for every data type columns_filtered = filter_col_name_by_dtypes(df, filter_by_column_dtypes) # Intersect the columns filtered per data type from the whole dataframe with the columns passed to the function final_columns = list(OrderedSet(cols).intersection(columns_filtered)) # This columns match filtered data type columns_residual = list( OrderedSet(cols) - OrderedSet(columns_filtered)) else: final_columns = cols cols_params = [] if invert: final_columns = list( OrderedSet(df.cols.names()) - OrderedSet(final_columns)) if get_args is True: cols_params = final_columns, attrs elif get_args is False: cols_params = final_columns else: RaiseIt.value_error(get_args, ["True", "False"]) if columns_residual: logger.print("%s %s %s", ",".join(escape_columns(columns_residual)), "column(s) was not processed because is/are not", ",".join(filter_by_column_dtypes)) # if because of filtering we got 0 columns return None if len(cols_params) == 0: cols_params = None logger.print("Outputting 0 columns after filtering. Is this expected?") return cols_params
def _count_data_types(col_name): """ Function for determine if register value is float or int or string. :param col_name: :return: """ # If String, process the data to try to infer which data type is inside. This a kind of optimization. # We do not need to analyze the data if the column data type is integer or boolean.etc temp = col_name + "_type" col_data_type = df.cols.dtypes(col_name) # Parse dtype if col_data_type == "smallint" or col_data_type == "tinyint": col_data_type = "int" elif col_data_type == "float" or col_data_type == "double": col_data_type = "decimal" elif col_data_type.find("array") >= 0: col_data_type = "array" count_by_data_type = {} count_empty_strings = 0 if infer is True and col_data_type == "string": logger.print("Processing column '" + col_name + "'...") types = collect_as_dict(df .h_repartition(col_name=col_name) .withColumn(temp, fbdt(col_name, get_type=True)) .groupBy(temp).count() ) for row in types: count_by_data_type[row[temp]] = row["count"] count_empty_strings = df.where(F.col(col_name) == '').count() else: # if boolean not support count na if "count_na" in stats[col_name]: nulls = stats[col_name]["count_na"] count_by_data_type[col_data_type] = int(df_count) - nulls count_by_data_type["null"] = nulls count_by_data_type = fill_missing_var_types(count_by_data_type) # Subtract white spaces to the total string count null_missed_count = {"null": count_by_data_type['null'], "missing": count_empty_strings, } # Get the greatest count by column data type greatest_data_type_count = max(count_by_data_type, key=count_by_data_type.get) if greatest_data_type_count == "string" or greatest_data_type_count == "boolean": cat = "categorical" elif greatest_data_type_count == "int" or greatest_data_type_count == "decimal": cat = "numeric" elif greatest_data_type_count == "date": cat = "date" elif greatest_data_type_count == "array": cat = "array" elif greatest_data_type_count == "binary": cat = "binary" elif greatest_data_type_count == "null": cat = "null" else: cat = None col = {} col['dtype'] = greatest_data_type_count col['type'] = cat col['details'] = {**count_by_data_type, **null_missed_count} return col
def columns(self, df, columns, buckets=10, infer=False, relative_error=RELATIVE_ERROR, approx_count=True): """ Return statistical information about a specific column in json format :param df: Dataframe to be processed :param columns: Columns that you want to profile :param buckets: Create buckets divided by range. Each bin is equal. :param infer: try to infer the column datatype :param relative_error: relative error when the percentile is calculated. 0 is more exact as slow 1 more error and faster :param approx_count: Use the function approx_count_distinct or countDistinct. approx_count_distinct is faster :return: json object """ columns = parse_columns(df, columns) self.rows_count = df.count() self.cols_count = len(df.columns) # Initialize Objects columns_info = {} columns_info['columns'] = {} columns_info['name'] = df._name columns_info['rows_count'] = humanize.intword(self.rows_count) logger.print("Processing General Stats...") stats = Profiler.general_stats(df, columns, buckets, relative_error, approx_count) count_dtypes = self._count_data_types(df, columns, infer, stats) columns_info["count_types"] = count_dtypes["count_types"] columns_info['size'] = humanize.naturalsize(df.size()) # Cast columns to the data type infer by count_data_types() # df = Profiler.cast_columns(df, columns, count_dtypes).cache() # Calculate stats logger.print("Processing Frequency ...") freq = df.cols.frequency(columns, buckets, True, self.rows_count) # Missing total_count_na = 0 for col_name in columns: total_count_na = total_count_na + stats[col_name]["count_na"] columns_info["summary"] = {} columns_info["summary"]['missing_count'] = total_count_na columns_info["summary"]['p_missing'] = round(total_count_na / self.rows_count * 100, 2) # Calculate percentage for col_name in columns: col_info = {} col_info["stats"] = stats[col_name] if freq is not None: col_info["frequency"] = freq[col_name] col_info["stats"].update(self.extra_stats(df, col_name, stats)) col_info['name'] = col_name col_info['column_dtype'] = count_dtypes["columns"][col_name]['dtype'] col_info["dtypes_stats"] = count_dtypes["columns"][col_name]['details'] col_info['column_type'] = count_dtypes["columns"][col_name]['type'] columns_info['columns'][col_name] = {} columns_info['columns'][col_name] = col_info return columns_info
def csv(path, sep=',', header=True, infer_schema=True, na_values=None, encoding="utf-8", n_rows=-1, cache=False, quoting=0, lineterminator=None, error_bad_lines=False, engine="c", keep_default_na=False, na_filter=False, null_value=None, storage_options=None, conn=None, n_partitions=1, *args, **kwargs): """ Return a dataframe from a csv file. It is the same read.csv Spark function with some predefined params :param path: path or location of the file. :param sep: usually delimiter mark are ',' or ';'. :param header: tell the function whether dataset has a header row. True default. :param infer_schema: infers the input schema automatically from data. :param null_value: :param charset: It requires one extra pass over the data. True default. :return dataFrame """ path = unquote_path(path) if cache is False: prepare_path.cache_clear() if conn is not None: path = conn.path(path) storage_options = conn.storage_options remove_param = "chunk_size" if kwargs.get(remove_param): # This is handle in this way to preserve compatibility with others dataframe technologies. logger.print( f"{remove_param} is not supported. Used to preserve compatibility with Optimus Pandas" ) kwargs.pop(remove_param) try: # From the panda docs using na_filter # Detect missing value markers (empty strings and the value of na_values). In data without any NAs, # passing na_filter=False can improve the performance of reading a large file. dfd = vaex.read_csv(path, sep=sep, header=0 if header else None, encoding=encoding, quoting=quoting, lineterminator=lineterminator, error_bad_lines=error_bad_lines, keep_default_na=True, na_values=None, engine=engine, na_filter=na_filter, storage_options=storage_options, *args, **kwargs) if n_rows > -1: dfd = vaex.from_pandas(dfd.head(n=n_rows), npartitions=1).reset_index(drop=True) df = VaexDataFrame(dfd) df.meta = Meta.set(df.meta, value={ "file_name": path, "name": ntpath.basename(path) }) except IOError as error: logger.print(error) raise return df
def columns_agg(self, df, columns, buckets=10, relative_error=RELATIVE_ERROR, approx_count=True, advanced_stats=True): columns = parse_columns(df, columns) n = BATCH_SIZE list_columns = [ columns[i * n:(i + 1) * n] for i in range((len(columns) + n - 1) // n) ] # we have problems sending +100 columns at the same time. Processing in batch result = {} for i, cols in enumerate(list_columns): logger.print( "Batch Stats {BATCH_NUMBER}. Processing columns{COLUMNS}". format(BATCH_NUMBER=i, COLUMNS=cols)) # Count uniques is necessary for calculate the histogram buckets funcs = [count_uniques_agg] exprs = df.cols.create_exprs(cols, funcs, approx_count) funcs = [F.min, F.max] exprs.extend(df.cols.create_exprs(cols, funcs)) funcs = [count_na_agg] exprs.extend(df.cols.create_exprs(cols, funcs, df)) if advanced_stats is True: funcs = [ F.stddev, F.kurtosis, F.mean, F.skewness, F.sum, F.variance, zeros_agg ] exprs.extend(df.cols.create_exprs(cols, funcs)) # TODO: None in basic calculation funcs = [percentile_agg] exprs.extend( df.cols.create_exprs(cols, funcs, df, [0.05, 0.25, 0.5, 0.75, 0.95], relative_error)) result.update(df.cols.exec_agg(exprs)) n = BATCH_SIZE result_hist = {} list_columns = [ columns[i * n:(i + 1) * n] for i in range((len(columns) + n - 1) // n) ] for i, cols in enumerate(list_columns): logger.print( "Batch Histogram {BATCH_NUMBER}. Processing columns{COLUMNS}". format(BATCH_NUMBER=i, COLUMNS=cols)) funcs = [hist_agg] for col_name in cols: # Only process histogram for numeric columns. For other data types using frequency if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): min_max = { "min": result[col_name]["min"], "max": result[col_name]["max"] } buckets = result[col_name]["count_uniques"] - 1 if buckets > MAX_BUCKETS: buckets = MAX_BUCKETS elif buckets == 0: buckets = 1 exprs.extend( df.cols.create_exprs(col_name, funcs, df, buckets, min_max)) agg_result = df.cols.exec_agg(exprs) if agg_result is not None: result_hist.update(agg_result) # Merge results for col_name in result: if col_name in result_hist: result[col_name].update(result_hist[col_name]) def extra_columns_stats(df, col_name, stats): """ Specific Stats for numeric columns :param df: :param col_name: :param stats: :return: """ col_info = {} max_value = stats[col_name]["max"] min_value = stats[col_name]["min"] if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES): stddev = stats[col_name]['stddev'] mean = stats[col_name]['mean'] quantile = stats[col_name]["percentile"] if max_value is not None and min_value is not None: col_info['range'] = max_value - min_value else: col_info['range'] = None col_info['median'] = quantile["0.5"] q1 = quantile["0.25"] q3 = quantile["0.75"] if q1 is not None and q3 is not None: col_info['interquartile_range'] = q3 - q1 else: col_info['interquartile_range'] = None if mean != 0 and mean is not None: col_info['coef_variation'] = round((stddev / mean), 5) else: col_info['coef_variation'] = None mad = df.cols.mad(col_name) if mad is not None: col_info['mad'] = round(df.cols.mad(col_name), 5) else: col_info['mad'] = None if self.rows_count is None: self.rows_count = df.count() col_info['p_count_na'] = round( (stats[col_name]['count_na'] * 100) / self.rows_count, 2) col_info['p_count_uniques'] = round( (stats[col_name]['count_uniques'] * 100) / self.rows_count, 2) return col_info if advanced_stats is True: for col_name in columns: result.update(extra_columns_stats(df, col_name, result)) return result
def _count_data_types(col_name): """ Function for determine if register value is float or int or string. :param col_name: :return: """ logger.print("Processing column '" + col_name + "'...") # If String, process the data to try to infer which data type is inside. This a kind of optimization. # We do not need to analyze the data if the column data type is integer or boolean.etc temp = col_name + "_type" col_data_type = df.cols.dtypes(col_name) count_by_data_type = {} count_empty_strings = 0 if infer is True and col_data_type == "string": types = (df .h_repartition(col_name=col_name) .withColumn(temp, fbdt(col_name, get_type=True)) .groupBy(temp).count() .to_json()) for row in types: count_by_data_type[row[temp]] = row["count"] count_empty_strings = df.where(F.col(col_name) == '').count() else: nulls = df.cols.count_na(col_name) count_by_data_type[col_data_type] = int(df.count()) - nulls count_by_data_type["null"] = nulls count_by_data_type = fill_missing_var_types(count_by_data_type) # Subtract white spaces to the total string count data_types_count = {"string": count_by_data_type['string'], "bool": count_by_data_type['bool'], "int": count_by_data_type['int'], "float": count_by_data_type['float'], "double": count_by_data_type['double'], "date": count_by_data_type['date'], "array": count_by_data_type['array'] } null_missed_count = {"null": count_by_data_type['null'], "missing": count_empty_strings, } # Get the greatest count by column data type greatest_data_type_count = max(data_types_count, key=data_types_count.get) if greatest_data_type_count is "string": cat = "categorical" elif greatest_data_type_count is "int" or greatest_data_type_count is "float" or greatest_data_type_count is "double": cat = "numeric" elif greatest_data_type_count is "date": cat = "date" elif greatest_data_type_count is "bool": cat = "bool" elif greatest_data_type_count is "array": cat = "array" else: cat = "null" col = {} col['dtype'] = greatest_data_type_count col['type'] = cat col['details'] = {**data_types_count, **null_missed_count} return col
def columns_stats(self, df, columns, buckets=10, infer=False, relative_error=RELATIVE_ERROR, approx_count=True): """ Return statistical information about a specific column in json format :param df: Dataframe to be processed :param columns: Columns that you want to profile :param buckets: Create buckets divided by range. Each bin is equal. :param infer: try to infer the column datatype :param relative_error: relative error when the percentile is calculated. 0 is more exact as slow 1 more error and faster :param approx_count: Use the function approx_count_distinct or countDistinct. approx_count_distinct is faster :return: json object """ columns = parse_columns(df, columns) # Initialize Objects logger.print("Processing Stats For columns...") # Get columns data types. This is necessary to make the pertinent histogram calculations. type_details = self._count_data_types(df, columns, infer) # Count the categorical, numerical, boolean and date columns count_types = {} for value in type_details.values(): name = value["dtype"] if name in count_types: count_types[name] += 1 else: count_types[name] = 1 # List the data types this data set have total = 0 dtypes = [] for key, value in count_types.items(): if value > 0: dtypes.append(key) total = total + 1 count_types = fill_missing_col_types(count_types) columns_info = {} columns_info["count_types"] = count_types columns_info["total_count_dtypes"] = total columns_info["dtypes_list"] = dtypes columns_info["columns"] = type_details # Aggregation stats = Profiler.columns_agg(df, columns, buckets, relative_error, approx_count) # Calculate Frequency logger.print("Processing Frequency ...") df_freq = df.cols.select("*", data_type=PYSPARK_NUMERIC_TYPES, invert=True) freq = None if df_freq is not None: freq = df_freq.cols.frequency("*", buckets, True, self.rows_count) # Calculate percentage for col_name in columns: col_info = {} assign(col_info, "stats", stats[col_name], dict) if freq is not None: if col_name in freq: assign(col_info, "frequency", freq[col_name]) col_info["stats"].update( self.extra_columns_stats(df, col_name, stats)) assign(col_info, "name", col_name) assign(col_info, "column_dtype", columns_info["columns"][col_name]['dtype']) assign(col_info, "dtypes_stats", columns_info["columns"][col_name]['stats']) assign(col_info, "column_type", columns_info["columns"][col_name]['type']) assign(columns_info, "columns." + col_name, col_info, dict) return columns_info
def timed(*args, **kw): start_time = timeit.default_timer() f = method(*args, **kw) _time = round(timeit.default_timer() - start_time, 2) logger.print("{name}() executed in {time} sec".format(name=method.__name__, time=_time)) return f
def run(self, df, collection_name=None, func_request=None, func_response=None, return_type="json", calls=60, period=60, max_tries=8): """ Read a the url key from a mongo collection an make a request to a service :param df: Dataframe to me loaded to the enricher collection. :param collection_name: Custom collection to save the data. :param func_request: help to create a custom request :param func_response: help to create a custom response :param return_type: :param calls: how many call can you make :param period: in which period ot time can the call be made :param max_tries: how many retries should we do :return: """ # Load the dataframe data in the enricher if is_(df, DataFrame): df = df.create_id(COL_ID) # Load the dataframe data in the enricher self.send(df) if collection_name is None: collection_name = self.collection_name collection = self.get_collection(collection_name) # Get data that is not yet enriched cursor = collection.find({COL_RESULTS: {"$exists": False}}) total_docs = cursor.count(True) if func_request is None: func_request = requests.get collection = self.get_collection(collection_name) @on_exception(expo, RateLimitException, max_tries=max_tries) @limits(calls=calls, period=period) def _func_request(v): return func_request(v) if total_docs > 0: for c in tqdm_notebook(cursor, total=total_docs, desc='Processing...'): # Send request to the API response = _func_request(c) mongo_id = c["_id"] if response.status_code == 200: if return_type == "json": response = json.loads(response.text) elif return_type == "text": response = response.text # Process the result with an external function if is_function(func_response): response = func_response(response) # Update the mongo id with the result collection.find_and_modify(query={"_id": mongo_id}, update={"$set": {COL_RESULTS: response}}, upsert=False, full_response=True) else: # The response key will remain blank so we can filter it to try in future request logger.print(response.status_code) # Append the data in enrichment to the dataframe logger.print("Appending collection info into the dataframe") # TODO: An elegant way to handle pickling? # take care to the pickling host = self.host port = self.port db_name = self.db_name @pandas_udf('string', PandasUDFType.SCALAR) def func(value): # More about pickling from pymongo import MongoClient _client = MongoClient(host, port) _db = _client[db_name] _collection = _db[collection_name] def func_serie(serie): _cursor = _collection.find_one({COL_ID: serie}, projection={"_id": 0, COL_RESULTS: 1}) return _cursor[COL_RESULTS] return value.apply(func_serie) df = df.withColumn(COL_RESULTS, func(df[COL_ID])).cols.drop(COL_ID).run() # If the process is finished, flush the Mongo collection self.flush() return df else: print("No records available to process")
def __init__(self, master="local[*]", app_name="optimus"): """ :param master: Sets the Spark master URL to connect to, such as 'local' to run locally, 'local[4]' to run locally with 4 cores, or spark://master:7077 to run on a Spark standalone cluster. :param app_name: Sets a name for the application, which will be shown in the Spark web UI """ self.master = master self.app_name = app_name logger.print(JUST_CHECKING) logger.print("-----") check_env_vars([ "SPARK_HOME", "HADOOP_HOME", "PYSPARK_PYTHON", "PYSPARK_DRIVER_PYTHON", "PYSPARK_SUBMIT_ARGS", "JAVA_HOME" ]) if is_pyarrow_installed() is True: logger.print("Pyarrow Installed") else: logger.print( "Pyarrow not installed. Pandas UDF not available. Install using 'pip install pyarrow'" ) logger.print("-----") logger.print(STARTING_SPARK) # print(os.environ['PYSPARK_SUBMIT_ARGS']) # Build the spark session self._spark = SparkSession.builder \ .appName(app_name) \ .master(master) \ .config("spark.executor.heartbeatInterval", "110") \ .config("spark.jars.packages", "ml.combust.mleap:mleap-spark_2.11:0.13.0") \ .getOrCreate() # .option("driver", "org.postgresql.Driver") self._sc = self._spark.sparkContext logger.print("Spark Version:" + self._sc.version)