def rename(columns_old_new=None, func=None): """" Changes the name of a column(s) dataFrame. :param columns_old_new: List of tuples. Each tuple has de following form: (oldColumnName, newColumnName). :param func: can be lower, upper or any string transformation function """ df = self # Apply a transformation function if is_function(func): exprs = [F.col(c).alias(func(c)) for c in df.columns] df = df.select(exprs) elif is_list_of_tuples(columns_old_new): # Check that the 1st element in the tuple is a valid set of columns validate_columns_names(self, columns_old_new) for c in columns_old_new: old_col_name = c[0] if is_str(old_col_name): df = df.withColumnRenamed(old_col_name, c[1]) elif is_int(old_col_name): df = df.withColumnRenamed(self.schema.names[old_col_name], c[1]) return df
def create(self, df, func, suffix=None, output="df", *args, **kwargs): """ This is a helper function that output python tests for Spark Dataframes. :param df: Spark Dataframe :param suffix: The create method will try to create a test function with the func param given. If you want to test a function with different params you can use suffix. :param func: Spark dataframe function to be tested :param output: can be a 'df' or a 'json' :param args: Arguments to be used in the function :param kwargs: Keyword arguments to be used in the functions :return: """ buffer = [] def add_buffer(value): buffer.append("\t" + value) if suffix is None: suffix = "" else: suffix = "_" + suffix # Create func test name. If is None we just test the create.df function a not transform the data frame in # any way if func is None: func_test_name = "test_" + "create_df" + suffix + "()" else: func_test_name = "test_" + func.replace(".", "_") + suffix + "()" print("Creating {test} test function...".format(test=func_test_name)) logger.print(func_test_name) add_buffer("@staticmethod\n") add_buffer("def " + func_test_name + ":\n") source = "source_df" if df is None: # Use the main df df_func = self.df elif isinstance(df, pyspark.sql.dataframe.DataFrame): source_df = "\tsource_df=op.create.df(" + df.export() + ")\n" df_func = df add_buffer(source_df) else: # TODO: op is not supposed to be hardcoded source = "op" df_func = df # Process simple arguments _args = [] for v in args: if is_str(v): _args.append("'" + v + "'") elif is_numeric(v): _args.append(str(v)) elif is_list(v): if is_list_of_strings(v): lst = ["'" + x + "'" for x in v] elif is_list_of_numeric(v): lst = [str(x) for x in v] elif is_list_of_tuples(v): lst = [str(x) for x in v] _args.append('[' + ','.join(lst) + ']') elif is_function(v): _args.append(v.__qualname__) # else: # import marshal # code_string = marshal.dumps(v.__code__) # add_buffer("\tfunction = '" + code_string + "'\n") # import marshal, types # # code = marshal.loads(code_string) # func = types.FunctionType(code, globals(), "some_func_name") _args = ','.join(_args) _kwargs = [] # print(_args) # Process keywords arguments for k, v in kwargs.items(): if is_str(v): v = "'" + v + "'" _kwargs.append(k + "=" + str(v)) # Separator if we have positional and keyword arguments separator = "" if (not is_list_empty(args)) & (not is_list_empty(kwargs)): separator = "," if func is None: add_buffer("\tactual_df = source_df\n") else: add_buffer("\tactual_df =" + source + "." + func + "(" + _args + separator + ','.join(_kwargs) + ")\n") # Apply function to the dataframe if func is None: df_result = self.op.create.df(*args, **kwargs) else: # Here we construct the method to be applied to the source object for f in func.split("."): df_func = getattr(df_func, f) df_result = df_func(*args, **kwargs) if output == "df": expected = "\texpected_df = op.create.df(" + df_result.export( ) + ")\n" elif output == "json": if is_str(df_result): df_result = "'" + df_result + "'" else: df_result = str(df_result) add_buffer("\tactual_df =json_enconding(actual_df)\n") expected = "\texpected_value =json_enconding(" + df_result + ")\n" else: expected = "\t\n" add_buffer(expected) if output == "df": add_buffer( "\tassert (expected_df.collect() == actual_df.collect())\n") elif output == "json": add_buffer("\tassert (expected_value == actual_df)\n") return "".join(buffer)
def run(self, df, func_request=None, func_response=None, return_type="json", calls=60, period=60, max_tries=8): """ Read a the url key from a mongo collection an make a request to a service :param df: Dataframe to me loaded to the enricher collection. :param func_request: help to create a custom request :param func_response: help to create a custom response :param calls: how many call can you make by period of time :param period: in which period ot time can the call be made in seconds :param max_tries: how many retries should we do :param return_type: :return: """ if is_dataframe(df): df = df.create_id(COL_ID) # Load the dataframe data in the enricher self.load(df) collection_name = self.collection_name collection = self.get_collection(collection_name) # Get data that is not yet enriched cursor = collection.find({COL_RESULTS: {"$exists": False}}) total_docs = cursor.count(True) if func_request is None: func_request = requests.get collection = self.get_collection(collection_name) @on_exception(expo, RateLimitException, max_tries=max_tries) @limits(calls=calls, period=period) def _func_request(v): return func_request(v) if total_docs > 0: for c in tqdm_notebook(cursor, total=total_docs, desc='Processing...'): # Send request to the API response = _func_request(c) mongo_id = c["_id"] if response.status_code == 200: if return_type == "json": response = json.loads(response.text) elif return_type == "text": response = response.text # Process the result with an external function if is_function(func_response): response = func_response(response) # Update the mongo id with the result collection.find_and_modify( query={"_id": mongo_id}, update={"$set": { COL_RESULTS: response }}, upsert=False, full_response=True) else: # The response key will remain blank so we can filter it to try in future request logger.print(response.status_code) # Append the data in enrichment to the dataframe logger.print("Appending collection info into the dataframe") # TODO: An elegant way to handle pickling? # take care to the pickling host = self.host port = self.port db_name = self.db_name @pandas_udf('string', PandasUDFType.SCALAR) def func(value): # More about pickling from pymongo import MongoClient _client = MongoClient(host, port) _db = _client[db_name] _collection = _db[collection_name] def func_serie(serie): _cursor = _collection.find_one({COL_ID: serie}, projection={ "_id": 0, COL_RESULTS: 1 }) return _cursor[COL_RESULTS] return value.apply(func_serie) df = df.withColumn(COL_RESULTS, func(df[COL_ID])).cols.drop(COL_ID).run() # If the process is finished, flush the Mongo collection self.flush() return df else: print("No records available to process")