示例#1
0
    def rename(columns_old_new=None, func=None):
        """"
        Changes the name of a column(s) dataFrame.
        :param columns_old_new: List of tuples. Each tuple has de following form: (oldColumnName, newColumnName).
        :param func: can be lower, upper or any string transformation function
        """

        df = self

        # Apply a transformation function
        if is_function(func):
            exprs = [F.col(c).alias(func(c)) for c in df.columns]
            df = df.select(exprs)

        elif is_list_of_tuples(columns_old_new):
            # Check that the 1st element in the tuple is a valid set of columns

            validate_columns_names(self, columns_old_new)
            for c in columns_old_new:
                old_col_name = c[0]
                if is_str(old_col_name):
                    df = df.withColumnRenamed(old_col_name, c[1])
                elif is_int(old_col_name):
                    df = df.withColumnRenamed(self.schema.names[old_col_name], c[1])

        return df
示例#2
0
文件: test.py 项目: schatzr/Optimus
    def create(self, df, func, suffix=None, output="df", *args, **kwargs):
        """
        This is a helper function that output python tests for Spark Dataframes.
        :param df: Spark Dataframe
        :param suffix: The create method will try to create a test function with the func param given.
        If you want to test a function with different params you can use suffix.
        :param func: Spark dataframe function to be tested
        :param output: can be a 'df' or a 'json'
        :param args: Arguments to be used in the function
        :param kwargs: Keyword arguments to be used in the functions
        :return:
        """

        buffer = []

        def add_buffer(value):
            buffer.append("\t" + value)

        if suffix is None:
            suffix = ""
        else:
            suffix = "_" + suffix

        # Create func test name. If is None we just test the create.df function a not transform the data frame in
        # any way
        if func is None:
            func_test_name = "test_" + "create_df" + suffix + "()"
        else:
            func_test_name = "test_" + func.replace(".", "_") + suffix + "()"

        print("Creating {test} test function...".format(test=func_test_name))
        logger.print(func_test_name)

        add_buffer("@staticmethod\n")
        add_buffer("def " + func_test_name + ":\n")

        source = "source_df"
        if df is None:
            # Use the main df
            df_func = self.df
        elif isinstance(df, pyspark.sql.dataframe.DataFrame):
            source_df = "\tsource_df=op.create.df(" + df.export() + ")\n"
            df_func = df
            add_buffer(source_df)
        else:
            # TODO: op is not supposed to be hardcoded
            source = "op"
            df_func = df

        # Process simple arguments
        _args = []
        for v in args:
            if is_str(v):
                _args.append("'" + v + "'")
            elif is_numeric(v):
                _args.append(str(v))
            elif is_list(v):
                if is_list_of_strings(v):
                    lst = ["'" + x + "'" for x in v]
                elif is_list_of_numeric(v):
                    lst = [str(x) for x in v]
                elif is_list_of_tuples(v):
                    lst = [str(x) for x in v]

                _args.append('[' + ','.join(lst) + ']')
            elif is_function(v):
                _args.append(v.__qualname__)
            # else:
            #     import marshal
            #     code_string = marshal.dumps(v.__code__)
            #     add_buffer("\tfunction = '" + code_string + "'\n")
            # import marshal, types
            #
            # code = marshal.loads(code_string)
            # func = types.FunctionType(code, globals(), "some_func_name")

        _args = ','.join(_args)
        _kwargs = []

        # print(_args)
        # Process keywords arguments
        for k, v in kwargs.items():
            if is_str(v):
                v = "'" + v + "'"
            _kwargs.append(k + "=" + str(v))

        # Separator if we have positional and keyword arguments
        separator = ""
        if (not is_list_empty(args)) & (not is_list_empty(kwargs)):
            separator = ","

        if func is None:
            add_buffer("\tactual_df = source_df\n")
        else:
            add_buffer("\tactual_df =" + source + "." + func + "(" + _args +
                       separator + ','.join(_kwargs) + ")\n")

        # Apply function to the dataframe
        if func is None:
            df_result = self.op.create.df(*args, **kwargs)
        else:
            # Here we construct the method to be applied to the source object
            for f in func.split("."):
                df_func = getattr(df_func, f)

            df_result = df_func(*args, **kwargs)

        if output == "df":
            expected = "\texpected_df = op.create.df(" + df_result.export(
            ) + ")\n"
        elif output == "json":
            if is_str(df_result):
                df_result = "'" + df_result + "'"
            else:
                df_result = str(df_result)
            add_buffer("\tactual_df =json_enconding(actual_df)\n")

            expected = "\texpected_value =json_enconding(" + df_result + ")\n"
        else:
            expected = "\t\n"

        add_buffer(expected)

        if output == "df":
            add_buffer(
                "\tassert (expected_df.collect() == actual_df.collect())\n")
        elif output == "json":
            add_buffer("\tassert (expected_value == actual_df)\n")

        return "".join(buffer)
示例#3
0
    def run(self,
            df,
            func_request=None,
            func_response=None,
            return_type="json",
            calls=60,
            period=60,
            max_tries=8):
        """
        Read a the url key from a mongo collection an make a request to a service
        :param df: Dataframe to me loaded to the enricher collection.
        :param func_request: help to create a custom request
        :param func_response: help to create a custom response
        :param calls: how many call can you make by period of time
        :param period: in which period ot time can the call be made in seconds
        :param max_tries: how many retries should we do
        :param return_type:
        :return:
        """

        if is_dataframe(df):
            df = df.create_id(COL_ID)

        # Load the dataframe data in the enricher
        self.load(df)

        collection_name = self.collection_name
        collection = self.get_collection(collection_name)

        # Get data that is not yet enriched
        cursor = collection.find({COL_RESULTS: {"$exists": False}})

        total_docs = cursor.count(True)

        if func_request is None:
            func_request = requests.get
        collection = self.get_collection(collection_name)

        @on_exception(expo, RateLimitException, max_tries=max_tries)
        @limits(calls=calls, period=period)
        def _func_request(v):
            return func_request(v)

        if total_docs > 0:
            for c in tqdm_notebook(cursor,
                                   total=total_docs,
                                   desc='Processing...'):

                # Send request to the API
                response = _func_request(c)

                mongo_id = c["_id"]

                if response.status_code == 200:
                    if return_type == "json":
                        response = json.loads(response.text)
                    elif return_type == "text":
                        response = response.text

                    # Process the result with an external function
                    if is_function(func_response):
                        response = func_response(response)

                    # Update the mongo id with the result
                    collection.find_and_modify(
                        query={"_id": mongo_id},
                        update={"$set": {
                            COL_RESULTS: response
                        }},
                        upsert=False,
                        full_response=True)
                else:
                    # The response key will remain blank so we can filter it to try in future request
                    logger.print(response.status_code)

            # Append the data in enrichment to the dataframe

            logger.print("Appending collection info into the dataframe")
            # TODO: An elegant way to handle pickling?
            # take care to the pickling
            host = self.host
            port = self.port
            db_name = self.db_name

            @pandas_udf('string', PandasUDFType.SCALAR)
            def func(value):
                # More about pickling
                from pymongo import MongoClient
                _client = MongoClient(host, port)
                _db = _client[db_name]
                _collection = _db[collection_name]

                def func_serie(serie):
                    _cursor = _collection.find_one({COL_ID: serie},
                                                   projection={
                                                       "_id": 0,
                                                       COL_RESULTS: 1
                                                   })
                    return _cursor[COL_RESULTS]

                return value.apply(func_serie)

            df = df.withColumn(COL_RESULTS,
                               func(df[COL_ID])).cols.drop(COL_ID).run()

            # If the process is finished, flush the Mongo collection
            self.flush()
            return df
        else:
            print("No records available to process")