예제 #1
0
    def process_features(self, df, cols_by_type):
        """Process features before histogram filling.

        Specifically, in this case convert timestamp features to nanoseconds

        :param df: input data frame
        :return: output data frame with converted timestamp features
        :rtype: DataFrame
        """
        # make alias df for value counting (used below)
        idf = df.alias('')

        # timestamp variables are converted here to ns since 1970-1-1
        # histogrammar does not yet support long integers, so convert timestamps to float
        # epoch = (sparkcol("ts").cast("bigint") * 1000000000).cast("bigint")
        for col in cols_by_type["dt"]:
            self.logger.debug(
                'Converting column "{col}" of type "{type}" to nanosec.'.format(col=col, type=self.var_dtype[col])
            )
            to_ns = (sparkcol(col).cast("float") * 1e9)
            idf = idf.withColumn(col, to_ns)

        hg.sparksql.addMethods(idf)

        return idf
예제 #2
0
    def get_nunique(self, df, columns=[]):
        """return dict with number of unique entries for given columns

        :param df: input (spark) data frame
        :param columns: columns to select (optional)
        """
        if not columns:
            columns = df.columns
        qdf = df.agg(*(approxCountDistinct(sparkcol(c)).alias(c) for c in columns))
        return qdf.toPandas().T[0].to_dict()