def smvStrCat(head, *others): """Concatenate multiple columns to a single string. Similar to `concat` and `concat_ws` functions in Spark but behaves differently when some columns are nulls. The Spark version will return null if any of the inputs is null. smvStrCat will return null if all of the inputs are nulls, otherwise it will coalesce null cols to blank. This function can take 2 forms: - smvStrCat(sep, col1, col2, ...) - smvStrCat(col1, col2, ...) Args: sep (String): separater for the concats col. (Column): columns to be concatenated Return: (col): a StringType column """ if (isinstance(head, basestring)): sep = head cols = list(others) elif (isinstance(head, Column)): sep = "" cols = [head] + list(others) else: raise RuntimeError( "first parameter must be either a String or a Column") app = SmvApp.getInstance() return Column( app._jvm.org.tresamigos.smv.python.SmvPythonHelper.smvStrCat( sep, smv_copy_array(app.sc, *cols)))
def create_smv_pyclient(self, arglist): ''' return a smvPyClient instance ''' # convert python arglist to java String array java_args = smv_copy_array(self.sc, *arglist) return self._jvm.org.tresamigos.smv.python.SmvPyClientFactory.init( java_args, self.sqlContext._ssql_ctx)
def smvPivotCoalesce(self, pivotCols, valueCols, baseOutput): """Perform SmvPivot, then coalesce the output Args: pivotCols (list(list(str))): lists of names of column names to pivot valueCols (list(string)): names of value columns to coalesce baseOutput (list(str)): expected names pivoted column Returns: (Dataframe): result of pivot coalesce """ return DataFrame(self.sgd.smvPivotCoalesce(smv_copy_array(self.df._sc, *pivotCols), smv_copy_array(self.df._sc, *valueCols), smv_copy_array(self.df._sc, *baseOutput)), self.df.sql_ctx)
def dependencies(self): # Try/except block is a short-term solution (read: hack) to ensure that # the user gets a full stack trace when SmvPyDataSet user-defined methods # causes errors try: arr = smv_copy_array(self.smvApp.sc, *[x.urn() for x in self.requiresDS()]) except BaseException as err: traceback.print_exc() raise err return arr
def smvDedupByKey(self, *keys): """Remove duplicate records from the DataFrame by arbitrarly selecting the first record from a set of records with same primary key or key combo. Args: keys (\*string or \*Column): the column names or Columns on which to apply dedup Example: input DataFrame: +-----+---------+---------+ | id | product | Company | +=====+=========+=========+ | 1 | A | C1 | +-----+---------+---------+ | 1 | C | C2 | +-----+---------+---------+ | 2 | B | C3 | +-----+---------+---------+ | 2 | B | C4 | +-----+---------+---------+ >>> df.dedupByKey("id") output DataFrame: +-----+---------+---------+ | id | product | Company | +=====+=========+=========+ | 1 | A | C1 | +-----+---------+---------+ | 2 | B | C3 | +-----+---------+---------+ >>> df.dedupByKey("id", "product") output DataFrame: +-----+---------+---------+ | id | product | Company | +=====+=========+=========+ | 1 | A | C1 | +-----+---------+---------+ | 1 | C | C2 | +-----+---------+---------+ | 2 | B | C3 | +-----+---------+---------+ Returns: (DataFrame): a DataFrame without duplicates for the specified keys """ jdf = self._jPythonHelper.smvDedupByKey(self._jdf, smv_copy_array(self._sc, *keys)) return DataFrame(jdf, self._sql_ctx)
def smvRenameField(self, *namePairs): """Rename one or more fields of a `DataFrame` Args: namePairs (\*tuple): tuples of strings where the first is the source column name, and the second is the target column name Example: >>> df.smvRenameField(("a", "aa"), ("c", "cc")) Returns: (DataFrame): the DataFrame with renamed fields """ jdf = self._jPythonHelper.smvRenameField(self._jdf, smv_copy_array(self._sc, *namePairs)) return DataFrame(jdf, self._sql_ctx)
def smvDesc(self, *colDescs): """Adds column descriptions Args: colDescs (\*tuple): tuples of strings where the first is the column name, and the second is the description to add Example: >>> df.smvDesc(("a", "description of col a"), ("b", "description of col b")) Returns: (DataFrame): the DataFrame with column descriptions added """ jdf = self._jPythonHelper.smvDesc(self._jdf, smv_copy_array(self._sc, *colDescs)) return DataFrame(jdf, self._sql_ctx)
def smvRemoveDesc(self, *colNames): """Removes description for the given columns from the Dataframe Args: colNames (\*string): names of columns for which to remove the description Example: >>> df.smvRemoveDesc("col_a", "col_b") Returns: (DataFrame): the DataFrame with column descriptions removed """ jdf = self._jPythonHelper.smvRemoveDesc(self._jdf, smv_copy_array(self._sc, *colNames)) return DataFrame(jdf, self._sql_ctx)
def smvSelectMinus(self, *cols): """Remove one or more columns from current DataFrame Args: cols (\*string or \*Column): column names or Columns to remove from the DataFrame Example: >>> df.smvSelectMinus("col1", "col2") >>> df.smvSelectMinus(col("col1"), col("col2")) Returns: (DataFrame): the resulting DataFrame after removal of columns """ jdf = self._jPythonHelper.smvSelectMinus(self._jdf, smv_copy_array(self._sc, *cols)) return DataFrame(jdf, self._sql_ctx)
def smvTopNRecs(self, maxElems, *cols): """For each group, return the top N records according to a given ordering Example: # This will keep the 3 largest amt records for each id df.smvGroupBy("id").smvTopNRecs(3, col("amt").desc()) Args: maxElems (int): maximum number of records per group cols (\*str): columns defining the ordering Returns: (DataFrame): result of taking top records from groups """ return DataFrame(self.sgd.smvTopNRecs(maxElems, smv_copy_array(self.df._sc, *cols)), self.df.sql_ctx)
def smvJoinMultipleByKey(self, keys, joinType = 'inner'): """Create multiple DF join builder It is used in conjunction with `joinWith` and `doJoin` Args: keys (list(string)): a list of column names on which to apply the join joinType (string): choose one of ['inner', 'outer', 'leftouter', 'rightouter', 'leftsemi'] Example: >>> df.joinMultipleByKey(["k1", "k2"], "inner").joinWith(df2, "_df2").joinWith(df3, "_df3", "leftouter").doJoin() Returns: (SmvMultiJoin): the builder object for the multi join operation """ jdf = self._jPythonHelper.smvJoinMultipleByKey(self._jdf, smv_copy_array(self._sc, *keys), joinType) return SmvMultiJoin(self._sql_ctx, jdf)
def smvHashKey(head, *others): """Create MD5 on concatenated columns. Return "Prefix" + MD5 Hex string(size 32 string) as the unique key MD5's collisions rate on real data records could be ignored based on the following discussion. https://marc-stevens.nl/research/md5-1block-collision/ The shortest messages have the same MD5 are 512-bit (64-byte) messages as below 4dc968ff0ee35c209572d4777b721587d36fa7b21bdc56b74a3dc0783e7b9518afbfa200a8284bf36e8e4b55b35f427593d849676da0d1555d8360fb5f07fea2 and the (different by two bits) 4dc968ff0ee35c209572d4777b721587d36fa7b21bdc56b74a3dc0783e7b9518afbfa202a8284bf36e8e4b55b35f427593d849676da0d1d55d8360fb5f07fea2 both have MD5 hash 008ee33a9d58b51cfeb425b0959121c9 There are other those pairs, but all carefully constructed. Theoretically the random collisions will happen on data size approaching 2^64 (since MD5 has 128-bit), which is much larger than the number of records we deal with (a billion is about 2^30) There for using MD5 to hash primary key columns is good enough for creating an unique key This function can take 2 forms: - smvHashKey(prefix, col1, col2, ...) - smvHashKey(col1, col2, ...) Args: prefix (String): return string's prefix col. (Column): columns to be part of hash Return: (col): a StringType column as Prefix + MD5 Hex string """ if (isinstance(head, basestring)): pre = head cols = list(others) elif (isinstance(head, Column)): pre = "" cols = [head] + list(others) else: raise RuntimeError( "first parameter must be either a String or a Column") app = SmvApp.getInstance() return Column( app._jvm.org.tresamigos.smv.python.SmvPythonHelper.smvHashKey( pre, smv_copy_array(app.sc, *cols)))
def smvPivotSum(self, pivotCols, valueCols, baseOutput): """Perform SmvPivot, then sum the results The user is required to supply the list of expected pivot column output names to avoid extra action on the input DataFrame. If an empty sequence is provided, then the base output columns will be extracted from values in the pivot columns (will cause an action on the entire DataFrame!) Args: pivotCols (list(list(str))): lists of names of column names to pivot valueCols (list(string)): names of value columns to sum baseOutput (list(str)): expected names pivoted column Examples: For example, given a DataFrame df that represents the table +-----+-------+---------+-------+ | id | month | product | count | +=====+=======+=========+=======+ | 1 | 5/14 | A | 100 | +-----+-------+---------+-------+ | 1 | 6/14 | B | 200 | +-----+-------+---------+-------+ | 1 | 5/14 | B | 300 | +-----+-------+---------+-------+ we can use >>> df.smvGroupBy("id").smvPivotSum(Seq("month", "product"))("count")("5_14_A", "5_14_B", "6_14_A", "6_14_B") to produce the following output +-----+--------------+--------------+--------------+--------------+ | id | count_5_14_A | count_5_14_B | count_6_14_A | count_6_14_B | +=====+==============+==============+==============+==============+ | 1 | 100 | 300 | NULL | 200 | +-----+--------------+--------------+--------------+--------------+ Returns: (DataFrame): result of pivot sum """ return DataFrame(self.sgd.smvPivotSum(smv_copy_array(self.df._sc, *pivotCols), smv_copy_array(self.df._sc, *valueCols), smv_copy_array(self.df._sc, *baseOutput)), self.df.sql_ctx)
def smvExpandStruct(self, *cols): """Expand structure type column to a group of columns Args: cols (\*string): column names to expand Example: input DF: [id: string, address: struct<state:string, zip:string, street:string>] >>> df.smvExpandStruct("address") output DF: [id: string, state: string, zip: string, street: string] Returns: (DataFrame): DF with expanded columns """ jdf = self._jPythonHelper.smvExpandStruct(self._jdf, smv_copy_array(self._sc, *cols)) return DataFrame(jdf, self._sql_ctx)
def smvGroupBy(self, *cols): """Similar to groupBy, instead of creating GroupedData, create an `SmvGroupedData` object. See [[org.tresamigos.smv.SmvGroupedDataFunc]] for list of functions that can be applied to the grouped data. Args: cols (\*string or \*Column): column names or Column objects to group on Note: This is going away shortly and user will be able to use standard Spark `groupBy` method directly. Example: >>> df.smvGroupBy(col("k")) >>> df.smvGroupBy("k") Returns: (SmvGroupedData): grouped data object """ jSgd = self._jPythonHelper.smvGroupBy(self._jdf, smv_copy_array(self._sc, *cols)) return SmvGroupedData(self.df, jSgd)
def _moduleUrnsForStage(self, stageName, fn): # `walk_packages` can generate AttributeError if the system has # Gtk modules, which are not designed to use with reflection or # introspection. Best action to take in this situation is probably # to simply suppress the error. def err(name): pass # print("Error importing module %s" % name) # t, v, tb = sys.exc_info() # print("type is {0}, value is {1}".format(t, v)) buf = [] # import the stage and only walk the packages in the path of that stage, recursively try: stagemod = __import__(stageName) except: # may be a scala-only stage pass else: for loader, name, is_pkg in pkgutil.walk_packages( stagemod.__path__, stagemod.__name__ + '.', onerror=err): # The additional "." is necessary to prevent false positive, e.g. stage_2.M1 matches stage if name.startswith(stageName + ".") and not is_pkg: pymod = __import__(name) for c in name.split('.')[1:]: pymod = getattr(pymod, c) for n in dir(pymod): obj = getattr(pymod, n) try: # Class should have an fqn which begins with the stageName. # Each package will contain among other things all of # the modules that were imported into it, and we need # to exclude these (so that we only count each module once) if fn(obj) and obj.fqn().startswith(name): buf.append(obj.urn()) except AttributeError: continue return smv_copy_array(self.smvApp.sc, *buf)
def _smvBinHist(self, *colWithBin): for elem in colWithBin: assert type(elem) is tuple, "smvBinHist takes a list of tuple(string, double) as paraeter" assert len(elem) == 2, "smvBinHist takes a list of tuple(string, double) as parameter" insureDouble = map(lambda t: (t[0], t[1] * 1.0), colWithBin) return self._jPythonHelper.smvBinHist(self._jdf, smv_copy_array(self._sc, *insureDouble))
def _smvConcatHist(self, *cols): return self._jPythonHelper.smvConcatHist(self._jdf, smv_copy_array(self._sc, *cols))
def __doFill(*valueCols): return DataFrame(self.sgd.smvFillNullWithPrevValue(smv_copy_array(self.df._sc, *orderCols), smv_copy_array(self.df._sc, *valueCols)), self.df.sql_ctx)
def _check(*dfothers): jdf = self._jPythonHelper.smvOverlapCheck(self._jdf, keyColName, smv_copy_array(self._sc, *dfothers)) return DataFrame(jdf, self._sql_ctx)
def _withOrder(*orderCols): jdf = self._jPythonHelper.smvDedupByKeyWithOrder(self._jdf, smv_copy_array(self._sc, *keys), smv_copy_array(self._sc, *orderCols)) return DataFrame(jdf, self._sql_ctx)