def create_spark_model_vectors_df( self, df: DataFrame) -> _UserResourceFeatureVectorMapping: tenant_col = self.tenant_col indexed_user_col = self.indexed_user_col user_vec_col = self.user_vec_col indexed_res_col = self.indexed_res_col res_vec_col = self.res_vec_col max_iter = self.max_iter distinct_tenants = df.select(tenant_col).distinct().cache() num_tenants = distinct_tenants.count() separate_tenants = self.separate_tenants num_blocks = self.num_blocks if self.num_blocks is not None else ( num_tenants if not separate_tenants else 10) als = ALS(rank=self.rank_param, maxIter=max_iter, regParam=self.reg_param, numUserBlocks=num_blocks, numItemBlocks=num_blocks, implicitPrefs=self.apply_implicit_cf, userCol=self.indexed_user_col, itemCol=self.indexed_res_col, ratingCol=self.scaled_likelihood_col, nonnegative=True, coldStartStrategy='drop') alpha = self.alpha_param if alpha is not None: als.setAlpha(alpha) if separate_tenants: tenants = [ row[tenant_col] for row in distinct_tenants.orderBy(tenant_col).collect() ] user_mapping_df: Optional[DataFrame] = None res_mapping_df: Optional[DataFrame] = None for curr_tenant in tenants: curr_df = df.filter(f.col(tenant_col) == curr_tenant).cache() curr_user_mapping_df, curr_res_mapping_df = self._train_cf( als, curr_df) user_mapping_df = user_mapping_df.union( curr_user_mapping_df ) if user_mapping_df is not None else curr_user_mapping_df res_mapping_df = res_mapping_df.union( curr_res_mapping_df ) if res_mapping_df is not None else curr_res_mapping_df else: user_mapping_df, res_mapping_df = self._train_cf(als, df) assert user_mapping_df is not None and res_mapping_df is not None return _UserResourceFeatureVectorMapping(tenant_col, indexed_user_col, user_vec_col, indexed_res_col, res_vec_col, None, None, None, user_mapping_df, res_mapping_df)