示例#1
0
    def create_spark_model_vectors_df(
            self, df: DataFrame) -> _UserResourceFeatureVectorMapping:
        tenant_col = self.tenant_col
        indexed_user_col = self.indexed_user_col
        user_vec_col = self.user_vec_col
        indexed_res_col = self.indexed_res_col
        res_vec_col = self.res_vec_col
        max_iter = self.max_iter
        distinct_tenants = df.select(tenant_col).distinct().cache()
        num_tenants = distinct_tenants.count()
        separate_tenants = self.separate_tenants
        num_blocks = self.num_blocks if self.num_blocks is not None else (
            num_tenants if not separate_tenants else 10)

        als = ALS(rank=self.rank_param,
                  maxIter=max_iter,
                  regParam=self.reg_param,
                  numUserBlocks=num_blocks,
                  numItemBlocks=num_blocks,
                  implicitPrefs=self.apply_implicit_cf,
                  userCol=self.indexed_user_col,
                  itemCol=self.indexed_res_col,
                  ratingCol=self.scaled_likelihood_col,
                  nonnegative=True,
                  coldStartStrategy='drop')

        alpha = self.alpha_param

        if alpha is not None:
            als.setAlpha(alpha)

        if separate_tenants:
            tenants = [
                row[tenant_col]
                for row in distinct_tenants.orderBy(tenant_col).collect()
            ]

            user_mapping_df: Optional[DataFrame] = None
            res_mapping_df: Optional[DataFrame] = None

            for curr_tenant in tenants:
                curr_df = df.filter(f.col(tenant_col) == curr_tenant).cache()
                curr_user_mapping_df, curr_res_mapping_df = self._train_cf(
                    als, curr_df)

                user_mapping_df = user_mapping_df.union(
                    curr_user_mapping_df
                ) if user_mapping_df is not None else curr_user_mapping_df

                res_mapping_df = res_mapping_df.union(
                    curr_res_mapping_df
                ) if res_mapping_df is not None else curr_res_mapping_df
        else:
            user_mapping_df, res_mapping_df = self._train_cf(als, df)

        assert user_mapping_df is not None and res_mapping_df is not None

        return _UserResourceFeatureVectorMapping(tenant_col, indexed_user_col,
                                                 user_vec_col, indexed_res_col,
                                                 res_vec_col, None, None, None,
                                                 user_mapping_df,
                                                 res_mapping_df)