def test_generate(self):

        df_features = self.spark.read.csv(
            'tests/fixtures/similarity/features.csv', header=True)

        columns_to_convert = [
            col for col in df_features.columns if 'id' not in col
        ]
        df_features_int = df_features
        for col in columns_to_convert:
            df_features_int = df_features_int.withColumn(
                col,
                f.col(col).cast(IntegerType()))

        similarity_cos = Similarity(df_features=df_features_int,
                                    similarity_type='cosine')

        pd_df_similarity_cos, _ = similarity_cos.generate()

        self.assertEqual(pd_df_similarity_cos.shape[0], df_features.count())
        self.assertEqual(pd_df_similarity_cos.shape[1], df_features.count())

        similarity_euc = Similarity(df_features=df_features_int,
                                    similarity_type='euclidean')

        pd_df_similarity_euc, _ = similarity_euc.generate()

        self.assertEqual(pd_df_similarity_euc.shape[0], df_features.count())
        self.assertEqual(pd_df_similarity_euc.shape[1], df_features.count())

        similarity_fail = Similarity(df_features=df_features_int,
                                     similarity_type='test')
        with self.assertRaises(ValueError):
            similarity_fail.generate()
Пример #2
0
etl_created = create_timestamp()

preprocessor = Preprocess(df_labels=df_labels,
                          columns=COLUMNS,
                          index_column=INDEX_COLUMN)
df_recipe_features = preprocessor.preprocess()
pd_df_recipe_features = df_recipe_features.toPandas()
features_dir = f'output/{etl_created}/features'
os.makedirs(features_dir)
pd_df_recipe_features.to_csv(f'{features_dir}/features.csv', index=False)


similarity = Similarity(df_features=df_recipe_features,
                        index_column=INDEX_COLUMN,
                        similarity_type=SIMILARITY_TYPE)
similarities = similarity.generate()
pd_df_similarities_wide = similarities[0]
pd_df_similarities_long = similarities[1]

similarities_dir = f'output/{etl_created}/similarities'
os.makedirs(similarities_dir)
pd_df_similarities_wide.to_csv(f'{similarities_dir}/similarities_wide.csv', index=True)
pd_df_similarities_long.to_csv(f'{similarities_dir}/similarities_long.csv', index=False)

parameters_dir = f'output/{etl_created}/parameters'
os.makedirs(parameters_dir)
pd_df_parameters = create_parameters_table(similarity_type=SIMILARITY_TYPE,
                                           index_column=INDEX_COLUMN,
                                           columns=COLUMNS)
pd_df_parameters.to_csv(f'{parameters_dir}/parameters.csv', index=False)