def test_generate(self): df_features = self.spark.read.csv( 'tests/fixtures/similarity/features.csv', header=True) columns_to_convert = [ col for col in df_features.columns if 'id' not in col ] df_features_int = df_features for col in columns_to_convert: df_features_int = df_features_int.withColumn( col, f.col(col).cast(IntegerType())) similarity_cos = Similarity(df_features=df_features_int, similarity_type='cosine') pd_df_similarity_cos, _ = similarity_cos.generate() self.assertEqual(pd_df_similarity_cos.shape[0], df_features.count()) self.assertEqual(pd_df_similarity_cos.shape[1], df_features.count()) similarity_euc = Similarity(df_features=df_features_int, similarity_type='euclidean') pd_df_similarity_euc, _ = similarity_euc.generate() self.assertEqual(pd_df_similarity_euc.shape[0], df_features.count()) self.assertEqual(pd_df_similarity_euc.shape[1], df_features.count()) similarity_fail = Similarity(df_features=df_features_int, similarity_type='test') with self.assertRaises(ValueError): similarity_fail.generate()
etl_created = create_timestamp() preprocessor = Preprocess(df_labels=df_labels, columns=COLUMNS, index_column=INDEX_COLUMN) df_recipe_features = preprocessor.preprocess() pd_df_recipe_features = df_recipe_features.toPandas() features_dir = f'output/{etl_created}/features' os.makedirs(features_dir) pd_df_recipe_features.to_csv(f'{features_dir}/features.csv', index=False) similarity = Similarity(df_features=df_recipe_features, index_column=INDEX_COLUMN, similarity_type=SIMILARITY_TYPE) similarities = similarity.generate() pd_df_similarities_wide = similarities[0] pd_df_similarities_long = similarities[1] similarities_dir = f'output/{etl_created}/similarities' os.makedirs(similarities_dir) pd_df_similarities_wide.to_csv(f'{similarities_dir}/similarities_wide.csv', index=True) pd_df_similarities_long.to_csv(f'{similarities_dir}/similarities_long.csv', index=False) parameters_dir = f'output/{etl_created}/parameters' os.makedirs(parameters_dir) pd_df_parameters = create_parameters_table(similarity_type=SIMILARITY_TYPE, index_column=INDEX_COLUMN, columns=COLUMNS) pd_df_parameters.to_csv(f'{parameters_dir}/parameters.csv', index=False)