def test_field_exceptions(self): # test to make sure that the method that checks the field configs ids for each field name in the field_dict # of the content analyzer works. It considers the three cases this can occur: when passing the field_dict # with duplicate ids as argument for the content_analyzer, when setting the FieldConfig list with duplicates # for a specific field_name, and when appending a FieldConfig to the list associated with a specific field_name # but the config id is already in the list config_1 = FieldConfig(SkLearnTfIdf(), NLTK(), id="test") config_2 = FieldConfig(SkLearnTfIdf(), NLTK(), id="test") config_list = [config_1, config_2] field_dict = dict() field_dict["test"] = config_list with self.assertRaises(ValueError): config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test", field_dict) ContentAnalyzer(config).fit() with self.assertRaises(ValueError): config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test") config.add_multiple_config("test", config_list) ContentAnalyzer(config).fit() with self.assertRaises(ValueError): config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test") config.add_single_config("test", config_1) config.add_single_config("test", config_2) ContentAnalyzer(config).fit()
def test_create_content_embedding(self): movies_ca_config = ItemAnalyzerConfig( source=JSONFile(movies_info_reduced), id=['imdbID'], output_directory="movielens_test_embedding", ) movies_ca_config.add_multiple_config( field_name='Title', config_list=[FieldConfig( WordEmbeddingTechnique(Gensim('glove-twitter-25')), NLTK(lemmatization=True, stopwords_removal=True))]) content_analyzer = ContentAnalyzer(movies_ca_config) content_analyzer.fit() for name in os.listdir(THIS_DIR): if os.path.isdir(os.path.join(THIS_DIR, name)) \ and 'movielens_test_embedding' in str(name): with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file: content = pickle.load(file) self.assertIsInstance(content.get_field("Title")[0], EmbeddingField) self.assertIsInstance(content.get_field("Title")[0].value, np.ndarray) break
def test_create_content_tfidf(self): movies_ca_config = ItemAnalyzerConfig( source=JSONFile(movies_info_reduced), id='imdbID', output_directory="movielens_test_tfidf", ) movies_ca_config.add_multiple_config( field_name='Title', config_list=[FieldConfig(SkLearnTfIdf())]) content_analyzer = ContentAnalyzer(movies_ca_config) content_analyzer.fit() for name in os.listdir(THIS_DIR): if os.path.isdir(os.path.join(THIS_DIR, name)) \ and 'movielens_test_tfidf' in str(name): with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file: content = pickle.load(file) self.assertIsInstance( content.get_field("Title")[0], FeaturesBagField) self.assertIsInstance( content.get_field("Title")[0].value, dict) break
def test_create_contents_in_index(self): output_dir = os.path.join(THIS_DIR, "movielens_test_original_index") movies_ca_config = ItemAnalyzerConfig( source=JSONFile(movies_info_reduced), id=['imdbID'], output_directory=output_dir, ) movies_ca_config.add_multiple_config( field_name='Title', config_list=[ FieldConfig(OriginalData(), NLTK(lemmatization=True, stopwords_removal=True), SearchIndex(os.path.join(output_dir, "index")), "test_search"), FieldConfig(SkLearnTfIdf(), NLTK(), KeywordIndex(os.path.join(output_dir, "index1")), "test_keyword"), FieldConfig(OriginalData(), NLTK(), SearchIndex(os.path.join(output_dir, "index"))) ]) content_analyzer = ContentAnalyzer(movies_ca_config) content_analyzer.fit() for name in os.listdir(THIS_DIR): if os.path.isdir(os.path.join(THIS_DIR, name)) \ and 'movielens_test_original_index' in str(name): with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file: content = pickle.load(file) self.assertIsInstance( content.get_field("Title")[0], IndexField) self.assertIsInstance( content.get_field("Title")[0].value, str) self.assertIsInstance( content.get_field("Title")[1], IndexField) self.assertIsInstance( content.get_field("Title")[1].value, str) break
def test_fit_export_json(self): movies_ca_config = ItemAnalyzerConfig( source=JSONFile(movies_info_reduced), id=['imdbID'], output_directory=self.out_dir, export_json=True ) movies_ca_config.add_single_config('Plot', FieldConfig(OriginalData())) movies_ca_config.add_single_config('Plot', FieldConfig(SkLearnTfIdf())) movies_ca_config.add_single_config('imdbRating', FieldConfig()) ContentAnalyzer(movies_ca_config).fit() self.assertTrue(os.path.isfile(os.path.join(self.out_dir, 'contents.json'))) processed_source = list(JSONFile(os.path.join(self.out_dir, 'contents.json'))) self.assertEqual(len(processed_source), 20) for processed_content in processed_source: self.assertIn('Plot#0', processed_content) self.assertIn('Plot#1', processed_content) self.assertIn('imdbRating#0', processed_content) # def doCleanups(self) -> None: # if os.path.isdir(self.out_dir): # shutil.rmtree(self.out_dir)
def test_exogenous_exceptions(self): # test to make sure that the method that checks the exogenous configs ids in the exogenous_representation_list # of the content analyzer works. It considers the two cases this can occur: when passing the # exogenous_representation_list with duplicate ids as argument for the content_analyzer, # and when appending an ExogenousConfig to the list but the config id is already in the list config_1 = ExogenousConfig( DBPediaMappingTechnique('dbo:Film', 'Title'), "test") config_2 = ExogenousConfig( DBPediaMappingTechnique('dbo:Film', 'Title'), "test") exogenous_representation_list = [config_1, config_2] with self.assertRaises(ValueError): config = ItemAnalyzerConfig( JSONFile(movies_info_reduced), ["imdbID"], "movielens_test", exogenous_representation_list=exogenous_representation_list) ContentAnalyzer(config).fit() with self.assertRaises(ValueError): config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test") config.add_single_exogenous(config_1) config.add_single_exogenous(config_2) ContentAnalyzer(config).fit()
def test_decode_field_data_embedding(self): movies_ca_config = ItemAnalyzerConfig( source=JSONFile(decode_embedding), id=['imdbID'], output_directory=decode_path + 'movies_embedding_' ) movies_ca_config.add_multiple_config( field_name='Title', config_list=[FieldConfig()] ) ContentAnalyzer(config=movies_ca_config).fit() for name in os.listdir(decode_path): if os.path.isdir(os.path.join(decode_path, name)) \ and 'movies_embedding_' in str(name): with lzma.open(os.path.join(decode_path, name, 'tt0113497.xz'), 'r') as file: content = pickle.load(file) self.assertIsInstance(content.get_field("Title")[0], EmbeddingField) self.assertIsInstance(content.get_field("Title")[0].value, np.ndarray) break
def test_create_content(self): plot_config = FieldConfig(BabelPyEntityLinking()) exogenous_config = ExogenousConfig(DBPediaMappingTechnique('Film', 'EN', 'Title')) content_analyzer_config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test") content_analyzer_config.add_single_config("Title", plot_config) content_analyzer_config.add_single_exogenous(exogenous_config) content_analyzer = ContentAnalyzer(content_analyzer_config) content_analyzer.fit() for name in os.listdir(THIS_DIR): if os.path.isdir(os.path.join(THIS_DIR, name)) \ and 'movielens_test' in str(name): with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file: content = pickle.load(file) self.assertIsInstance(content.get_field("Title")[0], FeaturesBagField) self.assertIsInstance(content.get_field("Title")[0].value, dict) break
timestamp_field_name='timestamp', ) ratings_frame = ratings_importer.import_ratings() RatingsImporter( source: RawInformationSource, from_id_column: Union[str, int] = 0, to_id_column: Union[str, int] = 1, score_column: Union[str, int] = 2, timestamp_column: Union[str, int] = None, score_processor: RatingProcessor = None ) ri = RatingsImporter(CSVFile(ratings_filename)) ri.import_ratings() ri.add_score_column( score_column = 'title_review', column_name = 'title_sentiment', score_processor = TextBlobSentimentAnalysis() ) from orange_cb_recsys.content_analyzer import ItemAnalyzerConfig, JSONFile config = ItemAnalyzerConfig( source=JSONFile(raw_source), id="imdbId" output_directory='movies_codified/' )