def test_field_exceptions(self): # test to make sure that the method that checks the field configs ids for each field name in the field_dict # of the content analyzer works. It considers the three cases this can occur: when passing the field_dict # with duplicate ids as argument for the content_analyzer, when setting the FieldConfig list with duplicates # for a specific field_name, and when appending a FieldConfig to the list associated with a specific field_name # but the config id is already in the list config_1 = FieldConfig(SkLearnTfIdf(), NLTK(), id="test") config_2 = FieldConfig(SkLearnTfIdf(), NLTK(), id="test") config_list = [config_1, config_2] field_dict = dict() field_dict["test"] = config_list with self.assertRaises(ValueError): config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test", field_dict) ContentAnalyzer(config).fit() with self.assertRaises(ValueError): config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test") config.add_multiple_config("test", config_list) ContentAnalyzer(config).fit() with self.assertRaises(ValueError): config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test") config.add_single_config("test", config_1) config.add_single_config("test", config_2) ContentAnalyzer(config).fit()
def test_fit_export_json(self): movies_ca_config = ItemAnalyzerConfig( source=JSONFile(movies_info_reduced), id=['imdbID'], output_directory=self.out_dir, export_json=True ) movies_ca_config.add_single_config('Plot', FieldConfig(OriginalData())) movies_ca_config.add_single_config('Plot', FieldConfig(SkLearnTfIdf())) movies_ca_config.add_single_config('imdbRating', FieldConfig()) ContentAnalyzer(movies_ca_config).fit() self.assertTrue(os.path.isfile(os.path.join(self.out_dir, 'contents.json'))) processed_source = list(JSONFile(os.path.join(self.out_dir, 'contents.json'))) self.assertEqual(len(processed_source), 20) for processed_content in processed_source: self.assertIn('Plot#0', processed_content) self.assertIn('Plot#1', processed_content) self.assertIn('imdbRating#0', processed_content) # def doCleanups(self) -> None: # if os.path.isdir(self.out_dir): # shutil.rmtree(self.out_dir)
def test_create_content(self): file_path_content_analyzer = os.path.join( THIS_DIR, "../../test/content_analyzer/movielens_test*") entity_linking_pipeline = FieldRepresentationPipeline( BabelPyEntityLinking()) plot_config = FieldConfig(None) plot_config.append_pipeline(entity_linking_pipeline) content_analyzer_config = ContentAnalyzerConfig( 'ITEM', JSONFile(file_path), ["imdbID"], "movielens_test") content_analyzer_config.append_field_config("Plot", plot_config) content_analyzer = ContentAnalyzer(content_analyzer_config) content_analyzer.fit() """
def test_valid_id(self): valid_object = FieldConfig(id='test') self.assertIsNotNone(valid_object) valid_object = FieldConfig(id='test_valid') self.assertIsNotNone(valid_object) valid_object = FieldConfig(id='test-valid') self.assertIsNotNone(valid_object) valid_object = FieldConfig(id='test1-valid2') self.assertIsNotNone(valid_object) valid_object = FieldConfig(id='1_2-3_') self.assertIsNotNone(valid_object)
def test_invalid_id(self): with self.assertRaises(ValueError): FieldConfig(id='.in.vali.d') with self.assertRaises(ValueError): FieldConfig(id='#in#vali#d') with self.assertRaises(ValueError): FieldConfig(id=' ') with self.assertRaises(ValueError): FieldConfig(id='is invalid') with self.assertRaises(ValueError): FieldConfig(id='is/inva/lid')
def test_decode_field_data_embedding(self): file_path_test_decode = os.path.join( THIS_DIR, "../../datasets/test_decode/movies_title_embedding.json") test_dir = os.path.join(THIS_DIR, "../../datasets/test_decode/") movies_ca_config = ContentAnalyzerConfig( content_type='Item', source=JSONFile(file_path_test_decode), id_field_name_list=['imdbID'], output_directory=test_dir + 'movies_embedding_') movies_ca_config.append_field_config( field_name='Title', field_config=FieldConfig(pipelines_list=[ FieldRepresentationPipeline(content_technique=None), ])) ContentAnalyzer(config=movies_ca_config).fit() for name in os.listdir(test_dir): if os.path.isdir(os.path.join(test_dir, name)) \ and 'movies_embedding_' in str(name): with lzma.open(os.path.join(test_dir, name, 'tt0113497.xz'), 'r') as file: content = pickle.load(file) self.assertIsInstance( content.get_field("Title").get_representation('0'), EmbeddingField) self.assertIsInstance( content.get_field("Title").get_representation( '0').value, np.ndarray) break
def test_create_content_embedding(self): movies_ca_config = ItemAnalyzerConfig( source=JSONFile(movies_info_reduced), id=['imdbID'], output_directory="movielens_test_embedding", ) movies_ca_config.add_multiple_config( field_name='Title', config_list=[FieldConfig( WordEmbeddingTechnique(Gensim('glove-twitter-25')), NLTK(lemmatization=True, stopwords_removal=True))]) content_analyzer = ContentAnalyzer(movies_ca_config) content_analyzer.fit() for name in os.listdir(THIS_DIR): if os.path.isdir(os.path.join(THIS_DIR, name)) \ and 'movielens_test_embedding' in str(name): with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file: content = pickle.load(file) self.assertIsInstance(content.get_field("Title")[0], EmbeddingField) self.assertIsInstance(content.get_field("Title")[0].value, np.ndarray) break
def test_create_content_tfidf(self): movies_ca_config = ItemAnalyzerConfig( source=JSONFile(movies_info_reduced), id='imdbID', output_directory="movielens_test_tfidf", ) movies_ca_config.add_multiple_config( field_name='Title', config_list=[FieldConfig(SkLearnTfIdf())]) content_analyzer = ContentAnalyzer(movies_ca_config) content_analyzer.fit() for name in os.listdir(THIS_DIR): if os.path.isdir(os.path.join(THIS_DIR, name)) \ and 'movielens_test_tfidf' in str(name): with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file: content = pickle.load(file) self.assertIsInstance( content.get_field("Title")[0], FeaturesBagField) self.assertIsInstance( content.get_field("Title")[0].value, dict) break
def test_create_content(self): filepath = '../../datasets/movies_info_reduced.json' try: with open(filepath): pass except FileNotFoundError: filepath = 'datasets/movies_info_reduced.json' entity_linking_pipeline = FieldRepresentationPipeline( BabelPyEntityLinking()) plot_config = FieldConfig(None) plot_config.append_pipeline(entity_linking_pipeline) content_analyzer_config = ContentAnalyzerConfig( 'ITEM', JSONFile(filepath), ["imdbID"], "movielens_test") content_analyzer_config.append_field_config("Plot", plot_config) content_analyzer = ContentAnalyzer(content_analyzer_config) content_analyzer.fit()
def test_create_contents_in_index(self): output_dir = os.path.join(THIS_DIR, "movielens_test_original_index") movies_ca_config = ItemAnalyzerConfig( source=JSONFile(movies_info_reduced), id=['imdbID'], output_directory=output_dir, ) movies_ca_config.add_multiple_config( field_name='Title', config_list=[ FieldConfig(OriginalData(), NLTK(lemmatization=True, stopwords_removal=True), SearchIndex(os.path.join(output_dir, "index")), "test_search"), FieldConfig(SkLearnTfIdf(), NLTK(), KeywordIndex(os.path.join(output_dir, "index1")), "test_keyword"), FieldConfig(OriginalData(), NLTK(), SearchIndex(os.path.join(output_dir, "index"))) ]) content_analyzer = ContentAnalyzer(movies_ca_config) content_analyzer.fit() for name in os.listdir(THIS_DIR): if os.path.isdir(os.path.join(THIS_DIR, name)) \ and 'movielens_test_original_index' in str(name): with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file: content = pickle.load(file) self.assertIsInstance( content.get_field("Title")[0], IndexField) self.assertIsInstance( content.get_field("Title")[0].value, str) self.assertIsInstance( content.get_field("Title")[1], IndexField) self.assertIsInstance( content.get_field("Title")[1].value, str) break
def test_create_content_search_index(self): movies_ca_config = ContentAnalyzerConfig( content_type='Item', source=JSONFile(file_path), id_field_name_list=['imdbID'], output_directory='movielens_test') movies_ca_config.append_field_config( field_name='Title', field_config=FieldConfig(pipelines_list=[ FieldRepresentationPipeline(content_technique=SearchIndexing()) ])) content_analyzer = ContentAnalyzer(movies_ca_config) content_analyzer.fit()
def test_create_content(self): plot_config = FieldConfig(BabelPyEntityLinking()) exogenous_config = ExogenousConfig(DBPediaMappingTechnique('Film', 'EN', 'Title')) content_analyzer_config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test") content_analyzer_config.add_single_config("Title", plot_config) content_analyzer_config.add_single_exogenous(exogenous_config) content_analyzer = ContentAnalyzer(content_analyzer_config) content_analyzer.fit() for name in os.listdir(THIS_DIR): if os.path.isdir(os.path.join(THIS_DIR, name)) \ and 'movielens_test' in str(name): with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file: content = pickle.load(file) self.assertIsInstance(content.get_field("Title")[0], FeaturesBagField) self.assertIsInstance(content.get_field("Title")[0].value, dict) break
def test_decode_field_data_embedding(self): movies_ca_config = ItemAnalyzerConfig( source=JSONFile(decode_embedding), id=['imdbID'], output_directory=decode_path + 'movies_embedding_' ) movies_ca_config.add_multiple_config( field_name='Title', config_list=[FieldConfig()] ) ContentAnalyzer(config=movies_ca_config).fit() for name in os.listdir(decode_path): if os.path.isdir(os.path.join(decode_path, name)) \ and 'movies_embedding_' in str(name): with lzma.open(os.path.join(decode_path, name, 'tt0113497.xz'), 'r') as file: content = pickle.load(file) self.assertIsInstance(content.get_field("Title")[0], EmbeddingField) self.assertIsInstance(content.get_field("Title")[0].value, np.ndarray) break
def test_create_content_embedding(self): movies_ca_config = ContentAnalyzerConfig( content_type='Item', source=JSONFile(file_path), id_field_name_list=['imdbID'], output_directory="movielens_test", ) movies_ca_config.append_field_config( field_name='Title', field_config=FieldConfig(pipelines_list=[ FieldRepresentationPipeline( preprocessor_list=[ NLTK(lemmatization=True, stopwords_removal=True) ], content_technique=EmbeddingTechnique( combining_technique=Centroid(), embedding_source=GensimDownloader( name='glove-twitter-25'), granularity='doc')) ])) content_analyzer = ContentAnalyzer(movies_ca_config) content_analyzer.fit()
users_filename = '../../../datasets/examples/users_70.dat' users_output_dir = '../../../contents/examples/ex_2/users_' movies_ca_config = ContentAnalyzerConfig( content_type='Item', source=JSONFile(movies_filename), id_field_name_list=['imdbID'], output_directory=movies_output_dir, ) movies_ca_config.append_field_config( field_name='Title', field_config=FieldConfig(pipelines_list=[ FieldRepresentationPipeline(content_technique=LuceneTfIdf()) ])) movies_ca_config.append_field_config( field_name='Year', field_config=FieldConfig(pipelines_list=[ FieldRepresentationPipeline(content_technique=LuceneTfIdf()) ])) movies_ca_config.append_field_config( field_name='Genre', field_config=FieldConfig(pipelines_list=[ FieldRepresentationPipeline(preprocessor_list=[ NLTK(lemmatization=True, stopwords_removal=True) ], content_technique=LuceneTfIdf())
users_filename = '../../../datasets/examples/users_70.dat' users_output_dir = '../../../contents/examples/ex_1/users_' movies_ca_config = ContentAnalyzerConfig( content_type='Item', source=JSONFile(movies_filename), id_field_name_list=['imdbID'], output_directory=movies_output_dir, ) movies_ca_config.append_field_config( field_name='Title', field_config=FieldConfig(pipelines_list=[ FieldRepresentationPipeline(content_technique=LuceneTfIdf()) ])) movies_ca_config.append_field_config( field_name='Year', field_config=FieldConfig(pipelines_list=[ FieldRepresentationPipeline(content_technique=LuceneTfIdf()) ])) movies_ca_config.append_field_config( field_name='Genre', field_config=FieldConfig(pipelines_list=[ FieldRepresentationPipeline(preprocessor_list=[ NLTK(lemmatization=True, stopwords_removal=True) ], content_technique=LuceneTfIdf())
source = JSONFile(users_filename), id = '0', output_dir = users_output_dir, ) ItemAnalyzerConfig( source = JSONFile(items_filename), id = 'imdbID', output_dir = items_output_dir, ) movies_ca_config.append_field_config( field_name='Title', field_config=FieldConfig( pipelines_list=[FieldRepresentationPipeline( content_technique=LuceneTfIdf())] ) ) movies_ca_config.append_field_config( field_name='Year', field_config=FieldConfig( pipelines_list=[FieldRepresentationPipeline( content_technique=LuceneTfIdf())] ) ) movies_ca_config.append_field_config( field_name='Genre',