Пример #1
0
    def test_field_exceptions(self):
        # test to make sure that the method that checks the field configs ids for each field name in the field_dict
        # of the content analyzer works. It considers the three cases this can occur: when passing the field_dict
        # with duplicate ids as argument for the content_analyzer, when setting the FieldConfig list with duplicates
        # for a specific field_name, and when appending a FieldConfig to the list associated with a specific field_name
        # but the config id is already in the list

        config_1 = FieldConfig(SkLearnTfIdf(), NLTK(), id="test")
        config_2 = FieldConfig(SkLearnTfIdf(), NLTK(), id="test")
        config_list = [config_1, config_2]
        field_dict = dict()
        field_dict["test"] = config_list

        with self.assertRaises(ValueError):
            config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test", field_dict)
            ContentAnalyzer(config).fit()

        with self.assertRaises(ValueError):
            config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test")
            config.add_multiple_config("test", config_list)
            ContentAnalyzer(config).fit()

        with self.assertRaises(ValueError):
            config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test")
            config.add_single_config("test", config_1)
            config.add_single_config("test", config_2)
            ContentAnalyzer(config).fit()
Пример #2
0
    def test_fit_export_json(self):
        movies_ca_config = ItemAnalyzerConfig(
            source=JSONFile(movies_info_reduced),
            id=['imdbID'],
            output_directory=self.out_dir,
            export_json=True
        )

        movies_ca_config.add_single_config('Plot', FieldConfig(OriginalData()))
        movies_ca_config.add_single_config('Plot', FieldConfig(SkLearnTfIdf()))
        movies_ca_config.add_single_config('imdbRating', FieldConfig())

        ContentAnalyzer(movies_ca_config).fit()

        self.assertTrue(os.path.isfile(os.path.join(self.out_dir, 'contents.json')))
        processed_source = list(JSONFile(os.path.join(self.out_dir, 'contents.json')))

        self.assertEqual(len(processed_source), 20)
        for processed_content in processed_source:
            self.assertIn('Plot#0', processed_content)
            self.assertIn('Plot#1', processed_content)
            self.assertIn('imdbRating#0', processed_content)

    # def doCleanups(self) -> None:
    #     if os.path.isdir(self.out_dir):
    #         shutil.rmtree(self.out_dir)
Пример #3
0
 def test_create_content(self):
     file_path_content_analyzer = os.path.join(
         THIS_DIR, "../../test/content_analyzer/movielens_test*")
     entity_linking_pipeline = FieldRepresentationPipeline(
         BabelPyEntityLinking())
     plot_config = FieldConfig(None)
     plot_config.append_pipeline(entity_linking_pipeline)
     content_analyzer_config = ContentAnalyzerConfig(
         'ITEM', JSONFile(file_path), ["imdbID"], "movielens_test")
     content_analyzer_config.append_field_config("Plot", plot_config)
     content_analyzer = ContentAnalyzer(content_analyzer_config)
     content_analyzer.fit()
     """
Пример #4
0
    def test_valid_id(self):
        valid_object = FieldConfig(id='test')
        self.assertIsNotNone(valid_object)

        valid_object = FieldConfig(id='test_valid')
        self.assertIsNotNone(valid_object)

        valid_object = FieldConfig(id='test-valid')
        self.assertIsNotNone(valid_object)

        valid_object = FieldConfig(id='test1-valid2')
        self.assertIsNotNone(valid_object)

        valid_object = FieldConfig(id='1_2-3_')
        self.assertIsNotNone(valid_object)
Пример #5
0
    def test_invalid_id(self):
        with self.assertRaises(ValueError):
            FieldConfig(id='.in.vali.d')

        with self.assertRaises(ValueError):
            FieldConfig(id='#in#vali#d')

        with self.assertRaises(ValueError):
            FieldConfig(id='     ')

        with self.assertRaises(ValueError):
            FieldConfig(id='is invalid')

        with self.assertRaises(ValueError):
            FieldConfig(id='is/inva/lid')
Пример #6
0
    def test_decode_field_data_embedding(self):
        file_path_test_decode = os.path.join(
            THIS_DIR, "../../datasets/test_decode/movies_title_embedding.json")
        test_dir = os.path.join(THIS_DIR, "../../datasets/test_decode/")

        movies_ca_config = ContentAnalyzerConfig(
            content_type='Item',
            source=JSONFile(file_path_test_decode),
            id_field_name_list=['imdbID'],
            output_directory=test_dir + 'movies_embedding_')

        movies_ca_config.append_field_config(
            field_name='Title',
            field_config=FieldConfig(pipelines_list=[
                FieldRepresentationPipeline(content_technique=None),
            ]))
        ContentAnalyzer(config=movies_ca_config).fit()

        for name in os.listdir(test_dir):
            if os.path.isdir(os.path.join(test_dir, name)) \
                    and 'movies_embedding_' in str(name):
                with lzma.open(os.path.join(test_dir, name, 'tt0113497.xz'),
                               'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(
                        content.get_field("Title").get_representation('0'),
                        EmbeddingField)
                    self.assertIsInstance(
                        content.get_field("Title").get_representation(
                            '0').value, np.ndarray)
                    break
Пример #7
0
    def test_create_content_embedding(self):
        movies_ca_config = ItemAnalyzerConfig(
            source=JSONFile(movies_info_reduced),
            id=['imdbID'],
            output_directory="movielens_test_embedding",
        )

        movies_ca_config.add_multiple_config(
            field_name='Title',
            config_list=[FieldConfig(
                    WordEmbeddingTechnique(Gensim('glove-twitter-25')),
                    NLTK(lemmatization=True, stopwords_removal=True))])

        content_analyzer = ContentAnalyzer(movies_ca_config)
        content_analyzer.fit()

        for name in os.listdir(THIS_DIR):
            if os.path.isdir(os.path.join(THIS_DIR, name)) \
                    and 'movielens_test_embedding' in str(name):

                with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(content.get_field("Title")[0], EmbeddingField)
                    self.assertIsInstance(content.get_field("Title")[0].value, np.ndarray)
                    break
    def test_create_content_tfidf(self):
        movies_ca_config = ItemAnalyzerConfig(
            source=JSONFile(movies_info_reduced),
            id='imdbID',
            output_directory="movielens_test_tfidf",
        )

        movies_ca_config.add_multiple_config(
            field_name='Title', config_list=[FieldConfig(SkLearnTfIdf())])

        content_analyzer = ContentAnalyzer(movies_ca_config)
        content_analyzer.fit()

        for name in os.listdir(THIS_DIR):
            if os.path.isdir(os.path.join(THIS_DIR, name)) \
                    and 'movielens_test_tfidf' in str(name):

                with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'),
                               'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(
                        content.get_field("Title")[0], FeaturesBagField)
                    self.assertIsInstance(
                        content.get_field("Title")[0].value, dict)
                    break
    def test_create_content(self):
        filepath = '../../datasets/movies_info_reduced.json'
        try:
            with open(filepath):
                pass
        except FileNotFoundError:
            filepath = 'datasets/movies_info_reduced.json'

        entity_linking_pipeline = FieldRepresentationPipeline(
            BabelPyEntityLinking())
        plot_config = FieldConfig(None)
        plot_config.append_pipeline(entity_linking_pipeline)
        content_analyzer_config = ContentAnalyzerConfig(
            'ITEM', JSONFile(filepath), ["imdbID"], "movielens_test")
        content_analyzer_config.append_field_config("Plot", plot_config)
        content_analyzer = ContentAnalyzer(content_analyzer_config)
        content_analyzer.fit()
    def test_create_contents_in_index(self):
        output_dir = os.path.join(THIS_DIR, "movielens_test_original_index")
        movies_ca_config = ItemAnalyzerConfig(
            source=JSONFile(movies_info_reduced),
            id=['imdbID'],
            output_directory=output_dir,
        )

        movies_ca_config.add_multiple_config(
            field_name='Title',
            config_list=[
                FieldConfig(OriginalData(),
                            NLTK(lemmatization=True, stopwords_removal=True),
                            SearchIndex(os.path.join(output_dir, "index")),
                            "test_search"),
                FieldConfig(SkLearnTfIdf(), NLTK(),
                            KeywordIndex(os.path.join(output_dir, "index1")),
                            "test_keyword"),
                FieldConfig(OriginalData(), NLTK(),
                            SearchIndex(os.path.join(output_dir, "index")))
            ])

        content_analyzer = ContentAnalyzer(movies_ca_config)
        content_analyzer.fit()

        for name in os.listdir(THIS_DIR):
            if os.path.isdir(os.path.join(THIS_DIR, name)) \
                    and 'movielens_test_original_index' in str(name):

                with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'),
                               'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(
                        content.get_field("Title")[0], IndexField)
                    self.assertIsInstance(
                        content.get_field("Title")[0].value, str)
                    self.assertIsInstance(
                        content.get_field("Title")[1], IndexField)
                    self.assertIsInstance(
                        content.get_field("Title")[1].value, str)
                    break
Пример #11
0
    def test_create_content_search_index(self):
        movies_ca_config = ContentAnalyzerConfig(
            content_type='Item',
            source=JSONFile(file_path),
            id_field_name_list=['imdbID'],
            output_directory='movielens_test')

        movies_ca_config.append_field_config(
            field_name='Title',
            field_config=FieldConfig(pipelines_list=[
                FieldRepresentationPipeline(content_technique=SearchIndexing())
            ]))

        content_analyzer = ContentAnalyzer(movies_ca_config)
        content_analyzer.fit()
Пример #12
0
    def test_create_content(self):
        plot_config = FieldConfig(BabelPyEntityLinking())
        exogenous_config = ExogenousConfig(DBPediaMappingTechnique('Film', 'EN', 'Title'))
        content_analyzer_config = ItemAnalyzerConfig(JSONFile(movies_info_reduced), ["imdbID"], "movielens_test")
        content_analyzer_config.add_single_config("Title", plot_config)
        content_analyzer_config.add_single_exogenous(exogenous_config)
        content_analyzer = ContentAnalyzer(content_analyzer_config)
        content_analyzer.fit()

        for name in os.listdir(THIS_DIR):
            if os.path.isdir(os.path.join(THIS_DIR, name)) \
                    and 'movielens_test' in str(name):

                with lzma.open(os.path.join(THIS_DIR, name, 'tt0113497.xz'), 'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(content.get_field("Title")[0], FeaturesBagField)
                    self.assertIsInstance(content.get_field("Title")[0].value, dict)
                    break
Пример #13
0
    def test_decode_field_data_embedding(self):
        movies_ca_config = ItemAnalyzerConfig(
            source=JSONFile(decode_embedding),
            id=['imdbID'],
            output_directory=decode_path + 'movies_embedding_'
        )

        movies_ca_config.add_multiple_config(
            field_name='Title',
            config_list=[FieldConfig()]
        )
        ContentAnalyzer(config=movies_ca_config).fit()

        for name in os.listdir(decode_path):
            if os.path.isdir(os.path.join(decode_path, name)) \
                    and 'movies_embedding_' in str(name):
                with lzma.open(os.path.join(decode_path, name, 'tt0113497.xz'), 'r') as file:
                    content = pickle.load(file)

                    self.assertIsInstance(content.get_field("Title")[0], EmbeddingField)
                    self.assertIsInstance(content.get_field("Title")[0].value, np.ndarray)
                    break
Пример #14
0
    def test_create_content_embedding(self):
        movies_ca_config = ContentAnalyzerConfig(
            content_type='Item',
            source=JSONFile(file_path),
            id_field_name_list=['imdbID'],
            output_directory="movielens_test",
        )

        movies_ca_config.append_field_config(
            field_name='Title',
            field_config=FieldConfig(pipelines_list=[
                FieldRepresentationPipeline(
                    preprocessor_list=[
                        NLTK(lemmatization=True, stopwords_removal=True)
                    ],
                    content_technique=EmbeddingTechnique(
                        combining_technique=Centroid(),
                        embedding_source=GensimDownloader(
                            name='glove-twitter-25'),
                        granularity='doc'))
            ]))

        content_analyzer = ContentAnalyzer(movies_ca_config)
        content_analyzer.fit()
Пример #15
0
users_filename = '../../../datasets/examples/users_70.dat'

users_output_dir = '../../../contents/examples/ex_2/users_'

movies_ca_config = ContentAnalyzerConfig(
    content_type='Item',
    source=JSONFile(movies_filename),
    id_field_name_list=['imdbID'],
    output_directory=movies_output_dir,
)

movies_ca_config.append_field_config(
    field_name='Title',
    field_config=FieldConfig(pipelines_list=[
        FieldRepresentationPipeline(content_technique=LuceneTfIdf())
    ]))

movies_ca_config.append_field_config(
    field_name='Year',
    field_config=FieldConfig(pipelines_list=[
        FieldRepresentationPipeline(content_technique=LuceneTfIdf())
    ]))

movies_ca_config.append_field_config(
    field_name='Genre',
    field_config=FieldConfig(pipelines_list=[
        FieldRepresentationPipeline(preprocessor_list=[
            NLTK(lemmatization=True, stopwords_removal=True)
        ],
                                    content_technique=LuceneTfIdf())
Пример #16
0
users_filename = '../../../datasets/examples/users_70.dat'

users_output_dir = '../../../contents/examples/ex_1/users_'

movies_ca_config = ContentAnalyzerConfig(
    content_type='Item',
    source=JSONFile(movies_filename),
    id_field_name_list=['imdbID'],
    output_directory=movies_output_dir,
)

movies_ca_config.append_field_config(
    field_name='Title',
    field_config=FieldConfig(pipelines_list=[
        FieldRepresentationPipeline(content_technique=LuceneTfIdf())
    ]))

movies_ca_config.append_field_config(
    field_name='Year',
    field_config=FieldConfig(pipelines_list=[
        FieldRepresentationPipeline(content_technique=LuceneTfIdf())
    ]))

movies_ca_config.append_field_config(
    field_name='Genre',
    field_config=FieldConfig(pipelines_list=[
        FieldRepresentationPipeline(preprocessor_list=[
            NLTK(lemmatization=True, stopwords_removal=True)
        ],
                                    content_technique=LuceneTfIdf())
Пример #17
0
    source = JSONFile(users_filename),
    id = '0',
    output_dir = users_output_dir,
)

ItemAnalyzerConfig(
    source = JSONFile(items_filename),
    id = 'imdbID',
    output_dir = items_output_dir,
)


movies_ca_config.append_field_config(
    field_name='Title',
    field_config=FieldConfig(
        pipelines_list=[FieldRepresentationPipeline(
            content_technique=LuceneTfIdf())]
    )
)


movies_ca_config.append_field_config(
    field_name='Year',
    field_config=FieldConfig(
        pipelines_list=[FieldRepresentationPipeline(
            content_technique=LuceneTfIdf())]
    )
)


movies_ca_config.append_field_config(
    field_name='Genre',