예제 #1
0
    def test_extract_corpus(self):
        preprocessor = NLTK(stopwords_removal=True, stemming=True)
        fields = ["Title", "Released"]
        expected = [['jumanji', '15', 'dec', '1995'],
                    ['grumpier', 'old', 'men', '22', 'dec', '1995'],
                    ['toy', 'stori', '22', 'nov', '1995'],
                    ['father', 'bride', 'part', 'ii', '08', 'dec', '1995'],
                    ['heat', '15', 'dec', '1995'],
                    ['tom', 'huck', '22', 'dec', '1995'],
                    ['wait', 'exhal', '22', 'dec', '1995'],
                    ['sabrina', '15', 'dec', '1995'],
                    ['dracula', ':', 'dead', 'love', '22', 'dec', '1995'],
                    ['nixon', '05', 'jan', '1996'],
                    ['american', 'presid', '17', 'nov', '1995'],
                    ['goldeney', '17', 'nov', '1995'],
                    ['balto', '22', 'dec', '1995'],
                    ['cutthroat', 'island', '22', 'dec', '1995'],
                    ['casino', '22', 'nov', '1995'],
                    ['sudden', 'death', '22', 'dec', '1995'],
                    ['sens', 'sensibl', '26', 'jan', '1996'],
                    ['four', 'room', '25', 'dec', '1995'],
                    ['money', 'train', '22', 'nov', '1995'],
                    [
                        'ace', 'ventura', ':', 'natur', 'call', '10', 'nov',
                        '1995'
                    ]]
        file_path = os.path.join(THIS_DIR,
                                 "../../../datasets/movies_info_reduced.json")
        src = JSONFile(file_path)
        learner = GensimLatentSemanticAnalysis(src, preprocessor, fields)
        generated = learner.extract_corpus()

        self.assertEqual(generated, expected)
예제 #2
0
 def test_fit(self):
     path = os.path.join(THIS_DIR, "../../../datasets/d2v_test_data.json")
     doc2vec = GensimDoc2Vec(source=JSONFile(file_path=path),
                             preprocessor=NLTK(),
                             field_list=["doc_field"])
     doc2vec.fit()
     self.assertIsInstance(doc2vec.model, gensim.models.doc2vec.Doc2Vec)
예제 #3
0
 def test_fit(self):
     file_path = os.path.join(THIS_DIR,
                              "../../../datasets/movies_info_reduced.json")
     random_indexing = GensimRandomIndexing(JSONFile(file_path), NLTK(),
                                            ['Genre', 'Plot'])
     random_indexing.fit()
     self.assertIsInstance(random_indexing.model,
                           gensim.models.rpmodel.RpModel)
예제 #4
0
 def test_fit(self):
     file_path = os.path.join(THIS_DIR, "../../../datasets/movies_info_reduced.json")
     preprocessor = NLTK(stopwords_removal=True)
     fields = ["Plot"]
     src = JSONFile(file_path)
     learner = GensimLatentSemanticAnalysis(src, preprocessor, fields)
     learner.fit()
     self.assertIsInstance(learner.model, gensim.models.lsimodel.LsiModel)
예제 #5
0
 def test_fit(self):
     file_path = os.path.join(THIS_DIR,
                              "../../../datasets/movies_info_reduced.json")
     field_list = ['Title', 'Year', 'Genre']
     word2vec = GensimWord2Vec(source=JSONFile(file_path),
                               preprocessor=NLTK(),
                               field_list=field_list)
     word2vec.fit()
     self.assertIsInstance(word2vec.model, gensim.models.word2vec.Word2Vec)
예제 #6
0
 def test_fit(self):
     field_list = ['Title', 'Year', 'Genre']
     file_path = os.path.join(THIS_DIR,
                              "../../../datasets/movies_info_reduced.json")
     fast_text = GensimFastText(source=JSONFile(file_path),
                                preprocessor=NLTK(),
                                field_list=field_list)
     fast_text.fit()
     self.assertIsInstance(fast_text.model, gensim.models.fasttext.FastText)
예제 #7
0
    def test_fit(self):
        model_path = os.path.join(THIS_DIR, "/model_test_Lsa")
        learner = GensimLatentSemanticAnalysis(model_path, True)
        learner.fit(source=JSONFile(file_path),
                    field_list=["Plot", "Genre"],
                    preprocessor_list=[NLTK()])
        model_path += ".model"

        self.assertEqual(learner.get_embedding("ace").any(), True)
        self.assertEqual(pl.Path(model_path).resolve().is_file(), True)
예제 #8
0
 def __init__(self, source: RawInformationSource,
              preprocessor: TextProcessor, field_list: List[str]):
     self.__source: RawInformationSource = source
     if preprocessor is None:
         self.__preprocessor: TextProcessor = NLTK()
     else:
         self.__preprocessor: TextProcessor = preprocessor
     self.__preprocessor.set_lang("")
     self.__field_list = field_list
     self.__model = None
예제 #9
0
    def test_fit(self):
        file_path = '../../../datasets/movies_info_reduced.json'
        try:
            with open(file_path):
                pass
        except FileNotFoundError:
            file_path = 'datasets/movies_info_reduced.json'

        GensimRandomIndexing(JSONFile(file_path), NLTK(),
                             ['Genre', 'Plot']).fit()
예제 #10
0
    def test_produce_content(self):
        technique = SkLearnTfIdf()
        technique.field_need_refactor = "Plot"
        technique.pipeline_need_refactor = str(1)
        technique.processor_list = [NLTK()]
        technique.dataset_refactor(JSONFile(file_path), ["imdbID"])
        features_bag_test = technique.produce_content("test", "tt0113497",
                                                      "Plot")
        features = features_bag_test.value

        self.assertLess(features['the'], 0.15)
    def test_produce_content_string(self):
        source = [{"field": "50"}]
        with open(self.file_name, 'w') as f:
            json.dump(source, f)

        # If preprocessor are specified, then we are sure the framework should import it as a str
        result = self.technique.produce_content("field", [NLTK(stopwords_removal=True)], JSONFile(self.file_name))

        self.assertIsInstance(result[0], SimpleField)
        self.assertIsInstance(result[0].value, str)

        source = [{"field": 50}]
        with open(self.file_name, 'w') as f:
            json.dump(source, f)

        # If preprocessor are specified, then we are sure the framework should import it as a str
        result = self.technique.produce_content("field", [NLTK(stopwords_removal=True)], JSONFile(self.file_name))

        self.assertIsInstance(result[0], SimpleField)
        self.assertIsInstance(result[0].value, str)
예제 #12
0
 def test_save(self):
     preprocessor = NLTK(stopwords_removal=True)
     fields = ["Plot"]
     file_path = os.path.join(THIS_DIR,
                              "../../../datasets/movies_info_reduced.json")
     src = JSONFile(file_path)
     learner = GensimWord2Vec(src, preprocessor, fields)
     learner.fit()
     learner.save()
     self.assertIsInstance(learner.model, gensim.models.word2vec.Word2Vec)
     """
예제 #13
0
    def test_fit(self):

        try:
            path = "datasets/d2v_test_data.json"
            with open(path):
                pass
        except FileNotFoundError:
            path = "../../../datasets/d2v_test_data.json"

        GensimDoc2Vec(source=JSONFile(file_path=path),
                      preprocessor=NLTK(),
                      field_list=["doc_field"]).fit()
예제 #14
0
    def test_fit(self):
        field_list = ['Title', 'Year', 'Genre']

        file_path = '../../../datasets/movies_info_reduced.json'
        try:
            with open(file_path):
                pass
        except FileNotFoundError:
            file_path = 'datasets/movies_info_reduced.json'

        GensimFastText(source=JSONFile(file_path),
                       preprocessor=NLTK(),
                       field_list=field_list).fit()
 def test_save(self):
     self.skipTest("_")
     preprocessor = NLTK(stopwords_removal=True)
     fields = ["Plot"]
     try:
         src = JSONFile("datasets/movies_info_reduced.json")
         learner = GensimLatentSemanticAnalysis(src, preprocessor, fields)
         learner.fit()
     except FileNotFoundError:
         src = JSONFile("../../../datasets/movies_info_reduced.json")
         learner = GensimLatentSemanticAnalysis(src, preprocessor, fields)
         learner.fit()
     learner.save()
    def test_fit(self):
        preprocessor = NLTK(stopwords_removal=True)
        fields = ["Plot"]

        file_path = "datasets/movies_info_reduced.json"
        try:
            with open(file_path):
                pass
        except FileNotFoundError:
            file_path = "../../../datasets/movies_info_reduced.json"

        src = JSONFile(file_path)
        learner = GensimLatentSemanticAnalysis(src, preprocessor, fields)
        learner.fit()
예제 #17
0
    def test_produce_content(self):
        try:
            technique = WhooshTfIdf()
            technique.field_need_refactor = "Plot"
            technique.pipeline_need_refactor = str(1)
            technique.processor_list = [NLTK()]
            technique.dataset_refactor(JSONFile(file_path), ["imdbID"])
            features_bag_test = technique.produce_content(
                "test", "tt0113497", "Plot")
            features = features_bag_test.value

            self.assertEqual(features['years'], 0.6989700043360189)
        except AttributeError:
            self.fail("Couldn't load feature bag!")
예제 #18
0
 def test_save(self):
     preprocessor = NLTK(stopwords_removal=True)
     fields = ["Plot"]
     file_path = os.path.join(THIS_DIR,
                              "../../../datasets/movies_info_reduced.json")
     src = JSONFile(file_path)
     learner = GensimLatentSemanticAnalysis(src, preprocessor, fields)
     learner.fit()
     learner.save()
     """
     path = os.path.join(THIS_DIR, "*.model")
     x = sorted(glob.glob(path))[-1]
     dynamic_path = pl.Path(x)
     self.assertEqual((str(dynamic_path), dynamic_path.is_file()), (str(dynamic_path), True))
     """
     self.assertIsInstance(learner.model, gensim.models.lsimodel.LsiModel)
예제 #19
0
    def test_produce_content(self):
        file_path = '../../../datasets/movies_info_reduced.json'
        try:
            with open(file_path):
                pass
        except FileNotFoundError:
            file_path = 'datasets/movies_info_reduced.json'

        technique = LuceneTfIdf()
        technique.set_field_need_refactor("Plot")
        technique.set_pipeline_need_refactor(str(1))
        technique.set_processor_list([NLTK()])
        technique.dataset_refactor(JSONFile(file_path), ["imdbID"])
        features_bag_test = technique.produce_content("test", "tt0113497", "Plot")
        features = features_bag_test.get_value()

        self.assertEqual(features['years'], 0.6989700043360189)
    def test_fasttext(self):
        # note: fasttext is trained because the resulting saved model file was too heavy
        source = GensimFastText("./test_source_fasttext",
                                auto_save=False,
                                min_count=1)
        source.fit(source=JSONFile(file_path),
                   field_list=["Plot"],
                   preprocessor_list=[NLTK()])
        vector_size = 100
        result = source.load(["first", "remote"])

        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]), vector_size)
        self.assertEqual(result[1].any(), True)

        self.assertWordEmbeddingMatches(source, result[0], "first")
        self.assertWordEmbeddingMatches(source, result[1], "remote")
예제 #21
0
    def test_produce_content(self):
        file_path = '../../../datasets/movies_info_reduced.json'
        try:
            with open(file_path):
                pass
        except FileNotFoundError:
            file_path = 'datasets/movies_info_reduced.json'

        technique = SkLearnTfIdf()
        technique.field_need_refactor = "Plot"
        technique.pipeline_need_refactor = str(1)
        technique.processor_list = [NLTK()]
        technique.dataset_refactor(JSONFile(file_path), ["imdbID"])
        features_bag_test = technique.produce_content("test", "tt0113497",
                                                      "Plot")
        features = features_bag_test.value

        self.assertLess(features['the'], 0.15)
예제 #22
0
    def test_process(self):
        #Test for only stop words removal
        nltka = NLTK(stopwords_removal=True, url_tagging=True)
        nltka.set_lang("")
        self.assertEqual(
            nltka.process(
                "The striped bats are hanging on their feet for the best"),
            ["striped", "bats", "hanging", "feet", "best"])

        #Test for only stemming
        nltka.set_stemming(True)
        nltka.set_stopwords_removal(False)
        self.assertEqual(
            nltka.process(
                "My name is Francesco and I am a student at the University of the city of Bari"
            ), [
                "my", "name", "is", "francesco", "and", "i", "am", "a",
                "student", "at", "the", "univers", "of", "the", "citi", "of",
                "bari"
            ])
        nltka.set_stemming(False)

        #Test for only lemmatization
        nltka.set_lemmatization(True)
        self.assertEqual(
            nltka.process(
                "The striped bats are hanging on their feet for best"), [
                    "The", "strip", "bat", "be", "hang", "on", "their", "foot",
                    "for", "best"
                ])

        #Test for lemmatization with multiple whitespaces removal
        nltka.set_strip_multiple_whitespaces(True)
        self.assertEqual(
            nltka.process(
                "The   striped  bats    are    hanging   on   their    feet   for  best"
            ), [
                "The", "strip", "bat", "be", "hang", "on", "their", "foot",
                "for", "best"
            ])

        #Test for lemmatization with multiple whitespaces removal and URL tagging
        nltka.set_url_tagging(True)
        self.assertEqual(
            nltka.process(
                "The   striped http://facebook.com bats https://github.com   are   http://facebook.com hanging   on   their    feet   for  best  http://twitter.it"
            ), [
                "The", "strip", "<URL>", "bat", "<URL>", "be", "<URL>", "hang",
                "on", "their", "foot", "for", "best", "<URL>"
            ])

        # Test for lemmatization, multiple whitespaces removal, URL tagging and stemming
        nltka.set_stemming(True)
        self.assertEqual(
            nltka.process(
                "The   striped http://facebook.com bats https://github.com   are   http://facebook.com hanging   on   their    feet   for  best  http://twitter.it"
            ), [
                "the", "strip", "<url>", "bat", "<url>", "be", "<url>", "hang",
                "on", "their", "foot", "for", "best", "<url>"
            ])

        # Test for lemmatization, multiple whitespaces removal, URL tagging, stemming, stop words removal
        nltka.set_stopwords_removal(True)
        self.assertEqual(
            nltka.process(
                "The   striped http://facebook.com bats https://github.com   are   http://facebook.com hanging   on   their    feet   for  best  http://twitter.it"
            ), [
                "strip", "<url>", "bat", "<url>", "<url>", "hang", "foot",
                "best", "<url>"
            ])

        nltka.set_named_entity_recognition(True)
        nltka.set_stopwords_removal(False)
        nltka.set_stemming(False)
        nltka.set_lemmatization(False)
        result = nltka.process(
            "Facebook was fined by Hewlett Packard for spending 100€ to buy Cristiano Ronaldo from Juventus"
        )

        self.assertEqual(
            result,
            Tree('S', [
                Tree('PERSON', [('Facebook', 'NNP')]), ('was', 'VBD'),
                ('fined', 'VBN'), ('by', 'IN'),
                Tree('PERSON', [('Hewlett', 'NNP'), ('Packard', 'NNP')]),
                ('for', 'IN'), ('spending', 'VBG'), ('100€', 'CD'),
                ('to', 'TO'), ('buy', 'VB'),
                Tree('PERSON', [('Cristiano', 'NNP'), ('Ronaldo', 'NNP')]),
                ('from', 'IN'),
                Tree('GPE', [('Juventus', 'NNP')])
            ]))