def test_extract_corpus(self): preprocessor = NLTK(stopwords_removal=True, stemming=True) fields = ["Title", "Released"] expected = [['jumanji', '15', 'dec', '1995'], ['grumpier', 'old', 'men', '22', 'dec', '1995'], ['toy', 'stori', '22', 'nov', '1995'], ['father', 'bride', 'part', 'ii', '08', 'dec', '1995'], ['heat', '15', 'dec', '1995'], ['tom', 'huck', '22', 'dec', '1995'], ['wait', 'exhal', '22', 'dec', '1995'], ['sabrina', '15', 'dec', '1995'], ['dracula', ':', 'dead', 'love', '22', 'dec', '1995'], ['nixon', '05', 'jan', '1996'], ['american', 'presid', '17', 'nov', '1995'], ['goldeney', '17', 'nov', '1995'], ['balto', '22', 'dec', '1995'], ['cutthroat', 'island', '22', 'dec', '1995'], ['casino', '22', 'nov', '1995'], ['sudden', 'death', '22', 'dec', '1995'], ['sens', 'sensibl', '26', 'jan', '1996'], ['four', 'room', '25', 'dec', '1995'], ['money', 'train', '22', 'nov', '1995'], [ 'ace', 'ventura', ':', 'natur', 'call', '10', 'nov', '1995' ]] file_path = os.path.join(THIS_DIR, "../../../datasets/movies_info_reduced.json") src = JSONFile(file_path) learner = GensimLatentSemanticAnalysis(src, preprocessor, fields) generated = learner.extract_corpus() self.assertEqual(generated, expected)
def test_fit(self): path = os.path.join(THIS_DIR, "../../../datasets/d2v_test_data.json") doc2vec = GensimDoc2Vec(source=JSONFile(file_path=path), preprocessor=NLTK(), field_list=["doc_field"]) doc2vec.fit() self.assertIsInstance(doc2vec.model, gensim.models.doc2vec.Doc2Vec)
def test_fit(self): file_path = os.path.join(THIS_DIR, "../../../datasets/movies_info_reduced.json") random_indexing = GensimRandomIndexing(JSONFile(file_path), NLTK(), ['Genre', 'Plot']) random_indexing.fit() self.assertIsInstance(random_indexing.model, gensim.models.rpmodel.RpModel)
def test_fit(self): file_path = os.path.join(THIS_DIR, "../../../datasets/movies_info_reduced.json") preprocessor = NLTK(stopwords_removal=True) fields = ["Plot"] src = JSONFile(file_path) learner = GensimLatentSemanticAnalysis(src, preprocessor, fields) learner.fit() self.assertIsInstance(learner.model, gensim.models.lsimodel.LsiModel)
def test_fit(self): file_path = os.path.join(THIS_DIR, "../../../datasets/movies_info_reduced.json") field_list = ['Title', 'Year', 'Genre'] word2vec = GensimWord2Vec(source=JSONFile(file_path), preprocessor=NLTK(), field_list=field_list) word2vec.fit() self.assertIsInstance(word2vec.model, gensim.models.word2vec.Word2Vec)
def test_fit(self): field_list = ['Title', 'Year', 'Genre'] file_path = os.path.join(THIS_DIR, "../../../datasets/movies_info_reduced.json") fast_text = GensimFastText(source=JSONFile(file_path), preprocessor=NLTK(), field_list=field_list) fast_text.fit() self.assertIsInstance(fast_text.model, gensim.models.fasttext.FastText)
def test_fit(self): model_path = os.path.join(THIS_DIR, "/model_test_Lsa") learner = GensimLatentSemanticAnalysis(model_path, True) learner.fit(source=JSONFile(file_path), field_list=["Plot", "Genre"], preprocessor_list=[NLTK()]) model_path += ".model" self.assertEqual(learner.get_embedding("ace").any(), True) self.assertEqual(pl.Path(model_path).resolve().is_file(), True)
def __init__(self, source: RawInformationSource, preprocessor: TextProcessor, field_list: List[str]): self.__source: RawInformationSource = source if preprocessor is None: self.__preprocessor: TextProcessor = NLTK() else: self.__preprocessor: TextProcessor = preprocessor self.__preprocessor.set_lang("") self.__field_list = field_list self.__model = None
def test_fit(self): file_path = '../../../datasets/movies_info_reduced.json' try: with open(file_path): pass except FileNotFoundError: file_path = 'datasets/movies_info_reduced.json' GensimRandomIndexing(JSONFile(file_path), NLTK(), ['Genre', 'Plot']).fit()
def test_produce_content(self): technique = SkLearnTfIdf() technique.field_need_refactor = "Plot" technique.pipeline_need_refactor = str(1) technique.processor_list = [NLTK()] technique.dataset_refactor(JSONFile(file_path), ["imdbID"]) features_bag_test = technique.produce_content("test", "tt0113497", "Plot") features = features_bag_test.value self.assertLess(features['the'], 0.15)
def test_produce_content_string(self): source = [{"field": "50"}] with open(self.file_name, 'w') as f: json.dump(source, f) # If preprocessor are specified, then we are sure the framework should import it as a str result = self.technique.produce_content("field", [NLTK(stopwords_removal=True)], JSONFile(self.file_name)) self.assertIsInstance(result[0], SimpleField) self.assertIsInstance(result[0].value, str) source = [{"field": 50}] with open(self.file_name, 'w') as f: json.dump(source, f) # If preprocessor are specified, then we are sure the framework should import it as a str result = self.technique.produce_content("field", [NLTK(stopwords_removal=True)], JSONFile(self.file_name)) self.assertIsInstance(result[0], SimpleField) self.assertIsInstance(result[0].value, str)
def test_save(self): preprocessor = NLTK(stopwords_removal=True) fields = ["Plot"] file_path = os.path.join(THIS_DIR, "../../../datasets/movies_info_reduced.json") src = JSONFile(file_path) learner = GensimWord2Vec(src, preprocessor, fields) learner.fit() learner.save() self.assertIsInstance(learner.model, gensim.models.word2vec.Word2Vec) """
def test_fit(self): try: path = "datasets/d2v_test_data.json" with open(path): pass except FileNotFoundError: path = "../../../datasets/d2v_test_data.json" GensimDoc2Vec(source=JSONFile(file_path=path), preprocessor=NLTK(), field_list=["doc_field"]).fit()
def test_fit(self): field_list = ['Title', 'Year', 'Genre'] file_path = '../../../datasets/movies_info_reduced.json' try: with open(file_path): pass except FileNotFoundError: file_path = 'datasets/movies_info_reduced.json' GensimFastText(source=JSONFile(file_path), preprocessor=NLTK(), field_list=field_list).fit()
def test_save(self): self.skipTest("_") preprocessor = NLTK(stopwords_removal=True) fields = ["Plot"] try: src = JSONFile("datasets/movies_info_reduced.json") learner = GensimLatentSemanticAnalysis(src, preprocessor, fields) learner.fit() except FileNotFoundError: src = JSONFile("../../../datasets/movies_info_reduced.json") learner = GensimLatentSemanticAnalysis(src, preprocessor, fields) learner.fit() learner.save()
def test_fit(self): preprocessor = NLTK(stopwords_removal=True) fields = ["Plot"] file_path = "datasets/movies_info_reduced.json" try: with open(file_path): pass except FileNotFoundError: file_path = "../../../datasets/movies_info_reduced.json" src = JSONFile(file_path) learner = GensimLatentSemanticAnalysis(src, preprocessor, fields) learner.fit()
def test_produce_content(self): try: technique = WhooshTfIdf() technique.field_need_refactor = "Plot" technique.pipeline_need_refactor = str(1) technique.processor_list = [NLTK()] technique.dataset_refactor(JSONFile(file_path), ["imdbID"]) features_bag_test = technique.produce_content( "test", "tt0113497", "Plot") features = features_bag_test.value self.assertEqual(features['years'], 0.6989700043360189) except AttributeError: self.fail("Couldn't load feature bag!")
def test_save(self): preprocessor = NLTK(stopwords_removal=True) fields = ["Plot"] file_path = os.path.join(THIS_DIR, "../../../datasets/movies_info_reduced.json") src = JSONFile(file_path) learner = GensimLatentSemanticAnalysis(src, preprocessor, fields) learner.fit() learner.save() """ path = os.path.join(THIS_DIR, "*.model") x = sorted(glob.glob(path))[-1] dynamic_path = pl.Path(x) self.assertEqual((str(dynamic_path), dynamic_path.is_file()), (str(dynamic_path), True)) """ self.assertIsInstance(learner.model, gensim.models.lsimodel.LsiModel)
def test_produce_content(self): file_path = '../../../datasets/movies_info_reduced.json' try: with open(file_path): pass except FileNotFoundError: file_path = 'datasets/movies_info_reduced.json' technique = LuceneTfIdf() technique.set_field_need_refactor("Plot") technique.set_pipeline_need_refactor(str(1)) technique.set_processor_list([NLTK()]) technique.dataset_refactor(JSONFile(file_path), ["imdbID"]) features_bag_test = technique.produce_content("test", "tt0113497", "Plot") features = features_bag_test.get_value() self.assertEqual(features['years'], 0.6989700043360189)
def test_fasttext(self): # note: fasttext is trained because the resulting saved model file was too heavy source = GensimFastText("./test_source_fasttext", auto_save=False, min_count=1) source.fit(source=JSONFile(file_path), field_list=["Plot"], preprocessor_list=[NLTK()]) vector_size = 100 result = source.load(["first", "remote"]) self.assertEqual(len(result), 2) self.assertEqual(len(result[0]), vector_size) self.assertEqual(result[1].any(), True) self.assertWordEmbeddingMatches(source, result[0], "first") self.assertWordEmbeddingMatches(source, result[1], "remote")
def test_produce_content(self): file_path = '../../../datasets/movies_info_reduced.json' try: with open(file_path): pass except FileNotFoundError: file_path = 'datasets/movies_info_reduced.json' technique = SkLearnTfIdf() technique.field_need_refactor = "Plot" technique.pipeline_need_refactor = str(1) technique.processor_list = [NLTK()] technique.dataset_refactor(JSONFile(file_path), ["imdbID"]) features_bag_test = technique.produce_content("test", "tt0113497", "Plot") features = features_bag_test.value self.assertLess(features['the'], 0.15)
def test_process(self): #Test for only stop words removal nltka = NLTK(stopwords_removal=True, url_tagging=True) nltka.set_lang("") self.assertEqual( nltka.process( "The striped bats are hanging on their feet for the best"), ["striped", "bats", "hanging", "feet", "best"]) #Test for only stemming nltka.set_stemming(True) nltka.set_stopwords_removal(False) self.assertEqual( nltka.process( "My name is Francesco and I am a student at the University of the city of Bari" ), [ "my", "name", "is", "francesco", "and", "i", "am", "a", "student", "at", "the", "univers", "of", "the", "citi", "of", "bari" ]) nltka.set_stemming(False) #Test for only lemmatization nltka.set_lemmatization(True) self.assertEqual( nltka.process( "The striped bats are hanging on their feet for best"), [ "The", "strip", "bat", "be", "hang", "on", "their", "foot", "for", "best" ]) #Test for lemmatization with multiple whitespaces removal nltka.set_strip_multiple_whitespaces(True) self.assertEqual( nltka.process( "The striped bats are hanging on their feet for best" ), [ "The", "strip", "bat", "be", "hang", "on", "their", "foot", "for", "best" ]) #Test for lemmatization with multiple whitespaces removal and URL tagging nltka.set_url_tagging(True) self.assertEqual( nltka.process( "The striped http://facebook.com bats https://github.com are http://facebook.com hanging on their feet for best http://twitter.it" ), [ "The", "strip", "<URL>", "bat", "<URL>", "be", "<URL>", "hang", "on", "their", "foot", "for", "best", "<URL>" ]) # Test for lemmatization, multiple whitespaces removal, URL tagging and stemming nltka.set_stemming(True) self.assertEqual( nltka.process( "The striped http://facebook.com bats https://github.com are http://facebook.com hanging on their feet for best http://twitter.it" ), [ "the", "strip", "<url>", "bat", "<url>", "be", "<url>", "hang", "on", "their", "foot", "for", "best", "<url>" ]) # Test for lemmatization, multiple whitespaces removal, URL tagging, stemming, stop words removal nltka.set_stopwords_removal(True) self.assertEqual( nltka.process( "The striped http://facebook.com bats https://github.com are http://facebook.com hanging on their feet for best http://twitter.it" ), [ "strip", "<url>", "bat", "<url>", "<url>", "hang", "foot", "best", "<url>" ]) nltka.set_named_entity_recognition(True) nltka.set_stopwords_removal(False) nltka.set_stemming(False) nltka.set_lemmatization(False) result = nltka.process( "Facebook was fined by Hewlett Packard for spending 100€ to buy Cristiano Ronaldo from Juventus" ) self.assertEqual( result, Tree('S', [ Tree('PERSON', [('Facebook', 'NNP')]), ('was', 'VBD'), ('fined', 'VBN'), ('by', 'IN'), Tree('PERSON', [('Hewlett', 'NNP'), ('Packard', 'NNP')]), ('for', 'IN'), ('spending', 'VBG'), ('100€', 'CD'), ('to', 'TO'), ('buy', 'VB'), Tree('PERSON', [('Cristiano', 'NNP'), ('Ronaldo', 'NNP')]), ('from', 'IN'), Tree('GPE', [('Juventus', 'NNP')]) ]))