def setUpClass(cls): cls.fileHandler = FileHandler() cls.courseDataText = CourseData(id=1, position=1, name="test.txt", dataType="txt", dataContent="Hallo") cls.courseDataHtml = CourseData(id=1, position=1, name="www.test.de", dataType="html", dataContent="Hallo")
def get_data_link(self, data, count_data, moodleCourse, courseLecturer, response, moodleHeader): data_link = data.xpath('a/@onclick').extract_first() data_link2 = data.xpath('a/@href').extract_first() if data_link is None: data_link = "" if data_link2 is None: data_link2 = "Hidden Data" # replace in the data link every call function for the browser data_link = data_link.replace("window.open(", "") data_link = data_link.replace("); return false;", "") data_link = data_link.replace("'", "") # check if at least one link contains characters if len(data_link) == 0: if len(data_link2) == 0: raise Exception( "Die %d. Datei hat keinen Link für den Download. Bitte prüfe den Kurs: %s; Dozent: " "%s; Link: %s", count_data , moodleCourse.name , courseLecturer[0].name , response.url) # if onclick does not exist => set the data link at the value of href data_link = data_link2 count_data += 1 courseData = CourseData(link=data_link, course=moodleCourse, moodleHeader=moodleHeader, position=count_data, isNew=1) return courseData, count_data
def textProcessing(self, courseData: CourseData): """ textProcessing the control logic of the processing. The logic will try to read out the text from the CourseData and try to summarize the text in the main sentences and the words main words. :param courseData: The CourseData which shall be processed """ self.error = "" # update the View, because the function runs is in the same process like # the GUI QtWidgets.QApplication.processEvents() logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) logging.info("Beginn der Textaufbereitung Datei: %s", courseData.name) # save the CourseData to the tempPath and get the dataLocation back dataLocation = self.fileHandler.saveFile(courseData, self.tempPath) # read the text from the file text = self._readTextFromFile(courseData, dataLocation) # update the View, because the function runs is in the same process like # the GUI QtWidgets.QApplication.processEvents() # If a text was return if len(text) > 0: # set the full text of the CourseData to the read text courseData.fullText = text # clean the read text cleanText = self.textCleaner.cleaningText(text) # summarize the clean text courseData = self._summarizeText(courseData, cleanText) # If no text was return else: # if no error pop up, write an error if self.error == "": self.error = "Die Datei besteht wahrscheinlich nur aus Bildern. Es gibt keinen Text" # if in the process pop up an error # set the attribute error of the CourseData to the poped up error if len(self.error) > 0: courseData.error = self.error # update the CourseData in the database self.database.UpdateCourseDataTextFields(courseData) # remove the temporary file self.fileHandler.deleteFile(dataLocation)
def _summarizeText(self, courseData: CourseData, cleanText: str) -> CourseData: """ summarizeText summarize the cleanText from the courseData :param courseData: The data which shall get an abstract :param cleanText: The clean text from the data :return: The updated CourseData object. The fileds which will be updated are abstract and the abstractWordFrequency. """ try: # check the language of the clean text language = self.checkLanguage.checkLanguage(cleanText) # if the language is not uncertain if (language != 'uncertain'): logging.info("Versuch der Textzusammenfassung der Datei %s", courseData.name) # calculate the abstract courseData.abstract = self.textSummarizer.summarize_Text(cleanText, 5, language=language) # calculate the frequency of the n words, which pop up most courseData.abstractWordFrequency = self.textSummarizer.frequency_Words(cleanText, 10, language=language) # if the language is uncertain # create an error text, that the language is not supported else: self.error = "Die Sprache der Datei wird nicht unterstuetzt." QtWidgets.QApplication.processEvents() # if the language Detection produced an error # set the error on the error text except ExceptionCheckLanguage as e: logging.warning( "Die Datei: %s %s", courseData.name, str(e)) self.error = str(e) # if the summarizer produced an error # set the error on the error text except ExceptionTextAbstraction as e: error = "Die Datei besitzt weniger Sätze oder Wörter, als die Zusammenfassung benötigt." logging.warning( "Die Datei: %s %s", courseData.name, error) self.error = error return courseData
def get_data_link(self, data: Selector, count_data: int, moodleCourse: MoodleCourse, courseLecturer: list, response, moodleHeader: str) -> (CourseData, int): """ search in the data Selector for the data link and iterate the count_data with 1 :param data: the selector, which contain the data link :param count_data: the position of the data in the course :param moodleCourse: the moodleCourse, which was found :param courseLecturer: the list of lecturer :param response: :param moodleHeader: the header of the moodle section :return: Return the CourseData with the link and the postion of the course data """ # for one data can be stored 2 different links # one link can be in "onclick" # another one in "href" in "href" has to bee something # otherwise it is an hidden data for the user # "onclick" is to prefer, because this opens directly the CourseData data_link = data.xpath('a/@onclick').extract_first() data_link2 = data.xpath('a/@href').extract_first() if data_link is None: data_link = "" if data_link2 is None: data_link2 = "Hidden Data" # replace in the data link every call function for the browser data_link = data_link.replace("window.open(", "") data_link = data_link.replace("); return false;", "") data_link = data_link.replace("'", "") # check if at least one link contains characters if len(data_link) == 0: if len(data_link2) == 0: raise Exception("Die %d. Datei hat keinen Link für den Download. Bitte prüfe den Kurs: %s; Dozent: " "%s; Link: %s", count_data , moodleCourse.name , courseLecturer[0].name , response.url) # if onclick does not exist => set the data link at the value of href data_link = data_link2 count_data += 1 courseData = CourseData(link=data_link, course=moodleCourse, moodleHeader=moodleHeader, position=count_data, isNew=1) return courseData, count_data
def test_textProcessor_ExceptionCheckLanguage(self, mock_apply_async, mock_pool): """ Test if the text processing process will work right, if a ExceptionTextAbstraction pop up Procedure: 1. Set the mocking results 2. call the function --------- Verification: 3. check if file handler was called right 4. check if multiprocessing was called right 5. check if text cleaner was called right 6. check if check language was called right 7. check if summarize text was called right 8. check if frequency_words was called right 9. check if update course data text fields was called right 10. check if the course data object is right """ def my_side_effect(text): raise ExceptionCheckLanguage("ExceptionCheckLanguage") dummyCourseData = CourseData(id=1, name="test.pdf", dataType="pdf") mock_database = Mock(spec=DatabaseManager) mock_fileToText = Mock(spec=FileToText) mock_textAbstraction = Mock(spec=TextAbstraction) mock_fileHandler = Mock(spec=FileHandler) mock_checkLanguage = Mock(spec=CheckLanguage) mock_textCleaner = Mock(spec=TextCleaner) textProcesser = TextProcessing(database=mock_database, fileHandler=mock_fileHandler, path="C:/", textSummarizer=mock_textAbstraction, fileToText=mock_fileToText, checkLanguage=mock_checkLanguage, textCleaner=mock_textCleaner) # mocking results resultFullText = "Das ist ein Volltext" resultCleanText = "Das ist ein sauberer Text" # set mocking results mock_get = MagicMock() mock_get_function = MagicMock(return_value=resultFullText) mock_get.get = mock_get_function mock_apply_async.apply_async.return_value = mock_get mock_pool.return_value = mock_apply_async mock_fileHandler.saveFile.return_value = "C:/test.pdf" mock_fileToText.fileToText.return_value = resultFullText mock_textCleaner.cleaningText.return_value = resultCleanText mock_checkLanguage.checkLanguage.side_effect = my_side_effect textProcesser.textProcessing(dummyCourseData) # check if FileHandler was called right mock_fileHandler.saveFile.assert_called_with(dummyCourseData, "C:/") self.assertTrue(mock_fileHandler.saveFile.called) # check if multiprocessing was called right called_args = mock_apply_async.apply_async.call_args_list[0][1] self.assertTrue(called_args['args'] == ("C:/test.pdf", "pdf")) self.assertTrue(called_args['func'] == mock_fileToText.fileToText) self.assertTrue(mock_apply_async.apply_async.called) # check if text cleaner was called right mock_textCleaner.cleaningText.assert_called_with(resultFullText) self.assertTrue(mock_textCleaner.cleaningText.called) # check if check language was called right mock_checkLanguage.checkLanguage.assert_called_with(resultCleanText) self.assertTrue(mock_checkLanguage.checkLanguage.called) # check if summarize text was not called self.assertFalse(mock_textAbstraction.summarize_Text.called) # check if frequency_words was not called self.assertFalse(mock_textAbstraction.frequency_Words.called) # check if update course data text fields was called right mock_database.UpdateCourseDataTextFields.assert_called_with(dummyCourseData) self.assertTrue(mock_database.UpdateCourseDataTextFields.called) # check if the course data object is right self.assertEqual(dummyCourseData.fullText, resultFullText) self.assertEqual(dummyCourseData.abstract, None) self.assertEqual(dummyCourseData.abstractWordFrequency, None) self.assertEqual(dummyCourseData.error, "ExceptionCheckLanguage")