Пример #1
0
 def setUpClass(cls):
     cls.fileHandler = FileHandler()
     cls.courseDataText = CourseData(id=1,
                                     position=1,
                                     name="test.txt",
                                     dataType="txt",
                                     dataContent="Hallo")
     cls.courseDataHtml = CourseData(id=1,
                                     position=1,
                                     name="www.test.de",
                                     dataType="html",
                                     dataContent="Hallo")
            def get_data_link(self, data, count_data, moodleCourse, courseLecturer,
                              response, moodleHeader):
                data_link = data.xpath('a/@onclick').extract_first()
                data_link2 = data.xpath('a/@href').extract_first()

                if data_link is None:
                    data_link = ""
                if data_link2 is None:
                    data_link2 = "Hidden Data"
                # replace in the data link every call function for the browser
                data_link = data_link.replace("window.open(", "")
                data_link = data_link.replace("); return false;", "")
                data_link = data_link.replace("'", "")

                # check if at least one link contains characters
                if len(data_link) == 0:
                    if len(data_link2) == 0:
                        raise Exception(
                            "Die %d. Datei hat keinen Link für den Download. Bitte prüfe den Kurs: %s; Dozent: "
                            "%s; Link: %s", count_data
                            , moodleCourse.name
                            , courseLecturer[0].name
                            , response.url)
                    # if onclick does not exist => set the data link at the value of href
                    data_link = data_link2
                count_data += 1
                courseData = CourseData(link=data_link, course=moodleCourse, moodleHeader=moodleHeader,
                                        position=count_data,
                                        isNew=1)
                return courseData, count_data
Пример #3
0
    def textProcessing(self, courseData: CourseData):
        """
        textProcessing the control logic of the processing. The logic will try to read out the text from the
        CourseData and try to summarize the text in the main sentences and the words main words.

        :param courseData: The CourseData which shall be processed
        """
        self.error = ""
        # update the View, because the function runs is in the same process like
        # the GUI
        QtWidgets.QApplication.processEvents()
        logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)

        logging.info("Beginn der Textaufbereitung Datei: %s", courseData.name)
        # save the CourseData to the tempPath and get the dataLocation back
        dataLocation = self.fileHandler.saveFile(courseData, self.tempPath)

        # read the text from the file
        text = self._readTextFromFile(courseData, dataLocation)
        # update the View, because the function runs is in the same process like
        # the GUI
        QtWidgets.QApplication.processEvents()

        # If a text was return
        if len(text) > 0:
            # set the full text of the CourseData to the read text
            courseData.fullText = text

            # clean the read text
            cleanText = self.textCleaner.cleaningText(text)
            # summarize the clean text
            courseData = self._summarizeText(courseData, cleanText)

        # If no text was return
        else:
            # if no error pop up, write an error
            if self.error == "":
                self.error = "Die Datei besteht wahrscheinlich nur aus Bildern. Es gibt keinen Text"
        # if in the process pop up an error
        # set the attribute error of the CourseData to the poped up error
        if len(self.error) > 0:
            courseData.error = self.error
        # update the CourseData in the database
        self.database.UpdateCourseDataTextFields(courseData)
        # remove the temporary file
        self.fileHandler.deleteFile(dataLocation)
Пример #4
0
    def _summarizeText(self, courseData: CourseData, cleanText: str) -> CourseData:
        """
        summarizeText summarize the cleanText from the courseData

        :param courseData: The data which shall get an abstract
        :param cleanText: The clean text from the data
        :return: The updated CourseData object. The fileds which will be updated are abstract
        and the abstractWordFrequency.
        """
        try:
            # check the language of the clean text
            language = self.checkLanguage.checkLanguage(cleanText)
            # if the language is not uncertain
            if (language != 'uncertain'):
                logging.info("Versuch der Textzusammenfassung der Datei %s", courseData.name)
                # calculate the abstract
                courseData.abstract = self.textSummarizer.summarize_Text(cleanText, 5, language=language)
                # calculate the frequency of the n words, which pop up most
                courseData.abstractWordFrequency = self.textSummarizer.frequency_Words(cleanText, 10, language=language)
            # if the language is uncertain
            # create an error text, that the language is not supported
            else:
                self.error = "Die Sprache der Datei wird nicht unterstuetzt."
            QtWidgets.QApplication.processEvents()
            # if the language Detection produced an error
            # set the error on the error text
        except ExceptionCheckLanguage as e:
            logging.warning(
                "Die Datei: %s %s",
                courseData.name, str(e))
            self.error = str(e)
            # if the summarizer produced an error
            # set the error on the error text
        except ExceptionTextAbstraction as e:
            error = "Die Datei besitzt weniger Sätze oder Wörter, als die Zusammenfassung benötigt."
            logging.warning(
                "Die Datei: %s %s",
                courseData.name, error)
            self.error = error
        return courseData
Пример #5
0
    def get_data_link(self, data: Selector, count_data: int, moodleCourse: MoodleCourse, courseLecturer: list, response, moodleHeader: str) -> (CourseData, int):
        """
        search in the data Selector for the data link and iterate the count_data with 1

        :param data: the selector, which contain the data link
        :param count_data: the position of the data in the course
        :param moodleCourse: the moodleCourse, which was found
        :param courseLecturer: the list of lecturer
        :param response:
        :param moodleHeader: the header of the moodle section
        :return: Return the CourseData with the link and the postion of the course data
        """
        # for one data can be stored 2 different links
        # one link can be in "onclick"
        # another one in "href" in "href" has to bee something
        # otherwise it is an hidden data for the user
        # "onclick" is to prefer, because this opens directly the CourseData
        data_link = data.xpath('a/@onclick').extract_first()
        data_link2 = data.xpath('a/@href').extract_first()

        if data_link is None:
            data_link = ""
        if data_link2 is None:
            data_link2 = "Hidden Data"
        # replace in the data link every call function for the browser
        data_link = data_link.replace("window.open(", "")
        data_link = data_link.replace("); return false;", "")
        data_link = data_link.replace("'", "")

        # check if at least one link contains characters
        if len(data_link) == 0:
            if len(data_link2) == 0:
                raise Exception("Die %d. Datei hat keinen Link für den Download. Bitte prüfe den Kurs: %s; Dozent: "
                                "%s; Link: %s", count_data
                                , moodleCourse.name
                                , courseLecturer[0].name
                                , response.url)
            # if onclick does not exist => set the data link at the value of href
            data_link = data_link2
        count_data += 1
        courseData = CourseData(link=data_link, course=moodleCourse, moodleHeader=moodleHeader, position=count_data,
                                isNew=1)
        return courseData, count_data
Пример #6
0
    def test_textProcessor_ExceptionCheckLanguage(self, mock_apply_async, mock_pool):
        """
        Test if the text processing process will work right, if a ExceptionTextAbstraction pop up
        Procedure:
            1. Set the mocking results
            2. call the function
            ---------
            Verification:
            3. check if file handler was called right
            4. check if multiprocessing was called right
            5. check if text cleaner was called right
            6. check if check language was called right
            7. check if summarize text was called right
            8. check if frequency_words was called right
            9. check if update course data text fields was called right
            10. check if the course data object is right
        """

        def my_side_effect(text):
            raise ExceptionCheckLanguage("ExceptionCheckLanguage")

        dummyCourseData = CourseData(id=1, name="test.pdf", dataType="pdf")

        mock_database = Mock(spec=DatabaseManager)
        mock_fileToText = Mock(spec=FileToText)
        mock_textAbstraction = Mock(spec=TextAbstraction)
        mock_fileHandler = Mock(spec=FileHandler)
        mock_checkLanguage = Mock(spec=CheckLanguage)
        mock_textCleaner = Mock(spec=TextCleaner)
        textProcesser = TextProcessing(database=mock_database, fileHandler=mock_fileHandler, path="C:/",
                                       textSummarizer=mock_textAbstraction, fileToText=mock_fileToText,
                                       checkLanguage=mock_checkLanguage, textCleaner=mock_textCleaner)

        # mocking results
        resultFullText = "Das ist ein Volltext"
        resultCleanText = "Das ist ein sauberer Text"

        # set mocking results
        mock_get = MagicMock()
        mock_get_function = MagicMock(return_value=resultFullText)
        mock_get.get = mock_get_function
        mock_apply_async.apply_async.return_value = mock_get
        mock_pool.return_value = mock_apply_async
        mock_fileHandler.saveFile.return_value = "C:/test.pdf"
        mock_fileToText.fileToText.return_value = resultFullText
        mock_textCleaner.cleaningText.return_value = resultCleanText
        mock_checkLanguage.checkLanguage.side_effect = my_side_effect

        textProcesser.textProcessing(dummyCourseData)

        # check if FileHandler was called right
        mock_fileHandler.saveFile.assert_called_with(dummyCourseData, "C:/")
        self.assertTrue(mock_fileHandler.saveFile.called)

        # check if multiprocessing was called right
        called_args = mock_apply_async.apply_async.call_args_list[0][1]
        self.assertTrue(called_args['args'] == ("C:/test.pdf", "pdf"))
        self.assertTrue(called_args['func'] == mock_fileToText.fileToText)
        self.assertTrue(mock_apply_async.apply_async.called)

        # check if text cleaner was called right
        mock_textCleaner.cleaningText.assert_called_with(resultFullText)
        self.assertTrue(mock_textCleaner.cleaningText.called)

        # check if check language was called right
        mock_checkLanguage.checkLanguage.assert_called_with(resultCleanText)
        self.assertTrue(mock_checkLanguage.checkLanguage.called)

        # check if summarize text was not called
        self.assertFalse(mock_textAbstraction.summarize_Text.called)

        # check if frequency_words was not called
        self.assertFalse(mock_textAbstraction.frequency_Words.called)

        # check if update course data text fields was called right
        mock_database.UpdateCourseDataTextFields.assert_called_with(dummyCourseData)
        self.assertTrue(mock_database.UpdateCourseDataTextFields.called)

        # check if the course data object is right
        self.assertEqual(dummyCourseData.fullText, resultFullText)
        self.assertEqual(dummyCourseData.abstract, None)
        self.assertEqual(dummyCourseData.abstractWordFrequency, None)
        self.assertEqual(dummyCourseData.error, "ExceptionCheckLanguage")