def calculateInverseDocumentFrequency(self):
        wordInSentenceDictionary = {}
        inverseDocumentFrequencyDictionary = {}
        wordList = processStopwords(self.newsArticle)
        sentences = self.getTokenizedSentences()
        # print(f'\nWords: {wordList}')
        # print(f'\n Sentences: {sentences}')
        for word in wordList:
            for sentence in sentences:
                try:
                    if word in sentence:
                        if word in wordInSentenceDictionary:
                            wordInSentenceDictionary[word] += 1
                        else:
                            wordInSentenceDictionary[word] = 1

                    inverseDocumentFrequencyDictionary[word] = \
                        len(sentences) / wordInSentenceDictionary[word]

                except KeyError:
                    if word in wordInSentenceDictionary:
                        wordInSentenceDictionary.pop(word)
                    if word in inverseDocumentFrequencyDictionary:
                        inverseDocumentFrequencyDictionary.pop(word)

        return inverseDocumentFrequencyDictionary
    def scoreSentences(self) -> dict:
        freqTable = Counter(processStopwords(self.newsArticle))
        sentences = self.getTokenizedSentences()
        sentenceValue = dict()

        for sentence in sentences:
            textBlob = TextBlob(sentence)
            word_count_in_sentence = len(textBlob.words)  # (len(word_tokenize(sentence)))
            # print(f'word count in sentence: {word_count_in_sentence}')
            for wordValue in freqTable:
                if wordValue in sentence.lower():
                    if sentence[:10] in sentenceValue:
                        sentenceValue[sentence[:10]] += freqTable[wordValue]
                    else:
                        sentenceValue[sentence[:10]] = freqTable[wordValue]

            sentenceValue[sentence[:10]] = sentenceValue[sentence[:10]] // word_count_in_sentence

        return sentenceValue
    def calculateTermFrequency(self):
        if self.newsArticle.getArticle() == "":
            print("No Article from calculateTermFrequency")
            return

        termCounterDictionary = dict()
        termFrequencyDictionary = dict()
        sentences = self.getTokenizedSentences()
        docWordList = processStopwords(self.newsArticle)
        # print(f'\nList of important words: {docWordList}\n')
        # print(f'\nTotal number of important words: {len(docWordList)}\n')
        for word in docWordList:
            for sentence in sentences:
                if word in sentence:
                    if word in termCounterDictionary:
                        termCounterDictionary[word] = termCounterDictionary[word] + 1
                    else:
                        termCounterDictionary[word] = 1

                    termFrequencyDictionary[word] = \
                        termCounterDictionary[word] / len(self.newsArticle.getArticle())

        return termFrequencyDictionary
Пример #4
0
                baseNewsArticle = BaseNewsArticle(heading=heading,
                                                  article=article,
                                                  summary=given_summary)
                print(
                    f'Article #{line_count}\nHeading:\n{heading}  \n\nSummary:\n{given_summary} '
                    f'\n\nArticle:\n{article}\n')
                print(
                    f'Summary using library:\n{convertListToString(generated_summary)}\n'
                )

                processSentences = ProcessSentences(baseNewsArticle)
                lengthOfHeading = len(heading.strip().split(" "))
                numberOfWordsInArticle = baseNewsArticle.getTotalNumberOfWords(
                )
                numberOfImpWords = len(processStopwords(baseNewsArticle))
                numberOfSentences = len(
                    processSentences.getTokenizedSentences())
                termFrequency = getDictionaryAsString(
                    processSentences.calculateTermFrequency(),
                    Type.TERM_FREQUENCY)
                inverseDocumentFrequency = getDictionaryAsString(
                    processSentences.calculateInverseDocumentFrequency(),
                    Type.INVERSE_DOCUMENT_FREQUENCY)

                termUniqueness = getDictionaryAsString(
                    processSentences.calculateTermUniqueness(),
                    Type.TERM_UNIQUENESS)
                numberOfStopWordsRemoved = numberOfWordsInArticle - numberOfImpWords
                lengthOfGivenSummary = len(given_summary.strip().split(" "))
                medium_summary = processSentences.generate_summary(1.5)