Exemplo n.º 1
0
class Corpus:
    """
    The Corpus class creates a corpus, that is a set of speeches to be analyzed.
    The constructor takes a list of files from the speeches folder as parameter.
    Example. ["1977.txt", "1980.txt"]
    If the list is empty, the corpus is created with the complete collection of speeches (from 1975 to 2017)
    """
    def __init__(self, files):
        if not files:
            self.corpus = PlaintextCorpusReader('./speeches', '.*')
        else:
            self.corpus = PlaintextCorpusReader('./speeches', files)
        self.speech = Speech(self.corpus.raw(), self.corpus.words(),
                             self.corpus.sents(), self.corpus.paras(), None,
                             None, None, None)
        self.speeches = build_speeches_dict(self.corpus)
        self.years = [
            int(year.split('.')[0]) for year in self.corpus.fileids()
        ]
        complementary_years = list(
            set(os.listdir("./speeches")) -
            set([str(years) + '.txt' for years in self.years]))
        if not files:
            self.complementary = None
            self.unique_words = None
        else:
            self.complementary = ComplementaryCorpus(complementary_years)
            self.unique_words = [
                word for word in self.speech.tokens
                if word not in self.complementary.speech.tokens
            ]

    def to_speeches_list(self):
        speeches_list = []
        for key, speech in self.speeches.items():
            speeches_list.append(speech.speech_to_dict())
        return speeches_list

    def print_graph(self, my_words):
        """
        :param my_words: list of words whose frequency is to be plotted
        :return: a frequency plot
        """
        cfd = nltk.ConditionalFreqDist((target, fileid)
                                       for fileid in self.speeches.keys()
                                       for w in self.speeches[fileid].tokens
                                       for target in my_words
                                       if w.lower() == target)
        cfd.plot()

    def get_files(self):
        """
        :return: list of files in the Corpus object
        """
        return self.corpus.fileids()

    def unique_words_freq(self):
        """
        :return: the words in the corpus object that are unique to that corpus
        (i.e., these words dont appear in the rest of the speeches)
        """
        if self.unique_words is None:
            return "The corpus contains all speeches, so no comparison can be made"
        else:
            return nltk.FreqDist(self.unique_words).most_common()

    def radiography(self):
        """
        The method that returns the lexical radiography of the corpus
        :return: prints lexical analysis from the corpus
        """
        print("Lexical data for period from " + str(self.years[0]) + " to " +
              str(self.years[-1]))
        print(str(len(self.years)) + " total speeches")
        print(str(len(self.corpus.words())) + " total words")
        print(
            str(len(self.corpus.words()) / len(self.get_files())) +
            " words per speech")
        print("Frequency distribution:")
        print(self.speech.frequencies())
        print("Content words frequency distribution:")
        print(self.speech.most_frequent_content_words())
        print("Hapaxes:")
        print(self.speech.hapaxes())
        print("Unique words frequency distribution:")
        print(self.unique_words_freq())
        print("Most frequent content bigrams:")
        print(self.speech.most_frequent_bigrams())
        print("Most frequent content trigrams:")
        print(self.speech.most_frequent_trigrams())
        print("#######################################")