예제 #1
0
 def LDA(self, num_topics, num_words):
     dictionary = corpora.Dictionary(self.para_list)
     doc_term_matrix = [dictionary.doc2bow(para) for para in self.para_list]
     path = '/mnt/APPDATA/Project_Mafia/omkhalil/vowpal_binaries/vw-7.20150623'
     self.ldamodel = LdaVowpalWabbit(path,
                                     doc_term_matrix,
                                     num_topics=num_topics,
                                     id2word=dictionary)
     self.ldamodel.save('model/lda_model')
     print(self.ldamodel.print_topics(num_topics=10, num_words=num_words))
 def setUp(self):
     # Suppose given below are the topics which two different LdaModels come up with.
     # `topics1` is clearly better as it has a clear distinction between system-human
     # interaction and graphs. Hence both the coherence measures for `topics1` should be
     # greater.
     self.topics1 = [['human', 'computer', 'system', 'interface'],
                     ['graph', 'minors', 'trees', 'eps']]
     self.topics2 = [['user', 'graph', 'minors', 'system'],
                     ['time', 'graph', 'survey', 'minors']]
     self.ldamodel = LdaModel(corpus=corpus,
                              id2word=dictionary,
                              num_topics=2,
                              passes=0,
                              iterations=0)
     mallet_home = os.environ.get('MALLET_HOME', None)
     self.mallet_path = os.path.join(mallet_home, 'bin',
                                     'mallet') if mallet_home else None
     if self.mallet_path:
         self.malletmodel = LdaMallet(mallet_path=self.mallet_path,
                                      corpus=corpus,
                                      id2word=dictionary,
                                      num_topics=2,
                                      iterations=0)
     vw_path = os.environ.get('VOWPAL_WABBIT_PATH', None)
     if not vw_path:
         msg = "Environment variable 'VOWPAL_WABBIT_PATH' not specified, skipping sanity checks for LDA Model"
         logging.info(msg)
         self.vw_path = None
     else:
         self.vw_path = vw_path
         self.vwmodel = LdaVowpalWabbit(self.vw_path,
                                        corpus=corpus,
                                        id2word=dictionary,
                                        num_topics=2,
                                        passes=0)
예제 #3
0
class Contract_Reader():
    def __init__(self, config):
        print('Filepath for texts = ', config.textpath)
        self.corpus = PCR(config.textpath,
                          '.*\.txt',
                          encoding='utf-16',
                          para_block_reader=read_line_block)
        if config.clean_paragraphs == 'yes':
            self.clean(config, mode='para')
        if config.clean_sentences == 'yes':
            self.clean(config, mode='sent')
        #Corpus summaries
        self.corpus_info()
        self.LDA(config.num_topics, config.num_words)
        self.plot(config.num_words)

    def clean(self, config, mode='sent'):
        stop = set(stopwords.words('english'))
        exclude = set(string.punctuation)
        lemma = WNL()
        if mode == 'para':
            #paragraphs are lists of sentences each of which is a list of tokens. Reducing to list of strings.
            self.para_list = [
                list(itertools.chain.from_iterable(para))
                for para in self.corpus.paras()
            ]
            for index, paragraph in enumerate(self.para_list):
                paragraph = " ".join(paragraph)
                stop_free = " ".join(
                    [i for i in paragraph.lower().split() if i not in stop])
                punc_free = ''.join(ch for ch in stop_free
                                    if ch not in exclude)
                normalized = " ".join(
                    lemma.lemmatize(word) for word in punc_free.split())
                self.para_list[index] = normalized
            print(self.para_list[0])
            self.para_list = [para.split() for para in self.para_list]
            print(self.para_list[0])
        if mode == 'sent':
            #Obtain list of strings each one a sentence rather than list of lists.
            self.sents_list = [" ".join(sent) for sent in self.corpus.sents()]
            for index, sentence in enumerate(self.sents_list):
                stop_free = " ".join(
                    [i for i in sentence.lower().split() if i not in stop])
                punc_free = ''.join(ch for ch in stop_free
                                    if ch not in exclude)
                normalized = " ".join(
                    lemma.lemmatize(word) for word in punc_free.split())
                self.sents_list[index] = normalized
            print(self.sents_list[0])
            self.sents_list = [
                sentence.split() for sentence in self.sents_list
            ]
            print(self.sents_list[0])

    def LDA(self, num_topics, num_words):
        dictionary = corpora.Dictionary(self.para_list)
        doc_term_matrix = [dictionary.doc2bow(para) for para in self.para_list]
        path = '/mnt/APPDATA/Project_Mafia/omkhalil/vowpal_binaries/vw-7.20150623'
        self.ldamodel = LdaVowpalWabbit(path,
                                        doc_term_matrix,
                                        num_topics=num_topics,
                                        id2word=dictionary)
        self.ldamodel.save('model/lda_model')
        print(self.ldamodel.print_topics(num_topics=10, num_words=num_words))

    def plot(self, num_words):
        for t in range(self.ldamodel.num_topics):
            plt.figure()
            tuples = [
                reversed(x) for x in self.ldamodel.show_topic(t, num_words)
            ]
            plt.imshow(WordCloud().fit_words(dict(tuples)))
            plt.axis("off")
            plt.title("Topic #" + str(t))
            plt.savefig('plots/topic' + str(t))

    def corpus_info(self):
        """
        Summary information about the status of a corpus.
        """
        fids = len(self.corpus.fileids())
        paras = len(self.corpus.paras())
        sents = len(self.corpus.sents())
        sperp = sum(len(para) for para in self.corpus.paras()) / float(paras)
        tokens = FreqDist(self.corpus.words())
        count = sum(tokens.values())
        vocab = len(tokens)
        lexdiv = float(count) / float(vocab)

        print(
            ("Text corpus contains {} files\n"
             "Composed of {} paragraphs and {} sentences.\n"
             "{:0.3f} sentences per paragraph\n"
             "Word count of {} with a vocabulary of {}\n"
             "lexical diversity is {:0.3f}").format(fids, paras, sents, sperp,
                                                    count, vocab, lexdiv))