Exemplo n.º 1
0
 def __tokenize_sentence(self, processed):
     """
     tokenize sentence and remove sentence-level punctuation,
     such as comma (,) but not dash (-) in, e.g. 'morning-after'
     function only for internal usage
     """
     self.tokens = Preprocessor.get_processed_tokens(processed)
Exemplo n.º 2
0
    def get_idf_array(self):
        """
        Use external corpus to get IDF scores
        for cluster centroid calculations
        :return: numpy array of idf values
        """
        corpus = brown
        if self.args.corpus == 'R':
            corpus = reuters
        num_words = Vectors().num_unique_words
        n = len(corpus.fileids())  # number of documents in corpus
        docs_word_matrix = np.zeros([n, num_words])
        for doc_idx, doc_id in enumerate(corpus.fileids()):
            sentences = list(corpus.sents(doc_id))
            words_in_doc = set()
            for s in sentences:
                s = ' '.join(s)
                proc_s = Preprocessor.get_processed_tokens(Preprocessor.get_processed_sentence(s))
                if proc_s:
                    words_in_doc = words_in_doc.union(proc_s)
            for word in words_in_doc:
                word_idx = WordMap.id_of(word)
                if word_idx:
                    docs_word_matrix[doc_idx, word_idx] = 1

        docs_per_word = np.sum(docs_word_matrix, axis=0)
        self.idf_array = np.log10(np.divide(n, docs_per_word + 1))  # add one to avoid divide by zero error

        return self.idf_array