def get_idf_array(self): """ Use external corpus to get IDF scores for cluster centroid calculations :return: numpy array of idf values """ corpus = brown if self.args.corpus == 'R': corpus = reuters num_words = Vectors().num_unique_words n = len(corpus.fileids()) # number of documents in corpus docs_word_matrix = np.zeros([n, num_words]) for doc_idx, doc_id in enumerate(corpus.fileids()): sentences = list(corpus.sents(doc_id)) words_in_doc = set() for s in sentences: s = ' '.join(s) proc_s = Preprocessor.get_processed_tokens(Preprocessor.get_processed_sentence(s)) if proc_s: words_in_doc = words_in_doc.union(proc_s) for word in words_in_doc: word_idx = WordMap.id_of(word) if word_idx: docs_word_matrix[doc_idx, word_idx] = 1 docs_per_word = np.sum(docs_word_matrix, axis=0) self.idf_array = np.log10(np.divide(n, docs_per_word + 1)) # add one to avoid divide by zero error return self.idf_array
def __init__(self, raw_sentence, sent_pos, doc_id=None): """ initialize Sentence class with methods for plain/raw and tokenized sentence options, word count, position of sentence in document and document id :param raw_sentence: :param sent_pos: """ self.raw_sentence = ' '.join(raw_sentence.rstrip().split()) self.raw_sentence = Preprocessor.strip_beginning(self.raw_sentence) self.tokens = [] self.processed = Preprocessor.get_processed_sentence(self.raw_sentence) self.__tokenize_sentence(self.processed) self.sent_pos = int(sent_pos) # position of sentence in document self.doc_id = doc_id self.vector = [] # placeholder self.order_by = self.sent_pos self.c_score = self.p_score = self.f_score = self.mead_score = self.lda_scores = self.melda_scores = None self.compressed = self.raw_sentence # update global mapping of words to indices WordMap.add_words( self.tokens) # make sure self.tokens is the right thing here