def preprocess_one(self, raw_file): ''' Takes single document (raw_file) and calculates all neccessary to incorporate that document into the existing set of documents. Args: raw_file: text (string) ''' logger.info("Preprocessing one...") # create document document = preprocess_one(raw_file) if document.identifier in self.documents: logger.info("Document already exists.") return # update documents self.documents[document.identifier] = document self.docs_no = len(self.documents) self.iterative_docs.append(document.identifier) # update tokens for token, occurrence in document.bag.items(): if token not in self.tokens: self.tokens[token] = self.tokens_no self.tokens_no += 1 new_tf_shape = (self.docs_no, self.tokens_no) new_idf_shape = (1, self.tokens_no) # resize matrices tf_lil = resize_tf(self.tf, new_tf_shape) idf_lil = resize_idf(self.idf, new_idf_shape) # update tf max_freq = document.bag[max(document.bag, key=document.bag.get)] for token, freq in document.bag.items(): token_index = self.tokens[token] doc_index = self.docs_no - 1 tf_lil[doc_index, token_index] = tf(freq, max_freq) # update idf for token in document.bag: index = self.tokens[token] idf_lil[0, index] += 1 # convert back to csr (faster multiplication) self.tf = tf_lil.tocsr() self.idf = idf_lil.tocsr() # TODO: multiply only last row self.tf_idf = self.tf.multiply(self.idf)
def preprocess_one(self, raw_file): ''' Takes single document (raw_file) and calculates all that is neccessary to incorporate that document into the existing set of documents. Args: raw_file: text (string) ''' logger.info("Preprocessing one...") # create document document = preprocess_one(raw_file) # update docs_bag dict for token, occurrences in document.bag.items(): docs_number = self.docs_bag.get(token, {}) docs_number[document.identifier] = occurrences self.docs_bag[token] = docs_number # add the document to the documents set self.documents[document.identifier] = document