def get_bag_of_words_matrix(batch, training=True): """ Retrieve the bag of words matrix for a batch. @param batch: the number of the batch. """ return array(s.load(open(env_paths.get_bow_matrix_path(training, int(batch)), "rb")))
def __generate_word_matrix(self, index_lookup): """ Generate a BOW matrix with rows, columns corresponding to documents, words respectively. @param index_lookup: A dictionary with keys for the attributes. In order to know which colounm should be incremented in word_matrix. """ batches = s.load(open(env_paths.get_batches_path(self.training), "rb")) length = len(batches) processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) bag_of_words_matrix = zeros([len(docs_list), len(index_lookup)]) row = 0 for doc in docs_list: for token in doc: try: # If word is not found in the dictionary col = index_lookup[token] bag_of_words_matrix[row, col] += 1 except KeyError: continue row += 1 # Serialize bag of words s.dump(bag_of_words_matrix.tolist(), open(env_paths.get_bow_matrix_path(self.training, batch), "wb")) print "Processed " + str(processed) + " of " + str(length) + " batches" processed += 1
def __generate_word_matrix(self, index_lookup): """ Generate a BOW matrix with rows, columns corresponding to documents, words respectively. @param index_lookup: A dictionary with keys for the attributes. In order to know which colounm should be incremented in word_matrix. """ batches = s.load(open(env_paths.get_batches_path(self.training), "rb")) length = len(batches) processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) bag_of_words_matrix = zeros([len(docs_list), len(index_lookup)]) row = 0 for doc in docs_list: for token in doc: try: # If word is not found in the dictionary col = index_lookup[token] bag_of_words_matrix[row, col] += 1 except KeyError: continue row += 1 # Serialize bag of words s.dump(bag_of_words_matrix.tolist(), open(env_paths.get_bow_matrix_path(self.training, batch), "wb")) print 'Processed ' + str(processed) + ' of ' + str(length) + ' batches' processed += 1
def save_batch(batch,batch_lbl,batchno,training): pickle.dump(batch_lbl, open(env_paths.get_class_indices_path(training,batchno), "wb")) pickle.dump(batch, open(env_paths.get_bow_matrix_path(training,batchno), "wb"))