def get_document_class(row, batch, training=True): """ The class of a document corresponding to a row in a batch. @param row: row in the bag of words matrix in batch. @param batch: the number of the batch. @param training: is this the training set or the test set. """ class_indices_for_batch = s.load(open(env_paths.get_class_indices_path(training, batch), "rb")) class_names_for_batch = s.load(open(env_paths.get_class_names_path(training), "rb")) return class_names_for_batch[class_indices_for_batch[row]]
def __read_docs_from_filesystem(self): """ Read all docs and assign them to batches, so that each doc category is represented equally across batches. """ docs_names = [] docs_names_split = [] class_indices = [] class_indices_split = [] class_names = [] batches = [] print "Generating class indices and docs names list." doc_count = 0 for folder in self.paths: docs_names_split.append([]) class_indices_split.append([]) class_names.append(folder.split("/")[len(folder.split("/")) - 1]) if self.trainingset_size == None: # If data processing should be done on all data in the specified folders. docs = os.listdir(folder) elif ( not self.trainingset_size == None and self.trainingset_attributes == None ): # If data processing should be done on parts of the docs in the specified folders - for training and testing purposes. docs = os.listdir(folder)[: int(len(os.listdir(folder)) * self.trainingset_size)] else: # If data processing should be done on a test set. docs = os.listdir(folder)[int(len(os.listdir(folder)) * self.trainingset_size) :] for doc in docs: if doc.endswith(".p"): # Append the name of the document to the list containing document names. docs_names_split[-1].append(folder + "/" + doc) class_indices_split[-1].append(len(class_names) - 1) doc_count += 1 if len(docs_names_split) == 0: # Check if docs have been stemmed. print "Documents have not been stemmed. Please stem documents in order to create bag of words matrices." return 0 # Ensure that batches contain an even amount of docs from each category. print "Arranging the documents." if doc_count < self.batchsize: print "Number of documents must be greater than batchsize. Please revise the batchsize." return 0 number_of_batches = doc_count / self.batchsize number_of_classes = len(self.paths) batches_collected_class_indices = [] batches_collected_docs_names = [] # Calculate fraction of category in each batch. d = {} for i in range(len(class_indices_split)): d[i] = float(len(class_indices_split[i])) / number_of_batches count = 0 for i in range(number_of_batches): batch_class_indices = [] batch_docs_names = [] d_tmp = array([int(v) for v in d.values()]) while True: if ( (len(batch_class_indices) == self.batchsize) and (not doc_count - count < self.batchsize) or (count == doc_count) ): break if len(d_tmp[d_tmp > 0]) == 0: break for j in range(number_of_classes): if ( (len(batch_class_indices) == self.batchsize) and (not doc_count - count < self.batchsize) or (count == doc_count) ): break if len(class_indices_split[j]) > 0 and d_tmp[j] != 0: batch_class_indices.append(class_indices_split[j].pop(0)) batch_docs_names.append(docs_names_split[j].pop(0)) d_tmp[j] -= 1 count += 1 batches_collected_class_indices.append(batch_class_indices) batches_collected_docs_names.append(batch_docs_names) for i in range(number_of_batches): bsize = self.batchsize if i < number_of_batches - 1 else self.batchsize + (doc_count % self.batchsize) batch_class_indices = batches_collected_class_indices[i] batch_docs_names = batches_collected_docs_names[i] if len(batch_class_indices) < bsize: while True: if len(batch_class_indices) == bsize: break for j in range(number_of_classes): if len(batch_class_indices) == bsize: break if len(class_indices_split[j]) > 0: batch_class_indices.append(class_indices_split[j].pop(0)) batch_docs_names.append(docs_names_split[j].pop(0)) # Shuffle the batch batch_class_indices_shuf = [] batch_docs_names_shuf = [] index_shuf = range(len(batch_class_indices)) shuffle(index_shuf) for k in index_shuf: batch_class_indices_shuf.append(batch_class_indices[k]) batch_docs_names_shuf.append(batch_docs_names[k]) # Append batch to full lists class_indices += batch_class_indices_shuf docs_names += batch_docs_names_shuf print "Reading and saving docs from file system" count = 0 class_indices_batch = [] docs_names_batch = [] docs_list = [] for i in xrange(len(class_indices)): if ( not count == 0 and (count % self.batchsize) == 0 ): # Save the batch if batchsize is reached or if the last document has been read. if not (len(class_indices) - count) < self.batchsize: print "Read ", str(count), " of ", len(class_indices) self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch) batches.append(count) # Reset the lists docs_list = [] docs_names_batch = [] class_indices_batch = [] d = s.load(open(docs_names[i], "rb")) docs_list.append(d) docs_names_batch.append(docs_names[i]) class_indices_batch.append(class_indices[i]) count += 1 # Save the remaining docs if len(docs_list) > 0: print "Read ", str(count), " of ", len(class_indices) self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch) batches.append(count) s.dump(class_names, open(env_paths.get_class_names_path(self.training), "wb")) s.dump(batches, open(env_paths.get_batches_path(self.training), "wb")) return 1
def get_all_class_names(): """ Get all class names for training set. """ return s.load(open(env_paths.get_class_names_path(train=True), "rb"))
def __read_docs_from_filesystem(self): """ Read all docs and assign them to batches, so that each doc category is represented equally across batches. """ docs_names = [] docs_names_split = [] class_indices = [] class_indices_split = [] class_names = [] batches = [] print 'Generating class indices and docs names list.' doc_count = 0 for folder in self.paths: docs_names_split.append([]) class_indices_split.append([]) class_names.append(folder.split('/')[len(folder.split('/')) - 1]) if self.trainingset_size == None: # If data processing should be done on all data in the specified folders. docs = os.listdir(folder) elif not self.trainingset_size == None and self.trainingset_attributes == None: # If data processing should be done on parts of the docs in the specified folders - for training and testing purposes. docs = os.listdir(folder)[:int(len(os.listdir(folder)) * self.trainingset_size)] else: # If data processing should be done on a test set. docs = os.listdir(folder)[int(len(os.listdir(folder)) * self.trainingset_size):] for doc in docs: if doc.endswith('.p'): # Append the name of the document to the list containing document names. docs_names_split[-1].append(folder + '/' + doc) class_indices_split[-1].append(len(class_names) - 1) doc_count += 1 if len(docs_names_split) == 0: # Check if docs have been stemmed. print 'Documents have not been stemmed. Please stem documents in order to create bag of words matrices.' return 0 # Ensure that batches contain an even amount of docs from each category. print 'Arranging the documents.' if doc_count < self.batchsize: print 'Number of documents must be greater than batchsize. Please revise the batchsize.' return 0 number_of_batches = doc_count / self.batchsize number_of_classes = len(self.paths) batches_collected_class_indices = [] batches_collected_docs_names = [] # Calculate fraction of category in each batch. d = {} for i in range(len(class_indices_split)): d[i] = float(len(class_indices_split[i])) / number_of_batches count = 0 for i in range(number_of_batches): batch_class_indices = [] batch_docs_names = [] d_tmp = array([int(v) for v in d.values()]) while True: if (len(batch_class_indices) == self.batchsize) and (not doc_count - count < self.batchsize) or ( count == doc_count): break if len(d_tmp[d_tmp > 0]) == 0: break for j in range(number_of_classes): if (len(batch_class_indices) == self.batchsize) and (not doc_count - count < self.batchsize) or ( count == doc_count): break if len(class_indices_split[j]) > 0 and d_tmp[j] != 0: batch_class_indices.append(class_indices_split[j].pop(0)) batch_docs_names.append(docs_names_split[j].pop(0)) d_tmp[j] -= 1 count += 1 batches_collected_class_indices.append(batch_class_indices) batches_collected_docs_names.append(batch_docs_names) for i in range(number_of_batches): bsize = self.batchsize if i < number_of_batches - 1 else self.batchsize + (doc_count % self.batchsize) batch_class_indices = batches_collected_class_indices[i] batch_docs_names = batches_collected_docs_names[i] if len(batch_class_indices) < bsize: while True: if len(batch_class_indices) == bsize: break for j in range(number_of_classes): if len(batch_class_indices) == bsize: break if len(class_indices_split[j]) > 0: batch_class_indices.append(class_indices_split[j].pop(0)) batch_docs_names.append(docs_names_split[j].pop(0)) # Shuffle the batch batch_class_indices_shuf = [] batch_docs_names_shuf = [] index_shuf = range(len(batch_class_indices)) shuffle(index_shuf) for k in index_shuf: batch_class_indices_shuf.append(batch_class_indices[k]) batch_docs_names_shuf.append(batch_docs_names[k]) # Append batch to full lists class_indices += batch_class_indices_shuf docs_names += batch_docs_names_shuf print 'Reading and saving docs from file system' count = 0 class_indices_batch = [] docs_names_batch = [] docs_list = [] for i in xrange(len(class_indices)): if not count == 0 and ( count % self.batchsize) == 0: # Save the batch if batchsize is reached or if the last document has been read. if not (len(class_indices) - count) < self.batchsize: print 'Read ', str(count), ' of ', len(class_indices) self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch) batches.append(count) # Reset the lists docs_list = [] docs_names_batch = [] class_indices_batch = [] d = s.load(open(docs_names[i], 'rb')) docs_list.append(d) docs_names_batch.append(docs_names[i]) class_indices_batch.append(class_indices[i]) count += 1 # Save the remaining docs if len(docs_list) > 0: print 'Read ', str(count), ' of ', len(class_indices) self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch) batches.append(count) s.dump(class_names, open(env_paths.get_class_names_path(self.training), "wb")) s.dump(batches, open(env_paths.get_batches_path(self.training), "wb")) return 1
def get_all_class_names(): """ Get all class names for training set. """ return s.load(open(env_paths.get_class_names_path(train=True), 'rb'))