def compare_real_data_to_reconstructed_data(): weights = s.load(open(env_paths.get_dbn_weight_path(),"rb")) batches = s.load(open(env_paths.get_batches_path(train=False),"rb")) class_indices = s.load(open(env_paths.get_class_indices_path(False,batches[0]).replace(".0",""),"rb")) batch = batches[0] data = data_processing.get_bag_of_words_matrix(batch,training = False) dict = {} for i in range(len(class_indices)): idx = class_indices[i] if idx in dict.keys(): continue dict[idx] = data[i] if len(dict) >= 10: break print dict.keys() data_points = dict.values() output_data_points = [] for d in data_points: d = append(d,1.) out = generate_output_data(d,weights) output_data_points.append(out) visualise_data_points(data_points,output_data_points)
def __generate_word_matrix(self, index_lookup): """ Generate a BOW matrix with rows, columns corresponding to documents, words respectively. @param index_lookup: A dictionary with keys for the attributes. In order to know which colounm should be incremented in word_matrix. """ batches = s.load(open(env_paths.get_batches_path(self.training), "rb")) length = len(batches) processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) bag_of_words_matrix = zeros([len(docs_list), len(index_lookup)]) row = 0 for doc in docs_list: for token in doc: try: # If word is not found in the dictionary col = index_lookup[token] bag_of_words_matrix[row, col] += 1 except KeyError: continue row += 1 # Serialize bag of words s.dump(bag_of_words_matrix.tolist(), open(env_paths.get_bow_matrix_path(self.training, batch), "wb")) print "Processed " + str(processed) + " of " + str(length) + " batches" processed += 1
def get_batch_list(training=True): """ Retrieve the list containing the batch numbers. @param training: is this the training set or the test set. """ return s.load(open(env_paths.get_batches_path(training), "rb"))
def __generate_word_matrix(self, index_lookup): """ Generate a BOW matrix with rows, columns corresponding to documents, words respectively. @param index_lookup: A dictionary with keys for the attributes. In order to know which colounm should be incremented in word_matrix. """ batches = s.load(open(env_paths.get_batches_path(self.training), "rb")) length = len(batches) processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) bag_of_words_matrix = zeros([len(docs_list), len(index_lookup)]) row = 0 for doc in docs_list: for token in doc: try: # If word is not found in the dictionary col = index_lookup[token] bag_of_words_matrix[row, col] += 1 except KeyError: continue row += 1 # Serialize bag of words s.dump(bag_of_words_matrix.tolist(), open(env_paths.get_bow_matrix_path(self.training, batch), "wb")) print 'Processed ' + str(processed) + ' of ' + str(length) + ' batches' processed += 1
def __set_attributes(self): """ Set the attributes containing of a list of words of all attributes in the bag of words matrix. @return: The generated list of words acting as attributes for the BOWs. """ batches = s.load(open(env_paths.get_batches_path(self.training), "rb")) length = len(batches) attributes = [] processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) tmp_attributes = list( set(sorted(list(chain(*docs_list)))) ) # Retrieve the each word of the docs list in a sorted list attributes += tmp_attributes attributes = list( set(sorted(attributes)) ) # Sort the attributes list so that there is no 2 occurrences of the same word. if not self.acceptance_lst == None: attributes = list( set(attributes).intersection(self.acceptance_lst) ) # Only consider words in the acceptance list. print "Processed attribute " + str(processed) + " of " + str(length) + " batches" processed += 1 # Find attributes of the most common words. d = dict.fromkeys(attributes) processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) words = list(list(chain(*docs_list))) for w in words: try: if d[w] == None: d[w] = 1 else: d[w] += 1 except KeyError: continue print "Processed summing " + str(processed) + " of " + str(length) + " batches" processed += 1 sorted_att = sorted(d.items(), key=lambda x: x[1]) sorted_att = sorted_att[len(sorted_att) - self.max_words_matrix :] attributes = [elem[0] for elem in sorted_att] # Serialize attributes s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb")) return attributes
def __set_attributes(self): """ Set the attributes containing of a list of words of all attributes in the bag of words matrix. @return: The generated list of words acting as attributes for the BOWs. """ batches = s.load(open(env_paths.get_batches_path(self.training), "rb")) length = len(batches) attributes = [] processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) tmp_attributes = list( set(sorted(list(chain(*docs_list))))) # Retrieve the each word of the docs list in a sorted list attributes += tmp_attributes attributes = list( set(sorted(attributes))) # Sort the attributes list so that there is no 2 occurrences of the same word. if not self.acceptance_lst == None: attributes = list( set(attributes).intersection(self.acceptance_lst)) # Only consider words in the acceptance list. print 'Processed attribute ' + str(processed) + ' of ' + str(length) + ' batches' processed += 1 # Find attributes of the most common words. d = dict.fromkeys(attributes) processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) words = list(list(chain(*docs_list))) for w in words: try: if d[w] == None: d[w] = 1 else: d[w] += 1 except KeyError: continue print 'Processed summing ' + str(processed) + ' of ' + str(length) + ' batches' processed += 1 sorted_att = sorted(d.items(), key=lambda x: x[1]) sorted_att = sorted_att[len(sorted_att) - self.max_words_matrix:] attributes = [elem[0] for elem in sorted_att] # Serialize attributes s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb")) return attributes
def compare_real_data_to_reconstructed_data_random(): weights = s.load(open(env_paths.get_dbn_weight_path(),"rb")) batches = s.load(open(env_paths.get_batches_path(train=False),"rb")) batch = choice(batches) # make sure to pick batch at random data = data_processing.get_bag_of_words_matrix(batch,training = False) # choose 10 data points at random data_points = [] indices = random.randint(0,len(data),10) for idx in indices: data_points.append(data[idx]) output_data_points = [] for d in data_points: d = append(d,1.) out = generate_output_data(d,weights) output_data_points.append(out) visualise_data_points(data_points,output_data_points)
def save_batches(batches,training): pickle.dump(batches, open(env_paths.get_batches_path(training), "wb"))
def __read_docs_from_filesystem(self): """ Read all docs and assign them to batches, so that each doc category is represented equally across batches. """ docs_names = [] docs_names_split = [] class_indices = [] class_indices_split = [] class_names = [] batches = [] print "Generating class indices and docs names list." doc_count = 0 for folder in self.paths: docs_names_split.append([]) class_indices_split.append([]) class_names.append(folder.split("/")[len(folder.split("/")) - 1]) if self.trainingset_size == None: # If data processing should be done on all data in the specified folders. docs = os.listdir(folder) elif ( not self.trainingset_size == None and self.trainingset_attributes == None ): # If data processing should be done on parts of the docs in the specified folders - for training and testing purposes. docs = os.listdir(folder)[: int(len(os.listdir(folder)) * self.trainingset_size)] else: # If data processing should be done on a test set. docs = os.listdir(folder)[int(len(os.listdir(folder)) * self.trainingset_size) :] for doc in docs: if doc.endswith(".p"): # Append the name of the document to the list containing document names. docs_names_split[-1].append(folder + "/" + doc) class_indices_split[-1].append(len(class_names) - 1) doc_count += 1 if len(docs_names_split) == 0: # Check if docs have been stemmed. print "Documents have not been stemmed. Please stem documents in order to create bag of words matrices." return 0 # Ensure that batches contain an even amount of docs from each category. print "Arranging the documents." if doc_count < self.batchsize: print "Number of documents must be greater than batchsize. Please revise the batchsize." return 0 number_of_batches = doc_count / self.batchsize number_of_classes = len(self.paths) batches_collected_class_indices = [] batches_collected_docs_names = [] # Calculate fraction of category in each batch. d = {} for i in range(len(class_indices_split)): d[i] = float(len(class_indices_split[i])) / number_of_batches count = 0 for i in range(number_of_batches): batch_class_indices = [] batch_docs_names = [] d_tmp = array([int(v) for v in d.values()]) while True: if ( (len(batch_class_indices) == self.batchsize) and (not doc_count - count < self.batchsize) or (count == doc_count) ): break if len(d_tmp[d_tmp > 0]) == 0: break for j in range(number_of_classes): if ( (len(batch_class_indices) == self.batchsize) and (not doc_count - count < self.batchsize) or (count == doc_count) ): break if len(class_indices_split[j]) > 0 and d_tmp[j] != 0: batch_class_indices.append(class_indices_split[j].pop(0)) batch_docs_names.append(docs_names_split[j].pop(0)) d_tmp[j] -= 1 count += 1 batches_collected_class_indices.append(batch_class_indices) batches_collected_docs_names.append(batch_docs_names) for i in range(number_of_batches): bsize = self.batchsize if i < number_of_batches - 1 else self.batchsize + (doc_count % self.batchsize) batch_class_indices = batches_collected_class_indices[i] batch_docs_names = batches_collected_docs_names[i] if len(batch_class_indices) < bsize: while True: if len(batch_class_indices) == bsize: break for j in range(number_of_classes): if len(batch_class_indices) == bsize: break if len(class_indices_split[j]) > 0: batch_class_indices.append(class_indices_split[j].pop(0)) batch_docs_names.append(docs_names_split[j].pop(0)) # Shuffle the batch batch_class_indices_shuf = [] batch_docs_names_shuf = [] index_shuf = range(len(batch_class_indices)) shuffle(index_shuf) for k in index_shuf: batch_class_indices_shuf.append(batch_class_indices[k]) batch_docs_names_shuf.append(batch_docs_names[k]) # Append batch to full lists class_indices += batch_class_indices_shuf docs_names += batch_docs_names_shuf print "Reading and saving docs from file system" count = 0 class_indices_batch = [] docs_names_batch = [] docs_list = [] for i in xrange(len(class_indices)): if ( not count == 0 and (count % self.batchsize) == 0 ): # Save the batch if batchsize is reached or if the last document has been read. if not (len(class_indices) - count) < self.batchsize: print "Read ", str(count), " of ", len(class_indices) self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch) batches.append(count) # Reset the lists docs_list = [] docs_names_batch = [] class_indices_batch = [] d = s.load(open(docs_names[i], "rb")) docs_list.append(d) docs_names_batch.append(docs_names[i]) class_indices_batch.append(class_indices[i]) count += 1 # Save the remaining docs if len(docs_list) > 0: print "Read ", str(count), " of ", len(class_indices) self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch) batches.append(count) s.dump(class_names, open(env_paths.get_class_names_path(self.training), "wb")) s.dump(batches, open(env_paths.get_batches_path(self.training), "wb")) return 1
def __read_docs_from_filesystem(self): """ Read all docs and assign them to batches, so that each doc category is represented equally across batches. """ docs_names = [] docs_names_split = [] class_indices = [] class_indices_split = [] class_names = [] batches = [] print 'Generating class indices and docs names list.' doc_count = 0 for folder in self.paths: docs_names_split.append([]) class_indices_split.append([]) class_names.append(folder.split('/')[len(folder.split('/')) - 1]) if self.trainingset_size == None: # If data processing should be done on all data in the specified folders. docs = os.listdir(folder) elif not self.trainingset_size == None and self.trainingset_attributes == None: # If data processing should be done on parts of the docs in the specified folders - for training and testing purposes. docs = os.listdir(folder)[:int(len(os.listdir(folder)) * self.trainingset_size)] else: # If data processing should be done on a test set. docs = os.listdir(folder)[int(len(os.listdir(folder)) * self.trainingset_size):] for doc in docs: if doc.endswith('.p'): # Append the name of the document to the list containing document names. docs_names_split[-1].append(folder + '/' + doc) class_indices_split[-1].append(len(class_names) - 1) doc_count += 1 if len(docs_names_split) == 0: # Check if docs have been stemmed. print 'Documents have not been stemmed. Please stem documents in order to create bag of words matrices.' return 0 # Ensure that batches contain an even amount of docs from each category. print 'Arranging the documents.' if doc_count < self.batchsize: print 'Number of documents must be greater than batchsize. Please revise the batchsize.' return 0 number_of_batches = doc_count / self.batchsize number_of_classes = len(self.paths) batches_collected_class_indices = [] batches_collected_docs_names = [] # Calculate fraction of category in each batch. d = {} for i in range(len(class_indices_split)): d[i] = float(len(class_indices_split[i])) / number_of_batches count = 0 for i in range(number_of_batches): batch_class_indices = [] batch_docs_names = [] d_tmp = array([int(v) for v in d.values()]) while True: if (len(batch_class_indices) == self.batchsize) and (not doc_count - count < self.batchsize) or ( count == doc_count): break if len(d_tmp[d_tmp > 0]) == 0: break for j in range(number_of_classes): if (len(batch_class_indices) == self.batchsize) and (not doc_count - count < self.batchsize) or ( count == doc_count): break if len(class_indices_split[j]) > 0 and d_tmp[j] != 0: batch_class_indices.append(class_indices_split[j].pop(0)) batch_docs_names.append(docs_names_split[j].pop(0)) d_tmp[j] -= 1 count += 1 batches_collected_class_indices.append(batch_class_indices) batches_collected_docs_names.append(batch_docs_names) for i in range(number_of_batches): bsize = self.batchsize if i < number_of_batches - 1 else self.batchsize + (doc_count % self.batchsize) batch_class_indices = batches_collected_class_indices[i] batch_docs_names = batches_collected_docs_names[i] if len(batch_class_indices) < bsize: while True: if len(batch_class_indices) == bsize: break for j in range(number_of_classes): if len(batch_class_indices) == bsize: break if len(class_indices_split[j]) > 0: batch_class_indices.append(class_indices_split[j].pop(0)) batch_docs_names.append(docs_names_split[j].pop(0)) # Shuffle the batch batch_class_indices_shuf = [] batch_docs_names_shuf = [] index_shuf = range(len(batch_class_indices)) shuffle(index_shuf) for k in index_shuf: batch_class_indices_shuf.append(batch_class_indices[k]) batch_docs_names_shuf.append(batch_docs_names[k]) # Append batch to full lists class_indices += batch_class_indices_shuf docs_names += batch_docs_names_shuf print 'Reading and saving docs from file system' count = 0 class_indices_batch = [] docs_names_batch = [] docs_list = [] for i in xrange(len(class_indices)): if not count == 0 and ( count % self.batchsize) == 0: # Save the batch if batchsize is reached or if the last document has been read. if not (len(class_indices) - count) < self.batchsize: print 'Read ', str(count), ' of ', len(class_indices) self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch) batches.append(count) # Reset the lists docs_list = [] docs_names_batch = [] class_indices_batch = [] d = s.load(open(docs_names[i], 'rb')) docs_list.append(d) docs_names_batch.append(docs_names[i]) class_indices_batch.append(class_indices[i]) count += 1 # Save the remaining docs if len(docs_list) > 0: print 'Read ', str(count), ' of ', len(class_indices) self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch) batches.append(count) s.dump(class_names, open(env_paths.get_class_names_path(self.training), "wb")) s.dump(batches, open(env_paths.get_batches_path(self.training), "wb")) return 1