def get_char_id_map(self): class_label_id_map = {} char_freq_map = {} all_files = [] print("正字读取语料,并统计字的频率") utils.find_all_files(self.parameters['train_corpus_dir'], all_files) count = 0 for file_name in all_files: class_name = file_name.split("/")[-2] if class_name not in class_label_id_map: class_label_id_map[class_name] = len(class_label_id_map) this_class_sample_size = 0 for lines in utils.read_lines_small_file(file_name): for line in lines: line = preProcess.filtUrl(line) line = line.replace(' ', '').replace('\n', '') if len(line) < 10: continue count += 1 this_class_sample_size += 1 # if this_class_sample_size==100: break if count % 10000 == 0: print("已经读取了", count, '行。', "字符数量是", len(char_freq_map)) for char in line: char_freq_map[char] = char_freq_map.get(char, 0) + 1 print("正在为每一个字分配一个id") char_id_map = {'unk': 0, 'pad_char': 1, 'stop_char': 2} id_char_map = {0: 'unk', 1: 'pad_char', 2: 'stop_char'} init_char_id_map_size = len(char_id_map) char_freq_list = sorted(char_freq_map.items(),\ key=lambda x: x[1], reverse=True)\ [:self.parameters['char_set_size']-len(id_char_map)] for i in range(len(char_freq_list)): [char, _] = char_freq_list[i] if char not in stop_chars: char_id_map[char] = i + init_char_id_map_size id_char_map[i + init_char_id_map_size] = char pickle.dump(char_id_map, open(self.parameters['char_id_map_file'], 'wb')) print(char_id_map.keys()) pickle.dump(id_char_map, open(self.parameters['id_char_map_file'], 'wb')) pickle.dump(class_label_id_map, open(self.parameters['class_label_id_map_file'], 'wb'))
def load_test_data(self): x_batch = [] y_batch = [] class_num_map = {} test_file_list = utils.find_all_files( self.parameters['test_corpus_dir'], []) lines = [] for file_name in test_file_list: lines += utils.read_lines_small_file(file_name) random.shuffle(lines) for text_file in lines: [text, file_name] = text_file text = preProcess.filtUrl(text) text = text.replace(" ", '').replace( '\n', '')[:self.parameters['max_text_length']] if len(text) == 0: continue id_list = self.trans_char2id(text) # print(text) # print(id_list) class_label = file_name.split('/')[-2] class_num_map[class_label] = class_num_map.get(class_label, 0) + 1 #if class_num_map[class_label]>20: continue class_label_one_hot = self.class_label_one_hot[class_label] x_batch.append(id_list) y_batch.append(class_label_one_hot) x_batch = np.array(x_batch) y_batch = np.array(y_batch) return x_batch, y_batch
def fit(self, if_static_embeding=True): file_list = utils.find_all_files(self.parameters['train_corpus_dir'], []) test_input, test_output = self.load_test_data() count = 0 batch_size = 50 for epoch in range(10000): x_batch = [] y_batch = [] lines = [] for file_name in file_list: lines += utils.read_lines_small_file(file_name) random.shuffle(lines) for text_file in lines: [text, file_name] = text_file text = preProcess.filtUrl(text) text = text.replace(" ", '').replace( '\n', '')[:self.parameters['max_text_length']] id_list = self.trans_char2id(text) class_label = file_name.split('/')[-2] class_label_one_hot = self.class_label_one_hot[class_label] count += 1 #print(class_label, text) x_batch.append(id_list) # print(x_batch) y_batch.append(class_label_one_hot) #print(len(y_batch), y_batch) #打乱顺序 # print(x_batch) x_batch = np.array(x_batch) y_batch = np.array(y_batch) #print(y_batch) # print(x_batch.shape) #训练 for i in range(0, y_batch.shape[0], batch_size): a_x_batch = x_batch[i:i + batch_size, :] a_y_batch = y_batch[i:i + batch_size, :] #print(a_y_batch) Y, prob_dist, _, loss_1, accuracy = self.sess.run([self.Y, self.prob_dist, self.train, self.losses, self.accuracy],\ feed_dict={self.X: a_x_batch, self.Y: a_y_batch}) #打印损失值 # print('epoch ', epoch," loss is ", loss, '。 accuracy is ', accuracy) loss, accuracy = self.sess.run([self.losses, self.accuracy],\ feed_dict={self.X: test_input, self.Y: test_output}) merg = self.sess.run(self.merged,\ feed_dict={self.X: test_input, self.Y: test_output}) self.writer.add_summary(merg, epoch) print(epoch, loss_1, "在测试集中的loss为", loss, 'accuracy为', accuracy) self.saver.save(self.sess, self.parameters['check_points_dir'] + '/model') tf.reset_default_graph() self.writer.close()
def fit(self, if_static_embeding=True): model = None file_list = utils.find_all_files( self.parameters['train_corpus_for_embedding'], []) print("训练数据的文件数量是", len(file_list)) count = 0 x_batch = [] step = 0 random.shuffle(file_list) for file_name in file_list: lines = utils.read_lines_small_file(file_name) # text = self.get_title_content(lines) lines = list(map(lambda x: x.split('#'), lines)) lines = list( filter(lambda x: len(x) == 8 and len(x[6]) > 50, lines)) lines = list(map(lambda x: x[6].split('kabukabu')[1].\ replace('d_post_content j_d_post_content clearfix"> ', ''), lines)) text = ''.join(lines).replace(' ', '') if len(text) == 0: continue # print("文档字数 是", len(text)) text = list(text) if len(text) == 0: continue count += 1 x_batch.append(text) if len(x_batch) == 10: #打乱顺序 random_index = list(range(10)) random.shuffle(random_index) #print(x_batch) x_batch = np.array(x_batch)[random_index] #训练 print("这是第", step) if model == None: model = Word2Vec(x_batch, size=200, window=5, min_count=5, workers=8, iter=200) step += 1 else: model.build_vocab(x_batch, update=True) model.train(x_batch, total_examples=x_batch.shape[0], epochs=200) step += 1 x_batch = [] if count % 50 == 0: model.save("./model/word2vec.model")
def fit(self, if_static_embeding=True): file_list = utils.find_all_files(self.parameters['train_corpus_dir'], []) test_input, test_output = self.load_test_data() count = 0 x_batch = [] y_batch = [] for epoch in range(1000): random.shuffle(file_list) for file_name in file_list: print(file_name) lines = utils.read_lines_small_file(file_name) text = self.get_title_content(lines) if len(text) == 0: continue id_list = self.trans_char2id(text) class_label = file_name.split('/')[-2] print(self.class_label_one_hot) class_label_one_hot = self.class_label_one_hot[class_label] count += 1 x_batch.append(id_list) # print(x_batch) y_batch.append(class_label_one_hot) if len(x_batch) == 500: #打乱顺序 random_index = list(range(500)) random.shuffle(random_index) # print(x_batch) x_batch = np.array(x_batch)[random_index] y_batch = np.array(y_batch)[random_index] # print(x_batch.shape) #训练 _, loss, accuracy = self.sess.run([self.train, self.losses, self.accuracy],\ feed_dict={self.X: x_batch, self.Y: y_batch}) #打印损失值 x_batch = [] y_batch = [] if count % 5000 == 0: print('epoch ', epoch, " loss is ", loss, '。 accuracy is ', accuracy) loss, accuracy = self.sess.run([self.losses, self.accuracy],\ feed_dict={self.X: test_input, self.Y: test_output}) print("在测试集中的loss为", loss, 'accuracy为', accuracy) self.saver.save( self.sess, self.parameters['check_points_dir'] + '/model')
def parse_original_xmls(dirname, pickle=True): pickle_file = pickle and os.path.join(dirname, 'annotation.pkl') or None if pickle and os.path.isfile(pickle_file): logging.info("Loading annotations from file %s" % pickle_file) with open(pickle_file, 'r') as f: annotations = cPickle.load(f) logging.info("Load annotations complete") else: logging.info("Reading annotations") annotations = [] xml_files = find_all_files(dirname, '.xml') for f in xml_files: annotations.append(parse(f)) if pickle and not os.path.isfile(pickle_file): logging.info("Saving annotations to file %s" % pickle_file) with open(pickle_file, 'w') as f: cPickle.dump(annotations, f) return annotations
def cleanup_checkpoints(self) -> None: if not self.is_remove_old_checkpoint: # do nothing if the model do not save latest checkpoints or if all checkpoints are kept return checkpoint_paths = find_all_files( checkpoint_dir=self.log_dir, search_pattern=self.latest_checkpoint_pattern) # sort by recency (largest step first) checkpoint_paths.sort(key=lambda x: int( re.search(self.latest_checkpoint_pattern, x.name).group(1)), reverse=True) # remove old checkpoints for checkpoint_path in checkpoint_paths[self. num_latest_checkpoints_kept:]: print(f"Removing old checkpoint \"{checkpoint_path}\"", flush=True) checkpoint_path.unlink()
def load_test_data(self): x_batch = [] y_batch = [] class_num_map = {} test_file_list = utils.find_all_files( self.parameters['test_corpus_dir'], []) for file_name in test_file_list: lines = utils.read_lines_small_file(file_name) text = self.get_title_content(lines) if len(text) == 0: continue id_list = self.trans_char2id(text) class_label = file_name.split('/')[-2] class_num_map[class_label] = class_num_map.get(class_label, 0) + 1 if class_num_map[class_label] > 20: continue print(self.class_label_one_hot) class_label_one_hot = self.class_label_one_hot[class_label] x_batch.append(id_list) y_batch.append(class_label_one_hot) x_batch = np.array(x_batch) y_batch = np.array(y_batch) return x_batch, y_batch
def execute_operation( self ): # Anropa olika funktioner beroende på valet i radiobuttons och skapa dictionaries inför generering av rapport folder = self.katalog_entry.get() ext = self.ext_entry.get() keyword = self.keyword_entry.get() date = self.date_entry.get() if self.radiovar.get() == 1: if folder: if os.path.isdir(folder): list_tmp = utils.find_all_files(folder) self.match_hashset += utils.verify_files(list_tmp) utils.create_dict(folder, self.allfiles, list_tmp) self.display_results(list_tmp) else: tkinter.messagebox.showerror( 'Error', 'Detta är inte en äkta katalog!') else: tkinter.messagebox.showerror('Error', 'Du måste ange en katalog!') elif self.radiovar.get() == 2: if folder and ext: if os.path.isdir(folder): list_tmp = utils.find_specific_files(folder, ext) self.match_hashset += utils.verify_files(list_tmp) utils.create_dict(ext, self.specificfiles, list_tmp) self.display_results(list_tmp) else: tkinter.messagebox.showerror( 'Error', 'Detta är inte en äkta katalog!') else: tkinter.messagebox.showerror( 'Error', 'Du måste ange både katalog och filändelse!') elif self.radiovar.get() == 3: if folder and ext and keyword: if os.path.isdir(folder): list_tmp = utils.search_files(folder, ext, keyword) self.match_hashset += utils.verify_files(list_tmp) utils.create_dict(keyword, self.infofiles, list_tmp) self.display_results(list_tmp) else: tkinter.messagebox.showerror( 'Error', 'Detta är inte en äkta katalog!') else: tkinter.messagebox.showerror( 'Error!', 'Du måste ange katalog, filändelse och sökord!') elif self.radiovar.get() == 4: if folder and date: if os.path.isdir(folder): list_tmp = utils.find_modified_files(folder, date) self.match_hashset += utils.verify_files(list_tmp) utils.create_dict(date, self.datefiles, list_tmp) self.display_results(list_tmp) else: tkinter.messagebox.showerror( 'Error', 'Detta är inte en äkta katalog!') else: tkinter.messagebox.showerror( 'Error!', 'Du måste ange katalog och datum!')
def main(): allfiles = dict() specificfiles = dict() infofiles = dict() datefiles = dict() match_hashset = list() while True: print("\n") print("################################################") print("# [1]Search [2]Encryption [3]File Difference #") print("# [4]System Info [5]Generate report #") print('# q or "exit" to exit #') print("################################################") ch = input("$ ") # Search in files if ch == "1": while True: print("\n") print("##########################################") print("# [1] Find all files [2] File Extension #") print("# [3] By date [4] Search in files #") print('# q or "back" to go back #') print("##########################################") ch2 = input("$ ") if ch2 == "1": path = input("$ Path to folder: ") if path == "q" or path == "back": break list_tmp = utils.find_all_files(path) utils.create_dict(path, allfiles, list_tmp) match_hashset += utils.verify_files(list_tmp) print_results(list_tmp) if ch2 == "2": ext = input("$ Extension: ") if ext == "q" or ext == "back": break folder = input("$ Path to folder: ") if folder == "q" or folder == "back": break list_tmp = utils.find_specific_files(folder, ext) utils.create_dict(ext, specificfiles, list_tmp) match_hashset += utils.verify_files(list_tmp) print_results(list_tmp) if ch2 == "3": folder = input("$ Path to folder: ") if folder == "q" or folder == "back": break date = input("$ Date (Ex format: 2020-03-03): ") if date == "q" or date == "back": break list_tmp = utils.find_modified_files(folder, date) utils.create_dict(date, datefiles, list_tmp) match_hashset = utils.verify_files(list_tmp) print_results(list_tmp) if ch2 == "4": folder = input("$ Path to folder: ") if folder == "q" or folder == "back": break ext = input("$ Extension: ") if ext == "q" or ext == "back": break keyword = input("$ Keyword: ") if keyword == "q" or keyword == "back": break list_tmp = utils.search_files(folder, ext, keyword) utils.create_dict(keyword, infofiles, list_tmp) match_hashset = utils.verify_files(list_tmp) print_results(list_tmp) if ch2 == "q" or ch2 == "back": break #Encryption if ch == "2": while True: print("\n") print("###########################") print("# [1] Encrypt [2] Decrypt #") print('# q or "back" to go back #') print("###########################") ch2 = input("$ ") if ch2 == "1": filename = input("$ Path to file: ") if filename == "q" or filename == "back": break utils.encrypt_file(filename) print(filename + " has been encrypted.") if ch2 == "2": filename = input("$ Path to file: ") if filename == "q" or filename == "back": break utils.decrypt_file(filename) print(filename + "has been decrypted.") if ch2 == "q" or ch2 == "back": break # File Difference if ch == "3": while True: print("\n") print(' q or "back" to go back') file1 = input("$ File 1: ") if file1 == "q" or file1 == "back": break file2 = input("$ File 2: ") if file2 == "q" or file2 == "back": break file1_diff, file2_diff = utils.word_difference(file1, file2) print() print("Words in file 1, but not in file 2:") print_results(file1_diff) print("Words in file 2, but not in file 1:") print_results(file2_diff) # System info if ch == "4": print_results(utils.system_information()) if ch == "5": dictionary = dict() dictionary['sys'] = utils.system_information() dictionary['hashset'] = match_hashset dictionary['allfiles'] = allfiles dictionary['extfiles'] = specificfiles dictionary['infofiles'] = infofiles dictionary['datefiles'] = datefiles utils.gen_report(dictionary) print("The report has been generated!") if ch == "q" or ch == "exit": print("\n") print(" Cya! ") print("\n") break