def remove_stop_postag(dataset, output_dir): utils.mkdir(output_dir) stack = os.listdir(dataset) # print 'loading data in ' + dataset total_doc = 0 while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(dataset, file_name) if (os.path.isdir(file_path)): # neu la thu muc thi day vao strong stack utils.push_data_to_stack(stack, file_path, file_name) else: with open(file_path, 'r', encoding='utf-8') as fr: data = unicodedata.normalize('NFKC', fr.read().strip()) original_content = tokenizer.predict(data) content = map(lambda x: ViPosTagger.postagging(x), spliter.split(original_content)) clean_content = [] for info in content: sen = [] for i in xrange(len(info[0])): if is_exist(info[1][i]): sen.append(info[0][i]) clean_content.append(u' '.join(sen)) with open(os.path.join(output_dir, os.path.basename(file_name)), 'w', encoding='utf-8') as fw: if len(clean_content) > 0: fw.write(u'\n'.join(clean_content)) else: fw.write(original_content) total_doc += 1
def load_dataset_from_disk(dataset): list_samples = {k: [] for k in my_map.name2label.keys()} print(list_samples) print 'load_data in ' + dataset # return list file and folder in dir stack = os.listdir(dataset) while len(stack) > 0: file_name = stack.pop() file_path = os.path.join(dataset, file_name) # where is file_path if (os.path.isdir(file_path)): utils.push_data_to_stack(stack, file_path, file_name) else: print('%s' % file_path) sys.stdout.flush() with open(file_path, 'r', encoding='utf-16') as fp: content = unicodedata.normalize('NFKC', fp.read()) #tokenizer content content = r.run(tokenizer.predict(content)) #dir name of file_path dir_name = utils.get_dir_name(file_path) list_samples[dir_name].append(content) print('') return list_samples
def load_document_content(dataset, documents): stack = os.listdir(dataset) while (len(stack) > 0): file_name = stack.pop() file_path = dataset + '/' + file_name if (os.path.isdir(file_path) ): # neu la thu muc thi day vao strong stack utils.push_data_to_stack(stack, file_path, file_name) else: with open(file_path, 'r', encoding='utf-8') as f: try: raw_content = unicodedata.normalize( 'NFKC', f.read().strip()).split(u'\n') new_content = [] for i, sen in enumerate(raw_content): if i == 0: # highlight title sen = u'<h2>' + sen + u'</h2>' elif i == 1: sen = u'<h5>' + sen + u'</h5>' else: sen = sen + u'<br>' new_content.append(sen) documents.update( {raw_content[0].lower(): u'\n'.join(new_content)}) except: continue
def build_vocab(dataset, output_vocab, root_dir, title_map): vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_df=0.6, min_df=1, stop_words=utils.load_data_from_list( os.path.join(root_dir, 'stopwords.txt'))) stack = os.listdir(dataset) contents = [] titles = [] while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(dataset, file_name) if (os.path.isdir(file_path) ): # neu la thu muc thi day vao strong stack utils.push_data_to_stack(stack, file_path, file_name) else: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() base = os.path.basename(file_name) titles.append(title_map[base]) contents.append(content.lower()) # change vectorizer to ensure length of document greater than 0 if len(contents) < 50: vectorizer.max_df = 1.0 vectorizer.fit(contents) with open(output_vocab, 'w', encoding='utf-8') as f: vocab = {w: i for i, w in enumerate(vectorizer.vocabulary_.keys())} f.write(u'\n'.join(vocab.keys())) return contents, titles
def update_title_map(dataset, title_map): stack = os.listdir(dataset) while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(dataset, file_name) if (os.path.isdir(file_path) ): # neu la thu muc thi day vao strong stack utils.push_data_to_stack(stack, file_path, file_name) else: with open(file_path, 'r', encoding='utf-8') as f: data = unicodedata.normalize('NFKC', f.read().strip()) title = data.lower().split(u'\n')[0] base = os.path.basename(file_name) title_map.update({base: title})
def load_data_from_file(dataset): data = {} stack = os.listdir(dataset) while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(dataset, file_name) if (os.path.isdir(file_path)): utils.push_data_to_stack(stack, file_path, file_name) else: img = cv2.imread(file_path) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) loan_id = file_name.split('_')[2].split('.')[0] data[loan_id] = gray return data
def load_data_from_file(dataset): data = {} stack = os.listdir(dataset) while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(dataset, file_name) if (os.path.isdir(file_path)): utils.push_data_to_stack(stack, file_path, file_name) else: img = cv.imread(file_path) loan_id = file_name.split('_')[2].split('.')[0] img = padding_img(img, max_width, max_height) img = padding_img(img, final_width, final_height) data[loan_id] = img return data
def load_dataset(dataset): list_samples = {k: [] for k in my_map.name2label.keys()} stack = os.listdir(dataset) print 'loading data in ' + dataset while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(dataset, file_name) if (os.path.isdir(file_path)): utils.push_data_to_stack(stack, file_path, file_name) else: print(file_path) with open(file_path, 'r', encoding='utf-16') as fp: content = unicodedata.normalize('NFKC', fp.read()) content = r.run(tokenizer.predict(content)) dir_name = utils.get_dir_name(file_path) list_samples[dir_name].append(content) return list_samples
def parse_training_data(dataset, output): docs = [] utils.mkdir(output) stack = os.listdir(dataset) print 'loading data in ' + dataset while (len(stack) > 0): file_name = stack.pop() file_path = dataset + '/' + file_name if (os.path.isdir(file_path) ): # neu la thu muc thi day vao strong stack utils.push_data_to_stack(stack, file_path, file_name) else: # nguoc lai tien hanh readfile with open(file_path, 'r', encoding='utf-8') as ff: content = ff.read() bs = BeautifulSoup(content) docs.append([bs.text]) # with open(output + '/' + file_name, 'w', encoding='utf-8') as f: # f.write(bs.text) return docs
def load_data_from_file(dataset): data = {} stack = os.listdir(dataset) max_h, max_w = 0, 0 while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(dataset, file_name) if (os.path.isdir(file_path)): utils.push_data_to_stack(stack, file_path, file_name) else: img = cv.imread(file_path) loan_id = file_name.split('_')[2].split('.')[0] data[loan_id] = img if img.shape[0] > max_h: max_h = img.shape[0] if img.shape[1] > max_w: max_w = img.shape[1] # print(f'{max_h} {max_w}') return data
def count_tokens(): print('count tokens...') statistic = {name: {} for name in my_map.name2label.keys()} stack = os.listdir(tokenized_dataset) print 'loading data in ' + dataset while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(tokenized_dataset, file_name) if (os.path.isdir(file_path)): utils.push_data_to_stack(stack, file_path, file_name) else: print('\r%s' % (file_path)), sys.stdout.flush() with open(file_path, 'r', encoding='utf-8') as fp: label = utils.get_dir_name(file_path) for sen in fp: sen = sen.strip() tag = ViPosTagger.postagging(sen) tokens = [ tag[0][i] for i in xrange(len(tag[0])) if tag[1][i] == u'N' ] update_count_tokens(statistic, label, tokens)
def tokenizer_dataset(): utils.mkdir(tokenized_dataset) stack = os.listdir(dataset) print 'loading data in ' + dataset while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(dataset, file_name) if (os.path.isdir(file_path)): utils.push_data_to_stack(stack, file_path, file_name) else: print('\r%s' % (file_path)), sys.stdout.flush() with open(file_path, 'r', encoding='utf-16') as fp: content = unicodedata.normalize('NFKC', fp.read()) content = r.run(tokenizer.predict(content)) dir_name = utils.get_dir_name(file_path) output_dir = os.path.join(tokenized_dataset, dir_name) utils.mkdir(output_dir) name = os.path.basename(file_path) with open(os.path.join(output_dir, name), 'w', encoding='utf-8') as fw: fw.write(content) print('')