def remove_stop_postag(dataset, output_dir): utils.mkdir(output_dir) stack = os.listdir(dataset) # print 'loading data in ' + dataset total_doc = 0 while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(dataset, file_name) if (os.path.isdir(file_path)): # neu la thu muc thi day vao strong stack utils.push_data_to_stack(stack, file_path, file_name) else: with open(file_path, 'r', encoding='utf-8') as fr: data = unicodedata.normalize('NFKC', fr.read().strip()) original_content = tokenizer.predict(data) content = map(lambda x: ViPosTagger.postagging(x), spliter.split(original_content)) clean_content = [] for info in content: sen = [] for i in xrange(len(info[0])): if is_exist(info[1][i]): sen.append(info[0][i]) clean_content.append(u' '.join(sen)) with open(os.path.join(output_dir, os.path.basename(file_name)), 'w', encoding='utf-8') as fw: if len(clean_content) > 0: fw.write(u'\n'.join(clean_content)) else: fw.write(original_content) total_doc += 1
def load_dataset_from_list(list_samples, remove_tags=False): result = [] for sample in list_samples: if remove_tags: sample = sample.split(u'[tags] : ') sample = sample[0] sample = r.run(tokenizer.predict(sample)) result.append(sample) return result
def preprocessing(data, tokenize=True): data = unicodedata.normalize('NFKC', data) if tokenize: data = tokenizer.predict(data) data = my_regex.detect_url.sub(u'', data) data = my_regex.detect_url2.sub(u'', data) data = my_regex.detect_email.sub(u'', data) data = my_regex.detect_datetime.sub(u'', data) data = my_regex.detect_num.sub(u'', data) data = my_regex.normalize_special_mark.sub(u' \g<special_mark> ', data) data = my_regex.detect_exception_chars.sub(u'', data) data = my_regex.detect_special_mark.sub(u'', data) data = my_regex.detect_special_mark2.sub(u'', data) data = my_regex.detect_special_mark3.sub(u'', data) data = my_regex.normalize_space.sub(u' ', data) return data.strip()
def load_dataset(dataset): list_samples = {k: [] for k in my_map.name2label.keys()} stack = os.listdir(dataset) print 'loading data in ' + dataset while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(dataset, file_name) if (os.path.isdir(file_path)): utils.push_data_to_stack(stack, file_path, file_name) else: print(file_path) with open(file_path, 'r', encoding='utf-16') as fp: content = unicodedata.normalize('NFKC', fp.read()) content = r.run(tokenizer.predict(content)) dir_name = utils.get_dir_name(file_path) list_samples[dir_name].append(content) return list_samples
def load_dataset_from_disk(dataset, remove_tags=False): list_samples = [] stack = os.listdir(dataset) print 'loading data in ' + dataset while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(dataset, file_name) if (os.path.isdir(file_path)): utilities.push_data_to_stack(stack, file_path, file_name) else: print('\r%s' % (file_path)), sys.stdout.flush() with open(file_path, 'r', encoding='utf-8') as fp: content = unicodedata.normalize('NFKC', fp.read()) if remove_tags: content = content.split(u'[tags] : ') content = content[0] content = r.run(tokenizer.predict(content)) list_samples.append(content) print('') return list_samples
def load_dataset_from_list(list_samples): result = [] for sample in list_samples: sample = r.run(tokenizer.predict(sample)) result.append(sample) return result