def tokenize(text): """ tokenize text for word segmentation :param text: raw text input :return: tokenize text """ text = Text(text) specials = ["==>", "->", "\.\.\.", ">>"] digit = "\d+([\.,_]\d+)+" email = "(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)" web = "^(http[s]?://)?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$" datetime = [ "\d{1,2}\/\d{1,2}(\/\d+)?", "\d{1,2}-\d{1,2}(-\d+)?", ] word = "\w+" non_word = "[^\w\s]" abbreviations = [ "[A-ZĐ]+\.", "Tp\.", "Mr\.", "Mrs\.", "Ms\.", "Dr\.", "ThS\." ] patterns = [] patterns.extend(abbreviations) patterns.extend(specials) patterns.extend([web, email]) patterns.extend(datetime) patterns.extend([digit, non_word, word]) patterns = "(" + "|".join(patterns) + ")" if sys.version_info < (3, 0): patterns = patterns.decode('utf-8') tokens = re.findall(patterns, text, re.UNICODE) return u" ".join(["%s" % token[0] for token in tokens])
def revise_vlsp2013_wtk_dataset(source_file, dest_file): copyfile(source_file, dest_file) with open(source_file) as f: content = Text(f.read()) sentences = content.split("\n\n") ignores = load_ignores() if basename(source_file) == "train.txt": corpus_id = "train" ignores_id = ignores["train"] else: corpus_id = "test" ignores_id = ignores["test"] last_index = len(sentences) - 1 with open(dest_file, "w") as f: for i, sentence in enumerate(sentences): id = i + 1 if id in ignores_id: continue nodes = sentence.split("\n") nodes = [node.split("\t") for node in nodes] send_id = f"# sent_id = {corpus_id}-s{id}\n" text = " ".join([node[0] for node in nodes]) text = f"# text = {text}\n" content = send_id + text + sentence if i != last_index: content += "\n\n" else: content += "\n" f.write(content)
def test_1(self): dictionary = Dictionary.Instance() senses = dictionary.lookup(Text("đi")) self.assertEqual(22, len(senses)) sense = senses[0] self.assertEqual("V", sense["pos"]) self.assertGreater(len(sense["definition"]), 0)
def tokenize(text): """ tokenize text for word segmentation :param text: raw text input :return: tokenize text """ text = Text(text) text = text.replace("\t", " ") tokens = re.findall(patterns, text) return u" ".join([token[0] for token in tokens])
def tokenize(text, format=None): """Tokenize text for word segmentation. :param text: raw text input :return: tokenize text """ text = Text(text.lower()) text = text.replace("\t", " ") tokens = re.findall(patterns, text) tokens = [token[0] for token in tokens] if format == "text": return " ".join(tokens) else: return tokens
def tokenize(text, format=None, tag=False): """ tokenize text for word segmentation :param text: raw text input :return: tokenize text """ text = Text(text) text = text.replace("\t", " ") matches = [m for m in re.finditer(patterns, text)] tokens = [extract_match(m) for m in matches] if tag: return tokens tokens = [token[0] for token in tokens] if format == "text": return " ".join(tokens) return tokens
def validate_utf8(file): base_name = basename(file) detector = UniversalDetector() detector.reset() with open(file, "rb") as f: for i, line in enumerate(f): detector.feed(line) if detector.done or i > 1000: break detector.close() result = detector.result if not (result["encoding"] == "utf-8" and result["confidence"] >= 0.99): warn(message=f"File {file} should encoding with UTF-8", level=1) sys.exit(1) with open(file, "r") as f: content = f.read() normalized_nfc_content = Text(content) if normalized_nfc_content != content: warn(message=f"File {base_name} should normalized to NFC", error_type="Format nfc-normalized-failed", file=base_name, level=1)
def validate_utf8(file): base_name = basename(file) text = b'' with open(file, "rb") as f: for i, line in enumerate(f): if i < 1000: text += line else: break results = charset_normalizer.from_bytes(text).best() if not (results.encoding == "utf-8" and results.coherence >= 0.99): warn(message=f"File {file} should encoding with UTF-8", level=1) sys.exit(1) with open(file, "r") as f: content = f.read() normalized_nfc_content = Text(content) if normalized_nfc_content != content: warn(message=f"File {base_name} should normalized to NFC", error_type="Format nfc-normalized-failed", file=base_name, level=1)
def extract_text(i, s): if i in incorrect or i in laters: return global count global cases s = Text(s) tokens_tags = [token.split("\t") for token in s.split("\n")] tokens = [token_tag[0] for token_tag in tokens_tags] text = " ".join(tokens) extract_tokens = tokenize(text) extract_text = " ".join(extract_tokens) if tokens != extract_tokens: count += 1 print("==========") print(i) differ = difflib.Differ() diff = differ.compare([text], [extract_text]) print("\n".join(diff)) cases.append(i) if count > 30: print(cases) sys.exit(1)
def test_classify_simple_case(self): text = u"HLV ngoại đòi gần tỷ mỗi tháng dẫn dắt tuyển Việt Nam 54" actual = classify(text)[0] expected = Text("The thao") self.assertEqual(actual, expected)
def test_text_1(self): input = u"đi học" output = Text(input) self.assertTrue(is_unicode(output))
def test_text_4(self): # string in byte input = u"đi học".encode("utf-8") output = Text(input) self.assertTrue(is_unicode(output))
def test_text_3(self): # string in unicode tổ hợp input = u"cộng hòa xã hội" output = Text(input) self.assertTrue(is_unicode(output))
def convert_text(x): try: return Text(x) except: pass return ""
import requests import json from os.path import join from underthesea.feature_engineering.text import Text from underthesea.util.file_io import write url = "http://localhost:8000/api/corpora/" headers = { 'Content-type': 'application/json', 'Accept': 'application/json'} r = requests.get(url, headers=headers) content = Text(json.dumps(r.json(), ensure_ascii=False)) write(join("data", "20171017.json"), content)
def read_utf16(filename): with open(filename, 'rb') as f: content = f.read() content = content.decode("utf-16") return Text(content)
def test_2(self): dictionary = Dictionary.Instance() word = dictionary.lookup(Text("không có từ này")) self.assertEqual(None, word)