예제 #1
0
def tokenize(text):
    """
    tokenize text for word segmentation
    :param text: raw text input
    :return: tokenize text
    """
    text = Text(text)
    specials = ["==>", "->", "\.\.\.", ">>"]
    digit = "\d+([\.,_]\d+)+"
    email = "(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"
    web = "^(http[s]?://)?(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+$"
    datetime = [
        "\d{1,2}\/\d{1,2}(\/\d+)?",
        "\d{1,2}-\d{1,2}(-\d+)?",
    ]
    word = "\w+"
    non_word = "[^\w\s]"
    abbreviations = [
        "[A-ZĐ]+\.", "Tp\.", "Mr\.", "Mrs\.", "Ms\.", "Dr\.", "ThS\."
    ]

    patterns = []
    patterns.extend(abbreviations)
    patterns.extend(specials)
    patterns.extend([web, email])
    patterns.extend(datetime)
    patterns.extend([digit, non_word, word])

    patterns = "(" + "|".join(patterns) + ")"
    if sys.version_info < (3, 0):
        patterns = patterns.decode('utf-8')
    tokens = re.findall(patterns, text, re.UNICODE)
    return u" ".join(["%s" % token[0] for token in tokens])
예제 #2
0
def revise_vlsp2013_wtk_dataset(source_file, dest_file):
    copyfile(source_file, dest_file)
    with open(source_file) as f:
        content = Text(f.read())
        sentences = content.split("\n\n")
    ignores = load_ignores()
    if basename(source_file) == "train.txt":
        corpus_id = "train"
        ignores_id = ignores["train"]
    else:
        corpus_id = "test"
        ignores_id = ignores["test"]
    last_index = len(sentences) - 1
    with open(dest_file, "w") as f:
        for i, sentence in enumerate(sentences):
            id = i + 1
            if id in ignores_id:
                continue
            nodes = sentence.split("\n")
            nodes = [node.split("\t") for node in nodes]
            send_id = f"# sent_id = {corpus_id}-s{id}\n"
            text = " ".join([node[0] for node in nodes])
            text = f"# text = {text}\n"
            content = send_id + text + sentence
            if i != last_index:
                content += "\n\n"
            else:
                content += "\n"
            f.write(content)
예제 #3
0
 def test_1(self):
     dictionary = Dictionary.Instance()
     senses = dictionary.lookup(Text("đi"))
     self.assertEqual(22, len(senses))
     sense = senses[0]
     self.assertEqual("V", sense["pos"])
     self.assertGreater(len(sense["definition"]), 0)
예제 #4
0
def tokenize(text):
    """
    tokenize text for word segmentation

    :param text: raw text input
    :return: tokenize text
    """
    text = Text(text)
    text = text.replace("\t", " ")
    tokens = re.findall(patterns, text)
    return u" ".join([token[0] for token in tokens])
예제 #5
0
def tokenize(text, format=None):
    """Tokenize text for word segmentation.

    :param text: raw text input
    :return: tokenize text
    """
    text = Text(text.lower())
    text = text.replace("\t", " ")
    tokens = re.findall(patterns, text)
    tokens = [token[0] for token in tokens]
    if format == "text":
        return " ".join(tokens)
    else:
        return tokens
예제 #6
0
def tokenize(text, format=None, tag=False):
    """
    tokenize text for word segmentation

    :param text: raw text input
    :return: tokenize text
    """
    text = Text(text)
    text = text.replace("\t", " ")
    matches = [m for m in re.finditer(patterns, text)]
    tokens = [extract_match(m) for m in matches]

    if tag:
        return tokens

    tokens = [token[0] for token in tokens]
    if format == "text":
        return " ".join(tokens)

    return tokens
예제 #7
0
def validate_utf8(file):
    base_name = basename(file)
    detector = UniversalDetector()
    detector.reset()
    with open(file, "rb") as f:
        for i, line in enumerate(f):
            detector.feed(line)
            if detector.done or i > 1000:
                break
    detector.close()
    result = detector.result
    if not (result["encoding"] == "utf-8" and result["confidence"] >= 0.99):
        warn(message=f"File {file} should encoding with UTF-8", level=1)
        sys.exit(1)
    with open(file, "r") as f:
        content = f.read()
    normalized_nfc_content = Text(content)
    if normalized_nfc_content != content:
        warn(message=f"File {base_name} should normalized to NFC",
             error_type="Format nfc-normalized-failed",
             file=base_name, level=1)
예제 #8
0
def validate_utf8(file):
    base_name = basename(file)
    text = b''
    with open(file, "rb") as f:
        for i, line in enumerate(f):
            if i < 1000:
                text += line
            else:
                break
    results = charset_normalizer.from_bytes(text).best()
    if not (results.encoding == "utf-8" and results.coherence >= 0.99):
        warn(message=f"File {file} should encoding with UTF-8", level=1)
        sys.exit(1)
    with open(file, "r") as f:
        content = f.read()
    normalized_nfc_content = Text(content)
    if normalized_nfc_content != content:
        warn(message=f"File {base_name} should normalized to NFC",
             error_type="Format nfc-normalized-failed",
             file=base_name,
             level=1)
예제 #9
0
def extract_text(i, s):
    if i in incorrect or i in laters:
        return
    global count
    global cases
    s = Text(s)
    tokens_tags = [token.split("\t") for token in s.split("\n")]
    tokens = [token_tag[0] for token_tag in tokens_tags]
    text = " ".join(tokens)
    extract_tokens = tokenize(text)
    extract_text = " ".join(extract_tokens)

    if tokens != extract_tokens:
        count += 1
        print("==========")
        print(i)
        differ = difflib.Differ()
        diff = differ.compare([text], [extract_text])
        print("\n".join(diff))
        cases.append(i)
    if count > 30:
        print(cases)
        sys.exit(1)
예제 #10
0
 def test_classify_simple_case(self):
     text = u"HLV ngoại đòi gần tỷ mỗi tháng dẫn dắt tuyển Việt Nam 54"
     actual = classify(text)[0]
     expected = Text("The thao")
     self.assertEqual(actual, expected)
예제 #11
0
 def test_text_1(self):
     input = u"đi học"
     output = Text(input)
     self.assertTrue(is_unicode(output))
예제 #12
0
 def test_text_4(self):
     # string in byte
     input = u"đi học".encode("utf-8")
     output = Text(input)
     self.assertTrue(is_unicode(output))
예제 #13
0
 def test_text_3(self):
     # string in unicode tổ hợp
     input = u"cộng hòa xã hội"
     output = Text(input)
     self.assertTrue(is_unicode(output))
예제 #14
0
 def convert_text(x):
     try:
         return Text(x)
     except:
         pass
     return ""
예제 #15
0
import requests
import json
from os.path import join

from underthesea.feature_engineering.text import Text
from underthesea.util.file_io import write

url = "http://localhost:8000/api/corpora/"
headers = {
    'Content-type': 'application/json',
    'Accept': 'application/json'}
r = requests.get(url, headers=headers)
content = Text(json.dumps(r.json(), ensure_ascii=False))
write(join("data", "20171017.json"), content)
예제 #16
0
def read_utf16(filename):
    with open(filename, 'rb') as f:
        content = f.read()
        content = content.decode("utf-16")
        return Text(content)
예제 #17
0
 def test_2(self):
     dictionary = Dictionary.Instance()
     word = dictionary.lookup(Text("không có từ này"))
     self.assertEqual(None, word)