Python normalize_vn 예제들, ngon_ngu.word_tokenize.utils.rules.normalize_vn Python 예제들

예제 #1

0

파일 보기

파일: taxonomy.py 프로젝트: microvnn/language_vn

 def parse_model(self, S, brand, folder):
     if folder is None:
         return None, None
     if folder not in self.__brands_models:
         return None, None
     #
     _brands_models: StringTrie = self.__brands_models.get(folder)
     # print("S, lower, brand", S, ", ", brand)
     lower = normalize_vn(normalize(S.lower()))
     b_low = normalize_vn(normalize(brand.lower()))
     b_split = b_low.split()
     ext = lower[len(b_low):]
     ww = ext.split()
     found = None
     for i in range(3, -1, -1):
         key = "/".join(b_split + ww[:i])
         found = _brands_models.longest_prefix(key)
         if found.value is not None:
             val = normalize_vn(normalize(found.value))
             start_pos = lower.find(val, len(b_low) - 1) + len(val)
             # print("---model_len ", S, len(S), start_pos, S[start_pos:])
             if start_pos >= len(S):
                 return found.value, None
             else:
                 ML = str(S[len(b_low):start_pos]).strip()
                 EX = str(S[start_pos:]).strip()
                 EX = EX.replace("_", " ").strip()
                 return ML, EX
     return None, None

예제 #2

0

파일 보기

파일: taxonomy.py 프로젝트: microvnn/language_vn

    def load_folder(self, folder):
        dic_csv = self.__map_dict.get(folder)
        if dic_csv is None:
            return
        logging.info(f"Read {dic_csv}")
        brands, models = self.read_brand(dic_csv)
        _brands: StringTrie = StringTrie()
        _brands_models: StringTrie = StringTrie()
        _models_brands: StringTrie = StringTrie()

        # Parse brands
        for S in brands:
            low = normalize_vn(normalize(S.lower()))
            ww = low.split()
            key = "/".join(ww)
            _brands["/".join(ww)] = S

        # Tất cả dòng, model
        ten_model_san_pham = [
            M for B, M in models if not (M is None or M is np.nan)
        ]
        m_ct = Counter(ten_model_san_pham)
        for B, M in models:
            if M is None or M is np.nan:
                continue
            b_low = normalize_vn(normalize(B.lower()))
            m_low = normalize_vn(normalize(M.lower()))
            ww = " ".join([b_low, m_low]).split()
            key = "/".join(ww)
            _brands_models[key] = M

            # Chỉ lấy tên Model là duy nhất để gán dòng,model với hãng, 2 ky tự tro len, ko phai la number
            # Galaxy S20 -> Samsung
            #
            # if M == "Galaxy":
            #     print(M)
            if len(M) < 2:
                continue
            if M.isdigit():
                continue
            if m_ct.get(M) > 1:
                # print("m_ct.get(K) :", M, m_ct.get(M))
                continue
            key = "/".join(m_low.split())
            _models_brands[key] = f"{B}:{M}"
        #
        self.__brands[folder] = _brands
        self.__brands_models[folder] = _brands_models
        self.__models_brands[folder] = _models_brands

예제 #3

0

파일 보기

파일: compare_tu_don.py 프로젝트: microvnn/language_vn

def danh_sach_tu_don():
    vn_syll_1 = set([normalize_vn(x) for x in readlines(path.join("data", "vietnamese", "vietnamese-syllables.txt"))])
    vn_syll_2 = set([normalize_vn(x) for x in readlines(path.join("data", "vietnamese", "arr_word_2021-03-19.txt"))])
    vn_tu_ghep_1 = [normalize_vn(x) for x in readlines(path.join("data", "vietnamese", "arr_2_word_2021-03-19.txt"))]

    syll_tu_ghep = [r.split() for r in vn_tu_ghep_1]
    syll_b_ghep = set([r[0] for r in syll_tu_ghep if len(r) > 1])
    syll_i_ghep = set([r[1] for r in syll_tu_ghep if len(r) > 1])
    # am_tiet_co_the_la_B = vn_syll_1.intersection(syll_b_ghep)
    # am_tiet_co_the_la_I = vn_syll_1.intersection(syll_i_ghep)
    # am_tiet_kho_nam_trong_tu_ghep = [x for x in vn_syll_1 if x not in syll_b_ghep and x not in syll_i_ghep]
    am_tiet_nam_trong_tu_ghep = vn_syll_1.intersection(list(syll_b_ghep) + list(syll_i_ghep))
    am_tiet_kho_nam_trong_tu_ghep = [x for x in vn_syll_1 if x not in syll_b_ghep and x not in syll_i_ghep]
    print(am_tiet_kho_nam_trong_tu_ghep)
    exit()
    #
    combine = vn_syll_1.intersection(vn_syll_2)
    not_in = [x for x in vn_syll_2 if x not in combine]
    write_lines(path.join("data", "vietnamese", "vietnamese-syllables.new.txt"), list(combine))
    write_lines(path.join("data", "vietnamese", "vietnamese-syllables.ko.txt"), list(not_in))
    write_lines(
        path.join("data", "vietnamese", "vietnamese-syllables.ko_the_ghep.txt"),
        list(vn_syll_1.intersection(vn_tu_ghep_1)),
    )

예제 #4

0

파일 보기

파일: taxonomy.py 프로젝트: microvnn/language_vn

    def detect(self, text: str, folder=0):
        taxonomy = []
        ner = Semi_Supervised_Doc_Ner()
        # __init__
        ori_words = tokenize(text)
        key_words = [normalize_vn(normalize(W.lower())) for W in ori_words]
        checking_pos = [
            i for i, W in enumerate(ori_words) if len(W) > 1 and (
                W[0].istitle() or W.isupper() or W.lower() == "iphone")
        ]
        map_token_types = ["X" for W in ori_words]
        len_sent = len(key_words)

        # if folder == Folder.SO_HOA.value:
        #     print("--------Folder.SO_HOA.value----------")

        if folder in [Folder.XE.value, Folder.SO_HOA.value]:
            # Tìm từ bắt đầu từ hãng
            # Ví dụ : Samsung [A-Z]...., Toyota [A_Z]....
            brand_pos, cur_pos = [], -1
            for I in checking_pos:
                if I <= cur_pos:
                    continue
                b_start, b_end, b_value = self.find_brand(key_words, I, folder)
                if b_value is not None:
                    brand_pos.append((b_start, b_end, b_value))
                    cur_pos = b_end - 1 if I < b_end - 1 else I

            brands = set()
            for b_start, b_end, b_value in brand_pos:
                for I in range(b_start, b_end):
                    if map_token_types[I] != "X":
                        continue
                    map_token_types[I] = "BRAND"
                    brand_str = " ".join(ori_words[b_start:b_end])
                    checking_str = found_obj_via_str(ori_words, b_end)
                    if checking_str is None:
                        taxonomy.append(f"{b_value}(BRAND)")
                        continue
                    S = " ".join([brand_str, checking_str])
                    # print("==> brand_str:", S, "---", brand_str)
                    MN, MX = self.parse_model(S, brand_str, folder)
                    # print("---> S MN, MX :", S, MN, MX, ner_words[i][0])
                    if MX is not None:
                        taxonomy.append(f"{b_value}:{MN}:{MX}(PRODUCT)")
                    elif MN is not None:
                        taxonomy.append(f"{b_value}:{MN}(PRODUCT)")
                    else:
                        p_name = xe_year_str(f"{b_value}:{checking_str}")
                        taxonomy.append(f"{p_name}(PRODUCT-INSERT)")

                    if MN is not None:
                        len_mx = len(MN.split())
                        for j, XX in enumerate(MN.split()):
                            map_token_types[b_end + j] = "MODEL"
                        if MX is not None:
                            for j, XX in enumerate(MX.split()):
                                map_token_types[b_end + len_mx + j] = "SERIAL"
                    else:
                        for j, checking_str in enumerate(checking_str.split()):
                            map_token_types[b_end + j] = "MODEL"
                    brands.add(brand_str)

            # Những từ được đánh dấu
            # Nhưng mã sản phẩm đặc biệt khác
            # Ví dụ: GT-S7260 -> Samsung Galaxy Star Pro
            #        V8 Vantage => Aston Martin V8 Vantage
            #        Range Rover Velar => Land Rover Range Rover Velar
            cur_pos = -1
            for I in checking_pos:
                if map_token_types[I] != "X":
                    # print(map_token_types[I], ori_words[I])
                    continue
                if I in brand_pos:
                    continue
                if I <= cur_pos:
                    continue
                m_start, m_end, m_value = self.find_model(key_words, I, folder)
                if m_value is not None:
                    ori_word_i = ori_words[m_start]
                    if normalize_vn(normalize(ori_word_i)) in âm_tiết_việt_nam:
                        continue
                    ok_rules = (ori_word_i.isupper() and len(ori_word_i) >= 2
                                ) or (ori_word_i.istitle()
                                      and len(ori_word_i) >= 4)
                    if not ok_rules:
                        continue
                    # print("-- m_start, m_end, m_value:", m_start, m_end, m_value)
                    taxonomy.append(f"{m_value}(PRODUCT)")
                    brand_pos.append((b_start, b_end, b_value))
                    cur_pos = b_end - 1 if I < b_end - 1 else I
                    for J in range(m_start, m_end):
                        map_token_types[J] = "MODEL"
                    # print(m_start, m_end)

                    # Parse thêm extend từ phần chữ viết Hoa kế tiếp
                    if folder == Folder.SO_HOA.value:
                        extend_str = []
                        for J in range(m_end, min(m_end + 5, len_sent)):
                            if ori_words[J].islower():
                                break
                            if ori_words[J].isdigit():
                                extend_str.append(ori_words[J])
                                map_token_types[J] = "MODEL"
                            elif ori_words[J][0].istitle():
                                extend_str.append(ori_words[J])
                                map_token_types[J] = "MODEL"
                            else:
                                break
                        if len(extend_str) > 0:
                            taxonomy.append(
                                f"{m_value}:{' '.join(extend_str)}(PRODUCT)")

        # Những đối tượng khác sau hi parse các đối tượng đặc biệt XE,vv
        ner_words, s_tokens = [], []
        for W in ori_words:
            s_tokens.append(W)
            if W in ".?!;":
                ner_words += ner.tokenlize(" ".join(s_tokens))
                s_tokens = []
        if len(s_tokens) > 0:
            ner_words += ner.tokenlize(" ".join(s_tokens))
        del s_tokens

        # print("ner_words:", ner_words)
        c_pos, len_type = 0, len(map_token_types)
        for i, W in enumerate(ner_words):
            for x in W[0].split("_"):
                j = c_pos
                if ner_words[i][1] not in ["MODEL", "BRAND", "SERIAL"]:
                    if j < len_type and map_token_types[j] != "X":
                        ner_words[i] = (ner_words[i][0], map_token_types[j])
                c_pos += 1
        # print("taxonomy:", taxonomy)
        # return taxonomy, None

        # -- Bước 2: Kiểm tra PERSON
        # print(">> ner_words: ", ner_words)
        optimizer_tokens = self.tokens_and_fixed(ner_words)

        if folder in [Folder.XE.value, Folder.SO_HOA.value]:
            # Quét thêm khi các model nằm xa sản phẩm
            for B in brands:
                for i, W in enumerate(optimizer_tokens):
                    if W[1] == "BRAND":
                        continue
                    if W[1] not in [
                            "B-FAL", "B-ORG", "B-MISC", "B-PER", "B-MISC",
                            "B-GEO"
                    ]:
                        continue
                    S = " ".join([B, W[0]])
                    MN, MX = self.parse_model(S, B, folder)
                    if MX is not None:
                        taxonomy.append(f"{B}:{MN}:{MX}(PRODUCT)")
                        optimizer_tokens[i] = (optimizer_tokens[i][0], "MODEL")
                    elif MN is not None:
                        taxonomy.append(f"{B}:{MN}(PRODUCT)")
                        optimizer_tokens[i] = (optimizer_tokens[i][0], "MODEL")

        object_tags = set(["B-GEO", "B-PER", "B-ORG", "B-MISC", "B-FAL"])
        persons = [
            (i, T[0], T[1]) for i, T in enumerate(optimizer_tokens)
            if T[1] != "B-POS" and T[1] != "B-TIME" and T[1].startswith("B-")
        ]
        remove_pos = []

        if folder in [Folder.GIAI_TRI.value, Folder.THE_THAO.value]:
            for i, S, P in persons:
                pros = self.__vocal.get_pros(S)
                if pros is not None and len(pros) > 0:
                    pros = {
                        self.__prop_map.get(K): V
                        for K, V in enumerate(pros) if V == True
                    }
                    if "is_artist" in pros and P in object_tags:
                        taxonomy.append(f"{S}(ARTIST)")
                        remove_pos.append(i)
                    elif "is_football" in pros and P in object_tags:
                        taxonomy.append(f"{S}(FOOTBALL)")
                        remove_pos.append(i)

        for i, S, P in persons:
            if i in remove_pos:
                continue
            # Những cái nào có I thì có khả năng chính xác
            if i > 0 and optimizer_tokens[i - 1][1] == "I":
                continue
            pros = self.__vocal.get_pros(S)
            if pros is not None and len(pros) > 0:
                pros = {
                    self.__prop_map.get(K): V
                    for K, V in enumerate(pros) if V == True
                }
                # if "is_artist" in pros and P in object_tags:
                #     taxonomy.append(f"{S}(ARTIST)")
                #     remove_pos.append(i)
                # elif "is_football" in pros and P in object_tags:
                #     taxonomy.append(f"{S}(FOOTBALL)")
                #     remove_pos.append(i)
                if "is_per" in pros and P in object_tags:
                    taxonomy.append(f"{S}(PERSON)")
                    remove_pos.append(i)
                elif "is_org" in pros and P in object_tags:
                    taxonomy.append(f"{S}(ORG)")
                    remove_pos.append(i)
                elif "is_misc" in pros and P in object_tags:
                    taxonomy.append(f"{S}(MISC)")
                    remove_pos.append(i)
                elif "is_loc" in pros and P in "B-LOC":
                    taxonomy.append(f"{S}(LOC)")
                    remove_pos.append(i)
                elif "is_geo" in pros and P in object_tags:
                    taxonomy.append(f"{S}(GEO)")
                    remove_pos.append(i)
                elif "is_fal" in pros and P in object_tags:
                    taxonomy.append(f"{S}(FAL)")
                    remove_pos.append(i)

        # print(">> optimizer_tokens: ", optimizer_tokens)
        merge_tokens = chunk_nealy_obj.merge(optimizer_tokens)
        # print('>> merge_tokens:', merge_tokens)
        obj_tokens = [
            self.to_text(merge_tokens, i) for i, T in enumerate(merge_tokens)
            if T[1] not in ["BRAND", "MODEL", "B-POS", "B-TIME", "B-CUR"]
            and T[1].startswith("B-") and i not in remove_pos
        ]
        # print("Còn lại:", obj_tokens)
        return taxonomy, obj_tokens

예제 #5

0

파일 보기

파일: compare_tu_don.py 프로젝트: microvnn/language_vn

def danh_sach_tu_don():
    vn_syll_1 = set([normalize_vn(x) for x in readlines(path.join("data", "vietnamese", "vietnamese-syllables.txt"))])
    vn_syll_2 = set([normalize_vn(x) for x in readlines(path.join("data", "vietnamese", "arr_word_2021-03-19.txt"))])
    vn_tu_ghep_1 = [normalize_vn(x) for x in readlines(path.join("data", "vietnamese", "arr_2_word_2021-03-19.txt"))]

    syll_tu_ghep = [r.split() for r in vn_tu_ghep_1]
    syll_b_ghep = set([r[0] for r in syll_tu_ghep if len(r) > 1])
    syll_i_ghep = set([r[1] for r in syll_tu_ghep if len(r) > 1])
    # am_tiet_co_the_la_B = vn_syll_1.intersection(syll_b_ghep)
    # am_tiet_co_the_la_I = vn_syll_1.intersection(syll_i_ghep)
    # am_tiet_kho_nam_trong_tu_ghep = [x for x in vn_syll_1 if x not in syll_b_ghep and x not in syll_i_ghep]
    am_tiet_nam_trong_tu_ghep = vn_syll_1.intersection(list(syll_b_ghep) + list(syll_i_ghep))
    am_tiet_kho_nam_trong_tu_ghep = [x for x in vn_syll_1 if x not in syll_b_ghep and x not in syll_i_ghep]
    print(am_tiet_kho_nam_trong_tu_ghep)
    exit()
    #
    combine = vn_syll_1.intersection(vn_syll_2)
    not_in = [x for x in vn_syll_2 if x not in combine]
    write_lines(path.join("data", "vietnamese", "vietnamese-syllables.new.txt"), list(combine))
    write_lines(path.join("data", "vietnamese", "vietnamese-syllables.ko.txt"), list(not_in))
    write_lines(
        path.join("data", "vietnamese", "vietnamese-syllables.ko_the_ghep.txt"),
        list(vn_syll_1.intersection(vn_tu_ghep_1)),
    )

vn_syll_1 = set([normalize_vn(x) for x in readlines(path.join("data", "vietnamese", "vietnamese-syllables.txt"))])
vn_syll_2 = set([normalize_vn(x) for x in readlines(path.join("data", "vietnamese", "vietnamese-syllables.1.txt"))])
da_co = vn_syll_1.intersection(vn_syll_2)
new_words = [c for c in vn_syll_1 if c not in da_co]
write_lines(path.join("data", "vietnamese", "vietnamese-syllables.chua_co.txt"), list(new_words))