def parse_model(self, S, brand, folder): if folder is None: return None, None if folder not in self.__brands_models: return None, None # _brands_models: StringTrie = self.__brands_models.get(folder) # print("S, lower, brand", S, ", ", brand) lower = normalize_vn(normalize(S.lower())) b_low = normalize_vn(normalize(brand.lower())) b_split = b_low.split() ext = lower[len(b_low):] ww = ext.split() found = None for i in range(3, -1, -1): key = "/".join(b_split + ww[:i]) found = _brands_models.longest_prefix(key) if found.value is not None: val = normalize_vn(normalize(found.value)) start_pos = lower.find(val, len(b_low) - 1) + len(val) # print("---model_len ", S, len(S), start_pos, S[start_pos:]) if start_pos >= len(S): return found.value, None else: ML = str(S[len(b_low):start_pos]).strip() EX = str(S[start_pos:]).strip() EX = EX.replace("_", " ").strip() return ML, EX return None, None
def load_folder(self, folder): dic_csv = self.__map_dict.get(folder) if dic_csv is None: return logging.info(f"Read {dic_csv}") brands, models = self.read_brand(dic_csv) _brands: StringTrie = StringTrie() _brands_models: StringTrie = StringTrie() _models_brands: StringTrie = StringTrie() # Parse brands for S in brands: low = normalize_vn(normalize(S.lower())) ww = low.split() key = "/".join(ww) _brands["/".join(ww)] = S # Tất cả dòng, model ten_model_san_pham = [ M for B, M in models if not (M is None or M is np.nan) ] m_ct = Counter(ten_model_san_pham) for B, M in models: if M is None or M is np.nan: continue b_low = normalize_vn(normalize(B.lower())) m_low = normalize_vn(normalize(M.lower())) ww = " ".join([b_low, m_low]).split() key = "/".join(ww) _brands_models[key] = M # Chỉ lấy tên Model là duy nhất để gán dòng,model với hãng, 2 ky tự tro len, ko phai la number # Galaxy S20 -> Samsung # # if M == "Galaxy": # print(M) if len(M) < 2: continue if M.isdigit(): continue if m_ct.get(M) > 1: # print("m_ct.get(K) :", M, m_ct.get(M)) continue key = "/".join(m_low.split()) _models_brands[key] = f"{B}:{M}" # self.__brands[folder] = _brands self.__brands_models[folder] = _brands_models self.__models_brands[folder] = _models_brands
def danh_sach_tu_don(): vn_syll_1 = set([normalize_vn(x) for x in readlines(path.join("data", "vietnamese", "vietnamese-syllables.txt"))]) vn_syll_2 = set([normalize_vn(x) for x in readlines(path.join("data", "vietnamese", "arr_word_2021-03-19.txt"))]) vn_tu_ghep_1 = [normalize_vn(x) for x in readlines(path.join("data", "vietnamese", "arr_2_word_2021-03-19.txt"))] syll_tu_ghep = [r.split() for r in vn_tu_ghep_1] syll_b_ghep = set([r[0] for r in syll_tu_ghep if len(r) > 1]) syll_i_ghep = set([r[1] for r in syll_tu_ghep if len(r) > 1]) # am_tiet_co_the_la_B = vn_syll_1.intersection(syll_b_ghep) # am_tiet_co_the_la_I = vn_syll_1.intersection(syll_i_ghep) # am_tiet_kho_nam_trong_tu_ghep = [x for x in vn_syll_1 if x not in syll_b_ghep and x not in syll_i_ghep] am_tiet_nam_trong_tu_ghep = vn_syll_1.intersection(list(syll_b_ghep) + list(syll_i_ghep)) am_tiet_kho_nam_trong_tu_ghep = [x for x in vn_syll_1 if x not in syll_b_ghep and x not in syll_i_ghep] print(am_tiet_kho_nam_trong_tu_ghep) exit() # combine = vn_syll_1.intersection(vn_syll_2) not_in = [x for x in vn_syll_2 if x not in combine] write_lines(path.join("data", "vietnamese", "vietnamese-syllables.new.txt"), list(combine)) write_lines(path.join("data", "vietnamese", "vietnamese-syllables.ko.txt"), list(not_in)) write_lines( path.join("data", "vietnamese", "vietnamese-syllables.ko_the_ghep.txt"), list(vn_syll_1.intersection(vn_tu_ghep_1)), )
def detect(self, text: str, folder=0): taxonomy = [] ner = Semi_Supervised_Doc_Ner() # __init__ ori_words = tokenize(text) key_words = [normalize_vn(normalize(W.lower())) for W in ori_words] checking_pos = [ i for i, W in enumerate(ori_words) if len(W) > 1 and ( W[0].istitle() or W.isupper() or W.lower() == "iphone") ] map_token_types = ["X" for W in ori_words] len_sent = len(key_words) # if folder == Folder.SO_HOA.value: # print("--------Folder.SO_HOA.value----------") if folder in [Folder.XE.value, Folder.SO_HOA.value]: # Tìm từ bắt đầu từ hãng # Ví dụ : Samsung [A-Z]...., Toyota [A_Z].... brand_pos, cur_pos = [], -1 for I in checking_pos: if I <= cur_pos: continue b_start, b_end, b_value = self.find_brand(key_words, I, folder) if b_value is not None: brand_pos.append((b_start, b_end, b_value)) cur_pos = b_end - 1 if I < b_end - 1 else I brands = set() for b_start, b_end, b_value in brand_pos: for I in range(b_start, b_end): if map_token_types[I] != "X": continue map_token_types[I] = "BRAND" brand_str = " ".join(ori_words[b_start:b_end]) checking_str = found_obj_via_str(ori_words, b_end) if checking_str is None: taxonomy.append(f"{b_value}(BRAND)") continue S = " ".join([brand_str, checking_str]) # print("==> brand_str:", S, "---", brand_str) MN, MX = self.parse_model(S, brand_str, folder) # print("---> S MN, MX :", S, MN, MX, ner_words[i][0]) if MX is not None: taxonomy.append(f"{b_value}:{MN}:{MX}(PRODUCT)") elif MN is not None: taxonomy.append(f"{b_value}:{MN}(PRODUCT)") else: p_name = xe_year_str(f"{b_value}:{checking_str}") taxonomy.append(f"{p_name}(PRODUCT-INSERT)") if MN is not None: len_mx = len(MN.split()) for j, XX in enumerate(MN.split()): map_token_types[b_end + j] = "MODEL" if MX is not None: for j, XX in enumerate(MX.split()): map_token_types[b_end + len_mx + j] = "SERIAL" else: for j, checking_str in enumerate(checking_str.split()): map_token_types[b_end + j] = "MODEL" brands.add(brand_str) # Những từ được đánh dấu # Nhưng mã sản phẩm đặc biệt khác # Ví dụ: GT-S7260 -> Samsung Galaxy Star Pro # V8 Vantage => Aston Martin V8 Vantage # Range Rover Velar => Land Rover Range Rover Velar cur_pos = -1 for I in checking_pos: if map_token_types[I] != "X": # print(map_token_types[I], ori_words[I]) continue if I in brand_pos: continue if I <= cur_pos: continue m_start, m_end, m_value = self.find_model(key_words, I, folder) if m_value is not None: ori_word_i = ori_words[m_start] if normalize_vn(normalize(ori_word_i)) in âm_tiết_việt_nam: continue ok_rules = (ori_word_i.isupper() and len(ori_word_i) >= 2 ) or (ori_word_i.istitle() and len(ori_word_i) >= 4) if not ok_rules: continue # print("-- m_start, m_end, m_value:", m_start, m_end, m_value) taxonomy.append(f"{m_value}(PRODUCT)") brand_pos.append((b_start, b_end, b_value)) cur_pos = b_end - 1 if I < b_end - 1 else I for J in range(m_start, m_end): map_token_types[J] = "MODEL" # print(m_start, m_end) # Parse thêm extend từ phần chữ viết Hoa kế tiếp if folder == Folder.SO_HOA.value: extend_str = [] for J in range(m_end, min(m_end + 5, len_sent)): if ori_words[J].islower(): break if ori_words[J].isdigit(): extend_str.append(ori_words[J]) map_token_types[J] = "MODEL" elif ori_words[J][0].istitle(): extend_str.append(ori_words[J]) map_token_types[J] = "MODEL" else: break if len(extend_str) > 0: taxonomy.append( f"{m_value}:{' '.join(extend_str)}(PRODUCT)") # Những đối tượng khác sau hi parse các đối tượng đặc biệt XE,vv ner_words, s_tokens = [], [] for W in ori_words: s_tokens.append(W) if W in ".?!;": ner_words += ner.tokenlize(" ".join(s_tokens)) s_tokens = [] if len(s_tokens) > 0: ner_words += ner.tokenlize(" ".join(s_tokens)) del s_tokens # print("ner_words:", ner_words) c_pos, len_type = 0, len(map_token_types) for i, W in enumerate(ner_words): for x in W[0].split("_"): j = c_pos if ner_words[i][1] not in ["MODEL", "BRAND", "SERIAL"]: if j < len_type and map_token_types[j] != "X": ner_words[i] = (ner_words[i][0], map_token_types[j]) c_pos += 1 # print("taxonomy:", taxonomy) # return taxonomy, None # -- Bước 2: Kiểm tra PERSON # print(">> ner_words: ", ner_words) optimizer_tokens = self.tokens_and_fixed(ner_words) if folder in [Folder.XE.value, Folder.SO_HOA.value]: # Quét thêm khi các model nằm xa sản phẩm for B in brands: for i, W in enumerate(optimizer_tokens): if W[1] == "BRAND": continue if W[1] not in [ "B-FAL", "B-ORG", "B-MISC", "B-PER", "B-MISC", "B-GEO" ]: continue S = " ".join([B, W[0]]) MN, MX = self.parse_model(S, B, folder) if MX is not None: taxonomy.append(f"{B}:{MN}:{MX}(PRODUCT)") optimizer_tokens[i] = (optimizer_tokens[i][0], "MODEL") elif MN is not None: taxonomy.append(f"{B}:{MN}(PRODUCT)") optimizer_tokens[i] = (optimizer_tokens[i][0], "MODEL") object_tags = set(["B-GEO", "B-PER", "B-ORG", "B-MISC", "B-FAL"]) persons = [ (i, T[0], T[1]) for i, T in enumerate(optimizer_tokens) if T[1] != "B-POS" and T[1] != "B-TIME" and T[1].startswith("B-") ] remove_pos = [] if folder in [Folder.GIAI_TRI.value, Folder.THE_THAO.value]: for i, S, P in persons: pros = self.__vocal.get_pros(S) if pros is not None and len(pros) > 0: pros = { self.__prop_map.get(K): V for K, V in enumerate(pros) if V == True } if "is_artist" in pros and P in object_tags: taxonomy.append(f"{S}(ARTIST)") remove_pos.append(i) elif "is_football" in pros and P in object_tags: taxonomy.append(f"{S}(FOOTBALL)") remove_pos.append(i) for i, S, P in persons: if i in remove_pos: continue # Những cái nào có I thì có khả năng chính xác if i > 0 and optimizer_tokens[i - 1][1] == "I": continue pros = self.__vocal.get_pros(S) if pros is not None and len(pros) > 0: pros = { self.__prop_map.get(K): V for K, V in enumerate(pros) if V == True } # if "is_artist" in pros and P in object_tags: # taxonomy.append(f"{S}(ARTIST)") # remove_pos.append(i) # elif "is_football" in pros and P in object_tags: # taxonomy.append(f"{S}(FOOTBALL)") # remove_pos.append(i) if "is_per" in pros and P in object_tags: taxonomy.append(f"{S}(PERSON)") remove_pos.append(i) elif "is_org" in pros and P in object_tags: taxonomy.append(f"{S}(ORG)") remove_pos.append(i) elif "is_misc" in pros and P in object_tags: taxonomy.append(f"{S}(MISC)") remove_pos.append(i) elif "is_loc" in pros and P in "B-LOC": taxonomy.append(f"{S}(LOC)") remove_pos.append(i) elif "is_geo" in pros and P in object_tags: taxonomy.append(f"{S}(GEO)") remove_pos.append(i) elif "is_fal" in pros and P in object_tags: taxonomy.append(f"{S}(FAL)") remove_pos.append(i) # print(">> optimizer_tokens: ", optimizer_tokens) merge_tokens = chunk_nealy_obj.merge(optimizer_tokens) # print('>> merge_tokens:', merge_tokens) obj_tokens = [ self.to_text(merge_tokens, i) for i, T in enumerate(merge_tokens) if T[1] not in ["BRAND", "MODEL", "B-POS", "B-TIME", "B-CUR"] and T[1].startswith("B-") and i not in remove_pos ] # print("Còn lại:", obj_tokens) return taxonomy, obj_tokens
def danh_sach_tu_don(): vn_syll_1 = set([normalize_vn(x) for x in readlines(path.join("data", "vietnamese", "vietnamese-syllables.txt"))]) vn_syll_2 = set([normalize_vn(x) for x in readlines(path.join("data", "vietnamese", "arr_word_2021-03-19.txt"))]) vn_tu_ghep_1 = [normalize_vn(x) for x in readlines(path.join("data", "vietnamese", "arr_2_word_2021-03-19.txt"))] syll_tu_ghep = [r.split() for r in vn_tu_ghep_1] syll_b_ghep = set([r[0] for r in syll_tu_ghep if len(r) > 1]) syll_i_ghep = set([r[1] for r in syll_tu_ghep if len(r) > 1]) # am_tiet_co_the_la_B = vn_syll_1.intersection(syll_b_ghep) # am_tiet_co_the_la_I = vn_syll_1.intersection(syll_i_ghep) # am_tiet_kho_nam_trong_tu_ghep = [x for x in vn_syll_1 if x not in syll_b_ghep and x not in syll_i_ghep] am_tiet_nam_trong_tu_ghep = vn_syll_1.intersection(list(syll_b_ghep) + list(syll_i_ghep)) am_tiet_kho_nam_trong_tu_ghep = [x for x in vn_syll_1 if x not in syll_b_ghep and x not in syll_i_ghep] print(am_tiet_kho_nam_trong_tu_ghep) exit() # combine = vn_syll_1.intersection(vn_syll_2) not_in = [x for x in vn_syll_2 if x not in combine] write_lines(path.join("data", "vietnamese", "vietnamese-syllables.new.txt"), list(combine)) write_lines(path.join("data", "vietnamese", "vietnamese-syllables.ko.txt"), list(not_in)) write_lines( path.join("data", "vietnamese", "vietnamese-syllables.ko_the_ghep.txt"), list(vn_syll_1.intersection(vn_tu_ghep_1)), ) vn_syll_1 = set([normalize_vn(x) for x in readlines(path.join("data", "vietnamese", "vietnamese-syllables.txt"))]) vn_syll_2 = set([normalize_vn(x) for x in readlines(path.join("data", "vietnamese", "vietnamese-syllables.1.txt"))]) da_co = vn_syll_1.intersection(vn_syll_2) new_words = [c for c in vn_syll_1 if c not in da_co] write_lines(path.join("data", "vietnamese", "vietnamese-syllables.chua_co.txt"), list(new_words))