def remove(self, word): try: w_name = word.get("word") x = normalize(w_name) token_type = word.get("tokenlize") if token_type == 1: self.__vn_dict.remove(x) elif token_type == 2: self.__given_names.remove(x) elif token_type == 3: self.__location.remove(x) elif token_type == 4: if x in self.__geo: self.__geo.remove(x) elif token_type == 5: sl = x.split() key = sl[0] if key in self.__hard_dict: self.__hard_dict[key].data.remove(x) if self.__hard_dict[key].max_len < len(sl): self.__hard_dict[key].max_len = len(sl) # print(self.__hard_dict[key]) if len(self.__hard_dict[key].data) < 1: del self.__hard_dict[key] logging.info("del key (%s) " % key) # else: self.__vn_dict.remove(x) logging.info("Vocabulary::Disable (word=%s,token_type=%s) " % (x, token_type)) logging.info("Vocabulary::Remove (%s) " % (word.get("word"))) except Exception as e: logging.error(str(e), exc_info=True)
def create_token_type_db(word_list): cur_dir = conf.vocobulary_path bin_vocal = path.join(cur_dir, "word_types.bin") # result = [ x for x in word_list if x.get("pos_tag") is not None and len(x.get("pos_tag")) > 0 ] logging.info("Len word exist pos_tag : %s" % len(result)) letters = set([r.get("word")[0] for r in result]) letters = [i.lower() for i in letters if i.isalpha()] dct = dict({}) for r in result: if r.get("pos_tag") is None or len(r.get("pos_tag")) < 1: continue for v in r.get("pos_tag"): dct[normalize(r.get("word"))] = r.get("pos_tag") dictionary = dict({l: {} for l in letters}) for x in dct: try: dictionary[x[0]][x] = dct.get(x) except: logging.error(x) dump_to_file(dictionary, bin_vocal) logging.info("Save as : %s" % bin_vocal) return True
def insert(self, word): try: # logging.info("Change (%s) " % (word.get("word"))) w_name = word.get("word") x = normalize(w_name) token_type = word.get("tokenlize") if token_type == 1: self.__vn_dict.add(x) elif token_type == 2: self.__given_names.add(x) elif token_type == 3: self.__location.add(x) elif token_type == 4: self.geo.add(x) elif token_type == 5: sl = x.split() key = sl[0] if key not in self.__hard_dict: self.__hard_dict[key] = Struct(max_len=0, data=set([])) self.__hard_dict[key].data.add(x) if self.__hard_dict[key].max_len < len(sl): self.__hard_dict[key].max_len = len(sl) print(self.__hard_dict[key]) logging.info("Vocabulary::Add (%s) " % (word.get("word"))) except Exception as e: logging.error(str(e), exc_info=True)
def get_middle_names(self) -> set: if self.__middle_names is None: filename = get_data_file("middle_names.txt", folder="vietnamese") # filename = path.join(conf.vietnamese_path, "middle_names.txt") logging.info("%s loaded" % filename) arr = [normalize(x) for x in readlines(filename)] self.__middle_names = set(list(dict.fromkeys(arr))) return self.__middle_names
def get_family_names(self) -> set: if self.__family_names is None: filename = get_data_file("family_names.txt", folder="vietnamese") # filename = path.join(conf.vietnamese_path, "family_names.txt") arr = [ normalize(x) for x in readlines(filename) if len(x.strip()) > 0 ] self.__family_names = set(list(dict.fromkeys(arr))) return self.__family_names
def init_db_caching(): from language_vn.vocabulary.database import VocalSqlLite cli = VocalSqlLite() word_list = cli.fetchAll("select * from Dictionary") for i in range(len(word_list)): word_list[i]["word"] = normalize_vn(normalize(word_list[i]["word"])) df = pd.DataFrame(word_list) create_word_token_db(df) create_entity_dic_v2(df)
def get_short_word(self) -> set: if self.__short_word is None: filename = get_data_file("short_words.txt", folder="vietnamese") # filename = path.join(conf.vietnamese_path, "short_words.txt") logging.info("%s loaded" % filename) arr = [ normalize(x) for x in readlines(filename) if len(x.strip()) > 0 ] self.__short_word = set(list(dict.fromkeys(arr))) return self.__short_word
def insert(self, word): try: if word.get("pos_tag") is None: return w_name = word.get("word") x = normalize(w_name.lower()) if self.__dict.get(x[0]) is None: self.__dict[x[0]] = dict() self.__dict[x[0]][x] = word.get("pos_tag") logging.info("Vocabulary::Insert (%s value %s) " % (word.get("word"), self.lookup(x))) except Exception as e: logging.error(str(e), exc_info=True)
def remove(self, word): try: w_name = word.get("word") x = normalize(w_name.lower()) if self.__dict.get(x[0]) is None: return if self.__dict.get(x[0]).get(x) is None: return del self.__dict[x[0]][x] logging.info("Vocabulary::Remove (%s) " % (word.get("word"))) except Exception as e: logging.error(str(e), exc_info=True)
def create_entity_dict(word_list): cur_dir = conf.vocobulary_path named_vocal_bin = path.join(cur_dir, "entity_named.dic.bin") result = [ x for x in word_list if x.get("named") is not None and len(x.get("named")) > 0 ] # result = result[-2:] logging.info("Total result exist named: %s" % len(result)) # dct = {i: set([]) for i in ENTITY.values()} # letters = set([r.get("word")[0] for r in result if len(r.get("word")) > 1]) letters = [i.lower() for i in letters if i.isalpha()] # last_modified = max([r.get("published_at") for r in result if "published_at" in r]) for r in result: if r.get("named") is None or len(r.get("named")) < 1: continue wwl = normalize(r.get("word").lower()) # if wwl == "anthony joshua": # print(wwl) for v in r.get("named"): dct[v].add(wwl) counter, dct_words, root = 0, defaultdict(int), StringTrie() for feature in dct: for x in dct[feature]: try: if not x[0].isalpha(): continue ww = x.split() first_word = ww[0] if first_word not in dct_words: dct_words[first_word] = len(ww) else: max_len = dct_words.get(first_word) if max_len < len(ww): dct_words[first_word] = len(ww) bits = get_bits(dct, x) root[x] = bits counter += 1 except Exception as e: logging.error(str(e), exc_info=True) exit() dictionary = {"root": root, "length": counter, "start_sylls": dct_words} logging.info("Total defined names : %s" % counter) dump_to_file(dictionary, named_vocal_bin) logging.info("Save as : %s" % named_vocal_bin) return True
def create_vocabulary_keywords(word_list): result = [ x for x in word_list if x.get("weight") is not None and x.get("weight") > 0 ] print(">> Total words weight> 0 : %s" % len(result)) dct = {} for x in result: word = normalize(x.get("word")).replace(" ", "_") if len(word) < 1: continue dct[word] = x.get("weight") output = path.join(conf.vocobulary_path, "word_weight.bin") dump_to_file(dct, output) logging.info("Save as : %s" % output)
def lookup(self, token): x = normalize(token) if x[0] in self.__dict: if x in self.__dict.get(x[0]): return self.__dict.get(x[0]).get(x) return None
def __init__(self, token, tag=None, debug=False): self.__is_hard_dict = False self.__debug = debug self.__form = token.strip() # Token word origin self.__word = token.lower() # token word lower self.__word_is_vn = False # từ là âm tiết việt nam wws = normalize(self.__word).strip().split() if len(wws) < 1: self.__key_node = self.__word else: self.__key_node = wws[0] self.__tag = tag self.__label = "<X>" self.__length = len(token.split("_")) self.vi = True self.lpnl = "<X>" # Last prefix label nearest : Công_ty X X X X X An_Tâm : ví trí từ với pros gần nhất là gì # Feature fix - supervision-for-NER self.props = {} # Build thuộc tính theo VLSP 2016 shapes = ["U" if c.istitle() else "L" for c in self.__form] self.shape = "".join(shapes[:4]) self.shaped = "".join(shapes[:2]) try: upper_shape = ["U" if c[0].istitle() else "L" for c in self.__form.split("_")] except: upper_shape = "LL" self.ti_shape = "".join(upper_shape[:4]) self.ti_shaped = "".join(upper_shape[:2]) self.ti = self.__form.istitle() if self.__form[0].istitle(): # print(">> self.__form.istitle(): ", self.__form) syllabels_vi = [syllable_is_vietnamese(c) for c in self.__form.split("_")] syllabels_vi = [c for c in syllabels_vi if not c] if len(syllabels_vi) > 0: self.vi = False self.mix = True if MIX_CASE.search(self.__form) else False self.acronym = True if ACRONYM.search(self.__form) else False self.hyp = True if HYPHEN.search(self.__form) else False self.da = True if DATE.search(self.__form) else False self.na = isName(self.__form) self.ed = True if ENDS_WITH_DIGIT.search(self.__form) else False self.wei = True if UNIT.search(self.__form) else False self.digits_alphabet = get_digit_and_alphabet(self.__form) self.digits_hyphens = get_digit_and_char(self.__form, "-") self.digits_backslash = get_digit_and_char(self.__form, "/") # d&, contains digits and comma “10,000” self.digits_comma = get_digit_and_char(self.__form, ",") # d&. contains digits and period “10.000” self.digits_period = get_digit_and_char(self.__form, ".") self.wp = self.word_upper() self.au = self.__form.isupper() self.al = self.__form.islower() self.ad = self.__form.isdigit() self.d_2d = self.__length == 2 and self.ad self.d_4d = self.__length == 4 and self.ad self.ao = get_all_other(self.__form) self.cu = contains_upper(self.__form) self.cl = contains_lower(self.__form) self.ca = contains_alpha(self.__form) self.cd = contains_digit(self.__form) self.cs = contains_symbol(self.__form) self.plu = contains_plus(self.__form) self.sc = True if SCORE.search(self.__form) else False self.ugram = self.__length >= 3 if self.da or self.ad: return
def build_node_key(self, w: str): return normalize(w).replace("_", " ").split()[0]
def to_key(self, w: str): return normalize(w).replace("_", " ").strip()