예제 #1
0
 def remove(self, word):
     try:
         w_name = word.get("word")
         x = normalize(w_name)
         token_type = word.get("tokenlize")
         if token_type == 1:
             self.__vn_dict.remove(x)
         elif token_type == 2:
             self.__given_names.remove(x)
         elif token_type == 3:
             self.__location.remove(x)
         elif token_type == 4:
             if x in self.__geo:
                 self.__geo.remove(x)
         elif token_type == 5:
             sl = x.split()
             key = sl[0]
             if key in self.__hard_dict:
                 self.__hard_dict[key].data.remove(x)
                 if self.__hard_dict[key].max_len < len(sl):
                     self.__hard_dict[key].max_len = len(sl)
                 # print(self.__hard_dict[key])
                 if len(self.__hard_dict[key].data) < 1:
                     del self.__hard_dict[key]
                     logging.info("del key (%s) " % key)
                 #
         else:
             self.__vn_dict.remove(x)
             logging.info("Vocabulary::Disable (word=%s,token_type=%s) " %
                          (x, token_type))
         logging.info("Vocabulary::Remove (%s) " % (word.get("word")))
     except Exception as e:
         logging.error(str(e), exc_info=True)
예제 #2
0
def create_token_type_db(word_list):
    cur_dir = conf.vocobulary_path
    bin_vocal = path.join(cur_dir, "word_types.bin")
    #
    result = [
        x for x in word_list
        if x.get("pos_tag") is not None and len(x.get("pos_tag")) > 0
    ]
    logging.info("Len word exist pos_tag : %s" % len(result))
    letters = set([r.get("word")[0] for r in result])
    letters = [i.lower() for i in letters if i.isalpha()]
    dct = dict({})
    for r in result:
        if r.get("pos_tag") is None or len(r.get("pos_tag")) < 1:
            continue
        for v in r.get("pos_tag"):
            dct[normalize(r.get("word"))] = r.get("pos_tag")
    dictionary = dict({l: {} for l in letters})
    for x in dct:
        try:
            dictionary[x[0]][x] = dct.get(x)
        except:
            logging.error(x)
    dump_to_file(dictionary, bin_vocal)
    logging.info("Save as : %s" % bin_vocal)
    return True
예제 #3
0
 def insert(self, word):
     try:
         # logging.info("Change (%s) " % (word.get("word")))
         w_name = word.get("word")
         x = normalize(w_name)
         token_type = word.get("tokenlize")
         if token_type == 1:
             self.__vn_dict.add(x)
         elif token_type == 2:
             self.__given_names.add(x)
         elif token_type == 3:
             self.__location.add(x)
         elif token_type == 4:
             self.geo.add(x)
         elif token_type == 5:
             sl = x.split()
             key = sl[0]
             if key not in self.__hard_dict:
                 self.__hard_dict[key] = Struct(max_len=0, data=set([]))
             self.__hard_dict[key].data.add(x)
             if self.__hard_dict[key].max_len < len(sl):
                 self.__hard_dict[key].max_len = len(sl)
             print(self.__hard_dict[key])
         logging.info("Vocabulary::Add (%s) " % (word.get("word")))
     except Exception as e:
         logging.error(str(e), exc_info=True)
예제 #4
0
 def get_middle_names(self) -> set:
     if self.__middle_names is None:
         filename = get_data_file("middle_names.txt", folder="vietnamese")
         # filename = path.join(conf.vietnamese_path, "middle_names.txt")
         logging.info("%s loaded" % filename)
         arr = [normalize(x) for x in readlines(filename)]
         self.__middle_names = set(list(dict.fromkeys(arr)))
     return self.__middle_names
예제 #5
0
 def get_family_names(self) -> set:
     if self.__family_names is None:
         filename = get_data_file("family_names.txt", folder="vietnamese")
         # filename = path.join(conf.vietnamese_path, "family_names.txt")
         arr = [
             normalize(x) for x in readlines(filename) if len(x.strip()) > 0
         ]
         self.__family_names = set(list(dict.fromkeys(arr)))
     return self.__family_names
예제 #6
0
def init_db_caching():
    from language_vn.vocabulary.database import VocalSqlLite

    cli = VocalSqlLite()
    word_list = cli.fetchAll("select * from Dictionary")
    for i in range(len(word_list)):
        word_list[i]["word"] = normalize_vn(normalize(word_list[i]["word"]))
    df = pd.DataFrame(word_list)
    create_word_token_db(df)
    create_entity_dic_v2(df)
예제 #7
0
 def get_short_word(self) -> set:
     if self.__short_word is None:
         filename = get_data_file("short_words.txt", folder="vietnamese")
         # filename = path.join(conf.vietnamese_path, "short_words.txt")
         logging.info("%s loaded" % filename)
         arr = [
             normalize(x) for x in readlines(filename) if len(x.strip()) > 0
         ]
         self.__short_word = set(list(dict.fromkeys(arr)))
     return self.__short_word
예제 #8
0
 def insert(self, word):
     try:
         if word.get("pos_tag") is None:
             return
         w_name = word.get("word")
         x = normalize(w_name.lower())
         if self.__dict.get(x[0]) is None:
             self.__dict[x[0]] = dict()
         self.__dict[x[0]][x] = word.get("pos_tag")
         logging.info("Vocabulary::Insert (%s value %s) " % (word.get("word"), self.lookup(x)))
     except Exception as e:
         logging.error(str(e), exc_info=True)
예제 #9
0
 def remove(self, word):
     try:
         w_name = word.get("word")
         x = normalize(w_name.lower())
         if self.__dict.get(x[0]) is None:
             return
         if self.__dict.get(x[0]).get(x) is None:
             return
         del self.__dict[x[0]][x]
         logging.info("Vocabulary::Remove (%s) " % (word.get("word")))
     except Exception as e:
         logging.error(str(e), exc_info=True)
예제 #10
0
def create_entity_dict(word_list):
    cur_dir = conf.vocobulary_path

    named_vocal_bin = path.join(cur_dir, "entity_named.dic.bin")
    result = [
        x for x in word_list
        if x.get("named") is not None and len(x.get("named")) > 0
    ]
    # result = result[-2:]
    logging.info("Total result exist named: %s" % len(result))
    #
    dct = {i: set([]) for i in ENTITY.values()}
    #
    letters = set([r.get("word")[0] for r in result if len(r.get("word")) > 1])
    letters = [i.lower() for i in letters if i.isalpha()]
    # last_modified = max([r.get("published_at") for r in result if "published_at" in r])
    for r in result:
        if r.get("named") is None or len(r.get("named")) < 1:
            continue
        wwl = normalize(r.get("word").lower())
        # if wwl == "anthony joshua":
        #     print(wwl)
        for v in r.get("named"):
            dct[v].add(wwl)

    counter, dct_words, root = 0, defaultdict(int), StringTrie()
    for feature in dct:
        for x in dct[feature]:
            try:
                if not x[0].isalpha():
                    continue
                ww = x.split()
                first_word = ww[0]
                if first_word not in dct_words:
                    dct_words[first_word] = len(ww)
                else:
                    max_len = dct_words.get(first_word)
                    if max_len < len(ww):
                        dct_words[first_word] = len(ww)
                bits = get_bits(dct, x)
                root[x] = bits
                counter += 1
            except Exception as e:
                logging.error(str(e), exc_info=True)
                exit()

    dictionary = {"root": root, "length": counter, "start_sylls": dct_words}
    logging.info("Total defined names : %s" % counter)
    dump_to_file(dictionary, named_vocal_bin)
    logging.info("Save as : %s" % named_vocal_bin)
    return True
예제 #11
0
def create_vocabulary_keywords(word_list):
    result = [
        x for x in word_list
        if x.get("weight") is not None and x.get("weight") > 0
    ]
    print(">> Total words weight> 0 : %s" % len(result))
    dct = {}
    for x in result:
        word = normalize(x.get("word")).replace(" ", "_")
        if len(word) < 1:
            continue
        dct[word] = x.get("weight")
    output = path.join(conf.vocobulary_path, "word_weight.bin")
    dump_to_file(dct, output)
    logging.info("Save as : %s" % output)
예제 #12
0
 def lookup(self, token):
     x = normalize(token)
     if x[0] in self.__dict:
         if x in self.__dict.get(x[0]):
             return self.__dict.get(x[0]).get(x)
     return None
예제 #13
0
    def __init__(self, token, tag=None, debug=False):
        self.__is_hard_dict = False
        self.__debug = debug
        self.__form = token.strip()  # Token word origin
        self.__word = token.lower()  # token word lower
        self.__word_is_vn = False  # từ là âm tiết việt nam
        wws = normalize(self.__word).strip().split()
        if len(wws) < 1:
            self.__key_node = self.__word
        else:
            self.__key_node = wws[0]
        self.__tag = tag
        self.__label = "<X>"
        self.__length = len(token.split("_"))
        self.vi = True
        self.lpnl = "<X>"  # Last prefix label nearest : Công_ty X X X X X An_Tâm : ví trí từ với pros gần nhất là gì
        # Feature fix - supervision-for-NER
        self.props = {}

        # Build thuộc tính theo VLSP 2016
        shapes = ["U" if c.istitle() else "L" for c in self.__form]
        self.shape = "".join(shapes[:4])
        self.shaped = "".join(shapes[:2])
        try:
            upper_shape = ["U" if c[0].istitle() else "L" for c in self.__form.split("_")]
        except:
            upper_shape = "LL"
        self.ti_shape = "".join(upper_shape[:4])
        self.ti_shaped = "".join(upper_shape[:2])
        self.ti = self.__form.istitle()

        if self.__form[0].istitle():
            # print(">> self.__form.istitle(): ", self.__form)
            syllabels_vi = [syllable_is_vietnamese(c) for c in self.__form.split("_")]
            syllabels_vi = [c for c in syllabels_vi if not c]
            if len(syllabels_vi) > 0:
                self.vi = False

        self.mix = True if MIX_CASE.search(self.__form) else False
        self.acronym = True if ACRONYM.search(self.__form) else False
        self.hyp = True if HYPHEN.search(self.__form) else False
        self.da = True if DATE.search(self.__form) else False
        self.na = isName(self.__form)
        self.ed = True if ENDS_WITH_DIGIT.search(self.__form) else False
        self.wei = True if UNIT.search(self.__form) else False
        self.digits_alphabet = get_digit_and_alphabet(self.__form)
        self.digits_hyphens = get_digit_and_char(self.__form, "-")
        self.digits_backslash = get_digit_and_char(self.__form, "/")
        # d&, contains digits and comma “10,000”
        self.digits_comma = get_digit_and_char(self.__form, ",")
        # d&. contains digits and period “10.000”
        self.digits_period = get_digit_and_char(self.__form, ".")
        self.wp = self.word_upper()
        self.au = self.__form.isupper()
        self.al = self.__form.islower()
        self.ad = self.__form.isdigit()
        self.d_2d = self.__length == 2 and self.ad
        self.d_4d = self.__length == 4 and self.ad
        self.ao = get_all_other(self.__form)
        self.cu = contains_upper(self.__form)
        self.cl = contains_lower(self.__form)
        self.ca = contains_alpha(self.__form)
        self.cd = contains_digit(self.__form)
        self.cs = contains_symbol(self.__form)
        self.plu = contains_plus(self.__form)
        self.sc = True if SCORE.search(self.__form) else False
        self.ugram = self.__length >= 3
        if self.da or self.ad:
            return
예제 #14
0
 def build_node_key(self, w: str):
     return normalize(w).replace("_", " ").split()[0]
예제 #15
0
 def to_key(self, w: str):
     return normalize(w).replace("_", " ").strip()