예제 #1
0
 def reload(self):
     cdr_file = get_data_file("named.rdr", folder="vietnamese")
     self.__root = SCRDRTree()
     self.__root.constructSCRDRtreeFromRDRfile(cdr_file)
     crf_file = get_data_file("ner.crf.bin", folder="models")
     if not path.isfile(crf_file):
         logging.error("Model %s not found " % crf_file)
     self.__crf: pycrfsuite.Tagger = pycrfsuite.Tagger()
     self.__crf.open(crf_file)
     if self.__debug:
         logging.info("Labels in model : %s" % str(self.__crf.labels()))
예제 #2
0
 def __init__(self, debug=False):
     cdr_file = get_data_file("named.rdr", folder="vietnamese")
     self.__root = SCRDRTree()
     self.__root.constructSCRDRtreeFromRDRfile(cdr_file)
     crf_file = get_data_file("ner.crf.bin", folder="models")
     if not path.isfile(crf_file):
         logging.error("Model %s not found " % crf_file)
         print("Model %s not found " % crf_file)
         exit()
     self.__crf: pycrfsuite.Tagger = pycrfsuite.Tagger()
     self.__crf.open(crf_file)
     self.__nlp = None
     self.__debug = debug
     self.__adapter: DocFeatures = DocFeatures()
     logging.info("Labels in model(Semi_Supervised_Doc_Ner) : %s" %
                  str(self.__crf.labels()))
예제 #3
0
 def get_middle_names(self) -> set:
     if self.__middle_names is None:
         filename = get_data_file("middle_names.txt", folder="vietnamese")
         logging.info("%s loaded" % filename)
         arr = [normalize(x) for x in readlines(filename)]
         self.__middle_names = set(list(dict.fromkeys(arr)))
     return self.__middle_names
예제 #4
0
 def add_given_name(self, word):
     # add to vocabulary_build:
     x = normalize(word)
     self.get_vn_dict()
     if x not in self.__given_names:
         filename = get_data_file("names.txt", folder="vocals")
         self.write_append(word=word, filename=filename)
         self.__given_names.add(x)
예제 #5
0
 def add_hard_dict(self, word):
     # add to vocabulary_build:
     x = normalize(word)
     self.get_hard_dict()
     if x not in self.__hard_dict:
         filename = get_data_file("hard_dict.txt", folder="vietnamese")
         self.write_append(word=word, filename=filename)
         self.__hard_dict.add(x)
예제 #6
0
 def get_short_word(self) -> set:
     if self.__short_word is None:
         filename = get_data_file("short_words.txt", folder="vietnamese")
         logging.info("%s loaded" % filename)
         arr = [
             normalize(x) for x in readlines(filename) if len(x.strip()) > 0
         ]
         self.__short_word = set(list(dict.fromkeys(arr)))
     return self.__short_word
예제 #7
0
 def add_custom_dict_vn(self, word):
     # add to vocabulary_build:
     x = normalize(word)
     self.get_vn_dict()
     if x not in self.__vn_dict:
         # filename = path.join(self.__cur_dir, "vocabulary_build.txt")
         filename = get_data_file("vocabulary_build.txt", folder="vocals")
         self.write_append(word=word, filename=filename)
         self.__vn_dict.add(x)
예제 #8
0
 def get_family_names(self) -> set:
     if self.__family_names is None:
         filename = get_data_file("family_names.txt", folder="vietnamese")
         # filename = path.join(conf.vietnamese_path, "family_names.txt")
         arr = [
             normalize(x) for x in readlines(filename) if len(x.strip()) > 0
         ]
         self.__family_names = set(list(dict.fromkeys(arr)))
     return self.__family_names
예제 #9
0
 def _load(self):
     logging.info("{}.load()".format(self.__class__.__name__))
     filename = get_data_file("entity_named.dic.bin", folder="vocabulary")
     if not path.isfile(filename):
         logging.error("Please check {}".format(filename))
         exit()
     dct = load_from_file(filename)
     # logging.info("DictionaryLoader load (path = %s)" % filename)
     self.__root = dct.get("root")
     self.__start_sylls = dct.get("start_sylls")
     self.__length = dct.get("length")
     pros_map = dct.get("pros_map")
     self.__pros_map = {pros_map.get(K): K for K in pros_map}
     logging.info("{}.done(length={},map={})".format(self.__class__.__name__, self.__length, self.__pros_map))
예제 #10
0
 def get_given_name(self) -> set:
     if self.__given_names is None:
         files = {"names.txt", "company.txt"}
         arr = []
         for f in files:
             filename = get_data_file(f, folder="vocals")
             logging.info("%s loaded" % filename)
             arr += [
                 normalize(x) for x in readlines(filename)
                 if len(x.strip()) > 0
             ]
         arr = list(dict.fromkeys(arr))
         self.__given_names = set(arr)
     return self.__given_names
예제 #11
0
    def __load(self):
        # logging.info(">> set(initialize)")

        # reset
        self.__vn_dict = None
        self.__short_word = None
        self.__location = None
        self.__location_lv_3 = None
        self.__vn_dict_ugram = None
        self.__first_sent_word = None
        self.__family_names = None
        self.__middle_names = None
        self.__max_ugram = 4
        self.__given_names = None
        self.__hard_dict = None
        #

        filename = get_data_file("word_tokenlizer.bin", folder="vocabulary")
        if not path.isfile(filename):
            logging.error("%s not found. please check data..." % filename)
            exit()

        dct = load_from_file(filename)
        #
        # self.__hard_dict = dct.get("hard_dict")
        # logging.info("Size of (self.__hard_dict): %s" % len(self.__hard_dict))
        #
        max_ugram, vn_dict_ugram = dct.get("vn_dict_ugram")
        self.__max_ugram = max_ugram
        self.__vn_dict_ugram = vn_dict_ugram
        logging.info("Size of (self.__vn_dict_ugram): %s" %
                     len(self.__vn_dict_ugram))
        #
        self.__vn_dict = dct.get("vn_dict")
        logging.info("Size of (self.__vn_dict): %s" % len(self.__vn_dict))

        #
        self.__given_names = dct.get("given_names")
        logging.info("Size of (self.__given_names): %s" %
                     len(self.__given_names))
        #
        self.__geo = dct.get("geo")
        logging.info("Size of (self.__geo): %s" % len(self.__geo))
        #
        self.__location = dct.get("location")
        logging.info("Size of (self.__location): %s" % len(self.__location))
        #
        self.__tên_riêng = dct.get("name_vn")
        logging.info("Size of (self.__tên_riêng): %s" % len(self.__tên_riêng))
예제 #12
0
 def get_vn_dict(self) -> set:
     if self.__vn_dict is None:
         files = {
             "vocabulary_standard.txt",
             "vocabulary_build.txt",
             "vocabulary.txt",
             "animal.txt",
         }
         arr = []
         for f in files:
             filename = get_data_file(f, folder="vocals")
             logging.info("%s loaded" % filename)
             arr += [
                 normalize(x) for x in readlines(filename)
                 if len(x.strip()) > 0
             ]
         arr = list(dict.fromkeys(arr))
         self.__vn_dict = set(arr)
     return self.__vn_dict
예제 #13
0
    def get_hard_dict(self):
        if self.__hard_dict is None:
            filename = get_data_file("hard_dict.txt", folder="vietnamese")
            logging.info("%s loaded" % filename)
            dic, sizeof = dict(), dict()
            arr = [
                normalize(x) for x in readlines(filename) if len(x.strip()) > 0
            ]
            for x in arr:
                sl = x.split()
                key = " ".join(sl[0:2])
                if key not in dic:
                    dic[key] = set([])
                    sizeof[key] = 0
                dic[key].add(x)
                if len(sl) > sizeof[key]:
                    sizeof[key] = len(sl)
            self.__hard_dict = dict()
            for i in dic:
                self.__hard_dict[i] = dict(max_len=sizeof.get(i),
                                           data=dic.get(i))

        return self.__hard_dict
예제 #14
0
def get_phó_từ():
    filename = get_data_file("photu.txt", folder="vietnamese/dictionary")
    return data_loader(filename)
예제 #15
0
def get_tu_tieng_nuoc_ngoai_thong_dung():
    filename = get_data_file("tu_tieng_nuoc_ngoai_thong_dung.txt",
                             folder="vietnamese")
    return data_loader(filename)
예제 #16
0
def get_tu_don_chi_nam_trong_tu_ghep():
    filename = get_data_file("tu_don_chi_nam_trong_tu_ghep.txt",
                             folder="vietnamese")
    return data_loader(filename)
예제 #17
0
def get_english_words():
    filename = get_data_file("english.txt")
    ss = readlines(filename)
    ss = [x.strip().lower() for x in ss]
    return ss
예제 #18
0
def get_stop_words():
    filename = get_data_file("stopwords.txt", folder="vietnamese")
    return data_loader(filename)
예제 #19
0
def get_âm_tiết_việt_nam():
    filename = get_data_file("vietnamese-syllables.txt", folder="vietnamese")
    ss = readlines(filename)
    ss = [x.strip().lower() for x in ss]
    return ss
예제 #20
0
 def reload_rdr(self):
     cdr_file = get_data_file("named.rdr", folder="vietnamese")
     self.__root = SCRDRTree()
     self.__root.constructSCRDRtreeFromRDRfile(cdr_file)
예제 #21
0
def get_tính_từ():
    filename = get_data_file("tinhtu.txt", folder="vietnamese/dictionary")
    return data_loader(filename)
예제 #22
0
def get_âm_tiết_đặt_tên_thông_dụng():
    # get_data_file
    filename = get_data_file("syll_names.txt", folder="vietnamese")
    return data_loader(filename)
예제 #23
0
def get_đại_từ_nhân_xưng():
    filename = get_data_file("danhtunhanxung.txt",
                             folder="vietnamese/dictionary")
    return data_loader(filename)
예제 #24
0
def get_giới_từ():
    filename = get_data_file("gioi_tu.txt", folder="vietnamese/dictionary")
    return data_loader(filename)
예제 #25
0
def get_liên_từ():
    filename = get_data_file("lientu.txt", folder="vietnamese/dictionary")
    return data_loader(filename)
예제 #26
0
 def reload(self):
     cdr_file = get_data_file("word_tokenlize.rdr", folder="vietnamese")
     self.__root = SCRDRTree()
     self.__root.constructSCRDRtreeFromRDRfile(cdr_file)
예제 #27
0
def get_động_từ():
    filename = get_data_file("dongtu.txt", folder="vietnamese/dictionary")
    return data_loader(filename)