def get_middle_names(self) -> set: if self.__middle_names is None: filename = get_data_file("middle_names.txt", folder="vietnamese") logging.info("%s loaded" % filename) arr = [normalize(x) for x in readlines(filename)] self.__middle_names = set(list(dict.fromkeys(arr))) return self.__middle_names
def get_short_word(self) -> set: if self.__short_word is None: filename = get_data_file("short_words.txt", folder="vietnamese") logging.info("%s loaded" % filename) arr = [ normalize(x) for x in readlines(filename) if len(x.strip()) > 0 ] self.__short_word = set(list(dict.fromkeys(arr))) return self.__short_word
def get_first_sent_word(self) -> set: if self.__first_sent_word is None: filename = path.join(self.__cur_dir, "first_words.txt") logging.info("%s loaded" % filename) arr = [ normalize(x) for x in readlines(filename) if len(x.strip()) > 0 ] self.__first_sent_word = set(list(dict.fromkeys(arr))) return self.__first_sent_word
def get_family_names(self) -> set: if self.__family_names is None: filename = get_data_file("family_names.txt", folder="vietnamese") # filename = path.join(conf.vietnamese_path, "family_names.txt") arr = [ normalize(x) for x in readlines(filename) if len(x.strip()) > 0 ] self.__family_names = set(list(dict.fromkeys(arr))) return self.__family_names
def get_location(self) -> set: if self.__location is None: files = {"loc.lv2.txt", "loc.lv2.fix.txt"} arr = [] for f in files: filename = path.join(self.__cur_dir, f) logging.info("%s loaded" % filename) arr += [ normalize(x) for x in readlines(filename) if len(x.strip()) > 0 ] arr = list(dict.fromkeys(arr)) self.__location = set(arr) return self.__location
def get_given_name(self) -> set: if self.__given_names is None: files = {"names.txt", "company.txt"} arr = [] for f in files: filename = get_data_file(f, folder="vocals") logging.info("%s loaded" % filename) arr += [ normalize(x) for x in readlines(filename) if len(x.strip()) > 0 ] arr = list(dict.fromkeys(arr)) self.__given_names = set(arr) return self.__given_names
def get_location_lv3(self): if self.__location_lv_3 is None: files = {"loc.lv3.txt"} arr = [] for f in files: filename = path.join(self.__cur_dir, f) logging.info("%s loaded" % filename) arr += [ normalize(x) for x in readlines(filename) if len(x.strip()) > 0 ] arr = list(dict.fromkeys(arr)) max_length = max([len(x.split()) for x in arr]) self.__location_lv_3 = (max_length, set(arr)) return self.__location_lv_3
def get_vn_dict(self) -> set: if self.__vn_dict is None: files = { "vocabulary_standard.txt", "vocabulary_build.txt", "vocabulary.txt", "animal.txt", } arr = [] for f in files: filename = get_data_file(f, folder="vocals") logging.info("%s loaded" % filename) arr += [ normalize(x) for x in readlines(filename) if len(x.strip()) > 0 ] arr = list(dict.fromkeys(arr)) self.__vn_dict = set(arr) return self.__vn_dict
def get_hard_dict(self): if self.__hard_dict is None: filename = get_data_file("hard_dict.txt", folder="vietnamese") logging.info("%s loaded" % filename) dic, sizeof = dict(), dict() arr = [ normalize(x) for x in readlines(filename) if len(x.strip()) > 0 ] for x in arr: sl = x.split() key = " ".join(sl[0:2]) if key not in dic: dic[key] = set([]) sizeof[key] = 0 dic[key].add(x) if len(sl) > sizeof[key]: sizeof[key] = len(sl) self.__hard_dict = dict() for i in dic: self.__hard_dict[i] = dict(max_len=sizeof.get(i), data=dic.get(i)) return self.__hard_dict