def initialize(self, dictionary=None): if dictionary: abs_path = _get_abs_path(dictionary) if self.dictionary == abs_path and self.initialized: return else: self.dictionary = abs_path self.initialized = False else: abs_path = self.dictionary with self.lock: try: with DICT_WRITING[abs_path]: pass except KeyError: pass if self.initialized: return default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary')) t1 = time.time() if self.cache_file: cache_file = self.cache_file # default dictionary elif abs_path == DEFAULT_DICT: cache_file = "jieba.cache" # custom dictionary else: cache_file = "jieba.u%s.cache" % md5( abs_path.encode('utf-8', 'replace')).hexdigest() cache_file = os.path.join(self.tmp_dir or tempfile.gettempdir(), cache_file) # prevent absolute path in self.cache_file tmpdir = os.path.dirname(cache_file) load_from_cache_fail = True if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or os.path.getmtime(cache_file) > os.path.getmtime(abs_path)): default_logger.debug("Loading model from cache %s" % cache_file) try: with open(cache_file, 'rb') as cf: self.FREQ, self.total = marshal.load(cf) load_from_cache_fail = False except Exception: load_from_cache_fail = True if load_from_cache_fail: wlock = DICT_WRITING.get(abs_path, threading.RLock()) DICT_WRITING[abs_path] = wlock with wlock: self.FREQ, self.total = self.gen_pfdict( self.get_dict_file()) default_logger.debug("Dumping model to file cache %s" % cache_file) try: # prevent moving across different filesystems fd, fpath = tempfile.mkstemp(dir=tmpdir) with os.fdopen(fd, 'wb') as temp_cache_file: marshal.dump((self.FREQ, self.total), temp_cache_file) _replace_file(fpath, cache_file) except Exception: default_logger.exception("Dump cache file failed.") try: del DICT_WRITING[abs_path] except KeyError: pass self.initialized = True default_logger.debug("Loading model cost %.3f seconds." % (time.time() - t1)) default_logger.debug("Prefix dict has been built succesfully.")
def initialize(self, dictionary=None): if dictionary: abs_path = _get_abs_path(dictionary) if self.dictionary == abs_path and self.initialized: return else: self.dictionary = abs_path self.initialized = False else: abs_path = self.dictionary with self.lock: try: with DICT_WRITING[abs_path]: pass except KeyError: pass if self.initialized: return default_logger.debug("Building prefix dict from %s ..." % abs_path) t1 = time.time() if self.cache_file: cache_file = self.cache_file # default dictionary elif abs_path == DEFAULT_DICT: cache_file = "jieba.cache" # custom dictionary else: cache_file = "jieba.u%s.cache" % md5( abs_path.encode('utf-8', 'replace')).hexdigest() cache_file = os.path.join( self.tmp_dir or tempfile.gettempdir(), cache_file) # prevent absolute path in self.cache_file tmpdir = os.path.dirname(cache_file) load_from_cache_fail = True if os.path.isfile(cache_file) and os.path.getmtime(cache_file) > os.path.getmtime(abs_path): default_logger.debug( "Loading model from cache %s" % cache_file) try: with open(cache_file, 'rb') as cf: self.FREQ, self.total = marshal.load(cf) load_from_cache_fail = False except Exception: load_from_cache_fail = True if load_from_cache_fail: wlock = DICT_WRITING.get(abs_path, threading.RLock()) DICT_WRITING[abs_path] = wlock with wlock: self.FREQ, self.total = self.gen_pfdict(abs_path) default_logger.debug( "Dumping model to file cache %s" % cache_file) try: # prevent moving across different filesystems fd, fpath = tempfile.mkstemp(dir=tmpdir) with os.fdopen(fd, 'wb') as temp_cache_file: marshal.dump( (self.FREQ, self.total), temp_cache_file) _replace_file(fpath, cache_file) except Exception: default_logger.exception("Dump cache file failed.") try: del DICT_WRITING[abs_path] except KeyError: pass self.initialized = True default_logger.debug( "Loading model cost %.3f seconds." % (time.time() - t1)) default_logger.debug("Prefix dict has been built succesfully.")
def initialize(self, DICTIONARY): file_name = DICTIONARY abs_path = os.path.join(os.getcwd(), file_name) self.dictionary = abs_path #print self.dictionary with self.lock: try: with DICT_WRITING[abs_path]: pass except KeyError: pass if self.initialized: return default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary')) t1 = time.time() if self.cache_file: cache_file = self.cache_file # default dictionary cache_file = "dict.cache" ''' elif abs_path == DEFAULT_DICT: cache_file = "dict.cache" # custom dictionary else:#hexdigest 16进制的摘要,获取加密串如:5f82e0b599b4397b322efdc0aeea6a72,32位 cache_file = "dict.u%s.cache" % md5(abs_path.encode('utf-8', 'replace')).hexdigest() #str.encode(encoding='UTF-8',errors='strict') ''' #cache_file = os.path.join(self.tmp_dir or tempfile.gettempdir(), cache_file) # gettempdir()则用于返回保存临时文件的文件夹路径。 #print ("tempfile.gettempdir():{}\n".format(tempfile.gettempdir())) #/tmp # prevent absolute path in self.cache_file tmpdir = os.path.dirname(cache_file) #返回最后的文件名 /tmp #self.cache_file = cache_file #print tmpdir load_from_cache_fail = True #第二次走着 if os.path.isfile(cache_file) and (abs_path == DEFAULT_DICT or os.path.getmtime(cache_file) > os.path.getmtime(abs_path)): default_logger.debug("Loading model from cache %s" % cache_file) # /tmp/jieba.cache try: #print (cache_file) #/tmp/jieba.cache with open(cache_file, 'rb') as cf: self.words, self.freqs = marshal.load(cf) #二进制流反序列化为对象 #print len(self.words)#, self.FREQ #print len(self.freqs) load_from_cache_fail = False except Exception: load_from_cache_fail = True if load_from_cache_fail: #第一次走这 wlock = DICT_WRITING.get(abs_path, threading.RLock()) DICT_WRITING[abs_path] = wlock with wlock: self.words, self.freqs = self.gen_pfdict( self.get_dict_file()) #print len(self.words), len(self.freqs) default_logger.debug("Dumping model to file cache %s" % cache_file) try: # prevent moving across different filesystems #返回包含两个元素的元组,第一个元素指示操作该临时文件的安全级别,第二个元素指示该临时文件的路径。 fd, fpath = tempfile.mkstemp( dir=tmpdir) #mkstemp方法用于创建一个临时文件 with os.fdopen(fd, 'wb') as temp_cache_file: marshal.dump((self.words, self.freqs), temp_cache_file) #将数值进序列对象成二进制流 _replace_file(fpath, cache_file) except Exception: default_logger.exception("Dump cache file failed.") try: del DICT_WRITING[abs_path] except KeyError: pass self.initialized = True default_logger.debug("Loading model cost %.3f seconds." % (time.time() - t1)) default_logger.debug("Prefix dict has been built succesfully.")
def _initialize(self, dictionary=None): if dictionary: abs_path = _get_abs_path(dictionary) if self.dictionary == abs_path and self.initialized: return else: self.dictionary = abs_path self.initialized = False else: abs_path = self.dictionary with self.lock: try: with DICT_WRITING[abs_path]: pass except KeyError: pass if self.initialized: return default_logger.debug( "Building from %s ..." % (abs_path or "the default dictionary") ) t1 = time.time() if self.cache_file: cache_file = self.cache_file # default dictionary elif abs_path == DEFAULT_DICT: cache_file = "yn.cache" # custom dictionary else: cache_file = ( "yn.u%s.cache" % md5(abs_path.encode("utf-8", "replace")).hexdigest() ) cache_file = os.path.join(self.tmp_dir or tempfile.gettempdir(), cache_file) load_from_cache_fail = True tmpdir = os.path.dirname(cache_file) if os.path.isfile(cache_file) and ( abs_path == DEFAULT_DICT or os.path.getmtime(cache_file) > os.path.getmtime(abs_path) ): default_logger.debug("Loading model from cache %s" % cache_file) try: with open(cache_file, "rb") as cf: self.yes, self.no, self.others, self.filtered = marshal.load(cf) load_from_cache_fail = False except Exception: load_from_cache_fail = True if load_from_cache_fail: wlock = DICT_WRITING.get(abs_path, threading.RLock()) DICT_WRITING[abs_path] = wlock with wlock: self.yes, self.no, self.others, self.filtered = self._load_simple_dict( self.dictionary ) default_logger.debug("Dumping model to file cache %s" % cache_file) try: # prevent moving across different filesystems fd, fpath = tempfile.mkstemp(dir=tmpdir) with os.fdopen(fd, "wb") as temp_cache_file: marshal.dump( (self.yes, self.no, self.others, self.filtered), temp_cache_file, ) _replace_file(fpath, cache_file) except Exception: default_logger.exception("Dump cache file failed.") try: del DICT_WRITING[abs_path] except KeyError: pass self.yes = set(self.yes) self.no = set(self.no) self.filtered = set(self.filtered) self.others = set(self.others) self.initialized = True default_logger.debug("Loading dict cost %.3f seconds." % (time.time() - t1)) default_logger.debug("simple dict has been built successfully.")
def initialize(self, dictionary=None): """ abs_path代表的是字典的絕對路徑 如果使用者傳入了dictionary參數,則需要更新abs_path 否則的話,就直接使用在__init__()中己經設好的self.dictionary """ if dictionary: abs_path = _get_abs_path(dictionary) if self.dictionary == abs_path and self.initialized: #因為詞典己載入,所以返回 return else: self.dictionary = abs_path self.initialized = False else: abs_path = self.dictionary #載入詞典的過程必須被完整執行,所以使用lock with self.lock: #這一段try-except的內容都是pass,似乎沒有作用 try: with DICT_WRITING[abs_path]: pass except KeyError: pass #如果self.intialized為True,代表字典己載入 #這時就直接返回 if self.initialized: return default_logger.debug("Building prefix dict from %s ..." % (abs_path or 'the default dictionary')) t1 = time.time() #將cache_file設定快取檔案的名稱 if self.cache_file: cache_file = self.cache_file # default dictionary elif abs_path == DEFAULT_DICT: cache_file = "jieba.cache" # custom dictionary else: cache_file = "jieba.u%s.cache" % md5( abs_path.encode('utf-8', 'replace')).hexdigest() """ tempfile.gettempdir的作用旨在尋找一個可以寫入暫存檔的目錄。 """ #將cache_file更新為其絕對路徑 cache_file = os.path.join(self.tmp_dir or tempfile.gettempdir(), cache_file) #快取檔案的目錄 # prevent absolute path in self.cache_file tmpdir = os.path.dirname(cache_file) load_from_cache_fail = True """ 載入cache_file 首先檢查cache_file是否存在,並且是一個檔案 如果不是的話則略過這部份; 如果是的話則接著確認如果使用的是預設的字典DEFAULT_DICT 如果不是使用預設的字典,則要確認cache_file的修改時間晚於自訂義字典的修改時間 如果都符合條件,則從快取檔案中載入self.FREQ, self.total這兩個值, 並將load_from_cache_fail設為False """ if os.path.isfile(cache_file) and ( abs_path == DEFAULT_DICT or #os.path.getmtime: 獲取檔案的最後修改時間 os.path.getmtime(cache_file) > os.path.getmtime(abs_path)): default_logger.debug("Loading model from cache %s" % cache_file) try: with open(cache_file, 'rb') as cf: """ marshal.dump及marshal.load是用來儲存及載入Python物件的工具。 """ self.FREQ, self.total = marshal.load(cf) load_from_cache_fail = False except Exception: load_from_cache_fail = True #如果cache_file載入失敗,就重新讀取字典檔案, # 獲取self.FREQ, self.total然後生成快取檔案 if load_from_cache_fail: #可能是怕程式中斷,所以先把lock存到DICT_WRITING這個字典裡 #中斷後繼續執行時就可以不用再重新生成一個lock wlock = DICT_WRITING.get(abs_path, threading.RLock()) DICT_WRITING[abs_path] = wlock #在這個程式區塊中,又需要一個lock,用來鎖住寫檔的這一區塊 with wlock: self.FREQ, self.total = self.gen_pfdict( self.get_dict_file()) default_logger.debug("Dumping model to file cache %s" % cache_file) try: # prevent moving across different filesystems """ tempfile.mkstemp的作用旨在使用最安全的方式創建一個暫存檔。 它回傳的是一個file descriptor,以及該檔案的絕對路徑。 """ # tmpdir是剛剛決定好的快取檔案的路徑 # prevent moving across different filesystems fd, fpath = tempfile.mkstemp(dir=tmpdir) """ os.fdopen: 利用傳入的file descriptor fd,回傳一個開啟的檔案物件。 """ # 使用marshal.dump將剛拿到的 # (self.FREQ, self.total)倒入temp_cache_file with os.fdopen(fd, 'wb') as temp_cache_file: """ marshal.dump及marshal.load是用來儲存及載入Python物件的工具。 """ marshal.dump((self.FREQ, self.total), temp_cache_file) #把檔案重命名為cache_file _replace_file(fpath, cache_file) except Exception: default_logger.exception("Dump cache file failed.") try: del DICT_WRITING[abs_path] except KeyError: pass #之後會利用self.initialized這個屬性 # 來檢查self.FREQ, self.total是否己被設為有意義的值 self.initialized = True default_logger.debug("Loading model cost %.3f seconds." % (time.time() - t1)) default_logger.debug("Prefix dict has been built successfully.")