def delete_sync_log(root, app_name): sync_folder = join(root, '.sync') log_path = join(sync_folder, '%s.log' % app_name) try: delete_file(log_path, to_trash=False) except: pass
def get_sync_cursor(root, app_name='farbox_bucket'): sync_folder = join(root, '.sync') cursor_path = join(sync_folder, '%s.cursor' % app_name) if os.path.isfile(cursor_path): with open(cursor_path) as f: return f.read() return ''
def delete_sync_cursor(root, app_name): sync_folder = join(root, '.sync') cursor_path = join(sync_folder, '%s.cursor' % app_name) try: delete_file(cursor_path, to_trash=False) except: pass
def get_path_with_dot_allowed(root, *keywords): possible_paths = [] for keyword in keywords: possible_paths.append(join(root, '.%s' % keyword)) possible_paths.append(join(root, keyword)) path = None # by default for path in possible_paths: if os.path.exists(path): return path return path
def sync_find_files_to_delete(root_path, app_name, as_dict=False): sync_data_folder = get_sync_data_folder(root_path, app_name) if not os.path.isdir(sync_data_folder): # never synced before return [] files = sync_loop_local_filesystem(root_path, app_name=app_name, check_md5=False) # same_path already data_filenames = os.listdir(sync_data_folder) old_file_paths = [] old_dir_paths = set() for data_filename in data_filenames: data_filepath = join(sync_data_folder, data_filename) try: with open(data_filepath) as f: data = json.loads(f.read()) filepath = data.get('filepath') is_dir = data.get('is_dir', False) if data.get('is_relative'): filepath = join(root_path, filepath) if filepath: filepath = same_slash(filepath) old_file_paths.append(filepath) if is_dir: old_dir_paths.add(filepath) except: pass _filepaths_to_delete = list(set(old_file_paths) - set(files)) # 让 folder 类型的排在最后 filepaths_to_delete = [] dirs_to_delete = [] for path in _filepaths_to_delete: # todo 尝试判断是不是在 iCloud 上 is_dir = filepath in old_dir_paths if not is_dir: filepaths_to_delete.append(path) else: dirs_to_delete.append(path) filepaths_to_delete += dirs_to_delete if as_dict: filepaths_to_delete_as_dict = [] for filepath in filepaths_to_delete: is_dir = filepath in old_dir_paths filepaths_to_delete_as_dict.append( dict(path=filepath, filepath=filepath, is_dir=is_dir)) return filepaths_to_delete_as_dict else: return filepaths_to_delete
def make_sure_archive_folder(filepath): # 根据某一个文件路径,创建其对应的.Archive目录,主要是用来处理版本管理的 folder_path = same_slash(os.path.dirname(filepath)) archive_path = join(folder_path, '.Archive') if not os.path.isdir(archive_path): os.makedirs(archive_path) return archive_path
def sync_loop_local_filesystem(root_path, app_name, check_md5=True, extra_should_sync_func=None): root_path = same_slash(root_path) if not os.path.isdir(root_path): # 根目录不存在,不处理 return [] file_paths = [] for parent, folders, files in os.walk(root_path): if is_a_hidden_path(parent): continue elif not is_real(parent): # link类型的不处理 continue for fs in [files, folders]: for filename in fs: filepath = join(parent, filename) # 看是否已经在本地数据库了 if not should_sync( filepath, root_path, app_name, check_md5, extra_should_sync_func=extra_should_sync_func): continue file_paths.append(filepath) return file_paths
def __init__(self, server_node, root, private_key=None, should_encrypt_file=True, files_info_filepath=None, app_name_for_sync='farbox_bucket', should_sync_file_func=None, auto_clean_bucket=True): self.server_node = server_node self.root = root self.private_key = private_key self.should_encrypt_file = should_encrypt_file self.auto_clean_bucket = auto_clean_bucket self.files_info_filepath = files_info_filepath or join(root, '.files_info.json') files_info = load_json_file(self.files_info_filepath) if not isinstance(files_info, dict): files_info = {} ipfs_files = files_info.setdefault('files', {}) if not isinstance(ipfs_files, dict): ipfs_files = {} files_info['files'] = ipfs_files self.ipfs_files = ipfs_files self.files_info = files_info self.app_name_for_sync = app_name_for_sync or 'farbox_bucket' self.files_info_on_server = {} # the files' info from remote server side # pass relative-path to this func, return True/False to sync or not self.should_sync_file_func = should_sync_file_func
def clear_sync_meta_data(root, app_name='farbox_bucket'): data_folder = get_sync_data_folder(root, app_name) delete_file(data_folder, to_trash=False) delete_sync_cursor(root, app_name) # 删除 meta 的逻辑 delete_sync_log(root, app_name) # 删除日志 files_info_info_file = join(root, '.files_info.json') delete_file(files_info_info_file, to_trash=True)
def default_set_cursor_func(root, cursor): cursor_file = join(root, ".farbox.cursor") try: with open(cursor_file, "wb") as f: f.write(cursor) return True except: return False
def get_file_versions_folder(filepath): filepath = same_slash(filepath) if not os.path.isfile(filepath): # 源文件不存在,则不处理 return # ignore archive_path = make_sure_archive_folder(filepath) # 确保.Archive目录存在 filename = os.path.split(filepath)[-1] versions_folder = join(archive_path, filename) return versions_folder
def default_get_cursor_func(root): if not os.path.isdir(root): return cursor_file = join(root, ".farbox.cursor") if os.path.isfile(cursor_file): return read_file(cursor_file) else: return None
def do_record_sync_log(self, log): now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') log = to_bytes('%s %s\n\n' % (now, log)) sync_log_filepath = join(self.root, '.sync/%s_sync.log' % self.app_name_for_sync) try: make_sure_path(sync_log_filepath) with open(sync_log_filepath, 'a') as f: f.write(log) except: pass
def store_sync_from_log(root, log): now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') log = smart_str('%s %s\n\n' % (now, log)) sync_log_filepath = join(root, '.sync/farbox_sync_from.log') try: make_sure_path(sync_log_filepath) with open(sync_log_filepath, 'a') as f: f.write(log) except: pass
def sync_for_deleted_files(self): # 处理删除了的文件 synced = False filepaths_to_delete_data = sync_find_files_to_delete( self.root, app_name=self.app_name_for_sync, as_dict=True) for filepath_to_delete_data in filepaths_to_delete_data: filepath_to_delete = filepath_to_delete_data['filepath'] is_dir = filepath_to_delete_data.get('is_dir', False) relative_path = get_relative_path(filepath_to_delete, root=self.root) ipfs_to_delete = self.ipfs_files.pop(relative_path, None) if isinstance(ipfs_to_delete, dict): ipfs_hash_to_delete = ipfs_to_delete.get('hash') else: ipfs_hash_to_delete = ipfs_to_delete self.remove_file_from_ipfs(ipfs_hash_to_delete) # is_deleted=True, send md5 value as version md5_value = filepath_to_delete_data.get('md5') compiler_sync_worker = FarBoxSyncCompilerWorker( server_node=self.server_node, root=self.root, filepath=filepath_to_delete, is_deleted=True, is_dir=is_dir, private_key=self.private_key, should_encrypt_file=self.should_encrypt_file, ipfs_key=ipfs_hash_to_delete, version=md5_value, auto_clean_bucket=self.auto_clean_bucket, files_info=self.files_info) sync_status = compiler_sync_worker.sync() self.record_sync_log(filepath=filepath_to_delete, sync_status=sync_status, is_deleted=True) if sync_status and sync_status.get('code') == 200: synced = True # at last, mark status as synced after_sync_deleted(filepath_to_delete, root=self.root, app_name=self.app_name_for_sync) # files on server, but no in local side, clean the configs_for_files # should run after self.sync_for_updated_files, to get self.files_info_on_server files_info_on_server = get_value_from_data(self.files_info_on_server, 'message.files') or {} for relative_path in files_info_on_server.keys(): abs_filepath = join(self.root, relative_path) if not os.path.isfile(abs_filepath): self.ipfs_files.pop(relative_path, None) synced = True return synced
def get_sync_data(filepath, root, app_name): # get the synced information for a filepath # 根据文件的路径,获得对应 md5 文件,里面存储了必要的信息(md5 * synced_at),用于判断当前文件是否需要同步 filepath = same_slash(filepath) data_path = get_sync_data_filepath(filepath, root, app_name) if os.path.isfile(data_path): try: with open(data_path) as f: data = json.loads(f.read()) if data.get('is_relative'): # 相对路径,转为绝对路径 data['filepath'] = join(root, data['filepath']) if isinstance(data, dict): return data except: pass return {} # final
def sync_site_folder_simply( node, root, private_key, should_encrypt_file=False, app_name_for_sync=None, print_log=True, exclude_rpath_func=None, ): if not node or not root or not private_key: return # ignore if not os.path.isdir(root): return # ignore if not is_valid_private_key(private_key): return # ignore now = time.time() app_name_for_sync = app_name_for_sync or 'farbox_bucket' site_folder_status_config_filepath = join( root, '.%s_site_folder_status.json' % app_name_for_sync) site_folder_status = load_json_file( site_folder_status_config_filepath) or {} bucket = get_bucket_by_private_key(private_key) old_bucket = site_folder_status.get('bucket') old_node = site_folder_status.get('node') if bucket != old_bucket or node != old_node: # bucket or node changed, reset the sync clear_sync_meta_data(root=root, app_name=app_name_for_sync) site_folder_status['bucket'] = bucket site_folder_status['node'] = node # configs 的逻辑也调整下 for key in site_folder_status: if key.endswith('_md5'): site_folder_status.pop('key', None) # dump_template first template_folder = get_path_with_dot_allowed(root, 'template') if os.path.isdir(template_folder): pages_data = get_pages_data(template_folder) current_pages_md5 = get_md5(json_dumps(pages_data, indent=4)) old_pages_md5 = site_folder_status.get('pages_md5') if current_pages_md5 != old_pages_md5: # 模板发生变化 old_pages_data = site_folder_status.get('pages') or {} sync_status = dump_pages( node=node, private_key=private_key, pages_dir=template_folder, old_pages_data=old_pages_data, ) sync_status_code = sync_status.get('code') if sync_status_code != 200: if print_log: print(sync_status.get('message')) return else: # update pages_md5 site_folder_status['pages_md5'] = current_pages_md5 site_folder_status['pages'] = pages_data if print_log: print('template is changed and synced') # update files first files_changed = sync_folder_simply(node=node, root=root, private_key=private_key, should_encrypt_file=should_encrypt_file, app_name_for_sync=app_name_for_sync, exclude_rpath_func=exclude_rpath_func) # update configs for config_type in allowed_bucket_config_types: sync_bucket_config(site_folder_status, root=root, node=node, private_key=private_key, config_type=config_type, print_log=print_log) # store the site_folder_status dump_json_file(filepath=site_folder_status_config_filepath, data=site_folder_status)
def create_file_version(filepath, force=False, min_time_diff=60, history_max_versions=150): # force 表示强制version # 将一个file_path进行拷贝到特定目录的读写操作,进而形成version的概念 filepath = same_slash(filepath) # for Markdown file only if not _is_a_markdown_file(filepath): return if not os.path.exists(filepath): return # ignore if os.path.isdir(filepath): return with open(filepath, 'rb') as f: raw_content = f.read() if not raw_content: return # blank content, ignore raw_content = smart_str(raw_content) now = datetime.datetime.now() now_str = now.strftime('%Y-%m-%d %H-%M-%S') version_folder = get_file_versions_folder(filepath) if not version_folder: return # ignore version_file_path = join(version_folder, now_str + os.path.splitext(filepath)[1]) if not os.path.isdir(version_folder): os.makedirs(version_folder) versions_file_names = os.listdir(version_folder) versions_file_names = [ name for name in versions_file_names if re.search('\d{4}-\d{1,2}-\d{1,2}', name) ] versions_file_names.sort() versions_file_names.reverse() # 倒序之后,就是最新的排最前面 file_size = os.path.getsize(filepath) now = time.time() if versions_file_names and file_size < 30 * 1024: # 对30k以下的文章,做version进一步的判断 last_version = versions_file_names[0] last_path = join(version_folder, last_version) last_mtime = os.path.getmtime(last_path) with open(last_path) as f: last_content = f.read() if last_content == raw_content: # 内容没有变化,ignore return length_diff = abs(len(last_content) - len(raw_content)) if length_diff < 30 or 0 < (now - last_mtime) < min_time_diff and not force: # 内容长度变化小余30,或者最后修改时间1分钟内的,忽略掉 return # ignore elif versions_file_names: # 1 分钟内, 才会尝试创建一个 version last_version = versions_file_names[0] last_path = join(version_folder, last_version) last_mtime = os.path.getmtime(last_path) if 0 < (now - last_mtime) < min_time_diff and not force: return # ignore if file_size < 10 * 1024: # 10k以下 max_versions = history_max_versions if max_versions < 0: max_versions = 0 elif file_size < 100 * 1024: # 100k max_versions = 80 elif file_size < 500 * 1024: max_versions = 50 else: max_versions = 20 if not max_versions: # ignore, versions is not allowed return for version_name_to_delete in versions_file_names[ max_versions:]: # 删除过多的版本文件 file_path_to_delete = join(version_folder, version_name_to_delete) try: os.remove(file_path_to_delete) except IOError: pass try: with open(version_file_path, 'wb') as new_f: new_f.write(raw_content) except IOError: # 失败 return
def get_sync_data_filepath(filepath, root, app_name): data_folder = make_sure_sync_data_folder(root, app_name) data_filename = get_sync_data_filename(filepath, root) data_filepath = join(data_folder, data_filename) return data_filepath
def make_sure_sync_log_path(root, app_name): make_sure_sync_data_folder(root, app_name) sync_folder = join(root, '.sync') log_path = join(sync_folder, '%s.log' % app_name) return log_path
def get_sync_log_path(root, app_name): sync_folder = join(root, '.sync') log_path = join(sync_folder, '%s.log' % app_name) return log_path
def get_sync_data_folder(root, app_name): # 存储同步信息的目录 data_path = join(root, '.sync/%s' % app_name) return data_path
def store_sync_cursor(root, cursor, app_name): make_sure_sync_data_folder(root, app_name) sync_folder = join(root, '.sync') cursor_path = join(sync_folder, '%s.cursor' % app_name) with open(cursor_path, 'w') as f: f.write(cursor)
def sync_from_farbox(root, private_key, node, get_cursor_func=None, save_cursor_func=None, before_file_sync_func=None, after_file_sync_func=None, per_page=30): get_cursor_func = get_cursor_func or partial(default_get_cursor_func, root) save_cursor_func = save_cursor_func or partial(default_set_cursor_func, root) cursor = get_cursor_func() message = dict(per_page=per_page) if cursor: message["cursor"] = cursor records = send_message(node, private_key, action="show_records", message=message) if not isinstance(records, (list, tuple)): if settings.DEBUG: print("error:records from node is not list/tuple type") return #if settings.DEBUG: # print("get %s records from node" % len(records)) last_cursor = None error_happened = False will_continue = False if len(records) == per_page: will_continue = True for record in records: if not isinstance(record, dict): continue record_id = record.get("_id") server_side_file_version = record.get("version") last_cursor = record_id path = record.get("path") if not record_id or not path: continue is_dir = record.get("is_dir") if is_dir: continue is_deleted = record.get("is_deleted") if is_deleted: # 为了避免误删除,忽略 is_deleted 的逻辑 continue abs_filepath = join(root, path) if server_side_file_version and os.path.isfile(abs_filepath): if get_md5_for_file(abs_filepath) == server_side_file_version: # 文件已经存在且重复了 #if settings.DEBUG: #print("has same file on server side for %s" % abs_filepath) continue # 开始下载文件 # 302 的跳转会自动处理,从而获得 200 的最终结果 response = send_message(node, private_key, action="download_file", message=dict(record_id=record_id), timeout=120, return_response=True) if not response: error_happened = True continue if response.status_code == 404: # 404 就直接忽略 continue elif response.status_code not in [200, 201]: error_happened = True continue raw_file_content = response.content if not raw_file_content: continue # 存储前的 hook, 比如做版本的存储 if before_file_sync_func and hasattr(before_file_sync_func, "__call__"): before_file_sync_func(abs_filepath) # 如果文件已经存在,保存之前,先放到回收站了,给用户多一个反悔的可能;另外 Windows 上 send2trash 并不总是正确的,except 就直接 pass if os.path.isfile(abs_filepath) and send2trash is not None: try: send2trash.send2trash(abs_filepath) except: pass try: make_sure_path(abs_filepath) with open(abs_filepath, "wb") as f: f.write(smart_str(raw_file_content)) if settings.DEBUG: print("downloaded %s" % abs_filepath) except: if settings.DEBUG: print_error() error_happened = True # 存储后的 hook if after_file_sync_func and hasattr(after_file_sync_func, "__call__"): after_file_sync_func(abs_filepath) # 存储日志 store_sync_from_log(root, abs_filepath) if not error_happened and last_cursor: # 没有错误发生,才会保存 cursor,确保同步的数据尽可能保持一致性 cursor_saved = save_cursor_func(last_cursor) if cursor_saved and will_continue: # 继续调用 sync_from_farbox(root=root, node=node, private_key=private_key, get_cursor_func=get_cursor_func, save_cursor_func=save_cursor_func, before_file_sync_func=before_file_sync_func, after_file_sync_func=after_file_sync_func, per_page=per_page) else: if cursor_saved: store_sync_from_log(root, "records updated") else: store_sync_from_log(root, "sync finished, no need to update") if settings.DEBUG: print("sync-from finished, %s records" % len(records))