def merge_part_files(self, file_dir, left_file, right_file, new_file): '''file_dir: 分片文件所在目录 left_file: 前段分片 right_file: 后段分片 new_file: 合并后的新分片文件 ''' try: absolute_new_file = os.path.join(file_dir, new_file) absolute_left_file = os.path.join(file_dir, left_file) absolute_right_file = os.path.join(file_dir, right_file) with open(absolute_left_file, mode='ab') as lf: with open(absolute_right_file, mode='rb') as rf: while True: buf = rf.read(1 * 1024 * 1024) if len(buf) == 0: break lf.write(buf) lf.flush() Utils.try_remove(absolute_new_file) shutil.move(absolute_left_file, absolute_new_file) self.aios_print('文件合并完成:', f'[{left_file}]', f'[{right_file}]') file_size = os.path.getsize(absolute_new_file) self.aios_print('新文件名称:', f'[{new_file}]', file_size, 'block:', int(file_size / (5 * 1024 * 1024))) finally: if os.path.exists(absolute_right_file): os.rename(absolute_right_file, absolute_right_file + '.deleted')
def single_file_handler(self, single_chunk_files): '''单分片文件处理方法 ''' succ_list = [] err_list = [] wait_add_tasks = [] wait_update_tasks = [] taskdao = TaskDAO() tenant_id = g.tenant_id if hasattr(g, 'tenant_id') else 0 batches = _.uniq( [Utils.get_batch(i.get('dir_path')) for i in single_chunk_files]) task_ids = [i.get('file_key') for i in single_chunk_files] exist_tasks = taskdao.get_tasks(tenant_id, batches, task_ids) exist_tasks_map = {} for exist_task in exist_tasks: exist_tasks_map[exist_task.task_id] = exist_task.id for single_chunk_file in single_chunk_files: try: file_key = single_chunk_file.get('file_key') dir_path = single_chunk_file.get('dir_path') file_name = single_chunk_file.get('file_name') tenant_id = single_chunk_file.get('tenant_id') user_id = single_chunk_file.get('user_id') merged_file = os.path.join(dir_path, file_name) final_merged_file = os.path.join(dir_path, f'{file_key}.1') shutil.move(final_merged_file, merged_file) task_json = { 'created_by': user_id, 'tenant_id': tenant_id, 'task_id': file_key, 'chunks': '1', 'status': TASK_STATUS_MERGED, 'size': os.path.getsize(merged_file), 'link': { 'host': Utils.get_host(dir_path) }, 'batch': Utils.get_batch(dir_path) } if exist_tasks_map.get(file_key): task_json['id'] = exist_tasks_map.get(file_key) wait_update_tasks.append(task_json) else: wait_add_tasks.append(task_json) succ_list.append({'file_key': file_key, 'curr_chunk': 1}) except Exception as err: err_list.append({'file_key': file_key, 'curr_chunk': 1}) if len(wait_add_tasks): taskdao.bulk_add(wait_add_tasks) if len(wait_update_tasks): taskdao.bulk_update(wait_update_tasks) return succ_list, err_list
def get_standard_sub_dir(self, sub_dir): ''' #### 获取标准的容器内部文件挂载的子目录,不存在则创建 ''' sub_dir = sub_dir.strip(os.path.sep).replace('/', os.path.sep) # 在容器内部可以直接访问的子目录的绝对路径 absolute_dir_path = os.path.join(Utils.mount_point(), sub_dir.strip(os.path.sep)) Utils.try_makedirs(absolute_dir_path) return absolute_dir_path
def start_thread_save_db(self, msg_list): taskdao = TaskDAO() # 多线程将缓存保存到数据库 def start_thread(x): batch = Utils.get_batch(x) thread_key = taskdao.thread_key.format(batch) if not self.aios_redis.get(thread_key): threading.Thread(target=taskdao.save_to_db, args=(g.user_id, g.tenant_id, batch), daemon=True).start() self.aios_print('启动定时调度线程', batch) self.aios_redis.set(thread_key, 'Running', taskdao.THREAD_KEY_TIMEOUT) if self.aios_redis.ttl(thread_key) < 10: threading.Thread(target=taskdao.save_to_db, args=(g.user_id, g.tenant_id, batch), daemon=True).start() self.aios_print('线程已存在,失效时间快到了,重新启动定时调度线程', batch) self.aios_redis.set(thread_key, 'Running', taskdao.THREAD_KEY_TIMEOUT) else: self.aios_print('线程已存在,不需要开启', batch) # 启动多个线程 _.chain(msg_list). \ map_(lambda x: Utils.get_host(x.get('dir_path'))). \ uniq(). \ for_each(start_thread). \ value()
def delete(sub_path): """ @@@ #### 文件删除 in: path 参数: { # 文件在NFS文件存储区的存放目录,可以以<挂载点>/<sub_path>找到 "sub_path": '<模块名>/<子目录>/.../<文件名>' } 返回值: 204 | 200 @@@ """ try: if not sub_path or not os.path.basename(sub_path): return bad_request('sub_path') store_path = os.path.join(Utils.mount_point(), sub_path) if os.path.exists(store_path): os.remove(store_path) if not os.listdir(os.path.dirname(store_path)): os.rmdir(os.path.dirname(store_path)) if not os.listdir(os.path.dirname(os.path.dirname(store_path))): os.rmdir(os.path.dirname(os.path.dirname(store_path))) return standard_response(None, 204) else: return standard_response(None) except Exception as err: return standard_expection(str(err))
def simple_multi_files_upload(): """简单文件批量上传,建议文件大小尽量不要超过20MB """ try: if len(request.files) == 0: return standard_expection('没有任何文件') link_dict_list = [] wait_add_records = [] user_id = g.user_id if hasattr(g, 'user_id') else 0 for file_key in request.files: file = request.files[file_key] [(file_name, file_extension)] = re.findall('(\S+)\.(\S+)$', file.filename) full_dir_path = os.path.join(Utils.mount_point(), str(user_id)) if not os.path.exists(full_dir_path): os.makedirs(full_dir_path) task_id = str(uuid.uuid4()) file_name = '{}.{}'.format(task_id, file_extension) save_path = os.path.join(full_dir_path, file_name) file.save(save_path) host = save_path.replace(current_app.config['FILE_STORAGE_DIR'], '', 1) http = '{}/{}'.format(current_app.config['FILE_STORAGE_HOST'], host) link_dict = {'http': http.replace('\\', '/'), 'host': host} wait_add_records.append({ 'task_id': task_id, 'chunks': -1, 'status': TASK_STATUS_NONE, 'size': os.path.getsize(save_path), 'link': link_dict, 'created_by': g.user_id if hasattr(g, 'user_id') else 0, 'tenant_id': g.tenant_id if hasattr(g, 'tenant_id') else 0 }) link_dict_list.append(link_dict) if len(wait_add_records): TaskModel.bulk_insert_mappings(wait_add_records) return standard_response(link_dict_list, 201) except Exception as err: return standard_expection(str(err))
def notify_thread_stop(self, msg_list): taskdao = TaskDAO() def stop_thread(x): batch = Utils.get_batch(x) thread_key = taskdao.thread_key.format(batch) self.aios_redis.set(thread_key, 'Stop') _.chain(msg_list). \ map_(lambda x: Utils.get_host(x.get('dir_path'))). \ uniq(). \ for_each(stop_thread). \ value()
def start_thread(x): batch = Utils.get_batch(x) thread_key = taskdao.thread_key.format(batch) if not self.aios_redis.get(thread_key): threading.Thread(target=taskdao.save_to_db, args=(g.user_id, g.tenant_id, batch), daemon=True).start() self.aios_print('启动定时调度线程', batch) self.aios_redis.set(thread_key, 'Running', taskdao.THREAD_KEY_TIMEOUT) if self.aios_redis.ttl(thread_key) < 10: threading.Thread(target=taskdao.save_to_db, args=(g.user_id, g.tenant_id, batch), daemon=True).start() self.aios_print('线程已存在,失效时间快到了,重新启动定时调度线程', batch) self.aios_redis.set(thread_key, 'Running', taskdao.THREAD_KEY_TIMEOUT) else: self.aios_print('线程已存在,不需要开启', batch)
def simple_file_upload(): """ @@@ #### 文件分片上传接口 in: body 参数: { "file": 文件对象 # 文件在NFS文件存储区的存放目录,可以以<挂载点>/<sub_dir>找到 "sub_dir": '<数据源id>/<数据集id>/<虚拟目录id>/<虚拟目录id>/...' } 返回值: { "data": 'ok' }, 201 @@@ """ try: file = request.files['file'] arr = os.path.splitext(file.filename) file_extension = arr[len(arr) - 1] if not file: return bad_request('file') user_id = g.user_id if hasattr(g, 'user_id') else 0 full_dir_path = os.path.join(Utils.mount_point(), str(user_id)) if not os.path.exists(full_dir_path): os.makedirs(full_dir_path) file_name = '{}{}'.format(str(uuid.uuid4()), file_extension) file.save(os.path.join(full_dir_path, file_name)) host = os.path.join(full_dir_path, file_name) http = '{}/{}'.format(current_app.config['FILE_STORAGE_HOST'], host) link_dict = {'http': http.replace('\\', '/'), 'host': host} return standard_response(link_dict, 201) except Exception as err: return standard_expection(str(err))
def stop_thread(x): batch = Utils.get_batch(x) thread_key = taskdao.thread_key.format(batch) self.aios_redis.set(thread_key, 'Stop')
def porter_running(self, task_id, total_chunk, part_file, merged_file, tenant_id, user_id, timeout=60): try: from run import app as inner_app from app import aios_redis with inner_app.app_context(): file_cache = FileCache(task_id) chunk = 1 start = time.time() task = self.get_task(tenant_id, task_id) if task is None: args = { 'task_id': task_id, 'tenant_id': tenant_id, 'created_by': user_id } task = TaskModel(**args) task.status = TASK_STATUS_MERGING task.save() wait_remove_files = [] with open(merged_file, mode='ab+') as dst_f: print(datetime.datetime.now(), f"{task_id} start while") while chunk <= total_chunk and time.time( ) - start <= timeout: # 刷新等待时间 start = time.time() src_part_file = f'{part_file}{chunk}' # 不能根据文件是否存在来检测是否可以写入,因为并发时文件存在但是正在写入硬盘中,可能内容没写完整 # 在src_part_file文件save之后,标记分片状态,此处可以实时检查标记状态 # print(datetime.datetime.now(), f"get redis chunk. {task_id} get chunk: {chunk}", # {file_cache.get_chunk_status(chunk)}) if file_cache.get_chunk_status(chunk): with open(src_part_file, mode='rb') as src_f: src_f_binary = src_f.read() dst_f.write(src_f_binary) # 数据库中记录分片序号 task.chunks = '{},{}'.format(task.chunks, chunk).strip(',') task.save() file_cache.inc_counter() chunk += 1 # 删除分片文件 wait_remove_files.append(src_part_file) else: time.sleep(0.5) print(datetime.datetime.now(), "chunk:", chunk, "total_chunk:", total_chunk, "end while") host = merged_file.replace( current_app.config['FILE_STORAGE_DIR'], '').replace(os.path.sep, '/') http = f'{current_app.config["FILE_STORAGE_HOST"]}/{host}' task.link = {'http': http.replace('\\', '/'), 'host': host} task.size = os.path.getsize(merged_file) task.status = TASK_STATUS_MERGED task.save() # 通知主进程文件合并进度 file_cache.set_task_completed(task.link) print('⭐porter_running⭐', 'finished', task.link) if len(wait_remove_files): for wait_remove_file in wait_remove_files: Utils.try_remove(wait_remove_file) except Exception as e: print(e)
def multi_file_handler(self, msg): '''子进程处理单个文件的写入 ''' try: # 上传进程内部加载sqlalchemy时需要控制每个进程的数据库连接池大小 os.environ['SQLALCHEMY_POOL_SIZE'] = '1' from run import app as inner_app from app import aios_redis with inner_app.app_context(): self.aios_redis = aios_redis file_key = msg.get('file_key') dir_path = msg.get('dir_path') file_name = msg.get('file_name') curr_chunk = msg.get('curr_chunk') total_chunks = msg.get('total_chunks') tenant_id = msg.get('tenant_id') user_id = msg.get('user_id') cache_expired_time = msg.get('cache_expired_time') wait_lock = f'plus_uploader:lock:{file_key}' while not self.aios_redis.setnx(wait_lock, f'lock.{curr_chunk}'): # self.aios_print(file_key, curr_chunk, 'task waiting...') time.sleep(0.001) else: self.lock = f'{wait_lock} by {curr_chunk}' self.aios_print(file_key, curr_chunk, f'current lock: lock.{curr_chunk}') # 锁独占状态 # wait_lock超时时间 = 已存在但未合并的分片个数 * 单个分片预估的失效时间 parts = len( _.filter_( os.listdir(dir_path), lambda x: '-' not in x and '.deleted' not in x)) count = max(parts, 1) self.aios_print(f'文件存放目录{dir_path}, 实时已存在的分片数: {parts}') self.aios_redis.expire(wait_lock, cache_expired_time * count) # self.aios_print(f'修正的lock超时时间, {cache_expired_time * count}') # 更新任务状态 args = { 'task_id': file_key, 'tenant_id': tenant_id, 'created_by': user_id, 'updated_by': user_id, 'batch': Utils.get_batch(dir_path), 'status': TASK_STATUS_NONE, 'chunks': '', 'size': 0, 'link': { 'host': Utils.get_host(dir_path) } } taskdao = TaskDAO() # task_json = taskdao.get_task_from_cache(tenant_id, file_key) task_json = taskdao.get_task(tenant_id, file_key, json=True) if task_json is None: task_json = taskdao.add(args) if task_json['status'] != TASK_STATUS_MERGED: task_json['status'] = TASK_STATUS_MERGING # taskdao.update(task_json) TaskModel.bulk_update_mappings([task_json]) # 合并分片 # 合并后的完整文件路径 merged_file = os.path.join(dir_path, file_name) merge_process = self.partation_merge( dir_path, file_key, total_chunks) merge_process = _.map_( merge_process, lambda x: x.replace(f'{file_key}.', '')) except_complete_name = f'1-{total_chunks}' if total_chunks > 1 else '1' # 检测是否完整 ['1-701'] except_complete_name 1-701 self.aios_print('检测是否完整', merge_process, 'except_complete_name', except_complete_name) if except_complete_name in merge_process: for f in os.listdir(dir_path): self.aios_print('比较文件名', f, except_complete_name) if f.startswith(f'{file_key}.') and ( not f.endswith(except_complete_name)): Utils.try_remove(os.path.join(dir_path, f)) self.aios_print('删除残留异常文件', f) # 修改合并后的文件名称 final_merged_file = os.path.join( dir_path, f'{file_key}.{except_complete_name}') shutil.move(final_merged_file, merged_file) self.aios_print('文件改名', final_merged_file, '>>', merged_file) # 记录任务状态 task_json['chunks'] = ','.join( [str(i + 1) for i in range(total_chunks)]) task_json['status'] = TASK_STATUS_MERGED task_json['size'] = os.path.getsize(merged_file) task_json['link'] = { 'host': Utils.get_host(dir_path) } # 合并完成 self.aios_print(f'{curr_chunk}.合并完成') else: # ['1', '3-5', '10'] covert_process = [] for section in merge_process: if '-' in section: [left, right] = section.split('-') covert_process.extend([ str(i) for i in range(int(left), int(right) + 1) ]) else: covert_process.append(section) task_json['chunks'] = ','.join(covert_process) # 不完整 self.aios_print(f'{curr_chunk}.不完整,继续等待') # 保存到缓存 # taskdao.update(task_json) self.aios_print(f'更新状态, {task_json}') TaskModel.bulk_update_mappings([task_json]) else: # 已经合并完成后,不需要再次合并,直接退出 self.aios_print(f'{curr_chunk}.已经合并完成后, 不需要再次合并') # 释放锁 self.aios_redis.delete(wait_lock) self.aios_print(f'结束.{curr_chunk}/{total_chunks}') # 清理被标记的可删除分片 for file in os.listdir(dir_path): if file.endswith('.deleted'): self.aios_print('清理被标记的可删除分片', file) Utils.try_remove(os.path.join(dir_path, file)) return {'file_key': file_key, 'curr_chunk': curr_chunk}, None except Exception as err: import traceback traceback.print_exc() print('multi_file_handler', err) return None, { 'file_key': msg['file_key'], 'curr_chunk': msg['curr_chunk'] }
def file_upload_async(): print("start", request.form) task_id = request.form.get('task_id') file_cache = FileCache(task_id) key = None fileHandler = FileHandler() file = request.files['file'] sub_dir = request.form.get('sub_dir') or "/upload" # chunkNumber从1开始 chunk = request.form.get('chunkNumber', type=int, default=1) totalChunks = request.form.get('totalChunks', type=int) if not file: return bad_request('file') if not sub_dir: return bad_request('sub_dir') if not task_id: return bad_request('task_id') if not totalChunks: return bad_request('totalChunks') try: user_id = g.user_id if hasattr(g, 'user_id') else 0 absolute_dir_path = os.path.join(Utils.get_upload_path(), str(user_id), str(task_id)) Utils.try_makedirs(absolute_dir_path) if file_cache.get_chunk_status(chunk): return standard_expection( f'当前文件{task_id}的第{chunk}个分片已经上传, 请勿重复操作!') tenant_id = g.tenant_id if hasattr(g, 'tenant_id') else 0 exist_task = TaskModel.query.filter( TaskModel.tenant_id == tenant_id, TaskModel.task_id == task_id).first() if exist_task and exist_task.status == TASK_STATUS_MERGED: return standard_expection(f'当前文件{task_id}已经上传完成, 请勿重复操作!') file.save(os.path.join(absolute_dir_path, f'{task_id}{chunk}')) file_cache.set_chunk_status(chunk) key = task_id fileHandler.log_print(key, chunk, f'{chunk}/{totalChunks}') # 第一个分片时开启后台线程监控分片状态,并行合并 if chunk == 1: target_filename = request.form.get('filename') merged_file = os.path.join(absolute_dir_path, target_filename) part_file = os.path.join(absolute_dir_path, key) args = (key, totalChunks, part_file, merged_file, tenant_id, user_id) threading.Thread(target=fileHandler.porter_running, args=args, daemon=True).start() while not file_cache.lock(): # print(datetime.datetime.now(), f"{task_id} set lock waiting chunk {chunk}") fileHandler.log_print(key, chunk, 'task waiting') time.sleep(0.2) else: # print(datetime.datetime.now(), f"{task_id} set lock success chunk {chunk}") # ============================独占状态============================ # 初始化需要被处理的分片序号,逐一pick掉,在最后一片时轮询检查后台线程合并状态 file_cache.ready_chunks(totalChunks) # 从chunks列表移除当前分片序号 chunks_values = file_cache.get_ready_chunks() _.remove(chunks_values, lambda x: x == chunk) file_cache.set_ready_chunks(chunks_values) # 释放文件task锁,以便中间分片能及时响应客户端 file_cache.release_lock() # ============================独占状态 END============================ # 最后一个分片时 if len(chunks_values) == 0: task = None # 轮询检查分片是否完整 is_completed = False start = time.time() while is_completed is False and time.time( ) - start <= current_app.config['REDIS_CACHE_EXPIRE_FILE']: time.sleep(0.5) counter = file_cache.get_counter() if counter is None: # 有可能此时后台合并线程还没有来得及创建counter_key continue if isinstance(counter, bytes): counter = int(counter) print( datetime.datetime.now(), f'#### counter {counter}, total chunks {totalChunks}') # 期望的分片列表 [1...totalChunks] # totalChunks: [i + 1 for i in range(totalChunks)] # 当前的分片列表 [x...counter] # counter: [i + 1 for i in range(counter)] expect_partitions = sorted( [i + 1 for i in range(totalChunks)]) current_partitions = sorted( [i + 1 for i in range(counter)]) is_completed = expect_partitions == current_partitions fileHandler.log_print( key, chunk, 'is_completed:{}'.format(is_completed)) else: # 多人同时上传文件时,porter_running可能出现延迟,需要等待一会 wait_start = time.time() task_link = file_cache.get_task_completed() print( datetime.datetime.now(), f'#### task link {task_link}, wait_start: {wait_start}' ) while is_completed is True and time.time( ) - wait_start <= 2000 and task_link is None: time.sleep(0.5) task_link = file_cache.get_task_completed() # print(datetime.datetime.now(), '#### task link', task_link) if task_link is None: return standard_expection('文件检测超时.') return standard_response(task_link, 200) else: # 中间分片,则结束 fileHandler.log_print(key, chunk, 'is_completed:False') print(datetime.datetime.now(), "is_completed:False", key, chunk) return standard_response(str(chunk), 200) except Exception as err: print("error: ", err) import traceback traceback.print_exc() return standard_expection('上传失败!') finally: file_cache.release_lock() print("end", request.form)
def send_static_file(path): dir_path = os.path.join(Utils.mount_point(), current_app.config['UPLOAD_FOLDER']) return send_from_directory(dir_path, path, as_attachment=True)