def handle_paraphrasing_export_origin(): """ 导出paraphrasing数据 :return: """ export_key = comm_util.get_time_keyword() export_path = __DEFAULT_PATH + '/export/' + export_key export_batch_path = __DEFAULT_PATH + '/export/' + export_key + '/batch' export_zipfile = export_path + '.zip' comm_util.create_dirs(export_batch_path) all_batch_status_file = export_path + '/all_batch_status.json' all_batch_status = [] batch_status_list = db[TBL_BATCHSTATUS].find() for status_item in batch_status_list: user_name = status_item["login_name"] # 导出用户的Batch列表 for batch_item in status_item[TERM_PARAPHRASING]: batch_id = batch_item["batch_id"] all_batch_status.append({ "login_name": user_name, "batch_id": batch_id, "batch_progress": batch_item['batch_progress'], "batch_description": batch_item['batch_description'] }) dialogue_list = db[TBL_PARAPHRASING].find({ 'login_name': user_name, "batch_id": batch_id }) user_dialogues = {} for dialogue_item in dialogue_list: dialogue = dialogue_item["dialogue"] user_dialogues[dialogue["dialogue_id"]] = dialogue # 导出用户的Batch文件 dstfile = os.path.join(export_batch_path, batch_id + ".json") # comm_util.save_json_file(user_dialogues, dstfile) comm_util.save_json_cn_file(user_dialogues, dstfile) # 导出全部batchStatus文件 # comm_util.save_json_file(all_batch_status, all_batch_status_file) comm_util.save_json_cn_file(all_batch_status, all_batch_status_file) # 压缩导出文件 comm_util.compress_folder(export_zipfile, export_path) # 删除导出文件夹 comm_util.rm_folder_tree(export_path) '''下载压缩文件''' return send_file(export_zipfile, as_attachment=True)
def handle_annotating_export_multiwox(): """ 导出annotating multiwox数据 :return: """ export_key = comm_util.get_time_keyword() export_path = __DEFAULT_PATH + '/export/' + export_key export_batch_path = __DEFAULT_PATH + '/export/' + export_key + '/batch' export_zipfile = export_path + '.zip' comm_util.create_dirs(export_batch_path) error_docs = [] error_path = __DEFAULT_PATH + '/error/EXP' + export_key comm_util.create_dirs(error_path) error_docsfile = error_path + '.json' error_zipfile = error_path + '.zip' all_batch_status_file = export_path + '/all_batch_status.json' all_batch_status = [] batch_status_list = db[TBL_BATCHSTATUS].find() for status_item in batch_status_list: user_name = status_item["login_name"] # 导出用户的Batch列表 for batch_item in status_item[TERM_ANNOTATING]: batch_id = batch_item["batch_id"] all_batch_status.append({ "login_name": user_name, "batch_id": batch_id, "batch_progress": batch_item['batch_progress'], "batch_description": batch_item['batch_description'] }) dialogue_list = db[TBL_ANNOTATING].find({ 'login_name': user_name, "batch_id": batch_id }) user_dialogues = {} for dialogue_item in dialogue_list: dialogue = dialogue_item["dialogue"] user_dialogues[dialogue["dialogue_id"]] = dialogue # 导出用户的Batch文件 dstfile = os.path.join(export_batch_path, batch_id + ".json") error_file = os.path.join(error_path, batch_id + ".json") try: target_dialogueDict = DataConvertor.to_dataset( user_dialogues, data_set='MultiWOZ') # comm_util.save_json_file(target_dialogueDict, dstfile) comm_util.save_json_cn_file(target_dialogueDict, dstfile) except Exception as e: print('Json transfer fails, INFO: %s' % str(e)) error_docs.append({ "batch_id": batch_id, "batch_file": batch_id + ".json", "exception": str(e), }) # 普通格式导出 # comm_util.save_json_file(user_dialogues, error_file) comm_util.save_json_cn_file(user_dialogues, error_file) # 导出全部batchStatus文件 # comm_util.save_json_file(all_batch_status, all_batch_status_file) comm_util.save_json_cn_file(all_batch_status, all_batch_status_file) # 压缩导出文件 comm_util.compress_folder(export_zipfile, export_path) # 删除导出文件夹 comm_util.rm_folder_tree(export_path) if len(error_docs) > 0: error_time = comm_util.get_time_stamp() # 保存错误日志 error_info = { "error_type": "EXPORT", "operator": "Administrator", "error_docs": error_docs, "error_time": error_time } # comm_util.save_json_file(error_info, error_docsfile) comm_util.save_json_cn_file(error_info, error_docsfile) # 压缩错误文件 comm_util.compress_folder(error_zipfile, error_path) # 删除错误原始文件夹 comm_util.rm_folder_tree(error_path) '''下载压缩文件''' return send_file(export_zipfile, as_attachment=True)
def handle_batch_allocation(): """ 批次分配 :return: """ data = request.get_json() batch_category = data['batch_category'] batch_id_list = data['batch_id_list'] allocate_login_name = data['allocate_login_name'] unallocate_path = __DEFAULT_PATH + '/unallocated/' + batch_category error_key = comm_util.get_time_keyword() error_time = comm_util.get_time_stamp() error_path = __DEFAULT_PATH + '/error/DUP' + error_key comm_util.create_dirs(error_path) error_docs = [] error_docsfile = error_path + '.json' error_zipfile = error_path + '.zip' batch_status = db[TBL_BATCHSTATUS].find_one( {'login_name': allocate_login_name}) if batch_status == None: batch_status = comm_util.generate_default_batchstatus( allocate_login_name) for batch in batch_id_list: batch_id = batch['batch_id'] metadata_name = batch['metadata_name'] srcfile = os.path.join(unallocate_path, metadata_name, batch_id + '.json') duplicatefile = os.path.join(error_path, batch_id + '.json') # batch_info = db[TBL_BATCHSTATUS].find_one( # {'login_name': allocate_login_name, batch_category: {"batch_id": batch_id}}) dup_query = { 'login_name': allocate_login_name, batch_category: { '$elemMatch': { 'batch_id': batch_id } } } # print(dup_query) batch_info = db[TBL_BATCHSTATUS].find_one(dup_query) # 如果同批次文件已经分配给该用户,作为重复文件记录 if batch_info: print('{} already exists'.format(batch_id)) shutil.move(srcfile, duplicatefile) # 移动文件到重复文件夹 error_docs.append({ "batch_id": batch_id, "batch_category": batch_category, "batch_file": batch_id + '.json', "exception": 'batch file duplicated' }) else: batch_status[batch_category].append({ "batch_id": batch_id, "batch_progress": "0%", "metadata_name": metadata_name, "batch_description": "Initialized" }) dialogue_name = "dialogue_" + batch_category dialogues = comm_util.load_json_file(srcfile) for key, value in dialogues.items(): # value["active"] = False # print("srcfile:=", srcfile, "key:=", key, "value:=", value) value["dialogue_id"] = key value["activated"] = False value["status"] = "PROCESSING" # 增加激活状态,便于第一次数据保存 db[dialogue_name].save({ 'login_name': allocate_login_name, "batch_id": batch_id, "metadata_name": metadata_name, DATA_VERSION: CURRENT_VERSION, "dialogue": value }) os.remove(srcfile) # 删除文件 # 保存batch列表处理状态 db[TBL_BATCHSTATUS].save(batch_status) if len(error_docs) > 0: # 保存错误日志 error_info = { "error_type": "ALLOCATION", "operator": allocate_login_name, "error_docs": error_docs, "error_time": error_time } comm_util.save_json_file(error_info, error_docsfile) # 压缩错误文件 comm_util.compress_folder(error_zipfile, error_path) responseObject = { "code": 100, "msg": gettext(u'msgErrorAllocateBatch') } else: responseObject = { "code": 200, "msg": gettext(u'msgSuccessAllocateBatch') } # 删除错误收集文件夹 comm_util.rm_folder_tree(error_path) return jsonify(responseObject)
def handle_upload_batch_allocation(content_category): """ 上传批次文件 :return: """ # Create unallocated file path raw_unzip_path = __DEFAULT_PATH + '/raw/' + comm_util.get_time_keyword() comm_util.create_dirs(raw_unzip_path) raw_path = __DEFAULT_PATH + '/raw' unallocate_path = __DEFAULT_PATH + '/unallocated' error_key = comm_util.get_time_keyword() error_time = comm_util.get_time_stamp() error_path = __DEFAULT_PATH + '/error/UPD' + error_key comm_util.create_dirs(error_path) f = request.files['batch_file'] # 注意:没有的文件夹一定要先创建,不然会提示没有该路径 upload_file_name = os.path.join(raw_path, secure_filename(f.filename)) f.save(upload_file_name) # 解压缩上传文件 comm_util.uncompress_file(upload_file_name, raw_unzip_path) manifest_file = raw_unzip_path + '/MANIFEST.json' if not os.path.exists(manifest_file): responseObject = {"code": 100, "msg": gettext(u'msgMANIFESTNotExists')} return jsonify(responseObject) manifest = comm_util.load_json_file(manifest_file) if "Batch-category" not in manifest: responseObject = { "code": 100, "msg": gettext(u'msgBatchCategoryNotDefined') } return jsonify(responseObject) if "metadata_name" not in manifest: responseObject = { "code": 100, "msg": gettext(u'msgMetadataNameNotDefined') } return jsonify(responseObject) batch_category = manifest["Batch-category"] metadata_name = manifest["metadata_name"] if metadata_name == 'NA': # 当metadata_name为NA是不检测元数据的存在 pass else: metadata = db[TBL_METADATA].find_one({ 'category': batch_category, 'metadata_name': metadata_name }) if metadata == None: responseObject = { "code": 100, "msg": gettext(u'msgMetadataNotExisting:[{0}][{1}]').format( batch_category, metadata_name) } return jsonify(responseObject) error_docs = [] error_docsfile = error_path + '.json' error_zipfile = error_path + '.zip' for root, dirs, files in os.walk(raw_unzip_path): for raw_filename in files: if raw_filename == 'MANIFEST.json': continue # raw_file = raw_unzip_path + '/' + raw_filename raw_file = os.path.join(root, raw_filename) # 创建目标分配路径 target_path = unallocate_path + '/' + batch_category + '/' + metadata_name comm_util.create_dirs(target_path) target_file = unallocate_path + '/' + batch_category + '/' + metadata_name + '/' + raw_filename error_file = os.path.join(error_path, raw_filename) try: if batch_category == TERM_ANNOTATING: if content_category == 'JSON': raw_dialogueDict = comm_util.load_json_file(raw_file) target_dialogueDict = DataConvertor.from_dataset( raw_dialogueDict, data_set=metadata_name) comm_util.save_json_file(target_dialogueDict, target_file) elif content_category == 'RAW': target_dialogueDict = DataLoader.load_raw_data( raw_file) comm_util.save_json_file(target_dialogueDict, target_file) else: if content_category == 'JSON': # shutil.move(raw_file, target_file) # 移动文件到未分配文件夹 # 读取JSON之后保存,避免不合法的JSON文件混入 raw_dialogueDict = comm_util.load_json_file(raw_file) comm_util.save_json_file(raw_dialogueDict, target_file) elif content_category == 'RAW': pass except Exception as e: print('Json transfer fails, INFO: %s' % str(e)) error_docs.append({ "batch_id": raw_filename.replace('.json', ''), "batch_file": raw_filename, "exception": str(e), }) shutil.copy(raw_file, error_file) # 复制文件 # 删除解压缩原始文件夹 comm_util.rm_folder_tree(raw_unzip_path) if len(error_docs) > 0: # 保存错误日志 error_info = { "error_type": "UPLOAD", "operator": "Administrator", "error_docs": error_docs, "error_time": error_time } comm_util.save_json_file(error_info, error_docsfile) # 压缩错误文件 comm_util.compress_folder(error_zipfile, error_path) responseObject = { "code": 100, "msg": gettext(u'msgErrorUploadBatch'), "error_docsfile": error_docsfile, "error_docs": error_docs } else: responseObject = { "code": 200, "msg": gettext(u'msgSuccessUploadBatch') } # 删除错误文件夹 comm_util.rm_folder_tree(error_path) return jsonify(responseObject)
def handle_consistency_check(): """ 数据完整性检测 :return: """ error_key = comm_util.get_time_keyword() error_time = comm_util.get_time_stamp() error_path = __DEFAULT_PATH + '/error/CHK' + error_key comm_util.create_dirs(error_path) error_docs = [] error_docsfile = error_path + '.json' error_zipfile = error_path + '.zip' general_query = {} batch_status_list = db[TBL_BATCHSTATUS].find(general_query) for batch_item in batch_status_list: login_name = batch_item['login_name'] # 检查annotating列表完整性 annotating_list = batch_item[TERM_ANNOTATING] for annotating_item in annotating_list: batch_id = annotating_item['batch_id'] dialogue_list = db[TBL_ANNOTATING].find({ 'login_name': login_name, "batch_id": batch_id }) if dialogue_list.count() == 0: error_docs.append('annotating列表与Status列表比匹配') # 检查paraphrasing列表完整性 paraphrasing_list = batch_item[TERM_PARAPHRASING] for paraphrasing_item in paraphrasing_list: batch_id = paraphrasing_item['batch_id'] dialogue_list = db[TBL_PARAPHRASING].find({ 'login_name': login_name, "batch_id": batch_id }) if dialogue_list.count() == 0: error_docs.append('paraphrasing列表与Status列表比匹配') # annotating聚合检查 # {"$sort": SON([("count", -1), ("_id", -1)])} pipeline = [ # {"$match": {DATA_VERSION: CURRENT_VERSION}}, { "$group": { "_id": { 'login_name': '$login_name', 'batch_id': '$batch_id' }, "count": { "$sum": 1 } } } ] annotating_aggregate = db[TBL_ANNOTATING].aggregate(pipeline) for annotating_item in annotating_aggregate: condition = annotating_item['_id'] login_name = condition['login_name'] batch_id = condition['batch_id'] existing_query = { 'login_name': login_name, 'annotating': { '$elemMatch': { 'batch_id': batch_id } } } batch_info = db[TBL_BATCHSTATUS].find_one(existing_query) if batch_info == None: error_docs.append( 'annotating的dialogue在Status列表中不存在,login_name:={}, batch_id:={}' .format(login_name, batch_id)) paraphrasing_aggregate = db[TBL_PARAPHRASING].aggregate(pipeline) for paraphrasing_item in paraphrasing_aggregate: condition = paraphrasing_item['_id'] login_name = condition['login_name'] batch_id = condition['batch_id'] existing_query = { 'login_name': login_name, 'paraphrasing': { '$elemMatch': { 'batch_id': batch_id } } } batch_info = db[TBL_BATCHSTATUS].find_one(existing_query) if batch_info == None: error_docs.append( 'paraphrasing的dialogue在Status列表中不存在,login_name:={}, batch_id:={}' .format(login_name, batch_id)) if len(error_docs) > 0: # 保存错误日志 error_info = { "error_type": "CHECK", "operator": "Administrator", "error_docs": error_docs, "error_time": error_time } comm_util.save_json_cn_file(error_info, error_docsfile) error_file = os.path.join(error_path, 'error.log') comm_util.save_txt_file(error_docs, error_file) # 压缩错误文件 comm_util.compress_folder(error_zipfile, error_path) # 删除错误文件夹 comm_util.rm_folder_tree(error_path) responseObject = {"status": "success"} return jsonify(responseObject)