示例#1
0
def handle_paraphrasing_export_origin():
    """
    导出paraphrasing数据
    :return:
    """
    export_key = comm_util.get_time_keyword()
    export_path = __DEFAULT_PATH + '/export/' + export_key
    export_batch_path = __DEFAULT_PATH + '/export/' + export_key + '/batch'
    export_zipfile = export_path + '.zip'
    comm_util.create_dirs(export_batch_path)

    all_batch_status_file = export_path + '/all_batch_status.json'
    all_batch_status = []

    batch_status_list = db[TBL_BATCHSTATUS].find()
    for status_item in batch_status_list:
        user_name = status_item["login_name"]

        # 导出用户的Batch列表
        for batch_item in status_item[TERM_PARAPHRASING]:
            batch_id = batch_item["batch_id"]
            all_batch_status.append({
                "login_name":
                user_name,
                "batch_id":
                batch_id,
                "batch_progress":
                batch_item['batch_progress'],
                "batch_description":
                batch_item['batch_description']
            })

            dialogue_list = db[TBL_PARAPHRASING].find({
                'login_name': user_name,
                "batch_id": batch_id
            })
            user_dialogues = {}
            for dialogue_item in dialogue_list:
                dialogue = dialogue_item["dialogue"]
                user_dialogues[dialogue["dialogue_id"]] = dialogue

            # 导出用户的Batch文件
            dstfile = os.path.join(export_batch_path, batch_id + ".json")
            # comm_util.save_json_file(user_dialogues, dstfile)
            comm_util.save_json_cn_file(user_dialogues, dstfile)

    # 导出全部batchStatus文件
    # comm_util.save_json_file(all_batch_status, all_batch_status_file)
    comm_util.save_json_cn_file(all_batch_status, all_batch_status_file)

    # 压缩导出文件
    comm_util.compress_folder(export_zipfile, export_path)

    # 删除导出文件夹
    comm_util.rm_folder_tree(export_path)
    '''下载压缩文件'''
    return send_file(export_zipfile, as_attachment=True)
示例#2
0
def handle_annotating_export_multiwox():
    """
    导出annotating multiwox数据
    :return:
    """
    export_key = comm_util.get_time_keyword()
    export_path = __DEFAULT_PATH + '/export/' + export_key
    export_batch_path = __DEFAULT_PATH + '/export/' + export_key + '/batch'
    export_zipfile = export_path + '.zip'
    comm_util.create_dirs(export_batch_path)

    error_docs = []
    error_path = __DEFAULT_PATH + '/error/EXP' + export_key
    comm_util.create_dirs(error_path)

    error_docsfile = error_path + '.json'
    error_zipfile = error_path + '.zip'

    all_batch_status_file = export_path + '/all_batch_status.json'
    all_batch_status = []

    batch_status_list = db[TBL_BATCHSTATUS].find()
    for status_item in batch_status_list:
        user_name = status_item["login_name"]

        # 导出用户的Batch列表
        for batch_item in status_item[TERM_ANNOTATING]:
            batch_id = batch_item["batch_id"]
            all_batch_status.append({
                "login_name":
                user_name,
                "batch_id":
                batch_id,
                "batch_progress":
                batch_item['batch_progress'],
                "batch_description":
                batch_item['batch_description']
            })

            dialogue_list = db[TBL_ANNOTATING].find({
                'login_name': user_name,
                "batch_id": batch_id
            })
            user_dialogues = {}
            for dialogue_item in dialogue_list:
                dialogue = dialogue_item["dialogue"]
                user_dialogues[dialogue["dialogue_id"]] = dialogue

            # 导出用户的Batch文件
            dstfile = os.path.join(export_batch_path, batch_id + ".json")
            error_file = os.path.join(error_path, batch_id + ".json")
            try:
                target_dialogueDict = DataConvertor.to_dataset(
                    user_dialogues, data_set='MultiWOZ')
                # comm_util.save_json_file(target_dialogueDict, dstfile)
                comm_util.save_json_cn_file(target_dialogueDict, dstfile)
            except Exception as e:
                print('Json transfer fails, INFO: %s' % str(e))
                error_docs.append({
                    "batch_id": batch_id,
                    "batch_file": batch_id + ".json",
                    "exception": str(e),
                })
                # 普通格式导出
                # comm_util.save_json_file(user_dialogues, error_file)
                comm_util.save_json_cn_file(user_dialogues, error_file)

    # 导出全部batchStatus文件
    # comm_util.save_json_file(all_batch_status, all_batch_status_file)
    comm_util.save_json_cn_file(all_batch_status, all_batch_status_file)

    # 压缩导出文件
    comm_util.compress_folder(export_zipfile, export_path)

    # 删除导出文件夹
    comm_util.rm_folder_tree(export_path)

    if len(error_docs) > 0:
        error_time = comm_util.get_time_stamp()
        # 保存错误日志
        error_info = {
            "error_type": "EXPORT",
            "operator": "Administrator",
            "error_docs": error_docs,
            "error_time": error_time
        }
        # comm_util.save_json_file(error_info, error_docsfile)
        comm_util.save_json_cn_file(error_info, error_docsfile)
        # 压缩错误文件
        comm_util.compress_folder(error_zipfile, error_path)

        # 删除错误原始文件夹
        comm_util.rm_folder_tree(error_path)
    '''下载压缩文件'''
    return send_file(export_zipfile, as_attachment=True)
示例#3
0
def handle_batch_allocation():
    """
    批次分配
    :return:
    """
    data = request.get_json()

    batch_category = data['batch_category']
    batch_id_list = data['batch_id_list']
    allocate_login_name = data['allocate_login_name']

    unallocate_path = __DEFAULT_PATH + '/unallocated/' + batch_category

    error_key = comm_util.get_time_keyword()
    error_time = comm_util.get_time_stamp()
    error_path = __DEFAULT_PATH + '/error/DUP' + error_key
    comm_util.create_dirs(error_path)

    error_docs = []
    error_docsfile = error_path + '.json'
    error_zipfile = error_path + '.zip'

    batch_status = db[TBL_BATCHSTATUS].find_one(
        {'login_name': allocate_login_name})
    if batch_status == None:
        batch_status = comm_util.generate_default_batchstatus(
            allocate_login_name)

    for batch in batch_id_list:
        batch_id = batch['batch_id']
        metadata_name = batch['metadata_name']
        srcfile = os.path.join(unallocate_path, metadata_name,
                               batch_id + '.json')
        duplicatefile = os.path.join(error_path, batch_id + '.json')

        # batch_info = db[TBL_BATCHSTATUS].find_one(
        #     {'login_name': allocate_login_name, batch_category: {"batch_id": batch_id}})
        dup_query = {
            'login_name': allocate_login_name,
            batch_category: {
                '$elemMatch': {
                    'batch_id': batch_id
                }
            }
        }
        # print(dup_query)
        batch_info = db[TBL_BATCHSTATUS].find_one(dup_query)

        # 如果同批次文件已经分配给该用户,作为重复文件记录
        if batch_info:
            print('{} already exists'.format(batch_id))
            shutil.move(srcfile, duplicatefile)  # 移动文件到重复文件夹
            error_docs.append({
                "batch_id": batch_id,
                "batch_category": batch_category,
                "batch_file": batch_id + '.json',
                "exception": 'batch file duplicated'
            })
        else:
            batch_status[batch_category].append({
                "batch_id":
                batch_id,
                "batch_progress":
                "0%",
                "metadata_name":
                metadata_name,
                "batch_description":
                "Initialized"
            })
            dialogue_name = "dialogue_" + batch_category
            dialogues = comm_util.load_json_file(srcfile)
            for key, value in dialogues.items():
                # value["active"] = False
                # print("srcfile:=", srcfile, "key:=", key, "value:=", value)
                value["dialogue_id"] = key
                value["activated"] = False
                value["status"] = "PROCESSING"
                # 增加激活状态,便于第一次数据保存
                db[dialogue_name].save({
                    'login_name': allocate_login_name,
                    "batch_id": batch_id,
                    "metadata_name": metadata_name,
                    DATA_VERSION: CURRENT_VERSION,
                    "dialogue": value
                })

            os.remove(srcfile)  # 删除文件

    # 保存batch列表处理状态
    db[TBL_BATCHSTATUS].save(batch_status)

    if len(error_docs) > 0:
        # 保存错误日志
        error_info = {
            "error_type": "ALLOCATION",
            "operator": allocate_login_name,
            "error_docs": error_docs,
            "error_time": error_time
        }
        comm_util.save_json_file(error_info, error_docsfile)
        # 压缩错误文件
        comm_util.compress_folder(error_zipfile, error_path)

        responseObject = {
            "code": 100,
            "msg": gettext(u'msgErrorAllocateBatch')
        }
    else:
        responseObject = {
            "code": 200,
            "msg": gettext(u'msgSuccessAllocateBatch')
        }

    # 删除错误收集文件夹
    comm_util.rm_folder_tree(error_path)

    return jsonify(responseObject)
示例#4
0
def handle_upload_batch_allocation(content_category):
    """
    上传批次文件
    :return:
    """
    # Create unallocated file path
    raw_unzip_path = __DEFAULT_PATH + '/raw/' + comm_util.get_time_keyword()
    comm_util.create_dirs(raw_unzip_path)

    raw_path = __DEFAULT_PATH + '/raw'
    unallocate_path = __DEFAULT_PATH + '/unallocated'

    error_key = comm_util.get_time_keyword()
    error_time = comm_util.get_time_stamp()
    error_path = __DEFAULT_PATH + '/error/UPD' + error_key
    comm_util.create_dirs(error_path)

    f = request.files['batch_file']
    # 注意:没有的文件夹一定要先创建,不然会提示没有该路径
    upload_file_name = os.path.join(raw_path, secure_filename(f.filename))
    f.save(upload_file_name)

    # 解压缩上传文件
    comm_util.uncompress_file(upload_file_name, raw_unzip_path)

    manifest_file = raw_unzip_path + '/MANIFEST.json'
    if not os.path.exists(manifest_file):
        responseObject = {"code": 100, "msg": gettext(u'msgMANIFESTNotExists')}
        return jsonify(responseObject)

    manifest = comm_util.load_json_file(manifest_file)
    if "Batch-category" not in manifest:
        responseObject = {
            "code": 100,
            "msg": gettext(u'msgBatchCategoryNotDefined')
        }
        return jsonify(responseObject)

    if "metadata_name" not in manifest:
        responseObject = {
            "code": 100,
            "msg": gettext(u'msgMetadataNameNotDefined')
        }
        return jsonify(responseObject)

    batch_category = manifest["Batch-category"]
    metadata_name = manifest["metadata_name"]

    if metadata_name == 'NA':
        # 当metadata_name为NA是不检测元数据的存在
        pass
    else:
        metadata = db[TBL_METADATA].find_one({
            'category': batch_category,
            'metadata_name': metadata_name
        })
        if metadata == None:
            responseObject = {
                "code":
                100,
                "msg":
                gettext(u'msgMetadataNotExisting:[{0}][{1}]').format(
                    batch_category, metadata_name)
            }
            return jsonify(responseObject)

    error_docs = []
    error_docsfile = error_path + '.json'
    error_zipfile = error_path + '.zip'
    for root, dirs, files in os.walk(raw_unzip_path):
        for raw_filename in files:
            if raw_filename == 'MANIFEST.json':
                continue

            # raw_file = raw_unzip_path + '/' + raw_filename
            raw_file = os.path.join(root, raw_filename)

            # 创建目标分配路径
            target_path = unallocate_path + '/' + batch_category + '/' + metadata_name
            comm_util.create_dirs(target_path)

            target_file = unallocate_path + '/' + batch_category + '/' + metadata_name + '/' + raw_filename
            error_file = os.path.join(error_path, raw_filename)
            try:
                if batch_category == TERM_ANNOTATING:
                    if content_category == 'JSON':
                        raw_dialogueDict = comm_util.load_json_file(raw_file)
                        target_dialogueDict = DataConvertor.from_dataset(
                            raw_dialogueDict, data_set=metadata_name)
                        comm_util.save_json_file(target_dialogueDict,
                                                 target_file)
                    elif content_category == 'RAW':
                        target_dialogueDict = DataLoader.load_raw_data(
                            raw_file)
                        comm_util.save_json_file(target_dialogueDict,
                                                 target_file)
                else:
                    if content_category == 'JSON':
                        # shutil.move(raw_file, target_file)  # 移动文件到未分配文件夹
                        # 读取JSON之后保存,避免不合法的JSON文件混入
                        raw_dialogueDict = comm_util.load_json_file(raw_file)
                        comm_util.save_json_file(raw_dialogueDict, target_file)
                    elif content_category == 'RAW':
                        pass

            except Exception as e:
                print('Json transfer fails, INFO: %s' % str(e))
                error_docs.append({
                    "batch_id": raw_filename.replace('.json', ''),
                    "batch_file": raw_filename,
                    "exception": str(e),
                })
                shutil.copy(raw_file, error_file)  # 复制文件

    # 删除解压缩原始文件夹
    comm_util.rm_folder_tree(raw_unzip_path)

    if len(error_docs) > 0:
        # 保存错误日志
        error_info = {
            "error_type": "UPLOAD",
            "operator": "Administrator",
            "error_docs": error_docs,
            "error_time": error_time
        }
        comm_util.save_json_file(error_info, error_docsfile)
        # 压缩错误文件
        comm_util.compress_folder(error_zipfile, error_path)

        responseObject = {
            "code": 100,
            "msg": gettext(u'msgErrorUploadBatch'),
            "error_docsfile": error_docsfile,
            "error_docs": error_docs
        }
    else:

        responseObject = {
            "code": 200,
            "msg": gettext(u'msgSuccessUploadBatch')
        }

    # 删除错误文件夹
    comm_util.rm_folder_tree(error_path)

    return jsonify(responseObject)
示例#5
0
def handle_consistency_check():
    """
    数据完整性检测
    :return:
    """

    error_key = comm_util.get_time_keyword()
    error_time = comm_util.get_time_stamp()
    error_path = __DEFAULT_PATH + '/error/CHK' + error_key
    comm_util.create_dirs(error_path)

    error_docs = []

    error_docsfile = error_path + '.json'
    error_zipfile = error_path + '.zip'

    general_query = {}
    batch_status_list = db[TBL_BATCHSTATUS].find(general_query)
    for batch_item in batch_status_list:
        login_name = batch_item['login_name']

        # 检查annotating列表完整性
        annotating_list = batch_item[TERM_ANNOTATING]
        for annotating_item in annotating_list:
            batch_id = annotating_item['batch_id']
            dialogue_list = db[TBL_ANNOTATING].find({
                'login_name': login_name,
                "batch_id": batch_id
            })
            if dialogue_list.count() == 0:
                error_docs.append('annotating列表与Status列表比匹配')

        # 检查paraphrasing列表完整性
        paraphrasing_list = batch_item[TERM_PARAPHRASING]
        for paraphrasing_item in paraphrasing_list:
            batch_id = paraphrasing_item['batch_id']
            dialogue_list = db[TBL_PARAPHRASING].find({
                'login_name': login_name,
                "batch_id": batch_id
            })
            if dialogue_list.count() == 0:
                error_docs.append('paraphrasing列表与Status列表比匹配')

    # annotating聚合检查
    # {"$sort": SON([("count", -1), ("_id", -1)])}
    pipeline = [
        # {"$match": {DATA_VERSION: CURRENT_VERSION}},
        {
            "$group": {
                "_id": {
                    'login_name': '$login_name',
                    'batch_id': '$batch_id'
                },
                "count": {
                    "$sum": 1
                }
            }
        }
    ]
    annotating_aggregate = db[TBL_ANNOTATING].aggregate(pipeline)
    for annotating_item in annotating_aggregate:
        condition = annotating_item['_id']
        login_name = condition['login_name']
        batch_id = condition['batch_id']
        existing_query = {
            'login_name': login_name,
            'annotating': {
                '$elemMatch': {
                    'batch_id': batch_id
                }
            }
        }
        batch_info = db[TBL_BATCHSTATUS].find_one(existing_query)
        if batch_info == None:
            error_docs.append(
                'annotating的dialogue在Status列表中不存在,login_name:={}, batch_id:={}'
                .format(login_name, batch_id))

    paraphrasing_aggregate = db[TBL_PARAPHRASING].aggregate(pipeline)
    for paraphrasing_item in paraphrasing_aggregate:
        condition = paraphrasing_item['_id']
        login_name = condition['login_name']
        batch_id = condition['batch_id']
        existing_query = {
            'login_name': login_name,
            'paraphrasing': {
                '$elemMatch': {
                    'batch_id': batch_id
                }
            }
        }
        batch_info = db[TBL_BATCHSTATUS].find_one(existing_query)
        if batch_info == None:
            error_docs.append(
                'paraphrasing的dialogue在Status列表中不存在,login_name:={}, batch_id:={}'
                .format(login_name, batch_id))

    if len(error_docs) > 0:
        # 保存错误日志
        error_info = {
            "error_type": "CHECK",
            "operator": "Administrator",
            "error_docs": error_docs,
            "error_time": error_time
        }
        comm_util.save_json_cn_file(error_info, error_docsfile)

        error_file = os.path.join(error_path, 'error.log')
        comm_util.save_txt_file(error_docs, error_file)
        # 压缩错误文件
        comm_util.compress_folder(error_zipfile, error_path)
        # 删除错误文件夹
        comm_util.rm_folder_tree(error_path)

    responseObject = {"status": "success"}
    return jsonify(responseObject)