def get_mark_job_data_by_ids(self, mark_job_ids, args, doc_type_key="doc_type", prefix='NER'):
        items = []
        for mark_job_id in mark_job_ids:
            doc_type = DocTypeModel().get_by_mark_job_id(mark_job_id)
            result = {
                "prefix": prefix,  # TODO: 与MQ确认传参是否适配
                doc_type_key: DocTypeSchema().dump(doc_type),
                "docs": [],
                "tasks": [],
                "mark_job_id": mark_job_id,
            }
            data = MarkTaskModel().get_mark_task_and_doc_by_mark_job_ids([mark_job_id])

            for task, doc in data:
                # 抽取逻辑
                if args.get('doc_term_ids'):
                    if isinstance(task.mark_task_result, list) \
                            and Common.check_doc_term_include(task.mark_task_result, 'doc_term_id',
                                                              args['doc_term_ids']):
                        result['docs'].append(DocSchema().dump(doc))
                        result['tasks'].append(MarkTaskSchema().dump(task))
                # 实体关系逻辑
                if args.get('doc_relation_ids'):
                    if isinstance(task.mark_task_result, list) and Common.check_doc_relation_include(
                            task.mark_task_result, 'relation_id', args['doc_relation_ids']):
                        result['docs'].append(DocSchema().dump(doc))
                        result['tasks'].append(MarkTaskSchema().dump(task))
                else:
                    result['docs'].append(DocSchema().dump(doc))
                    result['tasks'].append(MarkTaskSchema().dump(task))
            items.append(result)
        return items
Exemplo n.º 2
0
def add_file_handler(*loggers: logging.Logger) -> None:
    log_dir_path = os.path.join(BASE_PATH, 'logs')
    Common.make_dirs(log_dir_path)
    log_file_path = os.path.join(log_dir_path, time.strftime(
        '%Y-%m-%d', time.localtime(time.time())) + '.log')
    file_handler = TimedRotatingFileHandler(
        log_file_path, 'D', 1, 7, None, False, False)
    file_handler.setFormatter(logging.Formatter(
        '%(request_id)s - %(asctime)s - %(levelname)s - %(filename)s - %(funcName)s - %(lineno)s - %(message)s'))
    file_handler.addFilter(RequestIdFilter())
    file_handler.setLevel(logging.INFO)

    for l in loggers:
        l.addHandler(file_handler)
Exemplo n.º 3
0
 def get_user_task_with_doc_and_doc_type(nlp_task_id,
                                         current_user: CurrentUser, args):
     q = session.query(UserTask, DocType, Doc) \
         .join(MarkTask, MarkTask.mark_task_id == UserTask.mark_task_id) \
         .join(MarkJob, MarkJob.mark_job_id == MarkTask.mark_job_id) \
         .join(DocType, DocType.doc_type_id == MarkJob.doc_type_id) \
         .join(Doc, Doc.doc_id == MarkTask.doc_id) \
         .filter(
         DocType.nlp_task_id == nlp_task_id,
         ~UserTask.is_deleted,
         ~MarkTask.is_deleted,
         ~Doc.is_deleted
     )
     # TODO
     # 权限
     if current_user.user_role in [
             RoleEnum.manager.value, RoleEnum.guest.value
     ]:
         q = q.filter(DocType.group_id.in_(current_user.user_groups))
     elif current_user.user_role in [RoleEnum.reviewer.value]:
         q = q.filter(
             func.json_contains(MarkJob.reviewer_ids,
                                str(current_user.user_id)))
     elif current_user.user_role in [RoleEnum.annotator.value]:
         # q = q.filter(func.json_contains(MarkJob.annotator_ids, str(current_user.user_id)))
         q = q.filter(UserTask.annotator_id == current_user.user_id)
     if args.get('job_id'):
         q = q.filter(MarkTask.mark_job_id == args['job_id'])
     if args.get('doc_type_id'):
         q = q.filter(MarkJob.doc_type_id == args['doc_type_id'])
     if args['task_state']:
         q = q.filter(MarkTask.mark_task_status ==
                      status_str2int_mapper().get(args['task_state']))
     if args['query']:
         q = q.filter(Doc.doc_raw_name.like(f'%{args["query"]}%'))
     q = q.group_by(UserTask)
     count = q.count()
     processing_count = q.filter(
         MarkTask.mark_task_status == int(StatusEnum.processing)).count()
     if args['order_by'] and isinstance(args['order_by'], str):
         if args['order_by'][1:] == 'task_id':
             args['order_by'] = args['order_by'][0] + 'mark_task_id'
         q = Common().order_by_model_fields(q, UserTask, [args['order_by']])
     items = []
     for user_task, doc_type, doc in q.offset(args['offset']).limit(
             args['limit']).all():
         user_task.doc = doc
         user_task.doc_type = doc_type
         items.append(user_task)
     return count, count - processing_count, items
Exemplo n.º 4
0
 def get(self):
     nlp_task_id = Common().get_nlp_task_id_by_route()
     result = DocTypeService().get_doc_type_info_by_nlp_task_by_user(
         nlp_task_id=nlp_task_id, current_user=self.get_current_user())
     return {
         "message": "请求成功",
         "result": result,
     }, 200
Exemplo n.º 5
0
 def get(self: Resource) -> typing.Tuple[typing.Dict, int]:
     """
     获取所有条款,分页,可选排除条件exclude_terms_ids
     """
     result = Common().get_wordseg_doc_terms()
     return {
                "message": "请求成功",
                "result": result
            }, 200
Exemplo n.º 6
0
    def get(self: Resource, job_id: int) -> typing.Tuple[typing.Dict, int]:
        nlp_task_id = Common().get_nlp_task_id_by_route()

        # get predict job
        predict_job = PredictService().get_predict_job_by_id(
            nlp_task_id=nlp_task_id,
            predict_job_id=job_id,
            current_user=self.get_current_user)
        result = PredictJobSchema().dump(predict_job)
        return {"message": "请求成功", "result": result}, 200
Exemplo n.º 7
0
    def get(self: Resource, args: typing.Dict,
            job_id: int) -> typing.Tuple[typing.Dict, int]:

        nlp_task_id = Common().get_nlp_task_id_by_route()
        file_path = PredictService().export_predict_file(
            nlp_task_id=nlp_task_id,
            predict_job_id=job_id,
            offset=args["offset"])

        return {"message": "请求成功", "file_path": file_path}, 200
Exemplo n.º 8
0
 def post(self: Resource, args: typing.Dict) -> typing.Tuple[typing.Dict, int]:
     """
     创建一个文档类型包括它的条款
     """
     args.update({'nlp_task_id': Common().get_nlp_task_id_by_route()})
     args.update({"group_id": self.get_current_user().user_groups[0]})
     result = DocTypeService().create_doc_type(self.get_current_user(), args)
     return {
                "message": "创建成功",
                "result": result,
            }, 201
Exemplo n.º 9
0
    def post(self: Resource,
             args: typing.Dict) -> typing.Tuple[typing.Dict, int]:
        files = args['files']
        job_type = Common().check_job_type_by_files(files)
        if not job_type:
            abort(400, message='请上传全部纯文本文档(txt/csv)或者全部电子文档(pdf/word文档)')
        else:
            args['mark_job_type'] = job_type

        result = MarkJobService().create_mark_job(files, NlpTaskEnum.relation,
                                                  args)

        return {"message": "创建成功", "result": result}, 201
Exemplo n.º 10
0
 def get(self: Resource, args: typing.Dict) -> typing.Tuple[typing.Dict, int]:
     """
     获取所有条款,分页,可选排除条件exclude_terms_ids
     """
     nlp_task_id = Common().get_nlp_task_id_by_route()
     args.update({
         'nlp_task_id': nlp_task_id
     })
     result, count = DocTermService().get_doc_term_list(args)
     return {
                "message": "请求成功",
                "result": result,
                "count": count,
            }, 200
Exemplo n.º 11
0
 def get(self: Resource, args: typing.Dict) -> typing.Tuple[typing.Dict, int]:
     """
     获取所有文档条款
     :param args:
     :return:
     """
     nlp_task_id = Common().get_nlp_task_id_by_route()
     args.update({
         'nlp_task_id': nlp_task_id
     })
     result, count = DocTypeService().get_doc_type(self.get_current_user(), args)
     return {
                "message": "请求成功",
                "result": result,
                "count": count,
            }, 200
Exemplo n.º 12
0
 def post(self: Resource,
          args: typing.Dict) -> typing.Tuple[typing.Dict, int]:
     files = args["files"]
     assign_mode = args["assign_mode"]
     if assign_mode == AssignModeEnum.together:
         abort(400, message="不支持共同标注")
     job_type = Common().check_job_type_by_files(files)
     if job_type != "text":
         abort(400, message="请上传纯文本文档(txt/csv)")
     else:
         args['mark_job_type'] = job_type
     try:
         result = MarkJobService().create_mark_job(files,
                                                   NlpTaskEnum.wordseg,
                                                   args)
         return {"message": "创建成功", "result": result}, 201
     except TypeError:
         abort(400, message="上传文件类型错误")
    def export_multi_mark_file(nlp_task_id, mark_job_id_list):
        mark_job_list = MarkJobModel().get_by_mark_job_id_list(mark_job_id_list=mark_job_id_list)

        # 导出文件夹命名
        export_dir_path = os.path.join(
            'upload/export', 'classify_mark_job_{}_{}'.format(','.join([str(job_id) for job_id in mark_job_id_list]),
                                                              datetime.now().strftime("%Y%m%d%H%M%S")))
        os.mkdir(export_dir_path)

        # get all (count, status, mark_job_id) tuple
        all_count = MarkTaskModel().count_mark_task_status(mark_job_ids=mark_job_id_list)
        # convert to a nested dict
        all_status_dict = Common().tuple_list2dict(all_count)
        for mark_job in mark_job_list:  # 遍历所有的job
            if mark_job.mark_job_status not in (StatusEnum.success, StatusEnum.approved):  # 不成功的job
                continue
            # 不是所有的任务都未审核完成
            if len(all_status_dict[mark_job.mark_job_id]) == 1 and (
                    int(StatusEnum.approved) in all_status_dict[mark_job.mark_job_id]):
                continue

            export_file_path = os.path.join(
                'upload/export', '{}_mark_job_{}'.format(NlpTaskEnum(nlp_task_id).name, mark_job.mark_job_id))
            # 检查上一次导出的结果,如果没有最近更新的话,就直接返回上次的结果
            last_exported_file = export_sync.get_last_export_file(job=mark_job, export_file_path=export_file_path)
            if last_exported_file:
                shutil.copy(
                    last_exported_file, os.path.join(export_dir_path, '标注任务{}.csv'.format(mark_job.mark_job_id)))
                continue

            # 重新制作
            export_fileset = FileSet(folder=export_file_path)
            mark_task_and_doc_list = MarkTaskModel().get_mark_task_and_doc_by_mark_job_ids(
                mark_job_ids=[mark_job.mark_job_id])
            file_path = export_sync.generate_classify_file(
                task_and_doc_list=mark_task_and_doc_list, export_fileset=export_fileset)
            shutil.copy(file_path, os.path.join(export_dir_path, '标注任务{}.csv'.format(mark_job.mark_job_id)))

        if not os.listdir(export_dir_path):
            raise ValueError("所选标注任务中没有完成审核的任务,请重新选择")
        shutil.make_archive(export_dir_path, 'zip', export_dir_path)  # 打包
        return export_dir_path + ".zip"
Exemplo n.º 14
0
    def get(self: Resource, args: Dict[str,
                                       Any]) -> Tuple[Dict[str, Any], int]:
        """
        获取模型记录,分页
        """
        nlp_task_id = Common.get_nlp_task_id_by_route()

        count, train_job_list = ModelService(
        ).get_train_job_list_by_nlp_task_id(
            nlp_task_id=nlp_task_id,
            doc_type_id=args['doc_type_id'],
            search=args['query'],
            offset=args['offset'],
            limit=args['limit'],
            current_user=self.get_current_user())

        result = TrainJobSchema().dump(train_job_list, many=True)
        return {
            "message": "请求成功",
            "result": result,
            "count": count,
        }, 200
Exemplo n.º 15
0
 def get(self: Resource,
         args: typing.Dict) -> typing.Tuple[typing.Dict, int]:
     nlp_task_id = Common().get_nlp_task_id_by_route()
     order_by = args["order_by"][1:]
     order_by_desc = True if args["order_by"][0] == "-" else False
     count, predict_job_list = PredictService(
     ).get_predict_job_list_by_nlp_task_id(
         nlp_task_id=nlp_task_id,
         doc_type_id=args['doc_type_id'],
         search=args['query'],
         order_by=order_by,
         order_by_desc=order_by_desc,
         offset=args['offset'],
         limit=args['limit'],
         current_user=self.get_current_user())
     # get the serialized result
     result = PredictJobSchema().dump(predict_job_list, many=True)
     return {
         "message": "请求成功",
         "result": result,
         "count": count,
     }, 200
Exemplo n.º 16
0
    def get_preview_and_next_mark_task_id(current_user, nlp_task_id, task_id,
                                          args):
        q = session.query(MarkTask.mark_task_id) \
            .outerjoin(UserTask, UserTask.mark_task_id == MarkTask.mark_task_id) \
            .join(MarkJob, MarkJob.mark_job_id == MarkTask.mark_job_id) \
            .join(DocType, DocType.doc_type_id == MarkJob.doc_type_id) \
            .filter(
            DocType.nlp_task_id == nlp_task_id,
            MarkTask.mark_task_status != int(StatusEnum.processing),
            ~MarkTask.is_deleted,
            or_(~UserTask.is_deleted, UserTask.is_deleted.is_(None)),
            ~MarkJob.is_deleted,
            ~DocType.is_deleted
        )

        if args.get('job_id'):
            q = q.filter(MarkJob.mark_job_id == args['job_id'])
        if args.get("task_state"):
            q = q.filter(MarkTask.mark_task_status == args.get("task_state"))
        if args.get("query"):
            q = q.filter(Doc.doc_raw_name.contains(args.get("query")))

        if current_user.user_role in [
                RoleEnum.manager.value, RoleEnum.guest.value
        ]:
            q = q.filter(DocType.group_id.in_(current_user.user_groups))
        elif current_user.user_role in [RoleEnum.reviewer.value]:
            q = q.filter(
                func.json_contains(MarkJob.reviewer_ids,
                                   str(current_user.user_id)))
        elif current_user.user_role in [RoleEnum.annotator.value]:
            q = q.filter(
                func.json_contains(MarkJob.annotator_ids,
                                   str(current_user.user_id)))

        q1 = Common().order_by_model_fields(
            q.filter(MarkTask.mark_task_id < task_id), MarkTask,
            ['-mark_task_id'])
        q2 = Common().order_by_model_fields(
            q.filter(MarkTask.mark_task_id > task_id), MarkTask,
            ['+mark_task_id'])

        next_task_id = q1.limit(1).first()
        preview_task_id = q2.limit(1).first()
        return preview_task_id[0] if preview_task_id else None, next_task_id[
            0] if next_task_id else None
    def export_mark_file(nlp_task_id, mark_job_id, offset=50):
        mark_job = MarkJobModel().get_by_id(mark_job_id)

        if mark_job.mark_job_status not in (StatusEnum.approved, StatusEnum.success):
            abort(400, message="有失败或未完成任务,不能导出")

        all_count = MarkTaskModel().count_mark_task_status(mark_job_ids=[mark_job_id])
        # convert 3 element tuple to a nested dict
        all_status_dict = Common().tuple_list2dict(all_count)

        if not (len(all_status_dict[mark_job_id]) == 1 and int(StatusEnum.approved) in all_status_dict[mark_job_id]):
            abort(400, message="有未标注或未审核任务,不能导出")

        export_file_path = os.path.join(
            'upload/export', '{}_mark_job_{}'.format(NlpTaskEnum(nlp_task_id).name, mark_job_id))
        # 检查上一次导出的结果,如果没有最近更新的话,就直接返回上次的结果
        last_exported_file = export_sync.get_last_export_file(job=mark_job, export_file_path=export_file_path)
        if last_exported_file:
            return last_exported_file

        # 重新制作
        export_fileset = FileSet(folder=export_file_path)
        mark_task_and_doc_list = MarkTaskModel().get_mark_task_and_doc_by_mark_job_ids(mark_job_ids=[mark_job_id])

        if nlp_task_id == int(NlpTaskEnum.extract):
            doc_terms = DocTermModel().get_by_filter(limit=99999, doc_type_id=mark_job.doc_type_id)
            file_path = export_sync.generate_extract_file(task_and_doc_list=mark_task_and_doc_list,
                                                          export_fileset=export_fileset, doc_terms=doc_terms,
                                                          offset=offset)
        elif nlp_task_id == int(NlpTaskEnum.classify):
            file_path = export_sync.generate_classify_file(task_and_doc_list=mark_task_and_doc_list,
                                                           export_fileset=export_fileset)
        elif nlp_task_id == int(NlpTaskEnum.wordseg):
            file_path = export_sync.generate_wordseg_file(task_and_doc_list=mark_task_and_doc_list,
                                                          export_fileset=export_fileset)
        else:
            abort(400, message="该任务无法导出")
        return file_path
 def import_labeled_wordseg_files(self, f, mark_job: MarkJob):
     # Step 1. Save temp file
     # Step 2. Pre-process labeled file, generate raw content and labeled results
     # Step 3. Save labeled information into database
     doc_unique_name, doc_relative_path = upload_fileset.save_file(f.filename, f.stream.read())
     corpus_doc_unique_name_list = []
     # 标注txt中每行是一个sample,应该存成数据库里一个doc
     try:
         labeled_corpus_list = []
         with open(doc_relative_path, encoding='utf-8-sig') as fr:
             lines = fr.readlines()
             for line in lines:
                 line = line.replace("\n", "").strip()
                 if len(line) < 2:
                     continue
                 ws_raw_content = Common().restore_sentence(line)
                 doc_unique_name, _ = upload_fileset.save_file(f.filename, ws_raw_content)
                 corpus_doc_unique_name_list.append(doc_unique_name)
                 labeled_corpus_list.append(
                     [lc.rsplit("/", maxsplit=1) for lc in line.replace("  ", " ").split(" ")])
     except Exception as e:
         logger.exception(e)
         raise ValueError("分词标注数据格式有误")
     # bulk insert docs
     doc_list = [dict(doc_raw_name=f.filename, doc_unique_name=d) for d in corpus_doc_unique_name_list]
     doc_entity_list = DocModel().bulk_create(doc_list)
     task_list = []
     for i in range(len(doc_entity_list)):
         task_list.append(dict(
             doc_id=doc_entity_list[i].doc_id,
             mark_job_id=mark_job.mark_job_id,
             mark_task_result=labeled_corpus_list[i],
             mark_task_status=int(StatusEnum.approved)
         ))
     task_entity_list = MarkTaskModel().bulk_create(task_list)
     return task_entity_list
    def get_doc_type_info_by_nlp_task_by_user(nlp_task_id, current_user: CurrentUser):
        """
        获取管理大厅首页的doc_type信息
        """
        result = []
        # get doc_type list by user
        _, doc_type_list = DocTypeModel().get_by_nlp_task_id_by_user(nlp_task_id=nlp_task_id, current_user=current_user)
        for doc_type, terms in doc_type_list:
            doc_type.doc_terms = [int(t) for t in terms.split(",")] if terms is not None else []
        doc_type_list = [d[0] for d in doc_type_list]
        doc_type_list = [{"doc_type": DocTypeSchema().dump(doc_type)} for doc_type in doc_type_list]

        # get all job count and approved job count
        all_status, all_marked_status = MarkTaskModel().count_status_by_user(nlp_task_id=nlp_task_id, current_user=current_user)

        # calculate marked mark_job count and all mark_job for each doc_type
        all_status_dict = Common().tuple_list2dict(all_status)
        all_marked_status_dict = Common().tuple_list2dict(all_marked_status)

        for doc_type in doc_type_list:
            doc_type_id = doc_type["doc_type"]["doc_type_id"]
            mark_job_count = len(all_status_dict.get(doc_type_id, {}))
            marked_mark_job_count = 0
            for _mark_job_id, _count_sum in all_status_dict.get(doc_type_id, {}).items():
                if _count_sum == all_marked_status_dict.get(doc_type_id, {}).get(_mark_job_id, 0):
                    marked_mark_job_count += 1
            doc_type.update(progress_state={"job_num": mark_job_count,
                                            "labeled_job_number": marked_mark_job_count,
                                            "progress_rate": round(marked_mark_job_count / mark_job_count, 2) if mark_job_count > 0 else 0})

            # get latest evaluation result if exists
            latest_evaluate = EvaluateTaskModel().get_latest_evaluate_by_doc_type_id(nlp_task_id=nlp_task_id,
                                                                                     doc_type_id=doc_type_id)
            if latest_evaluate:
                doc_type.update(evaluate=EvaluateTaskSchema().dump(latest_evaluate))
            result.append(doc_type)
        return result
Exemplo n.º 20
0
 def get_mark_task_with_doc_and_doc_type(self, nlp_task_id,
                                         current_user: CurrentUser, args):
     q = session.query(MarkTask, DocType, Doc) \
         .join(MarkJob, MarkJob.mark_job_id == MarkTask.mark_job_id) \
         .join(DocType, DocType.doc_type_id == MarkJob.doc_type_id) \
         .join(Doc, Doc.doc_id == MarkTask.doc_id) \
         .filter(
         DocType.nlp_task_id == nlp_task_id,
         ~DocType.is_deleted,
         ~MarkTask.is_deleted,
         ~Doc.is_deleted
     )
     # TODO
     # 权限
     if current_user.user_role in [
             RoleEnum.manager.value, RoleEnum.guest.value
     ]:
         q = q.filter(DocType.group_id.in_(current_user.user_groups))
     elif current_user.user_role in [RoleEnum.reviewer.value]:
         q = q.filter(
             func.json_contains(MarkJob.reviewer_ids,
                                str(current_user.user_id)))
     elif current_user.user_role in [RoleEnum.annotator.value]:
         q = q.filter(
             func.json_contains(MarkJob.annotator_ids,
                                str(current_user.user_id)))
     if args.get('job_id'):
         q = q.filter(MarkTask.mark_job_id == args['job_id'])
     if args.get('doc_type_id'):
         q = q.filter(MarkJob.doc_type_id == args['doc_type_id'])
     if args['task_state']:
         q = q.filter(MarkTask.mark_task_status ==
                      status_str2int_mapper().get(args['task_state']))
     if args['query']:
         q = q.filter(Doc.doc_raw_name.like(f'%{args["query"]}%'))
     q = q.group_by(MarkTask)
     count = q.count()
     processing_count = q.filter(
         MarkTask.mark_task_status == int(StatusEnum.processing)).count()
     if args['order_by'] and isinstance(args['order_by'], str):
         if args['order_by'][1:] == 'task_id':
             args['order_by'] = args['order_by'][0] + 'mark_task_id'
         q = Common().order_by_model_fields(q, MarkTask, [args['order_by']])
     items = []
     results = q.offset(args['offset']).limit(args['limit']).all()
     mark_task_ids = [mark_task.mark_task_id for mark_task, _, _ in results]
     user_task_map = self._get_user_task_map(
         mark_task_ids,
         select_keys=(UserTask))  # .annotator_id, UserTask.mark_task_id))
     UserTaskPlaceholder = UserTask(
         annotator_id=0,
         is_deleted=False,
         user_task_status=StatusEnum.labeled.value)
     for mark_task, doc_type, doc in results:
         UserTaskPlaceholder.user_task_result = mark_task.mark_task_result
         user_task_list = user_task_map.get(str(mark_task.mark_task_id),
                                            [UserTaskPlaceholder])
         mark_task.user_task_list = user_task_list
         mark_task.doc = doc
         mark_task.doc_type = doc_type
         items.append(mark_task)
     return count, count - processing_count, items