示例#1
0
    def create_evaluate_task_by_train_job_id(train_job_id, evaluate_task_name, evaluate_task_desc, mark_job_ids, doc_term_ids, doc_relation_ids, use_rule=0):
        """
        如果后面要加重新训练的逻辑,这部分要改,不能根据train_job_id去创建评估任务,而是根据train_task_id,
        目前先保留,因为目前train_job和train_task是一一对应,不会有影响
        """
        # get correspondent train_job, doc_type, train_task, nlp_task by train_job_id
        train_job = TrainJobModel().get_by_id(train_job_id)
        doc_type = DocTypeModel().get_by_id(train_job.doc_type_id)
        doc_term_list = DocTermModel().get_by_filter(limit=99999, doc_type_id=doc_type.doc_type_id)
        doc_type.doc_term_list = doc_term_list

        nlp_task = NlpTaskEnum(doc_type.nlp_task_id)
        _, train_task_list = TrainTaskModel().get_by_filter(train_job_id=train_job_id)
        train_task = train_task_list[0]

        # create evaluate_task
        evaluate_task = EvaluateTaskModel().create(evaluate_task_name=evaluate_task_name,
                                                   evaluate_task_desc=evaluate_task_desc,
                                                   train_task_id=train_task.train_task_id,
                                                   evaluate_task_status=int(StatusEnum.processing))
        # bulk create evaluate m2m mark
        evaluate_m2m_mark_list = [{"evaluate_task_id": evaluate_task.evaluate_task_id, "mark_job_id": _id} for _id in mark_job_ids]
        EvaluateM2mMarkModel().bulk_create(evaluate_m2m_mark_list)

        # push to evaluate redis queue
        doc_term_ids = [str(t.doc_term_id) for t in RelationM2mTermModel().get_by_filter(limit=99999, doc_relation_ids=[int(rl) for rl in doc_relation_ids])]
        push_evaluate_task_to_redis(nlp_task, evaluate_task, train_task, doc_type, mark_job_ids, doc_term_ids, doc_relation_ids, use_rule)
        session.commit()
        return evaluate_task
 def create_doc_type(current_user: CurrentUser, args):
     doc_term_list = args.pop('doc_term_list')
     if 'group_id' not in args or args['group_id'] < 1:
         if current_user.user_groups:
             args['group_id'] = current_user.user_groups[0]
         else:
             abort(403, message="当前角色禁止创建项目,请切换角色操作")
     doc_type = DocTypeModel().create(**args)
     for item in doc_term_list:
         item.update({'doc_type_id': doc_type.doc_type_id})
     doc_type.doc_term_list = DocTermModel().bulk_create(doc_term_list)
     session.commit()
     result = DocTypeSchema().dumps(doc_type)
     return result
    def import_mark_job(self, files, args, nlp_task):
        DocTypeModel().get_by_id(args['doc_type_id'])

        job = MarkJobModel().create(
            mark_job_name=args['mark_job_name'],
            mark_job_type=args['mark_job_type'],
            mark_job_desc=args.get('mark_job_desc'),
            doc_type_id=args['doc_type_id'],
            mark_job_status=int(StatusEnum.approved),
            assign_mode='average',
        )
        tasks = []
        for f in files:
            if nlp_task == NlpTaskEnum.classify:
                single_file_tasks = self.import_labeled_classify_files(f, job)
            elif nlp_task == NlpTaskEnum.extract:
                single_file_tasks = self.import_labeled_extract_files(f, job)
            elif nlp_task == NlpTaskEnum.wordseg:
                single_file_tasks = self.import_labeled_wordseg_files(f, job)
            else:
                raise TypeError('nlp_task illegal')
            tasks.extend(single_file_tasks)
        session.commit()
        result = MarkJobSchema().dump(job)
        return result
    def get_mark_job_data_by_ids(self, mark_job_ids, args, doc_type_key="doc_type", prefix='NER'):
        items = []
        for mark_job_id in mark_job_ids:
            doc_type = DocTypeModel().get_by_mark_job_id(mark_job_id)
            result = {
                "prefix": prefix,  # TODO: 与MQ确认传参是否适配
                doc_type_key: DocTypeSchema().dump(doc_type),
                "docs": [],
                "tasks": [],
                "mark_job_id": mark_job_id,
            }
            data = MarkTaskModel().get_mark_task_and_doc_by_mark_job_ids([mark_job_id])

            for task, doc in data:
                # 抽取逻辑
                if args.get('doc_term_ids'):
                    if isinstance(task.mark_task_result, list) \
                            and Common.check_doc_term_include(task.mark_task_result, 'doc_term_id',
                                                              args['doc_term_ids']):
                        result['docs'].append(DocSchema().dump(doc))
                        result['tasks'].append(MarkTaskSchema().dump(task))
                # 实体关系逻辑
                if args.get('doc_relation_ids'):
                    if isinstance(task.mark_task_result, list) and Common.check_doc_relation_include(
                            task.mark_task_result, 'relation_id', args['doc_relation_ids']):
                        result['docs'].append(DocSchema().dump(doc))
                        result['tasks'].append(MarkTaskSchema().dump(task))
                else:
                    result['docs'].append(DocSchema().dump(doc))
                    result['tasks'].append(MarkTaskSchema().dump(task))
            items.append(result)
        return items
 def get_doc_type(current_user: CurrentUser, args):
     mark_job_ids = args.get('mark_job_ids', [])
     nlp_task_id = args["nlp_task_id"]
     count, items = DocTypeModel().get_by_mark_job_ids(mark_job_ids=mark_job_ids, nlp_task_id=nlp_task_id,
                                                       current_user=current_user, offset=args["offset"],
                                                       limit=args["limit"])
     result = DocTypeSchema(many=True).dump(items)
     return result, count
    def re_pre_label_mark_job(self, mark_job_ids, nlp_task):
        pipe = r.pipeline()
        # 通过标注任务获取 doctype id
        mark_jobs = MarkJobModel().get_by_ids(mark_job_ids)
        doc_type_ids = set(item.doc_type_id for item in mark_jobs)
        # 获取其中拥有上线模型的doctype ids
        online_doc_type_ids = DocTypeModel().get_online_ids_by_ids(doc_type_ids)
        # 如果重新预标注的doc type在上线模型中没有 则abort
        if doc_type_ids - online_doc_type_ids:
            doc_types = DocTypeModel().get_by_ids(doc_type_ids - online_doc_type_ids)
            abort(400, message='项目:{},没有上线模型'.format('、'.join(item.doc_type_name for item in doc_types)))

        # 获取所有标注任务所有文件生成的标注任务
        unlabel_tasks = MarkTaskModel().get_unlabel_tasks_by_mark_job_ids(mark_job_ids)

        # 按标注任务发送重新预标注任务
        for task in unlabel_tasks:
            self.push_mark_task_message(task, task, task, business=f"{nlp_task.name}_label")

        pipe.execute()
    def create_relation(doc_type_id: int, doc_term_ids: typing.List, doc_relation_name: str):
        if not DocTypeModel().get_by_id(doc_type_id):
            raise ValueError(f"DocType {doc_type_id} 不存在")
        if len(DocTermModel().get_by_filter(doc_term_ids=doc_term_ids)) != 2:
            raise ValueError(f"DocTerm 不存在或已被删除")

        item = DocTermModel().create_relation(doc_relation_name, doc_term_ids, doc_type_id=doc_type_id)
        session.commit()
        return {
            "doc_relation_name": doc_relation_name,
            "doc_relation_id": item.doc_relation_id
        }
    def update_doc_type(args, doc_type_id):
        item = DocTypeModel().update(doc_type_id, **args)
        existed_doc_term_ids = [dt.doc_term_id for dt in DocTermModel().get_by_filter(doc_type_id=doc_type_id)]
        updated_doc_term_ids = []
        if args.get("doc_term_list"):
            for i in args.get("doc_term_list"):
                i.update({"doc_type_id": doc_type_id})
                updated_doc_term_ids.append(i.get("doc_term_id", 0))
            DocTermModel().bulk_update(args.get("doc_term_list"))
        session.commit()

        # Remove doc terms
        for i in existed_doc_term_ids:
            if i not in updated_doc_term_ids:
                DocTermModel().delete(i)
        session.commit()
        return DocTypeSchema().dump(item)
    def get_doc_type_info_by_nlp_task_by_user(nlp_task_id, current_user: CurrentUser):
        """
        获取管理大厅首页的doc_type信息
        """
        result = []
        # get doc_type list by user
        _, doc_type_list = DocTypeModel().get_by_nlp_task_id_by_user(nlp_task_id=nlp_task_id, current_user=current_user)
        for doc_type, terms in doc_type_list:
            doc_type.doc_terms = [int(t) for t in terms.split(",")] if terms is not None else []
        doc_type_list = [d[0] for d in doc_type_list]
        doc_type_list = [{"doc_type": DocTypeSchema().dump(doc_type)} for doc_type in doc_type_list]

        # get all job count and approved job count
        all_status, all_marked_status = MarkTaskModel().count_status_by_user(nlp_task_id=nlp_task_id, current_user=current_user)

        # calculate marked mark_job count and all mark_job for each doc_type
        all_status_dict = Common().tuple_list2dict(all_status)
        all_marked_status_dict = Common().tuple_list2dict(all_marked_status)

        for doc_type in doc_type_list:
            doc_type_id = doc_type["doc_type"]["doc_type_id"]
            mark_job_count = len(all_status_dict.get(doc_type_id, {}))
            marked_mark_job_count = 0
            for _mark_job_id, _count_sum in all_status_dict.get(doc_type_id, {}).items():
                if _count_sum == all_marked_status_dict.get(doc_type_id, {}).get(_mark_job_id, 0):
                    marked_mark_job_count += 1
            doc_type.update(progress_state={"job_num": mark_job_count,
                                            "labeled_job_number": marked_mark_job_count,
                                            "progress_rate": round(marked_mark_job_count / mark_job_count, 2) if mark_job_count > 0 else 0})

            # get latest evaluation result if exists
            latest_evaluate = EvaluateTaskModel().get_latest_evaluate_by_doc_type_id(nlp_task_id=nlp_task_id,
                                                                                     doc_type_id=doc_type_id)
            if latest_evaluate:
                doc_type.update(evaluate=EvaluateTaskSchema().dump(latest_evaluate))
            result.append(doc_type)
        return result
示例#10
0
 def create_doc_type():
     from app.model import DocTypeModel
     if len(DocTypeModel().get_all()) == 0:
         doc_types = [
             dict(app_id=1,
                  created_by=1,
                  doc_type_id=1,
                  doc_type_name="测试抽取项目1",
                  nlp_task_id=int(NlpTaskEnum.extract)),
             dict(app_id=1,
                  created_by=1,
                  doc_type_id=2,
                  doc_type_name="测试抽取项目2",
                  nlp_task_id=int(NlpTaskEnum.extract)),
             dict(app_id=1,
                  created_by=1,
                  doc_type_id=3,
                  doc_type_name="测试抽取项目3",
                  nlp_task_id=int(NlpTaskEnum.extract)),
             dict(app_id=1,
                  created_by=1,
                  doc_type_id=4,
                  doc_type_name="测试抽取项目4",
                  nlp_task_id=int(NlpTaskEnum.extract)),
             dict(app_id=1,
                  created_by=1,
                  doc_type_id=5,
                  doc_type_name="测试分类项目1",
                  nlp_task_id=int(NlpTaskEnum.classify)),
             dict(app_id=1,
                  created_by=1,
                  doc_type_id=6,
                  doc_type_name="测试分类项目2",
                  nlp_task_id=int(NlpTaskEnum.classify)),
             dict(app_id=1,
                  created_by=1,
                  doc_type_id=7,
                  doc_type_name="测试分类项目3",
                  nlp_task_id=int(NlpTaskEnum.classify)),
             dict(app_id=1,
                  created_by=1,
                  doc_type_id=8,
                  doc_type_name="测试关系项目1",
                  nlp_task_id=int(NlpTaskEnum.relation)),
             dict(app_id=1,
                  created_by=1,
                  doc_type_id=9,
                  doc_type_name="测试关系项目2",
                  nlp_task_id=int(NlpTaskEnum.relation)),
             dict(app_id=1,
                  created_by=1,
                  doc_type_id=10,
                  doc_type_name="测试分词项目1",
                  nlp_task_id=int(NlpTaskEnum.wordseg)),
             dict(app_id=1,
                  created_by=1,
                  doc_type_id=11,
                  doc_type_name="测试分词项目2",
                  nlp_task_id=int(NlpTaskEnum.wordseg)),
         ]
         DocTypeModel().bulk_create(doc_types)
         session.commit()
示例#11
0
 def get(self):
     return jsonify(DocTypeModel().get_all())
 def set_favoriate_doc_type(doc_type_id, is_favorite: bool):
     _doc_type = DocTypeModel().update(doc_type_id=doc_type_id, is_favorite=is_favorite)
     return DocTypeSchema().dump(_doc_type)
    def get_doc_type_items(doc_type_id: int):
        item = DocTypeModel().get_by_id(doc_type_id)
        item.doc_term_list = DocTermModel().get_by_filter(doc_type_id=doc_type_id)

        return DocTypeSchema().dump(item)
 def delete_doc_type(doc_type_id):
     DocTypeModel().delete(doc_type_id)
     session.commit()
 def get_by_id_and_user_group(doc_type_id, group_id):
     doc_type = DocTypeModel().get_by_id_by_user_group(_id=doc_type_id, group_id=group_id)
     return doc_type
 def get_by_id(doc_type_id):
     doc_type = DocTypeModel().get_by_id(doc_type_id)
     return doc_type
 def create_relation_doc_type(args):
     item = DocTypeModel().create(**args)
     session.commit()
     result = DocTypeSchema().dump(item)
     return result
 def update_relation_doc_type(args, doc_type_id):
     item = DocTypeModel().update(doc_type_id, **args)
     session.commit()
     return DocTypeSchema().dump(item)
 def check_doc_type_name_exists(doc_type_name):
     return DocTypeModel().if_exists_by_name(doc_type_name)