Пример #1
0
def statute2id():
    '''
    法条内容转为id
    :return:
    '''
    from flow.wordvector import seq2id
    db = dbutil.get_mongodb_conn()
    statutes_set = db.statutes

    for line in statutes_set.find({
            "trainCount": {
                "$gte": 10
            },
            "flag": {
                "$ne": 0
            },
            "contentid": {
                "$exists": False
            }
    }):
        contentid = seq2id(line["contentWords2"])
        statutes_set.update(
            {"_id": line["_id"]},  # 更新条件
            {'$set': {
                "contentid": contentid
            }},  # 更新内容
            upsert=False,  # 如果不存在update的记录,是否插入
            multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
        )
Пример #2
0
def case2id():
    '''
    案件内容转为id
    :return:
    '''
    from flow.wordvector import seq2id
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases

    for line in cases_set.find(
        {
            "flag": {
                "$ne": 0,
                "$lt": 5
            },
            "ygscid": {
                "$exists": False
            }
        },
            no_cursor_timeout=True).batch_size(10):
        ygscid = seq2id(line["ygscWords2"])
        cases_set.update(
            {"_id": line["_id"]},  # 更新条件
            {'$set': {
                "ygscid": ygscid
            }},  # 更新内容
            upsert=False,  # 如果不存在update的记录,是否插入
            multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
        )
Пример #3
0
def sampling_train(total_num=10000):
    logger = myutil.getLogger("sample.log")
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    statutes_set = db.statutes

    num = 0

    for line in cases_set.find({
            "flag": 12
    }, no_cursor_timeout=True).batch_size(10):
        logger.info(line["_id"])  # 记录当前xml
        ygsc_words_2 = line["ygscWords2"].split(" ")

        if 10 < len(ygsc_words_2) < 30:
            num += 1
            cases_set.update(
                {"_id": line["_id"]},  # 更新条件
                {'$set': {
                    "flag": 2
                }},  # 更新内容
                upsert=False,  # 如果不存在update的记录,是否插入
                multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
            )
            for ftid in line["ftids"]:
                statutes_set.update(
                    {"_id": ftid},
                    {'$inc': {
                        "sampleTrainCount": 1
                    }},
                    upsert=False,  # 如果不存在update的记录,是否插入
                    multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
                )
        if num == total_num:
            break
Пример #4
0
def case_fenci_second_patch():
    logger = myutil.getLogger("fenci_patch.log")
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    statutes_set = db.statutes

    for line in cases_set.find({
            "flag": 10,
            "patch": {
                "$exists": True
            }
    },
                               no_cursor_timeout=True).batch_size(10):
        logger.info(line["_id"])  # 记录当前xml
        ygsc_words_2 = line["ygscWords2"].split(" ")

        if 3 < len(ygsc_words_2) <= 80:
            cases_set.update(
                {"_id": line["_id"]},  # 更新条件
                {'$set': {
                    "flag": 2
                }},  # 更新内容
                upsert=False,  # 如果不存在update的记录,是否插入
                multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
            )
            for ftid in line["ftids"]:
                statutes_set.update(
                    {"_id": ftid},
                    {'$inc': {
                        "trainCount": 1
                    }},
                    upsert=False,  # 如果不存在update的记录,是否插入
                    multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
                )
Пример #5
0
def runCnn(ygscid, candi_statutes, type="cnn"):
    __loadModel(type)
    db = dbutil.get_mongodb_conn()
    statutes_set = db.statutes

    # 1: 得到输入数据
    s2 = None
    for ftid in candi_statutes:
        statute = statutes_set.find_one({"_id": ftid}, {"contentid": 1})
        s2 = myutil.append_and_pad_2d_array(s2,
                                            np.array([statute["contentid"]]))
    s1 = np.repeat([ygscid], s2.shape[0], axis=0)  # 根据s2复制s1
    label = np.repeat([[0]], s2.shape[0], axis=0)  # 根据s2生成label

    # 运行
    y = model.sess.run(
        [model.predict_op],
        feed_dict={
            model.input_s1: s1,
            model.input_s2: s2,
            model.input_y: label,
            model.dropout_keep_prob: 1.0
        })

    # 转换为法条index
    pred = np.greater_equal(y, 0.5).astype(np.int32).reshape(-1)
    recom_index = np.where(pred == 1)[0]
    recom_statutes = [candi_statutes[i] for i in recom_index]
    return recom_statutes, np.asarray(y).reshape(-1)
Пример #6
0
def __transToData(xml_names):
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    statutes_set = db.statutes

    s1 = None
    s2 = None
    label = np.array([], dtype=np.int32)
    for xml_name in xml_names:
        case = cases_set.find_one({"_id": xml_name}, {
            "ygscid": 1,
            "ftids": 1,
            "negftids": 1
        })

        # 1:原告诉称重复正例+反例次
        ygsc_array = np.repeat([case["ygscid"]],
                               len(case["ftids"]) + len(case["negftids"]),
                               axis=0)
        s1 = myutil.append_and_pad_2d_array(s1, ygsc_array)
        label = np.append(label, [1] * len(case["ftids"]) +
                          [0] * len(case["negftids"]))
        for ftid in case["ftids"] + case["negftids"]:
            statute = statutes_set.find_one({"_id": ftid}, {"contentid": 1})
            s2 = myutil.append_and_pad_2d_array(
                s2, np.array([statute["contentid"]]))
Пример #7
0
def trainDataPrepare(nn_type="cnn", recom_num=35, sim_type="lda"):
    train_x = []
    train_y = []

    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    for line in cases_set.find({"flag": 2, sim_type: {"$exists": True}},
                               {"_id": 1, "ftids": 1, sim_type: 1, "ygscid": 1},
                               no_cursor_timeout=True).batch_size(20):
        # 1: 获取备选法条集
        statute_num = min(len(line[sim_type]), recom_num)
        candi_statute = line[sim_type][:statute_num]

        # 2: 构造输入
        lrinput = __get_lrinput(line["ygscid"], candi_statute, nn_type, recom_num)
        train_x.append(lrinput)

        # 3: 构造输出
        lroutput = np.zeros(statute_num, dtype=np.int32)
        for i, ftid in enumerate(candi_statute):
            if ftid in line["ftids"]:
                lroutput[i] = 1
        train_y.append(lroutput)

    # 4: 保存
    train_x = np.concatenate(train_x, axis=0)
    train_y = np.concatenate(train_y, axis=0)
    with open("checkpoint/lr_trainx.pk", "wb") as file:
        joblib.dump(train_x, file)
    with open("checkpoint/lr_trainy.pk", "wb") as file:
        joblib.dump(train_y, file)
Пример #8
0
def prepareLabels(flag=2):
    '''
    准备训练集或验证集的label
    :param flag:
    :return:
    '''
    logger = myutil.getLogger("label.log")
    statute_dict = {}
    statute_index = 0
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases

    if flag == 2:
        statutes_set = db.statutes
        statute_num = statutes_set.count(
            {"sampleTrainCount": {
                "$exists": True
            }})
    else:
        with open("checkpoint/statute_dict.pk", "rb") as file:
            statute_dict = joblib.load(file)
        statute_num = len(statute_dict)

    for line in cases_set.find({
            "flag": flag
    }, {
            "ftids": 1
    },
                               no_cursor_timeout=True).batch_size(20):
        logger.info(line["_id"])
        label = [0 for i in range(statute_num)]
        legal = True
        for ftid in line["ftids"]:
            if ftid in statute_dict:
                label[statute_dict[ftid]] = 1  # 直接赋值为1
            else:
                if flag == 2:
                    statute_dict[ftid] = statute_index  # 加入dict里面没有的
                    label[statute_index] = 1  # 赋值为1
                    statute_index += 1  # 更新计数
                else:
                    logger.error("出现不在训练集的法条:%s" % line["_id"])
                    legal = False
                    break

        if legal:
            cases_set.update(
                {"_id": line["_id"]},  # 更新条件
                {'$set': {
                    "label": label
                }},  # 更新内容
                upsert=False,  # 如果不存在update的记录,是否插入
                multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
            )

    # 保存dict
    if flag == 2:
        with open("checkpoint/statute_dict.pk", "wb") as file:
            joblib.dump(statute_dict, file)
Пример #9
0
def case_fenci_second():
    logger = myutil.getLogger("fenci.log")
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    statutes_set = db.statutes
    words_set = db.words
    for line in cases_set.find(
        {
            "flag": {
                "$ne": 0
            },
            "ygscWords2": {
                "$exists": False
            }
        },
            no_cursor_timeout=True).batch_size(10):
        logger.info(line["_id"])  # 记录当前xml
        flag = line["flag"]
        ygsc_words = line["ygscWords"].split(" ")
        ygsc_words_2 = []
        # 1:进行词筛选处理
        for word in ygsc_words:
            # 1.1 非停用词和低频词、如果非训练集,还要把未出现的词删掉
            word_db = words_set.find_one({"_id": word})
            if word_db is not None and __not_stopwords(word_db):
                # 1.2: 连续五个词中未重复
                found = False
                end = len(ygsc_words_2)
                start = max(0, end - 5)
                for i in range(start, end):
                    if ygsc_words_2[i] == word:
                        found = True
                        break
                if not found:
                    ygsc_words_2.append(word)

        # 2:处理后词长过长的(短的其实效果很好,没有人名地名什么的)
        if len(ygsc_words_2) < 3 or len(ygsc_words_2) > 80:
            flag = 10
        elif flag == 2:  # 否则训练集统计法条引用数
            for ftid in line["ftids"]:
                statutes_set.update(
                    {"_id": ftid},
                    {'$inc': {
                        "trainCount": 1
                    }},
                    upsert=False,  # 如果不存在update的记录,是否插入
                    multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
                )

        cases_set.update(
            {"_id": line["_id"]},  # 更新条件
            {'$set': {
                "ygscWords2": " ".join(ygsc_words_2),
                "flag": flag
            }},  # 更新内容
            upsert=False,  # 如果不存在update的记录,是否插入
            multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
        )
Пример #10
0
def evaluateLR(model="cnn", recom_num=35, sim_type="lda"):
    '''
    测试文本相关性
    :return:
    '''
    from flow.lr import runLR

    # 存储入数据库的列名
    model_precise_name = "LR" + model + "Precise"
    model_recall_name = "LR" + model + "Recall"
    # sum
    precise_sum = 0
    recall_sum = 0
    case_num = 0

    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    for line in cases_set.find({
            "flag": 4,
            sim_type: {
                "$exists": True
            }
    }, {
            "_id": 1,
            "ftids": 1,
            sim_type: 1,
            "ygscid": 1
    },
                               no_cursor_timeout=True).batch_size(20):
        # 1: 获取备选法条集
        statute_num = min(len(line[sim_type]), recom_num)
        candi_statute = line[sim_type][:statute_num]

        # 2: 送到模型中运行
        recom_statute = runLR(line["ygscid"], candi_statute, model, recom_num)

        # 3: 计算精度、召回
        case_precise, case_recall = __get_precise_and_recall(
            recom_statute, line["ftids"])

        # 4:加到总的里
        precise_sum += case_precise
        recall_sum += case_recall
        case_num += 1

        # # 5: 存入数据库记录
        # cases_set.update(
        #     {"_id": line["_id"]},  # 更新条件
        #     {'$set': {model: recom_statute,
        #               model_precise_name: case_precise,
        #               model_recall_name: case_recall
        #               }},  # 更新内容
        #     upsert=False,  # 如果不存在update的记录,是否插入
        #     multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
        # )

        # 计算总的精度、召回
    __print_total_precise_and_recall(precise_sum, recall_sum, case_num, model)
Пример #11
0
def lda_recom_num():
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    dic = dict()

    for line in cases_set.find({"flag": 4}, {"lda": 1}, no_cursor_timeout=True).batch_size(10):
        recom_num = len(line["lda"])
        index = recom_num // 10
        dic[index] = dic.get(index, 0) + 1

    print(dic)
Пример #12
0
def statute_refed_num():
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases

    statutes_vec = numpy.zeros(1405)

    for line in cases_set.find({"flag": 4}, {"label": 1}, no_cursor_timeout=True).batch_size(10):
        label = line["label"]
        statutes_vec += label

    statutes_vec = statutes_vec[numpy.nonzero(statutes_vec)]
    print("max:%f, min:%f, avg:%f, num:%d" % (numpy.max(statutes_vec), numpy.min(statutes_vec), numpy.average(statutes_vec), statutes_vec.shape[0]))
Пример #13
0
def runRules(oriStatutes):
    # 得到包含的rules
    db = dbutil.get_mongodb_conn()
    rules_set = db.rules
    rules = rules_set.find({"from": {"$in": oriStatutes}})

    # 遍历得到伴随
    oriSet = set(oriStatutes)
    assoRules = set(oriStatutes)
    for rule in rules:
        if set(rule["from"]).issubset(oriSet):
            assoRules.update(rule["to"])
    return list(assoRules)
Пример #14
0
def statutes_fenci():
    db = dbutil.get_mongodb_conn()
    statutes_set = db.statutes  # statutes表,没有则自动创建
    for line in statutes_set.find():
        content_words = " ".join(fenci(line["content"]))
        statutes_set.update(
            {"_id": line["_id"]},  # 更新条件
            {'$set': {
                "contentWords": content_words
            }},  # 更新内容
            upsert=False,  # 如果不存在update的记录,是否插入
            multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
        )
Пример #15
0
def evaluateSim(recom_num, type="lda"):  # 只有测试集需要跑这个
    # 存储入数据库的列名
    model_precise_name = type + str(recom_num) + "Precise"
    model_recall_name = type + str(recom_num) + "Recall"
    # sum
    precise_sum = 0
    recall_sum = 0
    case_num = 0

    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    for line in cases_set.find({
            "flag": 4,
            type: {
                "$exists": True
            }
    }, {
            "_id": 1,
            "ftids": 1,
            type: 1
    },
                               no_cursor_timeout=True).batch_size(20):
        # 1: 获取推荐列表
        statute_num = min(len(line[type]), recom_num)
        recom_statute = line[type][:statute_num]

        # 2: 计算精度、召回
        case_precise, case_recall = __get_precise_and_recall(
            recom_statute, line["ftids"])

        # 3:加到总的里
        precise_sum += case_precise
        recall_sum += case_recall
        case_num += 1

        # 4: 存入数据库记录
        cases_set.update(
            {"_id": line["_id"]},  # 更新条件
            {
                '$set': {
                    model_precise_name: case_precise,
                    model_recall_name: case_recall
                }
            },  # 更新内容
            upsert=False,  # 如果不存在update的记录,是否插入
            multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
        )

    # 计算总的精度、召回
    __print_total_precise_and_recall(precise_sum, recall_sum, case_num,
                                     type + str(recom_num))
Пример #16
0
def print_stopwords():
    db = dbutil.get_mongodb_conn()
    words_set = db.words
    for line in words_set.find(
        {
            "aynum": {
                "$gte": 170
            },
            "totalCount": {
                "$gte": 50000
            }
        },
            no_cursor_timeout=True).batch_size(10):
        print(line["_id"], end=" ")
Пример #17
0
def clean_by_statutes():
    logger = myutil.getLogger("clean.log")
    statutes_list = get_statutes_set()
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    statutes_set = db.statutes
    for line in cases_set.find(
        {
            "flag": {
                "$ne": 0,
                "$lt": 5
            },
            "clean": {
                "$exists": False
            }
        },
            no_cursor_timeout=True).batch_size(10):
        logger.info(line["_id"])  # 记录当前xml
        flag = line["flag"]

        for ftid in line["ftids"]:
            if ftid not in statutes_list:
                flag = 9
                break

        if flag == 3 or flag == 4:  # 测试集、验证集统计
            if flag == 3:  # 测试集
                col = "testCount"
            else:
                col = "validCount"

            for ftid in line["ftids"]:
                statutes_set.update(
                    {"_id": ftid},
                    {'$inc': {
                        col: 1
                    }},
                    upsert=False,  # 如果不存在update的记录,是否插入
                    multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
                )

        cases_set.update(
            {"_id": line["_id"]},  # 更新条件
            {'$set': {
                "flag": flag,
                "clean": 0
            }},  # 更新内容
            upsert=False,  # 如果不存在update的记录,是否插入
            multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
        )
Пример #18
0
def get_statutes_set():
    db = dbutil.get_mongodb_conn()
    statutes_set = db.statutes
    statute_list = []
    for line in statutes_set.find(
        {
            "trainCount": {
                "$gte": 10
            },
            "flag": {
                "$ne": 0
            }
        }, {"_id": 1}):
        statute_list.append(line["_id"])
    return set(statute_list)
Пример #19
0
def evaluateMulti(multi_model="svm"):
    from flow.multiLabel import runMulti
    import numpy as np

    precise_sum = 0
    recall_sum = 0
    case_num = 0

    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    for line in cases_set.find({
            "flag": 4
    }, {
            "_id": 1,
            "ygscWords2": 1,
            "label": 1
    },
                               no_cursor_timeout=True).batch_size(20):
        recom_statute = runMulti([line["ygscWords2"]], multi_model)[0]

        # 2: 计算精度、召回
        predict_sum = np.maximum(
            np.sum(recom_statute).astype(np.float32), 0.001)
        score_sum = np.sum(line["label"]).astype(np.float32)
        predict_right = np.sum(np.logical_and(
            recom_statute, line["label"])).astype(np.float32)
        case_precise = np.mean(predict_right / predict_sum)
        case_recall = np.mean(predict_right / score_sum)

        # 3:加到总的里
        precise_sum += case_precise
        recall_sum += case_recall
        case_num += 1

        # 4: 存入数据库记录
        # cases_set.update(
        #     {"_id": line["_id"]},  # 更新条件
        #     {'$set': {multi_model: recom_statute,
        #               multi_model + "Precise": case_precise,
        #               multi_model + "Recall": case_recall
        #               }},  # 更新内容
        #     upsert=False,  # 如果不存在update的记录,是否插入
        #     multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
        # )

    # 计算总的精度、召回
    __print_total_precise_and_recall(precise_sum, recall_sum, case_num,
                                     multi_model)
Пример #20
0
def compute_words_aynum():
    db = dbutil.get_mongodb_conn()
    words_set = db.words
    for line in words_set.find():
        aynum = 0
        for (aydm, ay_count) in line["ayCount"].items():
            if ay_count >= 5:
                aynum += 1
        words_set.update(
            {"_id": line["_id"]},
            {'$set': {
                "aynum": aynum
            }},
            upsert=False,  # 如果不存在update的记录,是否插入
            multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
        )
Пример #21
0
def clearSim(type="lda"):
    '''
    清空生成相似的记录
    :param type:
    :return:
    '''
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    cases_set.update(
        {"flag": 4},  # 更新条件
        {'$unset': {
            type: 1
        }},  # 更新内容
        upsert=False,  # 如果不存在update的记录,是否插入
        multi=True,  # 可选,mongodb 默认是false,只更新找到的第一条记录
    )
Пример #22
0
def fine_rule_param():
    from flow.rules import trainRules
    import numpy as np

    __load_logger()
    db = dbutil.get_mongodb_conn()
    rules_set = db.rules

    for minsup in range(5, 31, 5):
        for minconf in np.arange(0.7, 1, 0.1):
            logger.info(
                "#############rules: minsup=%d, minconf=%f ###########" %
                (minsup, minconf))
            rules_set.drop()
            trainRules(minsup, minconf)
            evaluateRules(text_model="cnn")
Пример #23
0
def evaluateRules(text_model="cnn"):
    from flow.rules import runRules

    precise_sum = 0
    recall_sum = 0
    case_num = 0

    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    for line in cases_set.find({
            "flag": 4
    }, {
            "_id": 1,
            "ftids": 1,
            text_model: 1
    },
                               no_cursor_timeout=True).batch_size(20):
        recom_statute = runRules(line[text_model])

        # 2: 计算精度、召回
        case_precise, case_recall = __get_precise_and_recall(
            recom_statute, line["ftids"])

        # 3:加到总的里
        precise_sum += case_precise
        recall_sum += case_recall
        case_num += 1

        # 4: 存入数据库记录
        cases_set.update(
            {"_id": line["_id"]},  # 更新条件
            {
                '$set': {
                    "rules": recom_statute,
                    "rulesPrecise": case_precise,
                    "rulesRecall": case_recall
                }
            },  # 更新内容
            upsert=False,  # 如果不存在update的记录,是否插入
            multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
        )

    # 计算总的精度、召回
    __print_total_precise_and_recall(precise_sum, recall_sum, case_num,
                                     "rules")
Пример #24
0
def case_ref_num(flag):
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases

    max_num = 0
    min_num = 1000
    ref_sum = 0
    case_num = 0
    for line in cases_set.find({"flag": flag}, {"ftids": 1}, no_cursor_timeout=True).batch_size(10):
        ref_num = len(line["ftids"])
        ref_sum += ref_num
        case_num += 1
        if ref_num > max_num:
            max_num = ref_num
        if ref_num < min_num:
            min_num = ref_num

    print("max:%d, min:%d, avg:%f, num:%d" % (max_num, min_num, (ref_sum/case_num), case_num))
Пример #25
0
def runCandiStatutes(simCases):
    '''
    获取关联法条
    :param simCases:
    :return: (ftid, score)的列表
    '''
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    candi_statutes = {}
    for (sim_id, sim_score) in simCases:
        sim_case = cases_set.find_one({"_id": sim_id}, {"ftids": 1})
        for ftid in sim_case["ftids"]:
            candi_statutes[ftid] = candi_statutes.get(ftid, 0) + sim_score
    # 3:将候选法条按分数倒序
    sorted_candi_statutes = sorted(candi_statutes.items(),
                                   key=lambda x: x[1],
                                   reverse=True)
    return sorted_candi_statutes
Пример #26
0
def genSim(flag=4, type="lda", sim_case_num=30):
    '''
    生成备选法条集(通过类案推荐)
    :param flag: 2是训练,4是测试
    :param type:  "lda"、"svd"、"tfidf"
    :param sim_case_num:  类案数字
    :return:
    '''
    from flow.similarCases import runSimilarCases, runCandiStatutes

    db = dbutil.get_mongodb_conn()
    cases_set = db.cases

    for line in cases_set.find({
            "flag": flag,
            type: {
                "$exists": False
            }
    }, {
            "_id": 1,
            "ygscWords2": 1
    },
                               no_cursor_timeout=True).batch_size(20):
        # 1:获取相似案件
        simCases = runSimilarCases([line["ygscWords2"]], type, sim_case_num)

        # 2:根据相似案件获取候选法条
        sorted_candi_statutes = runCandiStatutes(simCases[0])

        # 3:转换成列表
        statute_num = min(len(sorted_candi_statutes), 500)  # 最多不能超过500个
        recom_statute = [
            ftid for (ftid, ft_score) in sorted_candi_statutes[:statute_num]
        ]

        # 4:存储
        cases_set.update(
            {"_id": line["_id"]},  # 更新条件
            {'$set': {
                type: recom_statute
            }},  # 更新内容
            upsert=False,  # 如果不存在update的记录,是否插入
            multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
        )
Пример #27
0
def case_fenci_second_patch_test():
    logger = myutil.getLogger("fenci_patch.log")
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases

    for line in cases_set.find({
            "flag": 10
    }, no_cursor_timeout=True).batch_size(10):
        logger.info(line["_id"])  # 记录当前xml
        ygsc_words_2 = line["ygscWords2"].split(" ")

        if 3 < len(ygsc_words_2) <= 80:
            cases_set.update(
                {"_id": line["_id"]},  # 更新条件
                {'$set': {
                    "flag": 4
                }},  # 更新内容
                upsert=False,  # 如果不存在update的记录,是否插入
                multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
            )
Пример #28
0
def trainRules(minsup, minconf):
    '''
    挖掘关联规则
    :return:
    '''
    # 获取连接
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    rules_set = db.rules

    # 挖掘规则
    ftids = [
        case["ftids"] for case in cases_set.find({"flag": 2}, {"ftids": 1},
                                                 no_cursor_timeout=True)
    ]
    rules = fptree.generateRules(ftids, minsup, minconf)
    # 简化规则
    rules = __prunedRules(rules)

    # 存储规则
    for rule in rules:
        rules_set.insert({"from": list(rule[0]), "to": list(rule[1])})
Пример #29
0
def sampling_test(total_num=1000):
    logger = myutil.getLogger("sample_test.log")
    db = dbutil.get_mongodb_conn()
    cases_set = db.cases
    statutes_set = db.statutes

    num = 0

    for line in cases_set.find({
            "flag": 14
    }, no_cursor_timeout=True).batch_size(10):
        logger.info(line["_id"])  # 记录当前xml
        ygsc_words_2 = line["ygscWords2"].split(" ")

        if 10 < len(ygsc_words_2) < 30:
            ftlegal = True
            for ftid in line["ftids"]:
                statute_db = statutes_set.find_one({
                    "_id": ftid,
                    "sampleTrainCount": {
                        "$exists": True
                    }
                })
                if statute_db is None:
                    ftlegal = False
                    break
            if ftlegal:
                num += 1
                cases_set.update(
                    {"_id": line["_id"]},  # 更新条件
                    {'$set': {
                        "flag": 4
                    }},  # 更新内容
                    upsert=False,  # 如果不存在update的记录,是否插入
                    multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
                )

        if num == total_num:
            break
Пример #30
0
def statutes_fenci_second():
    db = dbutil.get_mongodb_conn()
    statutes_set = db.statutes  # statutes表,没有则自动创建
    words_set = db.words
    for line in statutes_set.find({"contentWords2": {"$exists": False}}):
        flag = line["flag"]
        content_words = line["contentWords"].split(" ")
        content_words_2 = []

        for word in content_words:
            # 1.1 非停用词和低频词、如果非训练集,还要把未出现的词删掉
            word_db = words_set.find_one({"_id": word})
            if word_db is not None and __not_stopwords(word_db):
                # 1.2: 连续五个词中未重复
                found = False
                end = len(content_words_2)
                start = max(0, end - 5)
                for i in range(start, end):
                    if content_words_2[i] == word:
                        found = True
                        break
                if not found:
                    content_words_2.append(word)

        # 删掉太长的
        if len(content_words_2) > 80:
            flag = 0

        statutes_set.update(
            {"_id": line["_id"]},  # 更新条件
            {
                '$set': {
                    "contentWords2": " ".join(content_words_2),
                    "flag": flag
                }
            },  # 更新内容
            upsert=False,  # 如果不存在update的记录,是否插入
            multi=False,  # 可选,mongodb 默认是false,只更新找到的第一条记录
        )