def statute2id(): ''' 法条内容转为id :return: ''' from flow.wordvector import seq2id db = dbutil.get_mongodb_conn() statutes_set = db.statutes for line in statutes_set.find({ "trainCount": { "$gte": 10 }, "flag": { "$ne": 0 }, "contentid": { "$exists": False } }): contentid = seq2id(line["contentWords2"]) statutes_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "contentid": contentid }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 )
def case2id(): ''' 案件内容转为id :return: ''' from flow.wordvector import seq2id db = dbutil.get_mongodb_conn() cases_set = db.cases for line in cases_set.find( { "flag": { "$ne": 0, "$lt": 5 }, "ygscid": { "$exists": False } }, no_cursor_timeout=True).batch_size(10): ygscid = seq2id(line["ygscWords2"]) cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "ygscid": ygscid }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 )
def sampling_train(total_num=10000): logger = myutil.getLogger("sample.log") db = dbutil.get_mongodb_conn() cases_set = db.cases statutes_set = db.statutes num = 0 for line in cases_set.find({ "flag": 12 }, no_cursor_timeout=True).batch_size(10): logger.info(line["_id"]) # 记录当前xml ygsc_words_2 = line["ygscWords2"].split(" ") if 10 < len(ygsc_words_2) < 30: num += 1 cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "flag": 2 }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) for ftid in line["ftids"]: statutes_set.update( {"_id": ftid}, {'$inc': { "sampleTrainCount": 1 }}, upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) if num == total_num: break
def case_fenci_second_patch(): logger = myutil.getLogger("fenci_patch.log") db = dbutil.get_mongodb_conn() cases_set = db.cases statutes_set = db.statutes for line in cases_set.find({ "flag": 10, "patch": { "$exists": True } }, no_cursor_timeout=True).batch_size(10): logger.info(line["_id"]) # 记录当前xml ygsc_words_2 = line["ygscWords2"].split(" ") if 3 < len(ygsc_words_2) <= 80: cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "flag": 2 }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) for ftid in line["ftids"]: statutes_set.update( {"_id": ftid}, {'$inc': { "trainCount": 1 }}, upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 )
def runCnn(ygscid, candi_statutes, type="cnn"): __loadModel(type) db = dbutil.get_mongodb_conn() statutes_set = db.statutes # 1: 得到输入数据 s2 = None for ftid in candi_statutes: statute = statutes_set.find_one({"_id": ftid}, {"contentid": 1}) s2 = myutil.append_and_pad_2d_array(s2, np.array([statute["contentid"]])) s1 = np.repeat([ygscid], s2.shape[0], axis=0) # 根据s2复制s1 label = np.repeat([[0]], s2.shape[0], axis=0) # 根据s2生成label # 运行 y = model.sess.run( [model.predict_op], feed_dict={ model.input_s1: s1, model.input_s2: s2, model.input_y: label, model.dropout_keep_prob: 1.0 }) # 转换为法条index pred = np.greater_equal(y, 0.5).astype(np.int32).reshape(-1) recom_index = np.where(pred == 1)[0] recom_statutes = [candi_statutes[i] for i in recom_index] return recom_statutes, np.asarray(y).reshape(-1)
def __transToData(xml_names): db = dbutil.get_mongodb_conn() cases_set = db.cases statutes_set = db.statutes s1 = None s2 = None label = np.array([], dtype=np.int32) for xml_name in xml_names: case = cases_set.find_one({"_id": xml_name}, { "ygscid": 1, "ftids": 1, "negftids": 1 }) # 1:原告诉称重复正例+反例次 ygsc_array = np.repeat([case["ygscid"]], len(case["ftids"]) + len(case["negftids"]), axis=0) s1 = myutil.append_and_pad_2d_array(s1, ygsc_array) label = np.append(label, [1] * len(case["ftids"]) + [0] * len(case["negftids"])) for ftid in case["ftids"] + case["negftids"]: statute = statutes_set.find_one({"_id": ftid}, {"contentid": 1}) s2 = myutil.append_and_pad_2d_array( s2, np.array([statute["contentid"]]))
def trainDataPrepare(nn_type="cnn", recom_num=35, sim_type="lda"): train_x = [] train_y = [] db = dbutil.get_mongodb_conn() cases_set = db.cases for line in cases_set.find({"flag": 2, sim_type: {"$exists": True}}, {"_id": 1, "ftids": 1, sim_type: 1, "ygscid": 1}, no_cursor_timeout=True).batch_size(20): # 1: 获取备选法条集 statute_num = min(len(line[sim_type]), recom_num) candi_statute = line[sim_type][:statute_num] # 2: 构造输入 lrinput = __get_lrinput(line["ygscid"], candi_statute, nn_type, recom_num) train_x.append(lrinput) # 3: 构造输出 lroutput = np.zeros(statute_num, dtype=np.int32) for i, ftid in enumerate(candi_statute): if ftid in line["ftids"]: lroutput[i] = 1 train_y.append(lroutput) # 4: 保存 train_x = np.concatenate(train_x, axis=0) train_y = np.concatenate(train_y, axis=0) with open("checkpoint/lr_trainx.pk", "wb") as file: joblib.dump(train_x, file) with open("checkpoint/lr_trainy.pk", "wb") as file: joblib.dump(train_y, file)
def prepareLabels(flag=2): ''' 准备训练集或验证集的label :param flag: :return: ''' logger = myutil.getLogger("label.log") statute_dict = {} statute_index = 0 db = dbutil.get_mongodb_conn() cases_set = db.cases if flag == 2: statutes_set = db.statutes statute_num = statutes_set.count( {"sampleTrainCount": { "$exists": True }}) else: with open("checkpoint/statute_dict.pk", "rb") as file: statute_dict = joblib.load(file) statute_num = len(statute_dict) for line in cases_set.find({ "flag": flag }, { "ftids": 1 }, no_cursor_timeout=True).batch_size(20): logger.info(line["_id"]) label = [0 for i in range(statute_num)] legal = True for ftid in line["ftids"]: if ftid in statute_dict: label[statute_dict[ftid]] = 1 # 直接赋值为1 else: if flag == 2: statute_dict[ftid] = statute_index # 加入dict里面没有的 label[statute_index] = 1 # 赋值为1 statute_index += 1 # 更新计数 else: logger.error("出现不在训练集的法条:%s" % line["_id"]) legal = False break if legal: cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "label": label }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) # 保存dict if flag == 2: with open("checkpoint/statute_dict.pk", "wb") as file: joblib.dump(statute_dict, file)
def case_fenci_second(): logger = myutil.getLogger("fenci.log") db = dbutil.get_mongodb_conn() cases_set = db.cases statutes_set = db.statutes words_set = db.words for line in cases_set.find( { "flag": { "$ne": 0 }, "ygscWords2": { "$exists": False } }, no_cursor_timeout=True).batch_size(10): logger.info(line["_id"]) # 记录当前xml flag = line["flag"] ygsc_words = line["ygscWords"].split(" ") ygsc_words_2 = [] # 1:进行词筛选处理 for word in ygsc_words: # 1.1 非停用词和低频词、如果非训练集,还要把未出现的词删掉 word_db = words_set.find_one({"_id": word}) if word_db is not None and __not_stopwords(word_db): # 1.2: 连续五个词中未重复 found = False end = len(ygsc_words_2) start = max(0, end - 5) for i in range(start, end): if ygsc_words_2[i] == word: found = True break if not found: ygsc_words_2.append(word) # 2:处理后词长过长的(短的其实效果很好,没有人名地名什么的) if len(ygsc_words_2) < 3 or len(ygsc_words_2) > 80: flag = 10 elif flag == 2: # 否则训练集统计法条引用数 for ftid in line["ftids"]: statutes_set.update( {"_id": ftid}, {'$inc': { "trainCount": 1 }}, upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "ygscWords2": " ".join(ygsc_words_2), "flag": flag }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 )
def evaluateLR(model="cnn", recom_num=35, sim_type="lda"): ''' 测试文本相关性 :return: ''' from flow.lr import runLR # 存储入数据库的列名 model_precise_name = "LR" + model + "Precise" model_recall_name = "LR" + model + "Recall" # sum precise_sum = 0 recall_sum = 0 case_num = 0 db = dbutil.get_mongodb_conn() cases_set = db.cases for line in cases_set.find({ "flag": 4, sim_type: { "$exists": True } }, { "_id": 1, "ftids": 1, sim_type: 1, "ygscid": 1 }, no_cursor_timeout=True).batch_size(20): # 1: 获取备选法条集 statute_num = min(len(line[sim_type]), recom_num) candi_statute = line[sim_type][:statute_num] # 2: 送到模型中运行 recom_statute = runLR(line["ygscid"], candi_statute, model, recom_num) # 3: 计算精度、召回 case_precise, case_recall = __get_precise_and_recall( recom_statute, line["ftids"]) # 4:加到总的里 precise_sum += case_precise recall_sum += case_recall case_num += 1 # # 5: 存入数据库记录 # cases_set.update( # {"_id": line["_id"]}, # 更新条件 # {'$set': {model: recom_statute, # model_precise_name: case_precise, # model_recall_name: case_recall # }}, # 更新内容 # upsert=False, # 如果不存在update的记录,是否插入 # multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 # ) # 计算总的精度、召回 __print_total_precise_and_recall(precise_sum, recall_sum, case_num, model)
def lda_recom_num(): db = dbutil.get_mongodb_conn() cases_set = db.cases dic = dict() for line in cases_set.find({"flag": 4}, {"lda": 1}, no_cursor_timeout=True).batch_size(10): recom_num = len(line["lda"]) index = recom_num // 10 dic[index] = dic.get(index, 0) + 1 print(dic)
def statute_refed_num(): db = dbutil.get_mongodb_conn() cases_set = db.cases statutes_vec = numpy.zeros(1405) for line in cases_set.find({"flag": 4}, {"label": 1}, no_cursor_timeout=True).batch_size(10): label = line["label"] statutes_vec += label statutes_vec = statutes_vec[numpy.nonzero(statutes_vec)] print("max:%f, min:%f, avg:%f, num:%d" % (numpy.max(statutes_vec), numpy.min(statutes_vec), numpy.average(statutes_vec), statutes_vec.shape[0]))
def runRules(oriStatutes): # 得到包含的rules db = dbutil.get_mongodb_conn() rules_set = db.rules rules = rules_set.find({"from": {"$in": oriStatutes}}) # 遍历得到伴随 oriSet = set(oriStatutes) assoRules = set(oriStatutes) for rule in rules: if set(rule["from"]).issubset(oriSet): assoRules.update(rule["to"]) return list(assoRules)
def statutes_fenci(): db = dbutil.get_mongodb_conn() statutes_set = db.statutes # statutes表,没有则自动创建 for line in statutes_set.find(): content_words = " ".join(fenci(line["content"])) statutes_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "contentWords": content_words }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 )
def evaluateSim(recom_num, type="lda"): # 只有测试集需要跑这个 # 存储入数据库的列名 model_precise_name = type + str(recom_num) + "Precise" model_recall_name = type + str(recom_num) + "Recall" # sum precise_sum = 0 recall_sum = 0 case_num = 0 db = dbutil.get_mongodb_conn() cases_set = db.cases for line in cases_set.find({ "flag": 4, type: { "$exists": True } }, { "_id": 1, "ftids": 1, type: 1 }, no_cursor_timeout=True).batch_size(20): # 1: 获取推荐列表 statute_num = min(len(line[type]), recom_num) recom_statute = line[type][:statute_num] # 2: 计算精度、召回 case_precise, case_recall = __get_precise_and_recall( recom_statute, line["ftids"]) # 3:加到总的里 precise_sum += case_precise recall_sum += case_recall case_num += 1 # 4: 存入数据库记录 cases_set.update( {"_id": line["_id"]}, # 更新条件 { '$set': { model_precise_name: case_precise, model_recall_name: case_recall } }, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) # 计算总的精度、召回 __print_total_precise_and_recall(precise_sum, recall_sum, case_num, type + str(recom_num))
def print_stopwords(): db = dbutil.get_mongodb_conn() words_set = db.words for line in words_set.find( { "aynum": { "$gte": 170 }, "totalCount": { "$gte": 50000 } }, no_cursor_timeout=True).batch_size(10): print(line["_id"], end=" ")
def clean_by_statutes(): logger = myutil.getLogger("clean.log") statutes_list = get_statutes_set() db = dbutil.get_mongodb_conn() cases_set = db.cases statutes_set = db.statutes for line in cases_set.find( { "flag": { "$ne": 0, "$lt": 5 }, "clean": { "$exists": False } }, no_cursor_timeout=True).batch_size(10): logger.info(line["_id"]) # 记录当前xml flag = line["flag"] for ftid in line["ftids"]: if ftid not in statutes_list: flag = 9 break if flag == 3 or flag == 4: # 测试集、验证集统计 if flag == 3: # 测试集 col = "testCount" else: col = "validCount" for ftid in line["ftids"]: statutes_set.update( {"_id": ftid}, {'$inc': { col: 1 }}, upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "flag": flag, "clean": 0 }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 )
def get_statutes_set(): db = dbutil.get_mongodb_conn() statutes_set = db.statutes statute_list = [] for line in statutes_set.find( { "trainCount": { "$gte": 10 }, "flag": { "$ne": 0 } }, {"_id": 1}): statute_list.append(line["_id"]) return set(statute_list)
def evaluateMulti(multi_model="svm"): from flow.multiLabel import runMulti import numpy as np precise_sum = 0 recall_sum = 0 case_num = 0 db = dbutil.get_mongodb_conn() cases_set = db.cases for line in cases_set.find({ "flag": 4 }, { "_id": 1, "ygscWords2": 1, "label": 1 }, no_cursor_timeout=True).batch_size(20): recom_statute = runMulti([line["ygscWords2"]], multi_model)[0] # 2: 计算精度、召回 predict_sum = np.maximum( np.sum(recom_statute).astype(np.float32), 0.001) score_sum = np.sum(line["label"]).astype(np.float32) predict_right = np.sum(np.logical_and( recom_statute, line["label"])).astype(np.float32) case_precise = np.mean(predict_right / predict_sum) case_recall = np.mean(predict_right / score_sum) # 3:加到总的里 precise_sum += case_precise recall_sum += case_recall case_num += 1 # 4: 存入数据库记录 # cases_set.update( # {"_id": line["_id"]}, # 更新条件 # {'$set': {multi_model: recom_statute, # multi_model + "Precise": case_precise, # multi_model + "Recall": case_recall # }}, # 更新内容 # upsert=False, # 如果不存在update的记录,是否插入 # multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 # ) # 计算总的精度、召回 __print_total_precise_and_recall(precise_sum, recall_sum, case_num, multi_model)
def compute_words_aynum(): db = dbutil.get_mongodb_conn() words_set = db.words for line in words_set.find(): aynum = 0 for (aydm, ay_count) in line["ayCount"].items(): if ay_count >= 5: aynum += 1 words_set.update( {"_id": line["_id"]}, {'$set': { "aynum": aynum }}, upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 )
def clearSim(type="lda"): ''' 清空生成相似的记录 :param type: :return: ''' db = dbutil.get_mongodb_conn() cases_set = db.cases cases_set.update( {"flag": 4}, # 更新条件 {'$unset': { type: 1 }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=True, # 可选,mongodb 默认是false,只更新找到的第一条记录 )
def fine_rule_param(): from flow.rules import trainRules import numpy as np __load_logger() db = dbutil.get_mongodb_conn() rules_set = db.rules for minsup in range(5, 31, 5): for minconf in np.arange(0.7, 1, 0.1): logger.info( "#############rules: minsup=%d, minconf=%f ###########" % (minsup, minconf)) rules_set.drop() trainRules(minsup, minconf) evaluateRules(text_model="cnn")
def evaluateRules(text_model="cnn"): from flow.rules import runRules precise_sum = 0 recall_sum = 0 case_num = 0 db = dbutil.get_mongodb_conn() cases_set = db.cases for line in cases_set.find({ "flag": 4 }, { "_id": 1, "ftids": 1, text_model: 1 }, no_cursor_timeout=True).batch_size(20): recom_statute = runRules(line[text_model]) # 2: 计算精度、召回 case_precise, case_recall = __get_precise_and_recall( recom_statute, line["ftids"]) # 3:加到总的里 precise_sum += case_precise recall_sum += case_recall case_num += 1 # 4: 存入数据库记录 cases_set.update( {"_id": line["_id"]}, # 更新条件 { '$set': { "rules": recom_statute, "rulesPrecise": case_precise, "rulesRecall": case_recall } }, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) # 计算总的精度、召回 __print_total_precise_and_recall(precise_sum, recall_sum, case_num, "rules")
def case_ref_num(flag): db = dbutil.get_mongodb_conn() cases_set = db.cases max_num = 0 min_num = 1000 ref_sum = 0 case_num = 0 for line in cases_set.find({"flag": flag}, {"ftids": 1}, no_cursor_timeout=True).batch_size(10): ref_num = len(line["ftids"]) ref_sum += ref_num case_num += 1 if ref_num > max_num: max_num = ref_num if ref_num < min_num: min_num = ref_num print("max:%d, min:%d, avg:%f, num:%d" % (max_num, min_num, (ref_sum/case_num), case_num))
def runCandiStatutes(simCases): ''' 获取关联法条 :param simCases: :return: (ftid, score)的列表 ''' db = dbutil.get_mongodb_conn() cases_set = db.cases candi_statutes = {} for (sim_id, sim_score) in simCases: sim_case = cases_set.find_one({"_id": sim_id}, {"ftids": 1}) for ftid in sim_case["ftids"]: candi_statutes[ftid] = candi_statutes.get(ftid, 0) + sim_score # 3:将候选法条按分数倒序 sorted_candi_statutes = sorted(candi_statutes.items(), key=lambda x: x[1], reverse=True) return sorted_candi_statutes
def genSim(flag=4, type="lda", sim_case_num=30): ''' 生成备选法条集(通过类案推荐) :param flag: 2是训练,4是测试 :param type: "lda"、"svd"、"tfidf" :param sim_case_num: 类案数字 :return: ''' from flow.similarCases import runSimilarCases, runCandiStatutes db = dbutil.get_mongodb_conn() cases_set = db.cases for line in cases_set.find({ "flag": flag, type: { "$exists": False } }, { "_id": 1, "ygscWords2": 1 }, no_cursor_timeout=True).batch_size(20): # 1:获取相似案件 simCases = runSimilarCases([line["ygscWords2"]], type, sim_case_num) # 2:根据相似案件获取候选法条 sorted_candi_statutes = runCandiStatutes(simCases[0]) # 3:转换成列表 statute_num = min(len(sorted_candi_statutes), 500) # 最多不能超过500个 recom_statute = [ ftid for (ftid, ft_score) in sorted_candi_statutes[:statute_num] ] # 4:存储 cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { type: recom_statute }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 )
def case_fenci_second_patch_test(): logger = myutil.getLogger("fenci_patch.log") db = dbutil.get_mongodb_conn() cases_set = db.cases for line in cases_set.find({ "flag": 10 }, no_cursor_timeout=True).batch_size(10): logger.info(line["_id"]) # 记录当前xml ygsc_words_2 = line["ygscWords2"].split(" ") if 3 < len(ygsc_words_2) <= 80: cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "flag": 4 }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 )
def trainRules(minsup, minconf): ''' 挖掘关联规则 :return: ''' # 获取连接 db = dbutil.get_mongodb_conn() cases_set = db.cases rules_set = db.rules # 挖掘规则 ftids = [ case["ftids"] for case in cases_set.find({"flag": 2}, {"ftids": 1}, no_cursor_timeout=True) ] rules = fptree.generateRules(ftids, minsup, minconf) # 简化规则 rules = __prunedRules(rules) # 存储规则 for rule in rules: rules_set.insert({"from": list(rule[0]), "to": list(rule[1])})
def sampling_test(total_num=1000): logger = myutil.getLogger("sample_test.log") db = dbutil.get_mongodb_conn() cases_set = db.cases statutes_set = db.statutes num = 0 for line in cases_set.find({ "flag": 14 }, no_cursor_timeout=True).batch_size(10): logger.info(line["_id"]) # 记录当前xml ygsc_words_2 = line["ygscWords2"].split(" ") if 10 < len(ygsc_words_2) < 30: ftlegal = True for ftid in line["ftids"]: statute_db = statutes_set.find_one({ "_id": ftid, "sampleTrainCount": { "$exists": True } }) if statute_db is None: ftlegal = False break if ftlegal: num += 1 cases_set.update( {"_id": line["_id"]}, # 更新条件 {'$set': { "flag": 4 }}, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 ) if num == total_num: break
def statutes_fenci_second(): db = dbutil.get_mongodb_conn() statutes_set = db.statutes # statutes表,没有则自动创建 words_set = db.words for line in statutes_set.find({"contentWords2": {"$exists": False}}): flag = line["flag"] content_words = line["contentWords"].split(" ") content_words_2 = [] for word in content_words: # 1.1 非停用词和低频词、如果非训练集,还要把未出现的词删掉 word_db = words_set.find_one({"_id": word}) if word_db is not None and __not_stopwords(word_db): # 1.2: 连续五个词中未重复 found = False end = len(content_words_2) start = max(0, end - 5) for i in range(start, end): if content_words_2[i] == word: found = True break if not found: content_words_2.append(word) # 删掉太长的 if len(content_words_2) > 80: flag = 0 statutes_set.update( {"_id": line["_id"]}, # 更新条件 { '$set': { "contentWords2": " ".join(content_words_2), "flag": flag } }, # 更新内容 upsert=False, # 如果不存在update的记录,是否插入 multi=False, # 可选,mongodb 默认是false,只更新找到的第一条记录 )