def epoch_step(self, logs={}): for (k, v) in logs.items(): l = self.H.get(k, []) # np.float32会报错 if not isinstance(v, np.float): v = round(float(v), 4) l.append(v) self.H[k] = l # 写入文件 if self.json_path is not None: piop.write_json(self.json_path, self.H) # 保存train图像 if len(self.H["loss"]) == 1: self.paths = { key: self.file_dir / (self.arch + f'_{key.upper()}') for key in self.H.keys() } if len(self.H["loss"]) > 1: # 指标变化 # 曲线 # 需要成对出现 keys = [key for key, _ in self.H.items() if '_' not in key] for key in keys: N = np.arange(0, len(self.H[key])) plt.style.use("ggplot") plt.figure() plt.plot(N, self.H[key], label=f"train_{key}") plt.plot(N, self.H[f"valid_{key}"], label=f"valid_{key}") if self.add_test: plt.plot(N, self.H[f"test_{key}"], label=f"test_{key}") plt.legend() plt.xlabel("Epoch #") plt.ylabel(key) plt.title(f"Training {key} [Epoch {len(self.H[key])}]") plt.savefig(str(self.paths[key])) plt.close()
fname = os.path.join(segpos_path, file) cate_data = piop.read_json(fname) tmp_duty_list.extend(cate_data['duty']) tmp_require_list.extend(cate_data['require']) tmp_dtimes_list.extend(cate_data['dtimes']) tmp_duty = get_common(tmp_duty_list, IGNORE, NEEDPOS) tmp_require = filter_require( tmp_duty, get_common(tmp_require_list, IGNORE, NEEDPOS)) tmp[post]['duty'] = get_need_item(tmp_duty) tmp[post]['require'] = get_need_item(tmp_require) tmp[post]['demand'] = get_dtime_item(tmp_dtimes_list) duty_list.extend(tmp_duty_list) require_list.extend(tmp_require_list) dtimes_list.extend(tmp_dtimes_list) duty = get_common(duty_list, IGNORE, NEEDPOS) require = filter_require(duty, get_common(require_list, IGNORE, NEEDPOS)) res[cate]['duty'] = get_need_item(duty) res[cate]['require'] = get_need_item(require) res[cate]['demand'] = get_dtime_item(dtimes_list) res[cate]['posts'] = tmp piop.write_json(os.path.join(model_path, "model.txt"), res, indent=4, ensure_ascii=False)
return res def flat_all_cates(): all_items = [] for cate, sub_cate in cate_gw.items(): for scate, gangwei in sub_cate.items(): gangwei_list = gangwei.split(";") gangwei_title_list = cate_url[cate][scate].split(";") for i, gw in enumerate(gangwei_list): all_items.append((cate, scate, gw, gangwei_title_list[i])) return all_items if __name__ == '__main__': all_items = flat_all_cates() for item in all_items: url_dict = {} cate = item[0] gangwei = item[1] zw = item[2] title = item[3] urlist = get_urlist(title) # 不满 30 页的 if len(urlist) < 15*30: print(cate, "\t", gangwei, "\t", zw, "\t", len(urlist)) url_dict['position'] = cate + "_" + gangwei + "_" + zw + "_" + title url_dict['urlist'] = urlist out_fpath = os.path.join(urlist_path, url_dict['position'] + ".txt") piop.write_json(out_fpath, url_dict, indent=4, ensure_ascii=False)
def test_write_json(): data = {"outjson1": "this is line 1.", "outjson2": "这是第二行。"} write_json(os.path.join(DATA_PATH, 'outjson.json'), data, indent=4, ensure_ascii=False)
def segpos(text: str) -> list: res = [] try: bd_resp = client.lexer(text) except Exception as e: print("BaiDu Error:", e) for item in bd_resp['items']: pos = item['pos'] w = item['item'] res.append((w, pos)) return res if __name__ == '__main__': for file in os.listdir(extract_path): if file[0] == '.': continue fname = os.path.join(extract_path, file) outname = os.path.join(segpos_path, file) if os.path.exists(outname): continue data = dict() cate, fduty, frequire, fdtimes = get_cate_res(fname) data['cate'] = cate data['duty'] = fduty data['require'] = frequire data['dtimes'] = fdtimes piop.write_json(outname, data, indent=4, ensure_ascii=False)