def reset(self, start_at): if start_at > 0: if self.json_path is not None: if self.json_path.exists(): self.H = piop.read_json(self.json_path) for k in self.H.keys(): self.H[k] = self.H[k][:start_at]
def batch_process(batch_file: str): data = piop.read_json(batch_file) for item in data: node = create_node(item) graph.push(node) for item in data: rlt = RelationUpdate(item) rlt.update_all()
def check_repeat_url(): # 重复 url 检验 filelist = os.listdir(urlist_path) for file in filelist: if file[0] == '.': continue data = piop.read_json(os.path.join(urlist_path, file)) fpath = os.path.join(html_path, data['position']) urlist = data['urlist'] if len(urlist) != len(set(urlist)): print(file, len(set(urlist)), len(urlist)) filenum = len([_ for _ in os.listdir(fpath) if _[0] != '.']) urlnum = len(urlist) if filenum != urlnum: print(file, filenum, urlnum) if filenum != len(set(urlist)): print("wrong")
def download(): driver = webdriver.Chrome() filelist = os.listdir(urlist_path) for file in filelist: if file[0] == '.': continue data = piop.read_json(os.path.join(urlist_path, file)) fpath = os.path.join(html_path, data['position']) piop.check_dir(fpath) urlist = data['urlist'] for i, url in enumerate(urlist): fname = os.path.split(url)[-1] if os.path.exists(os.path.join(fpath, fname)): continue driver.get(url) ps = driver.page_source write_html(os.path.join(fpath, fname), ps) time.sleep(random.randint(1, 3)) # 每 10 个重启一次 if i % 10 == 0: driver.close() driver = webdriver.Chrome()
def check(): # 页面检查 allist = [] filelist = os.listdir(urlist_path) for file in filelist: if file[0] == '.': continue data = piop.read_json(os.path.join(urlist_path, file)) fpath = html_path + data['position'] urlist = list(set(data['urlist'])) allist.extend(urlist) print(len(allist), len(set(allist))) allfiles = [] for pname in os.listdir(html_path): if pname[0] == '.': continue for base_file in os.listdir(os.path.join(html_path, pname)): allfiles.append(base_file) print(len(allfiles), len(set(allfiles))) assert len(allfiles) == len(allist) assert len(set(allfiles)) == len(set(allist))
def get_cate_res(cate_extract_file: str): cate_item = piop.read_json(cate_extract_file) dtimes, require, duty = [], [], [] for item in cate_item: dtimes.append(item['dtime']) if len(item['require']) < 10: continue try: item_require = segpos(item['require']) require.extend(item_require) except Exception as e: print("GET ITEM ERROR.", e) continue try: item_duty = segpos(item['duty']) duty.extend(item_duty) except Exception as e: print("GET ITEM ERROR.", e) continue try: cate = item['category'] except Exception as e: cate = "" return cate, duty, require, dtimes
return res if __name__ == '__main__': segpos_files = sorted(os.listdir(segpos_path)) res = pmag.MagicDict() for cate, _post in cate_gw.items(): duty_list, require_list, dtimes_list = [], [], [] tmp = pmag.MagicDict() for post, job in _post.items(): tmp_duty_list, tmp_require_list, tmp_dtimes_list = [], [], [] tag = cate + "_" + post for file in segpos_files: if tag in file: fname = os.path.join(segpos_path, file) cate_data = piop.read_json(fname) tmp_duty_list.extend(cate_data['duty']) tmp_require_list.extend(cate_data['require']) tmp_dtimes_list.extend(cate_data['dtimes']) tmp_duty = get_common(tmp_duty_list, IGNORE, NEEDPOS) tmp_require = filter_require( tmp_duty, get_common(tmp_require_list, IGNORE, NEEDPOS)) tmp[post]['duty'] = get_need_item(tmp_duty) tmp[post]['require'] = get_need_item(tmp_require) tmp[post]['demand'] = get_dtime_item(tmp_dtimes_list) duty_list.extend(tmp_duty_list) require_list.extend(tmp_require_list) dtimes_list.extend(tmp_dtimes_list)
def test_read_json(): data = read_json(os.path.join(DATA_PATH, 'json.json')) assert type(data) == dict assert data == {"json1": "this is line 1", "json2": "这是第二行。"}
import json import math import os from pnlp import piop, pmag ROOT_PATH = os.path.dirname(os.path.abspath(__file__)) MODEL_PATH = os.path.join(ROOT_PATH, "data", "model.txt") IGNORE_PATH = os.path.join(ROOT_PATH, "data", "ignore.txt") MODEL = piop.read_json(MODEL_PATH) IGNORE = piop.read_lines(IGNORE_PATH) def get_demand_normfactor(): """ Get demand normalization factor Parameters ----------- Returns -------- demand normalization factor, int type """ csfs, ilfs, factors = [], [], [] for cate, _others in MODEL.items(): for post, others in _others['posts'].items(): item = MODEL[cate]['posts'][post]['demand'] csf = item.get('continuous_freq', 0) ilf = item.get('interval_freq', 0) phf = item.get('publish_freq', 0) # factor = (csf + ilf) / 2 * phf