def last_finish_reason(): reason = FINISHED process_file = Path(settings.get('PERSIST_FILE')) if process_file.is_file(): process = load_json(settings.get('PERSIST_FILE')) reason = process['finish_reason'] return reason
def __init__(self,settings,default_process=default_process): self.process_file_path = settings.get('PERSIST_FILE') self.next_process: dict = None self.default_process = default_process self.settings = settings provinces_file_path=os.path.join(settings.get('PROVINCE_DIR'),'court_region.json') self.provinces = [] for p in load_json(provinces_file_path): self.provinces.append(p['name'])
def __pre_request(self): province_list = load_json( os.path.join( '/Users/stack/code/py3/wenshu/judgement_spider/judgement_spider/util/provinces', 'province.json')) for p in province_list: self.provinces.append(p['name']) self.decoder = Decoder( '/Users/stack/code/py3/wenshu/judgement_spider/judgement_spider/public' ) self.guid = self.__get_guid()
def __init__(self,config_path): if not Path(config_path).is_file: raise Exception('Please provide your email config') self.config:dict=load_json(config_path)
from judgement_spider.util.toolbox import load_json, dump_json import os # import urllib.parse dir = '/Users/stack/code/py3/wenshu/judgement_spider/judgement_spider/util/provinces' if __name__ == "__main__": provinces = load_json(os.path.join(dir, 'province.json')) courts = {} for p in provinces: p_name = p['name'].encode('utf-8').decode('utf-8') json_path = os.path.join(dir, '{}.json'.format(p_name)) p_courts = load_json(json_path) p_c = [] for court in p_courts: obj = { "court": court["court"], "province": court['province'], 'region': court['region'], 'leval': court['leval'], 'key': court['key'] } p_c.append(obj) print(obj) courts[p_name] = p_c dump_json(os.path.join(dir, 'converted.json'), courts)
def configure(self): settings = self.settings next_process: dict = None if not Path(self.process_file_path).is_file(): next_process=default_process else: date_to_crawl = None index_to_crawl = None province_to_crawl = None current_tried_times = None last_process: dict = load_json(self.process_file_path) logging.info('Last process is {}'.format(last_process)) last_index = int(last_process['last_index']) last_date = str_to_datetime(last_process['last_date']) finish_reason = last_process['finish_reason'] last_tried_times = int(last_process['last_tried_times']) last_province = last_process['last_province'] last_province_idx = self.provinces.index(last_province) if 'all_indexes' in list(last_process.keys()): all_indexes = last_process['all_indexes'] need_change_param = False # we finished or we meet the max index,change param,change province or date if (finish_reason == FINISHED) and (last_index >= all_indexes or last_index >= settings.getint('INDEXES_PER_DEPTH', 20) ): need_change_param = True # we need retry but we have met max tried times and further we cannot chagne index because we have meet the max index if (finish_reason == NEED_RETRY) and \ (last_tried_times == settings.getint('MAX_TRIED_TIMES', 2)) and \ (last_index >= all_indexes or last_index >= settings.getint('INDEX_PER_DEPTH', 20)): need_change_param = True if finish_reason==DATE_FINISHED: need_change_param=True elif finish_reason in [VALIDATION, REDIRECT, CANCELLED, SHUT_DOWN]: pass elif finish_reason == NEED_RETRY: pass if need_change_param: # 1. keep province unchanged and change date # 2. change province and date from first # we have not arrive the last province if last_province_idx != len(self.provinces)-1: # we change date and keep province unchanged if str_to_datetime(settings.get('STOP_DATE', '2018-01-01')) < last_date-TIME_DELTA_REGION: date_to_crawl = last_date-TIME_DELTA_REGION province_to_crawl = last_province else: # we can not change date, we change province: province_to_crawl = self.provinces[last_province_idx+1] date_to_crawl = str_to_datetime(START_DATE) index_to_crawl = START_INDEX else: # we have arrive at the last province if str_to_datetime(settings.get('STOP_DATE', '2018-01-01')) < last_date-TIME_DELTA_REGION: date_to_crawl = last_date-TIME_DELTA_REGION province_to_crawl = last_province index_to_crawl = START_INDEX else: # we have finish all province and all date in 2018 pass current_tried_times = 1 else: # no need to change province or date, date_to_crawl = last_date province_to_crawl = last_province if finish_reason in [REDIRECT, VALIDATION, SHUT_DOWN, CANCELLED,NEED_RETRY]: current_tried_times = last_tried_times+1 index_to_crawl = last_index elif finish_reason==NETWORK_ERROR: current_tried_times=last_tried_times index_to_crawl=last_index else: index_to_crawl = last_index+1 current_tried_times = 1 next_process={ 'date_to_crawl':datetime_to_str(date_to_crawl), 'province_to_crawl':province_to_crawl, 'index_to_crawl':index_to_crawl, 'current_tried_times':current_tried_times, 'last_all_indexes':last_process['all_indexes'] } logger.info('Process to feed is {}'.format(next_process)) self.settings.set('PARAM',next_process) self.settings.set('MANAGER',self)