示例#1
0
def last_finish_reason():
    reason = FINISHED
    process_file = Path(settings.get('PERSIST_FILE'))
    if process_file.is_file():
        process = load_json(settings.get('PERSIST_FILE'))
        reason = process['finish_reason']
    return reason
示例#2
0
    def __init__(self,settings,default_process=default_process):
        self.process_file_path = settings.get('PERSIST_FILE')
        self.next_process: dict = None
        self.default_process = default_process
        self.settings = settings
        
        provinces_file_path=os.path.join(settings.get('PROVINCE_DIR'),'court_region.json')

        self.provinces = []
        for p in load_json(provinces_file_path):
            self.provinces.append(p['name'])
示例#3
0
    def __pre_request(self):
        province_list = load_json(
            os.path.join(
                '/Users/stack/code/py3/wenshu/judgement_spider/judgement_spider/util/provinces',
                'province.json'))
        for p in province_list:
            self.provinces.append(p['name'])
        self.decoder = Decoder(
            '/Users/stack/code/py3/wenshu/judgement_spider/judgement_spider/public'
        )

        self.guid = self.__get_guid()
示例#4
0
 def __init__(self,config_path):
     if not Path(config_path).is_file:
         raise Exception('Please provide your email config')
     self.config:dict=load_json(config_path)
示例#5
0
from judgement_spider.util.toolbox import load_json, dump_json
import os
# import urllib.parse
dir = '/Users/stack/code/py3/wenshu/judgement_spider/judgement_spider/util/provinces'

if __name__ == "__main__":
    provinces = load_json(os.path.join(dir, 'province.json'))
    courts = {}
    for p in provinces:
        p_name = p['name'].encode('utf-8').decode('utf-8')
        json_path = os.path.join(dir, '{}.json'.format(p_name))
        p_courts = load_json(json_path)
        p_c = []
        for court in p_courts:
            obj = {
                "court": court["court"],
                "province": court['province'],
                'region': court['region'],
                'leval': court['leval'],
                'key': court['key']
            }
            p_c.append(obj)
            print(obj)

        courts[p_name] = p_c
    dump_json(os.path.join(dir, 'converted.json'), courts)
示例#6
0
    def configure(self):
        settings = self.settings
        next_process: dict = None

        if not Path(self.process_file_path).is_file():
            next_process=default_process
        else:
            date_to_crawl = None
            index_to_crawl = None
            province_to_crawl = None
            current_tried_times = None

            last_process: dict = load_json(self.process_file_path)

            logging.info('Last process is {}'.format(last_process))
            last_index = int(last_process['last_index'])
            last_date = str_to_datetime(last_process['last_date'])
            finish_reason = last_process['finish_reason']
            last_tried_times = int(last_process['last_tried_times'])
            last_province = last_process['last_province']
            last_province_idx = self.provinces.index(last_province)

            if 'all_indexes' in list(last_process.keys()):
                all_indexes = last_process['all_indexes']

            need_change_param = False
            # we finished or we meet the max index,change param,change province or date
            if (finish_reason == FINISHED) and (last_index >= all_indexes
                                                or last_index >= settings.getint('INDEXES_PER_DEPTH', 20)
                                                ):
                need_change_param = True
            # we need retry but we have met max tried times and further we cannot chagne index because we have meet the max index
            if (finish_reason == NEED_RETRY) and \
                (last_tried_times == settings.getint('MAX_TRIED_TIMES', 2)) and \
                    (last_index >= all_indexes or last_index >= settings.getint('INDEX_PER_DEPTH', 20)):
                need_change_param = True
            if finish_reason==DATE_FINISHED:
                need_change_param=True

            elif finish_reason in [VALIDATION, REDIRECT, CANCELLED, SHUT_DOWN]:
                pass
            elif finish_reason == NEED_RETRY:
                pass

            if need_change_param:
                # 1. keep province unchanged and change  date
                # 2. change province and date from first
                # we have not arrive the last province
                if last_province_idx != len(self.provinces)-1:
                    # we change date and keep province unchanged
                    if str_to_datetime(settings.get('STOP_DATE', '2018-01-01')) < last_date-TIME_DELTA_REGION:
                        date_to_crawl = last_date-TIME_DELTA_REGION
                        province_to_crawl = last_province
                    else:
                        # we can not change date, we change province:
                        province_to_crawl = self.provinces[last_province_idx+1]
                        date_to_crawl = str_to_datetime(START_DATE)
                    index_to_crawl = START_INDEX

                else:
                    # we have arrive at the last province
                    if str_to_datetime(settings.get('STOP_DATE', '2018-01-01')) < last_date-TIME_DELTA_REGION:
                        date_to_crawl = last_date-TIME_DELTA_REGION
                        province_to_crawl = last_province
                        index_to_crawl = START_INDEX
                    else:
                        # we have finish all province and all date in 2018
                        pass

                current_tried_times = 1
            else:
                # no need to change province or date,
                date_to_crawl = last_date
                province_to_crawl = last_province
                if finish_reason in [REDIRECT, VALIDATION, SHUT_DOWN, CANCELLED,NEED_RETRY]:
                    current_tried_times = last_tried_times+1
                    index_to_crawl = last_index
                elif finish_reason==NETWORK_ERROR:
                    current_tried_times=last_tried_times
                    index_to_crawl=last_index
                else:
                    index_to_crawl = last_index+1
                    current_tried_times = 1
            next_process={
                'date_to_crawl':datetime_to_str(date_to_crawl),
                'province_to_crawl':province_to_crawl,
                'index_to_crawl':index_to_crawl,
                'current_tried_times':current_tried_times,
                'last_all_indexes':last_process['all_indexes']
            }
        logger.info('Process to feed is {}'.format(next_process))
        self.settings.set('PARAM',next_process)
        self.settings.set('MANAGER',self)