def test_spider(): from session import Session from condition import Condition from spider import Spider from parameter import Parameter from datetime import datetime s = Session() c = Condition() # parameter = Parameter(param=str(c), sess=s) spider = Spider(sess=s) # page: 每页几条; order: 排序标准; direction: 顺序 (asc - 正序 desc - 倒序) print( spider.tree_content(param=Parameter(param=str( c.district('西藏自治区').date(datetime(1991, 1, 1), datetime( 2018, 9, 15))), sess=s))) for i in spider.content_list(param=Parameter(param=str( c.district('西藏自治区').date(datetime(1991, 1, 1), datetime(2018, 9, 15))), sess=s), page=20, order='法院层级', direction='asc'): print(i)
def court(self, condition: Condition, district: str, start_court: str = None): """ :param condition: :param district: :param start_court: start_court will only be available for level 2 :return: Court name, court level, court indicator, count """ level_count = {'高级法院': 0, '中级法院': 0, '基层法院': 0} condition = condition.district(district) info = self.tree_content( Parameter(param=str(condition), sess=self.sess))['法院层级'] satisfy = True for item in info['ParamList']: if item['IntValue'] > 200: satisfy = False if item['Key'] in level_count: level_count[item['Key']] = item['IntValue'] if satisfy: for k, v in level_count.items(): if v > 0: yield None, k, True, v else: start = start_court is None if start and level_count['高级法院'] > 0: yield None, 1, True, level_count['高级法院'] middle = self.court_tree_content(condition, parval=district)['中级法院'] for d in sorted(middle['ParamList'], key=lambda item: item['IntValue'], reverse=False): mid_court = d['Key'] if not start: if mid_court == start_court: start = True if start: if 0 < d['IntValue'] < 200: yield mid_court, 2, False, d['IntValue'] else: base = self.court_tree_content( condition.court(mid_court, 2, False), parval=mid_court)['基层法院'] if d['IntValue'] - base['IntValue'] > 0: yield mid_court, 2, True, d['IntValue'] - base[ 'IntValue'] for g in sorted(base['ParamList'], key=lambda item: item['IntValue'], reverse=False): base_court = g['Key'] if g['IntValue'] > 0: yield base_court, 3, False, g['IntValue']
def crawl_by_district(): # Read config start_dist, start_date, start_court = None, None, None start_info = Config.start if hasattr(start_info, 'district') and start_info.district is not None: start_dist = start_info.district logging.info('Start District: {}'.format(start_dist)) if hasattr(start_info, 'date') and start_info.date is not None: start_date = start_info.date logging.info('Start Date: {}'.format(start_date.strftime("%Y-%m-%d"))) if hasattr(start_info, 'court') and start_info.court is not None: start_court = start_info.court logging.info('Start Court: {}'.format(start_court)) max_retry = Config.config.max_retry data_file = open('./data/data {}.txt'.format( datetime.now().strftime('%Y-%m-%d %H-%M-%S')), 'a', encoding='utf-8') s = Session() c = Condition() spider = Spider(sess=s) total_success = False while not total_success: try: if start_dist is not None: start = False else: start = True # log the distribution of district with open('district_list.txt', 'w', encoding='utf-8') as f: print(json.dumps(list(spider.district(condition=c)), ensure_ascii=False), file=f) for dist in spider.district(condition=c): # Find the district to start if not start: if dist == start_dist: start = True else: continue logging.info(dist) c1 = c.district(dist) # If time_interval is interrupted, continue from the start_date cur_date = start_date start_date = None # Variables for retry dist_success = False dist_retry = max_retry while not dist_success: try: for time_interval in spider.time_interval( condition=c1, start_date=cur_date): logging.info('{0} {1} {2} {3}'.format( dist, time_interval[0].strftime('%Y-%m-%d'), time_interval[1].strftime('%Y-%m-%d'), time_interval[2])) cur_date = time_interval[0] time_success = False time_retry = max_retry index = 1 c2 = c1.date(time_interval[0], time_interval[1]) cur_court = start_court start_court = None while not time_success: if time_interval[2] > 200: try: for court in spider.court( condition=c2, district=dist, start_court=cur_court): logging.info( '{0} {1} {2} {3} {4} {5} {6}'. format( dist, time_interval[0].strftime( '%Y-%m-%d'), time_interval[1].strftime( '%Y-%m-%d'), court[0], court[1], court[2], court[3])) if court[1] == 2: cur_court = court[0] court_success = False court_retry = max_retry index = 1 c3 = c2.court(*court[0:3]) while not court_success: try: for item, idx in spider.content_list( param=Parameter( param=str(c3), sess=s), page=20, order='法院层级', direction='asc', index=index): print(item, file=data_file) index = idx court_success = True except ErrorList as e: logging.error( 'Error when fetch content list: {0}' .format(str(e))) court_retry -= 1 if court_retry <= 0: s.switch_proxy() court_retry = max_retry time_success = True except ErrorList as e: logging.error( 'Error when fetch court: {0}'. format(str(e))) time_retry -= 1 if time_retry <= 0: s.switch_proxy() time_retry = max_retry else: try: for item, idx in spider.content_list( param=Parameter(param=str(c2), sess=s), page=20, order='法院层级', direction='asc', index=index): print(item, file=data_file) index = idx # print(item['id'], item['name']) # try: # spider.download_doc(item['id']) # except: # print(item['id'], file=error_log) time_success = True except ErrorList as e: logging.error( 'Error when fetch content list: {0}' .format(str(e))) time_retry -= 1 if time_retry <= 0: s.switch_proxy() time_retry = max_retry dist_success = True except ErrorList as e: logging.error( 'Error when fetch time interval: {0}'.format( str(e))) dist_retry -= 1 if dist_retry <= 0: s.switch_proxy() dist_retry = max_retry total_success = True except ErrorList as e: logging.error('Error when fetch dist information: {0}'.format( str(e))) s.switch_proxy() data_file.close()