def __init__(self): self.rdbra = RequestDcardByRESTfulAPI() self.es = Elastic( host=app.config['ES_SETTING']['CONNECTION']['HOST'], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) # for index_category, index_info in app.config['ES_SETTING']['ES_INDEX'].items(): # self.es.create_index(index_info['INDEX_NAME'], index_info['MAPPING_FILEPATH']) self.article_es_key_list = [] f = open( app.config['ES_SETTING']['ES_INDEX']['ARTICLE'] ['MAPPING_FILEPATH'], 'r') for key in json.loads(f.read())['mappings']['properties']: self.article_es_key_list.append(key) self.comment_es_key_list = [] f = open( app.config['ES_SETTING']['ES_INDEX']['COMMENT'] ['MAPPING_FILEPATH'], 'r') for key in json.loads(f.read())['mappings']['properties']: self.comment_es_key_list.append(key) self.exist_index = {}
class shipxy_crawler_class(): def __init__(self): self.crawler_status = True self.error_msg_list = [] self.db_rollback_dict = {} self.script_name = os.path.basename(__file__) self.get_shipxy_thread_list = [] # 查詢船隻詳細資料至少要有 6 個thread,才有辦法在一小時內把冷熱區爬完 self.get_ship_detail_quantity_limit = 6 self.ship_type_dict = { x.type: x.name for x in ShipTypeShipxy.query.all() } self.navistatus_type_dict = { x.type: x.name for x in NavistatusTypeShipxy.query.all() } def get_shipxy_thread(self, db_result_dict_for_func): try: # data_token_result = self.sc.getareashipssimple(db_result_dict_for_func['coor_1'], db_result_dict_for_func['coor_2']) data_token_result = self.sc.getareashipssimple([ db_result_dict_for_func['lu_lat'], db_result_dict_for_func['lu_lng'] ], [ db_result_dict_for_func['rd_lat'], db_result_dict_for_func['rd_lng'] ]) except: print(traceback.format_exc()) self.db_rollback_dict[ db_result_dict_for_func['id']] = db_result_dict_for_func if db_result_dict_for_func['id'] in self.db_rollback_dict: return try: time.sleep(self.gernal_sleep_time) if db_result_dict_for_func['id'] in self.db_rollback_dict: return batch_load_list = [] if data_token_result['status']!=0 \ or 'data' not in data_token_result: self.crawler_status = False self.error_msg_list.append( '{}: 取得data token失敗, 船訊網 API 回傳 {}'.format( self.script_name, data_token_result)) return if not data_token_result['count']: print('{}: 區域 {} 內無船隻資料'.format(self.script_name, db_result_dict_for_func['id'])) return area_result_list = [] area_data_list = self.sc.area_info(data_token_result['data']) for area_data in area_data_list: if not area_data.get('mmsi') \ or area_data['mmsi']==0 \ or area_data['mmsi']=='0': continue area_result_list.append(area_data) # if len(area_data_list)!=len(area_result_list): # print( # '\n'.join( # [ # '-'*20, # '{}/{}'.format(len(area_data_list), len(area_result_list)), # '{}, {}'.format(db_result_dict_for_func['lu_lat'], db_result_dict_for_func['lu_lng']), # '{}, {}'.format(db_result_dict_for_func['rd_lat'], db_result_dict_for_func['rd_lng']), # json.dumps(area_data_list, ensure_ascii=False, indent=4), # '-'*20 # ] # ) # ) if not area_result_list: print('{}: 區域 {} 內無可爬的船隻資料'.format( self.script_name, db_result_dict_for_func['id'])) self.es.batch_load(batch_load_list) return gsdc = get_ship_detail_class(self.sc) thread_start_time = time.time() for index, area_data in enumerate(area_result_list): # 爬太久的停止機制 if time_to_stop(): break try: # 每個帳號,至少隔 1 秒requests一次,避免帳號被鎖 while [ x.is_alive() for x in gsdc.get_ship_detail_thread_list ].count(True) >= math.floor( (self.get_shipxy_thread_limit_tmp * self.get_ship_detail_quantity_limit) / ([x.is_alive() for x in self.get_shipxy_thread_list].count(True) + 1)): # 爬太久的停止機制 if time_to_stop(): break continue except: lll = [ traceback.format_exc(), '{}'.format([ x.is_alive() for x in gsdc.get_ship_detail_thread_list ].count(True)), '{}'.format(self.get_shipxy_thread_limit_tmp), '{}'.format(self.get_ship_detail_quantity_limit), '{}'.format([ x.is_alive() for x in self.get_shipxy_thread_list ].count(True)), ] raise Exception('\n'.join(lll)) remove_index_list = [] for index, thread in enumerate( gsdc.get_ship_detail_thread_list): if not thread.is_alive(): remove_index_list.append(index) remove_index_list.reverse() for index in remove_index_list: del (gsdc.get_ship_detail_thread_list[index]) thread = threading.Thread(target=gsdc.get_ship_detail, args=(area_data, ), daemon=True) thread.start() gsdc.get_ship_detail_thread_list.append(thread) time.sleep(self.gernal_sleep_time) while [x.is_alive() for x in gsdc.get_ship_detail_thread_list].count(True): # 爬太久的停止機制 if time_to_stop(): break continue print('爬取區域: {}, 耗費時間: {}, 船隻數量: {}'.format( db_result_dict_for_func['id'], round((time.time() - thread_start_time), 1), len(area_result_list))) # 爬太久的停止機制 if not time_to_stop(): if len(area_result_list ) >= 100 and not gsdc.thread_result_dict: self.crawler_status = False self.error_msg_list.append('\n'.join( list(set(gsdc.error_msg_list)))) return elif len(area_result_list) >= 100 and int( len(area_result_list) * 0.8) > len( list(gsdc.thread_result_dict.keys())): self.db_rollback_dict[db_result_dict_for_func[ 'id']] = db_result_dict_for_func self.error_msg_list.append('\n'.join( list(set(gsdc.error_msg_list)))) return id_list = [] for area_data in area_result_list: if area_data['mmsi'] not in gsdc.thread_result_dict: continue id_list.append('{}_{}'.format( area_data['mmsi'], gsdc.thread_result_dict[area_data['mmsi']]['lastdyn'])) if id_list: es_ship_ids = set([ data['_id'] for data in self.es.scan( { 'query': { 'bool': { 'must': [{ 'terms': { '_id': id_list } }] } } }, app.config['ES_SETTING']['INDEX_INFO']['SHIPXY'] ['INDEX_NAME']) ]) else: es_ship_ids = set() # print( # '\n'.join( # [ # 'area_result_list len: {}'.format(len(area_result_list)), # 'id_list len: {}'.format(len(id_list)), # 'es_ship_ids len: {}'.format(len(list(es_ship_ids))), # 'area_result_list-id_list= {}'.format(len(area_result_list)-len(id_list)), # 'id_list-es_ship_ids= {}'.format(len(id_list)-len(list(es_ship_ids))) # ] # ) # ) delete_list = [] for index, area_data in enumerate(area_result_list): if area_data['mmsi'] not in gsdc.thread_result_dict: print('{} : 未取得船隻 {} 之詳細資訊,略過之'.format( self.script_name, area_data['mmsi'])) continue dictionary = deepcopy( gsdc.thread_result_dict[area_data['mmsi']]) dictionary['latitude'] = area_data['lat'] # 緯度 dictionary['longitude'] = area_data['lng'] # 經度 dictionary = deepcopy( gsdc.thread_result_dict[area_data['mmsi']]) dictionary['_index'] = app.config['ES_SETTING']['INDEX_INFO'][ 'SHIPXY']['INDEX_NAME'] dictionary['_type'] = '_doc' dictionary['_id'] = '{}_{}'.format(area_data['mmsi'], dictionary['lastdyn']) if dictionary['_id'] in es_ship_ids: continue # dictionary['area_list_id'] = db_result_dict_for_func['area_list_id'] dictionary['nationality'] = self.mmsi_dict[ dictionary['mmsi'] [:3]] if dictionary['mmsi'][:3] in self.mmsi_dict else None dictionary['cog'] = dictionary['cog'] / 100 # 對地航向 dictionary['draught'] = dictionary['draught'] / 1000 # 吃水 dictionary['hdg'] = dictionary['hdg'] / 100 # 船首向 # # 船訊網可能會出現heading被設為51100,網頁上船首向為0的狀況 # if dictionary['hdg']>360: # dictionary['hdg'] = 0 for key in ['lat', 'lon']: dictionary.pop(key) dictionary['latitude'] = area_data['lat'] # 緯度 dictionary['longitude'] = area_data['lng'] # 經度 dictionary['sog'] = round(dictionary['sog'] / 5133 * 10, 2) # 速度:節 dictionary['length'] = dictionary['length'] / 10 # 船長 dictionary['lineWidth'] = area_data['lineWidth'] dictionary['width'] = dictionary['width'] / 10 # 船寬 dictionary['lastdyn_active'] = area_data[ 'lastdyn_active'] # 是否可擷取資料 dictionary['offset'] = area_data['offset'] dictionary['rot'] = area_data.get('rot') dictionary['rotate'] = area_data['rotate'] dictionary['shiptype'] = area_data['shiptype'] dictionary['state'] = area_data['state'] dictionary['state_color'] = area_data['state_color'] dictionary['istop'] = area_data['istop'] dictionary['tracks'] = area_data['tracks'] dictionary['tcname'] = s2tw_converter(dictionary['cnname']) dictionary['utc_timestamp'] = dictionary.pop('lastdyn') dictionary['time'] = ( datetime.utcfromtimestamp(dictionary['utc_timestamp']) + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S') if dictionary['type'] not in self.ship_type_dict: sts_db_result = ShipTypeShipxy.query.filter( ShipTypeShipxy.type == dictionary['type']).first() if sts_db_result: self.ship_type_dict[ sts_db_result.type] = sts_db_result.name else: self.ship_type_dict[dictionary['type']] = None dictionary['type_text'] = self.ship_type_dict[ dictionary['type']] if dictionary['navistatus'] not in self.navistatus_type_dict: nt_db_result = NavistatusTypeShipxy.query.filter( NavistatusTypeShipxy.type == dictionary['navistatus']).first() if nt_db_result: self.navistatus_type_dict[ nt_db_result.type] = nt_db_result.name else: self.navistatus_type_dict[ dictionary['navistatus']] = None dictionary['navistatus_text'] = self.navistatus_type_dict[ dictionary['navistatus']] dictionary['_routing'] = '{}'.format( (datetime.utcfromtimestamp(dictionary['utc_timestamp']) + timedelta(hours=8)).year) # dictionary['updatetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 這邊是如果字串後面有包含空格,就去除空格 # 如果是空字串,就設為null for key in list(dictionary.keys()): if type(dictionary[key]) is not str: continue dictionary[key] = dictionary[key].strip() if not dictionary[key] or dictionary[key] == 'NULL': dictionary[key] = None batch_load_list.append(dictionary) if delete_list: # 因為可能三台機器爬到同區域,如果有其中一台有刪除資料,這邊 Delete 就會出錯 try: self.es.delete_data(delete_list) except: pass if batch_load_list: self.es.batch_load(batch_load_list) del (delete_list, batch_load_list) except: msg_list = [ 'ip: {}'.format(get_external_ip()), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ] print('\n\n'.join(msg_list)) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], '\n\n'.join(msg_list)) ggg.send_email() def shipxy_crawler_func(self): try: ip = get_external_ip() except: ip = '取得IP失敗' try: # 檢查這台機器是否有同排程還在執行 if check_same_process_still_running(self.script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1)) return except: msg = '\n\n'.join([ 'ip: {}'.format(ip), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email() # line_notify_pusher(msg) return try: self.es = Elastic( host=app.config['ES_SETTING']['CONNECTION']['HOST'], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) if not self.es.check_index_exist( app.config['ES_SETTING']['INDEX_INFO']['SHIPXY'] ['INDEX_NAME']): print( self.es.create_index( app.config['ES_SETTING']['INDEX_INFO']['SHIPXY'] ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO'] ['SHIPXY']['MAPPING_FILEPATH'])) db_result_list = AreaList.query.with_entities( AreaList.id, AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all() if not db_result_list: print('{}: 無區域的排程區間資料'.format(self.script_name)) return crawl_span_dict = { db_result.id: db_result.crawl_span for db_result in db_result_list } query_sort_conds = [SubAreaList.area_list_id] query_sort_conds.extend([x.id for x in db_result_list]) self.cold_zone_ids = set([ db_result.id for db_result in AreaList.query.filter( AreaList.enable == 1, AreaList.name.like('%冷區%')).all() ]) self.mmsi_dict = {} for db_result in MMSI_Info.query.with_entities( MMSI_Info.mmsi, MMSI_Info.alpha_2, MMSI_Info.alpha_3).all(): self.mmsi_dict[ db_result. mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2 self.sc = ShipXY_Crawler() cookies_list = [] for db_result in ShipxyAccount.query.filter( ShipxyAccount.enable == 1, ShipxyAccount.updating == 0, ShipxyAccount.updated_time >= (datetime.now() - timedelta(days=1))).all(): if not db_result.cookies: continue cookies_list.append(deepcopy(db_result.cookies)) if not cookies_list: raise Exception('{}: 無可用之帳號'.format(self.script_name)) self.sc.update_cookies_list(cookies_list) del (cookies_list) while True: if not self.crawler_status: raise Exception('\n'.join(list(set(self.error_msg_list)))) elif self.get_shipxy_thread_list and [ x.is_alive() for x in self.get_shipxy_thread_list ].count(True) >= self.get_shipxy_thread_limit_tmp: continue remove_index_list = [] for index, thread in enumerate(self.get_shipxy_thread_list): if not thread.is_alive(): remove_index_list.append(index) remove_index_list.reverse() for index in remove_index_list: del (self.get_shipxy_thread_list[index]) cookies_list = [] for cookies in self.sc.cookies_list: if 'SERVERID' in cookies: SERVERID_list = cookies['SERVERID'].split('|') SERVERID_list[1] = '{}'.format(time.time()) SERVERID_list[2] = '{}'.format(time.time()) cookies['SERVERID'] = '|'.join(SERVERID_list) cookies_list.append(cookies) self.sc.update_cookies_list(cookies_list) del (cookies_list) db_result = CrawlerMachine.query.filter( CrawlerMachine.ip == ip).first() if not db_result: db_result = CrawlerMachine(ip=ip) db_result.updatedAt = datetime.now() db.session.add(db_result) db.session.commit() machine_quantity = CrawlerMachine.query.filter( CrawlerMachine.updatedAt >= (datetime.now() - timedelta(hours=1))).count() if not machine_quantity: machine_quantity += 1 # 每個帳號平均每秒只能查詢一次區域,以避免帳號被鎖 # 算式為:(一秒/((可用帳號數量)/(機器總數)))-(這一輪當前經過的時間) self.gernal_sleep_time = ( 1 / len(self.sc.cookies_list)) * machine_quantity * 1.5 self.get_shipxy_thread_limit_tmp = (math.floor( (len(self.sc.cookies_list) / machine_quantity) / self.get_ship_detail_quantity_limit)) if not self.get_shipxy_thread_limit_tmp: raise Exception('\n'.join([ '{}: 帳號總數量未達可爬取之帳號最小數量\n'.format(self.script_name), '最小數量定義的算式為\n', '可用帳號之數量({}) 除以 機器總數({}) 除以 每個 thread 取得船隻詳細資料的子程序上限值({}) 後取最小值整數' .format(len(self.sc.cookies_list), machine_quantity, self.get_ship_detail_quantity_limit) ])) if self.db_rollback_dict: for db_result_id in list(self.db_rollback_dict.keys()): db_result = SubAreaList.query.filter( SubAreaList.id == db_result_id).first() db_result.crawler_time = self.db_rollback_dict[ db_result_id]['crawler_time'] db_result.next_time = self.db_rollback_dict[ db_result_id]['next_time'] db.session.add(db_result) del (self.db_rollback_dict[db_result_id]) db.session.commit() db_result = SubAreaList.query.filter( SubAreaList.enable == 1, SubAreaList.web == 'shipxy', or_(SubAreaList.next_time <= datetime.now(), SubAreaList.next_time == None), or_(*[ SubAreaList.area_list_id == id for id in crawl_span_dict.keys() ])).order_by(sqlalchemy.func.field(*query_sort_conds), asc(SubAreaList.next_time), func.random()).first() if not db_result: if [x.is_alive() for x in self.get_shipxy_thread_list].count(True): print( '{}: 無需要爬取的區域, 等待仍在執行的的區域爬取子程序結束中,如果所有子程序執行結束且無任何需爬取的區域,程式將會結束' .format(self.script_name)) while [ x.is_alive() for x in self.get_shipxy_thread_list ].count(True): # 如果到有區域需要再爬的時間,就繼續爬區域 if not datetime.now().minute \ or datetime.now().minute in crawl_span_dict.values(): break # 這邊寫 continue 不是 return,是因為如果子程序執行結束,時間剛好過30分或是0分,就會又有區域需要爬 continue else: print('{}: 無需要爬取的區域, 程式結束, 時間: {}'.format( self.script_name, datetime.now())) return get_shipxy_thread_input = deepcopy(db_result.json()) if db_result.area_list_id not in crawl_span_dict: crawl_span_dict[ db_result.area_list_id] = AreaList.query.filter( AreaList.id == db_result.area_list_id).first().crawl_span crawler_time = datetime.now() - timedelta( minutes=datetime.now().minute % crawl_span_dict[db_result.area_list_id]) db_result.crawler_time = datetime.strptime( crawler_time.strftime('%Y-%m-%d %H:%M:00'), '%Y-%m-%d %H:%M:%S') db_result.next_time = db_result.crawler_time + timedelta( minutes=crawl_span_dict[db_result.area_list_id]) db.session.add(db_result) db.session.commit() if db_result.lu_lat==db_result.rd_lat \ or db_result.lu_lng==db_result.rd_lng: continue db.session.rollback() db.session.close() thread = threading.Thread(target=self.get_shipxy_thread, args=(get_shipxy_thread_input, ), daemon=True) thread.start() self.get_shipxy_thread_list.append(thread) # ############################### # for thread in self.get_shipxy_thread_list: # thread.join() # return # ############################### except: msg = '\n\n'.join([ 'ip: {}'.format(ip), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email()
def shipxy_crawler_func(self): try: ip = get_external_ip() except: ip = '取得IP失敗' try: # 檢查這台機器是否有同排程還在執行 if check_same_process_still_running(self.script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1)) return except: msg = '\n\n'.join([ 'ip: {}'.format(ip), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email() # line_notify_pusher(msg) return try: self.es = Elastic( host=app.config['ES_SETTING']['CONNECTION']['HOST'], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) if not self.es.check_index_exist( app.config['ES_SETTING']['INDEX_INFO']['SHIPXY'] ['INDEX_NAME']): print( self.es.create_index( app.config['ES_SETTING']['INDEX_INFO']['SHIPXY'] ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO'] ['SHIPXY']['MAPPING_FILEPATH'])) db_result_list = AreaList.query.with_entities( AreaList.id, AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all() if not db_result_list: print('{}: 無區域的排程區間資料'.format(self.script_name)) return crawl_span_dict = { db_result.id: db_result.crawl_span for db_result in db_result_list } query_sort_conds = [SubAreaList.area_list_id] query_sort_conds.extend([x.id for x in db_result_list]) self.cold_zone_ids = set([ db_result.id for db_result in AreaList.query.filter( AreaList.enable == 1, AreaList.name.like('%冷區%')).all() ]) self.mmsi_dict = {} for db_result in MMSI_Info.query.with_entities( MMSI_Info.mmsi, MMSI_Info.alpha_2, MMSI_Info.alpha_3).all(): self.mmsi_dict[ db_result. mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2 self.sc = ShipXY_Crawler() cookies_list = [] for db_result in ShipxyAccount.query.filter( ShipxyAccount.enable == 1, ShipxyAccount.updating == 0, ShipxyAccount.updated_time >= (datetime.now() - timedelta(days=1))).all(): if not db_result.cookies: continue cookies_list.append(deepcopy(db_result.cookies)) if not cookies_list: raise Exception('{}: 無可用之帳號'.format(self.script_name)) self.sc.update_cookies_list(cookies_list) del (cookies_list) while True: if not self.crawler_status: raise Exception('\n'.join(list(set(self.error_msg_list)))) elif self.get_shipxy_thread_list and [ x.is_alive() for x in self.get_shipxy_thread_list ].count(True) >= self.get_shipxy_thread_limit_tmp: continue remove_index_list = [] for index, thread in enumerate(self.get_shipxy_thread_list): if not thread.is_alive(): remove_index_list.append(index) remove_index_list.reverse() for index in remove_index_list: del (self.get_shipxy_thread_list[index]) cookies_list = [] for cookies in self.sc.cookies_list: if 'SERVERID' in cookies: SERVERID_list = cookies['SERVERID'].split('|') SERVERID_list[1] = '{}'.format(time.time()) SERVERID_list[2] = '{}'.format(time.time()) cookies['SERVERID'] = '|'.join(SERVERID_list) cookies_list.append(cookies) self.sc.update_cookies_list(cookies_list) del (cookies_list) db_result = CrawlerMachine.query.filter( CrawlerMachine.ip == ip).first() if not db_result: db_result = CrawlerMachine(ip=ip) db_result.updatedAt = datetime.now() db.session.add(db_result) db.session.commit() machine_quantity = CrawlerMachine.query.filter( CrawlerMachine.updatedAt >= (datetime.now() - timedelta(hours=1))).count() if not machine_quantity: machine_quantity += 1 # 每個帳號平均每秒只能查詢一次區域,以避免帳號被鎖 # 算式為:(一秒/((可用帳號數量)/(機器總數)))-(這一輪當前經過的時間) self.gernal_sleep_time = ( 1 / len(self.sc.cookies_list)) * machine_quantity * 1.5 self.get_shipxy_thread_limit_tmp = (math.floor( (len(self.sc.cookies_list) / machine_quantity) / self.get_ship_detail_quantity_limit)) if not self.get_shipxy_thread_limit_tmp: raise Exception('\n'.join([ '{}: 帳號總數量未達可爬取之帳號最小數量\n'.format(self.script_name), '最小數量定義的算式為\n', '可用帳號之數量({}) 除以 機器總數({}) 除以 每個 thread 取得船隻詳細資料的子程序上限值({}) 後取最小值整數' .format(len(self.sc.cookies_list), machine_quantity, self.get_ship_detail_quantity_limit) ])) if self.db_rollback_dict: for db_result_id in list(self.db_rollback_dict.keys()): db_result = SubAreaList.query.filter( SubAreaList.id == db_result_id).first() db_result.crawler_time = self.db_rollback_dict[ db_result_id]['crawler_time'] db_result.next_time = self.db_rollback_dict[ db_result_id]['next_time'] db.session.add(db_result) del (self.db_rollback_dict[db_result_id]) db.session.commit() db_result = SubAreaList.query.filter( SubAreaList.enable == 1, SubAreaList.web == 'shipxy', or_(SubAreaList.next_time <= datetime.now(), SubAreaList.next_time == None), or_(*[ SubAreaList.area_list_id == id for id in crawl_span_dict.keys() ])).order_by(sqlalchemy.func.field(*query_sort_conds), asc(SubAreaList.next_time), func.random()).first() if not db_result: if [x.is_alive() for x in self.get_shipxy_thread_list].count(True): print( '{}: 無需要爬取的區域, 等待仍在執行的的區域爬取子程序結束中,如果所有子程序執行結束且無任何需爬取的區域,程式將會結束' .format(self.script_name)) while [ x.is_alive() for x in self.get_shipxy_thread_list ].count(True): # 如果到有區域需要再爬的時間,就繼續爬區域 if not datetime.now().minute \ or datetime.now().minute in crawl_span_dict.values(): break # 這邊寫 continue 不是 return,是因為如果子程序執行結束,時間剛好過30分或是0分,就會又有區域需要爬 continue else: print('{}: 無需要爬取的區域, 程式結束, 時間: {}'.format( self.script_name, datetime.now())) return get_shipxy_thread_input = deepcopy(db_result.json()) if db_result.area_list_id not in crawl_span_dict: crawl_span_dict[ db_result.area_list_id] = AreaList.query.filter( AreaList.id == db_result.area_list_id).first().crawl_span crawler_time = datetime.now() - timedelta( minutes=datetime.now().minute % crawl_span_dict[db_result.area_list_id]) db_result.crawler_time = datetime.strptime( crawler_time.strftime('%Y-%m-%d %H:%M:00'), '%Y-%m-%d %H:%M:%S') db_result.next_time = db_result.crawler_time + timedelta( minutes=crawl_span_dict[db_result.area_list_id]) db.session.add(db_result) db.session.commit() if db_result.lu_lat==db_result.rd_lat \ or db_result.lu_lng==db_result.rd_lng: continue db.session.rollback() db.session.close() thread = threading.Thread(target=self.get_shipxy_thread, args=(get_shipxy_thread_input, ), daemon=True) thread.start() self.get_shipxy_thread_list.append(thread) # ############################### # for thread in self.get_shipxy_thread_list: # thread.join() # return # ############################### except: msg = '\n\n'.join([ 'ip: {}'.format(ip), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email()
class dcard_crawler(): def __init__(self): self.rdbra = RequestDcardByRESTfulAPI() self.es = Elastic( host=app.config['ES_SETTING']['CONNECTION']['HOST'], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) # for index_category, index_info in app.config['ES_SETTING']['ES_INDEX'].items(): # self.es.create_index(index_info['INDEX_NAME'], index_info['MAPPING_FILEPATH']) self.article_es_key_list = [] f = open( app.config['ES_SETTING']['ES_INDEX']['ARTICLE'] ['MAPPING_FILEPATH'], 'r') for key in json.loads(f.read())['mappings']['properties']: self.article_es_key_list.append(key) self.comment_es_key_list = [] f = open( app.config['ES_SETTING']['ES_INDEX']['COMMENT'] ['MAPPING_FILEPATH'], 'r') for key in json.loads(f.read())['mappings']['properties']: self.comment_es_key_list.append(key) self.exist_index = {} ################################################################################ # 工具區 def crawler_run_over_multi_hours(self, start_time, hours=run_hours_limit): return (True if (start_time + timedelta(hours=hours)) <= datetime.now() else False) def gen_article_url(self, forum_alias, article_id): return ('https://www.dcard.tw/f/{}/p/{}'.format( forum_alias, article_id)) def batch_load_retryer(self, input_batch_load_list): # 為避免ES主機無法連線,這邊先用while迴圈測試能否bulk存入 retry_n = 0 while retry_n < retry_n_limit: try: self.es.batch_load(input_batch_load_list) break except: retry_n += 1 # 如果測試兩次後,再試第三次,第三次還出錯,就會自動raise並寄信 if retry_n >= retry_n_limit: self.es.batch_load(input_batch_load_list) def format_dcard_article(self, input_dict): input_dict = format_datetime_dict(input_dict) dictionary = { '_id': input_dict['id'], '_index': app.config['ES_SETTING']['ES_INDEX']['ARTICLE'] ['INDEX_NAME_TEMPLATE'].format( datetime.strptime(input_dict['createdAt'], '%Y-%m-%d %H:%M:%S').year), '_type': '_doc', 'url': self.gen_article_url(input_dict['forumAlias'], input_dict['id']) } for key in self.article_es_key_list: if key in input_dict: dictionary[key] = input_dict[key] dictionary['websiteId'] = input_dict['forumId'] dictionary['website'] = input_dict['forumName'] dictionary['websiteAlias'] = input_dict['forumAlias'] dictionary['time'] = input_dict['createdAt'] dictionary['update_time'] = input_dict['updatedAt'] dictionary['db_update_time'] = datetime.now().strftime( '%Y-%m-%d %H:%M:%S') dictionary['reactionCount'] = input_dict['likeCount'] if dictionary['_index'] not in self.exist_index: if not self.es.check_index_exist(dictionary['_index']): self.es.create_index( dictionary['_index'], app.config['ES_SETTING']['ES_INDEX'] ['ARTICLE']['MAPPING_FILEPATH']) self.exist_index[dictionary['_index']] = True dictionary['reactions'] = { x['id']: x['count'] for x in dictionary['reactions'] } dictionary['media_data'] = [] url_dict = {x['url']: None for x in input_dict['media']} for mediaMeta_dict in input_dict['mediaMeta']: if mediaMeta_dict.get('normalizedUrl') \ and mediaMeta_dict.get('url') in url_dict: dictionary['media_data'].append({ 'url': mediaMeta_dict['url'], 'normalizedUrl': mediaMeta_dict['normalizedUrl'] }) return (dictionary) def format_dcard_comment(self, input_dict, year, websiteId, website): input_dict = format_datetime_dict(input_dict) dictionary = { '_id': input_dict['id'], '_index': app.config['ES_SETTING']['ES_INDEX']['COMMENT'] ['INDEX_NAME_TEMPLATE'].format(year), '_type': '_doc' } for key in self.comment_es_key_list: if key in input_dict: dictionary[key] = input_dict[key] dictionary['media_data'] = [] url_dict = {x['url']: None for x in input_dict['mediaMeta']} for mediaMeta_dict in input_dict['mediaMeta']: if mediaMeta_dict.get('normalizedUrl') \ and mediaMeta_dict.get('url') in url_dict: dictionary['media_data'].append({ 'url': mediaMeta_dict['url'], 'normalizedUrl': mediaMeta_dict['normalizedUrl'] }) dictionary['key_no'] = input_dict['postId'] dictionary['reactionCount'] = input_dict.get('likeCount') dictionary['time'] = input_dict['createdAt'] dictionary['update_time'] = input_dict['updatedAt'] dictionary['db_update_time'] = datetime.now().strftime( '%Y-%m-%d %H:%M:%S') dictionary['websiteId'] = websiteId dictionary['website'] = website dictionary['websiteAlias'] = input_dict['websiteAlias'] return (dictionary) ################################################################################ def dcard_forums_crawler(self, sub_script_name): db.session.close() # 檢查這台機器是否有同排程還在執行 if check_duplicate_process(sub_script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(sub_script_name, 1)) return print(sub_script_name) err_code_startw = 0 forums = self.rdbra.get_forums() # 拉不到看板列表,代表該主機被Dcard鎖了,結束排程 if not forums: raise Exception('排程名稱: {}, 訊息: 機器被鎖({}, {})'.format( sub_script_name, err_code_startw, 1)) print('排程名稱: {}, 訊息: Dcard看板共有 {} 個'.format(sub_script_name, len(forums))) forum_id_dict = {forum['id']: forum for forum in forums} db_forum_id_dict = { db_result.id: db_result for db_result in DcardForums.query.all() } print('排程名稱: {}, 訊息: DB內 Dcard看板有 {} 個'.format( sub_script_name, len(list(set(db_forum_id_dict.keys()))))) for forum_id in list( set(list(forum_id_dict.keys())) - set(list(db_forum_id_dict.keys()))): forum = forum_id_dict[forum_id] forum['pc_l30d'] = forum['postCount']['last30Days'] forum['backtrack'] = 0 forum['enable'] = 1 db_forum_id_dict[forum['id']] = True db.session.add(DcardForums(**forum)) nnn = 0 for forum_id in list( set(list(db_forum_id_dict.keys())) - set(list(forum_id_dict.keys()))): db_result = db_forum_id_dict[forum_id] db_result.exist = 0 db.session.add(db_result) nnn += 1 print('排程名稱: {}, 訊息: 存在於DB內,但Dcard已經關版的看板有 {} 個'.format( sub_script_name, nnn)) DcardForums.query.filter(DcardForums.ac_time == None).update( {'ac_status': 0}) DcardForums.query.filter(DcardForums.cc_time == None).update( {'cc_status': 0}) db.session.commit() def dcard_article_crawler(self, sub_script_name): # 檢查這台機器是否有同排程還在執行 if check_duplicate_process(sub_script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(sub_script_name, 1)) return DcardForums.query.filter(DcardForums.ac_status == 1).update({ 'ac_status': 0, 'ac_time': None }) db.session.commit() err_code_startw = 1 try: while True: db.session.rollback() db.session.close() DcardForums.query.filter( DcardForums.ac_status == 1, DcardForums.ac_time != None, DcardForums.ac_time <= (datetime.now() - timedelta(hours=6))).update( {'ac_status': 0}) db.session.commit() forum_alias_db_result = DcardForums.query.filter( DcardForums.ac_status == 0, DcardForums.enable == 1, DcardForums.exist == 1, DcardForums.pc_l30d != 0).order_by( asc(DcardForums.ac_time), asc(DcardForums.pc_l30d)).first() # 留言進度未追上文章進度,無文章可爬 if (forum_alias_db_result.ac_time and forum_alias_db_result.cc_time and forum_alias_db_result.ac_time>forum_alias_db_result.cc_time) \ or (not forum_alias_db_result.cc_time and forum_alias_db_result.ac_time): print('排程名稱: {}, 訊息: {}'.format(sub_script_name, '留言排程尚未追上文章排程進度,無文章可爬取')) break crawler_start_time = datetime.now() forum_alias_db_result.ac_time = crawler_start_time forum_alias_db_result.ac_status = 1 db.session.add(forum_alias_db_result) db.session.commit() forum_alias = forum_alias_db_result.alias before_id = None finish_status = False while not finish_status: if not self.rdbra.request_dcard_status(): raise Exception('排程名稱: {}, 訊息: 機器被鎖({}, {})'.format( sub_script_name, err_code_startw, 1)) params = pop_dict_empty_value_key({ 'before': before_id, 'popular': 'false' }) article_list = self.rdbra.get_article_list( forum_alias, params) if not article_list: print('排程名稱: {}, 無文章'.format(sub_script_name)) break batch_load_list = [] before_id = '{}'.format(article_list[-1]['id']) print( '排程名稱: {}, 工作內容: 爬取看板 {} 七天內文章中, 排程啟動時間: {}, 回溯進度: {}'. format( sub_script_name, forum_alias, crawler_start_time.strftime('%Y-%m-%d %H:%M:%S'), article_list[0]['createdAt'])) for article in article_list: tmp_dict = self.rdbra.get_article_content( article['id']) time.sleep(2) if not tmp_dict or not tmp_dict.get('forumAlias'): continue batch_load_dict = self.format_dcard_article(tmp_dict) if not batch_load_dict.get( 'content') and es.search_by_id( app.config['ES_SETTING']['ES_INDEX'] ['ARTICLE']['INDEX_NAME'], '_doc', batch_load_dict['_id'])['found']: continue elif datetime.strptime( batch_load_dict['time'], '%Y-%m-%d %H:%M:%S') <= (crawler_start_time - timedelta(days=7)): # 這裡加一層檢查,是為防該看板7天內都沒有任何文章,至少爬一篇最新的進ES,讓檢查看板過去文章是否已匯入這項工作順利執行 if not self.es.count({"query":{"bool":{"must":[{"match":{"websiteAlias":forum_alias}}]}}}, app.config['ES_SETTING']['ES_INDEX']['ARTICLE']['INDEX_NAME_TEMPLATE'].format(datetime.strptime(batch_load_dict['time'], '%Y-%m-%d %H:%M:%S').year)) \ and not batch_load_list: pass else: print('排程名稱: {}, 看板{}過去七天內文章已爬完'.format( sub_script_name, forum_alias)) finish_status = True break batch_load_list.append(batch_load_dict) if batch_load_list: self.batch_load_retryer(batch_load_list) forum_alias_db_result.ac_status = 0 db.session.add(forum_alias_db_result) db.session.commit() while True: db.session.rollback() db.session.close() crawler_start_time = datetime.now() forum_alias_db_result = DcardForums.query.filter( DcardForums.enable == 1, DcardForums.backtrack == 1, DcardForums.exist == 1).order_by(asc( DcardForums.ac_time)).first() if not forum_alias_db_result: print('排程名稱: {}, 訊息: 無需回溯的看板'.format(sub_script_name)) break forum_alias = forum_alias_db_result.alias # 全部看板只給他一小時時間回溯,以免耽擱時間 if self.crawler_run_over_multi_hours(crawler_start_time, hours=1): print('排程名稱: {}, 訊息: {}'.format( sub_script_name, '回溯執行超過一個小時,停止回溯看板 {}'.format(forum_alias))) break # 這邊開始是回溯看板過去文章 print('排程名稱: {}, 開始檢查看板{}過去文章是否已匯入'.format( sub_script_name, forum_alias)) query = { "from": 0, "size": 1, "sort": [{ "time": "asc" }], "query": { "term": { "websiteAlias": forum_alias } } } before_id = None year_n = 0 while True: if self.es.check_index_exist( app.config['ES_SETTING']['ES_INDEX']['ARTICLE'] ['INDEX_NAME_TEMPLATE'].format( crawler_start_time.year - year_n)): tmp_es_result = self.es.search( query, app.config['ES_SETTING']['ES_INDEX'] ['ARTICLE']['INDEX_NAME_TEMPLATE'].format( crawler_start_time.year - year_n)) if tmp_es_result['hits']['hits']: es_result = deepcopy(tmp_es_result) else: break year_n -= 1 if not es_result['hits']['hits']: print('排程名稱: {}, 看板{}無任何文章'.format(sub_script_name, forum_alias)) continue before_id = '{}'.format(es_result['hits']['hits'][0]['_id']) finish_status = False while not finish_status: # 全部看板只給他一小時時間回溯,以免耽擱時間 if self.crawler_run_over_multi_hours(crawler_start_time, hours=1): print('排程名稱: {}, 訊息: {}'.format( sub_script_name, '回溯執行超過一個小時,停止回溯看板 {}'.format(forum_alias))) break params = {'before': before_id, 'popular': 'false'} for key in list(params.keys()): if not params[key]: params.pop(key) article_list = self.rdbra.get_article_list( forum_alias, params) if not article_list: print('排程名稱: {}, 看板{}已完成回溯所有文章'.format( sub_script_name, forum_alias)) break batch_load_list = [] before_id = '{}'.format(article_list[-1]['id']) print( '排程名稱: {}, 工作內容: 爬取看板 {} 七天內文章中, 排程啟動時間: {}, 回溯進度: {}'. format( sub_script_name, forum_alias, crawler_start_time.strftime('%Y-%m-%d %H:%M:%S'), article_list[0]['createdAt'])) for article in article_list: tmp_dict = self.rdbra.get_article_content( article['id']) time.sleep(2) if not tmp_dict or not tmp_dict.get('forumAlias'): continue batch_load_dict = self.format_dcard_article(tmp_dict) if batch_load_list: self.batch_load_retryer(batch_load_list) forum_alias_db_result.ac_status = 0 db.session.add(forum_alias_db_result) db.session.commit() except Exception as e: subject = 'Dcard排程 {} 出現錯誤'.format(sub_script_name) message_list = [ '{}\n'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), '{}\n'.format(str(e)), '{}\n'.format(traceback.format_exc()) ] print(traceback.format_exc()) gs = GmailSender(app.config['GOOGLE_SENDER_CONF']['FROM_ADDRESS'], app.config['GOOGLE_SENDER_CONF']['RECEIVER_LIST'], subject, '\n'.join(message_list)) gs.send_email() def dcard_comment_crawler(self, sub_script_name): # 檢查這台機器是否有同排程還在執行 if check_duplicate_process(sub_script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(sub_script_name, 1)) return DcardForums.query.filter(DcardForums.cc_status == 1).update({ 'cc_status': 0, 'cc_time': None }) db.session.commit() err_code_startw = 2 try: while True: db.session.rollback() db.session.close() DcardForums.query.filter( DcardForums.cc_status == 1, DcardForums.cc_time != None, DcardForums.cc_time <= (datetime.now() - timedelta(hours=6))).update( {'cc_status': 0}) db.session.commit() forum_alias_db_result = DcardForums.query.filter( DcardForums.enable == 1, DcardForums.ac_time != None, DcardForums.ac_status == 0, DcardForums.cc_status == 0, DcardForums.exist == 1, DcardForums.pc_l30d != 0).order_by( asc(DcardForums.cc_time), asc(DcardForums.ac_time), asc(DcardForums.pc_l30d)).first() print(forum_alias_db_result.alias) if not forum_alias_db_result: print('排程名稱: {}, 訊息: {} ({})'.format( sub_script_name, '文章排程尚未追上留言排程進度,無留言可爬取', 1)) break elif forum_alias_db_result.cc_time and forum_alias_db_result.ac_time < forum_alias_db_result.cc_time: print('排程名稱: {}, 訊息: {} ({})'.format( sub_script_name, '文章排程尚未追上留言排程進度,無留言可爬取', 2)) break crawler_start_time = datetime.now() forum_alias = forum_alias_db_result.alias forum_alias_db_result.cc_time = crawler_start_time forum_alias_db_result.cc_status = 1 db.session.add(forum_alias_db_result) db.session.commit() article_index_year_list = [forum_alias_db_result.ac_time.year] if (forum_alias_db_result.ac_time - timedelta(days=7) ).year != forum_alias_db_result.ac_time.year: article_index_year_list.append( forum_alias_db_result.ac_time.year - 1) for article_index_year in article_index_year_list: article_index_name = app.config['ES_SETTING']['ES_INDEX'][ 'ARTICLE']['INDEX_NAME_TEMPLATE'].format( article_index_year) comment_index_name = app.config['ES_SETTING']['ES_INDEX'][ 'COMMENT']['INDEX_NAME_TEMPLATE'].format( article_index_year) if article_index_name not in self.exist_index and not self.es.check_index_exist( article_index_name): continue else: self.exist_index[article_index_name] = True if comment_index_name not in self.exist_index: if not self.es.check_index_exist(comment_index_name): self.es.create_index( comment_index_name, app.config['ES_SETTING'] ['ES_INDEX']['COMMENT']['MAPPING_FILEPATH']) self.exist_index[comment_index_name] = True article_query = { "from": 0, "size": 100, "sort": [{ "time": "asc" }], "query": { "bool": { "must": [{ "term": { "websiteAlias": forum_alias } }, { "range": { "commentCount": { "gt": 0 } } }] } } } comment_query = { "from": 0, "size": 1, "query": { "bool": { "must": [{ "term": { "websiteAlias": forum_alias } }] } } } # 如果該看板不是第一次執行留言排程了,就加入時間查詢規則 if self.es.search( comment_query, app.config['ES_SETTING']['ES_INDEX'] ['COMMENT']['INDEX_NAME_TEMPLATE'].format( article_index_year))['hits']['hits']: article_query['query']['bool']['must'].append({ "range": { "time": { "gte": (forum_alias_db_result.ac_time - timedelta( days=7)).strftime('%Y-%m-%d %H:%M:%S'), "lte": forum_alias_db_result.ac_time.strftime( '%Y-%m-%d %H:%M:%S') } } }) while True: es_result = self.es.search( article_query, app.config['ES_SETTING']['ES_INDEX']['ARTICLE'] ['INDEX_NAME_TEMPLATE'].format(article_index_year)) if not es_result['hits']['hits']: break article_query['from'] += article_query['size'] print( '排程名稱: {}, 工作內容: 爬取看板 {} 七天內文章中, 排程啟動時間: {}, 回溯進度: {}' .format( sub_script_name, forum_alias, crawler_start_time.strftime( '%Y-%m-%d %H:%M:%S'), es_result['hits'] ['hits'][0]['_source']['time'])) for article_dict in es_result['hits']['hits']: t1 = time.time() comment_list = self.rdbra.get_article_comments_by_num( article_dict['_id'], input_sleep_time=2) time.sleep(2) if not comment_list: continue batch_load_list = [] for comment_dict in comment_list: comment_dict['websiteAlias'] = forum_alias batch_load_dict = self.format_dcard_comment( comment_dict, article_index_year, article_dict['_source']['websiteId'], article_dict['_source']['website']) if not batch_load_dict.get('content') or ( not batch_load_dict['content'] and es.search_by_id( comment_index_name, '_doc', batch_load_dict['_id'])['found']): continue batch_load_list.append(batch_load_dict) if batch_load_list: self.batch_load_retryer(batch_load_list) forum_alias_db_result.cc_status = 0 db.session.add(forum_alias_db_result) db.session.commit() if forum_alias_db_result.backtrack != 1: continue db.session.rollback() db.session.close() crawler_start_time = datetime.now() forum_alias_db_result.cc_time = crawler_start_time forum_alias_db_result.cc_status = 1 db.session.add(forum_alias_db_result) db.session.commit() # 這邊開始是回溯看板過去文章留言 print('排程名稱: {}, 開始檢查看板{}過去文章留言是否已匯入'.format( sub_script_name, forum_alias)) year_n = 0 article_earliest_time = None article_query = { "from": 0, "size": 1, "sort": [{ "time": "asc" }], "query": { "bool": { "must": [{ "term": { "websiteAlias": forum_alias } }, { "range": { "commentCount": { "gt": 0 } } }] } } } while True: index_name = app.config['ES_SETTING']['ES_INDEX'][ 'ARTICLE']['INDEX_NAME_TEMPLATE'].format( crawler_start_time.year - year_n) if index_name not in self.exist_index: if not self.es.check_index_exist(index_name): break self.exist_index[index_name] = True es_result = self.es.search(article_query, index_name) if not es_result['hits']['hits']: break article_earliest_time = datetime.strptime( es_result['hits']['hits'][0]['_source']['time'], '%Y-%m-%d %H:%M:%S') year_n += 1 year_n = 0 comment_query = { "from": 0, "size": 1, "sort": [{ "postId": "asc" }], "query": { "bool": { "must": [{ "term": { "websiteAlias": forum_alias } }] } } } while True: index_name = app.config['ES_SETTING']['ES_INDEX'][ 'COMMENT']['INDEX_NAME_TEMPLATE'].format( crawler_start_time.year - year_n) if index_name not in self.exist_index: if not self.es.check_index_exist(index_name): break self.exist_index[index_name] = True es_result = self.es.search(comment_query, index_name) if not es_result['hits']['hits']: break comment_earliest_article_time = datetime.strptime( self.es.search_by_id( app.config['ES_SETTING']['ES_INDEX']['ARTICLE'] ['INDEX_NAME_TEMPLATE'].format( crawler_start_time.year - year_n), '_doc', '{}'.format(es_result['hits']['hits'][0]['_source'] ['postId']))['_source']['time'], '%Y-%m-%d %H:%M:%S') year_n += 1 if not article_earliest_time: print('排程名稱: {} 訊息: 看板{}沒有已爬入ES的文章,故略過留言回溯'.format( sub_script_name, forum_alias)) continue article_query = { "from": 0, "size": 10, "sort": [{ "time": "desc" }], "query": { "bool": { "must": [{ "term": { "websiteAlias": forum_alias } }, { "range": { "commentCount": { "gt": 0 } } }, { "range": { "time": { "lt": comment_earliest_article_time.strftime( '%Y-%m-%d %H:%M:%S') } } }] } } } traceback_stop_status = False while comment_earliest_article_time > article_earliest_time: article_index_name = app.config['ES_SETTING']['ES_INDEX'][ 'ARTICLE']['INDEX_NAME_TEMPLATE'].format( comment_earliest_article_time.year) if article_index_name not in self.exist_index: if self.es.check_index_exist(article_index_name): self.exist_index[article_index_name] = True else: break # 每個看板只給他一小時時間回溯,以免耽擱時間 elif self.crawler_run_over_multi_hours( crawler_start_time, 1): print('排程名稱: {}, 訊息: {}'.format( sub_script_name, '回溯執行超過一個小時,停止回溯留言 {}'.format(forum_alias))) traceback_stop_status = True break es_result = self.es.search( article_query, app.config['ES_SETTING']['ES_INDEX'] ['ARTICLE']['INDEX_NAME_TEMPLATE'].format( comment_earliest_article_time.year)) if not es_result['hits']['hits']: comment_earliest_article_time = datetime.strptime( '{}-12-31 23:59:59'.format( comment_earliest_article_time.year - 1), '%Y-%m-%d %H:%M:%S') continue print( '排程名稱: {}, 工作內容: 爬取看板 {} 七天內文章中, 排程啟動時間: {}, 回溯進度: {}'. format( sub_script_name, forum_alias, crawler_start_time.strftime('%Y-%m-%d %H:%M:%S'), es_result['hits']['hits'][0]['_source']['time'])) for article_dict in es_result['hits']['hits']: comment_list = self.rdbra.get_article_comments_by_num( article_dict['_id'], input_sleep_time=2) time.sleep(2) if not comment_list: continue batch_load_list = [] for comment_dict in comment_list: comment_dict['websiteAlias'] = forum_alias batch_load_dict = self.format_dcard_comment( comment_dict, datetime.strptime( article_dict['_source']['time'], '%Y-%m-%d %H:%M:%S').year, article_dict['_source']['websiteId'], article_dict['_source']['website']) if not batch_load_dict.get('content') or ( not batch_load_dict['content'] and es.search_by_id( batch_load_dict['_index'], '_doc', batch_load_dict['_id'])['found']): continue batch_load_list.append(batch_load_dict) if batch_load_list: self.batch_load_retryer(batch_load_list) del (article_query['query']['bool']['must'][-1]) comment_earliest_article_time = datetime.strptime( es_result['hits']['hits'][-1]['_source']['time'], '%Y-%m-%d %H:%M:%S') article_query['query']['bool']['must'].append({ 'range': { 'time': { 'lt': es_result['hits']['hits'][-1]['_source'] ['time'] } } }) article_query['from'] += article_query['size'] forum_alias_db_result.cc_status = 0 db.session.add(forum_alias_db_result) db.session.commit() # 全部看板只給一小時回溯留言 if traceback_stop_status: break except Exception as e: subject = 'Dcard排程 {} 出現錯誤'.format(sub_script_name) message_list = [ '{}\n'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), '{}\n'.format(str(e)), '{}\n'.format(traceback.format_exc()) ] print(traceback.format_exc()) gs = GmailSender(app.config['GOOGLE_SENDER_CONF']['FROM_ADDRESS'], app.config['GOOGLE_SENDER_CONF']['RECEIVER_LIST'], subject, '\n'.join(message_list)) gs.send_email()
def myships_crawler_func(self): try: self.ip = get_external_ip() except: self.ip = None try: # 檢查這台機器是否有同排程還在執行 if check_same_process_still_running(self.script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1)) return if not self.ip: raise (Exception('無法取得 IP')) try: rsp = requests.get( app.config['CRAWLER_SETTING']['MYSHIPS']['HOST_DOMAIN'], timeout=60) rsp.close() except: raise Exception('無法連線至寶船網網頁 :\n{}'.format( traceback.format_exc())) if rsp.status_code != 200: raise Exception('寶船網網頁無法正確連線 :\n{}'.format(rsp.text)) self.es = Elastic( host=app.config['ES_SETTING']['CONNECTION']['HOST'], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) if not self.es.check_index_exist( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME']): print( self.es.create_index( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO'] ['MYSHIPS']['MAPPING_FILEPATH'])) self.ship_type_dict = { x.type: x.name for x in ShipTypeMyships.query.all() } self.navistatus_type_dict = { x.type: x.name for x in NavistatusTypeMyships.query.all() } self.mmsi_dict = {} for db_result in MMSI_Info.query.with_entities( MMSI_Info.mmsi, MMSI_Info.alpha_2, MMSI_Info.alpha_3).all(): self.mmsi_dict[ db_result. mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2 # start_n = deepcopy(4000000+self.machine_serial) start_n = deepcopy(self.machine_serial) while True: if datetime.now().minute>59 \ and datetime.now().second>30: return print(start_n) end_n = start_n + 1000 * self.machine_count shipId_list = [ f'{i}' for i in range(start_n, end_n, self.machine_count) ] start_n = deepcopy(end_n) thread = threading.Thread(target=self.get_ship_detail, args=(shipId_list, ), daemon=True) thread.start() self.thread_list.append(thread) # for thread in self.thread_list: # thread.join() # pprint(self.ship_detail_dict) # if self.ship_detail_dict: # self.save2es() # pprint(self.ship_detail_dict) # return while [thread.is_alive() for thread in self.thread_list ].count(True) >= self.thread_max_count: continue delete_index_list = [] for index, thread in enumerate(self.thread_list): if not thread.is_alive(): delete_index_list.append(index) delete_index_list.reverse() for index in delete_index_list: del (self.thread_list[index]) if self.err_count >= self.err_count_max: raise Exception('\n\n'.join(self.err_msg_list)) if self.no_data_count >= self.no_data_count_max: break if self.ship_detail_dict: self.save2es() time.sleep(1) for thread in self.thread_list: thread.join() if self.ship_detail_dict: self.save2es() except: msg = '\n\n'.join([ 'ip: {}'.format(self.ip), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) self.err_msg_list.append(msg) self.err_msg_list = list(set(self.err_msg_list)) print('\n\n'.join(self.err_msg_list)) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], '\n\n'.join(self.err_msg_list)) ggg.send_email()
class myships_crawler_class(): def __init__(self): self.machine_serial = int(os.environ.get('SERIAL', '0')) self.script_name = os.path.basename(__file__) self.err_msg_list = [] self.thread_list = [] self.thread_max_count = 30 self.err_count = 0 self.err_count_max = 3 self.no_data_count = 0 self.no_data_count_max = 10 self.machine_count = len(serial_list) self.ship_detail_dict = {} self.headers = { 'Connection': 'close', 'User-Agent': 'Mozilla/5.0 (Macintosh Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' } # self.comparison_dict = { # 'updatetime':'updatetime', # 'eta_timestamp':'eta', # 'callsign':'callsign', # 'cog':'cog', # 'dest':'destPort', # 'draught':'draught', # 'hdg':'heading', # 'imo':'imo', # 'latitude':'lat', # 'length':'length', # 'longitude':'lon', # 'mmsi':'mmsi', # 'name':'shipnameEn', # 'navistatus':'aisNavStatus', # 'rot':'rot', # 'shipid':'shipId', # 'sog':'sog', # 'utc_timestamp':'posTime', # 'type':'shiptype', # 'width':'breadth', # 'y':'y', # 'v':'v' # } def err_msg_generator(self, err_msg): return ('\n'.join( [datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self.ip, err_msg])) def save2es(self): batch_load_list = [] for key_id in list(self.ship_detail_dict.keys()): id_list = [ '{}_{}'.format(ship_detail_dict['mmsi'], ship_detail_dict['posTime']) for ship_detail_dict in self.ship_detail_dict[key_id] ] if id_list: es_ship_ids = set([ data['_id'] for data in self.es.scan( { 'query': { 'bool': { 'must': [{ 'terms': { '_id': id_list } }] } } }, app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME']) ]) else: es_ship_ids = set() for ship_detail_dict in self.ship_detail_dict[key_id]: _id = '{}_{}'.format(ship_detail_dict['mmsi'], ship_detail_dict['posTime']) if _id in es_ship_ids: continue dictionary = { '_index': app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], '_type': '_doc', '_id': _id, '_routing': '{}'.format( (datetime.utcfromtimestamp(ship_detail_dict['posTime']) + timedelta(hours=8)).year), 'updatetime': ship_detail_dict['updatetime'], 'eta_timestamp': ship_detail_dict['eta'], 'time': (datetime.utcfromtimestamp(ship_detail_dict['posTime']) + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S'), 'callsign': ship_detail_dict['callsign'], 'nationality': self.mmsi_dict[ship_detail_dict['mmsi'][:3]] if ship_detail_dict['mmsi'][:3] in self.mmsi_dict else None, 'cog': ship_detail_dict['cog'] / 10 if ship_detail_dict['cog'] else None, 'dest': ship_detail_dict['destPort'], 'draught': ship_detail_dict['draught'] / 10 if ship_detail_dict['draught'] else None, 'hdg': ship_detail_dict['heading'], 'imo': ship_detail_dict['imo'], 'latitude': ship_detail_dict['lat'] / 600000, 'length': ship_detail_dict['length'], 'longitude': ship_detail_dict['lon'] / 600000, 'mmsi': ship_detail_dict['mmsi'], 'name': ship_detail_dict['shipnameEn'], 'navistatus': ship_detail_dict['aisNavStatus'], 'rot': ship_detail_dict['rot'], 'shipid': ship_detail_dict['shipId'], 'sog': ship_detail_dict['sog'] / 10 if ship_detail_dict['sog'] else None, 'utc_timestamp': ship_detail_dict['posTime'], 'type': ship_detail_dict['shiptype'], 'width': ship_detail_dict['breadth'], 'y': ship_detail_dict['shiptype'], 'v': ship_detail_dict['aisNavStatus'] } try: # type 有時會出現亂碼,如「6857-d&0」、「1607U No.158」 dictionary['type'] = int(dictionary['type']) except: dictionary['type'] = None try: dictionary['navistatus'] = int(dictionary['navistatus']) except: dictionary['navistatus'] = None if ship_detail_dict['eta']: eta_datetime = datetime.utcfromtimestamp( ship_detail_dict['eta']) dictionary['eta'] = eta_datetime.strftime('%m-%d %H:%M') dictionary['eta_datetime'] = eta_datetime.strftime( '%Y-%m-%d %H:%M:%S') else: dictionary['eta'] = None dictionary['eta_datetime'] = None if dictionary['type'] in self.ship_type_dict: dictionary['type_text'] = self.ship_type_dict[ dictionary['type']] if dictionary['navistatus'] in self.navistatus_type_dict: dictionary['navistatus_text'] = self.navistatus_type_dict[ dictionary['navistatus']] dictionary['y'] = dictionary['type'] dictionary['v'] = dictionary['navistatus'] batch_load_list.append(dictionary) del (self.ship_detail_dict[key_id]) self.es.batch_load(batch_load_list) def get_ship_detail(self, shipId_list_for_func): input_json = {"shipId": ','.join(shipId_list_for_func)} try: rsp = RequestsRetryer( 'post', { 'url': app.config['CRAWLER_SETTING']['MYSHIPS']['SHIP_DETAIL'], 'headers': self.headers, 'json': input_json, 'timeout': 180 }, req_retry_limit=3, req_retry_sleeptime=5) rsp.close() except: self.err_count += 1 self.err_msg_list.append( self.err_msg_generator(traceback.format_exc())) return if rsp.status_code != 200: self.err_count += 1 self.err_msg_list.append(self.err_msg_generator(rsp.text)) return try: rsp_result = rsp.json() except: self.err_count += 1 self.err_msg_list.append( self.err_msg_generator(traceback.format_exc())) return if rsp_result['code']!='0' \ or rsp_result['message']!='成功': pprint(rsp_result) self.err_count += 1 self.err_msg_list.append(self.err_msg_generator(rsp.text)) return key_id = f'{uuid4()}' self.ship_detail_dict[key_id] = [] for ship_detail_dict in rsp_result['data']: if not ship_detail_dict['mmsi'] \ or not ship_detail_dict['lon'] \ or not ship_detail_dict['lat'] \ or not ship_detail_dict['posTime']: continue ship_detail_dict['v'] = None ship_detail_dict['y'] = None ship_detail_dict['updatetime'] = datetime.now().strftime( '%Y-%m-%d %H:%M:%S') self.ship_detail_dict[key_id].append(ship_detail_dict) if not self.ship_detail_dict[key_id]: self.no_data_count += 1 return # { # "code": "0", # "count": 1, # "message": "成功", # "data": [ # { # "posTime": 1606964270, # "lon": 70842352, # "lat": 14671966, # "sog": 0, # "cog": 2612, # "heading": 359, # "rot": 0, # "aisNavStatus": "0", # "mmsi": "413698780", # "shipnameEn": "JIN YING 6", # "imo": "7549747", # "callsign": "BSKF", # "shiptype": "60", # "length": 37, # "breadth": 10, # "eta": 1599067020, # "destPort": "XIAMEN", # "draught": 18, # "shipId": 2029 # } # ] # } def myships_crawler_func(self): try: self.ip = get_external_ip() except: self.ip = None try: # 檢查這台機器是否有同排程還在執行 if check_same_process_still_running(self.script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1)) return if not self.ip: raise (Exception('無法取得 IP')) try: rsp = requests.get( app.config['CRAWLER_SETTING']['MYSHIPS']['HOST_DOMAIN'], timeout=60) rsp.close() except: raise Exception('無法連線至寶船網網頁 :\n{}'.format( traceback.format_exc())) if rsp.status_code != 200: raise Exception('寶船網網頁無法正確連線 :\n{}'.format(rsp.text)) self.es = Elastic( host=app.config['ES_SETTING']['CONNECTION']['HOST'], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) if not self.es.check_index_exist( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME']): print( self.es.create_index( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO'] ['MYSHIPS']['MAPPING_FILEPATH'])) self.ship_type_dict = { x.type: x.name for x in ShipTypeMyships.query.all() } self.navistatus_type_dict = { x.type: x.name for x in NavistatusTypeMyships.query.all() } self.mmsi_dict = {} for db_result in MMSI_Info.query.with_entities( MMSI_Info.mmsi, MMSI_Info.alpha_2, MMSI_Info.alpha_3).all(): self.mmsi_dict[ db_result. mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2 # start_n = deepcopy(4000000+self.machine_serial) start_n = deepcopy(self.machine_serial) while True: if datetime.now().minute>59 \ and datetime.now().second>30: return print(start_n) end_n = start_n + 1000 * self.machine_count shipId_list = [ f'{i}' for i in range(start_n, end_n, self.machine_count) ] start_n = deepcopy(end_n) thread = threading.Thread(target=self.get_ship_detail, args=(shipId_list, ), daemon=True) thread.start() self.thread_list.append(thread) # for thread in self.thread_list: # thread.join() # pprint(self.ship_detail_dict) # if self.ship_detail_dict: # self.save2es() # pprint(self.ship_detail_dict) # return while [thread.is_alive() for thread in self.thread_list ].count(True) >= self.thread_max_count: continue delete_index_list = [] for index, thread in enumerate(self.thread_list): if not thread.is_alive(): delete_index_list.append(index) delete_index_list.reverse() for index in delete_index_list: del (self.thread_list[index]) if self.err_count >= self.err_count_max: raise Exception('\n\n'.join(self.err_msg_list)) if self.no_data_count >= self.no_data_count_max: break if self.ship_detail_dict: self.save2es() time.sleep(1) for thread in self.thread_list: thread.join() if self.ship_detail_dict: self.save2es() except: msg = '\n\n'.join([ 'ip: {}'.format(self.ip), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) self.err_msg_list.append(msg) self.err_msg_list = list(set(self.err_msg_list)) print('\n\n'.join(self.err_msg_list)) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], '\n\n'.join(self.err_msg_list)) ggg.send_email()
def check_rumor_func(): script_name = os.path.basename(__file__) es = Elastic(host=['118.163.94.26'], port=17377, username='******', password='******') query_template = { "from": 0, "size": 1, "sort": [{ "create_time": "asc" }], "query": { "bool": { "must": [{ "term": { "source": 'mygopen' } }, { "exists": { 'field': 'file' } }, { "exists": { 'field': 'message' } }] } } } query_template['size'] = 10000 create_time_gt_old = None create_time_gt = (datetime.strptime( es.search(query_template, 'fakenews@rumor_grouping')['hits']['hits'][0] ['_source']['create_time'], '%Y-%m-%d %H:%M:%S') - timedelta(seconds=1)).strftime('%Y-%m-%d %H:%M:%S') dictionary_list_1 = [] dictionary_list_2 = [] while True: if create_time_gt_old and create_time_gt_old == create_time_gt: break print(create_time_gt) query = deepcopy(query_template) query['query']['bool']['must'].append( {"range": { "create_time": { "gt": create_time_gt } }}) search_result = es.search(query, 'fakenews@rumor_grouping') if not search_result['hits']['hits']: break query['from'] += query['size'] for data in search_result['hits']['hits']: dictionary = OrderedDict() dictionary['id'] = data['_id'] dictionary.update(data['_source']) if len(dictionary['file']) == 1: dictionary_list_1.append(dictionary) else: dictionary_list_2.append(dictionary) create_time_gt_old = deepcopy(create_time_gt) create_time_gt = datetime.strptime( es.search(query_template, 'fakenews@rumor_grouping')['hits']['hits'][-1]['_source'] ['create_time'], '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S') ddddd = { 'rumor_grouping檢查-一個檔案.xlsx': dictionary_list_1[:100], 'rumor_grouping檢查-多個檔案.xlsx': dictionary_list_2[:100], } for excel_filename, dictionary_list in ddddd.items(): excel_output_path = './rumor_grouping檢查-一個檔案.xlsx' content = json.dumps(dictionary_list_1, ensure_ascii=False, indent=4) tmp_json_path = 'tmp.json' f = open(tmp_json_path, 'w') f.write(content) f.close() pandas.read_json(tmp_json_path).to_excel(excel_output_path, sheet_name='rumor_grouping', index=False) if os.path.exists(tmp_json_path): os.remove(tmp_json_path)
def myships_crawler_func(): script_name = os.path.basename(__file__) # 檢查這台機器是否有同排程還在執行 if check_same_process_still_running(script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(script_name, 1)) return try: es = Elastic( host=app.config['ES_SETTING']['CONNECTION']['HOST'], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) if not es.check_index_exist(app.config['ES_SETTING']['INDEX_INFO'] ['MYSHIPS']['INDEX_NAME']): print( es.create_index( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO'] ['MYSHIPS']['MAPPING_FILEPATH'])) db_result_list = AreaList.query.with_entities( AreaList.id, AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all() if not db_result_list: print('{}: 無區域的排程區間資料'.format(script_name)) return crawl_span_dict = { db_result.id: db_result.crawl_span for db_result in db_result_list } query_sort_conds = [SubAreaList.area_list_id] query_sort_conds.extend([x.id for x in db_result_list]) cold_zone_ids = set([ db_result.id for db_result in AreaList.query.filter( AreaList.enable == 1, AreaList.name.like('%冷區%')).all() ]) ship_type_dict = {x.type: x.name for x in ShipTypeMyships.query.all()} navistatus_type_dict = { x.type: x.name for x in NavistatusTypeMyships.query.all() } mc = Myships_Crawler() while True: db.session.rollback() db.session.close() batch_load_list = [] db_result = SubAreaList.query.filter( SubAreaList.enable == 1, SubAreaList.web == 'myships', or_(SubAreaList.next_time <= datetime.now(), SubAreaList.next_time == None), or_(*[ SubAreaList.area_list_id == id for id in crawl_span_dict.keys() ])).order_by(sqlalchemy.func.field(*query_sort_conds), asc(SubAreaList.next_time), func.random()).first() if not db_result: print('{}: 完成'.format(script_name)) return print('{}: 爬取區域 {} 中'.format(script_name, db_result.id)) crawler_time = datetime.now() - timedelta( minutes=datetime.now().minute % crawl_span_dict[db_result.area_list_id]) old_crawler_time = deepcopy(db_result.crawler_time) old_next_time = deepcopy(db_result.next_time) db_result.crawler_time = datetime.strptime( crawler_time.strftime('%Y-%m-%d %H:%M:00'), '%Y-%m-%d %H:%M:%S') db_result.next_time = db_result.crawler_time + timedelta( minutes=crawl_span_dict[db_result.area_list_id]) db.session.add(db_result) db.session.commit() if db_result.lu_lat==db_result.rd_lat \ or db_result.lu_lng==db_result.rd_lng: continue # ma_cookies_list = [x.cookies for x in MyshipsAccount.query.filter(MyshipsAccount.enable==1, MyshipsAccount.updating==0).all()] ma_cookies_list = [] try: area_result = mc.area_info( min([db_result.lu_lat, db_result.rd_lat]), min([db_result.lu_lng, db_result.rd_lng]), max([db_result.lu_lat, db_result.rd_lat]), max([db_result.lu_lng, db_result.rd_lng]), ma_cookies_list) except: db_result.crawler_time = old_crawler_time db_result.next_time = old_next_time db.session.add(db_result) db.session.commit() msg = '\n\n'.join([ 'ip: {}'.format(get_external_ip()), '時間: {}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S')), '{}\n{}'.format('取得區域船隻資料出現錯誤,請檢查是資策會端網路出現錯誤還是寶船網網站異常', traceback.format_exc()), ]) print(msg) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email() continue if area_result['code'] != '0': msg = '\n\n'.join([ 'ip: {}'.format(get_external_ip()), '時間: {}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S')), '\n'.join([ '取得寶船網區域船隻資料時出現錯誤', '{}'.format(area_result), 'id :{}'.format(db_result.id), 'area_list_id :{}'.format(db_result.area_list_id), '{}'.format({ 'age': 1440, 'rgn': mc.check_trans2myships_coord( [[ min([db_result.lu_lat, db_result.rd_lat]), min([db_result.lu_lng, db_result.rd_lng]) ], [ max([db_result.lu_lat, db_result.rd_lat]), max([db_result.lu_lng, db_result.rd_lng]) ]]) }), '{}'.format([[db_result.lu_lat, db_result.lu_lng], [db_result.rd_lat, db_result.rd_lng]]) ]) ]) print(msg) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email() continue tmp_area_data_list = area_result.pop('data') area_result['data'] = [] for area_data in tmp_area_data_list: if not area_data.get('m') \ or area_data['m']=='0': continue area_result['data'].append(area_data) # 該區域沒有任何船隻資料的話,略過 if not area_result['data']: print('{}: Skip Area {}'.format(script_name, db_result.id)) continue else: print('{}: 區域 {} 有 {} 艘船隻'.format(script_name, db_result.id, len(area_result['data']))) id_list = [] ship_data_dict = mc.ship_info( [area_data['i'] for area_data in area_result['data']]) for area_data in area_result['data']: if area_data['i'] not in ship_data_dict: print(area_data['i']) continue id_list.append('{}_{}'.format( area_data['m'], ship_data_dict[area_data['i']]['posTime'])) if id_list: es_ship_ids = set([ data['_id'] for data in es.scan( { 'query': { 'bool': { 'must': [{ 'terms': { '_id': id_list } }] } } }, app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME']) ]) else: es_ship_ids = set() for area_data in area_result['data']: if area_data['i'] not in ship_data_dict: print(area_data['i']) continue ship_data = ship_data_dict[area_data['i']] try: dictionary = { '_index': app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], '_type': '_doc', '_id': '{}_{}'.format(area_data['m'], ship_data['posTime']), '_routing': '{}'.format( (datetime.utcfromtimestamp(ship_data['posTime']) + timedelta(hours=8)).year) if ship_data['posTime'] else None, 'updatetime': ship_data['updatetime'] if ship_data.get('updatetime') else datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'eta_timestamp': ship_data['eta'], 'eta': area_data['r'], 'time': (datetime.utcfromtimestamp(ship_data['posTime']) + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S'), 'y': area_data['y'] } except: msg = '\n'.join([ traceback.format_exc(), '{}'.format(area_data), '{}'.format(ship_data) ]) ggg = GmailSender( '船隻爬蟲出現錯誤-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email() continue if dictionary['_id'] in es_ship_ids: continue try: dictionary['v'] = int(area_data['v']) except: dictionary['v'] = None if not dictionary['eta_timestamp']: dictionary['eta_timestamp'] = None dictionary['eta_datetime'] = None else: dictionary['eta_datetime'] = ( datetime.utcfromtimestamp(dictionary['eta_timestamp']) + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S') for source_key, new_key in format_batch_load_dict.items(): if source_key in area_data: dictionary[new_key] = area_data[source_key] elif source_key in ship_data: dictionary[new_key] = ship_data[source_key] dictionary['shipid'] = '{}'.format(dictionary['shipid']) for key, divisor in format_data_content.items(): if dictionary.get(key): dictionary[key] = round(dictionary[key] / divisor, 6) for key in list(dictionary.keys()): if type(dictionary[key]) is not str: continue dictionary[key] = dictionary[key].strip() if not dictionary[key] or dictionary[key] == 'NULL': dictionary[key] = None for key in ['navistatus', 'rot', 'type', 'y']: if dictionary.get(key) and type( dictionary[key] is not int): dictionary[key] = int(dictionary[key]) if dictionary['type'] not in ship_type_dict: sts_db_result = ShipTypeMyships.query.filter( ShipTypeMyships.type == dictionary['type']).first() if sts_db_result: ship_type_dict[sts_db_result.type] = sts_db_result.name else: ship_type_dict[dictionary['type']] = None dictionary['type_text'] = ship_type_dict[dictionary['type']] if dictionary['navistatus'] not in navistatus_type_dict: nt_db_result = NavistatusTypeMyships.query.filter( NavistatusTypeMyships.type == dictionary['navistatus']).first() if nt_db_result: navistatus_type_dict[ nt_db_result.type] = nt_db_result.name else: navistatus_type_dict[dictionary['navistatus']] = None dictionary['navistatus_text'] = navistatus_type_dict[ dictionary['navistatus']] batch_load_list.append(dictionary) if batch_load_list: es.batch_load(batch_load_list) # ############################# # if len(batch_load_list)>2: # return # ############################# except: msg = '\n\n'.join([ 'ip: {}'.format(get_external_ip()), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email()
def myships_crawler_func(self): # 檢查這台機器是否有同排程還在執行 if check_same_process_still_running(self.script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1)) return try: self.es = Elastic( host=app.config['ES_SETTING']['CONNECTION']['HOST'], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) if not self.es.check_index_exist( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME']): print( self.es.create_index( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO'] ['MYSHIPS']['MAPPING_FILEPATH'])) db_result_list = AreaList.query.with_entities( AreaList.id, AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all() if not db_result_list: print('{}: 無區域的排程區間資料'.format(self.script_name)) return self.crawl_span_dict = { db_result.id: db_result.crawl_span for db_result in db_result_list } self.query_sort_conds = [SubAreaList.area_list_id] self.query_sort_conds.extend([x.id for x in db_result_list]) self.ship_type_dict = { x.type: x.name for x in ShipTypeMyships.query.all() } self.navistatus_type_dict = { x.type: x.name for x in NavistatusTypeMyships.query.all() } self.mc = Myships_Crawler() while True: if datetime.now().minute>57 \ and datetime.now().second>30: return db.session.rollback() db.session.close() if self.thread_error_count >= self.thread_error_count_max: self.err_msg_list = list(set(self.err_msg_list)) raise Exception('\n\n'.join(self.err_msg_list)) del_index_list = [] for index, thread in enumerate(self.thread_list): if not thread.is_alive(): del_index_list.append(index) del_index_list.reverse() for index in del_index_list: del (self.thread_list[index]) if self.rollback_id_list: while self.rollback_id_list: db_result = SubAreaList.query.filter( SubAreaList.id == self.rollback_id_list[0]).first() db_result.crawler_time = self.time_dict[ 'old_crawler_time'][self.rollback_id_list[0]] db_result.next_time = self.time_dict['old_next_time'][ self.rollback_id_list[0]] db.session.add(db_result) del (self.time_dict['old_crawler_time'][ self.rollback_id_list[0]]) del (self.time_dict['old_next_time'][ self.rollback_id_list[0]]) del (self.rollback_id_list[0]) db.session.commit() db_result = SubAreaList.query.filter( SubAreaList.enable == 1, SubAreaList.web == 'myships', or_(SubAreaList.next_time <= datetime.now(), SubAreaList.next_time == None), or_(*[ SubAreaList.area_list_id == id for id in self.crawl_span_dict.keys() ])).order_by(sqlalchemy.func.field(*self.query_sort_conds), asc(SubAreaList.next_time), func.random()).first() if not db_result: if self.thread_list: for thread in self.thread_list: thread.join() continue print('{}: 完成'.format(self.script_name)) return print('{}: 爬取區域 {} 中'.format(self.script_name, db_result.id)) crawler_time = datetime.now() - timedelta( minutes=datetime.now().minute % self.crawl_span_dict[db_result.area_list_id]) self.time_dict['old_crawler_time'][db_result.id] = deepcopy( db_result.crawler_time) self.time_dict['old_next_time'][db_result.id] = deepcopy( db_result.next_time) db_result.crawler_time = datetime.strptime( crawler_time.strftime('%Y-%m-%d %H:%M:00'), '%Y-%m-%d %H:%M:%S') db_result.next_time = db_result.crawler_time + timedelta( minutes=self.crawl_span_dict[db_result.area_list_id]) db.session.add(db_result) db.session.commit() if db_result.lu_lat==db_result.rd_lat \ or db_result.lu_lng==db_result.rd_lng: continue self.mc.set_cookies_list([ x.cookies for x in MyshipsAccount.query.filter( MyshipsAccount.enable == 1, MyshipsAccount.updating == 0, MyshipsAccount.updated_time >= ( datetime.now() - timedelta(hours=1))).all() ]) thread = threading.Thread(target=self.myships_thread, args=(db_result.json(), ), daemon=True) thread.start() time.sleep(2) self.thread_list.append(thread) while [x.is_alive() for x in self.thread_list ].count(True) >= self.thread_max_count: continue if self.err_msg_list: self.err_msg_list = list(set(self.err_msg_list)) ggg = GmailSender( '船隻爬蟲 {} 執行完成,但途中有部份錯誤'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], '\n\n'.join(self.err_msg_list)) ggg.send_email() except: msg = '\n\n'.join([ 'ip: {}'.format(get_external_ip()), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) self.err_msg_list.append(msg) self.err_msg_list = list(set(self.err_msg_list)) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], '\n\n'.join(self.err_msg_list)) ggg.send_email()
class myships_crawler_class(): def __init__(self): self.script_name = os.path.basename(__file__) self.thread_error_count = 0 self.err_msg_list = [] self.thread_max_count = 20 self.thread_list = [] self.rollback_id_list = [] self.thread_error_count_max = self.thread_max_count * 2 self.time_dict = {'old_crawler_time': {}, 'old_next_time': {}} def myships_thread(self, db_result_dict): try: batch_load_list = [] try: area_result = self.mc.area_info( min([db_result_dict['lu_lat'], db_result_dict['rd_lat']]), min([db_result_dict['lu_lng'], db_result_dict['rd_lng']]), max([db_result_dict['lu_lat'], db_result_dict['rd_lat']]), max([db_result_dict['lu_lng'], db_result_dict['rd_lng']])) except: self.thread_error_count += 1 self.rollback_id_list.append(db_result_dict['id']) msg = '\n\n'.join([ 'ip: {}'.format(get_external_ip()), '時間: {}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S')), '{}\n{}'.format('取得區域船隻資料出現錯誤,請檢查是資策會端網路出現錯誤還是寶船網網站異常', traceback.format_exc()), ]) print(msg) self.err_msg_list.append(msg) # ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) # ggg.send_email() return if area_result['code'] != '0': self.thread_error_count += 1 msg = '\n\n'.join([ 'ip: {}'.format(get_external_ip()), '時間: {}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S')), '\n'.join([ '取得寶船網區域船隻資料時出現錯誤', '{}'.format(area_result), 'id :{}'.format(db_result_dict['id']), 'area_list_id :{}'.format( db_result_dict['area_list_id']), '{}'.format([[ db_result_dict['lu_lat'], db_result_dict['lu_lng'] ], [ db_result_dict['rd_lat'], db_result_dict['rd_lng'] ]]) ]) ]) print(msg) self.err_msg_list.append(msg) # ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) # ggg.send_email() return tmp_area_data_list = area_result.pop('data') area_result['data'] = [] for area_data in tmp_area_data_list: if not area_data.get('m') \ or area_data['m']=='0': continue area_result['data'].append(area_data) # 該區域沒有任何船隻資料的話,略過 if not area_result['data']: print('{}: Skip Area {}'.format(self.script_name, db_result_dict['id'])) return else: print('{}: 區域 {} 有 {} 艘船隻'.format(self.script_name, db_result_dict['id'], len(area_result['data']))) id_list = [] ship_data_dict = self.mc.ship_info( [area_data['i'] for area_data in area_result['data']]) for area_data in area_result['data']: if area_data['i'] not in ship_data_dict: print(area_data['i']) continue id_list.append('{}_{}'.format(area_data['m'], area_data['t'])) if id_list: es_ship_ids = set([ data['_id'] for data in self.es.scan( { 'query': { 'bool': { 'must': [{ 'terms': { '_id': id_list } }] } } }, app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME']) ]) else: es_ship_ids = set() for area_data in area_result['data']: if area_data['i'] not in ship_data_dict: print(area_data['i']) continue # 有時候拉船隻詳細資料 posTime 會是 Null, 這時改為區域船隻資料的船隻資料時間點 ship_data = ship_data_dict[area_data['i']] try: dictionary = { '_index': app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], '_type': '_doc', '_id': '{}_{}'.format(area_data['m'], area_data['t']), '_routing': '{}'.format( (datetime.utcfromtimestamp(area_data['t']) + timedelta(hours=8)).year) if area_data['t'] else None, 'updatetime': area_data['updatetime'] if area_data.get('updatetime') else datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'eta_timestamp': ship_data['eta'], 'eta': area_data['r'], 'time': (datetime.utcfromtimestamp(area_data['t']) + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S'), 'y': area_data['y'] } except: self.thread_error_count += 1 msg = '\n'.join([ traceback.format_exc(), '{}'.format(area_data), '{}'.format(ship_data) ]) print(msg) self.err_msg_list.append(msg) ggg = GmailSender( '船隻爬蟲船隻資料出現異常-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email() continue if dictionary['_id'] in es_ship_ids: continue try: dictionary['v'] = int(area_data['v']) except: dictionary['v'] = None if not dictionary['eta_timestamp']: dictionary['eta_timestamp'] = None dictionary['eta_datetime'] = None else: dictionary['eta_datetime'] = ( datetime.utcfromtimestamp(dictionary['eta_timestamp']) + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S') for source_key, new_key in format_batch_load_dict.items(): if source_key in area_data: dictionary[new_key] = area_data[source_key] elif source_key in ship_data: dictionary[new_key] = ship_data[source_key] dictionary['shipid'] = '{}'.format(dictionary['shipid']) for key, divisor in format_data_content.items(): if dictionary.get(key): dictionary[key] = round(dictionary[key] / divisor, 6) for key in list(dictionary.keys()): if type(dictionary[key]) is not str: continue dictionary[key] = dictionary[key].strip() if not dictionary[key] or dictionary[key] == 'NULL': dictionary[key] = None for key in ['navistatus', 'rot', 'type', 'y']: if dictionary.get(key) and type( dictionary[key] is not int): dictionary[key] = int(dictionary[key]) if dictionary['type'] not in self.ship_type_dict: sts_db_result = ShipTypeMyships.query.filter( ShipTypeMyships.type == dictionary['type']).first() if sts_db_result: self.ship_type_dict[ sts_db_result.type] = sts_db_result.name else: self.ship_type_dict[dictionary['type']] = None dictionary['type_text'] = self.ship_type_dict[ dictionary['type']] if dictionary['navistatus'] not in self.navistatus_type_dict: nt_db_result = NavistatusTypeMyships.query.filter( NavistatusTypeMyships.type == dictionary['navistatus']).first() if nt_db_result: self.navistatus_type_dict[ nt_db_result.type] = nt_db_result.name else: self.navistatus_type_dict[ dictionary['navistatus']] = None dictionary['navistatus_text'] = self.navistatus_type_dict[ dictionary['navistatus']] batch_load_list.append(dictionary) if batch_load_list: self.es.batch_load(batch_load_list) except: self.thread_error_count += 1 msg = traceback.format_exc() print(msg) self.err_msg_list.append(msg) def myships_crawler_func(self): # 檢查這台機器是否有同排程還在執行 if check_same_process_still_running(self.script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1)) return try: self.es = Elastic( host=app.config['ES_SETTING']['CONNECTION']['HOST'], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) if not self.es.check_index_exist( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME']): print( self.es.create_index( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO'] ['MYSHIPS']['MAPPING_FILEPATH'])) db_result_list = AreaList.query.with_entities( AreaList.id, AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all() if not db_result_list: print('{}: 無區域的排程區間資料'.format(self.script_name)) return self.crawl_span_dict = { db_result.id: db_result.crawl_span for db_result in db_result_list } self.query_sort_conds = [SubAreaList.area_list_id] self.query_sort_conds.extend([x.id for x in db_result_list]) self.ship_type_dict = { x.type: x.name for x in ShipTypeMyships.query.all() } self.navistatus_type_dict = { x.type: x.name for x in NavistatusTypeMyships.query.all() } self.mc = Myships_Crawler() while True: if datetime.now().minute>57 \ and datetime.now().second>30: return db.session.rollback() db.session.close() if self.thread_error_count >= self.thread_error_count_max: self.err_msg_list = list(set(self.err_msg_list)) raise Exception('\n\n'.join(self.err_msg_list)) del_index_list = [] for index, thread in enumerate(self.thread_list): if not thread.is_alive(): del_index_list.append(index) del_index_list.reverse() for index in del_index_list: del (self.thread_list[index]) if self.rollback_id_list: while self.rollback_id_list: db_result = SubAreaList.query.filter( SubAreaList.id == self.rollback_id_list[0]).first() db_result.crawler_time = self.time_dict[ 'old_crawler_time'][self.rollback_id_list[0]] db_result.next_time = self.time_dict['old_next_time'][ self.rollback_id_list[0]] db.session.add(db_result) del (self.time_dict['old_crawler_time'][ self.rollback_id_list[0]]) del (self.time_dict['old_next_time'][ self.rollback_id_list[0]]) del (self.rollback_id_list[0]) db.session.commit() db_result = SubAreaList.query.filter( SubAreaList.enable == 1, SubAreaList.web == 'myships', or_(SubAreaList.next_time <= datetime.now(), SubAreaList.next_time == None), or_(*[ SubAreaList.area_list_id == id for id in self.crawl_span_dict.keys() ])).order_by(sqlalchemy.func.field(*self.query_sort_conds), asc(SubAreaList.next_time), func.random()).first() if not db_result: if self.thread_list: for thread in self.thread_list: thread.join() continue print('{}: 完成'.format(self.script_name)) return print('{}: 爬取區域 {} 中'.format(self.script_name, db_result.id)) crawler_time = datetime.now() - timedelta( minutes=datetime.now().minute % self.crawl_span_dict[db_result.area_list_id]) self.time_dict['old_crawler_time'][db_result.id] = deepcopy( db_result.crawler_time) self.time_dict['old_next_time'][db_result.id] = deepcopy( db_result.next_time) db_result.crawler_time = datetime.strptime( crawler_time.strftime('%Y-%m-%d %H:%M:00'), '%Y-%m-%d %H:%M:%S') db_result.next_time = db_result.crawler_time + timedelta( minutes=self.crawl_span_dict[db_result.area_list_id]) db.session.add(db_result) db.session.commit() if db_result.lu_lat==db_result.rd_lat \ or db_result.lu_lng==db_result.rd_lng: continue self.mc.set_cookies_list([ x.cookies for x in MyshipsAccount.query.filter( MyshipsAccount.enable == 1, MyshipsAccount.updating == 0, MyshipsAccount.updated_time >= ( datetime.now() - timedelta(hours=1))).all() ]) thread = threading.Thread(target=self.myships_thread, args=(db_result.json(), ), daemon=True) thread.start() time.sleep(2) self.thread_list.append(thread) while [x.is_alive() for x in self.thread_list ].count(True) >= self.thread_max_count: continue if self.err_msg_list: self.err_msg_list = list(set(self.err_msg_list)) ggg = GmailSender( '船隻爬蟲 {} 執行完成,但途中有部份錯誤'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], '\n\n'.join(self.err_msg_list)) ggg.send_email() except: msg = '\n\n'.join([ 'ip: {}'.format(get_external_ip()), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) self.err_msg_list.append(msg) self.err_msg_list = list(set(self.err_msg_list)) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], '\n\n'.join(self.err_msg_list)) ggg.send_email()
def mygopen_crawler_func(): script_name = os.path.basename(__file__) try: es = Elastic(host=[app.config['ES_SETTING']['CONNECTION']['HOST']], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) if not es.check_index_exist(app.config['ES_SETTING']['INDEX_INFO']['ARTICLE']['INDEX_NAME']): es.create_index(app.config['ES_SETTING']['INDEX_INFO']['ARTICLE']['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']['ARTICLE']['MAPPING_FILEPATH']) if not es.check_index_exist(app.config['ES_SETTING']['INDEX_INFO']['MEDIA']['INDEX_NAME']): es.create_index(app.config['ES_SETTING']['INDEX_INFO']['MEDIA']['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']['MEDIA']['MAPPING_FILEPATH']) query = { 'from' : 0, 'size' : 1, "sort" : [ { "article_publish_time" : "desc" } ] } es_dictionary_list = es.search(query, app.config['ES_SETTING']['INDEX_INFO']['ARTICLE']['INDEX_NAME'])['hits']['hits'] es_newest_article_time = None if not es_dictionary_list else datetime.strptime(es_dictionary_list[0]['_source']['article_publish_time'], '%Y-%m-%d %H:%M:%S') offset_n = 0 limit_n = 10 finish_status = False while not finish_status: print('爬取 MyGoPen Web 中, offset_n: {}'.format(offset_n)) article_list = get_MyGoPen_article_list(offset_n, limit_n) offset_n+=limit_n if not article_list: break for article_dict_index, article_dict in enumerate(article_list): print('爬取 MyGoPen Web 中,offset_n: {}, {}/{}'.format(offset_n, article_dict_index, len(article_list))) dictionary = format_api_rsp(article_dict, es) if es_newest_article_time and datetime.strptime(dictionary['article_publish_time'], '%Y-%m-%d %H:%M:%S')<=es_newest_article_time: finish_status = True break if not dictionary['article_url']: pprint(dictionary) print('無法透過API取得文章網址') continue dictionary['_type'] = '_doc' dictionary['_index'] = app.config['ES_SETTING']['INDEX_INFO']['ARTICLE']['INDEX_NAME'] m = hashlib.md5() m.update(dictionary['article_url'].encode("utf-8")) dictionary['_id'] = m.hexdigest() es.batch_load([dictionary]) except Exception as e: error_msg = '\n'.join( [ '{}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), '{}'.format(traceback.format_exc()) ] ) print(error_msg) lnm.send_msg(error_msg) gs = GmailSender( 'MyGoPen排程出現錯誤-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['RECEIVER_LIST'], error_msg ) gs.send_email()
class myships_crawler_class(): def __init__(self): self.machine_serial = int(os.environ.get('SERIAL', '0')) self.script_name = os.path.basename(__file__) self.err_msg_list = [] self.thread_list = [] self.thread_max_count = 100 self.err_count = 0 self.err_count_max = 200 self.no_data_count = 0 self.no_data_count_max = 10 self.machine_count = 1 self.ship_detail_dict = {} self.headers = { 'Connection': 'close', 'User-Agent': 'Mozilla/5.0 (Macintosh Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36' } self.save2es_thread = threading.Thread(target=self.save2es, daemon=True) def err_msg_generator(self, err_msg): return ('\n'.join( [datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self.ip, err_msg])) def save2es(self): batch_load_list = [] for key_id in list(self.ship_detail_dict.keys()): id_list = [ '{}_{}'.format(ship_detail_dict['mmsi'], ship_detail_dict['posTime']) for ship_detail_dict in self.ship_detail_dict[key_id] ] if id_list: es_ship_ids = set([ data['_id'] for data in self.es.scan( { 'query': { 'bool': { 'must': [{ 'terms': { '_id': id_list } }] } } }, app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME']) ]) else: es_ship_ids = set() for ship_detail_dict in self.ship_detail_dict.pop(key_id): _id = '{}_{}'.format(ship_detail_dict['mmsi'], ship_detail_dict['posTime']) if _id in es_ship_ids: continue dictionary = { '_index': app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], '_type': '_doc', '_id': _id, '_routing': '{}'.format( (datetime.utcfromtimestamp(ship_detail_dict['posTime']) + timedelta(hours=8)).year), 'updatetime': ship_detail_dict['updatetime'], 'eta_timestamp': ship_detail_dict['eta'], 'time': (datetime.utcfromtimestamp(ship_detail_dict['posTime']) + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S'), 'callsign': ship_detail_dict['callsign'], 'nationality': self.mmsi_dict[ship_detail_dict['mmsi'][:3]] if ship_detail_dict['mmsi'][:3] in self.mmsi_dict else None, 'cog': ship_detail_dict['cog'] / 10 if ship_detail_dict['cog'] else None, 'dest': ship_detail_dict['destPort'], 'draught': ship_detail_dict['draught'] / 10 if ship_detail_dict['draught'] else None, 'hdg': ship_detail_dict['heading'], 'imo': ship_detail_dict['imo'], 'latitude': ship_detail_dict['lat'] / 600000, 'length': ship_detail_dict['length'], 'longitude': ship_detail_dict['lon'] / 600000, 'mmsi': ship_detail_dict['mmsi'], 'name': ship_detail_dict['shipnameEn'], 'navistatus': ship_detail_dict['aisNavStatus'], 'rot': ship_detail_dict['rot'], 'shipid': ship_detail_dict['shipId'], 'sog': ship_detail_dict['sog'] / 10 if ship_detail_dict['sog'] else None, 'utc_timestamp': ship_detail_dict['posTime'], 'type': ship_detail_dict['shiptype'], 'width': ship_detail_dict['breadth'], 'y': ship_detail_dict['shiptype'], 'v': ship_detail_dict['aisNavStatus'] } try: # type 有時會出現亂碼,如「6857-d&0」、「1607U No.158」 dictionary['type'] = int(dictionary['type']) except: dictionary['type'] = None try: dictionary['navistatus'] = int(dictionary['navistatus']) except: dictionary['navistatus'] = None if ship_detail_dict['eta']: eta_datetime = datetime.utcfromtimestamp( ship_detail_dict['eta']) dictionary['eta'] = eta_datetime.strftime('%m-%d %H:%M') dictionary['eta_datetime'] = eta_datetime.strftime( '%Y-%m-%d %H:%M:%S') else: dictionary['eta'] = None dictionary['eta_datetime'] = None if dictionary['type'] in self.ship_type_dict: dictionary['type_text'] = self.ship_type_dict[ dictionary['type']] if dictionary['navistatus'] in self.navistatus_type_dict: dictionary['navistatus_text'] = self.navistatus_type_dict[ dictionary['navistatus']] dictionary['y'] = dictionary['type'] dictionary['v'] = dictionary['navistatus'] batch_load_list.append(dictionary) try: self.es.batch_load(batch_load_list) except: self.err_count += 100 def get_ship_detail(self, shipId_list_for_func): input_json = {"shipId": ','.join(shipId_list_for_func)} try: rsp = RequestsRetryer( 'post', { 'url': app.config['CRAWLER_SETTING']['MYSHIPS']['SHIP_DETAIL'], 'headers': self.headers, 'json': input_json, 'timeout': 180, 'cookies': self.cookies_list }, req_retry_limit=3, req_retry_sleeptime=5) rsp.close() except: print(traceback.format_exc()) self.err_count += 1 self.err_msg_list.append( self.err_msg_generator(traceback.format_exc())) return if rsp.status_code != 200: print(rsp.text) self.err_count += 1 self.err_msg_list.append(self.err_msg_generator(rsp.text)) return try: rsp_result = rsp.json() except: print(traceback.format_exc()) self.err_count += 1 self.err_msg_list.append( self.err_msg_generator(traceback.format_exc())) return if rsp_result['code']!='0' \ or rsp_result['message']!='成功': pprint(rsp_result) self.err_count += 1 self.err_msg_list.append(self.err_msg_generator(rsp.text)) return key_id = f'{uuid4()}' self.ship_detail_dict[key_id] = [] for ship_detail_dict in rsp_result['data']: if not ship_detail_dict['mmsi'] \ or not ship_detail_dict['lon'] \ or not ship_detail_dict['lat'] \ or not ship_detail_dict['posTime']: continue ship_detail_dict['v'] = None ship_detail_dict['y'] = None ship_detail_dict['updatetime'] = datetime.now().strftime( '%Y-%m-%d %H:%M:%S') self.ship_detail_dict[key_id].append(ship_detail_dict) if not self.ship_detail_dict[key_id]: self.no_data_count += 1 return def myships_crawler_func(self): print(datetime.now()) try: self.ip = get_external_ip() except: self.ip = None try: # 檢查這台機器是否有同排程還在執行 if check_same_process_still_running(self.script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1)) return if not self.ip: raise (Exception('無法取得 IP')) try: rsp = requests.get( app.config['CRAWLER_SETTING']['MYSHIPS']['HOST_DOMAIN'], timeout=60) rsp.close() except: raise Exception('無法連線至寶船網網頁 :\n{}'.format( traceback.format_exc())) if rsp.status_code != 200: raise Exception('寶船網網頁無法正確連線 :\n{}'.format(rsp.text)) try: rsp = requests.get( 'http://{}:{}/'.format( 'localhost', app.config['ES_SETTING']['CONNECTION']['PORT']), auth=HTTPBasicAuth( app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], app.config['ES_SETTING']['CONNECTION']['PASSWORD'])) rsp.close() except: raise Exception(traceback.format_exc()) if rsp.status_code != 200: raise Exception('無法連線至資策會 ES 主機') self.es = Elastic( host=app.config['ES_SETTING']['CONNECTION']['HOST'], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) if not self.es.check_index_exist( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME']): print( self.es.create_index( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO'] ['MYSHIPS']['MAPPING_FILEPATH'])) self.ship_type_dict = { x.type: x.name for x in ShipTypeMyships.query.all() } self.navistatus_type_dict = { x.type: x.name for x in NavistatusTypeMyships.query.all() } self.mmsi_dict = {} for db_result in MMSI_Info.query.with_entities( MMSI_Info.mmsi, MMSI_Info.alpha_2, MMSI_Info.alpha_3).all(): self.mmsi_dict[ db_result. mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2 print('帳戶檢查登入狀態中') account_login_timestamp = time.time() account_login_span = 1800 try: ship_account_login_func() self.cookies_list = ([ x.cookies for x in MyshipsAccount.query.filter( MyshipsAccount.enable == 1, MyshipsAccount.updating == 0, MyshipsAccount.updated_time >= ( datetime.now() - timedelta(hours=1))).all() ]) if not self.cookies_list: self.cookies_list.append({}) except: print('帳號登入失敗') print(traceback.format_exc()) self.cookies_list = [{}] # start_n = deepcopy(4000000+self.machine_serial) start_n = deepcopy(self.machine_serial) # start_n = 1660000 while True: if datetime.now().minute>59 \ and datetime.now().second>30: return print(start_n) if (time.time() - account_login_timestamp) >= account_login_span: print( f'帳戶距離上次登入時間超過 {account_login_span} 秒,等待所有 Thread 結束並重新登入後,將繼續執行' ) for thread in self.thread_list: thread.join() print('帳戶重新登入中') account_login_timestamp = time.time() try: ship_account_login_func() self.cookies_list = ([ x.cookies for x in MyshipsAccount.query.filter( MyshipsAccount.enable == 1, MyshipsAccount. updating == 0, MyshipsAccount.updated_time >= (datetime.now() - timedelta(hours=1))).all() ]) if not self.cookies_list: self.cookies_list.append({}) except: print('帳號登入失敗') print(traceback.format_exc()) self.cookies_list = [{}] end_n = start_n + 1000 * self.machine_count shipId_list = [ f'{i}' for i in range(start_n, end_n, self.machine_count) ] start_n = deepcopy(end_n) t1 = time.time() thread = threading.Thread(target=self.get_ship_detail, args=(shipId_list, ), daemon=True) thread.start() self.thread_list.append(thread) thread_sleep_time = 1 - (time.time() - t1) if thread_sleep_time > 0: time.sleep(thread_sleep_time) # for thread in self.thread_list: # thread.join() # pprint(self.ship_detail_dict) # if self.ship_detail_dict: # self.save2es() # pprint(self.ship_detail_dict) # return while [thread.is_alive() for thread in self.thread_list ].count(True) >= self.thread_max_count: continue delete_index_list = [] for index, thread in enumerate(self.thread_list): if not thread.is_alive(): delete_index_list.append(index) delete_index_list.reverse() for index in delete_index_list: del (self.thread_list[index]) if self.err_count >= self.err_count_max: raise Exception('\n\n'.join(self.err_msg_list)) if self.no_data_count >= self.no_data_count_max: break if self.ship_detail_dict and not self.save2es_thread.is_alive( ): self.save2es_thread = threading.Thread(target=self.save2es, daemon=True) self.save2es_thread.start() print('完成爬取,等待 Thread 結束') for thread in self.thread_list: thread.join() print('Thread 結束, 正在將最後剩餘資料存入 ES 中') while self.save2es_thread.is_alive(): continue if self.ship_detail_dict: self.save2es() print('結束') print(datetime.now()) exit() except: msg = '\n\n'.join([ 'ip: {}'.format(self.ip), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) self.err_msg_list.append(msg) self.err_msg_list = list(set(self.err_msg_list)) print('\n\n'.join(self.err_msg_list))
def myships_crawler_func(self): print(datetime.now()) try: self.ip = get_external_ip() except: self.ip = None try: # 檢查這台機器是否有同排程還在執行 if check_same_process_still_running(self.script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1)) return if not self.ip: raise (Exception('無法取得 IP')) try: rsp = requests.get( app.config['CRAWLER_SETTING']['MYSHIPS']['HOST_DOMAIN'], timeout=60) rsp.close() except: raise Exception('無法連線至寶船網網頁 :\n{}'.format( traceback.format_exc())) if rsp.status_code != 200: raise Exception('寶船網網頁無法正確連線 :\n{}'.format(rsp.text)) try: rsp = requests.get( 'http://{}:{}/'.format( 'localhost', app.config['ES_SETTING']['CONNECTION']['PORT']), auth=HTTPBasicAuth( app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], app.config['ES_SETTING']['CONNECTION']['PASSWORD'])) rsp.close() except: raise Exception(traceback.format_exc()) if rsp.status_code != 200: raise Exception('無法連線至資策會 ES 主機') self.es = Elastic( host=app.config['ES_SETTING']['CONNECTION']['HOST'], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) if not self.es.check_index_exist( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME']): print( self.es.create_index( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO'] ['MYSHIPS']['MAPPING_FILEPATH'])) self.ship_type_dict = { x.type: x.name for x in ShipTypeMyships.query.all() } self.navistatus_type_dict = { x.type: x.name for x in NavistatusTypeMyships.query.all() } self.mmsi_dict = {} for db_result in MMSI_Info.query.with_entities( MMSI_Info.mmsi, MMSI_Info.alpha_2, MMSI_Info.alpha_3).all(): self.mmsi_dict[ db_result. mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2 print('帳戶檢查登入狀態中') account_login_timestamp = time.time() account_login_span = 1800 try: ship_account_login_func() self.cookies_list = ([ x.cookies for x in MyshipsAccount.query.filter( MyshipsAccount.enable == 1, MyshipsAccount.updating == 0, MyshipsAccount.updated_time >= ( datetime.now() - timedelta(hours=1))).all() ]) if not self.cookies_list: self.cookies_list.append({}) except: print('帳號登入失敗') print(traceback.format_exc()) self.cookies_list = [{}] # start_n = deepcopy(4000000+self.machine_serial) start_n = deepcopy(self.machine_serial) # start_n = 1660000 while True: if datetime.now().minute>59 \ and datetime.now().second>30: return print(start_n) if (time.time() - account_login_timestamp) >= account_login_span: print( f'帳戶距離上次登入時間超過 {account_login_span} 秒,等待所有 Thread 結束並重新登入後,將繼續執行' ) for thread in self.thread_list: thread.join() print('帳戶重新登入中') account_login_timestamp = time.time() try: ship_account_login_func() self.cookies_list = ([ x.cookies for x in MyshipsAccount.query.filter( MyshipsAccount.enable == 1, MyshipsAccount. updating == 0, MyshipsAccount.updated_time >= (datetime.now() - timedelta(hours=1))).all() ]) if not self.cookies_list: self.cookies_list.append({}) except: print('帳號登入失敗') print(traceback.format_exc()) self.cookies_list = [{}] end_n = start_n + 1000 * self.machine_count shipId_list = [ f'{i}' for i in range(start_n, end_n, self.machine_count) ] start_n = deepcopy(end_n) t1 = time.time() thread = threading.Thread(target=self.get_ship_detail, args=(shipId_list, ), daemon=True) thread.start() self.thread_list.append(thread) thread_sleep_time = 1 - (time.time() - t1) if thread_sleep_time > 0: time.sleep(thread_sleep_time) # for thread in self.thread_list: # thread.join() # pprint(self.ship_detail_dict) # if self.ship_detail_dict: # self.save2es() # pprint(self.ship_detail_dict) # return while [thread.is_alive() for thread in self.thread_list ].count(True) >= self.thread_max_count: continue delete_index_list = [] for index, thread in enumerate(self.thread_list): if not thread.is_alive(): delete_index_list.append(index) delete_index_list.reverse() for index in delete_index_list: del (self.thread_list[index]) if self.err_count >= self.err_count_max: raise Exception('\n\n'.join(self.err_msg_list)) if self.no_data_count >= self.no_data_count_max: break if self.ship_detail_dict and not self.save2es_thread.is_alive( ): self.save2es_thread = threading.Thread(target=self.save2es, daemon=True) self.save2es_thread.start() print('完成爬取,等待 Thread 結束') for thread in self.thread_list: thread.join() print('Thread 結束, 正在將最後剩餘資料存入 ES 中') while self.save2es_thread.is_alive(): continue if self.ship_detail_dict: self.save2es() print('結束') print(datetime.now()) exit() except: msg = '\n\n'.join([ 'ip: {}'.format(self.ip), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) self.err_msg_list.append(msg) self.err_msg_list = list(set(self.err_msg_list)) print('\n\n'.join(self.err_msg_list))