def get_shipxy_thread(self, db_result_dict_for_func): try: # data_token_result = self.sc.getareashipssimple(db_result_dict_for_func['coor_1'], db_result_dict_for_func['coor_2']) data_token_result = self.sc.getareashipssimple([ db_result_dict_for_func['lu_lat'], db_result_dict_for_func['lu_lng'] ], [ db_result_dict_for_func['rd_lat'], db_result_dict_for_func['rd_lng'] ]) except: print(traceback.format_exc()) self.db_rollback_dict[ db_result_dict_for_func['id']] = db_result_dict_for_func if db_result_dict_for_func['id'] in self.db_rollback_dict: return try: time.sleep(self.gernal_sleep_time) if db_result_dict_for_func['id'] in self.db_rollback_dict: return batch_load_list = [] if data_token_result['status']!=0 \ or 'data' not in data_token_result: self.crawler_status = False self.error_msg_list.append( '{}: 取得data token失敗, 船訊網 API 回傳 {}'.format( self.script_name, data_token_result)) return if not data_token_result['count']: print('{}: 區域 {} 內無船隻資料'.format(self.script_name, db_result_dict_for_func['id'])) return area_result_list = [] area_data_list = self.sc.area_info(data_token_result['data']) for area_data in area_data_list: if not area_data.get('mmsi') \ or area_data['mmsi']==0 \ or area_data['mmsi']=='0': continue area_result_list.append(area_data) # if len(area_data_list)!=len(area_result_list): # print( # '\n'.join( # [ # '-'*20, # '{}/{}'.format(len(area_data_list), len(area_result_list)), # '{}, {}'.format(db_result_dict_for_func['lu_lat'], db_result_dict_for_func['lu_lng']), # '{}, {}'.format(db_result_dict_for_func['rd_lat'], db_result_dict_for_func['rd_lng']), # json.dumps(area_data_list, ensure_ascii=False, indent=4), # '-'*20 # ] # ) # ) if not area_result_list: print('{}: 區域 {} 內無可爬的船隻資料'.format( self.script_name, db_result_dict_for_func['id'])) self.es.batch_load(batch_load_list) return gsdc = get_ship_detail_class(self.sc) thread_start_time = time.time() for index, area_data in enumerate(area_result_list): # 爬太久的停止機制 if time_to_stop(): break try: # 每個帳號,至少隔 1 秒requests一次,避免帳號被鎖 while [ x.is_alive() for x in gsdc.get_ship_detail_thread_list ].count(True) >= math.floor( (self.get_shipxy_thread_limit_tmp * self.get_ship_detail_quantity_limit) / ([x.is_alive() for x in self.get_shipxy_thread_list].count(True) + 1)): # 爬太久的停止機制 if time_to_stop(): break continue except: lll = [ traceback.format_exc(), '{}'.format([ x.is_alive() for x in gsdc.get_ship_detail_thread_list ].count(True)), '{}'.format(self.get_shipxy_thread_limit_tmp), '{}'.format(self.get_ship_detail_quantity_limit), '{}'.format([ x.is_alive() for x in self.get_shipxy_thread_list ].count(True)), ] raise Exception('\n'.join(lll)) remove_index_list = [] for index, thread in enumerate( gsdc.get_ship_detail_thread_list): if not thread.is_alive(): remove_index_list.append(index) remove_index_list.reverse() for index in remove_index_list: del (gsdc.get_ship_detail_thread_list[index]) thread = threading.Thread(target=gsdc.get_ship_detail, args=(area_data, ), daemon=True) thread.start() gsdc.get_ship_detail_thread_list.append(thread) time.sleep(self.gernal_sleep_time) while [x.is_alive() for x in gsdc.get_ship_detail_thread_list].count(True): # 爬太久的停止機制 if time_to_stop(): break continue print('爬取區域: {}, 耗費時間: {}, 船隻數量: {}'.format( db_result_dict_for_func['id'], round((time.time() - thread_start_time), 1), len(area_result_list))) # 爬太久的停止機制 if not time_to_stop(): if len(area_result_list ) >= 100 and not gsdc.thread_result_dict: self.crawler_status = False self.error_msg_list.append('\n'.join( list(set(gsdc.error_msg_list)))) return elif len(area_result_list) >= 100 and int( len(area_result_list) * 0.8) > len( list(gsdc.thread_result_dict.keys())): self.db_rollback_dict[db_result_dict_for_func[ 'id']] = db_result_dict_for_func self.error_msg_list.append('\n'.join( list(set(gsdc.error_msg_list)))) return id_list = [] for area_data in area_result_list: if area_data['mmsi'] not in gsdc.thread_result_dict: continue id_list.append('{}_{}'.format( area_data['mmsi'], gsdc.thread_result_dict[area_data['mmsi']]['lastdyn'])) if id_list: es_ship_ids = set([ data['_id'] for data in self.es.scan( { 'query': { 'bool': { 'must': [{ 'terms': { '_id': id_list } }] } } }, app.config['ES_SETTING']['INDEX_INFO']['SHIPXY'] ['INDEX_NAME']) ]) else: es_ship_ids = set() # print( # '\n'.join( # [ # 'area_result_list len: {}'.format(len(area_result_list)), # 'id_list len: {}'.format(len(id_list)), # 'es_ship_ids len: {}'.format(len(list(es_ship_ids))), # 'area_result_list-id_list= {}'.format(len(area_result_list)-len(id_list)), # 'id_list-es_ship_ids= {}'.format(len(id_list)-len(list(es_ship_ids))) # ] # ) # ) delete_list = [] for index, area_data in enumerate(area_result_list): if area_data['mmsi'] not in gsdc.thread_result_dict: print('{} : 未取得船隻 {} 之詳細資訊,略過之'.format( self.script_name, area_data['mmsi'])) continue dictionary = deepcopy( gsdc.thread_result_dict[area_data['mmsi']]) dictionary['latitude'] = area_data['lat'] # 緯度 dictionary['longitude'] = area_data['lng'] # 經度 dictionary = deepcopy( gsdc.thread_result_dict[area_data['mmsi']]) dictionary['_index'] = app.config['ES_SETTING']['INDEX_INFO'][ 'SHIPXY']['INDEX_NAME'] dictionary['_type'] = '_doc' dictionary['_id'] = '{}_{}'.format(area_data['mmsi'], dictionary['lastdyn']) if dictionary['_id'] in es_ship_ids: continue # dictionary['area_list_id'] = db_result_dict_for_func['area_list_id'] dictionary['nationality'] = self.mmsi_dict[ dictionary['mmsi'] [:3]] if dictionary['mmsi'][:3] in self.mmsi_dict else None dictionary['cog'] = dictionary['cog'] / 100 # 對地航向 dictionary['draught'] = dictionary['draught'] / 1000 # 吃水 dictionary['hdg'] = dictionary['hdg'] / 100 # 船首向 # # 船訊網可能會出現heading被設為51100,網頁上船首向為0的狀況 # if dictionary['hdg']>360: # dictionary['hdg'] = 0 for key in ['lat', 'lon']: dictionary.pop(key) dictionary['latitude'] = area_data['lat'] # 緯度 dictionary['longitude'] = area_data['lng'] # 經度 dictionary['sog'] = round(dictionary['sog'] / 5133 * 10, 2) # 速度:節 dictionary['length'] = dictionary['length'] / 10 # 船長 dictionary['lineWidth'] = area_data['lineWidth'] dictionary['width'] = dictionary['width'] / 10 # 船寬 dictionary['lastdyn_active'] = area_data[ 'lastdyn_active'] # 是否可擷取資料 dictionary['offset'] = area_data['offset'] dictionary['rot'] = area_data.get('rot') dictionary['rotate'] = area_data['rotate'] dictionary['shiptype'] = area_data['shiptype'] dictionary['state'] = area_data['state'] dictionary['state_color'] = area_data['state_color'] dictionary['istop'] = area_data['istop'] dictionary['tracks'] = area_data['tracks'] dictionary['tcname'] = s2tw_converter(dictionary['cnname']) dictionary['utc_timestamp'] = dictionary.pop('lastdyn') dictionary['time'] = ( datetime.utcfromtimestamp(dictionary['utc_timestamp']) + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S') if dictionary['type'] not in self.ship_type_dict: sts_db_result = ShipTypeShipxy.query.filter( ShipTypeShipxy.type == dictionary['type']).first() if sts_db_result: self.ship_type_dict[ sts_db_result.type] = sts_db_result.name else: self.ship_type_dict[dictionary['type']] = None dictionary['type_text'] = self.ship_type_dict[ dictionary['type']] if dictionary['navistatus'] not in self.navistatus_type_dict: nt_db_result = NavistatusTypeShipxy.query.filter( NavistatusTypeShipxy.type == dictionary['navistatus']).first() if nt_db_result: self.navistatus_type_dict[ nt_db_result.type] = nt_db_result.name else: self.navistatus_type_dict[ dictionary['navistatus']] = None dictionary['navistatus_text'] = self.navistatus_type_dict[ dictionary['navistatus']] dictionary['_routing'] = '{}'.format( (datetime.utcfromtimestamp(dictionary['utc_timestamp']) + timedelta(hours=8)).year) # dictionary['updatetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') # 這邊是如果字串後面有包含空格,就去除空格 # 如果是空字串,就設為null for key in list(dictionary.keys()): if type(dictionary[key]) is not str: continue dictionary[key] = dictionary[key].strip() if not dictionary[key] or dictionary[key] == 'NULL': dictionary[key] = None batch_load_list.append(dictionary) if delete_list: # 因為可能三台機器爬到同區域,如果有其中一台有刪除資料,這邊 Delete 就會出錯 try: self.es.delete_data(delete_list) except: pass if batch_load_list: self.es.batch_load(batch_load_list) del (delete_list, batch_load_list) except: msg_list = [ 'ip: {}'.format(get_external_ip()), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ] print('\n\n'.join(msg_list)) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], '\n\n'.join(msg_list)) ggg.send_email()
def ship_account_login_func(): script_name = os.path.basename(__file__) try: # 檢查這台機器是否有同排程還在執行 if check_same_process_still_running(script_name): # 代表包含這個程式在內,有兩個ca以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(script_name, 1)) return # 預防當要登入的帳號只有一個,但不同主機搶同一帳號登入 try: time.sleep(os.environ.get('SERIAL')) except: pass f = open(app.config['CRAWLER_SETTING']['MYSHIPS']['JS_DEMIX_FILEPATH'], 'r') js_content = f.read() f.close() conds_list = [ # 登入新加入的帳號 [ MyshipsAccount.enable==None ], # 如果有任何帳號更新到一半,重新登入的排程crash掉,會導致該筆帳號資料 updating 會保持在1,故再重登更新失敗的帳號一次 [ MyshipsAccount.updating==1, MyshipsAccount.updated_time<=(datetime.now()-timedelta(minutes=30)) ], # 重登登入超過23小時的帳號,由於船訊網爬蟲會撈取一天內曾登入的帳號,故最晚要在22小時更新 [ MyshipsAccount.enable==1, MyshipsAccount.updating==0, or_( MyshipsAccount.updated_time<=(datetime.now()-timedelta(minutes=30)), MyshipsAccount.updated_time==None ) ] ] headers = { 'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36', 'Connection':'keep-alive' } err_msg_list = [] account_be_banned = [] for conds in conds_list: while True: db.session.rollback() db.session.close() db_result = MyshipsAccount.query.filter(*conds).order_by(func.random()).first() if not db_result: break db_result.updating = 1 db.session.add(db_result) db.session.commit() db_result.updating = 0 session = requests.Session() complie_result = execjs_compile(js_content).call('shipencode') # {'time': 1607309565213, 'pwdSign': '50aa7d92baa11df578ad254e252928c8'} complie_result['user'] = db_result.account try: rsp = session.post(app.config['CRAWLER_SETTING']['MYSHIPS']['LOGIN'], headers=headers, json=complie_result, timeout=180) rsp.close() except: raise Exception(traceback.format_exc()) try: rsp_result = rsp.json() except: raise Exception(rsp.text) if rsp_result['code']=='0': pprint(rsp_result) err_msg_list.append(f"{rsp_result}") db_result.cookies = deepcopy(session.cookies.get_dict()) db_result.enable = 1 else: db_result.enable = 0 account_be_banned.append(db_result.account) db.session.add(db_result) db.session.commit() account_be_banned = list(set(account_be_banned)) account_be_banned.sort() if account_be_banned: msg_list = [ 'ip: {}'.format(get_external_ip()), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), '有帳號被封鎖,下述為被封鎖之帳號', '\n'.join(account_be_banned), '\n'.join(list(set(err_msg_list))) ] msg = '\n\n'.join(msg_list) print(msg) ggg = GmailSender('船隻爬蟲狀況通知-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email() except: msg_list = ['ip: {}'.format(get_external_ip()), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc()] print('\n\n'.join(msg_list)) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], '\n\n'.join(msg_list)) ggg.send_email()
def myships_crawler_func(self): try: self.ip = get_external_ip() except: self.ip = None try: # 檢查這台機器是否有同排程還在執行 if check_same_process_still_running(self.script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1)) return if not self.ip: raise (Exception('無法取得 IP')) try: rsp = requests.get( app.config['CRAWLER_SETTING']['MYSHIPS']['HOST_DOMAIN'], timeout=60) rsp.close() except: raise Exception('無法連線至寶船網網頁 :\n{}'.format( traceback.format_exc())) if rsp.status_code != 200: raise Exception('寶船網網頁無法正確連線 :\n{}'.format(rsp.text)) self.es = Elastic( host=app.config['ES_SETTING']['CONNECTION']['HOST'], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) if not self.es.check_index_exist( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME']): print( self.es.create_index( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO'] ['MYSHIPS']['MAPPING_FILEPATH'])) self.ship_type_dict = { x.type: x.name for x in ShipTypeMyships.query.all() } self.navistatus_type_dict = { x.type: x.name for x in NavistatusTypeMyships.query.all() } self.mmsi_dict = {} for db_result in MMSI_Info.query.with_entities( MMSI_Info.mmsi, MMSI_Info.alpha_2, MMSI_Info.alpha_3).all(): self.mmsi_dict[ db_result. mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2 # start_n = deepcopy(4000000+self.machine_serial) start_n = deepcopy(self.machine_serial) while True: if datetime.now().minute>59 \ and datetime.now().second>30: return print(start_n) end_n = start_n + 1000 * self.machine_count shipId_list = [ f'{i}' for i in range(start_n, end_n, self.machine_count) ] start_n = deepcopy(end_n) thread = threading.Thread(target=self.get_ship_detail, args=(shipId_list, ), daemon=True) thread.start() self.thread_list.append(thread) # for thread in self.thread_list: # thread.join() # pprint(self.ship_detail_dict) # if self.ship_detail_dict: # self.save2es() # pprint(self.ship_detail_dict) # return while [thread.is_alive() for thread in self.thread_list ].count(True) >= self.thread_max_count: continue delete_index_list = [] for index, thread in enumerate(self.thread_list): if not thread.is_alive(): delete_index_list.append(index) delete_index_list.reverse() for index in delete_index_list: del (self.thread_list[index]) if self.err_count >= self.err_count_max: raise Exception('\n\n'.join(self.err_msg_list)) if self.no_data_count >= self.no_data_count_max: break if self.ship_detail_dict: self.save2es() time.sleep(1) for thread in self.thread_list: thread.join() if self.ship_detail_dict: self.save2es() except: msg = '\n\n'.join([ 'ip: {}'.format(self.ip), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) self.err_msg_list.append(msg) self.err_msg_list = list(set(self.err_msg_list)) print('\n\n'.join(self.err_msg_list)) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], '\n\n'.join(self.err_msg_list)) ggg.send_email()
def shipxy_crawler_func(self): try: ip = get_external_ip() except: ip = '取得IP失敗' try: # 檢查這台機器是否有同排程還在執行 if check_same_process_still_running(self.script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1)) return except: msg = '\n\n'.join([ 'ip: {}'.format(ip), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email() # line_notify_pusher(msg) return try: self.es = Elastic( host=app.config['ES_SETTING']['CONNECTION']['HOST'], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) if not self.es.check_index_exist( app.config['ES_SETTING']['INDEX_INFO']['SHIPXY'] ['INDEX_NAME']): print( self.es.create_index( app.config['ES_SETTING']['INDEX_INFO']['SHIPXY'] ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO'] ['SHIPXY']['MAPPING_FILEPATH'])) db_result_list = AreaList.query.with_entities( AreaList.id, AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all() if not db_result_list: print('{}: 無區域的排程區間資料'.format(self.script_name)) return crawl_span_dict = { db_result.id: db_result.crawl_span for db_result in db_result_list } query_sort_conds = [SubAreaList.area_list_id] query_sort_conds.extend([x.id for x in db_result_list]) self.cold_zone_ids = set([ db_result.id for db_result in AreaList.query.filter( AreaList.enable == 1, AreaList.name.like('%冷區%')).all() ]) self.mmsi_dict = {} for db_result in MMSI_Info.query.with_entities( MMSI_Info.mmsi, MMSI_Info.alpha_2, MMSI_Info.alpha_3).all(): self.mmsi_dict[ db_result. mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2 self.sc = ShipXY_Crawler() cookies_list = [] for db_result in ShipxyAccount.query.filter( ShipxyAccount.enable == 1, ShipxyAccount.updating == 0, ShipxyAccount.updated_time >= (datetime.now() - timedelta(days=1))).all(): if not db_result.cookies: continue cookies_list.append(deepcopy(db_result.cookies)) if not cookies_list: raise Exception('{}: 無可用之帳號'.format(self.script_name)) self.sc.update_cookies_list(cookies_list) del (cookies_list) while True: if not self.crawler_status: raise Exception('\n'.join(list(set(self.error_msg_list)))) elif self.get_shipxy_thread_list and [ x.is_alive() for x in self.get_shipxy_thread_list ].count(True) >= self.get_shipxy_thread_limit_tmp: continue remove_index_list = [] for index, thread in enumerate(self.get_shipxy_thread_list): if not thread.is_alive(): remove_index_list.append(index) remove_index_list.reverse() for index in remove_index_list: del (self.get_shipxy_thread_list[index]) cookies_list = [] for cookies in self.sc.cookies_list: if 'SERVERID' in cookies: SERVERID_list = cookies['SERVERID'].split('|') SERVERID_list[1] = '{}'.format(time.time()) SERVERID_list[2] = '{}'.format(time.time()) cookies['SERVERID'] = '|'.join(SERVERID_list) cookies_list.append(cookies) self.sc.update_cookies_list(cookies_list) del (cookies_list) db_result = CrawlerMachine.query.filter( CrawlerMachine.ip == ip).first() if not db_result: db_result = CrawlerMachine(ip=ip) db_result.updatedAt = datetime.now() db.session.add(db_result) db.session.commit() machine_quantity = CrawlerMachine.query.filter( CrawlerMachine.updatedAt >= (datetime.now() - timedelta(hours=1))).count() if not machine_quantity: machine_quantity += 1 # 每個帳號平均每秒只能查詢一次區域,以避免帳號被鎖 # 算式為:(一秒/((可用帳號數量)/(機器總數)))-(這一輪當前經過的時間) self.gernal_sleep_time = ( 1 / len(self.sc.cookies_list)) * machine_quantity * 1.5 self.get_shipxy_thread_limit_tmp = (math.floor( (len(self.sc.cookies_list) / machine_quantity) / self.get_ship_detail_quantity_limit)) if not self.get_shipxy_thread_limit_tmp: raise Exception('\n'.join([ '{}: 帳號總數量未達可爬取之帳號最小數量\n'.format(self.script_name), '最小數量定義的算式為\n', '可用帳號之數量({}) 除以 機器總數({}) 除以 每個 thread 取得船隻詳細資料的子程序上限值({}) 後取最小值整數' .format(len(self.sc.cookies_list), machine_quantity, self.get_ship_detail_quantity_limit) ])) if self.db_rollback_dict: for db_result_id in list(self.db_rollback_dict.keys()): db_result = SubAreaList.query.filter( SubAreaList.id == db_result_id).first() db_result.crawler_time = self.db_rollback_dict[ db_result_id]['crawler_time'] db_result.next_time = self.db_rollback_dict[ db_result_id]['next_time'] db.session.add(db_result) del (self.db_rollback_dict[db_result_id]) db.session.commit() db_result = SubAreaList.query.filter( SubAreaList.enable == 1, SubAreaList.web == 'shipxy', or_(SubAreaList.next_time <= datetime.now(), SubAreaList.next_time == None), or_(*[ SubAreaList.area_list_id == id for id in crawl_span_dict.keys() ])).order_by(sqlalchemy.func.field(*query_sort_conds), asc(SubAreaList.next_time), func.random()).first() if not db_result: if [x.is_alive() for x in self.get_shipxy_thread_list].count(True): print( '{}: 無需要爬取的區域, 等待仍在執行的的區域爬取子程序結束中,如果所有子程序執行結束且無任何需爬取的區域,程式將會結束' .format(self.script_name)) while [ x.is_alive() for x in self.get_shipxy_thread_list ].count(True): # 如果到有區域需要再爬的時間,就繼續爬區域 if not datetime.now().minute \ or datetime.now().minute in crawl_span_dict.values(): break # 這邊寫 continue 不是 return,是因為如果子程序執行結束,時間剛好過30分或是0分,就會又有區域需要爬 continue else: print('{}: 無需要爬取的區域, 程式結束, 時間: {}'.format( self.script_name, datetime.now())) return get_shipxy_thread_input = deepcopy(db_result.json()) if db_result.area_list_id not in crawl_span_dict: crawl_span_dict[ db_result.area_list_id] = AreaList.query.filter( AreaList.id == db_result.area_list_id).first().crawl_span crawler_time = datetime.now() - timedelta( minutes=datetime.now().minute % crawl_span_dict[db_result.area_list_id]) db_result.crawler_time = datetime.strptime( crawler_time.strftime('%Y-%m-%d %H:%M:00'), '%Y-%m-%d %H:%M:%S') db_result.next_time = db_result.crawler_time + timedelta( minutes=crawl_span_dict[db_result.area_list_id]) db.session.add(db_result) db.session.commit() if db_result.lu_lat==db_result.rd_lat \ or db_result.lu_lng==db_result.rd_lng: continue db.session.rollback() db.session.close() thread = threading.Thread(target=self.get_shipxy_thread, args=(get_shipxy_thread_input, ), daemon=True) thread.start() self.get_shipxy_thread_list.append(thread) # ############################### # for thread in self.get_shipxy_thread_list: # thread.join() # return # ############################### except: msg = '\n\n'.join([ 'ip: {}'.format(ip), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email()
def myships_crawler_func(): script_name = os.path.basename(__file__) # 檢查這台機器是否有同排程還在執行 if check_same_process_still_running(script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(script_name, 1)) return try: es = Elastic( host=app.config['ES_SETTING']['CONNECTION']['HOST'], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) if not es.check_index_exist(app.config['ES_SETTING']['INDEX_INFO'] ['MYSHIPS']['INDEX_NAME']): print( es.create_index( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO'] ['MYSHIPS']['MAPPING_FILEPATH'])) db_result_list = AreaList.query.with_entities( AreaList.id, AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all() if not db_result_list: print('{}: 無區域的排程區間資料'.format(script_name)) return crawl_span_dict = { db_result.id: db_result.crawl_span for db_result in db_result_list } query_sort_conds = [SubAreaList.area_list_id] query_sort_conds.extend([x.id for x in db_result_list]) cold_zone_ids = set([ db_result.id for db_result in AreaList.query.filter( AreaList.enable == 1, AreaList.name.like('%冷區%')).all() ]) ship_type_dict = {x.type: x.name for x in ShipTypeMyships.query.all()} navistatus_type_dict = { x.type: x.name for x in NavistatusTypeMyships.query.all() } mc = Myships_Crawler() while True: db.session.rollback() db.session.close() batch_load_list = [] db_result = SubAreaList.query.filter( SubAreaList.enable == 1, SubAreaList.web == 'myships', or_(SubAreaList.next_time <= datetime.now(), SubAreaList.next_time == None), or_(*[ SubAreaList.area_list_id == id for id in crawl_span_dict.keys() ])).order_by(sqlalchemy.func.field(*query_sort_conds), asc(SubAreaList.next_time), func.random()).first() if not db_result: print('{}: 完成'.format(script_name)) return print('{}: 爬取區域 {} 中'.format(script_name, db_result.id)) crawler_time = datetime.now() - timedelta( minutes=datetime.now().minute % crawl_span_dict[db_result.area_list_id]) old_crawler_time = deepcopy(db_result.crawler_time) old_next_time = deepcopy(db_result.next_time) db_result.crawler_time = datetime.strptime( crawler_time.strftime('%Y-%m-%d %H:%M:00'), '%Y-%m-%d %H:%M:%S') db_result.next_time = db_result.crawler_time + timedelta( minutes=crawl_span_dict[db_result.area_list_id]) db.session.add(db_result) db.session.commit() if db_result.lu_lat==db_result.rd_lat \ or db_result.lu_lng==db_result.rd_lng: continue # ma_cookies_list = [x.cookies for x in MyshipsAccount.query.filter(MyshipsAccount.enable==1, MyshipsAccount.updating==0).all()] ma_cookies_list = [] try: area_result = mc.area_info( min([db_result.lu_lat, db_result.rd_lat]), min([db_result.lu_lng, db_result.rd_lng]), max([db_result.lu_lat, db_result.rd_lat]), max([db_result.lu_lng, db_result.rd_lng]), ma_cookies_list) except: db_result.crawler_time = old_crawler_time db_result.next_time = old_next_time db.session.add(db_result) db.session.commit() msg = '\n\n'.join([ 'ip: {}'.format(get_external_ip()), '時間: {}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S')), '{}\n{}'.format('取得區域船隻資料出現錯誤,請檢查是資策會端網路出現錯誤還是寶船網網站異常', traceback.format_exc()), ]) print(msg) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email() continue if area_result['code'] != '0': msg = '\n\n'.join([ 'ip: {}'.format(get_external_ip()), '時間: {}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S')), '\n'.join([ '取得寶船網區域船隻資料時出現錯誤', '{}'.format(area_result), 'id :{}'.format(db_result.id), 'area_list_id :{}'.format(db_result.area_list_id), '{}'.format({ 'age': 1440, 'rgn': mc.check_trans2myships_coord( [[ min([db_result.lu_lat, db_result.rd_lat]), min([db_result.lu_lng, db_result.rd_lng]) ], [ max([db_result.lu_lat, db_result.rd_lat]), max([db_result.lu_lng, db_result.rd_lng]) ]]) }), '{}'.format([[db_result.lu_lat, db_result.lu_lng], [db_result.rd_lat, db_result.rd_lng]]) ]) ]) print(msg) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email() continue tmp_area_data_list = area_result.pop('data') area_result['data'] = [] for area_data in tmp_area_data_list: if not area_data.get('m') \ or area_data['m']=='0': continue area_result['data'].append(area_data) # 該區域沒有任何船隻資料的話,略過 if not area_result['data']: print('{}: Skip Area {}'.format(script_name, db_result.id)) continue else: print('{}: 區域 {} 有 {} 艘船隻'.format(script_name, db_result.id, len(area_result['data']))) id_list = [] ship_data_dict = mc.ship_info( [area_data['i'] for area_data in area_result['data']]) for area_data in area_result['data']: if area_data['i'] not in ship_data_dict: print(area_data['i']) continue id_list.append('{}_{}'.format( area_data['m'], ship_data_dict[area_data['i']]['posTime'])) if id_list: es_ship_ids = set([ data['_id'] for data in es.scan( { 'query': { 'bool': { 'must': [{ 'terms': { '_id': id_list } }] } } }, app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME']) ]) else: es_ship_ids = set() for area_data in area_result['data']: if area_data['i'] not in ship_data_dict: print(area_data['i']) continue ship_data = ship_data_dict[area_data['i']] try: dictionary = { '_index': app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], '_type': '_doc', '_id': '{}_{}'.format(area_data['m'], ship_data['posTime']), '_routing': '{}'.format( (datetime.utcfromtimestamp(ship_data['posTime']) + timedelta(hours=8)).year) if ship_data['posTime'] else None, 'updatetime': ship_data['updatetime'] if ship_data.get('updatetime') else datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'eta_timestamp': ship_data['eta'], 'eta': area_data['r'], 'time': (datetime.utcfromtimestamp(ship_data['posTime']) + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S'), 'y': area_data['y'] } except: msg = '\n'.join([ traceback.format_exc(), '{}'.format(area_data), '{}'.format(ship_data) ]) ggg = GmailSender( '船隻爬蟲出現錯誤-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email() continue if dictionary['_id'] in es_ship_ids: continue try: dictionary['v'] = int(area_data['v']) except: dictionary['v'] = None if not dictionary['eta_timestamp']: dictionary['eta_timestamp'] = None dictionary['eta_datetime'] = None else: dictionary['eta_datetime'] = ( datetime.utcfromtimestamp(dictionary['eta_timestamp']) + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S') for source_key, new_key in format_batch_load_dict.items(): if source_key in area_data: dictionary[new_key] = area_data[source_key] elif source_key in ship_data: dictionary[new_key] = ship_data[source_key] dictionary['shipid'] = '{}'.format(dictionary['shipid']) for key, divisor in format_data_content.items(): if dictionary.get(key): dictionary[key] = round(dictionary[key] / divisor, 6) for key in list(dictionary.keys()): if type(dictionary[key]) is not str: continue dictionary[key] = dictionary[key].strip() if not dictionary[key] or dictionary[key] == 'NULL': dictionary[key] = None for key in ['navistatus', 'rot', 'type', 'y']: if dictionary.get(key) and type( dictionary[key] is not int): dictionary[key] = int(dictionary[key]) if dictionary['type'] not in ship_type_dict: sts_db_result = ShipTypeMyships.query.filter( ShipTypeMyships.type == dictionary['type']).first() if sts_db_result: ship_type_dict[sts_db_result.type] = sts_db_result.name else: ship_type_dict[dictionary['type']] = None dictionary['type_text'] = ship_type_dict[dictionary['type']] if dictionary['navistatus'] not in navistatus_type_dict: nt_db_result = NavistatusTypeMyships.query.filter( NavistatusTypeMyships.type == dictionary['navistatus']).first() if nt_db_result: navistatus_type_dict[ nt_db_result.type] = nt_db_result.name else: navistatus_type_dict[dictionary['navistatus']] = None dictionary['navistatus_text'] = navistatus_type_dict[ dictionary['navistatus']] batch_load_list.append(dictionary) if batch_load_list: es.batch_load(batch_load_list) # ############################# # if len(batch_load_list)>2: # return # ############################# except: msg = '\n\n'.join([ 'ip: {}'.format(get_external_ip()), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email()
def myships_crawler_func(self): # 檢查這台機器是否有同排程還在執行 if check_same_process_still_running(self.script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1)) return try: self.es = Elastic( host=app.config['ES_SETTING']['CONNECTION']['HOST'], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) if not self.es.check_index_exist( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME']): print( self.es.create_index( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO'] ['MYSHIPS']['MAPPING_FILEPATH'])) db_result_list = AreaList.query.with_entities( AreaList.id, AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all() if not db_result_list: print('{}: 無區域的排程區間資料'.format(self.script_name)) return self.crawl_span_dict = { db_result.id: db_result.crawl_span for db_result in db_result_list } self.query_sort_conds = [SubAreaList.area_list_id] self.query_sort_conds.extend([x.id for x in db_result_list]) self.ship_type_dict = { x.type: x.name for x in ShipTypeMyships.query.all() } self.navistatus_type_dict = { x.type: x.name for x in NavistatusTypeMyships.query.all() } self.mc = Myships_Crawler() while True: if datetime.now().minute>57 \ and datetime.now().second>30: return db.session.rollback() db.session.close() if self.thread_error_count >= self.thread_error_count_max: self.err_msg_list = list(set(self.err_msg_list)) raise Exception('\n\n'.join(self.err_msg_list)) del_index_list = [] for index, thread in enumerate(self.thread_list): if not thread.is_alive(): del_index_list.append(index) del_index_list.reverse() for index in del_index_list: del (self.thread_list[index]) if self.rollback_id_list: while self.rollback_id_list: db_result = SubAreaList.query.filter( SubAreaList.id == self.rollback_id_list[0]).first() db_result.crawler_time = self.time_dict[ 'old_crawler_time'][self.rollback_id_list[0]] db_result.next_time = self.time_dict['old_next_time'][ self.rollback_id_list[0]] db.session.add(db_result) del (self.time_dict['old_crawler_time'][ self.rollback_id_list[0]]) del (self.time_dict['old_next_time'][ self.rollback_id_list[0]]) del (self.rollback_id_list[0]) db.session.commit() db_result = SubAreaList.query.filter( SubAreaList.enable == 1, SubAreaList.web == 'myships', or_(SubAreaList.next_time <= datetime.now(), SubAreaList.next_time == None), or_(*[ SubAreaList.area_list_id == id for id in self.crawl_span_dict.keys() ])).order_by(sqlalchemy.func.field(*self.query_sort_conds), asc(SubAreaList.next_time), func.random()).first() if not db_result: if self.thread_list: for thread in self.thread_list: thread.join() continue print('{}: 完成'.format(self.script_name)) return print('{}: 爬取區域 {} 中'.format(self.script_name, db_result.id)) crawler_time = datetime.now() - timedelta( minutes=datetime.now().minute % self.crawl_span_dict[db_result.area_list_id]) self.time_dict['old_crawler_time'][db_result.id] = deepcopy( db_result.crawler_time) self.time_dict['old_next_time'][db_result.id] = deepcopy( db_result.next_time) db_result.crawler_time = datetime.strptime( crawler_time.strftime('%Y-%m-%d %H:%M:00'), '%Y-%m-%d %H:%M:%S') db_result.next_time = db_result.crawler_time + timedelta( minutes=self.crawl_span_dict[db_result.area_list_id]) db.session.add(db_result) db.session.commit() if db_result.lu_lat==db_result.rd_lat \ or db_result.lu_lng==db_result.rd_lng: continue self.mc.set_cookies_list([ x.cookies for x in MyshipsAccount.query.filter( MyshipsAccount.enable == 1, MyshipsAccount.updating == 0, MyshipsAccount.updated_time >= ( datetime.now() - timedelta(hours=1))).all() ]) thread = threading.Thread(target=self.myships_thread, args=(db_result.json(), ), daemon=True) thread.start() time.sleep(2) self.thread_list.append(thread) while [x.is_alive() for x in self.thread_list ].count(True) >= self.thread_max_count: continue if self.err_msg_list: self.err_msg_list = list(set(self.err_msg_list)) ggg = GmailSender( '船隻爬蟲 {} 執行完成,但途中有部份錯誤'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], '\n\n'.join(self.err_msg_list)) ggg.send_email() except: msg = '\n\n'.join([ 'ip: {}'.format(get_external_ip()), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) self.err_msg_list.append(msg) self.err_msg_list = list(set(self.err_msg_list)) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], '\n\n'.join(self.err_msg_list)) ggg.send_email()
def myships_thread(self, db_result_dict): try: batch_load_list = [] try: area_result = self.mc.area_info( min([db_result_dict['lu_lat'], db_result_dict['rd_lat']]), min([db_result_dict['lu_lng'], db_result_dict['rd_lng']]), max([db_result_dict['lu_lat'], db_result_dict['rd_lat']]), max([db_result_dict['lu_lng'], db_result_dict['rd_lng']])) except: self.thread_error_count += 1 self.rollback_id_list.append(db_result_dict['id']) msg = '\n\n'.join([ 'ip: {}'.format(get_external_ip()), '時間: {}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S')), '{}\n{}'.format('取得區域船隻資料出現錯誤,請檢查是資策會端網路出現錯誤還是寶船網網站異常', traceback.format_exc()), ]) print(msg) self.err_msg_list.append(msg) # ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) # ggg.send_email() return if area_result['code'] != '0': self.thread_error_count += 1 msg = '\n\n'.join([ 'ip: {}'.format(get_external_ip()), '時間: {}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S')), '\n'.join([ '取得寶船網區域船隻資料時出現錯誤', '{}'.format(area_result), 'id :{}'.format(db_result_dict['id']), 'area_list_id :{}'.format( db_result_dict['area_list_id']), '{}'.format([[ db_result_dict['lu_lat'], db_result_dict['lu_lng'] ], [ db_result_dict['rd_lat'], db_result_dict['rd_lng'] ]]) ]) ]) print(msg) self.err_msg_list.append(msg) # ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) # ggg.send_email() return tmp_area_data_list = area_result.pop('data') area_result['data'] = [] for area_data in tmp_area_data_list: if not area_data.get('m') \ or area_data['m']=='0': continue area_result['data'].append(area_data) # 該區域沒有任何船隻資料的話,略過 if not area_result['data']: print('{}: Skip Area {}'.format(self.script_name, db_result_dict['id'])) return else: print('{}: 區域 {} 有 {} 艘船隻'.format(self.script_name, db_result_dict['id'], len(area_result['data']))) id_list = [] ship_data_dict = self.mc.ship_info( [area_data['i'] for area_data in area_result['data']]) for area_data in area_result['data']: if area_data['i'] not in ship_data_dict: print(area_data['i']) continue id_list.append('{}_{}'.format(area_data['m'], area_data['t'])) if id_list: es_ship_ids = set([ data['_id'] for data in self.es.scan( { 'query': { 'bool': { 'must': [{ 'terms': { '_id': id_list } }] } } }, app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME']) ]) else: es_ship_ids = set() for area_data in area_result['data']: if area_data['i'] not in ship_data_dict: print(area_data['i']) continue # 有時候拉船隻詳細資料 posTime 會是 Null, 這時改為區域船隻資料的船隻資料時間點 ship_data = ship_data_dict[area_data['i']] try: dictionary = { '_index': app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], '_type': '_doc', '_id': '{}_{}'.format(area_data['m'], area_data['t']), '_routing': '{}'.format( (datetime.utcfromtimestamp(area_data['t']) + timedelta(hours=8)).year) if area_data['t'] else None, 'updatetime': area_data['updatetime'] if area_data.get('updatetime') else datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'eta_timestamp': ship_data['eta'], 'eta': area_data['r'], 'time': (datetime.utcfromtimestamp(area_data['t']) + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S'), 'y': area_data['y'] } except: self.thread_error_count += 1 msg = '\n'.join([ traceback.format_exc(), '{}'.format(area_data), '{}'.format(ship_data) ]) print(msg) self.err_msg_list.append(msg) ggg = GmailSender( '船隻爬蟲船隻資料出現異常-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg) ggg.send_email() continue if dictionary['_id'] in es_ship_ids: continue try: dictionary['v'] = int(area_data['v']) except: dictionary['v'] = None if not dictionary['eta_timestamp']: dictionary['eta_timestamp'] = None dictionary['eta_datetime'] = None else: dictionary['eta_datetime'] = ( datetime.utcfromtimestamp(dictionary['eta_timestamp']) + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S') for source_key, new_key in format_batch_load_dict.items(): if source_key in area_data: dictionary[new_key] = area_data[source_key] elif source_key in ship_data: dictionary[new_key] = ship_data[source_key] dictionary['shipid'] = '{}'.format(dictionary['shipid']) for key, divisor in format_data_content.items(): if dictionary.get(key): dictionary[key] = round(dictionary[key] / divisor, 6) for key in list(dictionary.keys()): if type(dictionary[key]) is not str: continue dictionary[key] = dictionary[key].strip() if not dictionary[key] or dictionary[key] == 'NULL': dictionary[key] = None for key in ['navistatus', 'rot', 'type', 'y']: if dictionary.get(key) and type( dictionary[key] is not int): dictionary[key] = int(dictionary[key]) if dictionary['type'] not in self.ship_type_dict: sts_db_result = ShipTypeMyships.query.filter( ShipTypeMyships.type == dictionary['type']).first() if sts_db_result: self.ship_type_dict[ sts_db_result.type] = sts_db_result.name else: self.ship_type_dict[dictionary['type']] = None dictionary['type_text'] = self.ship_type_dict[ dictionary['type']] if dictionary['navistatus'] not in self.navistatus_type_dict: nt_db_result = NavistatusTypeMyships.query.filter( NavistatusTypeMyships.type == dictionary['navistatus']).first() if nt_db_result: self.navistatus_type_dict[ nt_db_result.type] = nt_db_result.name else: self.navistatus_type_dict[ dictionary['navistatus']] = None dictionary['navistatus_text'] = self.navistatus_type_dict[ dictionary['navistatus']] batch_load_list.append(dictionary) if batch_load_list: self.es.batch_load(batch_load_list) except: self.thread_error_count += 1 msg = traceback.format_exc() print(msg) self.err_msg_list.append(msg)
def ship_account_login_func(): script_name = os.path.basename(__file__) try: # 檢查這台機器是否有同排程還在執行 if check_same_process_still_running(script_name): # 代表包含這個程式在內,有兩個ca以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(script_name, 1)) return # {"SERVERID": "ce54c768aca7be22386d8a7ce24ecdae|1596424213|1596424207", ".UserAuth2": "DC41F8152480DF00C47C6EA6666546EFD31E197BB9DCD08A6D47A60FE95B8D15A5963DA3196A79A31CD4DCABD9D15BC24D1D3B6AA57B108A9BF1C4350DAA0A12D2352E089006D4B5B285875C837BBC8A26E5069E1657CE48636716B1A820826E4AA7D4DF86AC7AA714B37C615B2A49AC245CB0FFEC011405D9F3F22085AC55D998184EF5", "FD857C2AF68165D4": "vyH3H7apfudRFtw8Dvd2z9dXvgh6/nhEsXkCr3rtsqflVSiS4EwmTNvclp+SBvVC", "ASP.NET_SessionId": "rp0u0ognzh3qzci3mnfdvjoi"} sc = ShipXY_Crawler() conds_list = [ # 登入新加入的帳號 [ShipxyAccount.enable == None], # 如果有任何帳號更新到一半,重新登入的排程crash掉,會導致該筆帳號資料 updating 會保持在1,故再重登更新失敗的帳號一次 [ ShipxyAccount.updating == 1, ShipxyAccount.updated_time <= (datetime.now() - timedelta(minutes=30)) ], # 重登登入超過23小時的帳號,由於船訊網爬蟲會撈取一天內曾登入的帳號,故最晚要在22小時更新 [ ShipxyAccount.enable == 1, ShipxyAccount.updating == 0, or_( ShipxyAccount.updated_time <= (datetime.now() - timedelta(hours=22)), ShipxyAccount.updated_time == None) ] ] account_be_banned = [] for conds in conds_list: while True: db.session.rollback() db.session.close() db_result = ShipxyAccount.query.filter(*conds).order_by( func.random()).first() if not db_result: break db_result.updating = 1 db.session.add(db_result) db.session.commit() db_result.updating = 0 db_result.cookies = sc.login(db_result.account, db_result.password, db_result.cookies) if '.UserAuth2' not in db_result.cookies or 'FD857C2AF68165D4' not in db_result.cookies: db_result.enable = 0 account_be_banned.append(db_result.account) else: db_result.enable = 1 db.session.add(db_result) db.session.commit() account_be_banned = list(set(account_be_banned)) account_be_banned.sort() if account_be_banned: msg_list = [ 'ip: {}'.format(get_external_ip()), '時間: {}'.format( datetime.now().strftime('%Y-%m-%d %H:%M:%S')), '有帳號被封鎖,下述為被封鎖之帳號', '\n'.join(account_be_banned) ] ggg = GmailSender('船隻爬蟲狀況通知-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], '\n\n'.join(msg_list)) ggg.send_email() except: msg_list = [ 'ip: {}'.format(get_external_ip()), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ] print('\n\n'.join(msg_list)) ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], '\n\n'.join(msg_list)) ggg.send_email()
def myships_crawler_func(self): print(datetime.now()) try: self.ip = get_external_ip() except: self.ip = None try: # 檢查這台機器是否有同排程還在執行 if check_same_process_still_running(self.script_name): # 代表包含這個程式在內,有兩個以上相同的排程正在運行 print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1)) return if not self.ip: raise (Exception('無法取得 IP')) try: rsp = requests.get( app.config['CRAWLER_SETTING']['MYSHIPS']['HOST_DOMAIN'], timeout=60) rsp.close() except: raise Exception('無法連線至寶船網網頁 :\n{}'.format( traceback.format_exc())) if rsp.status_code != 200: raise Exception('寶船網網頁無法正確連線 :\n{}'.format(rsp.text)) try: rsp = requests.get( 'http://{}:{}/'.format( 'localhost', app.config['ES_SETTING']['CONNECTION']['PORT']), auth=HTTPBasicAuth( app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], app.config['ES_SETTING']['CONNECTION']['PASSWORD'])) rsp.close() except: raise Exception(traceback.format_exc()) if rsp.status_code != 200: raise Exception('無法連線至資策會 ES 主機') self.es = Elastic( host=app.config['ES_SETTING']['CONNECTION']['HOST'], port=app.config['ES_SETTING']['CONNECTION']['PORT'], username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'], password=app.config['ES_SETTING']['CONNECTION']['PASSWORD']) if not self.es.check_index_exist( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME']): print( self.es.create_index( app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS'] ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO'] ['MYSHIPS']['MAPPING_FILEPATH'])) self.ship_type_dict = { x.type: x.name for x in ShipTypeMyships.query.all() } self.navistatus_type_dict = { x.type: x.name for x in NavistatusTypeMyships.query.all() } self.mmsi_dict = {} for db_result in MMSI_Info.query.with_entities( MMSI_Info.mmsi, MMSI_Info.alpha_2, MMSI_Info.alpha_3).all(): self.mmsi_dict[ db_result. mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2 print('帳戶檢查登入狀態中') account_login_timestamp = time.time() account_login_span = 1800 try: ship_account_login_func() self.cookies_list = ([ x.cookies for x in MyshipsAccount.query.filter( MyshipsAccount.enable == 1, MyshipsAccount.updating == 0, MyshipsAccount.updated_time >= ( datetime.now() - timedelta(hours=1))).all() ]) if not self.cookies_list: self.cookies_list.append({}) except: print('帳號登入失敗') print(traceback.format_exc()) self.cookies_list = [{}] # start_n = deepcopy(4000000+self.machine_serial) start_n = deepcopy(self.machine_serial) # start_n = 1660000 while True: if datetime.now().minute>59 \ and datetime.now().second>30: return print(start_n) if (time.time() - account_login_timestamp) >= account_login_span: print( f'帳戶距離上次登入時間超過 {account_login_span} 秒,等待所有 Thread 結束並重新登入後,將繼續執行' ) for thread in self.thread_list: thread.join() print('帳戶重新登入中') account_login_timestamp = time.time() try: ship_account_login_func() self.cookies_list = ([ x.cookies for x in MyshipsAccount.query.filter( MyshipsAccount.enable == 1, MyshipsAccount. updating == 0, MyshipsAccount.updated_time >= (datetime.now() - timedelta(hours=1))).all() ]) if not self.cookies_list: self.cookies_list.append({}) except: print('帳號登入失敗') print(traceback.format_exc()) self.cookies_list = [{}] end_n = start_n + 1000 * self.machine_count shipId_list = [ f'{i}' for i in range(start_n, end_n, self.machine_count) ] start_n = deepcopy(end_n) t1 = time.time() thread = threading.Thread(target=self.get_ship_detail, args=(shipId_list, ), daemon=True) thread.start() self.thread_list.append(thread) thread_sleep_time = 1 - (time.time() - t1) if thread_sleep_time > 0: time.sleep(thread_sleep_time) # for thread in self.thread_list: # thread.join() # pprint(self.ship_detail_dict) # if self.ship_detail_dict: # self.save2es() # pprint(self.ship_detail_dict) # return while [thread.is_alive() for thread in self.thread_list ].count(True) >= self.thread_max_count: continue delete_index_list = [] for index, thread in enumerate(self.thread_list): if not thread.is_alive(): delete_index_list.append(index) delete_index_list.reverse() for index in delete_index_list: del (self.thread_list[index]) if self.err_count >= self.err_count_max: raise Exception('\n\n'.join(self.err_msg_list)) if self.no_data_count >= self.no_data_count_max: break if self.ship_detail_dict and not self.save2es_thread.is_alive( ): self.save2es_thread = threading.Thread(target=self.save2es, daemon=True) self.save2es_thread.start() print('完成爬取,等待 Thread 結束') for thread in self.thread_list: thread.join() print('Thread 結束, 正在將最後剩餘資料存入 ES 中') while self.save2es_thread.is_alive(): continue if self.ship_detail_dict: self.save2es() print('結束') print(datetime.now()) exit() except: msg = '\n\n'.join([ 'ip: {}'.format(self.ip), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc() ]) print(msg) self.err_msg_list.append(msg) self.err_msg_list = list(set(self.err_msg_list)) print('\n\n'.join(self.err_msg_list))