示例#1
0
    def get_shipxy_thread(self, db_result_dict_for_func):
        try:
            # data_token_result = self.sc.getareashipssimple(db_result_dict_for_func['coor_1'], db_result_dict_for_func['coor_2'])
            data_token_result = self.sc.getareashipssimple([
                db_result_dict_for_func['lu_lat'],
                db_result_dict_for_func['lu_lng']
            ], [
                db_result_dict_for_func['rd_lat'],
                db_result_dict_for_func['rd_lng']
            ])
        except:
            print(traceback.format_exc())
            self.db_rollback_dict[
                db_result_dict_for_func['id']] = db_result_dict_for_func
        if db_result_dict_for_func['id'] in self.db_rollback_dict:
            return
        try:
            time.sleep(self.gernal_sleep_time)
            if db_result_dict_for_func['id'] in self.db_rollback_dict:
                return
            batch_load_list = []

            if data_token_result['status']!=0 \
            or 'data' not in data_token_result:
                self.crawler_status = False
                self.error_msg_list.append(
                    '{}: 取得data token失敗, 船訊網 API 回傳 {}'.format(
                        self.script_name, data_token_result))
                return

            if not data_token_result['count']:
                print('{}: 區域 {} 內無船隻資料'.format(self.script_name,
                                                db_result_dict_for_func['id']))
                return

            area_result_list = []
            area_data_list = self.sc.area_info(data_token_result['data'])
            for area_data in area_data_list:
                if not area_data.get('mmsi') \
                or area_data['mmsi']==0 \
                or area_data['mmsi']=='0':
                    continue
                area_result_list.append(area_data)

            # if len(area_data_list)!=len(area_result_list):
            #     print(
            #         '\n'.join(
            #             [
            #                 '-'*20,
            #                 '{}/{}'.format(len(area_data_list), len(area_result_list)),
            #                 '{}, {}'.format(db_result_dict_for_func['lu_lat'], db_result_dict_for_func['lu_lng']),
            #                 '{}, {}'.format(db_result_dict_for_func['rd_lat'], db_result_dict_for_func['rd_lng']),
            #                 json.dumps(area_data_list, ensure_ascii=False, indent=4),
            #                 '-'*20
            #             ]
            #         )
            #     )

            if not area_result_list:
                print('{}: 區域 {} 內無可爬的船隻資料'.format(
                    self.script_name, db_result_dict_for_func['id']))
                self.es.batch_load(batch_load_list)
                return

            gsdc = get_ship_detail_class(self.sc)
            thread_start_time = time.time()

            for index, area_data in enumerate(area_result_list):
                # 爬太久的停止機制
                if time_to_stop():
                    break
                try:
                    # 每個帳號,至少隔 1 秒requests一次,避免帳號被鎖
                    while [
                            x.is_alive()
                            for x in gsdc.get_ship_detail_thread_list
                    ].count(True) >= math.floor(
                        (self.get_shipxy_thread_limit_tmp *
                         self.get_ship_detail_quantity_limit) /
                        ([x.is_alive()
                          for x in self.get_shipxy_thread_list].count(True) +
                         1)):
                        # 爬太久的停止機制
                        if time_to_stop():
                            break
                        continue
                except:
                    lll = [
                        traceback.format_exc(),
                        '{}'.format([
                            x.is_alive()
                            for x in gsdc.get_ship_detail_thread_list
                        ].count(True)),
                        '{}'.format(self.get_shipxy_thread_limit_tmp),
                        '{}'.format(self.get_ship_detail_quantity_limit),
                        '{}'.format([
                            x.is_alive() for x in self.get_shipxy_thread_list
                        ].count(True)),
                    ]
                    raise Exception('\n'.join(lll))
                remove_index_list = []
                for index, thread in enumerate(
                        gsdc.get_ship_detail_thread_list):
                    if not thread.is_alive():
                        remove_index_list.append(index)
                remove_index_list.reverse()
                for index in remove_index_list:
                    del (gsdc.get_ship_detail_thread_list[index])
                thread = threading.Thread(target=gsdc.get_ship_detail,
                                          args=(area_data, ),
                                          daemon=True)
                thread.start()
                gsdc.get_ship_detail_thread_list.append(thread)
                time.sleep(self.gernal_sleep_time)

            while [x.is_alive()
                   for x in gsdc.get_ship_detail_thread_list].count(True):
                # 爬太久的停止機制
                if time_to_stop():
                    break
                continue

            print('爬取區域: {}, 耗費時間: {}, 船隻數量: {}'.format(
                db_result_dict_for_func['id'],
                round((time.time() - thread_start_time), 1),
                len(area_result_list)))

            # 爬太久的停止機制
            if not time_to_stop():
                if len(area_result_list
                       ) >= 100 and not gsdc.thread_result_dict:
                    self.crawler_status = False
                    self.error_msg_list.append('\n'.join(
                        list(set(gsdc.error_msg_list))))
                    return
                elif len(area_result_list) >= 100 and int(
                        len(area_result_list) * 0.8) > len(
                            list(gsdc.thread_result_dict.keys())):
                    self.db_rollback_dict[db_result_dict_for_func[
                        'id']] = db_result_dict_for_func
                    self.error_msg_list.append('\n'.join(
                        list(set(gsdc.error_msg_list))))
                    return

            id_list = []
            for area_data in area_result_list:
                if area_data['mmsi'] not in gsdc.thread_result_dict:
                    continue
                id_list.append('{}_{}'.format(
                    area_data['mmsi'],
                    gsdc.thread_result_dict[area_data['mmsi']]['lastdyn']))
            if id_list:
                es_ship_ids = set([
                    data['_id'] for data in self.es.scan(
                        {
                            'query': {
                                'bool': {
                                    'must': [{
                                        'terms': {
                                            '_id': id_list
                                        }
                                    }]
                                }
                            }
                        }, app.config['ES_SETTING']['INDEX_INFO']['SHIPXY']
                        ['INDEX_NAME'])
                ])
            else:
                es_ship_ids = set()

            # print(
            #     '\n'.join(
            #         [
            #             'area_result_list len: {}'.format(len(area_result_list)),
            #             'id_list len: {}'.format(len(id_list)),
            #             'es_ship_ids len: {}'.format(len(list(es_ship_ids))),
            #             'area_result_list-id_list= {}'.format(len(area_result_list)-len(id_list)),
            #             'id_list-es_ship_ids= {}'.format(len(id_list)-len(list(es_ship_ids)))
            #         ]
            #     )
            # )

            delete_list = []
            for index, area_data in enumerate(area_result_list):
                if area_data['mmsi'] not in gsdc.thread_result_dict:
                    print('{} : 未取得船隻 {} 之詳細資訊,略過之'.format(
                        self.script_name, area_data['mmsi']))
                    continue
                dictionary = deepcopy(
                    gsdc.thread_result_dict[area_data['mmsi']])
                dictionary['latitude'] = area_data['lat']  # 緯度
                dictionary['longitude'] = area_data['lng']  # 經度

                dictionary = deepcopy(
                    gsdc.thread_result_dict[area_data['mmsi']])
                dictionary['_index'] = app.config['ES_SETTING']['INDEX_INFO'][
                    'SHIPXY']['INDEX_NAME']
                dictionary['_type'] = '_doc'
                dictionary['_id'] = '{}_{}'.format(area_data['mmsi'],
                                                   dictionary['lastdyn'])

                if dictionary['_id'] in es_ship_ids:
                    continue

                # dictionary['area_list_id'] = db_result_dict_for_func['area_list_id']

                dictionary['nationality'] = self.mmsi_dict[
                    dictionary['mmsi']
                    [:3]] if dictionary['mmsi'][:3] in self.mmsi_dict else None

                dictionary['cog'] = dictionary['cog'] / 100  # 對地航向
                dictionary['draught'] = dictionary['draught'] / 1000  # 吃水
                dictionary['hdg'] = dictionary['hdg'] / 100  # 船首向
                # # 船訊網可能會出現heading被設為51100,網頁上船首向為0的狀況
                # if dictionary['hdg']>360:
                #     dictionary['hdg'] = 0
                for key in ['lat', 'lon']:
                    dictionary.pop(key)
                dictionary['latitude'] = area_data['lat']  # 緯度
                dictionary['longitude'] = area_data['lng']  # 經度
                dictionary['sog'] = round(dictionary['sog'] / 5133 * 10,
                                          2)  # 速度:節
                dictionary['length'] = dictionary['length'] / 10  # 船長
                dictionary['lineWidth'] = area_data['lineWidth']
                dictionary['width'] = dictionary['width'] / 10  # 船寬

                dictionary['lastdyn_active'] = area_data[
                    'lastdyn_active']  # 是否可擷取資料
                dictionary['offset'] = area_data['offset']
                dictionary['rot'] = area_data.get('rot')
                dictionary['rotate'] = area_data['rotate']
                dictionary['shiptype'] = area_data['shiptype']
                dictionary['state'] = area_data['state']
                dictionary['state_color'] = area_data['state_color']
                dictionary['istop'] = area_data['istop']
                dictionary['tracks'] = area_data['tracks']

                dictionary['tcname'] = s2tw_converter(dictionary['cnname'])

                dictionary['utc_timestamp'] = dictionary.pop('lastdyn')

                dictionary['time'] = (
                    datetime.utcfromtimestamp(dictionary['utc_timestamp']) +
                    timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')

                if dictionary['type'] not in self.ship_type_dict:
                    sts_db_result = ShipTypeShipxy.query.filter(
                        ShipTypeShipxy.type == dictionary['type']).first()
                    if sts_db_result:
                        self.ship_type_dict[
                            sts_db_result.type] = sts_db_result.name
                    else:
                        self.ship_type_dict[dictionary['type']] = None
                dictionary['type_text'] = self.ship_type_dict[
                    dictionary['type']]

                if dictionary['navistatus'] not in self.navistatus_type_dict:
                    nt_db_result = NavistatusTypeShipxy.query.filter(
                        NavistatusTypeShipxy.type ==
                        dictionary['navistatus']).first()
                    if nt_db_result:
                        self.navistatus_type_dict[
                            nt_db_result.type] = nt_db_result.name
                    else:
                        self.navistatus_type_dict[
                            dictionary['navistatus']] = None
                dictionary['navistatus_text'] = self.navistatus_type_dict[
                    dictionary['navistatus']]

                dictionary['_routing'] = '{}'.format(
                    (datetime.utcfromtimestamp(dictionary['utc_timestamp']) +
                     timedelta(hours=8)).year)

                # dictionary['updatetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                # 這邊是如果字串後面有包含空格,就去除空格
                # 如果是空字串,就設為null
                for key in list(dictionary.keys()):
                    if type(dictionary[key]) is not str:
                        continue
                    dictionary[key] = dictionary[key].strip()
                    if not dictionary[key] or dictionary[key] == 'NULL':
                        dictionary[key] = None

                batch_load_list.append(dictionary)
            if delete_list:
                # 因為可能三台機器爬到同區域,如果有其中一台有刪除資料,這邊 Delete 就會出錯
                try:
                    self.es.delete_data(delete_list)
                except:
                    pass
            if batch_load_list:
                self.es.batch_load(batch_load_list)
            del (delete_list, batch_load_list)
        except:
            msg_list = [
                'ip: {}'.format(get_external_ip()),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ]
            print('\n\n'.join(msg_list))
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                              '\n\n'.join(msg_list))
            ggg.send_email()
示例#2
0
def ship_account_login_func():
    script_name = os.path.basename(__file__)
    try:
        # 檢查這台機器是否有同排程還在執行
        if check_same_process_still_running(script_name):
            # 代表包含這個程式在內,有兩個ca以上相同的排程正在運行
            print('{}: 有相同排程尚在執行({})'.format(script_name, 1))
            return

        # 預防當要登入的帳號只有一個,但不同主機搶同一帳號登入
        try:
            time.sleep(os.environ.get('SERIAL'))
        except:
            pass

        f = open(app.config['CRAWLER_SETTING']['MYSHIPS']['JS_DEMIX_FILEPATH'], 'r')
        js_content = f.read()
        f.close()

        conds_list = [
            # 登入新加入的帳號
            [
                MyshipsAccount.enable==None
            ],

            # 如果有任何帳號更新到一半,重新登入的排程crash掉,會導致該筆帳號資料 updating 會保持在1,故再重登更新失敗的帳號一次
            [
                MyshipsAccount.updating==1, 
                MyshipsAccount.updated_time<=(datetime.now()-timedelta(minutes=30))
            ],

            # 重登登入超過23小時的帳號,由於船訊網爬蟲會撈取一天內曾登入的帳號,故最晚要在22小時更新
            [
                MyshipsAccount.enable==1, 
                MyshipsAccount.updating==0,
                or_(
                    MyshipsAccount.updated_time<=(datetime.now()-timedelta(minutes=30)),
                    MyshipsAccount.updated_time==None
                )
            ]
        ]
        headers = {
            'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
            'Connection':'keep-alive'
        }
        err_msg_list = []
        account_be_banned = []
        for conds in conds_list:
            while True:
                db.session.rollback()
                db.session.close()
                db_result = MyshipsAccount.query.filter(*conds).order_by(func.random()).first()
                if not db_result:
                    break
                db_result.updating = 1
                db.session.add(db_result)
                db.session.commit()

                db_result.updating = 0
                session = requests.Session()

                complie_result = execjs_compile(js_content).call('shipencode') # {'time': 1607309565213, 'pwdSign': '50aa7d92baa11df578ad254e252928c8'}
                complie_result['user'] = db_result.account

                try:
                    rsp = session.post(app.config['CRAWLER_SETTING']['MYSHIPS']['LOGIN'], headers=headers, json=complie_result, timeout=180)
                    rsp.close()
                except:
                    raise Exception(traceback.format_exc())
                try:
                    rsp_result = rsp.json()
                except:
                    raise Exception(rsp.text)
                if rsp_result['code']=='0':
                    pprint(rsp_result)
                    err_msg_list.append(f"{rsp_result}")
                    db_result.cookies = deepcopy(session.cookies.get_dict())
                    db_result.enable = 1
                else:
                    db_result.enable = 0
                    account_be_banned.append(db_result.account)
                db.session.add(db_result)
                db.session.commit()
        account_be_banned = list(set(account_be_banned))
        account_be_banned.sort()
        if account_be_banned:
            msg_list = [
                'ip: {}'.format(get_external_ip()), 
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), 
                '有帳號被封鎖,下述為被封鎖之帳號',
                '\n'.join(account_be_banned),
                '\n'.join(list(set(err_msg_list)))
                ]
            msg = '\n\n'.join(msg_list)
            print(msg)
            ggg = GmailSender('船隻爬蟲狀況通知-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
            ggg.send_email()
    except:
        msg_list = ['ip: {}'.format(get_external_ip()), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc()]
        print('\n\n'.join(msg_list))
        ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], '\n\n'.join(msg_list))
        ggg.send_email()
    def myships_crawler_func(self):
        try:
            self.ip = get_external_ip()
        except:
            self.ip = None
        try:
            # 檢查這台機器是否有同排程還在執行
            if check_same_process_still_running(self.script_name):
                # 代表包含這個程式在內,有兩個以上相同的排程正在運行
                print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1))
                return
            if not self.ip:
                raise (Exception('無法取得 IP'))
            try:
                rsp = requests.get(
                    app.config['CRAWLER_SETTING']['MYSHIPS']['HOST_DOMAIN'],
                    timeout=60)
                rsp.close()
            except:
                raise Exception('無法連線至寶船網網頁 :\n{}'.format(
                    traceback.format_exc()))
            if rsp.status_code != 200:
                raise Exception('寶船網網頁無法正確連線 :\n{}'.format(rsp.text))

            self.es = Elastic(
                host=app.config['ES_SETTING']['CONNECTION']['HOST'],
                port=app.config['ES_SETTING']['CONNECTION']['PORT'],
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])

            if not self.es.check_index_exist(
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                ['INDEX_NAME']):
                print(
                    self.es.create_index(
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                        ['MYSHIPS']['MAPPING_FILEPATH']))

            self.ship_type_dict = {
                x.type: x.name
                for x in ShipTypeMyships.query.all()
            }
            self.navistatus_type_dict = {
                x.type: x.name
                for x in NavistatusTypeMyships.query.all()
            }
            self.mmsi_dict = {}
            for db_result in MMSI_Info.query.with_entities(
                    MMSI_Info.mmsi, MMSI_Info.alpha_2,
                    MMSI_Info.alpha_3).all():
                self.mmsi_dict[
                    db_result.
                    mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2

            # start_n = deepcopy(4000000+self.machine_serial)
            start_n = deepcopy(self.machine_serial)
            while True:
                if datetime.now().minute>59 \
                and datetime.now().second>30:
                    return
                print(start_n)
                end_n = start_n + 1000 * self.machine_count
                shipId_list = [
                    f'{i}' for i in range(start_n, end_n, self.machine_count)
                ]
                start_n = deepcopy(end_n)

                thread = threading.Thread(target=self.get_ship_detail,
                                          args=(shipId_list, ),
                                          daemon=True)
                thread.start()
                self.thread_list.append(thread)

                # for thread in self.thread_list:
                #     thread.join()
                # pprint(self.ship_detail_dict)
                # if self.ship_detail_dict:
                #     self.save2es()
                # pprint(self.ship_detail_dict)
                # return

                while [thread.is_alive() for thread in self.thread_list
                       ].count(True) >= self.thread_max_count:
                    continue
                delete_index_list = []
                for index, thread in enumerate(self.thread_list):
                    if not thread.is_alive():
                        delete_index_list.append(index)
                delete_index_list.reverse()
                for index in delete_index_list:
                    del (self.thread_list[index])
                if self.err_count >= self.err_count_max:
                    raise Exception('\n\n'.join(self.err_msg_list))
                if self.no_data_count >= self.no_data_count_max:
                    break
                if self.ship_detail_dict:
                    self.save2es()
                time.sleep(1)
            for thread in self.thread_list:
                thread.join()
            if self.ship_detail_dict:
                self.save2es()
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(self.ip),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            self.err_msg_list.append(msg)
            self.err_msg_list = list(set(self.err_msg_list))
            print('\n\n'.join(self.err_msg_list))
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                              '\n\n'.join(self.err_msg_list))
            ggg.send_email()
示例#4
0
    def shipxy_crawler_func(self):
        try:
            ip = get_external_ip()
        except:
            ip = '取得IP失敗'
        try:
            # 檢查這台機器是否有同排程還在執行
            if check_same_process_still_running(self.script_name):
                # 代表包含這個程式在內,有兩個以上相同的排程正在運行
                print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1))
                return
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(ip),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
            ggg.send_email()
            # line_notify_pusher(msg)
            return

        try:
            self.es = Elastic(
                host=app.config['ES_SETTING']['CONNECTION']['HOST'],
                port=app.config['ES_SETTING']['CONNECTION']['PORT'],
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])
            if not self.es.check_index_exist(
                    app.config['ES_SETTING']['INDEX_INFO']['SHIPXY']
                ['INDEX_NAME']):
                print(
                    self.es.create_index(
                        app.config['ES_SETTING']['INDEX_INFO']['SHIPXY']
                        ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                        ['SHIPXY']['MAPPING_FILEPATH']))

            db_result_list = AreaList.query.with_entities(
                AreaList.id,
                AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all()
            if not db_result_list:
                print('{}: 無區域的排程區間資料'.format(self.script_name))
                return
            crawl_span_dict = {
                db_result.id: db_result.crawl_span
                for db_result in db_result_list
            }
            query_sort_conds = [SubAreaList.area_list_id]
            query_sort_conds.extend([x.id for x in db_result_list])

            self.cold_zone_ids = set([
                db_result.id for db_result in AreaList.query.filter(
                    AreaList.enable == 1, AreaList.name.like('%冷區%')).all()
            ])

            self.mmsi_dict = {}
            for db_result in MMSI_Info.query.with_entities(
                    MMSI_Info.mmsi, MMSI_Info.alpha_2,
                    MMSI_Info.alpha_3).all():
                self.mmsi_dict[
                    db_result.
                    mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2

            self.sc = ShipXY_Crawler()

            cookies_list = []
            for db_result in ShipxyAccount.query.filter(
                    ShipxyAccount.enable == 1, ShipxyAccount.updating == 0,
                    ShipxyAccount.updated_time >=
                (datetime.now() - timedelta(days=1))).all():
                if not db_result.cookies:
                    continue
                cookies_list.append(deepcopy(db_result.cookies))
            if not cookies_list:
                raise Exception('{}: 無可用之帳號'.format(self.script_name))
            self.sc.update_cookies_list(cookies_list)

            del (cookies_list)

            while True:
                if not self.crawler_status:
                    raise Exception('\n'.join(list(set(self.error_msg_list))))
                elif self.get_shipxy_thread_list and [
                        x.is_alive() for x in self.get_shipxy_thread_list
                ].count(True) >= self.get_shipxy_thread_limit_tmp:
                    continue
                remove_index_list = []
                for index, thread in enumerate(self.get_shipxy_thread_list):
                    if not thread.is_alive():
                        remove_index_list.append(index)
                remove_index_list.reverse()
                for index in remove_index_list:
                    del (self.get_shipxy_thread_list[index])

                cookies_list = []
                for cookies in self.sc.cookies_list:
                    if 'SERVERID' in cookies:
                        SERVERID_list = cookies['SERVERID'].split('|')
                        SERVERID_list[1] = '{}'.format(time.time())
                        SERVERID_list[2] = '{}'.format(time.time())
                        cookies['SERVERID'] = '|'.join(SERVERID_list)
                    cookies_list.append(cookies)
                self.sc.update_cookies_list(cookies_list)
                del (cookies_list)

                db_result = CrawlerMachine.query.filter(
                    CrawlerMachine.ip == ip).first()
                if not db_result:
                    db_result = CrawlerMachine(ip=ip)
                db_result.updatedAt = datetime.now()
                db.session.add(db_result)
                db.session.commit()

                machine_quantity = CrawlerMachine.query.filter(
                    CrawlerMachine.updatedAt >= (datetime.now() -
                                                 timedelta(hours=1))).count()
                if not machine_quantity:
                    machine_quantity += 1
                # 每個帳號平均每秒只能查詢一次區域,以避免帳號被鎖
                # 算式為:(一秒/((可用帳號數量)/(機器總數)))-(這一輪當前經過的時間)
                self.gernal_sleep_time = (
                    1 / len(self.sc.cookies_list)) * machine_quantity * 1.5

                self.get_shipxy_thread_limit_tmp = (math.floor(
                    (len(self.sc.cookies_list) / machine_quantity) /
                    self.get_ship_detail_quantity_limit))
                if not self.get_shipxy_thread_limit_tmp:
                    raise Exception('\n'.join([
                        '{}: 帳號總數量未達可爬取之帳號最小數量\n'.format(self.script_name),
                        '最小數量定義的算式為\n',
                        '可用帳號之數量({}) 除以 機器總數({}) 除以 每個 thread 取得船隻詳細資料的子程序上限值({}) 後取最小值整數'
                        .format(len(self.sc.cookies_list), machine_quantity,
                                self.get_ship_detail_quantity_limit)
                    ]))

                if self.db_rollback_dict:
                    for db_result_id in list(self.db_rollback_dict.keys()):
                        db_result = SubAreaList.query.filter(
                            SubAreaList.id == db_result_id).first()
                        db_result.crawler_time = self.db_rollback_dict[
                            db_result_id]['crawler_time']
                        db_result.next_time = self.db_rollback_dict[
                            db_result_id]['next_time']
                        db.session.add(db_result)
                        del (self.db_rollback_dict[db_result_id])
                    db.session.commit()

                db_result = SubAreaList.query.filter(
                    SubAreaList.enable == 1, SubAreaList.web == 'shipxy',
                    or_(SubAreaList.next_time <= datetime.now(),
                        SubAreaList.next_time == None),
                    or_(*[
                        SubAreaList.area_list_id == id
                        for id in crawl_span_dict.keys()
                    ])).order_by(sqlalchemy.func.field(*query_sort_conds),
                                 asc(SubAreaList.next_time),
                                 func.random()).first()

                if not db_result:
                    if [x.is_alive()
                            for x in self.get_shipxy_thread_list].count(True):
                        print(
                            '{}: 無需要爬取的區域, 等待仍在執行的的區域爬取子程序結束中,如果所有子程序執行結束且無任何需爬取的區域,程式將會結束'
                            .format(self.script_name))
                        while [
                                x.is_alive()
                                for x in self.get_shipxy_thread_list
                        ].count(True):
                            # 如果到有區域需要再爬的時間,就繼續爬區域
                            if not datetime.now().minute \
                            or datetime.now().minute in crawl_span_dict.values():
                                break
                        # 這邊寫 continue 不是 return,是因為如果子程序執行結束,時間剛好過30分或是0分,就會又有區域需要爬
                        continue
                    else:
                        print('{}: 無需要爬取的區域, 程式結束, 時間: {}'.format(
                            self.script_name, datetime.now()))
                    return

                get_shipxy_thread_input = deepcopy(db_result.json())

                if db_result.area_list_id not in crawl_span_dict:
                    crawl_span_dict[
                        db_result.area_list_id] = AreaList.query.filter(
                            AreaList.id ==
                            db_result.area_list_id).first().crawl_span
                crawler_time = datetime.now() - timedelta(
                    minutes=datetime.now().minute %
                    crawl_span_dict[db_result.area_list_id])
                db_result.crawler_time = datetime.strptime(
                    crawler_time.strftime('%Y-%m-%d %H:%M:00'),
                    '%Y-%m-%d %H:%M:%S')
                db_result.next_time = db_result.crawler_time + timedelta(
                    minutes=crawl_span_dict[db_result.area_list_id])
                db.session.add(db_result)
                db.session.commit()

                if db_result.lu_lat==db_result.rd_lat \
                or db_result.lu_lng==db_result.rd_lng:
                    continue

                db.session.rollback()
                db.session.close()

                thread = threading.Thread(target=self.get_shipxy_thread,
                                          args=(get_shipxy_thread_input, ),
                                          daemon=True)
                thread.start()
                self.get_shipxy_thread_list.append(thread)

                # ###############################
                # for thread in self.get_shipxy_thread_list:
                #     thread.join()
                # return
                # ###############################
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(ip),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
            ggg.send_email()
示例#5
0
def myships_crawler_func():
    script_name = os.path.basename(__file__)
    # 檢查這台機器是否有同排程還在執行
    if check_same_process_still_running(script_name):
        # 代表包含這個程式在內,有兩個以上相同的排程正在運行
        print('{}: 有相同排程尚在執行({})'.format(script_name, 1))
        return
    try:
        es = Elastic(
            host=app.config['ES_SETTING']['CONNECTION']['HOST'],
            port=app.config['ES_SETTING']['CONNECTION']['PORT'],
            username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
            password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])

        if not es.check_index_exist(app.config['ES_SETTING']['INDEX_INFO']
                                    ['MYSHIPS']['INDEX_NAME']):
            print(
                es.create_index(
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                    ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                    ['MYSHIPS']['MAPPING_FILEPATH']))

        db_result_list = AreaList.query.with_entities(
            AreaList.id,
            AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all()
        if not db_result_list:
            print('{}: 無區域的排程區間資料'.format(script_name))
            return
        crawl_span_dict = {
            db_result.id: db_result.crawl_span
            for db_result in db_result_list
        }
        query_sort_conds = [SubAreaList.area_list_id]
        query_sort_conds.extend([x.id for x in db_result_list])

        cold_zone_ids = set([
            db_result.id for db_result in AreaList.query.filter(
                AreaList.enable == 1, AreaList.name.like('%冷區%')).all()
        ])

        ship_type_dict = {x.type: x.name for x in ShipTypeMyships.query.all()}
        navistatus_type_dict = {
            x.type: x.name
            for x in NavistatusTypeMyships.query.all()
        }

        mc = Myships_Crawler()
        while True:
            db.session.rollback()
            db.session.close()
            batch_load_list = []

            db_result = SubAreaList.query.filter(
                SubAreaList.enable == 1, SubAreaList.web == 'myships',
                or_(SubAreaList.next_time <= datetime.now(),
                    SubAreaList.next_time == None),
                or_(*[
                    SubAreaList.area_list_id == id
                    for id in crawl_span_dict.keys()
                ])).order_by(sqlalchemy.func.field(*query_sort_conds),
                             asc(SubAreaList.next_time),
                             func.random()).first()
            if not db_result:
                print('{}: 完成'.format(script_name))
                return
            print('{}: 爬取區域 {} 中'.format(script_name, db_result.id))
            crawler_time = datetime.now() - timedelta(
                minutes=datetime.now().minute %
                crawl_span_dict[db_result.area_list_id])
            old_crawler_time = deepcopy(db_result.crawler_time)
            old_next_time = deepcopy(db_result.next_time)
            db_result.crawler_time = datetime.strptime(
                crawler_time.strftime('%Y-%m-%d %H:%M:00'),
                '%Y-%m-%d %H:%M:%S')
            db_result.next_time = db_result.crawler_time + timedelta(
                minutes=crawl_span_dict[db_result.area_list_id])
            db.session.add(db_result)
            db.session.commit()

            if db_result.lu_lat==db_result.rd_lat \
            or db_result.lu_lng==db_result.rd_lng:
                continue

            # ma_cookies_list = [x.cookies for x in MyshipsAccount.query.filter(MyshipsAccount.enable==1, MyshipsAccount.updating==0).all()]
            ma_cookies_list = []
            try:
                area_result = mc.area_info(
                    min([db_result.lu_lat, db_result.rd_lat]),
                    min([db_result.lu_lng, db_result.rd_lng]),
                    max([db_result.lu_lat, db_result.rd_lat]),
                    max([db_result.lu_lng, db_result.rd_lng]), ma_cookies_list)
            except:
                db_result.crawler_time = old_crawler_time
                db_result.next_time = old_next_time
                db.session.add(db_result)
                db.session.commit()

                msg = '\n\n'.join([
                    'ip: {}'.format(get_external_ip()),
                    '時間: {}'.format(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                    '{}\n{}'.format('取得區域船隻資料出現錯誤,請檢查是資策會端網路出現錯誤還是寶船網網站異常',
                                    traceback.format_exc()),
                ])
                print(msg)
                ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name),
                                  app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                                  msg)
                ggg.send_email()
                continue

            if area_result['code'] != '0':
                msg = '\n\n'.join([
                    'ip: {}'.format(get_external_ip()), '時間: {}'.format(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                    '\n'.join([
                        '取得寶船網區域船隻資料時出現錯誤', '{}'.format(area_result),
                        'id :{}'.format(db_result.id),
                        'area_list_id :{}'.format(db_result.area_list_id),
                        '{}'.format({
                            'age':
                            1440,
                            'rgn':
                            mc.check_trans2myships_coord(
                                [[
                                    min([db_result.lu_lat, db_result.rd_lat]),
                                    min([db_result.lu_lng, db_result.rd_lng])
                                ],
                                 [
                                     max([db_result.lu_lat, db_result.rd_lat]),
                                     max([db_result.lu_lng, db_result.rd_lng])
                                 ]])
                        }), '{}'.format([[db_result.lu_lat, db_result.lu_lng],
                                         [db_result.rd_lat, db_result.rd_lng]])
                    ])
                ])
                print(msg)
                ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name),
                                  app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                                  msg)
                ggg.send_email()
                continue

            tmp_area_data_list = area_result.pop('data')
            area_result['data'] = []

            for area_data in tmp_area_data_list:
                if not area_data.get('m') \
                or area_data['m']=='0':
                    continue
                area_result['data'].append(area_data)

            # 該區域沒有任何船隻資料的話,略過
            if not area_result['data']:
                print('{}: Skip Area {}'.format(script_name, db_result.id))
                continue
            else:
                print('{}: 區域 {} 有 {} 艘船隻'.format(script_name, db_result.id,
                                                  len(area_result['data'])))

            id_list = []
            ship_data_dict = mc.ship_info(
                [area_data['i'] for area_data in area_result['data']])
            for area_data in area_result['data']:
                if area_data['i'] not in ship_data_dict:
                    print(area_data['i'])
                    continue
                id_list.append('{}_{}'.format(
                    area_data['m'], ship_data_dict[area_data['i']]['posTime']))
            if id_list:
                es_ship_ids = set([
                    data['_id'] for data in es.scan(
                        {
                            'query': {
                                'bool': {
                                    'must': [{
                                        'terms': {
                                            '_id': id_list
                                        }
                                    }]
                                }
                            }
                        }, app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'])
                ])
            else:
                es_ship_ids = set()

            for area_data in area_result['data']:
                if area_data['i'] not in ship_data_dict:
                    print(area_data['i'])
                    continue
                ship_data = ship_data_dict[area_data['i']]
                try:
                    dictionary = {
                        '_index':
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'],
                        '_type':
                        '_doc',
                        '_id':
                        '{}_{}'.format(area_data['m'], ship_data['posTime']),
                        '_routing':
                        '{}'.format(
                            (datetime.utcfromtimestamp(ship_data['posTime']) +
                             timedelta(hours=8)).year)
                        if ship_data['posTime'] else None,
                        'updatetime':
                        ship_data['updatetime'] if ship_data.get('updatetime')
                        else datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        'eta_timestamp':
                        ship_data['eta'],
                        'eta':
                        area_data['r'],
                        'time':
                        (datetime.utcfromtimestamp(ship_data['posTime']) +
                         timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S'),
                        'y':
                        area_data['y']
                    }
                except:
                    msg = '\n'.join([
                        traceback.format_exc(), '{}'.format(area_data),
                        '{}'.format(ship_data)
                    ])
                    ggg = GmailSender(
                        '船隻爬蟲出現錯誤-{}'.format(script_name),
                        app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
                    ggg.send_email()
                    continue

                if dictionary['_id'] in es_ship_ids:
                    continue

                try:
                    dictionary['v'] = int(area_data['v'])
                except:
                    dictionary['v'] = None

                if not dictionary['eta_timestamp']:
                    dictionary['eta_timestamp'] = None
                    dictionary['eta_datetime'] = None
                else:
                    dictionary['eta_datetime'] = (
                        datetime.utcfromtimestamp(dictionary['eta_timestamp'])
                        + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')

                for source_key, new_key in format_batch_load_dict.items():
                    if source_key in area_data:
                        dictionary[new_key] = area_data[source_key]
                    elif source_key in ship_data:
                        dictionary[new_key] = ship_data[source_key]

                dictionary['shipid'] = '{}'.format(dictionary['shipid'])

                for key, divisor in format_data_content.items():
                    if dictionary.get(key):
                        dictionary[key] = round(dictionary[key] / divisor, 6)

                for key in list(dictionary.keys()):
                    if type(dictionary[key]) is not str:
                        continue
                    dictionary[key] = dictionary[key].strip()
                    if not dictionary[key] or dictionary[key] == 'NULL':
                        dictionary[key] = None
                for key in ['navistatus', 'rot', 'type', 'y']:
                    if dictionary.get(key) and type(
                            dictionary[key] is not int):
                        dictionary[key] = int(dictionary[key])

                if dictionary['type'] not in ship_type_dict:
                    sts_db_result = ShipTypeMyships.query.filter(
                        ShipTypeMyships.type == dictionary['type']).first()
                    if sts_db_result:
                        ship_type_dict[sts_db_result.type] = sts_db_result.name
                    else:
                        ship_type_dict[dictionary['type']] = None
                dictionary['type_text'] = ship_type_dict[dictionary['type']]

                if dictionary['navistatus'] not in navistatus_type_dict:
                    nt_db_result = NavistatusTypeMyships.query.filter(
                        NavistatusTypeMyships.type ==
                        dictionary['navistatus']).first()
                    if nt_db_result:
                        navistatus_type_dict[
                            nt_db_result.type] = nt_db_result.name
                    else:
                        navistatus_type_dict[dictionary['navistatus']] = None
                dictionary['navistatus_text'] = navistatus_type_dict[
                    dictionary['navistatus']]

                batch_load_list.append(dictionary)

            if batch_load_list:
                es.batch_load(batch_load_list)
            # #############################
            # if len(batch_load_list)>2:
            #     return
            # #############################
    except:
        msg = '\n\n'.join([
            'ip: {}'.format(get_external_ip()),
            '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
            traceback.format_exc()
        ])
        print(msg)
        ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name),
                          app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
        ggg.send_email()
    def myships_crawler_func(self):
        # 檢查這台機器是否有同排程還在執行
        if check_same_process_still_running(self.script_name):
            # 代表包含這個程式在內,有兩個以上相同的排程正在運行
            print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1))
            return
        try:
            self.es = Elastic(
                host=app.config['ES_SETTING']['CONNECTION']['HOST'],
                port=app.config['ES_SETTING']['CONNECTION']['PORT'],
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])

            if not self.es.check_index_exist(
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                ['INDEX_NAME']):
                print(
                    self.es.create_index(
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                        ['MYSHIPS']['MAPPING_FILEPATH']))

            db_result_list = AreaList.query.with_entities(
                AreaList.id,
                AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all()
            if not db_result_list:
                print('{}: 無區域的排程區間資料'.format(self.script_name))
                return
            self.crawl_span_dict = {
                db_result.id: db_result.crawl_span
                for db_result in db_result_list
            }
            self.query_sort_conds = [SubAreaList.area_list_id]
            self.query_sort_conds.extend([x.id for x in db_result_list])

            self.ship_type_dict = {
                x.type: x.name
                for x in ShipTypeMyships.query.all()
            }
            self.navistatus_type_dict = {
                x.type: x.name
                for x in NavistatusTypeMyships.query.all()
            }

            self.mc = Myships_Crawler()
            while True:
                if datetime.now().minute>57 \
                and datetime.now().second>30:
                    return
                db.session.rollback()
                db.session.close()

                if self.thread_error_count >= self.thread_error_count_max:
                    self.err_msg_list = list(set(self.err_msg_list))
                    raise Exception('\n\n'.join(self.err_msg_list))

                del_index_list = []
                for index, thread in enumerate(self.thread_list):
                    if not thread.is_alive():
                        del_index_list.append(index)
                del_index_list.reverse()
                for index in del_index_list:
                    del (self.thread_list[index])
                if self.rollback_id_list:
                    while self.rollback_id_list:
                        db_result = SubAreaList.query.filter(
                            SubAreaList.id ==
                            self.rollback_id_list[0]).first()
                        db_result.crawler_time = self.time_dict[
                            'old_crawler_time'][self.rollback_id_list[0]]
                        db_result.next_time = self.time_dict['old_next_time'][
                            self.rollback_id_list[0]]
                        db.session.add(db_result)
                        del (self.time_dict['old_crawler_time'][
                            self.rollback_id_list[0]])
                        del (self.time_dict['old_next_time'][
                            self.rollback_id_list[0]])
                        del (self.rollback_id_list[0])
                    db.session.commit()

                db_result = SubAreaList.query.filter(
                    SubAreaList.enable == 1, SubAreaList.web == 'myships',
                    or_(SubAreaList.next_time <= datetime.now(),
                        SubAreaList.next_time == None),
                    or_(*[
                        SubAreaList.area_list_id == id
                        for id in self.crawl_span_dict.keys()
                    ])).order_by(sqlalchemy.func.field(*self.query_sort_conds),
                                 asc(SubAreaList.next_time),
                                 func.random()).first()
                if not db_result:
                    if self.thread_list:
                        for thread in self.thread_list:
                            thread.join()
                        continue
                    print('{}: 完成'.format(self.script_name))
                    return
                print('{}: 爬取區域 {} 中'.format(self.script_name, db_result.id))
                crawler_time = datetime.now() - timedelta(
                    minutes=datetime.now().minute %
                    self.crawl_span_dict[db_result.area_list_id])
                self.time_dict['old_crawler_time'][db_result.id] = deepcopy(
                    db_result.crawler_time)
                self.time_dict['old_next_time'][db_result.id] = deepcopy(
                    db_result.next_time)
                db_result.crawler_time = datetime.strptime(
                    crawler_time.strftime('%Y-%m-%d %H:%M:00'),
                    '%Y-%m-%d %H:%M:%S')
                db_result.next_time = db_result.crawler_time + timedelta(
                    minutes=self.crawl_span_dict[db_result.area_list_id])
                db.session.add(db_result)
                db.session.commit()

                if db_result.lu_lat==db_result.rd_lat \
                or db_result.lu_lng==db_result.rd_lng:
                    continue

                self.mc.set_cookies_list([
                    x.cookies for x in MyshipsAccount.query.filter(
                        MyshipsAccount.enable == 1, MyshipsAccount.updating ==
                        0, MyshipsAccount.updated_time >= (
                            datetime.now() - timedelta(hours=1))).all()
                ])

                thread = threading.Thread(target=self.myships_thread,
                                          args=(db_result.json(), ),
                                          daemon=True)
                thread.start()
                time.sleep(2)
                self.thread_list.append(thread)
                while [x.is_alive() for x in self.thread_list
                       ].count(True) >= self.thread_max_count:
                    continue

            if self.err_msg_list:
                self.err_msg_list = list(set(self.err_msg_list))
                ggg = GmailSender(
                    '船隻爬蟲 {} 執行完成,但途中有部份錯誤'.format(self.script_name),
                    app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                    '\n\n'.join(self.err_msg_list))
                ggg.send_email()
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(get_external_ip()),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            self.err_msg_list.append(msg)
            self.err_msg_list = list(set(self.err_msg_list))
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                              '\n\n'.join(self.err_msg_list))
            ggg.send_email()
    def myships_thread(self, db_result_dict):
        try:
            batch_load_list = []
            try:
                area_result = self.mc.area_info(
                    min([db_result_dict['lu_lat'], db_result_dict['rd_lat']]),
                    min([db_result_dict['lu_lng'], db_result_dict['rd_lng']]),
                    max([db_result_dict['lu_lat'], db_result_dict['rd_lat']]),
                    max([db_result_dict['lu_lng'], db_result_dict['rd_lng']]))
            except:
                self.thread_error_count += 1
                self.rollback_id_list.append(db_result_dict['id'])
                msg = '\n\n'.join([
                    'ip: {}'.format(get_external_ip()),
                    '時間: {}'.format(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                    '{}\n{}'.format('取得區域船隻資料出現錯誤,請檢查是資策會端網路出現錯誤還是寶船網網站異常',
                                    traceback.format_exc()),
                ])
                print(msg)
                self.err_msg_list.append(msg)
                # ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
                # ggg.send_email()
                return

            if area_result['code'] != '0':
                self.thread_error_count += 1
                msg = '\n\n'.join([
                    'ip: {}'.format(get_external_ip()), '時間: {}'.format(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                    '\n'.join([
                        '取得寶船網區域船隻資料時出現錯誤', '{}'.format(area_result),
                        'id :{}'.format(db_result_dict['id']),
                        'area_list_id :{}'.format(
                            db_result_dict['area_list_id']),
                        '{}'.format([[
                            db_result_dict['lu_lat'], db_result_dict['lu_lng']
                        ], [
                            db_result_dict['rd_lat'], db_result_dict['rd_lng']
                        ]])
                    ])
                ])
                print(msg)
                self.err_msg_list.append(msg)
                # ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
                # ggg.send_email()
                return

            tmp_area_data_list = area_result.pop('data')
            area_result['data'] = []

            for area_data in tmp_area_data_list:
                if not area_data.get('m') \
                or area_data['m']=='0':
                    continue
                area_result['data'].append(area_data)

            # 該區域沒有任何船隻資料的話,略過
            if not area_result['data']:
                print('{}: Skip Area {}'.format(self.script_name,
                                                db_result_dict['id']))
                return
            else:
                print('{}: 區域 {} 有 {} 艘船隻'.format(self.script_name,
                                                  db_result_dict['id'],
                                                  len(area_result['data'])))

            id_list = []
            ship_data_dict = self.mc.ship_info(
                [area_data['i'] for area_data in area_result['data']])
            for area_data in area_result['data']:
                if area_data['i'] not in ship_data_dict:
                    print(area_data['i'])
                    continue
                id_list.append('{}_{}'.format(area_data['m'], area_data['t']))
            if id_list:
                es_ship_ids = set([
                    data['_id'] for data in self.es.scan(
                        {
                            'query': {
                                'bool': {
                                    'must': [{
                                        'terms': {
                                            '_id': id_list
                                        }
                                    }]
                                }
                            }
                        }, app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'])
                ])
            else:
                es_ship_ids = set()

            for area_data in area_result['data']:
                if area_data['i'] not in ship_data_dict:
                    print(area_data['i'])
                    continue
                # 有時候拉船隻詳細資料 posTime 會是 Null, 這時改為區域船隻資料的船隻資料時間點
                ship_data = ship_data_dict[area_data['i']]
                try:
                    dictionary = {
                        '_index':
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'],
                        '_type':
                        '_doc',
                        '_id':
                        '{}_{}'.format(area_data['m'], area_data['t']),
                        '_routing':
                        '{}'.format(
                            (datetime.utcfromtimestamp(area_data['t']) +
                             timedelta(hours=8)).year)
                        if area_data['t'] else None,
                        'updatetime':
                        area_data['updatetime'] if area_data.get('updatetime')
                        else datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        'eta_timestamp':
                        ship_data['eta'],
                        'eta':
                        area_data['r'],
                        'time':
                        (datetime.utcfromtimestamp(area_data['t']) +
                         timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S'),
                        'y':
                        area_data['y']
                    }
                except:
                    self.thread_error_count += 1
                    msg = '\n'.join([
                        traceback.format_exc(), '{}'.format(area_data),
                        '{}'.format(ship_data)
                    ])
                    print(msg)
                    self.err_msg_list.append(msg)
                    ggg = GmailSender(
                        '船隻爬蟲船隻資料出現異常-{}'.format(self.script_name),
                        app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
                    ggg.send_email()
                    continue

                if dictionary['_id'] in es_ship_ids:
                    continue

                try:
                    dictionary['v'] = int(area_data['v'])
                except:
                    dictionary['v'] = None

                if not dictionary['eta_timestamp']:
                    dictionary['eta_timestamp'] = None
                    dictionary['eta_datetime'] = None
                else:
                    dictionary['eta_datetime'] = (
                        datetime.utcfromtimestamp(dictionary['eta_timestamp'])
                        + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')

                for source_key, new_key in format_batch_load_dict.items():
                    if source_key in area_data:
                        dictionary[new_key] = area_data[source_key]
                    elif source_key in ship_data:
                        dictionary[new_key] = ship_data[source_key]

                dictionary['shipid'] = '{}'.format(dictionary['shipid'])

                for key, divisor in format_data_content.items():
                    if dictionary.get(key):
                        dictionary[key] = round(dictionary[key] / divisor, 6)

                for key in list(dictionary.keys()):
                    if type(dictionary[key]) is not str:
                        continue
                    dictionary[key] = dictionary[key].strip()
                    if not dictionary[key] or dictionary[key] == 'NULL':
                        dictionary[key] = None
                for key in ['navistatus', 'rot', 'type', 'y']:
                    if dictionary.get(key) and type(
                            dictionary[key] is not int):
                        dictionary[key] = int(dictionary[key])

                if dictionary['type'] not in self.ship_type_dict:
                    sts_db_result = ShipTypeMyships.query.filter(
                        ShipTypeMyships.type == dictionary['type']).first()
                    if sts_db_result:
                        self.ship_type_dict[
                            sts_db_result.type] = sts_db_result.name
                    else:
                        self.ship_type_dict[dictionary['type']] = None
                dictionary['type_text'] = self.ship_type_dict[
                    dictionary['type']]

                if dictionary['navistatus'] not in self.navistatus_type_dict:
                    nt_db_result = NavistatusTypeMyships.query.filter(
                        NavistatusTypeMyships.type ==
                        dictionary['navistatus']).first()
                    if nt_db_result:
                        self.navistatus_type_dict[
                            nt_db_result.type] = nt_db_result.name
                    else:
                        self.navistatus_type_dict[
                            dictionary['navistatus']] = None
                dictionary['navistatus_text'] = self.navistatus_type_dict[
                    dictionary['navistatus']]

                batch_load_list.append(dictionary)
            if batch_load_list:
                self.es.batch_load(batch_load_list)
        except:
            self.thread_error_count += 1
            msg = traceback.format_exc()
            print(msg)
            self.err_msg_list.append(msg)
示例#8
0
def ship_account_login_func():
    script_name = os.path.basename(__file__)
    try:
        # 檢查這台機器是否有同排程還在執行
        if check_same_process_still_running(script_name):
            # 代表包含這個程式在內,有兩個ca以上相同的排程正在運行
            print('{}: 有相同排程尚在執行({})'.format(script_name, 1))
            return

        # {"SERVERID": "ce54c768aca7be22386d8a7ce24ecdae|1596424213|1596424207", ".UserAuth2": "DC41F8152480DF00C47C6EA6666546EFD31E197BB9DCD08A6D47A60FE95B8D15A5963DA3196A79A31CD4DCABD9D15BC24D1D3B6AA57B108A9BF1C4350DAA0A12D2352E089006D4B5B285875C837BBC8A26E5069E1657CE48636716B1A820826E4AA7D4DF86AC7AA714B37C615B2A49AC245CB0FFEC011405D9F3F22085AC55D998184EF5", "FD857C2AF68165D4": "vyH3H7apfudRFtw8Dvd2z9dXvgh6/nhEsXkCr3rtsqflVSiS4EwmTNvclp+SBvVC", "ASP.NET_SessionId": "rp0u0ognzh3qzci3mnfdvjoi"}

        sc = ShipXY_Crawler()
        conds_list = [
            # 登入新加入的帳號
            [ShipxyAccount.enable == None],

            # 如果有任何帳號更新到一半,重新登入的排程crash掉,會導致該筆帳號資料 updating 會保持在1,故再重登更新失敗的帳號一次
            [
                ShipxyAccount.updating == 1, ShipxyAccount.updated_time <=
                (datetime.now() - timedelta(minutes=30))
            ],

            # 重登登入超過23小時的帳號,由於船訊網爬蟲會撈取一天內曾登入的帳號,故最晚要在22小時更新
            [
                ShipxyAccount.enable == 1, ShipxyAccount.updating == 0,
                or_(
                    ShipxyAccount.updated_time <=
                    (datetime.now() - timedelta(hours=22)),
                    ShipxyAccount.updated_time == None)
            ]
        ]
        account_be_banned = []
        for conds in conds_list:
            while True:
                db.session.rollback()
                db.session.close()
                db_result = ShipxyAccount.query.filter(*conds).order_by(
                    func.random()).first()
                if not db_result:
                    break
                db_result.updating = 1
                db.session.add(db_result)
                db.session.commit()

                db_result.updating = 0
                db_result.cookies = sc.login(db_result.account,
                                             db_result.password,
                                             db_result.cookies)
                if '.UserAuth2' not in db_result.cookies or 'FD857C2AF68165D4' not in db_result.cookies:
                    db_result.enable = 0
                    account_be_banned.append(db_result.account)
                else:
                    db_result.enable = 1

                db.session.add(db_result)
                db.session.commit()
            account_be_banned = list(set(account_be_banned))
            account_be_banned.sort()
            if account_be_banned:
                msg_list = [
                    'ip: {}'.format(get_external_ip()), '時間: {}'.format(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                    '有帳號被封鎖,下述為被封鎖之帳號', '\n'.join(account_be_banned)
                ]
                ggg = GmailSender('船隻爬蟲狀況通知-{}'.format(script_name),
                                  app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                                  '\n\n'.join(msg_list))
                ggg.send_email()
    except:
        msg_list = [
            'ip: {}'.format(get_external_ip()),
            '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
            traceback.format_exc()
        ]
        print('\n\n'.join(msg_list))
        ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name),
                          app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                          '\n\n'.join(msg_list))
        ggg.send_email()
    def myships_crawler_func(self):
        print(datetime.now())
        try:
            self.ip = get_external_ip()
        except:
            self.ip = None
        try:
            # 檢查這台機器是否有同排程還在執行
            if check_same_process_still_running(self.script_name):
                # 代表包含這個程式在內,有兩個以上相同的排程正在運行
                print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1))
                return
            if not self.ip:
                raise (Exception('無法取得 IP'))
            try:
                rsp = requests.get(
                    app.config['CRAWLER_SETTING']['MYSHIPS']['HOST_DOMAIN'],
                    timeout=60)
                rsp.close()
            except:
                raise Exception('無法連線至寶船網網頁 :\n{}'.format(
                    traceback.format_exc()))
            if rsp.status_code != 200:
                raise Exception('寶船網網頁無法正確連線 :\n{}'.format(rsp.text))

            try:
                rsp = requests.get(
                    'http://{}:{}/'.format(
                        'localhost',
                        app.config['ES_SETTING']['CONNECTION']['PORT']),
                    auth=HTTPBasicAuth(
                        app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                        app.config['ES_SETTING']['CONNECTION']['PASSWORD']))
                rsp.close()
            except:
                raise Exception(traceback.format_exc())
            if rsp.status_code != 200:
                raise Exception('無法連線至資策會 ES 主機')

            self.es = Elastic(
                host=app.config['ES_SETTING']['CONNECTION']['HOST'],
                port=app.config['ES_SETTING']['CONNECTION']['PORT'],
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])

            if not self.es.check_index_exist(
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                ['INDEX_NAME']):
                print(
                    self.es.create_index(
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                        ['MYSHIPS']['MAPPING_FILEPATH']))

            self.ship_type_dict = {
                x.type: x.name
                for x in ShipTypeMyships.query.all()
            }
            self.navistatus_type_dict = {
                x.type: x.name
                for x in NavistatusTypeMyships.query.all()
            }
            self.mmsi_dict = {}
            for db_result in MMSI_Info.query.with_entities(
                    MMSI_Info.mmsi, MMSI_Info.alpha_2,
                    MMSI_Info.alpha_3).all():
                self.mmsi_dict[
                    db_result.
                    mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2

            print('帳戶檢查登入狀態中')
            account_login_timestamp = time.time()
            account_login_span = 1800
            try:
                ship_account_login_func()
                self.cookies_list = ([
                    x.cookies for x in MyshipsAccount.query.filter(
                        MyshipsAccount.enable == 1, MyshipsAccount.updating ==
                        0, MyshipsAccount.updated_time >= (
                            datetime.now() - timedelta(hours=1))).all()
                ])
                if not self.cookies_list:
                    self.cookies_list.append({})
            except:
                print('帳號登入失敗')
                print(traceback.format_exc())
                self.cookies_list = [{}]

            # start_n = deepcopy(4000000+self.machine_serial)
            start_n = deepcopy(self.machine_serial)
            # start_n = 1660000
            while True:
                if datetime.now().minute>59 \
                and datetime.now().second>30:
                    return
                print(start_n)

                if (time.time() -
                        account_login_timestamp) >= account_login_span:
                    print(
                        f'帳戶距離上次登入時間超過 {account_login_span} 秒,等待所有 Thread 結束並重新登入後,將繼續執行'
                    )
                    for thread in self.thread_list:
                        thread.join()
                    print('帳戶重新登入中')
                    account_login_timestamp = time.time()
                    try:
                        ship_account_login_func()
                        self.cookies_list = ([
                            x.cookies for x in MyshipsAccount.query.filter(
                                MyshipsAccount.enable == 1, MyshipsAccount.
                                updating == 0, MyshipsAccount.updated_time >=
                                (datetime.now() - timedelta(hours=1))).all()
                        ])
                        if not self.cookies_list:
                            self.cookies_list.append({})
                    except:
                        print('帳號登入失敗')
                        print(traceback.format_exc())
                        self.cookies_list = [{}]

                end_n = start_n + 1000 * self.machine_count
                shipId_list = [
                    f'{i}' for i in range(start_n, end_n, self.machine_count)
                ]
                start_n = deepcopy(end_n)

                t1 = time.time()
                thread = threading.Thread(target=self.get_ship_detail,
                                          args=(shipId_list, ),
                                          daemon=True)
                thread.start()
                self.thread_list.append(thread)
                thread_sleep_time = 1 - (time.time() - t1)
                if thread_sleep_time > 0:
                    time.sleep(thread_sleep_time)

                # for thread in self.thread_list:
                #     thread.join()
                # pprint(self.ship_detail_dict)
                # if self.ship_detail_dict:
                #     self.save2es()
                # pprint(self.ship_detail_dict)
                # return

                while [thread.is_alive() for thread in self.thread_list
                       ].count(True) >= self.thread_max_count:
                    continue
                delete_index_list = []
                for index, thread in enumerate(self.thread_list):
                    if not thread.is_alive():
                        delete_index_list.append(index)
                delete_index_list.reverse()
                for index in delete_index_list:
                    del (self.thread_list[index])
                if self.err_count >= self.err_count_max:
                    raise Exception('\n\n'.join(self.err_msg_list))
                if self.no_data_count >= self.no_data_count_max:
                    break
                if self.ship_detail_dict and not self.save2es_thread.is_alive(
                ):
                    self.save2es_thread = threading.Thread(target=self.save2es,
                                                           daemon=True)
                    self.save2es_thread.start()
            print('完成爬取,等待 Thread 結束')
            for thread in self.thread_list:
                thread.join()
            print('Thread 結束, 正在將最後剩餘資料存入 ES 中')
            while self.save2es_thread.is_alive():
                continue
            if self.ship_detail_dict:
                self.save2es()
            print('結束')
            print(datetime.now())
            exit()
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(self.ip),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            self.err_msg_list.append(msg)
            self.err_msg_list = list(set(self.err_msg_list))
            print('\n\n'.join(self.err_msg_list))