示例#1
0
    def shipxy_crawler_func(self):
        try:
            ip = get_external_ip()
        except:
            ip = '取得IP失敗'
        try:
            # 檢查這台機器是否有同排程還在執行
            if check_same_process_still_running(self.script_name):
                # 代表包含這個程式在內,有兩個以上相同的排程正在運行
                print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1))
                return
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(ip),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
            ggg.send_email()
            # line_notify_pusher(msg)
            return

        try:
            self.es = Elastic(
                host=app.config['ES_SETTING']['CONNECTION']['HOST'],
                port=app.config['ES_SETTING']['CONNECTION']['PORT'],
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])
            if not self.es.check_index_exist(
                    app.config['ES_SETTING']['INDEX_INFO']['SHIPXY']
                ['INDEX_NAME']):
                print(
                    self.es.create_index(
                        app.config['ES_SETTING']['INDEX_INFO']['SHIPXY']
                        ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                        ['SHIPXY']['MAPPING_FILEPATH']))

            db_result_list = AreaList.query.with_entities(
                AreaList.id,
                AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all()
            if not db_result_list:
                print('{}: 無區域的排程區間資料'.format(self.script_name))
                return
            crawl_span_dict = {
                db_result.id: db_result.crawl_span
                for db_result in db_result_list
            }
            query_sort_conds = [SubAreaList.area_list_id]
            query_sort_conds.extend([x.id for x in db_result_list])

            self.cold_zone_ids = set([
                db_result.id for db_result in AreaList.query.filter(
                    AreaList.enable == 1, AreaList.name.like('%冷區%')).all()
            ])

            self.mmsi_dict = {}
            for db_result in MMSI_Info.query.with_entities(
                    MMSI_Info.mmsi, MMSI_Info.alpha_2,
                    MMSI_Info.alpha_3).all():
                self.mmsi_dict[
                    db_result.
                    mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2

            self.sc = ShipXY_Crawler()

            cookies_list = []
            for db_result in ShipxyAccount.query.filter(
                    ShipxyAccount.enable == 1, ShipxyAccount.updating == 0,
                    ShipxyAccount.updated_time >=
                (datetime.now() - timedelta(days=1))).all():
                if not db_result.cookies:
                    continue
                cookies_list.append(deepcopy(db_result.cookies))
            if not cookies_list:
                raise Exception('{}: 無可用之帳號'.format(self.script_name))
            self.sc.update_cookies_list(cookies_list)

            del (cookies_list)

            while True:
                if not self.crawler_status:
                    raise Exception('\n'.join(list(set(self.error_msg_list))))
                elif self.get_shipxy_thread_list and [
                        x.is_alive() for x in self.get_shipxy_thread_list
                ].count(True) >= self.get_shipxy_thread_limit_tmp:
                    continue
                remove_index_list = []
                for index, thread in enumerate(self.get_shipxy_thread_list):
                    if not thread.is_alive():
                        remove_index_list.append(index)
                remove_index_list.reverse()
                for index in remove_index_list:
                    del (self.get_shipxy_thread_list[index])

                cookies_list = []
                for cookies in self.sc.cookies_list:
                    if 'SERVERID' in cookies:
                        SERVERID_list = cookies['SERVERID'].split('|')
                        SERVERID_list[1] = '{}'.format(time.time())
                        SERVERID_list[2] = '{}'.format(time.time())
                        cookies['SERVERID'] = '|'.join(SERVERID_list)
                    cookies_list.append(cookies)
                self.sc.update_cookies_list(cookies_list)
                del (cookies_list)

                db_result = CrawlerMachine.query.filter(
                    CrawlerMachine.ip == ip).first()
                if not db_result:
                    db_result = CrawlerMachine(ip=ip)
                db_result.updatedAt = datetime.now()
                db.session.add(db_result)
                db.session.commit()

                machine_quantity = CrawlerMachine.query.filter(
                    CrawlerMachine.updatedAt >= (datetime.now() -
                                                 timedelta(hours=1))).count()
                if not machine_quantity:
                    machine_quantity += 1
                # 每個帳號平均每秒只能查詢一次區域,以避免帳號被鎖
                # 算式為:(一秒/((可用帳號數量)/(機器總數)))-(這一輪當前經過的時間)
                self.gernal_sleep_time = (
                    1 / len(self.sc.cookies_list)) * machine_quantity * 1.5

                self.get_shipxy_thread_limit_tmp = (math.floor(
                    (len(self.sc.cookies_list) / machine_quantity) /
                    self.get_ship_detail_quantity_limit))
                if not self.get_shipxy_thread_limit_tmp:
                    raise Exception('\n'.join([
                        '{}: 帳號總數量未達可爬取之帳號最小數量\n'.format(self.script_name),
                        '最小數量定義的算式為\n',
                        '可用帳號之數量({}) 除以 機器總數({}) 除以 每個 thread 取得船隻詳細資料的子程序上限值({}) 後取最小值整數'
                        .format(len(self.sc.cookies_list), machine_quantity,
                                self.get_ship_detail_quantity_limit)
                    ]))

                if self.db_rollback_dict:
                    for db_result_id in list(self.db_rollback_dict.keys()):
                        db_result = SubAreaList.query.filter(
                            SubAreaList.id == db_result_id).first()
                        db_result.crawler_time = self.db_rollback_dict[
                            db_result_id]['crawler_time']
                        db_result.next_time = self.db_rollback_dict[
                            db_result_id]['next_time']
                        db.session.add(db_result)
                        del (self.db_rollback_dict[db_result_id])
                    db.session.commit()

                db_result = SubAreaList.query.filter(
                    SubAreaList.enable == 1, SubAreaList.web == 'shipxy',
                    or_(SubAreaList.next_time <= datetime.now(),
                        SubAreaList.next_time == None),
                    or_(*[
                        SubAreaList.area_list_id == id
                        for id in crawl_span_dict.keys()
                    ])).order_by(sqlalchemy.func.field(*query_sort_conds),
                                 asc(SubAreaList.next_time),
                                 func.random()).first()

                if not db_result:
                    if [x.is_alive()
                            for x in self.get_shipxy_thread_list].count(True):
                        print(
                            '{}: 無需要爬取的區域, 等待仍在執行的的區域爬取子程序結束中,如果所有子程序執行結束且無任何需爬取的區域,程式將會結束'
                            .format(self.script_name))
                        while [
                                x.is_alive()
                                for x in self.get_shipxy_thread_list
                        ].count(True):
                            # 如果到有區域需要再爬的時間,就繼續爬區域
                            if not datetime.now().minute \
                            or datetime.now().minute in crawl_span_dict.values():
                                break
                        # 這邊寫 continue 不是 return,是因為如果子程序執行結束,時間剛好過30分或是0分,就會又有區域需要爬
                        continue
                    else:
                        print('{}: 無需要爬取的區域, 程式結束, 時間: {}'.format(
                            self.script_name, datetime.now()))
                    return

                get_shipxy_thread_input = deepcopy(db_result.json())

                if db_result.area_list_id not in crawl_span_dict:
                    crawl_span_dict[
                        db_result.area_list_id] = AreaList.query.filter(
                            AreaList.id ==
                            db_result.area_list_id).first().crawl_span
                crawler_time = datetime.now() - timedelta(
                    minutes=datetime.now().minute %
                    crawl_span_dict[db_result.area_list_id])
                db_result.crawler_time = datetime.strptime(
                    crawler_time.strftime('%Y-%m-%d %H:%M:00'),
                    '%Y-%m-%d %H:%M:%S')
                db_result.next_time = db_result.crawler_time + timedelta(
                    minutes=crawl_span_dict[db_result.area_list_id])
                db.session.add(db_result)
                db.session.commit()

                if db_result.lu_lat==db_result.rd_lat \
                or db_result.lu_lng==db_result.rd_lng:
                    continue

                db.session.rollback()
                db.session.close()

                thread = threading.Thread(target=self.get_shipxy_thread,
                                          args=(get_shipxy_thread_input, ),
                                          daemon=True)
                thread.start()
                self.get_shipxy_thread_list.append(thread)

                # ###############################
                # for thread in self.get_shipxy_thread_list:
                #     thread.join()
                # return
                # ###############################
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(ip),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
            ggg.send_email()
示例#2
0
def ship_account_login_func():
    script_name = os.path.basename(__file__)
    try:
        # 檢查這台機器是否有同排程還在執行
        if check_same_process_still_running(script_name):
            # 代表包含這個程式在內,有兩個ca以上相同的排程正在運行
            print('{}: 有相同排程尚在執行({})'.format(script_name, 1))
            return

        # 預防當要登入的帳號只有一個,但不同主機搶同一帳號登入
        try:
            time.sleep(os.environ.get('SERIAL'))
        except:
            pass

        f = open(app.config['CRAWLER_SETTING']['MYSHIPS']['JS_DEMIX_FILEPATH'], 'r')
        js_content = f.read()
        f.close()

        conds_list = [
            # 登入新加入的帳號
            [
                MyshipsAccount.enable==None
            ],

            # 如果有任何帳號更新到一半,重新登入的排程crash掉,會導致該筆帳號資料 updating 會保持在1,故再重登更新失敗的帳號一次
            [
                MyshipsAccount.updating==1, 
                MyshipsAccount.updated_time<=(datetime.now()-timedelta(minutes=30))
            ],

            # 重登登入超過23小時的帳號,由於船訊網爬蟲會撈取一天內曾登入的帳號,故最晚要在22小時更新
            [
                MyshipsAccount.enable==1, 
                MyshipsAccount.updating==0,
                or_(
                    MyshipsAccount.updated_time<=(datetime.now()-timedelta(minutes=30)),
                    MyshipsAccount.updated_time==None
                )
            ]
        ]
        headers = {
            'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36',
            'Connection':'keep-alive'
        }
        err_msg_list = []
        account_be_banned = []
        for conds in conds_list:
            while True:
                db.session.rollback()
                db.session.close()
                db_result = MyshipsAccount.query.filter(*conds).order_by(func.random()).first()
                if not db_result:
                    break
                db_result.updating = 1
                db.session.add(db_result)
                db.session.commit()

                db_result.updating = 0
                session = requests.Session()

                complie_result = execjs_compile(js_content).call('shipencode') # {'time': 1607309565213, 'pwdSign': '50aa7d92baa11df578ad254e252928c8'}
                complie_result['user'] = db_result.account

                try:
                    rsp = session.post(app.config['CRAWLER_SETTING']['MYSHIPS']['LOGIN'], headers=headers, json=complie_result, timeout=180)
                    rsp.close()
                except:
                    raise Exception(traceback.format_exc())
                try:
                    rsp_result = rsp.json()
                except:
                    raise Exception(rsp.text)
                if rsp_result['code']=='0':
                    pprint(rsp_result)
                    err_msg_list.append(f"{rsp_result}")
                    db_result.cookies = deepcopy(session.cookies.get_dict())
                    db_result.enable = 1
                else:
                    db_result.enable = 0
                    account_be_banned.append(db_result.account)
                db.session.add(db_result)
                db.session.commit()
        account_be_banned = list(set(account_be_banned))
        account_be_banned.sort()
        if account_be_banned:
            msg_list = [
                'ip: {}'.format(get_external_ip()), 
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), 
                '有帳號被封鎖,下述為被封鎖之帳號',
                '\n'.join(account_be_banned),
                '\n'.join(list(set(err_msg_list)))
                ]
            msg = '\n\n'.join(msg_list)
            print(msg)
            ggg = GmailSender('船隻爬蟲狀況通知-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
            ggg.send_email()
    except:
        msg_list = ['ip: {}'.format(get_external_ip()), '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')), traceback.format_exc()]
        print('\n\n'.join(msg_list))
        ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], '\n\n'.join(msg_list))
        ggg.send_email()
    def myships_crawler_func(self):
        try:
            self.ip = get_external_ip()
        except:
            self.ip = None
        try:
            # 檢查這台機器是否有同排程還在執行
            if check_same_process_still_running(self.script_name):
                # 代表包含這個程式在內,有兩個以上相同的排程正在運行
                print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1))
                return
            if not self.ip:
                raise (Exception('無法取得 IP'))
            try:
                rsp = requests.get(
                    app.config['CRAWLER_SETTING']['MYSHIPS']['HOST_DOMAIN'],
                    timeout=60)
                rsp.close()
            except:
                raise Exception('無法連線至寶船網網頁 :\n{}'.format(
                    traceback.format_exc()))
            if rsp.status_code != 200:
                raise Exception('寶船網網頁無法正確連線 :\n{}'.format(rsp.text))

            self.es = Elastic(
                host=app.config['ES_SETTING']['CONNECTION']['HOST'],
                port=app.config['ES_SETTING']['CONNECTION']['PORT'],
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])

            if not self.es.check_index_exist(
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                ['INDEX_NAME']):
                print(
                    self.es.create_index(
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                        ['MYSHIPS']['MAPPING_FILEPATH']))

            self.ship_type_dict = {
                x.type: x.name
                for x in ShipTypeMyships.query.all()
            }
            self.navistatus_type_dict = {
                x.type: x.name
                for x in NavistatusTypeMyships.query.all()
            }
            self.mmsi_dict = {}
            for db_result in MMSI_Info.query.with_entities(
                    MMSI_Info.mmsi, MMSI_Info.alpha_2,
                    MMSI_Info.alpha_3).all():
                self.mmsi_dict[
                    db_result.
                    mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2

            # start_n = deepcopy(4000000+self.machine_serial)
            start_n = deepcopy(self.machine_serial)
            while True:
                if datetime.now().minute>59 \
                and datetime.now().second>30:
                    return
                print(start_n)
                end_n = start_n + 1000 * self.machine_count
                shipId_list = [
                    f'{i}' for i in range(start_n, end_n, self.machine_count)
                ]
                start_n = deepcopy(end_n)

                thread = threading.Thread(target=self.get_ship_detail,
                                          args=(shipId_list, ),
                                          daemon=True)
                thread.start()
                self.thread_list.append(thread)

                # for thread in self.thread_list:
                #     thread.join()
                # pprint(self.ship_detail_dict)
                # if self.ship_detail_dict:
                #     self.save2es()
                # pprint(self.ship_detail_dict)
                # return

                while [thread.is_alive() for thread in self.thread_list
                       ].count(True) >= self.thread_max_count:
                    continue
                delete_index_list = []
                for index, thread in enumerate(self.thread_list):
                    if not thread.is_alive():
                        delete_index_list.append(index)
                delete_index_list.reverse()
                for index in delete_index_list:
                    del (self.thread_list[index])
                if self.err_count >= self.err_count_max:
                    raise Exception('\n\n'.join(self.err_msg_list))
                if self.no_data_count >= self.no_data_count_max:
                    break
                if self.ship_detail_dict:
                    self.save2es()
                time.sleep(1)
            for thread in self.thread_list:
                thread.join()
            if self.ship_detail_dict:
                self.save2es()
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(self.ip),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            self.err_msg_list.append(msg)
            self.err_msg_list = list(set(self.err_msg_list))
            print('\n\n'.join(self.err_msg_list))
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                              '\n\n'.join(self.err_msg_list))
            ggg.send_email()
示例#4
0
def myships_crawler_func():
    script_name = os.path.basename(__file__)
    # 檢查這台機器是否有同排程還在執行
    if check_same_process_still_running(script_name):
        # 代表包含這個程式在內,有兩個以上相同的排程正在運行
        print('{}: 有相同排程尚在執行({})'.format(script_name, 1))
        return
    try:
        es = Elastic(
            host=app.config['ES_SETTING']['CONNECTION']['HOST'],
            port=app.config['ES_SETTING']['CONNECTION']['PORT'],
            username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
            password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])

        if not es.check_index_exist(app.config['ES_SETTING']['INDEX_INFO']
                                    ['MYSHIPS']['INDEX_NAME']):
            print(
                es.create_index(
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                    ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                    ['MYSHIPS']['MAPPING_FILEPATH']))

        db_result_list = AreaList.query.with_entities(
            AreaList.id,
            AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all()
        if not db_result_list:
            print('{}: 無區域的排程區間資料'.format(script_name))
            return
        crawl_span_dict = {
            db_result.id: db_result.crawl_span
            for db_result in db_result_list
        }
        query_sort_conds = [SubAreaList.area_list_id]
        query_sort_conds.extend([x.id for x in db_result_list])

        cold_zone_ids = set([
            db_result.id for db_result in AreaList.query.filter(
                AreaList.enable == 1, AreaList.name.like('%冷區%')).all()
        ])

        ship_type_dict = {x.type: x.name for x in ShipTypeMyships.query.all()}
        navistatus_type_dict = {
            x.type: x.name
            for x in NavistatusTypeMyships.query.all()
        }

        mc = Myships_Crawler()
        while True:
            db.session.rollback()
            db.session.close()
            batch_load_list = []

            db_result = SubAreaList.query.filter(
                SubAreaList.enable == 1, SubAreaList.web == 'myships',
                or_(SubAreaList.next_time <= datetime.now(),
                    SubAreaList.next_time == None),
                or_(*[
                    SubAreaList.area_list_id == id
                    for id in crawl_span_dict.keys()
                ])).order_by(sqlalchemy.func.field(*query_sort_conds),
                             asc(SubAreaList.next_time),
                             func.random()).first()
            if not db_result:
                print('{}: 完成'.format(script_name))
                return
            print('{}: 爬取區域 {} 中'.format(script_name, db_result.id))
            crawler_time = datetime.now() - timedelta(
                minutes=datetime.now().minute %
                crawl_span_dict[db_result.area_list_id])
            old_crawler_time = deepcopy(db_result.crawler_time)
            old_next_time = deepcopy(db_result.next_time)
            db_result.crawler_time = datetime.strptime(
                crawler_time.strftime('%Y-%m-%d %H:%M:00'),
                '%Y-%m-%d %H:%M:%S')
            db_result.next_time = db_result.crawler_time + timedelta(
                minutes=crawl_span_dict[db_result.area_list_id])
            db.session.add(db_result)
            db.session.commit()

            if db_result.lu_lat==db_result.rd_lat \
            or db_result.lu_lng==db_result.rd_lng:
                continue

            # ma_cookies_list = [x.cookies for x in MyshipsAccount.query.filter(MyshipsAccount.enable==1, MyshipsAccount.updating==0).all()]
            ma_cookies_list = []
            try:
                area_result = mc.area_info(
                    min([db_result.lu_lat, db_result.rd_lat]),
                    min([db_result.lu_lng, db_result.rd_lng]),
                    max([db_result.lu_lat, db_result.rd_lat]),
                    max([db_result.lu_lng, db_result.rd_lng]), ma_cookies_list)
            except:
                db_result.crawler_time = old_crawler_time
                db_result.next_time = old_next_time
                db.session.add(db_result)
                db.session.commit()

                msg = '\n\n'.join([
                    'ip: {}'.format(get_external_ip()),
                    '時間: {}'.format(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                    '{}\n{}'.format('取得區域船隻資料出現錯誤,請檢查是資策會端網路出現錯誤還是寶船網網站異常',
                                    traceback.format_exc()),
                ])
                print(msg)
                ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name),
                                  app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                                  msg)
                ggg.send_email()
                continue

            if area_result['code'] != '0':
                msg = '\n\n'.join([
                    'ip: {}'.format(get_external_ip()), '時間: {}'.format(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                    '\n'.join([
                        '取得寶船網區域船隻資料時出現錯誤', '{}'.format(area_result),
                        'id :{}'.format(db_result.id),
                        'area_list_id :{}'.format(db_result.area_list_id),
                        '{}'.format({
                            'age':
                            1440,
                            'rgn':
                            mc.check_trans2myships_coord(
                                [[
                                    min([db_result.lu_lat, db_result.rd_lat]),
                                    min([db_result.lu_lng, db_result.rd_lng])
                                ],
                                 [
                                     max([db_result.lu_lat, db_result.rd_lat]),
                                     max([db_result.lu_lng, db_result.rd_lng])
                                 ]])
                        }), '{}'.format([[db_result.lu_lat, db_result.lu_lng],
                                         [db_result.rd_lat, db_result.rd_lng]])
                    ])
                ])
                print(msg)
                ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name),
                                  app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                                  msg)
                ggg.send_email()
                continue

            tmp_area_data_list = area_result.pop('data')
            area_result['data'] = []

            for area_data in tmp_area_data_list:
                if not area_data.get('m') \
                or area_data['m']=='0':
                    continue
                area_result['data'].append(area_data)

            # 該區域沒有任何船隻資料的話,略過
            if not area_result['data']:
                print('{}: Skip Area {}'.format(script_name, db_result.id))
                continue
            else:
                print('{}: 區域 {} 有 {} 艘船隻'.format(script_name, db_result.id,
                                                  len(area_result['data'])))

            id_list = []
            ship_data_dict = mc.ship_info(
                [area_data['i'] for area_data in area_result['data']])
            for area_data in area_result['data']:
                if area_data['i'] not in ship_data_dict:
                    print(area_data['i'])
                    continue
                id_list.append('{}_{}'.format(
                    area_data['m'], ship_data_dict[area_data['i']]['posTime']))
            if id_list:
                es_ship_ids = set([
                    data['_id'] for data in es.scan(
                        {
                            'query': {
                                'bool': {
                                    'must': [{
                                        'terms': {
                                            '_id': id_list
                                        }
                                    }]
                                }
                            }
                        }, app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'])
                ])
            else:
                es_ship_ids = set()

            for area_data in area_result['data']:
                if area_data['i'] not in ship_data_dict:
                    print(area_data['i'])
                    continue
                ship_data = ship_data_dict[area_data['i']]
                try:
                    dictionary = {
                        '_index':
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'],
                        '_type':
                        '_doc',
                        '_id':
                        '{}_{}'.format(area_data['m'], ship_data['posTime']),
                        '_routing':
                        '{}'.format(
                            (datetime.utcfromtimestamp(ship_data['posTime']) +
                             timedelta(hours=8)).year)
                        if ship_data['posTime'] else None,
                        'updatetime':
                        ship_data['updatetime'] if ship_data.get('updatetime')
                        else datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        'eta_timestamp':
                        ship_data['eta'],
                        'eta':
                        area_data['r'],
                        'time':
                        (datetime.utcfromtimestamp(ship_data['posTime']) +
                         timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S'),
                        'y':
                        area_data['y']
                    }
                except:
                    msg = '\n'.join([
                        traceback.format_exc(), '{}'.format(area_data),
                        '{}'.format(ship_data)
                    ])
                    ggg = GmailSender(
                        '船隻爬蟲出現錯誤-{}'.format(script_name),
                        app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
                    ggg.send_email()
                    continue

                if dictionary['_id'] in es_ship_ids:
                    continue

                try:
                    dictionary['v'] = int(area_data['v'])
                except:
                    dictionary['v'] = None

                if not dictionary['eta_timestamp']:
                    dictionary['eta_timestamp'] = None
                    dictionary['eta_datetime'] = None
                else:
                    dictionary['eta_datetime'] = (
                        datetime.utcfromtimestamp(dictionary['eta_timestamp'])
                        + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')

                for source_key, new_key in format_batch_load_dict.items():
                    if source_key in area_data:
                        dictionary[new_key] = area_data[source_key]
                    elif source_key in ship_data:
                        dictionary[new_key] = ship_data[source_key]

                dictionary['shipid'] = '{}'.format(dictionary['shipid'])

                for key, divisor in format_data_content.items():
                    if dictionary.get(key):
                        dictionary[key] = round(dictionary[key] / divisor, 6)

                for key in list(dictionary.keys()):
                    if type(dictionary[key]) is not str:
                        continue
                    dictionary[key] = dictionary[key].strip()
                    if not dictionary[key] or dictionary[key] == 'NULL':
                        dictionary[key] = None
                for key in ['navistatus', 'rot', 'type', 'y']:
                    if dictionary.get(key) and type(
                            dictionary[key] is not int):
                        dictionary[key] = int(dictionary[key])

                if dictionary['type'] not in ship_type_dict:
                    sts_db_result = ShipTypeMyships.query.filter(
                        ShipTypeMyships.type == dictionary['type']).first()
                    if sts_db_result:
                        ship_type_dict[sts_db_result.type] = sts_db_result.name
                    else:
                        ship_type_dict[dictionary['type']] = None
                dictionary['type_text'] = ship_type_dict[dictionary['type']]

                if dictionary['navistatus'] not in navistatus_type_dict:
                    nt_db_result = NavistatusTypeMyships.query.filter(
                        NavistatusTypeMyships.type ==
                        dictionary['navistatus']).first()
                    if nt_db_result:
                        navistatus_type_dict[
                            nt_db_result.type] = nt_db_result.name
                    else:
                        navistatus_type_dict[dictionary['navistatus']] = None
                dictionary['navistatus_text'] = navistatus_type_dict[
                    dictionary['navistatus']]

                batch_load_list.append(dictionary)

            if batch_load_list:
                es.batch_load(batch_load_list)
            # #############################
            # if len(batch_load_list)>2:
            #     return
            # #############################
    except:
        msg = '\n\n'.join([
            'ip: {}'.format(get_external_ip()),
            '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
            traceback.format_exc()
        ])
        print(msg)
        ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name),
                          app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
        ggg.send_email()
    def myships_crawler_func(self):
        # 檢查這台機器是否有同排程還在執行
        if check_same_process_still_running(self.script_name):
            # 代表包含這個程式在內,有兩個以上相同的排程正在運行
            print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1))
            return
        try:
            self.es = Elastic(
                host=app.config['ES_SETTING']['CONNECTION']['HOST'],
                port=app.config['ES_SETTING']['CONNECTION']['PORT'],
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])

            if not self.es.check_index_exist(
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                ['INDEX_NAME']):
                print(
                    self.es.create_index(
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                        ['MYSHIPS']['MAPPING_FILEPATH']))

            db_result_list = AreaList.query.with_entities(
                AreaList.id,
                AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all()
            if not db_result_list:
                print('{}: 無區域的排程區間資料'.format(self.script_name))
                return
            self.crawl_span_dict = {
                db_result.id: db_result.crawl_span
                for db_result in db_result_list
            }
            self.query_sort_conds = [SubAreaList.area_list_id]
            self.query_sort_conds.extend([x.id for x in db_result_list])

            self.ship_type_dict = {
                x.type: x.name
                for x in ShipTypeMyships.query.all()
            }
            self.navistatus_type_dict = {
                x.type: x.name
                for x in NavistatusTypeMyships.query.all()
            }

            self.mc = Myships_Crawler()
            while True:
                if datetime.now().minute>57 \
                and datetime.now().second>30:
                    return
                db.session.rollback()
                db.session.close()

                if self.thread_error_count >= self.thread_error_count_max:
                    self.err_msg_list = list(set(self.err_msg_list))
                    raise Exception('\n\n'.join(self.err_msg_list))

                del_index_list = []
                for index, thread in enumerate(self.thread_list):
                    if not thread.is_alive():
                        del_index_list.append(index)
                del_index_list.reverse()
                for index in del_index_list:
                    del (self.thread_list[index])
                if self.rollback_id_list:
                    while self.rollback_id_list:
                        db_result = SubAreaList.query.filter(
                            SubAreaList.id ==
                            self.rollback_id_list[0]).first()
                        db_result.crawler_time = self.time_dict[
                            'old_crawler_time'][self.rollback_id_list[0]]
                        db_result.next_time = self.time_dict['old_next_time'][
                            self.rollback_id_list[0]]
                        db.session.add(db_result)
                        del (self.time_dict['old_crawler_time'][
                            self.rollback_id_list[0]])
                        del (self.time_dict['old_next_time'][
                            self.rollback_id_list[0]])
                        del (self.rollback_id_list[0])
                    db.session.commit()

                db_result = SubAreaList.query.filter(
                    SubAreaList.enable == 1, SubAreaList.web == 'myships',
                    or_(SubAreaList.next_time <= datetime.now(),
                        SubAreaList.next_time == None),
                    or_(*[
                        SubAreaList.area_list_id == id
                        for id in self.crawl_span_dict.keys()
                    ])).order_by(sqlalchemy.func.field(*self.query_sort_conds),
                                 asc(SubAreaList.next_time),
                                 func.random()).first()
                if not db_result:
                    if self.thread_list:
                        for thread in self.thread_list:
                            thread.join()
                        continue
                    print('{}: 完成'.format(self.script_name))
                    return
                print('{}: 爬取區域 {} 中'.format(self.script_name, db_result.id))
                crawler_time = datetime.now() - timedelta(
                    minutes=datetime.now().minute %
                    self.crawl_span_dict[db_result.area_list_id])
                self.time_dict['old_crawler_time'][db_result.id] = deepcopy(
                    db_result.crawler_time)
                self.time_dict['old_next_time'][db_result.id] = deepcopy(
                    db_result.next_time)
                db_result.crawler_time = datetime.strptime(
                    crawler_time.strftime('%Y-%m-%d %H:%M:00'),
                    '%Y-%m-%d %H:%M:%S')
                db_result.next_time = db_result.crawler_time + timedelta(
                    minutes=self.crawl_span_dict[db_result.area_list_id])
                db.session.add(db_result)
                db.session.commit()

                if db_result.lu_lat==db_result.rd_lat \
                or db_result.lu_lng==db_result.rd_lng:
                    continue

                self.mc.set_cookies_list([
                    x.cookies for x in MyshipsAccount.query.filter(
                        MyshipsAccount.enable == 1, MyshipsAccount.updating ==
                        0, MyshipsAccount.updated_time >= (
                            datetime.now() - timedelta(hours=1))).all()
                ])

                thread = threading.Thread(target=self.myships_thread,
                                          args=(db_result.json(), ),
                                          daemon=True)
                thread.start()
                time.sleep(2)
                self.thread_list.append(thread)
                while [x.is_alive() for x in self.thread_list
                       ].count(True) >= self.thread_max_count:
                    continue

            if self.err_msg_list:
                self.err_msg_list = list(set(self.err_msg_list))
                ggg = GmailSender(
                    '船隻爬蟲 {} 執行完成,但途中有部份錯誤'.format(self.script_name),
                    app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                    '\n\n'.join(self.err_msg_list))
                ggg.send_email()
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(get_external_ip()),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            self.err_msg_list.append(msg)
            self.err_msg_list = list(set(self.err_msg_list))
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                              '\n\n'.join(self.err_msg_list))
            ggg.send_email()
示例#6
0
def ship_account_login_func():
    script_name = os.path.basename(__file__)
    try:
        # 檢查這台機器是否有同排程還在執行
        if check_same_process_still_running(script_name):
            # 代表包含這個程式在內,有兩個ca以上相同的排程正在運行
            print('{}: 有相同排程尚在執行({})'.format(script_name, 1))
            return

        # {"SERVERID": "ce54c768aca7be22386d8a7ce24ecdae|1596424213|1596424207", ".UserAuth2": "DC41F8152480DF00C47C6EA6666546EFD31E197BB9DCD08A6D47A60FE95B8D15A5963DA3196A79A31CD4DCABD9D15BC24D1D3B6AA57B108A9BF1C4350DAA0A12D2352E089006D4B5B285875C837BBC8A26E5069E1657CE48636716B1A820826E4AA7D4DF86AC7AA714B37C615B2A49AC245CB0FFEC011405D9F3F22085AC55D998184EF5", "FD857C2AF68165D4": "vyH3H7apfudRFtw8Dvd2z9dXvgh6/nhEsXkCr3rtsqflVSiS4EwmTNvclp+SBvVC", "ASP.NET_SessionId": "rp0u0ognzh3qzci3mnfdvjoi"}

        sc = ShipXY_Crawler()
        conds_list = [
            # 登入新加入的帳號
            [ShipxyAccount.enable == None],

            # 如果有任何帳號更新到一半,重新登入的排程crash掉,會導致該筆帳號資料 updating 會保持在1,故再重登更新失敗的帳號一次
            [
                ShipxyAccount.updating == 1, ShipxyAccount.updated_time <=
                (datetime.now() - timedelta(minutes=30))
            ],

            # 重登登入超過23小時的帳號,由於船訊網爬蟲會撈取一天內曾登入的帳號,故最晚要在22小時更新
            [
                ShipxyAccount.enable == 1, ShipxyAccount.updating == 0,
                or_(
                    ShipxyAccount.updated_time <=
                    (datetime.now() - timedelta(hours=22)),
                    ShipxyAccount.updated_time == None)
            ]
        ]
        account_be_banned = []
        for conds in conds_list:
            while True:
                db.session.rollback()
                db.session.close()
                db_result = ShipxyAccount.query.filter(*conds).order_by(
                    func.random()).first()
                if not db_result:
                    break
                db_result.updating = 1
                db.session.add(db_result)
                db.session.commit()

                db_result.updating = 0
                db_result.cookies = sc.login(db_result.account,
                                             db_result.password,
                                             db_result.cookies)
                if '.UserAuth2' not in db_result.cookies or 'FD857C2AF68165D4' not in db_result.cookies:
                    db_result.enable = 0
                    account_be_banned.append(db_result.account)
                else:
                    db_result.enable = 1

                db.session.add(db_result)
                db.session.commit()
            account_be_banned = list(set(account_be_banned))
            account_be_banned.sort()
            if account_be_banned:
                msg_list = [
                    'ip: {}'.format(get_external_ip()), '時間: {}'.format(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                    '有帳號被封鎖,下述為被封鎖之帳號', '\n'.join(account_be_banned)
                ]
                ggg = GmailSender('船隻爬蟲狀況通知-{}'.format(script_name),
                                  app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                                  '\n\n'.join(msg_list))
                ggg.send_email()
    except:
        msg_list = [
            'ip: {}'.format(get_external_ip()),
            '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
            traceback.format_exc()
        ]
        print('\n\n'.join(msg_list))
        ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name),
                          app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                          '\n\n'.join(msg_list))
        ggg.send_email()
    def myships_crawler_func(self):
        print(datetime.now())
        try:
            self.ip = get_external_ip()
        except:
            self.ip = None
        try:
            # 檢查這台機器是否有同排程還在執行
            if check_same_process_still_running(self.script_name):
                # 代表包含這個程式在內,有兩個以上相同的排程正在運行
                print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1))
                return
            if not self.ip:
                raise (Exception('無法取得 IP'))
            try:
                rsp = requests.get(
                    app.config['CRAWLER_SETTING']['MYSHIPS']['HOST_DOMAIN'],
                    timeout=60)
                rsp.close()
            except:
                raise Exception('無法連線至寶船網網頁 :\n{}'.format(
                    traceback.format_exc()))
            if rsp.status_code != 200:
                raise Exception('寶船網網頁無法正確連線 :\n{}'.format(rsp.text))

            try:
                rsp = requests.get(
                    'http://{}:{}/'.format(
                        'localhost',
                        app.config['ES_SETTING']['CONNECTION']['PORT']),
                    auth=HTTPBasicAuth(
                        app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                        app.config['ES_SETTING']['CONNECTION']['PASSWORD']))
                rsp.close()
            except:
                raise Exception(traceback.format_exc())
            if rsp.status_code != 200:
                raise Exception('無法連線至資策會 ES 主機')

            self.es = Elastic(
                host=app.config['ES_SETTING']['CONNECTION']['HOST'],
                port=app.config['ES_SETTING']['CONNECTION']['PORT'],
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])

            if not self.es.check_index_exist(
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                ['INDEX_NAME']):
                print(
                    self.es.create_index(
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                        ['MYSHIPS']['MAPPING_FILEPATH']))

            self.ship_type_dict = {
                x.type: x.name
                for x in ShipTypeMyships.query.all()
            }
            self.navistatus_type_dict = {
                x.type: x.name
                for x in NavistatusTypeMyships.query.all()
            }
            self.mmsi_dict = {}
            for db_result in MMSI_Info.query.with_entities(
                    MMSI_Info.mmsi, MMSI_Info.alpha_2,
                    MMSI_Info.alpha_3).all():
                self.mmsi_dict[
                    db_result.
                    mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2

            print('帳戶檢查登入狀態中')
            account_login_timestamp = time.time()
            account_login_span = 1800
            try:
                ship_account_login_func()
                self.cookies_list = ([
                    x.cookies for x in MyshipsAccount.query.filter(
                        MyshipsAccount.enable == 1, MyshipsAccount.updating ==
                        0, MyshipsAccount.updated_time >= (
                            datetime.now() - timedelta(hours=1))).all()
                ])
                if not self.cookies_list:
                    self.cookies_list.append({})
            except:
                print('帳號登入失敗')
                print(traceback.format_exc())
                self.cookies_list = [{}]

            # start_n = deepcopy(4000000+self.machine_serial)
            start_n = deepcopy(self.machine_serial)
            # start_n = 1660000
            while True:
                if datetime.now().minute>59 \
                and datetime.now().second>30:
                    return
                print(start_n)

                if (time.time() -
                        account_login_timestamp) >= account_login_span:
                    print(
                        f'帳戶距離上次登入時間超過 {account_login_span} 秒,等待所有 Thread 結束並重新登入後,將繼續執行'
                    )
                    for thread in self.thread_list:
                        thread.join()
                    print('帳戶重新登入中')
                    account_login_timestamp = time.time()
                    try:
                        ship_account_login_func()
                        self.cookies_list = ([
                            x.cookies for x in MyshipsAccount.query.filter(
                                MyshipsAccount.enable == 1, MyshipsAccount.
                                updating == 0, MyshipsAccount.updated_time >=
                                (datetime.now() - timedelta(hours=1))).all()
                        ])
                        if not self.cookies_list:
                            self.cookies_list.append({})
                    except:
                        print('帳號登入失敗')
                        print(traceback.format_exc())
                        self.cookies_list = [{}]

                end_n = start_n + 1000 * self.machine_count
                shipId_list = [
                    f'{i}' for i in range(start_n, end_n, self.machine_count)
                ]
                start_n = deepcopy(end_n)

                t1 = time.time()
                thread = threading.Thread(target=self.get_ship_detail,
                                          args=(shipId_list, ),
                                          daemon=True)
                thread.start()
                self.thread_list.append(thread)
                thread_sleep_time = 1 - (time.time() - t1)
                if thread_sleep_time > 0:
                    time.sleep(thread_sleep_time)

                # for thread in self.thread_list:
                #     thread.join()
                # pprint(self.ship_detail_dict)
                # if self.ship_detail_dict:
                #     self.save2es()
                # pprint(self.ship_detail_dict)
                # return

                while [thread.is_alive() for thread in self.thread_list
                       ].count(True) >= self.thread_max_count:
                    continue
                delete_index_list = []
                for index, thread in enumerate(self.thread_list):
                    if not thread.is_alive():
                        delete_index_list.append(index)
                delete_index_list.reverse()
                for index in delete_index_list:
                    del (self.thread_list[index])
                if self.err_count >= self.err_count_max:
                    raise Exception('\n\n'.join(self.err_msg_list))
                if self.no_data_count >= self.no_data_count_max:
                    break
                if self.ship_detail_dict and not self.save2es_thread.is_alive(
                ):
                    self.save2es_thread = threading.Thread(target=self.save2es,
                                                           daemon=True)
                    self.save2es_thread.start()
            print('完成爬取,等待 Thread 結束')
            for thread in self.thread_list:
                thread.join()
            print('Thread 結束, 正在將最後剩餘資料存入 ES 中')
            while self.save2es_thread.is_alive():
                continue
            if self.ship_detail_dict:
                self.save2es()
            print('結束')
            print(datetime.now())
            exit()
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(self.ip),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            self.err_msg_list.append(msg)
            self.err_msg_list = list(set(self.err_msg_list))
            print('\n\n'.join(self.err_msg_list))