Python Elastic示例

编程语言: Python

命名空间/包名称: lib.es.elastic

类/类型: Elastic

hotexamples.com的示例: 13

Python Elastic - 已找到13个示例。这些是从开源项目中提取的最受好评的lib.es.elastic.Elastic现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

Elastic(8)

check_index_exist(7)

create_index(7)

batch_load(3)

search(3)

count(1)

scan(1)

search_by_id(1)

示例#1

显示文件

文件： dcard_crawler.py 项目： douerdoyle/Dcard_Crawler_cron

    def __init__(self):
        self.rdbra = RequestDcardByRESTfulAPI()
        self.es = Elastic(
            host=app.config['ES_SETTING']['CONNECTION']['HOST'],
            port=app.config['ES_SETTING']['CONNECTION']['PORT'],
            username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
            password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])
        # for index_category, index_info in app.config['ES_SETTING']['ES_INDEX'].items():
        #     self.es.create_index(index_info['INDEX_NAME'], index_info['MAPPING_FILEPATH'])

        self.article_es_key_list = []
        f = open(
            app.config['ES_SETTING']['ES_INDEX']['ARTICLE']
            ['MAPPING_FILEPATH'], 'r')
        for key in json.loads(f.read())['mappings']['properties']:
            self.article_es_key_list.append(key)

        self.comment_es_key_list = []
        f = open(
            app.config['ES_SETTING']['ES_INDEX']['COMMENT']
            ['MAPPING_FILEPATH'], 'r')
        for key in json.loads(f.read())['mappings']['properties']:
            self.comment_es_key_list.append(key)

        self.exist_index = {}

示例#2

显示文件

class shipxy_crawler_class():
    def __init__(self):
        self.crawler_status = True
        self.error_msg_list = []
        self.db_rollback_dict = {}
        self.script_name = os.path.basename(__file__)
        self.get_shipxy_thread_list = []
        # 查詢船隻詳細資料至少要有 6 個thread，才有辦法在一小時內把冷熱區爬完
        self.get_ship_detail_quantity_limit = 6

        self.ship_type_dict = {
            x.type: x.name
            for x in ShipTypeShipxy.query.all()
        }
        self.navistatus_type_dict = {
            x.type: x.name
            for x in NavistatusTypeShipxy.query.all()
        }

    def get_shipxy_thread(self, db_result_dict_for_func):
        try:
            # data_token_result = self.sc.getareashipssimple(db_result_dict_for_func['coor_1'], db_result_dict_for_func['coor_2'])
            data_token_result = self.sc.getareashipssimple([
                db_result_dict_for_func['lu_lat'],
                db_result_dict_for_func['lu_lng']
            ], [
                db_result_dict_for_func['rd_lat'],
                db_result_dict_for_func['rd_lng']
            ])
        except:
            print(traceback.format_exc())
            self.db_rollback_dict[
                db_result_dict_for_func['id']] = db_result_dict_for_func
        if db_result_dict_for_func['id'] in self.db_rollback_dict:
            return
        try:
            time.sleep(self.gernal_sleep_time)
            if db_result_dict_for_func['id'] in self.db_rollback_dict:
                return
            batch_load_list = []

            if data_token_result['status']!=0 \
            or 'data' not in data_token_result:
                self.crawler_status = False
                self.error_msg_list.append(
                    '{}: 取得data token失敗, 船訊網 API 回傳 {}'.format(
                        self.script_name, data_token_result))
                return

            if not data_token_result['count']:
                print('{}: 區域 {} 內無船隻資料'.format(self.script_name,
                                                db_result_dict_for_func['id']))
                return

            area_result_list = []
            area_data_list = self.sc.area_info(data_token_result['data'])
            for area_data in area_data_list:
                if not area_data.get('mmsi') \
                or area_data['mmsi']==0 \
                or area_data['mmsi']=='0':
                    continue
                area_result_list.append(area_data)

            # if len(area_data_list)!=len(area_result_list):
            #     print(
            #         '\n'.join(
            #             [
            #                 '-'*20,
            #                 '{}/{}'.format(len(area_data_list), len(area_result_list)),
            #                 '{}, {}'.format(db_result_dict_for_func['lu_lat'], db_result_dict_for_func['lu_lng']),
            #                 '{}, {}'.format(db_result_dict_for_func['rd_lat'], db_result_dict_for_func['rd_lng']),
            #                 json.dumps(area_data_list, ensure_ascii=False, indent=4),
            #                 '-'*20
            #             ]
            #         )
            #     )

            if not area_result_list:
                print('{}: 區域 {} 內無可爬的船隻資料'.format(
                    self.script_name, db_result_dict_for_func['id']))
                self.es.batch_load(batch_load_list)
                return

            gsdc = get_ship_detail_class(self.sc)
            thread_start_time = time.time()

            for index, area_data in enumerate(area_result_list):
                # 爬太久的停止機制
                if time_to_stop():
                    break
                try:
                    # 每個帳號，至少隔 1 秒requests一次，避免帳號被鎖
                    while [
                            x.is_alive()
                            for x in gsdc.get_ship_detail_thread_list
                    ].count(True) >= math.floor(
                        (self.get_shipxy_thread_limit_tmp *
                         self.get_ship_detail_quantity_limit) /
                        ([x.is_alive()
                          for x in self.get_shipxy_thread_list].count(True) +
                         1)):
                        # 爬太久的停止機制
                        if time_to_stop():
                            break
                        continue
                except:
                    lll = [
                        traceback.format_exc(),
                        '{}'.format([
                            x.is_alive()
                            for x in gsdc.get_ship_detail_thread_list
                        ].count(True)),
                        '{}'.format(self.get_shipxy_thread_limit_tmp),
                        '{}'.format(self.get_ship_detail_quantity_limit),
                        '{}'.format([
                            x.is_alive() for x in self.get_shipxy_thread_list
                        ].count(True)),
                    ]
                    raise Exception('\n'.join(lll))
                remove_index_list = []
                for index, thread in enumerate(
                        gsdc.get_ship_detail_thread_list):
                    if not thread.is_alive():
                        remove_index_list.append(index)
                remove_index_list.reverse()
                for index in remove_index_list:
                    del (gsdc.get_ship_detail_thread_list[index])
                thread = threading.Thread(target=gsdc.get_ship_detail,
                                          args=(area_data, ),
                                          daemon=True)
                thread.start()
                gsdc.get_ship_detail_thread_list.append(thread)
                time.sleep(self.gernal_sleep_time)

            while [x.is_alive()
                   for x in gsdc.get_ship_detail_thread_list].count(True):
                # 爬太久的停止機制
                if time_to_stop():
                    break
                continue

            print('爬取區域: {}, 耗費時間: {}, 船隻數量: {}'.format(
                db_result_dict_for_func['id'],
                round((time.time() - thread_start_time), 1),
                len(area_result_list)))

            # 爬太久的停止機制
            if not time_to_stop():
                if len(area_result_list
                       ) >= 100 and not gsdc.thread_result_dict:
                    self.crawler_status = False
                    self.error_msg_list.append('\n'.join(
                        list(set(gsdc.error_msg_list))))
                    return
                elif len(area_result_list) >= 100 and int(
                        len(area_result_list) * 0.8) > len(
                            list(gsdc.thread_result_dict.keys())):
                    self.db_rollback_dict[db_result_dict_for_func[
                        'id']] = db_result_dict_for_func
                    self.error_msg_list.append('\n'.join(
                        list(set(gsdc.error_msg_list))))
                    return

            id_list = []
            for area_data in area_result_list:
                if area_data['mmsi'] not in gsdc.thread_result_dict:
                    continue
                id_list.append('{}_{}'.format(
                    area_data['mmsi'],
                    gsdc.thread_result_dict[area_data['mmsi']]['lastdyn']))
            if id_list:
                es_ship_ids = set([
                    data['_id'] for data in self.es.scan(
                        {
                            'query': {
                                'bool': {
                                    'must': [{
                                        'terms': {
                                            '_id': id_list
                                        }
                                    }]
                                }
                            }
                        }, app.config['ES_SETTING']['INDEX_INFO']['SHIPXY']
                        ['INDEX_NAME'])
                ])
            else:
                es_ship_ids = set()

            # print(
            #     '\n'.join(
            #         [
            #             'area_result_list len: {}'.format(len(area_result_list)),
            #             'id_list len: {}'.format(len(id_list)),
            #             'es_ship_ids len: {}'.format(len(list(es_ship_ids))),
            #             'area_result_list-id_list= {}'.format(len(area_result_list)-len(id_list)),
            #             'id_list-es_ship_ids= {}'.format(len(id_list)-len(list(es_ship_ids)))
            #         ]
            #     )
            # )

            delete_list = []
            for index, area_data in enumerate(area_result_list):
                if area_data['mmsi'] not in gsdc.thread_result_dict:
                    print('{} : 未取得船隻 {} 之詳細資訊，略過之'.format(
                        self.script_name, area_data['mmsi']))
                    continue
                dictionary = deepcopy(
                    gsdc.thread_result_dict[area_data['mmsi']])
                dictionary['latitude'] = area_data['lat']  # 緯度
                dictionary['longitude'] = area_data['lng']  # 經度

                dictionary = deepcopy(
                    gsdc.thread_result_dict[area_data['mmsi']])
                dictionary['_index'] = app.config['ES_SETTING']['INDEX_INFO'][
                    'SHIPXY']['INDEX_NAME']
                dictionary['_type'] = '_doc'
                dictionary['_id'] = '{}_{}'.format(area_data['mmsi'],
                                                   dictionary['lastdyn'])

                if dictionary['_id'] in es_ship_ids:
                    continue

                # dictionary['area_list_id'] = db_result_dict_for_func['area_list_id']

                dictionary['nationality'] = self.mmsi_dict[
                    dictionary['mmsi']
                    [:3]] if dictionary['mmsi'][:3] in self.mmsi_dict else None

                dictionary['cog'] = dictionary['cog'] / 100  # 對地航向
                dictionary['draught'] = dictionary['draught'] / 1000  # 吃水
                dictionary['hdg'] = dictionary['hdg'] / 100  # 船首向
                # # 船訊網可能會出現heading被設為51100，網頁上船首向為0的狀況
                # if dictionary['hdg']>360:
                #     dictionary['hdg'] = 0
                for key in ['lat', 'lon']:
                    dictionary.pop(key)
                dictionary['latitude'] = area_data['lat']  # 緯度
                dictionary['longitude'] = area_data['lng']  # 經度
                dictionary['sog'] = round(dictionary['sog'] / 5133 * 10,
                                          2)  # 速度：節
                dictionary['length'] = dictionary['length'] / 10  # 船長
                dictionary['lineWidth'] = area_data['lineWidth']
                dictionary['width'] = dictionary['width'] / 10  # 船寬

                dictionary['lastdyn_active'] = area_data[
                    'lastdyn_active']  # 是否可擷取資料
                dictionary['offset'] = area_data['offset']
                dictionary['rot'] = area_data.get('rot')
                dictionary['rotate'] = area_data['rotate']
                dictionary['shiptype'] = area_data['shiptype']
                dictionary['state'] = area_data['state']
                dictionary['state_color'] = area_data['state_color']
                dictionary['istop'] = area_data['istop']
                dictionary['tracks'] = area_data['tracks']

                dictionary['tcname'] = s2tw_converter(dictionary['cnname'])

                dictionary['utc_timestamp'] = dictionary.pop('lastdyn')

                dictionary['time'] = (
                    datetime.utcfromtimestamp(dictionary['utc_timestamp']) +
                    timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')

                if dictionary['type'] not in self.ship_type_dict:
                    sts_db_result = ShipTypeShipxy.query.filter(
                        ShipTypeShipxy.type == dictionary['type']).first()
                    if sts_db_result:
                        self.ship_type_dict[
                            sts_db_result.type] = sts_db_result.name
                    else:
                        self.ship_type_dict[dictionary['type']] = None
                dictionary['type_text'] = self.ship_type_dict[
                    dictionary['type']]

                if dictionary['navistatus'] not in self.navistatus_type_dict:
                    nt_db_result = NavistatusTypeShipxy.query.filter(
                        NavistatusTypeShipxy.type ==
                        dictionary['navistatus']).first()
                    if nt_db_result:
                        self.navistatus_type_dict[
                            nt_db_result.type] = nt_db_result.name
                    else:
                        self.navistatus_type_dict[
                            dictionary['navistatus']] = None
                dictionary['navistatus_text'] = self.navistatus_type_dict[
                    dictionary['navistatus']]

                dictionary['_routing'] = '{}'.format(
                    (datetime.utcfromtimestamp(dictionary['utc_timestamp']) +
                     timedelta(hours=8)).year)

                # dictionary['updatetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                # 這邊是如果字串後面有包含空格，就去除空格
                # 如果是空字串，就設為null
                for key in list(dictionary.keys()):
                    if type(dictionary[key]) is not str:
                        continue
                    dictionary[key] = dictionary[key].strip()
                    if not dictionary[key] or dictionary[key] == 'NULL':
                        dictionary[key] = None

                batch_load_list.append(dictionary)
            if delete_list:
                # 因為可能三台機器爬到同區域，如果有其中一台有刪除資料，這邊 Delete 就會出錯
                try:
                    self.es.delete_data(delete_list)
                except:
                    pass
            if batch_load_list:
                self.es.batch_load(batch_load_list)
            del (delete_list, batch_load_list)
        except:
            msg_list = [
                'ip: {}'.format(get_external_ip()),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ]
            print('\n\n'.join(msg_list))
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                              '\n\n'.join(msg_list))
            ggg.send_email()

    def shipxy_crawler_func(self):
        try:
            ip = get_external_ip()
        except:
            ip = '取得IP失敗'
        try:
            # 檢查這台機器是否有同排程還在執行
            if check_same_process_still_running(self.script_name):
                # 代表包含這個程式在內，有兩個以上相同的排程正在運行
                print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1))
                return
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(ip),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
            ggg.send_email()
            # line_notify_pusher(msg)
            return

        try:
            self.es = Elastic(
                host=app.config['ES_SETTING']['CONNECTION']['HOST'],
                port=app.config['ES_SETTING']['CONNECTION']['PORT'],
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])
            if not self.es.check_index_exist(
                    app.config['ES_SETTING']['INDEX_INFO']['SHIPXY']
                ['INDEX_NAME']):
                print(
                    self.es.create_index(
                        app.config['ES_SETTING']['INDEX_INFO']['SHIPXY']
                        ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                        ['SHIPXY']['MAPPING_FILEPATH']))

            db_result_list = AreaList.query.with_entities(
                AreaList.id,
                AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all()
            if not db_result_list:
                print('{}: 無區域的排程區間資料'.format(self.script_name))
                return
            crawl_span_dict = {
                db_result.id: db_result.crawl_span
                for db_result in db_result_list
            }
            query_sort_conds = [SubAreaList.area_list_id]
            query_sort_conds.extend([x.id for x in db_result_list])

            self.cold_zone_ids = set([
                db_result.id for db_result in AreaList.query.filter(
                    AreaList.enable == 1, AreaList.name.like('%冷區%')).all()
            ])

            self.mmsi_dict = {}
            for db_result in MMSI_Info.query.with_entities(
                    MMSI_Info.mmsi, MMSI_Info.alpha_2,
                    MMSI_Info.alpha_3).all():
                self.mmsi_dict[
                    db_result.
                    mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2

            self.sc = ShipXY_Crawler()

            cookies_list = []
            for db_result in ShipxyAccount.query.filter(
                    ShipxyAccount.enable == 1, ShipxyAccount.updating == 0,
                    ShipxyAccount.updated_time >=
                (datetime.now() - timedelta(days=1))).all():
                if not db_result.cookies:
                    continue
                cookies_list.append(deepcopy(db_result.cookies))
            if not cookies_list:
                raise Exception('{}: 無可用之帳號'.format(self.script_name))
            self.sc.update_cookies_list(cookies_list)

            del (cookies_list)

            while True:
                if not self.crawler_status:
                    raise Exception('\n'.join(list(set(self.error_msg_list))))
                elif self.get_shipxy_thread_list and [
                        x.is_alive() for x in self.get_shipxy_thread_list
                ].count(True) >= self.get_shipxy_thread_limit_tmp:
                    continue
                remove_index_list = []
                for index, thread in enumerate(self.get_shipxy_thread_list):
                    if not thread.is_alive():
                        remove_index_list.append(index)
                remove_index_list.reverse()
                for index in remove_index_list:
                    del (self.get_shipxy_thread_list[index])

                cookies_list = []
                for cookies in self.sc.cookies_list:
                    if 'SERVERID' in cookies:
                        SERVERID_list = cookies['SERVERID'].split('|')
                        SERVERID_list[1] = '{}'.format(time.time())
                        SERVERID_list[2] = '{}'.format(time.time())
                        cookies['SERVERID'] = '|'.join(SERVERID_list)
                    cookies_list.append(cookies)
                self.sc.update_cookies_list(cookies_list)
                del (cookies_list)

                db_result = CrawlerMachine.query.filter(
                    CrawlerMachine.ip == ip).first()
                if not db_result:
                    db_result = CrawlerMachine(ip=ip)
                db_result.updatedAt = datetime.now()
                db.session.add(db_result)
                db.session.commit()

                machine_quantity = CrawlerMachine.query.filter(
                    CrawlerMachine.updatedAt >= (datetime.now() -
                                                 timedelta(hours=1))).count()
                if not machine_quantity:
                    machine_quantity += 1
                # 每個帳號平均每秒只能查詢一次區域，以避免帳號被鎖
                # 算式為：(一秒/((可用帳號數量)/(機器總數)))-(這一輪當前經過的時間)
                self.gernal_sleep_time = (
                    1 / len(self.sc.cookies_list)) * machine_quantity * 1.5

                self.get_shipxy_thread_limit_tmp = (math.floor(
                    (len(self.sc.cookies_list) / machine_quantity) /
                    self.get_ship_detail_quantity_limit))
                if not self.get_shipxy_thread_limit_tmp:
                    raise Exception('\n'.join([
                        '{}: 帳號總數量未達可爬取之帳號最小數量\n'.format(self.script_name),
                        '最小數量定義的算式為\n',
                        '可用帳號之數量({}) 除以 機器總數({}) 除以 每個 thread 取得船隻詳細資料的子程序上限值({}) 後取最小值整數'
                        .format(len(self.sc.cookies_list), machine_quantity,
                                self.get_ship_detail_quantity_limit)
                    ]))

                if self.db_rollback_dict:
                    for db_result_id in list(self.db_rollback_dict.keys()):
                        db_result = SubAreaList.query.filter(
                            SubAreaList.id == db_result_id).first()
                        db_result.crawler_time = self.db_rollback_dict[
                            db_result_id]['crawler_time']
                        db_result.next_time = self.db_rollback_dict[
                            db_result_id]['next_time']
                        db.session.add(db_result)
                        del (self.db_rollback_dict[db_result_id])
                    db.session.commit()

                db_result = SubAreaList.query.filter(
                    SubAreaList.enable == 1, SubAreaList.web == 'shipxy',
                    or_(SubAreaList.next_time <= datetime.now(),
                        SubAreaList.next_time == None),
                    or_(*[
                        SubAreaList.area_list_id == id
                        for id in crawl_span_dict.keys()
                    ])).order_by(sqlalchemy.func.field(*query_sort_conds),
                                 asc(SubAreaList.next_time),
                                 func.random()).first()

                if not db_result:
                    if [x.is_alive()
                            for x in self.get_shipxy_thread_list].count(True):
                        print(
                            '{}: 無需要爬取的區域, 等待仍在執行的的區域爬取子程序結束中，如果所有子程序執行結束且無任何需爬取的區域，程式將會結束'
                            .format(self.script_name))
                        while [
                                x.is_alive()
                                for x in self.get_shipxy_thread_list
                        ].count(True):
                            # 如果到有區域需要再爬的時間，就繼續爬區域
                            if not datetime.now().minute \
                            or datetime.now().minute in crawl_span_dict.values():
                                break
                        # 這邊寫 continue 不是 return，是因為如果子程序執行結束，時間剛好過30分或是0分，就會又有區域需要爬
                        continue
                    else:
                        print('{}: 無需要爬取的區域, 程式結束, 時間: {}'.format(
                            self.script_name, datetime.now()))
                    return

                get_shipxy_thread_input = deepcopy(db_result.json())

                if db_result.area_list_id not in crawl_span_dict:
                    crawl_span_dict[
                        db_result.area_list_id] = AreaList.query.filter(
                            AreaList.id ==
                            db_result.area_list_id).first().crawl_span
                crawler_time = datetime.now() - timedelta(
                    minutes=datetime.now().minute %
                    crawl_span_dict[db_result.area_list_id])
                db_result.crawler_time = datetime.strptime(
                    crawler_time.strftime('%Y-%m-%d %H:%M:00'),
                    '%Y-%m-%d %H:%M:%S')
                db_result.next_time = db_result.crawler_time + timedelta(
                    minutes=crawl_span_dict[db_result.area_list_id])
                db.session.add(db_result)
                db.session.commit()

                if db_result.lu_lat==db_result.rd_lat \
                or db_result.lu_lng==db_result.rd_lng:
                    continue

                db.session.rollback()
                db.session.close()

                thread = threading.Thread(target=self.get_shipxy_thread,
                                          args=(get_shipxy_thread_input, ),
                                          daemon=True)
                thread.start()
                self.get_shipxy_thread_list.append(thread)

                # ###############################
                # for thread in self.get_shipxy_thread_list:
                #     thread.join()
                # return
                # ###############################
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(ip),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
            ggg.send_email()

示例#3

显示文件

    def shipxy_crawler_func(self):
        try:
            ip = get_external_ip()
        except:
            ip = '取得IP失敗'
        try:
            # 檢查這台機器是否有同排程還在執行
            if check_same_process_still_running(self.script_name):
                # 代表包含這個程式在內，有兩個以上相同的排程正在運行
                print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1))
                return
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(ip),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
            ggg.send_email()
            # line_notify_pusher(msg)
            return

        try:
            self.es = Elastic(
                host=app.config['ES_SETTING']['CONNECTION']['HOST'],
                port=app.config['ES_SETTING']['CONNECTION']['PORT'],
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])
            if not self.es.check_index_exist(
                    app.config['ES_SETTING']['INDEX_INFO']['SHIPXY']
                ['INDEX_NAME']):
                print(
                    self.es.create_index(
                        app.config['ES_SETTING']['INDEX_INFO']['SHIPXY']
                        ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                        ['SHIPXY']['MAPPING_FILEPATH']))

            db_result_list = AreaList.query.with_entities(
                AreaList.id,
                AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all()
            if not db_result_list:
                print('{}: 無區域的排程區間資料'.format(self.script_name))
                return
            crawl_span_dict = {
                db_result.id: db_result.crawl_span
                for db_result in db_result_list
            }
            query_sort_conds = [SubAreaList.area_list_id]
            query_sort_conds.extend([x.id for x in db_result_list])

            self.cold_zone_ids = set([
                db_result.id for db_result in AreaList.query.filter(
                    AreaList.enable == 1, AreaList.name.like('%冷區%')).all()
            ])

            self.mmsi_dict = {}
            for db_result in MMSI_Info.query.with_entities(
                    MMSI_Info.mmsi, MMSI_Info.alpha_2,
                    MMSI_Info.alpha_3).all():
                self.mmsi_dict[
                    db_result.
                    mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2

            self.sc = ShipXY_Crawler()

            cookies_list = []
            for db_result in ShipxyAccount.query.filter(
                    ShipxyAccount.enable == 1, ShipxyAccount.updating == 0,
                    ShipxyAccount.updated_time >=
                (datetime.now() - timedelta(days=1))).all():
                if not db_result.cookies:
                    continue
                cookies_list.append(deepcopy(db_result.cookies))
            if not cookies_list:
                raise Exception('{}: 無可用之帳號'.format(self.script_name))
            self.sc.update_cookies_list(cookies_list)

            del (cookies_list)

            while True:
                if not self.crawler_status:
                    raise Exception('\n'.join(list(set(self.error_msg_list))))
                elif self.get_shipxy_thread_list and [
                        x.is_alive() for x in self.get_shipxy_thread_list
                ].count(True) >= self.get_shipxy_thread_limit_tmp:
                    continue
                remove_index_list = []
                for index, thread in enumerate(self.get_shipxy_thread_list):
                    if not thread.is_alive():
                        remove_index_list.append(index)
                remove_index_list.reverse()
                for index in remove_index_list:
                    del (self.get_shipxy_thread_list[index])

                cookies_list = []
                for cookies in self.sc.cookies_list:
                    if 'SERVERID' in cookies:
                        SERVERID_list = cookies['SERVERID'].split('|')
                        SERVERID_list[1] = '{}'.format(time.time())
                        SERVERID_list[2] = '{}'.format(time.time())
                        cookies['SERVERID'] = '|'.join(SERVERID_list)
                    cookies_list.append(cookies)
                self.sc.update_cookies_list(cookies_list)
                del (cookies_list)

                db_result = CrawlerMachine.query.filter(
                    CrawlerMachine.ip == ip).first()
                if not db_result:
                    db_result = CrawlerMachine(ip=ip)
                db_result.updatedAt = datetime.now()
                db.session.add(db_result)
                db.session.commit()

                machine_quantity = CrawlerMachine.query.filter(
                    CrawlerMachine.updatedAt >= (datetime.now() -
                                                 timedelta(hours=1))).count()
                if not machine_quantity:
                    machine_quantity += 1
                # 每個帳號平均每秒只能查詢一次區域，以避免帳號被鎖
                # 算式為：(一秒/((可用帳號數量)/(機器總數)))-(這一輪當前經過的時間)
                self.gernal_sleep_time = (
                    1 / len(self.sc.cookies_list)) * machine_quantity * 1.5

                self.get_shipxy_thread_limit_tmp = (math.floor(
                    (len(self.sc.cookies_list) / machine_quantity) /
                    self.get_ship_detail_quantity_limit))
                if not self.get_shipxy_thread_limit_tmp:
                    raise Exception('\n'.join([
                        '{}: 帳號總數量未達可爬取之帳號最小數量\n'.format(self.script_name),
                        '最小數量定義的算式為\n',
                        '可用帳號之數量({}) 除以 機器總數({}) 除以 每個 thread 取得船隻詳細資料的子程序上限值({}) 後取最小值整數'
                        .format(len(self.sc.cookies_list), machine_quantity,
                                self.get_ship_detail_quantity_limit)
                    ]))

                if self.db_rollback_dict:
                    for db_result_id in list(self.db_rollback_dict.keys()):
                        db_result = SubAreaList.query.filter(
                            SubAreaList.id == db_result_id).first()
                        db_result.crawler_time = self.db_rollback_dict[
                            db_result_id]['crawler_time']
                        db_result.next_time = self.db_rollback_dict[
                            db_result_id]['next_time']
                        db.session.add(db_result)
                        del (self.db_rollback_dict[db_result_id])
                    db.session.commit()

                db_result = SubAreaList.query.filter(
                    SubAreaList.enable == 1, SubAreaList.web == 'shipxy',
                    or_(SubAreaList.next_time <= datetime.now(),
                        SubAreaList.next_time == None),
                    or_(*[
                        SubAreaList.area_list_id == id
                        for id in crawl_span_dict.keys()
                    ])).order_by(sqlalchemy.func.field(*query_sort_conds),
                                 asc(SubAreaList.next_time),
                                 func.random()).first()

                if not db_result:
                    if [x.is_alive()
                            for x in self.get_shipxy_thread_list].count(True):
                        print(
                            '{}: 無需要爬取的區域, 等待仍在執行的的區域爬取子程序結束中，如果所有子程序執行結束且無任何需爬取的區域，程式將會結束'
                            .format(self.script_name))
                        while [
                                x.is_alive()
                                for x in self.get_shipxy_thread_list
                        ].count(True):
                            # 如果到有區域需要再爬的時間，就繼續爬區域
                            if not datetime.now().minute \
                            or datetime.now().minute in crawl_span_dict.values():
                                break
                        # 這邊寫 continue 不是 return，是因為如果子程序執行結束，時間剛好過30分或是0分，就會又有區域需要爬
                        continue
                    else:
                        print('{}: 無需要爬取的區域, 程式結束, 時間: {}'.format(
                            self.script_name, datetime.now()))
                    return

                get_shipxy_thread_input = deepcopy(db_result.json())

                if db_result.area_list_id not in crawl_span_dict:
                    crawl_span_dict[
                        db_result.area_list_id] = AreaList.query.filter(
                            AreaList.id ==
                            db_result.area_list_id).first().crawl_span
                crawler_time = datetime.now() - timedelta(
                    minutes=datetime.now().minute %
                    crawl_span_dict[db_result.area_list_id])
                db_result.crawler_time = datetime.strptime(
                    crawler_time.strftime('%Y-%m-%d %H:%M:00'),
                    '%Y-%m-%d %H:%M:%S')
                db_result.next_time = db_result.crawler_time + timedelta(
                    minutes=crawl_span_dict[db_result.area_list_id])
                db.session.add(db_result)
                db.session.commit()

                if db_result.lu_lat==db_result.rd_lat \
                or db_result.lu_lng==db_result.rd_lng:
                    continue

                db.session.rollback()
                db.session.close()

                thread = threading.Thread(target=self.get_shipxy_thread,
                                          args=(get_shipxy_thread_input, ),
                                          daemon=True)
                thread.start()
                self.get_shipxy_thread_list.append(thread)

                # ###############################
                # for thread in self.get_shipxy_thread_list:
                #     thread.join()
                # return
                # ###############################
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(ip),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
            ggg.send_email()

示例#4

显示文件

文件： dcard_crawler.py 项目： douerdoyle/Dcard_Crawler_cron

class dcard_crawler():
    def __init__(self):
        self.rdbra = RequestDcardByRESTfulAPI()
        self.es = Elastic(
            host=app.config['ES_SETTING']['CONNECTION']['HOST'],
            port=app.config['ES_SETTING']['CONNECTION']['PORT'],
            username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
            password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])
        # for index_category, index_info in app.config['ES_SETTING']['ES_INDEX'].items():
        #     self.es.create_index(index_info['INDEX_NAME'], index_info['MAPPING_FILEPATH'])

        self.article_es_key_list = []
        f = open(
            app.config['ES_SETTING']['ES_INDEX']['ARTICLE']
            ['MAPPING_FILEPATH'], 'r')
        for key in json.loads(f.read())['mappings']['properties']:
            self.article_es_key_list.append(key)

        self.comment_es_key_list = []
        f = open(
            app.config['ES_SETTING']['ES_INDEX']['COMMENT']
            ['MAPPING_FILEPATH'], 'r')
        for key in json.loads(f.read())['mappings']['properties']:
            self.comment_es_key_list.append(key)

        self.exist_index = {}

    ################################################################################
    # 工具區
    def crawler_run_over_multi_hours(self, start_time, hours=run_hours_limit):
        return (True if
                (start_time + timedelta(hours=hours)) <= datetime.now() else
                False)

    def gen_article_url(self, forum_alias, article_id):
        return ('https://www.dcard.tw/f/{}/p/{}'.format(
            forum_alias, article_id))

    def batch_load_retryer(self, input_batch_load_list):
        # 為避免ES主機無法連線，這邊先用while迴圈測試能否bulk存入
        retry_n = 0
        while retry_n < retry_n_limit:
            try:
                self.es.batch_load(input_batch_load_list)
                break
            except:
                retry_n += 1
        # 如果測試兩次後，再試第三次，第三次還出錯，就會自動raise並寄信
        if retry_n >= retry_n_limit:
            self.es.batch_load(input_batch_load_list)

    def format_dcard_article(self, input_dict):
        input_dict = format_datetime_dict(input_dict)
        dictionary = {
            '_id':
            input_dict['id'],
            '_index':
            app.config['ES_SETTING']['ES_INDEX']['ARTICLE']
            ['INDEX_NAME_TEMPLATE'].format(
                datetime.strptime(input_dict['createdAt'],
                                  '%Y-%m-%d %H:%M:%S').year),
            '_type':
            '_doc',
            'url':
            self.gen_article_url(input_dict['forumAlias'], input_dict['id'])
        }

        for key in self.article_es_key_list:
            if key in input_dict:
                dictionary[key] = input_dict[key]

        dictionary['websiteId'] = input_dict['forumId']
        dictionary['website'] = input_dict['forumName']
        dictionary['websiteAlias'] = input_dict['forumAlias']
        dictionary['time'] = input_dict['createdAt']
        dictionary['update_time'] = input_dict['updatedAt']
        dictionary['db_update_time'] = datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        dictionary['reactionCount'] = input_dict['likeCount']

        if dictionary['_index'] not in self.exist_index:
            if not self.es.check_index_exist(dictionary['_index']):
                self.es.create_index(
                    dictionary['_index'], app.config['ES_SETTING']['ES_INDEX']
                    ['ARTICLE']['MAPPING_FILEPATH'])
            self.exist_index[dictionary['_index']] = True

        dictionary['reactions'] = {
            x['id']: x['count']
            for x in dictionary['reactions']
        }
        dictionary['media_data'] = []
        url_dict = {x['url']: None for x in input_dict['media']}
        for mediaMeta_dict in input_dict['mediaMeta']:
            if mediaMeta_dict.get('normalizedUrl') \
            and mediaMeta_dict.get('url') in url_dict:
                dictionary['media_data'].append({
                    'url':
                    mediaMeta_dict['url'],
                    'normalizedUrl':
                    mediaMeta_dict['normalizedUrl']
                })
        return (dictionary)

    def format_dcard_comment(self, input_dict, year, websiteId, website):
        input_dict = format_datetime_dict(input_dict)
        dictionary = {
            '_id':
            input_dict['id'],
            '_index':
            app.config['ES_SETTING']['ES_INDEX']['COMMENT']
            ['INDEX_NAME_TEMPLATE'].format(year),
            '_type':
            '_doc'
        }
        for key in self.comment_es_key_list:
            if key in input_dict:
                dictionary[key] = input_dict[key]
        dictionary['media_data'] = []
        url_dict = {x['url']: None for x in input_dict['mediaMeta']}
        for mediaMeta_dict in input_dict['mediaMeta']:
            if mediaMeta_dict.get('normalizedUrl') \
            and mediaMeta_dict.get('url') in url_dict:
                dictionary['media_data'].append({
                    'url':
                    mediaMeta_dict['url'],
                    'normalizedUrl':
                    mediaMeta_dict['normalizedUrl']
                })
        dictionary['key_no'] = input_dict['postId']
        dictionary['reactionCount'] = input_dict.get('likeCount')
        dictionary['time'] = input_dict['createdAt']
        dictionary['update_time'] = input_dict['updatedAt']
        dictionary['db_update_time'] = datetime.now().strftime(
            '%Y-%m-%d %H:%M:%S')
        dictionary['websiteId'] = websiteId
        dictionary['website'] = website
        dictionary['websiteAlias'] = input_dict['websiteAlias']
        return (dictionary)

    ################################################################################
    def dcard_forums_crawler(self, sub_script_name):
        db.session.close()
        # 檢查這台機器是否有同排程還在執行
        if check_duplicate_process(sub_script_name):
            # 代表包含這個程式在內，有兩個以上相同的排程正在運行
            print('{}: 有相同排程尚在執行({})'.format(sub_script_name, 1))
            return

        print(sub_script_name)
        err_code_startw = 0

        forums = self.rdbra.get_forums()
        # 拉不到看板列表，代表該主機被Dcard鎖了，結束排程
        if not forums:
            raise Exception('排程名稱: {}, 訊息: 機器被鎖({}, {})'.format(
                sub_script_name, err_code_startw, 1))

        print('排程名稱: {}, 訊息: Dcard看板共有 {} 個'.format(sub_script_name,
                                                    len(forums)))

        forum_id_dict = {forum['id']: forum for forum in forums}
        db_forum_id_dict = {
            db_result.id: db_result
            for db_result in DcardForums.query.all()
        }
        print('排程名稱: {}, 訊息: DB內 Dcard看板有 {} 個'.format(
            sub_script_name, len(list(set(db_forum_id_dict.keys())))))
        for forum_id in list(
                set(list(forum_id_dict.keys())) -
                set(list(db_forum_id_dict.keys()))):
            forum = forum_id_dict[forum_id]
            forum['pc_l30d'] = forum['postCount']['last30Days']
            forum['backtrack'] = 0
            forum['enable'] = 1
            db_forum_id_dict[forum['id']] = True
            db.session.add(DcardForums(**forum))

        nnn = 0
        for forum_id in list(
                set(list(db_forum_id_dict.keys())) -
                set(list(forum_id_dict.keys()))):
            db_result = db_forum_id_dict[forum_id]
            db_result.exist = 0
            db.session.add(db_result)
            nnn += 1
        print('排程名稱: {}, 訊息: 存在於DB內，但Dcard已經關版的看板有 {} 個'.format(
            sub_script_name, nnn))
        DcardForums.query.filter(DcardForums.ac_time == None).update(
            {'ac_status': 0})
        DcardForums.query.filter(DcardForums.cc_time == None).update(
            {'cc_status': 0})
        db.session.commit()

    def dcard_article_crawler(self, sub_script_name):
        # 檢查這台機器是否有同排程還在執行
        if check_duplicate_process(sub_script_name):
            # 代表包含這個程式在內，有兩個以上相同的排程正在運行
            print('{}: 有相同排程尚在執行({})'.format(sub_script_name, 1))
            return

        DcardForums.query.filter(DcardForums.ac_status == 1).update({
            'ac_status':
            0,
            'ac_time':
            None
        })
        db.session.commit()

        err_code_startw = 1
        try:
            while True:
                db.session.rollback()
                db.session.close()
                DcardForums.query.filter(
                    DcardForums.ac_status == 1, DcardForums.ac_time != None,
                    DcardForums.ac_time <=
                    (datetime.now() - timedelta(hours=6))).update(
                        {'ac_status': 0})
                db.session.commit()
                forum_alias_db_result = DcardForums.query.filter(
                    DcardForums.ac_status == 0, DcardForums.enable == 1,
                    DcardForums.exist == 1, DcardForums.pc_l30d != 0).order_by(
                        asc(DcardForums.ac_time),
                        asc(DcardForums.pc_l30d)).first()
                # 留言進度未追上文章進度，無文章可爬
                if (forum_alias_db_result.ac_time and forum_alias_db_result.cc_time and forum_alias_db_result.ac_time>forum_alias_db_result.cc_time) \
                or (not forum_alias_db_result.cc_time and forum_alias_db_result.ac_time):
                    print('排程名稱: {}, 訊息: {}'.format(sub_script_name,
                                                    '留言排程尚未追上文章排程進度，無文章可爬取'))
                    break
                crawler_start_time = datetime.now()
                forum_alias_db_result.ac_time = crawler_start_time
                forum_alias_db_result.ac_status = 1
                db.session.add(forum_alias_db_result)
                db.session.commit()

                forum_alias = forum_alias_db_result.alias
                before_id = None
                finish_status = False
                while not finish_status:
                    if not self.rdbra.request_dcard_status():
                        raise Exception('排程名稱: {}, 訊息: 機器被鎖({}, {})'.format(
                            sub_script_name, err_code_startw, 1))

                    params = pop_dict_empty_value_key({
                        'before': before_id,
                        'popular': 'false'
                    })
                    article_list = self.rdbra.get_article_list(
                        forum_alias, params)
                    if not article_list:
                        print('排程名稱: {}, 無文章'.format(sub_script_name))
                        break
                    batch_load_list = []
                    before_id = '{}'.format(article_list[-1]['id'])

                    print(
                        '排程名稱: {}, 工作內容: 爬取看板 {} 七天內文章中, 排程啟動時間: {}, 回溯進度: {}'.
                        format(
                            sub_script_name, forum_alias,
                            crawler_start_time.strftime('%Y-%m-%d %H:%M:%S'),
                            article_list[0]['createdAt']))
                    for article in article_list:
                        tmp_dict = self.rdbra.get_article_content(
                            article['id'])
                        time.sleep(2)
                        if not tmp_dict or not tmp_dict.get('forumAlias'):
                            continue
                        batch_load_dict = self.format_dcard_article(tmp_dict)
                        if not batch_load_dict.get(
                                'content') and es.search_by_id(
                                    app.config['ES_SETTING']['ES_INDEX']
                                    ['ARTICLE']['INDEX_NAME'], '_doc',
                                    batch_load_dict['_id'])['found']:
                            continue
                        elif datetime.strptime(
                                batch_load_dict['time'],
                                '%Y-%m-%d %H:%M:%S') <= (crawler_start_time -
                                                         timedelta(days=7)):
                            # 這裡加一層檢查，是為防該看板7天內都沒有任何文章，至少爬一篇最新的進ES，讓檢查看板過去文章是否已匯入這項工作順利執行
                            if not self.es.count({"query":{"bool":{"must":[{"match":{"websiteAlias":forum_alias}}]}}}, app.config['ES_SETTING']['ES_INDEX']['ARTICLE']['INDEX_NAME_TEMPLATE'].format(datetime.strptime(batch_load_dict['time'], '%Y-%m-%d %H:%M:%S').year)) \
                            and not batch_load_list:
                                pass
                            else:
                                print('排程名稱: {}, 看板{}過去七天內文章已爬完'.format(
                                    sub_script_name, forum_alias))
                                finish_status = True
                                break
                        batch_load_list.append(batch_load_dict)
                    if batch_load_list:
                        self.batch_load_retryer(batch_load_list)
                forum_alias_db_result.ac_status = 0
                db.session.add(forum_alias_db_result)
                db.session.commit()

            while True:
                db.session.rollback()
                db.session.close()
                crawler_start_time = datetime.now()
                forum_alias_db_result = DcardForums.query.filter(
                    DcardForums.enable == 1, DcardForums.backtrack == 1,
                    DcardForums.exist == 1).order_by(asc(
                        DcardForums.ac_time)).first()
                if not forum_alias_db_result:
                    print('排程名稱: {}, 訊息: 無需回溯的看板'.format(sub_script_name))
                    break
                forum_alias = forum_alias_db_result.alias
                # 全部看板只給他一小時時間回溯，以免耽擱時間
                if self.crawler_run_over_multi_hours(crawler_start_time,
                                                     hours=1):
                    print('排程名稱: {}, 訊息: {}'.format(
                        sub_script_name,
                        '回溯執行超過一個小時，停止回溯看板 {}'.format(forum_alias)))
                    break
                # 這邊開始是回溯看板過去文章
                print('排程名稱: {}, 開始檢查看板{}過去文章是否已匯入'.format(
                    sub_script_name, forum_alias))
                query = {
                    "from": 0,
                    "size": 1,
                    "sort": [{
                        "time": "asc"
                    }],
                    "query": {
                        "term": {
                            "websiteAlias": forum_alias
                        }
                    }
                }
                before_id = None
                year_n = 0
                while True:
                    if self.es.check_index_exist(
                            app.config['ES_SETTING']['ES_INDEX']['ARTICLE']
                        ['INDEX_NAME_TEMPLATE'].format(
                            crawler_start_time.year - year_n)):
                        tmp_es_result = self.es.search(
                            query, app.config['ES_SETTING']['ES_INDEX']
                            ['ARTICLE']['INDEX_NAME_TEMPLATE'].format(
                                crawler_start_time.year - year_n))
                        if tmp_es_result['hits']['hits']:
                            es_result = deepcopy(tmp_es_result)
                    else:
                        break
                    year_n -= 1
                if not es_result['hits']['hits']:
                    print('排程名稱: {}, 看板{}無任何文章'.format(sub_script_name,
                                                       forum_alias))
                    continue
                before_id = '{}'.format(es_result['hits']['hits'][0]['_id'])
                finish_status = False
                while not finish_status:
                    # 全部看板只給他一小時時間回溯，以免耽擱時間
                    if self.crawler_run_over_multi_hours(crawler_start_time,
                                                         hours=1):
                        print('排程名稱: {}, 訊息: {}'.format(
                            sub_script_name,
                            '回溯執行超過一個小時，停止回溯看板 {}'.format(forum_alias)))
                        break
                    params = {'before': before_id, 'popular': 'false'}
                    for key in list(params.keys()):
                        if not params[key]:
                            params.pop(key)
                    article_list = self.rdbra.get_article_list(
                        forum_alias, params)
                    if not article_list:
                        print('排程名稱: {}, 看板{}已完成回溯所有文章'.format(
                            sub_script_name, forum_alias))
                        break
                    batch_load_list = []
                    before_id = '{}'.format(article_list[-1]['id'])

                    print(
                        '排程名稱: {}, 工作內容: 爬取看板 {} 七天內文章中, 排程啟動時間: {}, 回溯進度: {}'.
                        format(
                            sub_script_name, forum_alias,
                            crawler_start_time.strftime('%Y-%m-%d %H:%M:%S'),
                            article_list[0]['createdAt']))
                    for article in article_list:
                        tmp_dict = self.rdbra.get_article_content(
                            article['id'])
                        time.sleep(2)
                        if not tmp_dict or not tmp_dict.get('forumAlias'):
                            continue
                        batch_load_dict = self.format_dcard_article(tmp_dict)
                    if batch_load_list:
                        self.batch_load_retryer(batch_load_list)
                forum_alias_db_result.ac_status = 0
                db.session.add(forum_alias_db_result)
                db.session.commit()
        except Exception as e:
            subject = 'Dcard排程 {} 出現錯誤'.format(sub_script_name)
            message_list = [
                '{}\n'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                '{}\n'.format(str(e)), '{}\n'.format(traceback.format_exc())
            ]
            print(traceback.format_exc())
            gs = GmailSender(app.config['GOOGLE_SENDER_CONF']['FROM_ADDRESS'],
                             app.config['GOOGLE_SENDER_CONF']['RECEIVER_LIST'],
                             subject, '\n'.join(message_list))
            gs.send_email()

    def dcard_comment_crawler(self, sub_script_name):
        # 檢查這台機器是否有同排程還在執行
        if check_duplicate_process(sub_script_name):
            # 代表包含這個程式在內，有兩個以上相同的排程正在運行
            print('{}: 有相同排程尚在執行({})'.format(sub_script_name, 1))
            return

        DcardForums.query.filter(DcardForums.cc_status == 1).update({
            'cc_status':
            0,
            'cc_time':
            None
        })
        db.session.commit()

        err_code_startw = 2
        try:
            while True:
                db.session.rollback()
                db.session.close()
                DcardForums.query.filter(
                    DcardForums.cc_status == 1, DcardForums.cc_time != None,
                    DcardForums.cc_time <=
                    (datetime.now() - timedelta(hours=6))).update(
                        {'cc_status': 0})
                db.session.commit()
                forum_alias_db_result = DcardForums.query.filter(
                    DcardForums.enable == 1, DcardForums.ac_time != None,
                    DcardForums.ac_status == 0, DcardForums.cc_status == 0,
                    DcardForums.exist == 1, DcardForums.pc_l30d != 0).order_by(
                        asc(DcardForums.cc_time), asc(DcardForums.ac_time),
                        asc(DcardForums.pc_l30d)).first()
                print(forum_alias_db_result.alias)
                if not forum_alias_db_result:
                    print('排程名稱: {}, 訊息: {} ({})'.format(
                        sub_script_name, '文章排程尚未追上留言排程進度，無留言可爬取', 1))
                    break
                elif forum_alias_db_result.cc_time and forum_alias_db_result.ac_time < forum_alias_db_result.cc_time:
                    print('排程名稱: {}, 訊息: {} ({})'.format(
                        sub_script_name, '文章排程尚未追上留言排程進度，無留言可爬取', 2))
                    break

                crawler_start_time = datetime.now()
                forum_alias = forum_alias_db_result.alias

                forum_alias_db_result.cc_time = crawler_start_time
                forum_alias_db_result.cc_status = 1
                db.session.add(forum_alias_db_result)
                db.session.commit()

                article_index_year_list = [forum_alias_db_result.ac_time.year]
                if (forum_alias_db_result.ac_time - timedelta(days=7)
                    ).year != forum_alias_db_result.ac_time.year:
                    article_index_year_list.append(
                        forum_alias_db_result.ac_time.year - 1)

                for article_index_year in article_index_year_list:
                    article_index_name = app.config['ES_SETTING']['ES_INDEX'][
                        'ARTICLE']['INDEX_NAME_TEMPLATE'].format(
                            article_index_year)
                    comment_index_name = app.config['ES_SETTING']['ES_INDEX'][
                        'COMMENT']['INDEX_NAME_TEMPLATE'].format(
                            article_index_year)
                    if article_index_name not in self.exist_index and not self.es.check_index_exist(
                            article_index_name):
                        continue
                    else:
                        self.exist_index[article_index_name] = True
                    if comment_index_name not in self.exist_index:
                        if not self.es.check_index_exist(comment_index_name):
                            self.es.create_index(
                                comment_index_name, app.config['ES_SETTING']
                                ['ES_INDEX']['COMMENT']['MAPPING_FILEPATH'])
                        self.exist_index[comment_index_name] = True

                    article_query = {
                        "from": 0,
                        "size": 100,
                        "sort": [{
                            "time": "asc"
                        }],
                        "query": {
                            "bool": {
                                "must": [{
                                    "term": {
                                        "websiteAlias": forum_alias
                                    }
                                }, {
                                    "range": {
                                        "commentCount": {
                                            "gt": 0
                                        }
                                    }
                                }]
                            }
                        }
                    }
                    comment_query = {
                        "from": 0,
                        "size": 1,
                        "query": {
                            "bool": {
                                "must": [{
                                    "term": {
                                        "websiteAlias": forum_alias
                                    }
                                }]
                            }
                        }
                    }
                    # 如果該看板不是第一次執行留言排程了，就加入時間查詢規則
                    if self.es.search(
                            comment_query, app.config['ES_SETTING']['ES_INDEX']
                        ['COMMENT']['INDEX_NAME_TEMPLATE'].format(
                            article_index_year))['hits']['hits']:
                        article_query['query']['bool']['must'].append({
                            "range": {
                                "time": {
                                    "gte":
                                    (forum_alias_db_result.ac_time - timedelta(
                                        days=7)).strftime('%Y-%m-%d %H:%M:%S'),
                                    "lte":
                                    forum_alias_db_result.ac_time.strftime(
                                        '%Y-%m-%d %H:%M:%S')
                                }
                            }
                        })
                    while True:
                        es_result = self.es.search(
                            article_query,
                            app.config['ES_SETTING']['ES_INDEX']['ARTICLE']
                            ['INDEX_NAME_TEMPLATE'].format(article_index_year))
                        if not es_result['hits']['hits']:
                            break
                        article_query['from'] += article_query['size']
                        print(
                            '排程名稱: {}, 工作內容: 爬取看板 {} 七天內文章中, 排程啟動時間: {}, 回溯進度: {}'
                            .format(
                                sub_script_name, forum_alias,
                                crawler_start_time.strftime(
                                    '%Y-%m-%d %H:%M:%S'), es_result['hits']
                                ['hits'][0]['_source']['time']))
                        for article_dict in es_result['hits']['hits']:
                            t1 = time.time()
                            comment_list = self.rdbra.get_article_comments_by_num(
                                article_dict['_id'], input_sleep_time=2)
                            time.sleep(2)
                            if not comment_list:
                                continue
                            batch_load_list = []
                            for comment_dict in comment_list:
                                comment_dict['websiteAlias'] = forum_alias
                                batch_load_dict = self.format_dcard_comment(
                                    comment_dict, article_index_year,
                                    article_dict['_source']['websiteId'],
                                    article_dict['_source']['website'])
                                if not batch_load_dict.get('content') or (
                                        not batch_load_dict['content']
                                        and es.search_by_id(
                                            comment_index_name, '_doc',
                                            batch_load_dict['_id'])['found']):
                                    continue
                                batch_load_list.append(batch_load_dict)
                            if batch_load_list:
                                self.batch_load_retryer(batch_load_list)
                forum_alias_db_result.cc_status = 0
                db.session.add(forum_alias_db_result)
                db.session.commit()

                if forum_alias_db_result.backtrack != 1:
                    continue
                db.session.rollback()
                db.session.close()

                crawler_start_time = datetime.now()
                forum_alias_db_result.cc_time = crawler_start_time
                forum_alias_db_result.cc_status = 1
                db.session.add(forum_alias_db_result)
                db.session.commit()

                # 這邊開始是回溯看板過去文章留言
                print('排程名稱: {}, 開始檢查看板{}過去文章留言是否已匯入'.format(
                    sub_script_name, forum_alias))

                year_n = 0
                article_earliest_time = None
                article_query = {
                    "from": 0,
                    "size": 1,
                    "sort": [{
                        "time": "asc"
                    }],
                    "query": {
                        "bool": {
                            "must": [{
                                "term": {
                                    "websiteAlias": forum_alias
                                }
                            }, {
                                "range": {
                                    "commentCount": {
                                        "gt": 0
                                    }
                                }
                            }]
                        }
                    }
                }
                while True:
                    index_name = app.config['ES_SETTING']['ES_INDEX'][
                        'ARTICLE']['INDEX_NAME_TEMPLATE'].format(
                            crawler_start_time.year - year_n)
                    if index_name not in self.exist_index:
                        if not self.es.check_index_exist(index_name):
                            break
                        self.exist_index[index_name] = True
                    es_result = self.es.search(article_query, index_name)
                    if not es_result['hits']['hits']:
                        break
                    article_earliest_time = datetime.strptime(
                        es_result['hits']['hits'][0]['_source']['time'],
                        '%Y-%m-%d %H:%M:%S')
                    year_n += 1

                year_n = 0
                comment_query = {
                    "from": 0,
                    "size": 1,
                    "sort": [{
                        "postId": "asc"
                    }],
                    "query": {
                        "bool": {
                            "must": [{
                                "term": {
                                    "websiteAlias": forum_alias
                                }
                            }]
                        }
                    }
                }
                while True:
                    index_name = app.config['ES_SETTING']['ES_INDEX'][
                        'COMMENT']['INDEX_NAME_TEMPLATE'].format(
                            crawler_start_time.year - year_n)
                    if index_name not in self.exist_index:
                        if not self.es.check_index_exist(index_name):
                            break
                        self.exist_index[index_name] = True
                    es_result = self.es.search(comment_query, index_name)
                    if not es_result['hits']['hits']:
                        break
                    comment_earliest_article_time = datetime.strptime(
                        self.es.search_by_id(
                            app.config['ES_SETTING']['ES_INDEX']['ARTICLE']
                            ['INDEX_NAME_TEMPLATE'].format(
                                crawler_start_time.year - year_n), '_doc',
                            '{}'.format(es_result['hits']['hits'][0]['_source']
                                        ['postId']))['_source']['time'],
                        '%Y-%m-%d %H:%M:%S')
                    year_n += 1

                if not article_earliest_time:
                    print('排程名稱: {} 訊息: 看板{}沒有已爬入ES的文章，故略過留言回溯'.format(
                        sub_script_name, forum_alias))
                    continue
                article_query = {
                    "from": 0,
                    "size": 10,
                    "sort": [{
                        "time": "desc"
                    }],
                    "query": {
                        "bool": {
                            "must": [{
                                "term": {
                                    "websiteAlias": forum_alias
                                }
                            }, {
                                "range": {
                                    "commentCount": {
                                        "gt": 0
                                    }
                                }
                            }, {
                                "range": {
                                    "time": {
                                        "lt":
                                        comment_earliest_article_time.strftime(
                                            '%Y-%m-%d %H:%M:%S')
                                    }
                                }
                            }]
                        }
                    }
                }
                traceback_stop_status = False
                while comment_earliest_article_time > article_earliest_time:
                    article_index_name = app.config['ES_SETTING']['ES_INDEX'][
                        'ARTICLE']['INDEX_NAME_TEMPLATE'].format(
                            comment_earliest_article_time.year)
                    if article_index_name not in self.exist_index:
                        if self.es.check_index_exist(article_index_name):
                            self.exist_index[article_index_name] = True
                        else:
                            break
                    # 每個看板只給他一小時時間回溯，以免耽擱時間
                    elif self.crawler_run_over_multi_hours(
                            crawler_start_time, 1):
                        print('排程名稱: {}, 訊息: {}'.format(
                            sub_script_name,
                            '回溯執行超過一個小時，停止回溯留言 {}'.format(forum_alias)))
                        traceback_stop_status = True
                        break
                    es_result = self.es.search(
                        article_query, app.config['ES_SETTING']['ES_INDEX']
                        ['ARTICLE']['INDEX_NAME_TEMPLATE'].format(
                            comment_earliest_article_time.year))
                    if not es_result['hits']['hits']:
                        comment_earliest_article_time = datetime.strptime(
                            '{}-12-31 23:59:59'.format(
                                comment_earliest_article_time.year - 1),
                            '%Y-%m-%d %H:%M:%S')
                        continue
                    print(
                        '排程名稱: {}, 工作內容: 爬取看板 {} 七天內文章中, 排程啟動時間: {}, 回溯進度: {}'.
                        format(
                            sub_script_name, forum_alias,
                            crawler_start_time.strftime('%Y-%m-%d %H:%M:%S'),
                            es_result['hits']['hits'][0]['_source']['time']))
                    for article_dict in es_result['hits']['hits']:
                        comment_list = self.rdbra.get_article_comments_by_num(
                            article_dict['_id'], input_sleep_time=2)
                        time.sleep(2)
                        if not comment_list:
                            continue
                        batch_load_list = []
                        for comment_dict in comment_list:
                            comment_dict['websiteAlias'] = forum_alias
                            batch_load_dict = self.format_dcard_comment(
                                comment_dict,
                                datetime.strptime(
                                    article_dict['_source']['time'],
                                    '%Y-%m-%d %H:%M:%S').year,
                                article_dict['_source']['websiteId'],
                                article_dict['_source']['website'])
                            if not batch_load_dict.get('content') or (
                                    not batch_load_dict['content']
                                    and es.search_by_id(
                                        batch_load_dict['_index'], '_doc',
                                        batch_load_dict['_id'])['found']):
                                continue
                            batch_load_list.append(batch_load_dict)
                        if batch_load_list:
                            self.batch_load_retryer(batch_load_list)
                    del (article_query['query']['bool']['must'][-1])
                    comment_earliest_article_time = datetime.strptime(
                        es_result['hits']['hits'][-1]['_source']['time'],
                        '%Y-%m-%d %H:%M:%S')
                    article_query['query']['bool']['must'].append({
                        'range': {
                            'time': {
                                'lt':
                                es_result['hits']['hits'][-1]['_source']
                                ['time']
                            }
                        }
                    })
                    article_query['from'] += article_query['size']

                forum_alias_db_result.cc_status = 0
                db.session.add(forum_alias_db_result)
                db.session.commit()
                # 全部看板只給一小時回溯留言
                if traceback_stop_status:
                    break
        except Exception as e:
            subject = 'Dcard排程 {} 出現錯誤'.format(sub_script_name)
            message_list = [
                '{}\n'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                '{}\n'.format(str(e)), '{}\n'.format(traceback.format_exc())
            ]
            print(traceback.format_exc())
            gs = GmailSender(app.config['GOOGLE_SENDER_CONF']['FROM_ADDRESS'],
                             app.config['GOOGLE_SENDER_CONF']['RECEIVER_LIST'],
                             subject, '\n'.join(message_list))
            gs.send_email()

示例#5

显示文件

文件： myships_crawler_callbyshipId.py 项目： douerdoyle/ship_crawler_cron

    def myships_crawler_func(self):
        try:
            self.ip = get_external_ip()
        except:
            self.ip = None
        try:
            # 檢查這台機器是否有同排程還在執行
            if check_same_process_still_running(self.script_name):
                # 代表包含這個程式在內，有兩個以上相同的排程正在運行
                print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1))
                return
            if not self.ip:
                raise (Exception('無法取得 IP'))
            try:
                rsp = requests.get(
                    app.config['CRAWLER_SETTING']['MYSHIPS']['HOST_DOMAIN'],
                    timeout=60)
                rsp.close()
            except:
                raise Exception('無法連線至寶船網網頁 :\n{}'.format(
                    traceback.format_exc()))
            if rsp.status_code != 200:
                raise Exception('寶船網網頁無法正確連線 :\n{}'.format(rsp.text))

            self.es = Elastic(
                host=app.config['ES_SETTING']['CONNECTION']['HOST'],
                port=app.config['ES_SETTING']['CONNECTION']['PORT'],
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])

            if not self.es.check_index_exist(
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                ['INDEX_NAME']):
                print(
                    self.es.create_index(
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                        ['MYSHIPS']['MAPPING_FILEPATH']))

            self.ship_type_dict = {
                x.type: x.name
                for x in ShipTypeMyships.query.all()
            }
            self.navistatus_type_dict = {
                x.type: x.name
                for x in NavistatusTypeMyships.query.all()
            }
            self.mmsi_dict = {}
            for db_result in MMSI_Info.query.with_entities(
                    MMSI_Info.mmsi, MMSI_Info.alpha_2,
                    MMSI_Info.alpha_3).all():
                self.mmsi_dict[
                    db_result.
                    mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2

            # start_n = deepcopy(4000000+self.machine_serial)
            start_n = deepcopy(self.machine_serial)
            while True:
                if datetime.now().minute>59 \
                and datetime.now().second>30:
                    return
                print(start_n)
                end_n = start_n + 1000 * self.machine_count
                shipId_list = [
                    f'{i}' for i in range(start_n, end_n, self.machine_count)
                ]
                start_n = deepcopy(end_n)

                thread = threading.Thread(target=self.get_ship_detail,
                                          args=(shipId_list, ),
                                          daemon=True)
                thread.start()
                self.thread_list.append(thread)

                # for thread in self.thread_list:
                #     thread.join()
                # pprint(self.ship_detail_dict)
                # if self.ship_detail_dict:
                #     self.save2es()
                # pprint(self.ship_detail_dict)
                # return

                while [thread.is_alive() for thread in self.thread_list
                       ].count(True) >= self.thread_max_count:
                    continue
                delete_index_list = []
                for index, thread in enumerate(self.thread_list):
                    if not thread.is_alive():
                        delete_index_list.append(index)
                delete_index_list.reverse()
                for index in delete_index_list:
                    del (self.thread_list[index])
                if self.err_count >= self.err_count_max:
                    raise Exception('\n\n'.join(self.err_msg_list))
                if self.no_data_count >= self.no_data_count_max:
                    break
                if self.ship_detail_dict:
                    self.save2es()
                time.sleep(1)
            for thread in self.thread_list:
                thread.join()
            if self.ship_detail_dict:
                self.save2es()
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(self.ip),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            self.err_msg_list.append(msg)
            self.err_msg_list = list(set(self.err_msg_list))
            print('\n\n'.join(self.err_msg_list))
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                              '\n\n'.join(self.err_msg_list))
            ggg.send_email()

示例#6

显示文件

文件： myships_crawler_callbyshipId.py 项目： douerdoyle/ship_crawler_cron

class myships_crawler_class():
    def __init__(self):
        self.machine_serial = int(os.environ.get('SERIAL', '0'))
        self.script_name = os.path.basename(__file__)
        self.err_msg_list = []
        self.thread_list = []
        self.thread_max_count = 30
        self.err_count = 0
        self.err_count_max = 3
        self.no_data_count = 0
        self.no_data_count_max = 10
        self.machine_count = len(serial_list)
        self.ship_detail_dict = {}
        self.headers = {
            'Connection':
            'close',
            'User-Agent':
            'Mozilla/5.0 (Macintosh Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
        }
        # self.comparison_dict = {
        #     'updatetime':'updatetime',
        #     'eta_timestamp':'eta',
        #     'callsign':'callsign',
        #     'cog':'cog',
        #     'dest':'destPort',
        #     'draught':'draught',
        #     'hdg':'heading',
        #     'imo':'imo',
        #     'latitude':'lat',
        #     'length':'length',
        #     'longitude':'lon',
        #     'mmsi':'mmsi',
        #     'name':'shipnameEn',
        #     'navistatus':'aisNavStatus',
        #     'rot':'rot',
        #     'shipid':'shipId',
        #     'sog':'sog',
        #     'utc_timestamp':'posTime',
        #     'type':'shiptype',
        #     'width':'breadth',
        #     'y':'y',
        #     'v':'v'
        # }

    def err_msg_generator(self, err_msg):
        return ('\n'.join(
            [datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self.ip, err_msg]))

    def save2es(self):
        batch_load_list = []
        for key_id in list(self.ship_detail_dict.keys()):
            id_list = [
                '{}_{}'.format(ship_detail_dict['mmsi'],
                               ship_detail_dict['posTime'])
                for ship_detail_dict in self.ship_detail_dict[key_id]
            ]
            if id_list:
                es_ship_ids = set([
                    data['_id'] for data in self.es.scan(
                        {
                            'query': {
                                'bool': {
                                    'must': [{
                                        'terms': {
                                            '_id': id_list
                                        }
                                    }]
                                }
                            }
                        }, app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'])
                ])
            else:
                es_ship_ids = set()
            for ship_detail_dict in self.ship_detail_dict[key_id]:
                _id = '{}_{}'.format(ship_detail_dict['mmsi'],
                                     ship_detail_dict['posTime'])
                if _id in es_ship_ids:
                    continue
                dictionary = {
                    '_index':
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                    ['INDEX_NAME'],
                    '_type':
                    '_doc',
                    '_id':
                    _id,
                    '_routing':
                    '{}'.format(
                        (datetime.utcfromtimestamp(ship_detail_dict['posTime'])
                         + timedelta(hours=8)).year),
                    'updatetime':
                    ship_detail_dict['updatetime'],
                    'eta_timestamp':
                    ship_detail_dict['eta'],
                    'time':
                    (datetime.utcfromtimestamp(ship_detail_dict['posTime']) +
                     timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S'),
                    'callsign':
                    ship_detail_dict['callsign'],
                    'nationality':
                    self.mmsi_dict[ship_detail_dict['mmsi'][:3]] if
                    ship_detail_dict['mmsi'][:3] in self.mmsi_dict else None,
                    'cog':
                    ship_detail_dict['cog'] /
                    10 if ship_detail_dict['cog'] else None,
                    'dest':
                    ship_detail_dict['destPort'],
                    'draught':
                    ship_detail_dict['draught'] /
                    10 if ship_detail_dict['draught'] else None,
                    'hdg':
                    ship_detail_dict['heading'],
                    'imo':
                    ship_detail_dict['imo'],
                    'latitude':
                    ship_detail_dict['lat'] / 600000,
                    'length':
                    ship_detail_dict['length'],
                    'longitude':
                    ship_detail_dict['lon'] / 600000,
                    'mmsi':
                    ship_detail_dict['mmsi'],
                    'name':
                    ship_detail_dict['shipnameEn'],
                    'navistatus':
                    ship_detail_dict['aisNavStatus'],
                    'rot':
                    ship_detail_dict['rot'],
                    'shipid':
                    ship_detail_dict['shipId'],
                    'sog':
                    ship_detail_dict['sog'] /
                    10 if ship_detail_dict['sog'] else None,
                    'utc_timestamp':
                    ship_detail_dict['posTime'],
                    'type':
                    ship_detail_dict['shiptype'],
                    'width':
                    ship_detail_dict['breadth'],
                    'y':
                    ship_detail_dict['shiptype'],
                    'v':
                    ship_detail_dict['aisNavStatus']
                }

                try:
                    # type 有時會出現亂碼，如「6857-d&0」、「1607U No.158」
                    dictionary['type'] = int(dictionary['type'])
                except:
                    dictionary['type'] = None
                try:
                    dictionary['navistatus'] = int(dictionary['navistatus'])
                except:
                    dictionary['navistatus'] = None
                if ship_detail_dict['eta']:
                    eta_datetime = datetime.utcfromtimestamp(
                        ship_detail_dict['eta'])
                    dictionary['eta'] = eta_datetime.strftime('%m-%d %H:%M')
                    dictionary['eta_datetime'] = eta_datetime.strftime(
                        '%Y-%m-%d %H:%M:%S')
                else:
                    dictionary['eta'] = None
                    dictionary['eta_datetime'] = None
                if dictionary['type'] in self.ship_type_dict:
                    dictionary['type_text'] = self.ship_type_dict[
                        dictionary['type']]
                if dictionary['navistatus'] in self.navistatus_type_dict:
                    dictionary['navistatus_text'] = self.navistatus_type_dict[
                        dictionary['navistatus']]
                dictionary['y'] = dictionary['type']
                dictionary['v'] = dictionary['navistatus']
                batch_load_list.append(dictionary)
            del (self.ship_detail_dict[key_id])
        self.es.batch_load(batch_load_list)

    def get_ship_detail(self, shipId_list_for_func):
        input_json = {"shipId": ','.join(shipId_list_for_func)}
        try:
            rsp = RequestsRetryer(
                'post', {
                    'url':
                    app.config['CRAWLER_SETTING']['MYSHIPS']['SHIP_DETAIL'],
                    'headers': self.headers,
                    'json': input_json,
                    'timeout': 180
                },
                req_retry_limit=3,
                req_retry_sleeptime=5)
            rsp.close()
        except:
            self.err_count += 1
            self.err_msg_list.append(
                self.err_msg_generator(traceback.format_exc()))
            return
        if rsp.status_code != 200:
            self.err_count += 1
            self.err_msg_list.append(self.err_msg_generator(rsp.text))
            return
        try:
            rsp_result = rsp.json()
        except:
            self.err_count += 1
            self.err_msg_list.append(
                self.err_msg_generator(traceback.format_exc()))
            return
        if rsp_result['code']!='0' \
        or rsp_result['message']!='成功':
            pprint(rsp_result)
            self.err_count += 1
            self.err_msg_list.append(self.err_msg_generator(rsp.text))
            return
        key_id = f'{uuid4()}'
        self.ship_detail_dict[key_id] = []
        for ship_detail_dict in rsp_result['data']:
            if not ship_detail_dict['mmsi'] \
            or not ship_detail_dict['lon'] \
            or not ship_detail_dict['lat'] \
            or not ship_detail_dict['posTime']:
                continue
            ship_detail_dict['v'] = None
            ship_detail_dict['y'] = None
            ship_detail_dict['updatetime'] = datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')
            self.ship_detail_dict[key_id].append(ship_detail_dict)
        if not self.ship_detail_dict[key_id]:
            self.no_data_count += 1
            return
        # {
        #     "code": "0",
        #     "count": 1,
        #     "message": "成功",
        #     "data": [
        #         {
        #             "posTime": 1606964270,
        #             "lon": 70842352,
        #             "lat": 14671966,
        #             "sog": 0,
        #             "cog": 2612,
        #             "heading": 359,
        #             "rot": 0,
        #             "aisNavStatus": "0",
        #             "mmsi": "413698780",
        #             "shipnameEn": "JIN YING 6",
        #             "imo": "7549747",
        #             "callsign": "BSKF",
        #             "shiptype": "60",
        #             "length": 37,
        #             "breadth": 10,
        #             "eta": 1599067020,
        #             "destPort": "XIAMEN",
        #             "draught": 18,
        #             "shipId": 2029
        #         }
        #     ]
        # }
    def myships_crawler_func(self):
        try:
            self.ip = get_external_ip()
        except:
            self.ip = None
        try:
            # 檢查這台機器是否有同排程還在執行
            if check_same_process_still_running(self.script_name):
                # 代表包含這個程式在內，有兩個以上相同的排程正在運行
                print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1))
                return
            if not self.ip:
                raise (Exception('無法取得 IP'))
            try:
                rsp = requests.get(
                    app.config['CRAWLER_SETTING']['MYSHIPS']['HOST_DOMAIN'],
                    timeout=60)
                rsp.close()
            except:
                raise Exception('無法連線至寶船網網頁 :\n{}'.format(
                    traceback.format_exc()))
            if rsp.status_code != 200:
                raise Exception('寶船網網頁無法正確連線 :\n{}'.format(rsp.text))

            self.es = Elastic(
                host=app.config['ES_SETTING']['CONNECTION']['HOST'],
                port=app.config['ES_SETTING']['CONNECTION']['PORT'],
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])

            if not self.es.check_index_exist(
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                ['INDEX_NAME']):
                print(
                    self.es.create_index(
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                        ['MYSHIPS']['MAPPING_FILEPATH']))

            self.ship_type_dict = {
                x.type: x.name
                for x in ShipTypeMyships.query.all()
            }
            self.navistatus_type_dict = {
                x.type: x.name
                for x in NavistatusTypeMyships.query.all()
            }
            self.mmsi_dict = {}
            for db_result in MMSI_Info.query.with_entities(
                    MMSI_Info.mmsi, MMSI_Info.alpha_2,
                    MMSI_Info.alpha_3).all():
                self.mmsi_dict[
                    db_result.
                    mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2

            # start_n = deepcopy(4000000+self.machine_serial)
            start_n = deepcopy(self.machine_serial)
            while True:
                if datetime.now().minute>59 \
                and datetime.now().second>30:
                    return
                print(start_n)
                end_n = start_n + 1000 * self.machine_count
                shipId_list = [
                    f'{i}' for i in range(start_n, end_n, self.machine_count)
                ]
                start_n = deepcopy(end_n)

                thread = threading.Thread(target=self.get_ship_detail,
                                          args=(shipId_list, ),
                                          daemon=True)
                thread.start()
                self.thread_list.append(thread)

                # for thread in self.thread_list:
                #     thread.join()
                # pprint(self.ship_detail_dict)
                # if self.ship_detail_dict:
                #     self.save2es()
                # pprint(self.ship_detail_dict)
                # return

                while [thread.is_alive() for thread in self.thread_list
                       ].count(True) >= self.thread_max_count:
                    continue
                delete_index_list = []
                for index, thread in enumerate(self.thread_list):
                    if not thread.is_alive():
                        delete_index_list.append(index)
                delete_index_list.reverse()
                for index in delete_index_list:
                    del (self.thread_list[index])
                if self.err_count >= self.err_count_max:
                    raise Exception('\n\n'.join(self.err_msg_list))
                if self.no_data_count >= self.no_data_count_max:
                    break
                if self.ship_detail_dict:
                    self.save2es()
                time.sleep(1)
            for thread in self.thread_list:
                thread.join()
            if self.ship_detail_dict:
                self.save2es()
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(self.ip),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            self.err_msg_list.append(msg)
            self.err_msg_list = list(set(self.err_msg_list))
            print('\n\n'.join(self.err_msg_list))
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                              '\n\n'.join(self.err_msg_list))
            ggg.send_email()

示例#7

显示文件

def check_rumor_func():
    script_name = os.path.basename(__file__)

    es = Elastic(host=['118.163.94.26'],
                 port=17377,
                 username='******',
                 password='******')

    query_template = {
        "from": 0,
        "size": 1,
        "sort": [{
            "create_time": "asc"
        }],
        "query": {
            "bool": {
                "must": [{
                    "term": {
                        "source": 'mygopen'
                    }
                }, {
                    "exists": {
                        'field': 'file'
                    }
                }, {
                    "exists": {
                        'field': 'message'
                    }
                }]
            }
        }
    }
    query_template['size'] = 10000
    create_time_gt_old = None
    create_time_gt = (datetime.strptime(
        es.search(query_template, 'fakenews@rumor_grouping')['hits']['hits'][0]
        ['_source']['create_time'], '%Y-%m-%d %H:%M:%S') -
                      timedelta(seconds=1)).strftime('%Y-%m-%d %H:%M:%S')
    dictionary_list_1 = []
    dictionary_list_2 = []
    while True:
        if create_time_gt_old and create_time_gt_old == create_time_gt:
            break
        print(create_time_gt)
        query = deepcopy(query_template)
        query['query']['bool']['must'].append(
            {"range": {
                "create_time": {
                    "gt": create_time_gt
                }
            }})
        search_result = es.search(query, 'fakenews@rumor_grouping')
        if not search_result['hits']['hits']:
            break
        query['from'] += query['size']
        for data in search_result['hits']['hits']:
            dictionary = OrderedDict()
            dictionary['id'] = data['_id']
            dictionary.update(data['_source'])
            if len(dictionary['file']) == 1:
                dictionary_list_1.append(dictionary)
            else:
                dictionary_list_2.append(dictionary)
        create_time_gt_old = deepcopy(create_time_gt)
        create_time_gt = datetime.strptime(
            es.search(query_template,
                      'fakenews@rumor_grouping')['hits']['hits'][-1]['_source']
            ['create_time'], '%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d %H:%M:%S')

    ddddd = {
        'rumor_grouping檢查-一個檔案.xlsx': dictionary_list_1[:100],
        'rumor_grouping檢查-多個檔案.xlsx': dictionary_list_2[:100],
    }
    for excel_filename, dictionary_list in ddddd.items():
        excel_output_path = './rumor_grouping檢查-一個檔案.xlsx'
        content = json.dumps(dictionary_list_1, ensure_ascii=False, indent=4)
        tmp_json_path = 'tmp.json'
        f = open(tmp_json_path, 'w')
        f.write(content)
        f.close()
        pandas.read_json(tmp_json_path).to_excel(excel_output_path,
                                                 sheet_name='rumor_grouping',
                                                 index=False)
        if os.path.exists(tmp_json_path):
            os.remove(tmp_json_path)

示例#8

显示文件

def myships_crawler_func():
    script_name = os.path.basename(__file__)
    # 檢查這台機器是否有同排程還在執行
    if check_same_process_still_running(script_name):
        # 代表包含這個程式在內，有兩個以上相同的排程正在運行
        print('{}: 有相同排程尚在執行({})'.format(script_name, 1))
        return
    try:
        es = Elastic(
            host=app.config['ES_SETTING']['CONNECTION']['HOST'],
            port=app.config['ES_SETTING']['CONNECTION']['PORT'],
            username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
            password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])

        if not es.check_index_exist(app.config['ES_SETTING']['INDEX_INFO']
                                    ['MYSHIPS']['INDEX_NAME']):
            print(
                es.create_index(
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                    ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                    ['MYSHIPS']['MAPPING_FILEPATH']))

        db_result_list = AreaList.query.with_entities(
            AreaList.id,
            AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all()
        if not db_result_list:
            print('{}: 無區域的排程區間資料'.format(script_name))
            return
        crawl_span_dict = {
            db_result.id: db_result.crawl_span
            for db_result in db_result_list
        }
        query_sort_conds = [SubAreaList.area_list_id]
        query_sort_conds.extend([x.id for x in db_result_list])

        cold_zone_ids = set([
            db_result.id for db_result in AreaList.query.filter(
                AreaList.enable == 1, AreaList.name.like('%冷區%')).all()
        ])

        ship_type_dict = {x.type: x.name for x in ShipTypeMyships.query.all()}
        navistatus_type_dict = {
            x.type: x.name
            for x in NavistatusTypeMyships.query.all()
        }

        mc = Myships_Crawler()
        while True:
            db.session.rollback()
            db.session.close()
            batch_load_list = []

            db_result = SubAreaList.query.filter(
                SubAreaList.enable == 1, SubAreaList.web == 'myships',
                or_(SubAreaList.next_time <= datetime.now(),
                    SubAreaList.next_time == None),
                or_(*[
                    SubAreaList.area_list_id == id
                    for id in crawl_span_dict.keys()
                ])).order_by(sqlalchemy.func.field(*query_sort_conds),
                             asc(SubAreaList.next_time),
                             func.random()).first()
            if not db_result:
                print('{}: 完成'.format(script_name))
                return
            print('{}: 爬取區域 {} 中'.format(script_name, db_result.id))
            crawler_time = datetime.now() - timedelta(
                minutes=datetime.now().minute %
                crawl_span_dict[db_result.area_list_id])
            old_crawler_time = deepcopy(db_result.crawler_time)
            old_next_time = deepcopy(db_result.next_time)
            db_result.crawler_time = datetime.strptime(
                crawler_time.strftime('%Y-%m-%d %H:%M:00'),
                '%Y-%m-%d %H:%M:%S')
            db_result.next_time = db_result.crawler_time + timedelta(
                minutes=crawl_span_dict[db_result.area_list_id])
            db.session.add(db_result)
            db.session.commit()

            if db_result.lu_lat==db_result.rd_lat \
            or db_result.lu_lng==db_result.rd_lng:
                continue

            # ma_cookies_list = [x.cookies for x in MyshipsAccount.query.filter(MyshipsAccount.enable==1, MyshipsAccount.updating==0).all()]
            ma_cookies_list = []
            try:
                area_result = mc.area_info(
                    min([db_result.lu_lat, db_result.rd_lat]),
                    min([db_result.lu_lng, db_result.rd_lng]),
                    max([db_result.lu_lat, db_result.rd_lat]),
                    max([db_result.lu_lng, db_result.rd_lng]), ma_cookies_list)
            except:
                db_result.crawler_time = old_crawler_time
                db_result.next_time = old_next_time
                db.session.add(db_result)
                db.session.commit()

                msg = '\n\n'.join([
                    'ip: {}'.format(get_external_ip()),
                    '時間: {}'.format(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                    '{}\n{}'.format('取得區域船隻資料出現錯誤，請檢查是資策會端網路出現錯誤還是寶船網網站異常',
                                    traceback.format_exc()),
                ])
                print(msg)
                ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name),
                                  app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                                  msg)
                ggg.send_email()
                continue

            if area_result['code'] != '0':
                msg = '\n\n'.join([
                    'ip: {}'.format(get_external_ip()), '時間: {}'.format(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                    '\n'.join([
                        '取得寶船網區域船隻資料時出現錯誤', '{}'.format(area_result),
                        'id :{}'.format(db_result.id),
                        'area_list_id :{}'.format(db_result.area_list_id),
                        '{}'.format({
                            'age':
                            1440,
                            'rgn':
                            mc.check_trans2myships_coord(
                                [[
                                    min([db_result.lu_lat, db_result.rd_lat]),
                                    min([db_result.lu_lng, db_result.rd_lng])
                                ],
                                 [
                                     max([db_result.lu_lat, db_result.rd_lat]),
                                     max([db_result.lu_lng, db_result.rd_lng])
                                 ]])
                        }), '{}'.format([[db_result.lu_lat, db_result.lu_lng],
                                         [db_result.rd_lat, db_result.rd_lng]])
                    ])
                ])
                print(msg)
                ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name),
                                  app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                                  msg)
                ggg.send_email()
                continue

            tmp_area_data_list = area_result.pop('data')
            area_result['data'] = []

            for area_data in tmp_area_data_list:
                if not area_data.get('m') \
                or area_data['m']=='0':
                    continue
                area_result['data'].append(area_data)

            # 該區域沒有任何船隻資料的話，略過
            if not area_result['data']:
                print('{}: Skip Area {}'.format(script_name, db_result.id))
                continue
            else:
                print('{}: 區域 {} 有 {} 艘船隻'.format(script_name, db_result.id,
                                                  len(area_result['data'])))

            id_list = []
            ship_data_dict = mc.ship_info(
                [area_data['i'] for area_data in area_result['data']])
            for area_data in area_result['data']:
                if area_data['i'] not in ship_data_dict:
                    print(area_data['i'])
                    continue
                id_list.append('{}_{}'.format(
                    area_data['m'], ship_data_dict[area_data['i']]['posTime']))
            if id_list:
                es_ship_ids = set([
                    data['_id'] for data in es.scan(
                        {
                            'query': {
                                'bool': {
                                    'must': [{
                                        'terms': {
                                            '_id': id_list
                                        }
                                    }]
                                }
                            }
                        }, app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'])
                ])
            else:
                es_ship_ids = set()

            for area_data in area_result['data']:
                if area_data['i'] not in ship_data_dict:
                    print(area_data['i'])
                    continue
                ship_data = ship_data_dict[area_data['i']]
                try:
                    dictionary = {
                        '_index':
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'],
                        '_type':
                        '_doc',
                        '_id':
                        '{}_{}'.format(area_data['m'], ship_data['posTime']),
                        '_routing':
                        '{}'.format(
                            (datetime.utcfromtimestamp(ship_data['posTime']) +
                             timedelta(hours=8)).year)
                        if ship_data['posTime'] else None,
                        'updatetime':
                        ship_data['updatetime'] if ship_data.get('updatetime')
                        else datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        'eta_timestamp':
                        ship_data['eta'],
                        'eta':
                        area_data['r'],
                        'time':
                        (datetime.utcfromtimestamp(ship_data['posTime']) +
                         timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S'),
                        'y':
                        area_data['y']
                    }
                except:
                    msg = '\n'.join([
                        traceback.format_exc(), '{}'.format(area_data),
                        '{}'.format(ship_data)
                    ])
                    ggg = GmailSender(
                        '船隻爬蟲出現錯誤-{}'.format(script_name),
                        app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
                    ggg.send_email()
                    continue

                if dictionary['_id'] in es_ship_ids:
                    continue

                try:
                    dictionary['v'] = int(area_data['v'])
                except:
                    dictionary['v'] = None

                if not dictionary['eta_timestamp']:
                    dictionary['eta_timestamp'] = None
                    dictionary['eta_datetime'] = None
                else:
                    dictionary['eta_datetime'] = (
                        datetime.utcfromtimestamp(dictionary['eta_timestamp'])
                        + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')

                for source_key, new_key in format_batch_load_dict.items():
                    if source_key in area_data:
                        dictionary[new_key] = area_data[source_key]
                    elif source_key in ship_data:
                        dictionary[new_key] = ship_data[source_key]

                dictionary['shipid'] = '{}'.format(dictionary['shipid'])

                for key, divisor in format_data_content.items():
                    if dictionary.get(key):
                        dictionary[key] = round(dictionary[key] / divisor, 6)

                for key in list(dictionary.keys()):
                    if type(dictionary[key]) is not str:
                        continue
                    dictionary[key] = dictionary[key].strip()
                    if not dictionary[key] or dictionary[key] == 'NULL':
                        dictionary[key] = None
                for key in ['navistatus', 'rot', 'type', 'y']:
                    if dictionary.get(key) and type(
                            dictionary[key] is not int):
                        dictionary[key] = int(dictionary[key])

                if dictionary['type'] not in ship_type_dict:
                    sts_db_result = ShipTypeMyships.query.filter(
                        ShipTypeMyships.type == dictionary['type']).first()
                    if sts_db_result:
                        ship_type_dict[sts_db_result.type] = sts_db_result.name
                    else:
                        ship_type_dict[dictionary['type']] = None
                dictionary['type_text'] = ship_type_dict[dictionary['type']]

                if dictionary['navistatus'] not in navistatus_type_dict:
                    nt_db_result = NavistatusTypeMyships.query.filter(
                        NavistatusTypeMyships.type ==
                        dictionary['navistatus']).first()
                    if nt_db_result:
                        navistatus_type_dict[
                            nt_db_result.type] = nt_db_result.name
                    else:
                        navistatus_type_dict[dictionary['navistatus']] = None
                dictionary['navistatus_text'] = navistatus_type_dict[
                    dictionary['navistatus']]

                batch_load_list.append(dictionary)

            if batch_load_list:
                es.batch_load(batch_load_list)
            # #############################
            # if len(batch_load_list)>2:
            #     return
            # #############################
    except:
        msg = '\n\n'.join([
            'ip: {}'.format(get_external_ip()),
            '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
            traceback.format_exc()
        ])
        print(msg)
        ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(script_name),
                          app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
        ggg.send_email()

示例#9

显示文件

文件： myships_crawler.py 项目： douerdoyle/ship_crawler_cron

    def myships_crawler_func(self):
        # 檢查這台機器是否有同排程還在執行
        if check_same_process_still_running(self.script_name):
            # 代表包含這個程式在內，有兩個以上相同的排程正在運行
            print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1))
            return
        try:
            self.es = Elastic(
                host=app.config['ES_SETTING']['CONNECTION']['HOST'],
                port=app.config['ES_SETTING']['CONNECTION']['PORT'],
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])

            if not self.es.check_index_exist(
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                ['INDEX_NAME']):
                print(
                    self.es.create_index(
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                        ['MYSHIPS']['MAPPING_FILEPATH']))

            db_result_list = AreaList.query.with_entities(
                AreaList.id,
                AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all()
            if not db_result_list:
                print('{}: 無區域的排程區間資料'.format(self.script_name))
                return
            self.crawl_span_dict = {
                db_result.id: db_result.crawl_span
                for db_result in db_result_list
            }
            self.query_sort_conds = [SubAreaList.area_list_id]
            self.query_sort_conds.extend([x.id for x in db_result_list])

            self.ship_type_dict = {
                x.type: x.name
                for x in ShipTypeMyships.query.all()
            }
            self.navistatus_type_dict = {
                x.type: x.name
                for x in NavistatusTypeMyships.query.all()
            }

            self.mc = Myships_Crawler()
            while True:
                if datetime.now().minute>57 \
                and datetime.now().second>30:
                    return
                db.session.rollback()
                db.session.close()

                if self.thread_error_count >= self.thread_error_count_max:
                    self.err_msg_list = list(set(self.err_msg_list))
                    raise Exception('\n\n'.join(self.err_msg_list))

                del_index_list = []
                for index, thread in enumerate(self.thread_list):
                    if not thread.is_alive():
                        del_index_list.append(index)
                del_index_list.reverse()
                for index in del_index_list:
                    del (self.thread_list[index])
                if self.rollback_id_list:
                    while self.rollback_id_list:
                        db_result = SubAreaList.query.filter(
                            SubAreaList.id ==
                            self.rollback_id_list[0]).first()
                        db_result.crawler_time = self.time_dict[
                            'old_crawler_time'][self.rollback_id_list[0]]
                        db_result.next_time = self.time_dict['old_next_time'][
                            self.rollback_id_list[0]]
                        db.session.add(db_result)
                        del (self.time_dict['old_crawler_time'][
                            self.rollback_id_list[0]])
                        del (self.time_dict['old_next_time'][
                            self.rollback_id_list[0]])
                        del (self.rollback_id_list[0])
                    db.session.commit()

                db_result = SubAreaList.query.filter(
                    SubAreaList.enable == 1, SubAreaList.web == 'myships',
                    or_(SubAreaList.next_time <= datetime.now(),
                        SubAreaList.next_time == None),
                    or_(*[
                        SubAreaList.area_list_id == id
                        for id in self.crawl_span_dict.keys()
                    ])).order_by(sqlalchemy.func.field(*self.query_sort_conds),
                                 asc(SubAreaList.next_time),
                                 func.random()).first()
                if not db_result:
                    if self.thread_list:
                        for thread in self.thread_list:
                            thread.join()
                        continue
                    print('{}: 完成'.format(self.script_name))
                    return
                print('{}: 爬取區域 {} 中'.format(self.script_name, db_result.id))
                crawler_time = datetime.now() - timedelta(
                    minutes=datetime.now().minute %
                    self.crawl_span_dict[db_result.area_list_id])
                self.time_dict['old_crawler_time'][db_result.id] = deepcopy(
                    db_result.crawler_time)
                self.time_dict['old_next_time'][db_result.id] = deepcopy(
                    db_result.next_time)
                db_result.crawler_time = datetime.strptime(
                    crawler_time.strftime('%Y-%m-%d %H:%M:00'),
                    '%Y-%m-%d %H:%M:%S')
                db_result.next_time = db_result.crawler_time + timedelta(
                    minutes=self.crawl_span_dict[db_result.area_list_id])
                db.session.add(db_result)
                db.session.commit()

                if db_result.lu_lat==db_result.rd_lat \
                or db_result.lu_lng==db_result.rd_lng:
                    continue

                self.mc.set_cookies_list([
                    x.cookies for x in MyshipsAccount.query.filter(
                        MyshipsAccount.enable == 1, MyshipsAccount.updating ==
                        0, MyshipsAccount.updated_time >= (
                            datetime.now() - timedelta(hours=1))).all()
                ])

                thread = threading.Thread(target=self.myships_thread,
                                          args=(db_result.json(), ),
                                          daemon=True)
                thread.start()
                time.sleep(2)
                self.thread_list.append(thread)
                while [x.is_alive() for x in self.thread_list
                       ].count(True) >= self.thread_max_count:
                    continue

            if self.err_msg_list:
                self.err_msg_list = list(set(self.err_msg_list))
                ggg = GmailSender(
                    '船隻爬蟲 {} 執行完成，但途中有部份錯誤'.format(self.script_name),
                    app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                    '\n\n'.join(self.err_msg_list))
                ggg.send_email()
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(get_external_ip()),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            self.err_msg_list.append(msg)
            self.err_msg_list = list(set(self.err_msg_list))
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                              '\n\n'.join(self.err_msg_list))
            ggg.send_email()

示例#10

显示文件

文件： myships_crawler.py 项目： douerdoyle/ship_crawler_cron

class myships_crawler_class():
    def __init__(self):
        self.script_name = os.path.basename(__file__)
        self.thread_error_count = 0
        self.err_msg_list = []
        self.thread_max_count = 20
        self.thread_list = []
        self.rollback_id_list = []
        self.thread_error_count_max = self.thread_max_count * 2
        self.time_dict = {'old_crawler_time': {}, 'old_next_time': {}}

    def myships_thread(self, db_result_dict):
        try:
            batch_load_list = []
            try:
                area_result = self.mc.area_info(
                    min([db_result_dict['lu_lat'], db_result_dict['rd_lat']]),
                    min([db_result_dict['lu_lng'], db_result_dict['rd_lng']]),
                    max([db_result_dict['lu_lat'], db_result_dict['rd_lat']]),
                    max([db_result_dict['lu_lng'], db_result_dict['rd_lng']]))
            except:
                self.thread_error_count += 1
                self.rollback_id_list.append(db_result_dict['id'])
                msg = '\n\n'.join([
                    'ip: {}'.format(get_external_ip()),
                    '時間: {}'.format(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                    '{}\n{}'.format('取得區域船隻資料出現錯誤，請檢查是資策會端網路出現錯誤還是寶船網網站異常',
                                    traceback.format_exc()),
                ])
                print(msg)
                self.err_msg_list.append(msg)
                # ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
                # ggg.send_email()
                return

            if area_result['code'] != '0':
                self.thread_error_count += 1
                msg = '\n\n'.join([
                    'ip: {}'.format(get_external_ip()), '時間: {}'.format(
                        datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                    '\n'.join([
                        '取得寶船網區域船隻資料時出現錯誤', '{}'.format(area_result),
                        'id :{}'.format(db_result_dict['id']),
                        'area_list_id :{}'.format(
                            db_result_dict['area_list_id']),
                        '{}'.format([[
                            db_result_dict['lu_lat'], db_result_dict['lu_lng']
                        ], [
                            db_result_dict['rd_lat'], db_result_dict['rd_lng']
                        ]])
                    ])
                ])
                print(msg)
                self.err_msg_list.append(msg)
                # ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name), app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
                # ggg.send_email()
                return

            tmp_area_data_list = area_result.pop('data')
            area_result['data'] = []

            for area_data in tmp_area_data_list:
                if not area_data.get('m') \
                or area_data['m']=='0':
                    continue
                area_result['data'].append(area_data)

            # 該區域沒有任何船隻資料的話，略過
            if not area_result['data']:
                print('{}: Skip Area {}'.format(self.script_name,
                                                db_result_dict['id']))
                return
            else:
                print('{}: 區域 {} 有 {} 艘船隻'.format(self.script_name,
                                                  db_result_dict['id'],
                                                  len(area_result['data'])))

            id_list = []
            ship_data_dict = self.mc.ship_info(
                [area_data['i'] for area_data in area_result['data']])
            for area_data in area_result['data']:
                if area_data['i'] not in ship_data_dict:
                    print(area_data['i'])
                    continue
                id_list.append('{}_{}'.format(area_data['m'], area_data['t']))
            if id_list:
                es_ship_ids = set([
                    data['_id'] for data in self.es.scan(
                        {
                            'query': {
                                'bool': {
                                    'must': [{
                                        'terms': {
                                            '_id': id_list
                                        }
                                    }]
                                }
                            }
                        }, app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'])
                ])
            else:
                es_ship_ids = set()

            for area_data in area_result['data']:
                if area_data['i'] not in ship_data_dict:
                    print(area_data['i'])
                    continue
                # 有時候拉船隻詳細資料 posTime 會是 Null, 這時改為區域船隻資料的船隻資料時間點
                ship_data = ship_data_dict[area_data['i']]
                try:
                    dictionary = {
                        '_index':
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'],
                        '_type':
                        '_doc',
                        '_id':
                        '{}_{}'.format(area_data['m'], area_data['t']),
                        '_routing':
                        '{}'.format(
                            (datetime.utcfromtimestamp(area_data['t']) +
                             timedelta(hours=8)).year)
                        if area_data['t'] else None,
                        'updatetime':
                        area_data['updatetime'] if area_data.get('updatetime')
                        else datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                        'eta_timestamp':
                        ship_data['eta'],
                        'eta':
                        area_data['r'],
                        'time':
                        (datetime.utcfromtimestamp(area_data['t']) +
                         timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S'),
                        'y':
                        area_data['y']
                    }
                except:
                    self.thread_error_count += 1
                    msg = '\n'.join([
                        traceback.format_exc(), '{}'.format(area_data),
                        '{}'.format(ship_data)
                    ])
                    print(msg)
                    self.err_msg_list.append(msg)
                    ggg = GmailSender(
                        '船隻爬蟲船隻資料出現異常-{}'.format(self.script_name),
                        app.config['GOOGLE_SENDER_CONF']['TO_LIST'], msg)
                    ggg.send_email()
                    continue

                if dictionary['_id'] in es_ship_ids:
                    continue

                try:
                    dictionary['v'] = int(area_data['v'])
                except:
                    dictionary['v'] = None

                if not dictionary['eta_timestamp']:
                    dictionary['eta_timestamp'] = None
                    dictionary['eta_datetime'] = None
                else:
                    dictionary['eta_datetime'] = (
                        datetime.utcfromtimestamp(dictionary['eta_timestamp'])
                        + timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S')

                for source_key, new_key in format_batch_load_dict.items():
                    if source_key in area_data:
                        dictionary[new_key] = area_data[source_key]
                    elif source_key in ship_data:
                        dictionary[new_key] = ship_data[source_key]

                dictionary['shipid'] = '{}'.format(dictionary['shipid'])

                for key, divisor in format_data_content.items():
                    if dictionary.get(key):
                        dictionary[key] = round(dictionary[key] / divisor, 6)

                for key in list(dictionary.keys()):
                    if type(dictionary[key]) is not str:
                        continue
                    dictionary[key] = dictionary[key].strip()
                    if not dictionary[key] or dictionary[key] == 'NULL':
                        dictionary[key] = None
                for key in ['navistatus', 'rot', 'type', 'y']:
                    if dictionary.get(key) and type(
                            dictionary[key] is not int):
                        dictionary[key] = int(dictionary[key])

                if dictionary['type'] not in self.ship_type_dict:
                    sts_db_result = ShipTypeMyships.query.filter(
                        ShipTypeMyships.type == dictionary['type']).first()
                    if sts_db_result:
                        self.ship_type_dict[
                            sts_db_result.type] = sts_db_result.name
                    else:
                        self.ship_type_dict[dictionary['type']] = None
                dictionary['type_text'] = self.ship_type_dict[
                    dictionary['type']]

                if dictionary['navistatus'] not in self.navistatus_type_dict:
                    nt_db_result = NavistatusTypeMyships.query.filter(
                        NavistatusTypeMyships.type ==
                        dictionary['navistatus']).first()
                    if nt_db_result:
                        self.navistatus_type_dict[
                            nt_db_result.type] = nt_db_result.name
                    else:
                        self.navistatus_type_dict[
                            dictionary['navistatus']] = None
                dictionary['navistatus_text'] = self.navistatus_type_dict[
                    dictionary['navistatus']]

                batch_load_list.append(dictionary)
            if batch_load_list:
                self.es.batch_load(batch_load_list)
        except:
            self.thread_error_count += 1
            msg = traceback.format_exc()
            print(msg)
            self.err_msg_list.append(msg)

    def myships_crawler_func(self):
        # 檢查這台機器是否有同排程還在執行
        if check_same_process_still_running(self.script_name):
            # 代表包含這個程式在內，有兩個以上相同的排程正在運行
            print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1))
            return
        try:
            self.es = Elastic(
                host=app.config['ES_SETTING']['CONNECTION']['HOST'],
                port=app.config['ES_SETTING']['CONNECTION']['PORT'],
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])

            if not self.es.check_index_exist(
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                ['INDEX_NAME']):
                print(
                    self.es.create_index(
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                        ['MYSHIPS']['MAPPING_FILEPATH']))

            db_result_list = AreaList.query.with_entities(
                AreaList.id,
                AreaList.crawl_span).order_by(asc(AreaList.crawl_span)).all()
            if not db_result_list:
                print('{}: 無區域的排程區間資料'.format(self.script_name))
                return
            self.crawl_span_dict = {
                db_result.id: db_result.crawl_span
                for db_result in db_result_list
            }
            self.query_sort_conds = [SubAreaList.area_list_id]
            self.query_sort_conds.extend([x.id for x in db_result_list])

            self.ship_type_dict = {
                x.type: x.name
                for x in ShipTypeMyships.query.all()
            }
            self.navistatus_type_dict = {
                x.type: x.name
                for x in NavistatusTypeMyships.query.all()
            }

            self.mc = Myships_Crawler()
            while True:
                if datetime.now().minute>57 \
                and datetime.now().second>30:
                    return
                db.session.rollback()
                db.session.close()

                if self.thread_error_count >= self.thread_error_count_max:
                    self.err_msg_list = list(set(self.err_msg_list))
                    raise Exception('\n\n'.join(self.err_msg_list))

                del_index_list = []
                for index, thread in enumerate(self.thread_list):
                    if not thread.is_alive():
                        del_index_list.append(index)
                del_index_list.reverse()
                for index in del_index_list:
                    del (self.thread_list[index])
                if self.rollback_id_list:
                    while self.rollback_id_list:
                        db_result = SubAreaList.query.filter(
                            SubAreaList.id ==
                            self.rollback_id_list[0]).first()
                        db_result.crawler_time = self.time_dict[
                            'old_crawler_time'][self.rollback_id_list[0]]
                        db_result.next_time = self.time_dict['old_next_time'][
                            self.rollback_id_list[0]]
                        db.session.add(db_result)
                        del (self.time_dict['old_crawler_time'][
                            self.rollback_id_list[0]])
                        del (self.time_dict['old_next_time'][
                            self.rollback_id_list[0]])
                        del (self.rollback_id_list[0])
                    db.session.commit()

                db_result = SubAreaList.query.filter(
                    SubAreaList.enable == 1, SubAreaList.web == 'myships',
                    or_(SubAreaList.next_time <= datetime.now(),
                        SubAreaList.next_time == None),
                    or_(*[
                        SubAreaList.area_list_id == id
                        for id in self.crawl_span_dict.keys()
                    ])).order_by(sqlalchemy.func.field(*self.query_sort_conds),
                                 asc(SubAreaList.next_time),
                                 func.random()).first()
                if not db_result:
                    if self.thread_list:
                        for thread in self.thread_list:
                            thread.join()
                        continue
                    print('{}: 完成'.format(self.script_name))
                    return
                print('{}: 爬取區域 {} 中'.format(self.script_name, db_result.id))
                crawler_time = datetime.now() - timedelta(
                    minutes=datetime.now().minute %
                    self.crawl_span_dict[db_result.area_list_id])
                self.time_dict['old_crawler_time'][db_result.id] = deepcopy(
                    db_result.crawler_time)
                self.time_dict['old_next_time'][db_result.id] = deepcopy(
                    db_result.next_time)
                db_result.crawler_time = datetime.strptime(
                    crawler_time.strftime('%Y-%m-%d %H:%M:00'),
                    '%Y-%m-%d %H:%M:%S')
                db_result.next_time = db_result.crawler_time + timedelta(
                    minutes=self.crawl_span_dict[db_result.area_list_id])
                db.session.add(db_result)
                db.session.commit()

                if db_result.lu_lat==db_result.rd_lat \
                or db_result.lu_lng==db_result.rd_lng:
                    continue

                self.mc.set_cookies_list([
                    x.cookies for x in MyshipsAccount.query.filter(
                        MyshipsAccount.enable == 1, MyshipsAccount.updating ==
                        0, MyshipsAccount.updated_time >= (
                            datetime.now() - timedelta(hours=1))).all()
                ])

                thread = threading.Thread(target=self.myships_thread,
                                          args=(db_result.json(), ),
                                          daemon=True)
                thread.start()
                time.sleep(2)
                self.thread_list.append(thread)
                while [x.is_alive() for x in self.thread_list
                       ].count(True) >= self.thread_max_count:
                    continue

            if self.err_msg_list:
                self.err_msg_list = list(set(self.err_msg_list))
                ggg = GmailSender(
                    '船隻爬蟲 {} 執行完成，但途中有部份錯誤'.format(self.script_name),
                    app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                    '\n\n'.join(self.err_msg_list))
                ggg.send_email()
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(get_external_ip()),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            self.err_msg_list.append(msg)
            self.err_msg_list = list(set(self.err_msg_list))
            ggg = GmailSender('船隻爬蟲出現錯誤-{}'.format(self.script_name),
                              app.config['GOOGLE_SENDER_CONF']['TO_LIST'],
                              '\n\n'.join(self.err_msg_list))
            ggg.send_email()

示例#11

显示文件

def mygopen_crawler_func():
    script_name = os.path.basename(__file__)
    try:
        es = Elastic(host=[app.config['ES_SETTING']['CONNECTION']['HOST']], 
                port=app.config['ES_SETTING']['CONNECTION']['PORT'], 
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])

        if not es.check_index_exist(app.config['ES_SETTING']['INDEX_INFO']['ARTICLE']['INDEX_NAME']):
            es.create_index(app.config['ES_SETTING']['INDEX_INFO']['ARTICLE']['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']['ARTICLE']['MAPPING_FILEPATH'])
        if not es.check_index_exist(app.config['ES_SETTING']['INDEX_INFO']['MEDIA']['INDEX_NAME']):
            es.create_index(app.config['ES_SETTING']['INDEX_INFO']['MEDIA']['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']['MEDIA']['MAPPING_FILEPATH'])

        query = {
            'from' : 0, 
            'size' : 1,
            "sort" : [
                { "article_publish_time" : "desc" }
            ]
        }

        es_dictionary_list = es.search(query, app.config['ES_SETTING']['INDEX_INFO']['ARTICLE']['INDEX_NAME'])['hits']['hits']
        es_newest_article_time = None if not es_dictionary_list else datetime.strptime(es_dictionary_list[0]['_source']['article_publish_time'], '%Y-%m-%d %H:%M:%S')
        offset_n = 0
        limit_n = 10

        finish_status = False
        while not finish_status:
            print('爬取 MyGoPen Web 中, offset_n: {}'.format(offset_n))
            article_list = get_MyGoPen_article_list(offset_n, limit_n)
            offset_n+=limit_n
            if not article_list:
                break

            for article_dict_index, article_dict in enumerate(article_list):
                print('爬取 MyGoPen Web 中，offset_n: {}, {}/{}'.format(offset_n, article_dict_index, len(article_list)))
                dictionary = format_api_rsp(article_dict, es)
                if es_newest_article_time and datetime.strptime(dictionary['article_publish_time'], '%Y-%m-%d %H:%M:%S')<=es_newest_article_time:
                    finish_status = True
                    break
                if not dictionary['article_url']:
                    pprint(dictionary)
                    print('無法透過API取得文章網址')
                    continue
                dictionary['_type'] = '_doc'
                dictionary['_index'] = app.config['ES_SETTING']['INDEX_INFO']['ARTICLE']['INDEX_NAME']
                m = hashlib.md5()
                m.update(dictionary['article_url'].encode("utf-8"))
                dictionary['_id'] = m.hexdigest()
                es.batch_load([dictionary])
    except Exception as e:
        error_msg = '\n'.join(
            [
                '{}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                '{}'.format(traceback.format_exc())
            ]
        )
        print(error_msg)
        lnm.send_msg(error_msg)
        gs = GmailSender(
            'MyGoPen排程出現錯誤-{}'.format(script_name),
            app.config['GOOGLE_SENDER_CONF']['RECEIVER_LIST'],
            error_msg
            )
        gs.send_email()

示例#12

显示文件

文件： myships_crawler_callbyshipId_2.py 项目： douerdoyle/ship_crawler_cron

class myships_crawler_class():
    def __init__(self):
        self.machine_serial = int(os.environ.get('SERIAL', '0'))
        self.script_name = os.path.basename(__file__)
        self.err_msg_list = []
        self.thread_list = []
        self.thread_max_count = 100
        self.err_count = 0
        self.err_count_max = 200
        self.no_data_count = 0
        self.no_data_count_max = 10
        self.machine_count = 1
        self.ship_detail_dict = {}
        self.headers = {
            'Connection':
            'close',
            'User-Agent':
            'Mozilla/5.0 (Macintosh Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
        }
        self.save2es_thread = threading.Thread(target=self.save2es,
                                               daemon=True)

    def err_msg_generator(self, err_msg):
        return ('\n'.join(
            [datetime.now().strftime('%Y-%m-%d %H:%M:%S'), self.ip, err_msg]))

    def save2es(self):
        batch_load_list = []
        for key_id in list(self.ship_detail_dict.keys()):
            id_list = [
                '{}_{}'.format(ship_detail_dict['mmsi'],
                               ship_detail_dict['posTime'])
                for ship_detail_dict in self.ship_detail_dict[key_id]
            ]
            if id_list:
                es_ship_ids = set([
                    data['_id'] for data in self.es.scan(
                        {
                            'query': {
                                'bool': {
                                    'must': [{
                                        'terms': {
                                            '_id': id_list
                                        }
                                    }]
                                }
                            }
                        }, app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'])
                ])
            else:
                es_ship_ids = set()
            for ship_detail_dict in self.ship_detail_dict.pop(key_id):
                _id = '{}_{}'.format(ship_detail_dict['mmsi'],
                                     ship_detail_dict['posTime'])
                if _id in es_ship_ids:
                    continue
                dictionary = {
                    '_index':
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                    ['INDEX_NAME'],
                    '_type':
                    '_doc',
                    '_id':
                    _id,
                    '_routing':
                    '{}'.format(
                        (datetime.utcfromtimestamp(ship_detail_dict['posTime'])
                         + timedelta(hours=8)).year),
                    'updatetime':
                    ship_detail_dict['updatetime'],
                    'eta_timestamp':
                    ship_detail_dict['eta'],
                    'time':
                    (datetime.utcfromtimestamp(ship_detail_dict['posTime']) +
                     timedelta(hours=8)).strftime('%Y-%m-%d %H:%M:%S'),
                    'callsign':
                    ship_detail_dict['callsign'],
                    'nationality':
                    self.mmsi_dict[ship_detail_dict['mmsi'][:3]] if
                    ship_detail_dict['mmsi'][:3] in self.mmsi_dict else None,
                    'cog':
                    ship_detail_dict['cog'] /
                    10 if ship_detail_dict['cog'] else None,
                    'dest':
                    ship_detail_dict['destPort'],
                    'draught':
                    ship_detail_dict['draught'] /
                    10 if ship_detail_dict['draught'] else None,
                    'hdg':
                    ship_detail_dict['heading'],
                    'imo':
                    ship_detail_dict['imo'],
                    'latitude':
                    ship_detail_dict['lat'] / 600000,
                    'length':
                    ship_detail_dict['length'],
                    'longitude':
                    ship_detail_dict['lon'] / 600000,
                    'mmsi':
                    ship_detail_dict['mmsi'],
                    'name':
                    ship_detail_dict['shipnameEn'],
                    'navistatus':
                    ship_detail_dict['aisNavStatus'],
                    'rot':
                    ship_detail_dict['rot'],
                    'shipid':
                    ship_detail_dict['shipId'],
                    'sog':
                    ship_detail_dict['sog'] /
                    10 if ship_detail_dict['sog'] else None,
                    'utc_timestamp':
                    ship_detail_dict['posTime'],
                    'type':
                    ship_detail_dict['shiptype'],
                    'width':
                    ship_detail_dict['breadth'],
                    'y':
                    ship_detail_dict['shiptype'],
                    'v':
                    ship_detail_dict['aisNavStatus']
                }

                try:
                    # type 有時會出現亂碼，如「6857-d&0」、「1607U No.158」
                    dictionary['type'] = int(dictionary['type'])
                except:
                    dictionary['type'] = None
                try:
                    dictionary['navistatus'] = int(dictionary['navistatus'])
                except:
                    dictionary['navistatus'] = None
                if ship_detail_dict['eta']:
                    eta_datetime = datetime.utcfromtimestamp(
                        ship_detail_dict['eta'])
                    dictionary['eta'] = eta_datetime.strftime('%m-%d %H:%M')
                    dictionary['eta_datetime'] = eta_datetime.strftime(
                        '%Y-%m-%d %H:%M:%S')
                else:
                    dictionary['eta'] = None
                    dictionary['eta_datetime'] = None
                if dictionary['type'] in self.ship_type_dict:
                    dictionary['type_text'] = self.ship_type_dict[
                        dictionary['type']]
                if dictionary['navistatus'] in self.navistatus_type_dict:
                    dictionary['navistatus_text'] = self.navistatus_type_dict[
                        dictionary['navistatus']]
                dictionary['y'] = dictionary['type']
                dictionary['v'] = dictionary['navistatus']
                batch_load_list.append(dictionary)
        try:
            self.es.batch_load(batch_load_list)
        except:
            self.err_count += 100

    def get_ship_detail(self, shipId_list_for_func):
        input_json = {"shipId": ','.join(shipId_list_for_func)}
        try:
            rsp = RequestsRetryer(
                'post', {
                    'url':
                    app.config['CRAWLER_SETTING']['MYSHIPS']['SHIP_DETAIL'],
                    'headers': self.headers,
                    'json': input_json,
                    'timeout': 180,
                    'cookies': self.cookies_list
                },
                req_retry_limit=3,
                req_retry_sleeptime=5)
            rsp.close()
        except:
            print(traceback.format_exc())
            self.err_count += 1
            self.err_msg_list.append(
                self.err_msg_generator(traceback.format_exc()))
            return
        if rsp.status_code != 200:
            print(rsp.text)
            self.err_count += 1
            self.err_msg_list.append(self.err_msg_generator(rsp.text))
            return
        try:
            rsp_result = rsp.json()
        except:
            print(traceback.format_exc())
            self.err_count += 1
            self.err_msg_list.append(
                self.err_msg_generator(traceback.format_exc()))
            return
        if rsp_result['code']!='0' \
        or rsp_result['message']!='成功':
            pprint(rsp_result)
            self.err_count += 1
            self.err_msg_list.append(self.err_msg_generator(rsp.text))
            return
        key_id = f'{uuid4()}'
        self.ship_detail_dict[key_id] = []
        for ship_detail_dict in rsp_result['data']:
            if not ship_detail_dict['mmsi'] \
            or not ship_detail_dict['lon'] \
            or not ship_detail_dict['lat'] \
            or not ship_detail_dict['posTime']:
                continue
            ship_detail_dict['v'] = None
            ship_detail_dict['y'] = None
            ship_detail_dict['updatetime'] = datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')
            self.ship_detail_dict[key_id].append(ship_detail_dict)
        if not self.ship_detail_dict[key_id]:
            self.no_data_count += 1
            return

    def myships_crawler_func(self):
        print(datetime.now())
        try:
            self.ip = get_external_ip()
        except:
            self.ip = None
        try:
            # 檢查這台機器是否有同排程還在執行
            if check_same_process_still_running(self.script_name):
                # 代表包含這個程式在內，有兩個以上相同的排程正在運行
                print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1))
                return
            if not self.ip:
                raise (Exception('無法取得 IP'))
            try:
                rsp = requests.get(
                    app.config['CRAWLER_SETTING']['MYSHIPS']['HOST_DOMAIN'],
                    timeout=60)
                rsp.close()
            except:
                raise Exception('無法連線至寶船網網頁 :\n{}'.format(
                    traceback.format_exc()))
            if rsp.status_code != 200:
                raise Exception('寶船網網頁無法正確連線 :\n{}'.format(rsp.text))

            try:
                rsp = requests.get(
                    'http://{}:{}/'.format(
                        'localhost',
                        app.config['ES_SETTING']['CONNECTION']['PORT']),
                    auth=HTTPBasicAuth(
                        app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                        app.config['ES_SETTING']['CONNECTION']['PASSWORD']))
                rsp.close()
            except:
                raise Exception(traceback.format_exc())
            if rsp.status_code != 200:
                raise Exception('無法連線至資策會 ES 主機')

            self.es = Elastic(
                host=app.config['ES_SETTING']['CONNECTION']['HOST'],
                port=app.config['ES_SETTING']['CONNECTION']['PORT'],
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])

            if not self.es.check_index_exist(
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                ['INDEX_NAME']):
                print(
                    self.es.create_index(
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                        ['MYSHIPS']['MAPPING_FILEPATH']))

            self.ship_type_dict = {
                x.type: x.name
                for x in ShipTypeMyships.query.all()
            }
            self.navistatus_type_dict = {
                x.type: x.name
                for x in NavistatusTypeMyships.query.all()
            }
            self.mmsi_dict = {}
            for db_result in MMSI_Info.query.with_entities(
                    MMSI_Info.mmsi, MMSI_Info.alpha_2,
                    MMSI_Info.alpha_3).all():
                self.mmsi_dict[
                    db_result.
                    mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2

            print('帳戶檢查登入狀態中')
            account_login_timestamp = time.time()
            account_login_span = 1800
            try:
                ship_account_login_func()
                self.cookies_list = ([
                    x.cookies for x in MyshipsAccount.query.filter(
                        MyshipsAccount.enable == 1, MyshipsAccount.updating ==
                        0, MyshipsAccount.updated_time >= (
                            datetime.now() - timedelta(hours=1))).all()
                ])
                if not self.cookies_list:
                    self.cookies_list.append({})
            except:
                print('帳號登入失敗')
                print(traceback.format_exc())
                self.cookies_list = [{}]

            # start_n = deepcopy(4000000+self.machine_serial)
            start_n = deepcopy(self.machine_serial)
            # start_n = 1660000
            while True:
                if datetime.now().minute>59 \
                and datetime.now().second>30:
                    return
                print(start_n)

                if (time.time() -
                        account_login_timestamp) >= account_login_span:
                    print(
                        f'帳戶距離上次登入時間超過 {account_login_span} 秒，等待所有 Thread 結束並重新登入後，將繼續執行'
                    )
                    for thread in self.thread_list:
                        thread.join()
                    print('帳戶重新登入中')
                    account_login_timestamp = time.time()
                    try:
                        ship_account_login_func()
                        self.cookies_list = ([
                            x.cookies for x in MyshipsAccount.query.filter(
                                MyshipsAccount.enable == 1, MyshipsAccount.
                                updating == 0, MyshipsAccount.updated_time >=
                                (datetime.now() - timedelta(hours=1))).all()
                        ])
                        if not self.cookies_list:
                            self.cookies_list.append({})
                    except:
                        print('帳號登入失敗')
                        print(traceback.format_exc())
                        self.cookies_list = [{}]

                end_n = start_n + 1000 * self.machine_count
                shipId_list = [
                    f'{i}' for i in range(start_n, end_n, self.machine_count)
                ]
                start_n = deepcopy(end_n)

                t1 = time.time()
                thread = threading.Thread(target=self.get_ship_detail,
                                          args=(shipId_list, ),
                                          daemon=True)
                thread.start()
                self.thread_list.append(thread)
                thread_sleep_time = 1 - (time.time() - t1)
                if thread_sleep_time > 0:
                    time.sleep(thread_sleep_time)

                # for thread in self.thread_list:
                #     thread.join()
                # pprint(self.ship_detail_dict)
                # if self.ship_detail_dict:
                #     self.save2es()
                # pprint(self.ship_detail_dict)
                # return

                while [thread.is_alive() for thread in self.thread_list
                       ].count(True) >= self.thread_max_count:
                    continue
                delete_index_list = []
                for index, thread in enumerate(self.thread_list):
                    if not thread.is_alive():
                        delete_index_list.append(index)
                delete_index_list.reverse()
                for index in delete_index_list:
                    del (self.thread_list[index])
                if self.err_count >= self.err_count_max:
                    raise Exception('\n\n'.join(self.err_msg_list))
                if self.no_data_count >= self.no_data_count_max:
                    break
                if self.ship_detail_dict and not self.save2es_thread.is_alive(
                ):
                    self.save2es_thread = threading.Thread(target=self.save2es,
                                                           daemon=True)
                    self.save2es_thread.start()
            print('完成爬取，等待 Thread 結束')
            for thread in self.thread_list:
                thread.join()
            print('Thread 結束, 正在將最後剩餘資料存入 ES 中')
            while self.save2es_thread.is_alive():
                continue
            if self.ship_detail_dict:
                self.save2es()
            print('結束')
            print(datetime.now())
            exit()
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(self.ip),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            self.err_msg_list.append(msg)
            self.err_msg_list = list(set(self.err_msg_list))
            print('\n\n'.join(self.err_msg_list))

示例#13

显示文件

文件： myships_crawler_callbyshipId_2.py 项目： douerdoyle/ship_crawler_cron

    def myships_crawler_func(self):
        print(datetime.now())
        try:
            self.ip = get_external_ip()
        except:
            self.ip = None
        try:
            # 檢查這台機器是否有同排程還在執行
            if check_same_process_still_running(self.script_name):
                # 代表包含這個程式在內，有兩個以上相同的排程正在運行
                print('{}: 有相同排程尚在執行({})'.format(self.script_name, 1))
                return
            if not self.ip:
                raise (Exception('無法取得 IP'))
            try:
                rsp = requests.get(
                    app.config['CRAWLER_SETTING']['MYSHIPS']['HOST_DOMAIN'],
                    timeout=60)
                rsp.close()
            except:
                raise Exception('無法連線至寶船網網頁 :\n{}'.format(
                    traceback.format_exc()))
            if rsp.status_code != 200:
                raise Exception('寶船網網頁無法正確連線 :\n{}'.format(rsp.text))

            try:
                rsp = requests.get(
                    'http://{}:{}/'.format(
                        'localhost',
                        app.config['ES_SETTING']['CONNECTION']['PORT']),
                    auth=HTTPBasicAuth(
                        app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                        app.config['ES_SETTING']['CONNECTION']['PASSWORD']))
                rsp.close()
            except:
                raise Exception(traceback.format_exc())
            if rsp.status_code != 200:
                raise Exception('無法連線至資策會 ES 主機')

            self.es = Elastic(
                host=app.config['ES_SETTING']['CONNECTION']['HOST'],
                port=app.config['ES_SETTING']['CONNECTION']['PORT'],
                username=app.config['ES_SETTING']['CONNECTION']['ACCOUNT'],
                password=app.config['ES_SETTING']['CONNECTION']['PASSWORD'])

            if not self.es.check_index_exist(
                    app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                ['INDEX_NAME']):
                print(
                    self.es.create_index(
                        app.config['ES_SETTING']['INDEX_INFO']['MYSHIPS']
                        ['INDEX_NAME'], app.config['ES_SETTING']['INDEX_INFO']
                        ['MYSHIPS']['MAPPING_FILEPATH']))

            self.ship_type_dict = {
                x.type: x.name
                for x in ShipTypeMyships.query.all()
            }
            self.navistatus_type_dict = {
                x.type: x.name
                for x in NavistatusTypeMyships.query.all()
            }
            self.mmsi_dict = {}
            for db_result in MMSI_Info.query.with_entities(
                    MMSI_Info.mmsi, MMSI_Info.alpha_2,
                    MMSI_Info.alpha_3).all():
                self.mmsi_dict[
                    db_result.
                    mmsi] = db_result.alpha_3 if db_result.alpha_3 else db_result.alpha_2

            print('帳戶檢查登入狀態中')
            account_login_timestamp = time.time()
            account_login_span = 1800
            try:
                ship_account_login_func()
                self.cookies_list = ([
                    x.cookies for x in MyshipsAccount.query.filter(
                        MyshipsAccount.enable == 1, MyshipsAccount.updating ==
                        0, MyshipsAccount.updated_time >= (
                            datetime.now() - timedelta(hours=1))).all()
                ])
                if not self.cookies_list:
                    self.cookies_list.append({})
            except:
                print('帳號登入失敗')
                print(traceback.format_exc())
                self.cookies_list = [{}]

            # start_n = deepcopy(4000000+self.machine_serial)
            start_n = deepcopy(self.machine_serial)
            # start_n = 1660000
            while True:
                if datetime.now().minute>59 \
                and datetime.now().second>30:
                    return
                print(start_n)

                if (time.time() -
                        account_login_timestamp) >= account_login_span:
                    print(
                        f'帳戶距離上次登入時間超過 {account_login_span} 秒，等待所有 Thread 結束並重新登入後，將繼續執行'
                    )
                    for thread in self.thread_list:
                        thread.join()
                    print('帳戶重新登入中')
                    account_login_timestamp = time.time()
                    try:
                        ship_account_login_func()
                        self.cookies_list = ([
                            x.cookies for x in MyshipsAccount.query.filter(
                                MyshipsAccount.enable == 1, MyshipsAccount.
                                updating == 0, MyshipsAccount.updated_time >=
                                (datetime.now() - timedelta(hours=1))).all()
                        ])
                        if not self.cookies_list:
                            self.cookies_list.append({})
                    except:
                        print('帳號登入失敗')
                        print(traceback.format_exc())
                        self.cookies_list = [{}]

                end_n = start_n + 1000 * self.machine_count
                shipId_list = [
                    f'{i}' for i in range(start_n, end_n, self.machine_count)
                ]
                start_n = deepcopy(end_n)

                t1 = time.time()
                thread = threading.Thread(target=self.get_ship_detail,
                                          args=(shipId_list, ),
                                          daemon=True)
                thread.start()
                self.thread_list.append(thread)
                thread_sleep_time = 1 - (time.time() - t1)
                if thread_sleep_time > 0:
                    time.sleep(thread_sleep_time)

                # for thread in self.thread_list:
                #     thread.join()
                # pprint(self.ship_detail_dict)
                # if self.ship_detail_dict:
                #     self.save2es()
                # pprint(self.ship_detail_dict)
                # return

                while [thread.is_alive() for thread in self.thread_list
                       ].count(True) >= self.thread_max_count:
                    continue
                delete_index_list = []
                for index, thread in enumerate(self.thread_list):
                    if not thread.is_alive():
                        delete_index_list.append(index)
                delete_index_list.reverse()
                for index in delete_index_list:
                    del (self.thread_list[index])
                if self.err_count >= self.err_count_max:
                    raise Exception('\n\n'.join(self.err_msg_list))
                if self.no_data_count >= self.no_data_count_max:
                    break
                if self.ship_detail_dict and not self.save2es_thread.is_alive(
                ):
                    self.save2es_thread = threading.Thread(target=self.save2es,
                                                           daemon=True)
                    self.save2es_thread.start()
            print('完成爬取，等待 Thread 結束')
            for thread in self.thread_list:
                thread.join()
            print('Thread 結束, 正在將最後剩餘資料存入 ES 中')
            while self.save2es_thread.is_alive():
                continue
            if self.ship_detail_dict:
                self.save2es()
            print('結束')
            print(datetime.now())
            exit()
        except:
            msg = '\n\n'.join([
                'ip: {}'.format(self.ip),
                '時間: {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S')),
                traceback.format_exc()
            ])
            print(msg)
            self.err_msg_list.append(msg)
            self.err_msg_list = list(set(self.err_msg_list))
            print('\n\n'.join(self.err_msg_list))