예제 #1
0
    def __init__(self):
        self.pool = redis.ConnectionPool(
            host=ConfManage.getString("REDIS_HOST"),
            port=ConfManage.getInt("REDIS_PORT"),
            db=ConfManage.getInt("REDIS_DB"),
            password=ConfManage.getString("REDIS_PASSWORD"),
            max_connections=ConfManage.getInt("REDIS_MAX_CONNECTIONS"),
            decode_responses=True,
            socket_keepalive=True)

        self.conn = redis.StrictRedis(connection_pool=self.pool,
                                      socket_connect_timeout=5)
        self.logger = Logger.get_instance(ConfManage.getString("LOG_REQ_NAME"))
예제 #2
0
 def __init__(self,
              host=ConfManage.getString("HBASE_HOST"),
              port=ConfManage.getInt("HBASE_PORT")):
     self.timezone = ConfManage.getString("ARROW_TIMEZONE")
     self.host = host
     self.port = port
     # self.connection = Connection(host=self.host,port=self.port,table_prefix=ConfManage.getString("HBASE_PREFIX"))
     self.connPool = RConnectionPool(
         size=ConfManage.getInt("HBASE_CONN_SIZE"),
         host=self.host,
         port=self.port,
         # timeout=10,
         table_prefix=ConfManage.getString("HBASE_PREFIX"))
예제 #3
0
def get_run_time(args_date, shift_days=0, floored=True):
    run_time = None
    if args_date is not None and len(args_date) > 0:
        run_time = arrow.get(args_date).replace(tzinfo=ConfManage.getString(
            "ARROW_TIMEZONE"))  # .shift(hours=ENV_ARROW_TZSHIFT)
    else:
        run_time = arrow.now(tz=ConfManage.getString(
            "ARROW_TIMEZONE"))  # .shift(hours=ENV_ARROW_TZSHIFT)
    if floored:
        run_time = run_time.floor('day').floor('hour').floor('minute').floor(
            'second')
    run_time = run_time.shift(days=shift_days) if shift_days != 0 else run_time
    return run_time
예제 #4
0
def init_pickle_cache():
    RELOAD_PICKLE_CACHE_KEY = ConfManage.getString("RELOAD_PICKLE_CACHE_KEY")
    logger.info('start init_pickle_cache RELOAD_PICKLE_CACHE_KEY={}'.format(
        RELOAD_PICKLE_CACHE_KEY))
    if RELOAD_PICKLE_CACHE_KEY is not None:
        result = reload_pickle_cache(RELOAD_PICKLE_CACHE_KEY)
        logger.info('init_pickle_cache result={}'.format(result))
예제 #5
0
 def __init__(self, client_class, host=None, port=None, timeout=None):
     host = host if host is not None else ConfManage.getString(
         "THRIFT_HOST")
     port = port if port is not None else ConfManage.getInt("THRIFT_PORT")
     timeout = timeout if timeout is not None else ConfManage.getInt(
         "THRIFT_TIMEOUT")
     socket = TSocket.TSocket(host, port)
     socket.setTimeout(timeout)
     self.transport = TTransport.TBufferedTransport(socket)
     protocol = TBinaryProtocol.TBinaryProtocol(self.transport)
     # Create a client
     self.client = client_class.Client(protocol)
예제 #6
0
 def save_mode(self, realtime=None, postfix=None):
     if self.model is None:
         raise Exception("1602:Not model in Model, please use train member to produce model")
     else:
         if realtime is not None:
             estmator_key = '%s_%s_%s' % (self.estimator_name, realtime, self.target)
         else:
             estmator_key = '%s_%s' % (self.estimator_name, self.target)
         try:
             if self.estimator_name == "tf":
                 self.model.save("pickles/{app_mode}-{zone}-{estmator_key}".format(
                     app_mode=ConfManage.getString("APP_MODE"),
                     zone=ConfManage.getString("ZONE"),
                     estmator_key=estmator_key))
             else:
                 self.model.save_model(
                     "pickles/{app_mode}-{zone}-{estmator_key}".format(app_mode=ConfManage.getString("APP_MODE"),
                                                                   zone=ConfManage.getString("ZONE"),
                                                                   estmator_key=estmator_key))
         except AttributeError:  # 非xgboost保存为pkl文件
             save_pickle(self.model, estmator_key + postfix, using_joblib=True)
         logger.info('Estmator Key: {}'.format(estmator_key))
예제 #7
0
def preprocess(date, pickle, estimator, predict_target, holdout, mode,
               shift_days):
    data = load_pickle(pickle)
    try:
        run_time = get_run_time(date)
        logger.info('Run-Time: %s' % run_time.format(loggable))
        run_time = run_time.shift(days=shift_days).ceil('day').ceil(
            'hour').ceil('minute').ceil('second')
        start_time = run_time.shift(
            days=-ConfManage.getInt("TRAINING_INTERVAL")).floor('day').floor(
                'hour').floor('minute').floor('second')
        logger.info('Targeted Training Interval %d [%s - %s]' % \
                    (ConfManage.getInt("TRAINING_INTERVAL"), start_time.format(loggable), run_time.format(loggable)))
        logger.info('Preprocessing with Estimator %s (%s)' % (estimator, mode))
        # 导入eta类:
        module_tmp = importlib.import_module('tools.eta.{}_{}'.format(
            estimator, predict_target))
        class_tmp = getattr(
            module_tmp, '{}{}'.format(estimator.capitalize(),
                                      predict_target.capitalize()))
        estimator_obj = class_tmp()

        # 数据处理
        data = estimator_obj.etl(data)
        # 去除异常值
        data = estimator_obj.filter_data(data)
        if data is not None and 'time' in data.columns:
            # 选取某段时间数据
            data = data.loc[(data.order_time > start_time)
                            & (data.order_time < run_time)]
            order_times = data.order_time
            interval_count = len(
                order_times.apply(
                    lambda order_time: order_time.date()).unique())
            logger.info('Available Training Interval %d/%d [%s - %s]' % (interval_count, ConfManage.getInt("TRAINING_INTERVAL"), \
                         order_times.min().format(loggable), order_times.max().format(loggable)))
            # 模型训练:
            estimator_obj.preprocess(data, mode, holdout)
            Logger.resource_checkpoint('post-preprocess')
        else:
            raise Exception(
                "Data not yet obtained. Please run `python collect.py` first!")
    except (AttributeError, ValueError) as err:
        logger.error(err)
        logger.error('Trace: {}'.format(traceback.format_exc()))
    except KeyboardInterrupt:
        logger.info('Process manually interupted at %s' % arrow.now(
            tz=ConfManage.getString("ARROW_TIMEZONE")).format(loggable))
    logger.info('Releasing Logger...')
    # Logger.release_instance()
    return 0
예제 #8
0
def main():
    logger = Logger.get_instance(ConfManage.getString("LOG_CRON_NAME"))
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--date', help='日期', type=str)
    parser.add_argument('-p', '--pickle', type=str, help='数据集', default='data')
    parser.add_argument('estimator', help='算法选择', nargs='?', type=str, default='xgb')
    parser.add_argument('predict_target', help='目标值', nargs='?', type=str, default='accept')
    parser.add_argument('-f', '--feature-selected', help='特征值选择', action='store_true')
    parser.add_argument('-w', '--withhold', help='是否保存数据到bi数据库', action='store_true')
    parser.add_argument("-s", "--shift_days", help="The last few days", type=int, default=-1)
    args = parser.parse_args()
    logger.info('Arguments: estimator=%s, predict-target=%s, feature-selected=%r, withhold-bi-insertion=%r' % \
                (args.estimator, args.predict_target, args.feature_selected, args.withhold))
    try:
        process(logger, args.pickle, args.estimator, args.predict_target, args.withhold, args.date, args.shift_days)
    except TestDataEmpty:
        logger.error('Test Data Empty!')
    except (AttributeError, ValueError) as err:
        logger.error(err)
        logger.error('Trace: {}'.format(traceback.format_exc()))
    except KeyboardInterrupt:
        logger.info('Process manually interupted at %s' % arrow.now(tz=ConfManage.getString("ARROW_TIMEZONE")).format(loggable))
    logger.info('Releasing Logger...')
예제 #9
0
    def load_model_cache(self, name='undefined', using_joblib=False):

        cache_key = 'pickle_cache_{}'.format(name)
        ret = self.cache.get(cache_key)
        if ret is None:
            logger.debug('load_pickle_cache, fetch from raw pickle')
            path = "pickles/{app_mode}-{zone}-{estmator_key}".format(app_mode=ConfManage.getString("APP_MODE"),
                                                                     zone=ConfManage.getString("ZONE"),
                                                                     estmator_key=name)
            if name[:3] == "xgb":
                ret = xgb.Booster(model_file=path)
            elif name[:2] == "tf":
                ret = tf.keras.models.load_model(path, compile=False)
                ret.compile(optimizer=self.estimator().get_optimizer(), loss=self.estimator().loss_class,  # todo:self.estimator()未初始化设置
                            metrics=['mae', 'mse'])
        else:
            ret = load_pickle(name, using_joblib)
        if ret is not None:
            cached = self.cache.set(cache_key, ret, ConfManage.getInt("PICKLE_CACHE_EXPIRE"))
            logger.debug('load_pickle_cache, set cache, cache_key={}, status={}'.format(cache_key, cached))
        else:
            logger.debug('load_pickle_cache, fetch from cache, cache_key={}'.format(cache_key))
        return ret
예제 #10
0
 def data_url(self,
              table,
              topic,
              start_time=None,
              end_time=None,
              columns=None,
              record_path=None,
              meta=None,
              timeout=10.,
              **kwargs):
     """
     调用API,可加table_name值查询,无法用于多条件查询
     :param table
     :param topic: string
     :param time_start: int eg:20190711000000 查询开始时间
     :param time_end: 查询终止时间
     :param kwargs: table_name值
     :return: pd.DataFrame
     """
     route = ConfManage.getString(
         "DATA_API_PREFIX") + '/' + table + '/' + topic
     for k, v in kwargs.items():
         if isinstance(v, list):
             object_id = ','.join([str(i) for i in v])
             route += '/' + object_id
         elif isinstance(v, int):
             object_id = str(v)
             route += '/' + object_id
     if start_time is not None and end_time is not None:
         params = remove_none(
             dict(time_start=dataApiTimeFmt(start_time),
                  time_end=dataApiTimeFmt(end_time)))
     else:
         params = None
     response = self.get(route=route, queries=params, timeout=timeout)
     response = json.loads(response) if isinstance(response,
                                                   (str,
                                                    bytes)) else response
     if response['error'] == 0:
         j2df = json_normalize(response['data'],
                               record_path=record_path,
                               meta=meta)
         if columns:
             try:
                 j2df = j2df.loc[:, columns]
             except KeyError:
                 return j2df
         return j2df
     else:
         raise DataApiException(response['err_msg'])
예제 #11
0
 def get_distance(self,
                  traffic,
                  starting_coordinates,
                  destination_coordinates,
                  timeout=None,
                  zone="",
                  **kwargs):
     if self.client is None:
         raise ConnectionNotEstablished(
             'OsrmApiClient is not yet initiated.')
     if not isinstance(starting_coordinates,
                       tuple) or len(starting_coordinates) != 2:
         raise ValueError(
             'Given param: `starting_coordinates` is of wrong type or not paired.'
         )
     if not isinstance(destination_coordinates,
                       tuple) or len(destination_coordinates) != 2:
         raise ValueError(
             'Given param: `destination_coordinates` is of wrong type or not paired.'
         )
     route = zone + ConfManage.getString(
         "OSRM_API_%s_ROUTE" % traffic.upper())
     route += '%.6f,%.6f;%.6f,%.6f' % ( \
         starting_coordinates[0], starting_coordinates[1],
         destination_coordinates[0], destination_coordinates[1]
     )
     if len(kwargs) > 0:
         params = kwargs
     else:
         params = None
     try:
         result = self.client.get(route, params, timeout=timeout)
     except Exception as err:
         logger.error('OsrmapiError link={}, Msg:{}'.format(
             ConfManage.getString("OSRM_API_ENDPOINT") + route, err))
         result = {"routes": [{"distance": -1}]}
     return result['routes'][0]["distance"]
예제 #12
0
class EtaMetricAll(Base):
    """Object Relational Model class used to establish connection to eta_accept_metrics table"""
    db_name = ConfManage.getString("BI_MYSQL_DBNAME")
    table_name = 'eta_metrics_all'
    columns_msg = {
        'id': ['integer unsigned', 'auto_increment primary key'],
        'prediction_date': ['date', 'not null'],
        'model': ['varchar(30)', 'not null'],
        'mae': ['float unsigned', 'not null'],
        'mse': ['float unsigned', 'not null'],
        'r2': ['float unsigned', 'not null'],
        'limit_N_percent': ['float unsigned', 'not null'],
        'valid_count': ['integer unsigned', 'not null'],
        'total_count': ['integer unsigned', 'not null']
    }

    def checkBidata(self, model, prediction_date):
        sql = "SELECT * FROM {table_name} WHERE prediction_date='{prediction_date}' AND model='{model}'" \
            .format(table_name=EtaMetricAll.table_name, prediction_date=prediction_date, model=model)
        df = read_sql(sql=sql, con=self.sql_conn)
        return True if len(df) != 0 else False
예제 #13
0
from tools.caster import chunks
from tools.pickler import delete_pickle, load_pickle, save_pickle
from tools.modeler import get_model
from tools.timer import get_run_time, LOGGABLE_ARROW_FORMAT as loggable
from tools.parallel import multi_thread

if sys.version_info[:2] in [(2, 6), (2, 7)]:
    reload(sys)
    sys.setdefaultencoding('utf-8')
elif sys.version_info[:2] in [(3, 6), (3, 7)]:
    # pylint: disable=E0401, E0611, E1101
    import importlib

    importlib.reload(sys)

ENV_ARROW_TIMEZONE = ConfManage.getString("ARROW_TIMEZONE")
ENV_ARROW_TZSHIFT = ConfManage.getInt("ARROW_TZSHIFT")
ENV_DATA_API_TIMERANGE = ConfManage.getInt("DATA_API_TIMERANGE")
ENV_ZONE = ConfManage.getString("ZONE")
logger = Logger.get_instance(ConfManage.getString("LOG_CRON_NAME"))
client = ApiClient()
osrm_api_client = OsrmApi()


def fetch_model_info(id_list, model_name='order', col=None, chunk_size=500):
    model = get_model(model_name)
    result = None
    id_chunks = chunks(id_list, chunk_size)
    for id_chunk in id_chunks:
        if result is None:
            result = model.fetch_in(
예제 #14
0
class Cache:
    """缓存系统工厂类,根据环境变量CACHE_TYPE指定使用哪种缓存方式"""
    cache_type = ConfManage.getString("CACHE_TYPE")
    _instance = {}

    CACHE_TYPE_LOCAL = 'Local'

    CACHE_TYPE_REDIS = 'Redis'

    def __init__(self, cache_type=None):
        if cache_type is None:
            if Cache.cache_type == "redis":
                self.client = RedisCache()
            elif Cache.cache_type == "local":
                self.client = LocalCache()
            else:
                raise Exception(
                    '1303:CACHE_TYPE set error, must be "local" or "redis".')
        else:
            self.client = cache_type

    @staticmethod
    def get_instance(cache_type=None):
        cache_type = cache_type if cache_type is not None else Cache.CACHE_TYPE_LOCAL
        """Get Singleton instance of Cache"""
        if cache_type not in Cache._instance:
            if cache_type.capitalize() == Cache.CACHE_TYPE_LOCAL:
                instance = LocalCache()
            elif cache_type.capitalize() == Cache.CACHE_TYPE_REDIS:
                instance = RedisCache()
            else:
                raise Exception('1303:cache type error.')
            Cache._instance[cache_type] = instance
        return Cache._instance[cache_type]

    def toCache(self, cacheKey=None, age=ConfManage.getInt("CACHE_AGE")):
        def getData(func):
            def save(*args, **kwargs):
                defineKey = None
                if "cache_key" in kwargs:
                    defineKey = kwargs["cache_key"]
                actucal_key = defineKey if defineKey else cacheKey
                retry = 4
                while True:
                    data = self.client.get(actucal_key)
                    if data is None:
                        if self.client.set_mutex(actucal_key, 2):
                            try:
                                data = func(*args, **kwargs).to_json()
                                self.client.set(actucal_key, data, age)
                                self.client.delete(actucal_key + "_mutex")
                            except Exception:
                                self.client.delete(actucal_key + "_mutex")
                                raise
                            break
                        else:
                            time.sleep(0.5)
                            retry -= 1
                            if retry == 0:
                                logger.error(
                                    "Cache msg: Get cache data fail while retry 4 times"
                                )
                                raise Exception(
                                    "1302:Get cache data fail while retry 4 times"
                                )
                    else:
                        extime = self.client.ttl(actucal_key)
                        if extime <= 8:
                            if self.client.set_mutex(actucal_key, 2):
                                try:
                                    data = func(*args, **kwargs).to_json()
                                    self.client.set(actucal_key, data, age)
                                    self.client.delete(actucal_key + "_mutex")
                                except Exception:
                                    logger.error(
                                        "Cache msg:get {} failed, return old date"
                                        .format(kwargs["topic"]))
                                    self.client.delete(actucal_key + "_mutex")
                                    return data
                                break
                        else:
                            break
                return pandas.read_json(data)

            return save

        return getData
예제 #15
0
def main():
    """Obtain Information from Data-API and MySQL Database"""
    logger = Logger.get_instance(ConfManage.getString("LOG_CRON_NAME"))
    Logger.resource_checkpoint('init')
    parser = argparse.ArgumentParser()
    parser.add_argument('-c',
                        '--clear',
                        help='Clear previously saved pickles.',
                        action='store_true')
    parser.add_argument('-r',
                        '--reverse',
                        help='whether clear previously data.',
                        action='store_true')
    parser.add_argument('-d',
                        '--date',
                        help='Date used for calculation.',
                        type=str)
    parser.add_argument('-p',
                        '--pickle',
                        type=str,
                        default='stream_data',
                        help='Pickle name for saving latest data-collection.')
    parser.add_argument('-u',
                        '--updata',
                        type=bool,
                        help='Merge data with new feature to data.pkl.',
                        default=False)
    parser.add_argument('-f',
                        '--funtion',
                        help='Update new feature from funtion.')
    parser.add_argument(
        '-m',
        '--merge_on',
        help='Field names to join on. Must be found in both DataFrames.')
    args = parser.parse_args()

    # 更新新的数据:
    if args.updata:
        data = load_pickle(args.pickle)
        if args.funtion is not None and args.merge_on is not None:
            update_data(logger, args.funtion, data, args.merge_on)
        else:
            logger.error(
                'Funtion and Merge_on is None, Please provide corresponding parameters'
            )
        return
    # 清除所有子pkl:
    if args.clear:
        clear_pickles(logger)
        return

    is_reverse = False if args.reverse is None else True  # 向前或向后收集数据
    # 处理时间段
    run_time = get_run_time(None, 0, False)
    logger.info('Run-Time: %s' % run_time)
    collect_date = None if args.date is None else get_run_time(args.date)
    logger.info('Collect-Date: %s' % collect_date)
    end_time = run_time.shift(days=-1).ceil('day')
    logger.info('End-Time: %s' % end_time)
    # 读取数据
    pickled = load_pickle(args.pickle)
    collected_count = 0
    if pickled is not None and isinstance(
            pickled, pd.DataFrame) and 'order_time' in pickled.columns:
        order_times = pickled['order_time']
        del pickled  # Release pickle
        collected_count = len(
            order_times.apply(lambda order_time: order_time.date()).unique())
        collected_start_time = order_times.min()
        logger.info('Min collected order_time Date: %s' %
                    collected_start_time.format(loggable))
        collected_end_time = order_times.max()
        logger.info('Max collected order_time Date: %s' %
                    collected_end_time.format(loggable))

        if collect_date is not None:
            if collect_date > end_time:
                logger.warning(
                    'collect_date can not greater then end_time {} > {}'.
                    format(collect_date.format(loggable),
                           end_time.format(loggable)))
                return
            if collect_date < collected_start_time.floor('day'):
                start_time = collect_date.floor('day')
                end_time = collected_start_time.shift(days=-1).ceil('day')
            elif collected_start_time.floor(
                    'day') <= collect_date <= collected_end_time.ceil('day'):
                trim_data(logger, collect_date, is_reverse)
                if is_reverse:
                    start_time = collected_start_time.floor('day')
                    end_time = collect_date.ceil('day')
                else:
                    start_time = collect_date.floor('day')
                    end_time = collected_end_time.ceil('day')
            elif collect_date > collected_end_time.ceil('day'):
                start_time = collected_end_time.shift(days=1).floor('day')
                end_time = collect_date.ceil('day')
            else:
                logger.warning('collect_data invalid. {}'.format(collect_date))
                return
        else:
            if collected_end_time >= end_time:
                logger.info('Targeted Run-Time already in Collection-Interval')
                return
            else:
                start_time = collected_end_time.shift(days=1).floor('day')

        gap = start_time.shift(days=-1).date() - end_time.date()
        gap_days = gap.days

    else:
        logger.info('Data empty!')
        gap_days = -ConfManage.getInt("COLLECTION_GAP")
        start_time = end_time.shift(days=gap_days + 1).floor('day')

    logger.info('Total Collection Interval: %d/%d [%s - %s]' %
                (collected_count, ConfManage.getInt("COLLECTION_INTERVAL"),
                 start_time.format(loggable), end_time.format(loggable)))

    if gap_days >= 0:
        logger.info('Targeted Run-Time already in Collection-Interval')
        return

    logger.info('Gap: %d' % gap_days)
    logger.info(
        'Gap Interval: %d [%s - %s]' %
        (gap_days, start_time.format(loggable), end_time.format(loggable)))
    try:
        # 这部分代码是针对缺失1天以上的数据进行每日收集
        for i in range(-gap_days, 0, -1):
            end_time = start_time.ceil('day')
            logger.info('Collecting data in [{} - {}]'.format(
                start_time.format(loggable), end_time.format(loggable)))
            collect(logger, start_time, end_time, args.pickle)
            logger.info('Success collect data in [{} - {}] \n\n'.format(
                start_time.format(loggable), end_time.format(loggable)))
            start_time = start_time.shift(days=1)
        trim_outdated(logger, run_time, args.pickle)  # 没有环境变量下,默认截取最近30天的数据
    except (AttributeError, ValueError) as err:
        logger.error(err)
        logger.error('Trace: {}'.format(traceback.format_exc()))
    except KeyboardInterrupt:
        logger.info('Process manually interupted at {}'.format(arrow.utcnow()))
    logger.info('Releasing Logger...')
    return 0
예제 #16
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# 收集实时数据
import argparse
import arrow
import pandas as pd
import traceback
from client.api_client import ApiClient
from configs.ConfManage import ConfManage
from tools.logger import Logger
from tools.pickler import delete_pickle, load_pickle, save_pickle
from tools.timer import get_run_time, LOGGABLE_ARROW_FORMAT as loggable

client = ApiClient()

ENV_ARROW_TIMEZONE = ConfManage.getString("ARROW_TIMEZONE")
ENV_ARROW_TZSHIFT = ConfManage.getInt("ARROW_TZSHIFT")
ENV_DATA_API_TIMERANGE = ConfManage.getInt("DATA_API_TIMERANGE")
ENV_ZONE = ConfManage.getString("ZONE")


def collect_batch_data(start_time, end_time, table, topic, columns=None):
    st = start_time
    et = end_time
    data_df = pd.DataFrame()
    hours_interval = int(
        24 /
        ENV_DATA_API_TIMERANGE) if 24 % ENV_DATA_API_TIMERANGE == 0 else 12
    while st < et:
        snt = st.shift(hours=hours_interval)
        data_df = data_df.append(
예제 #17
0
class ApiClient(object):
    """
    根据参数决定调用hbase或者url获取数据
    """
    def __init__(self):
        self.conf = ConfManage()
        if self.conf.getString("API_TYPE") == "URL":
            self.client = self.dataapi_client()
        else:
            self.client = self.hbase_client()

    @staticmethod
    def client():
        if ConfManage.getString("API_TYPE") == "URL":
            client = ApiClient.dataapi_client()
        else:
            client = ApiClient.hbase_client()
        return client

    @staticmethod
    def hbase_client():
        return HbaseClient()

    @staticmethod
    def dataapi_client():
        return SimpleHttpClient()

    def set_client(self, client):
        self.client = client
        return self.client

    def get_client(self):
        return self.client

    def get_data(self, **kwargs):
        data = self.client.get_data(**kwargs)
        return data

    def get_cache_data(self, key, **kwargs):
        """
         获取缓存数据
        Args:
            key: cache key
            **kwargs:

        Returns: pd.DataFrame

        """
        update_cache_time = self.conf.getInt("CACHE_UPDATE")
        retry = 3
        while True:
            data = cache.get(key)
            if data is None:
                if cache.set_mutex(key, 2):
                    try:
                        data = self.get_data(**kwargs).to_json()
                        cache.set(key, data)
                        cache.delete(key + "_mutex")
                    except Exception:
                        cache.delete(key + "_mutex")
                        raise
                    break
                else:
                    time.sleep(1)
                    retry -= 1
                    if retry == 0:
                        logger.error("Cache msg: Get cache data fail while retry three times, key: {}".format(key))
                        raise Exception("1302:Get cache data fail while retry three times, key: {}".format(key))
            else:
                extime = cache.ttl(key)
                if extime <= update_cache_time:
                    if cache.set_mutex(key, 2):
                        try:
                            data = self.get_data(**kwargs).to_json()
                            cache.set(key, data)
                            cache.delete(key + "_mutex")
                        except Exception:
                            logger.error("Cache msg:get {} failed, return old date".format(key))
                            cache.delete(key + "_mutex")
                            return data
                        break
                else:
                    break
        return read_json(data)

    def collect_batch_data(self, start_time, end_time, table, topic, columns=None, record_path=None, meta=None):
        st = start_time
        et = end_time
        data_df = pd.DataFrame()
        hours_interval = int(24 / ENV_DATA_API_TIMERANGE) if 24 % ENV_DATA_API_TIMERANGE == 0 else 12
        while st < et:
            snt = st.shift(hours=hours_interval)
            data_df = data_df.append(
                self.get_data(table=table, topic=topic, start_time=st.shift(seconds=1), end_time=snt, columns=columns,
                                record_path=record_path, meta=meta))
            st = snt
        return data_df

    def fetch_data(self, start_time, end_time, table, topic, columns=None, record_path=None, meta=None,
                   save_file_prefix="order_accept_"):
        df = load_pickle(save_file_prefix + topic)
        if df is None:
            df = self.collect_batch_data(start_time=start_time, end_time=end_time, table=table, topic=topic,
                                         columns=columns, record_path=record_path, meta=meta)
            save_pickle(df, save_file_prefix + topic)
        else:
            if len(df) == 0:
                df = self.collect_batch_data(start_time=start_time, end_time=end_time, table=table, topic=topic,
                                        columns=columns, record_path=record_path, meta=meta)
                save_pickle(df, save_file_prefix + topic)
        logger.info('Fetch %s (Count): %d' % (topic, len(df)))
        return df
예제 #18
0
# coding=utf-8

from thrift import Thrift
from thrift.protocol import TBinaryProtocol
from thrift.Thrift import TApplicationException
from thrift.transport import TSocket, TTransport
from configs.ConfManage import ConfManage
from tools.logger import Logger

logger = Logger.get_instance(ConfManage.getString("LOG_BASE_NAME"))


class ThriftClient:
    '''thrift client'''
    def __init__(self, client_class, host=None, port=None, timeout=None):
        host = host if host is not None else ConfManage.getString(
            "THRIFT_HOST")
        port = port if port is not None else ConfManage.getInt("THRIFT_PORT")
        timeout = timeout if timeout is not None else ConfManage.getInt(
            "THRIFT_TIMEOUT")
        socket = TSocket.TSocket(host, port)
        socket.setTimeout(timeout)
        self.transport = TTransport.TBufferedTransport(socket)
        protocol = TBinaryProtocol.TBinaryProtocol(self.transport)
        # Create a client
        self.client = client_class.Client(protocol)

    def close(self):
        if self.transport.isOpen():
            self.transport.close()
            # logger.info('thrift transport IS CLOSED!')
예제 #19
0
 def __init__(self, log_name):
     dictConfig({
         'version': 1,
         'disable_existing_loggers': True,
         # 格式化日志
         'formatters': {
             'verbose': {
                 'format':
                 "[%(asctime)s][%(filename)s][%(levelname)s]: %(message)s",
                 'datefmt': "%Y-%m-%d %H:%M:%S"
             },
             'simple': {
                 'format': '%(levelname)s %(message)s'
             },
         },
         'handlers': {
             'null': {
                 'level': ConfManage.getInt("LOG_LEVEL"),
                 'class': 'logging.FileHandler',
                 'filename': 'logs/{}.log'.format(log_name),
             },
             'console': {
                 'level': ConfManage.getInt("LOG_LEVEL"),
                 'class': 'logging.StreamHandler',
                 'formatter': 'verbose',
             },
             'thread': {
                 'level': ConfManage.getInt("LOG_LEVEL"),
                 'class': 'logging.handlers.TimedRotatingFileHandler',
                 'when': "D",
                 # 最多保留10份文件
                 'backupCount': 10,
                 # If delay is true,
                 # then file opening is deferred until the first call to emit().
                 'delay': True,
                 'formatter': 'verbose',
                 'filename': 'logs/{}.log'.format(log_name),
             },
             'process': {
                 'level': ConfManage.getInt("LOG_LEVEL"),
                 # 如果没有使用并发的日志处理类,在多实例的情况下日志会出现缺失
                 'class': 'cloghandler.ConcurrentRotatingFileHandler',
                 # 当达到10MB时分割日志
                 'maxBytes': 1024 * 1024 * 1024,
                 # 最多保留50份文件
                 'backupCount': 10,
                 'delay': True,
                 'formatter': 'verbose',
                 'filename': 'logs/{}.log'.format(log_name),
             },
         },
         'loggers': {
             ConfManage.getString("LOG_BASE_NAME"): {
                 'handlers': ['process'],
                 'level': ConfManage.getInt("LOG_LEVEL"),
             },
             ConfManage.getString("LOG_CRON_NAME"): {
                 'handlers': ['thread'],
                 'level': ConfManage.getInt("LOG_LEVEL"),
             },
             ConfManage.getString("LOG_REQ_NAME"): {
                 'handlers': ['process'],
                 'level': ConfManage.getInt("LOG_LEVEL"),
             },
         }
     })
     self.logger = logging.getLogger(log_name)
     streamHandler = StreamHandler(sys.stdout)
     self.logger.addHandler(streamHandler)
예제 #20
0
# AUTHOR: 	[email protected]
# DESCRIPTION:
#
# HISTORY:
# *************************************************************
"""Module definining useful pickle-related methods"""
import os
import sys
from pickle import dump, load, PicklingError, UnpicklingError, HIGHEST_PROTOCOL
from configs.ConfManage import ConfManage
from tools.logger import Logger
from tools.cache import Cache
from sklearn.externals import joblib
from tools.po_cache_ret import POCacheRet

logger = Logger.get_instance(ConfManage.getString("LOG_BASE_NAME"))
PICKLE_FOLDER = ConfManage.getString("PICKLE_FOLDER")
cache = Cache.get_instance()
pickle_prefix = '%s-%s-' % (ConfManage.getString("APP_MODE"),
                            ConfManage.getString("ZONE"))

if sys.version_info[:2] in [(2, 6), (2, 7)]:
    reload(sys)
    sys.setdefaultencoding('utf-8')
elif sys.version_info[:2] in [(3, 6), (3, 7)]:
    # pylint: disable=E0401, E0611, E1101
    import importlib

    importlib.reload(sys)

예제 #21
0
 def __init__(self):
     self.logger = Logger.get_instance(
         ConfManage.getString("LOG_BASE_NAME"))
     self.cache = Cache().client
     self.cache_expire = ConfManage.getInt("OSRM_CACHE_EXPIRE")
예제 #22
0
 def client():
     if ConfManage.getString("API_TYPE") == "URL":
         client = ApiClient.dataapi_client()
     else:
         client = ApiClient.hbase_client()
     return client
예제 #23
0
from models_bi.eta_metric_all import EtaMetricAll
from configs import conf
from tools.logger import Logger
from configs.ConfManage import ConfManage
import os
import arrow
from core.process import process
from argparse import ArgumentParser
from core.stream_collect import main
logger = Logger.get_instance(ConfManage.getString("LOG_CRON_NAME"))
ENV_APP_MODE = ConfManage.getString("APP_MODE")
ENV_ZONE = ConfManage.getString("ZONE")

def check_collect():
    """检查collect是否采集"""
    pickle_prefix = os.getenv('ENV_PICKLE_PREFIX', '')
    if pickle_prefix == '' and ENV_APP_MODE != 'release':
        pickle_prefix = '%s-%s-' % (ENV_APP_MODE, ENV_ZONE)
    filename = '%sdata.pkl' % (pickle_prefix)
    file = './pickles/%s' % (filename)
    collect_ctimes = 3
    while collect_ctimes > 0:
        try:
            datacreattime = arrow.get(os.path.getctime(file)).date()
        except FileNotFoundError as err:
            logger.info(err)
            main()
            datacreattime = arrow.get(os.path.getctime(file)).date()
        today = arrow.now().date()
        if datacreattime != today:
예제 #24
0
 def __init__(self):
     self.client = SimpleHttpClient(
         "http://" + ConfManage.getString("OSRM_API_ENDPOINT"))
     self.S2_cache = S2()