예제 #1
0
def run():
    handler = EstimateTaskDurationsHandler()
    # handler.settimeout = ConfManage.getInt("THRIFT_TIMEOUT")
    processor = RProcessor(handler)
    transport = TSocket.TServerSocket(port=ConfManage.getInt("THRIFT_PORT"))
    tfactory = TTransport.TBufferedTransportFactory()
    pfactory = TBinaryProtocol.TBinaryProtocolFactory()
    # Start Server
    server = RTProcessPoolServer(processor, transport, tfactory, pfactory)
    print('Starting Thrift {} at port: {}'.format(server.__class__.__name__, ConfManage.getInt("THRIFT_PORT")))
    server.setClient()
    server.setNumThreads(ConfManage.getInt("THRIFT_THREAD_COUNT"))
    server.setNumWorkers(ConfManage.getInt("PROCESS_NUM"))

    def clean_shutdown(signum, frame):
        for worker in server.workers:
            requirements_logger.info('Terminating worker: %s' % worker)
            worker.terminate()
        requirements_logger.info('Requesting server to stop()')
        try:
            server.stop()
        except Exception:
            pass
    def set_alarm():
        signal.signal(signal.SIGALRM, clean_shutdown)
        signal.alarm(4)

    set_alarm()
    # 处理僵尸子进程:
    signal.signal(signal.SIGCHLD, signal.SIG_IGN)
    server.serve()
예제 #2
0
 def __init__(self, client_class, host=None, port=None, timeout=None):
     host = host if host is not None else ConfManage.getString(
         "THRIFT_HOST")
     port = port if port is not None else ConfManage.getInt("THRIFT_PORT")
     timeout = timeout if timeout is not None else ConfManage.getInt(
         "THRIFT_TIMEOUT")
     socket = TSocket.TSocket(host, port)
     socket.setTimeout(timeout)
     self.transport = TTransport.TBufferedTransport(socket)
     protocol = TBinaryProtocol.TBinaryProtocol(self.transport)
     # Create a client
     self.client = client_class.Client(protocol)
예제 #3
0
    def __init__(self):
        self.pool = redis.ConnectionPool(
            host=ConfManage.getString("REDIS_HOST"),
            port=ConfManage.getInt("REDIS_PORT"),
            db=ConfManage.getInt("REDIS_DB"),
            password=ConfManage.getString("REDIS_PASSWORD"),
            max_connections=ConfManage.getInt("REDIS_MAX_CONNECTIONS"),
            decode_responses=True,
            socket_keepalive=True)

        self.conn = redis.StrictRedis(connection_pool=self.pool,
                                      socket_connect_timeout=5)
        self.logger = Logger.get_instance(ConfManage.getString("LOG_REQ_NAME"))
예제 #4
0
 def __init__(self,
              host=ConfManage.getString("HBASE_HOST"),
              port=ConfManage.getInt("HBASE_PORT")):
     self.timezone = ConfManage.getString("ARROW_TIMEZONE")
     self.host = host
     self.port = port
     # self.connection = Connection(host=self.host,port=self.port,table_prefix=ConfManage.getString("HBASE_PREFIX"))
     self.connPool = RConnectionPool(
         size=ConfManage.getInt("HBASE_CONN_SIZE"),
         host=self.host,
         port=self.port,
         # timeout=10,
         table_prefix=ConfManage.getString("HBASE_PREFIX"))
예제 #5
0
def preprocess(date, pickle, estimator, predict_target, holdout, mode,
               shift_days):
    data = load_pickle(pickle)
    try:
        run_time = get_run_time(date)
        logger.info('Run-Time: %s' % run_time.format(loggable))
        run_time = run_time.shift(days=shift_days).ceil('day').ceil(
            'hour').ceil('minute').ceil('second')
        start_time = run_time.shift(
            days=-ConfManage.getInt("TRAINING_INTERVAL")).floor('day').floor(
                'hour').floor('minute').floor('second')
        logger.info('Targeted Training Interval %d [%s - %s]' % \
                    (ConfManage.getInt("TRAINING_INTERVAL"), start_time.format(loggable), run_time.format(loggable)))
        logger.info('Preprocessing with Estimator %s (%s)' % (estimator, mode))
        # 导入eta类:
        module_tmp = importlib.import_module('tools.eta.{}_{}'.format(
            estimator, predict_target))
        class_tmp = getattr(
            module_tmp, '{}{}'.format(estimator.capitalize(),
                                      predict_target.capitalize()))
        estimator_obj = class_tmp()

        # 数据处理
        data = estimator_obj.etl(data)
        # 去除异常值
        data = estimator_obj.filter_data(data)
        if data is not None and 'time' in data.columns:
            # 选取某段时间数据
            data = data.loc[(data.order_time > start_time)
                            & (data.order_time < run_time)]
            order_times = data.order_time
            interval_count = len(
                order_times.apply(
                    lambda order_time: order_time.date()).unique())
            logger.info('Available Training Interval %d/%d [%s - %s]' % (interval_count, ConfManage.getInt("TRAINING_INTERVAL"), \
                         order_times.min().format(loggable), order_times.max().format(loggable)))
            # 模型训练:
            estimator_obj.preprocess(data, mode, holdout)
            Logger.resource_checkpoint('post-preprocess')
        else:
            raise Exception(
                "Data not yet obtained. Please run `python collect.py` first!")
    except (AttributeError, ValueError) as err:
        logger.error(err)
        logger.error('Trace: {}'.format(traceback.format_exc()))
    except KeyboardInterrupt:
        logger.info('Process manually interupted at %s' % arrow.now(
            tz=ConfManage.getString("ARROW_TIMEZONE")).format(loggable))
    logger.info('Releasing Logger...')
    # Logger.release_instance()
    return 0
예제 #6
0
 def __init__(self, end_point, use_ssl=False):
     self.end_point = end_point
     self.conn = requests.Session()
     if use_ssl:
         self.conn.mount(
             "https://",
             HTTPAdapter(
                 pool_connections=os.cpu_count() - 1,
                 pool_maxsize=ConfManage.getInt("HTTP_MAX_CONNECTIONS"),
                 max_retries=3))
     else:
         self.conn.mount(
             "http://",
             HTTPAdapter(
                 pool_connections=os.cpu_count() - 1,
                 pool_maxsize=ConfManage.getInt("HTTP_MAX_CONNECTIONS"),
                 max_retries=3))
예제 #7
0
 def set(self, key, data, age=ConfManage.getInt("CACHE_AGE")):
     """保存键为key的值,时间位age"""
     with self.lock:
         self.mem[key] = data
         if age == -1:
             self.time[key] = -1
         else:
             self.time[key] = time.time() + age + randrange(
                 start=0, stop=10, step=1)
         return True
예제 #8
0
    def __init__(self):
        sql_config = {
            'host': conf.MYSQL_HOST, 'port': conf.MYSQL_PORT,
            'user': conf.MYSQL_USER, 'pwd': conf.MYSQL_PASS,
            'dbname': self.__class__.db_name
        }

        self.sql_pool = create_engine("mysql+pymysql://{user}:{pwd}@{host}:{port}/{dbname}?charset=utf8"
            .format(**sql_config), max_overflow=0, pool_size=ConfManage.getInt("MYSQL_DB_CONNECTIONS"),
                                      poolclass=pool.QueuePool, pool_recycle=450)
예제 #9
0
def trim_outdated(logger, run_time, pickle_name):
    pickled = load_pickle(pickle_name)
    if pickled is None: return
    interval_begin = run_time.shift(
        days=-ConfManage.getInt("COLLECTION_INTERVAL"))
    pickled = pickled.loc[pickled['order_time'] > interval_begin.ceil('day')]
    if save_pickle(pickled, pickle_name):
        logger.info('Successfully Trimmed outdated! [ {} - {} ]'.format(
            interval_begin.shift(days=1).floor('day').format(loggable),
            run_time.format(loggable)))
예제 #10
0
    def toCache(self, cacheKey=None, age=ConfManage.getInt("CACHE_AGE")):
        def getData(func):
            def save(*args, **kwargs):
                defineKey = None
                if "cache_key" in kwargs:
                    defineKey = kwargs["cache_key"]
                actucal_key = defineKey if defineKey else cacheKey
                retry = 4
                while True:
                    data = self.client.get(actucal_key)
                    if data is None:
                        if self.client.set_mutex(actucal_key, 2):
                            try:
                                data = func(*args, **kwargs).to_json()
                                self.client.set(actucal_key, data, age)
                                self.client.delete(actucal_key + "_mutex")
                            except Exception:
                                self.client.delete(actucal_key + "_mutex")
                                raise
                            break
                        else:
                            time.sleep(0.5)
                            retry -= 1
                            if retry == 0:
                                logger.error(
                                    "Cache msg: Get cache data fail while retry 4 times"
                                )
                                raise Exception(
                                    "1302:Get cache data fail while retry 4 times"
                                )
                    else:
                        extime = self.client.ttl(actucal_key)
                        if extime <= 8:
                            if self.client.set_mutex(actucal_key, 2):
                                try:
                                    data = func(*args, **kwargs).to_json()
                                    self.client.set(actucal_key, data, age)
                                    self.client.delete(actucal_key + "_mutex")
                                except Exception:
                                    logger.error(
                                        "Cache msg:get {} failed, return old date"
                                        .format(kwargs["topic"]))
                                    self.client.delete(actucal_key + "_mutex")
                                    return data
                                break
                        else:
                            break
                return pandas.read_json(data)

            return save

        return getData
예제 #11
0
def multi_process(functions, args=(), kwds=[], processnum=None):
    # 多进程
    processnum = ConfManage.getInt("PROCESS_NUM") if processnum is None else processnum
    pool = Pool(processnum)
    func_length = len(functions)
    results = []
    for i in range(0, func_length):
        arg = () if len(args) == 0 else args[i]
        kwd = {} if len(kwds) == 0 else kwds[i]
        result = pool.apply_async(functions[i], args=arg, kwds=kwd)
        results.append(result)
    pool.close()
    pool.join()
    return results
예제 #12
0
 def lnglat_to_cellid(self, longitude, latitude):
     """坐标转s2 cellid"""
     LEVEL = ConfManage.getInt("S2_LEVEL")
     if latitude > 90 or latitude < -90:
         raise ValueError('4002:latitude out of range (-90,90)')
     elif longitude > 180 or longitude < -180:
         raise ValueError('4002:latitude out of range (-180,180)')
     elif LEVEL > 30:
         raise ValueError('4009:level must be litter than 30')
     else:
         latlng = LatLng.from_degrees(latitude, longitude)
         cell_id = CellId.from_lat_lng(latlng)
         level_cell_id = cell_id.parent(LEVEL)
         return level_cell_id.id()
예제 #13
0
    def cv(self, x_train, y_train, model=None, cv_round=5):
        """
        Args:
            x_train(DataFrame): 训练集
            y_train(DataFrame): 验证集
            model (str): 用于增量训练,原模型路径(数据量不大慎用,注意增加tree的数量对模型的影响)
        :return:
        """
        jobs = ConfManage.getInt("PARALLEL_NUM")

        if self.estimator_name == "xgb":
            import xgboost as xgb
            self.parameters["n_jobs"] = jobs
            dtrain = xgb.DMatrix(x_train, y_train)
            history = xgb.cv(self.parameters, dtrain, nfold=cv_round, metrics="rmse")
        else:
            verbose_level = ConfManage.getInt("LOG_LEVEL") if ConfManage.getInt("LOG_LEVEL") == logging.DEBUG else 0
            gsv = GridSearchCV(estimator=self.estimator, param_grid=self.parameters,
                               scoring='neg_mean_squared_error', verbose=verbose_level, n_jobs=jobs,
                               cv=cv_round)  # ‘neg_mean_squared_error’,neg_mean_absolute_error
            gsv.fit(x_train, y_train)
            history = gsv.cv_results_
        return history
예제 #14
0
def multi_thread(functions, args=[], kwds=[], threadnum=None):
    threadnum = ConfManage.getInt("PARALLEL_NUM") if threadnum is None else threadnum
    logger.debug('mulit_thread processes={}'.format(threadnum))
    pool = ThreadPool(threadnum)
    func_length = len(functions)
    results = []
    for i in range(0, func_length):
        arg = () if len(args) == 0 else args[i]
        kwd = {} if len(kwds) == 0 else kwds[i]
        result = pool.apply_async(functions[i], args=arg, kwds=kwd)
        results.append(result)
    pool.close()
    pool.join()
    return results
예제 #15
0
    def train(self, x_train, y_train, cv_round=5, model=None):
        """
        Args:
            x_train(DataFrame): 训练集
            y_train(DataFrame): 验证集
            model (str): 用于增量训练,原模型路径(数据量不大慎用,注意增加tree的数量对模型的影响)
        :return:
        """
        jobs = ConfManage.getInt("PARALLEL_NUM")

        if self.estimator_name == "xgb":
            self.parameters["n_jobs"] = jobs
            if model is not None:
                self.parameters["n_estimators"] = 50
            self.model = self.estimator(**self.parameters).fit(x_train, y_train, xgb_model=model)
        else:
            self.model = self.estimator(**self.parameters).fit(x_train, y_train)
예제 #16
0
def load_pickle_cache(name='undefined', using_joblib=False):
    cache_key = 'pickle_cache_{}'.format(name)
    ret = cache.get(cache_key)
    if ret is None:
        logger.debug('load_pickle_cache, fetch from raw pickle')
        ret = load_pickle(name, using_joblib)
        if ret is not None:
            cached = cache.set(cache_key, ret,
                               ConfManage.getInt("PICKLE_CACHE_EXPIRE"))
            logger.debug(
                'load_pickle_cache, set cache, cache_key={}, status={}'.format(
                    cache_key, cached))
    else:
        logger.debug(
            'load_pickle_cache, fetch from cache, cache_key={}'.format(
                cache_key))
    return ret
예제 #17
0
    def load_model_cache(self, name='undefined', using_joblib=False):

        cache_key = 'pickle_cache_{}'.format(name)
        ret = self.cache.get(cache_key)
        if ret is None:
            logger.debug('load_pickle_cache, fetch from raw pickle')
            path = "pickles/{app_mode}-{zone}-{estmator_key}".format(app_mode=ConfManage.getString("APP_MODE"),
                                                                     zone=ConfManage.getString("ZONE"),
                                                                     estmator_key=name)
            if name[:3] == "xgb":
                ret = xgb.Booster(model_file=path)
            elif name[:2] == "tf":
                ret = tf.keras.models.load_model(path, compile=False)
                ret.compile(optimizer=self.estimator().get_optimizer(), loss=self.estimator().loss_class,  # todo:self.estimator()未初始化设置
                            metrics=['mae', 'mse'])
        else:
            ret = load_pickle(name, using_joblib)
        if ret is not None:
            cached = self.cache.set(cache_key, ret, ConfManage.getInt("PICKLE_CACHE_EXPIRE"))
            logger.debug('load_pickle_cache, set cache, cache_key={}, status={}'.format(cache_key, cached))
        else:
            logger.debug('load_pickle_cache, fetch from cache, cache_key={}'.format(cache_key))
        return ret
예제 #18
0
from tools.pickler import delete_pickle, load_pickle, save_pickle
from tools.modeler import get_model
from tools.timer import get_run_time, LOGGABLE_ARROW_FORMAT as loggable
from tools.parallel import multi_thread

if sys.version_info[:2] in [(2, 6), (2, 7)]:
    reload(sys)
    sys.setdefaultencoding('utf-8')
elif sys.version_info[:2] in [(3, 6), (3, 7)]:
    # pylint: disable=E0401, E0611, E1101
    import importlib

    importlib.reload(sys)

ENV_ARROW_TIMEZONE = ConfManage.getString("ARROW_TIMEZONE")
ENV_ARROW_TZSHIFT = ConfManage.getInt("ARROW_TZSHIFT")
ENV_DATA_API_TIMERANGE = ConfManage.getInt("DATA_API_TIMERANGE")
ENV_ZONE = ConfManage.getString("ZONE")
logger = Logger.get_instance(ConfManage.getString("LOG_CRON_NAME"))
client = ApiClient()
osrm_api_client = OsrmApi()


def fetch_model_info(id_list, model_name='order', col=None, chunk_size=500):
    model = get_model(model_name)
    result = None
    id_chunks = chunks(id_list, chunk_size)
    for id_chunk in id_chunks:
        if result is None:
            result = model.fetch_in(
                id_chunk) if col is None else model.fetch_in(id_chunk, col=col)
예제 #19
0
class ApiClient(object):
    """
    根据参数决定调用hbase或者url获取数据
    """
    def __init__(self):
        self.conf = ConfManage()
        if self.conf.getString("API_TYPE") == "URL":
            self.client = self.dataapi_client()
        else:
            self.client = self.hbase_client()

    @staticmethod
    def client():
        if ConfManage.getString("API_TYPE") == "URL":
            client = ApiClient.dataapi_client()
        else:
            client = ApiClient.hbase_client()
        return client

    @staticmethod
    def hbase_client():
        return HbaseClient()

    @staticmethod
    def dataapi_client():
        return SimpleHttpClient()

    def set_client(self, client):
        self.client = client
        return self.client

    def get_client(self):
        return self.client

    def get_data(self, **kwargs):
        data = self.client.get_data(**kwargs)
        return data

    def get_cache_data(self, key, **kwargs):
        """
         获取缓存数据
        Args:
            key: cache key
            **kwargs:

        Returns: pd.DataFrame

        """
        update_cache_time = self.conf.getInt("CACHE_UPDATE")
        retry = 3
        while True:
            data = cache.get(key)
            if data is None:
                if cache.set_mutex(key, 2):
                    try:
                        data = self.get_data(**kwargs).to_json()
                        cache.set(key, data)
                        cache.delete(key + "_mutex")
                    except Exception:
                        cache.delete(key + "_mutex")
                        raise
                    break
                else:
                    time.sleep(1)
                    retry -= 1
                    if retry == 0:
                        logger.error("Cache msg: Get cache data fail while retry three times, key: {}".format(key))
                        raise Exception("1302:Get cache data fail while retry three times, key: {}".format(key))
            else:
                extime = cache.ttl(key)
                if extime <= update_cache_time:
                    if cache.set_mutex(key, 2):
                        try:
                            data = self.get_data(**kwargs).to_json()
                            cache.set(key, data)
                            cache.delete(key + "_mutex")
                        except Exception:
                            logger.error("Cache msg:get {} failed, return old date".format(key))
                            cache.delete(key + "_mutex")
                            return data
                        break
                else:
                    break
        return read_json(data)

    def collect_batch_data(self, start_time, end_time, table, topic, columns=None, record_path=None, meta=None):
        st = start_time
        et = end_time
        data_df = pd.DataFrame()
        hours_interval = int(24 / ENV_DATA_API_TIMERANGE) if 24 % ENV_DATA_API_TIMERANGE == 0 else 12
        while st < et:
            snt = st.shift(hours=hours_interval)
            data_df = data_df.append(
                self.get_data(table=table, topic=topic, start_time=st.shift(seconds=1), end_time=snt, columns=columns,
                                record_path=record_path, meta=meta))
            st = snt
        return data_df

    def fetch_data(self, start_time, end_time, table, topic, columns=None, record_path=None, meta=None,
                   save_file_prefix="order_accept_"):
        df = load_pickle(save_file_prefix + topic)
        if df is None:
            df = self.collect_batch_data(start_time=start_time, end_time=end_time, table=table, topic=topic,
                                         columns=columns, record_path=record_path, meta=meta)
            save_pickle(df, save_file_prefix + topic)
        else:
            if len(df) == 0:
                df = self.collect_batch_data(start_time=start_time, end_time=end_time, table=table, topic=topic,
                                        columns=columns, record_path=record_path, meta=meta)
                save_pickle(df, save_file_prefix + topic)
        logger.info('Fetch %s (Count): %d' % (topic, len(df)))
        return df
예제 #20
0
    parser.add_argument('predict_target',
                        help='目标值',
                        nargs='?',
                        type=str,
                        default='accept')
    parser.add_argument('--holdout', help='是否拆分3/7数据训练模型',
                        action='store_true')  # True or Flase
    parser.add_argument("-s",
                        "--shift_days",
                        help="The last few days",
                        type=int,
                        default=-1)  # True or Flase
    args = parser.parse_args()
    date = args.date
    pickle = args.pickle
    optimal = args.optimal
    estimator = args.estimator
    predict_target = args.predict_target
    holdout = args.holdout
    shift_days = args.shift_days

    mode = predict_target if optimal else 'optimal'
    logger.info('Arguments: estimator=%s, predict-target=%s, mode=%s' %
                (estimator, predict_target, mode))
    logger.info('Environment-Configs: training-interval=%d' %
                (ConfManage.getInt("TRAINING_INTERVAL")))
    Logger.resource_checkpoint('post-argparse')

    preprocess(date, pickle, estimator, predict_target, holdout, mode,
               shift_days)
예제 #21
0
        raise TypeError("datetime isn't arrow.Arrow")

    quarter = datetime.date().month
    if quarter in (1, 2, 3):
        str_quarter = 1
    elif quarter in (4, 5, 6):
        str_quarter = 2
    elif quarter in (7, 8, 9):
        str_quarter = 3
    else:
        str_quarter = 4
    return str_quarter


# 计时器
timepiece_time = ConfManage.getInt("TIMEPIECE_TIME")
timeout = float(timepiece_time) if timepiece_time else 5


def timepiece(timeout=timeout, run=ConfManage.getBool("TIMEPIECE_RUN"), msg=0):
    def starttest(fun):
        def fun_run(*args, **kwargs):
            if run:
                starttime = time.time()
                res = fun(*args, **kwargs)
                endtime = time.time()
                totaltime = round(endtime - starttime, 4)
                if totaltime >= timeout:
                    logger.warning("FunTimeout({}s) funtion={}, msg:{}".format(
                        timeout, fun.__name__, kwargs if msg else None))
                else:
예제 #22
0
 def __init__(self, log_name):
     dictConfig({
         'version': 1,
         'disable_existing_loggers': True,
         # 格式化日志
         'formatters': {
             'verbose': {
                 'format':
                 "[%(asctime)s][%(filename)s][%(levelname)s]: %(message)s",
                 'datefmt': "%Y-%m-%d %H:%M:%S"
             },
             'simple': {
                 'format': '%(levelname)s %(message)s'
             },
         },
         'handlers': {
             'null': {
                 'level': ConfManage.getInt("LOG_LEVEL"),
                 'class': 'logging.FileHandler',
                 'filename': 'logs/{}.log'.format(log_name),
             },
             'console': {
                 'level': ConfManage.getInt("LOG_LEVEL"),
                 'class': 'logging.StreamHandler',
                 'formatter': 'verbose',
             },
             'thread': {
                 'level': ConfManage.getInt("LOG_LEVEL"),
                 'class': 'logging.handlers.TimedRotatingFileHandler',
                 'when': "D",
                 # 最多保留10份文件
                 'backupCount': 10,
                 # If delay is true,
                 # then file opening is deferred until the first call to emit().
                 'delay': True,
                 'formatter': 'verbose',
                 'filename': 'logs/{}.log'.format(log_name),
             },
             'process': {
                 'level': ConfManage.getInt("LOG_LEVEL"),
                 # 如果没有使用并发的日志处理类,在多实例的情况下日志会出现缺失
                 'class': 'cloghandler.ConcurrentRotatingFileHandler',
                 # 当达到10MB时分割日志
                 'maxBytes': 1024 * 1024 * 1024,
                 # 最多保留50份文件
                 'backupCount': 10,
                 'delay': True,
                 'formatter': 'verbose',
                 'filename': 'logs/{}.log'.format(log_name),
             },
         },
         'loggers': {
             ConfManage.getString("LOG_BASE_NAME"): {
                 'handlers': ['process'],
                 'level': ConfManage.getInt("LOG_LEVEL"),
             },
             ConfManage.getString("LOG_CRON_NAME"): {
                 'handlers': ['thread'],
                 'level': ConfManage.getInt("LOG_LEVEL"),
             },
             ConfManage.getString("LOG_REQ_NAME"): {
                 'handlers': ['process'],
                 'level': ConfManage.getInt("LOG_LEVEL"),
             },
         }
     })
     self.logger = logging.getLogger(log_name)
     streamHandler = StreamHandler(sys.stdout)
     self.logger.addHandler(streamHandler)
예제 #23
0
def main():
    """Obtain Information from Data-API and MySQL Database"""
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--clear', \
                        help='Clear previously saved pickles.', action='store_true')
    parser.add_argument('-r', '--reverse', \
                        help='whether clear previously data.', action='store_true')
    parser.add_argument('-d', '--date', \
                        help='Date used for calculation.', type=str)
    parser.add_argument('-p', '--pickle', type=str, \
                        help='Pickle name for saving latest data-collection.', default='data')
    parser.add_argument('-u', '--updata', type=bool, \
                        help='Merge data with new feature to data.pkl.', default=False)
    parser.add_argument('-f', '--funtion', \
                        help='Update new feature from funtion.')
    parser.add_argument('-m', '--merge_on', \
                        help='Field names to join on. Must be found in both DataFrames.')
    args = parser.parse_args()

    # update date
    if args.updata:
        data = load_pickle(args.pickle)
        if args.funtion is not None and args.merge_on is not None:
            update_data(logger, args.funtion, data, args.merge_on, args.pickle)
        else:
            logger.error(
                'Funtion and Merge_on is None, Please provide corresponding parameters'
            )
        return

    # 清除所有子pkl。
    if args.clear:
        clear_pickles(logger)
        return

    is_reverse = False if args.reverse is None else True  # 向前或向后收集数据
    run_time = get_run_time(None, 0, False)
    logger.info('Run-Time: %s' % run_time)
    collect_date = None if args.date is None else get_run_time(args.date)
    logger.info('Collect-Date: %s' % collect_date)
    # 数据最后时间
    end_time = run_time.shift(days=-1).ceil('day')
    logger.info('End-Time: %s' % end_time)

    pickled = load_pickle(args.pickle)
    collected_start_time = None
    collected_count = 0
    if pickled is not None and isinstance(
            pickled, pd.DataFrame) and 'time' in pickled.columns:
        times = pickled['time']
        del pickled
        collected_count = len(
            times.apply(lambda order_time: order_time.date()).unique())
        collected_start_time = times.min()
        logger.info('Min collected order_time Date: %s' %
                    collected_start_time.format(loggable))
        collected_end_time = times.max()
        logger.info('Max collected order_time Date: %s' %
                    collected_end_time.format(loggable))

        if collect_date is not None:
            if collect_date > end_time:
                logger.warning(
                    'collect_date can not greater then end_time {} > {}'.
                    format(collect_date.format(loggable),
                           end_time.format(loggable)))
                return
            if collect_date < collected_start_time.floor('day'):
                start_time = collect_date.floor('day')
                end_time = collected_start_time.shift(days=-1).ceil('day')
            elif collect_date > collected_end_time.ceil('day'):
                start_time = collected_end_time.shift(days=1).floor('day')
                end_time = collect_date.ceil('day')
            else:
                logger.warning('collect_data invalid. {}'.format(collect_date))
        else:
            if collected_end_time >= end_time:
                logger.info('Targeted Run-Time already in Collection-Interval')
                return
            else:
                start_time = collected_end_time.shift(days=1).floor('day')

        gap = start_time.shift(days=-1).date() - end_time.date()
        gap_days = gap.days

    else:
        logger.info('Data empty!')
        gap_days = -ConfManage.getInt("COLLECTION_GAP")
        start_time = end_time.shift(days=gap_days + 1).floor('day')

    logger.info('Total Collection Interval: %d/%d [%s - %s]' %
                (collected_count, ConfManage.getInt("COLLECTION_INTERVAL"),
                 start_time.format(loggable), end_time.format(loggable)))

    if gap_days >= 0:
        logger.info('Targeted Run-Time already in Collection-Interval')
        return

    logger.info('Gap: %d' % (gap_days))
    logger.info(
        'Gap Interval: %d [%s - %s]' %
        (gap_days, start_time.format(loggable), end_time.format(loggable)))
    try:
        # 针对缺失1天以上的数据进行每日收集
        for i in range(-gap_days, 0, -1):
            end_time = start_time.ceil('day')
            logger.info('Collecting data in [{} - {}]'.format(
                start_time.format(loggable), end_time.format(loggable)))
            collect(logger, start_time, end_time, args.pickle)
            logger.info('Success collect data in [{} - {}] \n\n'.format(
                start_time.format(loggable), end_time.format(loggable)))
            start_time = start_time.shift(days=1)
        trim_outdated(logger, run_time, args.pickle)  # 没有环境变量下,默认截取最近30天的数据
    except (AttributeError, ValueError) as err:
        logger.error(err)
        logger.error('Trace: {}'.format(traceback.format_exc()))
    except KeyboardInterrupt:
        logger.info('Process manually interupted at {}'.format(arrow.utcnow()))
    logger.info('Releasing Logger...')
    return 0
예제 #24
0
 def set(self, key, data, age=ConfManage.getInt("CACHE_AGE")):
     ex = age + randrange(start=0, stop=10, step=1) if age > 0 else None
     with self.conn as conn:
         result = conn.set(key, data, ex)
     return result
예제 #25
0
 def __init__(self):
     self.logger = Logger.get_instance(
         ConfManage.getString("LOG_BASE_NAME"))
     self.cache = Cache().client
     self.cache_expire = ConfManage.getInt("OSRM_CACHE_EXPIRE")