Пример #1
0
def test_multiprocessing_safety(mongo_host, library_name):
    # Create/initialize library at the parent process, then spawn children, and start them aligned in time
    total_processes = 64
    total_writes_per_child = 100

    register_get_auth_hook(my_auth_hook)

    global MY_ARCTIC
    MY_ARCTIC = Arctic(mongo_host=mongo_host)

    MY_ARCTIC.initialize_library(library_name, VERSION_STORE)
    assert isinstance(MY_ARCTIC.get_library(library_name), VersionStore)

    processes = [Process(target=f, args=(library_name, total_writes_per_child, True)) for _ in range(total_processes)]

    for p in processes:
        p.start()

    for p in processes:
        p.join()

    for p in processes:
        assert p.exitcode == 0

    assert isinstance(MY_ARCTIC.get_library(library_name), VersionStore)
def test_multiprocessing_safety(mongo_host, library_name):
    # Create/initialize library at the parent process, then spawn children, and start them aligned in time
    total_processes = 64
    total_writes_per_child = 100

    register_get_auth_hook(my_auth_hook)

    global MY_ARCTIC
    MY_ARCTIC = Arctic(mongo_host=mongo_host)

    MY_ARCTIC.initialize_library(library_name, VERSION_STORE)
    assert isinstance(MY_ARCTIC.get_library(library_name), VersionStore)

    processes = [
        Process(target=f, args=(library_name, total_writes_per_child, True))
        for _ in range(total_processes)
    ]

    for p in processes:
        p.start()

    for p in processes:
        p.join()

    for p in processes:
        assert p.exitcode == 0

    assert isinstance(MY_ARCTIC.get_library(library_name), VersionStore)
def test_multiprocessing_safety_parent_children_race(mongo_host, library_name):
    # Create Arctic and directly fork/start children (no wait)
    total_iterations = 12
    total_processes = 6
    total_writes_per_child = 20

    global MY_ARCTIC

    for i in range(total_iterations):
        processes = list()

        MY_ARCTIC = Arctic(mongo_host=mongo_host)
        for j in range(total_processes):
            p = Process(target=f,
                        args=(library_name, total_writes_per_child, False))
            p.start(
            )  # start directly, don't wait to create first all children procs
            processes.append(p)

        MY_ARCTIC.initialize_library(
            library_name, VERSION_STORE)  # this will unblock spinning children

        for p in processes:
            p.join()

        for p in processes:
            assert p.exitcode == 0

        MY_ARCTIC.reset()

    assert isinstance(MY_ARCTIC.get_library(library_name), VersionStore)
Пример #4
0
def test_multiprocessing_safety_parent_children_race(mongo_host, library_name):
    # Create Arctic and directly fork/start children (no wait)
    total_iterations = 12
    total_processes = 6
    total_writes_per_child = 20

    global MY_ARCTIC

    for i in range(total_iterations):
        processes = list()

        MY_ARCTIC = Arctic(mongo_host=mongo_host)
        for j in range(total_processes):
            p = Process(target=f, args=(library_name, total_writes_per_child, False))
            p.start()  # start directly, don't wait to create first all children procs
            processes.append(p)

        MY_ARCTIC.initialize_library(library_name, VERSION_STORE)  # this will unblock spinning children

        for p in processes:
            p.join()

        for p in processes:
            assert p.exitcode == 0

        MY_ARCTIC.reset()

    assert isinstance(MY_ARCTIC.get_library(library_name), VersionStore)
Пример #5
0
 def _arctic_loader():
     host = Arctic(arctic_opts['host'])
     lib = host.get_library(arctic_opts['library'])
     read_kwargs = {}
     start, end = map(arctic_opts.get, ['start', 'end'])
     if start and end:
         read_kwargs['chunk_range'] = pd.date_range(start, end)
     data = lib.read(arctic_opts['node'], **read_kwargs)
     if isinstance(data, VersionedItem):
         data = data.data
     return data
Пример #6
0
def update_pool_cmd(ctx, pool_id, start, end, libname, mongo_uri):
    config = Config()
    ctx.obj["start"] = start
    ctx.obj["end"] = end

    start, end = datetime.datetime.strptime(start, "%Y%m%d"), \
        datetime.datetime.strptime(end, "%Y%m%d")

    # for dumping usuage
    pool = ctx.obj["pools"][pool_id]()
    logger.info(f"Proccessing Pool: {pool.__class__.__name__.lower()}")
    symbols = pool.variables.keys()

    # if load we load from arctic mongodb, then filling the missing ones
    try:
        store = Arctic(mongo_uri)
        logger.info(f"Connecting mongodb from: URI={mongo_uri}.")

        # Here needs to be improved
        # for async to since evert symbol may have different length

        data_ = {}
        if store.library_exists(libname):
            lib = store.get_library(libname)
            start_ = start

            for sym in symbols:
                try:
                    d = lib.read(sym).data
                    data_[sym] = d
                    if d.index[
                            -1] > start_:  # check db start is gt requested start
                        start_ = d.index[-1]
                except arctic.exceptions.NoDataFoundException as e:
                    logger.info(e)
            start = start_

    # update from start, in older to overide the start date data point]
    except pymongo.errors.ServerSelectionTimeoutError as e:
        click.echo(str(e))

    data = pool.get_batch(symbols, start, end)
    # merge data by replacing
    if data_:
        for sym, d in data.items():
            d = pd.merge(data_[sym], d)  # this will replace with new data
            data[sym] = d
    # dataset
    for symb, var in pool.variables.items():
        logger.info(f"symbol: {symb}")
        yield var, data[symb]
Пример #7
0
def loader_func(**kwargs):
    try:
        from arctic import Arctic
        from arctic.store.versioned_item import VersionedItem
    except ImportError:
        raise ImportError(
            'In order to use the arctic loader you must install arctic!')
    host = Arctic(kwargs.get('host'))
    lib = host.get_library(kwargs.get('library'))
    read_kwargs = {}
    start, end = (kwargs.get(p) for p in ['start', 'end'])
    if start and end:
        read_kwargs['chunk_range'] = pd.date_range(start, end)
    data = lib.read(kwargs.get('node'), **read_kwargs)
    if isinstance(data, VersionedItem):
        data = data.data
    return data
Пример #8
0
def loader_func(**kwargs):
    try:
        from arctic import Arctic
        from arctic.store.versioned_item import VersionedItem
    except ImportError:
        raise ImportError(
            "In order to use the arctic loader you must install arctic!")
    host = Arctic(kwargs.get("host"))
    lib = host.get_library(kwargs.get("library"))
    read_kwargs = {}
    start, end = (kwargs.get(p) for p in ["start", "end"])
    if start and end:
        read_kwargs["chunk_range"] = pd.date_range(start, end)
    data = lib.read(kwargs.get("node"), **read_kwargs)
    if isinstance(data, VersionedItem):
        data = data.data
    return data
Пример #9
0
    def poke(self, context):
        hook = MongoHook(self.mongo_conn_id, libname=self.libname)
        client = hook.get_conn()
        store = Arctic(client)

        self.log.info(
            f'Poking for {self.mongo_conn_id}, {self.libname}: {self.symbol}')

        try:
            if store.library_exists(self.libname):
                lib = store.get_library(self.libname)
                if lib.has_symbol(self.symbol):
                    return self.python_call_back(
                        self.meta,
                        lib.read_meta(self.symbol).metadata)
        except OSError:
            return False
        return False
Пример #10
0
 def _arctic_loader():
     try:
         from arctic import Arctic
         from arctic.store.versioned_item import VersionedItem
     except BaseException as ex:
         logger.exception(
             'In order to use the arctic loader you must install ahl.core!'
         )
         raise ex
     host = Arctic(arctic_opts['host'])
     lib = host.get_library(arctic_opts['library'])
     read_kwargs = {}
     start, end = map(arctic_opts.get, ['start', 'end'])
     if start and end:
         read_kwargs['chunk_range'] = pd.date_range(start, end)
     data = lib.read(arctic_opts['node'], **read_kwargs)
     if isinstance(data, VersionedItem):
         data = data.data
     return data
Пример #11
0
async def update_basekets(model_id, start, end):
    from akira.akira_models.basket.utils import get_model
    # updating model
    store = Arctic(os.environ.get("MONGODB_URI",
                                  "localhost:27017"))
    lib = store.get_library("akira.tickers")
    spec = data_spec[model_id]
    cols = {}
    for symbol in spec["symbols"]:
        cols[symbol] = lib.read(
            symbol, date_range=DateRange(
                start=start, end=end))

    data = pd.concat(cols, axis=1)
    model_cls_spec = model_spec[model_id]
    model = get_model(model_id)(**model_cls_spec)
    model.fit(data)
    # submit trades
    return model
Пример #12
0
class Data(object):
    def __init__(self, dbname, offline=False):
        self.libraries = {}
        self.distributor = Distributer.default()

        if offline:
            self.db = OfflineDB()
            log.critical('WARNING Running in offline mode')
            return

        self.db = Arctic(dbname)

        # initialize databases
        for field in FIELDS:
            try:
                self.libraries[field] = _getLib(self.db, field)
            except (arctic.exceptions.LibraryNotFoundException,
                    ServerSelectionTimeoutError):
                log.critical('Arctic not available, is mongo offline??')
                raise

    def cache(self, symbols=None, fields=None, delete=False):
        fields = fields or FIELDS
        symbols = symbols or p.symbolsDF().index.values.tolist()

        to_delete, to_fill, to_update = self.initialize(symbols, fields)

        if delete:
            # prune data
            self.delete(to_delete)

        self.backfill(to_fill)
        self.update(to_update)
        self.validate()

    def delete(self, to_delete=None):
        # delete data no longer needed
        for field in to_delete:
            for symbol in to_delete[field]:
                log.critical('Deleting %s from %s' % (symbol, field))
                self.libraries[field].delete(symbol)

    def backfill(self, to_fill):
        # backfill data if necessary
        for field in to_fill:
            log.critical('Backfilling %d items' % len(to_fill[field]))
            lib = self.libraries[field]

            for symbol, data in whichBackfill(field)(self.distributor,
                                                     to_fill[field]):
                log.critical('Filling %s for %s' % (symbol, field))
                data_orig = lib.read(symbol).data
                _appendIfNecessary(lib, symbol, data_orig, data)

    def update(self, to_update):
        # update data if necessary
        for field in to_update:
            log.critical('Updating %d items' % len(to_update[field]))
            lib = self.libraries[field]

            for symbol, data in whichFetch(field)(self.distributor,
                                                  to_update[field]):
                log.critical('Updating %s for %s' % (symbol, field))
                data_orig = self.libraries[field].read(symbol).data
                _appendIfNecessary(lib, symbol, data_orig, data)

    def initialize(self, symbols=None, fields=None):
        '''setup db'''
        fields = fields or FIELDS
        symbols = symbols or p.symbolsDF().index.values.tolist()

        to_fill = {}
        to_update = {}
        to_delete = {}

        _empty = pd.DataFrame()

        # initialize database and collect what to update
        for field in FIELDS:
            if field not in to_fill:
                to_fill[field] = []
            if field not in to_update:
                to_update[field] = []
            if field not in to_delete:
                to_delete[field] = []

            library = self.libraries[field]
            all_symbols = library.list_symbols()

            for symbol in symbols:
                symbol = symbol.upper()
                if symbol not in all_symbols:
                    log.critical('Initializing %s for %s' % (symbol, field))
                    to_fill[field].append(symbol)
                    library.write(symbol,
                                  _empty,
                                  metadata={'timestamp': never()})

                else:
                    metadata = library.read_metadata(symbol.upper()).metadata
                    if not metadata or not metadata.get('timestamp'):
                        to_fill[field].append(symbol)
                    elif metadata.get('timestamp', never()) <= never():
                        to_fill[field].append(symbol)
                    elif metadata.get('timestamp',
                                      never()) < _updateTime(field):
                        to_update[field].append(symbol)

            for symbol in set(all_symbols) - set(symbols):
                to_delete[field].append(symbol)
        return to_delete, to_fill, to_update

    def validate(self, symbols=None, fields=None):
        '''look for missing data'''
        fields = fields or FIELDS
        symbols = symbols or p.symbolsDF().index.values.tolist()
        to_refill = {}
        self.initialize(symbols, fields)

        for field in FIELDS:
            tick_start_date = today()
            daily_start_date = today()
            fail_count = 0
            print_fail = False
            dates = business_days(last_month(), yesterday())
            to_refill[field] = []

            if _skip(field):
                continue

            dbs = self.db.list_libraries()
            if field not in dbs:
                log.critical('VALIDATION FAILED %s' % field)
                continue

            lib = self.db.get_library(field)
            all_symbols = lib.list_symbols()

            for symbol in symbols:
                symbol = symbol.upper()

                # if fail count too high, autofail all for speed
                if fail_count > .2 * len(all_symbols):
                    if not print_fail:
                        log.critical('VALIDATION THRESHOLD REACHED for %s' %
                                     field)
                        print_fail = True

                    if _skip(field, symbol):
                        continue

                    to_refill[field].append(symbol)
                    if field == 'DAILY':
                        daily_start_date = dates[0]
                    if field == 'TICK':
                        tick_start_date = dates[0]
                    continue

                if _skip(field, symbol):
                    continue

                if symbol not in all_symbols:
                    to_refill[field].append(symbol)
                    log.critical('VALIDATION FAILED %s for %s' %
                                 (symbol, field))
                    fail_count += 1
                    continue

                data = lib.read(symbol).data

                if data.empty:
                    log.critical('VALIDATION FAILED - DATA EMPTY %s for %s' %
                                 (symbol, field))
                    to_refill[field].append(symbol)
                    fail_count += 1
                    continue

                elif field in ('TICK'):
                    for date in dates:
                        if date not in data.index:
                            log.critical(
                                'VALIDATION FAILED - DATA MISSING %s for %s : %s'
                                % (symbol, field, date.strftime('%Y%m%d')))
                            to_refill[field].append(symbol)
                            tick_start_date = min(
                                tick_start_date,
                                date) if tick_start_date is not None else date
                            fail_count += 1
                            break

                elif field in ('Daily'):
                    for date in dates:
                        if date not in data.index:
                            log.critical(
                                'VALIDATION FAILED - DATA MISSING %s for %s : %s'
                                % (symbol, field, date.strptime('%Y%m%d')))
                            to_refill[field].append(symbol)
                            daily_start_date = min(
                                daily_start_date,
                                date) if daily_start_date is not None else date
                            fail_count += 1
                            break

        # backfill data if necessary
        for field in to_refill:
            lib = self.libraries[field]

            if field == 'TICK':
                log.critical(
                    'Backfilling %d items for %s - %s' %
                    (len(to_refill[field]), field, str(tick_start_date)))
            elif field == 'DAILY':
                log.critical(
                    'Backfilling %d items for %s - %s' %
                    (len(to_refill[field]), field, str(daily_start_date)))
            else:
                log.critical('Backfilling %d items for %s' %
                             (len(to_refill[field]), field))

            for symbol, data in whichBackfill(field)(self.distributor,
                                                     to_refill[field],
                                                     from_=tick_start_date):
                log.critical('Updating %s for %s' % (symbol, field))

                data_orig = lib.read(symbol).data
                _appendIfNecessary(lib, symbol, data_orig, data)

    def read(self, symbol, field, fetch=True, fill=False):
        field = field.upper()
        symbol = symbol.upper()

        if field in ('QUOTE'):
            # dont cache, instantaneous
            return p.quoteDF(symbol)
        elif field in ('COMPOSITION'):
            return refetch(field, symbol)

        if field not in self.libraries and not fetch:
            return pd.DataFrame()

        l = _getLib(self.db, field)

        if not l.has_symbol(symbol):
            if not fetch:
                return pd.DataFrame()
            df = pd.DataFrame()
        else:
            df = l.read(symbol).data
            metadata = l.read_metadata(symbol).metadata

        if fetch:
            if df.empty or not metadata or not metadata.get('timestamp') or \
               metadata.get('timestamp', never()) <= never() or \
               metadata.get('timestamp', never()) < _updateTime(field):

                df = refetch(field, symbol)
                if fill:
                    l.write(symbol, df, metadata={'timestamp': datetime.now()})
        return df