Пример #1
0
    def test_diff(self):
        branch_props = elasticsearch.Cluster(
            host="http://localhost").get_index("debug_active_data",
                                               "active_data").get_properties()
        debug_props = elasticsearch.Cluster(host="http://localhost").get_index(
            "debug", "bz_etl").get_properties()

        elasticsearch.diff_schema(branch_props, debug_props)
Пример #2
0
    def __init__(
        self,
        rollover_field,      # the FIELD with a timestamp to use for determining which index to push to
        rollover_interval,   # duration between roll-over to new index
        rollover_max,        # remove old indexes, do not add old records
        schema,              # es schema
        queue_size=10000,    # number of documents to queue in memory
        batch_size=5000,     # number of documents to push at once
        typed=None,          # indicate if we are expected typed json
        kwargs=None          # plus additional ES settings
    ):
        if kwargs.tjson != None:
            Log.error("not expected")
        if typed == None:
            Log.error("not expected")

        schema.settings.index.max_result_window = 100000  # REQUIRED FOR ACTIVEDATA NESTED QUERIES
        schema.settings.index.max_inner_result_window = 100000  # REQUIRED FOR ACTIVEDATA NESTED QUERIES

        self.settings = kwargs
        self.locker = Lock("lock for rollover_index")
        self.rollover_field = jx.get(rollover_field)
        self.rollover_interval = self.settings.rollover_interval = Duration(rollover_interval)
        self.rollover_max = self.settings.rollover_max = Duration(rollover_max)
        self.known_queues = {}  # MAP DATE TO INDEX
        self.cluster = elasticsearch.Cluster(self.settings)
Пример #3
0
def get_branches(hg, branches, kwargs=None):
    # TRY ES
    cluster = elasticsearch.Cluster(branches)
    try:
        es = cluster.get_index(kwargs=branches, read_only=False)
        esq = jx_elasticsearch.new_instance(branches)
        found_branches = esq.query({
            "from": branches.index,
            "format": "list",
            "limit": 10000
        }).data

        # IF IT IS TOO OLD, THEN PULL FROM HG
        oldest = Date(MAX(found_branches.etl.timestamp))
        if oldest == None or Date.now() - oldest > OLD_BRANCH:
            found_branches = _get_branches_from_hg(hg)
            es.extend([{
                "id": b.name + " " + b.locale,
                "value": b
            } for b in found_branches])
            es.flush()

        try:
            return UniqueIndex(["name", "locale"],
                               data=found_branches,
                               fail_on_dup=False)
        except Exception as e:
            Log.error("Bad branch in ES index", cause=e)
    except Exception as e:
        e = Except.wrap(e)
        if "Can not find index " in e:
            branches.schema = branches_schema
            es = cluster.get_or_create_index(branches)
            es.add_alias()
            return get_branches(kwargs)
        Log.error("problem getting branches", cause=e)
    def __init__(
            self,
            host,
            index,  # THE NAME OF THE SNOWFLAKE (IF WRITING)
            alias=None,  # THE NAME OF THE SNOWFLAKE (FOR READING)
            type=None,
            name=None,  # THE FULL NAME OF THE TABLE (THE NESTED PATH INTO THE SNOWFLAKE)
            port=9200,
            read_only=True,
            timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
            wait_for_active_shards=1,  # ES WRITE CONSISTENCY (https://www.elastic.co/guide/en/elasticsearch/reference/1.7/docs-index_.html#index-consistency)
            typed=None,
            kwargs=None):
        Container.__init__(self)
        if not container.config.default:
            container.config.default = {
                "type": "elasticsearch",
                "settings": unwrap(kwargs)
            }
        self.edges = Data()  # SET EARLY, SO OTHER PROCESSES CAN REQUEST IT
        self.worker = None
        self.settings = kwargs
        self._namespace = ElasticsearchMetadata(kwargs=kwargs)
        self.name = name = self._namespace._find_alias(
            coalesce(alias, index, name))
        if read_only:
            self.es = elasticsearch.Alias(alias=name,
                                          index=None,
                                          kwargs=kwargs)
        else:
            self.es = elasticsearch.Cluster(kwargs=kwargs).get_index(
                read_only=read_only, kwargs=kwargs)

        self._ensure_max_result_window_set(name)
        self.settings.type = self.es.settings.type
        self.stats = QueryStats(self.es.cluster)

        columns = self.snowflake.columns  # ABSOLUTE COLUMNS
        is_typed = any(c.es_column == EXISTS_TYPE for c in columns)

        if typed == None:
            # SWITCH ON TYPED MODE
            self.typed = is_typed
        else:
            if is_typed != typed:
                Log.error(
                    "Expecting given typed {{typed}} to match {{is_typed}}",
                    typed=typed,
                    is_typed=is_typed)
            self.typed = typed

        if not typed:
            # ADD EXISTENCE COLUMNS
            all_paths = {'.': None}  # MAP FROM path TO parent TO MAKE A TREE

            def nested_path_of(v):
                if v == '.':
                    return ('.', )
                return (v, ) + nested_path_of(all_paths[v])

            query_paths = sort_using_key(set(
                step for path in self.snowflake.query_paths for step in path),
                                         key=lambda p: len(split_field(p)))
            for step in query_paths:
                if step in all_paths:
                    continue
                else:
                    best = '.'
                    for candidate in all_paths.keys():
                        if startswith_field(step, candidate):
                            if startswith_field(candidate, best):
                                best = candidate
                    all_paths[step] = best
            for p in all_paths.keys():
                if p == ".":
                    nested_path = ('.', )
                else:
                    nested_path = nested_path_of(p)[1:]

                jx_type = (OBJECT if p == "." else NESTED)
                self.namespace.meta.columns.add(
                    Column(name=p,
                           es_column=p,
                           es_index=self.name,
                           es_type=jx_type,
                           jx_type=jx_type,
                           cardinality=1,
                           nested_path=nested_path,
                           multi=1001 if jx_type is NESTED else 1,
                           last_updated=Date.now()))
Пример #5
0
    def __init__(
        self,
        hg=None,  # CONNECT TO hg
        repo=None,  # CONNECTION INFO FOR ES CACHE
        use_cache=False,  # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES
        timeout=30 * SECOND,
        kwargs=None,
    ):
        if not _hg_branches:
            _late_imports()

        if not is_text(repo.index):
            Log.error("Expecting 'index' parameter")
        self.repo_locker = Lock()
        self.moves_locker = Lock()
        self.todo = mo_threads.Queue("todo for hg daemon",
                                     max=DAEMON_QUEUE_SIZE)
        self.settings = kwargs
        self.timeout = Duration(timeout)
        self.last_cache_miss = Date.now()

        # VERIFY CONNECTIVITY
        with Explanation("Test connect with hg"):
            http.head(self.settings.hg.url)

        set_default(repo, {
            "type": "revision",
            "schema": revision_schema,
        })
        kwargs.branches = set_default(
            {
                "index": repo.index + "-branches",
                "type": "branch",
            },
            repo,
        )
        moves = set_default(
            {
                "index": repo.index + "-moves",
            },
            repo,
        )

        self.branches = _hg_branches.get_branches(kwargs=kwargs)
        cluster = elasticsearch.Cluster(kwargs=repo)
        self.repo = cluster.get_or_create_index(kwargs=repo)
        self.moves = cluster.get_or_create_index(kwargs=moves)

        def setup_es(please_stop):
            with suppress_exception:
                self.repo.add_alias()
            with suppress_exception:
                self.moves.add_alias()

            with suppress_exception:
                self.repo.set_refresh_interval(seconds=1)
            with suppress_exception:
                self.moves.set_refresh_interval(seconds=1)

        Thread.run("setup_es", setup_es)
        Thread.run("hg daemon", self._daemon)
Пример #6
0
def setup():
    global config

    config = startup.read_settings(
        default_filename=os.environ.get('ACTIVEDATA_CONFIG'),
        defs=[{
            "name": ["--process_num", "--process"],
            "help": "Additional port offset (for multiple Flask processes",
            "type": int,
            "dest": "process_num",
            "default": 0,
            "required": False
        }])

    constants.set(config.constants)
    Log.start(config.debug)

    agg_bulk.S3_CONFIG = config.bulk.s3

    File.new_instance("activedata.pid").write(text(machine_metadata.pid))

    # PIPE REQUEST LOGS TO ES DEBUG
    if config.request_logs:
        cluster = elasticsearch.Cluster(config.request_logs)
        request_logger = cluster.get_or_create_index(config.request_logs)
        active_data.request_log_queue = request_logger.threaded_queue(
            max_size=2000, period=1)

    if config.dockerflow:

        def backend_check():
            http.get_json(config.elasticsearch.host + ":" +
                          text(config.elasticsearch.port))

        dockerflow(flask_app, backend_check)
    else:
        # IF NOT USING DOCKERFLOW, THEN RESPOND WITH A SIMPLER __version__
        add_version(flask_app)

    # SETUP DEFAULT CONTAINER, SO THERE IS SOMETHING TO QUERY
    container.config.default = {
        "type": "elasticsearch",
        "settings": config.elasticsearch.copy()
    }

    # TRIGGER FIRST INSTANCE
    if config.saved_queries:
        setattr(save_query, "query_finder", SaveQueries(config.saved_queries))

    # STARTUP QUERY STATS
    QueryStats(elasticsearch.Cluster(config.elasticsearch))

    if config.flask.port and config.args.process_num:
        config.flask.port += config.args.process_num

    # TURN ON /exit FOR WINDOWS DEBUGGING
    if config.flask.debug or config.flask.allow_exit:
        config.flask.allow_exit = None
        Log.warning("ActiveData is in debug mode")
        flask_app.add_url_rule('/exit', 'exit', _exit)

    if config.flask.ssl_context:
        if config.args.process_num:
            Log.error("can not serve ssl and multiple Flask instances at once")
        setup_flask_ssl()

    # ENSURE MAIN THREAD SHUTDOWN TRIGGERS Flask SHUTDOWN
    MAIN_THREAD.stopped.then(exit)