예제 #1
0
    def __init__(
        self,
        host,
        user,
        password,
        table,
        meta,       # REDSHIFT COPY COMMAND REQUIRES A BUCKET TO HOLD PARAMETERS
        database=None,
        port=5439,
        settings=None
    ):
        self.settings = settings
        self.db = Redshift(settings)
        INDEX_CACHE[settings.table] = wrap({"name":settings.table})  # HACK TO GET parse_columns TO WORK
        columns = parse_columns(settings.table, settings.mapping.test_result.properties)
        nested = [c.name for c in columns if c.type == "nested"]
        self.columns = wrap([c for c in columns if c.type not in ["object"] and not any(c.name.startswith(n+".") for n in nested)])

        try:
            self.db.execute("""
                CREATE TABLE {{table_name}} (
                    "_id" character varying UNIQUE,
                    {{columns}}
                )""", {
                "table_name": self.db.quote_column(settings.table),
                "columns": SQL(",\n".join(self.db.quote_column(c.name) + " " + self.db.es_type2pg_type(c.type) for c in self.columns))
            }, retry=False)
        except Exception, e:
            if "already exists" in e:
                Log.alert("Table {{table}} exists in Redshift",  table= settings.table)
            else:
                Log.error("Could not make table", e)
예제 #2
0
    def __init__(
            self,
            index,  # NAME OF THE INDEX, EITHER ALIAS NAME OR FULL VERSION NAME
            type,  # SCHEMA NAME
            alias=None,
            explore_metadata=True,  # PROBING THE CLUSTER FOR METADATA IS ALLOWED
            timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
            debug=False,  # DO NOT SHOW THE DEBUG STATEMENTS
            settings=None):
        if index == None or type == None:
            Log.error("not allowed")
        if index == alias:
            Log.error("must have a unique index name")

        self.cluster_state = None
        self.cluster_metadata = None
        self.debug = debug
        if self.debug:
            Log.alert("elasticsearch debugging for index {{index}} is on",
                      index=settings.index)

        self.settings = settings
        self.cluster = Cluster(settings)

        try:
            index = self.get_index(index)
            if index and alias == None:
                settings.alias = settings.index
                settings.index = index
            if index == None:
                Log.error("not allowed")
        except Exception, e:
            # EXPLORING (get_metadata()) IS NOT ALLOWED ON THE PUBLIC CLUSTER
            pass
예제 #3
0
    def __init__(
        self,
        index,  # NAME OF THE INDEX, EITHER ALIAS NAME OR FULL VERSION NAME
        type,  # SCHEMA NAME
        alias=None,
        explore_metadata=True,  # PROBING THE CLUSTER FOR METADATA IS ALLOWED
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        debug=False,  # DO NOT SHOW THE DEBUG STATEMENTS
        settings=None
    ):
        if index==None or type==None:
            Log.error("not allowed")
        if index == alias:
            Log.error("must have a unique index name")

        self.cluster_state = None
        self.cluster_metadata = None
        self.debug = debug
        if self.debug:
            Log.alert("elasticsearch debugging for index {{index}} is on",  index= settings.index)

        self.settings = settings
        self.cluster = Cluster(settings)

        try:
            index = self.get_index(index)
            if index and alias==None:
                settings.alias = settings.index
                settings.index = index
            if index==None:
                Log.error("not allowed")
        except Exception, e:
            # EXPLORING (get_metadata()) IS NOT ALLOWED ON THE PUBLIC CLUSTER
            pass
예제 #4
0
    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(Thread.STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == Thread.STOP:
                break

            if not c.last_updated or c.last_updated >= Date.now() - TOO_OLD:
                continue

            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "es_index": c.es_index,
                            "es_column": c.es_column
                        }
                    }
                })
            Log.note("Could not get {{col.es_index}}.{{col.es_column}} info",
                     col=c)
예제 #5
0
 def _monitor(self, please_stop):
     self.service.wait()
     if DEBUG:
         Log.alert(
             "{{name}} stopped with returncode={{returncode}}", name=self.name, returncode=self.service.returncode
         )
     self.stdin.add(Thread.STOP)
     self.service_stopped.go()
예제 #6
0
 def _monitor(self, please_stop):
     self.service.wait()
     if self.debug:
         Log.alert("{{name}} stopped with returncode={{returncode}}",
                   name=self.name,
                   returncode=self.service.returncode)
     self.stdin.add(Thread.STOP)
     self.service_stopped.go()
예제 #7
0
 def try_till_response(self, *args, **kwargs):
     while True:
         try:
             response = self.server.get(*args, **kwargs)
             return response
         except Exception, e:
             e = Except.wrap(e)
             if "No connection could be made because the target machine actively refused it" in e:
                 Log.alert("Problem connecting")
             else:
                 Log.error("Server raised exception", e)
예제 #8
0
    def rollback(self):
        if self.pending:
            pending, self.pending = self.pending, []

            for p in pending:
                m = Message()
                m.set_body(p.get_body())
                self.queue.write(m)

            for p in pending:
                self.queue.delete_message(p)

            if self.settings.debug:
                Log.alert("{{num}} messages returned to queue", num=len(pending))
예제 #9
0
    def rollback(self):
        if self.pending:
            pending, self.pending = self.pending, []

            for p in pending:
                m = Message()
                m.set_body(p.get_body())
                self.queue.write(m)

            for p in pending:
                self.queue.delete_message(p)

            if self.settings.debug:
                Log.alert("{{num}} messages returned to queue",
                          num=len(pending))
예제 #10
0
def main():

    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        branches = _get_branches_from_hg(settings.hg)

        es = elasticsearch.Cluster(settings=settings.hg.branches).get_or_create_index(settings=settings.hg.branches)
        es.add_alias()
        es.extend({"id": b.name + " " + b.locale, "value": b} for b in branches)
        Log.alert("DONE!")
    except Exception, e:
        Log.error("Problem with etl", e)
예제 #11
0
 def __init__(
     self,
     bucket,  # NAME OF THE BUCKET
     aws_access_key_id=None,  # CREDENTIAL
     aws_secret_access_key=None,  # CREDENTIAL
     region=None,  # NAME OF AWS REGION, REQUIRED FOR SOME BUCKETS
     public=False,
     debug=False,
     settings=None
 ):
     self.uid = None
     self.bucket = s3.Bucket(settings=settings)
     Log.alert("Using {{bucket}} for S3 storage", bucket=self.bucket.name)
     self.temp_queue = PersistentQueue(bucket + "_queue.txt")
     self._figure_out_start_point()
     self.push_to_s3 = Thread.run("pushing to " + bucket, self._worker)
예제 #12
0
    def __init__(
        self,
        alias,  # NAME OF THE ALIAS
        type=None,  # SCHEMA NAME, WILL HUNT FOR ONE IF None
        explore_metadata=True,  # IF PROBING THE CLUSTER FOR METADATA IS ALLOWED
        debug=False,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        settings=None
    ):
        self.debug = debug
        if self.debug:
            Log.alert("Elasticsearch debugging on {{index|quote}} is on",  index= settings.index)
        if alias == None:
            Log.error("Alias can not be None")
        self.settings = settings
        self.cluster = Cluster(settings)

        if type == None:
            if not explore_metadata:
                Log.error("Alias() was given no `type` (aka schema) and not allowed to explore metadata.  Do not know what to do now.")

            indices = self.cluster.get_metadata().indices
            if not self.settings.alias or self.settings.alias==self.settings.index:
                alias_list = self.cluster.get("/_alias/"+self.settings.index)
                candidates = [(name, i) for name, i in alias_list.items() if self.settings.index in i.aliases.keys()]
                full_name = jx.sort(candidates, 0).last()[0]
                index = self.cluster.get("/" + full_name + "/_mapping")[full_name]
            else:
                index = self.cluster.get("/"+self.settings.index+"/_mapping")[self.settings.index]

            # FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE)
            max_prop = -1
            for _type, mapping in index.mappings.items():
                if _type == "_default_":
                    continue
                num_prop = len(mapping.properties.keys())
                if max_prop < num_prop:
                    max_prop = num_prop
                    self.settings.type = _type
                    type = _type

            if type == None:
                Log.error("Can not find schema type for index {{index}}", index=coalesce(self.settings.alias, self.settings.index))

        self.path = "/" + alias + "/" + type
예제 #13
0
    def __init__(
        self,
        alias,  # NAME OF THE ALIAS
        type=None,  # SCHEMA NAME, WILL HUNT FOR ONE IF None
        explore_metadata=True,  # IF PROBING THE CLUSTER FOR METADATA IS ALLOWED
        debug=False,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        settings=None
    ):
        self.debug = debug
        if self.debug:
            Log.alert("Elasticsearch debugging on {{index|quote}} is on",  index= settings.index)
        if alias == None:
            Log.error("Alias can not be None")
        self.settings = settings
        self.cluster = Cluster(settings)

        if type == None:
            if not explore_metadata:
                Log.error("Alias() was given no `type` (aka schema) and not allowed to explore metadata.  Do not know what to do now.")

            indices = self.cluster.get_metadata().indices
            if not self.settings.alias or self.settings.alias==self.settings.index:
                alias_list = self.cluster.get("/_alias/"+self.settings.index)
                candidates = [(name, i) for name, i in alias_list.items() if self.settings.index in i.aliases.keys()]
                full_name = jx.sort(candidates, 0).last()[0]
                index = self.cluster.get("/" + full_name + "/_mapping")[full_name]
            else:
                index = self.cluster.get("/"+self.settings.index+"/_mapping")[self.settings.index]

            # FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE)
            max_prop = -1
            for _type, mapping in index.mappings.items():
                if _type == "_default_":
                    continue
                num_prop = len(mapping.properties.keys())
                if max_prop < num_prop:
                    max_prop = num_prop
                    self.settings.type = _type
                    type = _type

            if type == None:
                Log.error("Can not find schema type for index {{index}}", index=coalesce(self.settings.alias, self.settings.index))

        self.path = "/" + alias + "/" + type
예제 #14
0
    def __init__(
            self,
            host,
            user,
            password,
            table,
            meta,  # REDSHIFT COPY COMMAND REQUIRES A BUCKET TO HOLD PARAMETERS
            database=None,
            port=5439,
            settings=None):
        self.settings = settings
        self.db = Redshift(settings)
        INDEX_CACHE[settings.table] = wrap(
            {"name": settings.table})  # HACK TO GET parse_columns TO WORK
        columns = parse_columns(settings.table,
                                settings.mapping.test_result.properties)
        nested = [c.name for c in columns if c.type == "nested"]
        self.columns = wrap([
            c for c in columns if c.type not in ["object"] and not any(
                c.name.startswith(n + ".") for n in nested)
        ])

        try:
            self.db.execute("""
                CREATE TABLE {{table_name}} (
                    "_id" character varying UNIQUE,
                    {{columns}}
                )""", {
                "table_name":
                self.db.quote_column(settings.table),
                "columns":
                SQL(",\n".join(
                    self.db.quote_column(c.name) + " " +
                    self.db.es_type2pg_type(c.type) for c in self.columns))
            },
                            retry=False)
        except Exception, e:
            if "already exists" in e:
                Log.alert("Table {{table}} exists in Redshift",
                          table=settings.table)
            else:
                Log.error("Could not make table", e)
예제 #15
0
def main():

    try:
        settings = startup.read_settings()
        constants.set(settings.constants)
        Log.start(settings.debug)

        branches = get_branches(settings.hg)

        es = elasticsearch.Cluster(
            settings=settings.hg.branches).get_or_create_index(
                settings=settings.hg.branches)
        es.add_alias()
        es.extend({
            "id": b.name + " " + b.locale,
            "value": b
        } for b in branches)
        Log.alert("DONE!")
    except Exception, e:
        Log.error("Problem with etl", e)
예제 #16
0
    def __init__(
            self,
            service_url,  # location of the ActiveData server we are testing
            backend_es,  # the ElasticSearch settings for filling the backend
            fastTesting=False,
            settings=None):
        if backend_es.schema == None:
            Log.error("Expecting backed_es to have a schema defined")

        letters = unicode(ascii_lowercase)
        self.random_letter = letters[int(Date.now().unix / 30) % 26]
        self.service_url = service_url
        self.backend_es = backend_es
        self.settings = settings
        self._es_test_settings = None
        self._es_cluster = None
        self._index = None

        if not containers.config.default:
            containers.config.default = {
                "type": "elasticsearch",
                "settings": backend_es
            }

        if not fastTesting:
            self.server = http
        else:
            Log.alert(
                "TESTS WILL RUN FAST, BUT NOT ALL TESTS ARE RUN!\nEnsure the `file://tests/config/elasticsearch.json#fastTesting=true` to turn on the network response tests."
            )
            # WE WILL USE THE ActiveServer CODE, AND CONNECT TO ES DIRECTLY.
            # THIS MAKES FOR SLIGHTLY FASTER TEST TIMES BECAUSE THE PROXY IS
            # MISSING
            self.server = FakeHttp()
            containers.config.default = {
                "type": "elasticsearch",
                "settings": settings.backend_es.copy()
            }
예제 #17
0
    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(Thread.STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == Thread.STOP:
                break

            if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD:
                continue

            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear":[
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
            Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
예제 #18
0
def parse_short_desc(bug):
    parts = bug.short_desc.split("|")
    if len(parts) in [2, 3]:
        bug.result.test = parts[0].strip()
        bug.result.message = parts[1].strip()
    elif any(map(parts[0].strip().endswith, [".html", ".py", ".js", ".xul"])) and len(parts)>2:
        bug.result.test = parts[0].strip()
        bug.result.message = parts[1].strip()
    elif len(parts) >= 4:
        set_default(bug.result, parse_status(parts[0]))
        bug.result.test = parts[1].strip()
        bug.result.message = parts[3].strip()
    elif any(black in bug.short_desc for black in blacklist):
        Log.note("IGNORED {{line}}", line=bug.short_desc)
    elif bug.bug_id in [1165765]:
        Log.note("IGNORED {{line}}", line=bug.short_desc)
    elif "###" in bug.short_desc:
        bug.short_desc = bug.short_desc.replace("###", " | ")
        parse_short_desc(bug)
    else:
        Log.alert("can not handle {{bug_id}}: {{line}}", line=bug.short_desc, bug_id=bug.bug_id)

    if bug.result.test.lower().startswith("intermittent "):
        bug.result.test = bug.result.test[13:]
예제 #19
0
    def _dispatch_work(self, source_block):
        """
        source_block POINTS TO THE bucket AND key TO PROCESS
        :return: False IF THERE IS NOTHING LEFT TO DO
        """
        source_keys = listwrap(coalesce(source_block.key, source_block.keys))

        if not isinstance(source_block.bucket, basestring):  # FIX MISTAKE
            source_block.bucket = source_block.bucket.bucket
        bucket = source_block.bucket
        work_actions = [
            w for w in self.settings.workers if w.source.bucket == bucket
        ]

        if not work_actions:
            Log.note(
                "No worker defined for records from {{bucket}}, {{action}}.\n{{message|indent}}",
                bucket=source_block.bucket,
                message=source_block,
                action="skipping"
                if self.settings.keep_unknown_on_queue else "deleting")
            return not self.settings.keep_unknown_on_queue

        for action in work_actions:
            try:
                source_key = unicode(source_keys[0])
                if len(source_keys) > 1:
                    multi_source = action._source
                    source = ConcatSources(
                        [multi_source.get_key(k) for k in source_keys])
                    source_key = MIN(source_key)
                else:
                    source = action._source.get_key(source_key)
                    source_key = source.key

                Log.note("Execute {{action}} on bucket={{source}} key={{key}}",
                         action=action.name,
                         source=source_block.bucket,
                         key=source_key)

                if action.transform_type == "bulk":
                    old_keys = set()
                else:
                    old_keys = action._destination.keys(
                        prefix=source_block.key)

                new_keys = set(
                    action._transformer(source_key,
                                        source,
                                        action._destination,
                                        resources=self.resources,
                                        please_stop=self.please_stop))

                #VERIFY KEYS
                if len(new_keys) == 1 and list(new_keys)[0] == source_key:
                    pass  # ok
                else:
                    etls = map(key2etl, new_keys)
                    etls = qb.sort(etls, "id")
                    for i, e in enumerate(etls):
                        if i != e.id:
                            Log.error(
                                "expecting keys to have dense order: {{ids}}",
                                ids=etls.id)
                    #VERIFY KEYS EXIST
                    if hasattr(action._destination, "get_key"):
                        for k in new_keys:
                            action._destination.get_key(k)

                for n in action._notify:
                    for k in new_keys:
                        n.add(k)

                if action.transform_type == "bulk":
                    continue

                # DUE TO BUGS THIS INVARIANT IS NOW BROKEN
                # TODO: FIGURE OUT HOW TO FIX THIS (CHANGE NAME OF THE SOURCE BLOCK KEY?)
                # for n in new_keys:
                #     if not n.startswith(source_key):
                #         Log.error("Expecting new keys ({{new_key}}) to start with source key ({{source_key}})",  new_key= n,  source_key= source_key)

                if not new_keys and old_keys:
                    Log.alert(
                        "Expecting some new keys after etl of {{source_key}}, especially since there were old ones\n{{old_keys}}",
                        old_keys=old_keys,
                        source_key=source_key)
                    continue
                elif not new_keys:
                    Log.alert(
                        "Expecting some new keys after processing {{source_key}}",
                        old_keys=old_keys,
                        source_key=source_key)
                    continue

                for k in new_keys:
                    if len(k.split(".")
                           ) == 3 and action.destination.type != "test_result":
                        Log.error(
                            "two dots have not been needed yet, this is a consitency check"
                        )

                delete_me = old_keys - new_keys
                if delete_me:
                    if action.destination.bucket == "ekyle-test-result":
                        for k in delete_me:
                            action._destination.delete_key(k)
                    else:
                        Log.note("delete keys?\n{{list}}",
                                 list=sorted(delete_me))
                        # for k in delete_me:
                # WE DO NOT PUT KEYS ON WORK QUEUE IF ALREADY NOTIFYING SOME OTHER
                # AND NOT GOING TO AN S3 BUCKET
                if not action._notify and isinstance(
                        action._destination, (aws.s3.Bucket, S3Bucket)):
                    for k in old_keys | new_keys:
                        self.work_queue.add(
                            Dict(bucket=action.destination.bucket, key=k))
            except Exception, e:
                if "Key {{key}} does not exist" in e:
                    err = Log.warning
                elif "multiple keys in {{bucket}}" in e:
                    err = Log.warning
                    if source_block.bucket == "ekyle-test-result":
                        for k in action._source.list(
                                prefix=key_prefix(source_key)):
                            action._source.delete_key(strip_extension(k.key))
                elif "expecting keys to have dense order" in e:
                    err = Log.warning
                    if source_block.bucket == "ekyle-test-result":
                        # WE KNOW OF THIS ETL MISTAKE, REPROCESS
                        self.work_queue.add({
                            "key":
                            unicode(key_prefix(source_key)),
                            "bucket":
                            "ekyle-pulse-logger"
                        })
                elif "Expecting a pure key" in e:
                    err = Log.warning
                else:
                    err = Log.error

                err(
                    "Problem transforming {{action}} on bucket={{source}} key={{key}} to destination={{destination}}",
                    {
                        "action":
                        action.name,
                        "source":
                        source_block.bucket,
                        "key":
                        source_key,
                        "destination":
                        coalesce(action.destination.name,
                                 action.destination.index)
                    }, e)
예제 #20
0
    buildbot_summary.etl = {
        "id": 0,
        "name": "unittest",
        "timestamp": Date.now().unix,
        "source": etl_header,
        "type": "join",
        "revision": get_git_revision(),
        "duration": timer.duration
    }
    buildbot_summary.run.stats = summary.stats
    buildbot_summary.run.stats.duration = summary.stats.end_time - summary.stats.start_time

    if DEBUG:
        age = Date.now() - Date(buildbot_summary.run.stats.start_time)
        if age > Duration.DAY:
            Log.alert("Test is {{days|round(decimal=1)}} days old",
                      days=age / Duration.DAY)
        Log.note("Done\n{{data|indent}}", data=buildbot_summary.run.stats)

    new_keys = []
    new_data = []

    if not summary.tests:
        key = source_key + ".0"
        new_keys.append(key)

        new_data.append({"id": key, "value": buildbot_summary})
    else:
        for i, t in enumerate(summary.tests):
            key = source_key + "." + unicode(i)
            new_keys.append(key)
예제 #21
0
    def _update_cardinality(self, c):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if c.type in ["object", "nested"]:
            Log.error("not supported")
        try:
            if c.table == "meta.columns":
                with self.columns.locker:
                    partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.columns, c.abs_name) if g[c.abs_name] != None])
                    self.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.columns),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"table": c.table, "abs_name": c.abs_name}}
                    })
                return
            if c.table == "meta.tables":
                with self.columns.locker:
                    partitions = qb.sort([g[c.abs_name] for g, _ in qb.groupby(self.tables, c.abs_name) if g[c.abs_name] != None])
                    self.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.tables),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return

            es_index = c.table.split(".")[0]
            result = self.default_es.post("/"+es_index+"/_search", data={
                "aggs": {c.name: _counting_query(c)},
                "size": 0
            })
            r = result.aggregations.values()[0]
            count = result.hits.total
            cardinality = coalesce(r.value, r._nested.value)
            if cardinality == None:
                Log.error("logic error")

            query = Dict(size=0)
            if c.type in ["object", "nested"]:
                Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.columns.locker:
                    self.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return
            elif cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.columns.locker:
                    self.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return
            elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.columns.locker:
                    self.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return
            elif c.nested_path:
                query.aggs[literal_field(c.name)] = {
                    "nested": {"path": listwrap(c.nested_path)[0]},
                    "aggs": {"_nested": {"terms": {"field": c.abs_name, "size": 0}}}
                }
            else:
                query.aggs[literal_field(c.name)] = {"terms": {"field": c.abs_name, "size": 0}}

            result = self.default_es.post("/"+es_index+"/_search", data=query)

            aggs = result.aggregations.values()[0]
            if aggs._nested:
                parts = qb.sort(aggs._nested.buckets.key)
            else:
                parts = qb.sort(aggs.buckets.key)

            Log.note("{{field}} has {{parts}}", field=c.name, parts=parts)
            with self.columns.locker:
                self.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"table": c.table, "abs_name": c.abs_name}}
                })
        except Exception, e:
            if "IndexMissingException" in e and c.table.startswith("testing"):
                Log.alert("{{col.table}} does not exist", col=c)
            else:
                self.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear":[
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"table": c.table, "abs_name": c.abs_name}}
                })
                Log.warning("Could not get {{col.table}}.{{col.abs_name}} info", col=c, cause=e)
예제 #22
0
        output_bytes = convert.unicode2utf8(convert.value2json(result))
        return wrap({
            "status_code": 200,
            "all_content": output_bytes,
            "content": output_bytes
        })


global_settings = jsons.ref.get("file://tests/config/elasticsearch.json")
constants.set(global_settings.constants)
NEXT = 0

container_types = Dict(elasticsearch=ESUtils, sqlite=SQLiteUtils)

# read_alternate_settings
utils = None
try:
    filename = os.environ.get("TEST_CONFIG")
    if filename:
        global_settings = jsons.ref.get("file://" + filename)
    else:
        Log.alert(
            "No TEST_CONFIG environment variable to point to config file.  Using /tests/config/elasticsearch.json"
        )

    if not global_settings.use:
        Log.error('Must have a {"use": type} set in the config file')
    utils = container_types[global_settings.use](global_settings)
except Exception, e:
    Log.warning("problem", e)
예제 #23
0
    def _dispatch_work(self, source_block):
        """
        source_block POINTS TO THE bucket AND key TO PROCESS
        :return: False IF THERE IS NOTHING LEFT TO DO
        """
        source_keys = listwrap(coalesce(source_block.key, source_block.keys))

        if not isinstance(source_block.bucket, basestring):  # FIX MISTAKE
            source_block.bucket = source_block.bucket.bucket
        bucket = source_block.bucket
        work_actions = [w for w in self.settings.workers if w.source.bucket == bucket]

        if not work_actions:
            Log.note("No worker defined for records from {{bucket}}, {{action}}.\n{{message|indent}}",
                bucket= source_block.bucket,
                message= source_block,
                action= "skipping" if self.settings.keep_unknown_on_queue else "deleting")
            return not self.settings.keep_unknown_on_queue

        for action in work_actions:
            try:
                source_key = unicode(source_keys[0])
                if len(source_keys) > 1:
                    multi_source = action._source
                    source = ConcatSources([multi_source.get_key(k) for k in source_keys])
                    source_key = MIN(source_key)
                else:
                    source = action._source.get_key(source_key)
                    source_key = source.key

                Log.note("Execute {{action}} on bucket={{source}} key={{key}}",
                    action= action.name,
                    source= source_block.bucket,
                    key= source_key)

                if action.transform_type == "bulk":
                    old_keys = set()
                else:
                    old_keys = action._destination.keys(prefix=source_block.key)

                new_keys = set(action._transformer(source_key, source, action._destination, resources=self.resources, please_stop=self.please_stop))

                #VERIFY KEYS
                if len(new_keys) == 1 and list(new_keys)[0] == source_key:
                    pass  # ok
                else:
                    etls = map(key2etl, new_keys)
                    etls = qb.sort(etls, "id")
                    for i, e in enumerate(etls):
                        if i != e.id:
                            Log.error("expecting keys to have dense order: {{ids}}", ids=etls.id)
                    #VERIFY KEYS EXIST
                    if hasattr(action._destination, "get_key"):
                        for k in new_keys:
                            action._destination.get_key(k)

                for n in action._notify:
                    for k in new_keys:
                        n.add(k)

                if action.transform_type == "bulk":
                    continue

                # DUE TO BUGS THIS INVARIANT IS NOW BROKEN
                # TODO: FIGURE OUT HOW TO FIX THIS (CHANGE NAME OF THE SOURCE BLOCK KEY?)
                # for n in new_keys:
                #     if not n.startswith(source_key):
                #         Log.error("Expecting new keys ({{new_key}}) to start with source key ({{source_key}})",  new_key= n,  source_key= source_key)

                if not new_keys and old_keys:
                    Log.alert("Expecting some new keys after etl of {{source_key}}, especially since there were old ones\n{{old_keys}}",
                        old_keys= old_keys,
                        source_key= source_key)
                    continue
                elif not new_keys:
                    Log.alert("Expecting some new keys after processing {{source_key}}",
                        old_keys= old_keys,
                        source_key= source_key)
                    continue

                for k in new_keys:
                    if len(k.split(".")) == 3 and action.destination.type!="test_result":
                        Log.error("two dots have not been needed yet, this is a consitency check")

                delete_me = old_keys - new_keys
                if delete_me:
                    if action.destination.bucket == "ekyle-test-result":
                        for k in delete_me:
                            action._destination.delete_key(k)
                    else:
                        Log.note("delete keys?\n{{list}}",  list= sorted(delete_me))
                        # for k in delete_me:
                # WE DO NOT PUT KEYS ON WORK QUEUE IF ALREADY NOTIFYING SOME OTHER
                # AND NOT GOING TO AN S3 BUCKET
                if not action._notify and isinstance(action._destination, (aws.s3.Bucket, S3Bucket)):
                    for k in old_keys | new_keys:
                        self.work_queue.add(Dict(
                            bucket=action.destination.bucket,
                            key=k
                        ))
            except Exception, e:
                if "Key {{key}} does not exist" in e:
                    err = Log.warning
                elif "multiple keys in {{bucket}}" in e:
                    err = Log.warning
                    if source_block.bucket=="ekyle-test-result":
                        for k in action._source.list(prefix=key_prefix(source_key)):
                            action._source.delete_key(strip_extension(k.key))
                elif "expecting keys to have dense order" in e:
                    err = Log.warning
                    if source_block.bucket=="ekyle-test-result":
                        # WE KNOW OF THIS ETL MISTAKE, REPROCESS
                        self.work_queue.add({
                            "key": unicode(key_prefix(source_key)),
                            "bucket": "ekyle-pulse-logger"
                        })
                elif "Expecting a pure key" in e:
                    err = Log.warning
                else:
                    err = Log.error

                err("Problem transforming {{action}} on bucket={{source}} key={{key}} to destination={{destination}}", {
                    "action": action.name,
                    "source": source_block.bucket,
                    "key": source_key,
                    "destination": coalesce(action.destination.name, action.destination.index)
                }, e)
예제 #24
0
class Index(Features):
    """
    AN ElasticSearch INDEX LIFETIME MANAGEMENT TOOL

    ElasticSearch'S REST INTERFACE WORKS WELL WITH PYTHON AND JAVASCRIPT
    SO HARDLY ANY LIBRARY IS REQUIRED.  IT IS SIMPLER TO MAKE HTTP CALLS
    DIRECTLY TO ES USING YOUR FAVORITE HTTP LIBRARY.  I HAVE SOME
    CONVENIENCE FUNCTIONS HERE, BUT IT'S BETTER TO MAKE YOUR OWN.

    THIS CLASS IS TO HELP DURING ETL, CREATING INDEXES, MANAGING ALIASES
    AND REMOVING INDEXES WHEN THEY HAVE BEEN REPLACED.  IT USES A STANDARD
    SUFFIX (YYYYMMDD-HHMMSS) TO TRACK AGE AND RELATIONSHIP TO THE ALIAS,
    IF ANY YET.

    """
    @use_settings
    def __init__(
        self,
        index,  # NAME OF THE INDEX, EITHER ALIAS NAME OR FULL VERSION NAME
        type=None,  # SCHEMA NAME, (DEFAULT TO TYPE IN INDEX, IF ONLY ONE)
        alias=None,
        explore_metadata=True,  # PROBING THE CLUSTER FOR METADATA IS ALLOWED
        read_only=True,
        tjson=False,  # STORED AS TYPED JSON
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        debug=False,  # DO NOT SHOW THE DEBUG STATEMENTS
        settings=None
    ):
        if index==None:
            Log.error("not allowed")
        if index == alias:
            Log.error("must have a unique index name")

        self.cluster_state = None
        self.debug = debug
        self.settings = settings
        self.cluster = Cluster(settings)

        try:
            full_index = self.get_index(index)
            if full_index and alias==None:
                settings.alias = settings.index
                settings.index = full_index
            if full_index==None:
                Log.error("not allowed")
            if type == None:
                # NO type PROVIDED, MAYBE THERE IS A SUITABLE DEFAULT?
                with self.cluster.metadata_locker:
                    index_ = self.cluster._metadata.indices[self.settings.index]
                if not index_:
                    indices = self.cluster.get_metadata().indices
                    index_ = indices[self.settings.index]

                candidate_types = list(index_.mappings.keys())
                if len(candidate_types) != 1:
                    Log.error("Expecting `type` parameter")
                self.settings.type = type = candidate_types[0]
        except Exception, e:
            # EXPLORING (get_metadata()) IS NOT ALLOWED ON THE PUBLIC CLUSTER
            Log.error("not expected", cause=e)

        if not type:
            Log.error("not allowed")

        self.path = "/" + full_index + "/" + type

        if self.debug:
            Log.alert("elasticsearch debugging for {{url}} is on", url=self.url)
예제 #25
0
class Cluster(object):

    @use_settings
    def __new__(cls, host, port=9200, settings=None):
        if not isinstance(port, int):
            Log.error("port must be integer")
        cluster = known_clusters.get((host, port))
        if cluster:
            return cluster

        cluster = object.__new__(cls)
        known_clusters[(host, port)] = cluster
        return cluster

    @use_settings
    def __init__(self, host, port=9200, explore_metadata=True, settings=None):
        """
        settings.explore_metadata == True - IF PROBING THE CLUSTER FOR METADATA IS ALLOWED
        settings.timeout == NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        """
        if hasattr(self, "settings"):
            return

        self.settings = settings
        self.cluster_state = None
        self._metadata = None
        self.metadata_locker = Lock()
        self.debug = settings.debug
        self.version = None
        self.path = settings.host + ":" + unicode(settings.port)
        self.get_metadata()

    @use_settings
    def get_or_create_index(
        self,
        index,
        alias=None,
        schema=None,
        limit_replicas=None,
        read_only=False,
        tjson=False,
        settings=None
    ):
        best = self._get_best(settings)
        if not best:
            output = self.create_index(settings=settings, schema=schema, limit_replicas=limit_replicas)
            return output
        elif best.alias != None:
            settings.alias = best.alias
            settings.index = best.index
        elif settings.alias == None:
            settings.alias = settings.index
            settings.index = best.index

        index = settings.index
        meta = self.get_metadata()
        columns = parse_properties(index, [], meta.indices[index].mappings.values()[0].properties)
        if len(columns)!=0:
            settings.tjson = tjson or any(c.name.endswith("$value") for c in columns)

        return Index(settings)

    def _get_best(self, settings):
        from pyLibrary.queries import jx
        aliases = self.get_aliases()
        indexes = jx.sort([
            a
            for a in aliases
            if (a.alias == settings.index and settings.alias == None) or
            (re.match(re.escape(settings.index) + r'\d{8}_\d{6}', a.index) and settings.alias == None) or
            (a.index == settings.index and (a.alias == None or a.alias == settings.alias))
        ], "index")
        return indexes.last()

    @use_settings
    def get_index(self, index, type=None, alias=None, read_only=True, settings=None):
        """
        TESTS THAT THE INDEX EXISTS BEFORE RETURNING A HANDLE
        """
        if read_only:
            # GET EXACT MATCH, OR ALIAS
            aliases = self.get_aliases()
            if index in aliases.index:
                return Index(settings)
            if index in aliases.alias:
                match = [a for a in aliases if a.alias == index][0]
                settings.alias = match.alias
                settings.index = match.index
                return Index(settings)
            Log.error("Can not find index {{index_name}}", index_name=settings.index)
        else:
            # GET BEST MATCH, INCLUDING PROTOTYPE
            best = self._get_best(settings)
            if not best:
                Log.error("Can not find index {{index_name}}", index_name=settings.index)

            if best.alias != None:
                settings.alias = best.alias
                settings.index = best.index
            elif settings.alias == None:
                settings.alias = settings.index
                settings.index = best.index
            return Index(settings)

    def get_alias(self, alias):
        """
        RETURN REFERENCE TO ALIAS (MANY INDEXES)
        USER MUST BE SURE NOT TO SEND UPDATES
        """
        aliases = self.get_aliases()
        if alias in aliases.alias:
            settings = self.settings.copy()
            settings.alias = alias
            settings.index = alias
            return Index(read_only=True, settings=settings)
        Log.error("Can not find any index with alias {{alias_name}}",  alias_name= alias)

    def get_prototype(self, alias):
        """
        RETURN ALL INDEXES THAT ARE INTENDED TO BE GIVEN alias, BUT HAVE NO
        ALIAS YET BECAUSE INCOMPLETE
        """
        output = sort([
            a.index
            for a in self.get_aliases()
            if re.match(re.escape(alias) + "\\d{8}_\\d{6}", a.index) and not a.alias
        ])
        return output

    @use_settings
    def create_index(
        self,
        index,
        alias=None,
        schema=None,
        limit_replicas=None,
        read_only=False,
        tjson=False,
        settings=None
    ):
        if not settings.alias:
            settings.alias = settings.index
            index = settings.index = proto_name(settings.alias)

        if settings.alias == index:
            Log.error("Expecting index name to conform to pattern")

        if settings.schema_file:
            Log.error('schema_file attribute not supported.  Use {"$ref":<filename>} instead')

        if schema == None:
            Log.error("Expecting a schema")
        elif isinstance(schema, basestring):
            schema = convert.json2value(schema, leaves=True)
        else:
            schema = convert.json2value(convert.value2json(schema), leaves=True)

        if limit_replicas:
            # DO NOT ASK FOR TOO MANY REPLICAS
            health = self.get("/_cluster/health")
            if schema.settings.index.number_of_replicas >= health.number_of_nodes:
                Log.warning("Reduced number of replicas: {{from}} requested, {{to}} realized",
                    {"from": schema.settings.index.number_of_replicas},
                    to= health.number_of_nodes - 1
                )
                schema.settings.index.number_of_replicas = health.number_of_nodes - 1

        self.post(
            "/" + index,
            data=schema,
            headers={"Content-Type": "application/json"}
        )

        # CONFIRM INDEX EXISTS
        while True:
            try:
                state = self.get("/_cluster/state")
                if index in state.metadata.indices:
                    break
                Log.note("Waiting for index {{index}} to appear", index=index)
            except Exception, e:
                Log.warning("Problem while waiting for index {{index}} to appear", index=index, cause=e)
            Thread.sleep(seconds=1)
        Log.alert("Made new index {{index|quote}}", index=index)

        es = Index(settings=settings)
        return es