Exemplo n.º 1
0
def get_branches(hg, branches, kwargs=None):
    # TRY ES
    cluster = elasticsearch.Cluster(branches)
    try:
        es = cluster.get_index(kwargs=branches, read_only=False)
        esq = jx_elasticsearch.new_instance(branches)
        found_branches = esq.query({"from": "branches", "format": "list", "limit": 10000}).data

        # IF IT IS TOO OLD, THEN PULL FROM HG
        oldest = Date(MAX(found_branches.etl.timestamp))
        if oldest == None or Date.now() - oldest > OLD_BRANCH:
            found_branches = _get_branches_from_hg(hg)
            es.extend({"id": b.name + " " + b.locale, "value": b} for b in found_branches)
            es.flush()

        try:
            return UniqueIndex(["name", "locale"], data=found_branches, fail_on_dup=False)
        except Exception as e:
            Log.error("Bad branch in ES index", cause=e)
    except Exception as e:
        e = Except.wrap(e)
        if "Can not find index " in e:
            set_default(branches, {"schema": branches_schema})
            es = cluster.get_or_create_index(branches)
            es.add_alias()
            return get_branches(kwargs)
        Log.error("problem getting branches", cause=e)
Exemplo n.º 2
0
 def test_big_result_works(self):
     result = http.post_json(global_settings.testing.query,
                             data={
                                 "from": "unittest",
                                 "where": {
                                     "and": [{
                                         "gte": {
                                             "run.timestamp":
                                             Date.today() - DAY
                                         }
                                     }, {
                                         "lt": {
                                             "run.timestamp": Date.today()
                                         }
                                     }, {
                                         "eq": {
                                             "result.ok": False
                                         }
                                     }]
                                 },
                                 "format": "list",
                                 "limit": 10000
                             })
     if result.template:
         result = Except.new_instance(result)
         Log.error("problem with call", cause=result)
     Log.note("Got {{num}} test failures", num=len(result.data))
Exemplo n.º 3
0
    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD:
                continue

            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear":[
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
            if DEBUG:
                Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
Exemplo n.º 4
0
 def __init__(self,
              host,
              index,
              type="query",
              max_size=10,
              batch_size=10,
              kwargs=None):
     """
     settings ARE FOR THE ELASTICSEARCH INDEX
     """
     es = Cluster(kwargs).get_or_create_index(schema=convert.json2value(
         convert.value2json(SCHEMA), leaves=True),
                                              limit_replicas=True,
                                              kwargs=kwargs)
     #ENSURE THE TYPE EXISTS FOR PROBING
     try:
         es.add({
             "id": "dummy",
             "value": {
                 "hash": "dummy",
                 "create_time": Date.now(),
                 "last_used": Date.now(),
                 "query": {}
             }
         })
     except Exception, e:
         Log.warning("Problem saving query", cause=e)
Exemplo n.º 5
0
    def not_monitor(self, please_stop):
        Log.alert("metadata scan has been disabled")
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            c = self.todo.pop()
            if c == THREAD_STOP:
                break

            if not c.last_updated or c.last_updated >= Date.now()-TOO_OLD:
                continue

            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear":[
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
            if DEBUG:
                Log.note("Could not get {{col.es_index}}.{{col.es_column}} info", col=c)
Exemplo n.º 6
0
    def __init__(self, **desc):
        Domain.__init__(self, **desc)
        self.type = "time"
        self.NULL = Null
        self.min = Date(self.min)
        self.max = Date(self.max)
        self.interval = Duration(self.interval)

        if self.partitions:
            # IGNORE THE min, max, interval
            if not self.key:
                Log.error("Must have a key value")

            Log.error("not implemented yet")

            # VERIFY PARTITIONS DO NOT OVERLAP
            return
        elif not all([self.min, self.max, self.interval]):
            Log.error("Can not handle missing parameter")

        self.key = "min"
        self.partitions = wrap([{
            "min": v,
            "max": v + self.interval,
            "dataIndex": i
        } for i, v in enumerate(Date.range(self.min, self.max, self.interval))
                                ])
Exemplo n.º 7
0
    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            short_name = join_field(split_field(table_name)[0:1])
            table = self.get_table(short_name)[0]

            if not table:
                table = Table(
                    name=short_name,
                    url=None,
                    query_path=None,
                    timestamp=Date.now()
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=short_name)
            elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=short_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(table_name, column_name)
            if columns:
                columns = jx.sort(columns, "name")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated])
                    Till(seconds=1).wait()
                return columns
        except Exception, e:
            Log.error("Not expected", cause=e)
Exemplo n.º 8
0
def get_branches(hg, branches, kwargs=None):
    # TRY ES
    try:
        es = elasticsearch.Cluster(kwargs=branches).get_index(kwargs=branches, read_only=False)

        query = {
            "query": {"match_all": {}},
            "size": 10000
        }

        found_branches = es.search(query).hits.hits._source
        # IF IT IS TOO OLD, THEN PULL FROM HG
        oldest = Date(MAX(found_branches.etl.timestamp))
        if oldest == None or Date.now() - oldest > OLD_BRANCH:
            found_branches = _get_branches_from_hg(hg)
            es.extend({"id": b.name + " " + b.locale, "value": b} for b in found_branches)
            es.flush()

        try:
            return UniqueIndex(["name", "locale"], data=found_branches, fail_on_dup=False)
        except Exception as e:
            Log.error("Bad branch in ES index", cause=e)
    except Exception as e:
        if "Can not find index " in e:
            set_default(branches, {"schema": branches_schema})
            es = elasticsearch.Cluster(kwargs=branches).get_or_create_index(kwargs=branches)
            es.add_alias()
            return get_branches(kwargs=kwargs)
        Log.error("problem getting branches", cause=e)
Exemplo n.º 9
0
 def test_create_date(self):
     if PY3:
         from datetime import timezone
         test = Date(datetime(2020, 3, 21, 0, 0, 0, 0, timezone.utc))
     else:
         test = Date(datetime(2020, 3, 21, 0, 0, 0, 0))
     self.assertEqual(test, 1584748800)
Exemplo n.º 10
0
    def __init__(self, **desc):
        Domain.__init__(self, **desc)
        self.type = "time"
        self.NULL = Null
        self.min = Date(self.min)
        self.max = Date(self.max)
        self.interval = Duration(self.interval)
        self.sort = Null

        if self.partitions:
            # IGNORE THE min, max, interval
            if not self.key:
                Log.error("Must have a key value")

            Log.error("not implemented yet")

            # VERIFY PARTITIONS DO NOT OVERLAP
            return

        self.verify_attributes_not_null(["min", "max", "interval"])
        self.key = "min"
        self.partitions = wrap([{
            "min": v,
            "max": v + self.interval,
            "dataIndex": i
        } for i, v in enumerate(Date.range(self.min, self.max, self.interval))
                                ])
    def _db_load(self):
        self.last_load = Date.now()

        try:
            self.es_index = self.es_cluster.get_index(
                id=ID,
                index=META_COLUMNS_NAME,
                type=META_COLUMNS_TYPE_NAME,
                read_only=False)

            result = self.es_index.search({
                "query": {
                    "bool": {
                        "should": [
                            {
                                "bool": {
                                    "must_not": {
                                        "exists": {
                                            "field": "cardinality.~n~"
                                        }
                                    }
                                }
                            },
                            {  # ASSUME UNUSED COLUMNS DO NOT EXIST
                                "range": {
                                    "cardinality.~n~": {
                                        "gte": 0
                                    }
                                }
                            },
                        ]
                    }
                },
                "sort": ["es_index.~s~", "name.~s~", "es_column.~s~"],
                "size":
                10000,
            })

            with Timer("adding columns to structure"):
                for r in result.hits.hits._source:
                    col = doc_to_column(r)
                    if col:
                        self._add(col)

            Log.note("{{num}} columns loaded", num=result.hits.total)
            if not self.data.get(META_COLUMNS_NAME):
                Log.error("metadata missing from index!")

        except Exception as e:
            metadata = self.es_cluster.get_metadata(after=Date.now())
            if any(
                    index.startswith(META_COLUMNS_NAME)
                    for index in metadata.indices.keys()):
                Log.error("metadata already exists!", cause=e)

            Log.warning("no {{index}} exists, making one",
                        index=META_COLUMNS_NAME,
                        cause=e)
            self._db_create()
Exemplo n.º 12
0
def parse_hg_date(date):
    if isinstance(date, text_type):
        return Date(date)
    elif isinstance(date, list):
        # FIRST IN TUPLE (timestamp, time_zone) TUPLE, WHERE timestamp IS GMT
        return Date(date[0])
    else:
        Log.error("Can not deal with date like {{date|json}}", date=date)
Exemplo n.º 13
0
    def _get_queue(self, row):
        row = wrap(row)
        if row.json:
            row.value, row.json = json2value(row.json), None
        timestamp = Date(self.rollover_field(row.value))
        if timestamp == None:
            return Null
        elif timestamp < Date.today() - self.rollover_max:
            return DATA_TOO_OLD

        rounded_timestamp = timestamp.floor(self.rollover_interval)
        with self.locker:
            queue = self.known_queues.get(rounded_timestamp.unix)
        if queue == None:
            candidates = sort_using_key(
                filter(
                    lambda r: re.match(
                        re.escape(self.settings.index) + r"\d\d\d\d\d\d\d\d_\d\d\d\d\d\d$",
                        r['index']
                    ),
                    self.cluster.get_aliases()
                ),
                key=lambda r: r['index']
            )
            best = None
            for c in candidates:
                c = wrap(c)
                c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT)
                if timestamp > c.date:
                    best = c
            if not best or rounded_timestamp > best.date:
                if rounded_timestamp < wrap(candidates[-1]).date:
                    es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)
                else:
                    try:
                        es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings)
                        es.add_alias(self.settings.index)
                    except Exception as e:
                        e = Except.wrap(e)
                        if "IndexAlreadyExistsException" not in e:
                            Log.error("Problem creating index", cause=e)
                        return self._get_queue(row)  # TRY AGAIN
            else:
                es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)

            def refresh(please_stop):
                try:
                    es.set_refresh_interval(seconds=60 * 10, timeout=5)
                except Exception:
                    Log.note("Could not set refresh interval for {{index}}", index=es.settings.index)

            Thread.run("refresh", refresh)

            self._delete_old_indexes(candidates)
            threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True)
            with self.locker:
                queue = self.known_queues[rounded_timestamp.unix] = threaded_queue
        return queue
Exemplo n.º 14
0
    def test_timing(self):
        if self.not_real_service():
            return

        test = wrap({
            "query": {
                "from": {
                    "type": "elasticsearch",
                    "settings": {
                        "host": ES_CLUSTER_LOCATION,
                        "index": "unittest",
                        "type": "test_result"
                    }
                },
                "select": [{
                    "name": "count",
                    "value": "run.duration",
                    "aggregate": "count"
                }, {
                    "name": "total",
                    "value": "run.duration",
                    "aggregate": "sum"
                }],
                "edges": [{
                    "name": "chunk",
                    "value": ["run.suite", "run.chunk"]
                }, "result.ok"],
                "where": {
                    "and": [{
                        "lt": {
                            "timestamp": Date.floor(Date.now()).milli / 1000
                        }
                    }, {
                        "gte": {
                            "timestamp":
                            Date.floor(Date.now() - (Duration.DAY * 7),
                                       Duration.DAY).milli / 1000
                        }
                    }]
                },
                "format":
                "cube",
                "samples": {
                    "limit": 30
                }
            }
        })

        query = unicode2utf8(convert.value2json(test.query))
        # EXECUTE QUERY
        with Timer("query"):
            response = http.get(self.testing.query, data=query)
            if response.status_code != 200:
                error(response)
        result = json2value(utf82unicode(response.all_content))

        Log.note("result\n{{result|indent}}", {"result": result})
Exemplo n.º 15
0
    def test_timeout(self):
        def test(please_stop):
            Till(seconds=10).wait()

        now = Date.now()
        thread = Thread.run("sleeper", test)
        Till(seconds=0.5).wait()
        thread.stop()
        self.assertGreater(now.unix+1, Date.now().unix, "Expecting quick stop")
        Log.note("done")
Exemplo n.º 16
0
def doc_to_column(doc):
    try:
        doc = wrap(untyped(doc))

        # I HAVE MANAGED TO MAKE MANY MISTAKES WRITING COLUMNS TO ES. HERE ARE THE FIXES

        # FIX
        if not doc.last_updated:
            doc.last_updated = Date.now() - YEAR

        # FIX
        if doc.es_type == None:
            if doc.jx_type == OBJECT:
                doc.es_type = "object"
            else:
                Log.warning("{{doc}} has no es_type", doc=doc)

        # FIX
        doc.multi = 1001 if doc.es_type == "nested" else doc.multi

        # FIX
        doc.nested_path = tuple(listwrap(doc.nested_path))
        if last(split_field(
                doc.es_column)) == NESTED_TYPE and doc.es_type != "nested":
            doc.es_type = "nested"
            doc.jx_type = NESTED
            doc.multi = 1001
            doc.last_updated = Date.now()

        # FIX
        expected_nested_path = get_nested_path(doc.es_column)
        if len(doc.nested_path) > 1 and doc.nested_path[-2] == '.':
            doc.nested_path = doc.nested_path[:-1]

        # FIX
        if untype_path(doc.es_column) == doc.es_column:
            if doc.nested_path != (".", ):
                if doc.es_index in {"repo"}:
                    pass
                else:
                    Log.note("not expected")
                    doc.nested_path = expected_nested_path
        else:
            if doc.nested_path != expected_nested_path:
                doc.nested_path = expected_nested_path

        # FIX
        if last(split_field(doc.es_column)) == EXISTS_TYPE:
            doc.jx_type = EXISTS

        return Column(**doc)
    except Exception:
        doc.nested_path = ["."]
        mark_as_deleted(Column(**doc))
        return None
Exemplo n.º 17
0
    def save(self, query):
        query.meta = None
        json = convert.value2json(query)
        hash = convert.unicode2utf8(json)

        # TRY MANY HASHES AT ONCE
        hashes = [None] * HASH_BLOCK_SIZE
        for i in range(HASH_BLOCK_SIZE):
            hash = hashlib.sha1(hash).digest()
            hashes[i] = hash

        short_hashes = [
            convert.bytes2base64(h[0:6]).replace("/", "_") for h in hashes
        ]
        available = {h: True for h in short_hashes}

        existing = self.es.query({
            "from": {
                "type": "elasticsearch",
                "settings": self.es.settings
            },
            "where": {
                "terms": {
                    "hash": short_hashes
                }
            },
            "meta": {
                "timeout": "2second"
            }
        })

        for e in Cube(select=existing.select,
                      edges=existing.edges,
                      data=existing.data).values():
            if e.query == json:
                return e.hash
            available[e.hash] = False

        # THIS WILL THROW AN ERROR IF THERE ARE NONE, HOW UNLUCKY!
        best = [h for h in short_hashes if available[h]][0]

        self.queue.add({
            "id": best,
            "value": {
                "hash": best,
                "create_time": Date.now(),
                "last_used": Date.now(),
                "query": json
            }
        })

        Log.note("Saved query as {{hash}}", hash=best)

        return best
Exemplo n.º 18
0
 def setUpClass(self):
     # REMOVE OLD INDEXES
     cluster = elasticsearch.Cluster(test_jx.global_settings.backend_es)
     aliases = cluster.get_aliases()
     for a in aliases:
         try:
             if a.index.startswith("testing_"):
                 create_time = Date(a.index[-15:], "%Y%m%d_%H%M%S")  # EXAMPLE testing_0ef53e45b320160118_180420
                 if create_time < Date.now() - 10 * MINUTE:
                     cluster.delete_index(a.index)
         except Exception, e:
             Log.warning("Problem removing {{index|quote}}", index=a.index, cause=e)
Exemplo n.º 19
0
    def trigger_job(self):
        while self.please_stop:
            now = Date.now()
            next = now + DAY

            for j in self.jobs:
                if j.next_run_time < now:
                    j.next_run_time = next_run(j)
                    self.run_job(j)

                next = Date.min(next, j.next_run_time)

            (Till(till=next) | self.please_stop).wait()
Exemplo n.º 20
0
    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        es_index_name = table_path[0]
        query_path = join_field(table_path[1:])
        table = self.get_table(es_index_name)[0]
        abs_column_name = None if column_name == None else concat_field(
            query_path, column_name)

        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = Table(name=es_index_name,
                              url=None,
                              query_path=None,
                              timestamp=Date.now())
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=es_index_name)
            elif force or table.timestamp == None or table.timestamp < Date.now(
            ) - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=es_index_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(es_index_name, column_name)
            if columns:
                columns = jx.sort(columns, "names.\.")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note(
                            "waiting for columns to update {{columns|json}}",
                            columns=[
                                c.es_index + "." + c.es_column for c in columns
                                if not c.last_updated
                            ])
                    Till(seconds=1).wait()
                return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        if abs_column_name:
            Log.error("no columns matching {{table}}.{{column}}",
                      table=table_name,
                      column=abs_column_name)
        else:
            self._get_columns(table=table_name)  # TO TEST WHAT HAPPENED
            Log.error("no columns for {{table}}?!", table=table_name)
Exemplo n.º 21
0
    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    with self.meta.columns.locker:
                        old_columns = [
                            c for c in self.meta.columns
                            if (c.last_updated == None or c.last_updated <
                                Date.now() - TOO_OLD) and c.type not in STRUCT
                        ]
                        if old_columns:
                            if DEBUG:
                                Log.note(
                                    "Old columns wth dates {{dates|json}}",
                                    dates=wrap(old_columns).last_updated)
                            self.todo.extend(old_columns)
                            # TEST CONSISTENCY
                            for c, d in product(list(self.todo.queue),
                                                list(self.todo.queue)):
                                if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
                                    Log.error("")
                        else:
                            if DEBUG:
                                Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10 * MINUTE).seconds))
                if column:
                    if DEBUG:
                        Log.note("update {{table}}.{{column}}",
                                 table=column.es_index,
                                 column=column.es_column)
                    if column.type in STRUCT:
                        with self.meta.columns.locker:
                            column.last_updated = Date.now()
                        continue
                    elif column.last_updated >= Date.now() - TOO_OLD:
                        continue
                    try:
                        self._update_cardinality(column)
                        if DEBUG and not column.es_index.startswith(
                                TEST_TABLE_PREFIX):
                            Log.note("updated {{column.name}}", column=column)
                    except Exception as e:
                        Log.warning(
                            "problem getting cardinality for {{column.name}}",
                            column=column,
                            cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)
Exemplo n.º 22
0
def next_run(job):
    if job.settings.start_next:
        formula = next_run(job.settings.start_next)
    elif job.settings.start_interval:
        formula = "now|" + job.settings.start_interval + "+" + job.settings.start_interval
    else:
        Log.error("Expecting `start_next` or `start_interval` for job {{job}}",
                  job=job.name)

    now = Date.now()
    next = Date(formula)
    if next < now:
        Log.error("{{formula|quote}} does not calculate a future date")
    return next
Exemplo n.º 23
0
 def test_time_variables(self):
     test = {
         "metadata": {},
         "data": test_data_1,
         "query": {
             "from": TEST_TABLE,
             "edges": [
                 {
                     "value": "t",
                     "domain": {
                         "type": "time",
                         "min": "today-week",
                         "max": "today",
                         "interval": "day"
                     }
                 }
             ],
             "select": {
                 "value": "v", "aggregate": "sum"
             }
         },
         "expecting_list": {
             "meta": {"format": "list"},
             "data": [r for r in expected_list_1]
         },
         "expecting_table": {
             "meta": {"format": "table"},
             "header": ["t", "v"],
             "data": [[r.t, r.v] for r in expected_list_1]
         },
         "expecting_cube": {
             "meta": {"format": "cube"},
             "edges": [
                 {
                     "name": "t",
                     "domain": {
                         "type": "time",
                         "key": "min",
                         "min": Date("today-week").unix,
                         "max": TODAY.unix,
                         "interval": DAY.seconds,
                         "partitions": [{"min": r.t, "max": (Date(r.t) + DAY).unix} for r in expected_list_1 if r.t != None]
                     }
                 }
             ],
             "data": {"v": [r.v for r in expected_list_1]}
         }
     }
     self.utils.execute_tests(test)
Exemplo n.º 24
0
    def _delete_old_indexes(self, candidates):
        for c in candidates:
            timestamp = unicode2Date(c.index[-15:], "%Y%m%d_%H%M%S")
            if timestamp + self.rollover_interval < Date.today() - self.rollover_max:
                # Log.warning("Will delete {{index}}", index=c.index)
                try:
                    self.cluster.delete_index(c.index)
                except Exception as e:
                    Log.warning("could not delete index {{index}}", index=c.index, cause=e)
        for t, q in items(self.known_queues):
            if unix2Date(t) + self.rollover_interval < Date.today() - self.rollover_max:
                with self.locker:
                    del self.known_queues[t]

        pass
Exemplo n.º 25
0
    def _delete_old_indexes(self, candidates):
        for c in candidates:
            timestamp = unicode2Date(c.index[-15:], "%Y%m%d_%H%M%S")
            if timestamp + self.rollover_interval < Date.today() - self.rollover_max:
                # Log.warning("Will delete {{index}}", index=c.index)
                try:
                    self.cluster.delete_index(c.index)
                except Exception as e:
                    Log.warning("could not delete index {{index}}", index=c.index, cause=e)
        for t, q in list(self.known_queues.items()):
            if unix2Date(t) + self.rollover_interval < Date.today() - self.rollover_max:
                with self.locker:
                    del self.known_queues[t]

        pass
Exemplo n.º 26
0
    def _get_queue(self, row):
        row = wrap(row)
        if row.json:
            row.value, row.json = json2value(row.json), None
        timestamp = Date(self.rollover_field(row.value))
        if timestamp == None:
            return Null
        elif timestamp < Date.today() - self.rollover_max:
            return DATA_TOO_OLD

        rounded_timestamp = timestamp.floor(self.rollover_interval)
        with self.locker:
            queue = self.known_queues.get(rounded_timestamp.unix)
        if queue == None:
            candidates = jx.run({
                "from": ListContainer(".", self.cluster.get_aliases()),
                "where": {"regex": {"index": self.settings.index + "\d\d\d\d\d\d\d\d_\d\d\d\d\d\d"}},
                "sort": "index"
            })
            best = None
            for c in candidates:
                c = wrap(c)
                c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT)
                if timestamp > c.date:
                    best = c
            if not best or rounded_timestamp > best.date:
                if rounded_timestamp < wrap(candidates[-1]).date:
                    es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)
                else:
                    try:
                        es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings)
                        es.add_alias(self.settings.index)
                    except Exception as e:
                        e = Except.wrap(e)
                        if "IndexAlreadyExistsException" not in e:
                            Log.error("Problem creating index", cause=e)
                        return self._get_queue(row)  # TRY AGAIN
            else:
                es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)

            with suppress_exception:
                es.set_refresh_interval(seconds=60 * 5, timeout=5)

            self._delete_old_indexes(candidates)
            threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True)
            with self.locker:
                queue = self.known_queues[rounded_timestamp.unix] = threaded_queue
        return queue
Exemplo n.º 27
0
    def _get_queue(self, row):
        row = wrap(row)
        if row.json:
            row.value, row.json = json2value(row.json), None
        timestamp = Date(self.rollover_field(row.value))
        if timestamp == None:
            return Null
        elif timestamp < Date.today() - self.rollover_max:
            return DATA_TOO_OLD

        rounded_timestamp = timestamp.floor(self.rollover_interval)
        with self.locker:
            queue = self.known_queues.get(rounded_timestamp.unix)
        if queue == None:
            candidates = jx.run({
                "from": ListContainer(".", self.cluster.get_aliases()),
                "where": {"regex": {"index": self.settings.index + "\d\d\d\d\d\d\d\d_\d\d\d\d\d\d"}},
                "sort": "index"
            })
            best = None
            for c in candidates:
                c = wrap(c)
                c.date = unicode2Date(c.index[-15:], elasticsearch.INDEX_DATE_FORMAT)
                if timestamp > c.date:
                    best = c
            if not best or rounded_timestamp > best.date:
                if rounded_timestamp < wrap(candidates[-1]).date:
                    es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)
                else:
                    try:
                        es = self.cluster.create_index(create_timestamp=rounded_timestamp, kwargs=self.settings)
                        es.add_alias(self.settings.index)
                    except Exception as e:
                        e = Except.wrap(e)
                        if "IndexAlreadyExistsException" not in e:
                            Log.error("Problem creating index", cause=e)
                        return self._get_queue(row)  # TRY AGAIN
            else:
                es = self.cluster.get_or_create_index(read_only=False, alias=best.alias, index=best.index, kwargs=self.settings)

            with suppress_exception:
                es.set_refresh_interval(seconds=60 * 5, timeout=5)

            self._delete_old_indexes(candidates)
            threaded_queue = es.threaded_queue(max_size=self.settings.queue_size, batch_size=self.settings.batch_size, silent=True)
            with self.locker:
                queue = self.known_queues[rounded_timestamp.unix] = threaded_queue
        return queue
    def _update_from_es(self, please_stop):
        try:
            last_extract = Date.now()
            while not please_stop:
                now = Date.now()
                try:
                    if (now - last_extract).seconds > COLUMN_EXTRACT_PERIOD:
                        result = self.es_index.search({
                            "query": {
                                "range": {
                                    "last_updated.~n~": {
                                        "gte": self.last_load
                                    }
                                }
                            },
                            "sort":
                            ["es_index.~s~", "name.~s~", "es_column.~s~"],
                            "from":
                            0,
                            "size":
                            10000,
                        })
                        last_extract = now

                        with self.locker:
                            for r in result.hits.hits._source:
                                c = doc_to_column(r)
                                if c:
                                    self._add(c)
                                    self.last_load = MAX(
                                        (self.last_load, c.last_updated))

                    while not please_stop:
                        updates = self.for_es_update.pop_all()
                        if not updates:
                            break

                        DEBUG and updates and Log.note(
                            "{{num}} columns to push to db", num=len(updates))
                        self.es_index.extend([{
                            "value": column.__dict__()
                        } for column in updates])
                except Exception as e:
                    Log.warning("problem updating database", cause=e)

                (Till(seconds=COLUMN_LOAD_PERIOD) | please_stop).wait()
        finally:
            Log.note("done")
Exemplo n.º 29
0
    def __init__(self, **desc):
        Domain.__init__(self, **desc)
        self.type = "time"
        self.NULL = Null
        self.min = Date(self.min)
        self.max = Date(self.max)
        self.interval = Duration(self.interval)
        self.sort = Null

        if self.partitions:
            # IGNORE THE min, max, interval
            if not self.key:
                Log.error("Must have a key value")

            Log.error("not implemented yet")

            # VERIFY PARTITIONS DO NOT OVERLAP
            return

        self.verify_attributes_not_null(["min", "max", "interval"])
        self.key = "min"
        self.partitions = wrap([
            {"min": v, "max": v + self.interval, "dataIndex": i}
            for i, v in enumerate(Date.range(self.min, self.max, self.interval))
        ])
Exemplo n.º 30
0
    def test_two_simple(self):
        today = Date.today()

        result = replace_vars('"{{today|week}}" "{{today}}d"')
        expect = '"' + unicode(today.floor(WEEK).unix) + '" "' + unicode(
            today.unix) + 'd"'
        self.assertEqual(result, expect)
Exemplo n.º 31
0
    def _upsert_column(self, c):
        # ASSUMING THE  self.meta.columns.locker IS HAD
        existing_columns = self.meta.columns.find(c.es_index, c.names["."])
        if not existing_columns:
            self.meta.columns.add(c)
            self.todo.add(c)

            if ENABLE_META_SCAN:
                if DEBUG:
                    Log.note("todo: {{table}}::{{column}}", table=c.es_index, column=c.es_column)
                # MARK meta.columns AS DIRTY TOO
                cols = self.meta.columns.find("meta.columns", None)
                for cc in cols:
                    cc.partitions = cc.cardinality = None
                    cc.last_updated = Date.now()
                self.todo.extend(cols)
        else:
            canonical = existing_columns[0]
            if canonical is not c:
                set_default(c.names, canonical.names)
                for key in Column.__slots__:
                    canonical[key] = c[key]
            if DEBUG:
                Log.note("todo: {{table}}::{{column}}", table=canonical.es_index, column=canonical.es_column)
            self.todo.add(canonical)
Exemplo n.º 32
0
    def _update_meta(self):
        if not self.dirty:
            return

        now = Date.now()
        for mc in META_COLUMNS_DESC.columns:
            count = 0
            values = set()
            objects = 0
            multi = 1
            for column in self._all_columns():
                value = column[mc.name]
                if value == None:
                    pass
                else:
                    count += 1
                    if is_list(value):
                        multi = max(multi, len(value))
                        try:
                            values |= set(value)
                        except Exception:
                            objects += len(value)
                    elif is_data(value):
                        objects += 1
                    else:
                        values.add(value)
            mc.count = count
            mc.cardinality = len(values) + objects
            mc.partitions = jx.sort(values)
            mc.multi = multi
            mc.last_updated = now

        META_COLUMNS_DESC.last_updated = now
        self.dirty = False
Exemplo n.º 33
0
    def _db_load(self):
        self.last_load = Date.now()

        result = self._query(
            SQL_SELECT
            + "name"
            + SQL_FROM
            + "sqlite_master"
            + SQL_WHERE
            + SQL_AND.join(["name=" + db_table_name, "type=" + quote_value("table")])
        )
        if not result.data:
            self._db_create()
            return

        result = self._query(
            SQL_SELECT
            + all_columns
            + SQL_FROM
            + db_table_name
            + SQL_ORDERBY
            + sql_list(map(quote_column, ["es_index", "name", "es_column"]))
        )

        with self.locker:
            for r in result.data:
                c = row_to_column(result.header, r)
                self._add(c)
Exemplo n.º 34
0
    def _update_meta(self):
        if not self.dirty:
            return

        for mcl in self.data.get("meta.columns").values():
            for mc in mcl:
                count = 0
                values = set()
                objects = 0
                multi = 1
                for column in self._all_columns():
                    value = column[mc.name]
                    if value == None:
                        pass
                    else:
                        count += 1
                        if is_list(value):
                            multi = max(multi, len(value))
                            try:
                                values |= set(value)
                            except Exception:
                                objects += len(value)
                        elif is_data(value):
                            objects += 1
                        else:
                            values.add(value)
                mc.count = count
                mc.cardinality = len(values) + objects
                mc.partitions = jx.sort(values)
                mc.multi = multi
                mc.last_updated = Date.now()
        self.dirty = False
Exemplo n.º 35
0
    def __init__(self, **desc):
        Domain.__init__(self, **desc)
        self.type = "time"
        self.NULL = Null
        self.min = Date(self.min)
        self.max = Date(self.max)
        self.interval = Duration(self.interval)

        if self.partitions:
            # IGNORE THE min, max, interval
            if not self.key:
                Log.error("Must have a key value")

            Log.error("not implemented yet")

            # VERIFY PARTITIONS DO NOT OVERLAP
            return
        elif not all([self.min, self.max, self.interval]):
            Log.error("Can not handle missing parameter")

        self.key = "min"
        self.partitions = wrap([
            {"min": v, "max": v + self.interval, "dataIndex": i}
            for i, v in enumerate(Date.range(self.min, self.max, self.interval))
        ])
Exemplo n.º 36
0
    def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(kwargs=kwargs)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.last_es_metadata = Date.now()-OLD_METADATA

        self.meta=Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Exemplo n.º 37
0
def record_request(request, query_, data, error):
    try:
        if request_log_queue == None:
            return

        if data and len(data) > 10000:
            data = data[:10000]

        log = wrap({
            "timestamp":
            Date.now(),
            "http_user_agent":
            request.headers.get("user_agent"),
            "http_accept_encoding":
            request.headers.get("accept_encoding"),
            "path":
            request.headers.environ["werkzeug.request"].full_path,
            "content_length":
            request.headers.get("content_length"),
            "remote_addr":
            request.remote_addr,
            "query":
            query_,
            "data":
            data,
            "error":
            error
        })
        log["from"] = request.headers.get("from")
        request_log_queue.add({"value": log})
    except Exception, e:
        Log.warning("Can not record", cause=e)
Exemplo n.º 38
0
    def _update_meta(self):
        if not self.dirty:
            return

        for mcl in self.data.get("meta.columns").values():
            for mc in mcl:
                count = 0
                values = set()
                objects = 0
                multi = 1
                for column in self._all_columns():
                    value = column[mc.names["."]]
                    if value == None:
                        pass
                    else:
                        count += 1
                        if isinstance(value, list):
                            multi = max(multi, len(value))
                            try:
                                values |= set(value)
                            except Exception:
                                objects += len(value)
                        elif isinstance(value, Mapping):
                            objects += 1
                        else:
                            values.add(value)
                mc.count = count
                mc.cardinality = len(values) + objects
                mc.partitions = jx.sort(values)
                mc.multi = multi
                mc.last_updated = Date.now()
        self.dirty = False
Exemplo n.º 39
0
    def __init__(self, host, index, alias=None, name=None, port=9200, kwargs=None):
        global _elasticsearch
        if hasattr(self, "settings"):
            return

        from pyLibrary.queries.containers.list_usingPythonList import ListContainer
        from pyLibrary.env import elasticsearch as _elasticsearch

        self.settings = kwargs
        self.default_name = coalesce(name, alias, index)
        self.default_es = _elasticsearch.Cluster(kwargs=kwargs)
        self.todo = Queue("refresh metadata", max=100000, unique=True)

        self.es_metadata = Null
        self.last_es_metadata = Date.now()-OLD_METADATA

        self.meta=Data()
        table_columns = metadata_tables()
        column_columns = metadata_columns()
        self.meta.tables = ListContainer("meta.tables", [], wrap({c.names["."]: c for c in table_columns}))
        self.meta.columns = ColumnList()
        self.meta.columns.insert(column_columns)
        self.meta.columns.insert(table_columns)
        # TODO: fix monitor so it does not bring down ES
        if ENABLE_META_SCAN:
            self.worker = Thread.run("refresh metadata", self.monitor)
        else:
            self.worker = Thread.run("refresh metadata", self.not_monitor)
        return
Exemplo n.º 40
0
    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        es_index_name = table_path[0]
        query_path = join_field(table_path[1:])
        table = self.get_table(es_index_name)[0]
        abs_column_name = None if column_name == None else concat_field(query_path, column_name)

        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = Table(
                    name=es_index_name,
                    url=None,
                    query_path=None,
                    timestamp=Date.now()
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=es_index_name)
            elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=es_index_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(es_index_name, column_name)
            if columns:
                columns = jx.sort(columns, "names.\.")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated])
                    Till(seconds=1).wait()
                return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        if abs_column_name:
            Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name)
        else:
            self._get_columns(table=table_name)  # TO TEST WHAT HAPPENED
            Log.error("no columns for {{table}}?!", table=table_name)
Exemplo n.º 41
0
        def add_column(c, query_path):
            c.last_updated = Date.now()
            if query_path[0] != ".":
                c.names[query_path[0]] = relative_field(c.names["."], query_path[0])

            with self.meta.columns.locker:
                self._upsert_column(c)
                for alias in meta.aliases:
                    c = copy(c)
                    c.es_index = alias
                    self._upsert_column(c)
Exemplo n.º 42
0
    def monitor(self, please_stop):
        please_stop.on_go(lambda: self.todo.add(THREAD_STOP))
        while not please_stop:
            try:
                if not self.todo:
                    with self.meta.columns.locker:
                        old_columns = [
                            c
                            for c in self.meta.columns
                            if (c.last_updated == None or c.last_updated < Date.now()-TOO_OLD) and c.type not in STRUCT
                        ]
                        if old_columns:
                            if DEBUG:
                                Log.note("Old columns wth dates {{dates|json}}", dates=wrap(old_columns).last_updated)
                            self.todo.extend(old_columns)
                            # TEST CONSISTENCY
                            for c, d in product(list(self.todo.queue), list(self.todo.queue)):
                                if c.es_column == d.es_column and c.es_index == d.es_index and c != d:
                                    Log.error("")
                        else:
                            if DEBUG:
                                Log.note("no more metatdata to update")

                column = self.todo.pop(Till(seconds=(10*MINUTE).seconds))
                if column:
                    if DEBUG:
                        Log.note("update {{table}}.{{column}}", table=column.es_index, column=column.es_column)
                    if column.type in STRUCT:
                        with self.meta.columns.locker:
                            column.last_updated = Date.now()
                        continue
                    elif column.last_updated >= Date.now()-TOO_OLD:
                        continue
                    try:
                        self._update_cardinality(column)
                        if DEBUG and not column.es_index.startswith(TEST_TABLE_PREFIX):
                            Log.note("updated {{column.name}}", column=column)
                    except Exception as e:
                        Log.warning("problem getting cardinality for {{column.name}}", column=column, cause=e)
            except Exception as e:
                Log.warning("problem in cardinality monitor", cause=e)
Exemplo n.º 43
0
    def _get_columns(self, table=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        table_path = split_field(table)
        es_index = table_path[0]
        query_path = join_field(table_path[1:])
        meta = self.es_metadata.indices[es_index]
        if not meta or self.last_es_metadata < Date.now() - OLD_METADATA:
            self.es_metadata = self.default_es.get_metadata(force=True)
            meta = self.es_metadata.indices[es_index]

        for _, properties in meta.mappings.items():
            properties.properties["_id"] = {"type": "string", "index": "not_analyzed"}
            self._parse_properties(meta.index, properties, meta)
Exemplo n.º 44
0
    def output(*args, **kwargs):
        if kwargs:
            Log.error("Sorry, caching only works with ordered parameter, not keyword arguments")

        with cache_store.locker:
            if using_self:
                self = args[0]
                args = args[1:]
            else:
                self = cache_store

            now = Date.now()
            try:
                _cache = getattr(self, attr_name)
            except Exception:
                _cache = {}
                setattr(self, attr_name, _cache)

            if Random.int(100) == 0:
                # REMOVE OLD CACHE
                _cache = {k: v for k, v in _cache.items() if v.timeout == None or v.timeout > now}
                setattr(self, attr_name, _cache)

            timeout, key, value, exception = _cache.get(args, (Null, Null, Null, Null))

        if now >= timeout:
            value = func(self, *args)
            with cache_store.locker:
                _cache[args] = CacheElement(now + cache_store.timeout, args, value, None)
            return value

        if value == None:
            if exception == None:
                try:
                    value = func(self, *args)
                    with cache_store.locker:
                        _cache[args] = CacheElement(now + cache_store.timeout, args, value, None)
                    return value
                except Exception as e:
                    e = Except.wrap(e)
                    with cache_store.locker:
                        _cache[args] = CacheElement(now + cache_store.timeout, args, None, e)
                    raise e
            else:
                raise exception
        else:
            return value
Exemplo n.º 45
0
    def _daemon(self, please_stop):
        while not please_stop:
            with Explanation("looking for work"):
                try:
                    branch, revisions = self.todo.pop(till=please_stop)
                except Exception as e:
                    if please_stop:
                        break
                    else:
                        raise e
                if branch.name in DAEMON_DO_NO_SCAN:
                    continue
                revisions = set(revisions)

                # FIND THE REVSIONS ON THIS BRANCH
                for r in list(revisions):
                    try:
                        rev = self.get_revision(Revision(branch=branch, changeset={"id": r}))
                        if DAEMON_DEBUG:
                            Log.note("found revision with push date {{date|datetime}}", date=rev.push.date)
                        revisions.discard(r)

                        if rev.etl.timestamp > Date.now() - (DAEMON_RECENT_HG_PULL * SECOND):
                            # SOME PUSHES ARE BIG, RUNNING THE RISK OTHER MACHINES ARE
                            # ALSO INTERESTED AND PERFORMING THE SAME SCAN. THIS DELAY
                            # WILL HAVE SMALL EFFECT ON THE MAJORITY OF SMALL PUSHES
                            # https://bugzilla.mozilla.org/show_bug.cgi?id=1417720
                            Till(seconds=Random.float(DAEMON_HG_INTERVAL*2)).wait()

                    except Exception as e:
                        Log.warning(
                            "Scanning {{branch}} {{revision|left(12)}}",
                            branch=branch.name,
                            revision=r,
                            cause=e
                        )
                        if "Read timed out" in e:
                            Till(seconds=DAEMON_WAIT_AFTER_TIMEOUT).wait()


                # FIND ANY BRANCH THAT MAY HAVE THIS REVISION
                for r in list(revisions):
                    self._find_revision(r)
Exemplo n.º 46
0
    def __init__(
        self,
        hg=None,        # CONNECT TO hg
        repo=None,      # CONNECTION INFO FOR ES CACHE
        branches=None,  # CONNECTION INFO FOR ES CACHE
        use_cache=False,   # True IF WE WILL USE THE ES FOR DOWNLOADING BRANCHES
        timeout=30 * SECOND,
        kwargs=None
    ):
        if not _hg_branches:
            _late_imports()

        self.es_locker = Lock()
        self.todo = mo_threads.Queue("todo for hg daemon", max=DAEMON_QUEUE_SIZE)

        self.settings = kwargs
        self.timeout = Duration(timeout)

        # VERIFY CONNECTIVITY
        with Explanation("Test connect with hg"):
            response = http.head(self.settings.hg.url)

        if branches == None:
            self.branches = _hg_branches.get_branches(kwargs=kwargs)
            self.es = None
            return

        self.last_cache_miss = Date.now()

        set_default(repo, {"schema": revision_schema})
        self.es = elasticsearch.Cluster(kwargs=repo).get_or_create_index(kwargs=repo)

        def setup_es(please_stop):
            with suppress_exception:
                self.es.add_alias()

            with suppress_exception:
                self.es.set_refresh_interval(seconds=1)

        Thread.run("setup_es", setup_es)
        self.branches = _hg_branches.get_branches(kwargs=kwargs)
        self.timeout = timeout
        Thread.run("hg daemon", self._daemon)
Exemplo n.º 47
0
    def output(*args):
        with cache_store.locker:
            if using_self:
                self = args[0]
                args = args[1:]
            else:
                self = cache_store

            now = Date.now()
            try:
                _cache = getattr(self, attr_name)
            except Exception, _:
                _cache = {}
                setattr(self, attr_name, _cache)

            if Random.int(100) == 0:
                # REMOVE OLD CACHE
                _cache = {k: v for k, v in _cache.items() if v[0]==None or v[0] > now}
                setattr(self, attr_name, _cache)

            timeout, key, value, exception = _cache.get(args, (Null, Null, Null, Null))
Exemplo n.º 48
0
    def _update_cardinality(self, c):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if c.type in STRUCT:
            Log.error("not supported")
        try:
            if c.es_index == "meta.columns":
                with self.meta.columns.locker:
                    partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.columns),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            if c.es_index == "meta.tables":
                with self.meta.columns.locker:
                    partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.tables),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return

            es_index = c.es_index.split(".")[0]
            result = self.default_es.post("/" + es_index + "/_search", data={
                "aggs": {c.names["."]: _counting_query(c)},
                "size": 0
            })
            r = result.aggregations.values()[0]
            count = result.hits.total
            cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count==0 else None)
            if cardinality == None:
                Log.error("logic error")

            query = Data(size=0)
            if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                if DEBUG:
                    Log.note("{{table}}.{{field}} has {{num}} parts", table=c.es_index, field=c.es_column, num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                if DEBUG:
                    Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            elif len(c.nested_path) != 1:
                query.aggs[literal_field(c.names["."])] = {
                    "nested": {"path": c.nested_path[0]},
                    "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}}
                }
            else:
                query.aggs[literal_field(c.names["."])] = {"terms": {"field": c.es_column, "size": 0}}

            result = self.default_es.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations.values()[0]
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            if DEBUG:
                Log.note("{{field}} has {{parts}}", field=c.name, parts=parts)
            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
        except Exception as e:
            if "IndexMissingException" in e and c.es_index.startswith(TEST_TABLE_PREFIX):
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": 0,
                            "cardinality": 0,
                            "last_updated": Date.now()
                        },
                        "clear":[
                            "partitions"
                        ],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"names.\\.": ".", "es_index": c.es_index, "es_column": c.es_column}}
                })
                Log.warning("Could not get {{col.es_index}}.{{col.es_column}} info", col=c, cause=e)
Exemplo n.º 49
0
    def get_revision(self, revision, locale=None, get_diff=False, get_moves=True):
        """
        EXPECTING INCOMPLETE revision OBJECT
        RETURNS revision
        """
        rev = revision.changeset.id
        if not rev:
            return Null
        elif rev == "None":
            return Null
        elif revision.branch.name == None:
            return Null
        locale = coalesce(locale, revision.branch.locale, DEFAULT_LOCALE)
        output = self._get_from_elasticsearch(revision, locale=locale, get_diff=get_diff)
        if output:
            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None
            DEBUG and Log.note("Got hg ({{branch}}, {{locale}}, {{revision}}) from ES", branch=output.branch.name, locale=locale, revision=output.changeset.id)
            if output.push.date >= Date.now()-MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents)))
                self.todo.add((output.branch, listwrap(output.children)))
            if output.push.date:
                return output

        # RATE LIMIT CALLS TO HG (CACHE MISSES)
        next_cache_miss = self.last_cache_miss + (Random.float(WAIT_AFTER_CACHE_MISS * 2) * SECOND)
        self.last_cache_miss = Date.now()
        if next_cache_miss > self.last_cache_miss:
            Log.note("delaying next hg call for {{seconds|round(decimal=1)}}", seconds=next_cache_miss - self.last_cache_miss)
            Till(till=next_cache_miss.unix).wait()

        found_revision = copy(revision)
        if isinstance(found_revision.branch, (text_type, binary_type)):
            lower_name = found_revision.branch.lower()
        else:
            lower_name = found_revision.branch.name.lower()

        if not lower_name:
            Log.error("Defective revision? {{rev|json}}", rev=found_revision.branch)

        b = found_revision.branch = self.branches[(lower_name, locale)]
        if not b:
            b = found_revision.branch = self.branches[(lower_name, DEFAULT_LOCALE)]
            if not b:
                Log.warning("can not find branch ({{branch}}, {{locale}})", branch=lower_name, locale=locale)
                return Null

        if Date.now() - Date(b.etl.timestamp) > _OLD_BRANCH:
            self.branches = _hg_branches.get_branches(kwargs=self.settings)

        push = self._get_push(found_revision.branch, found_revision.changeset.id)

        url1 = found_revision.branch.url.rstrip("/") + "/json-info?node=" + found_revision.changeset.id[0:12]
        url2 = found_revision.branch.url.rstrip("/") + "/json-rev/" + found_revision.changeset.id[0:12]
        with Explanation("get revision from {{url}}", url=url1, debug=DEBUG):
            raw_rev2 = Null
            try:
                raw_rev1 = self._get_raw_json_info(url1, found_revision.branch)
                raw_rev2 = self._get_raw_json_rev(url2, found_revision.branch)
            except Exception as e:
                if "Hg denies it exists" in e:
                    raw_rev1 = Data(node=revision.changeset.id)
                else:
                    raise e
            output = self._normalize_revision(set_default(raw_rev1, raw_rev2), found_revision, push, get_diff, get_moves)
            if output.push.date >= Date.now()-MAX_TODO_AGE:
                self.todo.add((output.branch, listwrap(output.parents)))
                self.todo.add((output.branch, listwrap(output.children)))

            if not get_diff:  # DIFF IS BIG, DO NOT KEEP IT IF NOT NEEDED
                output.changeset.diff = None
            if not get_moves:
                output.changeset.moves = None
            return output
Exemplo n.º 50
0
                    np = listwrap(nested_path)
                    newpath = unwraplist([join_field(split_field(np[0]) + [name])] + np)
                    _get_schema_from_list(
                        value, table_name, full_name, newpath, columns
                    )


METADATA_COLUMNS = (
    [
        Column(
            name=c,
            es_index="meta.columns",
            es_column=c,
            es_type="keyword",
            jx_type=STRING,
            last_updated=Date.now(),
            nested_path=ROOT_PATH,
        )
        for c in [
            "name",
            "es_type",
            "jx_type",
            "nested_path",
            "es_column",
            "es_index",
            "partitions",
        ]
    ]
    + [
        Column(
            name=c,
Exemplo n.º 51
0
    def _normalize_revision(self, r, found_revision, push, get_diff, get_moves):
        new_names = set(r.keys()) - KNOWN_TAGS
        if new_names and not r.tags:
            Log.warning(
                "hg is returning new property names {{names|quote}} for {{changeset}} from {{url}}",
                names=new_names,
                changeset=r.node,
                url=found_revision.branch.url
            )

        changeset = Changeset(
            id=r.node,
            id12=r.node[0:12],
            author=r.user,
            description=strings.limit(coalesce(r.description, r.desc), 2000),
            date=parse_hg_date(r.date),
            files=r.files,
            backedoutby=r.backedoutby if r.backedoutby else None,
            bug=self._extract_bug_id(r.description)
        )
        rev = Revision(
            branch=found_revision.branch,
            index=r.rev,
            changeset=changeset,
            parents=unwraplist(list(set(r.parents))),
            children=unwraplist(list(set(r.children))),
            push=push,
            phase=r.phase,
            bookmarks=unwraplist(r.bookmarks),
            landingsystem=r.landingsystem,
            etl={"timestamp": Date.now().unix, "machine": machine_metadata}
        )

        r.pushuser = None
        r.pushdate = None
        r.pushid = None
        r.node = None
        r.user = None
        r.desc = None
        r.description = None
        r.date = None
        r.files = None
        r.backedoutby = None
        r.parents = None
        r.children = None
        r.bookmarks = None
        r.landingsystem = None

        set_default(rev, r)

        # ADD THE DIFF
        if get_diff:
            rev.changeset.diff = self._get_json_diff_from_hg(rev)
        if get_moves:
            rev.changeset.moves = self._get_moves_from_hg(rev)

        try:
            _id = coalesce(rev.changeset.id12, "") + "-" + rev.branch.name + "-" + coalesce(rev.branch.locale, DEFAULT_LOCALE)
            with self.es_locker:
                self.es.add({"id": _id, "value": rev})
        except Exception as e:
            e = Except.wrap(e)
            Log.warning("Did not save to ES, waiting {{duration}} seconds", duration=WAIT_AFTER_NODE_FAILURE, cause=e)
            Till(seconds=WAIT_AFTER_NODE_FAILURE).wait()
            if "FORBIDDEN/12/index read-only" in e:
                pass  # KNOWN FAILURE MODE

        return rev
Exemplo n.º 52
0
def _get_single_branch_from_hg(settings, description, dir):
    if dir == "users":
        return []
    response = http.get(settings.url + "/" + dir)
    doc = BeautifulSoup(response.all_content, "html.parser")

    output = []
    try:
        all_branches = doc("table")[0]
    except Exception:
        return []

    for i, b in enumerate(all_branches("tr")):
        if i == 0:
            continue  # IGNORE HEADER
        columns = b("td")

        try:
            path = columns[0].a.get('href')
            if path == "/":
                continue

            name, desc, last_used = [c.text.strip() for c in columns][0:3]

            if last_used.startswith('at'):
                last_used = last_used[2:]

            detail = Data(
                name=name.lower(),
                locale=DEFAULT_LOCALE,
                parent_name=description,
                url=settings.url + path,
                description=desc,
                last_used=Date(last_used),
                etl={"timestamp": Date.now()}
            )
            if detail.description == "unknown":
                detail.description = None

            # SOME BRANCHES HAVE NAME COLLISIONS, IGNORE LEAST POPULAR
            if path in [
                "/projects/dxr/",                   # moved to webtools
                "/build/compare-locales/",          # ?build team likes to clone?
                "/build/puppet/",                   # ?build team likes to clone?
                "/SeaMonkey/puppet/",               # looses the popularity contest
                "/releases/gaia-l10n/v1_2/en-US/",  # use default branch
                "/releases/gaia-l10n/v1_3/en-US/",  # use default branch
                "/releases/gaia-l10n/v1_4/en-US/",  # use default branch
                "/releases/gaia-l10n/v2_0/en-US/",  # use default branch
                "/releases/gaia-l10n/v2_1/en-US/",  # use default branch
                "/build/autoland/"
            ]:
                continue

            # MARKUP BRANCH IF LOCALE SPECIFIC
            if path.startswith("/l10n-central"):
                _path = path.strip("/").split("/")
                detail.locale = _path[-1]
                detail.name = "mozilla-central"
            elif path.startswith("/releases/l10n/"):
                _path = path.strip("/").split("/")
                detail.locale = _path[-1]
                detail.name = _path[-2].lower()
            elif path.startswith("/releases/gaia-l10n/"):
                _path = path.strip("/").split("/")
                detail.locale = _path[-1]
                detail.name = "gaia-" + _path[-2][1::]
            elif path.startswith("/weave-l10n"):
                _path = path.strip("/").split("/")
                detail.locale = _path[-1]
                detail.name = "weave"

            if BRANCH_WHITELIST is not None:
                found = False
                for br in BRANCH_WHITELIST:
                    if br in str(detail.name):
                        found = True
                        break
                if not found:
                    continue

            Log.note("Branch {{name}} {{locale}}", name=detail.name, locale=detail.locale)
            output.append(detail)
        except Exception as e:
            Log.warning("branch digestion problem", cause=e)

    return output
Exemplo n.º 53
0
def proto_name(prefix, timestamp=None):
    if not timestamp:
        timestamp = Date.now()
    else:
        timestamp = Date(timestamp)
    return prefix + timestamp.format(INDEX_DATE_FORMAT)
Exemplo n.º 54
0
def _get_schema_from_list(frum, table_name, parent, nested_path, columns):
    """
    :param frum: The list
    :param table_name: Name of the table this list holds records for
    :param parent: parent path
    :param nested_path: each nested array, in reverse order
    :param columns: map from full name to column definition
    :return:
    """

    for d in frum:
        row_type = python_type_to_json_type[d.__class__]

        if row_type != "object":
            # EXPECTING PRIMITIVE VALUE
            full_name = parent
            column = columns[full_name]
            if not column:
                column = Column(
                    name=concat_field(table_name, full_name),
                    es_column=full_name,
                    es_index=".",
                    es_type=d.__class__.__name__,
                    jx_type=None,  # WILL BE SET BELOW
                    last_updated=Date.now(),
                    nested_path=nested_path,
                )
                columns.add(column)
            column.es_type = _merge_python_type(column.es_type, d.__class__)
            column.jx_type = python_type_to_json_type[column.es_type]
        else:
            for name, value in d.items():
                full_name = concat_field(parent, name)
                column = columns[full_name]
                if not column:
                    column = Column(
                        name=concat_field(table_name, full_name),
                        es_column=full_name,
                        es_index=".",
                        es_type=value.__class__.__name__,
                        jx_type=None,  # WILL BE SET BELOW
                        last_updated=Date.now(),
                        nested_path=nested_path,
                    )
                    columns.add(column)
                if is_container(value):  # GET TYPE OF MULTIVALUE
                    v = list(value)
                    if len(v) == 0:
                        this_type = none_type.__name__
                    elif len(v) == 1:
                        this_type = v[0].__class__.__name__
                    else:
                        this_type = reduce(
                            _merge_python_type, (vi.__class__.__name__ for vi in value)
                        )
                else:
                    this_type = value.__class__.__name__
                column.es_type = _merge_python_type(column.es_type, this_type)
                column.jx_type = python_type_to_json_type[column.es_type]

                if this_type in {"object", "dict", "Mapping", "Data"}:
                    _get_schema_from_list(
                        [value], table_name, full_name, nested_path, columns
                    )
                elif this_type in {"list", "FlatList"}:
                    np = listwrap(nested_path)
                    newpath = unwraplist([join_field(split_field(np[0]) + [name])] + np)
                    _get_schema_from_list(
                        value, table_name, full_name, newpath, columns
                    )