示例#1
0
def sort_table(result):
    """
    SORT ROWS IN TABLE, EVEN IF ELEMENTS ARE JSON
    """
    data = wrap([{unicode(i): v for i, v in enumerate(row) if v != None} for row in result.data])
    sort_columns = jx.sort(set(jx.get_columns(data, leaves=True).name))
    data = jx.sort(data, sort_columns)
    result.data = [tuple(row[unicode(i)] for i in range(len(result.header))) for row in data]
示例#2
0
def compare_to_expected(query, result, expect):
    query = wrap(query)
    expect = wrap(expect)

    if result.meta.format == "table":
        assertAlmostEqual(set(result.header), set(expect.header))

        # MAP FROM expected COLUMN TO result COLUMN
        mapping = zip(*zip(*filter(
            lambda v: v[0][1] == v[1][1],
            itertools.product(enumerate(expect.header), enumerate(
                result.header))))[1])[0]
        result.header = [result.header[m] for m in mapping]

        if result.data:
            columns = zip(*unwrap(result.data))
            result.data = zip(*[columns[m] for m in mapping])

        if not query.sort:
            sort_table(result)
            sort_table(expect)
    elif result.meta.format == "list":
        if query["from"].startswith("meta."):
            pass
        else:
            query = QueryOp.wrap(query)

        if not query.sort:
            try:
                #result.data MAY BE A LIST OF VALUES, NOT OBJECTS
                data_columns = jx.sort(
                    set(jx.get_columns(result.data, leaves=True))
                    | set(jx.get_columns(expect.data, leaves=True)), "name")
            except Exception:
                data_columns = [{"name": "."}]

            sort_order = listwrap(coalesce(query.edges,
                                           query.groupby)) + data_columns

            if isinstance(expect.data, list):
                try:
                    expect.data = jx.sort(expect.data, sort_order.name)
                except Exception, _:
                    pass

            if isinstance(result.data, list):
                try:
                    result.data = jx.sort(result.data, sort_order.name)
                except Exception, _:
                    pass
示例#3
0
    def insert_list(self, table_name, records):
        if not records:
            return

        columns = set()
        for r in records:
            columns |= set(r.keys())
        columns = jx.sort(columns)

        try:
            self.execute(
                "DELETE FROM " + self.quote_column(table_name) +
                " WHERE _id IN {{ids}}",
                {"ids": self.quote_column([r["_id"] for r in records])})

            command = \
                "INSERT INTO " + self.quote_column(table_name) + "(" + \
                ",".join([self.quote_column(k) for k in columns]) + \
                ") VALUES " + ",\n".join([
                    "(" + ",".join([self.quote_value(r.get(k, None)) for k in columns]) + ")"
                    for r in records
                ])
            self.execute(command)
        except Exception, e:
            Log.error("problem with insert", e)
示例#4
0
    def _get_columns(self, table=None, metadata=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        if not metadata:
            metadata = self.default_es.get_metadata(force=True)

        def parse_all(please_stop):
            for abs_index, meta in jx.sort(metadata.indices.items(), {
                    "value": 0,
                    "sort": -1
            }):
                if meta.index != abs_index:
                    continue

                for _, properties in meta.mappings.items():
                    if please_stop:
                        return
                    self._parse_properties(abs_index, properties, meta)

        if table:
            for abs_index, meta in jx.sort(metadata.indices.items(), {
                    "value": 0,
                    "sort": -1
            }):
                if table == meta.index:
                    for _, properties in meta.mappings.items():
                        self._parse_properties(abs_index, properties, meta)
                    return
                if table == abs_index:
                    self._get_columns(table=meta.index, metadata=metadata)
                    return
        else:
            self.parser = Thread.run("parse properties", parse_all)
示例#5
0
    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            short_name = join_field(split_field(table_name)[0:1])
            table = self.get_table(short_name)[0]

            if not table:
                table = Table(
                    name=short_name,
                    url=None,
                    query_path=None,
                    timestamp=Date.now()
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=short_name)
            elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=short_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(table_name, column_name)
            if columns:
                columns = jx.sort(columns, "name")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated])
                    Till(seconds=1).wait()
                return columns
        except Exception, e:
            Log.error("Not expected", cause=e)
示例#6
0
 def get_columns(self,
                 table_name,
                 column_name=None,
                 fail_when_not_found=False):
     """
     RETURN METADATA COLUMNS
     """
     try:
         with self.meta.columns.locker:
             columns = [
                 c for c in self.meta.columns.data
                 if c.table == table_name and (
                     column_name is None or c.name == column_name)
             ]
         if columns:
             columns = jx.sort(columns, "name")
             if fail_when_not_found:
                 # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                 while len(self.todo) and not all(
                         columns.get("last_updated")):
                     Log.note(
                         "waiting for columns to update {{columns|json}}",
                         columns=[
                             c.table + "." + c.es_column for c in columns
                             if not c.last_updated
                         ])
                     Thread.sleep(seconds=1)
                 return columns
             elif all(columns.get("last_updated")):
                 return columns
     except Exception, e:
         Log.error("Not expected", cause=e)
示例#7
0
    def _get_columns(self, table=None, metadata=None):
        # TODO: HANDLE MORE THEN ONE ES, MAP TABLE SHORT_NAME TO ES INSTANCE
        if not metadata:
            metadata = self.default_es.get_metadata(force=True)

        def parse_all(please_stop):
            for abs_index, meta in jx.sort(metadata.indices.items(), {"value": 0, "sort": -1}):
                if meta.index != abs_index:
                    continue

                for _, properties in meta.mappings.items():
                    if please_stop:
                        return
                    self._parse_properties(abs_index, properties, meta)

        if table:
            for abs_index, meta in jx.sort(metadata.indices.items(), {"value": 0, "sort": -1}):
                if table == meta.index:
                    for _, properties in meta.mappings.items():
                        self._parse_properties(abs_index, properties, meta)
                    return
                if table == abs_index:
                    self._get_columns(table=meta.index, metadata=metadata)
                    return
        else:
            self.parser = Thread.run("parse properties", parse_all)
示例#8
0
    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            short_name = join_field(split_field(table_name)[0:1])
            table = self.get_table(short_name)[0]

            if not table:
                table = Table(
                    name=short_name,
                    url=None,
                    query_path=None,
                    timestamp=Date.now()
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=short_name)
            elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=short_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(table_name, column_name)
            if columns:
                columns = jx.sort(columns, "name")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated])
                    Thread.sleep(seconds=1)
                return columns
        except Exception, e:
            Log.error("Not expected", cause=e)
示例#9
0
    def insert_list(self, table_name, records):
        if not records:
            return

        columns = set()
        for r in records:
            columns |= set(r.keys())
        columns = jx.sort(columns)

        try:
            self.execute(
                "DELETE FROM " + self.quote_column(table_name) + " WHERE _id IN {{ids}}",
                {"ids": self.quote_column([r["_id"] for r in records])}
            )

            command = \
                "INSERT INTO " + self.quote_column(table_name) + "(" + \
                ",".join([self.quote_column(k) for k in columns]) + \
                ") VALUES " + ",\n".join([
                    "(" + ",".join([self.quote_value(r.get(k, None)) for k in columns]) + ")"
                    for r in records
                ])
            self.execute(command)
        except Exception, e:
            Log.error("problem with insert", e)
示例#10
0
def groupby(data,
            keys=None,
            size=None,
            min_size=None,
            max_size=None,
            contiguous=False):
    """
    :param data:
    :param keys:
    :param size:
    :param min_size:
    :param max_size:
    :param contiguous: MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES
    :return: return list of (keys, values) PAIRS, WHERE
                 keys IS IN LEAF FORM (FOR USE WITH {"eq": terms} OPERATOR
                 values IS GENERATOR OF ALL VALUE THAT MATCH keys
        contiguous -
    """
    if isinstance(data, Container):
        return data.groupby(keys)

    if size != None or min_size != None or max_size != None:
        if size != None:
            max_size = size
        return groupby_min_max_size(data, min_size=min_size, max_size=max_size)

    try:
        keys = listwrap(keys)
        if not contiguous:
            from pyLibrary.queries import jx
            data = jx.sort(data, keys)

        if not data:
            return Null

        accessor = jx_expression_to_function(TupleOp(
            "tuple",
            keys))  # CAN RETURN Null, WHICH DOES NOT PLAY WELL WITH __cmp__

        def _output():
            start = 0
            prev = accessor(data[0])
            for i, d in enumerate(data):
                curr = accessor(d)
                if curr != prev:
                    group = {}
                    for k, gg in zip(keys, prev):
                        group[k] = gg
                    yield Data(group), data[start:i:]
                    start = i
                    prev = curr
            group = {}
            for k, gg in zip(keys, prev):
                group[k] = gg
            yield Data(group), data[start::]

        return _output()
    except Exception, e:
        Log.error("Problem grouping", cause=e)
示例#11
0
        def parse_all(please_stop):
            for abs_index, meta in jx.sort(metadata.indices.items(), {"value": 0, "sort": -1}):
                if meta.index != abs_index:
                    continue

                for _, properties in meta.mappings.items():
                    if please_stop:
                        return
                    self._parse_properties(abs_index, properties, meta)
示例#12
0
    def done_count(self):
        columns = map(unicode, range(len(self.fields)))
        parts = wrap([{unicode(i): p for i, p in enumerate(part)} for part in set(self.parts)])
        self.parts = None
        sorted_parts = jx.sort(parts, columns)

        self.edge.domain = self.domain = SimpleSetDomain(
            key="value",
            partitions=[{"value": tuple(v[k] for k in columns), "dataIndex": i} for i, v in enumerate(sorted_parts)]
        )
示例#13
0
    def done_count(self):
        columns = map(unicode, range(len(self.fields)))
        parts = wrap([{unicode(i): p for i, p in enumerate(part)} for part in set(self.parts)])
        self.parts = None
        sorted_parts = jx.sort(parts, columns)

        self.edge.domain = self.domain = SimpleSetDomain(
            key="value",
            partitions=[{"value": tuple(v[k] for k in columns), "dataIndex": i} for i, v in enumerate(sorted_parts)]
        )
示例#14
0
def groupby(data, keys=None, size=None, min_size=None, max_size=None, contiguous=False):
    """
    :param data:
    :param keys:
    :param size:
    :param min_size:
    :param max_size:
    :param contiguous: MAINTAIN THE ORDER OF THE DATA, STARTING THE NEW GROUP WHEN THE SELECTOR CHANGES
    :return: return list of (keys, values) PAIRS, WHERE
                 keys IS IN LEAF FORM (FOR USE WITH {"eq": terms} OPERATOR
                 values IS GENERATOR OF ALL VALUE THAT MATCH keys
        contiguous -
    """
    if isinstance(data, Container):
        return data.groupby(keys)

    if size != None or min_size != None or max_size != None:
        if size != None:
            max_size = size
        return groupby_min_max_size(data, min_size=min_size, max_size=max_size)

    try:
        keys = listwrap(keys)
        if not contiguous:
            from pyLibrary.queries import jx
            data = jx.sort(data, keys)

        if not data:
            return Null

        if any(isinstance(k, Expression) for k in keys):
            Log.error("can not handle expressions")
        else:
            accessor = jx_expression_to_function(jx_expression({"tuple": keys}))  # CAN RETURN Null, WHICH DOES NOT PLAY WELL WITH __cmp__

        def _output():
            start = 0
            prev = accessor(data[0])
            for i, d in enumerate(data):
                curr = accessor(d)
                if curr != prev:
                    group = {}
                    for k, gg in zip(keys, prev):
                        group[k] = gg
                    yield Data(group), data[start:i:]
                    start = i
                    prev = curr
            group = {}
            for k, gg in zip(keys, prev):
                group[k] = gg
            yield Data(group), data[start::]

        return _output()
    except Exception as e:
        Log.error("Problem grouping", cause=e)
示例#15
0
def _es_terms2(es, mvel, query):
    """
    WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value
    """

    # REQUEST VALUES IN FIRST DIMENSION
    q1 = query.copy()
    q1.edges = query.edges[0:1:]
    values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value

    select = listwrap(query.select)
    FromES = build_es_query(query)
    for s in select:
        for i, v in enumerate(values1):
            FromES.facets[s.name + "," + str(i)] = {
                "terms": {
                    "field": query.edges[1].value,
                    "size": coalesce(query.limit, 200000)
                },
                "facet_filter": simplify_esfilter({"and": [
                    query.where,
                    {"term": {query.edges[0].value: v}}
                ]})
            }

    data = es09.util.post(es, FromES, query.limit)

    # UNION ALL TERMS FROM SECOND DIMENSION
    values2 = set()
    for k, f in data.facets.items():
        values2.update(f.terms.term)
    values2 = jx.sort(values2)
    term2index = {v: i for i, v in enumerate(values2)}
    query.edges[1].domain.partitions = DictList([{"name": v, "value": v} for v in values2])

    # MAKE CUBE
    output = {}
    dims = [len(values1), len(values2)]
    for s in select:
        output[s.name] = Matrix(*dims)

    # FILL CUBE
    # EXPECTING ONLY SELECT CLAUSE FACETS
    for facetName, facet in data.facets.items():
        coord = facetName.split(",")
        s = [s for s in select if s.name == coord[0]][0]
        i1 = int(coord[1])
        for term in facet.terms:
            i2 = term2index[term.term]
            output[s.name][(i1, i2)] = term[aggregates[s.aggregate]]

    cube = Cube(query.select, query.edges, output)
    cube.query = query
    return cube
 def _get_best(self, settings):
     from pyLibrary.queries import jx
     aliases = self.get_aliases()
     indexes = jx.sort([
         a
         for a in aliases
         if (a.alias == settings.index and settings.alias == None) or
         (re.match(re.escape(settings.index) + r'\d{8}_\d{6}', a.index) and settings.alias == None) or
         (a.index == settings.index and (a.alias == None or a.alias == settings.alias))
     ], "index")
     return indexes.last()
示例#17
0
 def _get_best(self, settings):
     from pyLibrary.queries import jx
     aliases = self.get_aliases()
     indexes = jx.sort([
         a
         for a in aliases
         if (a.alias == settings.index and settings.alias == None) or
         (re.match(re.escape(settings.index) + r'\d{8}_\d{6}', a.index) and settings.alias == None) or
         (a.index == settings.index and (settings.alias == None or a.alias == None or a.alias == settings.alias))
     ], "index")
     return indexes.last()
示例#18
0
        def parse_all(please_stop):
            for abs_index, meta in jx.sort(metadata.indices.items(), {
                    "value": 0,
                    "sort": -1
            }):
                if meta.index != abs_index:
                    continue

                for _, properties in meta.mappings.items():
                    if please_stop:
                        return
                    self._parse_properties(abs_index, properties, meta)
示例#19
0
    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        es_index_name = table_path[0]
        query_path = join_field(table_path[1:])
        table = self.get_table(es_index_name)[0]
        abs_column_name = None if column_name == None else concat_field(
            query_path, column_name)

        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = Table(name=es_index_name,
                              url=None,
                              query_path=None,
                              timestamp=Date.now())
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=es_index_name)
            elif force or table.timestamp == None or table.timestamp < Date.now(
            ) - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=es_index_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(es_index_name, column_name)
            if columns:
                columns = jx.sort(columns, "names.\.")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note(
                            "waiting for columns to update {{columns|json}}",
                            columns=[
                                c.es_index + "." + c.es_column for c in columns
                                if not c.last_updated
                            ])
                    Till(seconds=1).wait()
                return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        if abs_column_name:
            Log.error("no columns matching {{table}}.{{column}}",
                      table=table_name,
                      column=abs_column_name)
        else:
            self._get_columns(table=table_name)  # TO TEST WHAT HAPPENED
            Log.error("no columns for {{table}}?!", table=table_name)
示例#20
0
 def running_instances(self):
     # FIND THE BIGGEST, MOST EXPENSIVE REQUESTS
     instances = self._get_managed_instances()
     for r in instances:
         try:
             r.markup = self.price_lookup[r.instance_type, r.placement]
         except Exception as e:
             r.markup = self.price_lookup[r.instance_type, r.placement]
             Log.error("No pricing!!!", e)
     instances = jx.sort(instances, [
         {"value": "markup.type.utility", "sort": -1},
         {"value": "markup.estimated_value", "sort": 1}
     ])
     return instances
示例#21
0
def json_schema_to_markdown(schema):
    from pyLibrary.queries import jx

    def _md_code(code):
        return "`" + code + "`"

    def _md_italic(value):
        return "*" + value + "*"

    def _inner(schema, parent_name, indent):
        more_lines = []
        for k, v in schema.items():
            full_name = concat_field(parent_name, k)
            details = indent + "* " + _md_code(full_name)
            if v.type:
                details += " - " + _md_italic(v.type)
            else:
                Log.error("{{full_name}} is missing type", full_name=full_name)
            if v.description:
                details += " " + v.description
            more_lines.append(details)

            if v.type in ["object", "array", "nested"]:
                more_lines.extend(
                    _inner(v.properties, full_name, indent + "  "))
        return more_lines

    lines = []
    if schema.title:
        lines.append("#" + schema.title)

    lines.append(schema.description)
    lines.append("")

    for k, v in jx.sort(schema.properties.items(), 0):
        full_name = k
        if v.type in ["object", "array", "nested"]:
            lines.append("##" + _md_code(full_name) + " Property")
            if v.description:
                lines.append(v.description)
            lines.append("")

            if v.type in ["object", "array", "nested"]:
                lines.extend(_inner(v.properties, full_name, "  "))
        else:
            lines.append("##" + _md_code(full_name) + " (" + v.type + ")")
            if v.description:
                lines.append(v.description)

    return "\n".join(lines)
示例#22
0
def json_schema_to_markdown(schema):
    from pyLibrary.queries import jx

    def _md_code(code):
        return "`"+code+"`"

    def _md_italic(value):
        return "*"+value+"*"

    def _inner(schema, parent_name, indent):
        more_lines = []
        for k,v in schema.items():
            full_name = join_field(split_field(parent_name)+[k])
            details = indent+"* "+_md_code(full_name)
            if v.type:
                details += " - "+_md_italic(v.type)
            else:
                Log.error("{{full_name}} is missing type", full_name=full_name)
            if v.description:
                details += " " + v.description
            more_lines.append(details)

            if v.type in ["object", "array", "nested"]:
                more_lines.extend(_inner(v.properties, full_name, indent+"  "))
        return more_lines

    lines = []
    if schema.title:
        lines.append("#"+schema.title)

    lines.append(schema.description)
    lines.append("")

    for k, v in jx.sort(schema.properties.items(), 0):
        full_name = k
        if v.type in ["object", "array", "nested"]:
            lines.append("##"+_md_code(full_name)+" Property")
            if v.description:
                lines.append(v.description)
            lines.append("")

            if v.type in ["object", "array", "nested"]:
                lines.extend(_inner(v.properties, full_name, "  "))
        else:
            lines.append("##"+_md_code(full_name)+" ("+v.type+")")
            if v.description:
                lines.append(v.description)

    return "\n".join(lines)
示例#23
0
    def __init__(
        self,
        alias,  # NAME OF THE ALIAS
        type=None,  # SCHEMA NAME, WILL HUNT FOR ONE IF None
        explore_metadata=True,  # IF PROBING THE CLUSTER FOR METADATA IS ALLOWED
        debug=False,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        kwargs=None
    ):
        self.debug = debug
        if self.debug:
            Log.alert("Elasticsearch debugging on {{index|quote}} is on",  index= kwargs.index)
        if alias == None:
            Log.error("Alias can not be None")
        self.settings = kwargs
        self.cluster = Cluster(kwargs)

        if type == None:
            if not explore_metadata:
                Log.error("Alias() was given no `type` (aka schema) and not allowed to explore metadata.  Do not know what to do now.")

            if not self.settings.alias or self.settings.alias==self.settings.index:
                alias_list = self.cluster.get("/_alias")
                candidates = (
                    [(name, i) for name, i in alias_list.items() if self.settings.index in i.aliases.keys()] +
                    [(name, Null) for name, i in alias_list.items() if self.settings.index==name]
                )
                full_name = jx.sort(candidates, 0).last()[0]
                if not full_name:
                    Log.error("No index by name of {{name}}", name=self.settings.index)
                mappings = self.cluster.get("/" + full_name + "/_mapping")[full_name]
            else:
                mappings = self.cluster.get("/"+self.settings.index+"/_mapping")[self.settings.index]

            # FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE)
            max_prop = -1
            for _type, mapping in mappings.mappings.items():
                if _type == "_default_":
                    continue
                num_prop = len(mapping.properties.keys())
                if max_prop < num_prop:
                    max_prop = num_prop
                    self.settings.type = _type
                    type = _type

            if type == None:
                Log.error("Can not find schema type for index {{index}}", index=coalesce(self.settings.alias, self.settings.index))

        self.path = "/" + alias + "/" + type
示例#24
0
    def get_markup(self, branch, revision, task_id=None, buildername=None, timestamp=None):
        # TRY CACHE
        if not branch or not revision:
            Log.error("expecting branch and revision")

        if self.settings.use_cache:
            if task_id:
                _filter = {"term": {"task.id": task_id}}
            else:
                _filter = {"term": {"ref_data_name": buildername}}

            query = {
                "query": {"filtered": {
                    "query": {"match_all": {}},
                    "filter": {"and": [
                        _filter,
                        {"term": {"repo.branch": branch}},
                        {"prefix": {"repo.revision": revision}},
                        {"or": [
                            {"range": {"etl.timestamp": {"gte": (Date.now() - HOUR).unix}}},
                            {"range": {"job.timing.last_modified": {"lt": (Date.now() - DAY).unix}}}
                        ]}
                    ]}
                }},
                "size": 10000
            }

            try:
                docs = self.cache.search(query, timeout=120).hits.hits
            except Exception, e:
                docs = None
                Log.warning("Bad ES call, fall back to TH", cause=e)

            if not docs:
                pass
            elif len(docs) == 1:
                if DEBUG:
                    Log.note("Used ES cache to get TH details on {{value|quote}}", value=coalesce(task_id, buildername))
                return docs[0]._source
            elif timestamp == None:
                Log.error("timestamp required to find best match")
            else:
                # MISSING docs._source.job.timing.end WHEN A PLACEHOLDER WAS ADDED
                # TODO: SHOULD DELETE OVERAPPING PLACEHOLDER RECORDS
                timestamp = Date(timestamp).unix
                best_index = jx.sort([(i, abs(coalesce(e, 0) - timestamp)) for i, e in enumerate(docs._source.job.timing.end)], 1)[0][0]
                return docs[best_index]._source
示例#25
0
    def __init__(
        self,
        alias,  # NAME OF THE ALIAS
        type=None,  # SCHEMA NAME, WILL HUNT FOR ONE IF None
        explore_metadata=True,  # IF PROBING THE CLUSTER FOR METADATA IS ALLOWED
        debug=False,
        timeout=None,  # NUMBER OF SECONDS TO WAIT FOR RESPONSE, OR SECONDS TO WAIT FOR DOWNLOAD (PASSED TO requests)
        kwargs=None
    ):
        self.debug = debug
        if self.debug:
            Log.alert("Elasticsearch debugging on {{index|quote}} is on",  index= kwargs.index)
        if alias == None:
            Log.error("Alias can not be None")
        self.settings = kwargs
        self.cluster = Cluster(kwargs)

        if type == None:
            if not explore_metadata:
                Log.error("Alias() was given no `type` (aka schema) and not allowed to explore metadata.  Do not know what to do now.")

            if not self.settings.alias or self.settings.alias==self.settings.index:
                alias_list = self.cluster.get("/_alias")
                candidates = (
                    [(name, i) for name, i in alias_list.items() if self.settings.index in i.aliases.keys()] +
                    [(name, Null) for name, i in alias_list.items() if self.settings.index==name]
                )
                full_name = jx.sort(candidates, 0).last()[0]
                mappings = self.cluster.get("/" + full_name + "/_mapping")[full_name]
            else:
                mappings = self.cluster.get("/"+self.settings.index+"/_mapping")[self.settings.index]

            # FIND MAPPING WITH MOST PROPERTIES (AND ASSUME THAT IS THE CANONICAL TYPE)
            max_prop = -1
            for _type, mapping in mappings.mappings.items():
                if _type == "_default_":
                    continue
                num_prop = len(mapping.properties.keys())
                if max_prop < num_prop:
                    max_prop = num_prop
                    self.settings.type = _type
                    type = _type

            if type == None:
                Log.error("Can not find schema type for index {{index}}", index=coalesce(self.settings.alias, self.settings.index))

        self.path = "/" + alias + "/" + type
示例#26
0
    def __init__(self, edge, query, limit):
        AggsDecoder.__init__(self, edge, query, limit)
        self.domain = edge.domain

        # WE ASSUME IF THE VARIABLES MATCH, THEN THE SORT TERM AND EDGE TERM MATCH, AND WE SORT BY TERM
        self.sorted = None
        edge_var = edge.value.vars()
        for s in query.sort:
            if not edge_var - s.value.vars():
                self.sorted = {1: "asc", -1: "desc"}[s.sort]
                domain = self.domain
                key = self.domain.key
                domain.partitions = parts = jx.sort(domain.partitions, {
                    "value": key,
                    "sort": s.sort
                })
                domain.map = {i: p for i, p in enumerate(parts)}
示例#27
0
    def get_columns(self, table_name, column_name=None, force=False):
        """
        RETURN METADATA COLUMNS
        """
        table_path = split_field(table_name)
        es_index_name = table_path[0]
        query_path = join_field(table_path[1:])
        table = self.get_table(es_index_name)[0]
        abs_column_name = None if column_name == None else concat_field(query_path, column_name)

        try:
            # LAST TIME WE GOT INFO FOR THIS TABLE
            if not table:
                table = Table(
                    name=es_index_name,
                    url=None,
                    query_path=None,
                    timestamp=Date.now()
                )
                with self.meta.tables.locker:
                    self.meta.tables.add(table)
                self._get_columns(table=es_index_name)
            elif force or table.timestamp == None or table.timestamp < Date.now() - MAX_COLUMN_METADATA_AGE:
                table.timestamp = Date.now()
                self._get_columns(table=es_index_name)

            with self.meta.columns.locker:
                columns = self.meta.columns.find(es_index_name, column_name)
            if columns:
                columns = jx.sort(columns, "names.\.")
                # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                while len(self.todo) and not all(columns.get("last_updated")):
                    if DEBUG:
                        Log.note("waiting for columns to update {{columns|json}}", columns=[c.es_index+"."+c.es_column for c in columns if not c.last_updated])
                    Till(seconds=1).wait()
                return columns
        except Exception as e:
            Log.error("Not expected", cause=e)

        if abs_column_name:
            Log.error("no columns matching {{table}}.{{column}}", table=table_name, column=abs_column_name)
        else:
            self._get_columns(table=table_name)  # TO TEST WHAT HAPPENED
            Log.error("no columns for {{table}}?!", table=table_name)
示例#28
0
 def get_columns(self, table_name, column_name=None, fail_when_not_found=False):
     """
     RETURN METADATA COLUMNS
     """
     try:
         with self.meta.columns.locker:
             columns = [c for c in self.meta.columns.data if c.table == table_name and (column_name is None or c.name==column_name)]
         if columns:
             columns = jx.sort(columns, "name")
             if fail_when_not_found:
                 # AT LEAST WAIT FOR THE COLUMNS TO UPDATE
                 while len(self.todo) and not all(columns.get("last_updated")):
                     Log.note("waiting for columns to update {{columns|json}}", columns=[c.table+"."+c.es_column for c in columns if not c.last_updated])
                     Thread.sleep(seconds=1)
                 return columns
             elif all(columns.get("last_updated")):
                 return columns
     except Exception, e:
         Log.error("Not expected", cause=e)
示例#29
0
    def _figure_out_start_point(self):
        # RECOVER FROM THE QUEUE
        acc = []
        while True:
            d = self.temp_queue.pop(timeout=ZERO)
            if d:
                acc.append(d)
            else:
                break
        self.temp_queue.rollback()

        if acc:
            # WAS IN THE MIDDLE OF A BATCH, FIND count
            data = acc[-1]
            today_ = data[UID_PATH].split(".")[0]
            todays_batch_count = int(data[UID_PATH].split(".")[1])
            count = todays_batch_count * BATCH_SIZE + data.etl.id + 1
            if DEBUG:
                Log.note(
                    "Next uid from queue is {{uid}}.{{count}}",
                    count=count % BATCH_SIZE,
                    uid=today_ + "." + unicode(todays_batch_count)
                )
            self.uid = UID(count)
            return

        # FIND LAST WHOLE BATCH FROM TODAY
        today_ = unicode(today())
        todays_keys = self.bucket.keys(prefix=unicode(today_))
        if not todays_keys:
            if DEBUG:
                Log.note("Next uid is {{uid}}.{{count}}", count=0, uid=today_+".0")
            self.uid = UID()
            return

        todays_batch_count = jx.sort(int(k.split(".")[1]) for k in todays_keys).last() + 1
        max_key = today_ + "." + unicode(todays_batch_count)

        if DEBUG:
            Log.note("Next uid is {{uid}}", uid=max_key)
        count = todays_batch_count * BATCH_SIZE
        self.uid = UID(count)
示例#30
0
    def get_metadata(self, force=False):
        if not self.settings.explore_metadata:
            Log.error("Metadata exploration has been disabled")

        if not self._metadata or force:
            response = self.get("/_cluster/state", retry={"times": 5}, timeout=3)
            with self.metadata_locker:
                self._metadata = wrap(response.metadata)
                # REPLICATE MAPPING OVER ALL ALIASES
                indices = self._metadata.indices
                for i, m in jx.sort(indices.items(), {"value": {"offset": 0}, "sort": -1}):
                    m.index = i
                    for a in m.aliases:
                        if not indices[a]:
                            indices[a] = {"index": i}
                self.cluster_state = wrap(self.get("/"))
                self.version = self.cluster_state.version.number
            return self._metadata

        return self._metadata
    def get_index(self, alias):
        """
        RETURN THE INDEX USED BY THIS alias
        """
        alias_list = self.cluster.get_aliases()
        output = jx.sort(set([
            a.index
            for a in alias_list
            if a.alias == alias or
                a.index == alias or
                (re.match(re.escape(alias) + "\\d{8}_\\d{6}", a.index) and a.index != alias)
        ]))

        if len(output) > 1:
            Log.error("only one index with given alias==\"{{alias}}\" expected",  alias= alias)

        if not output:
            return Null

        return output.last()
示例#32
0
    def get_metadata(self, force=False):
        if not self.settings.explore_metadata:
            Log.error("Metadata exploration has been disabled")

        if not self._metadata or force:
            response = self.get("/_cluster/state", retry={"times": 3}, timeout=30)
            with self.metadata_locker:
                self._metadata = wrap(response.metadata)
                # REPLICATE MAPPING OVER ALL ALIASES
                indices = self._metadata.indices
                for i, m in jx.sort(indices.items(), {"value": {"offset": 0}, "sort": -1}):
                    m.index = i
                    for a in m.aliases:
                        if not indices[a]:
                            indices[a] = m
                self.cluster_state = wrap(self.get("/"))
                self.version = self.cluster_state.version.number
            return self._metadata

        return self._metadata
示例#33
0
    def insert_list(self, table_name, records):
        if not records:
            return

        keys = set()
        for r in records:
            keys |= set(r.keys())
        keys = jx.sort(keys)

        try:
            command = \
                "INSERT INTO " + self.quote_column(table_name) + "(" + \
                ",".join([self.quote_column(k) for k in keys]) + \
                ") VALUES " + ",\n".join([
                    "(" + ",".join([self.quote_value(r[k]) for k in keys]) + ")"
                    for r in records
                ])
            self.execute(command)
        except Exception, e:
            Log.error("problem with record: {{record}}",  record= records, cause=e)
示例#34
0
    def get_index(self, alias):
        """
        RETURN THE INDEX USED BY THIS alias
        """
        alias_list = self.cluster.get_aliases()
        output = jx.sort(set([
            a.index
            for a in alias_list
            if a.alias == alias or
                a.index == alias or
                (re.match(re.escape(alias) + "\\d{8}_\\d{6}", a.index) and a.index != alias)
        ]))

        if len(output) > 1:
            Log.error("only one index with given alias==\"{{alias}}\" expected",  alias= alias)

        if not output:
            return Null

        return output.last()
示例#35
0
    def get_schema(self, retry=True):
        if self.settings.explore_metadata:
            indices = self.cluster.get_metadata().indices
            if not self.settings.alias or self.settings.alias==self.settings.index:
                #PARTIALLY DEFINED settings
                candidates = [(name, i) for name, i in indices.items() if self.settings.index in i.aliases]
                # TODO: MERGE THE mappings OF ALL candidates, DO NOT JUST PICK THE LAST ONE

                index = "dummy value"
                schema = wrap({"_routing": {}, "properties": {}})
                for _, ind in jx.sort(candidates, {"value": 0, "sort": -1}):
                    mapping = ind.mappings[self.settings.type]
                    set_default(schema._routing, mapping._routing)
                    schema.properties = _merge_mapping(schema.properties, mapping.properties)
            else:
                #FULLY DEFINED settings
                index = indices[self.settings.index]
                schema = index.mappings[self.settings.type]

            if index == None and retry:
                #TRY AGAIN, JUST IN CASE
                self.cluster.cluster_state = None
                return self.get_schema(retry=False)

            #TODO: REMOVE THIS BUG CORRECTION
            if not schema and self.settings.type == "test_result":
                schema = index.mappings["test_results"]
            # DONE BUG CORRECTION

            if not schema:
                Log.error(
                    "ElasticSearch index ({{index}}) does not have type ({{type}})",
                    index=self.settings.index,
                    type=self.settings.type
                )
            return schema
        else:
            mapping = self.cluster.get(self.path + "/_mapping")
            if not mapping[self.settings.type]:
                Log.error("{{index}} does not have type {{type}}", self.settings)
            return wrap({"mappings": mapping[self.settings.type]})
    def get_schema(self, retry=True):
        if self.settings.explore_metadata:
            indices = self.cluster.get_metadata().indices
            if not self.settings.alias or self.settings.alias==self.settings.index:
                #PARTIALLY DEFINED settings
                candidates = [(name, i) for name, i in indices.items() if self.settings.index in i.aliases]
                # TODO: MERGE THE mappings OF ALL candidates, DO NOT JUST PICK THE LAST ONE

                index = "dummy value"
                schema = wrap({"_routing": {}, "properties": {}})
                for _, ind in jx.sort(candidates, {"value": 0, "sort": -1}):
                    mapping = ind.mappings[self.settings.type]
                    set_default(schema._routing, mapping._routing)
                    schema.properties = _merge_mapping(schema.properties, mapping.properties)
            else:
                #FULLY DEFINED settings
                index = indices[self.settings.index]
                schema = index.mappings[self.settings.type]

            if index == None and retry:
                #TRY AGAIN, JUST IN CASE
                self.cluster.cluster_state = None
                return self.get_schema(retry=False)

            #TODO: REMOVE THIS BUG CORRECTION
            if not schema and self.settings.type == "test_result":
                schema = index.mappings["test_results"]
            # DONE BUG CORRECTION

            if not schema:
                Log.error(
                    "ElasticSearch index ({{index}}) does not have type ({{type}})",
                    index=self.settings.index,
                    type=self.settings.type
                )
            return schema
        else:
            mapping = self.cluster.get(self.path + "/_mapping")
            if not mapping[self.settings.type]:
                Log.error("{{index}} does not have type {{type}}", self.settings)
            return wrap({"mappings": mapping[self.settings.type]})
示例#37
0
    def save_money(self, remaining_budget, net_new_utility):
        remove_spot_requests = wrap([])

        # FIRST CANCEL THE PENDING REQUESTS
        if remaining_budget < 0:
            requests = self._get_managed_spot_requests()
            for r in requests:
                if r.status.code in PENDING_STATUS_CODES | PROBABLY_NOT_FOR_A_WHILE | MIGHT_HAPPEN:
                    remove_spot_requests.append(r.id)
                    net_new_utility += self.settings.utility[r.launch_specification.instance_type].utility
                    remaining_budget += r.price

        instances = jx.sort(self.running_instances(), "markup.estimated_value")

        remove_list = wrap([])
        for s in instances:
            if remaining_budget >= 0:
                break
            remove_list.append(s)
            net_new_utility += coalesce(s.markup.type.utility, 0)
            remaining_budget += coalesce(s.request.bid_price, s.markup.price_80, s.markup.current_price)

        if not remove_list:
            return remaining_budget, net_new_utility

        # SEND SHUTDOWN TO EACH INSTANCE
        Log.warning("Shutdown {{instances}} to save money!", instances=remove_list.id)
        for i in remove_list:
            try:
                self.instance_manager.teardown(i)
            except Exception as e:
                Log.warning("Teardown of {{id}} failed", id=i.id, cause=e)

        remove_spot_requests.extend(remove_list.spot_instance_request_id)

        # TERMINATE INSTANCES
        self.ec2_conn.terminate_instances(instance_ids=remove_list.id)

        # TERMINATE SPOT REQUESTS
        self.ec2_conn.cancel_spot_instance_requests(request_ids=remove_spot_requests)
        return remaining_budget, net_new_utility
示例#38
0
    def insert_list(self, table_name, records):
        if not records:
            return

        keys = set()
        for r in records:
            keys |= set(r.keys())
        keys = jx.sort(keys)

        try:
            command = \
                "INSERT INTO " + self.quote_column(table_name) + "(" + \
                ",".join([self.quote_column(k) for k in keys]) + \
                ") VALUES " + ",\n".join([
                    "(" + ",".join([self.quote_value(r[k]) for k in keys]) + ")"
                    for r in records
                ])
            self.execute(command)
        except Exception, e:
            Log.error("problem with record: {{record}}",
                      record=records,
                      cause=e)
示例#39
0
def loop_all_days(destination, please_stop):
    try:
        today = Date.today()

        # WHICH DAYS DO WE NEED TO CALCULATE
        # ALL BUILD DATES WITH WITH ETL TIMESTAMP OF A WEEK AGO
        # ALL BUILD DATES THAT HAVE NOT BEEN PROCESSED YET
        build_dates = http.post_json(config.source.url, json={
            "from": "unittest",
            "edges": [
                {
                    "name": "date",
                    "value": "build.date",
                    "allowNulls": False,
                    "domain": {
                        "type": "time",
                        "min": "today-week",
                        "max": "eod",
                        "interval": "day"
                    }
                }
            ],
            "where": {"gte": {"etl.timestamp": (today - WEEK).unix}},
            "sort": {"value": "build.date", "sort": -1},
            "limit": 14,
            "format": "list"
        })

        build_dates.data = jx.sort(build_dates.data, {"value": "date", "sort": -1})

        for d in build_dates.data:
            if please_stop:
                return
            agg(Date(d.date), destination, please_stop=please_stop)
    finally:
        please_stop.go()
示例#40
0
def int_list_packer(term, values):
    """
    return singletons, ranges and exclusions
    """
    DENSITY = 10  # a range can have holes, this is inverse of the hole density
    MIN_RANGE = 20  # min members before a range is allowed to be used

    singletons = set()
    ranges = []
    exclude = set()

    sorted = jx.sort(values)

    last = sorted[0]
    curr_start = last
    curr_excl = set()

    for v in sorted[1::]:
        if v <= last + 1:
            pass
        elif v - last > 3:
            # big step, how do we deal with it?
            if last == curr_start:
                # not a range yet, so just add as singlton
                singletons.add(last)
            elif last - curr_start - len(curr_excl) < MIN_RANGE or ((last - curr_start) < len(curr_excl) * DENSITY):
                # small ranges are singletons, sparse ranges are singletons
                singletons |= set(range(curr_start, last + 1))
                singletons -= curr_excl
            else:
                # big enough, and dense enough range
                ranges.append({"gte": curr_start, "lte": last})
                exclude |= curr_excl
            curr_start = v
            curr_excl = set()
        else:
            if 1 + last - curr_start >= len(curr_excl) * DENSITY:
                # high density, keep track of excluded and continue
                add_me = set(range(last + 1, v))
                curr_excl |= add_me
            elif 1 + last - curr_start - len(curr_excl) < MIN_RANGE:
                # not big enough, convert range to singletons
                new_singles = set(range(curr_start, last + 1)) - curr_excl
                singletons = singletons | new_singles

                curr_start = v
                curr_excl = set()
            else:
                ranges.append({"gte": curr_start, "lte": last})
                exclude |= curr_excl
                curr_start = v
                curr_excl = set()
        last = v

    if last == curr_start:
        # not a range yet, so just add as singlton
        singletons.add(last)
    elif last - curr_start - len(curr_excl) < MIN_RANGE or ((last - curr_start) < len(curr_excl) * DENSITY):
        # small ranges are singletons, sparse ranges are singletons
        singletons |= set(range(curr_start, last + 1))
        singletons -= curr_excl
    else:
        # big enough, and dense enough range
        ranges.append({"gte": curr_start, "lte": last})
        exclude |= curr_excl

    if ranges:
        r = {"or": [{"range": {term: r}} for r in ranges]}
        if exclude:
            r = {"and": [r, {"not": {"terms": {term: jx.sort(exclude)}}}]}
        if singletons:
            return {"or": [
                {"terms": {term: jx.sort(singletons)}},
                r
            ]}
        else:
            return r
    else:
        raise Except("no packing possible")
示例#41
0
def process_batch(todo, coverage_index, coverage_summary_index, settings, please_stop):
    for not_summarized in todo:
        if please_stop:
            return True

        # IS THERE MORE THAN ONE COVERAGE FILE FOR THIS REVISION?
        Log.note("Find dups for file {{file}}", file=not_summarized.source.file.name)
        dups = http.post_json(settings.url, json={
            "from": "coverage",
            "select": [
                {"name": "max_id", "value": "etl.source.id", "aggregate": "max"},
                {"name": "min_id", "value": "etl.source.id", "aggregate": "min"}
            ],
            "where": {"and": [
                {"missing": "source.method.name"},
                {"eq": {
                    "source.file.name": not_summarized.source.file.name,
                    "build.revision12": not_summarized.build.revision12
                }},
            ]},
            "groupby": [
                "test.url"
            ],
            "limit": 100000,
            "format": "list"
        })

        dups_found = False
        for d in dups.data:
            if d.max_id != d.min_id:
                dups_found = True

                Log.note(
                    "removing dups {{details|json}}\n{{dups|json|indent}}",
                    details={
                        "id": int(d.max_id),
                        "test": d.test.url,
                        "source": not_summarized.source.file.name,
                        "revision": not_summarized.build.revision12
                    }
                )

                # FIND ALL INDEXES
                all_indexes = [
                    p.index
                    for p in coverage_index.cluster.get_aliases()
                    if p.alias == coverage_index.settings.alias
                ]
                for index_name in all_indexes:
                    elasticsearch.Index(index=index_name, read_only=False, cluster=coverage_index.cluster).delete_record({"and": [
                        {"not": {"term": {"etl.source.id": int(d.max_id)}}},
                        {"term": {"test.url": d.test.url}},
                        {"term": {"source.file.name": not_summarized.source.file.name}},
                        {"term": {"build.revision12": not_summarized.build.revision12}}
                    ]})
        if dups_found:
            continue

        # LIST ALL TESTS THAT COVER THIS FILE, AND THE LINES COVERED
        test_count = http.post_json(settings.url, json={
            "from": "coverage.source.file.covered",
            "where": {"and": [
                {"missing": "source.method.name"},
                {"eq": {
                    "source.file.name": not_summarized.source.file.name,
                    "build.revision12": not_summarized.build.revision12
                }},
            ]},
            "groupby": [
                "test.url",
                "line"
            ],
            "limit": 100000,
            "format": "list"
        })

        all_tests_covering_file = UNION(test_count.data.get("test.url"))
        num_tests = len(all_tests_covering_file)
        max_siblings = num_tests - 1
        Log.note(
            "{{filename}} rev {{revision}} is covered by {{num}} tests",
            filename=not_summarized.source.file.name,
            num=num_tests,
            revision=not_summarized.build.revision12
        )
        line_summary = list(
            (k, unwrap(wrap(list(v)).get("test.url")))
            for k, v in jx.groupby(test_count.data, keys="line")
        )

        # PULL THE RAW RECORD FOR MODIFICATION
        file_level_coverage_records = http.post_json(settings.url, json={
            "from": "coverage",
            "where": {"and": [
                {"missing": "source.method.name"},
                {"in": {"test.url": all_tests_covering_file}},
                {"eq": {
                    "source.file.name": not_summarized.source.file.name,
                    "build.revision12": not_summarized.build.revision12
                }}
            ]},
            "limit": 100000,
            "format": "list"
        })

        for test_name in all_tests_covering_file:
            siblings = [len(test_names)-1 for g, test_names in line_summary if test_name in test_names]
            min_siblings = MIN(siblings)
            coverage_candidates = jx.filter(file_level_coverage_records.data, lambda row, rownum, rows: row.test.url == test_name)
            if coverage_candidates:

                if len(coverage_candidates) > 1 and any(coverage_candidates[0]._id != c._id for c in coverage_candidates):
                    Log.warning(
                        "Duplicate coverage\n{{cov|json|indent}}",
                        cov=[{"_id": c._id, "run": c.run, "test": c.test} for c in coverage_candidates]
                    )

                # MORE THAN ONE COVERAGE CANDIDATE CAN HAPPEN WHEN THE SAME TEST IS IN TWO DIFFERENT CHUNKS OF THE SAME SUITE
                for coverage_record in coverage_candidates:
                    coverage_record.source.file.max_test_siblings = max_siblings
                    coverage_record.source.file.min_line_siblings = min_siblings
                    coverage_record.source.file.score = (max_siblings - min_siblings) / (max_siblings + min_siblings + 1)
            else:
                example = http.post_json(settings.url, json={
                    "from": "coverage",
                    "where": {"eq": {
                        "test.url": test_name,
                        "source.file.name": not_summarized.source.file.name,
                        "build.revision12": not_summarized.build.revision12
                    }},
                    "limit": 1,
                    "format": "list"
                })

                Log.warning(
                    "{{test|quote}} rev {{revision}} appears to have no coverage for {{file|quote}}!\n{{example|json|indent}}",
                    test=test_name,
                    file=not_summarized.source.file.name,
                    revision=not_summarized.build.revision12,
                    example=example.data[0]
                )

        bad_example = [d for d in file_level_coverage_records.data if d["source.file.min_line_siblings"] == None]
        if bad_example:
            Log.warning("expecting all records to have summary. Example:\n{{example}}", example=bad_example[0])

        rows = [{"id": d._id, "value": d} for d in file_level_coverage_records.data]
        coverage_summary_index.extend(rows)
        coverage_index.extend(rows)

        all_test_summary = []
        for g, records in jx.groupby(file_level_coverage_records.data, "source.file.name"):
            cov = UNION(records.source.file.covered)
            uncov = UNION(records.source.file.uncovered)
            coverage = {
                "_id": "|".join([records[0].build.revision12, g["source.file.name"]]),  # SOMETHING UNIQUE, IN CASE WE RECALCULATE
                "source": {
                    "file": {
                        "name": g["source.file.name"],
                        "is_file": True,
                        "covered": jx.sort(cov, "line"),
                        "uncovered": jx.sort(uncov),
                        "total_covered": len(cov),
                        "total_uncovered": len(uncov),
                        "min_line_siblings": 0  # PLACEHOLDER TO INDICATE DONE
                    }
                },
                "build": records[0].build,
                "repo": records[0].repo,
                "run": records[0].run,
                "etl": {"timestamp": Date.now()}
            }
            all_test_summary.append(coverage)

        sum_rows = [{"id": d["_id"], "value": d} for d in all_test_summary]
        coverage_summary_index.extend(sum_rows)

        if DEBUG:
            coverage_index.refresh()
            todo = http.post_json(settings.url, json={
                "from": "coverage",
                "where": {"and": [
                    {"missing": "source.method.name"},
                    {"missing": "source.file.min_line_siblings"},
                    {"eq": {"source.file.name": not_summarized.source.file.name}},
                    {"eq": {"build.revision12": not_summarized.build.revision12}}
                ]},
                "format": "list",
                "limit": 10
            })
            if todo.data:
                Log.error("Failure to update")
示例#42
0
 def sort(self, sort):
     return ListContainer("from "+self.name, jx.sort(self.data, sort, already_normalized=True), self.schema)
示例#43
0
    def _edges_op(self, query, frum):
        index_to_column = {}  # MAP FROM INDEX TO COLUMN (OR SELECT CLAUSE)
        outer_selects = [
        ]  # EVERY SELECT CLAUSE (NOT TO BE USED ON ALL TABLES, OF COURSE)
        tables = []
        base_table = split_field(frum)[0]
        path = join_field(split_field(frum)[1:])
        nest_to_alias = {
            nested_path: "__" + unichr(ord('a') + i) + "__"
            for i, (nested_path,
                    sub_table) in enumerate(self.nested_tables.items())
        }

        columns = self._get_sql_schema(frum)

        tables = []
        for n, a in nest_to_alias.items():
            if startswith_field(path, n):
                tables.append({"nest": n, "alias": a})
        tables = jx.sort(tables, {"value": {"length": "nest"}})

        from_sql = join_field(
            [base_table] + split_field(tables[0].nest)) + " " + tables[0].alias
        previous = tables[0]
        for t in tables[1::]:
            from_sql += "\nLEFT JOIN\n" + join_field(
                [base_table] + split_field(t.nest)
            ) + " " + t.alias + " ON " + t.alias + "." + PARENT + " = " + previous.alias + "." + GUID

        # SHIFT THE COLUMN DEFINITIONS BASED ON THE NESTED QUERY DEPTH
        ons = []
        join_types = []
        wheres = []
        not_ons = ["__exists__ IS NULL"]
        groupby = []
        not_groupby = []
        orderby = []
        domains = []
        select_clause = [
            "1 __exists__"  # USED TO DISTINGUISH BETWEEN NULL-BECAUSE-LEFT-JOIN OR NULL-BECAUSE-NULL-VALUE
        ]

        for edge_index, query_edge in enumerate(query.edges):
            edge_alias = "e" + unicode(edge_index)

            if query_edge.value:
                edge_values = [
                    p for c in query_edge.value.to_sql(self).sql
                    for p in c.items()
                ]
            elif not query_edge.value and any(
                    query_edge.domain.partitions.where):
                case = "CASE "
                for pp, p in enumerate(query_edge.domain.partitions):
                    w = p.where.to_sql(self)[0].sql.b
                    t = quote_value(pp)
                    case += " WHEN " + w + " THEN " + t
                case += " ELSE NULL END "
                edge_values = [("n", case)]
            elif query_edge.range:
                edge_values = query_edge.range.min.to_sql(self)[0].sql.items(
                ) + query_edge.range.max.to_sql(self)[0].sql.items()
示例#44
0
def _es_terms2(es, mvel, query):
    """
    WE ASSUME THERE ARE JUST TWO EDGES, AND EACH HAS A SIMPLE value
    """

    # REQUEST VALUES IN FIRST DIMENSION
    q1 = query.copy()
    q1.edges = query.edges[0:1:]
    values1 = es_terms(es, mvel, q1).edges[0].domain.partitions.value

    select = listwrap(query.select)
    FromES = build_es_query(query)
    for s in select:
        for i, v in enumerate(values1):
            FromES.facets[s.name + "," + str(i)] = {
                "terms": {
                    "field": query.edges[1].value,
                    "size": coalesce(query.limit, 200000)
                },
                "facet_filter":
                simplify_esfilter({
                    "and": [query.where, {
                        "term": {
                            query.edges[0].value: v
                        }
                    }]
                })
            }

    data = es09.util.post(es, FromES, query.limit)

    # UNION ALL TERMS FROM SECOND DIMENSION
    values2 = set()
    for k, f in data.facets.items():
        values2.update(f.terms.term)
    values2 = jx.sort(values2)
    term2index = {v: i for i, v in enumerate(values2)}
    query.edges[1].domain.partitions = FlatList([{
        "name": v,
        "value": v
    } for v in values2])

    # MAKE CUBE
    output = {}
    dims = [len(values1), len(values2)]
    for s in select:
        output[s.name] = Matrix(*dims)

    # FILL CUBE
    # EXPECTING ONLY SELECT CLAUSE FACETS
    for facetName, facet in data.facets.items():
        coord = facetName.split(",")
        s = [s for s in select if s.name == coord[0]][0]
        i1 = int(coord[1])
        for term in facet.terms:
            i2 = term2index[term.term]
            output[s.name][(i1, i2)] = term[aggregates[s.aggregate]]

    cube = Cube(query.select, query.edges, output)
    cube.query = query
    return cube
示例#45
0
                except Exception, _:
                    pass

            if isinstance(result.data, list):
                try:
                    result.data = jx.sort(result.data, sort_order.name)
                except Exception, _:
                    pass

    elif result.meta.format == "cube" and len(
            result.edges
    ) == 1 and result.edges[0].name == "rownum" and not query.sort:
        header = list(result.data.keys())

        result.data = cube2list(result.data)
        result.data = jx.sort(result.data, header)
        result.data = list2cube(result.data, header)

        expect.data = cube2list(expect.data)
        expect.data = jx.sort(expect.data, header)
        expect.data = list2cube(expect.data, header)

    # CONFIRM MATCH
    assertAlmostEqual(result, expect, places=6)


def cube2list(c):
    rows = zip(*[[(k, v) for v in a] for k, a in c.items()])
    rows = [dict(r) for r in rows]
    return rows
示例#46
0
 def done_count(self):
     self.edge.domain = self.domain = SimpleSetDomain(
         partitions=jx.sort(set(self.parts))
     )
     self.parts = None
     self.computed_domain = True
示例#47
0
    def _update_cardinality(self, c):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if c.type in ["object", "nested"]:
            Log.error("not supported")
        try:
            if c.table == "meta.columns":
                with self.meta.columns.locker:
                    partitions = jx.sort([
                        g[c.es_column]
                        for g, _ in jx.groupby(self.meta.columns, c.es_column)
                        if g[c.es_column] != None
                    ])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.columns),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {
                            "eq": {
                                "table": c.table,
                                "es_column": c.es_column
                            }
                        }
                    })
                return
            if c.table == "meta.tables":
                with self.meta.columns.locker:
                    partitions = jx.sort([
                        g[c.es_column]
                        for g, _ in jx.groupby(self.meta.tables, c.es_column)
                        if g[c.es_column] != None
                    ])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.tables),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {
                            "eq": {
                                "table": c.table,
                                "name": c.name
                            }
                        }
                    })
                return

            es_index = c.table.split(".")[0]
            result = self.default_es.post("/" + es_index + "/_search",
                                          data={
                                              "aggs": {
                                                  c.name: _counting_query(c)
                                              },
                                              "size": 0
                                          })
            r = result.aggregations.values()[0]
            count = result.hits.total
            cardinality = coalesce(r.value, r._nested.value,
                                   0 if r.doc_count == 0 else None)
            if cardinality == None:
                Log.error("logic error")

            query = Dict(size=0)
            if cardinality > 1000 or (count >= 30 and cardinality == count
                                      ) or (count >= 1000
                                            and cardinality / count > 0.99):
                Log.note("{{table}}.{{field}} has {{num}} parts",
                         table=c.table,
                         field=c.es_column,
                         num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": c.es_index,
                                "es_column": c.es_column
                            }
                        }
                    })
                return
            elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                Log.note("{{field}} has {{num}} parts",
                         field=c.name,
                         num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": c.es_index,
                                "es_column": c.es_column
                            }
                        }
                    })
                return
            elif c.nested_path:
                query.aggs[literal_field(c.name)] = {
                    "nested": {
                        "path": listwrap(c.nested_path)[0]
                    },
                    "aggs": {
                        "_nested": {
                            "terms": {
                                "field": c.es_column,
                                "size": 0
                            }
                        }
                    }
                }
            else:
                query.aggs[literal_field(c.name)] = {
                    "terms": {
                        "field": c.es_column,
                        "size": 0
                    }
                }

            result = self.default_es.post("/" + es_index + "/_search",
                                          data=query)

            aggs = result.aggregations.values()[0]
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            Log.note("{{field}} has {{parts}}", field=c.name, parts=parts)
            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {
                        "eq": {
                            "es_index": c.es_index,
                            "es_column": c.es_column
                        }
                    }
                })
        except Exception, e:
            if "IndexMissingException" in e and c.table.startswith(
                    TEST_TABLE_PREFIX):
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": 0,
                            "cardinality": 0,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": c.es_index,
                                "es_column": c.es_column
                            }
                        }
                    })
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "table": c.table,
                            "es_column": c.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get {{col.table}}.{{col.es_column}} info",
                    col=c,
                    cause=e)
示例#48
0
    def _update_cardinality(self, c):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if c.type in STRUCT:
            Log.error("not supported")
        try:
            if c.table == "meta.columns":
                with self.meta.columns.locker:
                    partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.columns),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"table": c.table, "es_column": c.es_column}}
                    })
                return
            if c.table == "meta.tables":
                with self.meta.columns.locker:
                    partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.tables),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return

            es_index = c.table.split(".")[0]
            result = self.default_es.post("/" + es_index + "/_search", data={
                "aggs": {c.name: _counting_query(c)},
                "size": 0
            })
            r = result.aggregations.values()[0]
            count = result.hits.total
            cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count==0 else None)
            if cardinality == None:
                Log.error("logic error")

            query = Dict(size=0)
            if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                Log.note("{{table}}.{{field}} has {{num}} parts", table=c.table, field=c.es_column, num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            elif c.nested_path:
                query.aggs[literal_field(c.name)] = {
                    "nested": {"path": listwrap(c.nested_path)[0]},
                    "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}}
                }
            else:
                query.aggs[literal_field(c.name)] = {"terms": {"field": c.es_column, "size": 0}}

            result = self.default_es.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations.values()[0]
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            Log.note("{{field}} has {{parts}}", field=c.name, parts=parts)
            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
        except Exception, e:
            if "IndexMissingException" in e and c.table.startswith(TEST_TABLE_PREFIX):
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": 0,
                            "cardinality": 0,
                            "last_updated": Date.now()
                        },
                        "clear":[
                            "partitions"
                        ],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"table": c.table, "es_column": c.es_column}}
                })
                Log.warning("Could not get {{col.table}}.{{col.es_column}} info", col=c, cause=e)
示例#49
0
 def sort(self, sort):
     return ListContainer("from " + self.name,
                          jx.sort(self.data, sort, already_normalized=True),
                          self.schema)
示例#50
0
def int_list_packer(term, values):
    """
    return singletons, ranges and exclusions
    """
    DENSITY = 10  # a range can have holes, this is inverse of the hole density
    MIN_RANGE = 20  # min members before a range is allowed to be used

    singletons = set()
    ranges = []
    exclude = set()

    sorted = jx.sort(values)

    last = sorted[0]
    curr_start = last
    curr_excl = set()

    for v in sorted[1::]:
        if v <= last + 1:
            pass
        elif v - last > 3:
            # big step, how do we deal with it?
            if last == curr_start:
                # not a range yet, so just add as singlton
                singletons.add(last)
            elif last - curr_start - len(curr_excl) < MIN_RANGE or (
                (last - curr_start) < len(curr_excl) * DENSITY):
                # small ranges are singletons, sparse ranges are singletons
                singletons |= set(range(curr_start, last + 1))
                singletons -= curr_excl
            else:
                # big enough, and dense enough range
                ranges.append({"gte": curr_start, "lte": last})
                exclude |= curr_excl
            curr_start = v
            curr_excl = set()
        else:
            if 1 + last - curr_start >= len(curr_excl) * DENSITY:
                # high density, keep track of excluded and continue
                add_me = set(range(last + 1, v))
                curr_excl |= add_me
            elif 1 + last - curr_start - len(curr_excl) < MIN_RANGE:
                # not big enough, convert range to singletons
                new_singles = set(range(curr_start, last + 1)) - curr_excl
                singletons = singletons | new_singles

                curr_start = v
                curr_excl = set()
            else:
                ranges.append({"gte": curr_start, "lte": last})
                exclude |= curr_excl
                curr_start = v
                curr_excl = set()
        last = v

    if last == curr_start:
        # not a range yet, so just add as singlton
        singletons.add(last)
    elif last - curr_start - len(curr_excl) < MIN_RANGE or (
        (last - curr_start) < len(curr_excl) * DENSITY):
        # small ranges are singletons, sparse ranges are singletons
        singletons |= set(range(curr_start, last + 1))
        singletons -= curr_excl
    else:
        # big enough, and dense enough range
        ranges.append({"gte": curr_start, "lte": last})
        exclude |= curr_excl

    if ranges:
        r = {"or": [{"range": {term: r}} for r in ranges]}
        if exclude:
            r = {"and": [r, {"not": {"terms": {term: jx.sort(exclude)}}}]}
        if singletons:
            return {"or": [{"terms": {term: jx.sort(singletons)}}, r]}
        else:
            return r
    else:
        raise Except("no packing possible")
示例#51
0
def loop(source, coverage_summary_index, settings, please_stop):
    try:
        cluster = elasticsearch.Cluster(source)
        aliases = cluster.get_aliases()
        candidates = []
        for pairs in aliases:
            if pairs.alias == source.index:
                candidates.append(pairs.index)
        candidates = jx.sort(candidates, {".": "desc"})

        for index_name in candidates:
            coverage_index = elasticsearch.Index(index=index_name, read_only=False, settings=source)
            push_date_filter = unicode2Date(coverage_index.settings.index[-15::], elasticsearch.INDEX_DATE_FORMAT)

            while not please_stop:
                # IDENTIFY NEW WORK
                Log.note("Working on index {{index}}", index=index_name)
                coverage_index.refresh()

                todo = http.post_json(settings.url, json={
                    "from": "coverage",
                    "groupby": ["source.file.name", "build.revision12"],
                    "where": {"and": [
                        {"missing": "source.method.name"},
                        {"missing": "source.file.min_line_siblings"},
                        {"gte": {"repo.push.date": push_date_filter}}
                    ]},
                    "format": "list",
                    "limit": coalesce(settings.batch_size, 100)
                })

                if not todo.data:
                    break

                queue = Queue("pending source files to review")
                queue.extend(todo.data[0:coalesce(settings.batch_size, 100):])

                threads = [
                    Thread.run(
                        "processor" + unicode(i),
                        process_batch,
                        queue,
                        coverage_index,
                        coverage_summary_index,
                        settings,
                        please_stop=please_stop
                    )
                    for i in range(NUM_THREAD)
                ]

                # ADD STOP MESSAGE
                queue.add(Thread.STOP)

                # WAIT FOR THEM TO COMPLETE
                for t in threads:
                    t.join()

        please_stop.go()
        return

    except Exception, e:
        Log.warning("Problem processing", cause=e)
示例#52
0
            if isinstance(expect.data, list):
                try:
                    expect.data = jx.sort(expect.data, sort_order.name)
                except Exception, _:
                    pass

            if isinstance(result.data, list):
                try:
                    result.data = jx.sort(result.data, sort_order.name)
                except Exception, _:
                    pass

    elif result.meta.format == "cube" and len(result.edges) == 1 and result.edges[0].name == "rownum" and not query.sort:
        result_data, result_header = cube2list(result.data)
        result_data = unwrap(jx.sort(result_data, result_header))
        result.data = list2cube(result_data, result_header)

        expect_data, expect_header = cube2list(expect.data)
        expect_data = jx.sort(expect_data, expect_header)
        expect.data = list2cube(expect_data, expect_header)

    # CONFIRM MATCH
    assertAlmostEqual(result, expect, places=places)


def cube2list(cube):
    """
    RETURNS header SO THAT THE ORIGINAL CUBE CAN BE RECREATED
    :param cube: A dict WITH VALUES BEING A MULTIDIMENSIONAL ARRAY OF UNIFORM VALUES
    :return: (rows, header) TUPLE
示例#53
0
    def query(self, query):
        """
        :param query:  JSON Query Expression, SET `format="container"` TO MAKE NEW TABLE OF RESULT
        :return:
        """
        if not startswith_field(query['from'], self.name):
            Log.error("Expecting table, or some nested table")
        frum, query['from'] = query['from'], self
        query = QueryOp.wrap(query, self.columns)

        # TYPE CONFLICTS MUST NOW BE RESOLVED DURING
        # TYPE-SPECIFIC QUERY NORMALIZATION
        # vars_ = query.vars(exclude_select=True)
        # type_map = {
        #     v: c.es_column
        #     for v in vars_
        #     if v in self.columns and len([c for c in self.columns[v] if c.type != "nested"]) == 1
        #     for c in self.columns[v]
        #     if c.type != "nested"
        # }
        #
        # sql_query = query.map(type_map)
        query = query

        new_table = "temp_" + unique_name()

        if query.format == "container":
            create_table = "CREATE TABLE " + quote_table(new_table) + " AS "
        else:
            create_table = ""

        if query.groupby:
            op, index_to_columns = self._groupby_op(query, frum)
            command = create_table + op
        elif query.edges or any(a != "none"
                                for a in listwrap(query.select).aggregate):
            op, index_to_columns = self._edges_op(query, frum)
            command = create_table + op
        else:
            op = self._set_op(query, frum)
            return op

        if query.sort:
            command += "\nORDER BY " + ",\n".join(
                "(" + sql[t] + ") IS NULL" +
                (" DESC" if s.sort == -1 else "") + ",\n" + sql[t] +
                (" DESC" if s.sort == -1 else "")
                for s, sql in [(s, s.value.to_sql(self)[0].sql)
                               for s in query.sort] for t in "bns" if sql[t])

        result = self.db.query(command)

        column_names = query.edges.name + query.groupby.name + listwrap(
            query.select).name
        if query.format == "container":
            output = QueryTable(new_table,
                                db=self.db,
                                uid=self.uid,
                                exists=True)
        elif query.format == "cube" or (not query.format and query.edges):
            if len(query.edges) == 0 and len(query.groupby) == 0:
                data = {n: Data() for n in column_names}
                for s in index_to_columns.values():
                    data[s.push_name][s.push_child] = unwrap(
                        s.pull(result.data[0]))
                return Data(data=unwrap(data), meta={"format": "cube"})

            if not result.data:
                edges = []
                dims = []
                for i, e in enumerate(query.edges + query.groupby):
                    allowNulls = coalesce(e.allowNulls, True)

                    if e.domain.type == "set" and e.domain.partitions:
                        domain = SimpleSetDomain(
                            partitions=e.domain.partitions.name)
                    elif e.domain.type == "range":
                        domain = e.domain
                    elif isinstance(e.value, TupleOp):
                        pulls = jx.sort([
                            c for c in index_to_columns.values()
                            if c.push_name == e.name
                        ], "push_child").pull
                        parts = [
                            tuple(p(d) for p in pulls) for d in result.data
                        ]
                        domain = SimpleSetDomain(
                            partitions=jx.sort(set(parts)))
                    else:
                        domain = SimpleSetDomain(partitions=[])

                    dims.append(1 if allowNulls else 0)
                    edges.append(
                        Data(name=e.name, allowNulls=allowNulls,
                             domain=domain))

                zeros = [
                    0 if s.aggregate == "count"
                    and index_to_columns[si].push_child == "." else Data
                    for si, s in enumerate(listwrap(query.select))
                ]
                data = {
                    s.name: Matrix(dims=dims, zeros=zeros[si])
                    for si, s in enumerate(listwrap(query.select))
                }

                if isinstance(query.select, list):
                    select = [{"name": s.name} for s in query.select]
                else:
                    select = {"name": query.select.name}

                return Data(meta={"format": "cube"},
                            edges=edges,
                            select=select,
                            data={k: v.cube
                                  for k, v in data.items()})

            columns = None

            edges = []
            dims = []
            for g in query.groupby:
                g.is_groupby = True

            for i, e in enumerate(query.edges + query.groupby):
                allowNulls = coalesce(e.allowNulls, True)

                if e.domain.type == "set" and e.domain.partitions:
                    domain = SimpleSetDomain(
                        partitions=e.domain.partitions.name)
                elif e.domain.type == "range":
                    domain = e.domain
                elif e.domain.type == "time":
                    domain = wrap(mo_json.scrub(e.domain))
                elif e.domain.type == "duration":
                    domain = wrap(mo_json.scrub(e.domain))
                elif isinstance(e.value, TupleOp):
                    pulls = jx.sort([
                        c for c in index_to_columns.values()
                        if c.push_name == e.name
                    ], "push_child").pull
                    parts = [tuple(p(d) for p in pulls) for d in result.data]
                    domain = SimpleSetDomain(partitions=jx.sort(set(parts)))
                else:
                    if not columns:
                        columns = zip(*result.data)
                    parts = set(columns[i])
                    if e.is_groupby and None in parts:
                        allowNulls = True
                    parts -= {None}
                    domain = SimpleSetDomain(partitions=jx.sort(parts))

                dims.append(len(domain.partitions) + (1 if allowNulls else 0))
                edges.append(
                    Data(name=e.name, allowNulls=allowNulls, domain=domain))

            zeros = [
                0 if s.aggregate == "count"
                and index_to_columns[si].push_child == "." else Data
                for si, s in enumerate(listwrap(query.select))
            ]
            data_cubes = {
                s.name: Matrix(dims=dims, zeros=zeros[si])
                for si, s in enumerate(listwrap(query.select))
            }
            r2c = index_to_coordinate(
                dims)  # WORKS BECAUSE THE DATABASE SORTED THE EDGES TO CONFORM
            for rownum, row in enumerate(result.data):
                coord = r2c(rownum)

                for i, s in enumerate(index_to_columns.values()):
                    if s.is_edge:
                        continue
                    if s.push_child == ".":
                        data_cubes[s.push_name][coord] = s.pull(row)
                    else:
                        data_cubes[s.push_name][coord][s.push_child] = s.pull(
                            row)

            if isinstance(query.select, list):
                select = [{"name": s.name} for s in query.select]
            else:
                select = {"name": query.select.name}

            return Data(meta={"format": "cube"},
                        edges=edges,
                        select=select,
                        data={k: v.cube
                              for k, v in data_cubes.items()})
        elif query.format == "table" or (not query.format and query.groupby):
            data = []
            for d in result.data:
                row = [None for _ in column_names]
                for s in index_to_columns.values():
                    if s.push_child == ".":
                        row[s.push_column] = s.pull(d)
                    elif s.num_push_columns:
                        tuple_value = row[s.push_column]
                        if tuple_value == None:
                            tuple_value = row[
                                s.push_column] = [None] * s.num_push_columns
                        tuple_value[s.push_child] = s.pull(d)
                    elif row[s.push_column] == None:
                        row[s.push_column] = Data()
                        row[s.push_column][s.push_child] = s.pull(d)
                    else:
                        row[s.push_column][s.push_child] = s.pull(d)
                data.append(tuple(unwrap(r) for r in row))

            output = Data(meta={"format": "table"},
                          header=column_names,
                          data=data)
        elif query.format == "list" or (not query.edges and not query.groupby):

            if not query.edges and not query.groupby and any(
                    listwrap(query.select).aggregate):
                if isinstance(query.select, list):
                    data = Data()
                    for c in index_to_columns.values():
                        if c.push_child == ".":
                            data[c.push_name] = c.pull(result.data[0])
                        else:
                            data[c.push_name][c.push_child] = c.pull(
                                result.data[0])

                    output = Data(meta={"format": "value"}, data=data)
                else:
                    data = Data()
                    for s in index_to_columns.values():
                        data[s.push_child] = s.pull(result.data[0])

                    output = Data(meta={"format": "value"}, data=unwrap(data))
            else:
                data = []
                for rownum in result.data:
                    row = Data()
                    for c in index_to_columns.values():
                        if c.push_child == ".":
                            row[c.push_name] = c.pull(rownum)
                        elif c.num_push_columns:
                            tuple_value = row[c.push_name]
                            if not tuple_value:
                                tuple_value = row[
                                    c.push_name] = [None] * c.num_push_columns
                            tuple_value[c.push_child] = c.pull(rownum)
                        else:
                            row[c.push_name][c.push_child] = c.pull(rownum)

                    data.append(row)

                output = Data(meta={"format": "list"}, data=data)
        else:
            Log.error("unknown format {{format}}", format=query.format)

        return output
示例#54
0
 def done_count(self):
     self.edge.domain = self.domain = SimpleSetDomain(
         partitions=jx.sort(set(self.parts)))
     self.parts = None
     self.computed_domain = True
示例#55
0
    if not value:
        return Null

    try:
        return int(value)
    except Exception:
        pass

    try:
        return float(value)
    except Exception:
        pass

    return value

tab_data = File("resources/EC2.csv").read()
lines = map(strings.trim, tab_data.split("\n"))
header = lines[0].split(",")
rows = [r.split(",") for r in lines[1:] if r]
data = wrap([{h: unquote(r[c]) for c, h in enumerate(header)} for r in rows])


for d in data:
    d.utility = Math.min(d.memory, d.storage/50, 60)
    d.drives["$ref"] = "#" + unicode(d.num_drives) + "_ephemeral_drives"
    d.discount = 0

Log.note("{{data|json(False)}}", data=[d for d in data if d.utility])

Log.note("{{data|json}}", data={d.instance_type: {"num": d.num_drives, "size": d.storage} for d in jx.sort(data, "instance_type")})
示例#56
0
    def get_markup(self,
                   branch,
                   revision,
                   task_id=None,
                   buildername=None,
                   timestamp=None):
        # TRY CACHE
        if not branch or not revision:
            Log.error("expecting branch and revision")

        if self.settings.use_cache:
            if task_id:
                _filter = {"term": {"task.id": task_id}}
            else:
                _filter = {"term": {"ref_data_name": buildername}}

            query = {
                "query": {
                    "filtered": {
                        "query": {
                            "match_all": {}
                        },
                        "filter": {
                            "and": [
                                _filter, {
                                    "term": {
                                        "repo.branch": branch
                                    }
                                }, {
                                    "prefix": {
                                        "repo.revision": revision
                                    }
                                }, {
                                    "or": [{
                                        "range": {
                                            "etl.timestamp": {
                                                "gte": (Date.now() - HOUR).unix
                                            }
                                        }
                                    }, {
                                        "range": {
                                            "job.timing.last_modified": {
                                                "lt": (Date.now() - DAY).unix
                                            }
                                        }
                                    }]
                                }
                            ]
                        }
                    }
                },
                "size": 10000
            }

            try:
                docs = self.cache.search(query, timeout=120).hits.hits
            except Exception, e:
                docs = None
                Log.warning("Bad ES call, fall back to TH", cause=e)

            if not docs:
                pass
            elif len(docs) == 1:
                if DEBUG:
                    Log.note(
                        "Used ES cache to get TH details on {{value|quote}}",
                        value=coalesce(task_id, buildername))
                return docs[0]._source
            elif timestamp == None:
                Log.error("timestamp required to find best match")
            else:
                # MISSING docs._source.job.timing.end WHEN A PLACEHOLDER WAS ADDED
                # TODO: SHOULD DELETE OVERAPPING PLACEHOLDER RECORDS
                timestamp = Date(timestamp).unix
                best_index = jx.sort(
                    [(i, abs(coalesce(e, 0) - timestamp))
                     for i, e in enumerate(docs._source.job.timing.end)],
                    1)[0][0]
                return docs[best_index]._source