Пример #1
0
def es_setop(es, query):
    schema = query.frum.schema
    new_select, all_paths, split_select, var_to_columns = pre_process(query)

    es_query = setop_to_es_queries(query, all_paths, split_select,
                                   var_to_columns)
    size = coalesce(query.limit, DEFAULT_LIMIT)
    sort = jx_sort_to_es_sort(query.sort, schema)
    for q in es_query:
        q.size = size
        q.sort = sort

    with Timer("call to ES", verbose=DEBUG) as call_timer:
        results = es.multisearch(es_query)

    T = []
    for result in results:
        T.extend(result.hits.hits)

    try:
        formatter, _, mime_type = set_formatters[query.format]

        with Timer("formatter", silent=True):
            output = formatter(T, new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
Пример #2
0
def es_setop(es, query):
    schema = query.frum.schema
    query_path = schema.query_path[0]

    new_select, split_select = get_selects(query)

    split_wheres = split_expression_by_path(query.where, schema, lang=ES52)
    es_query = es_query_proto(query_path, split_select, split_wheres, schema)
    es_query.size = coalesce(query.limit, DEFAULT_LIMIT)
    es_query.sort = jx_sort_to_es_sort(query.sort, schema)

    with Timer("call to ES", silent=DEBUG) as call_timer:
        result = es.search(es_query)

    # Log.note("{{result}}", result=result)

    T = result.hits.hits

    try:
        formatter, _, mime_type = set_formatters[query.format]

        with Timer("formatter", silent=True):
            output = formatter(T, new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
Пример #3
0
    def test_lock_speed(self):
        SCALE = 1000 * 100

        with Timer("create"):
            locks = [_allocate_lock() for _ in range(SCALE)]

        with Timer("acquire"):
            for i in range(SCALE):
                locks[i].acquire()

        with Timer("release"):
            for i in range(SCALE):
                locks[i].release()
Пример #4
0
def test_simple(filename):
    with Timer("simple time"):
        with codecs.open(filename, "r", encoding="utf-8") as f:
            for line in f:
                id = int(line.split("\t")[0])
                if id % 10000 == 0:
                    Log.note("{{id}}", id=id)
Пример #5
0
    def test_multiple_agg_on_same_field(self):
        if self.not_real_service():
            return

        test = wrap({
            "query": {
                "from": {
                    "type": "elasticsearch",
                    "settings": {
                        "host": ES_CLUSTER_LOCATION,
                        "index": "unittest",
                        "type": "test_result"
                    }
                },
                "select": [{
                    "name": "max_bytes",
                    "value": "run.stats.bytes",
                    "aggregate": "max"
                }, {
                    "name": "count",
                    "value": "run.stats.bytes",
                    "aggregate": "count"
                }]
            }
        })

        query = unicode2utf8(convert.value2json(test.query))
        # EXECUTE QUERY
        with Timer("query"):
            response = http.get(self.testing.query, data=query)
            if response.status_code != 200:
                error(response)
        result = json2value(utf82unicode(response.all_content))

        Log.note("result\n{{result|indent}}", {"result": result})
Пример #6
0
def format_cube(T, select, query=None):
    with Timer("format table"):
        table = format_table(T, select, query)

    if len(table.data) == 0:
        return Cube(
            select,
            edges=[{
                "name": "rownum",
                "domain": {
                    "type": "rownum",
                    "min": 0,
                    "max": 0,
                    "interval": 1
                }
            }],
            data={h: Matrix(list=[])
                  for i, h in enumerate(table.header)})

    cols = transpose(*unwrap(table.data))
    return Cube(
        select,
        edges=[{
            "name": "rownum",
            "domain": {
                "type": "rownum",
                "min": 0,
                "max": len(table.data),
                "interval": 1
            }
        }],
        data={h: Matrix(list=cols[i])
              for i, h in enumerate(table.header)})
Пример #7
0
    def test_branch_count(self):
        if self.not_real_service():
            return

        test = wrap({"query": {
            "from": {
                "type": "elasticsearch",
                "settings": {
                    "host": ES_CLUSTER_LOCATION,
                    "index": "unittest",
                    "type": "test_result"
                }
            },
            "select": [
                {"aggregate": "count"},
            ],
            "edges": [
                "build.branch"
            ],
            "where": {"or": [
                {"missing": "build.id"}
                # {"gte": {"timestamp": Date.floor(Date.now() - (Duration.DAY * 7), Duration.DAY).milli / 1000}}
            ]},
            "format": "table"
        }})

        query = convert.unicode2utf8(convert.value2json(test.query))
        # EXECUTE QUERY
        with Timer("query"):
            response = http.get(self.service_url, data=query)
            if response.status_code != 200:
                error(response)
        result = convert.json2value(convert.utf82unicode(response.all_content))

        Log.note("result\n{{result|indent}}", {"result": result})
Пример #8
0
    def test_queue_speed(self):
        SCALE = 1000*10

        done = Signal("done")
        slow = Queue()
        q = ThreadedQueue("test queue", queue=slow)

        def empty(please_stop):
            while not please_stop:
                item = q.pop()
                if item is THREAD_STOP:
                    break

            done.go()

        Thread.run("empty", empty)

        timer = Timer("add {{num}} to queue", param={"num": SCALE})
        with timer:
            for i in range(SCALE):
                q.add(i)
            q.add(THREAD_STOP)
            Log.note("Done insert")
            done.wait()

        self.assertLess(timer.duration.seconds, 1.5, "Expecting queue to be fast")
Пример #9
0
def get_raw_json(path):
    active_data_timer = Timer("total duration")
    body = flask.request.get_data()
    try:
        with active_data_timer:
            args = wrap(Data(**flask.request.args))
            limit = args.limit if args.limit else 10
            args.limit = None
            frum = wrap_from(path)
            result = jx.run(
                {
                    "from": path,
                    "where": {
                        "eq": args
                    },
                    "limit": limit,
                    "format": "list"
                }, frum)

            if isinstance(
                    result, Container
            ):  #TODO: REMOVE THIS CHECK, jx SHOULD ALWAYS RETURN Containers
                result = result.format("list")

        result.meta.active_data_response_time = active_data_timer.duration

        response_data = convert.unicode2utf8(
            convert.value2json(result.data, pretty=True))
        Log.note("Response is {{num}} bytes", num=len(response_data))
        return Response(response_data, status=200)
    except Exception, e:
        e = Except.wrap(e)
        return _send_error(active_data_timer, body, e)
Пример #10
0
    def copy(self, keys, source, sample_only_filter=None, sample_size=None, done_copy=None):
        """
        :param keys: THE KEYS TO LOAD FROM source
        :param source: THE SOURCE (USUALLY S3 BUCKET)
        :param sample_only_filter: SOME FILTER, IN CASE YOU DO NOT WANT TO SEND EVERYTHING
        :param sample_size: FOR RANDOM SAMPLE OF THE source DATA
        :param done_copy: CALLBACK, ADDED TO queue, TO FINISH THE TRANSACTION
        :return: LIST OF SUB-keys PUSHED INTO ES
        """
        num_keys = 0
        queue = None
        pending = []  # FOR WHEN WE DO NOT HAVE QUEUE YET
        for key in keys:
            timer = Timer("Process {{key}}", param={"key": key})
            try:
                with timer:
                    for rownum, line in enumerate(source.read_lines(strip_extension(key))):
                        if not line:
                            continue

                        if rownum > 0 and rownum % 1000 == 0:
                            Log.note("Ingested {{num}} records from {{key}} in bucket {{bucket}}", num=rownum, key=key, bucket=source.name)

                        row, please_stop = fix(rownum, line, source, sample_only_filter, sample_size)
                        num_keys += 1

                        if queue == None:
                            queue = self._get_queue(row)
                            if queue == None:
                                pending.append(row)
                                if len(pending) > 1000:
                                    self._get_queue(row)
                                    Log.error("first 1000 (key={{key}}) records have no indication what index to put data", key=tuple(keys)[0])
                                continue
                            elif queue is DATA_TOO_OLD:
                                break
                            if pending:
                                queue.extend(pending)
                                pending = []

                        queue.add(row)

                        if please_stop:
                            break
            except Exception as e:
                done_copy = None
                Log.warning("Could not process {{key}} after {{duration|round(places=2)}}seconds", key=key, duration=timer.duration.seconds, cause=e)

        if done_copy:
            if queue == None:
                done_copy()
            else:
                queue.add(done_copy)

        if pending:
            Log.error("Did not find an index to place the data for key={{key}}", key=tuple(keys)[0])

        Log.note("{{num}} keys from {{key|json}} added", num=num_keys, key=keys)
        return num_keys
Пример #11
0
def test_simple_binary(filename):
    with Timer("simple binary time"):
        with io.open(filename, "rb") as f:
            for line in f:
                line = line.decode("utf-8")
                id = int(line.split("\t")[0])
                if id % 10000 == 0:
                    Log.note("{{id}}", id=id)
Пример #12
0
def test_io(filename):
    with Timer("io time"):
        with io.open(filename, "r", buffering=2**25) as f:
            for line in f:
                line = line.decode("utf-8")
                id = int(line.split("\t")[0])
                if id % 10000 == 0:
                    Log.note("{{id}}", id=id)
Пример #13
0
def test_buffered(filename):
    with Timer("buffered time"):
        with codecs.open(filename, "r", encoding="utf-8",
                         buffering=2**25) as f:
            for line in f:
                id = int(line.split("\t")[0])
                if id % 10000 == 0:
                    Log.note("{{id}}", id=id)
Пример #14
0
    def test_timing(self):
        if self.not_real_service():
            return

        test = wrap({
            "query": {
                "from": {
                    "type": "elasticsearch",
                    "settings": {
                        "host": ES_CLUSTER_LOCATION,
                        "index": "unittest",
                        "type": "test_result"
                    }
                },
                "select": [{
                    "name": "count",
                    "value": "run.duration",
                    "aggregate": "count"
                }, {
                    "name": "total",
                    "value": "run.duration",
                    "aggregate": "sum"
                }],
                "edges": [{
                    "name": "chunk",
                    "value": ["run.suite", "run.chunk"]
                }, "result.ok"],
                "where": {
                    "and": [{
                        "lt": {
                            "timestamp": Date.floor(Date.now()).milli / 1000
                        }
                    }, {
                        "gte": {
                            "timestamp":
                            Date.floor(Date.now() - (Duration.DAY * 7),
                                       Duration.DAY).milli / 1000
                        }
                    }]
                },
                "format":
                "cube",
                "samples": {
                    "limit": 30
                }
            }
        })

        query = unicode2utf8(convert.value2json(test.query))
        # EXECUTE QUERY
        with Timer("query"):
            response = http.get(self.testing.query, data=query)
            if response.status_code != 200:
                error(response)
        result = json2value(utf82unicode(response.all_content))

        Log.note("result\n{{result|indent}}", {"result": result})
Пример #15
0
def es_aggsop(es, frum, query):
    query = query.copy()  # WE WILL MARK UP THIS QUERY
    schema = frum.schema
    query_path = schema.query_path[0]
    selects = listwrap(query.select)

    acc, decoders, es_query = build_es_query(selects, query_path, schema,
                                             query)

    with Timer("ES query time", verbose=DEBUG) as es_duration:
        result = es.search(es_query)

    # Log.note("{{result}}", result=result)

    try:
        format_time = Timer("formatting", verbose=DEBUG)
        with format_time:
            # result.aggregations.doc_count = coalesce(result.aggregations.doc_count, result.hits.total)
            # IT APPEARS THE OLD doc_count IS GONE
            aggs = unwrap(result.aggregations)

            edges_formatter, groupby_formatter, value_fomratter, mime_type = agg_formatters[
                query.format]
            if query.edges:
                output = edges_formatter(aggs, acc, query, decoders, selects)
            elif query.groupby:
                output = groupby_formatter(aggs, acc, query, decoders, selects)
            else:
                output = value_fomratter(aggs, acc, query, decoders, selects)

        output.meta.timing.formatting = format_time.duration
        output.meta.timing.es_search = es_duration.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        if query.format not in agg_formatters:
            Log.error("Format {{format|quote}} not supported yet",
                      format=query.format,
                      cause=e)
        Log.error("Some problem", cause=e)
Пример #16
0
    def test_simple_query(self):
        if self.not_real_service():
            return

        query = value2json({"from": "unittest"}).encode('utf8')
        # EXECUTE QUERY
        with Timer("query"):
            response = http.get(self.testing.query, data=query)
            if response.status_code != 200:
                error(response)
        result = json2value(response.all_content.decode('utf8'))

        Log.note("result\n{{result|indent}}", {"result": result})
Пример #17
0
    def test_simple_query(self):
        if self.not_real_service():
            return

        query = convert.unicode2utf8(convert.value2json({"from": "unittest"}))
        # EXECUTE QUERY
        with Timer("query"):
            response = http.get(self.service_url, data=query)
            if response.status_code != 200:
                error(response)
        result = convert.json2value(convert.utf82unicode(response.all_content))

        Log.note("result\n{{result|indent}}", {"result": result})
Пример #18
0
def es_setop(es, query):
    schema = query.frum.schema
    all_paths, split_decoders, var_to_columns = pre_process(query)
    new_select, split_select, flatten = get_selects(query)
    # THE SELECTS MAY BE REACHING DEEPER INTO THE NESTED RECORDS
    all_paths = list(
        reversed(sorted(set(split_select.keys()) | set(all_paths))))
    es_query = setop_to_es_queries(query, all_paths, split_select,
                                   var_to_columns)
    if not es_query:
        # NO QUERY TO SEND
        formatter, _, mime_type = set_formatters[query.format]
        output = formatter([], new_select, query)
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output

    size = coalesce(query.limit, DEFAULT_LIMIT)
    sort = jx_sort_to_es_sort(query.sort, schema)
    for q in es_query:
        q["size"] = size
        q["sort"] = sort

    with Timer("call to ES", verbose=DEBUG) as call_timer:
        results = es.multisearch(es_query)

    T = [copy(row) for row in flatten(results)]
    try:
        formatter, _, mime_type = set_formatters[query.format]

        with Timer("formatter", silent=True):
            output = formatter(T, new_select, query)
        output.meta.timing.es = call_timer.duration
        output.meta.content_type = mime_type
        output.meta.es_query = es_query
        return output
    except Exception as e:
        Log.error("problem formatting", e)
Пример #19
0
 def setUpClass(cls):
     Log.start(settings.debug)
     with Timer("setup database"):
         try:
             with MySQL(schema=None, kwargs=settings.database) as db:
                 db.query("drop database testing")
         except Exception as e:
             if "Can't drop database " in e:
                 pass
             else:
                 Log.warning("problem removing db", cause=e)
         MySQL.execute_file("tests/resources/database.sql",
                            schema=None,
                            kwargs=settings.database)
Пример #20
0
    def _parse_properties(self, abs_index, properties, meta):
        # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND
        # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES
        abs_columns = _elasticsearch.parse_properties(abs_index, None,
                                                      properties.properties)
        abs_columns = abs_columns.filter(  # TODO: REMOVE WHEN jobs PROPERTY EXPLOSION IS CONTAINED
            lambda r: not r.es_column.startswith("other.") and not r.es_column.
            startswith("previous_values.cf_") and not r.es_index.startswith(
                "debug") and r.es_column.find("=") == -1 and r.es_column.find(
                    " ") == -1)

        def add_column(c, query_path):
            c.last_updated = Date.now()
            if query_path[0] != ".":
                c.names[query_path[0]] = relative_field(
                    c.names["."], query_path[0])

            with self.meta.columns.locker:
                self._upsert_column(c)
                for alias in meta.aliases:
                    c = copy(c)
                    c.es_index = alias
                    self._upsert_column(c)

        with Timer("upserting {{num}} columns", {"num": len(abs_columns)},
                   debug=DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns
                           if c.type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(".")
            query_paths.append(ROOT_PATH)

            # ADD RELATIVE COLUMNS
            for abs_column in abs_columns:
                for query_path in query_paths:
                    add_column(abs_column, query_path)
Пример #21
0
    def write_lines(self, key, lines):
        self._verify_key_format(key)
        storage = self.bucket.new_key(key + ".json.gz")

        buff = TemporaryFile()
        archive = gzip.GzipFile(fileobj=buff, mode='w')
        count = 0
        for l in lines:
            if hasattr(l, "__iter__"):
                for ll in l:
                    archive.write(ll.encode("utf8"))
                    archive.write(b"\n")
                    count += 1
            else:
                archive.write(l.encode("utf8"))
                archive.write(b"\n")
                count += 1

        archive.close()
        file_length = buff.tell()

        retry = 3
        while retry:
            try:
                with Timer(
                        "Sending {{count}} lines in {{file_length|comma}} bytes for {{key}}",
                    {
                        "key": key,
                        "file_length": file_length,
                        "count": count
                    },
                        verbose=self.settings.debug):
                    buff.seek(0)
                    storage.set_contents_from_file(buff)
                break
            except Exception as e:
                e = Except.wrap(e)
                retry -= 1
                if retry == 0 or 'Access Denied' in e or "No space left on device" in e:
                    Log.error("could not push data to s3", cause=e)
                else:
                    Log.warning("could not push data to s3", cause=e)

        if self.settings.public:
            storage.set_acl('public-read')
        return
Пример #22
0
    def _parse_properties(self, abs_index, properties, meta):
        # IT IS IMPORTANT THAT NESTED PROPERTIES NAME ALL COLUMNS, AND
        # ALL COLUMNS ARE GIVEN NAMES FOR ALL NESTED PROPERTIES
        def add_column(c, query_path):
            c.last_updated = Date.now() - TOO_OLD
            if query_path[0] != ".":
                c.names[query_path[0]] = relative_field(
                    c.names["."], query_path[0])

            with self.meta.columns.locker:
                for alias in meta.aliases:
                    c_ = copy(c)
                    c_.es_index = alias
                    self._upsert_column(c_)
                self._upsert_column(c)

        abs_columns = elasticsearch.parse_properties(abs_index, None,
                                                     properties.properties)
        self.abs_columns.update(abs_columns)
        with Timer("upserting {{num}} columns", {"num": len(abs_columns)},
                   debug=DEBUG):
            # LIST OF EVERY NESTED PATH
            query_paths = [[c.es_column] for c in abs_columns
                           if c.type == "nested"]
            for a, b in itertools.product(query_paths, query_paths):
                aa = a[0]
                bb = b[0]
                if aa and bb.startswith(aa):
                    for i, b_prefix in enumerate(b):
                        if len(b_prefix) > len(aa):
                            continue
                        if aa == b_prefix:
                            break  # SPLIT ALREADY FOUND
                        b.insert(i, aa)
                        break
            for q in query_paths:
                q.append(".")
            query_paths.append(SELF_PATH)

            # ADD RELATIVE COLUMNS
            for abs_column in abs_columns:
                abs_column = abs_column.__copy__()
                abs_column.type = es_type_to_json_type[abs_column.type]
                for query_path in query_paths:
                    add_column(abs_column, query_path)
        pass
Пример #23
0
def test_binary(filename, buffering=2**14):
    with Timer("binary time (buffering=={{buffering}})",
               {"buffering": buffering}):
        remainder = ""
        with io.open(filename, "rb") as f:
            while True:
                block = f.read(buffering)
                if block == "":
                    if remainder == "":
                        return None
                    return remainder
                lines = (remainder + block).split("\n")
                for line in lines[:-1]:
                    line = line.decode("utf-8")
                    id = int(line.split("\t")[0])
                    if id % 10000 == 0:
                        Log.note("{{id}}", id=id)
                remainder = lines[-1]
Пример #24
0
    def test_longest_running_tests(self):
        test = wrap({
            "query": {
                "sort": {
                    "sort": -1,
                    "field": "avg"
                },
                "from": {
                    "from":
                    "unittest",
                    "where": {
                        "and": [{
                            "gt": {
                                "build.date": "1439337600"
                            }
                        }]
                    },
                    "groupby": [
                        "build.platform", "build.type", "run.suite",
                        "result.test"
                    ],
                    "select": [{
                        "aggregate": "avg",
                        "name": "avg",
                        "value": "result.duration"
                    }],
                    "format":
                    "table",
                    "limit":
                    100
                },
                "limit": 100,
                "format": "list"
            }
        })
        query = unicode2utf8(convert.value2json(test.query))
        # EXECUTE QUERY
        with Timer("query"):
            response = http.get(self.testing.query, data=query)
            if response.status_code != 200:
                error(response)
        result = json2value(utf82unicode(response.all_content))

        Log.note("result\n{{result|indent}}", {"result": result})
Пример #25
0
    def _test_queue_speed(self, test=False):
        SCALE = 1000 * 10

        done = Signal("done")
        slow = Queue()
        q = ThreadedQueue("test queue", slow_queue=slow)

        def empty(please_stop):
            while not please_stop:
                item = slow.pop()
                if item is THREAD_STOP:
                    break

            done.go()

        Thread.run("empty", empty)

        timer = Timer("add {{num}} to queue", param={"num": SCALE})
        with timer:
            for i in range(SCALE):
                q.add(i)
            q.add(THREAD_STOP)
            Log.note("Done insert")
            done.wait()

        Log.note(
            "{{num}} items through queue in {{seconds|round(3)}} seconds",
            num=SCALE,
            seconds=timer.duration.seconds,
        )
        if PY2 and "windows" not in platform.system().lower():
            expected_time = 15  # LINUX PY2 IS CRAZY SLOW
        elif PY3 and "windows" not in platform.system().lower():
            expected_time = 6  # LINUX PY3 IS SLOW
        else:
            expected_time = 6
        if test:
            self.assertLess(
                timer.duration.seconds,
                expected_time,
                "Expecting queue to be fast, not " +
                text(timer.duration.seconds) + " seconds",
            )
Пример #26
0
    def test_failures_by_directory(self):
        if self.not_real_service():
            return

        test = wrap({"query": {
            "from": {
                "type": "elasticsearch",
                "settings": {
                    "host": ES_CLUSTER_LOCATION,
                    "index": "unittest",
                    "type": "test_result"
                }
            },
            "select": [
                {
                    "aggregate": "count"
                }
            ],
            "edges": [
                "result.test",
                "result.ok"
            ],
            "where": {
                "prefix": {
                    "result.test": "/"
                }
            },
            "format": "table"
        }})

        query = convert.unicode2utf8(convert.value2json(test.query))
        # EXECUTE QUERY
        with Timer("query"):
            response = http.get(self.service_url, data=query)
            if response.status_code != 200:
                error(response)
        result = convert.json2value(convert.utf82unicode(response.all_content))

        Log.note("result\n{{result|indent}}", {"result": result})
Пример #27
0
    def write_lines(self, key, lines):
        self._verify_key_format(key)
        storage = self.bucket.new_key(key + ".json.gz")

        buff = TemporaryFile()
        archive = gzip.GzipFile(fileobj=buff, mode='w')
        count = 0
        for l in lines:
            if hasattr(l, "__iter__"):
                for ll in l:
                    archive.write(ll.encode("utf8"))
                    archive.write(b"\n")
                    count += 1
            else:
                archive.write(l.encode("utf8"))
                archive.write(b"\n")
                count += 1
        archive.close()
        file_length = buff.tell()

        retry = 3
        while retry:
            try:
                with Timer(
                        "Sending {{count}} lines in {{file_length|comma}} bytes",
                    {
                        "file_length": file_length,
                        "count": count
                    },
                        debug=self.settings.debug):
                    buff.seek(0)
                    storage.set_contents_from_file(buff)
                break
            except Exception, e:
                Log.warning("could not push data to s3", cause=e)
                retry -= 1
Пример #28
0
    def test_chunk_timing(self):
        if self.not_real_service():
            return

        test = wrap({"query": {
            "from": {
                "type": "elasticsearch",
                "settings": {
                    "host": ES_CLUSTER_LOCATION,
                    "index": "unittest",
                    "type": "test_result"
                }
            },
            "select": {"value": "run.stats.duration", "aggregate": "average"},
            "edges": [
                {"name": "chunk", "value": ["run.suite", "run.chunk"]}
            ],
            "where": {"and": [
                {"term": {"etl.id": 0}},
                {"gte": {"timestamp": Date.floor(Date.now() - (Duration.DAY * 7), Duration.DAY).milli / 1000}}
            ]},
            "format": "cube",
            "samples": {
                "limit": 30
            }
        }})

        query = value2json(test.query).encode('utf8')
        # EXECUTE QUERY
        with Timer("query"):
            response = self.utils.try_till_response(self.testing.query, data=query)
            if response.status_code != 200:
                error(response)
        result = json2value(response.all_content.decode('utf8'))

        Log.note("result\n{{result|indent}}", {"result": result})
Пример #29
0
    def write_lines(self, key, lines):
        self._verify_key_format(key)
        storage = self.bucket.new_key(str(key + ".json.gz"))

        if VERIFY_UPLOAD:
            lines = list(lines)

        with mo_files.TempFile() as tempfile:
            with open(tempfile.abspath, "wb") as buff:
                DEBUG and Log.note("Temp file {{filename}}",
                                   filename=tempfile.abspath)
                archive = gzip.GzipFile(filename=str(key + ".json"),
                                        fileobj=buff,
                                        mode="w")
                count = 0
                for l in lines:
                    if is_many(l):
                        for ll in l:
                            archive.write(ll.encode("utf8"))
                            archive.write(b"\n")
                            count += 1
                    else:
                        archive.write(l.encode("utf8"))
                        archive.write(b"\n")
                        count += 1
                archive.close()

            retry = 3
            while retry:
                try:
                    with Timer(
                            "Sending {{count}} lines in {{file_length|comma}} bytes for {{key}}",
                        {
                            "key": key,
                            "file_length": tempfile.length,
                            "count": count
                        },
                            verbose=self.settings.debug,
                    ):
                        storage.set_contents_from_filename(
                            tempfile.abspath,
                            headers={"Content-Type": mimetype.GZIP})
                    break
                except Exception as e:
                    e = Except.wrap(e)
                    retry -= 1
                    if (retry == 0 or "Access Denied" in e
                            or "No space left on device" in e):
                        Log.error("could not push data to s3", cause=e)
                    else:
                        Log.warning("could not push data to s3, will retry",
                                    cause=e)

            if self.settings.public:
                storage.set_acl("public-read")

            if VERIFY_UPLOAD:
                try:
                    with open(tempfile.abspath, mode="rb") as source:
                        result = list(ibytes2ilines(
                            scompressed2ibytes(source)))
                        assertAlmostEqual(result,
                                          lines,
                                          msg="file is different")

                    # full_url = "https://"+self.name+".s3-us-west-2.amazonaws.com/"+storage.key.replace(":", "%3A")
                    # https://active-data-test-result.s3-us-west-2.amazonaws.com/tc.1524896%3A152488763.0.json.gz

                    # dest_bucket = s3.MultiBucket(bucket="self.name", kwargs=self.settings.aws)

                    result = list(self.read_lines(strip_extension(key)))
                    assertAlmostEqual(result,
                                      lines,
                                      result,
                                      msg="S3 is different")

                except Exception as e:
                    from activedata_etl.transforms import TRY_AGAIN_LATER

                    Log.error(TRY_AGAIN_LATER,
                              reason="did not pass verification",
                              cause=e)
        return
Пример #30
0
    def copy(self,
             keys,
             source,
             sample_only_filter=None,
             sample_size=None,
             done_copy=None):
        """
        :param keys: THE KEYS TO LOAD FROM source
        :param source: THE SOURCE (USUALLY S3 BUCKET)
        :param sample_only_filter: SOME FILTER, IN CASE YOU DO NOT WANT TO SEND EVERYTHING
        :param sample_size: FOR RANDOM SAMPLE OF THE source DATA
        :param done_copy: CALLBACK, ADDED TO queue, TO FINISH THE TRANSACTION
        :return: LIST OF SUB-keys PUSHED INTO ES
        """
        num_keys = 0
        queue = None
        pending = []  # FOR WHEN WE DO NOT HAVE QUEUE YET
        for key in keys:
            timer = Timer("Process {{key}}",
                          param={"key": key},
                          silent=not DEBUG)
            try:
                with timer:
                    for rownum, line in enumerate(
                            source.read_lines(strip_extension(key))):
                        if not line:
                            continue

                        if rownum > 0 and rownum % 1000 == 0:
                            Log.note(
                                "Ingested {{num}} records from {{key}} in bucket {{bucket}}",
                                num=rownum,
                                key=key,
                                bucket=source.name)

                        insert_me, please_stop = fix(key, rownum, line, source,
                                                     sample_only_filter,
                                                     sample_size)
                        if insert_me == None:
                            continue
                        value = insert_me['value']

                        if '_id' not in value:
                            Log.warning(
                                "expecting an _id in all S3 records. If missing, there can be duplicates"
                            )

                        if queue == None:
                            queue = self._get_queue(insert_me)
                            if queue == None:
                                pending.append(insert_me)
                                if len(pending) > 1000:
                                    if done_copy:
                                        done_copy()
                                    Log.error(
                                        "first 1000 (key={{key}}) records for {{alias}} have no indication what index to put data",
                                        key=tuple(keys)[0],
                                        alias=self.settings.index)
                                continue
                            elif queue is DATA_TOO_OLD:
                                break
                            if pending:
                                queue.extend(pending)
                                pending = []

                        num_keys += 1
                        queue.add(insert_me)

                        if please_stop:
                            break
            except Exception as e:
                if KEY_IS_WRONG_FORMAT in e:
                    Log.warning(
                        "Could not process {{key}} because bad format. Never trying again.",
                        key=key,
                        cause=e)
                    pass
                elif CAN_NOT_DECODE_JSON in e:
                    Log.warning(
                        "Could not process {{key}} because of bad JSON. Never trying again.",
                        key=key,
                        cause=e)
                    pass
                else:
                    Log.warning(
                        "Could not process {{key}} after {{duration|round(places=2)}}seconds",
                        key=key,
                        duration=timer.duration.seconds,
                        cause=e)
                    done_copy = None

        if done_copy:
            if queue == None:
                done_copy()
            elif queue is DATA_TOO_OLD:
                done_copy()
            else:
                queue.add(done_copy)

        if [
                p for p in pending
                if wrap(p).value.task.state not in ('failed', 'exception')
        ]:
            Log.error(
                "Did not find an index for {{alias}} to place the data for key={{key}}",
                key=tuple(keys)[0],
                alias=self.settings.index)

        Log.note("{{num}} keys from {{key|json}} added",
                 num=num_keys,
                 key=keys)
        return num_keys