Python groupby 예제들, pyLibrary.queries.jx.groupby Python 예제들

예제 #1

0

파일 보기

    def _execute_backlog(self):
        if not self.backlog: return

        (backlog, self.backlog) = (self.backlog, [])
        if self.db.__module__.startswith("pymysql"):
            # BUG IN PYMYSQL: CAN NOT HANDLE MULTIPLE STATEMENTS
            # https://github.com/PyMySQL/PyMySQL/issues/157
            for b in backlog:
                sql = self.preamble + b
                try:
                    if self.debug:
                        Log.note("Execute SQL:\n{{sql|indent}}", sql=sql)
                    self.cursor.execute(b)
                except Exception as e:
                    Log.error("Can not execute sql:\n{{sql}}",
                              sql=sql,
                              cause=e)

            self.cursor.close()
            self.cursor = self.db.cursor()
        else:
            for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE):
                sql = self.preamble + ";\n".join(g)
                try:
                    if self.debug:
                        Log.note("Execute block of SQL:\n{{sql|indent}}",
                                 sql=sql)
                    self.cursor.execute(sql)
                    self.cursor.close()
                    self.cursor = self.db.cursor()
                except Exception as e:
                    Log.error("Problem executing SQL:\n{{sql|indent}}",
                              sql=sql,
                              cause=e,
                              stack_depth=1)

예제 #2

0

파일 보기

def decrypt(data, _key):
    """
    ACCEPT JSON OF ENCRYPTED DATA  {"salt":s, "length":l, "data":d}
    """
    from pyLibrary.queries import jx

    # Key and iv have not been generated or provided, bail out
    if _key is None:
        Log.error("Expecting a key")

    _input = convert.json2value(data)

    # Initialize encryption using key and iv
    key_expander_256 = key_expander.KeyExpander(256)
    expanded_key = key_expander_256.expand(_key)
    aes_cipher_256 = aes_cipher.AESCipher(expanded_key)
    aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16)
    aes_cbc_256.set_iv(convert.base642bytearray(_input.salt))

    raw = convert.base642bytearray(_input.data)
    out_data = bytearray()
    for _, e in jx.groupby(raw, size=16):
        out_data.extend(aes_cbc_256.decrypt_block(e))

    return str(out_data[:_input.length:]).decode("utf8")

예제 #3

0

파일 보기

파일: mysql.py 프로젝트: klahnakoski/SpotManager

    def _execute_backlog(self):
        if not self.backlog: return

        (backlog, self.backlog) = (self.backlog, [])
        if self.db.__module__.startswith("pymysql"):
            # BUG IN PYMYSQL: CAN NOT HANDLE MULTIPLE STATEMENTS
            # https://github.com/PyMySQL/PyMySQL/issues/157
            for b in backlog:
                sql = self.preamble + b
                try:
                    if self.debug:
                        Log.note("Execute SQL:\n{{sql|indent}}",  sql= sql)
                    self.cursor.execute(b)
                except Exception as e:
                    Log.error("Can not execute sql:\n{{sql}}",  sql= sql, cause=e)

            self.cursor.close()
            self.cursor = self.db.cursor()
        else:
            for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE):
                sql = self.preamble + ";\n".join(g)
                try:
                    if self.debug:
                        Log.note("Execute block of SQL:\n{{sql|indent}}",  sql= sql)
                    self.cursor.execute(sql)
                    self.cursor.close()
                    self.cursor = self.db.cursor()
                except Exception as e:
                    Log.error("Problem executing SQL:\n{{sql|indent}}",  sql= sql, cause=e, stack_depth=1)

예제 #4

0

파일 보기

def decrypt(data, _key):
    """
    ACCEPT JSON OF ENCRYPTED DATA  {"salt":s, "length":l, "data":d}
    """
    from pyLibrary.queries import jx

    # Key and iv have not been generated or provided, bail out
    if _key is None:
        Log.error("Expecting a key")

    _input = convert.json2value(data)

    # Initialize encryption using key and iv
    key_expander_256 = key_expander.KeyExpander(256)
    expanded_key = key_expander_256.expand(_key)
    aes_cipher_256 = aes_cipher.AESCipher(expanded_key)
    aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16)
    aes_cbc_256.set_iv(convert.base642bytearray(_input.salt))

    raw = convert.base642bytearray(_input.data)
    out_data = bytearray()
    for _, e in jx.groupby(raw, size=16):
        out_data.extend(aes_cbc_256.decrypt_block(e))

    return str(out_data[:_input.length:]).decode("utf8")

예제 #5

0

파일 보기

    def _insert_loop(self, please_stop=None):
        bad_count = 0
        while not please_stop:
            try:
                Till(seconds=1).wait()
                messages = wrap(self.queue.pop_all())
                if not messages:
                    continue

                for g, mm in jx.groupby(messages, size=self.batch_size):
                    scrubbed = []
                    try:
                        for i, message in enumerate(mm):
                            if message is THREAD_STOP:
                                please_stop.go()
                                return
                            scrubbed.append(_deep_json_to_string(message, depth=3))
                    finally:
                        self.es.extend(scrubbed)
                    bad_count = 0
            except Exception as e:
                Log.warning("Problem inserting logs into ES", cause=e)
                bad_count += 1
                if bad_count > MAX_BAD_COUNT:
                    Log.warning("Given up trying to write debug logs to ES index {{index}}", index=self.es.settings.index)
                Till(seconds=30).wait()

        # CONTINUE TO DRAIN THIS QUEUE
        while not please_stop:
            try:
                Till(seconds=1).wait()
                self.queue.pop_all()
            except Exception as e:
                Log.warning("Should not happen", cause=e)

예제 #6

0

파일 보기

 def _insert_loop(self, please_stop=None):
     bad_count = 0
     while not please_stop:
         try:
             Thread.sleep(seconds=1)
             messages = wrap(self.queue.pop_all())
             if messages:
                 # for m in messages:
                 #     m.value.params = leafer(m.value.params)
                 #     m.value.error = leafer(m.value.error)
                 for g, mm in jx.groupby(messages, size=self.batch_size):
                     self.es.extend(mm)
                 bad_count = 0
         except Exception, e:
             Log.warning("Problem inserting logs into ES", cause=e)
             bad_count += 1
             if bad_count > 5:
                 break

예제 #7

0

파일 보기

파일: log_usingElasticSearch.py 프로젝트: klahnakoski/esReplicate

 def _insert_loop(self, please_stop=None):
     bad_count = 0
     while not please_stop:
         try:
             Thread.sleep(seconds=1)
             messages = wrap(self.queue.pop_all())
             if messages:
                 # for m in messages:
                 #     m.value.params = leafer(m.value.params)
                 #     m.value.error = leafer(m.value.error)
                 for g, mm in jx.groupby(messages, size=self.batch_size):
                     self.es.extend(mm)
                 bad_count = 0
         except Exception, e:
             Log.warning("Problem inserting logs into ES", cause=e)
             bad_count += 1
             if bad_count > 5:
                 break

예제 #8

0

파일 보기

파일: crypto.py 프로젝트: davehunt/ActiveData

def encrypt(text, _key, salt=None):
    """
    RETURN JSON OF ENCRYPTED DATA   {"salt":s, "length":l, "data":d}
    """
    from pyLibrary.queries import jx

    if not isinstance(text, unicode):
        Log.error("only unicode is encrypted")
    if _key is None:
        Log.error("Expecting a key")
    if isinstance(_key, str):
        _key = bytearray(_key)
    if salt is None:
        salt = Random.bytes(16)

    data = bytearray(text.encode("utf8"))

    # Initialize encryption using key and iv
    key_expander_256 = key_expander.KeyExpander(256)
    expanded_key = key_expander_256.expand(_key)
    aes_cipher_256 = aes_cipher.AESCipher(expanded_key)
    aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16)
    aes_cbc_256.set_iv(salt)

    output = Dict()
    output.type = "AES256"
    output.salt = convert.bytes2base64(salt)
    output.length = len(data)

    encrypted = bytearray()
    for _, d in jx.groupby(data, size=16):
        encrypted.extend(aes_cbc_256.encrypt_block(d))
    output.data = convert.bytes2base64(encrypted)
    json = convert.value2json(output)

    if DEBUG:
        test = decrypt(json, _key)
        if test != text:
            Log.error("problem with encryption")

    return json

예제 #9

0

파일 보기

def encrypt(text, _key, salt=None):
    """
    RETURN JSON OF ENCRYPTED DATA   {"salt":s, "length":l, "data":d}
    """
    from pyLibrary.queries import jx

    if not isinstance(text, unicode):
        Log.error("only unicode is encrypted")
    if _key is None:
        Log.error("Expecting a key")
    if isinstance(_key, str):
        _key = bytearray(_key)
    if salt is None:
        salt = Random.bytes(16)

    data = bytearray(text.encode("utf8"))

    # Initialize encryption using key and iv
    key_expander_256 = key_expander.KeyExpander(256)
    expanded_key = key_expander_256.expand(_key)
    aes_cipher_256 = aes_cipher.AESCipher(expanded_key)
    aes_cbc_256 = cbc_mode.CBCMode(aes_cipher_256, 16)
    aes_cbc_256.set_iv(salt)

    output = Data()
    output.type = "AES256"
    output.salt = convert.bytes2base64(salt)
    output.length = len(data)

    encrypted = bytearray()
    for _, d in jx.groupby(data, size=16):
        encrypted.extend(aes_cbc_256.encrypt_block(d))
    output.data = convert.bytes2base64(encrypted)
    json = convert.value2json(output)

    if DEBUG:
        test = decrypt(json, _key)
        if test != text:
            Log.error("problem with encryption")

    return json

예제 #10

0

파일 보기

파일: app.py 프로젝트: klahnakoski/TestFailures

def agg(today, destination, debug_filter=None, please_stop=None):
    """
    :param today:  The day we are performing the calculation for
    :param destination: The ES index where we put the results
    :param debug_filter: Some extra limitation to go faster, and focus, for testing
    :param please_stop: Signal for stopping early
    :return: nothing
    """

    # GET LIST OF ALL TESTS, BY PLATFORM, TYPE, SUITE
    for suite in SUITES:
        domain = {"and": [
            {"prefix": {"run.suite": suite}},
            {"gt": {"build.date": (today - 3 * DAY).unix}},
            {"lt": {"build.date": (today + 4 * DAY).unix}},
            {"exists": "build.platform"},
            {"not": {"in": {"build.platform": EXCLUDE_PLATFORMS}}},
            {"not": {"in": {"build.branch": EXCLUDE_BRANCHES}}}
        ]}
        if debug_filter:
            domain['and'].append(debug_filter)

        _ = convert.value2json("\"\"")

        # WE CAN NOT PULL ALL TESTS, THERE ARE TOO MANY, SO DO ONE SUITE AT A TIME
        Log.note("Get summary of failures in {{suite}} for date {{date}}", suite=suite, date=today)
        suite_summary = http.post_json(config.source.url, json={
            "from": "unittest",
            "groupby": [
                {"name": "test", "value": "result.test"}
            ],
            "where": {"and": [
                domain,
                {"eq": {"result.ok": False}}
            ]},
            "format": "list",
            "limit": 100000
        })

        often_fail = jx.filter(suite_summary.data, {"gt": {"count": 1}})

        for g, tests in jx.groupby(often_fail, size=100):
            tests = wrap(tests)
            if please_stop:
                return

            Log.note("Collect stats on {{num}} tests", num=len(tests))
            tests_summary = http.post_json(config.source.url, json={
                "from": "unittest",
                "groupby": [
                    "run.suite",
                    {"name": "test", "value": "result.test"},
                    "build.platform",
                    "build.product",
                    "build.type",
                    "run.type"
                ],
                "select": [
                    {
                        "name": "date_fails",
                        "value": {
                            "mult": [
                                {"div": [{"sub": {"build.date": today + 0.5 * DAY}}, DAY.seconds]},
                                {"when": "result.ok", "then": 0, "else": 1}
                            ]
                        },
                        "aggregate": "stats"
                    },
                    {
                        "name": "date",
                        "value": {"div": [{"sub": {"build.date": today + 0.5 * DAY}}, DAY.seconds]},
                        "aggregate": "stats"
                    },
                    {
                        "name": "fails",
                        "value": {"when": "result.ok", "then": 0, "else": 1},
                        "aggregate": "stats"
                    }
                ],
                "where": {"and": [
                    domain,
                    {"in": {"result.test": tests}}
                ]},
                "format": "list",
                "limit": 100000
            })

            # FOR EACH TEST, CALCULATE THE "RECENTLY BAD" STATISTIC (linear regression slope)
            # THIS IS ONLY A ROUGH CALC FOR TESTING THE UI
            for t in tests_summary.data:
                try:
                    t._id = "-".join([
                        coalesce(t.build.product, ""),
                        t.build.platform,
                        coalesce(t.build.type, ""),
                        coalesce(t.run.type, ""),
                        t.run.suite,
                        t.test,
                        unicode(today.unix)
                    ])
                except Exception, e:
                    Log.error("text join problem", cause=e)
                t.timestamp = today
                t.average = t.fails.avg
                if t.date.var == 0:
                    t.slope = 0
                else:
                    t.slope = (t.date_fails.avg - t.date.avg * t.fails.avg) / t.date.var
                t.etl.timestamp = Date.now()

            # PUSH STATS TO ES
            docs = [{"id": t._id, "value": t} for t in tests_summary.data if t.fails.sum > 0]
            Log.note("Adding {{num}} test summaries", num=len(docs))
            destination.extend(docs)

예제 #11

0

파일 보기

 def groupby(self, keys):
     return jx.groupby(self.__iter__(), keys)

예제 #12

0

파일 보기

def process_batch(todo, coverage_index, coverage_summary_index, settings, please_stop):
    for not_summarized in todo:
        if please_stop:
            return True

        # IS THERE MORE THAN ONE COVERAGE FILE FOR THIS REVISION?
        Log.note("Find dups for file {{file}}", file=not_summarized.source.file.name)
        dups = http.post_json(settings.url, json={
            "from": "coverage",
            "select": [
                {"name": "max_id", "value": "etl.source.id", "aggregate": "max"},
                {"name": "min_id", "value": "etl.source.id", "aggregate": "min"}
            ],
            "where": {"and": [
                {"missing": "source.method.name"},
                {"eq": {
                    "source.file.name": not_summarized.source.file.name,
                    "build.revision12": not_summarized.build.revision12
                }},
            ]},
            "groupby": [
                "test.url"
            ],
            "limit": 100000,
            "format": "list"
        })

        dups_found = False
        for d in dups.data:
            if d.max_id != d.min_id:
                dups_found = True

                Log.note(
                    "removing dups {{details|json}}\n{{dups|json|indent}}",
                    details={
                        "id": int(d.max_id),
                        "test": d.test.url,
                        "source": not_summarized.source.file.name,
                        "revision": not_summarized.build.revision12
                    }
                )

                # FIND ALL INDEXES
                all_indexes = [
                    p.index
                    for p in coverage_index.cluster.get_aliases()
                    if p.alias == coverage_index.settings.alias
                ]
                for index_name in all_indexes:
                    elasticsearch.Index(index=index_name, read_only=False, cluster=coverage_index.cluster).delete_record({"and": [
                        {"not": {"term": {"etl.source.id": int(d.max_id)}}},
                        {"term": {"test.url": d.test.url}},
                        {"term": {"source.file.name": not_summarized.source.file.name}},
                        {"term": {"build.revision12": not_summarized.build.revision12}}
                    ]})
        if dups_found:
            continue

        # LIST ALL TESTS THAT COVER THIS FILE, AND THE LINES COVERED
        test_count = http.post_json(settings.url, json={
            "from": "coverage.source.file.covered",
            "where": {"and": [
                {"missing": "source.method.name"},
                {"eq": {
                    "source.file.name": not_summarized.source.file.name,
                    "build.revision12": not_summarized.build.revision12
                }},
            ]},
            "groupby": [
                "test.url",
                "line"
            ],
            "limit": 100000,
            "format": "list"
        })

        all_tests_covering_file = UNION(test_count.data.get("test.url"))
        num_tests = len(all_tests_covering_file)
        max_siblings = num_tests - 1
        Log.note(
            "{{filename}} rev {{revision}} is covered by {{num}} tests",
            filename=not_summarized.source.file.name,
            num=num_tests,
            revision=not_summarized.build.revision12
        )
        line_summary = list(
            (k, unwrap(wrap(list(v)).get("test.url")))
            for k, v in jx.groupby(test_count.data, keys="line")
        )

        # PULL THE RAW RECORD FOR MODIFICATION
        file_level_coverage_records = http.post_json(settings.url, json={
            "from": "coverage",
            "where": {"and": [
                {"missing": "source.method.name"},
                {"in": {"test.url": all_tests_covering_file}},
                {"eq": {
                    "source.file.name": not_summarized.source.file.name,
                    "build.revision12": not_summarized.build.revision12
                }}
            ]},
            "limit": 100000,
            "format": "list"
        })

        for test_name in all_tests_covering_file:
            siblings = [len(test_names)-1 for g, test_names in line_summary if test_name in test_names]
            min_siblings = MIN(siblings)
            coverage_candidates = jx.filter(file_level_coverage_records.data, lambda row, rownum, rows: row.test.url == test_name)
            if coverage_candidates:

                if len(coverage_candidates) > 1 and any(coverage_candidates[0]._id != c._id for c in coverage_candidates):
                    Log.warning(
                        "Duplicate coverage\n{{cov|json|indent}}",
                        cov=[{"_id": c._id, "run": c.run, "test": c.test} for c in coverage_candidates]
                    )

                # MORE THAN ONE COVERAGE CANDIDATE CAN HAPPEN WHEN THE SAME TEST IS IN TWO DIFFERENT CHUNKS OF THE SAME SUITE
                for coverage_record in coverage_candidates:
                    coverage_record.source.file.max_test_siblings = max_siblings
                    coverage_record.source.file.min_line_siblings = min_siblings
                    coverage_record.source.file.score = (max_siblings - min_siblings) / (max_siblings + min_siblings + 1)
            else:
                example = http.post_json(settings.url, json={
                    "from": "coverage",
                    "where": {"eq": {
                        "test.url": test_name,
                        "source.file.name": not_summarized.source.file.name,
                        "build.revision12": not_summarized.build.revision12
                    }},
                    "limit": 1,
                    "format": "list"
                })

                Log.warning(
                    "{{test|quote}} rev {{revision}} appears to have no coverage for {{file|quote}}!\n{{example|json|indent}}",
                    test=test_name,
                    file=not_summarized.source.file.name,
                    revision=not_summarized.build.revision12,
                    example=example.data[0]
                )

        bad_example = [d for d in file_level_coverage_records.data if d["source.file.min_line_siblings"] == None]
        if bad_example:
            Log.warning("expecting all records to have summary. Example:\n{{example}}", example=bad_example[0])

        rows = [{"id": d._id, "value": d} for d in file_level_coverage_records.data]
        coverage_summary_index.extend(rows)
        coverage_index.extend(rows)

        all_test_summary = []
        for g, records in jx.groupby(file_level_coverage_records.data, "source.file.name"):
            cov = UNION(records.source.file.covered)
            uncov = UNION(records.source.file.uncovered)
            coverage = {
                "_id": "|".join([records[0].build.revision12, g["source.file.name"]]),  # SOMETHING UNIQUE, IN CASE WE RECALCULATE
                "source": {
                    "file": {
                        "name": g["source.file.name"],
                        "is_file": True,
                        "covered": jx.sort(cov, "line"),
                        "uncovered": jx.sort(uncov),
                        "total_covered": len(cov),
                        "total_uncovered": len(uncov),
                        "min_line_siblings": 0  # PLACEHOLDER TO INDICATE DONE
                    }
                },
                "build": records[0].build,
                "repo": records[0].repo,
                "run": records[0].run,
                "etl": {"timestamp": Date.now()}
            }
            all_test_summary.append(coverage)

        sum_rows = [{"id": d["_id"], "value": d} for d in all_test_summary]
        coverage_summary_index.extend(sum_rows)

        if DEBUG:
            coverage_index.refresh()
            todo = http.post_json(settings.url, json={
                "from": "coverage",
                "where": {"and": [
                    {"missing": "source.method.name"},
                    {"missing": "source.file.min_line_siblings"},
                    {"eq": {"source.file.name": not_summarized.source.file.name}},
                    {"eq": {"build.revision12": not_summarized.build.revision12}}
                ]},
                "format": "list",
                "limit": 10
            })
            if todo.data:
                Log.error("Failure to update")

예제 #13

0

파일 보기

파일: replicate.py 프로젝트: klahnakoski/esReplicate

            locals()[k] = v

        for k, f in fixes.items():
            try:
                _source[k] = eval(f)
            except Exception, e:
                if "Problem pulling pushlog" in e:
                    pass
                elif "can not find branch" in e:
                    pass
                else:
                    Log.warning("not evaluated {{expression}}", expression=f, cause=e)

        return _source

    for g, docs in jx.groupby(pending_ids, max_size=BATCH_SIZE):
        with Timer("Replicate {{num_docs}} documents", {"num_docs": len(docs)}):
            data = source.search({
                "query": {"filtered": {
                    "query": {"match_all": {}},
                    "filter": {"terms": {"_id": set(docs)}}
                }},
                "from": 0,
                "size": 200000,
                "sort": []
            })

            destination.extend([{"id": h._id, "value": fixer(h._source)} for h in data.hits.hits])

        if please_stop:
            break

예제 #14

0

파일 보기

파일: mysql.py 프로젝트: klahnakoski/MoTreeherder

        if self.db.__module__.startswith("pymysql"):
            # BUG IN PYMYSQL: CAN NOT HANDLE MULTIPLE STATEMENTS
            # https://github.com/PyMySQL/PyMySQL/issues/157
            for b in backlog:
                sql = self.preamble + b
                try:
                    if self.debug:
                        Log.note("Execute SQL:\n{{sql|indent}}",  sql= sql)
                    self.cursor.execute(b)
                except Exception, e:
                    Log.error("Can not execute sql:\n{{sql}}",  sql= sql, cause=e)

            self.cursor.close()
            self.cursor = self.db.cursor()
        else:
            for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE):
                sql = self.preamble + ";\n".join(g)
                try:
                    if self.debug:
                        Log.note("Execute block of SQL:\n{{sql|indent}}",  sql= sql)
                    self.cursor.execute(sql)
                    self.cursor.close()
                    self.cursor = self.db.cursor()
                except Exception, e:
                    Log.error("Problem executing SQL:\n{{sql|indent}}",  sql= sql, cause=e, stack_depth=1)


    ## Insert dictionary of values into table
    def insert(self, table_name, record):
        keys = record.keys()

예제 #15

0

파일 보기

파일: meta.py 프로젝트: klahnakoski/SpotManager

 def groupby(self, keys):
     return jx.groupby(self.__iter__(), keys)

예제 #16

0

파일 보기

파일: treeherder.py 프로젝트: klahnakoski/MoTreeherder

    def _get_job_results_from_th(self, branch, revision):
        output = []

        with self.locker:
            waiting_threads = self.pending.get((branch, revision))
            if waiting_threads is None:
                sig = None
                waiting_threads = self.pending[(branch, revision)] = [output]
            else:
                sig = Signal()
                waiting_threads.append(Signal())

        if sig is not None:
            Log.note("Holding thread for {{branch}}/{{revision}}", branch=branch, revision=revision)
            sig.wait_for_go()
            return waiting_threads[0]

        try:
            results = DictList()
            while True:
                response = self._rate_limited_get_json(expand_template(RESULT_SET_URL, {"branch": branch, "revision": revision[0:12:]}))
                results.extend(response.results)
                if len(response.results) != 1000:
                    break

            for g, repo_ids in jx.groupby(results.id, size=10):
                jobs = DictList()
                with Timer("Get {{num}} jobs", {"num": len(repo_ids)}, debug=DEBUG):
                    while True:
                        response = self._rate_limited_get_json(expand_template(JOBS_URL, {"branch": branch, "offset": len(jobs), "result_set_id": ",".join(map(unicode, repo_ids))}))
                        jobs.extend(response.results)
                        if len(response.results) != 2000:
                            break

                with Timer("Get (up to {{num}}) details from TH", {"num": len(jobs)}, debug=DEBUG):
                    details = []
                    for _, ids in jx.groupby(jobs.id, size=40):
                        details.extend(self._rate_limited_get_json(
                            url=expand_template(DETAILS_URL, {"branch": branch, "job_id": ",".join(map(unicode, ids))}),
                            retry={"times": 3}
                        ).results)
                    details = {k.job_guid: list(v) for k, v in jx.groupby(details, "job_guid")}

                with Timer("Get (up to {{num}}) stars from TH", {"num": len(jobs)}, debug=DEBUG):
                    stars = []
                    for _, ids in jx.groupby(jobs.id, size=40):
                        response = self._rate_limited_get_json(expand_template(JOB_BUG_MAP, {"branch": branch, "job_id": "&job_id=".join(map(unicode, ids))}))
                        stars.extend(response),
                    stars = {k.job_id: list(v) for k, v in jx.groupby(stars, "job_id")}

                with Timer("Get notes from TH", debug=DEBUG):
                    notes = []
                    for jid in set([j.id for j in jobs if j.failure_classification_id != 1] + stars.keys()):
                        response = self._rate_limited_get_json(expand_template(NOTES_URL, {"branch": branch, "job_id": unicode(jid)}))
                        notes.extend(response),
                    notes = {k.job_id: list(v) for k, v in jx.groupby(notes, "job_id")}

                for j in jobs:
                    output.append(self._normalize_job_result(branch, revision, j, details, notes, stars))

            if output:
                with Timer("Write to ES cache", debug=DEBUG):
                    self.cache.extend({"id": "-".join([c.repo.branch, unicode(c.job.id)]), "value": c} for c in output)
                    try:
                        self.cache.flush()
                    except Exception, e:
                        Log.warning("problem flushing. nevermind.", cause=e)
        finally:
            with self.locker:
                for p in waiting_threads[1:]:
                    if DEBUG:
                        Log.note("releasing thread for {{branch}}/{{revision}}", branch=branch, revision=revision)
                    p.go()
                self.pending[(branch, revision)] = None

        return output

예제 #17

0

파일 보기

파일: hg_mozilla_org.py 프로젝트: klahnakoski/esReplicate

    def _load_all_in_push(self, revision, locale=None):
        # http://hg.mozilla.org/mozilla-central/json-pushes?full=1&changeset=57c461500a0c
        found_revision = copy(revision)
        if isinstance(found_revision.branch, basestring):
            lower_name = found_revision.branch.lower()
        else:
            lower_name = found_revision.branch.name.lower()

        if not lower_name:
            Log.error("Defective revision? {{rev|json}}", rev=found_revision.branch)

        b = found_revision.branch = self.branches[(lower_name, locale)]
        if not b:
            b = found_revision.branch = self.branches[(lower_name, DEFAULT_LOCALE)]
            if not b:
                Log.error("can not find branch ({{branch}}, {{locale}})", name=lower_name, locale=locale)
        if Date.now() - Date(b.etl.timestamp) > _OLD_BRANCH:
            self.branches = _hg_branches.get_branches(use_cache=True, settings=self.settings)

        url = found_revision.branch.url.rstrip("/") + "/json-pushes?full=1&changeset=" + found_revision.changeset.id
        Log.note(
            "Reading pushlog for revision ({{branch}}, {{locale}}, {{changeset}}): {{url}}",
            branch=found_revision.branch.name,
            locale=locale,
            changeset=found_revision.changeset.id,
            url=url,
        )

        try:
            data = self._get_and_retry(url, found_revision.branch)

            revs = []
            output = None
            for index, _push in data.items():
                push = Push(id=int(index), date=_push.date, user=_push.user)

                for _, ids in jx.groupby(_push.changesets.node, size=200):
                    url_param = "&".join("node=" + c[0:12] for c in ids)

                    url = found_revision.branch.url.rstrip("/") + "/json-info?" + url_param
                    Log.note("Reading details from {{url}}", {"url": url})

                    raw_revs = self._get_and_retry(url, found_revision.branch)
                    for r in raw_revs.values():
                        rev = Revision(
                            branch=found_revision.branch,
                            index=r.rev,
                            changeset=Changeset(
                                id=r.node,
                                id12=r.node[0:12],
                                author=r.user,
                                description=r.description,
                                date=Date(r.date),
                                files=r.files,
                            ),
                            parents=unwraplist(r.parents),
                            children=unwraplist(r.children),
                            push=push,
                            etl={"timestamp": Date.now().unix},
                        )
                        if r.node == found_revision.changeset.id:
                            output = rev
                        if r.node[0:12] == found_revision.changeset.id[0:12]:
                            output = rev
                        _id = (
                            coalesce(rev.changeset.id12, "")
                            + "-"
                            + rev.branch.name
                            + "-"
                            + coalesce(rev.branch.locale, DEFAULT_LOCALE)
                        )
                        revs.append({"id": _id, "value": rev})
            self.es.extend(revs)
            return output
        except Exception, e:
            Log.error("Problem pulling pushlog from {{url}}", url=url, cause=e)

예제 #18

0

파일 보기

파일: meta.py 프로젝트: klahnakoski/esReplicate

    def _update_cardinality(self, c):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if c.type in STRUCT:
            Log.error("not supported")
        try:
            if c.table == "meta.columns":
                with self.meta.columns.locker:
                    partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.columns, c.es_column) if g[c.es_column] != None])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.columns),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"table": c.table, "es_column": c.es_column}}
                    })
                return
            if c.table == "meta.tables":
                with self.meta.columns.locker:
                    partitions = jx.sort([g[c.es_column] for g, _ in jx.groupby(self.meta.tables, c.es_column) if g[c.es_column] != None])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.tables),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {"eq": {"table": c.table, "name": c.name}}
                    })
                return

            es_index = c.table.split(".")[0]
            result = self.default_es.post("/" + es_index + "/_search", data={
                "aggs": {c.name: _counting_query(c)},
                "size": 0
            })
            r = result.aggregations.values()[0]
            count = result.hits.total
            cardinality = coalesce(r.value, r._nested.value, 0 if r.doc_count==0 else None)
            if cardinality == None:
                Log.error("logic error")

            query = Dict(size=0)
            if cardinality > 1000 or (count >= 30 and cardinality == count) or (count >= 1000 and cardinality / count > 0.99):
                Log.note("{{table}}.{{field}} has {{num}} parts", table=c.table, field=c.es_column, num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                Log.note("{{field}} has {{num}} parts", field=c.name, num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
                return
            elif c.nested_path:
                query.aggs[literal_field(c.name)] = {
                    "nested": {"path": listwrap(c.nested_path)[0]},
                    "aggs": {"_nested": {"terms": {"field": c.es_column, "size": 0}}}
                }
            else:
                query.aggs[literal_field(c.name)] = {"terms": {"field": c.es_column, "size": 0}}

            result = self.default_es.post("/" + es_index + "/_search", data=query)

            aggs = result.aggregations.values()[0]
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            Log.note("{{field}} has {{parts}}", field=c.name, parts=parts)
            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                })
        except Exception, e:
            if "IndexMissingException" in e and c.table.startswith(TEST_TABLE_PREFIX):
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": 0,
                            "cardinality": 0,
                            "last_updated": Date.now()
                        },
                        "clear":[
                            "partitions"
                        ],
                        "where": {"eq": {"es_index": c.es_index, "es_column": c.es_column}}
                    })
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {"eq": {"table": c.table, "es_column": c.es_column}}
                })
                Log.warning("Could not get {{col.table}}.{{col.es_column}} info", col=c, cause=e)

예제 #19

0

파일 보기

파일: mysql.py 프로젝트: davehunt/ActiveData

            # https://github.com/PyMySQL/PyMySQL/issues/157
            for b in backlog:
                sql = self.preamble + b
                try:
                    if self.debug:
                        Log.note("Execute SQL:\n{{sql|indent}}", sql=sql)
                    self.cursor.execute(b)
                except Exception, e:
                    Log.error("Can not execute sql:\n{{sql}}",
                              sql=sql,
                              cause=e)

            self.cursor.close()
            self.cursor = self.db.cursor()
        else:
            for i, g in jx.groupby(backlog, size=MAX_BATCH_SIZE):
                sql = self.preamble + ";\n".join(g)
                try:
                    if self.debug:
                        Log.note("Execute block of SQL:\n{{sql|indent}}",
                                 sql=sql)
                    self.cursor.execute(sql)
                    self.cursor.close()
                    self.cursor = self.db.cursor()
                except Exception, e:
                    Log.error("Problem executing SQL:\n{{sql|indent}}",
                              sql=sql,
                              cause=e,
                              stack_depth=1)

    ## Insert dictionary of values into table

예제 #20

0

파일 보기

파일: meta.py 프로젝트: klahnakoski/MoTreeherder

    def _update_cardinality(self, c):
        """
        QUERY ES TO FIND CARDINALITY AND PARTITIONS FOR A SIMPLE COLUMN
        """
        if c.type in ["object", "nested"]:
            Log.error("not supported")
        try:
            if c.table == "meta.columns":
                with self.meta.columns.locker:
                    partitions = jx.sort([
                        g[c.es_column]
                        for g, _ in jx.groupby(self.meta.columns, c.es_column)
                        if g[c.es_column] != None
                    ])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.columns),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {
                            "eq": {
                                "table": c.table,
                                "es_column": c.es_column
                            }
                        }
                    })
                return
            if c.table == "meta.tables":
                with self.meta.columns.locker:
                    partitions = jx.sort([
                        g[c.es_column]
                        for g, _ in jx.groupby(self.meta.tables, c.es_column)
                        if g[c.es_column] != None
                    ])
                    self.meta.columns.update({
                        "set": {
                            "partitions": partitions,
                            "count": len(self.meta.tables),
                            "cardinality": len(partitions),
                            "last_updated": Date.now()
                        },
                        "where": {
                            "eq": {
                                "table": c.table,
                                "name": c.name
                            }
                        }
                    })
                return

            es_index = c.table.split(".")[0]
            result = self.default_es.post("/" + es_index + "/_search",
                                          data={
                                              "aggs": {
                                                  c.name: _counting_query(c)
                                              },
                                              "size": 0
                                          })
            r = result.aggregations.values()[0]
            count = result.hits.total
            cardinality = coalesce(r.value, r._nested.value,
                                   0 if r.doc_count == 0 else None)
            if cardinality == None:
                Log.error("logic error")

            query = Dict(size=0)
            if cardinality > 1000 or (count >= 30 and cardinality == count
                                      ) or (count >= 1000
                                            and cardinality / count > 0.99):
                Log.note("{{table}}.{{field}} has {{num}} parts",
                         table=c.table,
                         field=c.es_column,
                         num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": c.es_index,
                                "es_column": c.es_column
                            }
                        }
                    })
                return
            elif c.type in _elasticsearch.ES_NUMERIC_TYPES and cardinality > 30:
                Log.note("{{field}} has {{num}} parts",
                         field=c.name,
                         num=cardinality)
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": count,
                            "cardinality": cardinality,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": c.es_index,
                                "es_column": c.es_column
                            }
                        }
                    })
                return
            elif c.nested_path:
                query.aggs[literal_field(c.name)] = {
                    "nested": {
                        "path": listwrap(c.nested_path)[0]
                    },
                    "aggs": {
                        "_nested": {
                            "terms": {
                                "field": c.es_column,
                                "size": 0
                            }
                        }
                    }
                }
            else:
                query.aggs[literal_field(c.name)] = {
                    "terms": {
                        "field": c.es_column,
                        "size": 0
                    }
                }

            result = self.default_es.post("/" + es_index + "/_search",
                                          data=query)

            aggs = result.aggregations.values()[0]
            if aggs._nested:
                parts = jx.sort(aggs._nested.buckets.key)
            else:
                parts = jx.sort(aggs.buckets.key)

            Log.note("{{field}} has {{parts}}", field=c.name, parts=parts)
            with self.meta.columns.locker:
                self.meta.columns.update({
                    "set": {
                        "count": count,
                        "cardinality": cardinality,
                        "partitions": parts,
                        "last_updated": Date.now()
                    },
                    "where": {
                        "eq": {
                            "es_index": c.es_index,
                            "es_column": c.es_column
                        }
                    }
                })
        except Exception, e:
            if "IndexMissingException" in e and c.table.startswith(
                    TEST_TABLE_PREFIX):
                with self.meta.columns.locker:
                    self.meta.columns.update({
                        "set": {
                            "count": 0,
                            "cardinality": 0,
                            "last_updated": Date.now()
                        },
                        "clear": ["partitions"],
                        "where": {
                            "eq": {
                                "es_index": c.es_index,
                                "es_column": c.es_column
                            }
                        }
                    })
            else:
                self.meta.columns.update({
                    "set": {
                        "last_updated": Date.now()
                    },
                    "clear": [
                        "count",
                        "cardinality",
                        "partitions",
                    ],
                    "where": {
                        "eq": {
                            "table": c.table,
                            "es_column": c.es_column
                        }
                    }
                })
                Log.warning(
                    "Could not get {{col.table}}.{{col.es_column}} info",
                    col=c,
                    cause=e)

예제 #21

0

파일 보기

파일: treeherder.py 프로젝트: klahnakoski/MoTreeherder

    def _get_job_results_from_th(self, branch, revision):
        output = []

        with self.locker:
            waiting_threads = self.pending.get((branch, revision))
            if waiting_threads is None:
                sig = None
                waiting_threads = self.pending[(branch, revision)] = [output]
            else:
                sig = Signal()
                waiting_threads.append(Signal())

        if sig is not None:
            Log.note("Holding thread for {{branch}}/{{revision}}",
                     branch=branch,
                     revision=revision)
            sig.wait_for_go()
            return waiting_threads[0]

        try:
            results = DictList()
            while True:
                response = self._rate_limited_get_json(
                    expand_template(RESULT_SET_URL, {
                        "branch": branch,
                        "revision": revision[0:12:]
                    }))
                results.extend(response.results)
                if len(response.results) != 1000:
                    break

            for g, repo_ids in jx.groupby(results.id, size=10):
                jobs = DictList()
                with Timer("Get {{num}} jobs", {"num": len(repo_ids)},
                           debug=DEBUG):
                    while True:
                        response = self._rate_limited_get_json(
                            expand_template(
                                JOBS_URL, {
                                    "branch":
                                    branch,
                                    "offset":
                                    len(jobs),
                                    "result_set_id":
                                    ",".join(map(unicode, repo_ids))
                                }))
                        jobs.extend(response.results)
                        if len(response.results) != 2000:
                            break

                with Timer("Get (up to {{num}}) details from TH",
                           {"num": len(jobs)},
                           debug=DEBUG):
                    details = []
                    for _, ids in jx.groupby(jobs.id, size=40):
                        details.extend(
                            self._rate_limited_get_json(url=expand_template(
                                DETAILS_URL, {
                                    "branch": branch,
                                    "job_id": ",".join(map(unicode, ids))
                                }),
                                                        retry={
                                                            "times": 3
                                                        }).results)
                    details = {
                        k.job_guid: list(v)
                        for k, v in jx.groupby(details, "job_guid")
                    }

                with Timer("Get (up to {{num}}) stars from TH",
                           {"num": len(jobs)},
                           debug=DEBUG):
                    stars = []
                    for _, ids in jx.groupby(jobs.id, size=40):
                        response = self._rate_limited_get_json(
                            expand_template(
                                JOB_BUG_MAP, {
                                    "branch": branch,
                                    "job_id": "&job_id=".join(map(
                                        unicode, ids))
                                }))
                        stars.extend(response),
                    stars = {
                        k.job_id: list(v)
                        for k, v in jx.groupby(stars, "job_id")
                    }

                with Timer("Get notes from TH", debug=DEBUG):
                    notes = []
                    for jid in set([
                            j.id
                            for j in jobs if j.failure_classification_id != 1
                    ] + stars.keys()):
                        response = self._rate_limited_get_json(
                            expand_template(NOTES_URL, {
                                "branch": branch,
                                "job_id": unicode(jid)
                            }))
                        notes.extend(response),
                    notes = {
                        k.job_id: list(v)
                        for k, v in jx.groupby(notes, "job_id")
                    }

                for j in jobs:
                    output.append(
                        self._normalize_job_result(branch, revision, j,
                                                   details, notes, stars))

            if output:
                with Timer("Write to ES cache", debug=DEBUG):
                    self.cache.extend(
                        {
                            "id": "-".join([c.repo.branch,
                                            unicode(c.job.id)]),
                            "value": c
                        } for c in output)
                    try:
                        self.cache.flush()
                    except Exception, e:
                        Log.warning("problem flushing. nevermind.", cause=e)
        finally:
            with self.locker:
                for p in waiting_threads[1:]:
                    if DEBUG:
                        Log.note(
                            "releasing thread for {{branch}}/{{revision}}",
                            branch=branch,
                            revision=revision)
                    p.go()
                self.pending[(branch, revision)] = None

        return output