def __init__(self, pushlog_settings):
     with Timer("get pushlog"):
         if pushlog_settings.disable:
             all_pushlogs = []
         else:
             with DB(pushlog_settings) as db:
                 all_pushlogs = db.query("""
                     SELECT
                         pl.`date`,
                         left(ch.node, 12) revision,
                         coalesce(bm.alt_name, br.name) branch
                     FROM
                         changesets ch
                     LEFT JOIN
                         pushlogs pl ON pl.id = ch.pushlog_id
                     LEFT JOIN
                         branches br ON br.id = pl.branch_id
                     LEFT JOIN
                         branch_map bm ON br.id = bm.id
                     WHERE
                         pl.date > {{oldest_date}}
                 """, {"oldest_date": TOO_OLD})
         Log.note("Got pushlog, now indexing...")
         self.pushlog = wrap(Q.index(all_pushlogs, ["branch", "revision"])._data)
         self.locker = Lock()
         self.unknown_branches = set()
def get_existing_ids(es, settings, branches):
    #FIND WHAT'S IN ES
    bad_ids = []
    int_ids = set()

    demand_pushlog = {"match_all":{}}
    if branches:
        demand_pushlog = {"or": [
            {"not": {"missing": {"field": "test_build.push_date"}}},
            {"not": {"missing": {"field": "test_build.no_pushlog"}}}
        ]}

    if settings.elasticsearch.debug and settings.production.step < 10:
        # SIMPLY RELOAD THIS SMALL NUMBER
        return set([])

    with ESQuery(es) as esq:
        max_id = esq.query({
            "from": es.settings.alias,
            "select": {"value": "datazilla.id", "aggregate": "max"}
        })

        interval_size = 200000
        for mini, maxi in Q.intervals(settings.production.min, max_id+interval_size, interval_size):
            existing_ids = es.search({
                "query": {
                    "filtered": {
                        "query": {"match_all": {}},
                        "filter": {"and": [
                            {"range": {"datazilla.id": {"gte": mini, "lt": maxi}}},
                            demand_pushlog
                        ]}
                    }
                },
                "from": 0,
                "size": 0,
                "sort": [],
                "facets": {
                    "ids": {"terms": {"field": "datazilla.id", "size": interval_size}}
                }
            })

            for t in existing_ids.facets.ids.terms:
                try:
                    int_ids.add(int(t.term))
                except Exception, e:
                    bad_ids.append(t.term)

        existing_ids = int_ids
        Log.println("Number of ids in ES: " + str(len(existing_ids)))
        Log.println("BAD ids in ES: " + str(bad_ids))
        return existing_ids
    def transform(self, id, datazilla):
        try:
            r = datazilla.json_blob

            #ADD DATAZILLA MARKUP
            r.datazilla = {
                "id": id,
                "date_loaded": datazilla.date_loaded * 1000,
                "error_flag": datazilla.error_flag,
                "test_run_id": datazilla.test_run_id,
                "processed_flag": datazilla.processed_flag,
                "error_msg": datazilla.error_msg
            }

            #CONVERT UNIX TIMESTAMP TO MILLISECOND TIMESTAMP
            r.testrun.date *= 1000

            def mainthread_transform(r):
                if r == None:
                    return None

                output = Struct()

                for i in r.mainthread_readbytes:
                    output[literal_field(i[1])].name = i[1]
                    output[literal_field(i[1])].readbytes = i[0]
                r.mainthread_readbytes = None

                for i in r.mainthread_writebytes:
                    output[literal_field(i[1])].name = i[1]
                    output[literal_field(i[1])].writebytes = i[0]
                r.mainthread_writebytes = None

                for i in r.mainthread_readcount:
                    output[literal_field(i[1])].name = i[1]
                    output[literal_field(i[1])].readcount = i[0]
                r.mainthread_readcount = None

                for i in r.mainthread_writecount:
                    output[literal_field(i[1])].name = i[1]
                    output[literal_field(i[1])].writecount = i[0]
                r.mainthread_writecount = None

                r.mainthread = output.values()

            mainthread_transform(r.results_aux)
            mainthread_transform(r.results_xperf)

            #ADD PUSH LOG INFO
            try:
                branch = r.test_build.branch
                if branch.endswith("-Non-PGO"):
                    r.test_build.branch = branch
                    r.test_build.pgo = False
                    branch = branch[0:-8]
                else:
                    r.test_build.pgo = True

                with Profiler("get from pushlog"):
                    if not self.pushlog:
                        #NO PUSHLOG MEANS WE DO NOTHING TO MARKUP TEST RESULTS
                        pass
                    elif self.pushlog[branch]:
                        possible_dates = self.pushlog[branch][r.test_build.revision]
                        if possible_dates:
                            r.test_build.push_date = int(Math.round(possible_dates[0].date * 1000))
                        else:
                            if r.test_build.revision == 'NULL':
                                r.test_build.no_pushlog = True  # OOPS! SOMETHING BROKE
                            elif CNV.milli2datetime(Math.min(r.testrun.date, r.datazilla.date_loaded)) < PUSHLOG_TOO_OLD:
                                Log.note("{{branch}} @ {{revision}} has no pushlog, transforming anyway", r.test_build)
                                r.test_build.no_pushlog = True
                            else:
                                Log.note("{{branch}} @ {{revision}} has no pushlog, try again later", r.test_build)
                                return []  # TRY AGAIN LATER
                    else:
                        with self.locker:
                            if branch not in self.unknown_branches:
                                Log.note("Whole branch {{branch}} has no pushlog", {"branch":branch})
                                self.unknown_branches.add(branch)
                            if CNV.milli2datetime(Math.min(r.testrun.date, r.datazilla.date_loaded)) < PUSHLOG_TOO_OLD:
                                r.test_build.no_pushlog = True
                            else:
                                r.test_build.no_pushlog = True
                                #return [r]  #TODO: DO THIS IF WE FIGURE OUT HOW TO HANDLE THE VERY LARGE NUMBER OF RESULTS WITH NO PUSHLOG

            except Exception, e:
                Log.warning("{{branch}} @ {{revision}} has no pushlog", r.test_build, e)

            new_records = []

            # RECORD THE UNKNOWN PART OF THE TEST RESULTS
            remainder = r.copy()
            remainder.results = None
            if len(remainder.keys()) > 4:
                new_records.append(remainder)

            #RECORD TEST RESULTS
            total = StructList()
            if r.testrun.suite in ["dromaeo_css", "dromaeo_dom"]:
                #dromaeo IS SPECIAL, REPLICATES ARE IN SETS OF FIVE
                #RECORD ALL RESULTS
                for i, (test_name, replicates) in enumerate(r.results.items()):
                    for g, sub_results in Q.groupby(replicates, size=5):
                        new_record = Struct(
                            test_machine=r.test_machine,
                            datazilla=r.datazilla,
                            testrun=r.testrun,
                            test_build=r.test_build,
                            result={
                                "test_name": unicode(test_name) + "." + unicode(g),
                                "ordering": i,
                                "samples": sub_results
                            }
                        )
                        try:
                            s = stats(sub_results)
                            new_record.result.stats = s
                            total.append(s)
                        except Exception, e:
                            Log.warning("can not reduce series to moments", e)
                        new_records.append(new_record)
                        "ordering": -1,
                        "stats": geo_mean(total)
                    }
                )
                new_records.append(new_record)

                # ADD RECORD FOR GRAPH SERVER SUMMARY
                new_record = Struct(
                    test_machine=r.test_machine,
                    datazilla=r.datazilla,
                    testrun=r.testrun,
                    test_build=r.test_build,
                    result={
                        "test_name": "summary_old",
                        "ordering": -1,
                        "stats": Stats(samples=Q.sort(total.mean)[:len(total)-1:])
                    }
                )
                new_records.append(new_record)

            return new_records
        except Exception, e:
            Log.error("Transformation failure on id={{id}}", {"id":id}, e)


def stats(values):
    """
    RETURN LOTS OF AGGREGATES
    """
    if values == None:
        return None
                            "length": len(CNV.object2JSON(line)),
                            "prefix": CNV.object2JSON(line)[0:130]
                        }, e)
        missing_ids = missing_ids - existing_ids

    #COPY MISSING DATA TO ES
    try:
        with ThreadedQueue(es, size=nvl(es.settings.batch_size, 100)) as es_sink:
            with ThreadedQueue(File(settings.param.output_file), size=50) as file_sink:
                simple_etl = functools.partial(etl, *[es_sink, file_sink, settings, transformer, max_existing_id])

                num_not_found = 0
                with Multithread(simple_etl, threads=settings.production.threads) as many:
                    results = many.execute([
                        {"id": id}
                        for id in Q.sort(missing_ids)[:nvl(settings.production.step, NUM_PER_BATCH):]
                    ])
                    for result in results:
                        if not result:
                            num_not_found += 1
                            if num_not_found > nvl(settings.production.max_tries, 10):
                                many.inbound.pop_all()  # CLEAR THE QUEUE OF OTHER WORK
                                many.stop()
                                break
                        else:
                            num_not_found = 0
    except (KeyboardInterrupt, SystemExit):
        Log.println("Shutdown Started, please be patient")
    except Exception, e:
        Log.error("Unusual shutdown!", e)