def extract_from_datazilla_using_id(es, settings, transformer):

    existing_ids = get_existing_ids(es, settings, transformer.pushlog.keys())
    max_existing_id = nvl(MAX(existing_ids), settings.production.min)
    holes = set(range(settings.production.min, max_existing_id)) - existing_ids
    missing_ids = set(range(settings.production.min, max_existing_id+nvl(settings.production.step, NUM_PER_BATCH))) - existing_ids

    Log.note("Number missing: {{num}}", {"num": len(missing_ids)})
    Log.note("Number in holes: {{num}}", {"num": len(holes)})
    #FASTER IF NO INDEXING IS ON
    es.set_refresh_interval(-1)

    #FILE IS FASTER THAN NETWORK
    if (len(holes) > 10000 or settings.args.scan_file or settings.args.restart) and File(settings.param.output_file).exists:
        #ASYNCH PUSH TO ES IN BLOCKS OF 1000
        with Timer("Scan file for missing ids"):
            with ThreadedQueue(es, size=nvl(es.settings.batch_size, 100)) as json_for_es:
                num = 0
                for line in File(settings.param.output_file):
                    try:
                        if len(line.strip()) == 0:
                            continue
                        col = line.split("\t")
                        id = int(col[0])
                        # if id==3003529:
                        #     Log.debug()
                        if id < settings.production.min:
                            continue
                        if id in existing_ids:
                            continue

                        if num > settings.production.step:
                            return
                        num += 1

                        with Profiler("decode and transform"):
                            data = CNV.JSON2object(col[-1])
                            if data.test_run_id:
                                with Profiler("transform"):
                                    data = transformer.transform(id, data)
                                json_for_es.extend({"value": d} for d in data)
                                Log.note("Added {{id}} from file", {"id": id})

                                existing_ids.add(id)
                            else:
                                Log.note("Skipped {{id}} from file (no test_run_id)", {"id": id})
                                num -= 1

                    except Exception, e:
                        Log.warning("Bad line id={{id}} ({{length}}bytes):\n\t{{prefix}}", {
                            "id": id,
                            "length": len(CNV.object2JSON(line)),
                            "prefix": CNV.object2JSON(line)[0:130]
                        }, e)
        missing_ids = missing_ids - existing_ids
def geo_mean(values):
    """
    GIVEN AN ARRAY OF dicts, CALC THE GEO-MEAN ON EACH ATTRIBUTE
    """
    agg = Struct()
    for d in values:
        for k, v in d.items():
            if v != 0:
                agg[k] = nvl(agg[k], ZeroMoment.new_instance()) + Math.log(Math.abs(v))
    return {k: Math.exp(v.stats.mean) for k, v in agg.items()}
def main():
    try:
        settings = startup.read_settings(defs=[{
            "name": ["--no_restart", "--no_reset", "--no_redo", "--norestart", "--noreset", "--noredo"],
            "help": "do not allow creation of new index (for debugging rouge resets)",
            "action": "store_true",
            "dest": "no_restart"
        }, {
            "name": ["--restart", "--reset", "--redo"],
            "help": "force a reprocessing of all data",
            "action": "store_true",
            "dest": "restart"
        }, {
            "name": ["--file", "--scan_file", "--scanfile", "--use_file", "--usefile"],
            "help": "scan file for missing ids",
            "action": "store_true",
            "dest": "scan_file"
        }, {
            "name": ["--nofile", "--no_file", "--no-file"],
            "help": "do not scan file for missing ids",
            "action": "store_false",
            "dest": "scan_file"
        }])
        Log.start(settings.debug)

        with startup.SingleInstance(flavor_id=settings.args.filename):
            settings.production.threads = nvl(settings.production.threads, 1)
            settings.param.output_file = nvl(settings.param.output_file, "./results/raw_json_blobs.tab")

            transformer = DZ_to_ES(settings.pushlog)

            #RESET ONLY IF NEW Transform IS USED
            if settings.args.restart:
                es = Cluster(settings.elasticsearch).create_index(settings.elasticsearch)
                es.add_alias()
                es.delete_all_but_self()
                extract_from_datazilla_using_id(es, settings, transformer)
            else:
                es = Cluster(settings.elasticsearch).get_or_create_index(settings.elasticsearch)
                extract_from_datazilla_using_id(es, settings, transformer)
    except Exception, e:
        Log.error("Problem with etl", e)
def etl(es_sink, file_sink, settings, transformer, max_id, id):
    """
    PULL FROM DZ AND PUSH TO es AND file_sink
    """

    # DEBUG GROWTH
    # with GC_LOCKER:
    #     try:
    #         if COUNTER.count % 100 == 0:
    #             # gc.collect()
    #             deltas, stats = objgraph.get_growth()
    #             Log.note("Deltas:\n{{deltas|indent}}", {"deltas": deltas})
    #     except Exception, e:
    #         Log.warning("objgraph problem", e)
    #
    #     COUNTER.count += 1

    url = settings.production.blob_url + "/" + str(id)
    try:
        with Timer("read {{id}} from DZ", {"id": id}):
            content = requests.get(url, timeout=nvl(settings.production.timeout, 30)).content
    except Exception, e:
        Log.warning("Failure to read from {{url}}", {"url": url}, e)
        return False
    with ThreadedQueue(db_queue, 100) as records_for_db:
        added = set()

        for line in File(filename).iter():
            try:
                if len(line.strip()) == 0: continue
                col = line.split("\t")
                id = int(col[0])
                if id in added:
                    continue
                added.add(id)

                data = CNV.JSON2object(col[1])
                records_for_db.add({
                    "id": nvl(data.test_run_id, id),
                    "branch": data.json_blob.test_build.branch,
                    "name": data.json_blob.test_build.name,
                    "version": data.json_blob.test_build.version,
                    "suite": data.json_blob.testrun.suite,
                    "revision": data.json_blob.test_build.revision,
                    "date": data.json_blob.testrun.date
                })
                Log.note("Added {{id}} from file", {"id": data.test_run_id})
            except Exception, e:
                Log.warning("Bad line ({{length}}bytes):\n\t{{prefix}}", {
                    "length": len(CNV.object2JSON(line)),
                    "prefix": CNV.object2JSON(line)[0:130]
                }, e)

                                existing_ids.add(id)
                            else:
                                Log.note("Skipped {{id}} from file (no test_run_id)", {"id": id})
                                num -= 1

                    except Exception, e:
                        Log.warning("Bad line id={{id}} ({{length}}bytes):\n\t{{prefix}}", {
                            "id": id,
                            "length": len(CNV.object2JSON(line)),
                            "prefix": CNV.object2JSON(line)[0:130]
                        }, e)
        missing_ids = missing_ids - existing_ids

    #COPY MISSING DATA TO ES
    try:
        with ThreadedQueue(es, size=nvl(es.settings.batch_size, 100)) as es_sink:
            with ThreadedQueue(File(settings.param.output_file), size=50) as file_sink:
                simple_etl = functools.partial(etl, *[es_sink, file_sink, settings, transformer, max_existing_id])

                num_not_found = 0
                with Multithread(simple_etl, threads=settings.production.threads) as many:
                    results = many.execute([
                        {"id": id}
                        for id in Q.sort(missing_ids)[:nvl(settings.production.step, NUM_PER_BATCH):]
                    ])
                    for result in results:
                        if not result:
                            num_not_found += 1
                            if num_not_found > nvl(settings.production.max_tries, 10):
                                many.inbound.pop_all()  # CLEAR THE QUEUE OF OTHER WORK
                                many.stop()