示例#1
0
            def do():
                yield from asyncio.sleep(0.0)
                nonlocal mode

                pinfo = {
                    "category": INSPECTOR_CATEGORY,
                    "source": "%s" % repr(data_provider),
                    "step": "",
                    "description": ""
                }
                # register begin of inspection (differ slightly depending on type)
                if data_provider_type == "source":
                    registerer_obj.register_status("inspecting",
                                                   subkey="inspect")
                elif data_provider_type == "build":
                    registerer_obj.register_status("inspecting",
                                                   transient=True,
                                                   init=True,
                                                   job={"step": "inspect"})

                self.logger.info(
                    "Running inspector on %s (type:%s,data_provider:%s)" %
                    (repr(data_provider), data_provider_type,
                     backend_provider))
                if sample is not None:
                    self.logger.info(
                        "Sample set to %s, inspect only a subset of data",
                        sample)
                if limit is None:
                    self.logger.info("Inspecting all the documents")
                else:
                    nonlocal batch_size
                    # adjust batch_size so we inspect only "limit" docs if batch is smaller than the limit
                    if batch_size > limit:
                        batch_size = limit
                    self.logger.info("Inspecting only %s documents", limit)
                # make it pickleable
                if data_provider_type == "source":
                    # because register_obj is also used to fetch data, it has to be unprepare() for pickling
                    registerer_obj.unprepare()
                else:
                    # NOTE: do not unprepare() the builder, we'll loose the target name
                    # (it's be randomly generated again) and we won't be able to register results
                    pass

                cnt = 0
                doccnt = 0
                jobs = []
                # normalize mode param and prepare global results
                if type(mode) == str:
                    mode = [mode]

                converters, mode = btinspect.get_converters(mode)

                inspected = {}
                for m in mode:
                    inspected.setdefault(m, {})

                backend = create_backend(backend_provider).target_collection
                for ids in id_feeder(backend, batch_size=batch_size):
                    if sample is not None:
                        if random.random() > sample:
                            continue
                    cnt += 1
                    doccnt += batch_size
                    if limit and doccnt > limit:
                        break
                    pinfo["description"] = "batch #%s" % cnt

                    def batch_inspected(bnum, i, f):
                        nonlocal inspected
                        nonlocal got_error
                        nonlocal mode
                        try:
                            res = f.result()
                            for m in mode:
                                inspected[m] = btinspect.merge_record(
                                    inspected[m], res[m], m)
                        except Exception as e:
                            got_error = e
                            self.logger.error(
                                "Error while inspecting data from batch #%s: %s"
                                % (bnum, e))
                            raise

                    pre_mapping = "mapping" in mode  # we want to generate intermediate mapping so we can merge
                    # all maps later and then generate the ES mapping from there
                    self.logger.info("Creating inspect worker for batch #%s" %
                                     cnt)
                    job = yield from self.job_manager.defer_to_process(
                        pinfo,
                        partial(inspect_data,
                                backend_provider,
                                ids,
                                mode=mode,
                                pre_mapping=pre_mapping,
                                **kwargs))
                    job.add_done_callback(partial(batch_inspected, cnt, ids))
                    jobs.append(job)

                yield from asyncio.gather(*jobs)

                # compute metadata (they were skipped before)
                for m in mode:
                    if m == "mapping":
                        try:
                            inspected["mapping"] = es.generate_es_mapping(
                                inspected["mapping"])
                            # metadata for mapping only once generated
                            inspected = btinspect.compute_metadata(
                                inspected, m)
                        except es.MappingError as e:
                            inspected["mapping"] = {
                                "pre-mapping": inspected["mapping"],
                                "errors": e.args[1]
                            }
                    else:
                        inspected = btinspect.compute_metadata(inspected, m)

                # just potential converters
                btinspect.run_converters(inspected, converters)

                def fully_inspected(res):
                    nonlocal got_error
                    try:
                        res = btinspect.stringify_inspect_doc(res)
                        _map = {"results": res}
                        _map["data_provider"] = repr(data_provider)
                        _map["started_at"] = started_at
                        _map["duration"] = timesofar(t0)

                        # when inspecting with "stats" mode, we can get huge number but mongo
                        # can't store more than 2^64, make sure to get rid of big nums there
                        def clean_big_nums(k, v):
                            # TODO: same with float/double? seems mongo handles more there ?
                            if isinstance(v, int) and v > 2**64:
                                return k, math.nan
                            else:
                                return k, v

                        dict_traverse(_map, clean_big_nums)
                        # register begin of inspection (differ slightly depending on type)
                        if "mapping" in mode and "errors" in res[
                                "mapping"] and "pre-mapping" in res["mapping"]:
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           inspect=_map)
                            got_error = InspectorError(
                                res["mapping"]["errors"])
                        else:
                            if data_provider_type == "source":
                                registerer_obj.register_status(
                                    "success", subkey="inspect", inspect=_map)
                            elif data_provider_type == "build":
                                registerer_obj.register_status(
                                    "success",
                                    job={"step": "inspect"},
                                    build={"inspect": _map})
                    except Exception as e:
                        self.logger.exception(
                            "Error while inspecting data: %s" % e)
                        got_error = e
                        if data_provider_type == "source":
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           err=repr(e))
                        elif data_provider_type == "build":
                            registerer_obj.register_status(
                                "failed", job={"err": repr(e)})

                fully_inspected(inspected)
                if data_provider_type is None:
                    return
                if got_error:
                    raise got_error
示例#2
0
            def do():
                yield from asyncio.sleep(0.0)
                nonlocal mode

                pinfo = {
                    "category": INSPECTOR_CATEGORY,
                    "source": "%s" % repr(data_provider),
                    "step": "",
                    "description": ""
                }
                # register begin of inspection (differ slightly depending on type)
                if data_provider_type == "source":
                    registerer_obj.register_status("inspecting",
                                                   subkey="inspect")
                elif data_provider_type == "build":
                    registerer_obj.register_status("inspecting",
                                                   transient=True,
                                                   init=True,
                                                   job={"step": "inspect"})

                self.logger.info("Running inspector on %s (type:%s,data_provider:%s)" % \
                        (repr(data_provider),data_provider_type,backend_provider))
                # make it pickleable
                if data_provider_type == "source":
                    # because register_obj is also used to fetch data, it has to be unprepare() for pickling
                    registerer_obj.unprepare()
                else:
                    # NOTE: do not unprepare() the builder, we'll loose the target name
                    # (it's be randomly generated again) and we won't be able to register results
                    pass

                cnt = 0
                jobs = []
                # normalize mode param and prepare global results
                if type(mode) == str:
                    mode = [mode]
                inspected = {}
                for m in mode:
                    inspected.setdefault(m, {})

                backend = create_backend(backend_provider).target_collection
                for ids in id_feeder(backend, batch_size=batch_size):
                    cnt += 1
                    pinfo["description"] = "batch #%s" % cnt

                    def batch_inspected(bnum, i, f):
                        nonlocal inspected
                        nonlocal got_error
                        nonlocal mode
                        try:
                            res = f.result()
                            for m in mode:
                                inspected[m] = btinspect.merge_record(
                                    inspected[m], res[m], m)
                        except Exception as e:
                            got_error = e
                            self.logger.error(
                                "Error while inspecting data from batch #%s: %s"
                                % (bnum, e))
                            raise

                    pre_mapping = "mapping" in mode  # we want to generate intermediate mapping so we can merge
                    # all maps later and then generate the ES mapping from there
                    self.logger.info("Creating inspect worker for batch #%s" %
                                     cnt)
                    job = yield from self.job_manager.defer_to_process(
                        pinfo,
                        partial(inspect_data,
                                backend_provider,
                                ids,
                                mode=mode,
                                pre_mapping=pre_mapping,
                                **kwargs))
                    job.add_done_callback(partial(batch_inspected, cnt, ids))
                    jobs.append(job)

                yield from asyncio.gather(*jobs)

                # compute metadata (they were skipped before)
                for m in mode:
                    if m == "mapping":
                        try:
                            inspected["mapping"] = es.generate_es_mapping(
                                inspected["mapping"])
                            # metadata for mapping only once generated
                            inspected = btinspect.compute_metadata(
                                inspected, m)
                        except es.MappingError as e:
                            inspected["mapping"] = {
                                "pre-mapping": inspected["mapping"],
                                "errors": e.args[1]
                            }
                    else:
                        inspected = btinspect.compute_metadata(inspected, m)

                def fully_inspected(res):
                    nonlocal got_error
                    try:
                        res = btinspect.stringify_inspect_doc(res)
                        _map = {"results": res}
                        _map["data_provider"] = repr(data_provider)
                        _map["started_at"] = started_at
                        _map["duration"] = timesofar(t0)
                        # register begin of inspection (differ slightly depending on type)
                        if "mapping" in mode and "errors" in res[
                                "mapping"] and "pre-mapping" in res["mapping"]:
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           inspect=_map)
                            got_error = InspectorError(
                                res["mapping"]["errors"])
                        else:
                            if data_provider_type == "source":
                                registerer_obj.register_status(
                                    "success", subkey="inspect", inspect=_map)
                            elif data_provider_type == "build":
                                registerer_obj.register_status(
                                    "success",
                                    job={"step": "inspect"},
                                    build={"inspect": _map})
                    except Exception as e:
                        self.logger.exception(
                            "Error while inspecting data: %s" % e)
                        got_error = e
                        if data_provider_type == "source":
                            registerer_obj.register_status("failed",
                                                           subkey="inspect",
                                                           err=repr(e))
                        elif data_provider_type == "build":
                            registerer_obj.register_status(
                                "failed", job={"err": repr(e)})

                fully_inspected(inspected)
                if data_provider_type is None:
                    return
                if got_error:
                    raise got_error