def insights_iter(file_path: pathlib.Path) -> Iterable[Prediction]: for prediction in jsonl_iter(file_path): yield dacite.from_dict( data_class=Prediction, data=prediction, config=dacite.Config(cast=[PredictionType]), )
def stream(self) -> ProductStream: json_path_str = str(self.jsonl_path) if json_path_str.endswith(".gz"): iterator = gzip_jsonl_iter(json_path_str) else: iterator = jsonl_iter(json_path_str) return ProductStream(iterator)
def ocr_iter( source: Union[str, TextIO, pathlib.Path] ) -> Iterable[Tuple[Optional[str], Dict]]: if isinstance(source, pathlib.Path): items = jsonl_iter(source) yield from ocr_content_iter(items) elif not isinstance(source, str): items = jsonl_iter_fp(source) yield from ocr_content_iter(items) elif is_barcode(source): barcode: str = source image_data = fetch_images_for_ean(source)["product"]["images"] for image_id in image_data.keys(): if image_id.isdigit(): print("Getting OCR for image {}".format(image_id)) data = get_json_for_image(barcode, image_id) source = get_source(image_id, barcode=barcode) if data: yield source, data else: input_path = pathlib.Path(source) if not input_path.exists(): print("Unrecognized input: {}".format(input_path)) return if input_path.is_dir(): for json_path in input_path.glob("**/*.json"): with open(str(json_path), "rb") as f: source = get_source(json_path.stem, json_path=str(json_path)) yield source, orjson.loads(f.read()) else: if ".json" in input_path.suffixes: with open(str(input_path), "rb") as f: yield None, orjson.loads(f.read()) elif ".jsonl" in input_path.suffixes: items = jsonl_iter(input_path) yield from ocr_content_iter(items)
def insert_batch(data_path: pathlib.Path, model_name: str, model_version: str) -> int: timestamp = datetime.datetime.utcnow() logger.info("Loading seen set...") seen_set = get_seen_set() logger.info("Seen set loaded") inserted = 0 for item in tqdm.tqdm(jsonl_iter(data_path)): barcode = item["barcode"] source_image = generate_image_path(barcode=barcode, image_id=item["image_id"]) key = (model_name, source_image) if key in seen_set: continue image_instance = ImageModel.get_or_none(source_image=source_image) if image_instance is None: logger.warning("Unknown image in DB: {}".format(source_image)) continue results = [r for r in item["result"] if r["score"] > 0.1] data = {"objects": results} max_confidence = max([r["score"] for r in results], default=None) inserted += 1 image_prediction = ImagePrediction.create( type=TYPE, image=image_instance, timestamp=timestamp, model_name=model_name, model_version=model_version, data=data, max_confidence=max_confidence, ) for i, item in enumerate(results): if item["score"] >= 0.5: LogoAnnotation.create( image_prediction=image_prediction, index=i, score=item["score"], bounding_box=item["bounding_box"], ) seen_set.add(key) return inserted
def from_jsonl(self, file_path): items = jsonl_iter(file_path) self.import_insights(items, automatic=False)
def insights_iter(file_path: pathlib.Path) -> Iterable[ProductInsights]: for insight in jsonl_iter(file_path): yield ProductInsights.from_dict(insight)
def from_jsonl(self, file_path: pathlib.Path, server_domain: str): items = jsonl_iter(file_path) self.import_insights(items, server_domain=server_domain, automatic=False)
import pathlib from random import shuffle from robotoff import settings from robotoff.utils import dump_jsonl, jsonl_iter lang = "pt" input_path: pathlib.Path = (settings.DATASET_DIR / "category" / "category_{}.jsonl".format(lang)) items = list(jsonl_iter(input_path)) shuffle(items) val_count = len(items) // 10 val_items = items[:val_count] test_items = items[val_count:2 * val_count] train_items = items[2 * val_count:] dump_jsonl(input_path.with_name("category_{}.val.jsonl".format(lang)), val_items) dump_jsonl(input_path.with_name("category_{}.test.jsonl".format(lang)), test_items) dump_jsonl(input_path.with_name("category_{}.train.jsonl".format(lang)), train_items)