def update(answers): """Updates accept_keys so that the stream can find new phrases.""" log(f"RECIPE: Updating with {len(answers)} answers") for answer in answers: phrase = answer["text"] if answer["answer"] == "accept": accept_keys.append(phrase)
def ner_merge( dataset: str, recon_dataset: str, source: Union[str, Dataset], output_dir: Optional[str] = None, exclude: Optional[List[str]] = None, ): """ Stream a List of `recon.types.HardestExample` instances to prodigy for review/correction. Uses the Prodigy blocks interface to display prediction error information along with ner view """ log("RECIPE: Starting recipe recon.ner_merge", locals()) if isinstance(source, str): dataset = Dataset(recon_dataset).from_disk(source) else: dataset = source DB = connect() if dataset not in DB: msg.fail(f"Can't find dataset '{dataset}'", exits=1) prodigy_raw_examples = DB.get_dataset(dataset) prodigy_examples = [Example(**eg) for eg in prodigy_raw_examples if eg["answer"] == "accept"] prodigy_texts_to_examples = {e.text: e for e in prodigy_examples} prev_len = len(dataset) dataset.apply_("recon.v1.prodigy.merge_examples", prodigy_texts_to_examples) assert len(dataset) == prev_len if output_dir: log(f"RECIPE: Fixing {len(prodigy_examples)} examples in data") dataset.to_disk(output_dir)
def to_patterns(dataset=None, label=None, output_file=None): """ Convert a list of seed phrases to a list of match patterns that can be used with ner.match. If no output file is specified, each pattern is printed so the recipe's output can be piped forward to ner.match. This is pretty much an exact copy of terms.to-patterns. The pattern for each example is just split on whitespace so instead of: {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new balance"}]} which won't match anything you'll get: {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]} """ if label is None: prints( "--label is a required argument", "This is the label that will be assigned to all patterns " "created from terms collected in this dataset. ", exits=1, error=True, ) DB = connect() def get_pattern(term, label): return { "label": label, "pattern": [{ "lower": t.lower() } for t in term["text"].split()] } log("RECIPE: Starting recipe phrases.to-patterns", locals()) if dataset is None: log("RECIPE: Reading input terms from sys.stdin") terms = (srsly.json_loads(line) for line in sys.stdin) else: if dataset not in DB: prints("Can't find dataset '{}'".format(dataset), exits=1, error=True) terms = DB.get_dataset(dataset) log("RECIPE: Reading {} input phrases from dataset {}".format( len(terms), dataset)) if output_file: patterns = [ get_pattern(term, label) for term in terms if term["answer"] == "accept" ] log("RECIPE: Generated {} patterns".format(len(patterns))) srsly.write_jsonl(output_file, patterns) prints("Exported {} patterns".format(len(patterns)), output_file) else: log("RECIPE: Outputting patterns") for term in terms: if term["answer"] == "accept": print(srsly.json_dumps(get_pattern(term, label)))
def model_stats(dataset, spacy_model, label=None, isPrf=False): """ Evaluate model accuracy of model based on dataset with no training inspired from https://support.prodi.gy/t/evaluating-precision-and-recall-of-ner/193/2 found on https://support.prodi.gy/t/evaluating-precision-and-recall-of-ner/193 got basic model evaluation by looking at the batch-train recipe """ log("RECIPE: Starting recipe ner.stats", locals()) DB = connect() nlp = spacy.load(spacy_model) isPrf = 'True' if (isPrf): examples = gold_to_spacy(dataset, spacy_model) score = evaluate_prf(nlp, examples) print("Precision {:0.4f}\tRecall {:0.4f}\tF-score {:0.4f}".format( score['ents_p'], score['ents_r'], score['ents_f'])) else: # ripped this from ner.batch-train recipe model = EntityRecognizer(nlp, label=label) evaldoc = merge_spans(DB.get_dataset(dataset)) evals = list(split_sentences(model.orig_nlp, evaldoc)) scores = model.evaluate(evals) print( "Accuracy {:0.4f}\tRight {:0.0f}\tWrong {:0.0f}\tUnknown {:0.0f}\tEntities {:0.0f}" .format(scores['acc'], scores['right'], scores['wrong'], scores['unk'], scores['ents']))
def get_image_stream(stream, class_mapping_dict, ip, port, model_name, thresh): """Function that gets the image stream with bounding box information Arguments: stream (iterable): input image image stream class_mapping_dict (dict): with key as int and value as class name ip (str): tensorflow serving IP port (str): tensorflow serving port model_name (str): model name in tensorflow serving thresh (float): score threshold for predictions Returns: A generator that constantly yields a prodigy task """ for eg in stream: if not eg["image"].startswith("data"): msg = "Expected base64-encoded data URI, but got: '{}'." raise ValueError(msg.format(eg["image"][:100])) pil_image = Image.open(io.BytesIO(b64_uri_to_bytes(eg["image"]))) predictions = get_predictions(eg, class_mapping_dict, ip, port, model_name) eg["width"] = pil_image.width eg["height"] = pil_image.height eg["spans"] = [get_span(pred, pil_image) for pred in zip(*predictions) if pred[2] >= thresh] log("Using threshold {}, got {} predictions for file {}".format( thresh, len(eg["spans"]), eg["meta"]["file"])) task = copy.deepcopy(eg) yield task
def get_predictions(numpy_image, class_mapping_dict): """Gets predictions for a single image using Frozen Model Arguments: numpy_image (np.ndarray): A single numpy image class_mapping_dict (dict): with key as int and value as class name Returns: A tuple containing numpy arrays: (class_ids, class_names, scores, boxes) """ global detection_graph global sess image_tensor = detection_graph.get_tensor_by_name('image_tensor:0') detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0') detection_scores = detection_graph.get_tensor_by_name('detection_scores:0') detection_classes = detection_graph.get_tensor_by_name( 'detection_classes:0') num_detections = detection_graph.get_tensor_by_name('num_detections:0') image_np_expanded = np.expand_dims(numpy_image, axis=0) start_time = time() (boxes, scores, class_ids, num) = sess.run( [detection_boxes, detection_scores, detection_classes, num_detections], feed_dict={image_tensor: image_np_expanded}) log("time taken for image shape {} is {} secs".format( numpy_image.shape, time() - start_time)) boxes = np.squeeze(boxes) class_ids = np.squeeze(class_ids).astype(np.int32) class_names = np.array( [class_mapping_dict[class_id] for class_id in class_ids]) scores = np.squeeze(scores) return (class_ids, class_names, scores, boxes)
def generic_tf_serving_client(data, ip, port, model_name, signature_name, input_name, timeout=300): """A generic tensorflow serving client that predicts using given data Arguments: data (np.ndarray/bytes): A numpy array of data or bytes. No Default ip (str): IP address of tensorflow serving. No Default port (str/int): Port of tensorflow serving. No Default model_name (str): Model name. No Default signature_name (str): Signature name. No Default input_name (str): Input tensor name. No Default timeout (str): timeout for API call. Default 300 secs returns: Prediction protobuf """ start_time = time() assert isinstance(data, (np.ndarray, bytes)), \ "data must be a numpy array or bytes but got {}".format(type(data)) channel = grpc.insecure_channel('{}:{}'.format(ip, port)) stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) request = predict_pb2.PredictRequest() request.model_spec.name = model_name request.model_spec.signature_name = signature_name request.inputs['{}'.format(input_name) ].CopyFrom(tf.contrib.util.make_tensor_proto( data, )) result = stub.Predict(request, timeout) log(("time taken for prediction using model {} " "version {} is: {} secs").format( str(result.model_spec.name), result.model_spec.version.value, time()-start_time)) return result
def fill_memory(ctrl): if memorize: examples = ctrl.db.get_dataset(dataset) log("RECIPE: Add {} examples from dataset '{}' to memory".format( len(examples), dataset)) for eg in examples: memory[eg[TASK_HASH_ATTR]] = eg['answer']
def connect(db_id=None, db_settings=None): """Connect to the database. db_id (unicode): 'sqlite' (default), 'postgresql' or 'mysql'. db_settings (dict): Optional database connection parameters. RETURNS (prodigy.components.db.Database): The initialized database. """ global _DB if _DB is not None: return _DB connectors = { 'sqlite': connect_sqlite, 'postgresql': connect_postgresql, 'mysql': connect_mysql } user_dbs = get_entry_points('prodigy_db') if user_dbs: log("DB: Added {} connector(s) via entry points".format(len(user_dbs))) if db_id in user_dbs: _DB = user_dbs[db_id] return _DB config = get_config() if db_id in (True, False, None): db_id = config.get('db', 'sqlite') if db_settings in (True, False, None): config_db_settings = config.setdefault('db_settings', {}) db_settings = config_db_settings.get(db_id, {}) if db_id not in connectors: raise ValueError("Invalid database id: {}".format(db_id)) db_name, db = connectors[db_id](**db_settings) _DB = Database(db, db_id, db_name) log("DB: Connecting to database {}".format(db_name), db_settings) return _DB
def drop_dataset(self, name): """ name (unicode): The name of the dataset to drop. RETURNS (bool): True if dataset was dropped. """ dataset = Dataset.get(Dataset.name == name) query = Link.delete().where(Link.dataset == dataset.id) query.execute() query = Dataset.delete().where(Dataset.id == dataset.id) query.execute() self.db.commit() log("DB: Removed dataset '{}'".format(name)) return True
def get_dataset(self, name, default=None): """ name (unicode): The dataset name. default: Return value if dataset not in database. RETURNS (list): The examples in the dataset or default value. """ if name not in self: return default dataset = Dataset.get(Dataset.name == name) examples = (Example.select().join(Link).join(Dataset).where( Dataset.id == dataset.id)).execute() log("DB: Loading dataset '{}' ({} examples)".format( name, len(examples))) return [eg.load() for eg in examples]
def _create_dir(path): """A private function which creates a directory if it does not exists Arguments: path (str): Directory path Returns: None """ if not os.path.isdir(path): log("Creating a directory {}".format(path)) os.mkdir(path) else: log("Directory {} already exists".format(path))
def get_predictions(single_stream, class_mapping_dict, ip, port, model_name): """Gets predictions for a single image using Tensorflow serving Arguments: single_stream (dict): A single prodigy stream class_mapping_dict (dict): with key as int and value as class name ip (str): tensorflow serving IP port (str): tensorflow serving port model_name (str): model name in tensorflow serving Returns: A tuple containing numpy arrays: (class_ids, class_names, scores, boxes) """ image_byte_stream = b64_uri_to_bytes(single_stream["image"]) encoded_image_io = io.BytesIO(image_byte_stream) image = Image.open(encoded_image_io) width, height = image.size filename = str(single_stream["meta"]["file"]) file_extension = filename.split(".")[1].lower() if file_extension == "png": image_format = b'png' elif file_extension in ("jpg", "jpeg"): image_format = b'jpg' else: log(("Only 'png', 'jpeg' or 'jpg' files are supported by ODAPI. " "Got {}. Thus treating it as `jpg` file. " "Might cause errors".format(file_extension) )) image_format = b'jpg' filename = filename.encode("utf-8") tf_example = tf.train.Example(features=tf.train.Features(feature={ 'image/height': dataset_util.int64_feature(height), 'image/width': dataset_util.int64_feature(width), 'image/filename': dataset_util.bytes_feature(filename), 'image/source_id': dataset_util.bytes_feature(filename), 'image/encoded': dataset_util.bytes_feature(image_byte_stream), 'image/format': dataset_util.bytes_feature(image_format), })) boxes, class_ids, scores = tf_odapi_client(tf_example.SerializeToString(), ip, port, model_name, "serving_default", input_name="serialized_example", timeout=300 ) class_names = np.array([class_mapping_dict[class_id] for class_id in class_ids]) return (class_ids, class_names, scores, boxes)
def __init__(self, db, display_id='custom', display_name=None): """Initialize a database. db: A database object that can be initialized by peewee. display_id (unicode): Database ID used for logging, e.g. 'sqlite'. display_name (unicode): Database name used for logging, e.g. 'SQLite'. RETURNS (Database): The initialized database. """ DB_PROXY.initialize(db) self.db_id = display_id self.db_name = display_name or get_display_name(db) log("DB: Initialising database {}".format(self.db_name)) try: DB_PROXY.create_tables([User, Dataset, Example, Link], safe=True) except orm.OperationalError: pass self.db = DB_PROXY
def add_dataset(self, name, meta={}, session=False): """ name (unicode): The name of the dataset to add. meta (dict): Optional dataset meta. session (bool): Whether the dataset is a session dataset. RETURNS (list): The created dataset. """ if any([char in name for char in (',', ' ')]): raise ValueError("Dataset name can't include commas or whitespace") try: dataset = Dataset.get(Dataset.name == name) log("DB: Getting dataset '{}'".format(name)) except Dataset.DoesNotExist: log("DB: Creating dataset '{}'".format(name), meta) meta = ujson.dumps(meta, escape_forward_slashes=False) dataset = Dataset.create(name=name, meta=meta, session=session) return dataset
def get_stream(): """Continue querying sense2vec whenever we get a new phrase and presenting examples to the user with a similarity above the threshold parameter.""" nonlocal threshold while True: log(f"RECIPE: Looking for {n_similar} phrases most similar to " f"{len(accept_keys)} accepted keys") most_similar = s2v.most_similar(accept_keys, n=n_similar) log(f"RECIPE: Found {len(most_similar)} most similar phrases") n_skipped = 0 n_duplicate = 0 for key, score in most_similar: if score > threshold: word, sense = s2v.split_key(key) if (case_sensitive and word in seen) or (not case_sensitive and word.lower() in seen): n_duplicate += 1 continue seen.add(word if case_sensitive else word.lower()) # Make sure the score is a regular float, otherwise server # may fail when trying to serialize it to/from JSON meta = {"score": float(score), "sense": sense} yield { "text": key, "word": word, "sense": sense, "meta": meta } else: n_skipped += 1 if n_skipped: log(f"RECIPE: Skipped {n_skipped} phrases below threshold {threshold}" ) if n_skipped == len(most_similar) - n_duplicate: # No most similar phrases were found that are above the # threshold, so lower the threshold if it's not already 0 or # return empty list so Prodigy shows "no tasks available" new_threshold = threshold - 0.1 if new_threshold <= 0.0: log(f"RECIPE: No suggestions for threshold {threshold:.2}") return [] log(f"RECIPE: Lowering threshold from {threshold:.2} to {new_threshold:.2}" ) threshold = new_threshold
def _export_saved_model(export_dir, estimator, odapi_configs): """Private function which exports a SavedModel from estimator Arguments: export_dir (str): directory to export temp SavedModels for TF serving estimator (tf.estimator.Estimator): detection model as tf estimator odapi_configs (dict): Object detection api pipeline.config object Returns: None """ log("Exporting the model as SavedModel in {}".format(export_dir)) # Just a placeholder pred_input_config = odapi_configs["eval_input_config"] predict_input_fn = create_predict_input_fn(odapi_configs["model"], pred_input_config) estimator.export_saved_model(export_dir_base=export_dir, serving_input_receiver_fn=predict_input_fn) log("Exported SavedModel!")
def evaluate(dataset, spacy_model, source, label='', api=None, loader=None, exclude=None): """ Evaluate a text classification model and build an evaluation set from a stream. """ log("RECIPE: Starting recipe attncat.eval", locals()) nlp = spacy.load(spacy_model, disable=['tagger', 'parser', 'ner']) # Get attention layer weights from textcat textcat = nlp.get_pipe('textcat') assert textcat is not None with get_attention_weights(textcat) as attn_weights: stream = get_stream(source, api, loader) # Decorate items with attention data stream = attach_attention_data(stream, nlp, attn_weights) model = TextClassifier(nlp, label) log( 'RECIPE: Initialised TextClassifier with model {}'.format( spacy_model), model.nlp.meta) def on_exit(ctrl): examples = ctrl.db.get_dataset(dataset) data = dict(model.evaluate(examples)) print(printers.tc_result(data)) return { 'view_id': 'html', 'dataset': dataset, 'stream': stream, 'exclude': exclude, 'on_exit': on_exit, 'config': { 'lang': nlp.lang, 'labels': model.labels, 'html_template': template_text } }
def add_examples(self, examples, datasets=tuple()): """ examples (list): The examples to add. datasets (list): The names of the dataset(s) to add the examples to. """ with self.db.atomic(): ids = [] for eg in examples: content = ujson.dumps(eg, escape_forward_slashes=False) eg = Example.create(input_hash=eg[INPUT_HASH_ATTR], task_hash=eg[TASK_HASH_ATTR], content=content) ids.append(eg.id) if type(datasets) is not tuple and type(datasets) is not list: raise ValueError( 'datasets must be a tuple or list type, not: {}'.format( type(datasets))) for dataset in datasets: self.link(dataset, ids) log("DB: Added {} examples to {} datasets".format( len(examples), len(datasets)))
def to_patterns(dataset, spacy_model, label, output_file="-", case_sensitive=False, dry=False): """ Convert a dataset of phrases collected with sense2vec.teach to token-based match patterns that can be used with spaCy's EntityRuler or recipes like ner.match. If no output file is specified, the patterns are written to stdout. The examples are tokenized so that multi-token terms are represented correctly, e.g.: {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]} """ log("RECIPE: Starting recipe sense2vec.to-patterns", locals()) nlp = spacy.load(spacy_model) log(f"RECIPE: Loaded spaCy model '{spacy_model}'") DB = connect() if dataset not in DB: raise ValueError(f"Can't find dataset '{dataset}'") examples = DB.get_dataset(dataset) terms = [eg["text"] for eg in examples if eg["answer"] == "accept"] if case_sensitive: patterns = [{"text": t.text for t in nlp.make_doc(term)} for term in terms] else: patterns = [{"lower": t.lower_ for t in nlp.make_doc(term)} for term in terms] patterns = [{"label": label, "pattern": pattern} for pattern in patterns] log(f"RECIPE: Generated {len(patterns)} patterns") if not dry: srsly.write_jsonl(output_file, patterns) return patterns
def get_stream(): """Continue querying sense2vec whenever we get a new phrase and presenting examples to the user with a similarity above the threshold parameter.""" while True: log(f"RECIPE: Looking for {n_similar} phrases most similar to " f"{len(accept_keys)} accepted keys") most_similar = s2v.most_similar(accept_keys, n=n_similar) log(f"RECIPE: Found {len(most_similar)} most similar phrases") for key, score in most_similar: if key not in seen and score > threshold: seen.add(key) word, sense = s2v.split_key(key) # Make sure the score is a regular float, otherwise server # may fail when trying to serialize it to/from JSON meta = {"score": float(score)} yield { "text": key, "word": word, "sense": sense, "meta": meta }
def image_servingmodel(dataset, ip, port, model_name, label_map_path, source=None, threshold=0.5, api=None, exclude=None, use_display_name=False, label=None): log("RECIPE: Starting recipe image.servingmodel", locals()) # key class names reverse_class_mapping_dict = label_map_util.get_label_map_dict( label_map_path=label_map_path, use_display_name=use_display_name) if label is None: label = [k for k in reverse_class_mapping_dict.keys()] # key int class_mapping_dict = {v: k for k, v in reverse_class_mapping_dict.items()} stream = get_stream(source, api=api, loader="images", input_key="image") stream = fetch_images(stream) return { "view_id": "image_manual", "dataset": dataset, "stream": get_image_stream(stream, class_mapping_dict, ip, port, model_name, float(threshold)), "exclude": exclude, 'config': { 'label': ', '.join(label) if label is not None else 'all', 'labels': label, # Selectable label options, } }
def _write_tf_record(tasks, output_file, reverse_class_mapping_dict): """Private function which writes training TF-Record file Arguments: tasks (iterable): prodigy's tasks output_file (str): output TF-Record filename reverse_class_mapping_dict (dict): key as class name and value as int Returns: a counter containing number of examples returned """ writer = tf.python_io.TFRecordWriter(output_file) counter = 0 for task in tasks: if task['answer'] == 'accept': tf_example = create_a_tf_example(task, reverse_class_mapping_dict) writer.write(tf_example.SerializeToString()) counter += 1 else: continue writer.close() log("Successfully written {} annotations as TFRecords".format(counter)) return counter
def pipe(source=None, api=None, loader=None, from_dataset=False, exclude=None): """ Load examples from an input source, and print them as newline-delimited JSON. This makes it easy to filter the stream with command-line utilities such as `grep`. It's also often useful to inspect the stream, by piping to `less`. """ DB = connect() if from_dataset: stream = DB.get_dataset(source) else: stream = get_stream(source, api, loader) stream = (set_hashes(eg) for eg in stream) if exclude: log("RECIPE: Excluding tasks from datasets: {}".format( ', '.join(exclude))) exclude_hashes = DB.get_input_hashes(*exclude) stream = filter_inputs(stream, exclude_hashes) try: for eg in stream: print(ujson.dumps(eg, escape_forward_slashes=False)) except KeyboardInterrupt: pass
def evaluate(dataset, spacy_model, source, label='', api=None, loader=None, exclude=None): """ Evaluate a text classification model and build an evaluation set from a stream. """ log("RECIPE: Starting recipe attncat.eval", locals()) nlp = spacy.load(spacy_model, disable=['tagger', 'parser', 'ner']) stream = get_stream(source, api, loader) stream = attach_structural_sensitivity_data(stream, nlp, label.split(',')[0]) model = TextClassifier(nlp, label) log('RECIPE: Initialised TextClassifier with model {}'.format(spacy_model), model.nlp.meta) def on_exit(ctrl): examples = ctrl.db.get_dataset(dataset) data = dict(model.evaluate(examples)) print(printers.tc_result(data)) return { 'view_id': 'html', 'dataset': dataset, 'stream': stream, 'exclude': exclude, 'on_exit': on_exit, 'config': { 'lang': nlp.lang, 'labels': model.labels, 'html_template': template_text } }
def _tf_odapi_client(image, ip, port, model_name, signature_name="detection_signature", input_name="inputs", timeout=300): """Client for using Tensorflow Serving with Tensorflow Object Detection API Arguments: data (np.ndarray/bytes): A numpy array of data or bytes. No Default ip (str): IP address of tensorflow serving. No Default port (str/int): Port of tensorflow serving. No Default model_name (str): Model name. No Default signature_name (str): Signature name. Default "detection_signature". input_name (str): Input tensor name. Default "inputs". timeout (str): timeout for API call. Default 300 secs returns: a tuple containing numpy arrays of (boxes, classes, scores) """ start_time = time() result = _generic_tf_serving_client(image, ip, port, model_name, signature_name, input_name, timeout) log("time taken for image shape {} is {} secs".format( image.shape, time() - start_time)) # boxes are ymin.xmin,ymax,xmax boxes = np.array(result.outputs['detection_boxes'].float_val) classes = np.array(result.outputs['detection_classes'].float_val) scores = np.array(result.outputs['detection_scores'].float_val) boxes = boxes.reshape((len(scores), 4)) classes = np.squeeze(classes.astype(np.int32)) scores = np.squeeze(scores) return (boxes, classes, scores)
def to_patterns(dataset, spacy_model, label, output_file="-", case_sensitive=False, dry=False): """ Convert a dataset of phrases collected with sense2vec.teach to token-based match patterns that can be used with spaCy's EntityRuler or recipes like ner.match. If no output file is specified, the patterns are written to stdout. The examples are tokenized so that multi-token terms are represented correctly, e.g.: {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]} For tokenization, you can either pass in the name of a spaCy model (e.g. if you're using a model with custom tokenization), or "blank:" plus the language code you want to use, e.g. blank:en or blank:de. Make sure to use the same language / tokenizer you're planning to use at runtime – otherwise your patterns may not match. """ log("RECIPE: Starting recipe sense2vec.to-patterns", locals()) if spacy_model.startswith("blank:"): nlp = spacy.blank(spacy_model.replace("blank:", "")) else: nlp = spacy.load(spacy_model) log(f"RECIPE: Loaded spaCy model '{spacy_model}'") DB = connect() if dataset not in DB: msg.fail(f"Can't find dataset '{dataset}'", exits=1) examples = DB.get_dataset(dataset) terms = set([eg["word"] for eg in examples if eg["answer"] == "accept"]) if case_sensitive: patterns = [[{ "text": t.lower_ } for t in nlp.make_doc(term)] for term in terms] else: terms = set([word.lower() for word in terms]) patterns = [[{ "lower": t.lower_ } for t in nlp.make_doc(term)] for term in terms] patterns = [{"label": label, "pattern": pattern} for pattern in patterns] log(f"RECIPE: Generated {len(patterns)} patterns") if not dry: srsly.write_jsonl(output_file, patterns) return patterns
def image_tfodapimodel(dataset, frozen_model_path, label_map_path, source=None, threshold=0.5, api=None, exclude=None, use_display_name=False, label=None): log("RECIPE: Starting recipe image.tfodapimodel", locals()) log("RECIPE: Loading frozen model") global detection_graph detection_graph = tf.Graph() with detection_graph.as_default(): od_graph_def = tf.GraphDef() with tf.gfile.GFile(frozen_model_path, 'rb') as fid: serialized_graph = fid.read() od_graph_def.ParseFromString(serialized_graph) tf.import_graph_def(od_graph_def, name='') global sess sess = tf.Session(graph=detection_graph) log("RECIPE: Loaded frozen model") # key class names reverse_class_mapping_dict = label_map_util.get_label_map_dict( label_map_path=label_map_path, use_display_name=use_display_name) if label is None: label = [k for k in reverse_class_mapping_dict.keys()] # key int class_mapping_dict = {v: k for k, v in reverse_class_mapping_dict.items()} stream = get_stream(source, api=api, loader="images", input_key="image") stream = fetch_images(stream) return { "view_id": "image_manual", "dataset": dataset, "stream": get_image_stream(stream, class_mapping_dict, float(threshold)), "exclude": exclude, "on_exit": free_graph, 'config': { 'label': ', '.join(label) if label is not None else 'all', 'labels': label, # Selectable label options, } }
def mark_custom(dataset, source=None, view_id=None, label='', api=None, loader=None, memorize=False, exclude=None): """ Click through pre-prepared examples, with no model in the loop. """ log('RECIPE: Starting recipe mark', locals()) stream = list(get_stream(source, api, loader)) counts = Counter() memory = {} def fill_memory(ctrl): if memorize: examples = ctrl.db.get_dataset(dataset) log("RECIPE: Add {} examples from dataset '{}' to memory".format( len(examples), dataset)) for eg in examples: memory[eg[TASK_HASH_ATTR]] = eg['answer'] def ask_questions(stream): for eg in stream: eg['time_loaded'] = datetime.now().isoformat() if TASK_HASH_ATTR in eg and eg[TASK_HASH_ATTR] in memory: answer = memory[eg[TASK_HASH_ATTR]] counts[answer] += 1 else: if label: eg['label'] = label yield eg def recv_answers(answers): for eg in answers: counts[eg['answer']] += 1 memory[eg[TASK_HASH_ATTR]] = eg['answer'] eg['time_returned'] = datetime.now().isoformat() def print_results(ctrl): print(printers.answers(counts)) def get_progress(session=0, total=0, loss=0): progress = len(counts) / len(stream) return progress return { 'view_id': view_id, 'dataset': dataset, 'stream': ask_questions(stream), 'exclude': exclude, 'update': recv_answers, 'on_load': fill_memory, 'on_exit': print_results, 'config': { 'label': label } }
def teach( dataset, vectors_path, seeds, threshold=0.85, n_similar=100, batch_size=5, case_sensitive=False, resume=False, ): """ Bootstrap a terminology list using sense2vec. Prodigy will suggest similar terms based on the the most similar phrases from sense2vec, and the suggestions will be adjusted as you annotate and accept similar phrases. For each seed term, the best matching sense according to the sense2vec vectors will be used. If no similar terms are found above the given threshold, the threshold is lowered by 0.1 and similar terms are requested again. """ log("RECIPE: Starting recipe sense2vec.teach", locals()) s2v = Sense2Vec().from_disk(vectors_path) log("RECIPE: Loaded sense2vec vectors", vectors_path) html_template = "<span style='font-size: {{theme.largeText}}px'>{{word}}</span>" accept_keys = [] seen = set() seed_tasks = [] for seed in seeds: key = s2v.get_best_sense(seed) if key is None: msg.fail(f"Can't find seed term '{seed}' in vectors", exits=1) accept_keys.append(key) best_word, best_sense = s2v.split_key(key) seen.add(best_word if case_sensitive else best_word.lower()) task = { "text": key, "word": best_word, "sense": best_sense, "meta": { "score": 1.0, "sense": best_sense }, "answer": "accept", } seed_tasks.append(set_hashes(task)) print(f"Starting with seed keys: {accept_keys}") DB = connect() if dataset not in DB: DB.add_dataset(dataset) dataset_hashes = DB.get_task_hashes(dataset) DB.add_examples( [st for st in seed_tasks if st[TASK_HASH_ATTR] not in dataset_hashes], datasets=[dataset], ) if resume: prev = DB.get_dataset(dataset) prev_accept_keys = [ eg["text"] for eg in prev if eg["answer"] == "accept" ] prev_words = [ eg["word"] if case_sensitive else eg["word"].lower() for eg in prev ] accept_keys += prev_accept_keys seen.update(set(prev_words)) log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}" ) def update(answers): """Updates accept_keys so that the stream can find new phrases.""" log(f"RECIPE: Updating with {len(answers)} answers") for answer in answers: phrase = answer["text"] if answer["answer"] == "accept": accept_keys.append(phrase) def get_stream(): """Continue querying sense2vec whenever we get a new phrase and presenting examples to the user with a similarity above the threshold parameter.""" nonlocal threshold while True: log(f"RECIPE: Looking for {n_similar} phrases most similar to " f"{len(accept_keys)} accepted keys") most_similar = s2v.most_similar(accept_keys, n=n_similar) log(f"RECIPE: Found {len(most_similar)} most similar phrases") n_skipped = 0 n_duplicate = 0 for key, score in most_similar: if score > threshold: word, sense = s2v.split_key(key) if (case_sensitive and word in seen) or (not case_sensitive and word.lower() in seen): n_duplicate += 1 continue seen.add(word if case_sensitive else word.lower()) # Make sure the score is a regular float, otherwise server # may fail when trying to serialize it to/from JSON meta = {"score": float(score), "sense": sense} yield { "text": key, "word": word, "sense": sense, "meta": meta } else: n_skipped += 1 if n_skipped: log(f"RECIPE: Skipped {n_skipped} phrases below threshold {threshold}" ) if n_skipped == len(most_similar) - n_duplicate: # No most similar phrases were found that are above the # threshold, so lower the threshold if it's not already 0 or # return empty list so Prodigy shows "no tasks available" new_threshold = threshold - 0.1 if new_threshold <= 0.0: log(f"RECIPE: No suggestions for threshold {threshold:.2}") return [] log(f"RECIPE: Lowering threshold from {threshold:.2} to {new_threshold:.2}" ) threshold = new_threshold stream = get_stream() return { "view_id": "html", "dataset": dataset, "stream": stream, "update": update, "config": { "batch_size": batch_size, "html_template": html_template }, }