示例#1
0
 def update(answers):
     """Updates accept_keys so that the stream can find new phrases."""
     log(f"RECIPE: Updating with {len(answers)} answers")
     for answer in answers:
         phrase = answer["text"]
         if answer["answer"] == "accept":
             accept_keys.append(phrase)
示例#2
0
def ner_merge(
    dataset: str,
    recon_dataset: str,
    source: Union[str, Dataset],
    output_dir: Optional[str] = None,
    exclude: Optional[List[str]] = None,
):
    """
    Stream a List of `recon.types.HardestExample` instances to prodigy
    for review/correction. Uses the Prodigy blocks interface to display
    prediction error information along with ner view
    """
    log("RECIPE: Starting recipe recon.ner_merge", locals())
    if isinstance(source, str):
        dataset = Dataset(recon_dataset).from_disk(source)
    else:
        dataset = source

    DB = connect()
    if dataset not in DB:
        msg.fail(f"Can't find dataset '{dataset}'", exits=1)

    prodigy_raw_examples = DB.get_dataset(dataset)
    prodigy_examples = [Example(**eg) for eg in prodigy_raw_examples if eg["answer"] == "accept"]
    prodigy_texts_to_examples = {e.text: e for e in prodigy_examples}

    prev_len = len(dataset)
    dataset.apply_("recon.v1.prodigy.merge_examples", prodigy_texts_to_examples)
    assert len(dataset) == prev_len

    if output_dir:
        log(f"RECIPE: Fixing {len(prodigy_examples)} examples in data")
        dataset.to_disk(output_dir)
示例#3
0
def to_patterns(dataset=None, label=None, output_file=None):
    """
    Convert a list of seed phrases to a list of match patterns that can be used
    with ner.match. If no output file is specified, each pattern is printed
    so the recipe's output can be piped forward to ner.match.

    This is pretty much an exact copy of terms.to-patterns.
    The pattern for each example is just split on whitespace so instead of:

        {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new balance"}]}


    which won't match anything you'll get:

        {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]}
    """
    if label is None:
        prints(
            "--label is a required argument",
            "This is the label that will be assigned to all patterns "
            "created from terms collected in this dataset. ",
            exits=1,
            error=True,
        )

    DB = connect()

    def get_pattern(term, label):
        return {
            "label": label,
            "pattern": [{
                "lower": t.lower()
            } for t in term["text"].split()]
        }

    log("RECIPE: Starting recipe phrases.to-patterns", locals())
    if dataset is None:
        log("RECIPE: Reading input terms from sys.stdin")
        terms = (srsly.json_loads(line) for line in sys.stdin)
    else:
        if dataset not in DB:
            prints("Can't find dataset '{}'".format(dataset),
                   exits=1,
                   error=True)
        terms = DB.get_dataset(dataset)
        log("RECIPE: Reading {} input phrases from dataset {}".format(
            len(terms), dataset))
    if output_file:
        patterns = [
            get_pattern(term, label) for term in terms
            if term["answer"] == "accept"
        ]
        log("RECIPE: Generated {} patterns".format(len(patterns)))
        srsly.write_jsonl(output_file, patterns)
        prints("Exported {} patterns".format(len(patterns)), output_file)
    else:
        log("RECIPE: Outputting patterns")
        for term in terms:
            if term["answer"] == "accept":
                print(srsly.json_dumps(get_pattern(term, label)))
示例#4
0
def model_stats(dataset, spacy_model, label=None, isPrf=False):
    """
    Evaluate model accuracy of model based on dataset with no training
    inspired from https://support.prodi.gy/t/evaluating-precision-and-recall-of-ner/193/2
    found on https://support.prodi.gy/t/evaluating-precision-and-recall-of-ner/193
    got basic model evaluation by looking at the batch-train recipe
    """

    log("RECIPE: Starting recipe ner.stats", locals())
    DB = connect()
    nlp = spacy.load(spacy_model)

    isPrf = 'True'
    if (isPrf):
        examples = gold_to_spacy(dataset, spacy_model)
        score = evaluate_prf(nlp, examples)
        print("Precision {:0.4f}\tRecall {:0.4f}\tF-score {:0.4f}".format(
            score['ents_p'], score['ents_r'], score['ents_f']))

    else:
        # ripped this from ner.batch-train recipe
        model = EntityRecognizer(nlp, label=label)
        evaldoc = merge_spans(DB.get_dataset(dataset))
        evals = list(split_sentences(model.orig_nlp, evaldoc))

        scores = model.evaluate(evals)

        print(
            "Accuracy {:0.4f}\tRight {:0.0f}\tWrong {:0.0f}\tUnknown {:0.0f}\tEntities {:0.0f}"
            .format(scores['acc'], scores['right'], scores['wrong'],
                    scores['unk'], scores['ents']))
示例#5
0
def get_image_stream(stream, class_mapping_dict, ip, port, model_name, thresh):
    """Function that gets the image stream with bounding box information

    Arguments:
        stream (iterable): input image image stream
        class_mapping_dict (dict): with key as int and value as class name
        ip (str): tensorflow serving IP
        port (str): tensorflow serving port
        model_name (str): model name in tensorflow serving
        thresh (float): score threshold for predictions

    Returns:
        A generator that constantly yields a prodigy task
    """
    for eg in stream:
        if not eg["image"].startswith("data"):
            msg = "Expected base64-encoded data URI, but got: '{}'."
            raise ValueError(msg.format(eg["image"][:100]))

        pil_image = Image.open(io.BytesIO(b64_uri_to_bytes(eg["image"])))
        predictions = get_predictions(eg, class_mapping_dict,
                                      ip, port, model_name)
        eg["width"] = pil_image.width
        eg["height"] = pil_image.height
        eg["spans"] = [get_span(pred, pil_image)
                       for pred in zip(*predictions) if pred[2] >= thresh]
        log("Using threshold {}, got {} predictions for file {}".format(
            thresh, len(eg["spans"]), eg["meta"]["file"]))
        task = copy.deepcopy(eg)
        yield task
示例#6
0
def get_predictions(numpy_image, class_mapping_dict):
    """Gets predictions for a single image using Frozen Model

    Arguments:
        numpy_image (np.ndarray): A single numpy image
        class_mapping_dict (dict): with key as int and value as class name

    Returns:
        A tuple containing numpy arrays:
        (class_ids, class_names, scores, boxes)
    """
    global detection_graph
    global sess
    image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
    detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
    detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
    detection_classes = detection_graph.get_tensor_by_name(
        'detection_classes:0')
    num_detections = detection_graph.get_tensor_by_name('num_detections:0')

    image_np_expanded = np.expand_dims(numpy_image, axis=0)
    start_time = time()
    (boxes, scores, class_ids, num) = sess.run(
        [detection_boxes, detection_scores, detection_classes, num_detections],
        feed_dict={image_tensor: image_np_expanded})
    log("time taken for image shape {} is {} secs".format(
        numpy_image.shape,
        time() - start_time))
    boxes = np.squeeze(boxes)
    class_ids = np.squeeze(class_ids).astype(np.int32)
    class_names = np.array(
        [class_mapping_dict[class_id] for class_id in class_ids])
    scores = np.squeeze(scores)
    return (class_ids, class_names, scores, boxes)
示例#7
0
def generic_tf_serving_client(data, ip, port, model_name,
                              signature_name, input_name, timeout=300):
    """A generic tensorflow serving client that predicts using given data

    Arguments:
        data (np.ndarray/bytes): A numpy array of data or bytes. No Default
        ip (str): IP address of tensorflow serving. No Default
        port (str/int): Port of tensorflow serving. No Default
        model_name (str): Model name. No Default
        signature_name (str): Signature name. No Default
        input_name (str): Input tensor name. No Default
        timeout (str): timeout for API call. Default 300 secs

    returns:
        Prediction protobuf
    """
    start_time = time()
    assert isinstance(data, (np.ndarray, bytes)), \
        "data must be a numpy array or bytes but got {}".format(type(data))
    channel = grpc.insecure_channel('{}:{}'.format(ip, port))
    stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
    request = predict_pb2.PredictRequest()
    request.model_spec.name = model_name
    request.model_spec.signature_name = signature_name
    request.inputs['{}'.format(input_name)
                   ].CopyFrom(tf.contrib.util.make_tensor_proto(
                       data,
                   ))
    result = stub.Predict(request, timeout)
    log(("time taken for prediction using model {} "
         "version {} is: {} secs").format(
        str(result.model_spec.name), result.model_spec.version.value,
        time()-start_time))
    return result
示例#8
0
 def fill_memory(ctrl):
     if memorize:
         examples = ctrl.db.get_dataset(dataset)
         log("RECIPE: Add {} examples from dataset '{}' to memory".format(
             len(examples), dataset))
         for eg in examples:
             memory[eg[TASK_HASH_ATTR]] = eg['answer']
def connect(db_id=None, db_settings=None):
    """Connect to the database.

    db_id (unicode): 'sqlite' (default), 'postgresql' or 'mysql'.
    db_settings (dict): Optional database connection parameters.
    RETURNS (prodigy.components.db.Database): The initialized database.
    """
    global _DB
    if _DB is not None:
        return _DB
    connectors = {
        'sqlite': connect_sqlite,
        'postgresql': connect_postgresql,
        'mysql': connect_mysql
    }
    user_dbs = get_entry_points('prodigy_db')
    if user_dbs:
        log("DB: Added {} connector(s) via entry points".format(len(user_dbs)))
    if db_id in user_dbs:
        _DB = user_dbs[db_id]
        return _DB
    config = get_config()
    if db_id in (True, False, None):
        db_id = config.get('db', 'sqlite')
    if db_settings in (True, False, None):
        config_db_settings = config.setdefault('db_settings', {})
        db_settings = config_db_settings.get(db_id, {})
    if db_id not in connectors:
        raise ValueError("Invalid database id: {}".format(db_id))
    db_name, db = connectors[db_id](**db_settings)
    _DB = Database(db, db_id, db_name)
    log("DB: Connecting to database {}".format(db_name), db_settings)
    return _DB
 def drop_dataset(self, name):
     """
     name (unicode): The name of the dataset to drop.
     RETURNS (bool): True if dataset was dropped.
     """
     dataset = Dataset.get(Dataset.name == name)
     query = Link.delete().where(Link.dataset == dataset.id)
     query.execute()
     query = Dataset.delete().where(Dataset.id == dataset.id)
     query.execute()
     self.db.commit()
     log("DB: Removed dataset '{}'".format(name))
     return True
 def get_dataset(self, name, default=None):
     """
     name (unicode): The dataset name.
     default: Return value if dataset not in database.
     RETURNS (list): The examples in the dataset or default value.
     """
     if name not in self:
         return default
     dataset = Dataset.get(Dataset.name == name)
     examples = (Example.select().join(Link).join(Dataset).where(
         Dataset.id == dataset.id)).execute()
     log("DB: Loading dataset '{}' ({} examples)".format(
         name, len(examples)))
     return [eg.load() for eg in examples]
示例#12
0
def _create_dir(path):
    """A private function which creates a directory if it does not exists

    Arguments:
        path (str): Directory path

    Returns:
        None
    """
    if not os.path.isdir(path):
        log("Creating a directory {}".format(path))
        os.mkdir(path)
    else:
        log("Directory {} already  exists".format(path))
示例#13
0
def get_predictions(single_stream, class_mapping_dict, ip, port, model_name):
    """Gets predictions for a single image using Tensorflow serving

    Arguments:
        single_stream (dict): A single prodigy stream
        class_mapping_dict (dict): with key as int and value as class name
        ip (str): tensorflow serving IP
        port (str): tensorflow serving port
        model_name (str): model name in tensorflow serving

    Returns:
        A tuple containing numpy arrays:
        (class_ids, class_names, scores, boxes)
    """
    image_byte_stream = b64_uri_to_bytes(single_stream["image"])
    encoded_image_io = io.BytesIO(image_byte_stream)
    image = Image.open(encoded_image_io)
    width, height = image.size
    filename = str(single_stream["meta"]["file"])
    file_extension = filename.split(".")[1].lower()
    if file_extension == "png":
        image_format = b'png'
    elif file_extension in ("jpg", "jpeg"):
        image_format = b'jpg'
    else:
        log(("Only 'png', 'jpeg' or 'jpg' files are supported by ODAPI. "
             "Got {}. Thus treating it as `jpg` file. "
             "Might cause errors".format(file_extension)
             ))
        image_format = b'jpg'

    filename = filename.encode("utf-8")
    tf_example = tf.train.Example(features=tf.train.Features(feature={
        'image/height': dataset_util.int64_feature(height),
        'image/width': dataset_util.int64_feature(width),
        'image/filename': dataset_util.bytes_feature(filename),
        'image/source_id': dataset_util.bytes_feature(filename),
        'image/encoded': dataset_util.bytes_feature(image_byte_stream),
        'image/format': dataset_util.bytes_feature(image_format),
    }))

    boxes, class_ids, scores = tf_odapi_client(tf_example.SerializeToString(),
                                               ip, port, model_name,
                                               "serving_default",
                                               input_name="serialized_example",
                                               timeout=300
                                               )
    class_names = np.array([class_mapping_dict[class_id]
                            for class_id in class_ids])
    return (class_ids, class_names, scores, boxes)
    def __init__(self, db, display_id='custom', display_name=None):
        """Initialize a database.

        db: A database object that can be initialized by peewee.
        display_id (unicode): Database ID used for logging, e.g. 'sqlite'.
        display_name (unicode): Database name used for logging, e.g. 'SQLite'.
        RETURNS (Database): The initialized database.
        """
        DB_PROXY.initialize(db)
        self.db_id = display_id
        self.db_name = display_name or get_display_name(db)
        log("DB: Initialising database {}".format(self.db_name))
        try:
            DB_PROXY.create_tables([User, Dataset, Example, Link], safe=True)
        except orm.OperationalError:
            pass
        self.db = DB_PROXY
 def add_dataset(self, name, meta={}, session=False):
     """
     name (unicode): The name of the dataset to add.
     meta (dict): Optional dataset meta.
     session (bool): Whether the dataset is a session dataset.
     RETURNS (list): The created dataset.
     """
     if any([char in name for char in (',', ' ')]):
         raise ValueError("Dataset name can't include commas or whitespace")
     try:
         dataset = Dataset.get(Dataset.name == name)
         log("DB: Getting dataset '{}'".format(name))
     except Dataset.DoesNotExist:
         log("DB: Creating dataset '{}'".format(name), meta)
         meta = ujson.dumps(meta, escape_forward_slashes=False)
         dataset = Dataset.create(name=name, meta=meta, session=session)
     return dataset
示例#16
0
 def get_stream():
     """Continue querying sense2vec whenever we get a new phrase and
     presenting examples to the user with a similarity above the threshold
     parameter."""
     nonlocal threshold
     while True:
         log(f"RECIPE: Looking for {n_similar} phrases most similar to "
             f"{len(accept_keys)} accepted keys")
         most_similar = s2v.most_similar(accept_keys, n=n_similar)
         log(f"RECIPE: Found {len(most_similar)} most similar phrases")
         n_skipped = 0
         n_duplicate = 0
         for key, score in most_similar:
             if score > threshold:
                 word, sense = s2v.split_key(key)
                 if (case_sensitive
                         and word in seen) or (not case_sensitive
                                               and word.lower() in seen):
                     n_duplicate += 1
                     continue
                 seen.add(word if case_sensitive else word.lower())
                 # Make sure the score is a regular float, otherwise server
                 # may fail when trying to serialize it to/from JSON
                 meta = {"score": float(score), "sense": sense}
                 yield {
                     "text": key,
                     "word": word,
                     "sense": sense,
                     "meta": meta
                 }
             else:
                 n_skipped += 1
         if n_skipped:
             log(f"RECIPE: Skipped {n_skipped} phrases below threshold {threshold}"
                 )
         if n_skipped == len(most_similar) - n_duplicate:
             # No most similar phrases were found that are above the
             # threshold, so lower the threshold if it's not already 0 or
             # return empty list so Prodigy shows "no tasks available"
             new_threshold = threshold - 0.1
             if new_threshold <= 0.0:
                 log(f"RECIPE: No suggestions for threshold {threshold:.2}")
                 return []
             log(f"RECIPE: Lowering threshold from {threshold:.2} to {new_threshold:.2}"
                 )
             threshold = new_threshold
示例#17
0
def _export_saved_model(export_dir, estimator, odapi_configs):
    """Private function which exports a SavedModel from estimator
    Arguments:
        export_dir (str): directory to export temp SavedModels for TF serving
        estimator (tf.estimator.Estimator): detection model as tf estimator
        odapi_configs (dict): Object detection api pipeline.config object

    Returns:
        None
    """
    log("Exporting the model as SavedModel in {}".format(export_dir))
    # Just a placeholder
    pred_input_config = odapi_configs["eval_input_config"]
    predict_input_fn = create_predict_input_fn(odapi_configs["model"],
                                               pred_input_config)
    estimator.export_saved_model(export_dir_base=export_dir,
                                 serving_input_receiver_fn=predict_input_fn)
    log("Exported SavedModel!")
示例#18
0
def evaluate(dataset,
             spacy_model,
             source,
             label='',
             api=None,
             loader=None,
             exclude=None):
    """
    Evaluate a text classification model and build an evaluation set from a
    stream.
    """
    log("RECIPE: Starting recipe attncat.eval", locals())
    nlp = spacy.load(spacy_model, disable=['tagger', 'parser', 'ner'])
    # Get attention layer weights from textcat
    textcat = nlp.get_pipe('textcat')
    assert textcat is not None
    with get_attention_weights(textcat) as attn_weights:
        stream = get_stream(source, api, loader)
        # Decorate items with attention data
        stream = attach_attention_data(stream, nlp, attn_weights)
        model = TextClassifier(nlp, label)
        log(
            'RECIPE: Initialised TextClassifier with model {}'.format(
                spacy_model), model.nlp.meta)

    def on_exit(ctrl):
        examples = ctrl.db.get_dataset(dataset)
        data = dict(model.evaluate(examples))
        print(printers.tc_result(data))

    return {
        'view_id': 'html',
        'dataset': dataset,
        'stream': stream,
        'exclude': exclude,
        'on_exit': on_exit,
        'config': {
            'lang': nlp.lang,
            'labels': model.labels,
            'html_template': template_text
        }
    }
 def add_examples(self, examples, datasets=tuple()):
     """
     examples (list): The examples to add.
     datasets (list): The names of the dataset(s) to add the examples to.
     """
     with self.db.atomic():
         ids = []
         for eg in examples:
             content = ujson.dumps(eg, escape_forward_slashes=False)
             eg = Example.create(input_hash=eg[INPUT_HASH_ATTR],
                                 task_hash=eg[TASK_HASH_ATTR],
                                 content=content)
             ids.append(eg.id)
     if type(datasets) is not tuple and type(datasets) is not list:
         raise ValueError(
             'datasets must be a tuple or list type, not: {}'.format(
                 type(datasets)))
     for dataset in datasets:
         self.link(dataset, ids)
     log("DB: Added {} examples to {} datasets".format(
         len(examples), len(datasets)))
示例#20
0
def to_patterns(dataset,
                spacy_model,
                label,
                output_file="-",
                case_sensitive=False,
                dry=False):
    """
    Convert a dataset of phrases collected with sense2vec.teach to token-based
    match patterns that can be used with spaCy's EntityRuler or recipes like
    ner.match. If no output file is specified, the patterns are written to
    stdout. The examples are tokenized so that multi-token terms are represented
    correctly, e.g.:
    {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]}
    """
    log("RECIPE: Starting recipe sense2vec.to-patterns", locals())
    nlp = spacy.load(spacy_model)
    log(f"RECIPE: Loaded spaCy model '{spacy_model}'")
    DB = connect()
    if dataset not in DB:
        raise ValueError(f"Can't find dataset '{dataset}'")
    examples = DB.get_dataset(dataset)
    terms = [eg["text"] for eg in examples if eg["answer"] == "accept"]
    if case_sensitive:
        patterns = [{"text": t.text
                     for t in nlp.make_doc(term)} for term in terms]
    else:
        patterns = [{"lower": t.lower_
                     for t in nlp.make_doc(term)} for term in terms]
    patterns = [{"label": label, "pattern": pattern} for pattern in patterns]
    log(f"RECIPE: Generated {len(patterns)} patterns")
    if not dry:
        srsly.write_jsonl(output_file, patterns)
    return patterns
示例#21
0
 def get_stream():
     """Continue querying sense2vec whenever we get a new phrase and
     presenting examples to the user with a similarity above the threshold
     parameter."""
     while True:
         log(f"RECIPE: Looking for {n_similar} phrases most similar to "
             f"{len(accept_keys)} accepted keys")
         most_similar = s2v.most_similar(accept_keys, n=n_similar)
         log(f"RECIPE: Found {len(most_similar)} most similar phrases")
         for key, score in most_similar:
             if key not in seen and score > threshold:
                 seen.add(key)
                 word, sense = s2v.split_key(key)
                 # Make sure the score is a regular float, otherwise server
                 # may fail when trying to serialize it to/from JSON
                 meta = {"score": float(score)}
                 yield {
                     "text": key,
                     "word": word,
                     "sense": sense,
                     "meta": meta
                 }
示例#22
0
def image_servingmodel(dataset,
                       ip,
                       port,
                       model_name,
                       label_map_path,
                       source=None,
                       threshold=0.5,
                       api=None,
                       exclude=None,
                       use_display_name=False,
                       label=None):
    log("RECIPE: Starting recipe image.servingmodel", locals())

    # key class names
    reverse_class_mapping_dict = label_map_util.get_label_map_dict(
        label_map_path=label_map_path, use_display_name=use_display_name)
    if label is None:
        label = [k for k in reverse_class_mapping_dict.keys()]
    # key int
    class_mapping_dict = {v: k for k, v in reverse_class_mapping_dict.items()}
    stream = get_stream(source, api=api, loader="images", input_key="image")
    stream = fetch_images(stream)

    return {
        "view_id":
        "image_manual",
        "dataset":
        dataset,
        "stream":
        get_image_stream(stream, class_mapping_dict, ip, port, model_name,
                         float(threshold)),
        "exclude":
        exclude,
        'config': {
            'label': ', '.join(label) if label is not None else 'all',
            'labels': label,  # Selectable label options,
        }
    }
示例#23
0
def _write_tf_record(tasks, output_file, reverse_class_mapping_dict):
    """Private function which writes training TF-Record file

    Arguments:
        tasks (iterable): prodigy's tasks
        output_file (str): output TF-Record filename
        reverse_class_mapping_dict (dict): key as class name and value as int

    Returns:
        a counter containing number of examples returned
    """
    writer = tf.python_io.TFRecordWriter(output_file)
    counter = 0
    for task in tasks:
        if task['answer'] == 'accept':
            tf_example = create_a_tf_example(task, reverse_class_mapping_dict)
            writer.write(tf_example.SerializeToString())
            counter += 1
        else:
            continue
    writer.close()
    log("Successfully written {} annotations as TFRecords".format(counter))
    return counter
示例#24
0
def pipe(source=None, api=None, loader=None, from_dataset=False, exclude=None):
    """
    Load examples from an input source, and print them as newline-delimited
    JSON. This makes it easy to filter the stream with command-line utilities
    such as `grep`. It's also often useful to inspect the stream, by piping to
    `less`.
    """
    DB = connect()
    if from_dataset:
        stream = DB.get_dataset(source)
    else:
        stream = get_stream(source, api, loader)
        stream = (set_hashes(eg) for eg in stream)
    if exclude:
        log("RECIPE: Excluding tasks from datasets: {}".format(
            ', '.join(exclude)))
        exclude_hashes = DB.get_input_hashes(*exclude)
        stream = filter_inputs(stream, exclude_hashes)
    try:
        for eg in stream:
            print(ujson.dumps(eg, escape_forward_slashes=False))
    except KeyboardInterrupt:
        pass
示例#25
0
def evaluate(dataset,
             spacy_model,
             source,
             label='',
             api=None,
             loader=None,
             exclude=None):
    """
    Evaluate a text classification model and build an evaluation set from a
    stream.
    """
    log("RECIPE: Starting recipe attncat.eval", locals())
    nlp = spacy.load(spacy_model, disable=['tagger', 'parser', 'ner'])
    stream = get_stream(source, api, loader)
    stream = attach_structural_sensitivity_data(stream, nlp,
                                                label.split(',')[0])
    model = TextClassifier(nlp, label)
    log('RECIPE: Initialised TextClassifier with model {}'.format(spacy_model),
        model.nlp.meta)

    def on_exit(ctrl):
        examples = ctrl.db.get_dataset(dataset)
        data = dict(model.evaluate(examples))
        print(printers.tc_result(data))

    return {
        'view_id': 'html',
        'dataset': dataset,
        'stream': stream,
        'exclude': exclude,
        'on_exit': on_exit,
        'config': {
            'lang': nlp.lang,
            'labels': model.labels,
            'html_template': template_text
        }
    }
示例#26
0
def _tf_odapi_client(image,
                     ip,
                     port,
                     model_name,
                     signature_name="detection_signature",
                     input_name="inputs",
                     timeout=300):
    """Client for using Tensorflow Serving with Tensorflow Object Detection API

    Arguments:
        data (np.ndarray/bytes): A numpy array of data or bytes. No Default
        ip (str): IP address of tensorflow serving. No Default
        port (str/int): Port of tensorflow serving. No Default
        model_name (str): Model name. No Default
        signature_name (str): Signature name. Default "detection_signature".
        input_name (str): Input tensor name. Default "inputs".
        timeout (str): timeout for API call. Default 300 secs

    returns:
        a tuple containing numpy arrays of (boxes, classes, scores)
    """
    start_time = time()
    result = _generic_tf_serving_client(image, ip, port, model_name,
                                        signature_name, input_name, timeout)
    log("time taken for image shape {} is {} secs".format(
        image.shape,
        time() - start_time))
    # boxes are ymin.xmin,ymax,xmax
    boxes = np.array(result.outputs['detection_boxes'].float_val)
    classes = np.array(result.outputs['detection_classes'].float_val)
    scores = np.array(result.outputs['detection_scores'].float_val)
    boxes = boxes.reshape((len(scores), 4))
    classes = np.squeeze(classes.astype(np.int32))
    scores = np.squeeze(scores)

    return (boxes, classes, scores)
示例#27
0
def to_patterns(dataset,
                spacy_model,
                label,
                output_file="-",
                case_sensitive=False,
                dry=False):
    """
    Convert a dataset of phrases collected with sense2vec.teach to token-based
    match patterns that can be used with spaCy's EntityRuler or recipes like
    ner.match. If no output file is specified, the patterns are written to
    stdout. The examples are tokenized so that multi-token terms are represented
    correctly, e.g.:
    {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]}

    For tokenization, you can either pass in the name of a spaCy model (e.g. if
    you're using a model with custom tokenization), or "blank:" plus the
    language code you want to use, e.g. blank:en or blank:de. Make sure to use
    the same language / tokenizer you're planning to use at runtime – otherwise
    your patterns may not match.
    """
    log("RECIPE: Starting recipe sense2vec.to-patterns", locals())
    if spacy_model.startswith("blank:"):
        nlp = spacy.blank(spacy_model.replace("blank:", ""))
    else:
        nlp = spacy.load(spacy_model)
    log(f"RECIPE: Loaded spaCy model '{spacy_model}'")
    DB = connect()
    if dataset not in DB:
        msg.fail(f"Can't find dataset '{dataset}'", exits=1)
    examples = DB.get_dataset(dataset)
    terms = set([eg["word"] for eg in examples if eg["answer"] == "accept"])
    if case_sensitive:
        patterns = [[{
            "text": t.lower_
        } for t in nlp.make_doc(term)] for term in terms]
    else:
        terms = set([word.lower() for word in terms])
        patterns = [[{
            "lower": t.lower_
        } for t in nlp.make_doc(term)] for term in terms]
    patterns = [{"label": label, "pattern": pattern} for pattern in patterns]
    log(f"RECIPE: Generated {len(patterns)} patterns")
    if not dry:
        srsly.write_jsonl(output_file, patterns)
    return patterns
示例#28
0
def image_tfodapimodel(dataset,
                       frozen_model_path,
                       label_map_path,
                       source=None,
                       threshold=0.5,
                       api=None,
                       exclude=None,
                       use_display_name=False,
                       label=None):
    log("RECIPE: Starting recipe image.tfodapimodel", locals())
    log("RECIPE: Loading frozen model")
    global detection_graph
    detection_graph = tf.Graph()
    with detection_graph.as_default():
        od_graph_def = tf.GraphDef()
        with tf.gfile.GFile(frozen_model_path, 'rb') as fid:
            serialized_graph = fid.read()
            od_graph_def.ParseFromString(serialized_graph)
            tf.import_graph_def(od_graph_def, name='')
    global sess
    sess = tf.Session(graph=detection_graph)
    log("RECIPE: Loaded frozen model")
    # key class names
    reverse_class_mapping_dict = label_map_util.get_label_map_dict(
        label_map_path=label_map_path, use_display_name=use_display_name)
    if label is None:
        label = [k for k in reverse_class_mapping_dict.keys()]
    # key int
    class_mapping_dict = {v: k for k, v in reverse_class_mapping_dict.items()}
    stream = get_stream(source, api=api, loader="images", input_key="image")
    stream = fetch_images(stream)

    return {
        "view_id": "image_manual",
        "dataset": dataset,
        "stream": get_image_stream(stream, class_mapping_dict,
                                   float(threshold)),
        "exclude": exclude,
        "on_exit": free_graph,
        'config': {
            'label': ', '.join(label) if label is not None else 'all',
            'labels': label,  # Selectable label options,
        }
    }
示例#29
0
def mark_custom(dataset,
                source=None,
                view_id=None,
                label='',
                api=None,
                loader=None,
                memorize=False,
                exclude=None):
    """
    Click through pre-prepared examples, with no model in the loop.
    """
    log('RECIPE: Starting recipe mark', locals())
    stream = list(get_stream(source, api, loader))

    counts = Counter()
    memory = {}

    def fill_memory(ctrl):
        if memorize:
            examples = ctrl.db.get_dataset(dataset)
            log("RECIPE: Add {} examples from dataset '{}' to memory".format(
                len(examples), dataset))
            for eg in examples:
                memory[eg[TASK_HASH_ATTR]] = eg['answer']

    def ask_questions(stream):
        for eg in stream:
            eg['time_loaded'] = datetime.now().isoformat()
            if TASK_HASH_ATTR in eg and eg[TASK_HASH_ATTR] in memory:
                answer = memory[eg[TASK_HASH_ATTR]]
                counts[answer] += 1
            else:
                if label:
                    eg['label'] = label
                yield eg

    def recv_answers(answers):
        for eg in answers:
            counts[eg['answer']] += 1
            memory[eg[TASK_HASH_ATTR]] = eg['answer']
            eg['time_returned'] = datetime.now().isoformat()

    def print_results(ctrl):
        print(printers.answers(counts))

    def get_progress(session=0, total=0, loss=0):
        progress = len(counts) / len(stream)
        return progress

    return {
        'view_id': view_id,
        'dataset': dataset,
        'stream': ask_questions(stream),
        'exclude': exclude,
        'update': recv_answers,
        'on_load': fill_memory,
        'on_exit': print_results,
        'config': {
            'label': label
        }
    }
示例#30
0
def teach(
    dataset,
    vectors_path,
    seeds,
    threshold=0.85,
    n_similar=100,
    batch_size=5,
    case_sensitive=False,
    resume=False,
):
    """
    Bootstrap a terminology list using sense2vec. Prodigy will suggest similar
    terms based on the the most similar phrases from sense2vec, and the
    suggestions will be adjusted as you annotate and accept similar phrases. For
    each seed term, the best matching sense according to the sense2vec vectors
    will be used.

    If no similar terms are found above the given threshold, the threshold is
    lowered by 0.1 and similar terms are requested again.
    """
    log("RECIPE: Starting recipe sense2vec.teach", locals())
    s2v = Sense2Vec().from_disk(vectors_path)
    log("RECIPE: Loaded sense2vec vectors", vectors_path)
    html_template = "<span style='font-size: {{theme.largeText}}px'>{{word}}</span>"
    accept_keys = []
    seen = set()
    seed_tasks = []
    for seed in seeds:
        key = s2v.get_best_sense(seed)
        if key is None:
            msg.fail(f"Can't find seed term '{seed}' in vectors", exits=1)
        accept_keys.append(key)
        best_word, best_sense = s2v.split_key(key)
        seen.add(best_word if case_sensitive else best_word.lower())
        task = {
            "text": key,
            "word": best_word,
            "sense": best_sense,
            "meta": {
                "score": 1.0,
                "sense": best_sense
            },
            "answer": "accept",
        }
        seed_tasks.append(set_hashes(task))
    print(f"Starting with seed keys: {accept_keys}")
    DB = connect()
    if dataset not in DB:
        DB.add_dataset(dataset)
    dataset_hashes = DB.get_task_hashes(dataset)
    DB.add_examples(
        [st for st in seed_tasks if st[TASK_HASH_ATTR] not in dataset_hashes],
        datasets=[dataset],
    )

    if resume:
        prev = DB.get_dataset(dataset)
        prev_accept_keys = [
            eg["text"] for eg in prev if eg["answer"] == "accept"
        ]
        prev_words = [
            eg["word"] if case_sensitive else eg["word"].lower() for eg in prev
        ]
        accept_keys += prev_accept_keys
        seen.update(set(prev_words))
        log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}"
            )

    def update(answers):
        """Updates accept_keys so that the stream can find new phrases."""
        log(f"RECIPE: Updating with {len(answers)} answers")
        for answer in answers:
            phrase = answer["text"]
            if answer["answer"] == "accept":
                accept_keys.append(phrase)

    def get_stream():
        """Continue querying sense2vec whenever we get a new phrase and
        presenting examples to the user with a similarity above the threshold
        parameter."""
        nonlocal threshold
        while True:
            log(f"RECIPE: Looking for {n_similar} phrases most similar to "
                f"{len(accept_keys)} accepted keys")
            most_similar = s2v.most_similar(accept_keys, n=n_similar)
            log(f"RECIPE: Found {len(most_similar)} most similar phrases")
            n_skipped = 0
            n_duplicate = 0
            for key, score in most_similar:
                if score > threshold:
                    word, sense = s2v.split_key(key)
                    if (case_sensitive
                            and word in seen) or (not case_sensitive
                                                  and word.lower() in seen):
                        n_duplicate += 1
                        continue
                    seen.add(word if case_sensitive else word.lower())
                    # Make sure the score is a regular float, otherwise server
                    # may fail when trying to serialize it to/from JSON
                    meta = {"score": float(score), "sense": sense}
                    yield {
                        "text": key,
                        "word": word,
                        "sense": sense,
                        "meta": meta
                    }
                else:
                    n_skipped += 1
            if n_skipped:
                log(f"RECIPE: Skipped {n_skipped} phrases below threshold {threshold}"
                    )
            if n_skipped == len(most_similar) - n_duplicate:
                # No most similar phrases were found that are above the
                # threshold, so lower the threshold if it's not already 0 or
                # return empty list so Prodigy shows "no tasks available"
                new_threshold = threshold - 0.1
                if new_threshold <= 0.0:
                    log(f"RECIPE: No suggestions for threshold {threshold:.2}")
                    return []
                log(f"RECIPE: Lowering threshold from {threshold:.2} to {new_threshold:.2}"
                    )
                threshold = new_threshold

    stream = get_stream()

    return {
        "view_id": "html",
        "dataset": dataset,
        "stream": stream,
        "update": update,
        "config": {
            "batch_size": batch_size,
            "html_template": html_template
        },
    }