Exemplos de JSONL em Python, exemplos de prodigy.components.loaders.JSONL em Python

Exemplo n.º 1

0

Exibir arquivo

def ner_manual(dataset, spacy_model, source, label=None, exclude=None):
    """
    Mark spans manually by token. Requires only a tokenizer and no entity
    recognizer, and doesn't do any active learning.
    """
    # Load the spaCy model for tokenization
    nlp = spacy.load(spacy_model)

    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    stream = add_tokens(nlp, stream)

    return {
        'view_id': 'ner_manual',  # Annotation interface to use
        'dataset': dataset,  # Name of dataset to save annotations
        'stream': stream,  # Incoming stream of examples
        'exclude': exclude,  # List of dataset names to exclude
        'config': {  # Additional config settings, mostly for app UI
            'lang': nlp.lang,
            'label': ', '.join(label) if label is not None else 'all',
            'labels': label  # Selectable label options
        }
    }

Exemplo n.º 2

0

Exibir arquivo

def textcat_custom_model(dataset, source, model, label_binarizer, label, goal):
    """
    Use active learning-powered text classification with a scikit model.
    """

    stream = JSONL(source)

    model = CustomModel(
        label=label, model_path=model, label_binarizer_path=label_binarizer
    )

    stream = prefer_high_scores(model(stream))

    total_accepted = 0

    def update(answers):
        nonlocal total_accepted
        accepted = [a for a in answers if a["answer"] == "accept"]
        total_accepted += len(accepted)

    def progress(*args, **kwargs):
        nonlocal goal
        return total_accepted / goal + 0.00001

    return {
        "view_id": "classification",
        "dataset": dataset,
        "stream": stream,
        "update": update,
        "progress": progress,
        "config": {"label": label},
    }

Exemplo n.º 3

0

Exibir arquivo

def image_caption_text_align(dataset: str, sourcefile: str):
    """Stream in images and corresponding text.
    """
    nlp = spacy.load("de_core_news_sm")

    stream = JSONL(sourcefile)
    stream = fetch_images(stream)
    stream = add_tokens(nlp, stream)

    blocks = [
        {"view_id": "image", "spans": []},
        {"view_id": "text_input",
            "field_id": "caption",
            "field_rows": 4,
            "field-autofocus": True},
        {"view_id": "ner_manual"}
    ]
    return {
        "dataset": dataset,
        "stream": stream,
        "view_id": "blocks",
        "config": {"blocks": blocks,
                   "lang": nlp.lang,
                   "labels": ["current image"]
                   }
    }

Exemplo n.º 4

0

Exibir arquivo

def question_answering(dataset: str, source: str):
    """
    Annotate question/answer pairs with a custom HTML interface. Expects an
    input file with records that look like this:
        {"question": "What color is the sky?", "question_answer": "blue"}
    Important note: The "answer" field is reserved by Prodigy and will be set
    in the annotation UI ("accept", "reject" or "ignore"). That's why we're
    using "question_answer" here.
    """
    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # The HTML template to use. While we could also reformat the stream to
    # include a "html" field for each example, a template allows rendering
    # tasks without having to include the full HTML markup every time. All
    # task properties become available as Mustache-style variables.
    html_template = (
        "<div style='text-align: left; width: 100%'>"
        "<div style='padding: 20px; border-bottom: 1px solid #ccc'><strong>Question:</strong> {{question}}</div>"
        "<div style='padding: 20px'><strong>Answer:</strong> {{question_answer}}</div>"
        "</div>")

    return {
        "view_id": "html",  # Annotation interface to use
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "config": {
            "html_template": html_template
        },  # Additional config
    }

Exemplo n.º 5

0

Exibir arquivo

def ner_make_gold(dataset, spacy_model, source, label=None, exclude=None):
    """
    Create gold-standard data by correcting a model's predictions manually.
    """
    # Load the spaCy model
    nlp = spacy.load(spacy_model)

    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    stream = add_tokens(nlp, stream)

    # Add the entities predicted by the model to the tasks in the stream
    stream = make_tasks(nlp, stream, label)

    return {
        'view_id': 'ner_manual',  # Annotation interface to use
        'dataset': dataset,  # Name of dataset to save annotations
        'stream': stream,  # Incoming stream of examples
        'exclude': exclude,  # List of dataset names to exclude
        'config': {  # Additional config settings, mostly for app UI
            'lang': nlp.lang,
            'label': ', '.join(label) if label is not None else 'all',
            'labels': label  # Selectable label options
        }
    }

Exemplo n.º 6

0

Exibir arquivo

def relations(dataset, file_path, annotator):
    """Annotate the sentiment of texts using different mood options."""
    stream = JSONL(file_path)  # load in the JSONL file

    # TODO need to remove previously annotated
    total_lines = count_lines(file_path)

    def progress(controller, update_return_value):
        return controller.total_annotated / total_lines

    def add_label(stream):
        for task in stream:
            task['label'] = f'({task["mention1"]}, {task["mention2"]})'
            yield task

    stream = add_label(stream)
    stream = add_options(stream)

    def before_db(examples):
        for e in examples:
            if 'created' not in e:
                e['created'] = iso8601_now()
            if 'annotator' not in e:
                e['annotator'] = annotator
        return examples

    return {
        'dataset': dataset,
        'stream': stream,
        'view_id': 'choice',
        'progress': progress,
        'before_db': before_db,
    }

Exemplo n.º 7

0

Exibir arquivo

def choice(dataset: str,
           source: str,
           options: List[str],
           multiple: bool = False):
    """
    Annotate data with multiple-choice options. The annotated examples will
    have an additional property `"accept": []` mapping to the ID(s) of the
    selected option(s).
    """
    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Add the options to all examples in the stream
    stream = add_options(stream, options)

    return {
        "view_id": "choice",  # Annotation interface to use
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "config": {  # Additional config settings
            # Allow multiple choice if flag is set
            "choice_style": "multiple" if multiple else "single",
            # Automatically accept and "lock in" selected answers if only
            # single choice is allowed
            "choice_auto_accept": False if multiple else True,
        },
    }

Exemplo n.º 8

0

Exibir arquivo

def ner_make_gold(
    dataset: str,
    spacy_model: str,
    source: str,
    label: Optional[List[str]] = None,
    exclude: Optional[List[str]] = None,
):
    """
    Create gold-standard data by correcting a model's predictions manually.
    """
    # Load the spaCy model
    nlp = spacy.load(spacy_model)

    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    stream = add_tokens(nlp, stream)

    # Add the entities predicted by the model to the tasks in the stream
    stream = make_tasks(nlp, stream, label)

    return {
        "view_id": "ner_manual",  # Annotation interface to use
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "exclude": exclude,  # List of dataset names to exclude
        "config": {  # Additional config settings, mostly for app UI
            "lang": nlp.lang,
            "labels": label,  # Selectable label options
        },
    }

Exemplo n.º 9

0

Exibir arquivo

Arquivo: ner_manual.py Projeto: zlapp/prodigy-recipes

def ner_manual(
    dataset: str,
    spacy_model: str,
    source: str,
    label: Optional[List[str]] = None,
    exclude: Optional[List[str]] = None,
):
    """
    Mark spans manually by token. Requires only a tokenizer and no entity
    recognizer, and doesn't do any active learning.
    """
    # Load the spaCy model for tokenization
    nlp = spacy.load(spacy_model)

    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    stream = add_tokens(nlp, stream)

    return {
        "view_id": "ner_manual",  # Annotation interface to use
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "exclude": exclude,  # List of dataset names to exclude
        "config": {  # Additional config settings, mostly for app UI
            "lang": nlp.lang,
            "labels": label,  # Selectable label options
        },
    }

Exemplo n.º 10

0

Exibir arquivo

def registers(dataset, file_path, annotator):
    """Annotate the sentiment of texts using different mood options."""
    stream = JSONL(file_path)     # load in the JSONL file
    # stream = add_options(stream)  # add options to each task

    # TODO need to remove previously annotated
    total_lines = count_lines(file_path)
    def progress(controller, update_return_value):
        return controller.total_annotated / total_lines

    def add_label(stream):
        for task in stream:
            task['label'] = task.get('doc_title')
            yield task
    stream = add_label(stream)
    stream = add_options(stream)

    def before_db(examples):
        for e in examples:
            if 'created' not in e:
                e['created'] = iso8601_now()
            if 'annotator' not in e:
                e['annotator'] = annotator
        return examples

    return {
        'dataset': dataset,
        'stream': stream,
        'view_id': 'choice',
        'progress': progress,
        'before_db': before_db,
        'config': {
            'javascript': JAVASCRIPT,
        },
    }

Exemplo n.º 11

0

Exibir arquivo

def choice(dataset, source=None, options=None, multiple=False):
    """
    Annotate data with multiple-choice options. The annotated examples will
    have an additional property `"accept": []` mapping to the ID(s) of the
    selected option(s).
    """
    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Add the options to all examples in the stream
    stream = add_options(stream, options)

    return {
        'view_id': 'choice',  # Annotation interface to use
        'dataset': dataset,  # Name of dataset to save annotations
        'stream': stream,  # Incoming stream of examples
        'config': {  # Additional config settings
            # Allow multiple choice if flag is set
            'choice_style': 'multiple' if multiple else 'single',
            # Automatically accept and "lock in" selected answers if only
            # single choice is allowed
            'choice_auto_accept': False if multiple else True
        }
    }

Exemplo n.º 12

0

Exibir arquivo

Arquivo: textcat_custom_model.py Projeto: badibouteraa/prodigy-multi-annotator

def textcat_custom_model(dataset, source, label=[]):
    """
    Use active learning-powered text classification with a custom model. To
    demonstrate how it works, this demo recipe uses a simple dummy model that
    "precits" random scores. But you can swap it out for any model of your
    choice, for example a text classification model implementation using
    PyTorch, TensorFlow or scikit-learn.
    """
    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Load the dummy model
    model = DummyModel(labels=label)

    # Use the prefer_uncertain sorter to focus on suggestions that the model
    # is most uncertain about (i.e. with a score closest to 0.5). The model
    # yields (score, example) tuples and the sorter yields just the example
    stream = prefer_uncertain(model(stream))

    # The update method is called every time Prodigy receives new answers from
    # the web app. It can be used to update the model in the loop.
    update = model.update

    return {
        'view_id': 'classification',  # Annotation interface to use
        'dataset': dataset,  # Name of dataset to save annotations
        'stream': stream,  # Incoming stream of examples
        'update': update,  # Update callback, called with batch of answers
        'config': {  # Additional config settings, mostly for app UI
            'label': ', '.join(label)
        }
    }

Exemplo n.º 13

0

Exibir arquivo

Arquivo: textcat_teach.py Projeto: zlapp/prodigy-recipes

def textcat_teach(
    dataset: str,
    spacy_model: str,
    source: str,
    label: Optional[List[str]] = None,
    patterns: Optional[str] = None,
    exclude: Optional[List[str]] = None,
):
    """
    Collect the best possible training data for a text classification model
    with the model in the loop. Based on your annotations, Prodigy will decide
    which questions to ask next.
    """
    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Load the spaCy model
    nlp = spacy.load(spacy_model)

    # Initialize Prodigy's text classifier model, which outputs
    # (score, example) tuples
    model = TextClassifier(nlp, label)

    if patterns is None:
        # No patterns are used, so just use the model to suggest examples
        # and only use the model's update method as the update callback
        predict = model
        update = model.update
    else:
        # Initialize the pattern matcher and load in the JSONL patterns.
        # Set the matcher to not label the highlighted spans, only the text.
        matcher = PatternMatcher(
            nlp,
            prior_correct=5.0,
            prior_incorrect=5.0,
            label_span=False,
            label_task=True,
        )
        matcher = matcher.from_disk(patterns)
        # Combine the NER model and the matcher and interleave their
        # suggestions and update both at the same time
        predict, update = combine_models(model, matcher)

    # Use the prefer_uncertain sorter to focus on suggestions that the model
    # is most uncertain about (i.e. with a score closest to 0.5). The model
    # yields (score, example) tuples and the sorter yields just the example
    stream = prefer_uncertain(predict(stream))

    return {
        "view_id": "classification",  # Annotation interface to use
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "update": update,  # Update callback, called with batch of answers
        "exclude": exclude,  # List of dataset names to exclude
        "config": {
            "lang": nlp.lang
        },  # Additional config settings, mostly for app UI
    }

Exemplo n.º 14

0

Exibir arquivo

 def get_stream():
     stream = JSONL(file_path)
     tasks = []
     for eg in stream:
         # print(eg.keys())
         eg["edit_text"] = eg['text']
         tasks.append(eg)            
     return tasks

Exemplo n.º 15

0

Exibir arquivo

def get_stream(html_path):
    # Load the directory of images and add options to each task

    stream = JSONL(html_path)

    for eg in stream:
        eg["options"] = OPTIONS
        yield eg

Exemplo n.º 16

0

Exibir arquivo

Arquivo: recipe.py Projeto: tannonk/prodigy_human_evaluation

def choice(dataset: str, source: str):
    """
    Rating pairwise model outputs with a preference slider
    """

    def add_options(stream, k=2):
        """Helper function to add options to every task in a stream."""
        for i, task in enumerate(stream):

            new_task = {
                'src_text': '',
                'id': '',
                'ref_text': '',
                'hyp_a_text': '',
                'hyp_a_id': '',
                'hyp_b_text': '',
                'hyp_b_id': '',
                'score': 0,
                'time_loaded': None,
                'time_updated': None,
                'winner': None
            }
            
            new_task['time_loaded'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            new_task['src_text'] = task.pop('src_texts')
            new_task['src_text_title'], new_task['src_text_body'] = clean_text_for_display(new_task['src_text'])
            new_task['id'] = task.pop('test_set_line_id')
            new_task['ref_text'] = task.pop('ref_texts') # remove from item to avoid considering for annotation
            random.seed(i) # set random seed as index of item in stream for reproducibility
            random_pair = random.sample(list(task.keys()), k=min(k, len(list(task.keys()))))
            new_task['hyp_a_id'], new_task['hyp_b_id'] = random_pair
            new_task['hyp_a_text'] = task[new_task['hyp_a_id']]
            new_task['hyp_b_text'] = task[new_task['hyp_b_id']]
            
            yield new_task

    stream = JSONL(source)
    stream = add_options(stream, 2)
    stream = (set_hashes(task, input_keys=("src_text",), task_keys=("hyp_a_text", "hyp_b_text")) for task in stream)
    
    question = "Which response is more specific to the review?"

    return {
        "view_id": "blocks",
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "config": {
            "blocks": [
                {"view_id": "html", "html_template": "<div><p><strong>Review:</strong></p><p class=src><strong>{{src_text_title}}</strong> {{src_text_body}}</p></div>"},
                {"view_id": "html", "html_template": "<h1 class=taskQuestion>{}</h1>".format(question)},
                {"view_id": "html", "html_template": "<div><p><strong>Response A:</strong></p><p class=hyp1>{{hyp_a_text}}</p></div><div><p><strong>Response B:</strong></p><p class=hyp2>{{hyp_b_text}}</p></div>"},
                {"view_id": "html", "html_template": pref_slider},
                ],
            "global_css": css,
            "javascript": javascript
            },
        }

Exemplo n.º 17

0

Exibir arquivo

def ner_teach(
    dataset: str,
    spacy_model: str,
    source: str,
    label: Optional[List[str]] = None,
    patterns: Optional[str] = None,
    exclude: Optional[List[str]] = None,
    unsegmented: bool = False,
):
    """
    Collect the best possible training data for a named entity recognition
    model with the model in the loop. Based on your annotations, Prodigy will
    decide which questions to ask next.
    """
    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Load the spaCy model
    nlp = spacy.load(spacy_model)

    # Initialize Prodigy's entity recognizer model, which uses beam search to
    # find all possible analyses and outputs (score, example) tuples
    model = EntityRecognizer(nlp, label=label)

    if patterns is None:
        # No patterns are used, so just use the NER model to suggest examples
        # and only use the model's update method as the update callback
        predict = model
        update = model.update
    else:
        # Initialize the pattern matcher and load in the JSONL patterns
        matcher = PatternMatcher(nlp).from_disk(patterns)
        # Combine the NER model and the matcher and interleave their
        # suggestions and update both at the same time
        predict, update = combine_models(model, matcher)

    if not unsegmented:
        # Use spaCy to split text into sentences
        stream = split_sentences(nlp, stream)

    # Use the prefer_uncertain sorter to focus on suggestions that the model
    # is most uncertain about (i.e. with a score closest to 0.5). The model
    # yields (score, example) tuples and the sorter yields just the example
    stream = prefer_uncertain(predict(stream))

    return {
        "view_id": "ner",  # Annotation interface to use
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "update": update,  # Update callback, called with batch of answers
        "exclude": exclude,  # List of dataset names to exclude
        "config": {
            "lang": nlp.lang
        },  # Additional config settings, mostly for app UI
    }

Exemplo n.º 18

0

Exibir arquivo

def audio_annotation(dataset, source):
    stream = JSONL(source)
    stream = add_options(stream)
    stream = list(stream)
    return {
        "dataset": dataset,
        "stream": stream,
        "view_id": "choice",
        "progress": progress,
    }

Exemplo n.º 19

0

Exibir arquivo

def facts_annotation(language:str ):
  # define labels for annotation
  labels = ["Supports judgment","Opposes judgment", "Lower court"]


  if language != "test":
    # Load the spaCy model for tokenization.
    nlp = spacy.load("{}_core_news_sm".format(language))
    stream = JSONL("./datasets/annotation_input_set_{}.jsonl".format(language))
  else:
    nlp = spacy.load("de_core_news_sm")
    stream = JSONL("./datasets/annotation_input_set_de_ex.jsonl")

  dataset = "annotations_{}".format(language)
  port = ports[language]


  # Tokenize the incoming examples and add a "tokens" property to each
  # example. Also handles pre-defined selected spans. Tokenization allows
  # faster highlighting, because the selection can "snap" to token boundaries.
  # If `use_chars` is True, tokens are split into individual characters, which enables
  # character based selection as opposed to default token based selection.
  stream = add_tokens(nlp, stream, use_chars=None)
  return {
    "dataset": dataset ,# Name of dataset_scrc to save annotations
    "view_id": "blocks",
    "stream": stream,
    "config": {
      "port": port,
      "blocks": [
        {"view_id": "html",
         "html_template": "<p style='float:left'>{{file_number}}</p>"},
        {"view_id": "html", "html_template": "<h1 style='float:left'>{{header}} – Judgment: {{judgment}}</h2>"},
        {"view_id": "html",
         "html_template": "<h2 style='float:left'>Facts</h2><a style='float:right' href='{{link}}' target='_blank'>Go to the court ruling</a>"},
        {"view_id": "spans_manual", "lang": nlp.lang, "labels": labels},
        {"view_id": "text_input","field_label":"Annotator comment on this ruling", "field_placeholder": "Type here...","field_rows": 5},
      ]
    },

    }

Exemplo n.º 20

0

Exibir arquivo

def manual(dataset,
           label_type,
           labelid,
           label_type_cond,
           label_id_cond,
           exclude=None):
    """
    Mark spans manually by token. Requires only a tokenizer and no entity
    recognizer, and doesn't do any active learning.
    """

    source = "doc_input/examples_rdy_for_annotationprodfile.jsonl"
    # Load the spaCy model for tokenization
    # nlp = spacy.load(spacy_model)

    if label_type == "binary":
        label = [binary_labels[labelid]]
    elif label_type == "span":
        label = [span_labels[labelid]]
    elif label_type == "multi":
        label = multi_labels[labelid]

    else:
        raise Exception("need label type")

    # Loa   d the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    filtermodel = Model(label_type_cond=label_type_cond,
                        label_id_cond=label_id_cond)

    filteredstream = filtermodel(stream)
    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    # stream = add_tokens(nlp, stream)

    return {
        'view_id': 'ner_manual',  # Annotation interface to use
        'dataset': dataset,  # Name of dataset to save annotations
        'stream': filteredstream,  # Incoming stream of examples
        'config': {  # Additional config settings, mostly for app UI
            'label': ', '.join(label),
            #'label': label,
            'labels': label
            #'labels': label      # Selectable label options
        }
    }

Exemplo n.º 21

0

Exibir arquivo

def custom_recipe(dataset, jsonl_file):

    stream = JSONL(jsonl_file)
    stream = get_stream(stream)
    stream = add_tokens(nlp, stream)
    blocks = [{"view_id": "html"}, {"view_id": "ner_manual"}]
    return {
        "dataset": dataset,
        "stream": stream,
        "view_id": "blocks",
        "config": {
            "labels": ["LABEL1", "LABEL2", "LABEL3"],
            "blocks": blocks
        }
    }

Exemplo n.º 22

0

Exibir arquivo

Arquivo: recipe.py Projeto: zhang-informatics/iDISK

def compare(dataset, input_file, html_file):
    """
    Prodigy recipe for annotating pairs of iDISK concepts.
    """
    stream = JSONL(input_file)
    html_template = open(html_file, 'r').read()
    stream = add_to_stream(stream, html_template)

    return {
        "dataset": dataset,
        "stream": stream,
        "view_id": "choice",
        "exclude": [dataset],
        "config": {
            "html_template": html_template
        }
    }

Exemplo n.º 23

0

Exibir arquivo

Arquivo: custom_template.py Projeto: oneextrafact/prodigy-scratch

def custom_with_recipe_html_template():
    nlp = spacy.load(spacy_model)
    model = TextClassifier(nlp, labels, long_text=False)
    stream = JSONL(example_jsonl)
    stream = filter_duplicates(stream, by_input=True, by_task=False)

    return {
        'view_id': 'html',
        'dataset': dataset,
        'stream': stream,
        'exclude': [dataset],
        'update': model.update,
        'config': {
            'labels': labels,
            'html_template': template_text
        }
    }

Exemplo n.º 24

0

Exibir arquivo

Arquivo: extractive_qa.py Projeto: vitojph/prodigy-recipes

def qa(dataset, spacy_model, source, label="answer_span"):
    # load the source dataset, made of samples containing question and text pairs
    stream = JSONL(source)
    # load a spaCy model
    nlp = spacy.load(spacy_model)
    # and use it to tokenize the text
    stream = add_tokens(nlp, stream)

    return {
        "view_id": "ner_manual",
        "dataset": dataset,
        "stream": stream,
        "config": {
            "lang": nlp.lang,
            "label": label,
            "labels": label
        },
    }

Exemplo n.º 25

0

Exibir arquivo

def parsing_check(dataset, source, attr):
    """
    The annotator gets a contextualized patent citation displayed
    - the patent citation is highlighted
    - the title (h3 + bold + purple) is the value of the parsed attribute (e.g. orgname)
    - the citation has an href linking to the patent webpage (on google patents), in case further
    inspection is needed. Note, There is no guarantee that the link actually exists.
    The annotator faces a binary choice ACCEPT or REJECT.
    """
    def add_html(stream):
        for task in stream:
            span = task["spans"][0]
            root = "https://patents.google.com/patent/"
            suffix = span["orgname"] + span["original"]

            start, end = (span["start"], span["end"])
            text = task["text"]
            before = text[:start]
            span_ = text[start:end]
            after = text[end:]

            task["html"] = (
                f"<span style='background-color:#775ec2;color:white;font-size:130%;font-weight:bold;'>  "
                f"{str(span.get(attr))}  </span><br> \
                           {before} <span style='background-color: #fae284'><a \
                           href={root + suffix}>{span_}</a></span> \
                           {after}")
            yield task

    fmt = os.path.splitext(source)[-1]
    stream = JSONL(source) if fmt == ".jsonl" else JSON(source)
    stream = add_html(stream)

    # return {"view_id": "classification",
    return {
        "view_id": "blocks",
        "dataset": dataset,
        "stream": stream,
        "config": {
            "blocks": [{
                "view_id": "html"
            }]
        },  # add the blocks to the config
    }

Exemplo n.º 26

0

Exibir arquivo

def ner_match(
    dataset: str,
    spacy_model: str,
    source: str,
    patterns: Optional[str] = None,
    exclude: Optional[List[str]] = None,
    resume: bool = False,
):
    """
    Suggest phrases that match a given patterns file, and mark whether they
    are examples of the entity you're interested in. The patterns file can
    include exact strings or token patterns for use with spaCy's `Matcher`.
    """
    # Load the spaCy model
    nlp = spacy.load(spacy_model)

    # Initialize the pattern matcher and load in the JSONL patterns
    matcher = PatternMatcher(nlp).from_disk(patterns)

    if resume:
        # Connect to the database using the settings from prodigy.json
        DB = connect()
        if dataset and dataset in DB:
            # Get the existing annotations and update the matcher
            existing = DB.get_dataset(dataset)
            matcher.update(existing)

    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Apply the matcher to the stream, which returns (score, example) tuples.
    # Filter out the scores to only yield the examples for annotations.
    stream = (eg for score, eg in matcher(stream))

    return {
        "view_id": "ner",  # Annotation interface to use
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "exclude": exclude,  # List of dataset names to exclude
        "config": {
            "lang": nlp.lang
        },  # Additional config settings, mostly for app UI
    }

Exemplo n.º 27

0

Exibir arquivo

def mark(dataset, source, view_id, exclude=None):
    """
    Click through pre-prepared examples, with no model in the loop.
    """
    counts = Counter()

    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    def on_load(controller):
        # Check if current dataset is available in database. The on_load
        # callback receives the controller as an argument, which exposes the
        # database via controller.db
        if dataset in controller.db:
            examples = controller.db.get_dataset(dataset)
            for eg in examples:
                # Update counts with existing answers
                counts[eg['answer']] += 1

    def receive_answers(answers):
        for eg in answers:
            # Update counts with new answers
            counts[eg['answer']] += 1

    def on_exit(controller):
        # Output the total annotation counts
        print('Accept:', counts['accept'])
        print('Reject:', counts['reject'])
        print('Ignore:', counts['ignore'])
        print('Total: ', sum(counts.values()))

    return {
        'view_id': view_id,  # Annotation interface to use
        'dataset': dataset,  # Name of dataset to save annotations
        'stream': stream,  # Incoming stream of examples
        'update': receive_answers,  # Update callback, called with answers
        'on_load': on_load,  # Called on first load
        'on_exit': on_exit  # Called when Prodigy server is stopped
    }

Exemplo n.º 28

0

Exibir arquivo

Arquivo: mark.py Projeto: zlapp/prodigy-recipes

def mark(dataset: str, source: str, view_id: str, exclude: Optional[List[str]] = None):
    """
    Click through pre-prepared examples, with no model in the loop.
    """
    counts = Counter()

    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    def on_load(controller):
        # Check if current dataset is available in database. The on_load
        # callback receives the controller as an argument, which exposes the
        # database via controller.db
        if dataset in controller.db:
            examples = controller.db.get_dataset(dataset)
            for eg in examples:
                # Update counts with existing answers
                counts[eg["answer"]] += 1

    def receive_answers(answers):
        for eg in answers:
            # Update counts with new answers
            counts[eg["answer"]] += 1

    def on_exit(controller):
        # Output the total annotation counts
        print("Accept:", counts["accept"])
        print("Reject:", counts["reject"])
        print("Ignore:", counts["ignore"])
        print("Total: ", sum(counts.values()))

    return {
        "view_id": view_id,  # Annotation interface to use
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "update": receive_answers,  # Update callback, called with answers
        "on_load": on_load,  # Called on first load
        "on_exit": on_exit,  # Called when Prodigy server is stopped
    }

Exemplo n.º 29

0

Exibir arquivo

Arquivo: search_audit.py Projeto: pketh/investigation-google-search-audit

def image_recipe(dataset, source):
    num_lines = sum(1 for line in open(source))

    def progress(session, total):
        '''
        Shows your progress 
        '''
        return total / num_lines

    stream = JSONL(source)
    stream = add_options(stream)

    return {
        'dataset': dataset,  # ID of dataset to store annotations
        'stream': stream,  #  stream of examples
        'progress': progress,  # annotation progress
        'view_id': 'choice',  # annotation interface
        'config': {
            "choice_style": "multiple",
            "instructions": "config/serp-help.html"
        }
    }

Exemplo n.º 30

0

Exibir arquivo

Arquivo: keywords_annotation.py Projeto: COVID-19-Text-Mining/prodigy-recipes

def COVIDKeywordsAnnotation(
    dataset_name: Optional[str] = 'entries',
    dataset_file: Optional[str] = None,
    spacy_model: Optional[str] = 'en_core_web_sm',
    dataset_exclude: Optional[List[str]] = None,
):
    """
    keywords annotation recipe

    :param dataset_name:
    :param dataset_file:
    :param spacy_model:
    :param dataset_exclude:
    :return:
    """

    # TEXT_STREAM_PIPELINE is the global variable that you put all text processors in
    # in that way, other function outside could use the same processing pipeline for the same task
    global TEXT_STREAM_PIPELINE
    # MONGO_COL_NAME is the global variable recoding which mongo collection
    # you load data if you want to load paper by doi
    global MONGO_COL_NAME

    # change globale variable MONGO_COL_NAME for further use when loading paper by doi
    MONGO_COL_NAME = dataset_name

    # Load the spaCy model for tokenization
    nlp = spacy.load(spacy_model)

    # get a text stream, which is a generator of [{'text': '', ...}]
    if dataset_file is None:
        if dataset_name in db.collection_names():
            stream = db_endless_sampling(dataset_name)
        else:
            raise ValueError(
                'Loading from database because dataset_file is not specified! '
                'However, collection {} does not exist!'.format(dataset_name))
    else:
        # Load the stream from a JSONL file and return a generator that yields a
        # dictionary for each example in the data.
        stream = JSONL(dataset_file)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    # stream = add_tokens(nlp, stream)
    TEXT_STREAM_PIPELINE.append(lambda x: add_tokens(nlp, x))

    # add keywords extraction to pipeline
    kw_extractor_1 = keywords_extraction.KeywordsExtractorRaKUn(
        name='RaKUn_0',
        distance_threshold=2,
        pair_diff_length=2,
        bigram_count_threhold=2,
        num_tokens=[1, 2, 3],
        max_similar=10,
        max_occurrence=3,
        score_threshold=None,
        use_longest_phrase=True,
        ignore_shorter_keywords=False,
    )
    # use_longest_phrase = True,
    TEXT_STREAM_PIPELINE.append(lambda x: stream_add_keywords_ML(
        x,
        kw_extractors=[
            kw_extractor_1,
        ],
        add_keywords_in_db=True,
    ))

    with open('keywords_annotation.html') as txt:
        template_text = txt.read()
    with open('keywords_annotation.js') as txt:
        script_text = txt.read()
    with open('custom_style.css') as txt:
        css_text = txt.read()

    # activate tasks
    TASK_DESCs = {
        'ner': 'highlight named entities',
        'textcat': 'select text categories',
        'summary': 'add text summary',
        'note': 'add text notes',
    }
    AVAILABLE_TASKS = set(TASK_DESCs.keys())
    all_task_blocks = []

    # add title blocks
    all_task_blocks.extend(get_paper_title_blocks())

    # add task desc blocks
    all_task_blocks.extend(
        get_task_desc_blocks([
            'mark whatever you think are keywords',
        ]))

    # add keywords ner blocks
    all_task_blocks.extend(get_ner_blocks(labels=['KEYWORD']))
    all_task_blocks.extend([
        {
            'view_id': 'html',
            'html_template': template_text,
        },
    ])

    # apply stream pipeline on text stream
    for stream_fun in TEXT_STREAM_PIPELINE:
        stream = stream_fun(stream)

    return {
        "view_id": "blocks",  # Annotation interface to use
        "dataset": dataset_name,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "exclude": dataset_exclude,  # List of dataset names to exclude
        "config": {  # Additional config settings, mostly for app UI
            'blocks': all_task_blocks,
            "lang": nlp.lang,
            'javascript': script_text,     # custom js
            'global_css': None,     # custom css
            'instant_submit': True,
        },
    }