Exemplo n.º 1
0
def gold_to_spacy(dataset, spacy_model, biluo=False):
    # Ripped from ner.gold_to_spacy. Only change is returning annotations instead of printing or saving
    DB = connect()
    examples = DB.get_dataset(dataset)
    examples = [eg for eg in examples if eg['answer'] == 'accept']
    if biluo:
        if not spacy_model:
            prints(
                "Exporting annotations in BILUO format requires a spaCy "
                "model for tokenization.",
                exits=1,
                error=True)
        nlp = spacy.load(spacy_model)
    annotations = []
    for eg in examples:
        entities = [(span['start'], span['end'], span['label'])
                    for span in eg.get('spans', [])]
        if biluo:
            doc = nlp(eg['text'])
            entities = spacy.gold.biluo_tags_from_offsets(doc, entities)
            annot_entry = [eg['text'], entities]
        else:
            annot_entry = [eg['text'], {'entities': entities}]
        annotations.append(annot_entry)

    return annotations
Exemplo n.º 2
0
def model_stats(dataset, spacy_model, label=None, isPrf=False):
    """
    Evaluate model accuracy of model based on dataset with no training
    inspired from https://support.prodi.gy/t/evaluating-precision-and-recall-of-ner/193/2
    found on https://support.prodi.gy/t/evaluating-precision-and-recall-of-ner/193
    got basic model evaluation by looking at the batch-train recipe
    """

    log("RECIPE: Starting recipe ner.stats", locals())
    DB = connect()
    nlp = spacy.load(spacy_model)

    isPrf = 'True'
    if (isPrf):
        examples = gold_to_spacy(dataset, spacy_model)
        score = evaluate_prf(nlp, examples)
        print("Precision {:0.4f}\tRecall {:0.4f}\tF-score {:0.4f}".format(
            score['ents_p'], score['ents_r'], score['ents_f']))

    else:
        # ripped this from ner.batch-train recipe
        model = EntityRecognizer(nlp, label=label)
        evaldoc = merge_spans(DB.get_dataset(dataset))
        evals = list(split_sentences(model.orig_nlp, evaldoc))

        scores = model.evaluate(evals)

        print(
            "Accuracy {:0.4f}\tRight {:0.0f}\tWrong {:0.0f}\tUnknown {:0.0f}\tEntities {:0.0f}"
            .format(scores['acc'], scores['right'], scores['wrong'],
                    scores['unk'], scores['ents']))
Exemplo n.º 3
0
def to_patterns(dataset,
                spacy_model,
                label,
                output_file="-",
                case_sensitive=False,
                dry=False):
    """
    Convert a dataset of phrases collected with sense2vec.teach to token-based
    match patterns that can be used with spaCy's EntityRuler or recipes like
    ner.match. If no output file is specified, the patterns are written to
    stdout. The examples are tokenized so that multi-token terms are represented
    correctly, e.g.:
    {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]}
    """
    log("RECIPE: Starting recipe sense2vec.to-patterns", locals())
    nlp = spacy.load(spacy_model)
    log(f"RECIPE: Loaded spaCy model '{spacy_model}'")
    DB = connect()
    if dataset not in DB:
        raise ValueError(f"Can't find dataset '{dataset}'")
    examples = DB.get_dataset(dataset)
    terms = [eg["text"] for eg in examples if eg["answer"] == "accept"]
    if case_sensitive:
        patterns = [{"text": t.text
                     for t in nlp.make_doc(term)} for term in terms]
    else:
        patterns = [{"lower": t.lower_
                     for t in nlp.make_doc(term)} for term in terms]
    patterns = [{"label": label, "pattern": pattern} for pattern in patterns]
    log(f"RECIPE: Generated {len(patterns)} patterns")
    if not dry:
        srsly.write_jsonl(output_file, patterns)
    return patterns
Exemplo n.º 4
0
def ner_merge(
    dataset: str,
    recon_dataset: str,
    source: Union[str, Dataset],
    output_dir: Optional[str] = None,
    exclude: Optional[List[str]] = None,
):
    """
    Stream a List of `recon.types.HardestExample` instances to prodigy
    for review/correction. Uses the Prodigy blocks interface to display
    prediction error information along with ner view
    """
    log("RECIPE: Starting recipe recon.ner_merge", locals())
    if isinstance(source, str):
        dataset = Dataset(recon_dataset).from_disk(source)
    else:
        dataset = source

    DB = connect()
    if dataset not in DB:
        msg.fail(f"Can't find dataset '{dataset}'", exits=1)

    prodigy_raw_examples = DB.get_dataset(dataset)
    prodigy_examples = [Example(**eg) for eg in prodigy_raw_examples if eg["answer"] == "accept"]
    prodigy_texts_to_examples = {e.text: e for e in prodigy_examples}

    prev_len = len(dataset)
    dataset.apply_("recon.v1.prodigy.merge_examples", prodigy_texts_to_examples)
    assert len(dataset) == prev_len

    if output_dir:
        log(f"RECIPE: Fixing {len(prodigy_examples)} examples in data")
        dataset.to_disk(output_dir)
Exemplo n.º 5
0
def main(db_name):
    db = connect()
    examples = db.get_dataset(db_name)
    print("Total examples: ", len(examples))

    diffs = []
    for ex in examples:
        if 'time_returned' in ex.keys() and 'time_loaded' in ex.keys():
            date = parser.parse(ex['time_returned']).strftime("%Y-%m-%d")
            diff = parser.parse(ex['time_returned']) - parser.parse(
                ex['time_loaded'])
            diff = diff.total_seconds()
            diffs.append({
                "date": date,
                "coder": ex['active_coder'],
                "diff": diff,
                "id": ex['id'][-16:],
                'answer': ex['answer']
            })

    df = DataFrame(diffs)
    df.to_csv("/home/andy/multiuser_prodigy/coding_summary.csv")
    os.system(
        """/usr/bin/Rscript -e 'library(rmarkdown); rmarkdown::render("multiuser_prodigy/Report.Rmd", "html_document")'"""
    )
    os.system("""echo pwd""")
def main():

    dataset_name = args.dataset_name  # the dataset you want to use

    #with open("settings.json", "r") as read_file:
    #	data = json.load(read_file)

    # Connect to the database using the prodigy.json file (Can also be found in slack)
    #db = connect(data["db"],data["db_settings"])
    db = connect(
    )  #Prodigy automatically will use the settings in 'prodigy.json' file in this script's directory if running from this directory

    # The dataset will be returned as an object
    dataset = db.get_dataset(dataset_name)

    file_ext = "jsonl"  # modify this if you want it to be saved as a different file format

    out_path = args.output_path  #"./" # location of where the dataset will be saved, default is same directory as script

    # Name of the file being saved, we use uuid.uuid4() to avoid overwriting files
    outfile = os.path.join(
        out_path, f"{dataset_name}_download.{uuid.uuid4()}.{file_ext}")

    # if youre writing it as json use .write_json instead, refer to srsly documentation for other formats
    # or handle file writing yourself
    srsly.write_jsonl(outfile, dataset)
Exemplo n.º 7
0
 def eval_dataset(set_id):
     DB = connect()
     data = DB.get_dataset(set_id)
     accepted = [
         eg for eg in data if eg["answer"] == "accept" and eg.get("accept")
     ]
     rejected = [eg for eg in data if eg["answer"] == "reject"]
     ignored = [eg for eg in data if eg["answer"] == "ignore"]
     if not accepted and not rejected:
         msg.warn("No annotations collected", exits=1)
     counts = Counter()
     for eg in accepted:
         for model_id in eg["accept"]:
             counts[model_id] += 1
     preference, _ = counts.most_common(1)[0]
     ratio = f"{counts[preference]} / {sum(counts.values()) - counts[preference]}"
     msg.info(f"Evaluating data from '{set_id}'")
     msg.text(
         f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)")
     if counts["A"] == counts["B"]:
         msg.warn(f"No preference ({ratio})")
     else:
         pc = counts[preference] / sum(counts.values())
         msg.good(
             f"You preferred vectors {preference} with {ratio} ({pc:.0%})")
         msg.text(mapping[preference])
Exemplo n.º 8
0
def options(name):
    sentinel = False
    db = connect()
    if request.method == 'POST':
        if request.form['action'] == 'List':
            
            return render_template("options.html", greeting=name, lst=db.datasets, db=db)
        elif request.form['action'] == 'Create':
            stripped_input = request.form['create'].strip()
            description = request.form['describe'].strip()
            print(stripped_input)
            if stripped_input in db.datasets:  
                return render_template("error.html", msg="User must not use a dataset name that already exists")
            if stripped_input == "":  
                return render_template("error.html", msg="User must provide non empty dataset name")
            if ' ' in stripped_input or ',' in stripped_input:  
                return render_template("error.html", msg="Dataset name can't include commas or whitespace")
            db.add_dataset(stripped_input, {"description" : description, "author" : name})
        elif request.form['action'] == 'Delete':
            
            dataset_name = request.form['dataset_name']
        
            if not db.drop_dataset(dataset_name):
                return render_template("error.html", msg="Failed to drop dataset")
            return render_template("options.html", greeting=name, lst=db.datasets, db=db)
        elif request.form['action'] == 'Continue':
            
            dataset_name = request.form['continue']
            print('Dataset_name', dataset_name)
            
            spacy_model = request.form.get("models", None)
            print('Spacy_model', spacy_model)
            
        
            input_data = request.form['input_data'].strip()
            print('User_input', input_data)
            if input_data == '':
                return render_template("error.html", msg="User must provide an input dataset")
            # user labels (this should be a string separated by commas)
            input_labels = request.form['labels'].strip()
            print('User_input', input_labels)

            for coder_info in enumerate(coder_list):
                coder_info = coder_info[1]
                if name in coder_info['name']:
                    sentinel = True
                    mp = MultiUser(name, coder_info['port'])
                    atexit.register(mp.kill_prodigies)
                    mp.make_prodigies(dataset_name, input_data, spacy_model, input_labels.split(","))
                    mp.start_prodigies()
            if not sentinel:
                return render_template("error.html", msg="Username does not exist")
                

        elif request.form['action'] == 'Print':
            dataset_name = request.form['dataset_name']
            lst = db.get_dataset(dataset_name)
            return render_template("output.html", lst=lst, name=dataset_name)
    
    return render_template("options.html", greeting=name, lst=db.datasets, db=db)
Exemplo n.º 9
0
def db_out(set_id, out_dir=None, answer=None, flagged_only=False, dry=False):
    """
    Export annotations from the database. Files will be exported in
    Prodigy's JSONL format.
    """
    DB = connect()
    if set_id not in DB:
        prints("Can't find '{}' in database {}.".format(set_id, DB.db_name),
               exits=1,
               error=True)
    examples = DB.get_dataset(set_id)
    if flagged_only:
        examples = [eg for eg in examples if eg.get('flagged')]
    if answer:
        examples = [eg for eg in examples if eg.get('answer') == answer]
    if out_dir is None:
        for eg in examples:
            print(ujson.dumps(eg, escape_forward_slashes=False))
    else:
        if not out_dir.exists():
            out_dir.mkdir()
        out_file = out_dir / '{}.jsonl'.format(set_id)
        if not dry:
            write_jsonl(out_file, examples)
        prints(
            "Exported {} annotations for '{}' from database {}".format(
                len(examples), set_id, DB.db_name), out_file.resolve())
Exemplo n.º 10
0
def to_patterns(dataset=None, label=None, output_file=None):
    """
    Convert a list of seed phrases to a list of match patterns that can be used
    with ner.match. If no output file is specified, each pattern is printed
    so the recipe's output can be piped forward to ner.match.

    This is pretty much an exact copy of terms.to-patterns.
    The pattern for each example is just split on whitespace so instead of:

        {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new balance"}]}


    which won't match anything you'll get:

        {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]}
    """
    if label is None:
        prints(
            "--label is a required argument",
            "This is the label that will be assigned to all patterns "
            "created from terms collected in this dataset. ",
            exits=1,
            error=True,
        )

    DB = connect()

    def get_pattern(term, label):
        return {
            "label": label,
            "pattern": [{
                "lower": t.lower()
            } for t in term["text"].split()]
        }

    log("RECIPE: Starting recipe phrases.to-patterns", locals())
    if dataset is None:
        log("RECIPE: Reading input terms from sys.stdin")
        terms = (srsly.json_loads(line) for line in sys.stdin)
    else:
        if dataset not in DB:
            prints("Can't find dataset '{}'".format(dataset),
                   exits=1,
                   error=True)
        terms = DB.get_dataset(dataset)
        log("RECIPE: Reading {} input phrases from dataset {}".format(
            len(terms), dataset))
    if output_file:
        patterns = [
            get_pattern(term, label) for term in terms
            if term["answer"] == "accept"
        ]
        log("RECIPE: Generated {} patterns".format(len(patterns)))
        srsly.write_jsonl(output_file, patterns)
        prints("Exported {} patterns".format(len(patterns)), output_file)
    else:
        log("RECIPE: Outputting patterns")
        for term in terms:
            if term["answer"] == "accept":
                print(srsly.json_dumps(get_pattern(term, label)))
Exemplo n.º 11
0
def batch_train_custom_cumulate(dataset, input_model=None, output_model=None, lang='en',
                factor=1, dropout=0.2, n_iter=1, batch_size=10,
                eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False,gpu_id = None):
    if(gpu_id == 0 and torch.cuda.is_available()):
        print("Using cuda")
        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
        cudnn.benchmark = True
    if(n_iter ==1):
        print("one pass mode")
    print("batch_size",batch_size)
    #print(factor,type(factor))
    DB = connect()
    print_ = get_print(silent)
    random.seed(0)
    if input_model is not None:
        nlp = spacy.load(input_model, disable=['ner'])
        print_('\nLoaded model {}'.format(input_model))
        model = TextClassifier(nlp, labels, long_text=long_text,
                               low_data=len(examples) < 1000)
    else:
        print("build your customized model")
        pt_model = FastText(vocab_size=684831, emb_dim = 300).cuda()
        optimizer = torch.optim.Adam(pt_model.parameters(), lr=0.001)
        criterion = nn.BCELoss()
        model = Prodigy_model(pt_model,label_size=1,optimizer=optimizer,loss=criterion)
    examples = DB.get_dataset(dataset)
    if eval_id:
        evals = DB.get_dataset(eval_id)
        print_("Loaded {} evaluation examples from '{}'"
               .format(len(evals), eval_id))
    examples = examples[:int(len(examples) * factor)]
    print_(printers.trainconf(dropout, n_iter, batch_size, factor,
                              len(examples)))
    if len(evals) > 0:
        print_(printers.tc_update_header())
    best_acc = {'accuracy': 0}
    best_model = None
    start_time = datetime.now()
    if len(evals) > 0:
        model.init_eval(evals)
    interval = 100
    for fac in np.arange(interval,len(examples)+interval,interval):
        examples_fac = examples[:fac]
        batch_number = examples_fac/batch_size
        for i in range(n_iter):
            if shuffle:
                print("it's shuffling")
                random.shuffle(examples)
            batch_idx = 0
            loss = 0
            for batch in cytoolz.partition_all(batch_size,
                                               tqdm.tqdm(examples, leave=False)):
                batch = list(batch)
                loss += model.update(batch)
                batch_idx += 1
            acc = model.evaluate(evals)     
            print_('Time:[{0} seconds], process: [{1}/{2}], Epoch: [{3}/{4}], step: [{5}/{6}], Loss: {7},Acc:{8}'.format(
               end_time.seconds,fac, len(examples)//interval, i+1, n_iter, batch_idx+1, len(examples_fac)//batch_size, loss/batch_number, acc))
    return acc
Exemplo n.º 12
0
def stats(set_id=None,
          list_datasets=False,
          list_sessions=False,
          no_format=False):
    """
    Print Prodigy and database statistics. Specifying a dataset ID will show
    detailed stats for the set.
    """
    DB = connect()
    prodigy_stats = {
        'version': about.__version__,
        'location': str(Path(__file__).parent),
        'prodigy_home': PRODIGY_HOME,
        'platform': platform.platform(),
        'python_version': platform.python_version(),
        'database_name': DB.db_name,
        'database_id': DB.db_id,
        'total_datasets': len(DB.datasets),
        'total_sessions': len(DB.sessions)
    }
    print_stats('Prodigy stats', prodigy_stats, no_format=no_format)
    if (list_datasets or list_sessions) and len(DB.datasets):
        print_stats('Datasets', DB.datasets, no_format, False)
    if list_sessions and len(DB.sessions):
        print_stats('Sessions', DB.sessions, no_format, False)
    if set_id:
        if set_id not in DB:
            prints("Can't find '{}' in database {}.".format(
                set_id, DB.db_name),
                   exits=1,
                   error=True)
        examples = DB.get_dataset(set_id)
        meta = DB.get_meta(set_id)
        decisions = {'accept': 0, 'reject': 0, 'ignore': 0}
        for eg in examples:
            if 'answer' in eg:
                decisions[eg['answer']] += 1
            elif 'spans' in eg:
                for span in eg['spans']:
                    if 'answer' in span:
                        decisions[span['answer']] += 1
        dataset_stats = {
            'dataset': set_id,
            'created': meta.get('created'),
            'description': meta.get('description'),
            'author': meta.get('author'),
            'annotations': len(examples),
            'accept': decisions['accept'],
            'reject': decisions['reject'],
            'ignore': decisions['ignore']
        }
        print_stats("Dataset '{}'".format(set_id),
                    dataset_stats,
                    no_format=no_format)
Exemplo n.º 13
0
def batch_train_custom(dataset, input_model=None, output_model=None, lang='en',
                factor=1, dropout=0.2, n_iter=1, batch_size=10,
                eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False):
    if(n_iter ==1):
        print("one pass mode")
    print("batch_size",batch_size)
    #print(factor,type(factor))
    DB = connect()
    print_ = get_print(silent)
    random.seed(0)
    if input_model is not None:
        nlp = spacy.load(input_model, disable=['ner'])
        print_('\nLoaded model {}'.format(input_model))
        model = TextClassifier(nlp, labels, long_text=long_text,
                               low_data=len(examples) < 1000)
    else:
        print("build your customized model")
        pt_model = FastText(vocab_size=684831, emb_dim = 300)
        optimizer = torch.optim.Adam(pt_model.parameters(), lr=0.001)
        criterion = nn.BCELoss()
        model = Prodigy_model(pt_model,label_size=1,optimizer=optimizer,loss=criterion)
    examples = DB.get_dataset(dataset)
    if eval_id:
        evals = DB.get_dataset(eval_id)
        print_("Loaded {} evaluation examples from '{}'"
               .format(len(evals), eval_id))
    examples = examples[:int(len(examples) * factor)]
    print_(printers.trainconf(dropout, n_iter, batch_size, factor,
                              len(examples)))
    if len(evals) > 0:
        print_(printers.tc_update_header())
    best_acc = {'accuracy': 0}
    best_model = None
    for i in range(n_iter):
        if shuffle:
            random.shuffle(examples)
        batch_idx = 1
        for batch in cytoolz.partition_all(batch_size,
                                           tqdm.tqdm(examples, leave=False)):
            #print(j)
            batch = list(batch)
            loss = model.update(batch)
            if len(evals) > 0 and batch_idx % (4 * batch_size) == 0:
                acc = model.evaluate(evals)     
                #print_(printers.tc_update(batch_idx, loss, acc))
                print('Epoch: [{0}/{1}], Step: [{2}/{3}], Loss: {4}, Validation Acc:{5}'.format( 
                   i+1, n_iter, batch_idx, len(examples)//batch_size, loss, acc))
            batch_idx += 1
    return acc
Exemplo n.º 14
0
def ner_silver_to_gold(
    silver_dataset: str,
    gold_dataset: str,
    spacy_model: str,
    label: Optional[List[str]] = None,
):
    """
    Take an existing "silver" dataset with binary accept/reject annotations,
    merge the annotations to find the best possible analysis given the
    constraints defined in the annotations, and manually edit it to create
    a perfect and complete "gold" dataset.
    """
    # Connect to the database using the settings from prodigy.json, check
    # that the silver dataset exists and load it
    DB = connect()
    if silver_dataset not in DB:
        raise ValueError("Can't find dataset '{}'.".format(silver_dataset))
    silver_data = DB.get_dataset(silver_dataset)

    # Load the spaCy model
    nlp = spacy.load(spacy_model)
    if label is None:
        # Get the labels from the model by looking at the available moves, e.g.
        # B-PERSON, I-PERSON, L-PERSON, U-PERSON
        ner = nlp.get_pipe("ner")
        label = sorted(ner.labels)

    # Initialize Prodigy's entity recognizer model, which uses beam search to
    # find all possible analyses and outputs (score, example) tuples
    model = EntityRecognizer(nlp, label=label)

    # Merge all annotations and find the best possible analyses
    stream = model.make_best(silver_data)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    stream = add_tokens(nlp, stream)

    return {
        "view_id": "ner_manual",  # Annotation interface to use
        "dataset": gold_dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "config": {  # Additional config settings, mostly for app UI
            "lang": nlp.lang,
            "labels": label,  # Selectable label options
        },
    }
Exemplo n.º 15
0
def ner_silver_to_gold(silver_dataset, gold_dataset, spacy_model, label=[]):
    """
    Take an existing "silver" dataset with binary accept/reject annotations,
    merge the annotations to find the best possible analysis given the
    constraints defined in the annotations, and manually edit it to create
    a perfect and complete "gold" dataset.
    """
    # Connect to the database using the settings from prodigy.json, check
    # that the silver dataset exists and load it
    DB = connect()
    if silver_dataset not in DB:
        raise ValueError("Can't find dataset '{}'.".format(silver_dataset))
    silver_data = DB.get_dataset(silver_dataset)

    # Load the spaCy model
    nlp = spacy.load(spacy_model)
    if not label:
        # Get the labels from the model by looking at the available moves, e.g.
        # B-PERSON, I-PERSON, L-PERSON, U-PERSON
        ner = nlp.get_pipe('ner')
        moves = ner.move_names
        label = [
            move.split('-')[1] for move in moves
            if move[0] in ('B', 'I', 'L', 'U')
        ]
        label = sorted(set(label))

    # Initialize Prodigy's entity recognizer model, which uses beam search to
    # find all possible analyses and outputs (score, example) tuples
    model = EntityRecognizer(nlp, label=label)

    # Merge all annotations and find the best possible analyses
    stream = model.make_best(silver_data)

    # Tokenize the incoming examples and add a "tokens" property to each
    # example. Also handles pre-defined selected spans. Tokenization allows
    # faster highlighting, because the selection can "snap" to token boundaries.
    stream = add_tokens(nlp, stream)

    return {
        'view_id': 'ner_manual',  # Annotation interface to use
        'dataset': gold_dataset,  # Name of dataset to save annotations
        'stream': stream,  # Incoming stream of examples
        'config': {  # Additional config settings, mostly for app UI
            'lang': nlp.lang,
            'labels': label  # Selectable label options
        }
    }
Exemplo n.º 16
0
def load_sad_manual(dataset: Text, path: Text) -> Dict:
    """Load accepted pyannote.sad.manual examples

    Parameters
    ----------
    dataset : str
        Dataset containing annotations.
    path : str
        Path to annotated file

    Returns
    -------
    file : dict
        Dictionary containing the following keys:
        "audio" (Path) : path to audio file
        "annotated" (Timeline) : part of the audio annotated and accepted
        "speech" (Timeline) : part of the audio accepted as speech
    """

    db = connect()

    examples = [
        eg
        for eg in db.get_dataset(dataset)
        if eg["recipe"] == "pyannote.sad.manual"
        and eg["path"] == path
        and eg["answer"] == "accept"
    ]

    speech = Timeline(
        segments=[
            Segment(span["start"], span["end"])
            for eg in examples
            for span in eg["audio_spans"]
        ],
    ).support()

    annotated = Timeline(segments=[Segment(**eg["chunk"]) for eg in examples]).support()

    prodigy.log(f"RECIPE: {path}: loaded speech regions")

    return {
        "audio": Path(path),
        "speech": speech,
        "annotated": annotated,
    }
Exemplo n.º 17
0
def to_patterns(dataset,
                spacy_model,
                label,
                output_file="-",
                case_sensitive=False,
                dry=False):
    """
    Convert a dataset of phrases collected with sense2vec.teach to token-based
    match patterns that can be used with spaCy's EntityRuler or recipes like
    ner.match. If no output file is specified, the patterns are written to
    stdout. The examples are tokenized so that multi-token terms are represented
    correctly, e.g.:
    {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]}

    For tokenization, you can either pass in the name of a spaCy model (e.g. if
    you're using a model with custom tokenization), or "blank:" plus the
    language code you want to use, e.g. blank:en or blank:de. Make sure to use
    the same language / tokenizer you're planning to use at runtime – otherwise
    your patterns may not match.
    """
    log("RECIPE: Starting recipe sense2vec.to-patterns", locals())
    if spacy_model.startswith("blank:"):
        nlp = spacy.blank(spacy_model.replace("blank:", ""))
    else:
        nlp = spacy.load(spacy_model)
    log(f"RECIPE: Loaded spaCy model '{spacy_model}'")
    DB = connect()
    if dataset not in DB:
        msg.fail(f"Can't find dataset '{dataset}'", exits=1)
    examples = DB.get_dataset(dataset)
    terms = set([eg["word"] for eg in examples if eg["answer"] == "accept"])
    if case_sensitive:
        patterns = [[{
            "text": t.lower_
        } for t in nlp.make_doc(term)] for term in terms]
    else:
        terms = set([word.lower() for word in terms])
        patterns = [[{
            "lower": t.lower_
        } for t in nlp.make_doc(term)] for term in terms]
    patterns = [{"label": label, "pattern": pattern} for pattern in patterns]
    log(f"RECIPE: Generated {len(patterns)} patterns")
    if not dry:
        srsly.write_jsonl(output_file, patterns)
    return patterns
Exemplo n.º 18
0
def drop(set_id):
    """
    Remove a dataset. Can't be undone. For a list of all dataset and session
    IDs in the database, use `prodigy stats -ls`.
    """
    DB = connect()
    if set_id not in DB:
        prints("Can't find '{}' in database {}.".format(set_id, DB.db_name),
               exits=1,
               error=True)
    dropped = DB.drop_dataset(set_id)
    if not dropped:
        prints("Can't remove '{}' from database {}.".format(
            set_id, DB.db_name),
               exits=1,
               error=True)
    prints("Removed '{}' from database {}.".format(set_id, DB.db_name),
           exits=1)
Exemplo n.º 19
0
def run(ske_config,
        db_config,
        dataset_name,
        index1_table_name,
        index2_table_names=None,
        ske_translation_table_name=None):

    from prodigy.components.db import connect

    db = connect(db_id='postgresql', db_settings=db_config)

    # stream = stream_manager.stream_from_db_with_predictions(ske_config, db_config, index1_table_name)
    stream = stream_manager.stream_from_db_with_lmvr_keywords(
        ske_config, db_config, index1_table_name, index2_table_names,
        ske_translation_table_name)

    run_recipe(db, stream, dataset_name, db_config, index1_table_name,
               index2_table_names)
Exemplo n.º 20
0
    def load_dia_binary(self, path: Text):
        """Load existing examples as constraints for diarization

        This will set (or overwrite) the following attributes and return them
            * cannot_link_time
            * must_link_time
            * dont_know_time

        Parameters
        ----------
        path : Text
            Only load examples for this file.
        """

        db = connect()

        examples = [
            eg for eg in db.get_dataset(self.dataset)
            if eg["recipe"] == "pyannote.dia.binary" and eg["path"] == path
        ]

        cannot_link: CONSTRAINTS = [(eg["t1"], eg["t2"]) for eg in examples
                                    if eg["answer"] == "reject"]
        must_link: CONSTRAINTS = [(eg["t1"], eg["t2"]) for eg in examples
                                  if eg["answer"] == "accept"]
        dont_know: CONSTRAINTS = [(eg["t1"], eg["t2"]) for eg in examples
                                  if eg["answer"] not in ["accept", "reject"]]

        if len(cannot_link) > 0:
            prodigy.log(
                f"RECIPE: {path}: init: {len(cannot_link)} cannot link constraints"
            )
        if len(must_link) > 0:
            prodigy.log(
                f"RECIPE: {path}: init: {len(must_link)} must link constraints"
            )

        # expand list of "cannot link" constraints thanks to the following rule
        # (u != v) & (v == w) ==> u != w
        cannot_link = propagate_constraints(cannot_link, must_link)

        self.cannot_link_time = cannot_link
        self.must_link_time = must_link
        self.dont_know_time = dont_know
Exemplo n.º 21
0
def ner_match(
    dataset: str,
    spacy_model: str,
    source: str,
    patterns: Optional[str] = None,
    exclude: Optional[List[str]] = None,
    resume: bool = False,
):
    """
    Suggest phrases that match a given patterns file, and mark whether they
    are examples of the entity you're interested in. The patterns file can
    include exact strings or token patterns for use with spaCy's `Matcher`.
    """
    # Load the spaCy model
    nlp = spacy.load(spacy_model)

    # Initialize the pattern matcher and load in the JSONL patterns
    matcher = PatternMatcher(nlp).from_disk(patterns)

    if resume:
        # Connect to the database using the settings from prodigy.json
        DB = connect()
        if dataset and dataset in DB:
            # Get the existing annotations and update the matcher
            existing = DB.get_dataset(dataset)
            matcher.update(existing)

    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Apply the matcher to the stream, which returns (score, example) tuples.
    # Filter out the scores to only yield the examples for annotations.
    stream = (eg for score, eg in matcher(stream))

    return {
        "view_id": "ner",  # Annotation interface to use
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "exclude": exclude,  # List of dataset names to exclude
        "config": {
            "lang": nlp.lang
        },  # Additional config settings, mostly for app UI
    }
Exemplo n.º 22
0
def get_prodigy_data(dataset_name, db_config, ske_config) -> GoldDataContainer:

    from prodigy.components.db import connect

    db = connect(db_id='postgresql', db_settings=db_config)

    prodigy_data = db.get_dataset(dataset_name)

    if not prodigy_data:
        log_manager.info_global(
            f"Dataset {dataset_name} doesn't exist in the prodigy database!"
        )
        return

    log_manager.info_global(f"Loaded {len(prodigy_data)} entries")

    prodigy_gold_data_container = transform_to_gold_data(prodigy_data, db_config, ske_config)

    return prodigy_gold_data_container
Exemplo n.º 23
0
def dataset(set_id, description=None, author=None):
    """
    Create a new Prodigy dataset. This lets you assign meta information,
    like a description, and will add the new set to the database. In order to
    collect annotations and save the results, Prodigy expects a dataset ID to
    exist in the database.
    """
    DB = connect()
    if set_id in DB:
        prints("'{}' already exists in database {}.".format(
            set_id, DB.db_name),
               exits=True,
               error=True)
    meta = {'description': description, 'author': author}
    created = DB.add_dataset(set_id, meta)
    if not created:
        prints("Couldn't add {} to database {}.".format(set_id, DB.db_name),
               exits=1,
               error=True)
    prints("Successfully added '{}' to database {}.".format(
        set_id, DB.db_name))
Exemplo n.º 24
0
def pipe(source=None, api=None, loader=None, from_dataset=False, exclude=None):
    """
    Load examples from an input source, and print them as newline-delimited
    JSON. This makes it easy to filter the stream with command-line utilities
    such as `grep`. It's also often useful to inspect the stream, by piping to
    `less`.
    """
    DB = connect()
    if from_dataset:
        stream = DB.get_dataset(source)
    else:
        stream = get_stream(source, api, loader)
        stream = (set_hashes(eg) for eg in stream)
    if exclude:
        log("RECIPE: Excluding tasks from datasets: {}".format(
            ', '.join(exclude)))
        exclude_hashes = DB.get_input_hashes(*exclude)
        stream = filter_inputs(stream, exclude_hashes)
    try:
        for eg in stream:
            print(ujson.dumps(eg, escape_forward_slashes=False))
    except KeyboardInterrupt:
        pass
Exemplo n.º 25
0
def db_in(set_id,
          in_file,
          loader=None,
          answer='accept',
          overwrite=False,
          dry=False):
    """
    Import annotations to the database. Supports all formats loadable by
    Prodigy.
    """
    DB = connect()
    if not in_file.exists() or not in_file.is_file():
        prints("Not a valid input file.", in_file, exits=1, error=True)
    if set_id not in DB:
        prints("Can't find '{}' in database {}.".format(set_id, DB.db_name),
               "Maybe you misspelled the name or forgot to add the dataset "
               "using the `dataset` command?",
               exits=1,
               error=True)
    loader = get_loader(loader, file_path=in_file)
    annotations = loader(in_file)
    annotations = [set_hashes(eg) for eg in annotations]
    added_answers = 0
    for task in annotations:
        if 'answer' not in task or overwrite:
            task['answer'] = answer
            added_answers += 1
    session_id = get_timestamp_session_id()
    if not dry:
        DB.add_dataset(session_id, session=True)
        DB.add_examples(annotations, datasets=[set_id, session_id])
    prints(
        "Imported {} annotations for '{}' to database {}".format(
            len(annotations), set_id, DB.db_name),
        "Added '{}' answer to {} annotations".format(answer, added_answers),
        "Session ID: {}".format(session_id))
Exemplo n.º 26
0
 def eval_dataset(set_id):
     DB = connect()
     data = DB.get_dataset(set_id)
     accepted = [
         eg for eg in data if eg["answer"] == "accept" and eg.get("accept")
     ]
     rejected = [eg for eg in data if eg["answer"] == "reject"]
     ignored = [eg for eg in data if eg["answer"] == "ignore"]
     if not accepted and not rejected:
         msg.warn("No annotations collected", exits=1)
     total_count = 0
     agree_count = 0
     for eg in accepted:
         total_count += len(eg.get("options", []))
         agree_count += len(eg.get("accept", []))
     msg.info(f"Evaluating data from '{set_id}'")
     msg.text(
         f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)")
     pc = agree_count / total_count
     text = f"You agreed {agree_count} / {total_count} times ({pc:.0%})"
     if pc > 0.5:
         msg.good(text)
     else:
         msg.fail(text)
Exemplo n.º 27
0
 def eval_dataset(set_id):
     """Output summary about user agreement with the model."""
     DB = connect()
     data = DB.get_dataset(set_id)
     accepted = [
         eg for eg in data if eg["answer"] == "accept" and eg.get("accept")
     ]
     rejected = [eg for eg in data if eg["answer"] == "reject"]
     if not accepted and not rejected:
         msg.warn("No annotations collected", exits=1)
     high_conf = 0.8
     agree_count = 0
     disagree_high_conf = len(
         [e for e in rejected if e["confidence"] > high_conf])
     for eg in accepted:
         choice = eg["accept"][0]
         score_choice = [
             o["score"] for o in eg["options"] if o["id"] == choice
         ][0]
         score_other = [
             o["score"] for o in eg["options"] if o["id"] != choice
         ][0]
         if score_choice > score_other:
             agree_count += 1
         elif eg["confidence"] > high_conf:
             disagree_high_conf += 1
     pc = agree_count / (len(accepted) + len(rejected))
     text = f"You agreed {agree_count} / {len(data)} times ({pc:.0%})"
     msg.info(f"Evaluating data from '{set_id}'")
     if pc > 0.5:
         msg.good(text)
     else:
         msg.fail(text)
     msg.text(
         f"You disagreed on {disagree_high_conf} high confidence scores")
     msg.text(f"You rejected {len(rejected)} suggestions as not similar")
import os

from prodigy.components.db import connect  # import the database connector

# add custom home path for loading project db
os.environ['PRODIGY_HOME'] = '.'

db = connect()  # uses the settings in your prodigy.json
db.add_dataset('test_dataset')  # add a dataset
assert 'test_dataset' in db  # check that the dataset was added

examples = [{'text': 'hello world', '_task_hash': 123, '_input_hash': 456}]
db.add_examples(examples, ['test_dataset'])  # add examples to the dataset
dataset = db.get_dataset('test_dataset')  # retrieve a dataset

assert len(dataset) == 1  # check that the examples were added

db.drop_dataset('test_dataset')
assert 'test_dataset' not in db

db.close()
Exemplo n.º 29
0
def batch_train_increment(dataset, input_model=None, output_model=None, lang='en',
                factor=1, dropout=0.2, n_iter=1, batch_size=10,
                eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False,gpu_id = None):
    """
    Batch train a new text classification model from annotations. Prodigy will
    export the best result to the output directory, and include a JSONL file of
    the training and evaluation examples. You can either supply a dataset ID
    containing the evaluation data, or choose to split off a percentage of
    examples for evaluation.
    """
    #log("RECIPE: Starting recipe textcat.batch-train", locals())
    if(gpu_id):
        spacy.util.use_gpu(gpu_id)
    if(n_iter ==1):
        print("one pass mode")
    print("batch_size",batch_size)
    print(factor,type(factor))
    DB = connect()
    print_ = get_print(silent)
    random.seed(0)
    if input_model is not None:
        nlp = spacy.load(input_model, disable=['ner'])
        print_('\nLoaded model {}'.format(input_model))
    else:
        print("build your customized model")
        nlp = spacy.load('en_core_web_lg')
        pt_model = FastText(vocab_size=684831, emb_dim = 300)
        pt_model.embeds.weight.data.copy_(torch.from_numpy(nlp.vocab.vectors.data))
        model = PyTorchWrapper(pt_model)
        #textcat = TextCategorizer(nlp.vocab,model)
        textcat = Loss_TextCategorizer(nlp.vocab,model)
        nlp.add_pipe(textcat)
    examples = DB.get_dataset(dataset)
    labels = {eg['label'] for eg in examples}
    labels = list(sorted(labels))
    print(labels)
    model = TextClassifier(nlp, labels, long_text=long_text,
                           low_data=len(examples) < 1000)
    if shuffle:    
        print("it's shuffling")
        random.shuffle(examples)
    else:
        print("it's not shuffling")
    if eval_id:
        evals = DB.get_dataset(eval_id)
        print_("Loaded {} evaluation examples from '{}'"
               .format(len(evals), eval_id))
    else:
        examples, evals, eval_split = split_evals(examples, eval_split)
        print_("Using {}% of examples ({}) for evaluation"
               .format(round(eval_split * 100), len(evals)))
    if shuffle:
        random.shuffle(examples)
    examples = examples[:int(len(examples) * factor)]
    print_(printers.trainconf(dropout, n_iter, batch_size, factor,
                              len(examples)))
    if len(evals) > 0:
        print_(printers.tc_update_header())
    # best_acc = {'accuracy': 0}
    # best_model = None
    if long_text:
        examples = list(split_sentences(nlp, examples, min_length=False))
    batch_idx = 0
    start_time = datetime.now()
    for batch in cytoolz.partition_all(batch_size,
                                       tqdm.tqdm(examples, leave=False)):
        batch = list(batch)
        for i in range(n_iter):
            loss = model.update(batch, revise=False, drop=dropout)
            if len(evals) > 0:
                #print("optimizer averages",model.optimizer.averages)
                with nlp.use_params(model.optimizer.averages):
                    acc = model.evaluate(tqdm.tqdm(evals, leave=False))
                #print_(printers.tc_update(i, loss, acc))
                end_time = datetime.now() -start_time
                print('Time:[{0} seconds], Epoch: [{1}/{2}], batch: [{3}/{4}], Loss:{5}, Accuracy:{6}'.format( 
                   end_time.seconds,i+1, n_iter, batch_idx+1, len(examples)//batch_size, loss, acc['accuracy']))
            batch_idx += 1
    return acc
Exemplo n.º 30
0
def batch_train(dataset, input_model=None, output_model=None, lang='en',
                factor=1, dropout=0.2, n_iter=10, batch_size=10,
                eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False):
    """
    Batch train a new text classification model from annotations. Prodigy will
    export the best result to the output directory, and include a JSONL file of
    the training and evaluation examples. You can either supply a dataset ID
    containing the evaluation data, or choose to split off a percentage of
    examples for evaluation.
    """
    #log("RECIPE: Starting recipe textcat.batch-train", locals())
    print("batch_size",batch_size)
    print(factor,type(factor))
    DB = connect()
    print_ = get_print(silent)
    random.seed(0)
    if input_model is not None:
        nlp = spacy.load(input_model, disable=['ner'])
        print_('\nLoaded model {}'.format(input_model))
    else:
        print("build your customized model")
        nlp = spacy.load('en_core_web_lg')
        pt_model = FastText(vocab_size=684831, emb_dim = 300)
        pt_model.embeds.weight.data.copy_(torch.from_numpy(nlp.vocab.vectors.data))
        model = PyTorchWrapper(pt_model)
        textcat = TextCategorizer(nlp.vocab,model)
        nlp.add_pipe(textcat)

        #pt_model = LSTMSentiment(embedding_dim = 100, hidden_dim =100, vocab_size=259136, label_size=2, batch_size=3, dropout=0.5)
        #model = PyTorchWrapper(pt_model)
        #nlp = spacy.load('/home/ysun/pytorchprodigy/')
        #textcat = TextCategorizer(nlp.vocab,model)
        #nlp.add_pipe(textcat)
    examples = DB.get_dataset(dataset)
    labels = {eg['label'] for eg in examples}
    labels = list(sorted(labels))
    print(labels)
    model = TextClassifier(nlp, labels, long_text=long_text,
                           low_data=len(examples) < 1000)
    #log('RECIPE: Initialised TextClassifier with model {}'
    #    .format(input_model), model.nlp.meta)
    if shuffle:    
        print("it's shuffling")
        random.shuffle(examples)
    else:
        print("it's not shuffling")
    if eval_id:
        evals = DB.get_dataset(eval_id)
        print_("Loaded {} evaluation examples from '{}'"
               .format(len(evals), eval_id))
    else:
        examples, evals, eval_split = split_evals(examples, eval_split)
        print_("Using {}% of examples ({}) for evaluation"
               .format(round(eval_split * 100), len(evals)))
    if shuffle:
        random.shuffle(examples)
    examples = examples[:int(len(examples) * factor)]
    print_(printers.trainconf(dropout, n_iter, batch_size, factor,
                              len(examples)))
    if len(evals) > 0:
        print_(printers.tc_update_header())
    best_acc = {'accuracy': 0}
    best_model = None
    if long_text:
        examples = list(split_sentences(nlp, examples, min_length=False))
    for i in range(n_iter):
        loss = 0.
        random.shuffle(examples)
        for batch in cytoolz.partition_all(batch_size,
                                           tqdm.tqdm(examples, leave=False)):
            batch = list(batch)
            loss += model.update(batch, revise=False, drop=dropout)
        if len(evals) > 0:
            with nlp.use_params(model.optimizer.averages):
                acc = model.evaluate(tqdm.tqdm(evals, leave=False))
                if acc['accuracy'] > best_acc['accuracy']:
                    best_acc = dict(acc)
                    best_model = nlp.to_bytes()
            print_(printers.tc_update(i, loss, acc))
    if len(evals) > 0:
        print_(printers.tc_result(best_acc))
    if output_model is not None:
        if best_model is not None:
            nlp = nlp.from_bytes(best_model)
        msg = export_model_data(output_model, nlp, examples, evals)
        print_(msg)
    return best_acc['accuracy']