예제 #1
0
 def serve(self, coder, port):
     print(coder)
     #filename = "{0}{1}.jsonl".format(base, coder)
     prodigy.serve('toi_blocks',       # recipe
                   "prod_dec_2020_2",  # collection
                   coder, # input file, repurposed for coder
                   port=port)  # port
 def serve_ner(self, ner_label, port):
     print(ner_label)
     # We can actually give everyone the same document. That'll simplify the
     # directory and the update process, any may help the training process.
     filename = "data/{0}.jsonl".format(ner_label)
     prodigy.serve('ner.teach', "multiuser_test", "trained_ner",
                   filename,  None, None, ner_label, None, "multiuser_test",
                   port=port)
def info(title, port):
    print(title)
    print(f"module name: {__name__}")
    print(f"parent process: {os.getppid()}")
    print(f"process id: {os.getpid()} \n")
    prodigy.serve(
        "ner.manual", 
        "gsr_is_protest",  # db
        "en_core_web_sm",
        "data/raw_github-issue-titles.jsonl", # input file
        port=port  # port
    )
 def serve(self, coder, port):
     print(coder)
     prodigy.serve(
         'manual_custom',  # recipe
         self.dataset,  # dataset to save it in
         coder,  # input file, repurposed for coder
         "ner_manual",  # view ID
         self.label,
         self.collection,  # api, repurposed to be collection
         None,  # loader
         True,  # memorize
         None,  # exclude
         port=port)  # port
 def serve(self, coder, port):
     print(coder)
     base = "data/protest_for_classification_"
     filename = "{0}{1}.jsonl".format(base, coder)
     prodigy.serve(
         'mark_custom',  # recipe
         "gsr_is_protest",  # db
         filename,  # input file
         "classification",  # view ID
         "PROTEST",
         None,  # api
         None,  # loader
         True,  # memorize
         "gsr_is_protest",  # exclude
         port=port)  # port
예제 #6
0
 def serve(self, dataset_name, coder, port, input_data, spacy_model,
           labels):
     print('Dataset name in DB:', ',', dataset_name, 'Coder:', coder, ',',
           'Port #:', port, ',', 'Dataset file input:', ',', input_data,
           'Spacy Model:', ',', spacy_model, 'User Labels:', ',', labels)
     prodigy.serve('ner.manual',
                   dataset_name,
                   spacy_model,
                   input_data,
                   None,
                   None,
                   labels,
                   None,
                   host=ip,
                   port=port)
예제 #7
0
 def serve_ner_manual(self, ner_label, port):
     print(ner_label)
     # We can actually give everyone the same document. That'll simplify the
     # directory and the update process, any may help the training process.
     #filename = "data/{0}.jsonl".format(ner_label)
     filename = "data/aljazeera_1.jsonl"
     prodigy.serve('ner.manual',
                   "arabic_ner_db",
                   "arabic_model",
                   filename,
                   None,
                   None,
                   ner_label,
                   "arabic_ner_db",
                   port=port)
예제 #8
0
 def serve(self, coder, port):
     print(coder)
     #base = "data/protest_for_classification_"
     #filename = "{0}{1}.jsonl".format(base, coder)
     prodigy.serve(
         'mark_custom',  # recipe
         "new_test",  # db
         "/liveperson/data/alloy/prodigy/data/grillbot_textcat_data3.jsonl",  #path
         coder,  # input file, repurposed for coder
         "classification",  # view ID
         "PROTEST",
         None,  # api
         None,  # loaders
         True,  # memorize
         "new_test",  # exclude
         port=port)  # port
예제 #9
0
 def serve(self, coder, port):
     print(coder)
     prodigy.serve(
         self.recipe_name,
         self.dataset,
         self.model,
         coder,
         self.collection_name,
         self.db_name,
         self.label,
         #view_id=self.view_id,
         #label=self.label,
         #None, # api
         #None, # loader
         #True, # memorize
         #None, # exclude
         port=port,
     )  # port
예제 #10
0
    for text in get_basic_text_stream():

        cat_scores = nlp(text).cats
        options_accepted = []

        for o in options:
            if cat_scores[o["text"]] >= 0.5:
                options_accepted.append(o["id"])

        yield {
            "text": text,
            "options": options,
            "accept": options_accepted,
            "meta": {
                "score": str(cat_scores)
            }
        }


@prodigy.recipe("prodigy_textcat_pre_annotated_id")
def custom_recipe():

    return {
        "view_id": "choice",
        "dataset": "prodigy_standalone_dataset",
        "stream": stream_pre_annotated(),
    }


prodigy.serve("prodigy_textcat_pre_annotated_id", **prodigy_config)
예제 #11
0

if __name__ == '__main__':

    parser = argparse.ArgumentParser(
        description='''This is a simple tool that runs the annotation server for NER datasets.
		For more information visit https://github.com/eellak/gsoc2018-3gm/wiki/''')

    required = parser.add_argument_group('required arguments')

    required.add_argument(
        '-dataset',
        help='Dataset file to store annotations',
        required=True)
    required.add_argument(
        '-input',
        help='Corpus to draw annotations from',
        required=True)
    required.add_argument(
        '-model',
        help='model to train',
        required=True)

    args = parser.parse_args()

    data_set = args.dataset
    input_file = args.input
    model = args.model

    prodigy.serve('ner.manual', data_set, model, input_file)
예제 #12
0
from example_prodigy_standalones import data_provider


# local dict overriding any prodigy.json file
prodigy_config = {
    "choice_style": "multiple"
}


def stream_most_basic():

    for t in data_provider.get_basic_text_stream():

        yield {
            "text": t,
            "options": data_provider.choice_options
        }


@prodigy.recipe("prodigy_most_basic_id")
def custom_recipe():

    return {
        "view_id": "choice",
        "dataset": "prodigy_standalone_dataset",
        "stream": stream_most_basic(),
    }


prodigy.serve("prodigy_most_basic_id", **prodigy_config)
예제 #13
0
def run_recipe(db, stream, dataset_name, db_config, index1_table_name,
               index2_table_names):

    import prodigy

    @prodigy.recipe("cats_recipe")
    def choice():

        db_connection = None
        db_cursor = None

        # custom function to run when an annotation is complete
        def update(examples):

            log_manager.debug_global("Prodigy: updating ...")

            nonlocal db_connection
            nonlocal db_cursor

            db_connection, db_cursor = db_manager.open_db_connection(
                db_config, db_connection, db_cursor)

            assert db_connection and db_connection.closed == 0  # 0 means 'open'
            assert db_cursor and not db_cursor.closed

            for example in examples:
                try:

                    if index1_table_name and 'url' in example['meta']:
                        url = example['meta']['url']

                        log_manager.debug_global(
                            f"Storing annotation meta info for url={url} in table {index1_table_name} ..."
                        )

                        db_cursor.execute(
                            sql.SQL("UPDATE {index_table_name} "
                                    "SET already_annotated = TRUE "
                                    "WHERE {pk} = %(value)s").format(
                                        index_table_name=sql.Identifier(
                                            index1_table_name),
                                        pk=sql.Identifier('url')),
                            {'value': url})

                    # TODO: this could be made safer to ensure
                    # that index2 won't be updated accidentally with 'already_annotated'
                    # when we are actually only streaming from index1.
                    #
                    # Curently the stream from index1 does not set 'docid' in example['meta'],
                    # but this may not be good to rely on.
                    if index2_table_names and 'docid' in example['meta']:
                        docid = example['meta']['docid']

                        log_manager.debug_global(
                            f"Storing annotation meta info for docid={docid} in table {index2_table_names['scores']} ..."
                        )

                        db_cursor.execute(
                            sql.SQL("UPDATE {index_table_name} "
                                    "SET already_annotated = TRUE "
                                    "WHERE {pk} = %(value)s").format(
                                        index_table_name=sql.Identifier(
                                            index2_table_names['scores']),
                                        pk=sql.Identifier('docid')),
                            {'value': docid})

                    db_connection.commit()

                except Exception as ex:

                    log_manager.info_global(
                        f"Error storing an annotation in the database: {ex}")

                    db_connection.rollback()

        # custom function to run when the user exists prodigy
        # TODO: it is not ideal to put the closing of the database connection here because there might be multiple users.
        # but also, it won't hurt because the connection can be reopened at the next update,
        # and there is no better function to put it; see https://prodi.gy/docs/custom-recipes
        # at least, put here, it will close the connection when the last user stops annotating.
        def on_exit(controller):

            log_manager.debug_global("Prodigy: exiting ...")

            db_manager.close_db_connection(db_connection, db_cursor)

        return {
            "view_id": "choice",
            "dataset": dataset_name,
            "stream": stream,
            "db": db,
            "update": update,
            "on_exit": on_exit,
        }

    log_manager.debug_global("Starting up the prodigy server ...")

    prodigy.serve(
        "cats_recipe",
        host="0.0.0.0",
        choice_style="multiple",
    )
예제 #14
0
# create new dataset
db.add_dataset(dataset)
# add examples to the (existing!) dataset
db.add_examples(jsonl_annotations, datasets=[dataset])

reviewed_dataset = input("Enter NEW dataset name for reviewed transcription: ")
while reviewed_dataset in db:
    reviewed_dataset = input(
        "Database name already taken. Enter NEW dataset name for reviewed transcription: "
    )
    if not reviewed_dataset in db:
        break

print("\nPress ctrl+c when done transcribing\n")
#open prodigy to transcribe manually/review automatic transcriptionion (--fetch-media loads in audio as well)
prodigy.serve("audio.transcribe {} dataset:{} --fetch-media".format(
    reviewed_dataset, dataset))

#input("Type 'ok' when having saved the corrected transcription: ")

reviewed_transcription = db.get_dataset(reviewed_dataset)

#print(reviewed_transcription) #

for dic in reviewed_transcription:
    final_transcription = dic[
        "transcript"]  #müsste egtl immer nur 1 sein in dem kontext
    #TypeError: list indices must be integers or slices, not str!!!! kein dictionary in der liste??? types printen....

#prepare for annotation: 1 sentence at a time
"""labels = []
while input() != "ok":