def serve(self, coder, port): print(coder) #filename = "{0}{1}.jsonl".format(base, coder) prodigy.serve('toi_blocks', # recipe "prod_dec_2020_2", # collection coder, # input file, repurposed for coder port=port) # port
def serve_ner(self, ner_label, port): print(ner_label) # We can actually give everyone the same document. That'll simplify the # directory and the update process, any may help the training process. filename = "data/{0}.jsonl".format(ner_label) prodigy.serve('ner.teach', "multiuser_test", "trained_ner", filename, None, None, ner_label, None, "multiuser_test", port=port)
def info(title, port): print(title) print(f"module name: {__name__}") print(f"parent process: {os.getppid()}") print(f"process id: {os.getpid()} \n") prodigy.serve( "ner.manual", "gsr_is_protest", # db "en_core_web_sm", "data/raw_github-issue-titles.jsonl", # input file port=port # port )
def serve(self, coder, port): print(coder) prodigy.serve( 'manual_custom', # recipe self.dataset, # dataset to save it in coder, # input file, repurposed for coder "ner_manual", # view ID self.label, self.collection, # api, repurposed to be collection None, # loader True, # memorize None, # exclude port=port) # port
def serve(self, coder, port): print(coder) base = "data/protest_for_classification_" filename = "{0}{1}.jsonl".format(base, coder) prodigy.serve( 'mark_custom', # recipe "gsr_is_protest", # db filename, # input file "classification", # view ID "PROTEST", None, # api None, # loader True, # memorize "gsr_is_protest", # exclude port=port) # port
def serve(self, dataset_name, coder, port, input_data, spacy_model, labels): print('Dataset name in DB:', ',', dataset_name, 'Coder:', coder, ',', 'Port #:', port, ',', 'Dataset file input:', ',', input_data, 'Spacy Model:', ',', spacy_model, 'User Labels:', ',', labels) prodigy.serve('ner.manual', dataset_name, spacy_model, input_data, None, None, labels, None, host=ip, port=port)
def serve_ner_manual(self, ner_label, port): print(ner_label) # We can actually give everyone the same document. That'll simplify the # directory and the update process, any may help the training process. #filename = "data/{0}.jsonl".format(ner_label) filename = "data/aljazeera_1.jsonl" prodigy.serve('ner.manual', "arabic_ner_db", "arabic_model", filename, None, None, ner_label, "arabic_ner_db", port=port)
def serve(self, coder, port): print(coder) #base = "data/protest_for_classification_" #filename = "{0}{1}.jsonl".format(base, coder) prodigy.serve( 'mark_custom', # recipe "new_test", # db "/liveperson/data/alloy/prodigy/data/grillbot_textcat_data3.jsonl", #path coder, # input file, repurposed for coder "classification", # view ID "PROTEST", None, # api None, # loaders True, # memorize "new_test", # exclude port=port) # port
def serve(self, coder, port): print(coder) prodigy.serve( self.recipe_name, self.dataset, self.model, coder, self.collection_name, self.db_name, self.label, #view_id=self.view_id, #label=self.label, #None, # api #None, # loader #True, # memorize #None, # exclude port=port, ) # port
for text in get_basic_text_stream(): cat_scores = nlp(text).cats options_accepted = [] for o in options: if cat_scores[o["text"]] >= 0.5: options_accepted.append(o["id"]) yield { "text": text, "options": options, "accept": options_accepted, "meta": { "score": str(cat_scores) } } @prodigy.recipe("prodigy_textcat_pre_annotated_id") def custom_recipe(): return { "view_id": "choice", "dataset": "prodigy_standalone_dataset", "stream": stream_pre_annotated(), } prodigy.serve("prodigy_textcat_pre_annotated_id", **prodigy_config)
if __name__ == '__main__': parser = argparse.ArgumentParser( description='''This is a simple tool that runs the annotation server for NER datasets. For more information visit https://github.com/eellak/gsoc2018-3gm/wiki/''') required = parser.add_argument_group('required arguments') required.add_argument( '-dataset', help='Dataset file to store annotations', required=True) required.add_argument( '-input', help='Corpus to draw annotations from', required=True) required.add_argument( '-model', help='model to train', required=True) args = parser.parse_args() data_set = args.dataset input_file = args.input model = args.model prodigy.serve('ner.manual', data_set, model, input_file)
from example_prodigy_standalones import data_provider # local dict overriding any prodigy.json file prodigy_config = { "choice_style": "multiple" } def stream_most_basic(): for t in data_provider.get_basic_text_stream(): yield { "text": t, "options": data_provider.choice_options } @prodigy.recipe("prodigy_most_basic_id") def custom_recipe(): return { "view_id": "choice", "dataset": "prodigy_standalone_dataset", "stream": stream_most_basic(), } prodigy.serve("prodigy_most_basic_id", **prodigy_config)
def run_recipe(db, stream, dataset_name, db_config, index1_table_name, index2_table_names): import prodigy @prodigy.recipe("cats_recipe") def choice(): db_connection = None db_cursor = None # custom function to run when an annotation is complete def update(examples): log_manager.debug_global("Prodigy: updating ...") nonlocal db_connection nonlocal db_cursor db_connection, db_cursor = db_manager.open_db_connection( db_config, db_connection, db_cursor) assert db_connection and db_connection.closed == 0 # 0 means 'open' assert db_cursor and not db_cursor.closed for example in examples: try: if index1_table_name and 'url' in example['meta']: url = example['meta']['url'] log_manager.debug_global( f"Storing annotation meta info for url={url} in table {index1_table_name} ..." ) db_cursor.execute( sql.SQL("UPDATE {index_table_name} " "SET already_annotated = TRUE " "WHERE {pk} = %(value)s").format( index_table_name=sql.Identifier( index1_table_name), pk=sql.Identifier('url')), {'value': url}) # TODO: this could be made safer to ensure # that index2 won't be updated accidentally with 'already_annotated' # when we are actually only streaming from index1. # # Curently the stream from index1 does not set 'docid' in example['meta'], # but this may not be good to rely on. if index2_table_names and 'docid' in example['meta']: docid = example['meta']['docid'] log_manager.debug_global( f"Storing annotation meta info for docid={docid} in table {index2_table_names['scores']} ..." ) db_cursor.execute( sql.SQL("UPDATE {index_table_name} " "SET already_annotated = TRUE " "WHERE {pk} = %(value)s").format( index_table_name=sql.Identifier( index2_table_names['scores']), pk=sql.Identifier('docid')), {'value': docid}) db_connection.commit() except Exception as ex: log_manager.info_global( f"Error storing an annotation in the database: {ex}") db_connection.rollback() # custom function to run when the user exists prodigy # TODO: it is not ideal to put the closing of the database connection here because there might be multiple users. # but also, it won't hurt because the connection can be reopened at the next update, # and there is no better function to put it; see https://prodi.gy/docs/custom-recipes # at least, put here, it will close the connection when the last user stops annotating. def on_exit(controller): log_manager.debug_global("Prodigy: exiting ...") db_manager.close_db_connection(db_connection, db_cursor) return { "view_id": "choice", "dataset": dataset_name, "stream": stream, "db": db, "update": update, "on_exit": on_exit, } log_manager.debug_global("Starting up the prodigy server ...") prodigy.serve( "cats_recipe", host="0.0.0.0", choice_style="multiple", )
# create new dataset db.add_dataset(dataset) # add examples to the (existing!) dataset db.add_examples(jsonl_annotations, datasets=[dataset]) reviewed_dataset = input("Enter NEW dataset name for reviewed transcription: ") while reviewed_dataset in db: reviewed_dataset = input( "Database name already taken. Enter NEW dataset name for reviewed transcription: " ) if not reviewed_dataset in db: break print("\nPress ctrl+c when done transcribing\n") #open prodigy to transcribe manually/review automatic transcriptionion (--fetch-media loads in audio as well) prodigy.serve("audio.transcribe {} dataset:{} --fetch-media".format( reviewed_dataset, dataset)) #input("Type 'ok' when having saved the corrected transcription: ") reviewed_transcription = db.get_dataset(reviewed_dataset) #print(reviewed_transcription) # for dic in reviewed_transcription: final_transcription = dic[ "transcript"] #müsste egtl immer nur 1 sein in dem kontext #TypeError: list indices must be integers or slices, not str!!!! kein dictionary in der liste??? types printen.... #prepare for annotation: 1 sentence at a time """labels = [] while input() != "ok":