def batch_train_custom(dataset, input_model=None, output_model=None, lang='en', factor=1, dropout=0.2, n_iter=1, batch_size=10, eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False): if(n_iter ==1): print("one pass mode") print("batch_size",batch_size) #print(factor,type(factor)) DB = connect() print_ = get_print(silent) random.seed(0) if input_model is not None: nlp = spacy.load(input_model, disable=['ner']) print_('\nLoaded model {}'.format(input_model)) model = TextClassifier(nlp, labels, long_text=long_text, low_data=len(examples) < 1000) else: print("build your customized model") pt_model = FastText(vocab_size=684831, emb_dim = 300) optimizer = torch.optim.Adam(pt_model.parameters(), lr=0.001) criterion = nn.BCELoss() model = Prodigy_model(pt_model,label_size=1,optimizer=optimizer,loss=criterion) examples = DB.get_dataset(dataset) if eval_id: evals = DB.get_dataset(eval_id) print_("Loaded {} evaluation examples from '{}'" .format(len(evals), eval_id)) examples = examples[:int(len(examples) * factor)] print_(printers.trainconf(dropout, n_iter, batch_size, factor, len(examples))) if len(evals) > 0: print_(printers.tc_update_header()) best_acc = {'accuracy': 0} best_model = None for i in range(n_iter): if shuffle: random.shuffle(examples) batch_idx = 1 for batch in cytoolz.partition_all(batch_size, tqdm.tqdm(examples, leave=False)): #print(j) batch = list(batch) loss = model.update(batch) if len(evals) > 0 and batch_idx % (4 * batch_size) == 0: acc = model.evaluate(evals) #print_(printers.tc_update(batch_idx, loss, acc)) print('Epoch: [{0}/{1}], Step: [{2}/{3}], Loss: {4}, Validation Acc:{5}'.format( i+1, n_iter, batch_idx, len(examples)//batch_size, loss, acc)) batch_idx += 1 return acc
def textcat_teach( dataset: str, spacy_model: str, source: str, label: Optional[List[str]] = None, patterns: Optional[str] = None, exclude: Optional[List[str]] = None, ): """ Collect the best possible training data for a text classification model with the model in the loop. Based on your annotations, Prodigy will decide which questions to ask next. """ # Load the stream from a JSONL file and return a generator that yields a # dictionary for each example in the data. stream = JSONL(source) # Load the spaCy model nlp = spacy.load(spacy_model) # Initialize Prodigy's text classifier model, which outputs # (score, example) tuples model = TextClassifier(nlp, label) if patterns is None: # No patterns are used, so just use the model to suggest examples # and only use the model's update method as the update callback predict = model update = model.update else: # Initialize the pattern matcher and load in the JSONL patterns. # Set the matcher to not label the highlighted spans, only the text. matcher = PatternMatcher( nlp, prior_correct=5.0, prior_incorrect=5.0, label_span=False, label_task=True, ) matcher = matcher.from_disk(patterns) # Combine the NER model and the matcher and interleave their # suggestions and update both at the same time predict, update = combine_models(model, matcher) # Use the prefer_uncertain sorter to focus on suggestions that the model # is most uncertain about (i.e. with a score closest to 0.5). The model # yields (score, example) tuples and the sorter yields just the example stream = prefer_uncertain(predict(stream)) return { "view_id": "classification", # Annotation interface to use "dataset": dataset, # Name of dataset to save annotations "stream": stream, # Incoming stream of examples "update": update, # Update callback, called with batch of answers "exclude": exclude, # List of dataset names to exclude "config": { "lang": nlp.lang }, # Additional config settings, mostly for app UI }
def batch_train_custom_cumulate(dataset, input_model=None, output_model=None, lang='en', factor=1, dropout=0.2, n_iter=1, batch_size=10, eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False,gpu_id = None): if(gpu_id == 0 and torch.cuda.is_available()): print("Using cuda") os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) cudnn.benchmark = True if(n_iter ==1): print("one pass mode") print("batch_size",batch_size) #print(factor,type(factor)) DB = connect() print_ = get_print(silent) random.seed(0) if input_model is not None: nlp = spacy.load(input_model, disable=['ner']) print_('\nLoaded model {}'.format(input_model)) model = TextClassifier(nlp, labels, long_text=long_text, low_data=len(examples) < 1000) else: print("build your customized model") pt_model = FastText(vocab_size=684831, emb_dim = 300).cuda() optimizer = torch.optim.Adam(pt_model.parameters(), lr=0.001) criterion = nn.BCELoss() model = Prodigy_model(pt_model,label_size=1,optimizer=optimizer,loss=criterion) examples = DB.get_dataset(dataset) if eval_id: evals = DB.get_dataset(eval_id) print_("Loaded {} evaluation examples from '{}'" .format(len(evals), eval_id)) examples = examples[:int(len(examples) * factor)] print_(printers.trainconf(dropout, n_iter, batch_size, factor, len(examples))) if len(evals) > 0: print_(printers.tc_update_header()) best_acc = {'accuracy': 0} best_model = None start_time = datetime.now() if len(evals) > 0: model.init_eval(evals) interval = 100 for fac in np.arange(interval,len(examples)+interval,interval): examples_fac = examples[:fac] batch_number = examples_fac/batch_size for i in range(n_iter): if shuffle: print("it's shuffling") random.shuffle(examples) batch_idx = 0 loss = 0 for batch in cytoolz.partition_all(batch_size, tqdm.tqdm(examples, leave=False)): batch = list(batch) loss += model.update(batch) batch_idx += 1 acc = model.evaluate(evals) print_('Time:[{0} seconds], process: [{1}/{2}], Epoch: [{3}/{4}], step: [{5}/{6}], Loss: {7},Acc:{8}'.format( end_time.seconds,fac, len(examples)//interval, i+1, n_iter, batch_idx+1, len(examples_fac)//batch_size, loss/batch_number, acc)) return acc
def custom_with_recipe_html_template(): nlp = spacy.load(spacy_model) model = TextClassifier(nlp, labels, long_text=False) stream = JSONL(example_jsonl) stream = filter_duplicates(stream, by_input=True, by_task=False) return { 'view_id': 'html', 'dataset': dataset, 'stream': stream, 'exclude': [dataset], 'update': model.update, 'config': { 'labels': labels, 'html_template': template_text } }
def evaluate(dataset, spacy_model, source, label='', api=None, loader=None, exclude=None): """ Evaluate a text classification model and build an evaluation set from a stream. """ log("RECIPE: Starting recipe attncat.eval", locals()) nlp = spacy.load(spacy_model, disable=['tagger', 'parser', 'ner']) # Get attention layer weights from textcat textcat = nlp.get_pipe('textcat') assert textcat is not None with get_attention_weights(textcat) as attn_weights: stream = get_stream(source, api, loader) # Decorate items with attention data stream = attach_attention_data(stream, nlp, attn_weights) model = TextClassifier(nlp, label) log( 'RECIPE: Initialised TextClassifier with model {}'.format( spacy_model), model.nlp.meta) def on_exit(ctrl): examples = ctrl.db.get_dataset(dataset) data = dict(model.evaluate(examples)) print(printers.tc_result(data)) return { 'view_id': 'html', 'dataset': dataset, 'stream': stream, 'exclude': exclude, 'on_exit': on_exit, 'config': { 'lang': nlp.lang, 'labels': model.labels, 'html_template': template_text } }
def evaluate(dataset, spacy_model, source, label='', api=None, loader=None, exclude=None): """ Evaluate a text classification model and build an evaluation set from a stream. """ log("RECIPE: Starting recipe attncat.eval", locals()) nlp = spacy.load(spacy_model, disable=['tagger', 'parser', 'ner']) stream = get_stream(source, api, loader) stream = attach_structural_sensitivity_data(stream, nlp, label.split(',')[0]) model = TextClassifier(nlp, label) log('RECIPE: Initialised TextClassifier with model {}'.format(spacy_model), model.nlp.meta) def on_exit(ctrl): examples = ctrl.db.get_dataset(dataset) data = dict(model.evaluate(examples)) print(printers.tc_result(data)) return { 'view_id': 'html', 'dataset': dataset, 'stream': stream, 'exclude': exclude, 'on_exit': on_exit, 'config': { 'lang': nlp.lang, 'labels': model.labels, 'html_template': template_text } }
def batch_train_increment(dataset, input_model=None, output_model=None, lang='en', factor=1, dropout=0.2, n_iter=1, batch_size=10, eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False,gpu_id = None): """ Batch train a new text classification model from annotations. Prodigy will export the best result to the output directory, and include a JSONL file of the training and evaluation examples. You can either supply a dataset ID containing the evaluation data, or choose to split off a percentage of examples for evaluation. """ #log("RECIPE: Starting recipe textcat.batch-train", locals()) if(gpu_id): spacy.util.use_gpu(gpu_id) if(n_iter ==1): print("one pass mode") print("batch_size",batch_size) print(factor,type(factor)) DB = connect() print_ = get_print(silent) random.seed(0) if input_model is not None: nlp = spacy.load(input_model, disable=['ner']) print_('\nLoaded model {}'.format(input_model)) else: print("build your customized model") nlp = spacy.load('en_core_web_lg') pt_model = FastText(vocab_size=684831, emb_dim = 300) pt_model.embeds.weight.data.copy_(torch.from_numpy(nlp.vocab.vectors.data)) model = PyTorchWrapper(pt_model) #textcat = TextCategorizer(nlp.vocab,model) textcat = Loss_TextCategorizer(nlp.vocab,model) nlp.add_pipe(textcat) examples = DB.get_dataset(dataset) labels = {eg['label'] for eg in examples} labels = list(sorted(labels)) print(labels) model = TextClassifier(nlp, labels, long_text=long_text, low_data=len(examples) < 1000) if shuffle: print("it's shuffling") random.shuffle(examples) else: print("it's not shuffling") if eval_id: evals = DB.get_dataset(eval_id) print_("Loaded {} evaluation examples from '{}'" .format(len(evals), eval_id)) else: examples, evals, eval_split = split_evals(examples, eval_split) print_("Using {}% of examples ({}) for evaluation" .format(round(eval_split * 100), len(evals))) if shuffle: random.shuffle(examples) examples = examples[:int(len(examples) * factor)] print_(printers.trainconf(dropout, n_iter, batch_size, factor, len(examples))) if len(evals) > 0: print_(printers.tc_update_header()) # best_acc = {'accuracy': 0} # best_model = None if long_text: examples = list(split_sentences(nlp, examples, min_length=False)) batch_idx = 0 start_time = datetime.now() for batch in cytoolz.partition_all(batch_size, tqdm.tqdm(examples, leave=False)): batch = list(batch) for i in range(n_iter): loss = model.update(batch, revise=False, drop=dropout) if len(evals) > 0: #print("optimizer averages",model.optimizer.averages) with nlp.use_params(model.optimizer.averages): acc = model.evaluate(tqdm.tqdm(evals, leave=False)) #print_(printers.tc_update(i, loss, acc)) end_time = datetime.now() -start_time print('Time:[{0} seconds], Epoch: [{1}/{2}], batch: [{3}/{4}], Loss:{5}, Accuracy:{6}'.format( end_time.seconds,i+1, n_iter, batch_idx+1, len(examples)//batch_size, loss, acc['accuracy'])) batch_idx += 1 return acc
def textcat_al(dataset, spacy_model,source=None, label='', api=None, patterns=None, loader=None, long_text=False, exclude=None): """ Collect the best possible training data for a text classification model with the model in the loop. Based on your annotations, Prodigy will decide which questions to ask next. """ # logDB = setup_mongo('activelearning') #nlp = spacy.load('/home/ysun/pytorchprodigy/') if(spacy_model is not None): if(type(spacy_model) == str): print("Load model ",spacy_model) nlp=spacy.load(spacy_model, disable=['ner', 'parser']) model = TextClassifier(nlp, label, long_text=long_text) else: model = spacy_model else: print("build your customized model") nlp = spacy.load('en_core_web_lg') #pt_model = nn.Linear(100,1) #pt_model = LSTMSentiment(embedding_dim = 100, hidden_dim =100, vocab_size=259136, label_size=2, batch_size=3, dropout=0.5) pt_model = FastText_test(vocab_size=684831, emb_dim = 300) pt_model.embeds.weight.data.copy_(torch.from_numpy(nlp.vocab.vectors.data)) model = PyTorchWrapper(pt_model) textcat = Loss_TextCategorizer(nlp.vocab,model) nlp.add_pipe(textcat) model = TextClassifier(nlp, label, long_text=long_text) stream = get_stream(source,input_key = 'text') if patterns is None: predict = model update = model.update else: matcher = PatternMatcher(model.nlp, prior_correct=5., prior_incorrect=5., label_span=False, label_task=True) matcher = matcher.from_disk(patterns) #log("RECIPE: Created PatternMatcher and loaded in patterns", patterns) # Combine the textcat model with the PatternMatcher to annotate both # match results and predictions, and update both models. predict, update = combine_models(model, matcher) # Rank the stream. Note this is continuous, as model() is a generator. # As we call model.update(), the ranking of examples changes. stream = test_stream(stream,predict) def updateDB(answers): model.update(answers) #print("update model") #for eg in answers: # print(eg) #for score,eg in model(answers): # eg["update_score"] = score # print("new",score) #print(answers) def on_exit(): print("on_exit") return model return { 'view_id': 'classification', 'dataset': dataset, 'stream': stream, 'exclude': exclude, 'update': updateDB, 'on_exit': on_exit, 'config': {'labels': model.labels,'batch_size':1} }
def batch_train(dataset, input_model=None, output_model=None, lang='en', factor=1, dropout=0.2, n_iter=10, batch_size=10, eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False): """ Batch train a new text classification model from annotations. Prodigy will export the best result to the output directory, and include a JSONL file of the training and evaluation examples. You can either supply a dataset ID containing the evaluation data, or choose to split off a percentage of examples for evaluation. """ #log("RECIPE: Starting recipe textcat.batch-train", locals()) print("batch_size",batch_size) print(factor,type(factor)) DB = connect() print_ = get_print(silent) random.seed(0) if input_model is not None: nlp = spacy.load(input_model, disable=['ner']) print_('\nLoaded model {}'.format(input_model)) else: print("build your customized model") nlp = spacy.load('en_core_web_lg') pt_model = FastText(vocab_size=684831, emb_dim = 300) pt_model.embeds.weight.data.copy_(torch.from_numpy(nlp.vocab.vectors.data)) model = PyTorchWrapper(pt_model) textcat = TextCategorizer(nlp.vocab,model) nlp.add_pipe(textcat) #pt_model = LSTMSentiment(embedding_dim = 100, hidden_dim =100, vocab_size=259136, label_size=2, batch_size=3, dropout=0.5) #model = PyTorchWrapper(pt_model) #nlp = spacy.load('/home/ysun/pytorchprodigy/') #textcat = TextCategorizer(nlp.vocab,model) #nlp.add_pipe(textcat) examples = DB.get_dataset(dataset) labels = {eg['label'] for eg in examples} labels = list(sorted(labels)) print(labels) model = TextClassifier(nlp, labels, long_text=long_text, low_data=len(examples) < 1000) #log('RECIPE: Initialised TextClassifier with model {}' # .format(input_model), model.nlp.meta) if shuffle: print("it's shuffling") random.shuffle(examples) else: print("it's not shuffling") if eval_id: evals = DB.get_dataset(eval_id) print_("Loaded {} evaluation examples from '{}'" .format(len(evals), eval_id)) else: examples, evals, eval_split = split_evals(examples, eval_split) print_("Using {}% of examples ({}) for evaluation" .format(round(eval_split * 100), len(evals))) if shuffle: random.shuffle(examples) examples = examples[:int(len(examples) * factor)] print_(printers.trainconf(dropout, n_iter, batch_size, factor, len(examples))) if len(evals) > 0: print_(printers.tc_update_header()) best_acc = {'accuracy': 0} best_model = None if long_text: examples = list(split_sentences(nlp, examples, min_length=False)) for i in range(n_iter): loss = 0. random.shuffle(examples) for batch in cytoolz.partition_all(batch_size, tqdm.tqdm(examples, leave=False)): batch = list(batch) loss += model.update(batch, revise=False, drop=dropout) if len(evals) > 0: with nlp.use_params(model.optimizer.averages): acc = model.evaluate(tqdm.tqdm(evals, leave=False)) if acc['accuracy'] > best_acc['accuracy']: best_acc = dict(acc) best_model = nlp.to_bytes() print_(printers.tc_update(i, loss, acc)) if len(evals) > 0: print_(printers.tc_result(best_acc)) if output_model is not None: if best_model is not None: nlp = nlp.from_bytes(best_model) msg = export_model_data(output_model, nlp, examples, evals) print_(msg) return best_acc['accuracy']
all_labels.add(eg["label"]) textcat = nlp.create_pipe("textcat") for label in all_labels: textcat.add_label(label) textcat.begin_training() nlp.add_pipe(textcat) random.shuffle(examples) train_examples, evals, eval_split = split_evals( examples, eval_split) st.success( f"✅ Using **{len(train_examples)}** training examples " f"and **{len(evals)}** evaluation examples with " f"**{len(all_labels)}** label(s)") annot_model = TextClassifier( nlp, all_labels, low_data=len(train_examples) < 1000, exclusive_classes=exclusive, ) progress = st.progress(0) results = [] result_table = st.empty() best_acc = 0.0 for i in range(n_iter): loss = 0.0 random.shuffle(train_examples) for batch in minibatch(train_examples, size=10): batch = list(batch) loss += annot_model.update(batch, revise=False, drop=dropout) with nlp.use_params(annot_model.optimizer.averages):
def teach(dataset, spacy_model, source=None, label='', api=None, loader=None, seeds=None, long_text=False, exclude=None): """ Collect the best possible training data for a text classification model with the model in the loop. Based on your annotations, Prodigy will decide which questions to ask next. """ log('RECIPE: Starting recipe attncat.teach', locals()) DB = connect() nlp = spacy.load(spacy_model) log('RECIPE: Creating TextClassifier with model {}'.format(spacy_model)) model = TextClassifier(nlp, label.split(','), long_text=long_text) stream = get_stream(source, api, loader, rehash=True, dedup=True, input_key='text') # Get attention layer weights from textcat textcat = nlp.get_pipe('textcat') assert textcat is not None with get_attention_weights(textcat) as attn_weights: if seeds is not None: if isinstance(seeds, str) and seeds in DB: seeds = get_seeds_from_set(seeds, DB.get_dataset(seeds)) else: seeds = get_seeds(seeds) # Find 'seedy' examples examples_with_seeds = list( find_with_terms(stream, seeds, at_least=10, at_most=1000, give_up_after=10000)) for eg in examples_with_seeds: eg.setdefault('meta', {}) eg['meta']['via_seed'] = True print("Found {} examples with seeds".format( len(examples_with_seeds))) examples_with_seeds = [ task for _, task in model(examples_with_seeds) ] # Rank the stream. Note this is continuous, as model() is a generator. # As we call model.update(), the ranking of examples changes. stream = prefer_uncertain(model(stream)) # Prepend 'seedy' examples, if present if seeds: log("RECIPE: Prepending examples with seeds to the stream") stream = cytoolz.concat((examples_with_seeds, stream)) # Decorate items with attention data stream = attach_attention_data(stream, nlp, attn_weights) return { 'view_id': 'html', 'dataset': dataset, 'stream': stream, 'exclude': exclude, 'update': model.update, 'config': { 'lang': nlp.lang, 'labels': model.labels, 'html_template': template_text } }
def textcat_custom(dataset, spacy_model, source=None, label='', api=None, patterns=None, loader=None, long_text=False, exclude=None): """ Collect the best possible training data for a text classification model with the model in the loop. Based on your annotations, Prodigy will decide which questions to ask next. """ # logDB = setup_mongo('activelearning') #nlp = spacy.load('/home/ysun/pytorchprodigy/') if (spacy_model is not None): if (type(spacy_model) == str): print("Load model ", spacy_model) nlp = spacy.load(spacy_model, disable=['ner', 'parser']) model = TextClassifier(nlp, label, long_text=long_text) else: model = spacy_model else: print("build your customized model") #nlp = spacy.load('en_core_web_lg') pt_model = FastText(vocab_size=50966, emb_dim=300) #pt_model.embeds.weight.data.copy_(torch.from_numpy(nlp.vocab.vectors.data)) #model = PyTorchWrapper(pt_model) #textcat = Loss_TextCategorizer(nlp.vocab,model) #nlp.add_pipe(textcat) #model = TextClassifier(nlp, label, long_text=long_text) optimizer = torch.optim.Adam(pt_model.parameters(), lr=0.001) criterion = nn.BCELoss() #example_path = "/liveperson/data/alloy/prodigy/data/newsgroup_initial.jsonl" example_path = "/liveperson/data/alloy/prodigy/data/newsgroup_example.jsonl" vectorizer_path = "/liveperson/data/alloy/prodigy/data/newsgroup_all.jsonl" model = Prodigy_model_cpu(pt_model, vectorizer_path, None, label_size=1, optimizer=optimizer, loss=criterion) # model = Prodigy_svm_cpu(pt_model,label_size=1,optimizer=optimizer,loss=criterion) stream = get_stream(source, input_key='text') if patterns is None: predict = model.predict update = model.update stream = test_stream(stream, predict) def updateDB(answers): model.update(answers) def on_exit(): print("on_exit") return model return { 'view_id': 'classification', 'dataset': dataset, 'stream': stream, 'exclude': exclude, 'update': updateDB, 'on_exit': on_exit, 'config': { 'labels': ['POSITIVE', 'NEGATIVE'], 'batch_size': 32 } }
def textcat_log(dataset, spacy_model, source=None, label='', api=None, patterns=None, loader=None, long_text=False, exclude=None): """ Collect the best possible training data for a text classification model with the model in the loop. Based on your annotations, Prodigy will decide which questions to ask next. """ # logDB = setup_mongo('activelearning') #nlp = spacy.load('/home/ysun/pytorchprodigy/') if (spacy_model is not None): if (type(spacy_model) == str): print("Load model ", spacy_model) nlp = spacy.load(spacy_model, disable=['ner', 'parser']) model = TextClassifier(nlp, label, long_text=long_text) else: model = spacy_model else: print("build your customized model,log") pt_model = linear_model.SGDClassifier(loss="log") # pt_model = linear_model.SGDClassifier() example = ["Could you check my order status"] example_label = [1] #vectorizer_path = "/liveperson/data/alloy/prodigy/data/db-out/tmo_order_status.jsonl" #example_path = "/liveperson/data/alloy/prodigy/data/newsgroup_initial.jsonl" example_path = "/liveperson/data/alloy/prodigy/data/newsgroup_example.jsonl" vectorizer_path = "/liveperson/data/alloy/prodigy/data/newsgroup_all.jsonl" model = Prodigy_log_cpu(pt_model, 1, vectorizer_path, example_path) stream = get_stream(source, input_key='text') if patterns is None: predict = model.predict update = model.update stream = test_stream(stream, predict) # stream = probability_stream(stream,predict) def updateDB(answers): model.update(answers) def on_exit(): print("on_exit") return model return { 'view_id': 'classification', 'dataset': dataset, 'stream': stream, 'exclude': exclude, 'update': updateDB, 'on_exit': on_exit, 'config': { 'labels': ['ORDER_STATUS'], 'batch_size': 32 } }