def textcategorizerpredict(): textcat = TextCategorizer(nlp.vocab) textcat.from_disk('./textcat_try') nlp.add_pipe(textcat, last=True) doc = nlp(request.query.text) print(doc.cats) list = [] for key in doc.cats: print(key) list.append({'Label': key, 'Confidence': doc.cats[key]}) print(list) return {'Labels': list}
def create_pipeline(): nlp = spacy.blank('en') print("Start training") textcat = TextCategorizer(nlp.vocab, labels=['1', '2', '3', '4', '5'], architecture="simple_cnn") nlp.add_pipe(textcat) return nlp
def test_textcat_learns_multilabel(): random.seed(5) numpy.random.seed(5) docs = [] nlp = Language() letters = ["a", "b", "c"] for w1 in letters: for w2 in letters: cats = {letter: float(w2 == letter) for letter in letters} docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats)) random.shuffle(docs) textcat = TextCategorizer(nlp.vocab, width=8) for letter in letters: textcat.add_label(letter) optimizer = textcat.initialize(lambda: []) for i in range(30): losses = {} examples = [ Example.from_dict(doc, {"cats": cats}) for doc, cat in docs ] textcat.update(examples, sgd=optimizer, losses=losses) random.shuffle(docs) for w1 in letters: for w2 in letters: doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3) truth = {letter: w2 == letter for letter in letters} textcat(doc) for cat, score in doc.cats.items(): if not truth[cat]: assert score < 0.5 else: assert score > 0.5
def test_textcat_learns_multilabel(): random.seed(5) numpy.random.seed(5) docs = [] nlp = Language() letters = ["a", "b", "c"] for w1 in letters: for w2 in letters: cats = {letter: float(w2 == letter) for letter in letters} docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats)) random.shuffle(docs) model = TextCategorizer(nlp.vocab, width=8) for letter in letters: model.add_label(letter) optimizer = model.begin_training() for i in range(30): losses = {} Ys = [GoldParse(doc, cats=cats) for doc, cats in docs] Xs = [doc for doc, cats in docs] model.update(Xs, Ys, sgd=optimizer, losses=losses) random.shuffle(docs) for w1 in letters: for w2 in letters: doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3) truth = {letter: w2 == letter for letter in letters} model(doc) for cat, score in doc.cats.items(): if not truth[cat]: assert score < 0.5 else: assert score > 0.5
def test_textcat_learns_multilabel(): random.seed(5) numpy.random.seed(5) docs = [] nlp = Language() letters = ["a", "b", "c"] for w1 in letters: for w2 in letters: cats = {letter: float(w2 == letter) for letter in letters} docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats)) random.shuffle(docs) model = TextCategorizer(nlp.vocab, width=8) for letter in letters: model.add_label(letter) optimizer = model.begin_training() for i in range(30): losses = {} Ys = [GoldParse(doc, cats=cats) for doc, cats in docs] Xs = [doc for doc, cats in docs] model.update(Xs, Ys, sgd=optimizer, losses=losses) random.shuffle(docs) for w1 in letters: for w2 in letters: doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3) truth = {letter: w2 == letter for letter in letters} model(doc) for cat, score in doc.cats.items(): if not truth[cat]: assert score < 0.5 else: assert score > 0.5
def create_textcat(nlp_model=None, model_dir=None, textcat_filename=None, custom_labels=None): # Add textcat to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if textcat_filename: print('[INFO] load existing textcat from ', os.path.join(model_dir, textcat_filename)) textcat = TextCategorizer(nlp_model.vocab) textcat.from_disk(os.path.join(model_dir, textcat_filename)) nlp_model.add_pipe(textcat, last=True) else: if 'textcat' not in nlp_model.pipe_names: print('[INFO] create textcat from scratch') textcat = nlp_model.create_pipe('textcat') nlp_model.add_pipe(textcat, last=True) # add label to text classifier for custom_label in list(custom_labels): textcat.add_label(custom_label) # otherwise, get it, so we can add labels to it else: print('[INFO] load existing textcat from model-directory') textcat = nlp_model.get_pipe('textcat') return textcat
def create_pipeline(width, embed_size, vectors_model): print("Load vectors") nlp = spacy.load(vectors_model) print("Start training") textcat = TextCategorizer( nlp.vocab, labels=["POSITIVE", "NEGATIVE"], model=build_textcat_model(Tok2Vec(width=width, embed_size=embed_size), 2, width), ) nlp.add_pipe(textcat) return nlp
def create_pipeline(lang, width, embed_size, vectors): if vectors: nlp = spacy.blank(lang) else: print("Load vectors", vectors) nlp = spacy.load(vectors) print("Start training") tok2vec = Tok2Vec(width=width, embed_size=embed_size, pretrained_vectors=vectors) textcat = TextCategorizer( nlp.vocab, labels=["POSITIVE", "NEGATIVE"], model=build_textcat_model(tok2vec, 2, width), ) nlp.add_pipe(textcat) return nlp
def create_pipeline(lang, width, embed_size, vectors): if vectors is None: nlp = spacy.blank(lang) else: print("Load vectors", vectors) nlp = spacy.load(vectors) print("Start training") tok2vec = Tok2Vec( width=width, embed_size=embed_size, ) textcat = TextCategorizer( nlp.vocab, labels=['1', '2', '3', '4'], model=build_textcat_model(tok2vec, 4, width), ) nlp.add_pipe(textcat) return nlp
def test_bytes_serialize_issue_1105(): nlp = spacy.lang.en.English() tokenizer = nlp.tokenizer textcat = TextCategorizer(tokenizer.vocab, labels=['ENTITY', 'ACTION', 'MODIFIER']) textcat_bytes = textcat.to_bytes()
import spacy import json import spacy.attrs from spacy.pipeline import TextCategorizer from collections import Counter import re nlp = spacy.load("en_core_web_sm") textcat = TextCategorizer(nlp.vocab) def main(): with open('papers.json', 'r') as f: data = json.loads(f.read()) text = ' '.join([x["abstract_text"] for x in data]) print(text) doc = nlp(text) # processed = textcat(doc) # print(processed) # doc.count_by(spacy.attrs.IDS['ORG']) # counts_dict = doc.ents.count_by(spacy.attrs.IDS['POS']) # Print the human readable part of speech tags # for pos, count in counts_dict.items(): # human_readable_tag = doc.vocab[pos].text # print(human_readable_tag, count) cnt = Counter() cnt_2 = Counter() ents = [ent.text for ent in doc.ents if ent.label_ == "ORG"] phrases = [chunk.text for chunk in doc.noun_chunks]
def test_serialize_textcat_empty(en_vocab): # See issue #1105 textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"]) textcat.to_bytes()
def test_bytes_serialize_issue_1105(): nlp = spacy.lang.en.English() tokenizer = nlp.tokenizer textcat = TextCategorizer(tokenizer.vocab, labels=['ENTITY', 'ACTION', 'MODIFIER']) textcat_bytes = textcat.to_bytes()
textcat_file = None current_time = strftime("%Y%m%d-%H%M%S", gmtime()) np.random.seed(random_seed) print('Current time: ', current_time) # Load spacy nlp model print('Loading model...') nlp_model = en_core_web_sm.load() print('Configuring model') # add the text classifier to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if textcat_file: print(' load existing textcat from file', textcat_file) textcat = TextCategorizer(nlp_model.vocab) textcat.from_disk(os.path.join(model_dir, textcat_file)) nlp_model.add_pipe(textcat, last=True) else: if 'textcat' not in nlp_model.pipe_names: print(' create textcat from scratch') textcat = nlp_model.create_pipe('textcat') nlp_model.add_pipe(textcat, last=True) # add label to text classifier for custom_label in [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]: # Enter custom labels here <--------------------- textcat.add_label(custom_label) # otherwise, get it, so we can add labels to it
def test_serialize_textcat_empty(en_vocab): # See issue #1105 cfg = {"model": DEFAULT_SINGLE_TEXTCAT_MODEL} model = registry.resolve(cfg, validate=True)["model"] textcat = TextCategorizer(en_vocab, model, threshold=0.5) textcat.to_bytes(exclude=["vocab"])
def batch_train(dataset, input_model=None, output_model=None, lang='en', factor=1, dropout=0.2, n_iter=10, batch_size=10, eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False): """ Batch train a new text classification model from annotations. Prodigy will export the best result to the output directory, and include a JSONL file of the training and evaluation examples. You can either supply a dataset ID containing the evaluation data, or choose to split off a percentage of examples for evaluation. """ #log("RECIPE: Starting recipe textcat.batch-train", locals()) print("batch_size",batch_size) print(factor,type(factor)) DB = connect() print_ = get_print(silent) random.seed(0) if input_model is not None: nlp = spacy.load(input_model, disable=['ner']) print_('\nLoaded model {}'.format(input_model)) else: print("build your customized model") nlp = spacy.load('en_core_web_lg') pt_model = FastText(vocab_size=684831, emb_dim = 300) pt_model.embeds.weight.data.copy_(torch.from_numpy(nlp.vocab.vectors.data)) model = PyTorchWrapper(pt_model) textcat = TextCategorizer(nlp.vocab,model) nlp.add_pipe(textcat) #pt_model = LSTMSentiment(embedding_dim = 100, hidden_dim =100, vocab_size=259136, label_size=2, batch_size=3, dropout=0.5) #model = PyTorchWrapper(pt_model) #nlp = spacy.load('/home/ysun/pytorchprodigy/') #textcat = TextCategorizer(nlp.vocab,model) #nlp.add_pipe(textcat) examples = DB.get_dataset(dataset) labels = {eg['label'] for eg in examples} labels = list(sorted(labels)) print(labels) model = TextClassifier(nlp, labels, long_text=long_text, low_data=len(examples) < 1000) #log('RECIPE: Initialised TextClassifier with model {}' # .format(input_model), model.nlp.meta) if shuffle: print("it's shuffling") random.shuffle(examples) else: print("it's not shuffling") if eval_id: evals = DB.get_dataset(eval_id) print_("Loaded {} evaluation examples from '{}'" .format(len(evals), eval_id)) else: examples, evals, eval_split = split_evals(examples, eval_split) print_("Using {}% of examples ({}) for evaluation" .format(round(eval_split * 100), len(evals))) if shuffle: random.shuffle(examples) examples = examples[:int(len(examples) * factor)] print_(printers.trainconf(dropout, n_iter, batch_size, factor, len(examples))) if len(evals) > 0: print_(printers.tc_update_header()) best_acc = {'accuracy': 0} best_model = None if long_text: examples = list(split_sentences(nlp, examples, min_length=False)) for i in range(n_iter): loss = 0. random.shuffle(examples) for batch in cytoolz.partition_all(batch_size, tqdm.tqdm(examples, leave=False)): batch = list(batch) loss += model.update(batch, revise=False, drop=dropout) if len(evals) > 0: with nlp.use_params(model.optimizer.averages): acc = model.evaluate(tqdm.tqdm(evals, leave=False)) if acc['accuracy'] > best_acc['accuracy']: best_acc = dict(acc) best_model = nlp.to_bytes() print_(printers.tc_update(i, loss, acc)) if len(evals) > 0: print_(printers.tc_result(best_acc)) if output_model is not None: if best_model is not None: nlp = nlp.from_bytes(best_model) msg = export_model_data(output_model, nlp, examples, evals) print_(msg) return best_acc['accuracy']
def test_serialize_textcat_empty(en_vocab): # See issue #1105 textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"]) textcat.to_bytes(exclude=["vocab"])