예제 #1
0
def textcategorizerpredict():
    textcat = TextCategorizer(nlp.vocab)
    textcat.from_disk('./textcat_try')
    nlp.add_pipe(textcat, last=True)
    doc = nlp(request.query.text)
    print(doc.cats)
    list = []
    for key in doc.cats:
        print(key)
        list.append({'Label': key, 'Confidence': doc.cats[key]})
    print(list)
    return {'Labels': list}
예제 #2
0
def create_pipeline():
    nlp = spacy.blank('en')
    print("Start training")
    textcat = TextCategorizer(nlp.vocab,
                              labels=['1', '2', '3', '4', '5'],
                              architecture="simple_cnn")
    nlp.add_pipe(textcat)
    return nlp
예제 #3
0
def test_textcat_learns_multilabel():
    random.seed(5)
    numpy.random.seed(5)
    docs = []
    nlp = Language()
    letters = ["a", "b", "c"]
    for w1 in letters:
        for w2 in letters:
            cats = {letter: float(w2 == letter) for letter in letters}
            docs.append((Doc(nlp.vocab,
                             words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats))
    random.shuffle(docs)
    textcat = TextCategorizer(nlp.vocab, width=8)
    for letter in letters:
        textcat.add_label(letter)
    optimizer = textcat.initialize(lambda: [])
    for i in range(30):
        losses = {}
        examples = [
            Example.from_dict(doc, {"cats": cats}) for doc, cat in docs
        ]
        textcat.update(examples, sgd=optimizer, losses=losses)
        random.shuffle(docs)
    for w1 in letters:
        for w2 in letters:
            doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3)
            truth = {letter: w2 == letter for letter in letters}
            textcat(doc)
            for cat, score in doc.cats.items():
                if not truth[cat]:
                    assert score < 0.5
                else:
                    assert score > 0.5
예제 #4
0
def test_textcat_learns_multilabel():
    random.seed(5)
    numpy.random.seed(5)
    docs = []
    nlp = Language()
    letters = ["a", "b", "c"]
    for w1 in letters:
        for w2 in letters:
            cats = {letter: float(w2 == letter) for letter in letters}
            docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats))
    random.shuffle(docs)
    model = TextCategorizer(nlp.vocab, width=8)
    for letter in letters:
        model.add_label(letter)
    optimizer = model.begin_training()
    for i in range(30):
        losses = {}
        Ys = [GoldParse(doc, cats=cats) for doc, cats in docs]
        Xs = [doc for doc, cats in docs]
        model.update(Xs, Ys, sgd=optimizer, losses=losses)
        random.shuffle(docs)
    for w1 in letters:
        for w2 in letters:
            doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3)
            truth = {letter: w2 == letter for letter in letters}
            model(doc)
            for cat, score in doc.cats.items():
                if not truth[cat]:
                    assert score < 0.5
                else:
                    assert score > 0.5
예제 #5
0
def test_textcat_learns_multilabel():
    random.seed(5)
    numpy.random.seed(5)
    docs = []
    nlp = Language()
    letters = ["a", "b", "c"]
    for w1 in letters:
        for w2 in letters:
            cats = {letter: float(w2 == letter) for letter in letters}
            docs.append((Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3), cats))
    random.shuffle(docs)
    model = TextCategorizer(nlp.vocab, width=8)
    for letter in letters:
        model.add_label(letter)
    optimizer = model.begin_training()
    for i in range(30):
        losses = {}
        Ys = [GoldParse(doc, cats=cats) for doc, cats in docs]
        Xs = [doc for doc, cats in docs]
        model.update(Xs, Ys, sgd=optimizer, losses=losses)
        random.shuffle(docs)
    for w1 in letters:
        for w2 in letters:
            doc = Doc(nlp.vocab, words=["d"] * 3 + [w1, w2] + ["d"] * 3)
            truth = {letter: w2 == letter for letter in letters}
            model(doc)
            for cat, score in doc.cats.items():
                if not truth[cat]:
                    assert score < 0.5
                else:
                    assert score > 0.5
예제 #6
0
def create_textcat(nlp_model=None,
                   model_dir=None,
                   textcat_filename=None,
                   custom_labels=None):
    # Add textcat to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy

    if textcat_filename:
        print('[INFO]  load existing textcat from ',
              os.path.join(model_dir, textcat_filename))
        textcat = TextCategorizer(nlp_model.vocab)
        textcat.from_disk(os.path.join(model_dir, textcat_filename))
        nlp_model.add_pipe(textcat, last=True)
    else:
        if 'textcat' not in nlp_model.pipe_names:
            print('[INFO]  create textcat from scratch')
            textcat = nlp_model.create_pipe('textcat')
            nlp_model.add_pipe(textcat, last=True)
            # add label to text classifier
            for custom_label in list(custom_labels):
                textcat.add_label(custom_label)
        # otherwise, get it, so we can add labels to it
        else:
            print('[INFO] load existing textcat from model-directory')
            textcat = nlp_model.get_pipe('textcat')
    return textcat
예제 #7
0
def create_pipeline(width, embed_size, vectors_model):
    print("Load vectors")
    nlp = spacy.load(vectors_model)
    print("Start training")
    textcat = TextCategorizer(
        nlp.vocab,
        labels=["POSITIVE", "NEGATIVE"],
        model=build_textcat_model(Tok2Vec(width=width, embed_size=embed_size),
                                  2, width),
    )

    nlp.add_pipe(textcat)
    return nlp
예제 #8
0
def create_pipeline(lang, width, embed_size, vectors):
    if vectors:
        nlp = spacy.blank(lang)
    else:
        print("Load vectors", vectors)
        nlp = spacy.load(vectors)
    print("Start training")
    tok2vec = Tok2Vec(width=width,
                      embed_size=embed_size,
                      pretrained_vectors=vectors)
    textcat = TextCategorizer(
        nlp.vocab,
        labels=["POSITIVE", "NEGATIVE"],
        model=build_textcat_model(tok2vec, 2, width),
    )
    nlp.add_pipe(textcat)
    return nlp
예제 #9
0
def create_pipeline(lang, width, embed_size, vectors):
    if vectors is None:
        nlp = spacy.blank(lang)
    else:
        print("Load vectors", vectors)
        nlp = spacy.load(vectors)
    print("Start training")
    tok2vec = Tok2Vec(
        width=width,
        embed_size=embed_size,
    )
    textcat = TextCategorizer(
        nlp.vocab,
        labels=['1', '2', '3', '4'],
        model=build_textcat_model(tok2vec, 4, width),
    )
    nlp.add_pipe(textcat)
    return nlp
def test_bytes_serialize_issue_1105():
    nlp = spacy.lang.en.English()
    tokenizer = nlp.tokenizer
    textcat = TextCategorizer(tokenizer.vocab,
                              labels=['ENTITY', 'ACTION', 'MODIFIER'])
    textcat_bytes = textcat.to_bytes()
예제 #11
0
import spacy
import json
import spacy.attrs
from spacy.pipeline import TextCategorizer
from collections import Counter
import re

nlp = spacy.load("en_core_web_sm")
textcat = TextCategorizer(nlp.vocab)

def main():
    with open('papers.json', 'r') as f:
        data = json.loads(f.read())
    text = ' '.join([x["abstract_text"] for x in data])
    print(text)
    doc = nlp(text)
    # processed = textcat(doc)
    # print(processed)
    
    # doc.count_by(spacy.attrs.IDS['ORG'])

    # counts_dict = doc.ents.count_by(spacy.attrs.IDS['POS'])

    # Print the human readable part of speech tags
    # for pos, count in counts_dict.items():
    #     human_readable_tag = doc.vocab[pos].text
    #     print(human_readable_tag, count)
    cnt = Counter()
    cnt_2 = Counter()
    ents = [ent.text for ent in doc.ents if ent.label_ == "ORG"]
    phrases = [chunk.text for chunk in doc.noun_chunks]
예제 #12
0
def test_serialize_textcat_empty(en_vocab):
    # See issue #1105
    textcat = TextCategorizer(en_vocab, labels=["ENTITY", "ACTION", "MODIFIER"])
    textcat.to_bytes()
def test_bytes_serialize_issue_1105():
    nlp = spacy.lang.en.English()
    tokenizer = nlp.tokenizer
    textcat = TextCategorizer(tokenizer.vocab, labels=['ENTITY', 'ACTION', 'MODIFIER'])
    textcat_bytes = textcat.to_bytes()
예제 #14
0
    textcat_file = None
    current_time = strftime("%Y%m%d-%H%M%S", gmtime())

    np.random.seed(random_seed)
    print('Current time: ', current_time)
    # Load spacy nlp model
    print('Loading model...')
    nlp_model = en_core_web_sm.load()

    print('Configuring model')
    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy

    if textcat_file:
        print('  load existing textcat from file', textcat_file)
        textcat = TextCategorizer(nlp_model.vocab)
        textcat.from_disk(os.path.join(model_dir, textcat_file))
        nlp_model.add_pipe(textcat, last=True)
    else:
        if 'textcat' not in nlp_model.pipe_names:
            print('  create textcat from scratch')
            textcat = nlp_model.create_pipe('textcat')
            nlp_model.add_pipe(textcat, last=True)
            # add label to text classifier
            for custom_label in [
                    "toxic", "severe_toxic", "obscene", "threat", "insult",
                    "identity_hate"
            ]:  # Enter custom labels here <---------------------
                textcat.add_label(custom_label)

        # otherwise, get it, so we can add labels to it
예제 #15
0
def test_serialize_textcat_empty(en_vocab):
    # See issue #1105
    cfg = {"model": DEFAULT_SINGLE_TEXTCAT_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    textcat = TextCategorizer(en_vocab, model, threshold=0.5)
    textcat.to_bytes(exclude=["vocab"])
def batch_train(dataset, input_model=None, output_model=None, lang='en',
                factor=1, dropout=0.2, n_iter=10, batch_size=10,
                eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False):
    """
    Batch train a new text classification model from annotations. Prodigy will
    export the best result to the output directory, and include a JSONL file of
    the training and evaluation examples. You can either supply a dataset ID
    containing the evaluation data, or choose to split off a percentage of
    examples for evaluation.
    """
    #log("RECIPE: Starting recipe textcat.batch-train", locals())
    print("batch_size",batch_size)
    print(factor,type(factor))
    DB = connect()
    print_ = get_print(silent)
    random.seed(0)
    if input_model is not None:
        nlp = spacy.load(input_model, disable=['ner'])
        print_('\nLoaded model {}'.format(input_model))
    else:
        print("build your customized model")
        nlp = spacy.load('en_core_web_lg')
        pt_model = FastText(vocab_size=684831, emb_dim = 300)
        pt_model.embeds.weight.data.copy_(torch.from_numpy(nlp.vocab.vectors.data))
        model = PyTorchWrapper(pt_model)
        textcat = TextCategorizer(nlp.vocab,model)
        nlp.add_pipe(textcat)

        #pt_model = LSTMSentiment(embedding_dim = 100, hidden_dim =100, vocab_size=259136, label_size=2, batch_size=3, dropout=0.5)
        #model = PyTorchWrapper(pt_model)
        #nlp = spacy.load('/home/ysun/pytorchprodigy/')
        #textcat = TextCategorizer(nlp.vocab,model)
        #nlp.add_pipe(textcat)
    examples = DB.get_dataset(dataset)
    labels = {eg['label'] for eg in examples}
    labels = list(sorted(labels))
    print(labels)
    model = TextClassifier(nlp, labels, long_text=long_text,
                           low_data=len(examples) < 1000)
    #log('RECIPE: Initialised TextClassifier with model {}'
    #    .format(input_model), model.nlp.meta)
    if shuffle:    
        print("it's shuffling")
        random.shuffle(examples)
    else:
        print("it's not shuffling")
    if eval_id:
        evals = DB.get_dataset(eval_id)
        print_("Loaded {} evaluation examples from '{}'"
               .format(len(evals), eval_id))
    else:
        examples, evals, eval_split = split_evals(examples, eval_split)
        print_("Using {}% of examples ({}) for evaluation"
               .format(round(eval_split * 100), len(evals)))
    if shuffle:
        random.shuffle(examples)
    examples = examples[:int(len(examples) * factor)]
    print_(printers.trainconf(dropout, n_iter, batch_size, factor,
                              len(examples)))
    if len(evals) > 0:
        print_(printers.tc_update_header())
    best_acc = {'accuracy': 0}
    best_model = None
    if long_text:
        examples = list(split_sentences(nlp, examples, min_length=False))
    for i in range(n_iter):
        loss = 0.
        random.shuffle(examples)
        for batch in cytoolz.partition_all(batch_size,
                                           tqdm.tqdm(examples, leave=False)):
            batch = list(batch)
            loss += model.update(batch, revise=False, drop=dropout)
        if len(evals) > 0:
            with nlp.use_params(model.optimizer.averages):
                acc = model.evaluate(tqdm.tqdm(evals, leave=False))
                if acc['accuracy'] > best_acc['accuracy']:
                    best_acc = dict(acc)
                    best_model = nlp.to_bytes()
            print_(printers.tc_update(i, loss, acc))
    if len(evals) > 0:
        print_(printers.tc_result(best_acc))
    if output_model is not None:
        if best_model is not None:
            nlp = nlp.from_bytes(best_model)
        msg = export_model_data(output_model, nlp, examples, evals)
        print_(msg)
    return best_acc['accuracy']
예제 #17
0
def test_serialize_textcat_empty(en_vocab):
    # See issue #1105
    textcat = TextCategorizer(en_vocab,
                              labels=["ENTITY", "ACTION", "MODIFIER"])
    textcat.to_bytes(exclude=["vocab"])