Python TextClassifier示例，prodigy.models.textcat.TextClassifier Python示例

示例#1

0

显示文件

文件： textcat_recipe.py 项目： ysunlp/data_science_challenge

def batch_train_custom(dataset, input_model=None, output_model=None, lang='en',
                factor=1, dropout=0.2, n_iter=1, batch_size=10,
                eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False):
    if(n_iter ==1):
        print("one pass mode")
    print("batch_size",batch_size)
    #print(factor,type(factor))
    DB = connect()
    print_ = get_print(silent)
    random.seed(0)
    if input_model is not None:
        nlp = spacy.load(input_model, disable=['ner'])
        print_('\nLoaded model {}'.format(input_model))
        model = TextClassifier(nlp, labels, long_text=long_text,
                               low_data=len(examples) < 1000)
    else:
        print("build your customized model")
        pt_model = FastText(vocab_size=684831, emb_dim = 300)
        optimizer = torch.optim.Adam(pt_model.parameters(), lr=0.001)
        criterion = nn.BCELoss()
        model = Prodigy_model(pt_model,label_size=1,optimizer=optimizer,loss=criterion)
    examples = DB.get_dataset(dataset)
    if eval_id:
        evals = DB.get_dataset(eval_id)
        print_("Loaded {} evaluation examples from '{}'"
               .format(len(evals), eval_id))
    examples = examples[:int(len(examples) * factor)]
    print_(printers.trainconf(dropout, n_iter, batch_size, factor,
                              len(examples)))
    if len(evals) > 0:
        print_(printers.tc_update_header())
    best_acc = {'accuracy': 0}
    best_model = None
    for i in range(n_iter):
        if shuffle:
            random.shuffle(examples)
        batch_idx = 1
        for batch in cytoolz.partition_all(batch_size,
                                           tqdm.tqdm(examples, leave=False)):
            #print(j)
            batch = list(batch)
            loss = model.update(batch)
            if len(evals) > 0 and batch_idx % (4 * batch_size) == 0:
                acc = model.evaluate(evals)     
                #print_(printers.tc_update(batch_idx, loss, acc))
                print('Epoch: [{0}/{1}], Step: [{2}/{3}], Loss: {4}, Validation Acc:{5}'.format( 
                   i+1, n_iter, batch_idx, len(examples)//batch_size, loss, acc))
            batch_idx += 1
    return acc

示例#2

0

显示文件

文件： textcat_teach.py 项目： zlapp/prodigy-recipes

def textcat_teach(
    dataset: str,
    spacy_model: str,
    source: str,
    label: Optional[List[str]] = None,
    patterns: Optional[str] = None,
    exclude: Optional[List[str]] = None,
):
    """
    Collect the best possible training data for a text classification model
    with the model in the loop. Based on your annotations, Prodigy will decide
    which questions to ask next.
    """
    # Load the stream from a JSONL file and return a generator that yields a
    # dictionary for each example in the data.
    stream = JSONL(source)

    # Load the spaCy model
    nlp = spacy.load(spacy_model)

    # Initialize Prodigy's text classifier model, which outputs
    # (score, example) tuples
    model = TextClassifier(nlp, label)

    if patterns is None:
        # No patterns are used, so just use the model to suggest examples
        # and only use the model's update method as the update callback
        predict = model
        update = model.update
    else:
        # Initialize the pattern matcher and load in the JSONL patterns.
        # Set the matcher to not label the highlighted spans, only the text.
        matcher = PatternMatcher(
            nlp,
            prior_correct=5.0,
            prior_incorrect=5.0,
            label_span=False,
            label_task=True,
        )
        matcher = matcher.from_disk(patterns)
        # Combine the NER model and the matcher and interleave their
        # suggestions and update both at the same time
        predict, update = combine_models(model, matcher)

    # Use the prefer_uncertain sorter to focus on suggestions that the model
    # is most uncertain about (i.e. with a score closest to 0.5). The model
    # yields (score, example) tuples and the sorter yields just the example
    stream = prefer_uncertain(predict(stream))

    return {
        "view_id": "classification",  # Annotation interface to use
        "dataset": dataset,  # Name of dataset to save annotations
        "stream": stream,  # Incoming stream of examples
        "update": update,  # Update callback, called with batch of answers
        "exclude": exclude,  # List of dataset names to exclude
        "config": {
            "lang": nlp.lang
        },  # Additional config settings, mostly for app UI
    }

示例#3

0

显示文件

文件： textcat_recipe.py 项目： ysunlp/data_science_challenge

def batch_train_custom_cumulate(dataset, input_model=None, output_model=None, lang='en',
                factor=1, dropout=0.2, n_iter=1, batch_size=10,
                eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False,gpu_id = None):
    if(gpu_id == 0 and torch.cuda.is_available()):
        print("Using cuda")
        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
        cudnn.benchmark = True
    if(n_iter ==1):
        print("one pass mode")
    print("batch_size",batch_size)
    #print(factor,type(factor))
    DB = connect()
    print_ = get_print(silent)
    random.seed(0)
    if input_model is not None:
        nlp = spacy.load(input_model, disable=['ner'])
        print_('\nLoaded model {}'.format(input_model))
        model = TextClassifier(nlp, labels, long_text=long_text,
                               low_data=len(examples) < 1000)
    else:
        print("build your customized model")
        pt_model = FastText(vocab_size=684831, emb_dim = 300).cuda()
        optimizer = torch.optim.Adam(pt_model.parameters(), lr=0.001)
        criterion = nn.BCELoss()
        model = Prodigy_model(pt_model,label_size=1,optimizer=optimizer,loss=criterion)
    examples = DB.get_dataset(dataset)
    if eval_id:
        evals = DB.get_dataset(eval_id)
        print_("Loaded {} evaluation examples from '{}'"
               .format(len(evals), eval_id))
    examples = examples[:int(len(examples) * factor)]
    print_(printers.trainconf(dropout, n_iter, batch_size, factor,
                              len(examples)))
    if len(evals) > 0:
        print_(printers.tc_update_header())
    best_acc = {'accuracy': 0}
    best_model = None
    start_time = datetime.now()
    if len(evals) > 0:
        model.init_eval(evals)
    interval = 100
    for fac in np.arange(interval,len(examples)+interval,interval):
        examples_fac = examples[:fac]
        batch_number = examples_fac/batch_size
        for i in range(n_iter):
            if shuffle:
                print("it's shuffling")
                random.shuffle(examples)
            batch_idx = 0
            loss = 0
            for batch in cytoolz.partition_all(batch_size,
                                               tqdm.tqdm(examples, leave=False)):
                batch = list(batch)
                loss += model.update(batch)
                batch_idx += 1
            acc = model.evaluate(evals)     
            print_('Time:[{0} seconds], process: [{1}/{2}], Epoch: [{3}/{4}], step: [{5}/{6}], Loss: {7},Acc:{8}'.format(
               end_time.seconds,fac, len(examples)//interval, i+1, n_iter, batch_idx+1, len(examples_fac)//batch_size, loss/batch_number, acc))
    return acc

示例#4

0

显示文件

文件： custom_template.py 项目： oneextrafact/prodigy-scratch

def custom_with_recipe_html_template():
    nlp = spacy.load(spacy_model)
    model = TextClassifier(nlp, labels, long_text=False)
    stream = JSONL(example_jsonl)
    stream = filter_duplicates(stream, by_input=True, by_task=False)

    return {
        'view_id': 'html',
        'dataset': dataset,
        'stream': stream,
        'exclude': [dataset],
        'update': model.update,
        'config': {
            'labels': labels,
            'html_template': template_text
        }
    }

示例#5

0

显示文件

def evaluate(dataset,
             spacy_model,
             source,
             label='',
             api=None,
             loader=None,
             exclude=None):
    """
    Evaluate a text classification model and build an evaluation set from a
    stream.
    """
    log("RECIPE: Starting recipe attncat.eval", locals())
    nlp = spacy.load(spacy_model, disable=['tagger', 'parser', 'ner'])
    # Get attention layer weights from textcat
    textcat = nlp.get_pipe('textcat')
    assert textcat is not None
    with get_attention_weights(textcat) as attn_weights:
        stream = get_stream(source, api, loader)
        # Decorate items with attention data
        stream = attach_attention_data(stream, nlp, attn_weights)
        model = TextClassifier(nlp, label)
        log(
            'RECIPE: Initialised TextClassifier with model {}'.format(
                spacy_model), model.nlp.meta)

    def on_exit(ctrl):
        examples = ctrl.db.get_dataset(dataset)
        data = dict(model.evaluate(examples))
        print(printers.tc_result(data))

    return {
        'view_id': 'html',
        'dataset': dataset,
        'stream': stream,
        'exclude': exclude,
        'on_exit': on_exit,
        'config': {
            'lang': nlp.lang,
            'labels': model.labels,
            'html_template': template_text
        }
    }

示例#6

0

显示文件

def evaluate(dataset,
             spacy_model,
             source,
             label='',
             api=None,
             loader=None,
             exclude=None):
    """
    Evaluate a text classification model and build an evaluation set from a
    stream.
    """
    log("RECIPE: Starting recipe attncat.eval", locals())
    nlp = spacy.load(spacy_model, disable=['tagger', 'parser', 'ner'])
    stream = get_stream(source, api, loader)
    stream = attach_structural_sensitivity_data(stream, nlp,
                                                label.split(',')[0])
    model = TextClassifier(nlp, label)
    log('RECIPE: Initialised TextClassifier with model {}'.format(spacy_model),
        model.nlp.meta)

    def on_exit(ctrl):
        examples = ctrl.db.get_dataset(dataset)
        data = dict(model.evaluate(examples))
        print(printers.tc_result(data))

    return {
        'view_id': 'html',
        'dataset': dataset,
        'stream': stream,
        'exclude': exclude,
        'on_exit': on_exit,
        'config': {
            'lang': nlp.lang,
            'labels': model.labels,
            'html_template': template_text
        }
    }

示例#7

0

显示文件

文件： textcat_recipe.py 项目： ysunlp/data_science_challenge

def batch_train_increment(dataset, input_model=None, output_model=None, lang='en',
                factor=1, dropout=0.2, n_iter=1, batch_size=10,
                eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False,gpu_id = None):
    """
    Batch train a new text classification model from annotations. Prodigy will
    export the best result to the output directory, and include a JSONL file of
    the training and evaluation examples. You can either supply a dataset ID
    containing the evaluation data, or choose to split off a percentage of
    examples for evaluation.
    """
    #log("RECIPE: Starting recipe textcat.batch-train", locals())
    if(gpu_id):
        spacy.util.use_gpu(gpu_id)
    if(n_iter ==1):
        print("one pass mode")
    print("batch_size",batch_size)
    print(factor,type(factor))
    DB = connect()
    print_ = get_print(silent)
    random.seed(0)
    if input_model is not None:
        nlp = spacy.load(input_model, disable=['ner'])
        print_('\nLoaded model {}'.format(input_model))
    else:
        print("build your customized model")
        nlp = spacy.load('en_core_web_lg')
        pt_model = FastText(vocab_size=684831, emb_dim = 300)
        pt_model.embeds.weight.data.copy_(torch.from_numpy(nlp.vocab.vectors.data))
        model = PyTorchWrapper(pt_model)
        #textcat = TextCategorizer(nlp.vocab,model)
        textcat = Loss_TextCategorizer(nlp.vocab,model)
        nlp.add_pipe(textcat)
    examples = DB.get_dataset(dataset)
    labels = {eg['label'] for eg in examples}
    labels = list(sorted(labels))
    print(labels)
    model = TextClassifier(nlp, labels, long_text=long_text,
                           low_data=len(examples) < 1000)
    if shuffle:    
        print("it's shuffling")
        random.shuffle(examples)
    else:
        print("it's not shuffling")
    if eval_id:
        evals = DB.get_dataset(eval_id)
        print_("Loaded {} evaluation examples from '{}'"
               .format(len(evals), eval_id))
    else:
        examples, evals, eval_split = split_evals(examples, eval_split)
        print_("Using {}% of examples ({}) for evaluation"
               .format(round(eval_split * 100), len(evals)))
    if shuffle:
        random.shuffle(examples)
    examples = examples[:int(len(examples) * factor)]
    print_(printers.trainconf(dropout, n_iter, batch_size, factor,
                              len(examples)))
    if len(evals) > 0:
        print_(printers.tc_update_header())
    # best_acc = {'accuracy': 0}
    # best_model = None
    if long_text:
        examples = list(split_sentences(nlp, examples, min_length=False))
    batch_idx = 0
    start_time = datetime.now()
    for batch in cytoolz.partition_all(batch_size,
                                       tqdm.tqdm(examples, leave=False)):
        batch = list(batch)
        for i in range(n_iter):
            loss = model.update(batch, revise=False, drop=dropout)
            if len(evals) > 0:
                #print("optimizer averages",model.optimizer.averages)
                with nlp.use_params(model.optimizer.averages):
                    acc = model.evaluate(tqdm.tqdm(evals, leave=False))
                #print_(printers.tc_update(i, loss, acc))
                end_time = datetime.now() -start_time
                print('Time:[{0} seconds], Epoch: [{1}/{2}], batch: [{3}/{4}], Loss:{5}, Accuracy:{6}'.format( 
                   end_time.seconds,i+1, n_iter, batch_idx+1, len(examples)//batch_size, loss, acc['accuracy']))
            batch_idx += 1
    return acc

示例#8

0

显示文件

文件： textcat_recipe.py 项目： ysunlp/data_science_challenge

def textcat_al(dataset, spacy_model,source=None, label='', api=None, patterns=None,
          loader=None, long_text=False, exclude=None):
    """
    Collect the best possible training data for a text classification model
    with the model in the loop. Based on your annotations, Prodigy will decide
    which questions to ask next.
    """
    # logDB = setup_mongo('activelearning')
    #nlp = spacy.load('/home/ysun/pytorchprodigy/')
    if(spacy_model is not None):
        if(type(spacy_model) == str):
            print("Load model ",spacy_model)
            nlp=spacy.load(spacy_model, disable=['ner', 'parser'])
            model = TextClassifier(nlp, label, long_text=long_text)
        else:
            model = spacy_model
    else:
        print("build your customized model")
        nlp = spacy.load('en_core_web_lg')

    #pt_model = nn.Linear(100,1)
    #pt_model = LSTMSentiment(embedding_dim = 100, hidden_dim =100, vocab_size=259136, label_size=2, batch_size=3, dropout=0.5)
        pt_model = FastText_test(vocab_size=684831, emb_dim = 300)
        pt_model.embeds.weight.data.copy_(torch.from_numpy(nlp.vocab.vectors.data))
        model = PyTorchWrapper(pt_model)

        textcat = Loss_TextCategorizer(nlp.vocab,model)
        nlp.add_pipe(textcat)
        model = TextClassifier(nlp, label, long_text=long_text)
    stream = get_stream(source,input_key = 'text')
    if patterns is None:
        predict = model
        update = model.update
    else:
        matcher = PatternMatcher(model.nlp, prior_correct=5.,
                                 prior_incorrect=5., label_span=False,
                                 label_task=True)
        matcher = matcher.from_disk(patterns)
        #log("RECIPE: Created PatternMatcher and loaded in patterns", patterns)
        # Combine the textcat model with the PatternMatcher to annotate both
        # match results and predictions, and update both models.
        predict, update = combine_models(model, matcher)
    # Rank the stream. Note this is continuous, as model() is a generator.
    # As we call model.update(), the ranking of examples changes.
    stream = test_stream(stream,predict)

    def updateDB(answers):
        model.update(answers)
        #print("update model")
        #for eg in answers:
        #    print(eg)
        #for score,eg in model(answers):
        #    eg["update_score"] = score
        #    print("new",score)
        #print(answers)
        
    def on_exit():
        print("on_exit")
        return model
    
    return {
        'view_id': 'classification',
        'dataset': dataset,
        'stream': stream,
        'exclude': exclude,
        'update': updateDB,
        'on_exit': on_exit,
        'config': {'labels': model.labels,'batch_size':1}
    }

示例#9

0

显示文件

文件： textcat_recipe.py 项目： ysunlp/data_science_challenge

def batch_train(dataset, input_model=None, output_model=None, lang='en',
                factor=1, dropout=0.2, n_iter=10, batch_size=10,
                eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False):
    """
    Batch train a new text classification model from annotations. Prodigy will
    export the best result to the output directory, and include a JSONL file of
    the training and evaluation examples. You can either supply a dataset ID
    containing the evaluation data, or choose to split off a percentage of
    examples for evaluation.
    """
    #log("RECIPE: Starting recipe textcat.batch-train", locals())
    print("batch_size",batch_size)
    print(factor,type(factor))
    DB = connect()
    print_ = get_print(silent)
    random.seed(0)
    if input_model is not None:
        nlp = spacy.load(input_model, disable=['ner'])
        print_('\nLoaded model {}'.format(input_model))
    else:
        print("build your customized model")
        nlp = spacy.load('en_core_web_lg')
        pt_model = FastText(vocab_size=684831, emb_dim = 300)
        pt_model.embeds.weight.data.copy_(torch.from_numpy(nlp.vocab.vectors.data))
        model = PyTorchWrapper(pt_model)
        textcat = TextCategorizer(nlp.vocab,model)
        nlp.add_pipe(textcat)

        #pt_model = LSTMSentiment(embedding_dim = 100, hidden_dim =100, vocab_size=259136, label_size=2, batch_size=3, dropout=0.5)
        #model = PyTorchWrapper(pt_model)
        #nlp = spacy.load('/home/ysun/pytorchprodigy/')
        #textcat = TextCategorizer(nlp.vocab,model)
        #nlp.add_pipe(textcat)
    examples = DB.get_dataset(dataset)
    labels = {eg['label'] for eg in examples}
    labels = list(sorted(labels))
    print(labels)
    model = TextClassifier(nlp, labels, long_text=long_text,
                           low_data=len(examples) < 1000)
    #log('RECIPE: Initialised TextClassifier with model {}'
    #    .format(input_model), model.nlp.meta)
    if shuffle:    
        print("it's shuffling")
        random.shuffle(examples)
    else:
        print("it's not shuffling")
    if eval_id:
        evals = DB.get_dataset(eval_id)
        print_("Loaded {} evaluation examples from '{}'"
               .format(len(evals), eval_id))
    else:
        examples, evals, eval_split = split_evals(examples, eval_split)
        print_("Using {}% of examples ({}) for evaluation"
               .format(round(eval_split * 100), len(evals)))
    if shuffle:
        random.shuffle(examples)
    examples = examples[:int(len(examples) * factor)]
    print_(printers.trainconf(dropout, n_iter, batch_size, factor,
                              len(examples)))
    if len(evals) > 0:
        print_(printers.tc_update_header())
    best_acc = {'accuracy': 0}
    best_model = None
    if long_text:
        examples = list(split_sentences(nlp, examples, min_length=False))
    for i in range(n_iter):
        loss = 0.
        random.shuffle(examples)
        for batch in cytoolz.partition_all(batch_size,
                                           tqdm.tqdm(examples, leave=False)):
            batch = list(batch)
            loss += model.update(batch, revise=False, drop=dropout)
        if len(evals) > 0:
            with nlp.use_params(model.optimizer.averages):
                acc = model.evaluate(tqdm.tqdm(evals, leave=False))
                if acc['accuracy'] > best_acc['accuracy']:
                    best_acc = dict(acc)
                    best_model = nlp.to_bytes()
            print_(printers.tc_update(i, loss, acc))
    if len(evals) > 0:
        print_(printers.tc_result(best_acc))
    if output_model is not None:
        if best_model is not None:
            nlp = nlp.from_bytes(best_model)
        msg = export_model_data(output_model, nlp, examples, evals)
        print_(msg)
    return best_acc['accuracy']

示例#10

0

显示文件

文件： prodigy_streamlit.py 项目： laugustyniak/discourse_summarization

         all_labels.add(eg["label"])
 textcat = nlp.create_pipe("textcat")
 for label in all_labels:
     textcat.add_label(label)
 textcat.begin_training()
 nlp.add_pipe(textcat)
 random.shuffle(examples)
 train_examples, evals, eval_split = split_evals(
     examples, eval_split)
 st.success(
     f"✅ Using **{len(train_examples)}** training examples "
     f"and **{len(evals)}** evaluation examples with "
     f"**{len(all_labels)}** label(s)")
 annot_model = TextClassifier(
     nlp,
     all_labels,
     low_data=len(train_examples) < 1000,
     exclusive_classes=exclusive,
 )
 progress = st.progress(0)
 results = []
 result_table = st.empty()
 best_acc = 0.0
 for i in range(n_iter):
     loss = 0.0
     random.shuffle(train_examples)
     for batch in minibatch(train_examples, size=10):
         batch = list(batch)
         loss += annot_model.update(batch,
                                    revise=False,
                                    drop=dropout)
     with nlp.use_params(annot_model.optimizer.averages):

示例#11

0

显示文件

def teach(dataset,
          spacy_model,
          source=None,
          label='',
          api=None,
          loader=None,
          seeds=None,
          long_text=False,
          exclude=None):
    """
    Collect the best possible training data for a text classification model
    with the model in the loop. Based on your annotations, Prodigy will decide
    which questions to ask next.
    """
    log('RECIPE: Starting recipe attncat.teach', locals())
    DB = connect()
    nlp = spacy.load(spacy_model)
    log('RECIPE: Creating TextClassifier with model {}'.format(spacy_model))
    model = TextClassifier(nlp, label.split(','), long_text=long_text)
    stream = get_stream(source,
                        api,
                        loader,
                        rehash=True,
                        dedup=True,
                        input_key='text')

    # Get attention layer weights from textcat
    textcat = nlp.get_pipe('textcat')
    assert textcat is not None
    with get_attention_weights(textcat) as attn_weights:
        if seeds is not None:
            if isinstance(seeds, str) and seeds in DB:
                seeds = get_seeds_from_set(seeds, DB.get_dataset(seeds))
            else:
                seeds = get_seeds(seeds)
            # Find 'seedy' examples
            examples_with_seeds = list(
                find_with_terms(stream,
                                seeds,
                                at_least=10,
                                at_most=1000,
                                give_up_after=10000))
            for eg in examples_with_seeds:
                eg.setdefault('meta', {})
                eg['meta']['via_seed'] = True
            print("Found {} examples with seeds".format(
                len(examples_with_seeds)))
            examples_with_seeds = [
                task for _, task in model(examples_with_seeds)
            ]
        # Rank the stream. Note this is continuous, as model() is a generator.
        # As we call model.update(), the ranking of examples changes.
        stream = prefer_uncertain(model(stream))
        # Prepend 'seedy' examples, if present
        if seeds:
            log("RECIPE: Prepending examples with seeds to the stream")
            stream = cytoolz.concat((examples_with_seeds, stream))

        # Decorate items with attention data
        stream = attach_attention_data(stream, nlp, attn_weights)
    return {
        'view_id': 'html',
        'dataset': dataset,
        'stream': stream,
        'exclude': exclude,
        'update': model.update,
        'config': {
            'lang': nlp.lang,
            'labels': model.labels,
            'html_template': template_text
        }
    }

示例#12

0

显示文件

文件： textcat_recipe.py 项目： ysunlp/data_science_challenge

def textcat_custom(dataset,
                   spacy_model,
                   source=None,
                   label='',
                   api=None,
                   patterns=None,
                   loader=None,
                   long_text=False,
                   exclude=None):
    """
    Collect the best possible training data for a text classification model
    with the model in the loop. Based on your annotations, Prodigy will decide
    which questions to ask next.
    """
    # logDB = setup_mongo('activelearning')
    #nlp = spacy.load('/home/ysun/pytorchprodigy/')
    if (spacy_model is not None):
        if (type(spacy_model) == str):
            print("Load model ", spacy_model)
            nlp = spacy.load(spacy_model, disable=['ner', 'parser'])
            model = TextClassifier(nlp, label, long_text=long_text)
        else:
            model = spacy_model
    else:
        print("build your customized model")
        #nlp = spacy.load('en_core_web_lg')
        pt_model = FastText(vocab_size=50966, emb_dim=300)
        #pt_model.embeds.weight.data.copy_(torch.from_numpy(nlp.vocab.vectors.data))
        #model = PyTorchWrapper(pt_model)

        #textcat = Loss_TextCategorizer(nlp.vocab,model)
        #nlp.add_pipe(textcat)
        #model = TextClassifier(nlp, label, long_text=long_text)
        optimizer = torch.optim.Adam(pt_model.parameters(), lr=0.001)
        criterion = nn.BCELoss()
        #example_path = "/liveperson/data/alloy/prodigy/data/newsgroup_initial.jsonl"
        example_path = "/liveperson/data/alloy/prodigy/data/newsgroup_example.jsonl"
        vectorizer_path = "/liveperson/data/alloy/prodigy/data/newsgroup_all.jsonl"
        model = Prodigy_model_cpu(pt_model,
                                  vectorizer_path,
                                  None,
                                  label_size=1,
                                  optimizer=optimizer,
                                  loss=criterion)
        # model = Prodigy_svm_cpu(pt_model,label_size=1,optimizer=optimizer,loss=criterion)

    stream = get_stream(source, input_key='text')
    if patterns is None:
        predict = model.predict
        update = model.update

    stream = test_stream(stream, predict)

    def updateDB(answers):
        model.update(answers)

    def on_exit():
        print("on_exit")
        return model

    return {
        'view_id': 'classification',
        'dataset': dataset,
        'stream': stream,
        'exclude': exclude,
        'update': updateDB,
        'on_exit': on_exit,
        'config': {
            'labels': ['POSITIVE', 'NEGATIVE'],
            'batch_size': 32
        }
    }

示例#13

0

显示文件

文件： textcat_recipe.py 项目： ysunlp/data_science_challenge

def textcat_log(dataset,
                spacy_model,
                source=None,
                label='',
                api=None,
                patterns=None,
                loader=None,
                long_text=False,
                exclude=None):
    """
    Collect the best possible training data for a text classification model
    with the model in the loop. Based on your annotations, Prodigy will decide
    which questions to ask next.
    """
    # logDB = setup_mongo('activelearning')
    #nlp = spacy.load('/home/ysun/pytorchprodigy/')
    if (spacy_model is not None):
        if (type(spacy_model) == str):
            print("Load model ", spacy_model)
            nlp = spacy.load(spacy_model, disable=['ner', 'parser'])
            model = TextClassifier(nlp, label, long_text=long_text)
        else:
            model = spacy_model
    else:
        print("build your customized model,log")
        pt_model = linear_model.SGDClassifier(loss="log")
        # pt_model = linear_model.SGDClassifier()
        example = ["Could you check my order status"]
        example_label = [1]
        #vectorizer_path = "/liveperson/data/alloy/prodigy/data/db-out/tmo_order_status.jsonl"
        #example_path = "/liveperson/data/alloy/prodigy/data/newsgroup_initial.jsonl"
        example_path = "/liveperson/data/alloy/prodigy/data/newsgroup_example.jsonl"
        vectorizer_path = "/liveperson/data/alloy/prodigy/data/newsgroup_all.jsonl"
        model = Prodigy_log_cpu(pt_model, 1, vectorizer_path, example_path)

    stream = get_stream(source, input_key='text')
    if patterns is None:
        predict = model.predict
        update = model.update

    stream = test_stream(stream, predict)

    # stream = probability_stream(stream,predict)

    def updateDB(answers):
        model.update(answers)

    def on_exit():
        print("on_exit")
        return model

    return {
        'view_id': 'classification',
        'dataset': dataset,
        'stream': stream,
        'exclude': exclude,
        'update': updateDB,
        'on_exit': on_exit,
        'config': {
            'labels': ['ORDER_STATUS'],
            'batch_size': 32
        }
    }