Python tc_update_header 예제들, prodigy.components.printers.tc_update_header Python 예제들

예제 #1

0

파일 보기

파일: textcat_recipe.py 프로젝트: ysunlp/data_science_challenge

def batch_train_custom_cumulate(dataset, input_model=None, output_model=None, lang='en',
                factor=1, dropout=0.2, n_iter=1, batch_size=10,
                eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False,gpu_id = None):
    if(gpu_id == 0 and torch.cuda.is_available()):
        print("Using cuda")
        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
        cudnn.benchmark = True
    if(n_iter ==1):
        print("one pass mode")
    print("batch_size",batch_size)
    #print(factor,type(factor))
    DB = connect()
    print_ = get_print(silent)
    random.seed(0)
    if input_model is not None:
        nlp = spacy.load(input_model, disable=['ner'])
        print_('\nLoaded model {}'.format(input_model))
        model = TextClassifier(nlp, labels, long_text=long_text,
                               low_data=len(examples) < 1000)
    else:
        print("build your customized model")
        pt_model = FastText(vocab_size=684831, emb_dim = 300).cuda()
        optimizer = torch.optim.Adam(pt_model.parameters(), lr=0.001)
        criterion = nn.BCELoss()
        model = Prodigy_model(pt_model,label_size=1,optimizer=optimizer,loss=criterion)
    examples = DB.get_dataset(dataset)
    if eval_id:
        evals = DB.get_dataset(eval_id)
        print_("Loaded {} evaluation examples from '{}'"
               .format(len(evals), eval_id))
    examples = examples[:int(len(examples) * factor)]
    print_(printers.trainconf(dropout, n_iter, batch_size, factor,
                              len(examples)))
    if len(evals) > 0:
        print_(printers.tc_update_header())
    best_acc = {'accuracy': 0}
    best_model = None
    start_time = datetime.now()
    if len(evals) > 0:
        model.init_eval(evals)
    interval = 100
    for fac in np.arange(interval,len(examples)+interval,interval):
        examples_fac = examples[:fac]
        batch_number = examples_fac/batch_size
        for i in range(n_iter):
            if shuffle:
                print("it's shuffling")
                random.shuffle(examples)
            batch_idx = 0
            loss = 0
            for batch in cytoolz.partition_all(batch_size,
                                               tqdm.tqdm(examples, leave=False)):
                batch = list(batch)
                loss += model.update(batch)
                batch_idx += 1
            acc = model.evaluate(evals)     
            print_('Time:[{0} seconds], process: [{1}/{2}], Epoch: [{3}/{4}], step: [{5}/{6}], Loss: {7},Acc:{8}'.format(
               end_time.seconds,fac, len(examples)//interval, i+1, n_iter, batch_idx+1, len(examples_fac)//batch_size, loss/batch_number, acc))
    return acc

예제 #2

0

파일 보기

파일: textcat_recipe.py 프로젝트: ysunlp/data_science_challenge

def batch_train_custom(dataset, input_model=None, output_model=None, lang='en',
                factor=1, dropout=0.2, n_iter=1, batch_size=10,
                eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False):
    if(n_iter ==1):
        print("one pass mode")
    print("batch_size",batch_size)
    #print(factor,type(factor))
    DB = connect()
    print_ = get_print(silent)
    random.seed(0)
    if input_model is not None:
        nlp = spacy.load(input_model, disable=['ner'])
        print_('\nLoaded model {}'.format(input_model))
        model = TextClassifier(nlp, labels, long_text=long_text,
                               low_data=len(examples) < 1000)
    else:
        print("build your customized model")
        pt_model = FastText(vocab_size=684831, emb_dim = 300)
        optimizer = torch.optim.Adam(pt_model.parameters(), lr=0.001)
        criterion = nn.BCELoss()
        model = Prodigy_model(pt_model,label_size=1,optimizer=optimizer,loss=criterion)
    examples = DB.get_dataset(dataset)
    if eval_id:
        evals = DB.get_dataset(eval_id)
        print_("Loaded {} evaluation examples from '{}'"
               .format(len(evals), eval_id))
    examples = examples[:int(len(examples) * factor)]
    print_(printers.trainconf(dropout, n_iter, batch_size, factor,
                              len(examples)))
    if len(evals) > 0:
        print_(printers.tc_update_header())
    best_acc = {'accuracy': 0}
    best_model = None
    for i in range(n_iter):
        if shuffle:
            random.shuffle(examples)
        batch_idx = 1
        for batch in cytoolz.partition_all(batch_size,
                                           tqdm.tqdm(examples, leave=False)):
            #print(j)
            batch = list(batch)
            loss = model.update(batch)
            if len(evals) > 0 and batch_idx % (4 * batch_size) == 0:
                acc = model.evaluate(evals)     
                #print_(printers.tc_update(batch_idx, loss, acc))
                print('Epoch: [{0}/{1}], Step: [{2}/{3}], Loss: {4}, Validation Acc:{5}'.format( 
                   i+1, n_iter, batch_idx, len(examples)//batch_size, loss, acc))
            batch_idx += 1
    return acc

예제 #3

0

파일 보기

파일: textcat_recipe.py 프로젝트: ysunlp/data_science_challenge

def batch_train_increment(dataset, input_model=None, output_model=None, lang='en',
                factor=1, dropout=0.2, n_iter=1, batch_size=10,
                eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False,gpu_id = None):
    """
    Batch train a new text classification model from annotations. Prodigy will
    export the best result to the output directory, and include a JSONL file of
    the training and evaluation examples. You can either supply a dataset ID
    containing the evaluation data, or choose to split off a percentage of
    examples for evaluation.
    """
    #log("RECIPE: Starting recipe textcat.batch-train", locals())
    if(gpu_id):
        spacy.util.use_gpu(gpu_id)
    if(n_iter ==1):
        print("one pass mode")
    print("batch_size",batch_size)
    print(factor,type(factor))
    DB = connect()
    print_ = get_print(silent)
    random.seed(0)
    if input_model is not None:
        nlp = spacy.load(input_model, disable=['ner'])
        print_('\nLoaded model {}'.format(input_model))
    else:
        print("build your customized model")
        nlp = spacy.load('en_core_web_lg')
        pt_model = FastText(vocab_size=684831, emb_dim = 300)
        pt_model.embeds.weight.data.copy_(torch.from_numpy(nlp.vocab.vectors.data))
        model = PyTorchWrapper(pt_model)
        #textcat = TextCategorizer(nlp.vocab,model)
        textcat = Loss_TextCategorizer(nlp.vocab,model)
        nlp.add_pipe(textcat)
    examples = DB.get_dataset(dataset)
    labels = {eg['label'] for eg in examples}
    labels = list(sorted(labels))
    print(labels)
    model = TextClassifier(nlp, labels, long_text=long_text,
                           low_data=len(examples) < 1000)
    if shuffle:    
        print("it's shuffling")
        random.shuffle(examples)
    else:
        print("it's not shuffling")
    if eval_id:
        evals = DB.get_dataset(eval_id)
        print_("Loaded {} evaluation examples from '{}'"
               .format(len(evals), eval_id))
    else:
        examples, evals, eval_split = split_evals(examples, eval_split)
        print_("Using {}% of examples ({}) for evaluation"
               .format(round(eval_split * 100), len(evals)))
    if shuffle:
        random.shuffle(examples)
    examples = examples[:int(len(examples) * factor)]
    print_(printers.trainconf(dropout, n_iter, batch_size, factor,
                              len(examples)))
    if len(evals) > 0:
        print_(printers.tc_update_header())
    # best_acc = {'accuracy': 0}
    # best_model = None
    if long_text:
        examples = list(split_sentences(nlp, examples, min_length=False))
    batch_idx = 0
    start_time = datetime.now()
    for batch in cytoolz.partition_all(batch_size,
                                       tqdm.tqdm(examples, leave=False)):
        batch = list(batch)
        for i in range(n_iter):
            loss = model.update(batch, revise=False, drop=dropout)
            if len(evals) > 0:
                #print("optimizer averages",model.optimizer.averages)
                with nlp.use_params(model.optimizer.averages):
                    acc = model.evaluate(tqdm.tqdm(evals, leave=False))
                #print_(printers.tc_update(i, loss, acc))
                end_time = datetime.now() -start_time
                print('Time:[{0} seconds], Epoch: [{1}/{2}], batch: [{3}/{4}], Loss:{5}, Accuracy:{6}'.format( 
                   end_time.seconds,i+1, n_iter, batch_idx+1, len(examples)//batch_size, loss, acc['accuracy']))
            batch_idx += 1
    return acc

예제 #4

0

파일 보기

파일: textcat_recipe.py 프로젝트: ysunlp/data_science_challenge

def batch_train(dataset, input_model=None, output_model=None, lang='en',
                factor=1, dropout=0.2, n_iter=10, batch_size=10,
                eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False):
    """
    Batch train a new text classification model from annotations. Prodigy will
    export the best result to the output directory, and include a JSONL file of
    the training and evaluation examples. You can either supply a dataset ID
    containing the evaluation data, or choose to split off a percentage of
    examples for evaluation.
    """
    #log("RECIPE: Starting recipe textcat.batch-train", locals())
    print("batch_size",batch_size)
    print(factor,type(factor))
    DB = connect()
    print_ = get_print(silent)
    random.seed(0)
    if input_model is not None:
        nlp = spacy.load(input_model, disable=['ner'])
        print_('\nLoaded model {}'.format(input_model))
    else:
        print("build your customized model")
        nlp = spacy.load('en_core_web_lg')
        pt_model = FastText(vocab_size=684831, emb_dim = 300)
        pt_model.embeds.weight.data.copy_(torch.from_numpy(nlp.vocab.vectors.data))
        model = PyTorchWrapper(pt_model)
        textcat = TextCategorizer(nlp.vocab,model)
        nlp.add_pipe(textcat)

        #pt_model = LSTMSentiment(embedding_dim = 100, hidden_dim =100, vocab_size=259136, label_size=2, batch_size=3, dropout=0.5)
        #model = PyTorchWrapper(pt_model)
        #nlp = spacy.load('/home/ysun/pytorchprodigy/')
        #textcat = TextCategorizer(nlp.vocab,model)
        #nlp.add_pipe(textcat)
    examples = DB.get_dataset(dataset)
    labels = {eg['label'] for eg in examples}
    labels = list(sorted(labels))
    print(labels)
    model = TextClassifier(nlp, labels, long_text=long_text,
                           low_data=len(examples) < 1000)
    #log('RECIPE: Initialised TextClassifier with model {}'
    #    .format(input_model), model.nlp.meta)
    if shuffle:    
        print("it's shuffling")
        random.shuffle(examples)
    else:
        print("it's not shuffling")
    if eval_id:
        evals = DB.get_dataset(eval_id)
        print_("Loaded {} evaluation examples from '{}'"
               .format(len(evals), eval_id))
    else:
        examples, evals, eval_split = split_evals(examples, eval_split)
        print_("Using {}% of examples ({}) for evaluation"
               .format(round(eval_split * 100), len(evals)))
    if shuffle:
        random.shuffle(examples)
    examples = examples[:int(len(examples) * factor)]
    print_(printers.trainconf(dropout, n_iter, batch_size, factor,
                              len(examples)))
    if len(evals) > 0:
        print_(printers.tc_update_header())
    best_acc = {'accuracy': 0}
    best_model = None
    if long_text:
        examples = list(split_sentences(nlp, examples, min_length=False))
    for i in range(n_iter):
        loss = 0.
        random.shuffle(examples)
        for batch in cytoolz.partition_all(batch_size,
                                           tqdm.tqdm(examples, leave=False)):
            batch = list(batch)
            loss += model.update(batch, revise=False, drop=dropout)
        if len(evals) > 0:
            with nlp.use_params(model.optimizer.averages):
                acc = model.evaluate(tqdm.tqdm(evals, leave=False))
                if acc['accuracy'] > best_acc['accuracy']:
                    best_acc = dict(acc)
                    best_model = nlp.to_bytes()
            print_(printers.tc_update(i, loss, acc))
    if len(evals) > 0:
        print_(printers.tc_result(best_acc))
    if output_model is not None:
        if best_model is not None:
            nlp = nlp.from_bytes(best_model)
        msg = export_model_data(output_model, nlp, examples, evals)
        print_(msg)
    return best_acc['accuracy']