Пример #1
0
def dump_model_result(config, model):
    tag_vocab = pickle.load(open(os.path.join(config.data_path, config.tag_vocab_name), 'rb'))
    metrics = SpanFPreRecMetric(tag_vocab, pred='pred', seq_len='seq_len', target='tag')
    dev_data = pickle.load(open(os.path.join(config.data_path, config.dev_name), "rb"))
    data_iterator = Batch(dev_data, config.batch_size, sampler=SequentialSampler(), as_numpy=False)
    model.cuda()

    eval_results = {}
    dev_data.set_input('tag')
    dev_data.set_target('seq_len')
    with torch.no_grad():
        for i, (batch_x, batch_y) in enumerate(data_iterator):
            print('batch', i)
            #if i > 10:
            #    break
            char = batch_x['char'].cuda()
            word = batch_x['word'].cuda()
            pos = batch_x['pos'].cuda()
            spo = batch_x['spo'].cuda()
            seq_len = batch_x['seq_len'].cuda()

            tag = batch_y['tag'].cuda()
            
            #pred = model(char, word, pos, spo, seq_len, tag)
            pred = model.predict(char, word, pos, spo, seq_len)  # labels?
            #labels = idx2label(pred['pred'], tag_vocab.idx2word)
            #print(pred)
            #print(tag)
            #exit()
            metrics({'pred': pred['pred'].cuda(), 'seq_len':seq_len}, {'tag': batch_y['tag'].cuda()})
        eval_result = metrics.get_metric()
        metric_name = metrics.__class__.__name__
        eval_results[metric_name] = eval_result

    print("[tester] \n{}".format(_format_eval_results(eval_results)))
Пример #2
0
 def __init__(self, tag_vocab, config):
     super(CommonSeqEvaluator, self).__init__()
     self._config = config
     self._vocab = Vocabulary()
     self._vocab.add_word_lst(tag_vocab.stoi.keys())
     self._evaluator = SpanFPreRecMetric(self._vocab,  only_gross=False, f_type=config.evaluation.type)
     self._pad_index = tag_vocab.stoi['<pad>']
Пример #3
0
def train(args):
    data = get_data(args)
    train_data = data['train']
    dev_data = data['dev']
    model = get_model(args)
    optimizer = get_optim(args)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    callbacks = []
    trainer = Trainer(
        train_data=train_data,
        model=model,
        optimizer=optimizer,
        loss=None,
        batch_size=args.batch_size,
        n_epochs=args.epochs,
        num_workers=4,
        metrics=SpanFPreRecMetric(tag_vocab=data['tag_vocab'],
                                  encoding_type=data['encoding_type'],
                                  ignore_labels=data['ignore_labels']),
        metric_key='f1',
        dev_data=dev_data,
        save_path=args.save_path,
        device=device,
        callbacks=callbacks,
        check_code_level=-1,
    )

    print(trainer.train())
Пример #4
0
def test_each(config, models):
    dev_data = pickle.load(open(os.path.join(config.data_path, config.dev_name), "rb"))
    metrics = SpanFPreRecMetric(tag_vocab, pred='pred', seq_len='seq_len', target='tag')
    for model_name, model in zip(config.ensemble_models, models):
        print(model_name)
        tester = Tester(dev_data, model, metrics=metrics, device=config.device, batch_size=config.batch_size)
        tester.test()
Пример #5
0
    def _get_trainer(self, models_folder):
        optimizer = optim.SGD(self.parameters(),
                              lr=self.config['lr'],
                              momentum=0.9)

        callbacks = []
        clip_callback = GradientClipCallback(clip_type='value', clip_value=5)
        evaluate_callback = EvaluateCallback(
            self.data_bundle.get_dataset('test'))

        if self.config['warmup_steps'] > 0:
            warmup_callback = WarmupCallback(self.config['warmup_steps'],
                                             schedule='linear')
            callbacks.append(warmup_callback)
        callbacks.extend([clip_callback, evaluate_callback])

        return Trainer(self.data_bundle.get_dataset('train'),
                       self,
                       optimizer,
                       batch_size=self.config['batch_size'],
                       sampler=BucketSampler(),
                       num_workers=2,
                       n_epochs=100,
                       dev_data=self.data_bundle.get_dataset('dev'),
                       metrics=SpanFPreRecMetric(
                           tag_vocab=self.data_bundle.get_vocab('target'),
                           encoding_type=self.config['encoding_type']),
                       dev_batch_size=self.config['batch_size'] * 5,
                       callbacks=callbacks,
                       device=self.config['device'],
                       test_use_tqdm=False,
                       use_tqdm=True,
                       print_every=300,
                       save_path=models_folder)
Пример #6
0
    def __init__(self, masker, task_lst, vocabs, optimizer, args):
        """
        :param model: 模型
        :param description: 模型描述
        :param task_lst: 任务列表
        :param optimizer: 优化器
        :param log_path: TensorboardX存储文件夹
        :param save_path: 模型存储位置
        :param accumulation_steps: 累积梯度
        :param print_every: 评估间隔
        """
        self.logger = fastNLP.logger

        self.masker = masker
        self.task_lst = task_lst
        self.save_path = args.save_path
        self.description = args.exp_name
        self.optim = optimizer
        self.vocabs = vocabs
        n_steps = (int(
            len(task_lst) * len(task_lst[0].train_set) * 100 / args.batch_size)
                   + 1)
        args.n_steps = n_steps
        self.epoch_scheduler = get_scheduler(args, self.optim)
        self.scheduler = None
        self.logger.info('Using scheduler {}'.format(self.scheduler))
        self.accumulation_steps = args.accumulation_steps
        self.print_every = args.print_every
        self.batch_size = args.batch_size
        self.save_ep = args.save_ep

        include_tasks = args.tasks
        if include_tasks is None:
            self.empty_tasks = set()
        else:
            self.empty_tasks = set(range(len(
                self.task_lst))) - set(include_tasks)

        self.steps = 0
        self.best_acc = 0
        self.best_epoch = 0

        self.metrics = []
        for t in task_lst:
            if has_acc(t.task_name):
                self.metrics.append(AccuracyMetric())
            else:
                self.metrics.append(
                    SpanFPreRecMetric(
                        self.vocabs[t.task_name],
                        encoding_type="bioes"
                        if t.task_name == "ner" else "bio",
                    ))
        # self.logger.info(self.metrics)

        tb_path = "eval" if args.evaluate else "train"
        self.summary_writer = SummaryWriter(os.path.join(
            args.tb_path, tb_path))
def trainer(data_folder, write2model, write2vocab):
    data_bundle = PeopleDailyNERLoader().load(
        data_folder)  # 这一行代码将从{data_dir}处读取数据至DataBundle
    data_bundle = PeopleDailyPipe().process(data_bundle)
    data_bundle.rename_field('chars', 'words')
    # 存储vocab
    targetVocab = dict(data_bundle.vocabs["target"])
    wordsVocab = dict(data_bundle.vocabs["words"])
    targetWc = dict(data_bundle.vocabs['target'].word_count)
    wordsWc = dict(data_bundle.vocabs['words'].word_count)
    with open(write2vocab, "w", encoding="utf-8") as VocabOut:
        VocabOut.write(
            json.dumps(
                {
                    "targetVocab": targetVocab,
                    "wordsVocab": wordsVocab,
                    "targetWc": targetWc,
                    "wordsWc": wordsWc
                },
                ensure_ascii=False))

    embed = BertEmbedding(vocab=data_bundle.get_vocab('words'),
                          model_dir_or_name='cn',
                          requires_grad=False,
                          auto_truncate=True)
    model = BiLSTMCRF(embed=embed,
                      num_classes=len(data_bundle.get_vocab('target')),
                      num_layers=1,
                      hidden_size=100,
                      dropout=0.5,
                      target_vocab=data_bundle.get_vocab('target'))

    metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))
    optimizer = Adam(model.parameters(), lr=2e-5)
    loss = LossInForward()
    device = 0 if torch.cuda.is_available() else 'cpu'
    # device = "cpu"
    trainer = Trainer(data_bundle.get_dataset('train'),
                      model,
                      loss=loss,
                      optimizer=optimizer,
                      batch_size=8,
                      dev_data=data_bundle.get_dataset('dev'),
                      metrics=metric,
                      device=device,
                      n_epochs=1)
    trainer.train()
    tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)
    tester.test()
    saver = ModelSaver(write2model)
    saver.save_pytorch(model, param_only=False)
Пример #8
0
def evaluate(args):
    data = get_data(args)
    test_data = data['test']
    model = load_model_from_path(args)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    tester = Tester(
        data=test_data,
        model=model,
        batch_size=args.batch_size,
        num_workers=2,
        device=device,
        metrics=SpanFPreRecMetric(tag_vocab=data['tag_vocab'],
                                  encoding_type=data['encoding_type'],
                                  ignore_labels=data['ignore_labels']),
    )
    print(tester.test())
Пример #9
0
    def evaluate(self, data_samples, prefix=''):
        r"""
        :param DataSet data_samples: DataSet with Samples and Vocabs
        :return: dict obj to save metrics result

        """
        # span_f1_metric = SpanFPreRecMetric(
        #     tag_vocab=data_bundle.get_vocab('target'),
        #     encoding_type='bio')
        tester = Tester(data_bundle.get_dataset('test'),
                        self.model,
                        metrics=SpanFPreRecMetric(
                            tag_vocab=data_bundle.get_vocab('target'),
                            encoding_type='bio'),
                        batch_size=4,
                        device=device)

        return tester.test()['SpanFPreRecMetric']
Пример #10
0
def predict(config, model):
    tag_vocab = pickle.load(open(os.path.join(config.data_path, config.tag_vocab_name), 'rb'))
    metrics = SpanFPreRecMetric(tag_vocab, pred='pred', seq_len='seq_len', target='tag')
    dev_data = pickle.load(open(os.path.join(config.data_path, config.dev_name), "rb"))
    char_vocab = pickle.load(open(os.path.join(config.data_path, config.char_vocab_name), "rb"))

    data_iterator = Batch(dev_data, config.batch_size, sampler=SequentialSampler(), as_numpy=False)
    model.cuda()

    schema = get_schemas(config.source_path)

    eval_results = {}
    dev_data.set_input('tag')
    dev_data.set_target('seq_len')
    result = {}
    with torch.no_grad():
        for i, (batch_x, _) in enumerate(data_iterator):
            print('batch', i)
            #if i > 10:
            #    break
            char = batch_x['char'].cuda()
            word = batch_x['word'].cuda()
            pos = batch_x['pos'].cuda()
            spo = batch_x['spo'].cuda()
            seq_len = batch_x['seq_len'].cuda()
            
            #pred = model(char, word, pos, spo, seq_len, tag)
            pred = model.predict(char, word, pos, spo, seq_len)  # labels?

            texts = char2text(char.cpu().data, char_vocab.idx2word)
            labels = idx2label(pred['pred'].cpu().data, tag_vocab.idx2word)
            spos = idx2spo(schema, spo.cpu().data)
            result = label2spo(labels, texts, result, spos)
            #print(pred)
            #print(tag)
            #exit()
            # metrics({'pred': pred['pred'].cuda(), 'seq_len':seq_len}, {'tag': batch_y['tag'].cuda()})
        # eval_result = metrics.get_metric()
        # metric_name = metrics.__class__.__name__
        # eval_results[metric_name] = eval_result

    return result
Пример #11
0
    def _eval_epoch(self, dev=True):
        self.logger.info("Evaluating...")
        dev_loss = 0
        e_steps = 0
        avg_acc = 0
        dev_acc = {}
        self.model = self.masker.model
        self.model.eval()
        metrics = []
        for task in self.task_lst:
            if has_acc(task.task_name):
                metrics.append(fastNLP.AccuracyMetric())
            else:
                metrics.append(
                    SpanFPreRecMetric(
                        self.vocabs[task.task_name],
                        encoding_type="bioes"
                        if task.task_name == "ner" else "bio",
                    ))

        with torch.no_grad():
            for i in range(len(self.task_lst)):
                corrects, samples = 0, 0
                task = find_task(i, self.task_lst)
                if task.task_id in self.empty_tasks:
                    continue
                if dev:
                    data_loader = task.dev_data_loader
                else:
                    data_loader = task.test_data_loader
                for batch in data_loader:
                    x, y = batch
                    batch_task_id = x["task_id"].cuda()
                    batch_x = x["x"].cuda()
                    batch_y = y["y"].cuda()
                    if "seq_len" in x:
                        seq_len = x["seq_len"].cuda()
                    else:
                        seq_len = None

                    self.masker.before_forward(batch_task_id[0].item())
                    # loss, pred = self.model(batch_task_id, batch_x, batch_y, seq_len)
                    if seq_len is not None:
                        out = self.model(batch_task_id, batch_x, batch_y,
                                         seq_len)
                    else:
                        out = self.model(batch_task_id, batch_x, batch_y)
                    loss, pred = out["loss"], out["pred"]
                    self.masker.after_forward(batch_task_id[0].item())

                    dev_loss += loss.item()
                    e_steps += 1

                    metrics[i].evaluate(pred, batch_y, seq_len)

                    samples += batch_x.size(0)

            for i in range(len(self.task_lst)):
                task = find_task(i, self.task_lst)
                eval_res = metrics[i].get_metric()
                dev_acc[task.task_name] = eval_res
                avg_acc += eval_res["acc"] if "acc" in eval_res else eval_res[
                    "f"]

        avg_acc /= len(self.task_lst) - len(self.empty_tasks)
        dev_acc["avg"] = avg_acc
        dev_loss = dev_loss / e_steps
        return dev_loss, dev_acc
Пример #12
0
            for ins in all_data[target][key]:
                CWS_dataset.append(ins)
            del all_data[target][key]
    CWS_dataset.set_input('chars','target','seq_len')
    CWS_dataset.set_target('target','seq_len')
    all_data[target]['CWS-all']=CWS_dataset

model=torch.load('best_model')


device = 0 if torch.cuda.is_available() else 'cpu'

metric1 = SegAppCharParseF1Metric(label_vocab['Parsing']['APP'])
metric2 = CWSMetric(label_vocab['Parsing']['APP'])
metrics = [metric1,metric2]



for key in all_data['test']:
    dataset=all_data['test'][key]
    if key.startswith('CWS'):
        tester = Tester(data=dataset,model=model,metrics=SpanFPreRecMetric(tag_vocab=label_vocab['CWS']),device=device)
    elif key.startswith('POS'):
        tester = Tester(data=dataset,model=model,metrics=SpanFPreRecMetric(tag_vocab=label_vocab['POS']),device=device)
    elif key.startswith('NER'):
        tester = Tester(data=dataset,model=model,metrics=SpanFPreRecMetric(tag_vocab=label_vocab['NER']),device=device)
    else:
        tester = Tester(data=dataset,model=model,metrics=metrics,device=device)
    print(key)
    tester.test()
Пример #13
0
def eval_mtl_single(args):
    global logger
    # import ipdb; ipdb.set_trace()
    args = torch.load(os.path.join(args.save_path, "args"))
    print(args)
    logger.info(args)
    task_lst, vocabs = utils.get_data(args.data_path)
    task_db = task_lst[args.task_id]
    train_data = task_db.train_set
    dev_data = task_db.dev_set
    test_data = task_db.test_set
    task_name = task_db.task_name

    # text classification
    for ds in [train_data, dev_data, test_data]:
        ds.rename_field("words_idx", "x")
        ds.rename_field("label", "y")
        ds.set_input("x", "y", "task_id")
        ds.set_target("y")
    # seq label
    if task_name in SEQ_LABEL_TASK:
        for ds in [train_data, dev_data, test_data]:
            ds.set_input("seq_len")
            ds.set_target("seq_len")

    logger = utils.get_logger(__name__)
    logger.info("task name: {}, task id: {}".format(task_db.task_name, task_db.task_id))
    logger.info(
        "train len {}, dev len {}, test len {}".format(
            len(train_data), len(dev_data), len(test_data)
        )
    )

    # init model
    model = get_model(args, task_lst, vocabs)
    # logger.info('model: \n{}'.format(model))

    if task_name not in SEQ_LABEL_TASK or task_name == "pos":
        metrics = [
            AccuracyMetric(target="y"),
            # MetricInForward(val_name='loss')
        ]
    else:
        metrics = [
            SpanFPreRecMetric(
                tag_vocab=vocabs[task_name],
                pred="pred",
                target="y",
                seq_len="seq_len",
                encoding_type="bioes" if task_name == "ner" else "chunk",
            ),
            AccuracyMetric(target="y")
            # MetricInForward(val_name='loss')
        ]

    cur_best = 0.0
    init_best = None
    eval_time = 0
    paths = [path for path in os.listdir(args.save_path) if path.startswith("best")]
    paths = sorted(paths, key=lambda x: int(x.split("_")[1]))
    for path in paths:
        path = os.path.join(args.save_path, path)
        state = torch.load(path, map_location="cpu")
        model.load_state_dict(state)
        tester = Tester(
            test_data,
            model,
            metrics=metrics,
            batch_size=args.batch_size,
            num_workers=4,
            device="cuda",
            use_tqdm=False,
        )
        res = tester.test()
        val = 0.0
        for metric_name, metric_dict in res.items():
            if task_name == "pos" and "acc" in metric_dict:
                val = metric_dict["acc"]
                break
            elif "f" in metric_dict:
                val = metric_dict["f"]
                break

        if init_best is None:
            init_best = val
        logger.info(
            "No #%d: best %f, %s, path: %s, is better: %s",
            eval_time,
            val,
            tester._format_eval_results(res),
            path,
            val > init_best,
        )

        eval_time += 1
Пример #14
0
@cache_results('caches/weibo-lstm.pkl', _refresh=False)
def get_data():
    data_bundle = WeiboNERLoader().load()
    data_bundle = ChineseNERPipe(encoding_type='bioes',
                                 bigram=True).process(data_bundle)
    char_embed = StaticEmbedding(data_bundle.get_vocab(C.CHAR_INPUT),
                                 model_dir_or_name='cn-fasttext')
    bigram_embed = StaticEmbedding(data_bundle.get_vocab('bigrams'),
                                   embedding_dim=100,
                                   min_freq=3)
    return data_bundle, char_embed, bigram_embed


data_bundle, char_embed, bigram_embed = get_data()
# data_bundle = get_data()
print(data_bundle)

# exit(0)
model = CNBiLSTMCRFNER(char_embed,
                       num_classes=len(data_bundle.vocabs['target']),
                       bigram_embed=bigram_embed)

Trainer(data_bundle.datasets['train'],
        model,
        batch_size=20,
        metrics=SpanFPreRecMetric(data_bundle.vocabs['target'],
                                  encoding_type='bioes'),
        num_workers=2,
        dev_data=data_bundle.datasets['dev'],
        device=0).train()
Пример #15
0
                          embeddings['word'],
                          hidden_size=args.hidden,
                          label_size=len(vocabs['label']),
                          device=args.device,
                          bidirectional=args.bi,
                          embed_dropout=args.embed_dropout,
                          output_dropout=args.output_dropout,
                          use_bigram=args.use_bigram)

loss = LossInForward()
encoding_type = 'bmeso'
if args.dataset == 'weibo':
    encoding_type = 'bio'
f1_metric = SpanFPreRecMetric(vocabs['label'],
                              pred='pred',
                              target='target',
                              seq_len='seq_len',
                              encoding_type=encoding_type)
acc_metric = AccuracyMetric(pred='pred', target='target', seq_len='seq_len')
metrics = [f1_metric, acc_metric]

if args.optim == 'adam':
    optimizer = optim.Adam(model.parameters(), lr=args.lr)
elif args.optim == 'sgd':
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr,
                          momentum=args.momentum)

callbacks = [
    FitlogCallback({
        'test': datasets['test'],
Пример #16
0
                                 requires_grad=True, lower=True, word_dropout=0, dropout=0.5,
                                 only_norm_found_vector=normalize_embed)
    if char_embed is not None:
        embed = StackEmbedding([word_embed, char_embed], dropout=0, word_dropout=0.02)
    else:
        word_embed.word_drop = 0.02
        embed = word_embed
    data__ = data.get_vocab('words')
    data.rename_field('words', 'chars')
    return data, embed, data__, word_embed

data_bundle, embed, data__, word_embed = load_data()

model = TENER(tag_vocab=data_bundle.get_vocab('target'), embed=embed, num_layers=num_layers,
                d_model=d_model, n_head=n_heads,
                feedforward_dim=dim_feedforward, dropout=dropout,
                after_norm=after_norm, attn_type=attn_type,
                bi_embed=None,
                fc_dropout=fc_dropout,
                pos_embed=pos_embed,
                scale=attn_type=='transformer')
#Thay model path trước khi chạy
model_path = './w2v_n6'
states = torch.load(model_path).state_dict()
model.load_state_dict(states)

evaluation = Tester(data_bundle.get_dataset('test'),
                    model,
                    metrics=SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'), only_gross=False),
                    batch_size=batch_size)
evaluation.test()
Пример #17
0
def train(config, task_name):
    train_data = pickle.load(
        open(os.path.join(config.data_path, config.train_name), "rb"))
    # debug
    if config.debug:
        train_data = train_data[0:100]
    dev_data = pickle.load(
        open(os.path.join(config.data_path, config.dev_name), "rb"))
    print(len(train_data), len(dev_data))
    # test_data = pickle.load(open(os.path.join(config.data_path, config.test_name), "rb"))
    # load w2v data
    # weight = pickle.load(open(os.path.join(config.data_path, config.weight_name), "rb"))

    word_vocab = pickle.load(
        open(os.path.join(config.data_path, config.word_vocab_name), "rb"))
    char_vocab = pickle.load(
        open(os.path.join(config.data_path, config.char_vocab_name), "rb"))
    pos_vocab = pickle.load(
        open(os.path.join(config.data_path, config.pos_vocab_name), "rb"))
    # spo_vocab = pickle.load(open(os.path.join(config.data_path, config.spo_vocab_name), "rb"))
    tag_vocab = pickle.load(
        open(os.path.join(config.data_path, config.tag_vocab_name), "rb"))
    print('word vocab', len(word_vocab))
    print('char vocab', len(char_vocab))
    print('pos vocab', len(pos_vocab))
    # print('spo vocab', len(spo_vocab))
    print('tag vocab', len(tag_vocab))

    schema = get_schemas(config.source_path)

    if task_name == 'bilstm_crf':
        model = AdvSeqLabel(
            char_init_embed=(len(char_vocab), config.char_embed_dim),
            word_init_embed=(len(word_vocab), config.word_embed_dim),
            pos_init_embed=(len(pos_vocab), config.pos_embed_dim),
            spo_embed_dim=len(schema),
            sentence_length=config.sentence_length,
            hidden_size=config.hidden_dim,
            num_classes=len(tag_vocab),
            dropout=config.dropout,
            id2words=tag_vocab.idx2word,
            encoding_type=config.encoding_type)
    elif task_name == 'trans_crf':
        model = TransformerSeqLabel(
            char_init_embed=(len(char_vocab), config.char_embed_dim),
            word_init_embed=(len(word_vocab), config.word_embed_dim),
            pos_init_embed=(len(pos_vocab), config.pos_embed_dim),
            spo_embed_dim=len(schema),
            num_classes=len(tag_vocab),
            id2words=tag_vocab.idx2word,
            encoding_type=config.encoding_type,
            num_layers=config.num_layers,
            inner_size=config.inner_size,
            key_size=config.key_size,
            value_size=config.value_size,
            num_head=config.num_head,
            dropout=config.dropout)

    optimizer = Adam(lr=config.lr, weight_decay=config.weight_decay)
    timing = TimingCallback()
    early_stop = EarlyStopCallback(config.patience)
    # loss = NLLLoss()
    logs = FitlogCallback(dev_data)
    metrics = SpanFPreRecMetric(tag_vocab,
                                pred='pred',
                                seq_len='seq_len',
                                target='tag')

    train_data.set_input('tag')
    dev_data.set_input('tag')
    dev_data.set_target('seq_len')
    #print(train_data.get_field_names())
    trainer = Trainer(
        train_data=train_data,
        model=model,
        # loss=loss,
        metrics=metrics,
        metric_key='f',
        batch_size=config.batch_size,
        n_epochs=config.epoch,
        dev_data=dev_data,
        save_path=config.save_path,
        check_code_level=-1,
        print_every=config.print_every,
        validate_every=config.validate_every,
        optimizer=optimizer,
        use_tqdm=False,
        device=config.device,
        callbacks=[timing, early_stop, logs])
    trainer.train()

    # test result
    tester = Tester(dev_data,
                    model,
                    metrics=metrics,
                    device=config.device,
                    batch_size=config.batch_size)
    tester.test()
Пример #18
0
              block_loss=True,
              input_dropout=0.5,
              hidden_dropout=0.2,
              inner_dropout=0.2)

print(model)

callbacks = [
    GradientClipCallback(clip_value=ops.gradient_clip, clip_type='value'),
]
metrics = []
metrics.append(
    SpanFPreRecMetric(
        tag_vocab=data.vocabs[Const.TARGET],
        encoding_type=encoding_type,
        pred=Const.OUTPUT,
        target=Const.TARGET,
        seq_len=Const.INPUT_LEN,
    ))


class LossMetric(MetricBase):
    def __init__(self, loss=None):
        super(LossMetric, self).__init__()
        self._init_param_map(loss=loss)
        self.total_loss = 0.0
        self.steps = 0

    def evaluate(self, loss):
        self.total_loss += float(loss)
        self.steps += 1
Пример #19
0
model = CharModel(embed=embed,
                  label_vocab=label_vocab,
                  pos_idx=pos_idx,
                  Parsing_rnn_layers=rnn_layers,
                  Parsing_arc_mlp_size=arc_mlp_size,
                  Parsing_label_mlp_size=label_mlp_size,
                  encoding_type='bmeso')

optimizer = AdamW(model.parameters(), lr=2e-5)

device = 0 if torch.cuda.is_available() else 'cpu'
callbacks = [WarmupCallback(warmup=0.1, schedule='linear')]

metric1 = SegAppCharParseF1Metric(label_vocab['Parsing']['APP'])
metric2 = CWSMetric(label_vocab['Parsing']['APP'])
metric3 = SpanFPreRecMetric(tag_vocab=label_vocab['POS'])
metrics = [metric1, metric2, metric3]

for target in ['train', 'test', 'dev']:
    CWS_dataset = DataSet()
    for key in task_list:
        if key.startswith('CWS'):
            for ins in all_data[target][key]:
                CWS_dataset.append(ins)
            del all_data[target][key]
    CWS_dataset.set_input('chars', 'target', 'seq_len', 'task_class')
    CWS_dataset.set_target('target', 'seq_len')
    all_data[target]['CWS-all'] = CWS_dataset

train_data = dict()
train_data['POS-ctb9'] = all_data['train']['POS-ctb9']
Пример #20
0
def my_model_single_sentence(sentence):
    '''
    #取出Pipe的处理过程,目的是取得由训练集所构建的词典
    from fastNLP.io import WeiboNERLoader

    #load原始数据
    data_bundle = WeiboNERLoader().load()

    #这里需要获取原始数据的此表Vocabulary
    from fastNLP import Vocabulary
    from fastNLP.core.utils import iob2, iob2bioes
    from fastNLP.core.const import Const

    #encoding_type
    encoding_type: str = 'bio'

    if encoding_type == 'bio':
        convert_tag = iob2
    elif encoding_type == 'bioes':
        convert_tag = lambda words: iob2bioes(iob2(words))


    #转换tag
    for name, dataset in data_bundle.datasets.items():
        dataset.apply_field(convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET)    
   
    #复制一列chars
    data_bundle.copy_field(field_name=Const.RAW_CHAR, new_field_name=Const.CHAR_INPUT, ignore_miss_dataset=True)


    input_field_names = [Const.CHAR_INPUT]
    target_field_names=Const.TARGET

    if isinstance(input_field_names, str):
        input_field_names = [input_field_names]
    if isinstance(target_field_names, str):
        target_field_names = [target_field_names]

    #构建词表
    for input_field_name in input_field_names:
        src_vocab = Vocabulary()
        src_vocab.from_dataset(*[ds for name, ds in data_bundle.iter_datasets() if 'train' in name],
                                field_name=input_field_name,
                                no_create_entry_dataset=[ds for name, ds in data_bundle.iter_datasets()
                                                        if ('train' not in name) and (ds.has_field(input_field_name))]
                                )
        src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=input_field_name)
        data_bundle.set_vocab(src_vocab, input_field_name)
  
    #构建target表
    for target_field_name in target_field_names:
        tgt_vocab = Vocabulary(unknown=None, padding=None)
        tgt_vocab.from_dataset(*[ds for name, ds in data_bundle.iter_datasets() if 'train' in name],
                                field_name=target_field_name,
                                no_create_entry_dataset=[ds for name, ds in data_bundle.iter_datasets()
                                                        if ('train' not in name) and (ds.has_field(target_field_name))]
                                )
        if len(tgt_vocab._no_create_word) > 0:
            warn_msg = f"There are {len(tgt_vocab._no_create_word)} `{target_field_name}` labels" \
                        f" in {[name for name in data_bundle.datasets.keys() if 'train' not in name]} " \
                        f"data set but not in train data set!.\n" \
                        f"These label(s) are {tgt_vocab._no_create_word}"
            warnings.warn(warn_msg)
            logger.warning(warn_msg)
        tgt_vocab.index_dataset(*[ds for ds in data_bundle.datasets.values() if ds.has_field(target_field_name)], field_name=target_field_name)
        data_bundle.set_vocab(tgt_vocab, target_field_name)

    input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names
    target_fields = [Const.TARGET, Const.INPUT_LEN]
        
    for name, dataset in data_bundle.datasets.items():
        dataset.add_seq_len(Const.CHAR_INPUT)
        
    data_bundle.set_input(*input_fields)
    data_bundle.set_target(*target_fields)
    '''
    '''
    from fastNLP.io import WeiboNERPipe
    data_bundle = WeiboNERPipe().process_from_file()
    '''
    from fastNLP.io.loader.conll import CNNERLoader
    data_bundle = CNNERLoader().load("data/")

    from fastNLP.io.pipe.conll import _CNNERPipe
    data_bundle = _CNNERPipe(encoding_type='bio').process(data_bundle)

    src_vocab = data_bundle.get_vocab('chars')
    tgt_vocab = data_bundle.get_vocab('target')

    #至此数据处理完毕,两个词典也构建完成
    #需要增加数据的是data_bundle.get_dataset('test')这一个fastNLP dataset对象
    #该数据结构格式为 raw_chars target chars seq_len

    from fastNLP import Instance

    my_raw_chars = []
    my_target = []
    my_words = []
    for i in range(0, len(sentence)):
        my_raw_chars.append(sentence[i])
        my_target.append(0)
        my_words.append(src_vocab.to_index(sentence[i]))

    my_seq_len = len(sentence)

    ins = Instance()

    ins.add_field('raw_chars', my_raw_chars)
    ins.add_field('target', my_target)
    ins.add_field('chars', my_words)
    ins.add_field('seq_len', my_seq_len)

    data_bundle.get_dataset('test').append(ins)

    #加载模型
    from fastNLP.io import ModelLoader

    loader = ModelLoader()

    model = loader.load_pytorch_model("./save/bilstmcrf_sec_ner.pkl")

    data_bundle.rename_field(
        'chars',
        'words')  # 这是由于BiLSTMCRF模型的forward函数接受的words,而不是chars,所以需要把这一列重新命名

    from fastNLP import SpanFPreRecMetric
    metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))

    #进行测试
    from fastNLP import Tester

    tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)

    final = tester.get_pred()

    my_label = []
    #我们要的是final的最后一个的最后一行
    for i in final[len(final) - 1][len(final[len(final) - 1]) - 1]:
        i = i.cpu().item()
        my_label.append(tgt_vocab.to_word(i))

    output = ''
    for j in range(0, my_seq_len):
        output = output + sentence[j] + ' ' + my_label[j] + '\n'

    return output
Пример #21
0
def my_model_passage(sentences):
    from fastNLP.io.loader.conll import CNNERLoader
    data_bundle = CNNERLoader().load("data/")

    from fastNLP.io.pipe.conll import _CNNERPipe
    data_bundle = _CNNERPipe(encoding_type='bio').process(data_bundle)

    src_vocab = data_bundle.get_vocab('chars')
    tgt_vocab = data_bundle.get_vocab('target')

    for i in range(0, 27):
        data_bundle.get_dataset('test').delete_instance(0)

    #至此数据处理完毕,两个词典也构建完成
    #构建新的dataset

    from fastNLP import Instance

    for i in range(0, len(sentences)):
        my_raw_chars = []
        my_target = []
        my_words = []
        for j in range(0, len(sentences[i])):
            my_raw_chars.append(sentences[i][j])
            my_target.append(0)
            my_words.append(src_vocab.to_index(sentences[i][j]))

        my_seq_len = len(sentences[i])

        ins = Instance()

        ins.add_field('raw_chars', my_raw_chars)
        ins.add_field('target', my_target)
        ins.add_field('chars', my_words)
        ins.add_field('seq_len', my_seq_len)

        data_bundle.get_dataset('test').append(ins)

    data_bundle.get_dataset('test').delete_instance(0)

    #加载模型
    from fastNLP.io import ModelLoader

    loader = ModelLoader()

    model = loader.load_pytorch_model("./save/bilstmcrf_sec_ner.pkl")

    data_bundle.get_dataset('test').rename_field(
        'chars',
        'words')  # 这是由于BiLSTMCRF模型的forward函数接受的words,而不是chars,所以需要把这一列重新命名

    from fastNLP import SpanFPreRecMetric
    metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab('target'))

    #进行测试
    from fastNLP import Tester

    tester = Tester(data_bundle.get_dataset('test'), model, metrics=metric)

    final = tester.get_pred()

    output = ''
    labels = []
    #我们要的是final所有内容
    #原test有两个batch 数目为16 12
    for i in range(0, len(final)):
        for j in range(0, len(final[i])):
            my_label = []
            for item in final[i][j]:
                my_label.append(tgt_vocab.to_word(item.cpu().item()))
            labels.append(my_label)

    print(labels[0])
    print(final[0][0])

    for i in range(0, len(sentences)):
        for j in range(0, len(sentences[i])):
            output = output + sentences[i][j] + ' ' + labels[i][j] + '\n'

        output = output + '\n'
    return output
Пример #22
0
def main():
    if args.do_eval:
        torch.multiprocessing.set_start_method('spawn', force=True)

    if args.model == 'bert':

        model = BertCRF(embed, [data_bundle.get_vocab('target')],
                        encoding_type='bioes')

    else:
        model = StackedTransformersCRF(
            tag_vocabs=[data_bundle.get_vocab('target')],
            embed=embed,
            num_layers=num_layers,
            d_model=d_model,
            n_head=n_heads,
            feedforward_dim=dim_feedforward,
            dropout=trans_dropout,
            after_norm=after_norm,
            attn_type=attn_type,
            bi_embed=None,
            fc_dropout=fc_dropout,
            pos_embed=pos_embed,
            scale=attn_type == 'transformer')
        model = torch.nn.DataParallel(model)

    if args.do_eval:
        if os.path.exists(os.path.expanduser(args.saved_model)):
            print("Load checkpoint from {}".format(
                os.path.expanduser(args.saved_model)))
            model = torch.load(args.saved_model)
            model.to('cuda')
            print('model to CUDA')

    optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)

    callbacks = []
    clip_callback = GradientClipCallback(clip_type='value', clip_value=5)
    evaluate_callback = EvaluateCallback(data_bundle.get_dataset('test'))
    checkpoint_callback = CheckPointCallback(os.path.join(
        directory, 'model.pth'),
                                             delete_when_train_finish=False,
                                             recovery_fitlog=True)

    if warmup_steps > 0:
        warmup_callback = WarmupCallback(warmup_steps, schedule='linear')
        callbacks.append(warmup_callback)
    callbacks.extend([clip_callback, checkpoint_callback, evaluate_callback])

    if not args.do_eval:
        trainer = Trainer(data_bundle.get_dataset('train'),
                          model,
                          optimizer,
                          batch_size=batch_size,
                          sampler=BucketSampler(),
                          num_workers=no_cpu,
                          n_epochs=args.n_epochs,
                          dev_data=data_bundle.get_dataset('dev'),
                          metrics=SpanFPreRecMetric(
                              tag_vocab=data_bundle.get_vocab('target'),
                              encoding_type=encoding_type),
                          dev_batch_size=batch_size,
                          callbacks=callbacks,
                          device=args.device,
                          test_use_tqdm=True,
                          use_tqdm=True,
                          print_every=300,
                          save_path=os.path.join(directory, 'best'))

        trainer.train(load_best_model=True)

        predictor = Predictor(model)
        predict(os.path.join(directory, 'predictions_dev.tsv'), data_bundle,
                predictor, 'dev')
        predict(os.path.join(directory, 'predictions_test.tsv'), data_bundle,
                predictor, 'test')

    else:
        print('Predicting')
        # predictions of multiple files
        torch.multiprocessing.freeze_support()
        model.share_memory()
        predictor = Predictor(model)

        if len(files) > multiprocessing.cpu_count():
            with torch.multiprocessing.Pool(processes=no_cpu) as p:
                with tqdm(total=len(files)) as pbar:
                    for i, _ in enumerate(
                            p.imap_unordered(
                                partial(predict,
                                        data_bundle=data_bundle,
                                        predictor=predictor,
                                        predict_on='train',
                                        do_eval=args.do_eval), files)):
                        pbar.update()
        else:
            for file in tqdm(files):
                predict(file, data_bundle, predictor, 'train', args.do_eval)
Пример #23
0
 save_serialize_obj(char_vocab, char_vocab_pkl_file)
 save_serialize_obj(target_vocab, target_vocab_pkl_file)
 logger.info('词典序列化:{}'.format(char_vocab_pkl_file))
 logger.warn('选择预训练词向量')
 # model_dir_or_name = 'cn-wwm'
 model_dir_or_name = './data/embed/ERNIE_1.0_max-len-512-pytorch'
 bert_embed = BertEmbedding(vocab=char_vocab, model_dir_or_name=model_dir_or_name, requires_grad=False)
 logger.warn('神经网络模型')
 model = BiLSTMCRF(embed=bert_embed, num_classes=len(target_vocab), num_layers=1, hidden_size=200, dropout=0.5,
                   target_vocab=target_vocab)
 logger.info(model)
 logger.warn('训练超参数设定')
 loss = LossInForward()
 optimizer = Adam([param for param in model.parameters() if param.requires_grad])
 # metric = AccuracyMetric()
 metric = SpanFPreRecMetric(tag_vocab=data_bundle.get_vocab(Const.TARGET), only_gross=False)  # 若only_gross=False, 即还会返回各个label的metric统计值
 device = 'cuda' if torch.cuda.is_available() else 'cpu'  # 如果有gpu的话在gpu上运行,训练速度会更快
 logger.info('device:{}'.format(device))
 batch_size = 32
 n_epochs = 10
 early_stopping = 10
 trainer = Trainer(
     save_path=model_path,
     train_data=data_bundle.get_dataset('train'),
     model=model,
     loss=loss,
     optimizer=optimizer,
     batch_size=batch_size,
     n_epochs=n_epochs,
     dev_data=data_bundle.get_dataset('dev'),
     metrics=metric,
Пример #24
0
                     hidden_size=1200,
                     num_layers=1,
                     tag_vocab=data.vocabs[Const.TARGET],
                     encoding_type=encoding_type,
                     dropout=dropout)

callbacks = [
    GradientClipCallback(clip_value=5, clip_type='value'),
    EvaluateCallback(data.datasets['test'])
]

optimizer = SGD(model.parameters(), lr=lr, momentum=0.9)
scheduler = LRScheduler(
    LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
callbacks.append(scheduler)

trainer = Trainer(train_data=data.get_dataset('train'),
                  model=model,
                  optimizer=optimizer,
                  sampler=BucketSampler(num_buckets=100),
                  device=0,
                  dev_data=data.get_dataset('dev'),
                  batch_size=batch_size,
                  metrics=SpanFPreRecMetric(
                      tag_vocab=data.vocabs[Const.TARGET],
                      encoding_type=encoding_type),
                  callbacks=callbacks,
                  num_workers=1,
                  n_epochs=100,
                  dev_batch_size=256)
trainer.train()
Пример #25
0
class CommonSeqEvaluator(BaseSeqEvaluator):

    def __init__(self, tag_vocab, config):
        super(CommonSeqEvaluator, self).__init__()
        self._config = config
        self._vocab = Vocabulary()
        self._vocab.add_word_lst(tag_vocab.stoi.keys())
        self._evaluator = SpanFPreRecMetric(self._vocab,  only_gross=False, f_type=config.evaluation.type)
        self._pad_index = tag_vocab.stoi['<pad>']

    def _change_type(self, pred, target):
        seq_len = torch.tensor([len(text) for text in pred])
        max_len = max(seq_len)
        for text in pred:
            if len(text) < max_len:
                text.extend([self._pad_index for i in range(max_len - len(text))])
        pred = torch.tensor(pred).to(self._config.device)
        return pred, target, seq_len

    def evaluate(self, pred, target):
        # 送入batch数据
        pred, target, seq_len = self._change_type(pred, target)
        self._evaluator.evaluate(pred, target, seq_len)

    def _get_eval_result(self):
        # 统计所有batch数据的结果
        eval_dict = self._evaluator.get_metric()
        if self._config.data.chip_relation.use_chip_relation:
            names = list(set([label[2:] for label in self._vocab.word2idx.keys()][3:]))
            if '其他' in names:
                names.remove('其他')
        else:
            names = list(set([label[2:] for label in self._vocab.word2idx.keys()][3:]))
        head = ['label', '   precision', '   recall', '  F1_score']
        table = []
        table.append(head)
        for i in range(len(names)):
            ps = str(round(eval_dict['pre-' + names[i].lower()], 3))
            rs = str(round(eval_dict['rec-' + names[i].lower()], 3))
            f1s = str(round(eval_dict['f-' + names[i].lower()], 3))
            table.append([names[i], ps, rs, f1s])
        ps = str(round(eval_dict['pre'], 3))
        rs = str(round(eval_dict['rec'], 3))
        f1s = str(round(eval_dict['f'], 3))
        table.append(['{}_average'.format(self._config.evaluation.type), ps, rs, f1s])
        return eval_dict, table

    def get_eval_output(self):
        # 外部获取结果接口,并且可以配置是否打印(eval结果保存暂时默认保存)
        result, table = self._get_eval_result()
        if self._config.evaluation.is_display:
            self._print_table(table)
        self._write_csv(table)
        return result

    def _print_table(self, List):
        # 展示
        k = len(List)
        v = len(List[0])
        for i in range(k):
            for j in range(v):
                print(List[i][j].rjust(14), end=' ')
            print()

    def _write_csv(self, table):
        wb = Workbook()
        ws = wb['Sheet']
        for line in range(1,len(table)+1):
            for column in range(1, 5):
                ws.cell(line, column, table[line-1][column-1])
        save_path = self._config.learn.dir.saved + '/eval_result.xlsx'
        wb.save(save_path)
Пример #26
0
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

callbacks = []
clip_callback = GradientClipCallback(clip_type='value', clip_value=5)
evaluate_callback = EvaluateCallback(data_bundle.get_dataset('test'))

if warmup_steps > 0:
    warmup_callback = WarmupCallback(warmup_steps, schedule='linear')
    callbacks.append(warmup_callback)
callbacks.extend([clip_callback, evaluate_callback])

trainer = Trainer(data_bundle.get_dataset('train'),
                  model,
                  optimizer,
                  batch_size=batch_size,
                  sampler=BucketSampler(),
                  num_workers=2,
                  n_epochs=n_epochs,
                  dev_data=data_bundle.get_dataset('dev'),
                  metrics=SpanFPreRecMetric(
                      tag_vocab=data_bundle.get_vocab('target'),
                      encoding_type=encoding_type),
                  dev_batch_size=batch_size,
                  callbacks=callbacks,
                  device=device,
                  test_use_tqdm=False,
                  use_tqdm=True,
                  print_every=300,
                  save_path=None)
trainer.train(load_best_model=False)
Пример #27
0
def train(config):
    train_data = pickle.load(
        open(os.path.join(config.data_path, config.train_name), "rb"))
    # debug
    train_data = train_data[0:100]
    dev_data = pickle.load(
        open(os.path.join(config.data_path, config.dev_name), "rb"))
    print(len(train_data), len(dev_data))
    # test_data = pickle.load(open(os.path.join(config.data_path, config.test_name), "rb"))
    # load w2v data
    # weight = pickle.load(open(os.path.join(config.data_path, config.weight_name), "rb"))

    word_vocab = pickle.load(
        open(os.path.join(config.data_path, config.word_vocab_name), "rb"))
    char_vocab = pickle.load(
        open(os.path.join(config.data_path, config.char_vocab_name), "rb"))
    pos_vocab = pickle.load(
        open(os.path.join(config.data_path, config.pos_vocab_name), "rb"))
    spo_vocab = pickle.load(
        open(os.path.join(config.data_path, config.spo_vocab_name), "rb"))
    tag_vocab = pickle.load(
        open(os.path.join(config.data_path, config.tag_vocab_name), "rb"))
    print('word vocab', len(word_vocab))
    print('char vocab', len(char_vocab))
    print('pos vocab', len(pos_vocab))
    print('spo vocab', len(spo_vocab))
    print('tag vocab', len(tag_vocab))

    model = BiLSTM_CRF(config.batch_size,
                       len(word_vocab),
                       len(char_vocab),
                       len(pos_vocab),
                       len(spo_vocab),
                       config.embed_dim,
                       config.hidden_dim,
                       tag_vocab.idx2word,
                       dropout=0.5)

    optimizer = SGD(lr=config.lr, momentum=config.momentum)
    timing = TimingCallback()
    early_stop = EarlyStopCallback(config.patience)
    loss = NLLLoss()
    metrics = SpanFPreRecMetric(tag_vocab)
    # accuracy = AccuracyMetric(pred='output', target='target')

    trainer = Trainer(train_data=train_data,
                      model=model,
                      loss=loss,
                      metrics=metrics,
                      batch_size=config.batch_size,
                      n_epochs=config.epoch,
                      dev_data=dev_data,
                      save_path=config.save_path,
                      check_code_level=-1,
                      print_every=100,
                      validate_every=0,
                      optimizer=optimizer,
                      use_tqdm=False,
                      device=config.device,
                      callbacks=[timing, early_stop])
    trainer.train()