Exemplo n.º 1
0
def loop(device):
    output("start looping...")
    while True:
        time.sleep(0.05)
        a, b = torch.rand(233, 233, 233).to(device), torch.rand(233, 233,
                                                                233).to(device)
        c = a * b
        a = c
Exemplo n.º 2
0
def run_once(cfg, dataset, vocab, device):
    model = TransitionModel(vocab=vocab, **cfg.model)
    para_num = sum([np.prod(list(p.size())) for p in model.parameters()])
    output(f'param num: {para_num}, {para_num / 1000000:4f}M')
    model.to(device=device)

    optimizer = build_optimizer(model, **cfg.optim)
    scheduler = None
    writer = None
    trainer = Trainer(vars(cfg), dataset, vocab, model, optimizer, None,
                      scheduler, writer, device, **cfg.trainer)

    # 训练过程
    trainer.train()

    return model.metric
Exemplo n.º 3
0
    def process_one(self, one_set, name, device, batch_size, epoch=None):
        """ epoch is None means test stage.
        """
        loader = self.get_loader(one_set, batch_size)
        len_loader = len(loader)
        losses = torch.zeros(len_loader, device=device)

        for i, batch in enumerate(loader):
            batch = to_device(batch, device)
            model_output = self.model(**batch)
            losses[i] = model_output['loss'].item()
            if another:
                # ensemble预测
                scores = another(**batch)['scores']
                scores = tensor_op(scores, model_output['scores'])
                best_paths = self.model.crf.viterbi_tags(
                    scores, batch['mask'], 1)
                model_output['predicted_tags'] = cast(
                    List[List[int]], [x[0][0] for x in best_paths])

            # 记录测试结果
            for j, predicted in enumerate(model_output['predicted_tags']):
                sid = i * batch_size + j
                length = batch['seq_lens'][j]
                for n, word in enumerate(batch['sentences'][j]):
                    pos = vocab.index_to_token(batch['upostag'][j, n].item(),
                                               'upostag')
                    label = vocab.index_to_token(batch['labels'][j, n].item(),
                                                 'labels')
                    indicator = batch['indicator'][j, n].item()
                    prediction = vocab.index_to_token(predicted[n], 'labels')
                    table.append([
                        sid, length, n, word, pos, indicator, label, prediction
                    ])

        metric_counter = copy.deepcopy(self.model.metric.counter)
        metric = self.model.get_metrics(reset=True)
        if epoch is not None and self.writer is not None:
            metric['loss'] = losses.mean()
            self.add_scalars('Very_Detail', metric, epoch, name)
            self.writer.flush()
        elif epoch is None:
            output(f"Test {name} compete, {format_metric(metric)}")
        return metric_counter, metric, losses
Exemplo n.º 4
0
def run_once(cfg: Config, vocab, dataset, device, parser):
    """ 一次训练流程。"""
    model = build_model(vocab=vocab, depsawr=parser, **cfg['model'])
    para_num = sum([np.prod(list(p.size())) for p in model.parameters()])
    output(f'param num: {para_num}, {para_num / 1000000:4f}M')
    model.to(device=device)

    optimizer = build_optimizer(model, **cfg['optim'])
    scheduler = None

    if cfg['trainer']['tensorboard'] and _ARGS.debug:
        cfg['trainer']['tensorboard'] = False
    # cfg['trainer']['log_batch'] = _ARGS.debug
    trainer = Trainer(cfg, dataset, vocab, model, optimizer, None, scheduler,
                      device, **cfg['trainer'])
    # trainer.train_func = train_func  # 为了计时

    # 训练过程
    trainer.train()
    trainer.load()  # 加载配置文件中给定path的checkpoint,存档模式为best时有意义
    return trainer.test(dataset['test'], 64)
Exemplo n.º 5
0
def select_vec(dataset, vec_path, new_path):
    counter = defaultdict(int)
    for data in dataset.values():
        if isinstance(data, DataSet):
            data = [data]
        elif isinstance(data, dict):
            data = data.values()
        for one in data:
            for ins in one.data:
                for w in ins['words']:
                    counter[w] += 1

    new_vec = []
    with codecs.open(vec_path, mode='r', encoding='UTF-8') as file:
        for line in file.readlines():
            if line.split()[0] in counter:
                new_vec.append(line)

    with codecs.open(new_path, mode='w', encoding='UTF-8') as file:
        file.write(f"{len(new_vec)} 300\n")
        file.writelines(new_vec)

    output(f"save at <{new_path}>")
Exemplo n.º 6
0
def set_seed(seed: int = 123):
    output(f"Process id: {os.getpid()}, cuda: {_ARGS.cuda}, set seed {seed}")
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
Exemplo n.º 7
0
def test_result(cfg: Config,
                vocab,
                dataset,
                device,
                parser,
                ensemble_path=None):
    model = build_model(vocab=vocab, depsawr=parser, **cfg['model'])
    para_num = sum([np.prod(list(p.size())) for p in model.parameters()])
    output(f'param num: {para_num}, {para_num / 1000000:4f}M')
    model.to(device=device)
    optimizer = build_optimizer(model, **cfg['optim'])
    trainer = Trainer(cfg, dataset, vocab, model, optimizer, None, None,
                      device, **cfg['trainer'])
    trainer.load()

    tensor_op = tensor_avg
    if ensemble_path and os.path.isfile(ensemble_path):
        # 如果给了ensemble_path,将其加载
        another = build_model(vocab=vocab, depsawr=parser, **cfg['model'])
        another.to(device=device)
        checkpoint = torch.load(ensemble_path, map_location=device)
        another.load_state_dict(checkpoint['model'])
        another.test_mode(device)
        print(f"===> model loaded from <{ensemble_path}>")

        # 发射概率矩阵也按策略变换
        tran = tensor_op(model.crf.transitions.data,
                         another.crf.transitions.data)
        model.crf.transitions.data = tran
    else:
        another = None

    table = [[
        'sentence_id', 'length', 'word_id', 'word', 'pos', 'indicator',
        'label', 'prediction'
    ]]

    def process_one(self, one_set, name, device, batch_size, epoch=None):
        """ epoch is None means test stage.
        """
        loader = self.get_loader(one_set, batch_size)
        len_loader = len(loader)
        losses = torch.zeros(len_loader, device=device)

        for i, batch in enumerate(loader):
            batch = to_device(batch, device)
            model_output = self.model(**batch)
            losses[i] = model_output['loss'].item()
            if another:
                # ensemble预测
                scores = another(**batch)['scores']
                scores = tensor_op(scores, model_output['scores'])
                best_paths = self.model.crf.viterbi_tags(
                    scores, batch['mask'], 1)
                model_output['predicted_tags'] = cast(
                    List[List[int]], [x[0][0] for x in best_paths])

            # 记录测试结果
            for j, predicted in enumerate(model_output['predicted_tags']):
                sid = i * batch_size + j
                length = batch['seq_lens'][j]
                for n, word in enumerate(batch['sentences'][j]):
                    pos = vocab.index_to_token(batch['upostag'][j, n].item(),
                                               'upostag')
                    label = vocab.index_to_token(batch['labels'][j, n].item(),
                                                 'labels')
                    indicator = batch['indicator'][j, n].item()
                    prediction = vocab.index_to_token(predicted[n], 'labels')
                    table.append([
                        sid, length, n, word, pos, indicator, label, prediction
                    ])

        metric_counter = copy.deepcopy(self.model.metric.counter)
        metric = self.model.get_metrics(reset=True)
        if epoch is not None and self.writer is not None:
            metric['loss'] = losses.mean()
            self.add_scalars('Very_Detail', metric, epoch, name)
            self.writer.flush()
        elif epoch is None:
            output(f"Test {name} compete, {format_metric(metric)}")
        return metric_counter, metric, losses

    trainer.process_one = process_one  # 覆盖trainer处理函数
    trainer.test(dataset['test'], 64)

    with codecs.open(f"./dev/result/{trainer.prefix}.csv",
                     mode='w',
                     encoding='UTF-8') as file:
        writer = csv.writer(file)
        writer.writerows(table)
    output(f"saved <./dev/result/{trainer.prefix}.csv>")
Exemplo n.º 8
0
def read_data(name):
    with codecs.open(f"{PATH}data-{name}.bin", 'rb') as f:
        output(f"===> loading from <{PATH}data-{name}.bin>")
        return pickle.load(f)
Exemplo n.º 9
0
def save_data(name, dataset, vocab, index=False):
    if index:
        index_dataset(dataset, vocab)
    with codecs.open(f"{PATH}data-{name}.bin", 'wb') as f:
        pickle.dump((dataset, vocab), f)
    output(f"===> saved at <{PATH}data-{name}.bin>")