Пример #1
0
def evaluate(data_source):
    model.eval()
    total_loss = 0
    processed_data_size = 0
    with torch.no_grad():
        for i in range(0, data_source.size(1) - 1, args.validseqlen):
            if i + args.seq_len - args.validseqlen >= data_source.size(1) - 1:
                continue
            data, targets = get_batch(data_source, i, args, evaluation=True)

            # Discard the effective history, just like in training
            eff_history = args.seq_len - args.validseqlen
            final_target = targets[:, eff_history:].contiguous().view(-1)
            if args.causal_stack:
                batchsize = data.shape[0]
                valseqlen = min(args.validseqlen, data.shape[1] - eff_history)
                causal_stack = torch.vstack([
                    data[:, j:j + eff_history]
                    for j in range(1, valseqlen + 1)
                ]).reshape(valseqlen, batchsize, eff_history)
                final_output = model(
                    causal_stack.permute(1, 0, 2).reshape(
                        len(final_target), eff_history))[:, -1].contiguous()
            else:
                output = model(data)
                final_output = output[:, eff_history:].contiguous().view(
                    -1, n_words)
            loss = criterion(final_output, final_target)

            # Note that we don't add TAR loss here
            total_loss += (data.size(1) - eff_history) * loss.item()
            processed_data_size += data.size(1) - eff_history

        return total_loss / processed_data_size
Пример #2
0
def evaluate(data_source):
    #区分model.train()和model.eval()的区别,训练时用train,测试时用eval,eval会把BatchNorm和DropOut固定住。相当于测试时不划分batch,并且没有dropout的残缺网络而用完整网络。
    model.eval()
    total_loss = 0
    processed_data_size = 0
    ## with关键字能够自动帮忙执行 close 方法,不能处理异常   torch.no_grad()是一个上下文管理器,被该语句 wrap 起来的部分将不会track 梯度。因为不需更新网络所以,都包起来,不让track梯度。
    with torch.no_grad():
        for i in range(0, data_source.size(1) - 1, args.validseqlen):
            if i + args.seq_len - args.validseqlen >= data_source.size(1) - 1:
                continue
            data, targets = get_batch(data_source, i, args, evaluation=True)
            output = model(data)

            # Discard the effective history, just like in training
            eff_history = args.seq_len - args.validseqlen
            #contiguous是配合view使用的,view是用来改变tensor的形状的
            final_output = output[:, eff_history:].contiguous().view(-1, n_words)
            final_target = targets[:, eff_history:].contiguous().view(-1)

            loss = criterion(final_output, final_target)

            # Note that we don't add TAR loss here
            total_loss += (data.size(1) - eff_history) * loss.item()
            processed_data_size += data.size(1) - eff_history
        return total_loss / processed_data_size
Пример #3
0
def train():
    # Turn on training mode which enables dropout.
    global writer
    global train_data
    global write_graph
    model.train()
    total_loss = 0
    start_time = time.time()
    for batch_idx, i in enumerate(
            range(0,
                  train_data.size(1) - 1, args.validseqlen)):
        if i + args.seq_len - args.validseqlen >= train_data.size(1) - 1:
            continue
        data, targets = get_batch(train_data, i, args)
        optimizer.zero_grad()
        if write_graph:
            writer.add_graph(model, data)
            write_graph = False

        output = model(data)

        # Discard the effective history part
        eff_history = args.seq_len - args.validseqlen
        if eff_history < 0:
            raise ValueError(
                "Valid sequence length must be smaller than sequence length!")
        final_target = targets[:, eff_history:].contiguous().view(-1)
        final_output = output[:, eff_history:].contiguous().view(-1, n_words)
        loss = criterion(final_output, final_target)

        loss.backward()
        if args.clip > 0:
            torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        optimizer.step()

        total_loss += loss.data

        if batch_idx % args.log_interval == 0 and batch_idx > 0:
            cur_loss = total_loss.item() / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.5f} | ms/batch {:5.5f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch_idx,
                    train_data.size(1) // args.validseqlen, lr,
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            writer.add_scalar('loss', cur_loss, batch_idx + 1)
            writer.add_scalar('perplexity', math.exp(cur_loss), batch_idx + 1)
            writer.add_scalar('learning rate', lr)
            total_loss = 0
            start_time = time.time()
Пример #4
0
def evaluate(data_source):
    model.eval()
    total_loss = 0
    processed_data_size = 0
    for i in range(0, data_source.size(1) - 1, args.validseqlen):
        if i + args.seq_len - args.validseqlen >= data_source.size(1) - 1:
            continue
        data, targets = get_batch(data_source, i, args, evaluation=True)
        output = model(data)

        # Discard the effective history, just like in training
        eff_history = args.seq_len - args.validseqlen
        final_output = output[:, eff_history:].contiguous().view(-1, n_words)
        final_target = targets[:, eff_history:].contiguous().view(-1)

        loss = criterion(final_output, final_target)

        # Note that we don't add TAR loss here
        total_loss += (data.size(1) - eff_history) * loss.data
        processed_data_size += data.size(1) - eff_history
    return total_loss[0] / processed_data_size
Пример #5
0
def evaluate(data_source):
    model.eval()
    total_loss = 0
    processed_data_size = 0
    for i in range(0, data_source.size(1) - 1, args.validseqlen):
        if i + args.seq_len - args.validseqlen >= data_source.size(1) - 1:
            continue
        data, targets = get_batch(data_source, i, args, evaluation=True)
        output = model(data)

        # Discard the effective history, just like in training
        eff_history = args.seq_len - args.validseqlen
        final_output = output[:, eff_history:].contiguous().view(-1, n_words)
        final_target = targets[:, eff_history:].contiguous().view(-1)

        loss = criterion(final_output, final_target)

        # Note that we don't add TAR loss here
        total_loss += (data.size(1) - eff_history) * loss.data
        processed_data_size += data.size(1) - eff_history
    return total_loss[0] / processed_data_size
Пример #6
0
def train():
    # Turn on training mode which enables dropout.
    global train_data
    model.train()
    total_loss = 0
    start_time = time.time()
    for batch_idx, i in enumerate(range(0, train_data.size(1) - 1, args.validseqlen)):
        if i + args.seq_len - args.validseqlen >= train_data.size(1) - 1:
            continue
        data, targets = get_batch(train_data, i, args)
        optimizer.zero_grad()
        output = model(data)

        # Discard the effective history part
        eff_history = args.seq_len - args.validseqlen
        if eff_history < 0:
            raise ValueError("Valid sequence length must be smaller than sequence length!")
        final_target = targets[:, eff_history:].contiguous().view(-1)
        final_output = output[:, eff_history:].contiguous().view(-1, n_words)
        loss = criterion(final_output, final_target)

        loss.backward()
        if args.clip > 0:
            torch.nn.utils.clip_grad_norm(model.parameters(), args.clip)
        optimizer.step()

        total_loss += loss.data

        if batch_idx % args.log_interval == 0 and batch_idx > 0:
            cur_loss = total_loss[0] / args.log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.5f} | ms/batch {:5.5f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch_idx, train_data.size(1) // args.validseqlen, lr,
                elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()
Пример #7
0
def train(ep):
    # Turn on training mode which enables dropout.
    global train_data
    model.train()
    train_loss, total_loss = 0, 0
    start_time = time.time()
    for batch_idx, i in enumerate(
            range(0,
                  train_data.size(1) - 1, args.validseqlen)):
        if i + args.seq_len - args.validseqlen >= train_data.size(1) - 1:
            continue
        data, targets = get_batch(train_data, i, args)
        optimizer.zero_grad()

        # Discard the effective history part
        eff_history = args.seq_len - args.validseqlen
        if eff_history < 0:
            raise ValueError(
                "Valid sequence length must be smaller than sequence length!")
        final_target = targets[:, eff_history:].contiguous().view(-1)
        if args.causal_stack:
            valseqlen = min(args.validseqlen, data.shape[1] - eff_history)
            #causal_stack = torch.vstack([data[:,j:j+eff_history] for j in range(1, valseqlen+1)]).reshape(valseqlen, args.batch_size, eff_history)
            #final_output = model(causal_stack.permute(1, 0, 2).reshape(len(final_target), eff_history))[:,-1].contiguous()
            causal_stack = torch.vstack([
                data[:, j:j + eff_history] for j in range(1, valseqlen + 1)
            ]).reshape(valseqlen, args.batch_size,
                       eff_history).permute(1, 0,
                                            2).reshape(len(final_target),
                                                       eff_history)
            span = len(final_target) // args.accumulation_rounds
            intervals = [(offset, offset + span)
                         for offset in range(0, len(final_target), span)]
            if len(intervals) > args.accumulation_rounds:
                intervals = intervals[:-1]
                intervals[-1] = (intervals[-1][0], len(final_target))
            for a, b in intervals:
                loss = criterion(
                    model(causal_stack[a:b])[:, -1].contiguous(),
                    final_target[a:b]) / args.accumulation_rounds
                loss.backward()
                train_loss += loss.item()
                total_loss += loss.item()
        else:
            output = model(data)
            final_output = output[:,
                                  eff_history:].contiguous().view(-1, n_words)
            loss = criterion(final_output, final_target)
            loss.backward()
            train_loss += loss.item()
            total_loss += loss.item()

        if args.clip > 0:
            torch.nn.utils.clip_grad_norm_(model.model_weights(), args.clip)
        optimizer.step()

        if batch_idx % args.log_interval == 0 and batch_idx > 0:
            cur_loss = train_loss / args.log_interval
            elapsed = time.time() - start_time
            print(
                '| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.5f} | ms/batch {:5.5f} | '
                'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch_idx,
                    train_data.size(1) // args.validseqlen,
                    optimizer.optimizers[0].param_groups[0]['lr'],
                    elapsed * 1000 / args.log_interval, cur_loss,
                    math.exp(cur_loss)))
            train_loss = 0
            start_time = time.time()

    writer.add_scalar('train/loss', total_loss / (batch_idx + 1.), ep)