示例#1
0
def load_config_from_file(obj, config_format, folder_name, file_name):
    try:
        try:
            f = open(here(file_name))
        except OSError:
            loc = user_config_location(folder_name, file_name)
            f = open(loc)
        cfg = f.read()
        f.close()
        load_config(obj, config_format, cfg)
        return True
    except (OSError, IOError):
        return False
示例#2
0
def load_config_from_file(obj, config_format, folder_name, file_name):
    try:
        try:
            f = open(here(file_name))
        except OSError:
            loc = user_config_location(folder_name, file_name)
            f = open(loc)
        cfg = f.read()
        f.close()
        load_config(obj, config_format, cfg)
        return True
    except (OSError, IOError):
        return False
示例#3
0
def save_config_to_file(obj, config_format, folder_name, file_name):
    try:
        cfg = save_config(obj, config_format)
        try:
            f = open(here(file_name), 'w')
        except OSError:
            loc = user_config_location(folder_name, file_name)
            makedirs(loc)
            f = open(loc, 'w')
        f.write(cfg)
        f.close()
        return True
    except (OSError, IOError):
        return False
示例#4
0
def save_config_to_file(obj, config_format, folder_name, file_name):
    try:
        cfg = save_config(obj, config_format)
        try:
            f = open(here(file_name), "w")
        except OSError:
            loc = user_config_location(folder_name, file_name)
            makedirs(loc)
            f = open(loc, "w")
        f.write(cfg)
        f.close()
        return True
    except (OSError, IOError):
        return False
示例#5
0
def go(arg):

    if arg.seed < 0:
        seed = random.randint(0, 1000000)
        print('random seed: ', seed)
    else:
        torch.manual_seed(arg.seed)

    tbw = SummaryWriter(log_dir=arg.tb_dir)  # Tensorboard logging

    # load the data
    arg.path = here('data') if arg.path is None else arg.path
    data_train, data_val, data_test = read_dataset(arg.path, arg.dataset)

    # create the model
    model = GTransformer(emb=arg.embedding_size,
                         heads=arg.num_heads,
                         depth=arg.depth,
                         seq_length=arg.context,
                         num_tokens=NUM_TOKENS,
                         wide=arg.wide)

    if torch.cuda.is_available():
        model.cuda()

    print("Model parameters = %d" % sum(p.numel() for p in model.parameters()))

    if not arg.radam:
        opt = torch.optim.Adam(lr=arg.lr, params=model.parameters())
        # linear learning rate warmup
        sch = torch.optim.lr_scheduler.LambdaLR(
            opt, lambda i: min(i / (arg.lr_warmup / arg.batch_size), 1.0))
    else:
        opt = RAdam(model.parameters(), lr=arg.lr)

    if USE_APEX:
        model, opt = amp.initialize(model, opt, opt_level="O1", verbosity=0)

    best_bpb = np.inf
    best_step = 0

    # training loop
    # - note: we don't loop over the data, instead we sample a batch of random subsequences each time.
    for i in tqdm.trange(arg.num_batches):

        opt.zero_grad()

        # sample a batch of random subsequences
        starts = torch.randint(size=(arg.batch_size, ),
                               low=0,
                               high=data_train.size(0) - arg.context - 1)
        seqs_source = [
            data_train[start:start + arg.context] for start in starts
        ]
        seqs_target = [
            data_train[start + 1:start + arg.context + 1] for start in starts
        ]
        source = torch.cat([s[None, :] for s in seqs_source],
                           dim=0).to(torch.long)
        target = torch.cat([s[None, :] for s in seqs_target],
                           dim=0).to(torch.long)
        # - target is the same sequence as source, except one character ahead

        if torch.cuda.is_available():
            source, target = source.cuda(), target.cuda()
        source, target = Variable(source), Variable(target)

        output = model(source)

        loss = F.nll_loss(output.transpose(2, 1), target, reduction='mean')
        #tbw.add_scalar('transformer/train-loss', float(loss.item()) * LOG2E, i * arg.batch_size)

        if not USE_APEX:
            loss.backward()
        else:
            with amp.scale_loss(loss, opt) as scaled_loss:
                scaled_loss.backward()

        # clip gradients
        # - If the total gradient vector has a length > 1, we clip it back down to 1.
        if arg.gradient_clipping > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping)

        opt.step()

        if not arg.radam:
            sch.step()

        # - validate every {arg.test_every} steps. First we compute the
        #   compression on the validation (or a subset)
        #   then we generate some random text to monitor progress
        if i != 0 and (i % arg.test_every == 0 or i == arg.num_batches - 1):

            upto = arg.test_subset if arg.test_subset else data_val.size(0)
            data_sub = data_val[:upto]

            bits_per_byte = calculate_bpb(arg, model, data_sub)

            # print validation performance. 1 bit per byte is (currently) state of the art.
            print(f'epoch{i}: {bits_per_byte:.4} bits per byte')

            tag_scalar_dict = {
                'train-loss': float(loss.item()) * LOG2E,
                'eval-loss': bits_per_byte
            }
            tbw.add_scalars(f'transformer/loss', tag_scalar_dict,
                            i * arg.batch_size)

            if bits_per_byte < best_bpb:
                best_bpb = bits_per_byte
                best_step = i
                torch.save(model.state_dict(),
                           os.path.join(arg.tb_dir, 'best_model.pt'))

            print(f'best step {best_step}: {best_bpb:.4} bits per byte')

            generate_sequence(arg, model, data_val)

    # load the best model, calculate bpb of the test data and generate some random text
    finalize(arg, model, data_test)
示例#6
0
文件: data.py 项目: pbloem/embed
def load(name, limit=None):
    """
    Loads a knowledge graph dataset for link prediction purposes.

    :param name: Dataset name. "fb" for FB15k-237, "wn" for WN18k-RR, "toy" for a small toy dataset for testing.
    :param limit: If set, the total numnber of triples per set will be limited to this value. Useful for debugging.
    :return: Three lists of integer-triples (train, val, test), a pair of dicts to map entity strings from an to their
        integer ids, and a similar pair of dicts for the relations.
    """

    if name == 'fb':  # Freebase 15k 237
        train_file = util.here('data/fb15k237/train.txt')
        val_file = util.here('data/fb15k237/valid.txt')
        test_file = util.here('data/fb15k237/test.txt')

    elif name == 'wn':
        train_file = util.here('data/wn18rr/train.txt')
        val_file = util.here('data/wn18rr/valid.txt')
        test_file = util.here('data/wn18rr/test.txt')

    else:
        if os.path.isdir(util.here('data' + os.sep + name)):
            train_file = util.here(f'data/{name}/train.txt')
            val_file = util.here(f'data/{name}/valid.txt')
            test_file = util.here(f'data/{name}/test.txt')

        else:
            raise Exception(
                f'Could not find dataset with name {name} at location {util.here("data" + os.sep + name)}.'
            )

    train = load_strings(train_file)
    val = load_strings(val_file)
    test = load_strings(test_file)

    if limit:
        train = train[:limit]
        val = val[:limit]
        test = test[:limit]

    # mappings for nodes (n) and relations (r)
    nodes, rels = set(), set()
    for triple in train + val + test:
        nodes.add(triple[0])
        rels.add(triple[1])
        nodes.add(triple[2])

    i2n, i2r = list(nodes), list(rels)
    n2i, r2i = {n: i
                for i, n in enumerate(nodes)
                }, {r: i
                    for i, r in enumerate(rels)}

    traini, vali, testi = [], [], []

    for s, p, o in train:
        traini.append([n2i[s], r2i[p], n2i[o]])

    for s, p, o in val:
        vali.append([n2i[s], r2i[p], n2i[o]])

    for s, p, o in test:
        testi.append([n2i[s], r2i[p], n2i[o]])

    train, val, test = torch.tensor(traini), torch.tensor(vali), torch.tensor(
        testi)

    return train, val, test, (n2i, i2n), (r2i, i2r)
示例#7
0
def go_pods(arg):

    if arg.seed < 0:
        seed = random.randint(0, 1000000)
        print('random seed: ', seed)
    else:
        torch.manual_seed(arg.seed)

    tbw = SummaryWriter(log_dir=arg.tb_dir)  # Tensorboard logging

    df = pd.read_csv(here('./data/df_popular_podcasts.csv'))

    with open(here('./data/genre_IDs.txt')) as file:
        glist = eval(file.read())
        glist = {int(idstr): name for (idstr, name) in glist}
        rlist = {name: id for (id, name) in glist.items()}

    gs = set()
    for genres in df['Genre IDs']:
        genres = eval(genres)
        for genre in genres:
            g = int(genre)
            gs.add(g)

    i2g = list(gs)
    g2i = {g: i for i, g in enumerate(i2g)}

    train, val, test = df.iloc[:8000], df.iloc[8000:9000], df.iloc[9000:]

    # create the model
    model = GPT2Wrapper(iblocks=arg.iblocks,
                        csize=len(i2g),
                        gptname=arg.gpt_name)

    if arg.checkpoint is not None:
        model.load_state_dict(
            torch.load(arg.checkpoint, map_location=torch.device('cpu')))

    if torch.cuda.is_available():
        model.to('cuda')
        model.model.mod[0].to('cuda')

    tok = model.tokenizer
    opt = torch.optim.Adam(lr=arg.lr, params=model.parameters())
    # sch = torch.optim.lr_scheduler.LambdaLR(opt, lambda i: min(i / (arg.lr_warmup / arg.batch_size), 1.0))
    # -- linear learning rate warmup

    # training loop
    # -- note: we don't loop over the data, instead we sample a batch of random subsequences each time.
    seen = 0
    for e in range(arg.epochs):

        if e % arg.print_every == 0:
            with torch.no_grad():

                # Generate 10 titles from the seed
                genres = torch.zeros(1, len(i2g))
                for genre in PD_GENRES:
                    # print(glist[genre])
                    genres[0, g2i[genre]] = 1.0

                for i in range(10):

                    # generate and print some random text
                    seed = PD_SEED
                    input = torch.tensor(tok.encode(seed))

                    if torch.cuda.is_available():
                        input, genres = input.to('cuda'), genres.to('cuda')

                    outseq = []
                    for _ in range(PD_TITLE_LENTGH):
                        output = model(input[None, :], cond=genres)
                        c = sample(output[0, -1, :], arg.sampling_temp)
                        outseq.append(c)

                        input = torch.cat([input, c], dim=0)

                    outseq = torch.cat(outseq, dim=0)
                    outseq = model.tokenizer.decode(outseq)

                    with open(f'pd.e{e:03}i{i:02}.txt', 'w') as file:
                        print(outseq[len(PD_SEED):], file=file)
                        print(
                            '---------------------------------------------\n',
                            file=file)

                        print(PD_SEED + outseq, file=file)

                # Generate 10 random podcasts
                for i in range(10):
                    # generate a random genre
                    random_genre = random.choice(list(glist.keys()))

                    genres = torch.zeros(1, len(i2g))
                    genres[0, g2i[random_genre]] = 1.0

                    # generate and print some random text
                    seed = 'description: '
                    input = torch.tensor(tok.encode(seed))

                    if torch.cuda.is_available():
                        input, genres = input.to('cuda'), genres.to('cuda')

                    outseq = []
                    for _ in range(arg.print_size):
                        output = model(input[None, :], cond=genres)
                        c = sample(output[0, -1, :], arg.sampling_temp)
                        outseq.append(c)

                        input = torch.cat([input, c], dim=0)

                    outseq = torch.cat(outseq, dim=0)
                    outseq = model.tokenizer.decode(outseq)

                    with open(f'random.e{e:03}i{i:02}.txt', 'w') as file:
                        print('chosen genre ', glist[random_genre], file=file)
                        print('---------------------------------------------',
                              file=file)
                        print(seed, file=file)
                        print(outseq, flush=True, file=file)

        for fr in tqdm.trange(0, len(train), arg.batch_size):

            to = min(len(train), fr + arg.batch_size)

            dfbatch = df.iloc[fr:to]
            texts, genres = tobatch(dfbatch,
                                    tok,
                                    g2i,
                                    limit=arg.desc_clip,
                                    glist=glist)

            b = texts.size(0)
            source = torch.cat(
                [torch.empty(b, 1, dtype=torch.long).fill_(0), texts], dim=1)
            target = torch.cat(
                [texts, torch.empty(b, 1, dtype=torch.long).fill_(0)], dim=1)

            seen += b

            opt.zero_grad()

            if arg.dropout > 0.0:
                source = source * torch.empty_like(source).bernoulli_(
                    arg.dropout)
                #-- word dropout on the input (help the model use the conditionals)

            if torch.cuda.is_available():
                source, target, genres = source.to('cuda'), target.to(
                    'cuda'), genres.to('cuda')

            output = model(source, cond=genres)

            loss = F.cross_entropy(output.transpose(2, 1),
                                   target,
                                   reduction='mean')
            tbw.add_scalar('podcasts/train-loss',
                           float(loss.item()) * LOG2E, seen)

            loss.backward()

            # clip gradients
            # - If the total gradient vector has a length > 1, we clip it back down to 1.
            if arg.gradient_clipping > 0.0:
                nn.utils.clip_grad_norm_(model.parameters(),
                                         arg.gradient_clipping)

            opt.step()
            # sch.step()

            del loss, source, target, genres
            model.clear()

            # for obj in gc.get_objects():
            #     try:
            #         if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
            #             if obj.size(0) == b:
            #                 print(type(obj), obj.size())
            #     except:
            #         pass

        torch.save(model.state_dict(), './checkpoint.model')

        # - validate every {arg.test_every} steps. First we compute the
        #   compression on the validation (or a subset)
        #   then we generate some random text to monitor progress
        # if e != 0 and (e % arg.print_every == 0 or e == arg.epochs - 1):

        print('multipliers:')
        for block in model.iblocks:
            print('    ', block.mult)
        print()
示例#8
0
def go(arg):

    if arg.seed < 0:
        seed = random.randint(0, 1000000)
        print('random seed: ', seed)
    else:
        torch.manual_seed(arg.seed)

    tbw = SummaryWriter(log_dir=arg.tb_dir)  # Tensorboard logging

    arg.data = here('data/enwik8.gz') if arg.data is None else arg.data

    str_train, str_val, str_test = load_text(arg.data)
    str_train, str_test = (str_train + str_val, str_test) \
                            if arg.final else (str_train, str_val)

    # create the model
    model = GPT2Wrapper(iblocks=arg.iblocks)

    if torch.cuda.is_available():
        model.to('cuda')
        model.model.mod[0].to('cuda')

    # tokenize the data
    data_train, data_val, data_test = \
        torch.tensor(model.tokenizer.encode(str_train)), \
        torch.tensor(model.tokenizer.encode(str_val)), \
        torch.tensor(model.tokenizer.encode(str_test))

    opt = torch.optim.Adam(lr=arg.lr, params=model.parameters())
    # sch = torch.optim.lr_scheduler.LambdaLR(opt, lambda i: min(i / (arg.lr_warmup / arg.batch_size), 1.0))
    # -- linear learning rate warmup

    # training loop
    # -- note: we don't loop over the data, instead we sample a batch of random subsequences each time.
    for i in tqdm.trange(arg.num_batches):

        opt.zero_grad()

        # sample a batch of random subsequences
        starts = torch.randint(size=(arg.batch_size, ),
                               low=0,
                               high=data_train.size(0) - model.ctx - 1)
        seqs_source = [data_train[start:start + model.ctx] for start in starts]
        seqs_target = [
            data_train[start + 1:start + model.ctx + 1] for start in starts
        ]

        source = torch.cat([s[None, :] for s in seqs_source],
                           dim=0).to(torch.long)
        target = torch.cat([s[None, :] for s in seqs_target],
                           dim=0).to(torch.long)
        # -- target is the same sequence as source, except one character ahead

        if torch.cuda.is_available():
            source, target = source.to('cuda'), target.to('cuda')

        output = model(source)

        loss = F.cross_entropy(output.transpose(2, 1),
                               target,
                               reduction='mean')
        tbw.add_scalar('podcasts/train-loss',
                       float(loss.item()) * LOG2E, i * arg.batch_size)

        loss.backward()

        # clip gradients
        # - If the total gradient vector has a length > 1, we clip it back down to 1.
        if arg.gradient_clipping > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping)

        opt.step()
        # sch.step()

        model.clear()

        # - validate every {arg.test_every} steps. First we compute the
        #   compression on the validation (or a subset)
        #   then we generate some random text to monitor progress
        if i != 0 and (i % arg.print_every == 0 or i == arg.num_batches - 1):

            with torch.no_grad():

                # generate and print some random text
                seedfr = random.randint(
                    0,
                    data_test.size(0) - arg.print_seed_size)
                input = data_test[seedfr:seedfr + arg.print_seed_size].to(
                    torch.long)

                if torch.cuda.is_available():
                    input = input.cuda()

                # print the seed
                strinput = model.tokenizer.decode(input)
                print(f'[{strinput}]', end='')

                outseq = []
                for _ in range(arg.print_size):
                    output = model(input[None, :])
                    c = sample(output[0, -1, :], arg.sampling_temp)
                    outseq.append(c[None])

                    input = torch.cat([input[1:], c[None]], dim=0)

                outseq = torch.cat(outseq, dim=0)
                outseq = model.tokenizer.decode(outseq)

                print(outseq)

        # val
        if i != 0 and (i % arg.test_every == 0 or i == arg.num_batches - 1):

            with torch.no_grad():

                upto = data_test.size(
                    0) if i == arg.num_batches - 1 else arg.test_subset
                data_sub = data_test[:upto]

                bits, tot = 0.0, 0
                batch = [
                ]  # buffer, every time it fills up, we run it through the model

                for current in range(data_sub.size(0)):

                    fr = max(0, current - model.ctx)
                    to = current + 1

                    context = data_sub[fr:to].to(torch.long)
                    if context.size(0) < model.ctx + 1:
                        pad = torch.zeros(size=(model.ctx + 1 -
                                                context.size(0), ),
                                          dtype=torch.long)
                        context = torch.cat([pad, context], dim=0)

                        assert context.size(0) == model.ctx + 1

                    if torch.cuda.is_available():
                        context = context.cuda()

                    batch.append(context[None, :])

                    if len(
                            batch
                    ) == arg.test_batchsize or current == data_sub.size(0) - 1:

                        # batch is full, run it through the model
                        b = len(batch)

                        all = torch.cat(batch, dim=0)
                        source = all[:, :-1]  # input
                        target = all[:, -1]  # target values

                        output = model(source)

                        lnprobs = output[torch.arange(b, device=d()), -1,
                                         target]
                        log2probs = lnprobs * LOG2E  # convert from nats to bits

                        bits += -log2probs.sum()
                        batch = []  # empty buffer

                bits_per_byte = bits / data_sub.size(0)

                # print validation performance. 0.92 bit per byte is (currently) state of the art.
                print(f'epoch{i}: {bits_per_byte:.4} bits per byte')
                tbw.add_scalar(f'podcasts/eval-loss', bits_per_byte,
                               i * arg.batch_size)
示例#9
0
def go(arg):

    if arg.seed < 0:
        seed = random.randint(0, 1000000)
        print('random seed: ', seed)
    else:
        torch.manual_seed(arg.seed)

    tbw = SummaryWriter(log_dir=arg.tb_dir)  # Tensorboard logging

    # load the data (validation unless arg.final is true, then test)
    arg.data = here('data/enwik8.gz') if arg.data is None else arg.data

    data_train, data_val, data_test = enwik8(arg.data)
    data_train, data_test = (torch.cat([data_train, data_val], dim=0), data_test) \
                            if arg.final else (data_train, data_val)

    # create the model
    model = GTransformer(emb=arg.embedding_size,
                         heads=arg.num_heads,
                         depth=arg.depth,
                         seq_length=arg.context,
                         num_tokens=NUM_TOKENS)
    if torch.cuda.is_available():
        model.cuda()

    opt = torch.optim.Adam(lr=arg.lr, params=model.parameters())

    # training loop
    # - note: we don't loop over the data, instead we sample a batch of random subsequences each time.
    for i in tqdm.trange(arg.num_batches):

        # learning rate warmup
        # - we linearly increase the learning rate from 10e-10 to arg.lr over the first
        #   few thousand batches
        if arg.lr_warmup > 0 and i < arg.lr_warmup:
            lr = max((arg.lr / arg.lr_warmup) * i, 1e-10)
            opt.lr = lr

        opt.zero_grad()

        # sample a batch of random subsequences
        starts = torch.randint(size=(arg.batch_size, ),
                               low=0,
                               high=data_train.size(0) - arg.context - 1)
        seqs_source = [
            data_train[start:start + arg.context] for start in starts
        ]
        seqs_target = [
            data_train[start + 1:start + arg.context + 1] for start in starts
        ]
        source = torch.cat([s[None, :] for s in seqs_source],
                           dim=0).to(torch.long)
        target = torch.cat([s[None, :] for s in seqs_target],
                           dim=0).to(torch.long)
        # - target is the same sequence as source, except one character ahead

        if torch.cuda.is_available():
            source, target = source.cuda(), target.cuda()
        source, target = Variable(source), Variable(target)

        output = model(source)

        loss = F.nll_loss(output.transpose(2, 1), target, reduction='mean')
        tbw.add_scalar('transformer/train-loss',
                       float(loss.item()) * LOG2E, i * arg.batch_size)

        loss.backward()

        # clip gradients
        # - If the total gradient vector has a length > 1, we clip it back down to 1.
        if arg.gradient_clipping > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping)

        opt.step()

        # - validate every {arg.test_every} steps. First we compute the
        #   compression on the validation (or a subset)
        #   then we generate some random text to monitor progress
        if i != 0 and (i % arg.test_every == 0 or i == arg.num_batches - 1):

            upto = data_test.size(
                0) if i == arg.num_batches - 1 else arg.test_subset
            data_sub = data_test[:upto]

            with torch.no_grad():
                bits, tot = 0.0, 0
                batch = [
                ]  # buffer, every time it fills up, we run it through the model

                for current in range(data_sub.size(0)):

                    fr = max(0, current - arg.context)
                    to = current + 1

                    context = data_sub[fr:to].to(torch.long)
                    if context.size(0) < arg.context + 1:
                        pad = torch.zeros(size=(arg.context + 1 -
                                                context.size(0), ),
                                          dtype=torch.long)
                        context = torch.cat([pad, context], dim=0)

                        assert context.size(0) == arg.context + 1

                    if torch.cuda.is_available():
                        context = context.cuda()

                    batch.append(context[None, :])

                    if len(
                            batch
                    ) == arg.test_batchsize or current == data_sub.size(0) - 1:

                        # batch is full, run it through the model
                        b = len(batch)

                        all = torch.cat(batch, dim=0)
                        source = all[:, :-1]  # input
                        target = all[:, -1]  # target values

                        output = model(source)

                        lnprobs = output[torch.arange(b, device=d()), -1,
                                         target]
                        log2probs = lnprobs * LOG2E  # convert from nats to bits

                        bits += -log2probs.sum()
                        batch = []  # empty buffer

                bits_per_byte = bits / data_sub.size(0)

                # print validation performance. 1 bit per byte is (currently) state of the art.
                print(f'epoch{i}: {bits_per_byte:.4} bits per byte')
                tbw.add_scalar(f'transformer/eval-loss', bits_per_byte,
                               i * arg.batch_size)

                # generate some random text
                GENSIZE = 600
                TEMP = 0.5
                seedfr = random.randint(0, data_test.size(0) - arg.context)
                input = data_test[seedfr:seedfr + arg.context].to(torch.long)

                if torch.cuda.is_available():
                    input = input.cuda()

                input = Variable(input)

                print('[', end='', flush=True)
                for c in input:
                    print(str(chr(c)), end='', flush=True)
                print(']', end='', flush=True)

                for _ in range(GENSIZE):
                    output = model(input[None, :])
                    c = sample(output[0, -1, :], TEMP)
                    print(str(chr(max(32, c))), end='', flush=True)

                    input = torch.cat([input[1:], c[None]], dim=0)

                print()
示例#10
0
def go(arg):

    if arg.seed < 0:
        seed = random.randint(0, 1000000)
        print('random seed: ', seed)
    else:
        torch.manual_seed(arg.seed)

    tbw = SummaryWriter(log_dir=arg.tb_dir)  # Tensorboard logging

    # load the data (validation unless arg.final is true, then test)
    arg.data = here('data/enwik8.gz') if arg.data is None else arg.data

    data_train, data_val, data_test = enwik8(arg.data)
    data_train, data_test = (torch.cat([data_train, data_val], dim=0), data_test) \
                            if arg.final else (data_train, data_val)

    # create the model
    model = GTransformer(emb=arg.embedding_size,
                         heads=arg.num_heads,
                         depth=arg.depth,
                         seq_length=arg.context,
                         num_tokens=NUM_TOKENS,
                         attention_type=arg.attention_type)
    if torch.cuda.is_available():
        model.cuda()

    opt = torch.optim.Adam(lr=arg.lr, params=model.parameters())

    # Linear learning rate warmup
    sch = torch.optim.lr_scheduler.LambdaLR(
        opt, lambda i: min(i / (arg.lr_warmup / arg.batch_size), 1.0))

    # Training loop
    # -- We don't loop over the data, instead we sample a batch of random subsequences each time. This is not strictly
    #    better or worse as a training method, it's just a little simpler.
    #
    instances_seen = 0
    for i in tqdm.trange(arg.num_batches):

        opt.zero_grad()

        source, target = sample_batch(data_train,
                                      length=arg.context,
                                      batch_size=arg.batch_size)
        instances_seen += source.size(0)

        if torch.cuda.is_available():
            source, target = source.cuda(), target.cuda()

        tic()
        output = model(source)  # forward pass
        t = toc()

        # Compute the loss
        loss = F.nll_loss(output.transpose(2, 1), target, reduction='mean')

        tbw.add_scalar('transformer/train-loss',
                       float(loss.item()) * LOG2E, i * arg.batch_size,
                       instances_seen)
        tbw.add_scalar('transformer/time-forward', t, instances_seen)

        loss.backward()  # backward pass

        # clip gradients
        # -- If the total gradient vector has a length > x, we clip it back down to x.
        if arg.gradient_clipping > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping)

        opt.step()  # stochastic gradient descent step
        sch.step()  # update the learning rate

        # Validate every `arg.test_every` steps. First we compute the
        # compression on the validation data (or a subset),
        # then we generate some random text to monitor progress.
        if i != 0 and (i % arg.test_every == 0 or i == arg.num_batches - 1):
            with torch.no_grad():

                ## Sample and print a random sequence

                # Slice a random seed from the test data, and sample a continuation from the model.
                seedfr = random.randint(0, data_test.size(0) - arg.context)
                seed = data_test[seedfr:seedfr + arg.context].to(torch.long)

                if torch.cuda.is_available():
                    seed = seed.cuda()

                sample_sequence(model,
                                seed=seed,
                                max_context=arg.context,
                                verbose=True,
                                length=arg.sample_length)

                ## Compute validation bits per byte

                upto = data_test.size(
                    0) if i == arg.num_batches - 1 else arg.test_subset
                data_sub = data_test[:upto]

                bits_per_byte = compute_compression(
                    model,
                    data_sub,
                    context=arg.context,
                    batch_size=arg.test_batchsize)
                # -- Since we're not computing gradients, we can increase the batch size a little from what we used in
                #    training.

                print(f'epoch{i}: {bits_per_byte:.4} bits per byte')
                tbw.add_scalar(f'transformer/eval-loss', bits_per_byte,
                               i * arg.batch_size, instances_seen)
示例#11
0
def load(name, final=False, limit=None, bidir=False, prune=False):
    """
    Loads a knowledge graph dataset. Self connections are automatically added as a special relation

    :param name: Dataset name ('aifb' or 'am' at the moment)
    :param final: If true, load the canonical test set, otherwise split a validation set off from the training data.
    :param limit: If set, the number of unique relations will be limited to this value, plus one for the self-connections,
                  plus one for the remaining connections combined into a single, new relation.
    :param bidir: Whether to include inverse links for each relation
    :param prune: Whether to prune edges that are further than two steps from the target labels
    :return: A tuple containing the graph data, and the classification test and train sets:
              - edges: dictionary of edges (relation -> pair of lists cont. subject and object indices respectively)
    """
    # -- Check if the data has been cached for quick loading.
    cachefile = util.here(
        f'data{S}{name}{S}cache_{"fin" if final else "val"}_{"pruned" if prune else "unpruned"}.pkl'
    )
    if os.path.isfile(cachefile) and limit is None:
        print('Using cached data.')
        with open(cachefile, 'rb') as file:
            data = pickle.load(file)
            print('Loaded.')
            return data

    print(
        'No cache found (or relation limit is set). Loading data from scratch.'
    )

    if name == 'aifb':
        # AIFB data (academics, affiliations, publications, etc. About 8k nodes)
        file = util.here('data/aifb/aifb_stripped.nt.gz')

        train_file = util.here('data/aifb/trainingSet.tsv')
        test_file = util.here('data/aifb/testSet.tsv')
        label_header = 'label_affiliation'
        nodes_header = 'person'

    elif name == 'am':
        # Collection of the Amsterdam Museum. Data is downloaded on first load.
        data_url = 'https://www.dropbox.com/s/1mp9aot4d9j01h9/am_stripped.nt.gz?dl=1'
        file = util.here('data/am/am_stripped.nt.gz')

        print('dataset file exists: ', os.path.isfile(file))
        if not os.path.isfile(file):
            print('Downloading AM data.')
            wget.download(data_url, file)

        train_file = util.here('data/am/trainingSet.tsv')
        test_file = util.here('data/am/testSet.tsv')
        label_header = 'label_cateogory'
        nodes_header = 'proxy'

    elif name == 'mutag':
        data_url = 'https://www.dropbox.com/s/qy8j3p8eacvm4ir/mutag_stripped.nt.gz?dl=1'
        file = util.here('data/mutag/mutag_stripped.nt.gz')
        if not os.path.isfile(file):
            print('Downloading MUTAG data.')
            wget.download(data_url, file)

        # task_file = util.here('data/mutag/completeDataset.tsv')
        train_file = util.here('data/mutag/trainingSet.tsv')
        test_file = util.here('data/mutag/testSet.tsv')
        label_header = 'label_mutagenic'
        nodes_header = 'bond'

    elif name == 'bgs':
        file = util.here('data/bgs/bgs_stripped.nt.gz')
        train_file = util.here('data/bgs/trainingSet(lith).tsv')
        test_file = util.here('data/bgs/testSet(lith).tsv')
        label_header = 'label_lithogenesis'
        nodes_header = 'rock'

    else:
        raise Exception(f'Data {name} not recognized')

    # -- Load the classification task
    labels_train = pd.read_csv(train_file, sep='\t', encoding='utf8')
    if final:
        labels_test = pd.read_csv(test_file, sep='\t', encoding='utf8')
    else:  # split the training data into train and validation
        ltr = labels_train
        pivot = int(len(ltr) * VALPROP)

        labels_test = ltr[:pivot]
        labels_train = ltr[pivot:]

    labels = labels_train[label_header].astype('category').cat.codes
    train = {}
    for nod, lab in zip(labels_train[nodes_header].values, labels):
        train[nod] = lab

    labels = labels_test[label_header].astype('category').cat.codes
    test = {}
    for nod, lab in zip(labels_test[nodes_header].values, labels):
        test[nod] = lab

    print('Labels loaded.')

    # -- Parse the data with RDFLib
    graph = rdf.Graph()

    if file.endswith('nt.gz'):
        with gzip.open(file, 'rb') as f:
            graph.parse(file=f, format='nt')
    else:
        graph.parse(file, format=rdf.util.guess_format(file))

    print('RDF loaded.')

    # -- Collect all node and relation labels
    if prune:
        triples = set()
        for node in list(train.keys()) + list(test.keys()):
            add_neighbors(triples, graph, URIRef(node), depth=2)

    else:
        triples = graph

    nodes = set()
    relations = Counter()

    for s, p, o in triples:
        nodes.add(st(s))
        nodes.add(st(o))

        relations[st(p)] += 1

        if bidir:
            relations[INV + str(p)] += 1

    #print(len(nodes))
    # print(len(nodes_uri))
    # print('\n'.join(list(nodes)[:1000]))

    #sys.exit()

    i2n = list(nodes)  # maps indices to labels
    n2i = {n: i for i, n in enumerate(i2n)}  # maps labels to indices

    # Truncate the list of relations if necessary
    if limit is not None:
        i2r = [r[0] for r in relations.most_common(limit)] + [REST, INV + REST]
        # the 'limit' most frequent labels are maintained, the rest are combined into label REST to save memory
    else:
        i2r = list(relations.keys())

    r2i = {r: i for i, r in enumerate(i2r)}

    edges = {}

    # -- Collect all edges into a dictionary: relation -> (from, to)
    #    (only storing integer indices)
    for s, p, o in tqdm.tqdm(triples):
        s, p, o = n2i[st(s)], st(p), n2i[st(o)]

        pf = r2i[p] if (p in r2i) else r2i[REST]

        if pf not in edges:
            edges[pf] = [], []

        edges[pf][0].append(s)
        edges[pf][1].append(o)

        if bidir:
            pi = r2i[INV + p] if (INV + p in r2i) else r2i[INV + REST]

            if pi not in edges:
                edges[pi] = [], []

            edges[pi][0].append(o)
            edges[pi][1].append(s)

    # Add self connections explicitly
    edges[len(i2r)] = list(range(len(i2n))), list(range(len(i2n)))

    # for i in range(len(i2n)):
    #     edges[len(i2r)][0].append(i)
    #     edges[len(i2r)][1].append(i)

    print('Graph loaded.')

    # -- Cache the results for fast loading next time
    if limit is None:
        with open(cachefile, 'wb') as file:
            pickle.dump([edges, (n2i, i2n), (r2i, i2r), train, test], file)

    return edges, (n2i, i2n), (r2i, i2r), train, test
示例#12
0
def load_lp(name, limit=None, bidir=False, prune=False):
    """
    Loads a knowledge graph dataset. Self connections are NOT automatically added

    :param name: Dataset name ('fb' or 'wn' at the moment)
    :param limit: If set, the number of unique relations will be limited to this value, plus one for the self-connections,
                  plus one for the remaining connections combined into a single, new relation.
    :return: two lists of triples (train, test), two pairs of dicts for nodes and relations
    """

    if name == 'random':
        return load_lp_random()

    if name == 'fb':  # Freebase 15k 237
        train_file = util.here('data/fb15k237/train.txt')
        val_file = util.here('data/fb15k237/valid.txt')
        test_file = util.here('data/fb15k237/test.txt')

    elif name == 'wn':
        train_file = util.here('data/wn18rr/train.txt')
        val_file = util.here('data/wn18rr/valid.txt')
        test_file = util.here('data/wn18rr/test.txt')

    elif name == 'toy':
        train_file = util.here('data/toy/train.txt')
        val_file = util.here('data/toy/valid.txt')
        test_file = util.here('data/toy/test.txt')

    else:
        raise Exception(f'Data {name} not recognized')

    train = load_strings(train_file)
    val = load_strings(val_file)
    test = load_strings(test_file)

    if limit:
        train = train[:limit]
        val = val[:limit]
        test = test[:limit]

    # mappings for nodes (n) and relations (r)
    nodes, rels = set(), set()
    for triple in train + val + test:
        nodes.add(triple[0])
        rels.add(triple[1])
        nodes.add(triple[2])

    i2n, i2r = list(nodes), list(rels)
    n2i, r2i = {n: i
                for i, n in enumerate(nodes)
                }, {r: i
                    for i, r in enumerate(rels)}

    traini, vali, testi = [], [], []

    for s, p, o in train:
        traini.append([n2i[s], r2i[p], n2i[o]])

    for s, p, o in val:
        vali.append([n2i[s], r2i[p], n2i[o]])

    for s, p, o in test:
        testi.append([n2i[s], r2i[p], n2i[o]])

    train, val, test = torch.tensor(traini), torch.tensor(vali), torch.tensor(
        testi)

    return train, val, test, (n2i, i2n), (r2i, i2r)
示例#13
0
def go(arg):
    if arg.seed < 0:
        seed = random.randint(0, 1000000)
        print('random seed: ', seed)
    else:
        torch.manual_seed(arg.seed)

    tbw = SummaryWriter(log_dir=arg.tb_dir)  # Tensorboard logging

    # load the data (validation unless arg.final is true, then test)
    arg.data = here('../wiki_uk.txt') if arg.data is None else arg.data

    data_train, data_val, data_test = ukwiki(arg.data)
    data_train, data_test = (torch.cat([data_train, data_val], dim=0), data_test) \
        if arg.final else (data_train, data_val)

    # create the model
    model = GTransformer(emb=arg.embedding_size,
                         heads=arg.num_heads,
                         depth=arg.depth,
                         seq_length=arg.context,
                         num_tokens=NUM_TOKENS,
                         wide=arg.wide)
    if os.path.exists(MODEL_PATH):
        model.load_state_dict(torch.load(MODEL_PATH))
    if torch.cuda.is_available():
        model.cuda()

    opt = torch.optim.Adam(lr=arg.lr, params=model.parameters())
    # linear learning rate warmup
    sch = torch.optim.lr_scheduler.LambdaLR(
        opt, lambda i: min(i / (arg.lr_warmup / arg.batch_size), 1.0))

    # training loop
    # - note: we don't loop over the data, instead we sample a batch of random subsequences each time.
    for i in tqdm.trange(arg.num_batches):

        opt.zero_grad()

        # sample a batch of random subsequences
        starts = torch.randint(size=(arg.batch_size, ),
                               low=0,
                               high=data_train.size(0) - arg.context - 1)
        if arg.masked:
            seqs_source = [
                data_train.detach().clone()[start:start + arg.context, ]
                for start in starts
            ]
            seqs_target = [
                data_train.detach().clone()[start:start + arg.context]
                for start in starts
            ]
            for ss, st in zip(seqs_source, seqs_target):
                mask_indexes = torch.randint(1, arg.context,
                                             (arg.error_count, ))
                for ind in mask_indexes:
                    ss[ind] = torch.tensor(char_to_id['$'])
                # print(''.join([id_to_char[s.item()] for s in ss]))
                # print(''.join([id_to_char[t.item()] for t in st]))
        else:
            seqs_source = [
                data_train[start:start + arg.context] for start in starts
            ]
            seqs_target = [
                data_train[start + 1:start + arg.context + 1]
                for start in starts
            ]

        source = torch.cat([s[None, :] for s in seqs_source],
                           dim=0).to(torch.long)
        target = torch.cat([s[None, :] for s in seqs_target],
                           dim=0).to(torch.long)
        # - target is the same sequence as source, except one character ahead

        if torch.cuda.is_available():
            source, target = source.cuda(), target.cuda()
        source, target = Variable(source), Variable(target)

        output = model(source)

        loss = F.nll_loss(output.transpose(2, 1), target, reduction='mean')
        tbw.add_scalar('transformer/train-loss',
                       float(loss.item()) * LOG2E, i * arg.batch_size)

        loss.backward()

        # clip gradients
        # - If the total gradient vector has a length > 1, we clip it back down to 1.
        if arg.gradient_clipping > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping)

        opt.step()
        sch.step()

        # - validate every {arg.test_every} steps. First we compute the
        #   compression on the validation (or a subset)
        #   then we generate some random text to monitor progress
        if i != 0 and (i % arg.test_every == 0 or i == arg.num_batches - 1):

            upto = data_test.size(
                0) if i == arg.num_batches - 1 else arg.test_subset
            data_sub = data_test[:upto]

            with torch.no_grad():
                bits, tot = 0.0, 0
                batch = [
                ]  # buffer, every time it fills up, we run it through the model

                # for current in range(data_sub.size(0)):

                #     fr = max(0, current - arg.context)
                #     to = current + 1

                #     context = data_sub[fr:to].to(torch.long)
                #     if context.size(0) < arg.context + 1:
                #         pad = torch.zeros(size=(arg.context + 1 - context.size(0),), dtype=torch.long)
                #         context = torch.cat([pad, context], dim=0)

                #         assert context.size(0) == arg.context + 1

                #     if torch.cuda.is_available():
                #         context = context.cuda()

                #     batch.append(context[None, :])

                #     if len(batch) == arg.test_batchsize or current == data_sub.size(0) - 1:

                #         # batch is full, run it through the model
                #         b = len(batch)

                #         all = torch.cat(batch, dim=0)
                #         source = all[:, :-1] # input
                #         target = all[:, -1]  # target values

                #         output = model(source)

                #         lnprobs = output[torch.arange(b, device=d()), -1, target]
                #         log2probs = lnprobs * LOG2E # convert from nats to bits

                #         bits += - log2probs.sum()
                #         batch = [] # empty buffer

                # bits_per_byte = bits / data_sub.size(0)

                # # print validation performance. 1 bit per byte is (currently) state of the art.
                # print(f'epoch{i}: {bits_per_byte:.4} bits per byte')
                # tbw.add_scalar(f'transformer/eval-loss', bits_per_byte, i * arg.batch_size)

                # generate some random text
                GENSIZE = 600
                TEMP = 0.5
                seedfr = random.randint(0, data_test.size(0) - arg.context)
                # input = data_test[seedfr:seedfr + arg.context].to(torch.long)
                test_msgs = [
                    "купила м$ма коника, а коник і шо",
                    "як тебе не лю$ити Києве мій коли",
                    "у л$сі лісі темному де ходить як"
                ]
                for test_msg in test_msgs:
                    test_data = np.zeros(arg.context)
                    test_data.fill(110)
                    test_data[0:len(test_msg)] = np.array(
                        [char_to_id[ch] for ch in test_msg])
                    input = torch.from_numpy(test_data).to(torch.long)

                    if torch.cuda.is_available():
                        input = input.cuda()

                    input = Variable(input)

                    print('[', end='', flush=True)
                    for c in input:
                        print(str(id_to_char[c.item()]), end='', flush=True)
                    print(']', end='', flush=True)

                    output = model(input[None, :])
                    out_string = ''.join([
                        id_to_char[ind.item()]
                        for ind in output[0].max(axis=1).indices
                    ])
                    # c = sample(output[0].max(axis=1), TEMP)
                    print("Foo1")
                    print("PRED: " + out_string)

                    print("Foo2")
                    print()

        # Save model
        torch.save(model.state_dict(), MODEL_PATH)