def load_config_from_file(obj, config_format, folder_name, file_name): try: try: f = open(here(file_name)) except OSError: loc = user_config_location(folder_name, file_name) f = open(loc) cfg = f.read() f.close() load_config(obj, config_format, cfg) return True except (OSError, IOError): return False
def save_config_to_file(obj, config_format, folder_name, file_name): try: cfg = save_config(obj, config_format) try: f = open(here(file_name), 'w') except OSError: loc = user_config_location(folder_name, file_name) makedirs(loc) f = open(loc, 'w') f.write(cfg) f.close() return True except (OSError, IOError): return False
def save_config_to_file(obj, config_format, folder_name, file_name): try: cfg = save_config(obj, config_format) try: f = open(here(file_name), "w") except OSError: loc = user_config_location(folder_name, file_name) makedirs(loc) f = open(loc, "w") f.write(cfg) f.close() return True except (OSError, IOError): return False
def go(arg): if arg.seed < 0: seed = random.randint(0, 1000000) print('random seed: ', seed) else: torch.manual_seed(arg.seed) tbw = SummaryWriter(log_dir=arg.tb_dir) # Tensorboard logging # load the data arg.path = here('data') if arg.path is None else arg.path data_train, data_val, data_test = read_dataset(arg.path, arg.dataset) # create the model model = GTransformer(emb=arg.embedding_size, heads=arg.num_heads, depth=arg.depth, seq_length=arg.context, num_tokens=NUM_TOKENS, wide=arg.wide) if torch.cuda.is_available(): model.cuda() print("Model parameters = %d" % sum(p.numel() for p in model.parameters())) if not arg.radam: opt = torch.optim.Adam(lr=arg.lr, params=model.parameters()) # linear learning rate warmup sch = torch.optim.lr_scheduler.LambdaLR( opt, lambda i: min(i / (arg.lr_warmup / arg.batch_size), 1.0)) else: opt = RAdam(model.parameters(), lr=arg.lr) if USE_APEX: model, opt = amp.initialize(model, opt, opt_level="O1", verbosity=0) best_bpb = np.inf best_step = 0 # training loop # - note: we don't loop over the data, instead we sample a batch of random subsequences each time. for i in tqdm.trange(arg.num_batches): opt.zero_grad() # sample a batch of random subsequences starts = torch.randint(size=(arg.batch_size, ), low=0, high=data_train.size(0) - arg.context - 1) seqs_source = [ data_train[start:start + arg.context] for start in starts ] seqs_target = [ data_train[start + 1:start + arg.context + 1] for start in starts ] source = torch.cat([s[None, :] for s in seqs_source], dim=0).to(torch.long) target = torch.cat([s[None, :] for s in seqs_target], dim=0).to(torch.long) # - target is the same sequence as source, except one character ahead if torch.cuda.is_available(): source, target = source.cuda(), target.cuda() source, target = Variable(source), Variable(target) output = model(source) loss = F.nll_loss(output.transpose(2, 1), target, reduction='mean') #tbw.add_scalar('transformer/train-loss', float(loss.item()) * LOG2E, i * arg.batch_size) if not USE_APEX: loss.backward() else: with amp.scale_loss(loss, opt) as scaled_loss: scaled_loss.backward() # clip gradients # - If the total gradient vector has a length > 1, we clip it back down to 1. if arg.gradient_clipping > 0.0: nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping) opt.step() if not arg.radam: sch.step() # - validate every {arg.test_every} steps. First we compute the # compression on the validation (or a subset) # then we generate some random text to monitor progress if i != 0 and (i % arg.test_every == 0 or i == arg.num_batches - 1): upto = arg.test_subset if arg.test_subset else data_val.size(0) data_sub = data_val[:upto] bits_per_byte = calculate_bpb(arg, model, data_sub) # print validation performance. 1 bit per byte is (currently) state of the art. print(f'epoch{i}: {bits_per_byte:.4} bits per byte') tag_scalar_dict = { 'train-loss': float(loss.item()) * LOG2E, 'eval-loss': bits_per_byte } tbw.add_scalars(f'transformer/loss', tag_scalar_dict, i * arg.batch_size) if bits_per_byte < best_bpb: best_bpb = bits_per_byte best_step = i torch.save(model.state_dict(), os.path.join(arg.tb_dir, 'best_model.pt')) print(f'best step {best_step}: {best_bpb:.4} bits per byte') generate_sequence(arg, model, data_val) # load the best model, calculate bpb of the test data and generate some random text finalize(arg, model, data_test)
def load(name, limit=None): """ Loads a knowledge graph dataset for link prediction purposes. :param name: Dataset name. "fb" for FB15k-237, "wn" for WN18k-RR, "toy" for a small toy dataset for testing. :param limit: If set, the total numnber of triples per set will be limited to this value. Useful for debugging. :return: Three lists of integer-triples (train, val, test), a pair of dicts to map entity strings from an to their integer ids, and a similar pair of dicts for the relations. """ if name == 'fb': # Freebase 15k 237 train_file = util.here('data/fb15k237/train.txt') val_file = util.here('data/fb15k237/valid.txt') test_file = util.here('data/fb15k237/test.txt') elif name == 'wn': train_file = util.here('data/wn18rr/train.txt') val_file = util.here('data/wn18rr/valid.txt') test_file = util.here('data/wn18rr/test.txt') else: if os.path.isdir(util.here('data' + os.sep + name)): train_file = util.here(f'data/{name}/train.txt') val_file = util.here(f'data/{name}/valid.txt') test_file = util.here(f'data/{name}/test.txt') else: raise Exception( f'Could not find dataset with name {name} at location {util.here("data" + os.sep + name)}.' ) train = load_strings(train_file) val = load_strings(val_file) test = load_strings(test_file) if limit: train = train[:limit] val = val[:limit] test = test[:limit] # mappings for nodes (n) and relations (r) nodes, rels = set(), set() for triple in train + val + test: nodes.add(triple[0]) rels.add(triple[1]) nodes.add(triple[2]) i2n, i2r = list(nodes), list(rels) n2i, r2i = {n: i for i, n in enumerate(nodes) }, {r: i for i, r in enumerate(rels)} traini, vali, testi = [], [], [] for s, p, o in train: traini.append([n2i[s], r2i[p], n2i[o]]) for s, p, o in val: vali.append([n2i[s], r2i[p], n2i[o]]) for s, p, o in test: testi.append([n2i[s], r2i[p], n2i[o]]) train, val, test = torch.tensor(traini), torch.tensor(vali), torch.tensor( testi) return train, val, test, (n2i, i2n), (r2i, i2r)
def go_pods(arg): if arg.seed < 0: seed = random.randint(0, 1000000) print('random seed: ', seed) else: torch.manual_seed(arg.seed) tbw = SummaryWriter(log_dir=arg.tb_dir) # Tensorboard logging df = pd.read_csv(here('./data/df_popular_podcasts.csv')) with open(here('./data/genre_IDs.txt')) as file: glist = eval(file.read()) glist = {int(idstr): name for (idstr, name) in glist} rlist = {name: id for (id, name) in glist.items()} gs = set() for genres in df['Genre IDs']: genres = eval(genres) for genre in genres: g = int(genre) gs.add(g) i2g = list(gs) g2i = {g: i for i, g in enumerate(i2g)} train, val, test = df.iloc[:8000], df.iloc[8000:9000], df.iloc[9000:] # create the model model = GPT2Wrapper(iblocks=arg.iblocks, csize=len(i2g), gptname=arg.gpt_name) if arg.checkpoint is not None: model.load_state_dict( torch.load(arg.checkpoint, map_location=torch.device('cpu'))) if torch.cuda.is_available(): model.to('cuda') model.model.mod[0].to('cuda') tok = model.tokenizer opt = torch.optim.Adam(lr=arg.lr, params=model.parameters()) # sch = torch.optim.lr_scheduler.LambdaLR(opt, lambda i: min(i / (arg.lr_warmup / arg.batch_size), 1.0)) # -- linear learning rate warmup # training loop # -- note: we don't loop over the data, instead we sample a batch of random subsequences each time. seen = 0 for e in range(arg.epochs): if e % arg.print_every == 0: with torch.no_grad(): # Generate 10 titles from the seed genres = torch.zeros(1, len(i2g)) for genre in PD_GENRES: # print(glist[genre]) genres[0, g2i[genre]] = 1.0 for i in range(10): # generate and print some random text seed = PD_SEED input = torch.tensor(tok.encode(seed)) if torch.cuda.is_available(): input, genres = input.to('cuda'), genres.to('cuda') outseq = [] for _ in range(PD_TITLE_LENTGH): output = model(input[None, :], cond=genres) c = sample(output[0, -1, :], arg.sampling_temp) outseq.append(c) input = torch.cat([input, c], dim=0) outseq = torch.cat(outseq, dim=0) outseq = model.tokenizer.decode(outseq) with open(f'pd.e{e:03}i{i:02}.txt', 'w') as file: print(outseq[len(PD_SEED):], file=file) print( '---------------------------------------------\n', file=file) print(PD_SEED + outseq, file=file) # Generate 10 random podcasts for i in range(10): # generate a random genre random_genre = random.choice(list(glist.keys())) genres = torch.zeros(1, len(i2g)) genres[0, g2i[random_genre]] = 1.0 # generate and print some random text seed = 'description: ' input = torch.tensor(tok.encode(seed)) if torch.cuda.is_available(): input, genres = input.to('cuda'), genres.to('cuda') outseq = [] for _ in range(arg.print_size): output = model(input[None, :], cond=genres) c = sample(output[0, -1, :], arg.sampling_temp) outseq.append(c) input = torch.cat([input, c], dim=0) outseq = torch.cat(outseq, dim=0) outseq = model.tokenizer.decode(outseq) with open(f'random.e{e:03}i{i:02}.txt', 'w') as file: print('chosen genre ', glist[random_genre], file=file) print('---------------------------------------------', file=file) print(seed, file=file) print(outseq, flush=True, file=file) for fr in tqdm.trange(0, len(train), arg.batch_size): to = min(len(train), fr + arg.batch_size) dfbatch = df.iloc[fr:to] texts, genres = tobatch(dfbatch, tok, g2i, limit=arg.desc_clip, glist=glist) b = texts.size(0) source = torch.cat( [torch.empty(b, 1, dtype=torch.long).fill_(0), texts], dim=1) target = torch.cat( [texts, torch.empty(b, 1, dtype=torch.long).fill_(0)], dim=1) seen += b opt.zero_grad() if arg.dropout > 0.0: source = source * torch.empty_like(source).bernoulli_( arg.dropout) #-- word dropout on the input (help the model use the conditionals) if torch.cuda.is_available(): source, target, genres = source.to('cuda'), target.to( 'cuda'), genres.to('cuda') output = model(source, cond=genres) loss = F.cross_entropy(output.transpose(2, 1), target, reduction='mean') tbw.add_scalar('podcasts/train-loss', float(loss.item()) * LOG2E, seen) loss.backward() # clip gradients # - If the total gradient vector has a length > 1, we clip it back down to 1. if arg.gradient_clipping > 0.0: nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping) opt.step() # sch.step() del loss, source, target, genres model.clear() # for obj in gc.get_objects(): # try: # if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)): # if obj.size(0) == b: # print(type(obj), obj.size()) # except: # pass torch.save(model.state_dict(), './checkpoint.model') # - validate every {arg.test_every} steps. First we compute the # compression on the validation (or a subset) # then we generate some random text to monitor progress # if e != 0 and (e % arg.print_every == 0 or e == arg.epochs - 1): print('multipliers:') for block in model.iblocks: print(' ', block.mult) print()
def go(arg): if arg.seed < 0: seed = random.randint(0, 1000000) print('random seed: ', seed) else: torch.manual_seed(arg.seed) tbw = SummaryWriter(log_dir=arg.tb_dir) # Tensorboard logging arg.data = here('data/enwik8.gz') if arg.data is None else arg.data str_train, str_val, str_test = load_text(arg.data) str_train, str_test = (str_train + str_val, str_test) \ if arg.final else (str_train, str_val) # create the model model = GPT2Wrapper(iblocks=arg.iblocks) if torch.cuda.is_available(): model.to('cuda') model.model.mod[0].to('cuda') # tokenize the data data_train, data_val, data_test = \ torch.tensor(model.tokenizer.encode(str_train)), \ torch.tensor(model.tokenizer.encode(str_val)), \ torch.tensor(model.tokenizer.encode(str_test)) opt = torch.optim.Adam(lr=arg.lr, params=model.parameters()) # sch = torch.optim.lr_scheduler.LambdaLR(opt, lambda i: min(i / (arg.lr_warmup / arg.batch_size), 1.0)) # -- linear learning rate warmup # training loop # -- note: we don't loop over the data, instead we sample a batch of random subsequences each time. for i in tqdm.trange(arg.num_batches): opt.zero_grad() # sample a batch of random subsequences starts = torch.randint(size=(arg.batch_size, ), low=0, high=data_train.size(0) - model.ctx - 1) seqs_source = [data_train[start:start + model.ctx] for start in starts] seqs_target = [ data_train[start + 1:start + model.ctx + 1] for start in starts ] source = torch.cat([s[None, :] for s in seqs_source], dim=0).to(torch.long) target = torch.cat([s[None, :] for s in seqs_target], dim=0).to(torch.long) # -- target is the same sequence as source, except one character ahead if torch.cuda.is_available(): source, target = source.to('cuda'), target.to('cuda') output = model(source) loss = F.cross_entropy(output.transpose(2, 1), target, reduction='mean') tbw.add_scalar('podcasts/train-loss', float(loss.item()) * LOG2E, i * arg.batch_size) loss.backward() # clip gradients # - If the total gradient vector has a length > 1, we clip it back down to 1. if arg.gradient_clipping > 0.0: nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping) opt.step() # sch.step() model.clear() # - validate every {arg.test_every} steps. First we compute the # compression on the validation (or a subset) # then we generate some random text to monitor progress if i != 0 and (i % arg.print_every == 0 or i == arg.num_batches - 1): with torch.no_grad(): # generate and print some random text seedfr = random.randint( 0, data_test.size(0) - arg.print_seed_size) input = data_test[seedfr:seedfr + arg.print_seed_size].to( torch.long) if torch.cuda.is_available(): input = input.cuda() # print the seed strinput = model.tokenizer.decode(input) print(f'[{strinput}]', end='') outseq = [] for _ in range(arg.print_size): output = model(input[None, :]) c = sample(output[0, -1, :], arg.sampling_temp) outseq.append(c[None]) input = torch.cat([input[1:], c[None]], dim=0) outseq = torch.cat(outseq, dim=0) outseq = model.tokenizer.decode(outseq) print(outseq) # val if i != 0 and (i % arg.test_every == 0 or i == arg.num_batches - 1): with torch.no_grad(): upto = data_test.size( 0) if i == arg.num_batches - 1 else arg.test_subset data_sub = data_test[:upto] bits, tot = 0.0, 0 batch = [ ] # buffer, every time it fills up, we run it through the model for current in range(data_sub.size(0)): fr = max(0, current - model.ctx) to = current + 1 context = data_sub[fr:to].to(torch.long) if context.size(0) < model.ctx + 1: pad = torch.zeros(size=(model.ctx + 1 - context.size(0), ), dtype=torch.long) context = torch.cat([pad, context], dim=0) assert context.size(0) == model.ctx + 1 if torch.cuda.is_available(): context = context.cuda() batch.append(context[None, :]) if len( batch ) == arg.test_batchsize or current == data_sub.size(0) - 1: # batch is full, run it through the model b = len(batch) all = torch.cat(batch, dim=0) source = all[:, :-1] # input target = all[:, -1] # target values output = model(source) lnprobs = output[torch.arange(b, device=d()), -1, target] log2probs = lnprobs * LOG2E # convert from nats to bits bits += -log2probs.sum() batch = [] # empty buffer bits_per_byte = bits / data_sub.size(0) # print validation performance. 0.92 bit per byte is (currently) state of the art. print(f'epoch{i}: {bits_per_byte:.4} bits per byte') tbw.add_scalar(f'podcasts/eval-loss', bits_per_byte, i * arg.batch_size)
def go(arg): if arg.seed < 0: seed = random.randint(0, 1000000) print('random seed: ', seed) else: torch.manual_seed(arg.seed) tbw = SummaryWriter(log_dir=arg.tb_dir) # Tensorboard logging # load the data (validation unless arg.final is true, then test) arg.data = here('data/enwik8.gz') if arg.data is None else arg.data data_train, data_val, data_test = enwik8(arg.data) data_train, data_test = (torch.cat([data_train, data_val], dim=0), data_test) \ if arg.final else (data_train, data_val) # create the model model = GTransformer(emb=arg.embedding_size, heads=arg.num_heads, depth=arg.depth, seq_length=arg.context, num_tokens=NUM_TOKENS) if torch.cuda.is_available(): model.cuda() opt = torch.optim.Adam(lr=arg.lr, params=model.parameters()) # training loop # - note: we don't loop over the data, instead we sample a batch of random subsequences each time. for i in tqdm.trange(arg.num_batches): # learning rate warmup # - we linearly increase the learning rate from 10e-10 to arg.lr over the first # few thousand batches if arg.lr_warmup > 0 and i < arg.lr_warmup: lr = max((arg.lr / arg.lr_warmup) * i, 1e-10) opt.lr = lr opt.zero_grad() # sample a batch of random subsequences starts = torch.randint(size=(arg.batch_size, ), low=0, high=data_train.size(0) - arg.context - 1) seqs_source = [ data_train[start:start + arg.context] for start in starts ] seqs_target = [ data_train[start + 1:start + arg.context + 1] for start in starts ] source = torch.cat([s[None, :] for s in seqs_source], dim=0).to(torch.long) target = torch.cat([s[None, :] for s in seqs_target], dim=0).to(torch.long) # - target is the same sequence as source, except one character ahead if torch.cuda.is_available(): source, target = source.cuda(), target.cuda() source, target = Variable(source), Variable(target) output = model(source) loss = F.nll_loss(output.transpose(2, 1), target, reduction='mean') tbw.add_scalar('transformer/train-loss', float(loss.item()) * LOG2E, i * arg.batch_size) loss.backward() # clip gradients # - If the total gradient vector has a length > 1, we clip it back down to 1. if arg.gradient_clipping > 0.0: nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping) opt.step() # - validate every {arg.test_every} steps. First we compute the # compression on the validation (or a subset) # then we generate some random text to monitor progress if i != 0 and (i % arg.test_every == 0 or i == arg.num_batches - 1): upto = data_test.size( 0) if i == arg.num_batches - 1 else arg.test_subset data_sub = data_test[:upto] with torch.no_grad(): bits, tot = 0.0, 0 batch = [ ] # buffer, every time it fills up, we run it through the model for current in range(data_sub.size(0)): fr = max(0, current - arg.context) to = current + 1 context = data_sub[fr:to].to(torch.long) if context.size(0) < arg.context + 1: pad = torch.zeros(size=(arg.context + 1 - context.size(0), ), dtype=torch.long) context = torch.cat([pad, context], dim=0) assert context.size(0) == arg.context + 1 if torch.cuda.is_available(): context = context.cuda() batch.append(context[None, :]) if len( batch ) == arg.test_batchsize or current == data_sub.size(0) - 1: # batch is full, run it through the model b = len(batch) all = torch.cat(batch, dim=0) source = all[:, :-1] # input target = all[:, -1] # target values output = model(source) lnprobs = output[torch.arange(b, device=d()), -1, target] log2probs = lnprobs * LOG2E # convert from nats to bits bits += -log2probs.sum() batch = [] # empty buffer bits_per_byte = bits / data_sub.size(0) # print validation performance. 1 bit per byte is (currently) state of the art. print(f'epoch{i}: {bits_per_byte:.4} bits per byte') tbw.add_scalar(f'transformer/eval-loss', bits_per_byte, i * arg.batch_size) # generate some random text GENSIZE = 600 TEMP = 0.5 seedfr = random.randint(0, data_test.size(0) - arg.context) input = data_test[seedfr:seedfr + arg.context].to(torch.long) if torch.cuda.is_available(): input = input.cuda() input = Variable(input) print('[', end='', flush=True) for c in input: print(str(chr(c)), end='', flush=True) print(']', end='', flush=True) for _ in range(GENSIZE): output = model(input[None, :]) c = sample(output[0, -1, :], TEMP) print(str(chr(max(32, c))), end='', flush=True) input = torch.cat([input[1:], c[None]], dim=0) print()
def go(arg): if arg.seed < 0: seed = random.randint(0, 1000000) print('random seed: ', seed) else: torch.manual_seed(arg.seed) tbw = SummaryWriter(log_dir=arg.tb_dir) # Tensorboard logging # load the data (validation unless arg.final is true, then test) arg.data = here('data/enwik8.gz') if arg.data is None else arg.data data_train, data_val, data_test = enwik8(arg.data) data_train, data_test = (torch.cat([data_train, data_val], dim=0), data_test) \ if arg.final else (data_train, data_val) # create the model model = GTransformer(emb=arg.embedding_size, heads=arg.num_heads, depth=arg.depth, seq_length=arg.context, num_tokens=NUM_TOKENS, attention_type=arg.attention_type) if torch.cuda.is_available(): model.cuda() opt = torch.optim.Adam(lr=arg.lr, params=model.parameters()) # Linear learning rate warmup sch = torch.optim.lr_scheduler.LambdaLR( opt, lambda i: min(i / (arg.lr_warmup / arg.batch_size), 1.0)) # Training loop # -- We don't loop over the data, instead we sample a batch of random subsequences each time. This is not strictly # better or worse as a training method, it's just a little simpler. # instances_seen = 0 for i in tqdm.trange(arg.num_batches): opt.zero_grad() source, target = sample_batch(data_train, length=arg.context, batch_size=arg.batch_size) instances_seen += source.size(0) if torch.cuda.is_available(): source, target = source.cuda(), target.cuda() tic() output = model(source) # forward pass t = toc() # Compute the loss loss = F.nll_loss(output.transpose(2, 1), target, reduction='mean') tbw.add_scalar('transformer/train-loss', float(loss.item()) * LOG2E, i * arg.batch_size, instances_seen) tbw.add_scalar('transformer/time-forward', t, instances_seen) loss.backward() # backward pass # clip gradients # -- If the total gradient vector has a length > x, we clip it back down to x. if arg.gradient_clipping > 0.0: nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping) opt.step() # stochastic gradient descent step sch.step() # update the learning rate # Validate every `arg.test_every` steps. First we compute the # compression on the validation data (or a subset), # then we generate some random text to monitor progress. if i != 0 and (i % arg.test_every == 0 or i == arg.num_batches - 1): with torch.no_grad(): ## Sample and print a random sequence # Slice a random seed from the test data, and sample a continuation from the model. seedfr = random.randint(0, data_test.size(0) - arg.context) seed = data_test[seedfr:seedfr + arg.context].to(torch.long) if torch.cuda.is_available(): seed = seed.cuda() sample_sequence(model, seed=seed, max_context=arg.context, verbose=True, length=arg.sample_length) ## Compute validation bits per byte upto = data_test.size( 0) if i == arg.num_batches - 1 else arg.test_subset data_sub = data_test[:upto] bits_per_byte = compute_compression( model, data_sub, context=arg.context, batch_size=arg.test_batchsize) # -- Since we're not computing gradients, we can increase the batch size a little from what we used in # training. print(f'epoch{i}: {bits_per_byte:.4} bits per byte') tbw.add_scalar(f'transformer/eval-loss', bits_per_byte, i * arg.batch_size, instances_seen)
def load(name, final=False, limit=None, bidir=False, prune=False): """ Loads a knowledge graph dataset. Self connections are automatically added as a special relation :param name: Dataset name ('aifb' or 'am' at the moment) :param final: If true, load the canonical test set, otherwise split a validation set off from the training data. :param limit: If set, the number of unique relations will be limited to this value, plus one for the self-connections, plus one for the remaining connections combined into a single, new relation. :param bidir: Whether to include inverse links for each relation :param prune: Whether to prune edges that are further than two steps from the target labels :return: A tuple containing the graph data, and the classification test and train sets: - edges: dictionary of edges (relation -> pair of lists cont. subject and object indices respectively) """ # -- Check if the data has been cached for quick loading. cachefile = util.here( f'data{S}{name}{S}cache_{"fin" if final else "val"}_{"pruned" if prune else "unpruned"}.pkl' ) if os.path.isfile(cachefile) and limit is None: print('Using cached data.') with open(cachefile, 'rb') as file: data = pickle.load(file) print('Loaded.') return data print( 'No cache found (or relation limit is set). Loading data from scratch.' ) if name == 'aifb': # AIFB data (academics, affiliations, publications, etc. About 8k nodes) file = util.here('data/aifb/aifb_stripped.nt.gz') train_file = util.here('data/aifb/trainingSet.tsv') test_file = util.here('data/aifb/testSet.tsv') label_header = 'label_affiliation' nodes_header = 'person' elif name == 'am': # Collection of the Amsterdam Museum. Data is downloaded on first load. data_url = 'https://www.dropbox.com/s/1mp9aot4d9j01h9/am_stripped.nt.gz?dl=1' file = util.here('data/am/am_stripped.nt.gz') print('dataset file exists: ', os.path.isfile(file)) if not os.path.isfile(file): print('Downloading AM data.') wget.download(data_url, file) train_file = util.here('data/am/trainingSet.tsv') test_file = util.here('data/am/testSet.tsv') label_header = 'label_cateogory' nodes_header = 'proxy' elif name == 'mutag': data_url = 'https://www.dropbox.com/s/qy8j3p8eacvm4ir/mutag_stripped.nt.gz?dl=1' file = util.here('data/mutag/mutag_stripped.nt.gz') if not os.path.isfile(file): print('Downloading MUTAG data.') wget.download(data_url, file) # task_file = util.here('data/mutag/completeDataset.tsv') train_file = util.here('data/mutag/trainingSet.tsv') test_file = util.here('data/mutag/testSet.tsv') label_header = 'label_mutagenic' nodes_header = 'bond' elif name == 'bgs': file = util.here('data/bgs/bgs_stripped.nt.gz') train_file = util.here('data/bgs/trainingSet(lith).tsv') test_file = util.here('data/bgs/testSet(lith).tsv') label_header = 'label_lithogenesis' nodes_header = 'rock' else: raise Exception(f'Data {name} not recognized') # -- Load the classification task labels_train = pd.read_csv(train_file, sep='\t', encoding='utf8') if final: labels_test = pd.read_csv(test_file, sep='\t', encoding='utf8') else: # split the training data into train and validation ltr = labels_train pivot = int(len(ltr) * VALPROP) labels_test = ltr[:pivot] labels_train = ltr[pivot:] labels = labels_train[label_header].astype('category').cat.codes train = {} for nod, lab in zip(labels_train[nodes_header].values, labels): train[nod] = lab labels = labels_test[label_header].astype('category').cat.codes test = {} for nod, lab in zip(labels_test[nodes_header].values, labels): test[nod] = lab print('Labels loaded.') # -- Parse the data with RDFLib graph = rdf.Graph() if file.endswith('nt.gz'): with gzip.open(file, 'rb') as f: graph.parse(file=f, format='nt') else: graph.parse(file, format=rdf.util.guess_format(file)) print('RDF loaded.') # -- Collect all node and relation labels if prune: triples = set() for node in list(train.keys()) + list(test.keys()): add_neighbors(triples, graph, URIRef(node), depth=2) else: triples = graph nodes = set() relations = Counter() for s, p, o in triples: nodes.add(st(s)) nodes.add(st(o)) relations[st(p)] += 1 if bidir: relations[INV + str(p)] += 1 #print(len(nodes)) # print(len(nodes_uri)) # print('\n'.join(list(nodes)[:1000])) #sys.exit() i2n = list(nodes) # maps indices to labels n2i = {n: i for i, n in enumerate(i2n)} # maps labels to indices # Truncate the list of relations if necessary if limit is not None: i2r = [r[0] for r in relations.most_common(limit)] + [REST, INV + REST] # the 'limit' most frequent labels are maintained, the rest are combined into label REST to save memory else: i2r = list(relations.keys()) r2i = {r: i for i, r in enumerate(i2r)} edges = {} # -- Collect all edges into a dictionary: relation -> (from, to) # (only storing integer indices) for s, p, o in tqdm.tqdm(triples): s, p, o = n2i[st(s)], st(p), n2i[st(o)] pf = r2i[p] if (p in r2i) else r2i[REST] if pf not in edges: edges[pf] = [], [] edges[pf][0].append(s) edges[pf][1].append(o) if bidir: pi = r2i[INV + p] if (INV + p in r2i) else r2i[INV + REST] if pi not in edges: edges[pi] = [], [] edges[pi][0].append(o) edges[pi][1].append(s) # Add self connections explicitly edges[len(i2r)] = list(range(len(i2n))), list(range(len(i2n))) # for i in range(len(i2n)): # edges[len(i2r)][0].append(i) # edges[len(i2r)][1].append(i) print('Graph loaded.') # -- Cache the results for fast loading next time if limit is None: with open(cachefile, 'wb') as file: pickle.dump([edges, (n2i, i2n), (r2i, i2r), train, test], file) return edges, (n2i, i2n), (r2i, i2r), train, test
def load_lp(name, limit=None, bidir=False, prune=False): """ Loads a knowledge graph dataset. Self connections are NOT automatically added :param name: Dataset name ('fb' or 'wn' at the moment) :param limit: If set, the number of unique relations will be limited to this value, plus one for the self-connections, plus one for the remaining connections combined into a single, new relation. :return: two lists of triples (train, test), two pairs of dicts for nodes and relations """ if name == 'random': return load_lp_random() if name == 'fb': # Freebase 15k 237 train_file = util.here('data/fb15k237/train.txt') val_file = util.here('data/fb15k237/valid.txt') test_file = util.here('data/fb15k237/test.txt') elif name == 'wn': train_file = util.here('data/wn18rr/train.txt') val_file = util.here('data/wn18rr/valid.txt') test_file = util.here('data/wn18rr/test.txt') elif name == 'toy': train_file = util.here('data/toy/train.txt') val_file = util.here('data/toy/valid.txt') test_file = util.here('data/toy/test.txt') else: raise Exception(f'Data {name} not recognized') train = load_strings(train_file) val = load_strings(val_file) test = load_strings(test_file) if limit: train = train[:limit] val = val[:limit] test = test[:limit] # mappings for nodes (n) and relations (r) nodes, rels = set(), set() for triple in train + val + test: nodes.add(triple[0]) rels.add(triple[1]) nodes.add(triple[2]) i2n, i2r = list(nodes), list(rels) n2i, r2i = {n: i for i, n in enumerate(nodes) }, {r: i for i, r in enumerate(rels)} traini, vali, testi = [], [], [] for s, p, o in train: traini.append([n2i[s], r2i[p], n2i[o]]) for s, p, o in val: vali.append([n2i[s], r2i[p], n2i[o]]) for s, p, o in test: testi.append([n2i[s], r2i[p], n2i[o]]) train, val, test = torch.tensor(traini), torch.tensor(vali), torch.tensor( testi) return train, val, test, (n2i, i2n), (r2i, i2r)
def go(arg): if arg.seed < 0: seed = random.randint(0, 1000000) print('random seed: ', seed) else: torch.manual_seed(arg.seed) tbw = SummaryWriter(log_dir=arg.tb_dir) # Tensorboard logging # load the data (validation unless arg.final is true, then test) arg.data = here('../wiki_uk.txt') if arg.data is None else arg.data data_train, data_val, data_test = ukwiki(arg.data) data_train, data_test = (torch.cat([data_train, data_val], dim=0), data_test) \ if arg.final else (data_train, data_val) # create the model model = GTransformer(emb=arg.embedding_size, heads=arg.num_heads, depth=arg.depth, seq_length=arg.context, num_tokens=NUM_TOKENS, wide=arg.wide) if os.path.exists(MODEL_PATH): model.load_state_dict(torch.load(MODEL_PATH)) if torch.cuda.is_available(): model.cuda() opt = torch.optim.Adam(lr=arg.lr, params=model.parameters()) # linear learning rate warmup sch = torch.optim.lr_scheduler.LambdaLR( opt, lambda i: min(i / (arg.lr_warmup / arg.batch_size), 1.0)) # training loop # - note: we don't loop over the data, instead we sample a batch of random subsequences each time. for i in tqdm.trange(arg.num_batches): opt.zero_grad() # sample a batch of random subsequences starts = torch.randint(size=(arg.batch_size, ), low=0, high=data_train.size(0) - arg.context - 1) if arg.masked: seqs_source = [ data_train.detach().clone()[start:start + arg.context, ] for start in starts ] seqs_target = [ data_train.detach().clone()[start:start + arg.context] for start in starts ] for ss, st in zip(seqs_source, seqs_target): mask_indexes = torch.randint(1, arg.context, (arg.error_count, )) for ind in mask_indexes: ss[ind] = torch.tensor(char_to_id['$']) # print(''.join([id_to_char[s.item()] for s in ss])) # print(''.join([id_to_char[t.item()] for t in st])) else: seqs_source = [ data_train[start:start + arg.context] for start in starts ] seqs_target = [ data_train[start + 1:start + arg.context + 1] for start in starts ] source = torch.cat([s[None, :] for s in seqs_source], dim=0).to(torch.long) target = torch.cat([s[None, :] for s in seqs_target], dim=0).to(torch.long) # - target is the same sequence as source, except one character ahead if torch.cuda.is_available(): source, target = source.cuda(), target.cuda() source, target = Variable(source), Variable(target) output = model(source) loss = F.nll_loss(output.transpose(2, 1), target, reduction='mean') tbw.add_scalar('transformer/train-loss', float(loss.item()) * LOG2E, i * arg.batch_size) loss.backward() # clip gradients # - If the total gradient vector has a length > 1, we clip it back down to 1. if arg.gradient_clipping > 0.0: nn.utils.clip_grad_norm_(model.parameters(), arg.gradient_clipping) opt.step() sch.step() # - validate every {arg.test_every} steps. First we compute the # compression on the validation (or a subset) # then we generate some random text to monitor progress if i != 0 and (i % arg.test_every == 0 or i == arg.num_batches - 1): upto = data_test.size( 0) if i == arg.num_batches - 1 else arg.test_subset data_sub = data_test[:upto] with torch.no_grad(): bits, tot = 0.0, 0 batch = [ ] # buffer, every time it fills up, we run it through the model # for current in range(data_sub.size(0)): # fr = max(0, current - arg.context) # to = current + 1 # context = data_sub[fr:to].to(torch.long) # if context.size(0) < arg.context + 1: # pad = torch.zeros(size=(arg.context + 1 - context.size(0),), dtype=torch.long) # context = torch.cat([pad, context], dim=0) # assert context.size(0) == arg.context + 1 # if torch.cuda.is_available(): # context = context.cuda() # batch.append(context[None, :]) # if len(batch) == arg.test_batchsize or current == data_sub.size(0) - 1: # # batch is full, run it through the model # b = len(batch) # all = torch.cat(batch, dim=0) # source = all[:, :-1] # input # target = all[:, -1] # target values # output = model(source) # lnprobs = output[torch.arange(b, device=d()), -1, target] # log2probs = lnprobs * LOG2E # convert from nats to bits # bits += - log2probs.sum() # batch = [] # empty buffer # bits_per_byte = bits / data_sub.size(0) # # print validation performance. 1 bit per byte is (currently) state of the art. # print(f'epoch{i}: {bits_per_byte:.4} bits per byte') # tbw.add_scalar(f'transformer/eval-loss', bits_per_byte, i * arg.batch_size) # generate some random text GENSIZE = 600 TEMP = 0.5 seedfr = random.randint(0, data_test.size(0) - arg.context) # input = data_test[seedfr:seedfr + arg.context].to(torch.long) test_msgs = [ "купила м$ма коника, а коник і шо", "як тебе не лю$ити Києве мій коли", "у л$сі лісі темному де ходить як" ] for test_msg in test_msgs: test_data = np.zeros(arg.context) test_data.fill(110) test_data[0:len(test_msg)] = np.array( [char_to_id[ch] for ch in test_msg]) input = torch.from_numpy(test_data).to(torch.long) if torch.cuda.is_available(): input = input.cuda() input = Variable(input) print('[', end='', flush=True) for c in input: print(str(id_to_char[c.item()]), end='', flush=True) print(']', end='', flush=True) output = model(input[None, :]) out_string = ''.join([ id_to_char[ind.item()] for ind in output[0].max(axis=1).indices ]) # c = sample(output[0].max(axis=1), TEMP) print("Foo1") print("PRED: " + out_string) print("Foo2") print() # Save model torch.save(model.state_dict(), MODEL_PATH)