def __init__(self, embedding_method='glove'): Dataset.__init__(self) self.max_text_length = 40 self.min_text_length = 3 self.embedding_method = embedding_method self.character_dim = None self.name = 'lang8v1' self.src_vocab_size = 30000 self.tgt_vocab_size = 30000
FLAGS = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu class Config(BaseConfig): filename = FLAGS.dataset embed_size = FLAGS.embedding batch_size = FLAGS.batch_size l2 = FLAGS.l2 user_count = -1 item_count = -1 optimizer = 'adam' neg_count = FLAGS.neg learning_rate = 0.001 config = Config() dataset = Dataset(config.filename) config.item_count = dataset.item_count config.user_count = dataset.user_count tf.logging.info("\n\n%s\n\n" % config) model = PairwiseGMF(config) sv = tf.train.Supervisor(logdir=None, save_model_secs=0, save_summaries_secs=0) sess = sv.prepare_or_wait_for_session( config=tf.ConfigProto(gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.1, allow_growth=True))) for i in range(FLAGS.iters): if sv.should_stop(): break progress = tqdm(enumerate(dataset.get_data(FLAGS.batch_size, False, FLAGS.neg)),
class Config(BaseConfig): filename = FLAGS.dataset embed_size = FLAGS.embedding batch_size = FLAGS.batch_size l2 = FLAGS.l2 user_count = -1 item_count = -1 optimizer = 'adam' neg_count = FLAGS.neg learning_rate = 0.001 config = Config() dataset = Dataset(config.filename) config.item_count = dataset.item_count config.user_count = dataset.user_count tf.logging.info("\n\n%s\n\n" % config) model = PairwiseGMF(config) sv = tf.train.Supervisor(logdir=None, save_model_secs=0, save_summaries_secs=0) sess = sv.prepare_or_wait_for_session(config=tf.ConfigProto( gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.1, allow_growth=True))) for i in range(FLAGS.iters): if sv.should_stop(): break progress = tqdm(enumerate( dataset.get_data(FLAGS.batch_size, False, FLAGS.neg)),
yield t def store(triples, full_path): with open(full_path, 'w') as writer: for t in triples: s, p, o = t t_str = s + '\t' + p + '\t' + o + '\n' writer.write(t_str) kg = ['WN18RR', 'FB15k-237', 'YAGO3-10'] for i in kg: data_dir = f'KGs/{i}/' clean_data_dir = f'KGs/{i}*/' dataset = Dataset(data_dir=data_dir) print(data_dir) clean_valid_set = clean_dataset( dataset.valid_data, entities=dataset.get_entities(dataset.train_data), relations=dataset.get_relations(dataset.train_data)) clean_test_set = clean_dataset( dataset.test_data, entities=dataset.get_entities(dataset.train_data), relations=dataset.get_relations(dataset.train_data)) store(dataset.train_data, clean_data_dir + 'train.txt') # Train set store(clean_valid_set, clean_data_dir + 'valid.txt') # Cleaned valid set store(clean_test_set, clean_data_dir + 'test.txt') # Clean test set
# 3. Create mappings. # 3.1 Entity index mapping. entity_idxs = { e: e_idx for e, e_idx in zip(model.dataset.entity_ids(), range(len(model.dataset.entity_ids()))) } # 3.2 Relation index mapping. relation_idxs = { r: r_idx for r, r_idx in zip(model.dataset.relation_ids(), range(len(model.dataset.relation_ids()))) } # 2. Load Dataset dataset = Dataset(data_dir=f'KGs/FB15K-237*/') # 4. Subject-Predicate to Object mapping and Predicate-Object to Subject mapping. This will be used at computing filtering ranks. sp_vocab, so_vocab, po_vocab = dataset.get_mappings( dataset.train_data + dataset.valid_data + dataset.test_data, entity_idxs=entity_idxs, relation_idxs=relation_idxs) ev = Evaluator(entity_idxs=entity_idxs, relation_idxs=relation_idxs, sp_vocab=sp_vocab, so_vocab=so_vocab, po_vocab=po_vocab) lp_results = ev.filtered_relation_prediction(dataset.test_data, model) with open(f'fb15k-237_*_{m}_relation_prediction_results.json', 'w') as file_descriptor:
from util.data import Dataset kg = ['WN18RR', 'FB15k-237', 'YAGO3-10'] for i in kg: dataset = Dataset(data_dir=f'KGs/{i}/') # Get all entities from train set. entities = set(dataset.get_entities(dataset.train_data)) dataset.describe_oov(dataset.test_data, entities, info=f'{i}-Test set') dataset.describe_oov(dataset.valid_data, entities, info=f'{i}-Val set') # Cleaned datasets kg = ['WN18RR*', 'FB15k-237*', 'YAGO3-10*'] for i in kg: dataset = Dataset(data_dir=f'KGs/{i}/') # Get all entities from train set. entities = set(dataset.get_entities(dataset.train_data)) dataset.describe_oov(dataset.test_data, entities, info=f'{i}-Test set') dataset.describe_oov(dataset.valid_data, entities, info=f'{i}-Val set')
class Config(BaseConfig): filename = FLAGS.dataset embed_size = FLAGS.embedding batch_size = FLAGS.batch_size l2 = FLAGS.l2 user_count = -1 item_count = -1 optimizer = 'adam' neg_count = FLAGS.neg learning_rate = 0.001 config = Config() dataset = Dataset(config.filename, limit=limit) set_parameters( normalized_popularity=dataset.normalized_popularity, loss_alpha=loss_alpha, loss_beta=loss_beta, loss_scale=loss_scale, loss_percentile=get_percentile(dataset.normalized_popularity, 45), metrics_alpha=metrics_alpha, metrics_beta=metrics_beta, metrics_gamma=metrics_gamma, metrics_scale=metrics_scale, metrics_percentile=metrics_percentile, loss_type=loss_type, k=k, k_trainable=k_trainable, low_popularity_threshold=dataset.thresholds[0],
def train(env, datasets): """ Trains a semantic parser that translates natural language expressions to program code based on the language data provided. :param env: environment including model and language data. :param datasets: training and validation datasets. """ model = env['model'] lang = env['lang'] # Zero is padding token and no alignment. crit = nn.NLLLoss(ignore_index=0, reduction='sum') opt = optim.SGD( model.parameters(), lr=args.learning_rate, momentum=0.9 ) train_data = datasets['train'] train_set = Dataset( train_data, model.device, args.mask_ratio ) if 'dev' in datasets: dev_data = datasets['dev'] dev_set = Dataset( dev_data, model.device, args.mask_ratio ) best_dev_acc = 0 best_epoch = 0 logger['log'].log( f'[INFO {datetime.now()}] commencing ' f'training for {args.epochs} epochs' ) # space print('') early_stop = 0 for epoch in range(1, args.epochs+1): since = time.time() statistics = train_epoch( env, train_set, opt, crit, epoch ) duration = elapsed(since) loss = statistics.loss accuracy = statistics.accuracy gold_acc = statistics.gold_accuracy logger['log'].log( f'[INFO {datetime.now()}] EPOCH {epoch} > ' f'{"elapsed time: ":<25}{duration:.3f}s' ) logger['log'].log( f'[INFO {datetime.now()}] EPOCH {epoch} > ' f'{"train loss: ":<25}{loss:.5f}' ) logger['log'].log( f'[INFO {datetime.now()}] EPOCH {epoch} > ' f'{"train accuracy: ":<25}{accuracy*100:0>6.3f}%' ) logger['log'].log( f'[INFO {datetime.now()}] EPOCH {epoch} > ' f'{"train gold acc.: ":<25}{gold_acc*100:0>6.3f}%' ) if 'dev' in datasets and args.validate: # Validate model. statistics = validate(env, dev_set, crit) dev_loss = statistics.loss accuracy = statistics.accuracy gold_acc = statistics.gold_accuracy logger['log'].log( f'[INFO {datetime.now()}] EPOCH {epoch} > ' f'{"dev loss: ":<25}{dev_loss:.5f}' ) logger['log'].log( f'[INFO {datetime.now()}] EPOCH {epoch} > ' f'{"dev accuracy: ":<25}{accuracy*100:0>6.3f}%' ) logger['log'].log( f'[INFO {datetime.now()}] EPOCH {epoch} > ' f'{"dev gold acc.: ":<25}{gold_acc*100:0>6.3f}%' ) logger['log'].log( f'[INFO {datetime.now()}] EPOCH {epoch} > ' f'{"best dev accuracy: ":<25}{best_dev_acc*100:0>6.3f}%' ) # Save model if new best exact match accuracy on # development set. if args.best_gold and gold_acc > best_dev_acc: best_dev_acc = gold_acc __save_model(model, args, lang, epoch, best_epoch) best_epoch = epoch early_stop = 0 logger['log'].log( f'[INFO {datetime.now()}] EPOCH {epoch} > ' f'new best dev split gold accuracy, saving model' ) # Save model if new best accuracy on development set. elif not args.best_gold and accuracy > best_dev_acc: best_dev_acc = accuracy __save_model(model, args, lang, epoch, best_epoch) best_epoch = epoch early_stop = 0 logger['log'].log( f'[INFO {datetime.now()}] EPOCH {epoch} > ' f'new best dev split accuracy, saving model' ) else: early_stop = early_stop + 1 else: # if not validating # Save model each epoch if not validating. __save_model(model, args, lang, epoch, epoch-1) # space print('') if early_stop == args.early_stop: logger['log'].log( f'[INFO {datetime.now()}] no dev set improvement ' f'since {args.early_stop} epochs, stop training' ) break logger['log'].log( f'[INFO {datetime.now()}] training concluded' ) logger['log'].close()
def y_position(x, cutoff): y = sigmoid(-x * Settings.metrics_gamma / cutoff) + 0.5 return y def y_custom(popularity, position, cutoff): y = y_popularity(popularity) * y_position(position, cutoff) return y if __name__ == "__main__": print("Testing settings") dataset = Dataset('data/pinterest.npz') set_parameters( normalized_popularity=dataset.normalized_popularity, loss_alpha=200, loss_beta=0.02, loss_scale=1, loss_percentile=get_percentile(dataset.normalized_popularity, 45), metrics_alpha=100, metrics_beta=0.03, metrics_gamma=5, metrics_scale=1 / 15, metrics_percentile=0.45, loss_type=2 )
max_neighbors = -1 config = Config() # print('FLAGS.resume:', FLAGS.resume) # print('config.logdir:', config.logdir) if FLAGS.resume: config.save_directory = config.logdir config.load() dictConfig(get_logging_config(config.logdir)) dataset = Dataset(config.filename, limit=limit, rebuild=rebuild, use_preprocess=use_preprocess) set_parameters(normalized_popularity=dataset.normalized_popularity, loss_alpha=loss_alpha, loss_beta=loss_beta, loss_scale=loss_scale, loss_percentile=get_percentile(dataset.normalized_popularity, 45), metrics_alpha=metrics_alpha, metrics_beta=metrics_beta, metrics_gamma=metrics_gamma, metrics_scale=metrics_scale, metrics_percentile=metrics_percentile, loss_type=loss_type, k=k, k_trainable=k_trainable,
from util.data import Dataset kg = ['WN18RR', 'FB15K-237'] #, 'YAGO3-10'] for i in kg: dataset = Dataset(data_dir=f'KGs/{i}/') dataset.descriptive_statistics(dataset.train_data, info=f'{i}-Train set') dataset.descriptive_statistics(dataset.valid_data, info=f'{i}-Valid set') dataset.descriptive_statistics(dataset.test_data, info=f'{i}-Test set')
pretrain = FLAGS.pretrain max_neighbors = -1 config = Config() # print('FLAGS.resume:', FLAGS.resume) # print('config.logdir:', config.logdir) if FLAGS.resume: config.save_directory = config.logdir config.load() dictConfig(get_logging_config(config.logdir)) dataset = Dataset(config.filename, limit=limit) set_parameters( normalized_popularity=dataset.normalized_popularity, loss_alpha=loss_alpha, loss_beta=loss_beta, loss_scale=loss_scale, loss_percentile=get_percentile(dataset.normalized_popularity, 45), metrics_alpha=metrics_alpha, metrics_beta=metrics_beta, metrics_gamma=metrics_gamma, metrics_scale=metrics_scale, metrics_percentile=metrics_percentile, loss_type=loss_type, k=k, k_trainable=k_trainable, low_popularity_threshold=dataset.thresholds[0],