예제 #1
0
 def __init__(self, embedding_method='glove'):
     Dataset.__init__(self)
     self.max_text_length = 40
     self.min_text_length = 3
     self.embedding_method = embedding_method
     self.character_dim = None
     self.name = 'lang8v1'
     self.src_vocab_size = 30000
     self.tgt_vocab_size = 30000
FLAGS = parser.parse_args()
os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu

class Config(BaseConfig):
    filename = FLAGS.dataset
    embed_size = FLAGS.embedding
    batch_size = FLAGS.batch_size
    l2 = FLAGS.l2
    user_count = -1
    item_count = -1
    optimizer = 'adam'
    neg_count = FLAGS.neg
    learning_rate = 0.001

config = Config()
dataset = Dataset(config.filename)
config.item_count = dataset.item_count
config.user_count = dataset.user_count
tf.logging.info("\n\n%s\n\n" % config)

model = PairwiseGMF(config)
sv = tf.train.Supervisor(logdir=None, save_model_secs=0, save_summaries_secs=0)
sess = sv.prepare_or_wait_for_session(
    config=tf.ConfigProto(gpu_options=tf.GPUOptions(
        per_process_gpu_memory_fraction=0.1,
        allow_growth=True)))

for i in range(FLAGS.iters):
    if sv.should_stop():
        break
    progress = tqdm(enumerate(dataset.get_data(FLAGS.batch_size, False, FLAGS.neg)),
예제 #3
0

class Config(BaseConfig):
    filename = FLAGS.dataset
    embed_size = FLAGS.embedding
    batch_size = FLAGS.batch_size
    l2 = FLAGS.l2
    user_count = -1
    item_count = -1
    optimizer = 'adam'
    neg_count = FLAGS.neg
    learning_rate = 0.001


config = Config()
dataset = Dataset(config.filename)
config.item_count = dataset.item_count
config.user_count = dataset.user_count
tf.logging.info("\n\n%s\n\n" % config)

model = PairwiseGMF(config)
sv = tf.train.Supervisor(logdir=None, save_model_secs=0, save_summaries_secs=0)
sess = sv.prepare_or_wait_for_session(config=tf.ConfigProto(
    gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.1,
                              allow_growth=True)))

for i in range(FLAGS.iters):
    if sv.should_stop():
        break
    progress = tqdm(enumerate(
        dataset.get_data(FLAGS.batch_size, False, FLAGS.neg)),
            yield t


def store(triples, full_path):
    with open(full_path, 'w') as writer:
        for t in triples:
            s, p, o = t
            t_str = s + '\t' + p + '\t' + o + '\n'
            writer.write(t_str)


kg = ['WN18RR', 'FB15k-237', 'YAGO3-10']
for i in kg:
    data_dir = f'KGs/{i}/'
    clean_data_dir = f'KGs/{i}*/'
    dataset = Dataset(data_dir=data_dir)
    print(data_dir)

    clean_valid_set = clean_dataset(
        dataset.valid_data,
        entities=dataset.get_entities(dataset.train_data),
        relations=dataset.get_relations(dataset.train_data))
    clean_test_set = clean_dataset(
        dataset.test_data,
        entities=dataset.get_entities(dataset.train_data),
        relations=dataset.get_relations(dataset.train_data))

    store(dataset.train_data, clean_data_dir + 'train.txt')  # Train set
    store(clean_valid_set, clean_data_dir + 'valid.txt')  # Cleaned valid set
    store(clean_test_set, clean_data_dir + 'test.txt')  # Clean test set
        # 3. Create mappings.
        # 3.1 Entity index mapping.
        entity_idxs = {
            e: e_idx
            for e, e_idx in zip(model.dataset.entity_ids(),
                                range(len(model.dataset.entity_ids())))
        }
        # 3.2 Relation index mapping.
        relation_idxs = {
            r: r_idx
            for r, r_idx in zip(model.dataset.relation_ids(),
                                range(len(model.dataset.relation_ids())))
        }
    # 2. Load Dataset
    dataset = Dataset(data_dir=f'KGs/FB15K-237*/')

    # 4. Subject-Predicate to Object mapping and Predicate-Object to Subject mapping. This will be used at computing filtering ranks.
    sp_vocab, so_vocab, po_vocab = dataset.get_mappings(
        dataset.train_data + dataset.valid_data + dataset.test_data,
        entity_idxs=entity_idxs,
        relation_idxs=relation_idxs)
    ev = Evaluator(entity_idxs=entity_idxs,
                   relation_idxs=relation_idxs,
                   sp_vocab=sp_vocab,
                   so_vocab=so_vocab,
                   po_vocab=po_vocab)

    lp_results = ev.filtered_relation_prediction(dataset.test_data, model)
    with open(f'fb15k-237_*_{m}_relation_prediction_results.json',
              'w') as file_descriptor:
from util.data import Dataset

kg = ['WN18RR', 'FB15k-237', 'YAGO3-10']
for i in kg:
    dataset = Dataset(data_dir=f'KGs/{i}/')
    # Get all entities from train set.
    entities = set(dataset.get_entities(dataset.train_data))
    dataset.describe_oov(dataset.test_data, entities, info=f'{i}-Test set')
    dataset.describe_oov(dataset.valid_data, entities, info=f'{i}-Val set')

# Cleaned datasets
kg = ['WN18RR*', 'FB15k-237*', 'YAGO3-10*']
for i in kg:
    dataset = Dataset(data_dir=f'KGs/{i}/')
    # Get all entities from train set.
    entities = set(dataset.get_entities(dataset.train_data))
    dataset.describe_oov(dataset.test_data, entities, info=f'{i}-Test set')
    dataset.describe_oov(dataset.valid_data, entities, info=f'{i}-Val set')
        class Config(BaseConfig):
            filename = FLAGS.dataset
            embed_size = FLAGS.embedding
            batch_size = FLAGS.batch_size
            l2 = FLAGS.l2
            user_count = -1
            item_count = -1
            optimizer = 'adam'
            neg_count = FLAGS.neg
            learning_rate = 0.001


        config = Config()

        dataset = Dataset(config.filename, limit=limit)
        set_parameters(
            normalized_popularity=dataset.normalized_popularity,
            loss_alpha=loss_alpha,
            loss_beta=loss_beta,
            loss_scale=loss_scale,
            loss_percentile=get_percentile(dataset.normalized_popularity, 45),
            metrics_alpha=metrics_alpha,
            metrics_beta=metrics_beta,
            metrics_gamma=metrics_gamma,
            metrics_scale=metrics_scale,
            metrics_percentile=metrics_percentile,
            loss_type=loss_type,
            k=k,
            k_trainable=k_trainable,
            low_popularity_threshold=dataset.thresholds[0],
예제 #8
0
def train(env, datasets):
    """
    Trains a semantic parser that translates natural
    language expressions to program code based on the
    language data provided.

    :param env:         environment including model and
                        language data.
    :param datasets:    training and validation datasets.
    """

    model = env['model']
    lang = env['lang']

    # Zero is padding token and no alignment.
    crit = nn.NLLLoss(ignore_index=0, reduction='sum')
    opt = optim.SGD(
        model.parameters(),
        lr=args.learning_rate,
        momentum=0.9
    )

    train_data = datasets['train']
    train_set = Dataset(
        train_data,
        model.device,
        args.mask_ratio
    )

    if 'dev' in datasets:
        dev_data = datasets['dev']
        dev_set = Dataset(
            dev_data,
            model.device,
            args.mask_ratio
        )

        best_dev_acc = 0
        best_epoch = 0

    logger['log'].log(
        f'[INFO {datetime.now()}]    commencing '
        f'training for {args.epochs} epochs'
    )

    # space
    print('')
    early_stop = 0
    for epoch in range(1, args.epochs+1):
        since = time.time()
        statistics = train_epoch(
            env, train_set,
            opt, crit, epoch
        )

        duration = elapsed(since)
        loss = statistics.loss
        accuracy = statistics.accuracy
        gold_acc = statistics.gold_accuracy

        logger['log'].log(
            f'[INFO {datetime.now()}]    EPOCH {epoch} >   '
            f'{"elapsed time: ":<25}{duration:.3f}s'
        )

        logger['log'].log(
            f'[INFO {datetime.now()}]    EPOCH {epoch} >   '
            f'{"train loss: ":<25}{loss:.5f}'
        )

        logger['log'].log(
            f'[INFO {datetime.now()}]    EPOCH {epoch} >   '
            f'{"train accuracy: ":<25}{accuracy*100:0>6.3f}%'
        )

        logger['log'].log(
            f'[INFO {datetime.now()}]    EPOCH {epoch} >   '
            f'{"train gold acc.: ":<25}{gold_acc*100:0>6.3f}%'
        )

        if 'dev' in datasets and args.validate:
            # Validate model.
            statistics = validate(env, dev_set, crit)

            dev_loss = statistics.loss
            accuracy = statistics.accuracy
            gold_acc = statistics.gold_accuracy

            logger['log'].log(
                f'[INFO {datetime.now()}]    EPOCH {epoch} >   '
                f'{"dev loss: ":<25}{dev_loss:.5f}'
            )

            logger['log'].log(
                f'[INFO {datetime.now()}]    EPOCH {epoch} >   '
                f'{"dev accuracy: ":<25}{accuracy*100:0>6.3f}%'
            )

            logger['log'].log(
                f'[INFO {datetime.now()}]    EPOCH {epoch} >   '
                f'{"dev gold acc.: ":<25}{gold_acc*100:0>6.3f}%'
            )

            logger['log'].log(
                f'[INFO {datetime.now()}]    EPOCH {epoch} >   '
                f'{"best dev accuracy: ":<25}{best_dev_acc*100:0>6.3f}%'
            )

            # Save model if new best exact match accuracy on
            # development set.
            if args.best_gold and gold_acc > best_dev_acc:
                best_dev_acc = gold_acc
                __save_model(model, args, lang, epoch, best_epoch)
                best_epoch = epoch
                early_stop = 0

                logger['log'].log(
                    f'[INFO {datetime.now()}]    EPOCH {epoch} >   '
                    f'new best dev split gold accuracy, saving model'
                )

            # Save model if new best accuracy on development set.
            elif not args.best_gold and accuracy > best_dev_acc:
                best_dev_acc = accuracy
                __save_model(model, args, lang, epoch, best_epoch)
                best_epoch = epoch
                early_stop = 0

                logger['log'].log(
                    f'[INFO {datetime.now()}]    EPOCH {epoch} >   '
                    f'new best dev split accuracy, saving model'
                )

            else:
                early_stop = early_stop + 1

        else:  # if not validating

            # Save model each epoch if not validating.
            __save_model(model, args, lang, epoch, epoch-1)

        # space
        print('')

        if early_stop == args.early_stop:
            logger['log'].log(
                f'[INFO {datetime.now()}]    no dev set improvement '
                f'since {args.early_stop} epochs, stop training'
            )
            break

    logger['log'].log(
        f'[INFO {datetime.now()}]    training concluded'
    )

    logger['log'].close()

def y_position(x, cutoff):
    y = sigmoid(-x * Settings.metrics_gamma / cutoff) + 0.5
    return y


def y_custom(popularity, position, cutoff):
    y = y_popularity(popularity) * y_position(position, cutoff)
    return y


if __name__ == "__main__":
    print("Testing settings")

    dataset = Dataset('data/pinterest.npz')

    set_parameters(
        normalized_popularity=dataset.normalized_popularity,
        loss_alpha=200,
        loss_beta=0.02,
        loss_scale=1,
        loss_percentile=get_percentile(dataset.normalized_popularity, 45),
        metrics_alpha=100,
        metrics_beta=0.03,
        metrics_gamma=5,
        metrics_scale=1 / 15,
        metrics_percentile=0.45,
        loss_type=2
    )
예제 #10
0
    max_neighbors = -1


config = Config()

# print('FLAGS.resume:', FLAGS.resume)
# print('config.logdir:', config.logdir)

if FLAGS.resume:
    config.save_directory = config.logdir
    config.load()

dictConfig(get_logging_config(config.logdir))

dataset = Dataset(config.filename,
                  limit=limit,
                  rebuild=rebuild,
                  use_preprocess=use_preprocess)
set_parameters(normalized_popularity=dataset.normalized_popularity,
               loss_alpha=loss_alpha,
               loss_beta=loss_beta,
               loss_scale=loss_scale,
               loss_percentile=get_percentile(dataset.normalized_popularity,
                                              45),
               metrics_alpha=metrics_alpha,
               metrics_beta=metrics_beta,
               metrics_gamma=metrics_gamma,
               metrics_scale=metrics_scale,
               metrics_percentile=metrics_percentile,
               loss_type=loss_type,
               k=k,
               k_trainable=k_trainable,
예제 #11
0
from util.data import Dataset

kg = ['WN18RR', 'FB15K-237']  #, 'YAGO3-10']
for i in kg:
    dataset = Dataset(data_dir=f'KGs/{i}/')
    dataset.descriptive_statistics(dataset.train_data, info=f'{i}-Train set')
    dataset.descriptive_statistics(dataset.valid_data, info=f'{i}-Valid set')
    dataset.descriptive_statistics(dataset.test_data, info=f'{i}-Test set')
    pretrain = FLAGS.pretrain
    max_neighbors = -1


config = Config()

# print('FLAGS.resume:', FLAGS.resume)
# print('config.logdir:', config.logdir)

if FLAGS.resume:
    config.save_directory = config.logdir
    config.load()

dictConfig(get_logging_config(config.logdir))

dataset = Dataset(config.filename, limit=limit)
set_parameters(
    normalized_popularity=dataset.normalized_popularity,
    loss_alpha=loss_alpha,
    loss_beta=loss_beta,
    loss_scale=loss_scale,
    loss_percentile=get_percentile(dataset.normalized_popularity, 45),
    metrics_alpha=metrics_alpha,
    metrics_beta=metrics_beta,
    metrics_gamma=metrics_gamma,
    metrics_scale=metrics_scale,
    metrics_percentile=metrics_percentile,
    loss_type=loss_type,
    k=k,
    k_trainable=k_trainable,
    low_popularity_threshold=dataset.thresholds[0],