def main(args):

    key_dimensions = ['3', '4', '5']
    lambs = ['0.001', '0.0001', '0.00001', '0.000001']
    learning_rates = ['0.00005', '0.0001', '0.0005']
    mode_dimensions = ['1', '3']
    ranks = ['60', '80', '100']

    pattern = "parameters:\n" \
              "    models: MMP\n" \
              "    similarity: Cosine\n" \
              "    alpha: [0, 0.1, 1]\n" \
              "    batch_size: [32]\n" \
              "    corruption: [0.2, 0.3, 0.4]\n" \
              "    epoch: [300]\n" \
              "    iteration: [10]\n" \
              "    key_dimension: [{0}]\n" \
              "    lambda: [{1}]\n" \
              "    learning_rate: [{2}]\n" \
              "    mode_dimension: [{3}]\n" \
              "    rank: [{4}]\n" \
              "    root: [1.0]\n" \
              "    topK: [5, 10, 15, 20, 50]\n" \
              "    metric: [R-Precision, NDCG, Precision, Recall]"

    i = 1

    for key_dimension in key_dimensions:
        for lamb in lambs:
            for learning_rate in learning_rates:
                for mode_dimension in mode_dimensions:
                    for rank in ranks:
                        content = pattern.format(key_dimension, lamb, learning_rate, mode_dimension, rank)
                        write_file('config/'+args.path+'/', 'mmp-part'+str(i)+'.yml', content, exe=False)
                        i += 1
Пример #2
0
def main(args):
    bash_path = load_yaml('config/global.yml', key='path')['bashes']

    config_path = 'config/{}'.format(args.dataset_name)
    yaml_files = get_file_names(config_path, extension='.yml')
    project_path = "~/DeepCritiquingForRecSys"

    pattern = "#!/usr/bin/env bash\n" \
              "source {0}\n" \
              "cd {1}\n" \
              "python tune_parameters.py --data_dir {2} --save_path {3}/{4}.csv --parameters config/{3}/{4}.yml\n"

    for setting in yaml_files:
        name, extension = os.path.splitext(setting)
        content = pattern.format(args.virtualenv_path, project_path,
                                 args.data_dir, args.dataset_name, name)
        write_file(bash_path + args.dataset_name,
                   args.dataset_name + '-' + name + '.sh',
                   content,
                   exe=True)

    bash_files = sorted(
        get_file_names(bash_path + args.dataset_name, extension='.sh'))

    commands = []
    command_pattern = 'sbatch --nodes=1 --time={0}:00:00 --mem={1} --cpus=4 '
    if args.gpu:
        command_pattern = command_pattern + '--gres=gpu:1 '

    command_pattern = command_pattern + '{2}'

    for bash in bash_files:
        commands.append(
            command_pattern.format(args.max_time, args.memory, bash))
    content = "\n".join(commands)
    write_file(bash_path + args.dataset_name,
               'run_' + args.dataset_name + '.sh', content)
Пример #3
0
def main(args):
    bash_path = load_yaml('config/global.yml', key='path')['bashes']
    yaml_files = get_file_names('config', extension='.yml')
    #project_path = os.path.dirname(os.path.abspath(__file__))
    project_path = "~/IF-VAE-Recommendation"

    pattern = "#!/usr/bin/env bash\n" \
              "source {0}\n" \
              "cd {1}\n" \
              "python tune_parameters.py -d {2} -n {3}/{4}.csv -y config/{4}.yml\n"

    for setting in yaml_files:
        name, extension = os.path.splitext(setting)
        content = pattern.format(args.virtualenv_path, project_path,
                                 args.data_path, args.problem, name)
        write_file(bash_path + args.problem,
                   args.problem + '-' + name + '.sh',
                   content,
                   exe=True)

    bash_files = sorted(
        get_file_names(bash_path + args.problem, extension='.sh'))

    commands = []
    command_pattern = 'sbatch --nodes=1 --time={0}:00:00 --mem={1} --cpus=4 '
    if args.gpu:
        command_pattern = command_pattern + '--gres=gpu:1 '

    command_pattern = command_pattern + '{2}'

    for bash in bash_files:
        commands.append(
            command_pattern.format(args.max_time, args.memory, bash))
    content = "\n".join(commands)
    write_file(bash_path + args.problem, 'run_' + args.problem + '.sh',
               content)
Пример #4
0
                if line != '' and line != '</doc>':
                    doc += line + '\n'

    logging.info(f'Cut WIKI to sentence')
    knowledge_collect = []
    collect_num = 0
    for p in tqdm(passage):
        k = p.split('\n')
        topic = k[0]
        article = ' '.join(k[1:])
        sentences = sent_tokenize(article, language=language)
        for s in sentences:
            knowledge = topic + sep + s
            knowledge_collect.append(knowledge)
            if len(knowledge_collect) == size:
                write_file(knowledge_collect,
                           f'{output_path}/{collect_num}.txt')
                knowledge_collect = []
                collect_num += 1

    write_file(knowledge_collect, f'{output_path}/{collect_num}.txt')

    logging.info(f'Index WIKI in Solr')

    knowledge = []
    for i in range(1000):
        if os.path.exists(f'{output_path}/{i}.txt'):
            logging.info('Load file', f'{output_path}/{i}.txt', len(knowledge))
            knowledge.extend([
                line[:-1]
                for line in open(f'{output_path}/{i}.txt', encoding='utf-8')
            ])
Пример #5
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')

    parser = argparse.ArgumentParser()
    parser.add_argument('-dialog', type=str)
    parser.add_argument('-dialog2', type=str)
    parser.add_argument('-k', type=str)
    parser.add_argument('-pool', type=str)
    parser.add_argument('-m', type=int)
    parser.add_argument('-save_path', type=str)
    parser.add_argument('--bc_size', type=int, default=32)
    parser.add_argument('--lr', type=float, default=5e-5)
    parser.add_argument('--pool_size', type=int, default=10)
    parser.add_argument('--max_len', type=int, default=64)
    parser.add_argument('--language', type=str, default='en')
    parser.add_argument('--pt_path', type=str, default='none')
    args = parser.parse_args()

    dialog_path = args.dialog
    dialog2_path = args.dialog2
    knowledge_path = args.k
    pool_path = args.pool
    max_step = args.m
    batch_size = args.bc_size
    lr = args.lr
    pool_size = args.pool_size
    max_len = args.max_len
    lang_code = mbart_lang_to_id[args.language]
    save_path = args.save_path
    language = args.language

    if knowledge_path != 'redis':
        knowledge = []
        for i in range(200):
            if os.path.exists(f'{knowledge_path}/{i}.pkl'):
                knowledge.extend(read_pkl(f'{knowledge_path}/{i}.pkl'))
    else:
        knowledge = knowledge_path
    knowledge_pool = read_pkl(pool_path)

    dataset = DuoData(read_pkl(f'{dialog_path}/context.pkl'),
                      read_pkl(f'{dialog_path}/response.pkl'),
                      read_pkl(f'{dialog2_path}/context.pkl'),
                      read_pkl(f'{dialog2_path}/context.pkl'),
                      knowledge_pool, pool_size=pool_size, knowledge=knowledge, order=None,
                      max_len=max_len, lang_code=lang_code, curriculum=max_step)

    test_dataset = CKGCTestData(args.language, pool=f'dataset/ckgc/{args.language}/pool.txt',
                                max_len=max_len, lang_code=lang_code)

    tokenizer = get_tokenizer('mbart')
    tokenizer.lang_code_to_id = mbart_lang_to_id

    logging.info('Build generator')
    generator = Generator()
    if torch.cuda.is_available():
        generator = generator.cuda()

    optimizer = AdamW(generator.parameters(), lr)
    pretrained_path = args.pt_path
    if os.path.exists(pretrained_path):
        logging.info(f'Load pretrained model from {pretrained_path}')
        generator.load_state_dict({k.replace("module.", ""): v for k, v in torch.load(pretrained_path).items()})

    cur_step = 0
    while cur_step < max_step:
        dataset.set_offset(cur_step)
        logging.info(f'Training step {cur_step} / max step {max_step}')
        # train_generator(generator, optimizer, dataset,
        #                 pad_idx=1, batch_size=batch_size, step=10)
        cur_step += 10 * batch_size
        predict, true = test_generator(generator, test_dataset, language, tokenizer,
                                       pad_idx=1, batch_size=batch_size, epoch=0, word_mask=None)
        logging.info(eval_all(predict, true))
        write_file(predict, f'{save_path}/predict/{cur_step}.txt')
        torch.save(generator.state_dict(), f'{save_path}/generator/{cur_step}.pt')
Пример #6
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format=
        '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s'
    )

    parser = argparse.ArgumentParser()
    parser.add_argument('-dialog', type=str)
    parser.add_argument('-k', type=str)
    parser.add_argument('-pool', type=str)
    parser.add_argument('-save_path', type=str)
    parser.add_argument('--bc_size', type=int, default=32)
    parser.add_argument('--lr', type=float, default=5e-5)
    parser.add_argument('--pool_size', type=int, default=1)
    parser.add_argument('--max_len', type=int, default=64)
    parser.add_argument('--language', type=str, default='en')
    parser.add_argument('--pt_path', type=str, default='none')
    parser.add_argument('--dist', type=int, default=1)
    args = parser.parse_args()

    dialog_path = args.dialog
    knowledge_path = args.k
    pool_path = args.pool
    batch_size = args.bc_size
    lr = args.lr
    pool_size = args.pool_size
    max_len = args.max_len
    lang_code = mbart_lang_to_id[args.language]
    distributed = args.dist
    save_path = args.save_path
    language = args.language

    if distributed:
        dist_init()
    local_rank = dist.get_rank() if distributed else 0

    if knowledge_path != 'redis':
        knowledge = []
        for i in range(200):
            if os.path.exists(f'{knowledge_path}/{i}.pkl'):
                knowledge.extend(read_pkl(f'{knowledge_path}/{i}.pkl'))
    else:
        knowledge = knowledge_path
    knowledge_pool = read_pkl(pool_path)

    dataset = Data(read_pkl(f'{dialog_path}/context.pkl'),
                   read_pkl(f'{dialog_path}/response.pkl'),
                   knowledge_pool,
                   pool_size=pool_size,
                   knowledge=knowledge,
                   order=None,
                   max_len=max_len,
                   lang_code=lang_code)
    test_dataset = CKGCTestData(args.language,
                                pool=f'dataset/ckgc/{args.language}/pool.txt',
                                max_len=max_len,
                                lang_code=lang_code)

    tokenizer = get_tokenizer('mbart')
    tokenizer.lang_code_to_id = mbart_lang_to_id

    logging.info('Build generator')
    generator = Generator()
    if torch.cuda.is_available():
        generator = generator.cuda()
    if distributed:
        generator = torch.nn.parallel.DistributedDataParallel(
            generator,
            device_ids=[local_rank],
            output_device=local_rank,
            find_unused_parameters=True)
    optimizer = AdamW(generator.parameters(), lr)
    pretrained_path = args.pt_path
    if os.path.exists(pretrained_path):
        logging.info(f'Load pretrained model from {pretrained_path}')
        if distributed:
            dist.barrier()
            map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()}
            generator.load_state_dict(
                torch.load(pretrained_path, map_location=map_location))
            dist.barrier()
        else:
            generator.load_state_dict({
                k.replace("module.", ""): v
                for k, v in torch.load(pretrained_path).items()
            })

    for epoch in range(100):
        if os.path.exists(f'{save_path}/generator/{epoch}.pt'):
            if distributed:
                dist.barrier()
                map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()}
                generator.load_state_dict(
                    torch.load(f'{save_path}/generator/{epoch}.pt',
                               map_location=map_location))
                dist.barrier()
            else:
                generator.load_state_dict({
                    k.replace("module.", ""): v
                    for k, v in torch.load(save_path + f'_{epoch}.pt').items()
                })
            continue

        if distributed:
            dist.barrier()
        logging.info(f'Training epoch {epoch}')
        train_generator(generator,
                        optimizer,
                        dataset,
                        pad_idx=1,
                        batch_size=batch_size,
                        epoch=epoch,
                        distributed=distributed)

        if distributed:
            dist.barrier()
        if local_rank == 0:
            predict, true = test_generator(generator,
                                           test_dataset,
                                           language,
                                           tokenizer,
                                           pad_idx=1,
                                           batch_size=batch_size,
                                           epoch=epoch,
                                           word_mask=None)
            logging.info(eval_all(predict, true))
            write_file(predict, f'{save_path}/predict/{epoch}.txt')
            torch.save(generator.state_dict(),
                       f'{save_path}/generator/{epoch}.pt')
        if distributed:
            dist.barrier()
Пример #7
0
                            knowledge.append(ks)
                        if dohash(k.strip()) == dohash(
                                turn['selected_knowledge'].strip()):
                            pool = [knowledge2id[ks]] + pool
                            fg = 0
                        else:
                            pool.append(knowledge2id[ks])
                if fg:
                    pool = [0] + pool
                if 0 not in pool:
                    pool.append(0)
                context.append(prefix)
                response.append(turn['text'])
                pools.append(pool)
            prefix = prefix + ' </s> ' + turn['text']
    write_file(context, f'dataset/ckgc/{lang}/context.txt')
    write_file(response, f'dataset/ckgc/{lang}/response.txt')
    write_file(knowledge, f'dataset/ckgc/{lang}/knowledge.txt')
    write_file(pools, f'dataset/ckgc/{lang}/pool.txt')

# input('>>>>')
# for reddit-english
data = []
data.extend(
    read_file(
        'dataset/reddit_en/reddit_conversations.3turns.train.topical.txt'))
data.extend(
    read_file('dataset/reddit_en/reddit_conversations.3turns.dev.topical.txt'))
data.extend(
    read_file(
        'dataset/reddit_en/reddit_conversations.3turns.test.topical.txt'))
Пример #8
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format=
        '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s'
    )

    parser = argparse.ArgumentParser()
    parser.add_argument('-q', type=str)
    parser.add_argument('-d', type=str)
    parser.add_argument('-pool', type=str)
    parser.add_argument('-save_path', type=str)
    parser.add_argument('--bc_size', type=int, default=32)
    parser.add_argument('--lr', type=float, default=5e-5)
    parser.add_argument('--pool_size', type=int, default=10)
    parser.add_argument('--max_len', type=int, default=64)
    parser.add_argument('--language', type=str, default='en')
    parser.add_argument('--pt_path', type=str, default='none')
    parser.add_argument('--dist', type=int, default=1)
    args = parser.parse_args()

    query_path = args.q
    document_path = args.d
    pool_path = args.pool
    batch_size = args.bc_size
    lr = args.lr
    pool_size = args.pool_size
    max_len = args.max_len
    lang_code = mbart_lang_to_id[args.language]
    distributed = args.dist
    save_path = args.save_path

    if distributed:
        dist_init()
    local_rank = dist.get_rank() if distributed else 0

    logging.info(
        f'Load query from {query_path} and document from {document_path}')

    query = read_pkl(query_path)
    if document_path != 'redis':
        document = []
        for i in range(200):
            if os.path.exists(f'{document_path}/{i}.pkl'):
                document.extend(read_pkl(f'{document_path}/{i}.pkl'))
    else:
        document = document_path
    knowledge_pool = read_pkl(pool_path)

    dataset = Data(query,
                   query,
                   knowledge_pool,
                   pool_size=pool_size,
                   knowledge=document,
                   order=None,
                   max_len=max_len,
                   lang_code=lang_code)
    test_dataset = CKGCTestData(args.language,
                                pool=f'dataset/ckgc/{args.language}/pool.txt',
                                max_len=max_len,
                                lang_code=lang_code)

    logging.info('Build retriever')
    retriever = Retriever()
    if torch.cuda.is_available():
        retriever = retriever.cuda()
    if distributed:
        retriever = torch.nn.parallel.DistributedDataParallel(
            retriever,
            device_ids=[local_rank],
            output_device=local_rank,
            find_unused_parameters=True)
    optimizer = AdamW(retriever.parameters(), lr)
    pretrained_path = args.pt_path
    if os.path.exists(pretrained_path):
        logging.info(f'Load pretrained model from {pretrained_path}')
        if distributed:
            dist.barrier()
            map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()}
            retriever.load_state_dict(
                torch.load(pretrained_path, map_location=map_location))
            dist.barrier()
        else:
            retriever.load_state_dict({
                k.replace("module.", ""): v
                for k, v in torch.load(pretrained_path).items()
            })

    for epoch in range(100):
        if os.path.exists(f'{save_path}/retriever/{epoch}.pt'):
            if distributed:
                dist.barrier()
                map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()}
                retriever.load_state_dict(
                    torch.load(f'{save_path}/retriever/{epoch}.pt',
                               map_location=map_location))
                dist.barrier()
            else:
                retriever.load_state_dict({
                    k.replace("module.", ""): v
                    for k, v in torch.load(save_path + f'_{epoch}.pt').items()
                })
            continue

        if distributed:
            dist.barrier()
        logging.info(f'Training epoch {epoch}')
        train_retriever(retriever,
                        optimizer,
                        dataset,
                        pad_idx=1,
                        batch_size=batch_size,
                        epoch=epoch,
                        distributed=distributed)

        if distributed:
            dist.barrier()
        if local_rank == 0:
            ranks = test_retriever(retriever,
                                   test_dataset,
                                   pad_idx=1,
                                   batch_size=batch_size,
                                   epoch=epoch)
            write_file(ranks, f'{save_path}/ranks/{epoch}.txt')
            torch.save(retriever.state_dict(),
                       f'{save_path}/retriever/{epoch}.pt')
        if distributed:
            dist.barrier()
Пример #9
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')

    parser = argparse.ArgumentParser()
    parser.add_argument('-q1', type=str)
    parser.add_argument('-q2', type=str)
    parser.add_argument('-d', type=str)
    parser.add_argument('-pool', type=str)
    parser.add_argument('-m', type=int)
    parser.add_argument('-save_path', type=str)
    parser.add_argument('--bc_size', type=int, default=32)
    parser.add_argument('--lr', type=float, default=5e-5)
    parser.add_argument('--pool_size', type=int, default=10)
    parser.add_argument('--max_len', type=int, default=64)
    parser.add_argument('--language', type=str, default='en')
    parser.add_argument('--pt_path', type=str, default='none')
    args = parser.parse_args()

    query1_path = args.q1
    query2_path = args.q2
    document_path = args.d
    pool_path = args.pool
    max_step = args.m
    batch_size = args.bc_size
    lr = args.lr
    pool_size = args.pool_size
    max_len = args.max_len
    lang_code = mbart_lang_to_id[args.language]
    save_path = args.save_path

    logging.info(f'Load query from {query1_path}-{query2_path} and document from {document_path}')

    query1 = read_pkl(query1_path)
    query2 = read_pkl(query2_path)
    if document_path != 'redis':
        document = []
        for i in range(200):
            if os.path.exists(f'{document_path}/{i}.pkl'):
                document.extend(read_pkl(f'{document_path}/{i}.pkl'))
    else:
        document = document_path
    knowledge_pool = read_pkl(pool_path)

    dataset = DuoData(query1, query1, query2, query2, knowledge_pool, pool_size=pool_size, knowledge=document,
                      order=None, max_len=max_len, lang_code=lang_code, curriculum=max_step)
    test_dataset = CKGCTestData(args.language, pool=f'dataset/ckgc/{args.language}/pool.txt',
                                max_len=max_len, lang_code=lang_code)

    logging.info('Build retriever')
    retriever = Retriever()
    if torch.cuda.is_available():
        retriever = retriever.cuda()

    optimizer = AdamW(retriever.parameters(), lr)
    pretrained_path = args.pt_path
    if os.path.exists(pretrained_path):
        logging.info(f'Load pretrained model from {pretrained_path}')
        retriever.load_state_dict({k.replace("module.", ""): v for k, v in torch.load(pretrained_path).items()})

    cur_step = 0
    while cur_step < max_step:
        dataset.set_offset(cur_step)
        logging.info(f'Training step {cur_step} / max step {max_step}')
        train_retriever(retriever, optimizer, dataset, pad_idx=1, batch_size=batch_size, step=10)
        cur_step += 10 * batch_size
        ranks = test_retriever(retriever, test_dataset, pad_idx=1, batch_size=batch_size, epoch=0)
        write_file(ranks, f'{save_path}/ranks/{cur_step}.txt')
        torch.save(retriever.state_dict(), f'{save_path}/retriever/{cur_step}.pt')