def main(args): key_dimensions = ['3', '4', '5'] lambs = ['0.001', '0.0001', '0.00001', '0.000001'] learning_rates = ['0.00005', '0.0001', '0.0005'] mode_dimensions = ['1', '3'] ranks = ['60', '80', '100'] pattern = "parameters:\n" \ " models: MMP\n" \ " similarity: Cosine\n" \ " alpha: [0, 0.1, 1]\n" \ " batch_size: [32]\n" \ " corruption: [0.2, 0.3, 0.4]\n" \ " epoch: [300]\n" \ " iteration: [10]\n" \ " key_dimension: [{0}]\n" \ " lambda: [{1}]\n" \ " learning_rate: [{2}]\n" \ " mode_dimension: [{3}]\n" \ " rank: [{4}]\n" \ " root: [1.0]\n" \ " topK: [5, 10, 15, 20, 50]\n" \ " metric: [R-Precision, NDCG, Precision, Recall]" i = 1 for key_dimension in key_dimensions: for lamb in lambs: for learning_rate in learning_rates: for mode_dimension in mode_dimensions: for rank in ranks: content = pattern.format(key_dimension, lamb, learning_rate, mode_dimension, rank) write_file('config/'+args.path+'/', 'mmp-part'+str(i)+'.yml', content, exe=False) i += 1
def main(args): bash_path = load_yaml('config/global.yml', key='path')['bashes'] config_path = 'config/{}'.format(args.dataset_name) yaml_files = get_file_names(config_path, extension='.yml') project_path = "~/DeepCritiquingForRecSys" pattern = "#!/usr/bin/env bash\n" \ "source {0}\n" \ "cd {1}\n" \ "python tune_parameters.py --data_dir {2} --save_path {3}/{4}.csv --parameters config/{3}/{4}.yml\n" for setting in yaml_files: name, extension = os.path.splitext(setting) content = pattern.format(args.virtualenv_path, project_path, args.data_dir, args.dataset_name, name) write_file(bash_path + args.dataset_name, args.dataset_name + '-' + name + '.sh', content, exe=True) bash_files = sorted( get_file_names(bash_path + args.dataset_name, extension='.sh')) commands = [] command_pattern = 'sbatch --nodes=1 --time={0}:00:00 --mem={1} --cpus=4 ' if args.gpu: command_pattern = command_pattern + '--gres=gpu:1 ' command_pattern = command_pattern + '{2}' for bash in bash_files: commands.append( command_pattern.format(args.max_time, args.memory, bash)) content = "\n".join(commands) write_file(bash_path + args.dataset_name, 'run_' + args.dataset_name + '.sh', content)
def main(args): bash_path = load_yaml('config/global.yml', key='path')['bashes'] yaml_files = get_file_names('config', extension='.yml') #project_path = os.path.dirname(os.path.abspath(__file__)) project_path = "~/IF-VAE-Recommendation" pattern = "#!/usr/bin/env bash\n" \ "source {0}\n" \ "cd {1}\n" \ "python tune_parameters.py -d {2} -n {3}/{4}.csv -y config/{4}.yml\n" for setting in yaml_files: name, extension = os.path.splitext(setting) content = pattern.format(args.virtualenv_path, project_path, args.data_path, args.problem, name) write_file(bash_path + args.problem, args.problem + '-' + name + '.sh', content, exe=True) bash_files = sorted( get_file_names(bash_path + args.problem, extension='.sh')) commands = [] command_pattern = 'sbatch --nodes=1 --time={0}:00:00 --mem={1} --cpus=4 ' if args.gpu: command_pattern = command_pattern + '--gres=gpu:1 ' command_pattern = command_pattern + '{2}' for bash in bash_files: commands.append( command_pattern.format(args.max_time, args.memory, bash)) content = "\n".join(commands) write_file(bash_path + args.problem, 'run_' + args.problem + '.sh', content)
if line != '' and line != '</doc>': doc += line + '\n' logging.info(f'Cut WIKI to sentence') knowledge_collect = [] collect_num = 0 for p in tqdm(passage): k = p.split('\n') topic = k[0] article = ' '.join(k[1:]) sentences = sent_tokenize(article, language=language) for s in sentences: knowledge = topic + sep + s knowledge_collect.append(knowledge) if len(knowledge_collect) == size: write_file(knowledge_collect, f'{output_path}/{collect_num}.txt') knowledge_collect = [] collect_num += 1 write_file(knowledge_collect, f'{output_path}/{collect_num}.txt') logging.info(f'Index WIKI in Solr') knowledge = [] for i in range(1000): if os.path.exists(f'{output_path}/{i}.txt'): logging.info('Load file', f'{output_path}/{i}.txt', len(knowledge)) knowledge.extend([ line[:-1] for line in open(f'{output_path}/{i}.txt', encoding='utf-8') ])
def main(): logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') parser = argparse.ArgumentParser() parser.add_argument('-dialog', type=str) parser.add_argument('-dialog2', type=str) parser.add_argument('-k', type=str) parser.add_argument('-pool', type=str) parser.add_argument('-m', type=int) parser.add_argument('-save_path', type=str) parser.add_argument('--bc_size', type=int, default=32) parser.add_argument('--lr', type=float, default=5e-5) parser.add_argument('--pool_size', type=int, default=10) parser.add_argument('--max_len', type=int, default=64) parser.add_argument('--language', type=str, default='en') parser.add_argument('--pt_path', type=str, default='none') args = parser.parse_args() dialog_path = args.dialog dialog2_path = args.dialog2 knowledge_path = args.k pool_path = args.pool max_step = args.m batch_size = args.bc_size lr = args.lr pool_size = args.pool_size max_len = args.max_len lang_code = mbart_lang_to_id[args.language] save_path = args.save_path language = args.language if knowledge_path != 'redis': knowledge = [] for i in range(200): if os.path.exists(f'{knowledge_path}/{i}.pkl'): knowledge.extend(read_pkl(f'{knowledge_path}/{i}.pkl')) else: knowledge = knowledge_path knowledge_pool = read_pkl(pool_path) dataset = DuoData(read_pkl(f'{dialog_path}/context.pkl'), read_pkl(f'{dialog_path}/response.pkl'), read_pkl(f'{dialog2_path}/context.pkl'), read_pkl(f'{dialog2_path}/context.pkl'), knowledge_pool, pool_size=pool_size, knowledge=knowledge, order=None, max_len=max_len, lang_code=lang_code, curriculum=max_step) test_dataset = CKGCTestData(args.language, pool=f'dataset/ckgc/{args.language}/pool.txt', max_len=max_len, lang_code=lang_code) tokenizer = get_tokenizer('mbart') tokenizer.lang_code_to_id = mbart_lang_to_id logging.info('Build generator') generator = Generator() if torch.cuda.is_available(): generator = generator.cuda() optimizer = AdamW(generator.parameters(), lr) pretrained_path = args.pt_path if os.path.exists(pretrained_path): logging.info(f'Load pretrained model from {pretrained_path}') generator.load_state_dict({k.replace("module.", ""): v for k, v in torch.load(pretrained_path).items()}) cur_step = 0 while cur_step < max_step: dataset.set_offset(cur_step) logging.info(f'Training step {cur_step} / max step {max_step}') # train_generator(generator, optimizer, dataset, # pad_idx=1, batch_size=batch_size, step=10) cur_step += 10 * batch_size predict, true = test_generator(generator, test_dataset, language, tokenizer, pad_idx=1, batch_size=batch_size, epoch=0, word_mask=None) logging.info(eval_all(predict, true)) write_file(predict, f'{save_path}/predict/{cur_step}.txt') torch.save(generator.state_dict(), f'{save_path}/generator/{cur_step}.pt')
def main(): logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s' ) parser = argparse.ArgumentParser() parser.add_argument('-dialog', type=str) parser.add_argument('-k', type=str) parser.add_argument('-pool', type=str) parser.add_argument('-save_path', type=str) parser.add_argument('--bc_size', type=int, default=32) parser.add_argument('--lr', type=float, default=5e-5) parser.add_argument('--pool_size', type=int, default=1) parser.add_argument('--max_len', type=int, default=64) parser.add_argument('--language', type=str, default='en') parser.add_argument('--pt_path', type=str, default='none') parser.add_argument('--dist', type=int, default=1) args = parser.parse_args() dialog_path = args.dialog knowledge_path = args.k pool_path = args.pool batch_size = args.bc_size lr = args.lr pool_size = args.pool_size max_len = args.max_len lang_code = mbart_lang_to_id[args.language] distributed = args.dist save_path = args.save_path language = args.language if distributed: dist_init() local_rank = dist.get_rank() if distributed else 0 if knowledge_path != 'redis': knowledge = [] for i in range(200): if os.path.exists(f'{knowledge_path}/{i}.pkl'): knowledge.extend(read_pkl(f'{knowledge_path}/{i}.pkl')) else: knowledge = knowledge_path knowledge_pool = read_pkl(pool_path) dataset = Data(read_pkl(f'{dialog_path}/context.pkl'), read_pkl(f'{dialog_path}/response.pkl'), knowledge_pool, pool_size=pool_size, knowledge=knowledge, order=None, max_len=max_len, lang_code=lang_code) test_dataset = CKGCTestData(args.language, pool=f'dataset/ckgc/{args.language}/pool.txt', max_len=max_len, lang_code=lang_code) tokenizer = get_tokenizer('mbart') tokenizer.lang_code_to_id = mbart_lang_to_id logging.info('Build generator') generator = Generator() if torch.cuda.is_available(): generator = generator.cuda() if distributed: generator = torch.nn.parallel.DistributedDataParallel( generator, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) optimizer = AdamW(generator.parameters(), lr) pretrained_path = args.pt_path if os.path.exists(pretrained_path): logging.info(f'Load pretrained model from {pretrained_path}') if distributed: dist.barrier() map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()} generator.load_state_dict( torch.load(pretrained_path, map_location=map_location)) dist.barrier() else: generator.load_state_dict({ k.replace("module.", ""): v for k, v in torch.load(pretrained_path).items() }) for epoch in range(100): if os.path.exists(f'{save_path}/generator/{epoch}.pt'): if distributed: dist.barrier() map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()} generator.load_state_dict( torch.load(f'{save_path}/generator/{epoch}.pt', map_location=map_location)) dist.barrier() else: generator.load_state_dict({ k.replace("module.", ""): v for k, v in torch.load(save_path + f'_{epoch}.pt').items() }) continue if distributed: dist.barrier() logging.info(f'Training epoch {epoch}') train_generator(generator, optimizer, dataset, pad_idx=1, batch_size=batch_size, epoch=epoch, distributed=distributed) if distributed: dist.barrier() if local_rank == 0: predict, true = test_generator(generator, test_dataset, language, tokenizer, pad_idx=1, batch_size=batch_size, epoch=epoch, word_mask=None) logging.info(eval_all(predict, true)) write_file(predict, f'{save_path}/predict/{epoch}.txt') torch.save(generator.state_dict(), f'{save_path}/generator/{epoch}.pt') if distributed: dist.barrier()
knowledge.append(ks) if dohash(k.strip()) == dohash( turn['selected_knowledge'].strip()): pool = [knowledge2id[ks]] + pool fg = 0 else: pool.append(knowledge2id[ks]) if fg: pool = [0] + pool if 0 not in pool: pool.append(0) context.append(prefix) response.append(turn['text']) pools.append(pool) prefix = prefix + ' </s> ' + turn['text'] write_file(context, f'dataset/ckgc/{lang}/context.txt') write_file(response, f'dataset/ckgc/{lang}/response.txt') write_file(knowledge, f'dataset/ckgc/{lang}/knowledge.txt') write_file(pools, f'dataset/ckgc/{lang}/pool.txt') # input('>>>>') # for reddit-english data = [] data.extend( read_file( 'dataset/reddit_en/reddit_conversations.3turns.train.topical.txt')) data.extend( read_file('dataset/reddit_en/reddit_conversations.3turns.dev.topical.txt')) data.extend( read_file( 'dataset/reddit_en/reddit_conversations.3turns.test.topical.txt'))
def main(): logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s' ) parser = argparse.ArgumentParser() parser.add_argument('-q', type=str) parser.add_argument('-d', type=str) parser.add_argument('-pool', type=str) parser.add_argument('-save_path', type=str) parser.add_argument('--bc_size', type=int, default=32) parser.add_argument('--lr', type=float, default=5e-5) parser.add_argument('--pool_size', type=int, default=10) parser.add_argument('--max_len', type=int, default=64) parser.add_argument('--language', type=str, default='en') parser.add_argument('--pt_path', type=str, default='none') parser.add_argument('--dist', type=int, default=1) args = parser.parse_args() query_path = args.q document_path = args.d pool_path = args.pool batch_size = args.bc_size lr = args.lr pool_size = args.pool_size max_len = args.max_len lang_code = mbart_lang_to_id[args.language] distributed = args.dist save_path = args.save_path if distributed: dist_init() local_rank = dist.get_rank() if distributed else 0 logging.info( f'Load query from {query_path} and document from {document_path}') query = read_pkl(query_path) if document_path != 'redis': document = [] for i in range(200): if os.path.exists(f'{document_path}/{i}.pkl'): document.extend(read_pkl(f'{document_path}/{i}.pkl')) else: document = document_path knowledge_pool = read_pkl(pool_path) dataset = Data(query, query, knowledge_pool, pool_size=pool_size, knowledge=document, order=None, max_len=max_len, lang_code=lang_code) test_dataset = CKGCTestData(args.language, pool=f'dataset/ckgc/{args.language}/pool.txt', max_len=max_len, lang_code=lang_code) logging.info('Build retriever') retriever = Retriever() if torch.cuda.is_available(): retriever = retriever.cuda() if distributed: retriever = torch.nn.parallel.DistributedDataParallel( retriever, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) optimizer = AdamW(retriever.parameters(), lr) pretrained_path = args.pt_path if os.path.exists(pretrained_path): logging.info(f'Load pretrained model from {pretrained_path}') if distributed: dist.barrier() map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()} retriever.load_state_dict( torch.load(pretrained_path, map_location=map_location)) dist.barrier() else: retriever.load_state_dict({ k.replace("module.", ""): v for k, v in torch.load(pretrained_path).items() }) for epoch in range(100): if os.path.exists(f'{save_path}/retriever/{epoch}.pt'): if distributed: dist.barrier() map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()} retriever.load_state_dict( torch.load(f'{save_path}/retriever/{epoch}.pt', map_location=map_location)) dist.barrier() else: retriever.load_state_dict({ k.replace("module.", ""): v for k, v in torch.load(save_path + f'_{epoch}.pt').items() }) continue if distributed: dist.barrier() logging.info(f'Training epoch {epoch}') train_retriever(retriever, optimizer, dataset, pad_idx=1, batch_size=batch_size, epoch=epoch, distributed=distributed) if distributed: dist.barrier() if local_rank == 0: ranks = test_retriever(retriever, test_dataset, pad_idx=1, batch_size=batch_size, epoch=epoch) write_file(ranks, f'{save_path}/ranks/{epoch}.txt') torch.save(retriever.state_dict(), f'{save_path}/retriever/{epoch}.pt') if distributed: dist.barrier()
def main(): logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') parser = argparse.ArgumentParser() parser.add_argument('-q1', type=str) parser.add_argument('-q2', type=str) parser.add_argument('-d', type=str) parser.add_argument('-pool', type=str) parser.add_argument('-m', type=int) parser.add_argument('-save_path', type=str) parser.add_argument('--bc_size', type=int, default=32) parser.add_argument('--lr', type=float, default=5e-5) parser.add_argument('--pool_size', type=int, default=10) parser.add_argument('--max_len', type=int, default=64) parser.add_argument('--language', type=str, default='en') parser.add_argument('--pt_path', type=str, default='none') args = parser.parse_args() query1_path = args.q1 query2_path = args.q2 document_path = args.d pool_path = args.pool max_step = args.m batch_size = args.bc_size lr = args.lr pool_size = args.pool_size max_len = args.max_len lang_code = mbart_lang_to_id[args.language] save_path = args.save_path logging.info(f'Load query from {query1_path}-{query2_path} and document from {document_path}') query1 = read_pkl(query1_path) query2 = read_pkl(query2_path) if document_path != 'redis': document = [] for i in range(200): if os.path.exists(f'{document_path}/{i}.pkl'): document.extend(read_pkl(f'{document_path}/{i}.pkl')) else: document = document_path knowledge_pool = read_pkl(pool_path) dataset = DuoData(query1, query1, query2, query2, knowledge_pool, pool_size=pool_size, knowledge=document, order=None, max_len=max_len, lang_code=lang_code, curriculum=max_step) test_dataset = CKGCTestData(args.language, pool=f'dataset/ckgc/{args.language}/pool.txt', max_len=max_len, lang_code=lang_code) logging.info('Build retriever') retriever = Retriever() if torch.cuda.is_available(): retriever = retriever.cuda() optimizer = AdamW(retriever.parameters(), lr) pretrained_path = args.pt_path if os.path.exists(pretrained_path): logging.info(f'Load pretrained model from {pretrained_path}') retriever.load_state_dict({k.replace("module.", ""): v for k, v in torch.load(pretrained_path).items()}) cur_step = 0 while cur_step < max_step: dataset.set_offset(cur_step) logging.info(f'Training step {cur_step} / max step {max_step}') train_retriever(retriever, optimizer, dataset, pad_idx=1, batch_size=batch_size, step=10) cur_step += 10 * batch_size ranks = test_retriever(retriever, test_dataset, pad_idx=1, batch_size=batch_size, epoch=0) write_file(ranks, f'{save_path}/ranks/{cur_step}.txt') torch.save(retriever.state_dict(), f'{save_path}/retriever/{cur_step}.pt')