def __init__(self, lang, pool='pool', max_len=64, lang_code=250004): self.context = read_pkl(f'dataset/ckgc/{lang}/context.pkl') self.response = read_pkl(f'dataset/ckgc/{lang}/response.pkl') self.knowledge = read_pkl(f'dataset/ckgc/{lang}/knowledge.pkl') self.pool = [[int(item) for item in line[1:-1].split(',')] for line in read_file(pool)] self.max_len = max_len self.lang_code = lang_code
def main(): logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') parser = argparse.ArgumentParser() parser.add_argument('-dialog', type=str) parser.add_argument('-dialog2', type=str) parser.add_argument('-k', type=str) parser.add_argument('-pool', type=str) parser.add_argument('-m', type=int) parser.add_argument('-save_path', type=str) parser.add_argument('--bc_size', type=int, default=32) parser.add_argument('--lr', type=float, default=5e-5) parser.add_argument('--pool_size', type=int, default=10) parser.add_argument('--max_len', type=int, default=64) parser.add_argument('--language', type=str, default='en') parser.add_argument('--pt_path', type=str, default='none') args = parser.parse_args() dialog_path = args.dialog dialog2_path = args.dialog2 knowledge_path = args.k pool_path = args.pool max_step = args.m batch_size = args.bc_size lr = args.lr pool_size = args.pool_size max_len = args.max_len lang_code = mbart_lang_to_id[args.language] save_path = args.save_path language = args.language if knowledge_path != 'redis': knowledge = [] for i in range(200): if os.path.exists(f'{knowledge_path}/{i}.pkl'): knowledge.extend(read_pkl(f'{knowledge_path}/{i}.pkl')) else: knowledge = knowledge_path knowledge_pool = read_pkl(pool_path) dataset = DuoData(read_pkl(f'{dialog_path}/context.pkl'), read_pkl(f'{dialog_path}/response.pkl'), read_pkl(f'{dialog2_path}/context.pkl'), read_pkl(f'{dialog2_path}/context.pkl'), knowledge_pool, pool_size=pool_size, knowledge=knowledge, order=None, max_len=max_len, lang_code=lang_code, curriculum=max_step) test_dataset = CKGCTestData(args.language, pool=f'dataset/ckgc/{args.language}/pool.txt', max_len=max_len, lang_code=lang_code) tokenizer = get_tokenizer('mbart') tokenizer.lang_code_to_id = mbart_lang_to_id logging.info('Build generator') generator = Generator() if torch.cuda.is_available(): generator = generator.cuda() optimizer = AdamW(generator.parameters(), lr) pretrained_path = args.pt_path if os.path.exists(pretrained_path): logging.info(f'Load pretrained model from {pretrained_path}') generator.load_state_dict({k.replace("module.", ""): v for k, v in torch.load(pretrained_path).items()}) cur_step = 0 while cur_step < max_step: dataset.set_offset(cur_step) logging.info(f'Training step {cur_step} / max step {max_step}') # train_generator(generator, optimizer, dataset, # pad_idx=1, batch_size=batch_size, step=10) cur_step += 10 * batch_size predict, true = test_generator(generator, test_dataset, language, tokenizer, pad_idx=1, batch_size=batch_size, epoch=0, word_mask=None) logging.info(eval_all(predict, true)) write_file(predict, f'{save_path}/predict/{cur_step}.txt') torch.save(generator.state_dict(), f'{save_path}/generator/{cur_step}.pt')
#!/usr/bin/env python3 import numpy as np from utils.io import read_pkl, save_pkl if __name__ == '__main__': loc = read_pkl('tmp/location.pkl') user_miss = read_pkl('tmp/user_miss_pair.pkl') user_miss_loc = {} with open('raw/checkins_missing.txt', 'r') as f: for line in f: user, checkins = line.rstrip('\n').split(':') checkins = checkins.split(',') checkins = [(int(checkins[i]), checkins[i + 1]) for i in range(0, len(checkins), 2)] # checkins = [el for el in checkins if el[1] == '?' or loc[el[1]]['country'] == 'US'] for i, checkin in enumerate(checkins): if checkin[1] != '?': continue if user not in user_miss_loc: user_miss_loc[user] = [] if i != 0 and checkins[i - 1][1] != '?': user_miss_loc[user].append( (loc[checkins[i - 1][1]]['lat'],
def main(): logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s' ) parser = argparse.ArgumentParser() parser.add_argument('-dialog', type=str) parser.add_argument('-k', type=str) parser.add_argument('-pool', type=str) parser.add_argument('-save_path', type=str) parser.add_argument('--bc_size', type=int, default=32) parser.add_argument('--lr', type=float, default=5e-5) parser.add_argument('--pool_size', type=int, default=1) parser.add_argument('--max_len', type=int, default=64) parser.add_argument('--language', type=str, default='en') parser.add_argument('--pt_path', type=str, default='none') parser.add_argument('--dist', type=int, default=1) args = parser.parse_args() dialog_path = args.dialog knowledge_path = args.k pool_path = args.pool batch_size = args.bc_size lr = args.lr pool_size = args.pool_size max_len = args.max_len lang_code = mbart_lang_to_id[args.language] distributed = args.dist save_path = args.save_path language = args.language if distributed: dist_init() local_rank = dist.get_rank() if distributed else 0 if knowledge_path != 'redis': knowledge = [] for i in range(200): if os.path.exists(f'{knowledge_path}/{i}.pkl'): knowledge.extend(read_pkl(f'{knowledge_path}/{i}.pkl')) else: knowledge = knowledge_path knowledge_pool = read_pkl(pool_path) dataset = Data(read_pkl(f'{dialog_path}/context.pkl'), read_pkl(f'{dialog_path}/response.pkl'), knowledge_pool, pool_size=pool_size, knowledge=knowledge, order=None, max_len=max_len, lang_code=lang_code) test_dataset = CKGCTestData(args.language, pool=f'dataset/ckgc/{args.language}/pool.txt', max_len=max_len, lang_code=lang_code) tokenizer = get_tokenizer('mbart') tokenizer.lang_code_to_id = mbart_lang_to_id logging.info('Build generator') generator = Generator() if torch.cuda.is_available(): generator = generator.cuda() if distributed: generator = torch.nn.parallel.DistributedDataParallel( generator, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) optimizer = AdamW(generator.parameters(), lr) pretrained_path = args.pt_path if os.path.exists(pretrained_path): logging.info(f'Load pretrained model from {pretrained_path}') if distributed: dist.barrier() map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()} generator.load_state_dict( torch.load(pretrained_path, map_location=map_location)) dist.barrier() else: generator.load_state_dict({ k.replace("module.", ""): v for k, v in torch.load(pretrained_path).items() }) for epoch in range(100): if os.path.exists(f'{save_path}/generator/{epoch}.pt'): if distributed: dist.barrier() map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()} generator.load_state_dict( torch.load(f'{save_path}/generator/{epoch}.pt', map_location=map_location)) dist.barrier() else: generator.load_state_dict({ k.replace("module.", ""): v for k, v in torch.load(save_path + f'_{epoch}.pt').items() }) continue if distributed: dist.barrier() logging.info(f'Training epoch {epoch}') train_generator(generator, optimizer, dataset, pad_idx=1, batch_size=batch_size, epoch=epoch, distributed=distributed) if distributed: dist.barrier() if local_rank == 0: predict, true = test_generator(generator, test_dataset, language, tokenizer, pad_idx=1, batch_size=batch_size, epoch=epoch, word_mask=None) logging.info(eval_all(predict, true)) write_file(predict, f'{save_path}/predict/{epoch}.txt') torch.save(generator.state_dict(), f'{save_path}/generator/{epoch}.pt') if distributed: dist.barrier()
#!/usr/bin/env python3 import numpy as np from sklearn.preprocessing import normalize from utils.io import read_pkl, save_pkl if __name__ == '__main__': user_checkins = read_pkl('tmp/user_checkins.pkl') loc_db = read_pkl('tmp/location.pkl') nodes = read_pkl('tmp/nodes.pkl') node_features = read_pkl('tmp/features.pkl') for i, node in enumerate(nodes): if node[-1] != '?': continue if np.sum(node_features[i][24:]) > 0: continue user = node[:-2] group_features = np.zeros((6, 1)) for checkin in user_checkins[user]: if checkin in loc_db: g = loc_db[checkin]['group'] group_features[g][0] += 1 group_features = normalize(group_features, axis=0) for j in range(6): node_features[i][j + 24] = group_features[j]
#!/usr/bin/env python3 import numpy as np from sklearn.cluster import KMeans from utils.io import read_pkl, distance if __name__ == '__main__': loc_db = read_pkl('tmp/location.pkl') loc_in_checkins = {} with open('raw/checkins_missing.txt', 'r') as f: for line in f: user, checkins = line.rstrip('\n').split(':') checkins = checkins.split(',') checkins = [(int(checkins[i]), checkins[i + 1]) for i in range(0, len(checkins), 2)] checkins = [ el for el in checkins if el[1] != '?' and loc_db[el[1]]['country'] == 'US' ] for checkin in checkins: if checkin[1] not in loc_in_checkins: loc_in_checkins[checkin[1]] = [ loc_db[checkin[1]]['lat'], loc_db[checkin[1]]['lon'] ] candidate = read_pkl('tmp/candidate.pkl')
def main(): logging.basicConfig( level=logging.INFO, format= '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s' ) parser = argparse.ArgumentParser() parser.add_argument('-i', type=str) parser.add_argument('-o', type=str) parser.add_argument('-t', type=str) parser.add_argument('-m', type=str) parser.add_argument('--p', type=int, default=1) parser.add_argument('--redis', type=int, default=1) args = parser.parse_args() input_path = args.i output_path = args.o task = args.t processes = args.p method = args.m use_redis = args.redis if not os.path.exists(output_path): os.makedirs(output_path) logging.info(f'Tokenize data, processes={processes}') if task == 'wiki': pool = multiprocessing.Pool(processes=processes) results = [] file_num = len(all_file(input_path)) step = file_num // processes for i in range(0, file_num, step): results.append( pool.apply_async(tokenize_wiki, (i, step, method, input_path, output_path))) pool.close() pool.join() if use_redis: import redis logging.info('Now build redis') data = [] for i in range(1000): if os.path.exists(f'{output_path}/{i}.pkl'): batch = read_pkl(f'{output_path}/{i}.pkl') data.extend(batch) r = redis.StrictRedis(host='localhost', port=6379, db=0) pipe = r.pipeline() step = len(data) // 10 for j, line in enumerate(data): key = str(j) value = pickle.dumps(line) pipe.set(key, value) if j % step == 0 and j != 0: # print(j / len(data), 'execute') pipe.execute() pipe.execute() # print('final execute done') # print('DONE!') else: context = [ line[:-1].lower() for line in open(f'{input_path}/context.txt', encoding='utf-8') ] context_ids = do_multiprocessing(tokenize, context, processes) write_pkl(context_ids, f'{output_path}/context.pkl') response = [ line[:-1].lower() for line in open(f'{input_path}/response.txt', encoding='utf-8') ] response_ids = do_multiprocessing(tokenize, response, processes) write_pkl(response_ids, f'{output_path}/response.pkl') if os.path.exists(f'{input_path}/knowledge.txt'): knowledge = [ line[:-1].lower() for line in open(f'{input_path}/knowledge.txt', encoding='utf-8') ] knowledge_ids = do_multiprocessing(tokenize, knowledge, processes) write_pkl(knowledge_ids, f'{output_path}/knowledge.pkl')
def get_test_mask(): u_m_pair = read_pkl('tmp/user_miss_pair.pkl') nodes = read_pkl('tmp/nodes.pkl') return [nodes.index(el) for el in u_m_pair]
def get_test_mask(): u_m_pair = read_pkl('tmp/user_miss_pair.pkl') nodes = read_pkl('tmp/nodes.pkl') return [nodes.index(el) for el in u_m_pair] if __name__ == '__main__': root_path = sys.argv[1] if not os.path.isdir(root_path): os.makedirs(root_path) node_features = read_pkl('tmp/features.pkl') node_labels = read_pkl('tmp/labels.pkl') train_mask = read_pkl('tmp/train_mask.pkl') adj_matrix = sparse.load_npz('tmp/graph.npz') masks = get_k_fold_mask(idx_list=train_mask, folds=5) perf = [] for i in range(5): model_path = root_path + '/' + str(i) os.makedirs(model_path) train_mask = [np.array(masks[j]) for j in range(5) if j != i] train_mask = np.concatenate(train_mask, axis=0)
def main(): logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s' ) parser = argparse.ArgumentParser() parser.add_argument('-q', type=str) parser.add_argument('-d', type=str) parser.add_argument('-pool', type=str) parser.add_argument('-save_path', type=str) parser.add_argument('--bc_size', type=int, default=32) parser.add_argument('--lr', type=float, default=5e-5) parser.add_argument('--pool_size', type=int, default=10) parser.add_argument('--max_len', type=int, default=64) parser.add_argument('--language', type=str, default='en') parser.add_argument('--pt_path', type=str, default='none') parser.add_argument('--dist', type=int, default=1) args = parser.parse_args() query_path = args.q document_path = args.d pool_path = args.pool batch_size = args.bc_size lr = args.lr pool_size = args.pool_size max_len = args.max_len lang_code = mbart_lang_to_id[args.language] distributed = args.dist save_path = args.save_path if distributed: dist_init() local_rank = dist.get_rank() if distributed else 0 logging.info( f'Load query from {query_path} and document from {document_path}') query = read_pkl(query_path) if document_path != 'redis': document = [] for i in range(200): if os.path.exists(f'{document_path}/{i}.pkl'): document.extend(read_pkl(f'{document_path}/{i}.pkl')) else: document = document_path knowledge_pool = read_pkl(pool_path) dataset = Data(query, query, knowledge_pool, pool_size=pool_size, knowledge=document, order=None, max_len=max_len, lang_code=lang_code) test_dataset = CKGCTestData(args.language, pool=f'dataset/ckgc/{args.language}/pool.txt', max_len=max_len, lang_code=lang_code) logging.info('Build retriever') retriever = Retriever() if torch.cuda.is_available(): retriever = retriever.cuda() if distributed: retriever = torch.nn.parallel.DistributedDataParallel( retriever, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) optimizer = AdamW(retriever.parameters(), lr) pretrained_path = args.pt_path if os.path.exists(pretrained_path): logging.info(f'Load pretrained model from {pretrained_path}') if distributed: dist.barrier() map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()} retriever.load_state_dict( torch.load(pretrained_path, map_location=map_location)) dist.barrier() else: retriever.load_state_dict({ k.replace("module.", ""): v for k, v in torch.load(pretrained_path).items() }) for epoch in range(100): if os.path.exists(f'{save_path}/retriever/{epoch}.pt'): if distributed: dist.barrier() map_location = {'cuda:%d' % 0: 'cuda:%d' % dist.get_rank()} retriever.load_state_dict( torch.load(f'{save_path}/retriever/{epoch}.pt', map_location=map_location)) dist.barrier() else: retriever.load_state_dict({ k.replace("module.", ""): v for k, v in torch.load(save_path + f'_{epoch}.pt').items() }) continue if distributed: dist.barrier() logging.info(f'Training epoch {epoch}') train_retriever(retriever, optimizer, dataset, pad_idx=1, batch_size=batch_size, epoch=epoch, distributed=distributed) if distributed: dist.barrier() if local_rank == 0: ranks = test_retriever(retriever, test_dataset, pad_idx=1, batch_size=batch_size, epoch=epoch) write_file(ranks, f'{save_path}/ranks/{epoch}.txt') torch.save(retriever.state_dict(), f'{save_path}/retriever/{epoch}.pt') if distributed: dist.barrier()
#!/usr/bin/env python3 from pprint import pprint import numpy as np from scipy import sparse from utils.io import read_pkl, save_pkl loc_db = read_pkl('tmp/location.pkl') candidate = read_pkl('tmp/candidate.pkl') nodes = read_pkl('tmp/nodes.pkl') tag2class = read_pkl('tmp/tag2class.pkl') def get_train_class_weight(): counter = {} for node in nodes: if node[-1] == '?': continue c = loc_db[node]['tag'] if c not in counter: counter[c] = 0 counter[c] += 1 return counter
#!/usr/bin/env python3 import sys import os import numpy as np from sklearn.metrics import accuracy_score from utils.tfpkg.models import Evaluator from utils.io import read_pkl from utils.location import distance model_path = sys.argv[1] nodes = read_pkl('tmp/nodes.pkl') loc_db = read_pkl('tmp/location.pkl') candidate = read_pkl('tmp/candidate.pkl') node_features = read_pkl('tmp/features.pkl') node_labels = read_pkl('tmp/labels.pkl') user_checkins = read_pkl('tmp/user_checkins.pkl') user_miss_loc = read_pkl('tmp/user_miss_loc.pkl') u_m_pair = read_pkl('tmp/user_miss_pair.pkl') categorical = read_pkl('tmp/categorical.pkl') def top_k_accuracy(y_true, y_pred, k): total = y_true.shape[0] p = 0 top_k_indices = np.argsort(y_pred, axis=1)[:, -k:] ground_truth = np.argmax(y_true, axis=1)
def main(): logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') parser = argparse.ArgumentParser() parser.add_argument('-q1', type=str) parser.add_argument('-q2', type=str) parser.add_argument('-d', type=str) parser.add_argument('-pool', type=str) parser.add_argument('-m', type=int) parser.add_argument('-save_path', type=str) parser.add_argument('--bc_size', type=int, default=32) parser.add_argument('--lr', type=float, default=5e-5) parser.add_argument('--pool_size', type=int, default=10) parser.add_argument('--max_len', type=int, default=64) parser.add_argument('--language', type=str, default='en') parser.add_argument('--pt_path', type=str, default='none') args = parser.parse_args() query1_path = args.q1 query2_path = args.q2 document_path = args.d pool_path = args.pool max_step = args.m batch_size = args.bc_size lr = args.lr pool_size = args.pool_size max_len = args.max_len lang_code = mbart_lang_to_id[args.language] save_path = args.save_path logging.info(f'Load query from {query1_path}-{query2_path} and document from {document_path}') query1 = read_pkl(query1_path) query2 = read_pkl(query2_path) if document_path != 'redis': document = [] for i in range(200): if os.path.exists(f'{document_path}/{i}.pkl'): document.extend(read_pkl(f'{document_path}/{i}.pkl')) else: document = document_path knowledge_pool = read_pkl(pool_path) dataset = DuoData(query1, query1, query2, query2, knowledge_pool, pool_size=pool_size, knowledge=document, order=None, max_len=max_len, lang_code=lang_code, curriculum=max_step) test_dataset = CKGCTestData(args.language, pool=f'dataset/ckgc/{args.language}/pool.txt', max_len=max_len, lang_code=lang_code) logging.info('Build retriever') retriever = Retriever() if torch.cuda.is_available(): retriever = retriever.cuda() optimizer = AdamW(retriever.parameters(), lr) pretrained_path = args.pt_path if os.path.exists(pretrained_path): logging.info(f'Load pretrained model from {pretrained_path}') retriever.load_state_dict({k.replace("module.", ""): v for k, v in torch.load(pretrained_path).items()}) cur_step = 0 while cur_step < max_step: dataset.set_offset(cur_step) logging.info(f'Training step {cur_step} / max step {max_step}') train_retriever(retriever, optimizer, dataset, pad_idx=1, batch_size=batch_size, step=10) cur_step += 10 * batch_size ranks = test_retriever(retriever, test_dataset, pad_idx=1, batch_size=batch_size, epoch=0) write_file(ranks, f'{save_path}/ranks/{cur_step}.txt') torch.save(retriever.state_dict(), f'{save_path}/retriever/{cur_step}.pt')
#!/usr/bin/env python3 from pprint import pprint import numpy as np from utils.io import read_pkl loc_db = read_pkl('tmp/location.pkl') categorical = read_pkl('tmp/categorical.pkl') labels = read_pkl('tmp/labels.pkl') nodes = read_pkl('tmp/nodes.pkl') u_m_pair = read_pkl('tmp/user_miss_pair.pkl') def get_test_mask(): return [nodes.index(el) for el in u_m_pair] if __name__ == '__main__': labels = np.argmax(labels, axis=1) test_mask = get_test_mask() counter = {} for place in nodes: if place[-1] == '?': continue c = loc_db[place]['tag']
#!/usr/bin/env python3 import numpy as np from utils.io import read_pkl, save_pkl if __name__ == '__main__': loc_db = read_pkl('tmp/location.pkl') tag2class = read_pkl('tmp/tag2class.pkl') candidate = {} with open('raw/candidate_100_places.txt', 'r') as f: lines = f.readlines() lines = [el.rstrip('\n') for el in lines] for place in lines: candidate[place] = loc_db[place] tag = candidate[place]['tag'] label = tag if tag not in tag2class else tag2class[tag] loc_db[place]['class'] = label with open('raw/checkins_missing.txt', 'r') as f: for line in f: user, checkins = line.rstrip('\n').split(':') checkins = checkins.split(',') checkins = [(int(checkins[i]), checkins[i + 1])for i in range(0, len(checkins), 2)] for checkin in checkins: if checkin[1] == '?':
#!/usr/bin/env python3 import numpy as np from utils.io import read_pkl, save_pkl if __name__ == '__main__': loc_db = read_pkl('tmp/location.pkl') user_checkins = {} c = 0 with open('raw/checkins_missing.txt', 'r') as f: for line in f: user, checkins = line.rstrip('\n').split(':') checkins = checkins.split(',') checkins = [(int(checkins[i]), checkins[i + 1]) for i in range(0, len(checkins), 2)] checkins = [el for el in checkins if el[1] in loc_db] # checkins = [el for el in checkins if el[1] in loc_db and loc_db[el[1]]['country'] == 'US'] if user not in user_checkins: user_checkins[user] = set() for checkin in checkins: user_checkins[user].add(checkin[1]) save_pkl('tmp/user_checkins.pkl', user_checkins)