def save_in_steps(json_train, model, split_num=200, start=0, end=1000): hotpotQA_preprocess_cls = [] handle_dirs(args.save_dir) for index, item in enumerate(tqdm(json_train[start:end])): if os.path.exists(f"{args.save_dir}/{item['_id']}.json"): continue i = (index + start + 1, gen_nodes_feat(ques_item=construct_graph(item), model=model)) if i[1] == None: print(f"err id: {item['_id']}") continue hotpotQA_preprocess_cls.append(i[1]) assert None not in hotpotQA_preprocess_cls if i[0] % split_num == 0: for ques_item in tqdm(hotpotQA_preprocess_cls, desc=f'{i[0]}'): ques_item['node_list'] = [ node.to_serializable() for node in ques_item['node_list'] ] ques_item['sp_adj'] = ques_item['sp_adj'].to_serializable() if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) with open(f"{args.save_dir}/{ques_item['id']}.json", 'w', encoding='utf-8') as fp: json.dump(ques_item, fp) hotpotQA_preprocess_cls = [] torch.cuda.empty_cache()
def set_envs(args): if not torch.cuda.is_available(): args.cuda = False args.fp16 = False if 'CUDA_VISIBLE_DEVICES' in os.environ.keys(): args.device_ids = eval(f"[{os.environ['CUDA_VISIBLE_DEVICES']}]") torch.cuda.set_device(args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') torch.backends.cudnn.benchmark = True if not args.device: args.device = torch.device("cuda" if torch.cuda.is_available() and args.cuda else "cpu") if args.expand_filepaths_to_save_dir: args.model_state_file = os.path.join(args.save_dir,args.model_state_file) set_seed_everywhere(args.seed, args.cuda) handle_dirs(args.save_dir)
seed=1337, # Runtime options catch_keyboard_interrupt=True, cuda=False, expand_filepaths_to_save_dir=True, reload_from_files=False, train=True, # Flag to train your network # If embedding layer is used max_len = 15, vector_type='embedding', # 'one_hot' embedding_type = 'train', #'pre-trained', embedding_file_name= '../input/glove.6B.50d.txt', embedding_dim=20 ) # handle dirs handle_dirs(args.save_dir) vectorizer_pth = os.path.join(args.save_dir, args.vectorizer_file) if args.reload_from_files: # training from a checkpoint print("Loading dataset and vectorizer") dataset = SurnameDataset.load_dataset_and_load_vectorizer(args.data_csv, vectorizer_pth) else: print("Loading dataset and creating vectorizer") # create dataset and vectorizer dataset = SurnameDataset.load_dataset_and_make_vectorizer(args.data_csv, args.vector_type, args.max_len) dataset.save_vectorizer(vectorizer_pth)
def set_envs(args): if not args.device: args.device = torch.device(f"cuda:{args.cuda_id}" \ if torch.cuda.is_available() and args.cuda else "cpu") args.dev_features_folder = f"dev_feats/{args.model_path.split('/')[-1]}" handle_dirs(args.dev_features_folder)