def save_label(train_files, label_file, task=2, firstline=True): datasets = Tokenizer.load_file(train_files, firstline=firstline, task=task) # data = [] label_set = set() for dataset in datasets: for nl, label in dataset: # data.append(d) label_set.update(set(label.split())) # label_set.update([NULL]) TXT.write(label_set, label_file)
def build_data(args): if not args.tl: if not os.path.exists(args.model_dir): os.mkdir(args.model_dir) if args.timestamped_subdir: sub_folder = datetime.now().isoformat(sep='-', timespec='minutes').replace(":", "-").replace("-", "_") else: sub_folder = '' if not os.path.exists(os.path.join(args.model_dir, sub_folder)): os.mkdir(os.path.join(args.model_dir, sub_folder)) args.model_dir = os.path.join(args.model_dir, sub_folder) args.log_file = os.path.join(args.model_dir, args.log_file) if args.tokenize_type != "bpe": s_paras = [args.wl_th, args.wcutoff] t_paras = [args.wl_th, args.wcutoff] print("INFO: - Build vocabulary...") tokenizer = Tokenizer(s_paras, t_paras) files = [args.train_file] if args.train_file != args.dev_file: files.append(args.dev_file) # Load datasets to build vocabulary data = Tokenizer.load_file(files, task=2) tokenizer.build(datasets=data) sw2i = tokenizer.sw2i tw2i = tokenizer.tw2i print("INFO: - Save vocabulary...") Tokenizer.save(tokenizer, os.path.join(args.model_dir, "tokenizer.vocab")) else: print("INFO: - Load vocabulary...") tokenizer = BPE.load(args.vocab_file) tokenizer.add_tokens(sys_tokens) sw2i = tokenizer.get_vocab() tw2i = tokenizer.get_vocab() # args.tokenizer = tokenizer # Source language args.swd_pretrained = None args.twd_pretrained = None if len(args.swd_embfile) != 0: scale = np.sqrt(3.0 / args.swd_dim) emb_reader = Embeddings(args.swd_embfile) args.swd_pretrained = emb_reader.get_W(args.swd_dim, sw2i, scale) if args.twd_embfile == args.swd_embfile: scale = np.sqrt(3.0 / args.twd_dim) args.twd_pretrained = emb_reader.get_W(args.twd_dim, tw2i, scale) # Target language if len(args.twd_embfile) != 0: scale = np.sqrt(3.0 / args.twd_dim) if args.twd_pretrained is None: emb_reader = Embeddings(args.swd_embfile) args.twd_pretrained = emb_reader.get_W(args.twd_dim, tw2i, scale) # directly integrate transfer learning if no updating new words SaveloadHP.save(args, os.path.join(args.model_dir, args.model_args)) return args else: print("INFO: - Use transfer learning technique") assert os.path.exists(args.tlargs), print("\t - There is no pre-trained argument file") # load pre-trained argument file from a previous training folder margs = SaveloadHP.load(args.tlargs) # margs.tl = args.tl # margs.log_file = args.log_file # TODO update new vocab and all other new arguments used for new training # 0. Read vocab # 1. Update schema # 2. Update vocab # args.tokenizer = margs.tokenizer # 3. Use all model file directory of previous train args.model_dir = margs.model_dir args.seq2seq_file = margs.seq2seq_file # 4. Keep the remaining current arguments # add a constraint at the loading time that if fail to load any model, just skip it args.swd_pretrained = margs.swd_pretrained args.twd_pretrained = margs.twd_pretrained return args
Data2tensor.set_randseed(12345) device = torch.device("cpu") dtype = torch.long use_cuda = False filename = "/media/data/classification/datasets/yelp_review_full_csv/train.csv" label_file = "/media/data/classification/datasets/yelp_review_full_csv/labels.txt" labels_list = TXT.read(label_file, firstline=False) lb2id_dict = Tokenizer.list2dict(sys_tokens + labels_list) id2lb_dict = Tokenizer.reversed_dict(lb2id_dict) lb2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=lb2id_dict, unk_words=False, sos=False, eos=False) tokenize_type = "bpe" if tokenize_type != "bpe": # Load datasets to build vocabulary data = Tokenizer.load_file([filename], task=1) s_paras = [-1, 1] t_paras = [-1, 1] tokenizer = Tokenizer(s_paras, t_paras) tokenizer.build(data) nl2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_nl, vocab_words=tokenizer.sw2i, unk_words=True, sos=False, eos=False) tokenizer.tw2i = lb2id_dict tokenizer.i2tw = id2lb_dict tg2ids = Tokenizer.lst2idx(tokenizer=Tokenizer.process_target, vocab_words=tokenizer.tw2i, unk_words=False, sos=False, eos=False) pad_id = tokenizer.sw2i.get(PAD, 0) sw_size = len(tokenizer.sw2i) tw_size = len(tokenizer.tw2i) collate_fn = Tokenizer.collate_fn(pad_id, True) else: