def __init__(self, index_fp, catalog, term_ids_map, doc_ids_map, doc_len_map, decompressor=None, exclude=None, stemmer=None): self._index_fp = index_fp self._catalog = catalog self._term_ids_map = term_ids_map self._doc_ids_map = doc_ids_map self._doc_len_map = doc_len_map self._sum_ttf = sum(ttf for ttf in self._doc_len_map.values()) self._tokenizer = tokenizer.build_tokenizer(exclude=exclude, stemmer=stemmer) self._cached_inverted_list = dict() self._decoder = indexer.TextProcessor.decoder(decompressor)
def main(): name = sys.argv.pop(0) usage = '''{} -trn FILE [-tst FILE]* [-tok FILE] -tok FILE : options for tokenizer -trn FILE : train file -tst FILE : test file The script needs pyonmttok installed (pip install pyonmttok) '''.format(name) ftok = None ftrn = None ftsts = [] while len(sys.argv): tok = sys.argv.pop(0) if (tok == "-tok" and len(sys.argv)): ftok = sys.argv.pop(0) elif (tok == "-trn" and len(sys.argv)): ftrn = sys.argv.pop(0) elif (tok == "-tst" and len(sys.argv)): ftsts.append(sys.argv.pop(0)) elif (tok == "-h"): sys.stderr.write("{}".format(usage)) sys.exit() else: sys.stderr.write('error: unparsed {} option\n'.format(tok)) sys.stderr.write("{}".format(usage)) sys.exit() token = None if ftok is not None: with open(ftok) as yamlfile: opts = yaml.load(yamlfile, Loader=yaml.FullLoader) token = build_tokenizer(opts) if ftrn is not None: trn = File(ftrn, None, token) for ftst in ftsts: tst = File(ftst, trn, token)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--train_dataset", type=str, default="data/corpus.small", help="train dataset") parser.add_argument( "--test_dataset", type=str, default="data/corpus.small", help="test set for evaluation", ) parser.add_argument("--vocab_file", default="gpt2-vocab.json", type=str) parser.add_argument("--merges_file", default="gpt2-merges.txt", type=str) parser.add_argument("--output_path", default="output/", type=str, help="save path") parser.add_argument("--restore_file", default=None, type=str, help="the path for pretrained model") parser.add_argument("--seq_len", type=int, default=128, help="maximum sequence len") parser.add_argument("--batch_size", type=int, default=8, help="number of batch_size") parser.add_argument("--epochs", type=int, default=5, help="number of epochs") parser.add_argument("--num_workers", type=int, default=0, help="dataloader worker size") parser.add_argument("--lr", type=float, default=3e-4, help="learning rate of adam") parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam") parser.add_argument("--adam_beta1", type=float, default=0.98, help="adam first beta value") parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value") parser.add_argument("--warmup_steps", type=int, default=1000, help="warmup steps") parser.add_argument( "--accumulate_gradient_steps", type=int, default=1, help="accumulate gradient steps", ) args = parser.parse_args() print("building tokenizer") tokenizer = build_tokenizer( vocab_file=args.vocab_file, merges_file=args.merges_file, tokenizer_type="GPT2BPETokenizer", ) print("building train dataset") train_dataset = GPTDataset(args.train_dataset, tokenizer, args.seq_len) print("building test dataset") test_dataset = GPTDataset(args.test_dataset, tokenizer, args.seq_len) print("building train dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) print("building test dataloader") test_data_loader = DataLoader( test_dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, ) print("building model") config = GPT2Config() model = GPT2LMHeadModel(config) if args.restore_file is not None: model.load_state_dict(flow.load(args.restore_file)) model.lm_head.weight = model.transformer.wte.weight trainer = Trainer( model, train_dataloader=train_data_loader, test_dataloader=test_data_loader, epoch=args.epochs, lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, warmup_steps=args.warmup_steps, accumulate_gradient_steps=args.accumulate_gradient_steps, output_path=args.output_path, ) print("begin training") trainer.train()
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--train_dataset", required=False, type=str, default="data/corpus.small", help="train dataset", ) parser.add_argument( "--test_dataset", type=str, default="data/corpus.small", help="test set for evaluation", ) parser.add_argument("--vocab_file", required=False, default="vocab.json", type=str) parser.add_argument("--merges_file", required=False, default="merge.txt", type=str) parser.add_argument( "--output_path", required=False, default="output/model", type=str, help="save path", ) parser.add_argument("--seq_len", type=int, default=128, help="maximum sequence len") parser.add_argument("--batch_size", type=int, default=4, help="number of batch_size") parser.add_argument("--epochs", type=int, default=50, help="number of epochs") parser.add_argument("--num_workers", type=int, default=0, help="dataloader worker size") parser.add_argument( "--with_cuda", type=bool, default=True, help="training with CUDA: true, or false", ) parser.add_argument("--lr", type=float, default=1e-4, help="learning rate of adam") parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam") parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value") parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value") args = parser.parse_args() print("building tokenizer") tokenizer = build_tokenizer( vocab_file=args.vocab_file, merges_file=args.merges_file, tokenizer_type="GPT2BPETokenizer", ) print("building train dataset") train_dataset = GPTDataset(args.train_dataset, tokenizer, args.seq_len) print("building train dataloader") train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers) for i, b in enumerate(train_data_loader): if i == 2: batch = b break of_batch = batch.cuda() print("building model") config = GPT2Config() pt_batch = torch.from_numpy(batch.numpy()).long().cuda() model = pt_GPT2LMHeadModel(config) model.load_state_dict(torch.load("gpt2_model.pt")) model.lm_head.weight = model.transformer.wte.weight model.cuda() model.eval() learning_rate = 0.01 mom = 0.9 pt_optimizer = torch.optim.AdamW( model.parameters(), lr=0.0001, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, ) for_time = 0.0 bp_time = 0.0 update_time = 0.0 pt_loss = list() loss = None print("start pytorch training loop....") start_t = time.time() for epoch in range(args.epochs): s_t = time.time() loss = model(pt_batch, labels=pt_batch)[0] for_time += time.time() - s_t pt_loss.append(loss.item()) s_t = time.time() loss.backward() bp_time += time.time() - s_t s_t = time.time() pt_optimizer.step() pt_optimizer.zero_grad() update_time += time.time() - s_t end_t = time.time() print("pytorch traning loop avg time : {}".format( (end_t - start_t) / args.epochs)) print("forward avg time : {}".format(for_time / args.epochs)) print("backward avg time : {}".format(bp_time / args.epochs)) print("update parameters avg time : {}".format(update_time / args.epochs)) pt_parameters_names = [] pt_parameters_value = [] for name, param in model.named_parameters(): pt_parameters_names.append(name) pt_parameters_value.append(param.cpu().detach().numpy()) model = GPT2LMHeadModel(config) model.load_state_dict(flow.load("gpt2_oneflow_model")) model.lm_head.weight = model.transformer.wte.weight model.cuda() model.eval() optimizer = flow.optim.AdamW( model.parameters(), lr=0.0001, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay, ) for_time = 0.0 bp_time = 0.0 update_time = 0.0 of_loss = list() print("start oneflow training loop....") start_t = time.time() for epoch in range(args.epochs): s_t = time.time() loss = model(of_batch, labels=of_batch)[0] for_time += time.time() - s_t of_loss.append(loss.numpy()) s_t = time.time() loss.backward() bp_time += time.time() - s_t s_t = time.time() optimizer.step() optimizer.zero_grad() update_time += time.time() - s_t end_t = time.time() print("oneflow traning loop avg time : {}".format( (end_t - start_t) / args.epochs)) print("forward avg time : {}".format(for_time / args.epochs)) print("backward avg time : {}".format(bp_time / args.epochs)) print("update parameters avg time : {}".format(update_time / args.epochs)) for i in range(args.epochs): print(i, of_loss[i], pt_loss[i]) import matplotlib.pyplot as plt plt.switch_backend("agg") epochs = np.arange(1, args.epochs + 1) plt.plot(epochs, of_loss, label="oneflow") plt.plot(epochs, pt_loss, label="pytorch") plt.legend() plt.savefig("./1.jpg") plt.show()
def __init__(self, filepath, voc_src, tok_src, voc_tgt, tok_tgt, seq_size, max_sents, do_shuffle): if filepath is None: return self.voc_src = voc_src self.voc_tgt = voc_tgt self.files = filepath.split(",") self.seq_size = seq_size self.max_sents = max_sents self.do_shuffle = do_shuffle self.annotated = False self.data = [] ### length of the data set to be used (not necessarily the whole set) self.length = 0 src_tokenizer = None tgt_tokenizer = None if tok_src: src_tokenizer = build_tokenizer(tok_src) if tok_tgt: tgt_tokenizer = build_tokenizer(tok_tgt) # file handlers fhs = [] for file in self.files: if file.endswith('.gz'): fh.append(gzip.open(file, 'rb')) else: fhs.append(open(file, 'rb')) firstline = True count_column = None idx = 0 for line in fhs[0]: idx += 1 if len(fhs) > 1: # read from multiple files lsplit = [line] for fh in fhs[1:]: lsplit.append(fh.readline().strip()) else: # or for one single file lsplit = line.split('\t') if firstline: assert len(lsplit) >= 2 and len( lsplit) <= 4, "invalid column count in {}".format(filepath) count_column = len(lsplit) if len(lsplit) == 4: self.annotated = True firstline = False else: assert len( lsplit ) == count_column, "invalid column count in {}, line {}".format( filepath, idx) if src_tokenizer: tokens, _ = src_tokenizer.tokenize(str(lsplit[0])) lsplit[0] = " ".join(tokens) if tgt_tokenizer: tokens, _ = tgt_tokenizer.tokenize(str(lsplit[1])) lsplit[1] = " ".join(tokens) self.data.append("\t".join(lsplit)) self.length += 1 if self.max_sents > 0: self.length = min(self.length, self.max_sents) sys.stderr.write('({} contains {} examples)\n'.format( filepath, len(self.data)))
def main(): name = sys.argv.pop(0) usage = '''{} [-data FILE] ( -save FILE | -load FILE ) -tok FILE : options for tokenizer -data FILE : file used to learn/inference -save FILE : save tfidf model after building it with data file (LEARNING) -load FILE : load tfidf model and use it for inference on data file (INFERENCE) The script needs pyonmttok installed (pip install pyonmttok) '''.format(name) ftok = None fsave = None fload = None fdata = [] while len(sys.argv): tok = sys.argv.pop(0) if (tok == "-tok" and len(sys.argv)): ftok = sys.argv.pop(0) elif (tok == "-save" and len(sys.argv)): fsave = sys.argv.pop(0) elif (tok == "-load" and len(sys.argv)): fload = sys.argv.pop(0) elif (tok == "-data" and len(sys.argv)): fdata.append(sys.argv.pop(0)) elif (tok == "-h"): sys.stderr.write("{}".format(usage)) sys.exit() else: sys.stderr.write('error: unparsed {} option\n'.format(tok)) sys.stderr.write("{}".format(usage)) sys.exit() token = None if ftok is not None: with open(ftok) as yamlfile: opts = yaml.load(yamlfile) token = build_tokenizer(opts) ### learning ### if fsave is not None and len(fdata): sys.stderr.write('Learning mode\n') sentIdf = SentIdf() for f in fdata: sys.stderr.write('\treading {}\n'.format(f)) sentIdf.add(f, token) sys.stderr.write('Model saved in {}\n'.format(fsave)) sentIdf.save(fsave) ### inference ### if fload is not None and len(fdata): sys.stderr.write('Inference mode. Model in {}\n'.format(fload)) sentIdf = SentIdf(fload) for file in fdata: with open(file) as f: for line in f: line = line.strip('\n') if token is not None: toks, _ = token.tokenize(str(line)) else: toks = line.split(' ') tfidf = sentIdf.tfidf(toks, use_tf=False) sys.stdout.write(" ".join(toks) + '\n') for i in range(len(toks)): sys.stdout.write("{:.8f}\t{}\n".format( tfidf[i], toks[i]))
def main(): name = sys.argv.pop(0) usage = '''{} -tok FILE -mod FILE ([-trn STRING]+ | -tst FILE [-snt]) -tok FILE : options for tokenizer -mod FILE : tfidf model file (to create/save) -tst FILE : file used for inference -trn STRING : file:tag used for the given domain -max N : max vocabulary size (default 0: use all) -snt : compute tfidf values for each sentence rather the entire tst file The script needs pyonmttok installed (pip install pyonmttok) '''.format(name) ftok = None fmod = None vtrn = [] ftst = None max_voc_size = 0 snt = False #### compute inference over whole test-set while len(sys.argv): tok = sys.argv.pop(0) if (tok=="-tok" and len(sys.argv)): ftok = sys.argv.pop(0) elif (tok=="-mod" and len(sys.argv)): fmod = sys.argv.pop(0) elif (tok=="-trn" and len(sys.argv)): vtrn.append(sys.argv.pop(0)) elif (tok=="-tst" and len(sys.argv)): ftst = sys.argv.pop(0) elif (tok=="-max" and len(sys.argv)): max_voc_size = int(sys.argv.pop(0)) elif (tok=="-snt"): snt = True elif (tok=="-h"): sys.stderr.write("{}".format(usage)) sys.exit() else: sys.stderr.write('error: unparsed {} option\n'.format(tok)) sys.stderr.write("{}".format(usage)) sys.exit() token = None if ftok is not None: with open(ftok) as yamlfile: opts = yaml.load(yamlfile) token = build_tokenizer(opts) tfidf = TfIdf() ############################# ### create/read the model ### ############################# if len(vtrn): if os.path.exists(fmod): sys.stderr.write('error: the path {} already exists\n'.format(fmod)) sys.exit() tfidf.learn(vtrn,fmod,max_voc_size,token) fout = open(fmod,'w') pickle.dump(tfidf, fout) fout.close() #tfidf.debug() sys.stderr.write('Wrote model (V, D) = {}\n'.format(tfidf.TfIdf.shape)) else: fin = open(fmod,'r') tfidf = pickle.load(fin) fin.close() sys.stderr.write('Read model (V, D) = {}\n'.format(tfidf.TfIdf.shape)) ################# ### inference ### ################# if ftst is not None: tfidf.inference(ftst,snt,token) sys.stderr.write('Done\n')
sys.stderr.write('error: -trn and/or -tst options must be set\n') sys.stderr.write("{}".format(usage)) sys.exit() sys.stderr.write('Nbest : {}\n'.format(Nbest)) sys.stderr.write('minNgram : {}\n'.format(minNgram)) sys.stderr.write('maxNgram : {}\n'.format(maxNgram)) sys.stderr.write('testSet : {}\n'.format(testSet)) sys.stderr.write('sortByEDist : {}\n'.format(sortByEDist)) sys.stderr.write('{} Start\n'.format(str_time())) token = None if ftok is not None: with open(ftok) as yamlfile: opts = yaml.load(yamlfile, Loader=yaml.FullLoader) token = build_tokenizer(opts) sa = None if ftrn is not None: sa = SuffixArray(ftrn, token) if fmod is not None: with open(fmod, 'wb') as f: pickle.dump(sa, f) if ftst is not None: if sa is None and fmod is not None: with open(fmod, 'rb') as f: sa = pickle.load(f) sys.stderr.write('{} Read model from: {}\n'.format(str_time(),fmod)) sa.queryfile(ftst,token,minNgram,maxNgram,Nbest,sortByEDist,testSet) sys.stderr.write('{} End\n'.format(str_time()))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--vocab_file", default="gpt2-vocab.json", type=str) parser.add_argument("--merges_file", default="gpt2-merges.txt", type=str) parser.add_argument( "--restore_file", default="gpt2_oneflow_model", type=str, help="Path to pre-trained model", ) parser.add_argument("--prompt", type=str, default="") parser.add_argument("--length", type=int, default=20) parser.add_argument("--temperature", type=float, default=1.0) parser.add_argument("--top_k", type=int, default=1) parser.add_argument("--top_p", type=float, default=0.9) parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") args = parser.parse_args() args.device = flow.device("cuda" if not args.no_cuda else "cpu") set_seed(args) tokenizer = build_tokenizer( vocab_file=args.vocab_file, merges_file=args.merges_file, tokenizer_type="GPT2BPETokenizer", ) config = GPT2Config() model = GPT2LMHeadModel(config) if args.restore_file is not None: model.load_state_dict(flow.load(args.restore_file)) model.lm_head.weight = model.transformer.wte.weight model.to(args.device) model.eval() if args.length < 0 and config.max_position_embeddings > 0: args.length = config.max_position_embeddings elif 0 < config.max_position_embeddings < args.length: args.length = (config.max_position_embeddings ) # No generation bigger than model size elif args.length < 0: args.length = MAX_LENGTH # avoid infinite loop print(args) while True: raw_text = args.prompt if args.prompt else input("Model prompt >>> ") context_tokens = tokenizer.tokenize(raw_text) out = sample_sequence( model=model, context=context_tokens, length=args.length, temperature=args.temperature, top_k=args.top_k, top_p=args.top_p, device=args.device, ) out = out[0, len(context_tokens):].tolist() text = tokenizer.detokenize(out) print(text) if args.prompt: break return text