def main(args): text_encoder = TextEncoder(args.encoder_path, args.bpe_path) with jsonl.open(args.original_file, gzip=True) as test_file: data = test_file.read() with jsonl.open(args.out_file, gzip=True) as out_file: out_file.write(data[-args.n:])
def main(args): text_encoder = TextEncoder(args.encoder_path, args.bpe_path) train_split, val_split, test_split = load_splits(args.splits_file) summaries = os.listdir(args.summary_dir) num_summaries = 0 train_data, val_data, test_data = [], [], [] for file_name in tqdm(summaries): summary_data = load_summary(os.path.join(args.summary_dir, file_name)) if len(summary_data["summary"]) == 0 or len(summary_data["text"]) == 0: continue summary_data["summary"] = encode_line(summary_data["summary"], text_encoder) summary_data["text"] = encode_line(summary_data["text"], text_encoder) file_id = file_name.split(".")[0] if file_id in train_split: train_data.append(summary_data) num_summaries += 1 elif file_id in val_split: val_data.append(summary_data) num_summaries += 1 elif file_id in test_split: test_data.append(summary_data) num_summaries += 1 with jsonl.open(args.train_file, gzip=True) as train_file: train_file.write(train_data) with jsonl.open(args.val_file, gzip=True) as val_file: val_file.write(val_data) with jsonl.open(args.test_file, gzip=True) as test_file: test_file.write(test_data) print("Number of successful conversions: {}".format(num_summaries))
def encode(encoder=None): if encoder == None: ENCODER_PATH = 'model/encoder_bpe_40000.json' BPE_PATH = 'model/vocab_40000.bpe' encoder = TextEncoder(ENCODER_PATH, BPE_PATH) tokens = encoder(get_paragraphs(), verbose=False) with open('Data/tokens.pkl', 'wb') as pkl: pickle.dump(tokens, pkl)
def __init__(self): # initialize lm and text encoder and everything # set up the encoder to turn words into indices encoder_path = 'model/encoder_bpe_40000.json' bpe_path = 'model/vocab_40000.bpe' self.text_encoder = TextEncoder(encoder_path, bpe_path) self.nvocab = len(self.text_encoder.encoder) nctx = 512 # number of positional embeddings (nctx = number of context) vocab = self.nvocab + nctx # set up pretrained openai model args = DEFAULT_CONFIG self.lm_model = LMModel(args, vocab, nctx, return_probs = True) load_openai_pretrained_model(self.lm_model.transformer, n_ctx=nctx, n_special=0) self.lm_model.eval() # this line puts the model in eval mode so we don't do dropout :) # set up spacy for pos tagging self.nlp = spacy.load('en', disable=['ner', 'textcat', 'parser'])
def __init__(self, cfg, vocab=40990, n_ctx=512, return_probs=True, encoder_path='./model/encoder_bpe_40000.json', bpe_path='./model/vocab_40000.bpe'): super(CustomLMModel, self).__init__() self.transformer = TransformerModel(cfg, vocab=vocab, n_ctx=n_ctx) self.lm_head = LMHead(self.transformer, cfg, trunc_and_reshape=False) self.return_probs = return_probs self.text_encoder = TextEncoder(encoder_path,bpe_path) if self.return_probs: pos_emb_mask = torch.zeros(1, 1, vocab) pos_emb_mask[:, :, -n_ctx:] = -1e12 self.register_buffer('pos_emb_mask', pos_emb_mask)
def transformer_predict(input_file: str, text_encoder: TextEncoder, device: int): if device > -1: device_name = "cuda" else: device_name = "cpu" print(input_file) n_ctx = 512 transformer = TransformerModel(DEFAULT_CONFIG, n_ctx=n_ctx, requires_grad=False) load_openai_pretrained_model(transformer, n_ctx=n_ctx) with open(input_file) as f: sentences = f.readlines() encoded_sentences = text_encoder.encode(sentences) masks = [ np.concatenate((np.ones(len(s)), np.zeros(n_ctx - len(s)))) for s in encoded_sentences ] input_tensor = torch.LongTensor([ pad_sequence_to_length(s, desired_length=512) for s in encoded_sentences ]) if device_name == "cuda": input_tensor = input_tensor.cuda() batch_size, num_timesteps = input_tensor.size() positional_encodings = get_range_vector(num_timesteps, device) + n_ctx batch_tensor = torch.stack( [input_tensor, positional_encodings.expand(batch_size, num_timesteps)], dim=-1) if device_name == "cuda": transformer = transformer.cuda() transformer_embeddings = transformer(batch_tensor) np.save("openai_transformer_test_input.npy", batch_tensor.data.cpu().numpy()) np.save("openai_transformer_test_output.npy", transformer_embeddings.data.cpu().numpy())
def load_openai_gpt(n_special=1, n_ctx=512): text_encoder = TextEncoder("pytorch-openai-transformer-lm/model/encoder_bpe_40000.json", "pytorch-openai-transformer-lm/model/vocab_40000.bpe") encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) vocab = n_vocab + n_special + n_ctx args = DEFAULT_CONFIG lm_model = LMModel(args, vocab, n_ctx, return_probs=True) load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special, path="pytorch-openai-transformer-lm/model/", path_names="pytorch-openai-transformer-lm/") # lm_model.to(device) lm_model.return_probs = False lm_model.eval() return lm_model, text_encoder
def main(args): text_encoder = TextEncoder(args.encoder_path, args.bpe_path) num_summaries = 0 out_data = [] with jsonl.open(args.in_file, gzip=True) as in_file: data = in_file.read() for entry in tqdm(data): if entry["summary"] is None or entry["text"] is None: continue entry["summary"] = encode_line(entry["summary"], text_encoder) entry["text"] = encode_line(entry["text"], text_encoder) num_summaries += 1 out_data.append(entry) with jsonl.open(args.out_file, gzip=True) as out_file: out_file.write(out_data) print("Number of successful conversions: {}".format(num_summaries))
def encode_dataset(*splits: Tuple[ # the four lists are first_four_sentences (len=1497), first_choice(len=1497), second_choice(len=1497), true_choice(len=1497) Tuple[List[str], List[str], List[str], ndarray], # each list of len 1497, train instances, Tuple[List, List, List, List], # each list of len 374, val instances Tuple[List, List, List, List] # each list of len 1871, test instances ], encoder: TextEncoder): encoded_splits = [] for split in splits: # loop over trainInstances, valInstances and testInstances fields = [] for field in split: # a field is one list of str (sentences) or int (true answers) if isinstance(field[0], str): # check first element in field to see if str # each str element in the field list is encoded as a list of int, hence field becomes List[List[int]] field = encoder.encode(field) # only encode sentences, not encoding true answers (type int choice: {0,1}) fields.append(field) encoded_splits.append(fields) return encoded_splits
def __init__(self, args): globals().update(args.__dict__) random.seed(seed) np.random.seed(seed) tf.set_random_seed(seed) self.text_encoder = TextEncoder(encoder_path) self.encoder = self.text_encoder.encoder self.n_vocab = len(self.text_encoder.encoder) self.n_y = 2 self.encoder['_start_'] = len(self.encoder) self.encoder['_delimiter_'] = len(self.encoder) self.encoder['_end_'] = len(self.encoder) self.clf_token = self.encoder['_end_'] self.n_special = 3 self.n_batch_train = n_batch * n_gpu self.n_updates_total = n_iter
def main(args): text_encoder = TextEncoder(args.encoder_path, args.bpe_path) num_summaries = 0 out_data = [] with open(args.src_file) as src_file, open(args.tgt_file) as tgt_file: src_lines = src_file.readlines() tgt_lines = tgt_file.readlines() for i in tqdm(range(len(src_lines))): num_summaries += 1 out_data.append({ "summary": encode_line(tgt_lines[i].strip(), text_encoder), "text": encode_line(src_lines[i].strip(), text_encoder) }) with jsonl.open(args.out_file, gzip=True) as out_file: out_file.write(out_data) print("Number of successful conversions: {}".format(num_summaries))
def __init__(self, args): #globals().update(args.__dict__) random.seed(args.seed) np.random.seed(args.seed) tf.set_random_seed(args.seed) # self.ps_hosts = ps_hosts.split(',') # self.worker_hosts = worker_hosts.split(',') #self.logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) self.text_encoder = TextEncoder(args.vocab_path) self.encoder = self.text_encoder.encoder self.n_vocab = len(self.text_encoder.encoder) self.encoder['_start_'] = len(self.encoder) self.encoder['_delimiter_'] = len(self.encoder) self.encoder['_end_'] = len(self.encoder) self.clf_token = self.encoder['_end_'] self.n_special = 3 self.n_batch_train = args.n_batch * args.n_gpu self.n_updates_total = args.n_step * 10000 self.n_ctx = args.n_ctx
def __init__(self, cfg, vocab=40990, n_ctx=512, return_probs=True, encoder_path='./model/encoder_bpe_40000.json', bpe_path='./model/vocab_40000.bpe'): super(CustomLMModel, self).__init__() # Transformer block self.transformer = TransformerModel(cfg, vocab=vocab, n_ctx=n_ctx) # Language modeling head to convert transformer output to word probabilities self.lm_head = LMHead(self.transformer, cfg, trunc_and_reshape=False) # Should the model return probabilities or without softmax self.return_probs = return_probs # Text encoder to convert word to index self.text_encoder = TextEncoder(encoder_path, bpe_path) if self.return_probs: pos_emb_mask = torch.zeros(1, 1, vocab) pos_emb_mask[:, :, -n_ctx:] = -1e12 self.register_buffer('pos_emb_mask', pos_emb_mask)
data_dir = args.data_dir log_dir = args.log_dir submission_dir = args.submission_dir test_path = args.test_path pred_path = args.pred_path out_path = args.out_path topic = args.topic device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) logger = ResultLogger(path=os.path.join(log_dir, '{}log.json'.format(desc)), **args.__dict__) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) print("Encoding dataset...") dataLoader = DataLoader() ((trX, trY), (vaX, vaY), (teX, )) = encode_dataset(*dataLoader.veracity(data_dir, topic=topic), encoder=text_encoder) encoder['_start_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 2 max_len = n_ctx - 2 # Define maximum context as the minimum of [512, x] where x is the max sentence length
# Constants submit = args.submit dataset = args.dataset n_ctx = args.n_ctx save_dir = args.save_dir desc = args.desc data_dir = args.data_dir #I think this is the location of the vocablery? log_dir = args.log_dir submission_dir = args.submission_dir device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) n_special = 0 # XD: useless for language modeling task vocab = n_vocab + n_special + n_ctx #the size of the vocabalery - in this case it is letters - so I don;t think its what we need lm_model = LMModel(args, vocab, n_ctx, return_probs=True) load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special) lm_model.to(device) lm_model.eval() #till now it loaded the previuos model and the vocabalery that will be used text = input('Input some beginning words:') #why we need this? create_dictionary(text_encoder)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--input_file', required=True) parser.add_argument('-o', '--output_file', required=True) parser.add_argument('--n_batch', type=int, default=8) parser.add_argument('--skip_preprocess', action='store_true') parser.add_argument('--sentence_pair', action='store_true') parser.add_argument('--force_delimiter', action='store_true') parser.add_argument('--encoder_path', type=str, default='model/encoder_bpe_40000.json') parser.add_argument('--bpe_path', type=str, default='model/vocab_40000.bpe') parser.add_argument('--model_dir', required=True) parser.add_argument('--mc_dropout_iter', type=int, default=0) args = parser.parse_args() meta = json.load(open(os.path.join(args.model_dir, 'meta.json'), 'r', encoding='utf8')) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) encoder['_start_'] = len(encoder) if args.sentence_pair or args.force_delimiter: encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 2 + int('_delimiter_' in encoder) n_ctx = meta['dh_model']['n_ctx'] max_len = meta['encoder']['max_len'] if args.sentence_pair: max_len = min(max_len, n_ctx // 2 - 2) texts, labels = load_headerless_tsv(args.input_file, sentence_pair=args.sentence_pair) ((X, Y),) = encode_dataset(*[(texts, labels)], encoder=text_encoder, skip_preprocess=args.skip_preprocess) X, M = transform_classification(X, max_len, encoder['_start_'], clf_token, n_vocab, n_special, n_ctx, encoder.get('_delimiter_')) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() n_batch_train = args.n_batch * max(n_gpu, 1) meta['dh_model']['cfg'] = dotdict(meta['dh_model']['cfg']) dh_model = DoubleHeadModel(**meta['dh_model']) dh_model.to(device) dh_model = torch.nn.DataParallel(dh_model) path = os.path.join(args.model_dir, 'best_params') if device == torch.device('cpu'): map_location = lambda storage, loc: storage else: map_location = None dh_model.load_state_dict(torch.load(path, map_location=map_location)) prediction_output = predict(X=X, submission_dir=None, filename=None, pred_fn=lambda x: x, label_decoder=None, dh_model=dh_model, n_batch_train=n_batch_train, device=device) predictions = np.argmax(prediction_output, axis=1) if type(texts) is tuple: df = pd.DataFrame({'question': texts[0], 'text': texts[1], 'label': labels, 'prediction': predictions}) else: df = pd.DataFrame({'text': texts, 'label': labels, 'prediction': predictions}) df.to_csv(args.output_file, index=False, sep='\t', header=False, columns=['text', 'label', 'prediction'], float_format='%.0f') accuracy = accuracy_score(Y, predictions) * 100. print('Accuracy: {}%'.format(accuracy)) basename = os.path.splitext(args.output_file)[0] prediction_output_file = basename + '_output.npy' np.savetxt(prediction_output_file, prediction_output) prediction_probs = np_softmax(prediction_output) prediction_probs_file = basename + '_probs.npy' np.savetxt(prediction_probs_file, prediction_probs) mc_dropout_prediction_output = [] for _ in tqdm(range(args.mc_dropout_iter)): prediction_output = predict(X=X, submission_dir=None, filename=None, pred_fn=lambda x: x, label_decoder=None, dh_model=dh_model, n_batch_train=n_batch_train, device=device, enable_dropout=True) mc_dropout_prediction_output.append(prediction_output) if mc_dropout_prediction_output: mc_dropout_prediction_output = np.asarray(mc_dropout_prediction_output) mc_dropout_prediction_probs = np.zeros(mc_dropout_prediction_output.shape) for i in range(mc_dropout_prediction_output.shape[0]): mc_dropout_prediction_probs[i, ...] = np_softmax(mc_dropout_prediction_output[i, ...]) transpose_dims = (2, 1, 0) mc_dropout_prediction_output = mc_dropout_prediction_output.transpose(transpose_dims) mc_dropout_prediction_probs = mc_dropout_prediction_probs.transpose(transpose_dims) for i in range(mc_dropout_prediction_output.shape[0]): prediction_output_file = '{}_class{}_{}'.format(basename, i, 'output.npy') np.savetxt(prediction_output_file, mc_dropout_prediction_output[i, ...]) prediction_probs_file = '{}_class{}_{}'.format(basename, i, 'probs.npy') np.savetxt(prediction_probs_file, mc_dropout_prediction_probs[i, ...])
class SurprisalAnalyzer: def __init__(self): # initialize lm and text encoder and everything # set up the encoder to turn words into indices encoder_path = 'model/encoder_bpe_40000.json' bpe_path = 'model/vocab_40000.bpe' self.text_encoder = TextEncoder(encoder_path, bpe_path) self.nvocab = len(self.text_encoder.encoder) nctx = 512 # number of positional embeddings (nctx = number of context) vocab = self.nvocab + nctx # set up pretrained openai model args = DEFAULT_CONFIG self.lm_model = LMModel(args, vocab, nctx, return_probs = True) load_openai_pretrained_model(self.lm_model.transformer, n_ctx=nctx, n_special=0) self.lm_model.eval() # this line puts the model in eval mode so we don't do dropout :) # set up spacy for pos tagging self.nlp = spacy.load('en', disable=['ner', 'textcat', 'parser']) def make_batch(self, X): X = np.array(X) assert X.ndim in [1, 2] if X.ndim == 1: X = np.expand_dims(X, axis=0) # add positional encodings - just second dimension that says which word is where pos_enc = np.arange(self.nvocab, self.nvocab + X.shape[-1]) pos_enc = np.expand_dims(pos_enc, axis=0) batch = np.stack([X, pos_enc], axis=-1) batch = torch.tensor(batch, dtype=torch.long) return batch def _get_continuation_tensor(self, sent_vec): """ Deals strictly with tensors """ sent_batch = self.make_batch(sent_vec) sent_res = self.lm_model(sent_batch) return sent_res def tensor_to_probs(self, tensor): """ converts torch tensor to clean numpy array holding probabilities (Basically just hides some nasty code) """ return tensor[:, -1, :].flatten().detach().numpy() def get_continuation_probs(self, sentence): sent_vec = self.text_encoder.encode([sentence]) tensor = self._get_continuation_tensor(sent_vec) return self.tensor_to_probs(tensor) def _get_continuations(self, sent_res, k=10, verbose=False): """ Making this private so I can access it externally... that's awful This is a helper function for the `get_continuations` wrapper that separates the actual processing of the sentence from getting top continuations """ probs, decode = sent_res[:,-1,:].topk(k) if verbose: for p, d in zip(probs.flatten(), decode.flatten()): print("\t...%s (%.4f)"%(self.text_encoder.decoder[d.item()], p.item())) words = [self.text_encoder.decoder[d.item()] for d in decode.flatten()] # strip of the word ending tags if there are some - if it's not a full continuation, what to do? for i in range(len(words)): if words[i][-4:] == "</w>": words[i] = words[i][:-4] probs = probs.flatten().detach().numpy() # convert probs from tensor to numpy array return words, probs def get_continuations(self, sentence, k=10, verbose=False): """ sentence: a string that you want to get next words for k: how many next words you want to get verbose: do you want to print the output """ sent_vec = self.text_encoder.encode([sentence]) sent_res = self._get_continuation_tensor(sent_vec) if verbose: print(sentence) return self._get_continuations(sent_res, k, verbose) def _get_pos_continuations(self, sentence, words, probs): """ helper function for `get_pos_continuations` that takes the lists of words and probabilities and performs all the computation to get the most common pos tags independently of processing an individual sentence """ # get POS of all of k continuations pos_counter = Counter() for word, prob in zip(words, probs): sentence_continuation = "{} {}".format(sentence, word) encoded = self.nlp(sentence_continuation) pos_counter[encoded[-1].pos_] += prob # format pos_counter most common output as two lists, one of probs and one of pos tags pos_counter_list = list(zip(*pos_counter.most_common())) pos_tags, pos_tag_probs = list(pos_counter_list[0]), np.array((pos_counter_list[1]), dtype=np.float32) return pos_tags, pos_tag_probs def get_pos_continuations(self, sentence, k=10, verbose=False): """ sentence: string you want next parts of speech for k: how many top words to analyze NOTE: unlike in the `get_continuation` function, the k is NOT how many unique POS tags you want to look at, it's how many words you want to consider """ # get likely next words words, probs = self.get_continuations(sentence, k, verbose=False) return self._get_pos_continuations(sentence, words, probs) ################################################################################ # The following three functions calculate entropy/surprisal of a SINGLE function ################################################################################ def _get_surprisal(self, distribution, index): word_prob = distribution[index] return -np.log2(word_prob) def get_surprisal(self, sentence, word): """ get the -log2 probability of the word following the sentence """ all_probs = self.get_continuation_probs(sentence) # if the word is not in the vocabulary in full, represent its probability by the # probability of the first part of its encoding (the 0 index) word_index = self.text_encoder.encode([word])[0] # word_prob = all_probs[word_index] return self._get_surprisal(all_probs, word_index)#-np.log2(word_prob) def _get_entropy(self, distribution): return -np.sum([p*np.log2(p) if p > 0 else 0 for p in distribution]) def get_entropy(self, sentence): """ finds the shannon entropy of predicting the word following sentence """ all_probs = self.get_continuation_probs(sentence) return self._get_entropy(all_probs)#-np.sum([p*np.log2(p) if p > 0 else 0 for p in all_probs]) def get_surprisal_entropy_ratio(self, sentence, word): "gets ratio betwen surprisal and entropy at the end of the sentence for a given word" all_probs = self.get_continuation_probs(sentence) word_index = self.text_encoder.encode([word])[0] entropy = self._get_entropy(all_probs) surprisal = self._get_surprisal(all_probs, word_index) return surprisal/entropy #################################################################### # Same as above but for part of speech #################################################################### def get_surprisal_pos(self, sentence, pos, k=1000): """ Because we the language model is not a POS tagger, we cannot directly calculate the surprisal of the pos from a full probability distribution, instead we have to use the degenerate distribution computed from the top k most probable POS continuations sentence is full sentence pos is pos we want to get surprisal of k is how many possible continuations to check """ pos_tags, pos_tag_probs = self.get_pos_continuations(sentence, k) pos_index = pos_tags.index(pos) # assume the POS we want is in the list somewhere... return self._get_surprisal(pos_tag_probs, pos_index) def get_entropy_pos(self, sentence, k=1000): """ Disclaimer about degenerate distribution same as above """ pos_tags, pos_tag_probs = self.get_pos_continuations(sentence, k) return self._get_entropy(pos_tag_probs) ##################################################################### # Gets all of the above metrics for every word in a single sentence # ##################################################################### def get_surprisal_sentence(self, sentence, prepend=None, start=1): """ A little uglier, but perhaps faster """ surprisals = [] sent_enc = self.text_encoder.encode([sentence])[0] # list of indices in enocder 1-d if prepend != None: sent_enc = prepend + sent_enc sent_dec = [self.text_encoder.decoder[ind] for ind in sent_enc] sent_batch = None # if you run the language model with the whole sentence the outputs for each # word are the probabilities for the next word! sent_batch = self.make_batch([sent_enc]) sent_tensor = self.lm_model(sent_batch) for i in range(start, len(sent_enc)): surprisals.append(-np.log2(sent_tensor[:,i-1,sent_enc[i]].item())) return surprisals, sent_dec def get_s_h_shr_sentence(self, sentence, prepend=None, start=1): """ calculates the surprisal, entropy, and surprisal-entropy-ratio at each word (defined by bpe) in the sentence returns, in order 1. The list of surprisals (len(sentence) - 1) 2. The list of entropies (len(sentence) - 1) 3. The list of rations between surprisals and entropies (len(sentence) - 1) 4. The decoded tokens that are used by the BPE encoder wrapper """ surprisals, entropies, surprisal_entropy_ratios = [],[],[] sent_enc = self.text_encoder.encode([sentence])[0] # list of indices in enocder 1-d if prepend != None: sent_enc = prepend + sent_enc sent_dec = [self.text_encoder.decoder[ind] for ind in sent_enc] # start = max(0, min(1, start)) # doesn't work because language model needs to condition on something start = 1 for i in range(start, len(sent_enc)): partial_sent_enc = [sent_enc[:i]] cont_tensor = self._get_continuation_tensor(partial_sent_enc) partial_probs = self.tensor_to_probs(cont_tensor) surprisals.append(self._get_surprisal(partial_probs, sent_enc[i])) entropies.append(self._get_entropy(partial_probs)) surprisal_entropy_ratios.append(surprisals[-1]/entropies[-1]) return surprisals, entropies, surprisal_entropy_ratios, sent_dec
def main(args): init(args) # Constants n_ctx = args.n_ctx save_dir = os.path.join(args.output_dir, args.experiment_name, "checkpoints") desc = args.desc data_dir = args.data_dir log_dir = os.path.join(args.output_dir, args.experiment_name, "logs") train_log_interval = args.train_log_interval val_log_interval = args.val_log_interval beam = args.beam gen_len = args.gen_len k = args.k decoding_strategy = args.decoding_strategy accum_iter = args.accum_iter device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) logger = Logger(log_dir) text_encoder = TextEncoder(args.encoder_path, args.vocab_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 print("Loading dataset...") train_loader = get_loader(os.path.join(data_dir, "train_encoded.jsonl"), args.n_batch, encoder, num_workers=3, shuffle=True) val_loader = get_loader(os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, encoder, num_workers=0, shuffle=False, max_size=args.num_val_examples) print("Train length: {}, Validation length: {}".format(len(train_loader), len(val_loader))) vocab = n_vocab + n_special + n_ctx n_updates_total = (len(train_loader) // args.accum_iter) * (args.num_epochs_dat + args.num_epochs_ft) dh_model = LMModel(args, vocab=vocab, n_ctx=n_ctx, doc_embed=args.doc_model) criterion = nn.CrossEntropyLoss(reduction="none") model_opt = OpenAIAdam(dh_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) lm_loss = LMLoss(criterion) summary_loss = SummaryLoss(criterion) print("Loading Model") if args.use_pretrain: load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special, path="./model/", path_names="./") start_iter, running_loss = load_checkpoint(args.checkpoint, dh_model, model_opt, vocab, n_ctx) dh_model.to(device) dh_model = DataParallelModel(dh_model) lm_loss = DataParallelCriterion(lm_loss) summary_loss = DataParallelCriterion(summary_loss) for i in range(args.num_epochs_dat): start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, lm_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "DAT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_dat), save_dir, logger, text_encoder, show_progress=args.show_progress, summary_loss=summary_loss) for i in range(args.num_epochs_ft): start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, summary_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "FT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_ft), save_dir, logger, text_encoder, show_progress=args.show_progress)
N_EMBD = 768 N_HEAD = 12 N_LAYER = 12 EMBD_PDROP = 0.1 ATTN_PDROP = 0.1 RESID_PDROP = 0.1 AFN = 'gelu' ENCODER_PATH = 'model/encoder_bpe_40000.json' BPE_PATH = 'model/vocab_40000.bpe' N_TRANSFER = 12 random.seed(SEED) np.random.seed(SEED) tf.set_random_seed(SEED) TEXT_ENCODER = TextEncoder(ENCODER_PATH, BPE_PATH) ENCODER = TEXT_ENCODER.encoder N_VOCAB = len(TEXT_ENCODER.encoder) # parser.add_argument('--n_batch', type=int, default=8) # parser.add_argument('--n_gpu', type=int, default=4) # parser.add_argument('--lm_coef', type=float, default=0.5) def transform_texts(list_of_texts): tokens = TEXT_ENCODER.encode(list_of_texts, verbose=False) n_batch = len(tokens) xmb = np.zeros((n_batch, N_CTX, 2), dtype=np.int32) mmb = np.zeros((n_batch, N_CTX), dtype=np.float32) for i, x in enumerate(tokens): x1 = x[:N_CTX]
def fever_app(caller): global db, tokenizer, text_encoder, encoder, X_train, M_train, X, M, Y_train, Y,params,sess, n_batch_train, db_file, \ drqa_index, max_page, max_sent, encoder_path, bpe_path, n_ctx, n_batch, model_file global n_vocab,n_special,n_y,max_len,clf_token,eval_lm_losses,eval_clf_losses,eval_mgpu_clf_losses,eval_logits, \ eval_mgpu_logits,eval_logits LogHelper.setup() logger = LogHelper.get_logger("papelo") logger.info("Load config") config = json.load(open(os.getenv("CONFIG_FILE","configs/config-docker.json"))) globals().update(config) print(globals()) logger.info("Set Seeds") random.seed(42) np.random.seed(42) tf.set_random_seed(42) logger.info("Load FEVER DB") db = FeverDocDB(db_file) retrieval = TopNDocsTopNSents(db, max_page, max_sent, True, False, drqa_index) logger.info("Init word tokenizer") tokenizer = SimpleWordSplitter() # Prepare text encoder logger.info("Load BPE Text Encoder") text_encoder = TextEncoder(encoder_path, bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) n_y = 3 encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 max_len = n_ctx // 2 - 2 n_batch_train = n_batch logger.info("Create TF Placeholders") X_train = tf.placeholder(tf.int32, [n_batch, 1, n_ctx, 2]) M_train = tf.placeholder(tf.float32, [n_batch, 1, n_ctx]) X = tf.placeholder(tf.int32, [None, 1, n_ctx, 2]) M = tf.placeholder(tf.float32, [None, 1, n_ctx]) Y_train = tf.placeholder(tf.int32, [n_batch]) Y = tf.placeholder(tf.int32, [None]) logger.info("Model Setup") eval_logits, eval_clf_losses, eval_lm_losses = model(X, M, Y, train=False, reuse=None) eval_mgpu_logits, eval_mgpu_clf_losses, eval_mgpu_lm_losses = mgpu_predict(X_train, M_train, Y_train) logger.info("Create TF Session") params = find_trainable_variables('model') gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=float(os.getenv("TF_GPU_MEMORY_FRACTION","0.5"))) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)) sess.run(tf.global_variables_initializer()) sess.run([p.assign(ip) for p, ip in zip(params, joblib.load(model_file))]) logger.info("Ready") def predict(instances): predictions = [] for instance in tqdm(instances): sents = retrieval.get_sentences_for_claim(instance["claim"]) found_evidence = resolve_evidence(sents) instance["tokenized_claim"] = " ".join(map(lambda x: x.text, tokenizer.split_words(instance["claim"]))) sub_instances = make_instances(instance, found_evidence) sub_predictions = predict_sub_instances(text_encoder, sub_instances) refute_evidence = [i for i, x in enumerate(sub_predictions) if x == 2] support_evidence = [i for i, x in enumerate(sub_predictions) if x == 0] if len(support_evidence): predicted_label = "SUPPORTS" predicted_evidence = [[found_evidence[i]["title"], found_evidence[i]["line_number"]] for i in support_evidence] elif len(refute_evidence): predicted_label = "REFUTES" predicted_evidence = [[found_evidence[i]["title"], found_evidence[i]["line_number"]] for i in refute_evidence] else: predicted_label = "NOT ENOUGH INFO" predicted_evidence = [] predictions.append({"predicted_label":predicted_label, "predicted_evidence": predicted_evidence}) return predictions return caller(predict)
def main(args): # Constants n_ctx = args.n_ctx desc = args.desc device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 print("Loading dataset...") test_loader = get_loader(args.data_file, args.n_batch, encoder, num_workers=1, shuffle=False, subset=args.subset) vocab = n_vocab + n_special + n_ctx dh_model = LMModel(args, vocab=vocab, n_ctx=n_ctx, doc_embed=args.doc_model) print("Loading model...") load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special, path="./model/", path_names="./") if args.checkpoint != "none": checkpoint = torch.load(args.checkpoint, map_location='cpu') state_dict = checkpoint["state_dict"] for key in list(state_dict.keys()): state_dict[key[7:]] = state_dict[key] del state_dict[key] pos_emb_mask = torch.zeros(1, 1, vocab) pos_emb_mask[:, :, -n_ctx] = -1e12 state_dict['pos_emb_mask'] = pos_emb_mask dh_model.load_state_dict(state_dict) dh_model.to(device) dh_model = DataParallelModel(dh_model) stop_words = [] if args.stop_words is not None: with open(args.stop_words) as f: for line in f: stop_words.append(line) evaluate_model(dh_model, test_loader, text_encoder, device, args.beam, args.gen_len, args.k, args.decoding_strategy, args.save_file, args.gen_dir, args.tgt_dir, args.max_len, stop_words, args)
parser.add_argument('--b1', type=float, default=0.9) parser.add_argument('--b2', type=float, default=0.999) parser.add_argument('--e', type=float, default=1e-8) args = parser.parse_args() print(args) globals().update( args.__dict__ ) ## https://thepythonguru.com/python-builtin-functions/globals/ random.seed(seed) np.random.seed(seed) tf.set_random_seed(seed) logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) ## 로그 유틸 text_encoder = TextEncoder(encoder_path, bpe_path) ## BPE Encoder 생성 encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) ## TextEncoder(BPE)를 이용해 train, valid, test set 생성 (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset( rocstories(data_dir), encoder=text_encoder) ## Train, Valid, Test 데이터 로딩 n_y = 2 encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 max_len = n_ctx // 2 - 2
parser.add_argument('--encoder_path', type=str, default=pretrained_model_path + '/encoder_bpe_40000.json') parser.add_argument('--bpe_path', type=str, default=pretrained_model_path + '/vocab_40000.bpe') args = parser.parse_args() print(args) # Constants n_ctx = args.n_ctx text_encoder = TextEncoder(args.encoder_path, args.bpe_path) #encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) tokens_regular = n_vocab token_start = text_encoder.encoder['_start_'] = len( text_encoder.encoder) # Last number (increments) token_delim = text_encoder.encoder['_delimiter_'] = len( text_encoder.encoder) # Last number (increments) token_clf = text_encoder.encoder['_classify_'] = len( text_encoder.encoder) # Last number (increments) tokens_special = len( text_encoder.encoder) - tokens_regular # Number of extra tokens vocab_count = tokens_regular + tokens_special
'embd_pdrop': 0.1, 'attn_pdrop': 0.1, 'resid_pdrop': 0.1, 'afn': 'gelu', 'clf_pdrop': 0.1 }) args = DEFAULT_CONFIG encoder = pickle.load(open('vect.p', 'rb')).vocabulary_ #device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #n_gpu = torch.cuda.device_count() #print("device", device, "n_gpu", n_gpu) text_encoder = TextEncoder() encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) x = pd.read_csv('../notes_small.csv').iloc[:200] x['NOTE_TEXT'] = x['NOTE_TEXT'].apply(u2.cleanNotes) seq = text_encoder.encode(x['NOTE_TEXT']) seq = [s[:64] if len(s) > 64 else s for s in seq] seq = sorted(seq, key=lambda x: len(x)) #Setup Model encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_']
parser.add_argument('--bpe_path', type=str, default='model/vocab_40000.bpe') parser.add_argument('--n_transfer', type=int, default=12) parser.add_argument('--lm_coef', type=float, default=0.5) parser.add_argument('--b1', type=float, default=0.9) parser.add_argument('--b2', type=float, default=0.999) parser.add_argument('--e', type=float, default=1e-8) args = parser.parse_args() print(args) globals().update(args.__dict__) random.seed(seed) np.random.seed(seed) tf.set_random_seed(seed) logger = ResultLogger(path=os.path.join(log_dir, '{}.jsonl'.format(desc)), **args.__dict__) text_encoder = TextEncoder(encoder_path, bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) #(trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3) = encode_dataset(rocstories(data_dir), encoder=text_encoder) #enco_ry = ruoyao(data_dir) #(trX1,trX2,tyY), (vaX1, vaX2, vaY), (teX1, teX2) = ruoyao(data_dir) #print(trX1[0]) (trX1,trX2,trY), (vaX1, vaX2, vaY), (teX1, teX2, teY) = encode_dataset(ruoyao(data_dir), encoder=text_encoder) n_y = 2 encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 max_len = n_ctx//2-2
def main(args): init(args) # Constants n_ctx = args.n_ctx data_dir = args.data_dir device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) text_encoder = TextEncoder(args.encoder_path, args.bpe_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) text_encoder.decoder[len(encoder)] = '_start_' encoder['_start_'] = len(encoder) text_encoder.decoder[len(encoder)] = '_delimiter_' encoder['_delimiter_'] = len(encoder) text_encoder.decoder[len(encoder)] = '_classify_' encoder['_classify_'] = len(encoder) n_special = 3 # XD: useless for language modeling task vocab = n_vocab + n_special + n_ctx lm_model = LMModel(args, vocab, n_ctx, return_probs=True, doc_embed=args.doc_model) load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special) if args.checkpoint != "none": checkpoint = torch.load(args.checkpoint, map_location='cpu') state_dict = checkpoint["state_dict"] for key in list(state_dict.keys()): state_dict[key[7:]] = state_dict[key] del state_dict[key] pos_emb_mask = torch.zeros(1, 1, vocab) pos_emb_mask[:, :, -n_ctx] = -1e12 state_dict['pos_emb_mask'] = pos_emb_mask lm_model.load_state_dict(state_dict) lm_model.to(device) lm_model = DataParallelModel(lm_model) train_bar = get_loader(os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, encoder, num_workers=1, shuffle=True, max_size=args.n_iter) srcs, hyps, refs = [], [], [] with torch.no_grad(): lm_model.eval() for i, (pad_output, mask_output) in enumerate(tqdm(train_bar), 1): src_strs, tgt_strs, gen_strs = generate_outputs( lm_model, pad_output, mask_output, text_encoder, device, args.beam, args.gen_len, args.k, args.decoding_strategy) srcs.extend(src_strs) hyps.extend(gen_strs) refs.extend(tgt_strs) for i in range(len(hyps)): print("*" * 50) print("Source: {}".format(srcs[i])) print('Hypothesis: {}'.format(hyps[i])) print("Reference: {}".format(refs[i]))