def transformer_predict(input_file: str, text_encoder: TextEncoder, device: int): if device > -1: device_name = "cuda" else: device_name = "cpu" print(input_file) n_ctx = 512 transformer = TransformerModel(DEFAULT_CONFIG, n_ctx=n_ctx, requires_grad=False) load_openai_pretrained_model(transformer, n_ctx=n_ctx) with open(input_file) as f: sentences = f.readlines() encoded_sentences = text_encoder.encode(sentences) masks = [ np.concatenate((np.ones(len(s)), np.zeros(n_ctx - len(s)))) for s in encoded_sentences ] input_tensor = torch.LongTensor([ pad_sequence_to_length(s, desired_length=512) for s in encoded_sentences ]) if device_name == "cuda": input_tensor = input_tensor.cuda() batch_size, num_timesteps = input_tensor.size() positional_encodings = get_range_vector(num_timesteps, device) + n_ctx batch_tensor = torch.stack( [input_tensor, positional_encodings.expand(batch_size, num_timesteps)], dim=-1) if device_name == "cuda": transformer = transformer.cuda() transformer_embeddings = transformer(batch_tensor) np.save("openai_transformer_test_input.npy", batch_tensor.data.cpu().numpy()) np.save("openai_transformer_test_output.npy", transformer_embeddings.data.cpu().numpy())
def encode_dataset(*splits: Tuple[ # the four lists are first_four_sentences (len=1497), first_choice(len=1497), second_choice(len=1497), true_choice(len=1497) Tuple[List[str], List[str], List[str], ndarray], # each list of len 1497, train instances, Tuple[List, List, List, List], # each list of len 374, val instances Tuple[List, List, List, List] # each list of len 1871, test instances ], encoder: TextEncoder): encoded_splits = [] for split in splits: # loop over trainInstances, valInstances and testInstances fields = [] for field in split: # a field is one list of str (sentences) or int (true answers) if isinstance(field[0], str): # check first element in field to see if str # each str element in the field list is encoded as a list of int, hence field becomes List[List[int]] field = encoder.encode(field) # only encode sentences, not encoding true answers (type int choice: {0,1}) fields.append(field) encoded_splits.append(fields) return encoded_splits
n_special = 0 # XD: useless for language modeling task vocab = n_vocab + n_special + n_ctx #the size of the vocabalery - in this case it is letters - so I don;t think its what we need lm_model = LMModel(args, vocab, n_ctx, return_probs=True) load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special) lm_model.to(device) lm_model.eval() #till now it loaded the previuos model and the vocabalery that will be used text = input('Input some beginning words:') #why we need this? create_dictionary(text_encoder) while text != 'q': X = text_encoder.encode([text,]) XMB = make_batch(X) for _ in range(args.gen_len): lm_probs = lm_model(XMB) #the porbability of each word in the vocabalry? if args.topk == 0: next_idx = torch.multinomial(lm_probs[:, -1, :], 1) else: # prob = 0 # choosen_word = 0 tmp = [] for index in encoded_words: tmp += [lm_probs[:, -1, :][:, index].item()] #ToDo # if tmp >= prob: # prob = tmp # choosen_word = index - I think we can delete this part
class SurprisalAnalyzer: def __init__(self): # initialize lm and text encoder and everything # set up the encoder to turn words into indices encoder_path = 'model/encoder_bpe_40000.json' bpe_path = 'model/vocab_40000.bpe' self.text_encoder = TextEncoder(encoder_path, bpe_path) self.nvocab = len(self.text_encoder.encoder) nctx = 512 # number of positional embeddings (nctx = number of context) vocab = self.nvocab + nctx # set up pretrained openai model args = DEFAULT_CONFIG self.lm_model = LMModel(args, vocab, nctx, return_probs = True) load_openai_pretrained_model(self.lm_model.transformer, n_ctx=nctx, n_special=0) self.lm_model.eval() # this line puts the model in eval mode so we don't do dropout :) # set up spacy for pos tagging self.nlp = spacy.load('en', disable=['ner', 'textcat', 'parser']) def make_batch(self, X): X = np.array(X) assert X.ndim in [1, 2] if X.ndim == 1: X = np.expand_dims(X, axis=0) # add positional encodings - just second dimension that says which word is where pos_enc = np.arange(self.nvocab, self.nvocab + X.shape[-1]) pos_enc = np.expand_dims(pos_enc, axis=0) batch = np.stack([X, pos_enc], axis=-1) batch = torch.tensor(batch, dtype=torch.long) return batch def _get_continuation_tensor(self, sent_vec): """ Deals strictly with tensors """ sent_batch = self.make_batch(sent_vec) sent_res = self.lm_model(sent_batch) return sent_res def tensor_to_probs(self, tensor): """ converts torch tensor to clean numpy array holding probabilities (Basically just hides some nasty code) """ return tensor[:, -1, :].flatten().detach().numpy() def get_continuation_probs(self, sentence): sent_vec = self.text_encoder.encode([sentence]) tensor = self._get_continuation_tensor(sent_vec) return self.tensor_to_probs(tensor) def _get_continuations(self, sent_res, k=10, verbose=False): """ Making this private so I can access it externally... that's awful This is a helper function for the `get_continuations` wrapper that separates the actual processing of the sentence from getting top continuations """ probs, decode = sent_res[:,-1,:].topk(k) if verbose: for p, d in zip(probs.flatten(), decode.flatten()): print("\t...%s (%.4f)"%(self.text_encoder.decoder[d.item()], p.item())) words = [self.text_encoder.decoder[d.item()] for d in decode.flatten()] # strip of the word ending tags if there are some - if it's not a full continuation, what to do? for i in range(len(words)): if words[i][-4:] == "</w>": words[i] = words[i][:-4] probs = probs.flatten().detach().numpy() # convert probs from tensor to numpy array return words, probs def get_continuations(self, sentence, k=10, verbose=False): """ sentence: a string that you want to get next words for k: how many next words you want to get verbose: do you want to print the output """ sent_vec = self.text_encoder.encode([sentence]) sent_res = self._get_continuation_tensor(sent_vec) if verbose: print(sentence) return self._get_continuations(sent_res, k, verbose) def _get_pos_continuations(self, sentence, words, probs): """ helper function for `get_pos_continuations` that takes the lists of words and probabilities and performs all the computation to get the most common pos tags independently of processing an individual sentence """ # get POS of all of k continuations pos_counter = Counter() for word, prob in zip(words, probs): sentence_continuation = "{} {}".format(sentence, word) encoded = self.nlp(sentence_continuation) pos_counter[encoded[-1].pos_] += prob # format pos_counter most common output as two lists, one of probs and one of pos tags pos_counter_list = list(zip(*pos_counter.most_common())) pos_tags, pos_tag_probs = list(pos_counter_list[0]), np.array((pos_counter_list[1]), dtype=np.float32) return pos_tags, pos_tag_probs def get_pos_continuations(self, sentence, k=10, verbose=False): """ sentence: string you want next parts of speech for k: how many top words to analyze NOTE: unlike in the `get_continuation` function, the k is NOT how many unique POS tags you want to look at, it's how many words you want to consider """ # get likely next words words, probs = self.get_continuations(sentence, k, verbose=False) return self._get_pos_continuations(sentence, words, probs) ################################################################################ # The following three functions calculate entropy/surprisal of a SINGLE function ################################################################################ def _get_surprisal(self, distribution, index): word_prob = distribution[index] return -np.log2(word_prob) def get_surprisal(self, sentence, word): """ get the -log2 probability of the word following the sentence """ all_probs = self.get_continuation_probs(sentence) # if the word is not in the vocabulary in full, represent its probability by the # probability of the first part of its encoding (the 0 index) word_index = self.text_encoder.encode([word])[0] # word_prob = all_probs[word_index] return self._get_surprisal(all_probs, word_index)#-np.log2(word_prob) def _get_entropy(self, distribution): return -np.sum([p*np.log2(p) if p > 0 else 0 for p in distribution]) def get_entropy(self, sentence): """ finds the shannon entropy of predicting the word following sentence """ all_probs = self.get_continuation_probs(sentence) return self._get_entropy(all_probs)#-np.sum([p*np.log2(p) if p > 0 else 0 for p in all_probs]) def get_surprisal_entropy_ratio(self, sentence, word): "gets ratio betwen surprisal and entropy at the end of the sentence for a given word" all_probs = self.get_continuation_probs(sentence) word_index = self.text_encoder.encode([word])[0] entropy = self._get_entropy(all_probs) surprisal = self._get_surprisal(all_probs, word_index) return surprisal/entropy #################################################################### # Same as above but for part of speech #################################################################### def get_surprisal_pos(self, sentence, pos, k=1000): """ Because we the language model is not a POS tagger, we cannot directly calculate the surprisal of the pos from a full probability distribution, instead we have to use the degenerate distribution computed from the top k most probable POS continuations sentence is full sentence pos is pos we want to get surprisal of k is how many possible continuations to check """ pos_tags, pos_tag_probs = self.get_pos_continuations(sentence, k) pos_index = pos_tags.index(pos) # assume the POS we want is in the list somewhere... return self._get_surprisal(pos_tag_probs, pos_index) def get_entropy_pos(self, sentence, k=1000): """ Disclaimer about degenerate distribution same as above """ pos_tags, pos_tag_probs = self.get_pos_continuations(sentence, k) return self._get_entropy(pos_tag_probs) ##################################################################### # Gets all of the above metrics for every word in a single sentence # ##################################################################### def get_surprisal_sentence(self, sentence, prepend=None, start=1): """ A little uglier, but perhaps faster """ surprisals = [] sent_enc = self.text_encoder.encode([sentence])[0] # list of indices in enocder 1-d if prepend != None: sent_enc = prepend + sent_enc sent_dec = [self.text_encoder.decoder[ind] for ind in sent_enc] sent_batch = None # if you run the language model with the whole sentence the outputs for each # word are the probabilities for the next word! sent_batch = self.make_batch([sent_enc]) sent_tensor = self.lm_model(sent_batch) for i in range(start, len(sent_enc)): surprisals.append(-np.log2(sent_tensor[:,i-1,sent_enc[i]].item())) return surprisals, sent_dec def get_s_h_shr_sentence(self, sentence, prepend=None, start=1): """ calculates the surprisal, entropy, and surprisal-entropy-ratio at each word (defined by bpe) in the sentence returns, in order 1. The list of surprisals (len(sentence) - 1) 2. The list of entropies (len(sentence) - 1) 3. The list of rations between surprisals and entropies (len(sentence) - 1) 4. The decoded tokens that are used by the BPE encoder wrapper """ surprisals, entropies, surprisal_entropy_ratios = [],[],[] sent_enc = self.text_encoder.encode([sentence])[0] # list of indices in enocder 1-d if prepend != None: sent_enc = prepend + sent_enc sent_dec = [self.text_encoder.decoder[ind] for ind in sent_enc] # start = max(0, min(1, start)) # doesn't work because language model needs to condition on something start = 1 for i in range(start, len(sent_enc)): partial_sent_enc = [sent_enc[:i]] cont_tensor = self._get_continuation_tensor(partial_sent_enc) partial_probs = self.tensor_to_probs(cont_tensor) surprisals.append(self._get_surprisal(partial_probs, sent_enc[i])) entropies.append(self._get_entropy(partial_probs)) surprisal_entropy_ratios.append(surprisals[-1]/entropies[-1]) return surprisals, entropies, surprisal_entropy_ratios, sent_dec
args = DEFAULT_CONFIG encoder = pickle.load(open('vect.p', 'rb')).vocabulary_ #device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #n_gpu = torch.cuda.device_count() #print("device", device, "n_gpu", n_gpu) text_encoder = TextEncoder() encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) x = pd.read_csv('../notes_small.csv').iloc[:200] x['NOTE_TEXT'] = x['NOTE_TEXT'].apply(u2.cleanNotes) seq = text_encoder.encode(x['NOTE_TEXT']) seq = [s[:64] if len(s) > 64 else s for s in seq] seq = sorted(seq, key=lambda x: len(x)) #Setup Model encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 n_ctx = np.array([len(t) for t in seq]).max() + 2 n_ctx = int(n_ctx) print(n_ctx)