Пример #1
0
def transformer_predict(input_file: str, text_encoder: TextEncoder,
                        device: int):
    if device > -1:
        device_name = "cuda"
    else:
        device_name = "cpu"

    print(input_file)
    n_ctx = 512

    transformer = TransformerModel(DEFAULT_CONFIG,
                                   n_ctx=n_ctx,
                                   requires_grad=False)
    load_openai_pretrained_model(transformer, n_ctx=n_ctx)

    with open(input_file) as f:
        sentences = f.readlines()

    encoded_sentences = text_encoder.encode(sentences)

    masks = [
        np.concatenate((np.ones(len(s)), np.zeros(n_ctx - len(s))))
        for s in encoded_sentences
    ]

    input_tensor = torch.LongTensor([
        pad_sequence_to_length(s, desired_length=512)
        for s in encoded_sentences
    ])
    if device_name == "cuda":
        input_tensor = input_tensor.cuda()

    batch_size, num_timesteps = input_tensor.size()

    positional_encodings = get_range_vector(num_timesteps, device) + n_ctx

    batch_tensor = torch.stack(
        [input_tensor,
         positional_encodings.expand(batch_size, num_timesteps)],
        dim=-1)

    if device_name == "cuda":
        transformer = transformer.cuda()

    transformer_embeddings = transformer(batch_tensor)

    np.save("openai_transformer_test_input.npy",
            batch_tensor.data.cpu().numpy())
    np.save("openai_transformer_test_output.npy",
            transformer_embeddings.data.cpu().numpy())
Пример #2
0
def encode_dataset(*splits: Tuple[
    # the four lists are first_four_sentences (len=1497), first_choice(len=1497), second_choice(len=1497), true_choice(len=1497)
    Tuple[List[str], List[str], List[str], ndarray],  # each list of len 1497, train instances,
    Tuple[List, List, List, List],  # each list of len 374, val instances
    Tuple[List, List, List, List]  # each list of len 1871, test instances
], encoder: TextEncoder):
    encoded_splits = []
    for split in splits:  # loop over trainInstances, valInstances and testInstances
        fields = []
        for field in split: #  a field is one list of str (sentences) or int (true answers)
            if isinstance(field[0], str): # check first element in field to see if str
                # each str element in the field list is encoded as a list of int, hence field becomes List[List[int]]
                field = encoder.encode(field)  # only encode sentences, not encoding true answers (type int choice: {0,1})
            fields.append(field)
        encoded_splits.append(fields)
    return encoded_splits
    n_special = 0   # XD: useless for language modeling task
    vocab = n_vocab + n_special + n_ctx #the size of the vocabalery - in this case it is letters - so I don;t think its what we need


    lm_model = LMModel(args, vocab, n_ctx, return_probs=True)
    load_openai_pretrained_model(lm_model.transformer, n_ctx=n_ctx, n_special=n_special)
    lm_model.to(device)

    lm_model.eval()
    #till now it loaded the previuos model and the vocabalery that will be used
    text = input('Input some beginning words:') #why we need this?
    create_dictionary(text_encoder)

    while text != 'q':
        X = text_encoder.encode([text,])
        XMB = make_batch(X)

        for _ in range(args.gen_len):
            lm_probs = lm_model(XMB) #the porbability of each word in the vocabalry?
            if args.topk == 0:
                next_idx = torch.multinomial(lm_probs[:, -1, :], 1)
            else:
                # prob = 0
                # choosen_word = 0
                tmp = []
                for index in encoded_words:
                    tmp += [lm_probs[:, -1, :][:, index].item()] #ToDo
                    # if tmp >= prob:
                    #     prob = tmp
                    #     choosen_word = index  - I think we can delete this part
Пример #4
0
class SurprisalAnalyzer:

    def __init__(self):
        # initialize lm and text encoder and everything

        # set up the encoder to turn words into indices
        encoder_path = 'model/encoder_bpe_40000.json'
        bpe_path = 'model/vocab_40000.bpe'
        self.text_encoder = TextEncoder(encoder_path, bpe_path)

        self.nvocab = len(self.text_encoder.encoder)
        nctx = 512 # number of positional embeddings (nctx = number of context)
        vocab = self.nvocab + nctx

        # set up pretrained openai model
        args = DEFAULT_CONFIG
        self.lm_model = LMModel(args, vocab, nctx, return_probs = True)
        load_openai_pretrained_model(self.lm_model.transformer, n_ctx=nctx, n_special=0)
        self.lm_model.eval() # this line puts the model in eval mode so we don't do dropout :) 


        # set up spacy for pos tagging
        self.nlp = spacy.load('en', disable=['ner', 'textcat', 'parser'])

    def  make_batch(self, X):
        X = np.array(X)
        assert X.ndim in [1, 2]
        if X.ndim == 1:
            X = np.expand_dims(X, axis=0)
        # add positional encodings - just second dimension that says which word is where
        pos_enc = np.arange(self.nvocab, self.nvocab + X.shape[-1])
        pos_enc = np.expand_dims(pos_enc, axis=0)
        batch = np.stack([X, pos_enc], axis=-1)
        batch = torch.tensor(batch, dtype=torch.long)
        return batch

    def _get_continuation_tensor(self, sent_vec):
        """
        Deals strictly with tensors
        """
        sent_batch = self.make_batch(sent_vec)
        sent_res = self.lm_model(sent_batch)
        return sent_res

    def tensor_to_probs(self, tensor):
        """
        converts torch tensor to clean numpy array holding probabilities
        (Basically just hides some nasty code)
        """
        return tensor[:, -1, :].flatten().detach().numpy()

    def get_continuation_probs(self, sentence):
        sent_vec = self.text_encoder.encode([sentence])
        tensor = self._get_continuation_tensor(sent_vec)
        return self.tensor_to_probs(tensor)

    def _get_continuations(self, sent_res, k=10, verbose=False):
        """
        Making this private so I can access it externally... that's awful

        This is a helper function for the `get_continuations` wrapper that 
        separates the actual processing of the sentence from getting top
        continuations
        """
        probs, decode = sent_res[:,-1,:].topk(k)
        if verbose:
            for p, d in zip(probs.flatten(), decode.flatten()):
                print("\t...%s (%.4f)"%(self.text_encoder.decoder[d.item()], p.item()))
        words = [self.text_encoder.decoder[d.item()] for d in decode.flatten()]
        # strip of the word ending tags if there are some - if it's not a full continuation, what to do?
        for i in range(len(words)):
            if words[i][-4:] == "</w>":
                words[i] = words[i][:-4]
        probs = probs.flatten().detach().numpy() # convert probs from tensor to numpy array
        return words, probs

    def get_continuations(self, sentence, k=10, verbose=False):
        """
        sentence: a string that you want to get next words for
        k: how many next words you want to get
        verbose: do you want to print the output
        """
        sent_vec = self.text_encoder.encode([sentence])
        sent_res = self._get_continuation_tensor(sent_vec)
        if verbose:
            print(sentence)

        return self._get_continuations(sent_res, k, verbose)


    def _get_pos_continuations(self, sentence, words, probs):
        """
        helper function for `get_pos_continuations` that takes the lists of words and
        probabilities and performs all the computation to get the most common pos
        tags independently of processing an individual sentence
        """
        # get POS of all of k continuations
        pos_counter = Counter()

        for word, prob in zip(words, probs):
            sentence_continuation = "{} {}".format(sentence, word)
            encoded = self.nlp(sentence_continuation)
            pos_counter[encoded[-1].pos_] += prob

        # format pos_counter most common output as two lists, one of probs and one of pos tags
        pos_counter_list = list(zip(*pos_counter.most_common()))
        pos_tags, pos_tag_probs = list(pos_counter_list[0]), np.array((pos_counter_list[1]), dtype=np.float32)
        return pos_tags, pos_tag_probs

    def get_pos_continuations(self, sentence, k=10, verbose=False):
        """
        sentence: string you want next parts of speech for
        k: how many top words to analyze 
        NOTE: unlike in the `get_continuation` function, the k is NOT how many
        unique POS tags you want to look at, it's how many words you want to consider
        """
        # get likely next words
        words, probs = self.get_continuations(sentence, k, verbose=False)
        return self._get_pos_continuations(sentence, words, probs)



    ################################################################################
    # The following three functions calculate entropy/surprisal of a SINGLE function
    ################################################################################
    def _get_surprisal(self, distribution, index):
        word_prob = distribution[index]
        return -np.log2(word_prob)
    
    def get_surprisal(self, sentence, word):
        """
        get the -log2 probability of the word following the sentence
        """
        all_probs = self.get_continuation_probs(sentence)
        # if the word is not in the vocabulary in full, represent its probability by the 
        # probability of the first part of its encoding (the 0 index)
        word_index = self.text_encoder.encode([word])[0]
        # word_prob = all_probs[word_index]
        return self._get_surprisal(all_probs, word_index)#-np.log2(word_prob)

    def _get_entropy(self, distribution):
        return -np.sum([p*np.log2(p) if p > 0 else 0 for p in distribution])

    def get_entropy(self, sentence):
        """
        finds the shannon entropy of predicting the word following sentence
        """
        all_probs = self.get_continuation_probs(sentence)
        return self._get_entropy(all_probs)#-np.sum([p*np.log2(p) if p > 0 else 0 for p in all_probs])

    def get_surprisal_entropy_ratio(self, sentence, word):
        "gets ratio betwen surprisal and entropy at the end of the sentence for a given word"
        all_probs = self.get_continuation_probs(sentence)
        word_index = self.text_encoder.encode([word])[0]
        entropy = self._get_entropy(all_probs)
        surprisal = self._get_surprisal(all_probs, word_index)
        return surprisal/entropy

    ####################################################################
    # Same as above but for part of speech
    ####################################################################
    def get_surprisal_pos(self, sentence, pos, k=1000):
        """
        Because we the language model is not a POS tagger, we cannot directly
        calculate the surprisal of the pos from a full probability distribution,
        instead we have to use the degenerate distribution computed from the 
        top k most probable POS continuations

        sentence is full sentence
        pos is pos we want to get surprisal of
        k is how many possible continuations to check
        """
        pos_tags, pos_tag_probs = self.get_pos_continuations(sentence, k)
        pos_index = pos_tags.index(pos) # assume the POS we want is in the list somewhere...
        return self._get_surprisal(pos_tag_probs, pos_index)

        
    def get_entropy_pos(self, sentence, k=1000):
        """
        Disclaimer about degenerate distribution same as above
        """
        pos_tags, pos_tag_probs = self.get_pos_continuations(sentence, k)
        return self._get_entropy(pos_tag_probs)


    
    #####################################################################
    # Gets all of the above metrics for every word in a single sentence #
    #####################################################################
    def get_surprisal_sentence(self, sentence, prepend=None, start=1):
        """
        A little uglier, but perhaps faster

        """
        surprisals = []
        sent_enc = self.text_encoder.encode([sentence])[0] # list of indices in enocder 1-d
        if prepend != None:
            sent_enc = prepend + sent_enc
        sent_dec = [self.text_encoder.decoder[ind] for ind in sent_enc]

        sent_batch = None

        # if you run the language model with the whole sentence the outputs for each
        # word are the probabilities for the next word!
        sent_batch = self.make_batch([sent_enc])
        sent_tensor = self.lm_model(sent_batch)
        for i in range(start, len(sent_enc)):
            surprisals.append(-np.log2(sent_tensor[:,i-1,sent_enc[i]].item()))
        return surprisals, sent_dec
        
    def get_s_h_shr_sentence(self, sentence, prepend=None, start=1):
        """
        calculates the surprisal, entropy, and surprisal-entropy-ratio at each word (defined by bpe)
        in the sentence

        returns, in order
        1. The list of surprisals (len(sentence) - 1)
        2. The list of entropies  (len(sentence) - 1)
        3. The list of rations between surprisals and entropies (len(sentence) - 1)
        4. The decoded tokens that are used by the BPE encoder wrapper
        """
        surprisals, entropies, surprisal_entropy_ratios = [],[],[]
        sent_enc = self.text_encoder.encode([sentence])[0] # list of indices in enocder 1-d
        if prepend != None:
            sent_enc = prepend + sent_enc
        sent_dec = [self.text_encoder.decoder[ind] for ind in sent_enc]

        # start = max(0, min(1, start)) # doesn't work because language model needs to condition on something
        start = 1

        for i in range(start, len(sent_enc)):
            partial_sent_enc = [sent_enc[:i]]
            cont_tensor = self._get_continuation_tensor(partial_sent_enc)
            partial_probs = self.tensor_to_probs(cont_tensor)

            surprisals.append(self._get_surprisal(partial_probs, sent_enc[i]))
            entropies.append(self._get_entropy(partial_probs))
            surprisal_entropy_ratios.append(surprisals[-1]/entropies[-1])

        return surprisals, entropies, surprisal_entropy_ratios, sent_dec
Пример #5
0
    args = DEFAULT_CONFIG

    encoder = pickle.load(open('vect.p', 'rb')).vocabulary_

    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #n_gpu = torch.cuda.device_count()
    #print("device", device, "n_gpu", n_gpu)

    text_encoder = TextEncoder()
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    x = pd.read_csv('../notes_small.csv').iloc[:200]
    x['NOTE_TEXT'] = x['NOTE_TEXT'].apply(u2.cleanNotes)

    seq = text_encoder.encode(x['NOTE_TEXT'])
    seq = [s[:64] if len(s) > 64 else s for s in seq]
    seq = sorted(seq, key=lambda x: len(x))

    #Setup Model
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']

    n_special = 3
    n_ctx = np.array([len(t) for t in seq]).max() + 2
    n_ctx = int(n_ctx)

    print(n_ctx)