def evaluate_model(model, data, corpus, word_to_index, cuda, eval_func): rrs = [] for query in data.keys(): positives, candidates = data[query] embeddings = [] embeddings.append( pad(merge_title_and_body(corpus[query]), len(word_to_index))) for candidate in candidates: embeddings.append( pad(merge_title_and_body(corpus[candidate]), len(word_to_index))) embeddings = Variable(torch.from_numpy(np.array(embeddings))) if cuda: embeddings = embeddings.cuda() encodings = model(embeddings) similarities = F.cosine_similarity(encodings[1:], encodings[0].repeat( len(encodings) - 1, 1), dim=1) _, candidates_ranked = zip( *sorted(zip(similarities.data, candidates), reverse=True)) rrs.append(eval_func(positives, candidates_ranked)) return np.mean(rrs)
def __init__( self, trees, minibatch_size=128, wordmap=None, shuffle=True ): self.wordmap = wordmap or self._build_wordmap(trees) is_leafs = self._is_leafs(trees) word_indices = self._word_indices(trees, self.wordmap) child_indices = self._child_indices(trees) targets = self._targets(trees) # Apply padding is_leafs, lengths = helpers.pad(is_leafs, 0) word_indices, _ = helpers.pad(word_indices, 0) child_indices, _ = helpers.pad(child_indices, 0) targets, _ = helpers.pad(targets, 0) if shuffle: # Shuffle lengths, \ is_leafs, \ word_indices, \ child_indices, \ targets = helpers.parallel_shuffle( (lengths, is_leafs, word_indices, child_indices, targets) ) # Now sort in order of increasing input length lengths, \ is_leafs, \ word_indices, \ child_indices, \ targets = helpers.sort_by( lengths, (lengths, is_leafs, word_indices, child_indices, targets) ) # Transpose data from example -> sequence to sequence -> example is_leafs = is_leafs.transpose(1, 0) word_indices = word_indices.transpose(1, 0) child_indices = child_indices.transpose(1, 2, 0) # seq -> l/r -> ex targets = targets.transpose(1, 0) # Load into theano shared vars self.is_leafs = theano.shared(is_leafs, borrow=True) self.word_indices = theano.shared(word_indices, borrow=True) self.child_indices = theano.shared(child_indices, borrow=True) self.targets = theano.shared(targets, borrow=True) # these need to be stored on the GPU as floats and casted when needed self.lengths = theano.shared( lengths.astype(theano.config.floatX), borrow=True) self.minibatch_size = minibatch_size self.minibatch_count = int(numpy.ceil(float(len(trees)) / minibatch_size))
def __init__(self, trees, minibatch_size=128, wordmap=None, shuffle=True): self.wordmap = wordmap or self._build_wordmap(trees) is_leafs = self._is_leafs(trees) word_indices = self._word_indices(trees, self.wordmap) child_indices = self._child_indices(trees) targets = self._targets(trees) # Apply padding is_leafs, lengths = helpers.pad(is_leafs, 0) word_indices, _ = helpers.pad(word_indices, 0) child_indices, _ = helpers.pad(child_indices, 0) targets, _ = helpers.pad(targets, 0) if shuffle: # Shuffle lengths, \ is_leafs, \ word_indices, \ child_indices, \ targets = helpers.parallel_shuffle( (lengths, is_leafs, word_indices, child_indices, targets) ) # Now sort in order of increasing input length lengths, \ is_leafs, \ word_indices, \ child_indices, \ targets = helpers.sort_by( lengths, (lengths, is_leafs, word_indices, child_indices, targets) ) # Transpose data from example -> sequence to sequence -> example is_leafs = is_leafs.transpose(1, 0) word_indices = word_indices.transpose(1, 0) child_indices = child_indices.transpose(1, 2, 0) # seq -> l/r -> ex targets = targets.transpose(1, 0) # Load into theano shared vars self.is_leafs = theano.shared(is_leafs, borrow=True) self.word_indices = theano.shared(word_indices, borrow=True) self.child_indices = theano.shared(child_indices, borrow=True) self.targets = theano.shared(targets, borrow=True) # these need to be stored on the GPU as floats and casted when needed self.lengths = theano.shared(lengths.astype(theano.config.floatX), borrow=True) self.minibatch_size = minibatch_size self.minibatch_count = int( numpy.ceil(float(len(trees)) / minibatch_size))
def __init__( self, inputs, targets, minibatch_size=128, input_padding_val=0, target_padding_val=ctc.PADDING ): """Load a dataset into Theano shared variables.""" # Apply padding inputs, lengths = helpers.pad(inputs, input_padding_val) targets, _ = helpers.pad(targets, target_padding_val) # Shuffle inputs, \ targets, \ lengths = helpers.parallel_shuffle( (inputs, targets, lengths) ) # Now sort in order of increasing input length inputs, \ targets, \ lengths = helpers.sort_by( lengths, (inputs, targets, lengths) ) # Transpose data from example -> sequence to sequence -> example inputs = inputs.transpose(1, 0, 2) # Does each example have a target class (instead of an output vector)? classification = (len(targets.shape) == 2) if classification: targets = targets.transpose(1, 0) else: targets = targets.transpose(1, 0, 2) # Load into theano shared vars self.inputs = theano.shared(inputs, borrow=True) self.targets = theano.shared(targets, borrow=True) # this needs to be stored on the GPU as floats and casted when needed self.lengths = theano.shared( lengths.astype(theano.config.floatX), borrow=True) self.minibatch_size = minibatch_size self.minibatch_count = int(numpy.ceil(float(inputs.shape[1]) / minibatch_size)) self.ttargets = ctc.transform_targets(self.targets)
def __init__(self, inputs, targets, minibatch_size=128, input_padding_val=0, target_padding_val=ctc.PADDING): """Load a dataset into Theano shared variables.""" # Apply padding inputs, lengths = helpers.pad(inputs, input_padding_val) targets, _ = helpers.pad(targets, target_padding_val) # Shuffle inputs, \ targets, \ lengths = helpers.parallel_shuffle( (inputs, targets, lengths) ) # Now sort in order of increasing input length inputs, \ targets, \ lengths = helpers.sort_by( lengths, (inputs, targets, lengths) ) # Transpose data from example -> sequence to sequence -> example inputs = inputs.transpose(1, 0, 2) # Does each example have a target class (instead of an output vector)? classification = (len(targets.shape) == 2) if classification: targets = targets.transpose(1, 0) else: targets = targets.transpose(1, 0, 2) # Load into theano shared vars self.inputs = theano.shared(inputs, borrow=True) self.targets = theano.shared(targets, borrow=True) # this needs to be stored on the GPU as floats and casted when needed self.lengths = theano.shared(lengths.astype(theano.config.floatX), borrow=True) self.minibatch_size = minibatch_size self.minibatch_count = int( numpy.ceil(float(inputs.shape[1]) / minibatch_size)) self.ttargets = ctc.transform_targets(self.targets)
def just_encrypt(msg, key, decrease_unknown=0): msg = msg + unknown[decrease_unknown:] # padchars = (random_bytes(5, 10), random_bytes(5, 10)) # msg = padchars[0] + msg + padchars[1] msg = pad(msg, 16) encrypted = encrypt_ecb(msg, key) return encrypted
def ising_energy(model, states): """ Compute local energies of Ising model. Parameters ---------- states : Tensor of shape (N, system_size) Returns ------- energies : tensor of shape (N,) """ batch_size = tf.shape(states)[0] states_shaped = tf.reshape(states, (batch_size,)+SYSTEM_SHAPE) states_padded = pad(states_shaped, SYSTEM_SHAPE, [(K-1)//2]*N_DIMS) factors = tf.reshape(model.factors(states_padded), (batch_size, -1)) factor_windows = all_windows(factors, SYSTEM_SHAPE, HALF_WINDOW_SHAPE) spin_windows = all_windows(states, SYSTEM_SHAPE, FULL_WINDOW_SHAPE) flipper = np.ones(FULL_WINDOW_SIZE, dtype=np.int32) flipper[(FULL_WINDOW_SIZE-1)//2] = -1 spins_flipped = spin_windows * flipper factors_flipped = tf.reshape( model.factors(tf.reshape( spins_flipped, (batch_size*NUM_SPINS,)+FULL_WINDOW_SHAPE)), (batch_size, NUM_SPINS, HALF_WINDOW_SIZE)) log_pop = tf.reduce_sum(factors_flipped - factor_windows, 2) alignedness = tf.reduce_sum(interactions(states, SYSTEM_SHAPE), [1, 2]) energy = -H * tf.reduce_sum(tf.exp(log_pop), 1) - \ tf.cast(alignedness, tf.complex64) return energy / NUM_SPINS
def train(self, data, labels, epochs=1, record_epochs=False, validation_set=None): """ This method runs a simple version of the perceptron algorithm on the data given. :param data: numpy array of each data point to be used for training. This array should already be padded with a 1's column in order to ensure a bias weight is included. :param labels: numpy array specify the labels {-1, 1} :return: None """ # Pad the data with an all ones vector. p_data = pad(data) # Initialize the weights. self.weights = np.random.uniform(low=-0.01, high=0.01, size=p_data.shape[1]) for epoch in range(epochs): # Go through each data point. for x, y in zip(*shuffle(p_data, labels)): # If (w^t*x + b)*y < margin make an update. if np.dot(self.weights, x) * y < self.margin: # Calculate the aggressive learning rate. aggressive_learning_rate = (self.margin - (y * np.dot(self.weights, x))) / (np.dot(x, x) + 1) # Update the weights self.weights = self.weights + (aggressive_learning_rate * y * x) # Record update count. self.update_count += 1 # record epoch specific information if specified if record_epochs: val_x, val_y = validation_set[0], validation_set[1] self.epoch_records[epoch + 1] = {'accuracy': accuracy(self.predict(val_x), val_y), 'weights': self.weights}
def get_dis_batch(batch_size, corpus, word_to_index): keys = list(corpus.keys()) random.shuffle(keys) keys = keys[:batch_size] query_index_sequences = [merge_title_and_body(corpus[k]) for k in keys] embedded = [pad(seq, len(word_to_index)) for seq in query_index_sequences] batch = torch.from_numpy(np.array(embedded)) return batch
def process_batch_pairs(pairs, data, corpus, word_to_index): batch_querys = [] batch_positives = [] batch_negatives = [] for query, positive in pairs: query_index_sequence = merge_title_and_body(corpus[query]) batch_querys.append(pad(query_index_sequence, len(word_to_index))) positive_index_sequence = merge_title_and_body(corpus[positive]) batch_positives.append(pad(positive_index_sequence, len(word_to_index))) negatives = [merge_title_and_body(corpus[neg]) \ for neg in random.sample(data[(query, positive)], NEGATIVE_QUERYS_PER_SAMPLE)] negatives = [pad(neg, len(word_to_index)) for neg in negatives] batch_negatives.append(negatives) batch_querys = torch.from_numpy(np.array(batch_querys)) batch_positives = torch.from_numpy(np.array(batch_positives)) batch_negatives = torch.from_numpy(np.array(batch_negatives)) return batch_querys, batch_positives, batch_negatives
def evaluate_model(model, data, corpus, word_to_index, cuda): auc = AUCMeter() for query in data.keys(): positives = set(data[query][0]) candidates = data[query][1] embeddings = [pad(merge_title_and_body(corpus[query]), len(word_to_index))] targets = [] for candidate in candidates: embeddings.append(pad(merge_title_and_body(corpus[candidate]), len(word_to_index))) targets.append(IS_SIMMILAR_LABEL if candidate in positives else NOT_SIMMILAR_LABEL) embeddings = Variable(torch.from_numpy(np.array(embeddings))) targets = torch.from_numpy(np.array(targets)) if cuda: embeddings = embeddings.cuda() encodings = model(embeddings) query_encoding = encodings[0] candidate_encodings = encodings[1:] similarities = (F.cosine_similarity(candidate_encodings, query_encoding.repeat(len(encodings)-1, 1), dim=1)) auc.add(similarities.data, targets) return auc.value(MAXIMUM_FALSE_POSITIVE_RATIO)
def encrypt_cbc(msg, key, iv): cipher = AES.new(key, AES.MODE_ECB) msg = pad(msg, 16) msg = str2hex(msg) blocks = [msg[i:i + 32] for i in range(0, len(msg), 32)] encrypted = '' en = str2hex(iv) for block in blocks: ip = xor(en, block) en = cipher.encrypt(ip.decode('hex')).encode('hex') encrypted += en return encrypted
def heisenberg_energy(model, states): """ Compute local energies of AntiFerroMagneticHeisenberg model. Parameters ---------- states : Tensor of shape (N, system_size) Returns ------- energies : tensor of shape (N,) """ FULL_WINDOW_SHAPE = (K*2-1+2,)*N_DIMS FULL_WINDOW_SIZE = np.prod(FULL_WINDOW_SHAPE) HALF_WINDOW_SHAPE = (K+2,)*N_DIMS HALF_WINDOW_SIZE = np.prod(HALF_WINDOW_SHAPE) batch_size = tf.shape(states)[0] states_shaped = tf.reshape(states, (batch_size,)+SYSTEM_SHAPE) states_padded = pad(states_shaped, SYSTEM_SHAPE, [(K-1)//2]*N_DIMS) factors = tf.reshape(model.factors(states_padded), (batch_size, -1)) factor_windows = all_windows(factors, SYSTEM_SHAPE, HALF_WINDOW_SHAPE) spin_windows = all_windows(states, SYSTEM_SHAPE, FULL_WINDOW_SHAPE) flippers = np.zeros((N_DIMS, FULL_WINDOW_SIZE), dtype=np.int32) for d in range(N_DIMS): flipper = np.ones(FULL_WINDOW_SHAPE, dtype=np.int32) halfpoint = tuple((s-1)//2 for s in FULL_WINDOW_SHAPE) neighbour = tuple(x+1 if i == d else x for i, x in enumerate(halfpoint)) flipper[halfpoint] = -1 flipper[neighbour] = -1 flippers[d, :] = flipper.flatten() # [N, flip_direction=1..N_DIMS, spin_window=1..SYSTEM_SIZE, window_size] spins_flipped = spin_windows[:, None, :, :] * flippers[None, :, None, :] factors_flipped = tf.reshape( model.factors(tf.reshape( spins_flipped, (batch_size*N_DIMS*NUM_SPINS,)+FULL_WINDOW_SHAPE)), (batch_size, N_DIMS, NUM_SPINS, HALF_WINDOW_SIZE)) log_pop = tf.reduce_sum(factors_flipped - factor_windows[:, None, :, :], 3) ints = tf.cast(interactions(states, SYSTEM_SHAPE), tf.complex64) terms = -(1 - ints) * tf.exp(log_pop) + ints energy = tf.reduce_sum(terms, [1, 2]) return energy / NUM_SPINS
def encrypt(message, key, IV = None): if (message is None) or (len(message) == 0): raise ValueError('message cannot be null or empty') if IV is None: IV = generateIV(blockSize) cipherText = bytes(IV) paddedMessage = pad(message, blockSize) blocks = chunkMessage(paddedMessage, blockSize) for block in blocks: # update the IV to be the newly encrypted ciphertext. IV = encryptBlock(block, key, IV) cipherText += IV return cipherText
def train(self, data, labels, epochs=1, record_epochs=False, validation_set=None): """ This method runs a simple version of the perceptron algorithm on the data given. :param data: numpy array of each data point to be used for training. This array should already be padded with a 1's column in order to ensure a bias weight is included. :param labels: numpy array specify the labels {-1, 1} :return: None """ # Pad the data with an all ones vector. p_data = pad(data) # Initialize the weights and average weigths. self.weights = np.random.uniform(low=-0.01, high=0.01, size=p_data.shape[1]) self.average_weights = self.weights for epoch in range(epochs): # Go through each data point. for x, y in zip(*shuffle(p_data, labels)): # If (w^t*x + b)*y < 0 make an update. if np.dot(self.weights, x) * y < 0: # Update the weights self.weights = self.weights + self.learning_rate * y * x # Record update count. self.update_count += 1 # Increment the average weights even if no misprediction happens. self.average_weights = self.average_weights + self.weights # record epoch specific information if specified if record_epochs: val_x, val_y = validation_set[0], validation_set[1] # Set current weights to the averaged weights so predict will use them. temp_weights = self.weights self.weights = self.average_weights / (len(data) * epoch + 1) self.epoch_records[epoch + 1] = {'accuracy': accuracy(self.predict(val_x), val_y), 'weights': self.weights} # Set them back to resume normal algorithm operation. self.weights = temp_weights # Divide by the total number of examples it has seen. self.average_weights = self.average_weights / (len(data) * epochs) # Finally set the final weights to the average weights so they will be used for predictions. self.weights = self.average_weights
def encrypt_cbc(msg, key, iv='\x00' * 16, is_base64=False): cipher = AES.new(key, AES.MODE_ECB) if is_base64: msg = base64.b64decode(msg) msg = pad(msg, 16) blocks = [msg[i:i + 16] for i in range(0, len(msg), 16)] encrypted = '' en = iv for block in blocks: ip = xor(en, block) en = cipher.encrypt(ip) encrypted += en return encrypted
def just_encrypt(msg): key = random_bytes(16, 16) padchars = (random_bytes(5, 10), random_bytes(5, 10)) msg = padchars[0] + msg + padchars[1] msg = pad(msg, 16) choice = random.randint(0, 1) if choice: encrypted = encrypt_ecb(msg, key) mode = 'ECB' else: iv = random_bytes(16, 16) encrypted = encrypt_cbc(msg, key, iv) mode = 'CBC' return (mode, key, encrypted)
def predict(self, data): """ This method takes in :param data: numpy array of each data point to make a prediction for. This array should already be padded with a 1's column in order to ensure a bias weight is taken into consideration. The number of the columns should also correspond with the number of weights. :return: numpy array containing predictions made for each data point given in the data array. """ # Pad the data with an all ones vector. data = pad(data) # For each data point make a prediction and store it. preds = np.ndarray((len(data), 1)) for ix, x in enumerate(data): preds[ix] = np.sign(np.dot(self.weights, x)) return preds
def train(self, data, labels, epochs=1, record_epochs=False, validation_set=None): """ This method runs a simple version of the perceptron algorithm on the data given. :param data: numpy array of each data point to be used for training. This array should already be padded with a 1's column in order to ensure a bias weight is included. :param labels: numpy array specify the labels {-1, 1} :param epochs: number of epochs to run. :record_epochs: If set to true will record weights, and accuracy after each epoch. :return: None """ # Pad the data with an all ones vector. p_data = pad(data) # Initialize the weights. self.weights = np.random.uniform(low=-0.01, high=0.01, size=p_data.shape[1]) # Decaying Learning Rate t = 0 for epoch in range(epochs): # Go through each data point. for x, y in zip(*shuffle(p_data, labels)): # If (w^t*x + b)*y < 0 make an update. if np.dot(self.weights, x) * y < 0: # Calculate the decayed learning rate. decayed_learning_rate = self.learning_rate / (1 + t) # Update the weights self.weights = self.weights + (self.decayed_learning_rate * y * x) # Record update count. self.update_count += 1 # Increment t after each example not just mispredictions. t += 1 # record epoch specific information if specified if record_epochs: val_x, val_y = validation_set[0], validation_set[1] self.epoch_records[epoch + 1] = {'accuracy': accuracy(self.predict(val_x), val_y), 'weights': self.weights}
def optimize_op(sampler, model, energy_fn): """ Perform optimization iteration. Returns ------- energies : tensor of shape (N,) Energy of MCMC samples train_op : Optimization tensorflow op """ with tf.device('/cpu:0'): samples = tf.stop_gradient(sampler.mcmc_op()) energies = energy_fn(samples) energies = tf.stop_gradient(energies) with tf.device('/gpu:0'): samples_shaped = tf.reshape(samples, (NUM_SAMPLES,)+SYSTEM_SHAPE) samples_padded = pad(samples_shaped, SYSTEM_SHAPE, [(K-1)//2]*N_DIMS) loss = loss_op(model.factors(samples_padded), energies) optimizer = tf.train.AdamOptimizer(LEARNING_RATE) train_op = optimizer.minimize(loss) return energies, train_op
def mcmc_reset(self): """Reset MCMC variables.""" uniform_states = tf.random_uniform( (self.num_samplers, ) + self.system_shape, 0, 2, dtype=tf.int32) * 2 - 1 uniform_states_padded = pad(uniform_states, self.system_shape, [(self.r - 1) // 2] * self.n_dims) uniform_states_flattened = tf.reshape(uniform_states_padded, (self.num_samplers, -1)) states = tf.cond(self.new_samples, lambda: uniform_states_flattened, lambda: self.current_samples_var) factors = tf.reshape( self.model.factors( tf.reshape(states, (self.num_samplers, ) + self.padded_shape)), (self.num_samplers, -1)) return tf.group( tf.assign(self.current_samples_var, states), tf.assign(self.current_factors_var, factors), tf.assign(self.samples_var, tf.zeros_like(self.samples_var, dtype=tf.int32)), tf.assign( self.flip_positions_var, tf.random_uniform( [self.sample_its, self.num_samplers, self.num_flips], 0, self.num_spins, dtype=tf.int32)), tf.assign( self.accept_sample_var, tf.random_uniform([self.sample_its, self.num_samplers], 0., 1., dtype=tf.float32)), )
def handle_requests(self, data, client_address): usr_hash, length, start_from, finish_at = helpers.get_request_data(data) user_word, ack = helpers.scan_and_compare(start_from, finish_at, usr_hash) message_len = len(TEAM_NAME.decode()) + 1 + len(usr_hash.decode()) + 1 + len(user_word) print(user_word + "ssssssssss") print(TEAM_NAME + ack + usr_hash + bytes([length]) + user_word.encode() + user_word.encode()) if user_word: self.udp_socket.sendto(TEAM_NAME + ack + usr_hash + bytes([length]) + user_word.encode() + helpers.pad(length).encode(), client_address) else: self.udp_socket.sendto(TEAM_NAME + ack + usr_hash + bytes([0]), client_address)
import tensorflow as tf from tensorflow import keras from helpers import encode_review, decode_review, pad import numpy as np # train and test dataset imdb = keras.datasets.imdb (train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000) print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels))) # print(train_data[0]) # len(train_data[0]), len(train_data[1]) train_data = pad(train_data) test_data = pad(test_data) # len(train_data[0]), len(train_data[1]) # print(train_data[0]) # build model vocab_size = 10000 model = keras.Sequential() model.add(keras.layers.Embedding(vocab_size, 16)) model.add(keras.layers.GlobalAveragePooling1D()) model.add(keras.layers.Dense(16, activation=tf.nn.relu)) model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid)) model.summary()
def convert_to_features(example_batch): max_length = self.tokenizer.model_max_length articles = example_batch[self.hparams.data_example_column] articles_encoded_step = [] for idx, article in enumerate(articles): article = article.strip() try: article_encoded = self.tokenizer( article, padding="max_length", truncation=True, ) articles_encoded_step.append(article_encoded) except Exception: # skipcq: FLK-E722 print("Failed to tokenize article: {}".format(article)) sys.exit(1) if idx != 0: current_length = len(article_encoded["input_ids"]) first_length = len(articles_encoded_step[0]["input_ids"]) assert ( current_length == first_length ), "The length of the current input, {}, does not match the length of the first input, {}.".format( # noqa: E501 current_length, first_length) articles_encoded = { "input_ids": [i["input_ids"] for i in articles_encoded_step], "attention_mask": [i["attention_mask"] for i in articles_encoded_step], } # articles_encoded = self.tokenizer.batch_encode_plus( # articles, pad_to_max_length=True, truncation=True, # ) highlights = example_batch[self.hparams.data_summarized_column] # Tokenize highlights using spacy to split them into sentences if they were not # already split in the dataset (use `hparams.split_char` to specify the sentence # boundary character) if not self.hparams.split_char: highlights = tokenize(spacy_nlp, highlights, disable_progress_bar=True) sep_token = self.tokenizer.sep_token highlights_input_ids = [] highlights_attention_masks = [] # For each ground-truth summary for highlight in highlights: if self.hparams.split_char: # simply split into sentences if `hparams.split_char` is specified sents = highlight.split(self.hparams.split_char) else: # `highlight` is a list of sentences where each sentence is a list of tokens # Combine those tokens to create a list of sentences. sents = [ " ".join(list_of_ids) for list_of_ids in highlight ] assert type(sents) is list assert len(sents) > 0 # Tokenize each sentence and append the `sep_token` sents_tokenized = [] for sent in sents: assert type(sent) is str assert len(sent) > 0 sent = self.tokenizer.tokenize(sent) sent.append(sep_token) sents_tokenized.append(sent) # Delete the last `sep_token` from the last sentence assert type(sents_tokenized[-1][-1]) is str del sents_tokenized[-1][-1] # Flatten `sents_tokenized` (a list of sentences where each sentence is a list # of tokens) to a list of tokens sents_tokenized_flat = list( itertools.chain.from_iterable(sents_tokenized)) assert type(sents_tokenized_flat[0]) is str assert len(sents_tokenized_flat) > 0 # Convert the tokens to `input_ids` # `max_length` is the max length minus 2 because we need to add the # beginning and ending tokens to the target sents_input_ids = self.tokenizer.encode_plus( sents_tokenized_flat, truncation=True, is_split_into_words=True, add_special_tokens=False, max_length=(max_length - 2), return_attention_mask=False, return_token_type_ids=False, )["input_ids"] # Insert beginning of sequence token and append end of sequence token. sents_input_ids.insert(0, self.target_boseq_token_id) sents_input_ids.append(self.target_eoseq_token_id) # Create attention mask attention_mask = [1] * len(sents_input_ids) # Append the `input_ids` and `attention_mask` highlights_input_ids.append(sents_input_ids) highlights_attention_masks.append(attention_mask) # Pad the highlight input ids and attention masks to `tokenizer.max_len`. # The articles have already been padded because they do not need the extra # `boseq` and `eoseq` tokens. highlights_input_ids = pad( highlights_input_ids, self.tokenizer.pad_token_id, width=max_length, ) highlights_attention_masks = pad(highlights_attention_masks, 0, width=max_length) return { "source": articles_encoded["input_ids"], "target": highlights_input_ids, "source_mask": articles_encoded["attention_mask"], "target_mask": highlights_attention_masks, }
def send_offer_message(self, client_address): print(TEAM_NAME + OFFER + (helpers.pad(40)).encode() + bytes([0])) self.udp_socket.sendto(TEAM_NAME + OFFER + (helpers.pad(40)).encode() + bytes([0]), client_address)
def pad_batch_collate(batch, modifier=None): r""" Collate function to be passed to ``DataLoaders``. PyTorch Docs: `https://pytorch.org/docs/stable/data.html#dataloader-collate-fn <https://pytorch.org/docs/stable/data.html#dataloader-collate-fn>`__ Calculates padding (per batch for efficiency) of ``labels`` and ``token_type_ids`` if they exist within the batch from the ``Dataset``. Also, pads ``sent_rep_token_ids`` and creates the ``sent_rep_mask`` to indicate which numbers in the ``sent_rep_token_ids`` list are actually the locations of sentence representation ids and which are padding. Finally, calculates the ``attention_mask`` for each set of ``input_ids`` and pads both the ``attention_mask`` and the ``input_ids``. Converts all inputs to tensors. If ``sent_lengths`` are found then they will also automatically be padded. However, the padding for sentence lengths is complicated. Each list of sentence lengths needs to be the length of the longest list of sentence lengths and the sum of all the lengths in each list needs to add to the length of the input_ids width (the length of each input_id). The second requirement exists because ``torch.split()`` (which is used in the ``mean_tokens`` pooling algorithm to convert word vectors to sentence embeddings in ``pooling.py``) will split a tensor into the lengths requested but will error instead of returning any extra. However, ``torch.split()`` will split a tensor into zero length segments. Thus, to solve this, zeros are added to each sentence length list for each example until one more padding value is needed to get the maximum number of sentences. Once only one more value is needed, the total value needded to reach the width of the ``input_ids`` is added. ``source`` and ``target``, if present, are simply passed on without any processing. Therefore, the standard ``collate_fn`` function for ``DataLoader``\ s will not work if these are present since they cannot be converted to tensors without padding. This ``collate_fn`` must be used if ``source`` or ``target`` is present in the loaded dataset. The ``modifier`` argument accepts a function that takes the ``final_dictionary`` and returns a modified ``final_dictionary``. The ``modifier`` function will be called directly before ``final_dictionary`` is returned in :meth:`~data.pad_batch_collate`. This allows for easy extendability. """ elem = batch[0] final_dictionary = {} for key in elem: # don't process `sent_lengths` if key == "sent_lengths": continue feature_list = [d[key] for d in batch] if key == "sent_rep_token_ids": feature_list = pad(feature_list, -1) sent_rep_token_ids = torch.tensor(feature_list) sent_rep_mask = ~(sent_rep_token_ids == -1) sent_rep_token_ids[sent_rep_token_ids == -1] = 0 final_dictionary["sent_rep_token_ids"] = sent_rep_token_ids final_dictionary["sent_rep_mask"] = sent_rep_mask continue # go to next key if key == "input_ids": input_ids = feature_list # Attention # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [[1] * len(ids) for ids in input_ids] input_ids_width = max([len(ids) for ids in input_ids]) input_ids = pad(input_ids, 0, width=input_ids_width) input_ids = torch.tensor(input_ids) attention_mask = pad(attention_mask, 0) attention_mask = torch.tensor(attention_mask) if "sent_lengths" in elem: sent_lengths = [] sent_lengths_mask = [] sent_lengths_width = max([len(d["sent_lengths"]) + 1 for d in batch]) for d in batch: current_sent_lens = d["sent_lengths"] current_sent_lengths_mask = [True] * len(current_sent_lens) num_to_add = sent_lengths_width - len(current_sent_lens) total_value_to_add = input_ids_width - sum(current_sent_lens) while num_to_add > 1: num_to_add -= 1 # total_value_to_add -= 1 current_sent_lens.append(0) current_sent_lengths_mask.append(False) # if a value needs to be added to make `sum(current_sent_lens)` the total input # sequence length OR there is one more number to add (this can happen if the input # sequence exactly ends with a sentence, making the total of the lengths the length # of the sequence, or if there is one sentence that takes up the entire sequence) if total_value_to_add > 0 or num_to_add == 1: current_sent_lens.append(total_value_to_add) current_sent_lengths_mask.append(False) sent_lengths.append(current_sent_lens) sent_lengths_mask.append(current_sent_lengths_mask) final_dictionary["sent_lengths"] = sent_lengths final_dictionary["sent_lengths_mask"] = torch.tensor(sent_lengths_mask) final_dictionary["input_ids"] = input_ids final_dictionary["attention_mask"] = attention_mask continue if key in ("source", "target"): final_dictionary[key] = feature_list continue if key in ("labels", "token_type_ids"): feature_list = pad(feature_list, 0) feature_list = torch.tensor(feature_list) final_dictionary[key] = feature_list if modifier: final_dictionary = modifier(final_dictionary) return final_dictionary
def get_features( self, tokenizer, bert_compatible_cls=True, create_sent_rep_token_ids=True, sent_rep_token_id=None, create_sent_lengths=True, create_segment_ids="binary", segment_token_id=None, create_source=False, n_process=2, max_length=None, pad_on_left=False, pad_token=0, mask_padding_with_zero=True, create_attention_mask=True, pad_ids_and_attention=True, return_type=None, save_to_path=None, save_to_name=None, save_as_type="txt", ): r"""Convert the examples stored by the ``SentencesProcessor`` to features that can be used by a model. The following processes can be performed: tokenization, token type ids (to separate sentences), sentence representation token ids (the locations of each sentence representation token), sentence lengths, and the attention mask. Padding can be applied to the tokenized examples and the attention masks but it is recommended to instead use the :meth:`data.pad_batch_collate` function so each batch is padded individually for efficiency (less zeros passed through model). Arguments: tokenizer (transformers.PreTrainedTokenizer): The tokenizer used to tokenize the examples. bert_compatible_cls (bool, optional): Adds '[CLS]' tokens in front of each sentence. This is useful so that the '[CLS]' token can be used to obtain sentence embeddings. This only works if the chosen model has the '[CLS]' token in its vocabulary. Default is True. create_sent_rep_token_ids (bool, optional): Option to create sentence representation token ids. This will store a list of the indexes of all the ``sent_rep_token_id``\ s in the tokenized example. Default is True. sent_rep_token_id ([type], optional): The token id that should be captured for each sentence (should have one per sentence and each should represent that sentence). Default is ``'[CLS]' token if bert_compatible_cls else '[SEP]' token``. create_sent_lengths (bool, optional): Option to create a list of sentence lengths where each index in the list coresponds to the respective sentence in the example. Default is True. create_segment_ids (str, optional): Option to create segment ids (aka token type ids). See https://huggingface.co/transformers/glossary.html#token-type-ids for more info. Set to either "binary", "sequential", or False. * ``binary`` alternates between 0 and 1 for each sentence. * ``sequential`` starts at 0 and increments by 1 for each sentence. * ``False`` does not create any segment ids. Note: Many pretrained models that accept token type ids use them for question answering ans related tasks where the model receives two inputs. Therefore, most models have a token type id vocabulary size of 2, which means they only have learned 2 token type ids. The "binary" mode exists so that these pretrained models can easily be used. Default is "binary". segment_token_id (str, optional): The token id to be used when creating segment ids. Can be set to 'period' to treat periods as sentence separation tokens, but this is a terrible idea for obvious reasons. Default is '[SEP]' token id. create_source (bool, optional): Option to save the source text (non-tokenized) as a string. Default is False. n_process (int, optional): How many processes to use for multithreading for running get_features_process(). Set higher to run faster and set lower is you experience OOM issues. Default is 2. max_length (int, optional): If ``pad_ids_and_attention`` is True then pad to this amount. Default is ``tokenizer.max_len``. pad_on_left (bool, optional): Optionally, pad on the left instead of right. Default is False. pad_token (int, optional): Which token to use for padding the ``input_ids``. Default is 0. mask_padding_with_zero (bool, optional): Use zeros to pad the attention. Uses ones otherwise. Default is True. create_attention_mask (bool, optional): Option to create the attention mask. It is recommended to use the :meth:`data.pad_batch_collate` function, which will automatically create attention masks and pad them on a per batch level. Default is ``False if return_type == "lists" else True``. pad_ids_and_attention (bool, optional): Pad the ``input_ids`` with ``pad_token`` and attention masks with 0s or 1s deneding on ``mask_padding_with_zero``. Pad both to ``max_length``. Default is ``False if return_type == "lists" else True`` return_type (str, optional): Either "tensors", "lists", or None. See "Returns" section below. Default is None. save_to_path (str, optional): The folder/directory to save the data to OR None to not save. Will save the data specified by ``return_type`` to disk. Default is None. save_to_name (str, optional): The name of the file to save. The extension '.pt' is automatically appended. Default is ``'dataset_' + self.name + '.pt'``. save_as_type (str, optional): The file extension of saved file if `save_to_path` is set. Supports "pt" (PyTorch) and "txt" (Text). Saving as "txt" requires the ``return_type`` to be ``lists``. If ``return_type`` is ``tensors`` the only ``save_as_type`` available is "pt". Defaults to "txt". Returns: list or torch.TensorDataset: If ``return_type is None`` return the list of calculated features. If ``return_type == "tensors"`` return the features converted to tensors and stacked such that features are grouped together into individual tensors. If ``return_type == "lists"``, which is the recommended option then exports each ``InputFeatures`` object in the exported ``features`` list as a dictionary and appends each dictionary to a list. Returns that list. """ assert return_type in ["tensors", "lists"] or return_type is None assert save_as_type in ["txt", "pt"] or save_to_path is None if save_as_type == "txt": assert return_type == "lists" if return_type == "tensors": assert save_as_type == "pt" or save_to_path is None if return_type == "lists": create_attention_mask = False pad_ids_and_attention = False else: # if `return_type` is None or "tensors" create_attention_mask = True pad_ids_and_attention = True if max_length is None: max_length = tokenizer.model_max_length # batch_length = max(len(input_ids) for input_ids in all_input_ids) if create_sent_rep_token_ids: if sent_rep_token_id == "sep": # get the sep token id sent_rep_token_id = tokenizer.sep_token_id elif sent_rep_token_id == "cls": # get the cls token id sent_rep_token_id = tokenizer.cls_token_id elif not sent_rep_token_id: # if the `sent_rep_token_id` is not set # if using `bert_compatible_cls` then default to the `cls_token_id` if bert_compatible_cls: sent_rep_token_id = tokenizer.cls_token_id else: # otherwise, get the `sep_token_id` sent_rep_token_id = tokenizer.sep_token_id if create_segment_ids: if segment_token_id == "period": # get the token id for a "." segment_token_id = tokenizer.convert_tokens_to_ids(["."])[0] elif ( not segment_token_id ): # default to trying to get the `sep_token_id` if the `segment_token_id` is not set segment_token_id = tokenizer.sep_token_id features = [] pool = Pool(n_process) _get_features_process = partial( self.get_features_process, num_examples=len(self.labels), tokenizer=tokenizer, bert_compatible_cls=bert_compatible_cls, sep_token=tokenizer.sep_token, cls_token=tokenizer.cls_token, create_sent_rep_token_ids=create_sent_rep_token_ids, sent_rep_token_id=sent_rep_token_id, create_sent_lengths=create_sent_lengths, create_segment_ids=create_segment_ids, segment_token_id=segment_token_id, create_source=create_source, max_length=max_length, pad_on_left=pad_on_left, pad_token=pad_token, mask_padding_with_zero=mask_padding_with_zero, create_attention_mask=create_attention_mask, pad_ids_and_attention=pad_ids_and_attention, ) for rtn_features in pool.map( _get_features_process, zip(range(len(self.labels)), self.examples, self.labels), ): features.append(rtn_features) pool.close() pool.join() if not return_type: return features elif return_type == "tensors": final_tensors = [] all_input_ids = torch.tensor( [f.input_ids for f in features], dtype=torch.long ) final_tensors.append(all_input_ids) all_attention_masks = torch.tensor( [f.attention_mask for f in features], dtype=torch.long ) final_tensors.append(all_attention_masks) all_labels = torch.tensor( pad([f.labels for f in features], 0), dtype=torch.long ) final_tensors.append(all_labels) if create_segment_ids: all_token_type_ids = torch.tensor( pad([f.token_type_ids for f in features], 0), dtype=torch.long ) final_tensors.append(all_token_type_ids) # Pad sentence representation token ids (`sent_rep_token_ids`) if create_sent_rep_token_ids: all_sent_rep_token_ids = torch.tensor( pad([f.sent_rep_token_ids for f in features], -1), dtype=torch.long ) all_sent_rep_token_ids_masks = ~(all_sent_rep_token_ids == -1) all_sent_rep_token_ids[all_sent_rep_token_ids == -1] = 0 final_tensors.append(all_sent_rep_token_ids) final_tensors.append(all_sent_rep_token_ids_masks) if create_sent_lengths: all_sent_lengths = torch.tensor( pad([f.sent_lengths for f in features], 0), dtype=torch.long ) final_tensors.append(all_sent_lengths) dataset = torch.utils.data.TensorDataset(*final_tensors) elif return_type == "lists": dataset = [example.to_dict() for example in features] if save_to_path: final_save_name = save_to_name if save_to_name else ("dataset_" + self.name) dataset_path = os.path.join( save_to_path, (final_save_name + "." + save_as_type), ) logger.info("Saving dataset into cached file %s", dataset_path) if save_as_type == "txt": with open(dataset_path, "w+") as file: # Need to replace single with double quotes so it can be loaded as JSON file.write( "\n".join([json.dumps(x) for x in dataset]) + "\n" ) elif save_as_type == "pt": torch.save(dataset, dataset_path) else: logger.error("'%s' is an invalid save type.", save_as_type) return dataset
def start_activity(self): self.udp_socket.sendto( TEAM_NAME + DISCOVER + (helpers.pad(40)).encode() + bytes([0]), (IP_BROADCAST, SERVER_PORT)) # TODO: Missing fields self.wait_for_servers()