def __init__(self, nl_threshold, nl_embedding_size, nl_token_counter, code_threshold, code_embedding_size, code_token_counter, dropout_rate, load_pretrained_embeddings=False): """Keeps track of the NL and code vocabularies and embeddings.""" super(EmbeddingStore, self).__init__() edit_keywords = get_edit_keywords() self.__nl_vocabulary = Vocabulary.create_vocabulary(tokens=edit_keywords, max_size=MAX_VOCAB_SIZE, count_threshold=1, add_pad=True) self.__nl_vocabulary.update(nl_token_counter, MAX_VOCAB_SIZE, nl_threshold) self.__nl_embedding_layer = nn.Embedding(num_embeddings=len(self.__nl_vocabulary), embedding_dim=nl_embedding_size, padding_idx=self.__nl_vocabulary.get_id_or_unk( Vocabulary.get_pad())) self.nl_embedding_dropout_layer = nn.Dropout(p=dropout_rate) self.__code_vocabulary = Vocabulary.create_vocabulary(tokens=edit_keywords, max_size=MAX_VOCAB_SIZE, count_threshold=1, add_pad=True) self.__code_vocabulary.update(code_token_counter, MAX_VOCAB_SIZE, code_threshold) self.__code_embedding_layer = nn.Embedding(num_embeddings=len(self.__code_vocabulary), embedding_dim=code_embedding_size, padding_idx=self.__code_vocabulary.get_id_or_unk( Vocabulary.get_pad())) self.code_embedding_dropout_layer = nn.Dropout(p=dropout_rate) print('NL vocabulary size: {}'.format(len(self.__nl_vocabulary))) print('Code vocabulary size: {}'.format(len(self.__code_vocabulary))) if load_pretrained_embeddings: self.initialize_embeddings()
def get_padded_nl_ids(self, nl_sequence, pad_length): return self.__nl_vocabulary.get_id_or_unk_multiple( nl_sequence, pad_to_size=pad_length, padding_element=self.__nl_vocabulary.get_id_or_unk( Vocabulary.get_pad()), )
def pad_length(self, sequence, target_length): if len(sequence) >= target_length: return sequence[:target_length] else: return sequence + [ self.__nl_vocabulary.get_id_or_unk(Vocabulary.get_pad()) for _ in range(target_length - len(sequence)) ]
def get_extended_padded_nl_ids(self, nl_sequence, pad_length, inp_ids, inp_tokens): # Derived from: https://github.com/microsoft/dpu-utils/blob/master/python/dpu_utils/mlutils/vocabulary.py nl_ids = [] for token in nl_sequence: nl_id = self.get_nl_id(token) if self.is_nl_unk(nl_id) and token in inp_tokens: copy_idx = inp_tokens.index(token) nl_id = inp_ids[copy_idx] nl_ids.append(nl_id) if len(nl_ids) > pad_length: return nl_ids[:pad_length] else: padding = [self.__nl_vocabulary.get_id_or_unk(Vocabulary.get_pad())] * (pad_length - len(nl_ids)) return nl_ids + padding
def evaluate_f1(model: keras.Model, vocab: Vocabulary, input_method_body_subtokens: np.ndarray, target_method_names: np.ndarray, hyperparameters: Dict[str, any], visualise_prediction=True): padding_id = vocab.get_id_or_unk(vocab.get_pad()) begin_of_sentence_id = vocab.get_id_or_unk(SENTENCE_START_TOKEN) end_of_sentence_id = vocab.get_id_or_unk(SENTENCE_END_TOKEN) if input_method_body_subtokens.ndim != 3: # model prediction expects 3 dimensions, a single input won't have the batch dimension, manually add it input_method_body_subtokens = np.expand_dims( input_method_body_subtokens, 0) predictions = model.predict(input_method_body_subtokens, batch_size=1) best_predictions, best_predictions_probs = beam_search( predictions, padding_id, begin_of_sentence_id, end_of_sentence_id, hyperparameters['beam_width'], hyperparameters['beam_top_paths'], ) f1_evaluation = _evaluate_f1(best_predictions, best_predictions_probs, vocab, target_method_names) if visualise_prediction: max_results = 10 visualised_input = visualise_beam_predictions_to_targets( vocab, best_predictions[:max_results], best_predictions_probs[:max_results], input_method_body_subtokens[:max_results], target_method_names[:max_results]) # return best_predictions, best_predictions_probs return f1_evaluation, visualised_input return f1_evaluation
def translate_tokenized_array_to_list_words(vocab: Vocabulary, token: np.ndarray) -> List[str]: """Helper function to translate numpy array tokens back to words""" return [vocab.get_name_for_id(n) for n in token[np.nonzero(token != vocab.get_id_or_unk(vocab.get_pad()))]]
def get_code_pad_id(self): return self.__code_vocabulary.get_id_or_unk(Vocabulary.get_pad())