def __init__(self, nl_threshold, nl_embedding_size, nl_token_counter,
                 code_threshold, code_embedding_size, code_token_counter,
                 dropout_rate, load_pretrained_embeddings=False):
        """Keeps track of the NL and code vocabularies and embeddings."""
        super(EmbeddingStore, self).__init__()
        edit_keywords = get_edit_keywords()
        self.__nl_vocabulary = Vocabulary.create_vocabulary(tokens=edit_keywords,
                                                         max_size=MAX_VOCAB_SIZE,
                                                         count_threshold=1,
                                                         add_pad=True)
        self.__nl_vocabulary.update(nl_token_counter, MAX_VOCAB_SIZE, nl_threshold)
        self.__nl_embedding_layer = nn.Embedding(num_embeddings=len(self.__nl_vocabulary),
                                        embedding_dim=nl_embedding_size,
                                        padding_idx=self.__nl_vocabulary.get_id_or_unk(
                                            Vocabulary.get_pad()))
        self.nl_embedding_dropout_layer = nn.Dropout(p=dropout_rate)
        

        self.__code_vocabulary = Vocabulary.create_vocabulary(tokens=edit_keywords,
                                                    max_size=MAX_VOCAB_SIZE,
                                                    count_threshold=1,
                                                    add_pad=True)
        self.__code_vocabulary.update(code_token_counter, MAX_VOCAB_SIZE, code_threshold)
        self.__code_embedding_layer = nn.Embedding(num_embeddings=len(self.__code_vocabulary),
                        embedding_dim=code_embedding_size,
                        padding_idx=self.__code_vocabulary.get_id_or_unk(
                        Vocabulary.get_pad()))
        self.code_embedding_dropout_layer = nn.Dropout(p=dropout_rate)

        print('NL vocabulary size: {}'.format(len(self.__nl_vocabulary)))
        print('Code vocabulary size: {}'.format(len(self.__code_vocabulary)))

        if load_pretrained_embeddings:
            self.initialize_embeddings()
 def get_padded_nl_ids(self, nl_sequence, pad_length):
     return self.__nl_vocabulary.get_id_or_unk_multiple(
         nl_sequence,
         pad_to_size=pad_length,
         padding_element=self.__nl_vocabulary.get_id_or_unk(
             Vocabulary.get_pad()),
     )
 def pad_length(self, sequence, target_length):
     if len(sequence) >= target_length:
         return sequence[:target_length]
     else:
         return sequence + [
             self.__nl_vocabulary.get_id_or_unk(Vocabulary.get_pad())
             for _ in range(target_length - len(sequence))
         ]
 def get_extended_padded_nl_ids(self, nl_sequence, pad_length, inp_ids, inp_tokens):
     # Derived from: https://github.com/microsoft/dpu-utils/blob/master/python/dpu_utils/mlutils/vocabulary.py
     nl_ids = []
     for token in nl_sequence:
         nl_id = self.get_nl_id(token)
         if self.is_nl_unk(nl_id) and token in inp_tokens:
             copy_idx = inp_tokens.index(token)
             nl_id = inp_ids[copy_idx]
         nl_ids.append(nl_id)
     
     if len(nl_ids) > pad_length:
         return nl_ids[:pad_length]
     else:
         padding = [self.__nl_vocabulary.get_id_or_unk(Vocabulary.get_pad())] * (pad_length - len(nl_ids))
         return nl_ids + padding
示例#5
0
def evaluate_f1(model: keras.Model,
                vocab: Vocabulary,
                input_method_body_subtokens: np.ndarray,
                target_method_names: np.ndarray,
                hyperparameters: Dict[str, any],
                visualise_prediction=True):
    padding_id = vocab.get_id_or_unk(vocab.get_pad())
    begin_of_sentence_id = vocab.get_id_or_unk(SENTENCE_START_TOKEN)
    end_of_sentence_id = vocab.get_id_or_unk(SENTENCE_END_TOKEN)

    if input_method_body_subtokens.ndim != 3:
        # model prediction expects 3 dimensions, a single input won't have the batch dimension, manually add it
        input_method_body_subtokens = np.expand_dims(
            input_method_body_subtokens, 0)

    predictions = model.predict(input_method_body_subtokens, batch_size=1)

    best_predictions, best_predictions_probs = beam_search(
        predictions,
        padding_id,
        begin_of_sentence_id,
        end_of_sentence_id,
        hyperparameters['beam_width'],
        hyperparameters['beam_top_paths'],
    )
    f1_evaluation = _evaluate_f1(best_predictions, best_predictions_probs,
                                 vocab, target_method_names)
    if visualise_prediction:
        max_results = 10
        visualised_input = visualise_beam_predictions_to_targets(
            vocab, best_predictions[:max_results],
            best_predictions_probs[:max_results],
            input_method_body_subtokens[:max_results],
            target_method_names[:max_results])

        # return best_predictions, best_predictions_probs
        return f1_evaluation, visualised_input
    return f1_evaluation
示例#6
0
def translate_tokenized_array_to_list_words(vocab: Vocabulary, token: np.ndarray) -> List[str]:
    """Helper function to translate numpy array tokens back to words"""
    return [vocab.get_name_for_id(n) for n in token[np.nonzero(token != vocab.get_id_or_unk(vocab.get_pad()))]]
 def get_code_pad_id(self):
     return self.__code_vocabulary.get_id_or_unk(Vocabulary.get_pad())