def __init__(self, nl_threshold, nl_embedding_size, nl_token_counter, code_threshold, code_embedding_size, code_token_counter, dropout_rate, load_pretrained_embeddings=False): """Keeps track of the NL and code vocabularies and embeddings.""" super(EmbeddingStore, self).__init__() edit_keywords = get_edit_keywords() self.__nl_vocabulary = Vocabulary.create_vocabulary(tokens=edit_keywords, max_size=MAX_VOCAB_SIZE, count_threshold=1, add_pad=True) self.__nl_vocabulary.update(nl_token_counter, MAX_VOCAB_SIZE, nl_threshold) self.__nl_embedding_layer = nn.Embedding(num_embeddings=len(self.__nl_vocabulary), embedding_dim=nl_embedding_size, padding_idx=self.__nl_vocabulary.get_id_or_unk( Vocabulary.get_pad())) self.nl_embedding_dropout_layer = nn.Dropout(p=dropout_rate) self.__code_vocabulary = Vocabulary.create_vocabulary(tokens=edit_keywords, max_size=MAX_VOCAB_SIZE, count_threshold=1, add_pad=True) self.__code_vocabulary.update(code_token_counter, MAX_VOCAB_SIZE, code_threshold) self.__code_embedding_layer = nn.Embedding(num_embeddings=len(self.__code_vocabulary), embedding_dim=code_embedding_size, padding_idx=self.__code_vocabulary.get_id_or_unk( Vocabulary.get_pad())) self.code_embedding_dropout_layer = nn.Dropout(p=dropout_rate) print('NL vocabulary size: {}'.format(len(self.__nl_vocabulary))) print('Code vocabulary size: {}'.format(len(self.__code_vocabulary))) if load_pretrained_embeddings: self.initialize_embeddings()
def finalise_metadata(name: str, raw_metadata_list: List[Dict[str, Any]], final_metadata: Dict[str, Any], hyperparameters: Dict[str, Any]) -> None: label_embedding_style = hyperparameters[ f'{name}_embedding_style'].lower() merged_node_label_counter = Counter() for raw_metadata in raw_metadata_list: if label_embedding_style == 'token': merged_node_label_counter += raw_metadata[f'{name}_counter'] elif label_embedding_style == 'subtoken': merged_node_label_counter += raw_metadata[ f'{name}_subtoken_counter'] def add_special_literals(vocab: Vocabulary) -> None: vocab.add_or_get_id(TokenEmbedder.STRING_LITERAL) vocab.add_or_get_id(TokenEmbedder.FLOAT_LITERAL) vocab.add_or_get_id(TokenEmbedder.INT_LITERAL) if label_embedding_style == 'token': # Store token, type, and production vocabs: final_metadata[f'{name}_vocab'] = \ Vocabulary.create_vocabulary( merged_node_label_counter, max_size=hyperparameters[f'{name}_vocab_size']) add_special_literals(final_metadata[f'{name}_vocab']) elif label_embedding_style == 'subtoken': final_metadata[f'{name}_subtoken_vocab'] = \ Vocabulary.create_vocabulary( merged_node_label_counter, max_size=hyperparameters[f'{name}_vocab_size']) add_special_literals(final_metadata[f'{name}_subtoken_vocab'])
def finalise_metadata( cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) merged_token_counter = Counter() for raw_metadata in raw_metadata_list: merged_token_counter += raw_metadata['token_counter'] if hyperparameters['%s_use_bpe' % encoder_label]: token_vocabulary = BpeVocabulary( vocab_size=hyperparameters['%s_token_vocab_size' % encoder_label], pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label]) token_vocabulary.fit(merged_token_counter) else: token_vocabulary = Vocabulary.create_vocabulary( tokens=merged_token_counter, max_size=hyperparameters['%s_token_vocab_size' % encoder_label], count_threshold=hyperparameters[ '%s_token_vocab_count_threshold' % encoder_label]) final_metadata['token_vocab'] = token_vocabulary # Save the most common tokens for use in data augmentation: final_metadata['common_tokens'] = merged_token_counter.most_common(50) return final_metadata
def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: print("Finalising metadata") final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) merged_token_counter = collections.Counter() merged_edge_types = set() token_counts = [] for raw_metadata in raw_metadata_list: merged_token_counter += raw_metadata['token_counter'] merged_edge_types = merged_edge_types.union(raw_metadata['edge_types']) # token_counts.extend(raw_metadata['nodes_by_tokens']) if hyperparameters[f'{encoder_label}_token_use_bpe']: token_vocabulary = BpeVocabulary( vocab_size=hyperparameters[f'{encoder_label}_token_vocab_size'], pct_bpe=hyperparameters[f'{encoder_label}_token_pct_bpe'] ) token_vocabulary.fit(merged_token_counter) print('Total token word vocabulary words:', len(token_vocabulary.word_vocab)) print('Total token bpe vocabulary words:', len(token_vocabulary.bpe_vocab)) else: token_vocabulary = Vocabulary.create_vocabulary( tokens=merged_token_counter, max_size=hyperparameters[f'{encoder_label}_token_vocab_size'], count_threshold=hyperparameters[f'{encoder_label}_token_vocab_count_threshold']) print('Total token vocabulary words:', len(token_vocabulary.id_to_token)) final_metadata['token_vocab'] = token_vocabulary final_metadata['edge_type_mapping'] = {edge_type: i for i, edge_type in enumerate(merged_edge_types)} print('Edge type mapping:', final_metadata['edge_type_mapping']) # print("Percentiles:") # for p in [0, 1, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99, 99.9, 100]: # print(p, np.percentile(token_counts, p)) return final_metadata
def __create_voc_from_tokens(all_sub_tokens): vocabulary = Vocabulary.create_vocabulary(all_sub_tokens, max_size=100000, count_threshold=1, add_unk=True, add_pad=True) return vocabulary
def finalize_metadata(self) -> None: self.__token_counter[self.START] = 1000000 self.__token_counter[self.END] = 1000000 self.__output_vocabulary = Vocabulary.create_vocabulary( self.__token_counter, max_size=self.vocabulary_max_size, count_threshold=self.vocabulary_count_threshold, ) self.LOGGER.info("Output vocabulary Size %s", len(self.__output_vocabulary)) del self.__token_counter
def _finalise_metadata(self, raw_metadata_list: List[Dict[str, Any]], final_metadata: Dict[str, Any]): # Merge counters merged_type_counter = Counter() for raw_metadata in raw_metadata_list: merged_type_counter.update(raw_metadata["type_occurences_counter"]) final_metadata['annotation_vocab'] = Vocabulary.create_vocabulary( merged_type_counter, max_size=self.__model. hyperparameters['max_type_annotation_vocab_size']) return final_metadata
def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]], language=None) -> Dict[str, Any]: final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) merged_token_counter = Counter() print(encoder_label, language) if encoder_label == 'query': final_metadata_path = '_'.join([encoder_label, 'final_metadata']) else: assert encoder_label == 'code' and language final_metadata_path = '_'.join( [encoder_label, language, 'final_metadata']) if os.path.isfile(final_metadata_path): with open(final_metadata_path, 'rb') as final_metadata_file: final_metadata = pickle.load(final_metadata_file) else: for raw_metadata in raw_metadata_list: merged_token_counter += raw_metadata['token_counter'] if hyperparameters['%s_use_bpe' % encoder_label]: token_vocabulary = BpeVocabulary( vocab_size=hyperparameters['%s_token_vocab_size' % encoder_label], pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label]) token_vocabulary.fit(merged_token_counter) else: token_vocabulary = Vocabulary.create_vocabulary( tokens=merged_token_counter, max_size=hyperparameters['%s_token_vocab_size' % encoder_label], count_threshold=hyperparameters[ '%s_token_vocab_count_threshold' % encoder_label]) final_metadata['token_vocab'] = token_vocabulary # Save the most common tokens for use in data augmentation: final_metadata['common_tokens'] = merged_token_counter.most_common( 50) with open(final_metadata_path, 'wb') as final_metadata_file: pickle.dump(final_metadata, final_metadata_file) return final_metadata
def _finalise_metadata( self, raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: final_metadata = super()._finalise_metadata(raw_metadata_list) TokenEmbedder.finalise_metadata('leaf_label', raw_metadata_list, final_metadata, self.hyperparameters) # First, merge all needed information: merged_non_terminals = set() for raw_metadata in raw_metadata_list: merged_non_terminals.update(raw_metadata["path_elements"]) final_metadata['non_terminal_dict'] = Vocabulary.create_vocabulary( merged_non_terminals, max_size=10000, count_threshold=0) return final_metadata
def finalise_metadata( cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) merged_type_counter = collections.Counter() for raw_metadata in raw_metadata_list: merged_type_counter += raw_metadata['type_counter'] type_vocabulary = Vocabulary.create_vocabulary( tokens=merged_type_counter, max_size=hyperparameters[f'{encoder_label}_type_vocab_size'], count_threshold=hyperparameters[ f'{encoder_label}_type_vocab_count_threshold']) final_metadata['type_vocab'] = type_vocabulary print('Total type vocabulary words:', len(final_metadata['type_vocab'].id_to_token)) return final_metadata
def finalise_metadata( cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) # JGD ****** leaf_nodes start ****** merged_identifier_counter = Counter() for raw_metadata in raw_metadata_list: merged_identifier_counter += raw_metadata['identifier_counter'] if hyperparameters['%s_use_bpe' % encoder_label]: identifier_vocabulary = BpeVocabulary( vocab_size=hyperparameters['%s_token_vocab_size' % encoder_label], pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label]) identifier_vocabulary.fit(merged_identifier_counter) else: identifier_vocabulary = Vocabulary.create_vocabulary( tokens=merged_identifier_counter, max_size=hyperparameters['%s_token_vocab_size' % encoder_label], count_threshold=hyperparameters[ '%s_token_vocab_count_threshold' % encoder_label]) final_metadata['identifier_vocab'] = identifier_vocabulary # Save the most common tokens for use in data augmentation: final_metadata[ 'common_identifiers'] = merged_identifier_counter.most_common(50) # JGD ****** leaf_nodes end ****** # JGD ****** tree_paths start ****** # merged_context_filenames = list() # merged_terminal_counter = Counter() # merged_nonterminal_counter = Counter() # for raw_metadata in raw_metadata_list: # merged_context_filenames.extend(raw_metadata['context_filenames']) # merged_terminal_counter += raw_metadata['terminal_counter'] # merged_nonterminal_counter += raw_metadata['nonterminal_counter'] # # final_metadata['context_filenames'] = merged_context_filenames # final_metadata['terminal_counter'] = merged_terminal_counter # final_metadata['nonterminal_counter'] = merged_nonterminal_counter # JGD ****** tree_paths end ****** return final_metadata
def finalize_metadata(self) -> None: if self.splitting_kind in {"token", "subtoken"}: self.__vocabulary = Vocabulary.create_vocabulary( self.__tok_counter, max_size=self.max_vocabulary_size, count_threshold=self.min_freq_threshold, add_pad=True) elif self.splitting_kind == "bpe": self.__vocabulary = BpeVocabulary(self.max_vocabulary_size, unk_token=UNK_TOKEN, pad_token=PAD_TOKEN, eos_token=EOS_TOKEN, bos_token=INIT_TOKEN) self.__vocabulary.create_vocabulary(self.__tok_counter) else: raise ValueError( f'Unrecognized token splitting method "{self.splitting_kind}"') del self.__tok_counter
def finalize_metadata(self) -> None: if self.splitting_kind in {"token", "subtoken"}: self.__vocabulary = Vocabulary.create_vocabulary( self.__tok_counter, max_size=self.max_vocabulary_size, count_threshold=self.min_freq_threshold, ) elif self.splitting_kind == "bpe": self.__vocabulary = BpeVocabulary(self.max_vocabulary_size) self.__vocabulary.create_vocabulary(self.__tok_counter) elif self.splitting_kind == "char": self.__vocabulary = CharTensorizer( max_num_chars=self.max_num_chars, lower_case_all=False, include_space=False) else: raise ValueError( f'Unrecognized token splitting method "{self.splitting_kind}"') del self.__tok_counter
def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) merged_token_counter = collections.Counter() for raw_metadata in raw_metadata_list: merged_token_counter += raw_metadata['token_counter'] if hyperparameters[f'{encoder_label}_token_use_bpe']: token_vocabulary = BpeVocabulary( vocab_size=hyperparameters[f'{encoder_label}_token_vocab_size'], pct_bpe=hyperparameters[f'{encoder_label}_token_pct_bpe']) token_vocabulary.fit(merged_token_counter) print('Total token word vocabulary words:', len(token_vocabulary.word_vocab)) print('Total token bpe vocabulary words:', len(token_vocabulary.bpe_vocab)) else: token_vocabulary = Vocabulary.create_vocabulary( tokens=merged_token_counter, max_size=hyperparameters[f'{encoder_label}_token_vocab_size'], count_threshold=hyperparameters[f'{encoder_label}_token_vocab_count_threshold']) print('Total token vocabulary words:', len(token_vocabulary.id_to_token)) final_metadata['token_vocab'] = token_vocabulary return final_metadata
def load_vocabulary(self) -> Vocabulary: """ Return model vocabulary such as a vocabulary. """ max_size = self.config['vocabulary_max_size'] count_threshold = self.config['vocabulary_count_threshold'] # Count occurrences of the body vocabulary tokens_counter = Counter() for method_token in self.corpus_methods_token: for (name, body) in method_token: tokens_counter.update(body) tokens_counter.update(name) token_vocab = Vocabulary.create_vocabulary( tokens_counter, count_threshold=count_threshold, max_size=max_size, add_unk=True, add_pad=True) self.logger.info('{} Vocabulary created'.format(len(token_vocab))) return token_vocab
def __init__(self, train_dir, valid_dir, max_seq_length, max_vocab_size): # Dictionary which stores raw training data self.train_data = { METHOD_NAMES: load_data_file(train_dir + METHOD_NAME_FILE_NAME), METHOD_APIS: load_data_file(train_dir + METHOD_API_FILE_NAME), METHOD_TOKENS: load_data_file(train_dir + METHOD_TOKENS_FILE_NAME), JAVADOC: load_data_file(train_dir + JAVADOC_FILE_NAME) } # Dictionary which stores raw validation data self.valid_data = { METHOD_NAMES: load_data_file(valid_dir + METHOD_NAME_FILE_NAME), METHOD_APIS: load_data_file(valid_dir + METHOD_API_FILE_NAME), METHOD_TOKENS: load_data_file(valid_dir + METHOD_TOKENS_FILE_NAME), JAVADOC: load_data_file(valid_dir + JAVADOC_FILE_NAME) } # Tokens lists are flattened to prepare for vocabulary creation methods_list = [ self.train_data[METHOD_NAMES], self.train_data[METHOD_APIS], self.train_data[METHOD_TOKENS] ] javadoc_list = [self.train_data[JAVADOC]] all_tokens = flatten(methods_list + javadoc_list) self.vocabulary = Vocabulary.create_vocabulary(all_tokens, max_vocab_size, count_threshold=1, add_pad=True) self.max_seq_length = max_seq_length self.max_vocab_size = max_vocab_size # Create Training and Validation tensors self.train_tensors = self._tensorize_data(self.train_data) self.valid_tensors = self._tensorize_data(self.valid_data)
def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: hypers = cls.get_default_hyperparameters() resource = hypers['resource'] vocabulary_path = f'resources/embeddings/{resource}/token_to_index.pickle' with open(vocabulary_path, 'rb') as fin: token_to_index = pickle.load(fin) # Fictive counts so that the ordering in the internal vocabulary will be the same as the indices in the dict. token_to_count = {} for token, index in token_to_index.items(): token_to_count[token] = len(token_to_index) - index token_counter = Counter(token_to_count) token_vocabulary = Vocabulary.create_vocabulary( tokens=token_counter, max_size=hyperparameters['%s_token_vocab_size' % encoder_label], count_threshold=0) print('token_to_index', token_to_index) print('token_vocabulary.id_to_token', token_vocabulary.id_to_token) final_metadata = {} final_metadata['token_vocab'] = token_vocabulary # Save the most common tokens for use in data augmentation: final_metadata['common_tokens'] = token_counter.most_common(50) return final_metadata
def finalize_metadata(self) -> None: self.__target_vocab = Vocabulary.create_vocabulary( self.__target_class_counter, max_size=self.max_num_classes + 1, ) del self.__target_class_counter