def finalise_metadata( cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) merged_token_counter = Counter() for raw_metadata in raw_metadata_list: merged_token_counter += raw_metadata['token_counter'] if hyperparameters['%s_use_bpe' % encoder_label]: token_vocabulary = BpeVocabulary( vocab_size=hyperparameters['%s_token_vocab_size' % encoder_label], pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label]) token_vocabulary.fit(merged_token_counter) else: token_vocabulary = Vocabulary.create_vocabulary( tokens=merged_token_counter, max_size=hyperparameters['%s_token_vocab_size' % encoder_label], count_threshold=hyperparameters[ '%s_token_vocab_count_threshold' % encoder_label]) final_metadata['token_vocab'] = token_vocabulary # Save the most common tokens for use in data augmentation: final_metadata['common_tokens'] = merged_token_counter.most_common(50) return final_metadata
def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: print("Finalising metadata") final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) merged_token_counter = collections.Counter() merged_edge_types = set() token_counts = [] for raw_metadata in raw_metadata_list: merged_token_counter += raw_metadata['token_counter'] merged_edge_types = merged_edge_types.union(raw_metadata['edge_types']) # token_counts.extend(raw_metadata['nodes_by_tokens']) if hyperparameters[f'{encoder_label}_token_use_bpe']: token_vocabulary = BpeVocabulary( vocab_size=hyperparameters[f'{encoder_label}_token_vocab_size'], pct_bpe=hyperparameters[f'{encoder_label}_token_pct_bpe'] ) token_vocabulary.fit(merged_token_counter) print('Total token word vocabulary words:', len(token_vocabulary.word_vocab)) print('Total token bpe vocabulary words:', len(token_vocabulary.bpe_vocab)) else: token_vocabulary = Vocabulary.create_vocabulary( tokens=merged_token_counter, max_size=hyperparameters[f'{encoder_label}_token_vocab_size'], count_threshold=hyperparameters[f'{encoder_label}_token_vocab_count_threshold']) print('Total token vocabulary words:', len(token_vocabulary.id_to_token)) final_metadata['token_vocab'] = token_vocabulary final_metadata['edge_type_mapping'] = {edge_type: i for i, edge_type in enumerate(merged_edge_types)} print('Edge type mapping:', final_metadata['edge_type_mapping']) # print("Percentiles:") # for p in [0, 1, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99, 99.9, 100]: # print(p, np.percentile(token_counts, p)) return final_metadata
def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]], language=None) -> Dict[str, Any]: final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) merged_token_counter = Counter() print(encoder_label, language) if encoder_label == 'query': final_metadata_path = '_'.join([encoder_label, 'final_metadata']) else: assert encoder_label == 'code' and language final_metadata_path = '_'.join( [encoder_label, language, 'final_metadata']) if os.path.isfile(final_metadata_path): with open(final_metadata_path, 'rb') as final_metadata_file: final_metadata = pickle.load(final_metadata_file) else: for raw_metadata in raw_metadata_list: merged_token_counter += raw_metadata['token_counter'] if hyperparameters['%s_use_bpe' % encoder_label]: token_vocabulary = BpeVocabulary( vocab_size=hyperparameters['%s_token_vocab_size' % encoder_label], pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label]) token_vocabulary.fit(merged_token_counter) else: token_vocabulary = Vocabulary.create_vocabulary( tokens=merged_token_counter, max_size=hyperparameters['%s_token_vocab_size' % encoder_label], count_threshold=hyperparameters[ '%s_token_vocab_count_threshold' % encoder_label]) final_metadata['token_vocab'] = token_vocabulary # Save the most common tokens for use in data augmentation: final_metadata['common_tokens'] = merged_token_counter.most_common( 50) with open(final_metadata_path, 'wb') as final_metadata_file: pickle.dump(final_metadata, final_metadata_file) return final_metadata
def finalise_metadata( cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) # JGD ****** leaf_nodes start ****** merged_identifier_counter = Counter() for raw_metadata in raw_metadata_list: merged_identifier_counter += raw_metadata['identifier_counter'] if hyperparameters['%s_use_bpe' % encoder_label]: identifier_vocabulary = BpeVocabulary( vocab_size=hyperparameters['%s_token_vocab_size' % encoder_label], pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label]) identifier_vocabulary.fit(merged_identifier_counter) else: identifier_vocabulary = Vocabulary.create_vocabulary( tokens=merged_identifier_counter, max_size=hyperparameters['%s_token_vocab_size' % encoder_label], count_threshold=hyperparameters[ '%s_token_vocab_count_threshold' % encoder_label]) final_metadata['identifier_vocab'] = identifier_vocabulary # Save the most common tokens for use in data augmentation: final_metadata[ 'common_identifiers'] = merged_identifier_counter.most_common(50) # JGD ****** leaf_nodes end ****** # JGD ****** tree_paths start ****** # merged_context_filenames = list() # merged_terminal_counter = Counter() # merged_nonterminal_counter = Counter() # for raw_metadata in raw_metadata_list: # merged_context_filenames.extend(raw_metadata['context_filenames']) # merged_terminal_counter += raw_metadata['terminal_counter'] # merged_nonterminal_counter += raw_metadata['nonterminal_counter'] # # final_metadata['context_filenames'] = merged_context_filenames # final_metadata['terminal_counter'] = merged_terminal_counter # final_metadata['nonterminal_counter'] = merged_nonterminal_counter # JGD ****** tree_paths end ****** return final_metadata
def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any], raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]: final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list) merged_token_counter = collections.Counter() for raw_metadata in raw_metadata_list: merged_token_counter += raw_metadata['token_counter'] if hyperparameters[f'{encoder_label}_token_use_bpe']: token_vocabulary = BpeVocabulary( vocab_size=hyperparameters[f'{encoder_label}_token_vocab_size'], pct_bpe=hyperparameters[f'{encoder_label}_token_pct_bpe']) token_vocabulary.fit(merged_token_counter) print('Total token word vocabulary words:', len(token_vocabulary.word_vocab)) print('Total token bpe vocabulary words:', len(token_vocabulary.bpe_vocab)) else: token_vocabulary = Vocabulary.create_vocabulary( tokens=merged_token_counter, max_size=hyperparameters[f'{encoder_label}_token_vocab_size'], count_threshold=hyperparameters[f'{encoder_label}_token_vocab_count_threshold']) print('Total token vocabulary words:', len(token_vocabulary.id_to_token)) final_metadata['token_vocab'] = token_vocabulary return final_metadata