示例#1
0
    def finalise_metadata(
            cls, encoder_label: str, hyperparameters: Dict[str, Any],
            raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
        final_metadata = super().finalise_metadata(encoder_label,
                                                   hyperparameters,
                                                   raw_metadata_list)
        merged_token_counter = Counter()
        for raw_metadata in raw_metadata_list:
            merged_token_counter += raw_metadata['token_counter']

        if hyperparameters['%s_use_bpe' % encoder_label]:
            token_vocabulary = BpeVocabulary(
                vocab_size=hyperparameters['%s_token_vocab_size' %
                                           encoder_label],
                pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label])
            token_vocabulary.fit(merged_token_counter)
        else:
            token_vocabulary = Vocabulary.create_vocabulary(
                tokens=merged_token_counter,
                max_size=hyperparameters['%s_token_vocab_size' %
                                         encoder_label],
                count_threshold=hyperparameters[
                    '%s_token_vocab_count_threshold' % encoder_label])

        final_metadata['token_vocab'] = token_vocabulary
        # Save the most common tokens for use in data augmentation:
        final_metadata['common_tokens'] = merged_token_counter.most_common(50)
        return final_metadata
示例#2
0
    def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any],
                          raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
        print("Finalising metadata")
        final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list)
        merged_token_counter = collections.Counter()
        merged_edge_types = set()
        token_counts = []
        for raw_metadata in raw_metadata_list:
            merged_token_counter += raw_metadata['token_counter']
            merged_edge_types = merged_edge_types.union(raw_metadata['edge_types'])
            # token_counts.extend(raw_metadata['nodes_by_tokens'])

        if hyperparameters[f'{encoder_label}_token_use_bpe']:
            token_vocabulary = BpeVocabulary(
                vocab_size=hyperparameters[f'{encoder_label}_token_vocab_size'],
                pct_bpe=hyperparameters[f'{encoder_label}_token_pct_bpe']
            )
            token_vocabulary.fit(merged_token_counter)
            print('Total token word vocabulary words:', len(token_vocabulary.word_vocab))
            print('Total token bpe vocabulary words:', len(token_vocabulary.bpe_vocab))
        else:
            token_vocabulary = Vocabulary.create_vocabulary(
                tokens=merged_token_counter,
                max_size=hyperparameters[f'{encoder_label}_token_vocab_size'],
                count_threshold=hyperparameters[f'{encoder_label}_token_vocab_count_threshold'])
            print('Total token vocabulary words:', len(token_vocabulary.id_to_token))

        final_metadata['token_vocab'] = token_vocabulary
        final_metadata['edge_type_mapping'] = {edge_type: i for i, edge_type in enumerate(merged_edge_types)}
        print('Edge type mapping:', final_metadata['edge_type_mapping'])
        # print("Percentiles:")
        # for p in [0, 1, 10, 20, 25, 30, 40, 50, 60, 70, 75, 80, 90, 95, 99, 99.9, 100]:
        #     print(p, np.percentile(token_counts, p))
        return final_metadata
    def finalise_metadata(cls,
                          encoder_label: str,
                          hyperparameters: Dict[str, Any],
                          raw_metadata_list: List[Dict[str, Any]],
                          language=None) -> Dict[str, Any]:
        final_metadata = super().finalise_metadata(encoder_label,
                                                   hyperparameters,
                                                   raw_metadata_list)
        merged_token_counter = Counter()

        print(encoder_label, language)
        if encoder_label == 'query':
            final_metadata_path = '_'.join([encoder_label, 'final_metadata'])
        else:
            assert encoder_label == 'code' and language
            final_metadata_path = '_'.join(
                [encoder_label, language, 'final_metadata'])

        if os.path.isfile(final_metadata_path):
            with open(final_metadata_path, 'rb') as final_metadata_file:
                final_metadata = pickle.load(final_metadata_file)
        else:

            for raw_metadata in raw_metadata_list:
                merged_token_counter += raw_metadata['token_counter']

            if hyperparameters['%s_use_bpe' % encoder_label]:
                token_vocabulary = BpeVocabulary(
                    vocab_size=hyperparameters['%s_token_vocab_size' %
                                               encoder_label],
                    pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label])
                token_vocabulary.fit(merged_token_counter)
            else:
                token_vocabulary = Vocabulary.create_vocabulary(
                    tokens=merged_token_counter,
                    max_size=hyperparameters['%s_token_vocab_size' %
                                             encoder_label],
                    count_threshold=hyperparameters[
                        '%s_token_vocab_count_threshold' % encoder_label])

            final_metadata['token_vocab'] = token_vocabulary
            # Save the most common tokens for use in data augmentation:
            final_metadata['common_tokens'] = merged_token_counter.most_common(
                50)

            with open(final_metadata_path, 'wb') as final_metadata_file:
                pickle.dump(final_metadata, final_metadata_file)

        return final_metadata
示例#4
0
    def finalise_metadata(
            cls, encoder_label: str, hyperparameters: Dict[str, Any],
            raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
        final_metadata = super().finalise_metadata(encoder_label,
                                                   hyperparameters,
                                                   raw_metadata_list)
        # JGD ****** leaf_nodes start ******
        merged_identifier_counter = Counter()
        for raw_metadata in raw_metadata_list:
            merged_identifier_counter += raw_metadata['identifier_counter']

        if hyperparameters['%s_use_bpe' % encoder_label]:
            identifier_vocabulary = BpeVocabulary(
                vocab_size=hyperparameters['%s_token_vocab_size' %
                                           encoder_label],
                pct_bpe=hyperparameters['%s_pct_bpe' % encoder_label])
            identifier_vocabulary.fit(merged_identifier_counter)
        else:
            identifier_vocabulary = Vocabulary.create_vocabulary(
                tokens=merged_identifier_counter,
                max_size=hyperparameters['%s_token_vocab_size' %
                                         encoder_label],
                count_threshold=hyperparameters[
                    '%s_token_vocab_count_threshold' % encoder_label])

        final_metadata['identifier_vocab'] = identifier_vocabulary
        # Save the most common tokens for use in data augmentation:
        final_metadata[
            'common_identifiers'] = merged_identifier_counter.most_common(50)
        # JGD ****** leaf_nodes end ******
        # JGD ****** tree_paths start ******
        # merged_context_filenames = list()
        # merged_terminal_counter = Counter()
        # merged_nonterminal_counter = Counter()
        # for raw_metadata in raw_metadata_list:
        #     merged_context_filenames.extend(raw_metadata['context_filenames'])
        #     merged_terminal_counter += raw_metadata['terminal_counter']
        #     merged_nonterminal_counter += raw_metadata['nonterminal_counter']
        #
        # final_metadata['context_filenames'] = merged_context_filenames
        # final_metadata['terminal_counter'] = merged_terminal_counter
        # final_metadata['nonterminal_counter'] = merged_nonterminal_counter
        # JGD ****** tree_paths end ******
        return final_metadata
示例#5
0
 def finalise_metadata(cls, encoder_label: str, hyperparameters: Dict[str, Any],
                       raw_metadata_list: List[Dict[str, Any]]) -> Dict[str, Any]:
     final_metadata = super().finalise_metadata(encoder_label, hyperparameters, raw_metadata_list)
     merged_token_counter = collections.Counter()
     for raw_metadata in raw_metadata_list:
         merged_token_counter += raw_metadata['token_counter']
     if hyperparameters[f'{encoder_label}_token_use_bpe']:
         token_vocabulary = BpeVocabulary(
             vocab_size=hyperparameters[f'{encoder_label}_token_vocab_size'],
             pct_bpe=hyperparameters[f'{encoder_label}_token_pct_bpe'])
         token_vocabulary.fit(merged_token_counter)
         print('Total token word vocabulary words:', len(token_vocabulary.word_vocab))
         print('Total token bpe vocabulary words:', len(token_vocabulary.bpe_vocab))
     else:
         token_vocabulary = Vocabulary.create_vocabulary(
             tokens=merged_token_counter,
             max_size=hyperparameters[f'{encoder_label}_token_vocab_size'],
             count_threshold=hyperparameters[f'{encoder_label}_token_vocab_count_threshold'])
         print('Total token vocabulary words:', len(token_vocabulary.id_to_token))
     final_metadata['token_vocab'] = token_vocabulary
     return final_metadata