예제 #1
0
    def load_data_from_sample(cls,
                              encoder_label: str,
                              hyperparameters: Dict[str, Any],
                              metadata: Dict[str, Any],
                              data_to_load: Any,
                              function_name: Optional[str],
                              result_holder: Dict[str, Any],
                              is_test: bool = True) -> bool:
        """
        Saves two versions of both the code and the query: one using the docstring as the query and the other using the
        function-name as the query, and replacing the function name in the code with an out-of-vocab token.
        Sub-tokenizes, converts, and pads both versions, and rejects empty samples.
        """
        # Save the two versions of the code and query:
        data_holder = {
            QueryType.DOCSTRING.value: data_to_load,
            QueryType.FUNCTION_NAME.value: None
        }
        # Skip samples where the function name is very short, because it probably has too little information
        # to be a good search query.
        if not is_test and hyperparameters['fraction_using_func_name'] > 0. and function_name and \
                len(function_name) >= hyperparameters['min_len_func_name_for_query']:
            if encoder_label == 'query':
                # Set the query tokens to the function name, broken up into its sub-tokens:
                data_holder[QueryType.FUNCTION_NAME.
                            value] = split_identifier_into_parts(function_name)
            elif encoder_label == 'code':
                # In the code, replace the function name with the out-of-vocab token everywhere it appears:
                data_holder[QueryType.FUNCTION_NAME.value] = [
                    Vocabulary.get_unk() if token == function_name else token
                    for token in data_to_load
                ]

        # Sub-tokenize, convert, and pad both versions:
        for key, data in data_holder.items():
            if not data:
                result_holder[f'{encoder_label}_tokens_{key}'] = None
                result_holder[f'{encoder_label}_tokens_mask_{key}'] = None
                result_holder[f'{encoder_label}_tokens_length_{key}'] = None
                continue
            if hyperparameters[f'{encoder_label}_use_subtokens']:
                data = cls._to_subtoken_stream(
                    data,
                    mark_subtoken_end=hyperparameters[
                        f'{encoder_label}_mark_subtoken_end'])
            tokens, tokens_mask = \
                convert_and_pad_token_sequence(metadata['token_vocab'], list(data),
                                               hyperparameters[f'{encoder_label}_max_num_tokens'])
            # Note that we share the result_holder with different encoders, and so we need to make our identifiers
            # unique-ish
            result_holder[f'{encoder_label}_tokens_{key}'] = tokens
            result_holder[f'{encoder_label}_tokens_mask_{key}'] = tokens_mask
            result_holder[f'{encoder_label}_tokens_length_{key}'] = int(
                np.sum(tokens_mask))

        if result_holder[f'{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}'] is None or \
                int(np.sum(result_holder[f'{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}'])) == 0:
            return False

        return True
예제 #2
0
 def convert_and_pad(nodes_):
     n = len(nodes_)
     node_types = [node['type'] for node in nodes_]
     node_type_ids, mask = (tfutils.convert_and_pad_token_sequence(
         metadata['type_vocab'], node_types, n))
     assert len(node_type_ids) == n
     assert np.all(mask == 1)
     return list(node_type_ids)
예제 #3
0
 def load_data_from_sample(cls, encoder_label: str, hyperparameters: Dict[str, Any], metadata: Dict[str, Any],
                           data_to_load: Any, function_name: Optional[str], result_holder: Dict[str, Any],
                           is_test: bool = True) -> bool:
     _, node_tokens = _get_tree_elements_seq(data_to_load, hyperparameters[f'{encoder_label}_max_num_nodes'])
     node_token_ids, mask = (
         tfutils.convert_and_pad_token_sequence(
             metadata['token_vocab'],
             node_tokens,
             hyperparameters[f'{encoder_label}_max_num_tokens']))
     result_holder[f'{encoder_label}_node_masks'] = list(mask)
     result_holder[f'{encoder_label}_tokens'] = list(node_token_ids)
     return True
예제 #4
0
 def load_data_from_sample(cls,
                           encoder_label: str,
                           hyperparameters: Dict[str, Any],
                           metadata: Dict[str, Any],
                           data_to_load: Any,
                           function_name: Optional[str],
                           result_holder: Dict[str, Any],
                           is_test: bool = True) -> bool:
     nodes, children = _linearize_tree_bfs(
         data_to_load, hyperparameters[f'{encoder_label}_max_num_nodes'])
     n = len(nodes)
     node_types = [node['type'] for node in nodes]
     node_type_ids, mask = (tfutils.convert_and_pad_token_sequence(
         metadata['type_vocab'], node_types, n))
     assert len(node_type_ids) == n
     assert np.all(mask == 1)
     result_holder[f'{encoder_label}_node_type_ids'] = list(node_type_ids)
     result_holder[f'{encoder_label}_children'] = children
     return True
예제 #5
0
    def load_data_from_sample(cls, encoder_label: str, hyperparameters: Dict[str, Any], metadata: Dict[str, Any],
                              data_to_load: Any, function_name: Optional[str], result_holder: Dict[str, Any],
                              is_test: bool = True) -> bool:

        assert 'nodes' in data_to_load
        assert 'edges' in data_to_load

        node_tokens = data_to_load['nodes'] if not cls.encoder_hypers['is_raw'] else data_to_load['tokens']
        # seq_tokens = data_to_load['sequence']
        # print(seq_tokens)
        edges = np.array([
            (metadata['edge_type_mapping'][edge_type], v, u)
            for edge_type, edges_of_type in data_to_load['edges'].items()
            for v, u in edges_of_type
        ], dtype=np.int)

        node_token_ids, mask = (
            tfutils.convert_and_pad_token_sequence(
                metadata['token_vocab'],
                node_tokens,
                hyperparameters[f'{encoder_label}_max_num_tokens']
            )
        )
        # seq_token_ids, seq_mask = (
        #     tfutils.convert_and_pad_token_sequence(
        #         metadata['token_vocab'],
        #         seq_tokens,
        #         hyperparameters[f'{encoder_label}_max_num_tokens']
        #     )
        # )
        result_holder[f'{encoder_label}_node_masks'] = mask
        result_holder[f'{encoder_label}_node_token_ids'] = node_token_ids
        result_holder[f'{encoder_label}_edges'] = edges
        # result_holder[f'{encoder_label}_seq_token_masks'] = seq_mask
        # result_holder[f'{encoder_label}_seq_token_ids'] = seq_token_ids
        return True