def load_data_from_sample(cls, encoder_label: str, hyperparameters: Dict[str, Any], metadata: Dict[str, Any], data_to_load: Any, function_name: Optional[str], result_holder: Dict[str, Any], is_test: bool = True) -> bool: """ Saves two versions of both the code and the query: one using the docstring as the query and the other using the function-name as the query, and replacing the function name in the code with an out-of-vocab token. Sub-tokenizes, converts, and pads both versions, and rejects empty samples. """ # Save the two versions of the code and query: data_holder = { QueryType.DOCSTRING.value: data_to_load, QueryType.FUNCTION_NAME.value: None } # Skip samples where the function name is very short, because it probably has too little information # to be a good search query. if not is_test and hyperparameters['fraction_using_func_name'] > 0. and function_name and \ len(function_name) >= hyperparameters['min_len_func_name_for_query']: if encoder_label == 'query': # Set the query tokens to the function name, broken up into its sub-tokens: data_holder[QueryType.FUNCTION_NAME. value] = split_identifier_into_parts(function_name) elif encoder_label == 'code': # In the code, replace the function name with the out-of-vocab token everywhere it appears: data_holder[QueryType.FUNCTION_NAME.value] = [ Vocabulary.get_unk() if token == function_name else token for token in data_to_load ] # Sub-tokenize, convert, and pad both versions: for key, data in data_holder.items(): if not data: result_holder[f'{encoder_label}_tokens_{key}'] = None result_holder[f'{encoder_label}_tokens_mask_{key}'] = None result_holder[f'{encoder_label}_tokens_length_{key}'] = None continue if hyperparameters[f'{encoder_label}_use_subtokens']: data = cls._to_subtoken_stream( data, mark_subtoken_end=hyperparameters[ f'{encoder_label}_mark_subtoken_end']) tokens, tokens_mask = \ convert_and_pad_token_sequence(metadata['token_vocab'], list(data), hyperparameters[f'{encoder_label}_max_num_tokens']) # Note that we share the result_holder with different encoders, and so we need to make our identifiers # unique-ish result_holder[f'{encoder_label}_tokens_{key}'] = tokens result_holder[f'{encoder_label}_tokens_mask_{key}'] = tokens_mask result_holder[f'{encoder_label}_tokens_length_{key}'] = int( np.sum(tokens_mask)) if result_holder[f'{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}'] is None or \ int(np.sum(result_holder[f'{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}'])) == 0: return False return True
def convert_and_pad(nodes_): n = len(nodes_) node_types = [node['type'] for node in nodes_] node_type_ids, mask = (tfutils.convert_and_pad_token_sequence( metadata['type_vocab'], node_types, n)) assert len(node_type_ids) == n assert np.all(mask == 1) return list(node_type_ids)
def load_data_from_sample(cls, encoder_label: str, hyperparameters: Dict[str, Any], metadata: Dict[str, Any], data_to_load: Any, function_name: Optional[str], result_holder: Dict[str, Any], is_test: bool = True) -> bool: _, node_tokens = _get_tree_elements_seq(data_to_load, hyperparameters[f'{encoder_label}_max_num_nodes']) node_token_ids, mask = ( tfutils.convert_and_pad_token_sequence( metadata['token_vocab'], node_tokens, hyperparameters[f'{encoder_label}_max_num_tokens'])) result_holder[f'{encoder_label}_node_masks'] = list(mask) result_holder[f'{encoder_label}_tokens'] = list(node_token_ids) return True
def load_data_from_sample(cls, encoder_label: str, hyperparameters: Dict[str, Any], metadata: Dict[str, Any], data_to_load: Any, function_name: Optional[str], result_holder: Dict[str, Any], is_test: bool = True) -> bool: nodes, children = _linearize_tree_bfs( data_to_load, hyperparameters[f'{encoder_label}_max_num_nodes']) n = len(nodes) node_types = [node['type'] for node in nodes] node_type_ids, mask = (tfutils.convert_and_pad_token_sequence( metadata['type_vocab'], node_types, n)) assert len(node_type_ids) == n assert np.all(mask == 1) result_holder[f'{encoder_label}_node_type_ids'] = list(node_type_ids) result_holder[f'{encoder_label}_children'] = children return True
def load_data_from_sample(cls, encoder_label: str, hyperparameters: Dict[str, Any], metadata: Dict[str, Any], data_to_load: Any, function_name: Optional[str], result_holder: Dict[str, Any], is_test: bool = True) -> bool: assert 'nodes' in data_to_load assert 'edges' in data_to_load node_tokens = data_to_load['nodes'] if not cls.encoder_hypers['is_raw'] else data_to_load['tokens'] # seq_tokens = data_to_load['sequence'] # print(seq_tokens) edges = np.array([ (metadata['edge_type_mapping'][edge_type], v, u) for edge_type, edges_of_type in data_to_load['edges'].items() for v, u in edges_of_type ], dtype=np.int) node_token_ids, mask = ( tfutils.convert_and_pad_token_sequence( metadata['token_vocab'], node_tokens, hyperparameters[f'{encoder_label}_max_num_tokens'] ) ) # seq_token_ids, seq_mask = ( # tfutils.convert_and_pad_token_sequence( # metadata['token_vocab'], # seq_tokens, # hyperparameters[f'{encoder_label}_max_num_tokens'] # ) # ) result_holder[f'{encoder_label}_node_masks'] = mask result_holder[f'{encoder_label}_node_token_ids'] = node_token_ids result_holder[f'{encoder_label}_edges'] = edges # result_holder[f'{encoder_label}_seq_token_masks'] = seq_mask # result_holder[f'{encoder_label}_seq_token_ids'] = seq_token_ids return True