def get_dataset_from( data_dirs: List[RichPath], use_func_names: bool = False, max_files_per_dir: Optional[int] = None) -> List[Dict[str, Any]]: data_files = sorted( get_data_files_from_directory(data_dirs, max_files_per_dir)) data = list( chain(*chain(list(f.read_by_file_suffix()) for f in data_files))) if use_func_names: # This task tries to match the function name to the code, by setting the function name as the query for sample in data: # Replace the query tokens with the function name, broken up into its sub-tokens: sample['docstring_tokens'] = split_identifier_into_parts( sample['func_name']) # In the code, replace the function name with the out-of-vocab token everywhere it appears: sample['code_tokens'] = [ Vocabulary.get_unk() if token == sample['func_name'] else token for token in sample['code_tokens'] ] return data
def load_data_from_sample(name: str, metadata: Dict[str, Any], data: List[str], result_holder: Dict[str, Any], hyperparameters: Dict[str, Any], is_train: bool = True) -> bool: label_embedding_style = hyperparameters[ f'{name}_embedding_style'].lower() num_nodes = len(data) if label_embedding_style == 'token': # Translate node labels using the token vocabulary: node_labels = np.zeros((num_nodes, ), dtype=np.uint16) for (node, label) in enumerate(data): if metadata[f'{name}_vocab'].is_unk(label): label = TokenEmbedder.filter_literals( label ) # UNKs that are literals will be converted to special symbols. node_labels[node] = metadata[f'{name}_vocab'].get_id_or_unk( label) result_holder[f'{name}_token_ids'] = node_labels elif label_embedding_style == 'subtoken': max_num_subtokens = hyperparameters[f'{name}_max_subtokens'] node_subtokens = np.zeros((num_nodes, max_num_subtokens), dtype=np.uint16) node_subtoken_length = np.zeros(num_nodes, dtype=np.uint8) for (node, label) in enumerate(data): filtered_label = TokenEmbedder.filter_literals(label) if filtered_label == label: subtoken_ids = metadata[ f'{name}_subtoken_vocab'].get_id_or_unk_multiple( split_identifier_into_parts( label))[:max_num_subtokens] elif metadata[f'{name}_subtoken_vocab'].is_unk(label): subtoken_ids = metadata[ f'{name}_subtoken_vocab'].get_id_or_unk_multiple( [filtered_label]) else: subtoken_ids = metadata[ f'{name}_subtoken_vocab'].get_id_or_unk_multiple( [label]) node_subtokens[node, :len(subtoken_ids)] = subtoken_ids node_subtoken_length[node] = len(subtoken_ids) result_holder[f'{name}_subtoken_ids'] = node_subtokens result_holder[f'{name}_subtoken_lengths'] = node_subtoken_length elif label_embedding_style == 'charcnn': # Translate node labels into character-based representation, and make unique per context graph: node_label_chars = np.zeros( shape=(num_nodes, hyperparameters[f'{name}_char_length']), dtype=np.uint8) for (node, label) in enumerate(data): for (char_idx, label_char) in enumerate( label[:hyperparameters[f'{name}_char_length']]): node_label_chars[int(node), char_idx] = ALPHABET_DICT.get( label_char, 1) unique_chars, node_label_unique_indices = np.unique( node_label_chars, axis=0, return_inverse=True) result_holder[f'{name}_unique_chars'] = unique_chars result_holder[f'{name}_unique_indices'] = node_label_unique_indices else: raise Exception("Unknown node label embedding style '%s'!" % label_embedding_style) return True
def load_data_from_sample(cls, encoder_label: str, hyperparameters: Dict[str, Any], metadata: Dict[str, Any], data_to_load: Any, function_name: Optional[str], result_holder: Dict[str, Any], is_test: bool = True) -> bool: """ Saves two versions of both the code and the query: one using the docstring as the query and the other using the function-name as the query, and replacing the function name in the code with an out-of-vocab token. Sub-tokenizes, converts, and pads both versions, and rejects empty samples. """ # Save the two versions of the code and query: data_holder = { QueryType.DOCSTRING.value: data_to_load, QueryType.FUNCTION_NAME.value: None } # Skip samples where the function name is very short, because it probably has too little information # to be a good search query. if not is_test and hyperparameters['fraction_using_func_name'] > 0. and function_name and \ len(function_name) >= hyperparameters['min_len_func_name_for_query']: if encoder_label == 'query': # Set the query tokens to the function name, broken up into its sub-tokens: data_holder[QueryType.FUNCTION_NAME. value] = split_identifier_into_parts(function_name) elif encoder_label == 'code': # In the code, replace the function name with the out-of-vocab token everywhere it appears: data_holder[QueryType.FUNCTION_NAME.value] = [ Vocabulary.get_unk() if token == function_name else token for token in data_to_load ] # Sub-tokenize, convert, and pad both versions: for key, data in data_holder.items(): if not data: result_holder[f'{encoder_label}_tokens_{key}'] = None result_holder[f'{encoder_label}_tokens_mask_{key}'] = None result_holder[f'{encoder_label}_tokens_length_{key}'] = None result_holder[f'{encoder_label}_tokens_str_{key}'] = None continue if hyperparameters[f'{encoder_label}_use_subtokens']: data = cls._to_subtoken_stream( data, mark_subtoken_end=hyperparameters[ f'{encoder_label}_mark_subtoken_end']) tokens, tokens_mask = \ convert_and_pad_token_sequence(metadata['token_vocab'], list(data), hyperparameters[f'{encoder_label}_max_num_tokens']) # Note that we share the result_holder with different encoders, and so we need to make our identifiers # unique-ish result_holder[f'{encoder_label}_tokens_{key}'] = tokens result_holder[f'{encoder_label}_tokens_mask_{key}'] = tokens_mask result_holder[f'{encoder_label}_tokens_length_{key}'] = int( np.sum(tokens_mask)) result_holder[f'{encoder_label}_tokens_str_{key}'] = list(data) # print(list(data)) if result_holder[f'{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}'] is None or \ int(np.sum(result_holder[f'{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}'])) == 0: return False return True
def get_log_samples(graph, seq_length, pad_token, vocabulary, rnn_len): node_table = {} edge_table = defaultdict(list) token_pointer = 0 token_table = [] sample_contents = [] semi_node_ids = [] for node in graph.node: node_table[node.id] = node if (node.type in [FeatureNode.TOKEN, FeatureNode.IDENTIFIER_TOKEN ]) and token_pointer == 0: token_pointer = node.id token_table.append( vocabulary.get_id_or_unk_multiple( split_identifier_into_parts(node.contents), seq_length, pad_token)) for edge in graph.edge: edge_table[edge.sourceId].append(edge) while (True): term_flag = True if (len(edge_table[token_pointer]) > 0): for edge in edge_table[token_pointer]: if edge.type == FeatureEdge.NEXT_TOKEN: term_flag = False # id_in_order.append(token_pointer) if node_table[ token_pointer].type == FeatureNode.TOKEN and node_table[ token_pointer].contents == "SEMI": semi_node_ids.append(len(token_table)) token_pointer = edge.destinationId token_table.append( vocabulary.get_id_or_unk_multiple( split_identifier_into_parts( node_table[token_pointer].contents), seq_length, pad_token)) break if term_flag: break else: # print("warning: unable to find next node") break for semi_node_id in semi_node_ids: if semi_node_id < rnn_len: sample_content = [ vocabulary.get_id_or_unk_multiple( split_identifier_into_parts(" "), seq_length, pad_token) ] * rnn_len sample_content[-semi_node_id:] = token_table[:semi_node_id] else: sample_content = token_table[semi_node_id - rnn_len:semi_node_id] sample_contents.append(np.array([sample_content])) return sample_contents
def load_data_from_sample_siamese( language: str, encoder_label: str, data_to_load: Any, function_name: Optional[str], tokenizer: TokenizerRecordable, fraction_using_func_name: float, min_len_func_name_for_query: int, use_subtokens: bool, mark_subtoken_end: bool, max_num_tokens: int, lang_token: str, query_token: str, ) -> Optional[Dict[str, np.ndarray]]: """ Save two versions of both the code and the query: one using the docstring as the query and the other using the function-name as the query, and replacing the function name in the code with an out-of-vocab token. Sub-tokenizes, converts, and pads both versions, and rejects empty samples. """ result_holder: Dict[str, Any] = {} # Save the two versions of the code and query: data_holder = { QueryType.DOCSTRING.value: data_to_load, QueryType.FUNCTION_NAME.value: None } # Skip samples where the function name is very short, because it probably has too little information # to be a good search query. if fraction_using_func_name > 0.0 and function_name and len( function_name) >= min_len_func_name_for_query: if encoder_label == "query": # Set the query tokens to the function name, broken up into its sub-tokens: data_holder[QueryType.FUNCTION_NAME. value] = split_identifier_into_parts(function_name) elif encoder_label == "code": # In the code, replace the function name with the out-of-vocab token everywhere it appears: data_holder[QueryType.FUNCTION_NAME.value] = [ tokenizer.unk_token() if token == function_name else token for token in data_to_load ] else: return None # Sub-tokenize, convert, and pad both versions: for key, data in data_holder.items(): # if hyperparameters[f"{encoder_label}_use_subtokens"]: if use_subtokens: data = _to_subtoken_stream(data, mark_subtoken_end=mark_subtoken_end) logger.debug("") if encoder_label == "code": tokens, tokens_mask = convert_and_pad_token_sequence( tokenizer=tokenizer, token_sequence=list(data), output_tensor_size=max_num_tokens, token=lang_token, prefix=language, ) elif encoder_label == "query": tokens, tokens_mask = convert_and_pad_token_sequence( tokenizer=tokenizer, token_sequence=list(data), output_tensor_size=max_num_tokens, token=query_token, prefix=None, ) # Note that we share the result_holder with different encoders, and so we need to make our identifiers # unique-ish result_holder[f"{encoder_label}_tokens_{key}"] = tokens result_holder[f"{encoder_label}_tokens_mask_{key}"] = tokens_mask if (result_holder[ f"{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}"] is None or int( np.sum(result_holder[ f"{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}"] )) == 0): return None return result_holder
def parse_data_file_ast_tokenizer( data_file: Path, data_params: DatasetParams, tokenizer: TokenizerRecordable, ast_parser: TreeSitterParser, query_token: str, pickle_path: Path, ) -> Tuple[str, pd.DataFrame]: logger.info(f"Reading samples from {data_file}") filename = os.path.basename(data_file) file_language = filename.split("_")[0] file_id = filename.split(".")[0] pickle_file = pickle_path / f"{file_id}.p" if pickle_file.exists(): df = pd.read_pickle(pickle_path / f"{file_id}.p") return (file_language, df) samples = list(read_file_samples(data_file)) # ds: List[Dict[str, Union[str, int]]] = [] codes: List[List[str]] = [] funcs: List[List[str]] = [] docstrings: List[List[str]] = [] for idx, raw_sample in enumerate(tqdm(samples)): language = raw_sample["language"] if language.startswith( "python" ): # In some datasets, we use 'python-2.7' and 'python-3' language = "python" if language != file_language: logger.error( f"file with different language {language} from filename {file_language}" ) sys.exit( f"file with multiple language {language} from filename {file_language}" ) function_name = raw_sample.get("func_name") code: List[str] = ast_parser.parse( language, raw_sample["code"], max_tokens=data_params.code_max_num_tokens) # Skip samples where the function name is very short, because it probably has too little information # to be a good search query. if (data_params.fraction_using_func_name > 0.0 and function_name and len(function_name) >= data_params.min_len_func_name_for_query): func = [query_token] + split_identifier_into_parts(function_name) code = [ tokenizer.unk_token() if token == function_name else token for token in code ] docstring = [query_token] + [ d.lower() for d in raw_sample["docstring_tokens"] ] codes.append(code) funcs.append(func) docstrings.append(docstring) code_toks: List[List[int]] = [] code_masks: List[List[int]] = [] func_toks: List[List[int]] = [] func_masks: List[List[int]] = [] docstring_toks: List[List[int]] = [] docstring_masks: List[List[int]] = [] for batch in batch_iter(codes, batch_size=100): toks, masks = tokenizer.encode_tokens( batch, max_length=data_params.code_max_num_tokens) code_toks.extend(toks) code_masks.extend(masks) for batch in batch_iter(funcs, batch_size=100): toks, masks = tokenizer.encode_tokens( batch, max_length=data_params.query_max_num_tokens) func_toks.extend(toks) func_masks.extend(masks) for batch in batch_iter(docstrings, batch_size=100): toks, masks = tokenizer.encode_tokens( batch, max_length=data_params.query_max_num_tokens) docstring_toks.extend(toks) docstring_masks.extend(masks) langs = [data_params.lang_ids[file_language]] * len(func_toks) similarities = [1] * len(func_toks) logger.debug(f"func_toks {func_toks[:2]}") logger.debug(f"docstring_toks {docstring_toks[:2]}") logger.debug(f"code_toks {code_toks[:2]}") logger.debug(f"langs {langs[:2]}") logger.debug(f"similarities {similarities[:2]}") df = pd.DataFrame({ "lang": langs, "similarity": similarities, "func_tokens": func_toks, "func_masks": func_masks, "docstring_tokens": docstring_toks, "docstring_masks": docstring_masks, "code_tokens": code_toks, "code_masks": code_masks, }) df.to_pickle(pickle_file) logger.debug( f"Saved file {data_file}: language {file_language} [{df.shape}] to {pickle_file}" ) return (file_language, df)
def load_data_from_sample_ast( language: str, encoder_label: str, data_to_load: List[str], function_name: Optional[str], tokenizer: TokenizerRecordable, data_params: DatasetParams, query_token: str, ) -> Optional[Dict[str, np.ndarray]]: """ Save two versions of both the code and the query: one using the docstring as the query and the other using the function-name as the query, and replacing the function name in the code with an out-of-vocab token. Sub-tokenizes, converts, and pads both versions, and rejects empty samples. """ result_holder: Dict[str, Any] = {} # Save the two versions of the code and query: data_holder = { QueryType.DOCSTRING.value: data_to_load, QueryType.FUNCTION_NAME.value: None } # Skip samples where the function name is very short, because it probably has too little information # to be a good search query. if (data_params.fraction_using_func_name > 0.0 and function_name and len(function_name) >= data_params.min_len_func_name_for_query): if encoder_label == "query": # Set the query tokens to the function name, broken up into its sub-tokens: data_holder[QueryType.FUNCTION_NAME. value] = split_identifier_into_parts(function_name) elif encoder_label == "code": # In the code, replace the function name with the out-of-vocab token everywhere it appears: data_holder[QueryType.FUNCTION_NAME.value] = [ tokenizer.unk_token() if token == function_name else token for token in data_to_load ] else: return None # Sub-tokenize, convert, and pad both versions: for key, data in data_holder.items(): # if hyperparameters[f"{encoder_label}_use_subtokens"]: if data is not None: data_l: List[str] = list(data) if data_params.use_subtokens: data_l = list( _to_subtoken_stream( data_l, mark_subtoken_end=data_params.mark_subtoken_end)) if encoder_label == "code": token_ids, token_mask = tokenizer.encode_tokens( [data_l], max_length=data_params.code_max_num_tokens) elif encoder_label == "query": token_sequence = [query_token] + data_l token_ids, token_mask = tokenizer.encode_tokens( [token_sequence], max_length=data_params.query_max_num_tokens) result_holder[f"{encoder_label}_tokens_{key}"] = token_ids[0] result_holder[f"{encoder_label}_tokens_mask_{key}"] = token_mask[0] if (result_holder[ f"{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}"] is None or int( np.sum(result_holder[ f"{encoder_label}_tokens_mask_{QueryType.DOCSTRING.value}"] )) == 0): return None return result_holder
def _convert_ast_into_simpler_tree_format(self, root, binary_data): num_nodes = 0 queue = [root] root_token = "" root_sub_tokens = [] # Check if the node has children or not if len(root.children) == 0: root_token = binary_data[root_node.start_byte:root_node.end_byte] root_token_raw = root_token.decode("utf-8") root_token = self.process_token(root_token_raw) root_sub_tokens = split_identifier_into_parts(root_token_raw) root_sub_tokens = self.process_list_of_sub_tokens(root_sub_tokens) root_sub_token_ids = [] for sub_token in root_sub_tokens: root_sub_token_ids.append( self.look_up_for_id_from_token(sub_token)) root_json = { "node_type": str(root.type), "node_type_id": self.look_up_for_id_from_node_type(str(root.type)), "node_token": root_token, "node_sub_tokens": root_sub_tokens, "node_sub_tokens_id": root_sub_token_ids, "children": [ ] # Using children = None instead of [] to avoid the error 'Python 3: maximum recursion depth exceeded' } tree_tokens = [] tree_tokens.extend(root_sub_tokens) queue_json = [root_json] while queue: current_node = queue.pop(0) current_node_json = queue_json.pop(0) num_nodes += 1 children = [x for x in current_node.children] queue.extend(children) if len(children) > 0: current_node_json['children'] = [] for child_node in children: child_token = "" child_sub_tokens = [] if len(child_node.children) == 0: child_token = binary_data[child_node.start_byte:child_node. end_byte] child_token_raw = child_token.decode("utf-8") child_token = self.process_token(child_token_raw) child_sub_tokens = split_identifier_into_parts( str(child_token_raw)) child_sub_tokens = self.process_list_of_sub_tokens( child_sub_tokens) children_sub_token_ids = [] for sub_token in child_sub_tokens: sub_token = self.process_token(sub_token) # print(sub_token) sub_token_id = self.look_up_for_id_from_token(sub_token) children_sub_token_ids.append(sub_token_id) if len(children_sub_token_ids) == 0: children_sub_token_ids.append(0) child_json = { "node_type": str(child_node.type), "node_type_id": self.look_up_for_id_from_node_type(str(child_node.type)), "node_token": child_token, "node_sub_tokens": child_sub_tokens, "node_sub_tokens_id": children_sub_token_ids, "children": [] } tree_tokens.extend(child_sub_tokens) current_node_json['children'].append(child_json) queue_json.append(child_json) tree_tokens = list(set(tree_tokens)) return root_json, tree_tokens, num_nodes
from dpu_utils.codeutils import split_identifier_into_parts print(split_identifier_into_parts("eatRelationalExpression"))
def func_name_tokenizer(tokens): tokens = ujson.loads(tokens) return split_identifier_into_parts(tokens)
def compute_sample_data(sub_graph, identifier_token_node_ids, seq_length, pad_token, slot_token, vocabulary, exception_node_ids=[]): used_node_types = get_used_nodes_type() used_edge_types = get_used_edges_type() node_representations = [] id_to_index_map = {} ind = 0 (sub_nodes, sub_edges) = sub_graph for node in sub_nodes: if node.type in used_node_types: if node.id in exception_node_ids: node_representation = [pad_token for _ in range(seq_length)] node_representation[0] = slot_token else: node_representation = vocabulary.get_id_or_unk_multiple( split_identifier_into_parts(node.contents), seq_length, pad_token) node_representations.append(node_representation) id_to_index_map[node.id] = ind ind += 1 n_nodes = len(node_representations) n_types = len(used_edge_types) node_representations = np.array(node_representations) num_incoming_edges_per_type = np.zeros((n_nodes, n_types)) num_outgoing_edges_per_type = np.zeros((n_nodes, n_types)) adj_lists = defaultdict(list) for edge in sub_edges: if edge.type in used_edge_types \ and edge.sourceId in id_to_index_map \ and edge.destinationId in id_to_index_map: type_id = used_edge_types.index(edge.type) adj_lists[type_id].append([ id_to_index_map[edge.sourceId], id_to_index_map[edge.destinationId] ]) num_incoming_edges_per_type[id_to_index_map[edge.destinationId], type_id] += 1 num_outgoing_edges_per_type[id_to_index_map[edge.sourceId], type_id] += 1 final_adj_lists = { edge_type: np.array(sorted(adj_list), dtype=np.int32) for edge_type, adj_list in adj_lists.items() } # Add empty entries for types with no adjacency lists for i in range(len(used_edge_types)): if i not in final_adj_lists: final_adj_lists[i] = np.zeros((0, 2), dtype=np.int32) identifier_nodes = [ id_to_index_map[node_id] for node_id in identifier_token_node_ids ] return (identifier_nodes, node_representations, final_adj_lists, \ num_incoming_edges_per_type, num_outgoing_edges_per_type)
def func_name_tokenizer(tokens, **kwargs): tokens = ujson.loads(tokens) tokens = split_identifier_into_parts(tokens)[:kwargs['min_func_len']] return tokens
def string_sub_tokenizer(tokens: list): """code from https://github.com/github/CodeSearchNet/blob/e792e1caea20fbd4fba439565fe20c10d4798435/src/encoders/seq_encoder.py#L84-L92""" tokens = [split_identifier_into_parts(tok) if IDENTIFIER_TOKEN_REGEX.match(tok) else tok for tok in tokens] tokens = list(itertools.chain(*tokens)) return tokens