def process_lines(lines): results = [] for label, contexts in path_iterator(lines): masked_contexts = mask_variables_in_contexts(contexts) masked_contexts = [ ",".join(masked_context) for masked_context in masked_contexts ] contexts_str = " ".join(masked_contexts) results.append(f"{label} {contexts_str}\n") return results
def extract_targets_and_subtokens(lines): subtokens = [] targets = [] for target, contexts in path_iterator(lines): targets.append(" ".join(target.split("|"))) for start, _, end in contexts: subtokens.append(" ".join(start.split("|"))) subtokens.append(" ".join(end.split("|"))) return {"subtokens": subtokens, "targets": targets}
def compute_lengths(lines, subtoken_tokenizer, target_tokenizer): target_lengths = [] subtoken_lengths = [] for label, contexts in path_iterator(lines): target_lengths.append(len(target_tokenizer.encode(label.split("|")))) for start, path, end in contexts: subtoken_lengths.append( len(subtoken_tokenizer.encode(start.split("|")))) subtoken_lengths.append( len(subtoken_tokenizer.encode(end.split("|")))) return target_lengths, subtoken_lengths
def process_chunk(lines): target_count = {} subtoken_count = {} node_count = {} for label, contexts in path_iterator(lines): add_subtokens_to_dict(label, target_count, split_subtokens=True) for start, path, end in contexts: add_subtokens_to_dict(start, subtoken_count, split_subtokens=True) add_subtokens_to_dict(end, subtoken_count, split_subtokens=True) add_subtokens_to_dict(path, node_count) return target_count, subtoken_count, node_count
def process_chunk_serial(lines: List[str]): masked_entries = [] for label, contexts in path_iterator(lines): variables = get_variables_from_contexts(contexts) for variable in variables: new_entries = create_path_for_variable(variable, variables, contexts) new_entries = new_entries.strip() assert "\n" not in new_entries, "Newline found in new_entries" if len(new_entries) > 0: masked_entries.append(new_entries) return masked_entries
def mask_variables_for_method_name(src_folder, out_folder): raw_src_files = os.listdir(src_folder) raw_src_files = map(lambda p: os.path.join(src_folder, p), raw_src_files) for raw_src_file in raw_src_files: with open(raw_src_file, "r") as f: lines = map(lambda s: s.strip(), f.readlines()) masked_paths = [] for label, contexts in path_iterator(lines): masked_contexts = [] masked_contexts = mask_variables_in_contexts(contexts) masked_contexts = [",".join(masked_context) for masked_context in masked_contexts] masked_contexts = " ".join(masked_contexts) masked_paths.append(f"{label} {masked_contexts}") out_file = os.path.join(out_folder, os.path.basename(raw_src_file)) with open(out_file, "w") as out: for path in masked_paths: out.write(f"{path}\n")
def verify_variable_dataset(lines): for variable, contexts in path_iterator(lines): # make sure that if variable appears in any context, # it is not a VDID or a Nm for context in contexts: start, path, end = map(lambda x: x.strip(), context) if start == variable: start_node = get_start_node(path) assert not ( is_name_expr(start_node) or is_variable(start_node) ), f"Verification Failed: Variable appears in start node and is not a variable or a name expr:\n{context}" # print(f"variable name appeared with {start_node}") if end == variable: end_node = get_end_node(path) assert not ( is_name_expr(end_node) or is_variable(end_node) ), f"Verification Failed: Variable appears in start node and is not a variable or a name expr:\n{context}"
def process_variables(lines): target_count, subtoken_count, node_count = {}, {}, {} for label, contexts in path_iterator(lines): variables = get_variables_from_contexts(contexts) for variable in variables: add_subtokens_to_dict(variable, target_count) for start, path, end in contexts: start_node, end_node = [ f(path) for f in [get_start_node, get_end_node] ] if should_mask(start, start_node, variables): add_subtokens_to_dict(start, subtoken_count) if not should_mask(end, end_node, variables): add_subtokens_to_dict(end, subtoken_count) add_subtokens_to_dict(path, node_count) return target_count, subtoken_count, node_count