def unpack_annotations(body, labels): """ Use information from ast package to strip type annotation from function body :param body: :param labels: DataFrame with information about type annotations :return: Trimmed body and list of annotations. """ if labels is None: return [], [] global remove_default variables = [] annotations = [] for ind, row in labels.iterrows(): if row['name'] == "annotation": variables.append( (row['var_line'], row['var_end_line'], row['var_col_offset'], row['var_end_col_offset'], 'variable')) annotations.append( (row['line'], row['end_line'], row['col_offset'], row['end_col_offset'], 'annotation ')) # most likely do not need to use as_bytes here, because non-unicode usually appear in strings # but type annotations usually appear in the end of signature and in the beginnig of a line variables = to_offsets(body, variables, as_bytes=True) annotations = to_offsets(body, annotations, as_bytes=True) defaults_spans = get_defaults_spans(body) cuts = [] vars = [] for offset_ann, offset_var in zip(annotations, variables): beginning = offset_ann[0] end = offset_ann[1] head = body[:offset_ann[0]] orig_len = len(head) head = head.rstrip() stripped_len = len(head) annsymbol = ":" assert head.endswith(annsymbol) beginning = beginning - (orig_len - stripped_len) - len(annsymbol) cuts.append((beginning, end)) assert offset_var[0] != len(head) vars.append((offset_var[0], beginning, preprocess(body[offset_ann[0]:offset_ann[1]]))) if remove_default: cuts.extend(defaults_spans) return vars, cuts
def get_descendants(function, children): """ :param function: function string :param children: List of targets. :return: Offsets for attributes or names that are used as target for assignment operation. Subscript, Tuple and List targets are skipped. """ descendants = [] # if isinstance(children, ast.Tuple): # descendants.extend(get_descendants(function, children.elts)) # else: for chld in children: # for node in ast.walk(chld): node = chld if isinstance(node, ast.Attribute) or isinstance(node, ast.Name): # if isinstance(node, ast.Name): offset = to_offsets( function, [(node.lineno - 1, node.end_lineno - 1, node.col_offset, node.end_col_offset, "new_var")], as_bytes=True) # descendants.append((node.id, offset[-1])) descendants.append( (function[offset[-1][0]:offset[-1][1]], offset[-1])) # elif isinstance(node, ast.Tuple): # descendants.extend(get_descendants(function, node.elts)) elif isinstance(node, ast.Subscript) or isinstance( node, ast.Tuple) or isinstance(node, ast.List): pass # skip for now else: raise Exception("") return descendants
def get_docstring(body: str): """ Get docstring ranges :param body: :return: """ body_lines = body.split("\n") docstring_ranges = [] for node in ast.walk(ast.parse(body)): try: docstring = ast.get_docstring(node) except: # syntax error? continue else: if docstring is not None: docstring_ranges.append(( node.body[0].lineno - 1, node.body[0].end_lineno - 1, # first line, last line 0, len(body_lines[ node.body[0].end_lineno - 1]), # beginning of first line, end of last line "docstring")) # as bytes is not needed because the offsets are created using len and not ast package return to_offsets(body, docstring_ranges, as_bytes=False)
def get_mentions(function, root, mention): """ Find all mentions of a variable in the function's body :param function: string that contains function's body :param root: body parsed with ast package :param mention: the name of a variable to look for :return: list of offsets where the variable is mentioned """ mentions = [] for node in ast.walk(root): if isinstance(node, ast.Name): # a variable or a ... if node.id == mention: offset = to_offsets( function, [(node.lineno - 1, node.end_lineno - 1, node.col_offset, node.end_col_offset, "mention")], as_bytes=True) mentions.extend(offset) # hack for deduplication # the origin of duplicates is still unknown # it apears that mention contain false alarms.... mentions = resolve_self_collision(mentions) return mentions
def parse_as_expression(self, node, *args, **kwargs): offset = to_offsets( self.full_source, [(node.lineno - 1, node.end_lineno - 1, node.col_offset, node.end_col_offset, "expression")], as_bytes=True) offset, = offset line = self.full_source[offset[0]:offset[1]].replace("@", "##at##") name = GNode(name=line, type="Name") expr = GNode(name="Expression" + "_" + str(hex(int(time_ns()))), type="mention") edges = [ { "scope": copy(self.scope[-1]), "src": name, "dst": expr, "type": "local_mention", "line": node.lineno - 1, "end_line": node.end_lineno - 1, "col_offset": node.col_offset, "end_col_offset": node.end_col_offset }, ] return edges, expr
def into_offset(range): try: return to_offsets(body, [(*range, None)], cum_lens=cum_lens, b2c=byte2char, as_bytes=True)[-1][:2] except: return None
def get_defaults_spans(body): root = ast.parse(body) defaults_offsets = to_offsets( body, [(arg.lineno - 1, arg.end_lineno - 1, arg.col_offset, arg.end_col_offset, "default") for arg in root.body[0].args.defaults], as_bytes=True) extended = [] for start, end, label in defaults_offsets: while body[start] != "=": start -= 1 extended.append((start, end)) return extended
def get_declarations(function_): """ :param function: :return: """ function = function_.lstrip() initial_strip = function_[:len(function_) - len(function)] root = ast.parse(function) declarations = {} added = set() for node in ast.walk(root): if isinstance(node, ast.arg): # function argument # TODO # not quite sure why this if statement was needed, but there should be no annotations in the code if node.annotation is None: offset = to_offsets( function, [(node.lineno - 1, node.end_lineno - 1, node.col_offset, node.end_col_offset, "arg")], as_bytes=True) assert function[offset[-1][0]:offset[-1][ 1]] == node.arg, f"{function[offset[-1][0]:offset[-1][1]]} != {node.arg}" declarations[offset[-1]] = get_mentions( function, root, node.arg) added.add(node.arg) # mark variable name as seen elif isinstance(node, ast.Assign): desc = get_descendants(function, node.targets) for d in desc: if d[0] not in added: mentions = get_mentions(function, root, d[0]) valid_mentions = list( filter(lambda mention: mention[0] >= d[1][0], mentions)) declarations[d[1]] = valid_mentions added.add(d[0]) initial_strip_len = len(initial_strip) declarations = { adjust_offsets2([key], initial_strip_len)[0]: adjust_offsets2(val, initial_strip_len) for key, val in declarations.items() } return declarations
def process_body(body, local_occurrences, nodeid2name, f_id, f_start): """ Extract the list :param body: :param local_occurrences: :param nodeid2name: :param f_id: :param f_start: :return: """ body_lines = body.split("\n") local_occurrences = sort_occurrences(local_occurrences) list_of_replacements = [] for occ_ind, occurrence in local_occurrences.iterrows(): if occurrence.start_line == occurrence.end_line: curr_line = occurrence.start_line - 1 - f_start if curr_line >= len(body_lines): continue start_col = occurrence.start_column - 1 end_col = occurrence.end_column extended_range, sourcetrail_name = get_range_for_replacement( occurrence, start_col, end_col, body_lines[curr_line], nodeid2name) if extended_range is not None: occ_col_start, occ_col_end = extended_range # (start_line, end_line, start_col, end_col) list_of_replacements.append( (curr_line, curr_line, occ_col_start, occ_col_end, sourcetrail_name)) list_of_replacements = list(set(list_of_replacements)) list_of_replacements = to_offsets(body, list_of_replacements, as_bytes=False) return { "id": f_id, "body": body, "docstring": get_docstring_ast(body), "replacement_list": list_of_replacements, }
def get_function_body(file_content, file_id, start, end, s_col, e_col) -> str: # need to extract using offsets because last line can have content in it offsets = [(start, end, s_col, e_col, "body")] offsets = to_offsets(file_content[file_id], offsets) # source_lines = file_content.query(f"id == {file_id}").iloc[0]['content'].split("\n") source_lines = file_content[file_id].split("\n") if start == end: # handle situations when the entire function takes only one line body_lines = source_lines[start] else: body_lines = source_lines[start:end] initial_strip = body_lines[0][0:len(body_lines[0]) - len(body_lines[0].lstrip())] body = initial_strip + file_content[file_id][offsets[0][0]:offsets[0][1]] return body
def unpack_returns(body: str, labels: pd.DataFrame): """ Use information from ast package to strip return type annotation from function body :param body: :param labels: DataFrame with information about return type annotation :return: Trimmed body and list of return types (normally one). """ if labels is None: return [], [] returns = [] for ind, row in labels.iterrows(): if row['name'] == "returns": returns.append((row['line'], row['end_line'], row['col_offset'], row['end_col_offset'], "returns")) # most likely do not need to use as_bytes here, because non-unicode usually appear in strings # but type annotations usually appear in the end of signature and in the beginnig of a line return_offsets = to_offsets(body, returns, as_bytes=True) cuts = [] ret = [] for offset in return_offsets: beginning = offset[0] end = offset[1] head = body[:offset[0]] orig_len = len(head) head = head.rstrip() head = head.rstrip("\\") head = head.rstrip() stripped_len = len(head) fannsymbol = "->" assert head.endswith(fannsymbol) beginning = beginning - (orig_len - stripped_len) - len(fannsymbol) cuts.append((beginning, end)) ret.append(preprocess(body[offset[0]:offset[1]])) return ret, cuts
def _get_from_ast(bodies, node_resolver, bpe_tokenizer_path=None, create_subword_instances=True, connect_subwords=False): ast_edges = None bodies_with_replacements = {} subword_tokenizer = make_tokenizer(load_bpe_model((bpe_tokenizer_path))) \ if bpe_tokenizer_path else None tokenizer = RegexpTokenizer("\w+|[^\w\s]") for ind_bodies, (_, row) in custom_tqdm(enumerate(bodies.iterrows()), message="Extracting AST edges", total=len(bodies)): orig_body = row['body_with_random_replacements'] if not isinstance(orig_body, str): continue srctrl2original = get_srctrl2original_replacements(row) c = orig_body.lstrip() strip_len = len(orig_body) - len(c) try: ast.parse(c) except SyntaxError as e: print(e) continue replacements = row['random_2_srctrl'] g = AstGraphGenerator(c) edges = g.get_edges() if len(edges) == 0: continue # replacements_lookup = lambda x: complex_replacement_lookup(x, replacements) replacements_lookup = lambda x: \ GNode(name=random_replacement_lookup(x.name, x.type, replacements, tokenizer), type=x.type) if "@" not in x.name else \ GNode(name=random_replacement_lookup(x.name.split("@")[0], x.type, replacements, tokenizer) + "@" + x.name.split("@")[1], type=x.type) edges['src'] = edges['src'].apply(replacements_lookup) edges['dst'] = edges['dst'].apply(replacements_lookup) resolve = lambda node: node_resolver.resolve(node, srctrl2original) edges['src'] = edges['src'].apply(resolve) edges['dst'] = edges['dst'].apply(resolve) edges = replace_mentions_with_subword_instances( edges, subword_tokenizer, create_subword_instances=create_subword_instances, connect_subwords=connect_subwords) resolve_node_id = lambda node: node_resolver.resolve_node_id( node, row['id']) edges['src'] = edges['src'].apply(resolve_node_id) edges['dst'] = edges['dst'].apply(resolve_node_id) extract_id = lambda node: node.id edges['src'] = edges['src'].apply(extract_id) edges['dst'] = edges['dst'].apply(extract_id) # edges = edges.append(node_resolver.get_mention_edges()) edges = edges.drop_duplicates(subset=["src", "dst", "type"]) edges['id'] = 0 ast_nodes = resolve_self_collision( filter_nodes( adjust_offsets( to_offsets(c, get_ast_nodes(edges), as_bytes=True), -strip_len), orig_body)) srctrl_nodes = list( map( lambda x: (x[0], x[1], node_resolver.resolve(GNode(name=x[2], type="Name"), srctrl2original).global_id), to_offsets(row['body_with_random_replacements'], format_replacement_offsets( row['replacement_list'])))) all_offsets = join_offsets(sorted(ast_nodes, key=lambda x: x[0]), sorted(srctrl_nodes, key=lambda x: x[0])) bodies_with_replacements[row['id']] = all_offsets # append_edges(path=edges_with_ast_name, edges=edges) edges['mentioned_in'] = row['id'] ast_edges = append_edges(ast_edges=ast_edges, new_edges=edges) # print("\r%d/%d" % (ind_bodies, len(bodies['body_normalized'])), end="") # print(" " * 30, end="\r") bodies['graph_node_replacements'] = bodies['id'].apply( lambda id_: bodies_with_replacements.get(id_, None)) # write_nodes(path=nodes_with_ast_name, node_resolver=node_resolver) # ast_nodes = pd.DataFrame(node_resolver.new_nodes)[['id', 'type', 'serialized_name', 'mentioned_in']].astype( # {'mentioned_in': 'Int32'} # ) ast_edges = ast_edges.append(node_resolver.get_mention_edges()) ast_edges['id'] = 0 ast_nodes = node_resolver.new_nodes_for_write() ast_edges = ast_edges.rename( { 'src': 'source_node_id', 'dst': 'target_node_id' }, axis=1).astype({'mentioned_in': 'Int32'}) # assert leaf_nodes_are_leaf_types(ast_nodes, ast_edges) leaf_nodes_are_leaf_types(ast_nodes, ast_edges) return ast_nodes, ast_edges, bodies