def hash_node(node: bblfsh.Node, ignore_sideness: bool = True) -> hashlib._hashlib.HASH: """ Hashes a node ignoring positional information """ lroles = [ str(i) for i in node.roles if i not in (bblfsh.role_id("LEFT"), bblfsh.role_id("RIGHT")) ] _hash = hashlib.md5() stuff = [node.internal_type, node.token] + lroles for prop, value in sorted(node.properties.items()): if ignore_sideness and 'left' in value.lower( ) or 'right' in value.lower(): continue stuff.append(prop) stuff.append(value) child_hashes = [] for child in node.children: child_hashes.append( hash_node(child, ignore_sideness).hexdigest().encode('utf-8')) stuff.extend(sorted(child_hashes)) for s in stuff: _hash.update(str(s).encode('utf-8')) return _hash
def check(uast): findings = [] binexpr_nodes = bblfsh.filter(uast, "//InfixExpression[@roleBinary and @roleExpression]") for node in binexpr_nodes: left = None right = None for c in node.children: if bblfsh.role_id("LEFT") in c.roles: left = c elif bblfsh.role_id("RIGHT") in c.roles: right = c elif c.token in ["=", "*", "+"]: left = None right = None break if left and right: break if not left or not right: continue if utils.hash_node(left).hexdigest() == utils.hash_node(right).hexdigest(): findings.append({"msg": "Equal terms on both sides of binary expression, ", "pos": node.start_position}) return findings
def generate_typos_fixes(self, changes: Sequence[Change]) -> Iterator[TypoFix]: """ Generate all data about typo fix required for any type of further processing. The processing can be comment generation or performance report generation. :param changes: The list of changes in the pointed state. :return: Iterator with unrendered data per comment. """ base_files_by_lang = files_by_language(c.base for c in changes) head_files_by_lang = files_by_language(c.head for c in changes) for lang, head_files in head_files_by_lang.items(): for file in filter_files( files=head_files, line_length_limit=self.config["line_length_limit"], overall_size_limit=self.config["overall_size_limit"], log=self._log): try: prev_file = base_files_by_lang[lang][file.path] except KeyError: lines = [] old_identifiers = set() else: lines = find_new_lines(prev_file, file) old_identifiers = { node.token for node in uast2sequence(prev_file.uast) if bblfsh.role_id("IDENTIFIER") in node.roles and bblfsh.role_id("IMPORT") not in node.roles and node.token } changed_nodes = extract_changed_nodes(file.uast, lines) new_identifiers = [ node for node in changed_nodes if bblfsh.role_id("IDENTIFIER") in node.roles and bblfsh.role_id("IMPORT") not in node.roles and node.token and node.token not in old_identifiers ] if not new_identifiers: continue suggestions = self.check_identifiers( [n.token for n in new_identifiers]) for index in suggestions.keys(): corrections = suggestions[index] for token in corrections.keys(): yield TypoFix( head_file=file, token=new_identifiers[index].token, candidates=[ Candidate(*c[:2]) for c in corrections[token] ], line_number=new_identifiers[index].start_position. line, )
def rule_chk(uast): findings = [] is_left_literal = False is_right_literal = False left_node_pos = None right_node_pos = None #query = "//WhileStatement//InfixExpression" #query = "//*[@roleWhile and @roleBinary and @roleCondition and @roleExpression]" #query = "//*[@roleWhile and @roleStatement and not(@roleBody)]" query = "//*[@roleWhile and @roleStatement and not(@roleBody)]//*[@roleRelational and @roleExpression and @roleBinary and @roleOperator] " print(query) node = filter_uast(uast, query) i = 0 for n in node: i = i + 1 print('Node :{0}'.format(i)) is_left_literal = False is_right_literal = False left_node_pos = None right_node_pos = None j = 0 for child in n.children: j = j + 1 print('Iteration {0} for node {1} '.format(j, i)) #print(child) if (bblfsh.role_id("NUMBER") in child.roles) & (bblfsh.role_id("LEFT") in child.roles): is_left_literal = True left_node_pos = child.start_position.line if (bblfsh.role_id("NUMBER") in child.roles) & (bblfsh.role_id("RIGHT") in child.roles): is_right_literal = True right_node_pos = child.start_position.line if is_left_literal & is_right_literal: findings.append({ "msg": "Number literals found in while condition", "left literal at line": left_node_pos, "right literal at line": right_node_pos }) return findings
def check(uast): findings = [] switches = bblfsh.filter(uast, "//SwitchStatement") for i in switches: cases = bblfsh.filter(i, "//SwitchCase") for c in cases: if bblfsh.role_id("DEFAULT") in c.roles: break else: findings.append({ "msg": "Switch without default case", "pos": i.start_position }) return findings
def check(uast): findings = [] switches = bblfsh.filter(uast, "//SwitchStatement") for i in switches: cases = list(bblfsh.filter(i, "//SwitchCase")) if not cases: continue for r in range(len(cases)): c = cases[r] if bblfsh.role_id('DEFAULT') in c.roles and r != (len(cases)-1): findings.append({"msg": "'default' should be the line switch case", "pos": c.start_position}) return findings
def check(uast): findings = [] format_calls = bblfsh.filter( uast, "//MethodInvocation/" "Identifier[@roleCall and @roleReceiver and @Name='String']/parent::MethodInvocation/" "Identifier[@roleCall and @roleCallee and @Name='format']/parent::MethodInvocation" ) for fcall in format_calls: args = list(bblfsh.filter(fcall, "//*[@internalRole='arguments']")) if len(args) == 0: continue format_str = args[0] if format_str.internal_type != 'String': # Validating format strings assigned elsewhere on the same file is possible, # but won't be doing it here for brevity sake continue # For the reason stated above, we only validate %d str_val = format_str.properties["Value"] re_res = re.findall(r'[^%]%d', str_val) # Validate number of args if len(re_res) != len(args[1:]): findings.append({ "msg": "Format string doesn't match number of args", "pos": format_str.start_position }) # Validate type of args (for %d it should have the NumberLiteral role) for arg in args[1:]: froles = filter(lambda x: x == bblfsh.role_id('NUMBER'), arg.roles) if len(list(froles)) == 0: findings.append({ "msg": "Format string argument is not numeric", "pos": arg.start_position }) return findings
def testRoleIdName(self) -> None: self.assertEqual(role_id(role_name(1)), 1) self.assertEqual(role_name(role_id("IDENTIFIER")), "IDENTIFIER")
def analyze(self, ptr_from: ReferencePointer, ptr_to: ReferencePointer, data_service: DataService, **data) -> [Comment]: """ Return the list of `Comment`-s - found typo corrections. :param ptr_from: The Git revision of the fork point. Exists in both the original and \ the forked repositories. :param ptr_to: The Git revision to analyze. Exists only in the forked repository. :param data_service: The channel to the data service in Lookout server to query for \ UASTs, file contents, etc. :param data: Extra data passed into the method. Used by the decorators to simplify \ the data retrieval. :return: List of found review suggestions. Refer to \ lookout/core/server/sdk/service_analyzer.proto. """ log = self.log comments = [] changes = list(data["changes"]) base_files_by_lang = files_by_language(c.base for c in changes) head_files_by_lang = files_by_language(c.head for c in changes) line_length = self.config.get("line_length_limit", self.DEFAULT_LINE_LENGTH_LIMIT) for lang, head_files in head_files_by_lang.items(): for file in filter_files(head_files, line_length, log): try: prev_file = base_files_by_lang[lang][file.path] except KeyError: lines = [] old_identifiers = set() else: lines = find_new_lines(prev_file, file) old_identifiers = { node.token for node in uast2sequence(prev_file.uast) if bblfsh.role_id("IDENTIFIER") in node.roles and bblfsh.role_id("IMPORT") not in node.roles and node.token } changed_nodes = extract_changed_nodes(file.uast, lines) new_identifiers = [ node for node in changed_nodes if bblfsh.role_id("IDENTIFIER") in node.roles and bblfsh.role_id("IMPORT") not in node.roles and node.token and node.token not in old_identifiers ] if not new_identifiers: continue suggestions = self.check_identifiers( [n.token for n in new_identifiers]) for index in suggestions.keys(): corrections = suggestions[index] for token in corrections.keys(): comment = Comment() comment.file = file.path corrections_line = " " + ", ".join( "%s (%d%%)" % (candidate[0], int(candidate[1] * 100)) for candidate in corrections[token]) comment.text = """ Possible typo in \"%s\". Suggestions: """.strip( ) % new_identifiers[index].token + corrections_line comment.line = new_identifiers[ index].start_position.line comment.confidence = int(corrections[token][0][1] * 100) comments.append(comment) return comments
from lookout.style.typos.utils import Candidate, Columns, flatten_df_by_column, TEMPLATE_DIR # TODO(zurk): Split TypoFix to FileFixes and TypoFix. content, path and identifiers_number should # be in the FileFixes. TypoFix = NamedTuple( "TypoFix", ( ("content", str), # file content from head revision ("path", str), # file path from head revision ("line_number", int), # line number for the comment ("identifier", str), # identifier where typo is found ("candidates", Iterable[Candidate]), # suggested identifiers ("identifiers_number", int), # number of unique analyzed identifiers )) IDENTIFIER = bblfsh.role_id("IDENTIFIER") IMPORT = bblfsh.role_id("IMPORT") IDENTIFIER_INDEX_COLUMN = "identifier_index" class IdTyposAnalyzer(Analyzer): """ Identifier typos analyzer. """ _log = logging.getLogger("IdTyposAnalyzer") model_type = IdTyposModel name = "lookout.style.typos" vendor = "source{d}" version = 1 description = "Corrector of typos in source code identifiers."
def analyze_uast(path: str, root: bblfsh.Node, roles: set, reserved: set): contents = Path(path).read_text() # walk the tree: collect nodes with assigned tokens and build the parents map node_tokens = [] parents = {} queue = [root] while queue: node = queue.pop() for child in node.children: parents[id(child)] = node queue.extend(node.children) if node.token or node.start_position and node.end_position and not node.children: node_tokens.append(node) node_tokens.sort(key=lambda n: n.start_position.offset) sentinel = bblfsh.Node() sentinel.start_position.offset = len(contents) sentinel.start_position.line = contents.count("\n") node_tokens.append(sentinel) # scan `node_tokens` and analyze the gaps and the token prefixes and suffixes pos = 0 ws = re.compile("\s+") alpha = re.compile("[a-zA-Z]+") IDENTIFIER = bblfsh.role_id("IDENTIFIER") log = logging.getLogger("analyze_uast") def ccheck(char: str) -> bool: return not char.isspace() and not char.isalnum( ) and not ord(char) >= 128 for node in node_tokens: token = node.token if node.token else \ contents[node.start_position.offset:node.end_position.offset] if node.start_position.offset > pos: diff = contents[pos:node.start_position.offset] parts = ws.split(diff) for part in parts: if len(part) >= 8: continue # for keyword in alpha.finditer(part): # reserved.add(keyword.group()) for nonalpha in alpha.split(part): for char in nonalpha: if ccheck(char): reserved.add(char) if node is sentinel: break pos = node.end_position.offset if IDENTIFIER not in node.roles: continue outer = contents[node.start_position.offset:node.end_position.offset] if outer == token: continue pos = outer.find(token) if pos < 0: log.warning( "skipped %s, token offset corruption \"%s\" vs. \"%s\"", path, token, outer) break if pos > 0: for char in outer[:pos]: if ccheck(char): reserved.add(char) if pos + len(token) < len(outer): for char in outer[pos + len(token):]: if ccheck(char): reserved.add(char)
import bblfsh IDENTIFIER = bblfsh.role_id("IDENTIFIER") QUALIFIED = bblfsh.role_id("QUALIFIED") LITERAL = bblfsh.role_id("LITERAL") OPERATOR = bblfsh.role_id("OPERATOR") EXPRESSION = bblfsh.role_id("EXPRESSION") LEFT = bblfsh.role_id("LEFT") BINARY = bblfsh.role_id("BINARY") ASSIGNMENT = bblfsh.role_id("ASSIGNMENT")
import bblfsh IDENTIFIER = bblfsh.role_id("IDENTIFIER") QUALIFIED = bblfsh.role_id("QUALIFIED") LITERAL = bblfsh.role_id("LITERAL") OPERATOR = bblfsh.role_id("OPERATOR") EXPRESSION = bblfsh.role_id("EXPRESSION") LEFT = bblfsh.role_id("LEFT") BINARY = bblfsh.role_id("BINARY") ASSIGNMENT = bblfsh.role_id("ASSIGNMENT") FUNCTION = bblfsh.role_id("FUNCTION") DECLARATION = bblfsh.role_id("DECLARATION") NAME = bblfsh.role_id("NAME")
def hash_condition(if_node): for child in if_node.children: if bblfsh.role_id("CONDITION") in child.roles: return utils.hash_node(child).hexdigest() return None
def analyze_uast(path: str, content: str, root: bblfsh.Node, internal_types: dict, roles: dict, reserved: set): """ Fill internal types, roles and reserved dictionaries with statistics computed from an UAST. :param path: Path of the analyzed file. :param content: Content of the analyzed file. :param root: UAST of the analyzed file. :param internal_types: Dictionary containing the internal types statistics. :param roles: Dictionary containing the roles statistics. :param reserved: Dictionary containing the reserved (or tokens) statistics. """ # walk the tree: collect nodes with assigned tokens and build the parents map node_tokens = [] parents = {} queue = [root] while queue: node = queue.pop() internal_types[node.internal_type] += 1 for role in node.roles: roles[role] += 1 for child in node.children: parents[id(child)] = node queue.extend(node.children) if node.token or node.start_position and node.end_position and not node.children: node_tokens.append(node) node_tokens.sort(key=lambda n: n.start_position.offset) sentinel = bblfsh.Node() sentinel.start_position.offset = len(content) sentinel.start_position.line = content.count("\n") node_tokens.append(sentinel) # scan `node_tokens` and analyze the gaps and the token prefixes and suffixes pos = 0 ws = re.compile("\s+") alpha = re.compile("[a-zA-Z]+") IDENTIFIER = bblfsh.role_id("IDENTIFIER") log = logging.getLogger("analyze_uast") def ccheck(char: str) -> bool: return not char.isspace() and not char.isalnum( ) and not ord(char) >= 128 for node in node_tokens: token = node.token if node.token else \ content[node.start_position.offset:node.end_position.offset] if node.start_position.offset > pos: diff = content[pos:node.start_position.offset] parts = ws.split(diff) for part in parts: if len(part) >= 8: log.debug("Skipping weird part in code: %s. Path: %s", diff, path) continue for nonalpha in alpha.split(part): for char in nonalpha: if ccheck(char): reserved.add(char) if node is sentinel: break pos = node.end_position.offset if IDENTIFIER not in node.roles: continue outer = content[node.start_position.offset:node.end_position.offset] if outer == token: continue pos = outer.find(token) if pos < 0: log.warning( "skipped %s, token offset corruption \"%s\" vs. \"%s\"", path, token, outer) break if pos > 0: for char in outer[:pos]: if ccheck(char): reserved.add(char) if pos + len(token) < len(outer): for char in outer[pos + len(token):]: if ccheck(char): reserved.add(char)
def testRoleIdName(self): assert (role_id(role_name(1)) == 1) assert (role_name(role_id("IDENTIFIER")) == "IDENTIFIER")
``` """ import difflib from itertools import islice import random from typing import Iterator, Optional, List, Sequence, Union import bblfsh import numpy from tokenizer.tokenizer import CodeTokenizer from tokenizer.virtual_node import Position, VirtualNode INDENTATIOS = (" ", "\n", "\t") QUOTES = ("'", '"') LITERAL_ID = bblfsh.role_id("LITERAL") STRING_ID = bblfsh.role_id("STRING") def is_indentation(node: VirtualNode): """ Check if input node is indentation. """ for ch in node.value: if ch not in INDENTATIOS: return False return True def is_literal_string(token): """