def java_is_parsed(): if not request.json or not 'github_short_url' in request.json or not 'parsing_type' in request.json: return jsonify({'Error': 'Must POST JSON request with github_short_url and parsing_type fields'}), 400 if (request.json['parsing_type'] != 'all' and request.json['parsing_type'] != 'packageclassonly'): return jsonify({'Error': "Parsing type must either be 'all' or 'packageclassonly'"}), 400 task = { 'github_short_url': request.json['github_short_url'], 'parsing_type': request.json['parsing_type'], } try: driver = utils.get_neo4j() try: with driver.session() as session: result = session.read_transaction(is_package_parsed, tasks.get('github_short_url'), task.get('parsing_type'), parsing_type) return jsonify({'is-parsed': result}), 200 except Exception: traceback.print_exc() return jsonify({'Error': 'Error occurred connecting to neo4j'}), 500 finally: driver.close() return except: traceback.print_exc() return jsonify({'Error': 'Error occurred fetching neo4j driver'}), 500
def fetch_project(github_short_url): try: driver = utils.get_neo4j() depends_service_url = utils.get_depends_service() try: # request information about the project from the pom-search-service response = requests.get("{}/java/project/{}?remote=true".format(depends_service_url, github_short_url)) if (response.status_code != 200): return # the response is returned as JSON, parse it into a python object parsed_response = json.loads(response.content) # retrieve the name of the repository from response repo_name = parsed_response.get('github_repo_name') if (repo_name != None): with driver.session() as session: # add the repository to neo4j as a new vertices session.write_transaction(neo4j_queries.add_project_node, repo_name) # record that this project has been searched session.write_transaction(neo4j_queries.add_attribute_to_project, repo_name, "projectsearch", "True") # Parse the pom file associated with the pom file. This will # initiate searches for any repositories dependent on # artifacts produced by this repository. for pom in parsed_response.get('pom'): parse_pom(session, pom, repo_name, 'high') except Exception: traceback.print_exc() finally: driver.close() except Exception: traceback.print_exc() return
def create_parse_project_task(): if not request.json or not 'github_short_url' in request.json or not 'parsing_type' in request.json: return jsonify({'Error': 'Must POST JSON request with github_short_url and parsing_type fields'}), 400 if (request.json['parsing_type'] != 'all' and request.json['parsing_type'] != 'packageclassonly'): return jsonify({'Error': "Parsing type must either be 'all' or 'packageclassonly'"}), 400 task = { 'github_short_url': request.json['github_short_url'], 'parsing_type': request.json['parsing_type'], } try: driver = utils.get_neo4j() try: with driver.session() as session: # check if the project has already been parsed, or if parsing is in progress. If it is, then # don't requeue the project for parsing result = session.write_transaction(neo4j_queries.retrieve_attribute_value, request.json['github_short_url'], 'ast-parsed') if (result == request.json['parsing_type'] or result == 'in-progress' or result == 'all' or result == 'queued'): print("AST tree already parsed for project {}".format(request.json['github_short_url'])) return session.write_transaction(neo4j_queries.add_attribute_to_project, request.json['github_short_url'], 'ast-parsed', 'queued') except Exception: traceback.print_exc() finally: driver.close() except: traceback.print_exc() return job = q.enqueue(parse_repo, task.get('github_short_url'), task.get('parsing_type'), job_timeout=7200) return jsonify({'task': task}), 200
def is_ast_parsed(group, project): print(group) print(project) try: driver = utils.get_neo4j() try: with driver.session() as session: if session.read_transaction(neo4j_queries.project_exists, group, project): ast_parse_state = session.read_transaction( neo4j_queries.retrieve_project_attribute_value, "{}/{}".format(group, project), 'projectsearch') print(ast_parse_state) return jsonify({ 'status': 'ok', 'state': ast_parse_state }), 200 else: return jsonify({ 'status': 'ERROR', 'reason': 'Project does not exist in Neo4j. Have you submitted a parse job to /init/dependents-search/pom yet?' }), 400 except: traceback.print_exc() return jsonify({'status': 'SERVER_ERROR'}), 500 finally: driver.session().close() except: traceback.print_exc() return jsonify({'status': 'SERVER_ERROR'}), 500 return jsonify({'status': 'SERVER_ERROR'}), 500
def retrieve_project_calls(group, project): try: driver = utils.get_neo4j() try: with driver.session() as session: if session.read_transaction(neo4j_queries.project_exists, group, project): result = session.read_transaction( neo4j_queries.all_project_dependencies, group, project) return jsonify({'status': 'ok', 'data': result}), 200 else: return jsonify({ 'status': 'ERROR', 'reason': 'Project does not exist in Neo4j. Have you submitted a parse job to /init/dependents-search/pom yet?' }), 400 except: traceback.print_exc() return jsonify({'status': 'SERVER_ERROR'}), 500 finally: driver.session().close() except: traceback.print_exc() return jsonify({'status': 'SERVER_ERROR'}), 500
def retrieve_dependents_of_node(group, project): node_label = request.args.get('label') node_id = request.args.get('id') try: driver = utils.get_neo4j() try: with driver.session() as session: if session.read_transaction(neo4j_queries.project_exists, group, project): result = session.read_transaction( neo4j_queries.dependents_from_node, group, project, node_label, node_id) return jsonify({'status': 'ok', 'data': result}), 200 else: return jsonify({ 'status': 'ERROR', 'reason': 'Project does not exist in Neo4j. Have you submitted a parse job to /init/dependents-search/pom yet?' }), 400 except: traceback.print_exc() return jsonify({'status': 'SERVER_ERROR'}), 500 finally: driver.session().close() except: traceback.print_exc() return jsonify({'status': 'SERVER_ERROR'}), 500 return jsonify({'status': 'SERVER_ERROR'}), 500
def retrieve_hierarchy(group, project): """ This endpoint retrieves all children of the project that are called by at least one dependent project. """ try: driver = utils.get_neo4j() try: with driver.session() as session: if session.read_transaction(neo4j_queries.project_exists, group, project): result = session.read_transaction( neo4j_queries.project_hierarchy, group, project) return jsonify({'status': 'ok', 'data': result}), 200 else: return jsonify({ 'status': 'ERROR', 'reason': 'Project does not exist in Neo4j. Have you submitted a parse job to /init/dependents-search/pom yet?' }), 400 except: traceback.print_exc() return jsonify({'status': 'SERVER_ERROR'}), 500 finally: driver.session().close() except: traceback.print_exc() return jsonify({'status': 'SERVER_ERROR'}), 500
def fetch_package_transitive_dependents(group, project): print(group) print(project) try: driver = utils.get_neo4j() try: with driver.session() as session: if session.read_transaction(neo4j_queries.artifact_exists, group, project): transitive_artifacts_search = session.read_transaction( neo4j_queries.get_transitive_artifact_dependents, group, project) artifacts_cache_count = session.read_transaction( neo4j_queries.get_artifact_dependents_total_cached, group, project) artifacts_search_count = session.read_transaction( neo4j_queries.get_artifact_dependents_count, group, project) estimated_transitive_artifacts_count = int( float( transitive_artifacts_search.get('count') / artifacts_search_count) * float(artifacts_cache_count)) return jsonify({ 'status': 'ok', 'transitive-artifacts-search': transitive_artifacts_search, 'artifacts-cache': { 'count': artifacts_cache_count }, 'artifacts-search': { 'count': artifacts_search_count }, 'predictions': { 'estimated-transitive-artifacts-count': estimated_transitive_artifacts_count } }), 200 else: return jsonify({ 'status': 'ERROR', 'reason': 'Artifact does not exist in Neo4j' }), 400 except: traceback.print_exc() return jsonify({'status': 'SERVER_ERROR'}), 500 finally: driver.session().close() except: traceback.print_exc() return jsonify({'status': 'SERVER_ERROR'}), 500 return jsonify({'status': 'SERVER_ERROR'}), 500
def fetch_dependent_ast(group, project): print(group) print(project) dependent_group = request.args.get('group') print(dependent_group) dependent_repo = request.args.get('repo') print(dependent_repo) sub_node_label = request.args.get('label') sub_node_id = request.args.get('id') try: driver = utils.get_neo4j() try: with driver.session() as session: if session.read_transaction( neo4j_queries.project_exists, group, project) and session.read_transaction( neo4j_queries.project_exists, dependent_group, dependent_repo): ast_result = session.read_transaction( neo4j_queries.ast_tree_dependent, group, project, dependent_group, dependent_repo, sub_node_label, sub_node_id) return jsonify({ 'status': 'ok', 'ast': ast_result, }), 200 else: return jsonify({ 'status': 'ERROR', 'reason': 'Artifact does not exist in Neo4j' }), 400 except: traceback.print_exc() return jsonify({'status': 'SERVER_ERROR'}), 500 finally: driver.session().close() except: traceback.print_exc() return jsonify({'status': 'SERVER_ERROR'}), 500 return jsonify({'status': 'SERVER_ERROR'}), 500
def are_artifacts_parsed(group, project): print(group) print(project) try: driver = utils.get_neo4j() try: with driver.session() as session: if session.read_transaction(neo4j_queries.project_exists, group, project): artifacts = session.read_transaction( neo4j_queries.get_project_packages, group, project) for artifact in artifacts: ast_parse_state = session.read_transaction( neo4j_queries.retrieve_artifact_attribute_value, artifact.get("group"), artifact.get("artifact"), 'dependentsearch') if (ast_parse_state == None): artifact["search-state"] = "not-searched" else: artifact["search-state"] = ast_parse_state return jsonify({ 'status': 'ok', 'artifacts': artifacts }), 200 else: return jsonify({ 'status': 'ERROR', 'reason': 'Project does not exist in Neo4j. Have you submitted a parse job to /init/dependents-search/pom yet?' }), 400 except: traceback.print_exc() return jsonify({'status': 'SERVER_ERROR'}), 500 finally: driver.session().close() except: traceback.print_exc() return jsonify({'status': 'SERVER_ERROR'}), 500 return jsonify({'status': 'SERVER_ERROR'}), 500
def fetch_project_dependents(group, project): print(group) print(project) try: driver = utils.get_neo4j() try: with driver.session() as session: if session.read_transaction(neo4j_queries.project_exists, group, project): projects_search = session.read_transaction( neo4j_queries.get_project_dependents, group, project) projects_search_count = session.read_transaction( neo4j_queries.get_project_dependents_total_cached, group, project) return jsonify({ 'status': 'ok', 'projects-search': projects_search, 'projects-cache': { 'count': projects_search_count } }), 200 else: return jsonify({ 'status': 'ERROR', 'reason': 'Project does not exist in Neo4j. Have you submitted a parse job to /init/dependents-search/pom yet?' }), 400 except: traceback.print_exc() return jsonify({'status': 'SERVER_ERROR'}), 500 finally: driver.session().close() except: traceback.print_exc() return jsonify({'status': 'SERVER_ERROR'}), 500 return jsonify({'status': 'SERVER_ERROR'}), 500
def create_parse_package_task(): if not request.json: return 400 if not 'group' in request.json or not 'artifact' in request.json or not 'start' in request.json or not 'end' in request.json or not 'parent' in request.json: return 400 task = { 'group': request.json['group'], 'artifact': request.json['artifact'], 'start': request.json['start'], 'end': request.json['end'], 'priority': request.json.get('priority'), 'parent': request.json['parent'] } driver = utils.get_neo4j() try: with driver.session() as session: """ Guard against re-searching the same repository. This is an imperfect solution, as it only guards against the parsing being initiated again with a start value for the records to be retrieved during search <= 1. It doesn't guard against a search which overlaps with a previous search. For example, it doesn't guard against the following 1. search request received to search identified repositories 1 to 100 2. search request received to search identified repositories 50 to 100. In this instance, the repositories 50 to 100 wil be searched twice. """ parsed = session.read_transaction(neo4j_queries.is_package_parsed, task.get('group'), task.get('artifact')) if (parsed == "completed"): print("Parsing already completed on repository") return jsonify({'state': "completed"}), 202 if (parsed == "in-progress" and task.get("start") <= 1): print("Parsing already in progress on repository") return jsonify({'state': "in-progress"}), 201 except Exception: traceback.print_exc() finally: driver.close() if (task.get('priority') == 'high'): job = q_medium.enqueue(fetch_package, task.get('group'), task.get('artifact'), task.get('start'), task.get('end'), task.get('parent'), task.get('priority'), timeout=3600) else: job = q_low.enqueue(fetch_package, task.get('group'), task.get('artifact'), task.get('start'), task.get('end'), task.get('parent'), task.get('priority'), timeout=3600) return jsonify({'task': task}), 200
def parse_repo(git_short_url, parsing_type): print("Parsing project: " + git_short_url) if (parsing_type != 'all' and parsing_type != 'packageclassonly'): print("ERROR: parsing type must either be 'all' or 'packageclassonly'") print(parsing_type) return # add an attribute to the project, describing the parsing that has taken place try: driver = utils.get_neo4j() try: with driver.session() as session: # TODO: a potential race condition exists here. If two jobs are submitted to parse a repo, # and both are accepted by two separate parsing workers at the same time, then both may query # the parsing state at the same time, and both may carry out parsing. This won't actually cause # any errors, but it does waste resources result = session.write_transaction( neo4j_queries.retrieve_attribute_value, git_short_url, 'ast-parsed') if (result == parsing_type or result == 'in-progress' or result == 'all'): print("AST tree already parsed for project {}".format( git_short_url)) return result = session.write_transaction( neo4j_queries.update_attribute_of_project, git_short_url, 'ast-parsed', 'in-progress') except Exception: traceback.print_exc() finally: driver.close() except: traceback.print_exc() return # create a new temporary directory with tempdir() as dirpath: print("using dir " + dirpath) os.chdir(dirpath) clone_path = "git clone https://github.com/{}.git".format( git_short_url) print(clone_path) os.system(clone_path) # clone the git repo cloned_dir = os.listdir(dirpath) if (len(cloned_dir) == 1): os.chdir(dirpath + "/" + cloned_dir[0]) os.system("mvn dependency:copy-dependencies" ) # download all dependencies to ./target/dependencies # Run parser print("Parsing Java Project to produce AST tree") result = os.system( "java -Xmx1g -jar /java-parser/target/java_parser_cli.jar-jar-with-dependencies.jar -i {} -j {}/{} -s {}/{} -l cypher -t {} -o {}/output.cypher" .format(git_short_url, dirpath, cloned_dir[0], dirpath, cloned_dir[0], parsing_type, dirpath)) if (result != 0): print("Error occurred parsing AST tree") try: driver = utils.get_neo4j() try: with driver.session() as session: session.write_transaction( neo4j_queries.update_attribute_of_project, git_short_url, 'ast-parsed', 'failed') except Exception: traceback.print_exc() finally: driver.close() except: traceback.print_exc() return return print("Exporting to neo4j") # Export output to neo4j, subprocess.call returns the status code of the call export_to_neo4j_output = subprocess.call( "set -eo pipefail; cat {}/output.cypher | cypher-shell -a $NEO4J_IP -u $NEO4J_USER -p $NEO4J_PASS" .format(dirpath), shell=True, executable='/bin/bash') if (export_to_neo4j_output != 0): try: driver = utils.get_neo4j() try: with driver.session() as session: session.write_transaction( neo4j_queries.update_attribute_of_project, git_short_url, 'ast-parsed', 'failed') except Exception: traceback.print_exc() finally: driver.close() except: traceback.print_exc() return print("Error occurred adding call graph to Neo4j") return # add an attribute to the project, describing the parsing that has taken place try: driver = utils.get_neo4j() try: with driver.session() as session: print("Completed parsing of project") session.write_transaction( neo4j_queries.update_attribute_of_project, git_short_url, 'ast-parsed', parsing_type) except Exception: traceback.print_exc() finally: driver.close() except: traceback.print_exc() return return
def fetch_package(package_group, package_artifact, search_start, search_end, parent_project, continuation_priority): # validate that all arguments are correct if (package_group == None or package_artifact == None or search_start < 0 or search_end < search_start): print ("Cannot parse package " + str(package_group or '') + "." + str(package_artifact or '')) return try: driver = utils.get_neo4j() depends_service_url = utils.get_depends_service() queue_manager = utils.get_queue_manager() try: with driver.session() as session: try: # don't carry out parsing if the search is already completed parsed = session.read_transaction(neo4j_queries.is_package_parsed, package_group, package_artifact) if (parsed == "completed"): print ("Package " + package_group + "." + package_artifact + " at parsing state " + parsed) return print ("Parsing package " + package_group + "." + package_artifact) session.write_transaction(neo4j_queries.add_attribute_to_artifact, package_group, package_artifact, "dependentsearch", "in-progress") # search for dependents using the pom-search-service response = requests.get("{}/java/package/{}/{}/dependents/local?pom=true&start={}&end={}".format(depends_service_url, package_group, package_artifact, search_start, search_end - search_start)) if (response.status_code != 200): print("Couldn't retrieve dependents from dependents service") return parsed_response = json.loads(response.content) print ("Received packages from dependents service " + package_group + "." + package_artifact) total_count = parsed_response.get("total_count") # returned repositories are paginated - meaning that only a subset of all are returned # at any one time. If the current end to the paginated search is less than the total count, # then send a request to the queue manager to carry out the search with the next set of # paginated results if (search_end < total_count): # Identify the new end position for pagination, maintaining the same number of returned # results as in this search new_search_end = search_end + (search_end - search_start) if (new_search_end > total_count): new_search_end = total_count # request the next search next_parse_response = requests.post("http://{}/dependents/package".format(queue_manager), json = {'group': package_group, 'artifact': package_artifact, 'start': search_end, 'end': new_search_end, 'parent': parent_project, 'priority': continuation_priority}) if (next_parse_response.status_code != 200): print("Error occurred trying to request parsing of next section") # store the total count of dependents to the artifact session.write_transaction(neo4j_queries.add_attribute_to_artifact, package_group, package_artifact, "total_count", total_count) # for each identified dependent project, add it to Neo4j project_count = search_start for project in parsed_response.get("projects"): repo_name = project.get('github_repo_name') session.write_transaction(neo4j_queries.add_project_node, repo_name) session.write_transaction(neo4j_queries.add_project_node, parent_project) session.write_transaction(neo4j_queries.add_project_depends_project_edge, repo_name, parent_project) for pom in project.get("pom"): parse_pom(session, pom, repo_name, 'low') project_count = project_count + 1 #if search_end >= total_count, then add attribute to package stating it has been parsed. if (search_end >= total_count): print ("adding") session.write_transaction(neo4j_queries.update_attribute_of_artifact, package_group, package_artifact, "dependentsearch", "completed") print ("added") except Exception: traceback.print_exc() session.write_transaction(neo4j_queries.update_attribute_of_artifact, package_group, package_artifact, "dependentsearch", "failed") except Exception: traceback.print_exc() finally: driver.close() return except: print("Error occurred parsing package " + str(package_group or '') + "." + str(package_artifact or '')) return
def createTreeFromEdges(edges, vertices, group, project, sub_node_label, sub_node_id): nodes = {} forest = [] driver = utils.get_neo4j() with driver.session() as session: result = session.read_transaction(dependent_method_usage, group, project, sub_node_label, sub_node_id) node_usages = {} for record in result: node = record.get('node') object_to_return = {} object_to_return['label'] = list(getattr(node, '_labels'))[0] object_to_return['id'] = getattr(node, '_properties').get('id') object_to_return['usage'] = record.get("usage") object_to_return['project'] = getattr(record.get("proj"), '_properties').get('id') object_to_return['distinct_usage'] = record.get("usage_dist") object_to_return['properties'] = getattr(node, '_properties') object_to_return['name'] = "{}: {}".format( object_to_return.get('label'), object_to_return.get('id')) node_usages[object_to_return['id']] = object_to_return driver.session().close() for node_id in vertices.keys(): nodes[node_id] = { 'id': node_id, "name": getattr(vertices[node_id], '_properties').get('id'), "properties": getattr(vertices[node_id], '_properties'), "label": list(getattr(vertices[node_id], '_labels'))[0], "size": 1, "children": [] } id = nodes[node_id]["properties"]["id"] nodes[node_id]["id"] = getattr(vertices[node_id], '_properties').get('id') nodes[node_id]["name"] = nodes[node_id]["id"].split('.')[-1] nodes[node_id]["size"] = node_usages[id]["usage"] nodes[node_id]["value"] = node_usages[id]["usage"] nodes[node_id]["usage"] = node_usages[id]["usage"] nodes[node_id]["distinct_usage"] = node_usages[id]["distinct_usage"] nodes[node_id]["label"] = node_usages[id]["label"] nodes[node_id]["project"] = node_usages[id]["project"] forest.append(nodes[node_id]) # Must remove shortest paths to ensure there is no duplication. Java hiearchy naming can be used to ensure only direct children are linked. # The issue is that project names at the top level don't follow this pattern, so extra work must be done to identify which children are direct children of the # project level. roots = [] for i in edges: parent_id, child_id = i if nodes[parent_id]["project"] == nodes[parent_id]["id"]: roots.append(nodes[child_id]["id"]) tmp = roots.copy() for outer_root in tmp: roots = [ root for root in roots if not root.startswith(outer_root) or root == outer_root ] for i in edges: parent_id, child_id = i if nodes[parent_id]["project"] == nodes[parent_id]["id"] and nodes[ child_id]["id"] in roots or nodes[parent_id][ "id"] + '.' + nodes[child_id]["id"].split( ".")[-1] == nodes[child_id]["id"]: node = nodes[child_id] parent = nodes[parent_id] parent['children'].append(node) if (node in forest): forest.remove(node) #forest is now a graph, with a single root vertex return forest