def crawl(self, repository_id, pull_request_id): """ Entry point for this class """ if (repository_id is None) or (pull_request_id is None): print("could not get work item one of the id's was None") print(repository_id) print(pull_request_id) return graph = GraphBuilder().GetNewGraph() pull_request = PullRequest.select(graph, pull_request_id).first() if pull_request is None: print("Could not continue, pullrequest was not in db") return url = self.pull_request_workitems_url(repository_id, pull_request.Id) data = self.get_data(url) if data is None: return if "value" not in data: logging.info("no work items linked") return for raw in data["value"]: work_item = self.make_work_item(raw) if work_item is not None: self.link_to_pull_request(work_item, pull_request) self.fill_in_the_rest(work_item, graph) transaction = graph.begin() transaction.merge(work_item) transaction.graph.push(work_item)
def __init__(self): """ class init """ self.graph = GraphBuilder().GetNewGraph() self.config = configparser.ConfigParser() self.config.read_file(open('default.cfg'))
def crawl(self, project_name, url=None): """ This method will be recursive since we have to follow a url at the end of each request. """ if project_name is None: print("ProjectId is needed to link work items") return if url is None: url = self.get_url(project_name) data = self.vsts.make_request(url) if data is None: return if "values" not in data: logging.info("no work items linked") return for raw in data["values"]: graph = GraphBuilder().GetNewGraph() r = self.build_relationship(graph, raw) print("adding workitem and relationships") graph.create(r) if data.get("nextLink"): if data.get("isLastBatch"): print("reached the end of linked work items for project " + project_name) return next_url = data["nextLink"] self.crawl(project_name, next_url)
def get_pull_request_ids(self, repository_id): """ Get list of pull request ids """ graph = GraphBuilder().GetNewGraph() qry = "MATCH (n:Repository{{Id:'{}'}}) -[]-(r:PullRequest) RETURN r.Id as Id ".format( repository_id) pull_reqs = list(graph.run(qry)) ids = [] for _id in pull_reqs: ids.append(_id.get("Id")) pull_reqs = None return ids
def get_repository_ids(self, project_name): """ get list of repository ids """ graph = GraphBuilder().GetNewGraph() repo_qry = "MATCH (n:Repository)-[]-(p:Project{{Name:'{}'}}) return n.Id as Id".format( project_name) repo_ids = list(graph.run(repo_qry)) ids = [] for _id in repo_ids: ids.append(_id.get("Id")) repo_ids = None return ids
def __init__(self): self.ctpn = CTPN(); self.parser = OutputParser(); self.graph_builder = GraphBuilder(); if exists(join('model', 'ctpn.h5')): self.ctpn = tf.keras.models.load_model(join('model','ctpn.h5'), compile = False);
def get_pull_request_ids(self, project_name): """ from neo4j get all of the pull request id's for a given project. """ graph = GraphBuilder().GetNewGraph() qry = '''MATCH (pr:PullRequest)-[]- (r:Repository)-[]-(p:Project{{Name:"{}"}}) RETURN pr.Id as Id'''.format(project_name) print(qry) raw_pull_request_ids = list(graph.run(qry)) ids = [] for item in raw_pull_request_ids: ids.append(item.get("Id")) #freeing up memory, not sure if this is very pythonic or not. raw_pull_request_ids = None return ids
def crawl(self, raw_data): """ starts doing the crawling work """ graph = GraphBuilder().GetNewGraph() proj = self.map_and_save_project(raw_data, graph) if proj is not None: self.add_teams_to_repo(proj, graph) print("Finished Adding Projects Teams and Users")
def crawl(self, pull_request_id): ''' Crawls the comments and puts them in Neo4J ''' graph = GraphBuilder().GetNewGraph() pull_request = PullRequest.select(graph, pull_request_id).first() for repo in pull_request.ForRepository: self.copy_over_comments(repo.Id, pull_request) print("finished adding comments")
def copy_over_comments(self, repository_id, pull_request): ''' Copy VSTS Comments to VSTS ''' print("adding comments for pull_request_id" + str(pull_request.Id)) url = self.generate_vsts_url(repository_id, pull_request.Id) data = self.get_vsts_comments(url) if data is None: logging.warning("no comments from vsts for pull request " + pull_request.Id) return for item in data["value"]: graph = GraphBuilder().GetNewGraph() #vsts comment thread not python thread thread = self.make_thread_node(item, graph) print("working thread " + str(thread.Id)) for raw_comment in item.get("comments"): if self.exclude_system_comments and not self.is_user_comment(raw_comment): continue else: thread.PartOf.add(pull_request) comment = self.make_comment_node(raw_comment, thread.Id, graph, url) print("saving comment " + str(comment.Id)) graph.merge(comment) print("saved comment " + str(comment.Id)) #this should save the therad too comment.PartOf.add(thread) self.link_to_parent_comment(comment, raw_comment, thread.Id, graph) self.link_to_author(comment, raw_comment, graph) graph.push(comment) print("added links for comment " + str(comment.Id))
def crawl(self, project_name): """ Gets Repositories for a given project """ url = ( "%s/DefaultCollection/%s/_apis/git/repositories?api-version=%s" % (self.instance, project_name, self.api_version)) data = self.vsts.make_request(url) for r in data["value"]: graph = GraphBuilder().GetNewGraph() #print(r["id"]) repo = Repository() repo.Id = r.get("id") repo.Name = r.get("name") repo.Url = r.get("url") raw_proj = r.get("project") proj = Project() proj.Id = raw_proj.get("id") proj.Name = raw_proj.get("name") proj.Url = raw_proj.get("url") repo_proj = Project.select(graph, proj.Id) '''todo: may not need to do this.''' if repo_proj is not None: proj_tx = graph.begin() proj_tx.create(proj) proj_tx.commit() repo.BelongsTo.add(proj) print("Adding Repo: ") print(repo.Name) transaction = graph.begin() transaction.merge(repo) transaction.graph.push(repo) print("Finished mapping repos")
def crawl(self, project_name): ''' For a single project, gets the pull requests from VSTS and saves them to a neo4j database instance The list of repositories comes from neo4j, so that import must be done first. :param project_name: ''' graph = GraphBuilder().GetNewGraph() repo_ids = self.get_repo_ids(graph, project_name) for repo_id in repo_ids: skip = 0 #part of vsts pagination while True: url = self.get_vsts_pull_request_url(project_name, repo_id, skip) raw_pulls = self.vsts.make_request(url) if not self.has_data_to_parse(raw_pulls): break skip = skip + self.num_per_request #increment pagination for vsts api call for raw_pull_req in raw_pulls["value"]: self.map_and_save_pull_request(graph, raw_pull_req) print("Ending PullRequest Crawl for Project " + project_name)
class PostProcessingCommands(object): """ Adds extra goodness to the Neo4j data model after the data has been imported. """ def __init__(self): """ class init """ self.graph = GraphBuilder().GetNewGraph() self.config = configparser.ConfigParser() self.config.read_file(open('default.cfg')) @property def developer_names(self): """ List of developers to add a label for in Neo4j """ devs = self.config['DEFAULT']['developer_names'].replace( '"', '').replace("\r", '').replace("\n", '').split(",") return devs @property def data_developers(self): devs = self.config['DEFAULT']['database_developers'].replace( '"', '').replace("\r", '').replace("\n", '').split(",") return devs def add_bug_label(self): """ Finds work items of type bug and adds the label of bug. This makes it easier to query and also visualize the various work item types. """ qry = """MATCH (b:WorkItem{WorkItemType:'Bug'}) set b :Bug return count(b)""" self.graph.run(qry) print("Added Bug label to work items") def add_user_story_label(self): """ Finds work items of type User Story and adds the label of UserStory. This makes it easier to query and also visualize the various work item types. """ qry = """MATCH (n:WorkItem{WorkItemType:'User Story'}) set n :UserStory return count(n)""" self.graph.run(qry) print("Added User Story label to work items") def add_tasks_label(self): """ Finds work items of type Task and adds the label of Task. This makes it easier to query and also visualize the various work item types. """ qry = """MATCH (n:WorkItem{WorkItemType:'Task'}) set n :Task return count(n)""" self.graph.run(qry) print("Added Task label to work items") def add_created_timestamp(self): """ Finds all nodes with a CreatedDate and adds a CreatedTimestap """ qry = """MATCH (n) Where exists( n.CreatedDate) set n.CreatedTimestamp = apoc.date.parse(left(replace(n.CreatedDate,"T"," "),19),"ms","yyyy-MM-dd HH:mm:ss") return count(n) as n""" result = self.graph.evaluate(qry) print("Added CreatedTimestamps: Records Changed: {}".format(result)) def add_creation_timestamp(self): """ creation instead of created, but sticks with the CreatedTimestamp vs CreationTimestap Finds all nodes with a Creation and adds a CreatedTimestap """ qry = """MATCH (n) Where exists( n.CreationDate) set n.CreatedTimestamp = apoc.date.parse(left(replace(n.CreationDate,"T"," "),19),"ms","yyyy-MM-dd HH:mm:ss") return count(n) as n""" result = self.graph.evaluate(qry) print("Added CreatedTimestamps for CreationDate: Records Changed: {}". format(result)) def add_closed_timestamp(self): """ Finds all nodes with a ClosedDate and adds a ClosedTimestap """ qry = """MATCH (n) Where exists( n.ClosedDate) set n.ClosedTimestamp = apoc.date.parse(left(replace(n.ClosedDate,"T"," "),19),"ms","yyyy-MM-dd HH:mm:ss") return count(n)""" result = self.graph.evaluate(qry) print("Added ClosedTimestamps: Records Changed: {}".format(result)) def add_published_timestamp(self): """ Finds all nodes with a PublishedDate and adds a PublishedTimestap """ qry = """MATCH (n) Where exists( n.PublishedDate) set n.PublishedTimestamp = apoc.date.parse(left(replace(n.PublishedDate,"T"," "),19),"ms","yyyy-MM-dd HH:mm:ss") return count(n)""" result = self.graph.evaluate(qry) print("Added PublishedTimestap: Records Changed: {}".format(result)) def add_developer_label(self): """ Given a list of names adds a label of dev """ developer_names = self.developer_names for dev in developer_names: qry = """MATCH (n:Person{{Name:"{}"}}) set n :Developer """.format(dev) self.graph.run(qry) print("Added Developers labels to devlist") def add_database_developer_label(self): """ Given a list of names adds a label of dev """ for dev in self.data_developers: qry = """MATCH (n:Person{{Name:"{}"}}) set n :DatabaseDev """.format(dev) self.graph.run(qry) print("Added Developers labels to devlist") def run_all_commands(self): """ Runs all the commands """ print("Executing post processing commands") self.add_developer_label() self.add_database_developer_label() self.add_bug_label() self.add_user_story_label() self.add_tasks_label() self.add_created_timestamp() self.add_creation_timestamp() self.add_closed_timestamp() self.add_published_timestamp() print("Finished running post processing commands")
def crawl(self, raw_data): """ starts doing the crawling work """ graph = GraphBuilder().GetNewGraph() proj = self.map_and_save_project(raw_data, graph) if proj is not None: self.add_teams_to_repo(proj, graph) print("Finished Adding Projects Teams and Users") if __name__ == '__main__': print("starting Projects Teams and Users") #set to false for easier debugging, but it is slower RUN_MULTITHREADED = True GRAPH = GraphBuilder() GRAPH.create_unique_constraints() VSTS = VstsInfo(None, None, ignore_cache=True) #tod clean up this signature mess and just pass in VSTS WORKER = ProjectsTeamsUsersWorker(VSTS.get_request_settings(), VSTS.project_whitelist, VSTS) PROJECTS_URL = WORKER.get_vsts_projects_url() RAW = WORKER.vsts.make_request(PROJECTS_URL) PROJECTS = RAW["value"] if RUN_MULTITHREADED: with Pool(5) as p: p.map(WORKER.crawl, PROJECTS) else: for PROJ in PROJECTS: