def extract_link_in_body(nodes, node, links): target_number_list = [] body_url = [] body_url += re.findall( re.compile(r'https://github.com/' + node['url'].split('/')[-4] + '/' + node['url'].split('/')[-3] + '/+pull+/+[0-9]+'), preprocess.clear_body(node['body'])) body_url += re.findall( re.compile(r'https://github.com/' + node['url'].split('/')[-4] + '/' + node['url'].split('/')[-3] + '/+issues+/+[0-9]+'), preprocess.clear_body(node['body'])) if len(body_url) != 0: for url in body_url: target_number_list.append(url.split('/')[-1]) body_quote = re.findall(re.compile(r'#[0-9]+'), preprocess.clear_body(node['body'])) if len(body_quote) != 0: for quote in body_quote: target_number_list.append(quote.replace("#", "")) for target_number in target_number_list: for target_node in nodes: if target_node['number'] == int(target_number): source_number = node['number'] source_url = node['url'] target_url = target_node['url'] source_file = get_file(node) target_file = get_file(target_node) create_time_interval = datetime.strptime( target_node['createdAt'], "%Y-%m-%dT%H:%M:%SZ").__sub__( datetime.strptime( node['createdAt'], "%Y-%m-%dT%H:%M:%SZ")).total_seconds() link_time_interval = 1 # title里的link都是在创建node的时候就link上了,所以这里定义link time interval为1秒 link_type = determine_link_type( extract_type_in_url(source_url), extract_type_in_url(target_url)) link = { 'source': { 'number': source_number, 'url': source_url, 'createdAt': node['createdAt'], 'files': source_file }, 'target': { 'number': target_number, 'url': target_url, 'createdAt': target_node['createdAt'], 'create_time_interval': create_time_interval, 'link_time_interval': link_time_interval, 'type': link_type, 'location': "body", 'files': target_file } } links = detect_dup(links, link) return links
def extract_relations_in_comment(pr_list, issue_list, node, develop_unit): for comment in node['comments']['nodes']: comment_node = Node("comment", content=comment['body']) # 创建标题实体 g.merge(comment_node, 'comment', "content") g.merge(Relationship(develop_unit, "comment", comment_node)) if comment['author'] is not None: author = Node("author", name=comment['author']['login']) # 创建用户实体 g.merge(author, 'author', "name") g.merge(Relationship(author, "create", comment_node)) else: pass time = Node("time", time=comment['createdAt']) # 创建时间实体 g.merge(time, 'time', "time") g.merge(Relationship(comment_node, "time", time)) comment_link, comment_url = [], [] comment_url += re.findall( re.compile(r'https://github.com/' + node['url'].split('/')[-4] + '/' + node['url'].split('/')[-3] + '/+pull+/+[0-9]+'), preprocess.clear_body(comment['body'])) comment_url += re.findall( re.compile(r'https://github.com/' + node['url'].split('/')[-4] + '/' + node['url'].split('/')[-3] + '/+issues+/+[0-9]+'), preprocess.clear_body(comment['body'])) if len(comment_url) != 0: for url in comment_url: comment_link.append(url.split('/')[-1]) comment_quote = re.findall(re.compile(r'#[0-9]+'), preprocess.clear_body(comment['body'])) if len(comment_quote) != 0: for quote in comment_quote: comment_link.append(quote.replace("#", "")) for target_number in comment_link: if target_number == develop_unit['number']: continue for item in pr_list: if str(item) == str(target_number): target_node = Node("pullRequest", number=str(target_number)) # 创建单元实体 g.merge(target_node, "pullRequest", 'number') g.merge(Relationship(comment_node, "linkTo", target_node)) continue for item in issue_list: if str(item) == str(target_number): target_node = Node("issue", number=str(target_number)) # 创建单元实体 g.merge(target_node, "issue", 'number') g.merge(Relationship(comment_node, "linkTo", target_node)) continue
def extract_relations_in_body(pr_list, issue_list, node, develop_unit): body = Node("body", content=node['body']) # 创建标题实体 g.merge(body, 'body', "content") g.merge(Relationship(develop_unit, "body", body)) body_link, body_url = [], [] body_url += re.findall( re.compile(r'https://github.com/' + node['url'].split('/')[-4] + '/' + node['url'].split('/')[-3] + '/+pull+/+[0-9]+'), preprocess.clear_body(node['body'])) body_url += re.findall( re.compile(r'https://github.com/' + node['url'].split('/')[-4] + '/' + node['url'].split('/')[-3] + '/+issues+/+[0-9]+'), preprocess.clear_body(node['body'])) if len(body_url) != 0: for url in body_url: body_link.append(url.split('/')[-1]) body_quote = re.findall(re.compile(r'#[0-9]+'), preprocess.clear_body(node['body'])) if len(body_quote) != 0: for quote in body_quote: body_link.append(quote.replace("#", "")) if body_link != []: for target_number in body_link: for item in pr_list: if str(item) == str(target_number): target_node = Node("pullRequest", number=str(target_number)) # 创建单元实体 g.merge(target_node, "pullRequest", 'number') g.merge(Relationship(body, "linkTo", target_node)) continue for item in issue_list: if str(item) == str(target_number): target_node = Node("issue", number=str(target_number)) # 创建单元实体 g.merge(target_node, "issue", 'number') g.merge(Relationship(body, "linkTo", target_node)) continue
def extract_link_in_crossReference(nodes, node, links): # 处理crossReference, 从target找source for item in node['timelineItems']['nodes']: if item: if "source" not in item: # 排除ReferencedAt的情况 continue source_number = item['source']['number'] source_url = item['source']['url'] target_number = item['target']['number'] target_url = item['target']['url'] create_time_interval = datetime.strptime( item['target']['createdAt'], "%Y-%m-%dT%H:%M:%SZ").__sub__( datetime.strptime(item['source']['createdAt'], "%Y-%m-%dT%H:%M:%SZ")).total_seconds() link_time_interval = datetime.strptime(item['referencedAt'], "%Y-%m-%dT%H:%M:%SZ").__sub__\ (max(datetime.strptime(item['source']['createdAt'], "%Y-%m-%dT%H:%M:%SZ"), datetime.strptime(item['target']['createdAt'], "%Y-%m-%dT%H:%M:%SZ"))).total_seconds() target_file_path = get_file(node) link_type = determine_link_type(extract_type_in_url(source_url), extract_type_in_url(target_url)) # 找source for source_node in nodes: if source_node['number'] == source_number: # 提取source file source_file_path = get_file(source_node) # 确定location ,这里只有body和comment两种情况,用正则匹配出来 # body link_text = re.findall( re.compile(r'#+%s' % str(target_number)), preprocess.clear_body(source_node['body'])) if len(link_text) != 0: location = "body" link = { 'source': { 'number': source_number, 'url': source_url, 'createdAt': item['source']['createdAt'], 'files': source_file_path }, 'target': { 'number': target_number, 'url': target_url, 'createdAt': item['target']['createdAt'], 'create_time_interval': create_time_interval, 'link_time_interval': link_time_interval, 'type': link_type, 'location': location, 'files': target_file_path } } links = detect_dup(links, link) # comment for comment in source_node['comments'][ 'nodes']: # todo 这里comment一条记录都没有,检查一下 link_text = re.findall( re.compile(r'#+%s' % str(target_number)), preprocess.clear_body(comment['body'])) if len(link_text) != 0: location = "comment" link = { 'source': { 'number': source_number, 'url': source_url, 'createdAt': item['source']['createdAt'], 'files': source_file_path }, 'target': { 'number': target_number, 'url': target_url, 'createdAt': item['target']['createdAt'], 'create_time_interval': create_time_interval, 'link_time_interval': link_time_interval, 'type': link_type, 'location': location, 'files': target_file_path } } links = detect_dup(links, link) return links