示例#1
0
文件: RQ123.py 项目: SuShu19/GNN_RS
def extract_link_in_body(nodes, node, links):
    target_number_list = []
    body_url = []
    body_url += re.findall(
        re.compile(r'https://github.com/' + node['url'].split('/')[-4] + '/' +
                   node['url'].split('/')[-3] + '/+pull+/+[0-9]+'),
        preprocess.clear_body(node['body']))
    body_url += re.findall(
        re.compile(r'https://github.com/' + node['url'].split('/')[-4] + '/' +
                   node['url'].split('/')[-3] + '/+issues+/+[0-9]+'),
        preprocess.clear_body(node['body']))
    if len(body_url) != 0:
        for url in body_url:
            target_number_list.append(url.split('/')[-1])
    body_quote = re.findall(re.compile(r'#[0-9]+'),
                            preprocess.clear_body(node['body']))
    if len(body_quote) != 0:
        for quote in body_quote:
            target_number_list.append(quote.replace("#", ""))
    for target_number in target_number_list:
        for target_node in nodes:
            if target_node['number'] == int(target_number):
                source_number = node['number']
                source_url = node['url']
                target_url = target_node['url']
                source_file = get_file(node)
                target_file = get_file(target_node)
                create_time_interval = datetime.strptime(
                    target_node['createdAt'], "%Y-%m-%dT%H:%M:%SZ").__sub__(
                        datetime.strptime(
                            node['createdAt'],
                            "%Y-%m-%dT%H:%M:%SZ")).total_seconds()
                link_time_interval = 1  # title里的link都是在创建node的时候就link上了,所以这里定义link time interval为1秒
                link_type = determine_link_type(
                    extract_type_in_url(source_url),
                    extract_type_in_url(target_url))
                link = {
                    'source': {
                        'number': source_number,
                        'url': source_url,
                        'createdAt': node['createdAt'],
                        'files': source_file
                    },
                    'target': {
                        'number': target_number,
                        'url': target_url,
                        'createdAt': target_node['createdAt'],
                        'create_time_interval': create_time_interval,
                        'link_time_interval': link_time_interval,
                        'type': link_type,
                        'location': "body",
                        'files': target_file
                    }
                }
                links = detect_dup(links, link)
    return links
示例#2
0
def extract_relations_in_comment(pr_list, issue_list, node, develop_unit):
    for comment in node['comments']['nodes']:

        comment_node = Node("comment", content=comment['body'])  # 创建标题实体
        g.merge(comment_node, 'comment', "content")
        g.merge(Relationship(develop_unit, "comment", comment_node))

        if comment['author'] is not None:
            author = Node("author", name=comment['author']['login'])  # 创建用户实体
            g.merge(author, 'author', "name")
            g.merge(Relationship(author, "create", comment_node))
        else:
            pass

        time = Node("time", time=comment['createdAt'])  # 创建时间实体
        g.merge(time, 'time', "time")
        g.merge(Relationship(comment_node, "time", time))

        comment_link, comment_url = [], []
        comment_url += re.findall(
            re.compile(r'https://github.com/' + node['url'].split('/')[-4] +
                       '/' + node['url'].split('/')[-3] + '/+pull+/+[0-9]+'),
            preprocess.clear_body(comment['body']))
        comment_url += re.findall(
            re.compile(r'https://github.com/' + node['url'].split('/')[-4] +
                       '/' + node['url'].split('/')[-3] + '/+issues+/+[0-9]+'),
            preprocess.clear_body(comment['body']))
        if len(comment_url) != 0:
            for url in comment_url:
                comment_link.append(url.split('/')[-1])
        comment_quote = re.findall(re.compile(r'#[0-9]+'),
                                   preprocess.clear_body(comment['body']))
        if len(comment_quote) != 0:
            for quote in comment_quote:
                comment_link.append(quote.replace("#", ""))

        for target_number in comment_link:
            if target_number == develop_unit['number']:
                continue
            for item in pr_list:
                if str(item) == str(target_number):
                    target_node = Node("pullRequest",
                                       number=str(target_number))  # 创建单元实体
                    g.merge(target_node, "pullRequest", 'number')
                    g.merge(Relationship(comment_node, "linkTo", target_node))
                    continue
            for item in issue_list:
                if str(item) == str(target_number):
                    target_node = Node("issue",
                                       number=str(target_number))  # 创建单元实体
                    g.merge(target_node, "issue", 'number')
                    g.merge(Relationship(comment_node, "linkTo", target_node))
                    continue
示例#3
0
def extract_relations_in_body(pr_list, issue_list, node, develop_unit):

    body = Node("body", content=node['body'])  # 创建标题实体
    g.merge(body, 'body', "content")
    g.merge(Relationship(develop_unit, "body", body))

    body_link, body_url = [], []
    body_url += re.findall(
        re.compile(r'https://github.com/' + node['url'].split('/')[-4] + '/' +
                   node['url'].split('/')[-3] + '/+pull+/+[0-9]+'),
        preprocess.clear_body(node['body']))
    body_url += re.findall(
        re.compile(r'https://github.com/' + node['url'].split('/')[-4] + '/' +
                   node['url'].split('/')[-3] + '/+issues+/+[0-9]+'),
        preprocess.clear_body(node['body']))
    if len(body_url) != 0:
        for url in body_url:
            body_link.append(url.split('/')[-1])
    body_quote = re.findall(re.compile(r'#[0-9]+'),
                            preprocess.clear_body(node['body']))
    if len(body_quote) != 0:
        for quote in body_quote:
            body_link.append(quote.replace("#", ""))

    if body_link != []:
        for target_number in body_link:
            for item in pr_list:
                if str(item) == str(target_number):
                    target_node = Node("pullRequest",
                                       number=str(target_number))  # 创建单元实体
                    g.merge(target_node, "pullRequest", 'number')
                    g.merge(Relationship(body, "linkTo", target_node))
                    continue
            for item in issue_list:
                if str(item) == str(target_number):
                    target_node = Node("issue",
                                       number=str(target_number))  # 创建单元实体
                    g.merge(target_node, "issue", 'number')
                    g.merge(Relationship(body, "linkTo", target_node))
                    continue
示例#4
0
文件: RQ123.py 项目: SuShu19/GNN_RS
def extract_link_in_crossReference(nodes, node, links):
    # 处理crossReference, 从target找source
    for item in node['timelineItems']['nodes']:
        if item:
            if "source" not in item:  # 排除ReferencedAt的情况
                continue
            source_number = item['source']['number']
            source_url = item['source']['url']
            target_number = item['target']['number']
            target_url = item['target']['url']
            create_time_interval = datetime.strptime(
                item['target']['createdAt'], "%Y-%m-%dT%H:%M:%SZ").__sub__(
                    datetime.strptime(item['source']['createdAt'],
                                      "%Y-%m-%dT%H:%M:%SZ")).total_seconds()
            link_time_interval = datetime.strptime(item['referencedAt'], "%Y-%m-%dT%H:%M:%SZ").__sub__\
                (max(datetime.strptime(item['source']['createdAt'], "%Y-%m-%dT%H:%M:%SZ"),
                     datetime.strptime(item['target']['createdAt'], "%Y-%m-%dT%H:%M:%SZ"))).total_seconds()
            target_file_path = get_file(node)
            link_type = determine_link_type(extract_type_in_url(source_url),
                                            extract_type_in_url(target_url))
            # 找source
            for source_node in nodes:
                if source_node['number'] == source_number:
                    # 提取source file
                    source_file_path = get_file(source_node)
                    # 确定location ,这里只有body和comment两种情况,用正则匹配出来
                    # body
                    link_text = re.findall(
                        re.compile(r'#+%s' % str(target_number)),
                        preprocess.clear_body(source_node['body']))
                    if len(link_text) != 0:
                        location = "body"
                        link = {
                            'source': {
                                'number': source_number,
                                'url': source_url,
                                'createdAt': item['source']['createdAt'],
                                'files': source_file_path
                            },
                            'target': {
                                'number': target_number,
                                'url': target_url,
                                'createdAt': item['target']['createdAt'],
                                'create_time_interval': create_time_interval,
                                'link_time_interval': link_time_interval,
                                'type': link_type,
                                'location': location,
                                'files': target_file_path
                            }
                        }
                        links = detect_dup(links, link)
                    # comment
                    for comment in source_node['comments'][
                            'nodes']:  # todo 这里comment一条记录都没有,检查一下
                        link_text = re.findall(
                            re.compile(r'#+%s' % str(target_number)),
                            preprocess.clear_body(comment['body']))
                        if len(link_text) != 0:
                            location = "comment"
                            link = {
                                'source': {
                                    'number': source_number,
                                    'url': source_url,
                                    'createdAt': item['source']['createdAt'],
                                    'files': source_file_path
                                },
                                'target': {
                                    'number': target_number,
                                    'url': target_url,
                                    'createdAt': item['target']['createdAt'],
                                    'create_time_interval':
                                    create_time_interval,
                                    'link_time_interval': link_time_interval,
                                    'type': link_type,
                                    'location': location,
                                    'files': target_file_path
                                }
                            }
                            links = detect_dup(links, link)
    return links