示例#1
0
def traverse_all_bugs(all_bugids, new_bugids, sign, train=True, weaks=None):
	'''

	:param all_bugids: 
	:param new_bugids: 
	:param sign: 用来标识是哪一轮实验,提取好的训练集和测试集活跃度数据会放在以sign明明的文件夹里。
	:param train: 
	:return: 
	'''
	start_time = time.time()
	print('start_time:', start_time)
	bugs_counts = {}
	bug_msg_all, _ = data_helper.get_msg_all()
	count = 0
	# all_bugids = sorted(bug_msg_all.keys())  # 升序排好
	# for id in bug_msg_all.keys():

	for id in new_bugids:
		print(count)
		count += 1
		if train:
			actives = get_developer_active_sequence(bug_msg_all, all_bugids, id, all_bugids.index(id) - 1, train, weaks)
		else:
			actives = get_developer_active_sequence(bug_msg_all, all_bugids, id, len(all_bugids) - 1, train, weaks)
		bugs_counts[id] = len(actives)
		write_active_sequence_to_file(actives, sign, id)

	print('计算消耗时间:', time.time() - start_time)
	with open('../data/active_counts_{}.txt'.format(time.time()), 'w') as writer:
		for key in bugs_counts.keys():
			writer.write(str(key) + '\t' + str(bugs_counts[key]) + '\n')
	print('最终结束时间:', time.time())
def _writer_little_bug_msg_all(little_bugids):
    '''
	这个文件是测试用的,是为了查看划分的训练集是否正确
	:param little_bugids: 
	:return: 
	'''
    bug_msg_all, _ = get_msg_all()
    with open('../data/eval_little_bug_msg_all.txt', 'w') as writer:
        for name, value in little_bugids.items():
            for bugid in value:
                # writer.write('{}\n'.format('\t'.join(bug_msg_all.get(bugid))))
                writer.write('{}\t{}\n'.format(bugid,
                                               bug_msg_all.get(bugid)[0]))
def count_bug_fixed_by_each_developer():
    bugid_each_developer = {}  # key=developers' name , value=[bug_ids]
    bug_msg_all, _ = get_msg_all()
    # bug_msg_all = {}
    for bugid, value in bug_msg_all.items():
        der = value[0]  # 获取当前样本的修复者
        if der in bugid_each_developer.keys():
            bugid_each_developer[der].append(bugid)
        else:
            bugid_each_developer.setdefault(der, [bugid])
    # with open('../data/num_of_bugs_fixed_by_each_developer.txt', 'w') as writer:
    # 	for name in bugid_each_developer.keys():
    # 		writer.write('{}\t{}\n'.format(name, len(bugid_each_developer[name])))
    return bugid_each_developer
示例#4
0
import numpy as np
import util.PATH as PATH
from util.data_helper import get_msg_all
from util.data_helper import split_dataset_by_time_windows

'''
提取窗口0的词汇表信息
'''
# 字典,key=bugid,value=
bug_msg_all, _ = get_msg_all()
windows = split_dataset_by_time_windows(bug_msg_all)

vocabulary = []
for i in range(len(windows[0])):
	print(i)
	with open(PATH.path_corpus + str(windows[0][i]), 'r') as reader:
		for line in reader.readlines():
			if line.strip() not in vocabulary:
				vocabulary.append(line.strip())

with open('../data/windows/window_0_vocabulary.txt', 'w') as writer:
	for word in vocabulary:
		writer.write('{}\n'.format(word))
示例#5
0
            writer.write('{}\n'.format(word))


def del_invalid_developers(bug_msg_all):
    '''
	主要是用来检查数据集中是否还存在无效开发者
	:param bug_msg_all: 
	:return: 
	'''
    invalids = [
        "nobody", "inbox", "webmaster", "platform", "unassigned", "issues",
        "needsconfirm", "swneedsconfirm"
    ]
    invalid_bugs = []
    invalid_devs = set()
    for key, value in bug_msg_all.items():
        if True in list(map(lambda x: x in value[0].lower(),
                            invalids)):  # 该条bug是由无效开发者修复的
            invalid_bugs.append(key)
            invalid_devs.add(value[0])
    print('无效的bug条数={}'.format(len(invalid_bugs)))
    print('无效的开发者数目={}'.format(len(invalid_devs)))
    return invalid_bugs


if __name__ == '__main__':
    bug_msg_all, _ = data_helper.get_msg_all(
    )  # # key=bug_id   value= {assign_to   creation_ts delta_ts    product component}
    # extract_vocabulary(bug_msg_all, PATH.path_vocabulary)
    del_invalid_developers(bug_msg_all)
示例#6
0
            actives = get_developer_active_sequence(bug_msg_all, all_bugids,
                                                    id,
                                                    len(all_bugids) - 1, train)
        bugs_counts[id] = len(actives)
        write_active_sequence_to_file(actives, sign, id)

    print('计算消耗时间:', time.time() - start_time)
    with open('../data/active_counts_{}.txt'.format(time.time()),
              'w') as writer:
        for key in bugs_counts.keys():
            writer.write(str(key) + '\t' + str(bugs_counts[key]) + '\n')
    print('最终结束时间:', time.time())


if __name__ == '__main__':

    # traverse_all_bugs()
    bug_msg_all, _ = data_helper.get_msg_all()
    time_windows = data_helper.split_dataset_by_time_windows(bug_msg_all)
    for i in [0]:
        traverse_all_bugs(time_windows[i], time_windows[i], sign=i,
                          train=True)  # 针对训练集
        traverse_all_bugs(time_windows[i],
                          time_windows[i + 1],
                          sign=i,
                          train=False)  # 针对测试集
    #
    # bugids = sorted(bug_msg_all.keys())  # 升序排好
    # actives = get_developer_active_sequence(bug_msg_all, bugids, 6533)
    # for i in actives:
    # 	print(i)