def generateContributionFile(): changelog = pd.read_csv(filepathhelper.path(dataset, "changelog.csv"), quotechar='"', sep=';') with open(filepathhelper.path(dataset, 'rp'), 'rb') as f: rp = pickle.load(f) train = pd.read_csv(filepathhelper.path(dataset, "trainissuekey.csv")) #select only changelog in in changelog = changelog[changelog.issuekey.isin( train.issuekey)][['issuekey', 'username']] user = set() for i in rp: user.update(rp[i]) lognamegroup = changelog.groupby(['username']).groups logkeygroup = changelog.groupby(['issuekey']).count() person = set(changelog['username']) user = list(user) username = [] contribution = [] for i in tqdm(range(len(user))): activity = [] if user[i] in person: loglist = changelog[changelog['username'] == user[i]] logcount = loglist.groupby(['issuekey']).count().reset_index() for index, row in logcount.iterrows(): activity.append(row['username'] / logkeygroup.loc[row['issuekey']].values[0]) username.append(user[i]) contribution.append(sum(activity) / len(activity)) d = {'username': username, 'contribution': contribution} df = pd.DataFrame(data=d) df.to_csv(filepathhelper.path(dataset, 'contribution.csv'))
def calculatePairScore(dataset, tagpair, commentscore, train, outfile): #train = pd.read_csv(filepathhelper.path(dataset, train), encoding='iso-8859-1') tagcomment = pd.read_csv(filepathhelper.path(dataset, tagpair), encoding='iso-8859-1') #remove white space at beginning tagcomment['tagger'] = tagcomment.tagger.str.lstrip() tagcomment['taggee'] = tagcomment.taggee.str.lstrip() #remove user null tagcomment = tagcomment[tagcomment['tagger'] != ' '] tagcomment = tagcomment[tagcomment['taggee'] != ' '] #remove user that contains / (error from extract tagger and taggee) tagcomment = tagcomment[~tagcomment.tagger.str.contains('/', na=False)] tagcomment = tagcomment[~tagcomment.taggee.str.contains('/', na=False)] check = ~tagcomment.tagger.str.isdigit() #remove user all digit tagcomment = tagcomment[check] #check = ~tagcomment.taggee.str.isdigit() #tagcomment = tagcomment[check ] # commentscore = filterTrainSet(commentscore,train) # commentscore = commentscore.drop_duplicates() # print(commentscore) #tagcomment = tagcomment.set_index('commentid').join(commentscore.set_index('commentid')) tagcomment = pd.merge(tagcomment, commentscore, left_on='commentid', right_on='commentid', how='inner') pair = tagcomment.groupby(['tagger', 'taggee']).agg({ 'positivescore': 'mean', 'negativescore': 'mean' }) pair.reset_index().set_index(['tagger', 'taggee' ]).sort_index(level=0).to_csv(outfile)
def calculatePairScoreFilter(dataset, tagpair, commentscore, train, outfile): #train = pd.read_csv(filepathhelper.path(dataset, train), encoding='iso-8859-1') tagcomment = pd.read_csv(filepathhelper.path(dataset, tagpair), encoding='iso-8859-1') commentscore = filterTrainSet(commentscore, train) commentscore = commentscore.drop_duplicates() # print(commentscore) #tagcomment = tagcomment.set_index('commentid').join(commentscore.set_index('commentid')) tagcomment = pd.merge(tagcomment, commentscore, left_on='commentid', right_on='commentid', how='inner') pair = tagcomment.groupby(['tagger', 'taggee']).agg({ 'positivescore': 'mean', 'negativescore': 'mean' }) pair.reset_index().set_index(['tagger', 'taggee' ]).sort_index(level=0).to_csv(outfile)
def IssueFamiliarity(team, issuekey='', descript='', fortest=False): member = team['developer'] + team['integrator'] + team['tester'] + team[ 'reviewer'] + team['assignee'] member = set(member) if descript != '': with open("temp.txt", "w") as f: f.write(descript) subprocess.call([ 'java', '-jar', 'topicSim.jar', filepathhelper.path(dataset, 'model_nonLabel_500_9_1'), 'temp.txt' ]) with open("topicDist", "r") as f: s = f.read().split('\n')[:-1] tdin = ast.literal_eval(s[0]) os.remove("temp.txt") os.remove("topicDist") elif issuekey != '' and not fortest: inissuerow = topdists.loc[issuekey] tdin = inissuerow['topdist'] #faster tdin_squaresum = inissuerow['squaresum'] #faster tdin = cp.array(tdin) #gpu tdin_squaresum = cp.array(tdin_squaresum) #gpu #tdin_squaresum = math.sqrt((np.array(tdin)**2).sum()) #faster issuefam = 0 if fortest: issuefam = np.array([maxcosim(m, issuekey) for m in member]).sum() else: for m in member: if m not in PtoI: continue participated_issue = PtoI[m].copy() participated_issue.discard(issuekey) if len(participated_issue) > 0: participated_issue_topdist = topdists.loc[participated_issue] todistmatrix = cp.array([ cp.array(i) for i in participated_issue_topdist['topdist'].values ]) squaresum = cp.array( participated_issue_topdist['squaresum'].values) maxfam = (todistmatrix.dot(tdin) / (squaresum * tdin_squaresum)).max() # num_participated = participated_issue_topdist.shape[0] # pitd_squaresum = participated_issue_topdist['squaresum'] # pitd = participated_issue_topdist['topdist'] # tdin_squaresum_list = [tdin_squaresum for i in range(0,num_participated)] # tdin_list = [tdin for i in range(0,num_participated)] # cosinargs = tuple(zip(pitd_squaresum,pitd,tdin_squaresum_list,tdin_list)) # with mp.Pool(NUMTHREADS) as pool: # result = pool.starmap(cosim,cosinargs, chunksize=NUMTHREADS) # maxfam = np.array([i for i in result]).max() # for issue in participated_issue: # issuerow = topdists.loc[issue] # td = issuerow['topdist'] # td_squaresum = issuerow['squaresum'] # maxfam = max(maxfam,cosim(td_squaresum,td,tdin_squaresum,tdin)) issuefam = issuefam + maxfam return float(issuefam) / len(member)
import pickle # from tqdm import tqdm_notebook as tqdm import subprocess import ast from neo4j import GraphDatabase import neo4j import multiprocessing as mp from functools import lru_cache import hashlib import re from networkx.algorithms.approximation import steiner_tree # # File Need # In[4]: teams = pd.read_csv(filepathhelper.path(dataset, 'team.csv')) closeresolve = pd.read_csv(filepathhelper.path(dataset, 'closeresolve.csv'), sep=';') winissues = pd.read_csv(filepathhelper.path(dataset, 'winissue.csv')) assignees = pd.read_csv(filepathhelper.path(dataset, 'assignee.csv'), sep=';') trainset = pd.read_csv(filepathhelper.path(dataset, 'trainissuekey.csv')) issuecomponent = pd.read_csv(filepathhelper.path(dataset, 'component_title.csv'), sep=';;;', engine='python') ####################################################################################### df = pd.read_csv(filepathhelper.path(dataset, 'global_pair_score.csv'), encoding='iso-8859-1') pos = df['positivescore'] neg = df['negativescore']
# In[ ]: from numpy import newaxis, minimum def warshall(mat): n = len(mat) for k in tqdm(range(n)): mat = minimum(mat, mat[newaxis, k, :] + mat[:, k, newaxis]) return mat # In[ ]: tags = pd.read_csv(filepathhelper.path(dataset, 'tags.csv'), encoding='ISO-8859-1') teams = pd.read_csv(filepathhelper.path(dataset, 'team.csv')) closeresolve = pd.read_csv(filepathhelper.path(dataset, 'closeresolve.csv'), sep=';') teams = teams[(teams['issuekey'].isin(closeresolve['issuekey']))] winissues = pd.read_csv(filepathhelper.path(dataset, 'winissue.csv')) assignees = pd.read_csv(filepathhelper.path(dataset, 'assignee.csv'), sep=';') assignees = assignees[(assignees['issuekey'].isin(teams['issuekey']))] # assignees.shape # assignees.head() # closeresolve.head() # # train on only train dataset
curdir = os.getcwd() while 'filepathhelper.py' not in os.listdir(curdir): curdir = os.path.dirname(curdir) sys.path.append(curdir) import filepathhelper import math import random import pandas as pd import numpy as np from tqdm import tqdm import pickle #from tqdm import tqdm_notebook as tqdm teams = pd.read_csv(filepathhelper.path(dataset, 'team.csv')) closeresolve = pd.read_csv(filepathhelper.path(dataset, 'closeresolve.csv'), sep=';') winissues = pd.read_csv(filepathhelper.path(dataset, 'winissue.csv')) assignees = pd.read_csv(filepathhelper.path(dataset, 'assignee.csv'), sep=';') trainset = pd.read_csv(filepathhelper.path(dataset, 'trainissuekey.csv')) teams = teams[(teams['issuekey'].isin(closeresolve['issuekey']))] assignees = assignees[(assignees['issuekey'].isin(teams['issuekey']))] teams = teams[teams['issuekey'].isin(trainset['issuekey'])] assignees = assignees[assignees['issuekey'].isin(trainset['issuekey'])] winissues = winissues[winissues['issuekey'].isin(trainset['issuekey'])] assignees.set_index('issuekey', inplace=True) username = set() team = {}
curdir = os.getcwd() while 'filepathhelper.py' not in os.listdir(curdir): curdir = os.path.dirname(curdir) sys.path.append(curdir) import filepathhelper import math import random import pandas as pd import numpy as np from tqdm import tqdm import pickle #from tqdm import tqdm_notebook as tqdm teams = pd.read_csv(filepathhelper.path(dataset,'team.csv')) closeresolve = pd.read_csv(filepathhelper.path(dataset,'closeresolve.csv'),sep=';') winissues = pd.read_csv(filepathhelper.path(dataset,'winissue.csv')) assignees = pd.read_csv(filepathhelper.path(dataset,'assignee.csv'),sep=';') trainset = pd.read_csv(filepathhelper.path(dataset,'trainissuekey.csv')) issuecomponent = pd.read_csv(filepathhelper.path(dataset,'component_title.csv'),sep=';;;',engine='python') teams = teams[(teams['issuekey'].isin(closeresolve['issuekey']))] assignees = assignees[(assignees['issuekey'].isin(teams['issuekey']))] teams = teams[teams['issuekey'].isin(trainset['issuekey'])] assignees = assignees[assignees['issuekey'].isin(trainset['issuekey'])] winissues = winissues[winissues['issuekey'].isin(trainset['issuekey'])] dev = list(teams['dev'].unique()) tmp = [] for i in dev:
while 'filepathhelper.py' not in os.listdir(curdir): curdir = os.path.dirname(curdir) sys.path.append(curdir) import filepathhelper def issueContribution(userlog, changelog): print() if __name__ == "__main__": if len(sys.argv) == 2: dataset = sys.argv[1] if dataset == 'Moodle': changelog = pd.read_csv(filepathhelper.path( dataset, 'changelog.csv'), sep=';') developer = pd.read_csv( filepathhelper.path(dataset, 'developer.csv')) tester = pd.read_csv(filepathhelper.path(dataset, 'tester.csv')) reviewer = pd.read_csv( filepathhelper.path(dataset, 'peer_reviewer.csv')) integrator = pd.read_csv( filepathhelper.path(dataset, 'integrator.csv')) developer = set(developer['username']) tester = set(tester['username']) reviewer = set(reviewer['username']) integrator = set(integrator['username']) person = developer.union(tester, reviewer)
curdir = os.path.dirname(curdir) with open(os.path.join(curdir, 'config.json'), 'r') as f: dataset = json.load(f)['dataset'] curdir = os.getcwd() while 'filepathhelper.py' not in os.listdir(curdir): curdir = os.path.dirname(curdir) sys.path.append(curdir) import filepathhelper if __name__ == "__main__": if 1 > 0: # if len(sys.argv) == 2: # dataset = sys.argv[1] # print(dataset) changelog = pd.read_csv(filepathhelper.path(dataset, 'changelog.csv'), sep=';') with open(filepathhelper.path(dataset, 'rp'), 'rb') as f: rp = pickle.load(f) person = set() for i in rp: person.update(rp[i]) loggroup = changelog.groupby(['username']).groups userlog = set(changelog['username']) print('finish reading files') activity = {} for p in tqdm(person): if p in userlog: acttime = set() for i in loggroup[p]:
elif r.startswith('tester'): rankdict['team']['tester'].append(team[r]) rank.append(rankdict) rankno=rankno+1 return rank def saveOutput(outname): outdata = result # outfile = 'out\\'+outname+'.json' outfile = outname with open(outfile, 'w') as outfile: json.dump(outdata, outfile) if __name__== "__main__": inputname = filepathhelper.path(dataset,'input_test.json') random.seed(123) outdata = {} RANK=100 result = [] with open(filepathhelper.path(dataset,'rp'),'rb') as f: rp = pickle.load(f) if len(sys.argv) == 2: outputname = sys.argv[1] individual = False elif len(sys.argv) == 3: outputname = sys.argv[1] individual = True missingrole = sys.argv[2]+str('1') else: