def buildCommitPart(): repos = linkOperator.selectRepoOver(5000) logCorpus = open('commitLog.dat', "w") codeCorpus = open('commitCode.dat', "w") try: print 'start' for highRepo in repos: path = getPath(highRepo[1]) try: gitRe = gitResolver.GitResolver(path) commits = gitRe.getCommits() print path, ":", len(commits) for commit in commits: words = preprocessor.preprocessToWord( commit.message.decode('utf-8')) if len(words): # 不是空列表 for word in words: logCorpus.write(word.encode('utf-8')) logCorpus.write(" ") logCorpus.write("\n") diffs = gitRe.getOneDiff(commit) for diff in diffs: diffCode = preprocessor.processDiffCode(diff.diff) if len(diffCode): for code in diffCode: codeCorpus.write(code) codeCorpus.write(" ") codeCorpus.write("\n") except BaseException, e: print "***", path, ":", e print traceback.format_exc() print 'end'
def buildFromGit(): repos = mysqlOperator.selectAllHighRepository() corpus = open('corpusLabel.dat', "w") try: for highRepo in repos: path = getPath(highRepo[1]) try: gitRe = gitResolver.GitResolver(path) commits = gitRe.getCommits() print path, ":", len(commits) for commit in commits: corpus.write(str(highRepo[0]).encode("utf-8")) corpus.write("\n") corpus.write(commit.hexsha.encode("utf-8")) corpus.write("\n") sens = preprocessor.preprocess( commit.message.decode('utf-8')) for sentence in sens: if len(sentence): #不是空列表 for word in sentence: corpus.write(word.encode('utf-8')) corpus.write(" ") corpus.write("\n") corpus.write("\n") except BaseException, e: print "***", path, ":", e except IOError, e: #检查open()是否失败,通常是IOError类型的错误 print "***", e
def buildByList(repoList, corpusName): corpus = open('corpus/nocode%s.dat' % corpusName, "w") commitCorpus = open('corpus/commit%s.dat' % corpusName, "w") issueCorpus = open('corpus/issue%s.dat' % corpusName, "w") try: print 'start' for i in range(len(repoList)): repoId = repoList[i]['id'] repoPath = repoList[i]['path'] try: # commit part gitRe = gitResolver.GitResolver(repoPath) commits = gitRe.getCommits() print repoPath, ":", len(commits) for commit in commits: seqs = preprocessor.preprocessNoCamel( commit.message.decode('utf-8')) if len(seqs): # 不是空列表 for seq in seqs: for word in seq: corpus.write(word.encode('utf-8')) corpus.write(" ") commitCorpus.write(word.encode('utf-8')) commitCorpus.write(" ") corpus.write("\n") commitCorpus.write("\n") # issue part issues = mysqlOperator.selectAllIssueInOneRepo(repoId) print repoId, ":", len(issues) for issue in issues: titleSeqs = preprocessor.preprocessNoCamel( issue[4].decode('utf-8')) if len(titleSeqs): # 不是空列表 for titleSeq in titleSeqs: for word in titleSeq: corpus.write(word.encode('utf-8')) corpus.write(" ") issueCorpus.write(word.encode('utf-8')) issueCorpus.write(" ") corpus.write("\n") issueCorpus.write("\n") if issue[5]: body = preprocessor.processHTMLNoCamel( issue[5].decode('utf-8')) if len(body): # 不是空列表 for bodySeq in body: for word in bodySeq: corpus.write(word.encode('utf-8')) corpus.write(" ") issueCorpus.write(word.encode('utf-8')) issueCorpus.write(" ") corpus.write("\n") issueCorpus.write("\n") except BaseException, e: print "***", repoId, ":", e print traceback.format_exc() print 'end'
def buildLinks(repoId): print 'start' try: repoPath = nocodeRepoInfo.REPO_MAP[ nocodeRepoInfo.USE_REPO_INDEX]['path'] gitRepo = gitResolver.GitResolver(repoPath) issues = mysqlOperator.selectAllIssueInOneRepo(repoId) commits = gitRepo.getCommits() # repoName = re.sub(r'https://github.com/', '', repo[1], 0, re.I) print '==============', repoPath, 'Start' for commit in commits: commitSha = str(commit.hexsha.encode("utf-8")) print commitSha commitIssues = mysqlOperator.selectExistIssueOnCommit( (repoId, commitSha)) trueLinks = [] for ci in commitIssues: if ci[0] in trueLinks: pass else: trueLinks.append(ci[0]) linkOperator.insertLink( ('true_link_%d' % repoId, repoId, commitSha, ci[0])) for issue in issues: if isUnlabeled(issue, gitRepo.getDateTime(commit)): if len(commitIssues) > 0: if issue[1] in trueLinks: pass else: linkOperator.insertLink( ('false_link_%d' % repoId, repoId, commitSha, issue[1])) else: pass print '==============', repoPath, 'End' except Exception, e: print 'Error:', repoPath print traceback.format_exc()
def buildIssueAndCommitSeq(repoId, repoPath, corpusName): corpus = open('corpus/code%s.dat' % corpusName, "w") try: print 'start' try: # commit part gitRe = gitResolver.GitResolver(repoPath) commits = gitRe.getCommits() print repoPath, ":", len(commits) for commit in commits: diffs = gitRe.getOneDiff(commit) for diff in diffs: diffCode = preprocessor.processDiffCode(diff.diff) if len(diffCode): for word in diffCode: corpus.write(word.encode('utf-8')) corpus.write(" ") corpus.write("\n") # issue part issues = mysqlOperator.selectAllIssueInOneRepo(repoId) print repoId, ":", len(issues) for issue in issues: if issue[5]: bodycode = preprocessor.getIssueCode( issue[5].decode('utf-8')) if len(bodycode): # 不是空列表 for word in bodycode: corpus.write(word.encode('utf-8')) corpus.write(" ") corpus.write("\n") except BaseException, e: print "***", repoId, ":", e print traceback.format_exc() print 'end'
def buildTrainSet(trueTable, falseTable, repoId, repoPath, trueGap, falseGap, trueCount, falseCount): trueStart = 1 falseStart = 1 textCorpus = open('frcorpus/text%d.dat' % repoId, "w") codeCorpus = open('frcorpus/code%d.dat' % repoId, "w") trueLinkList = linkOperator.selectInScope( (trueTable, trueStart, trueStart + trueCount)) falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount) index = 0 repo = gitResolver.GitResolver(repoPath) try: while len(trueLinkList) > 0 and len(falseLinkList) > 0: print 'true: ', trueStart, ' to ', trueStart + trueCount print 'false: ', falseStart, ' to ', falseStart + falseCount linkList = [] for trueLink in trueLinkList: commit = repo.getOneCommit(trueLink[1]) issue = mysqlOperator.selectOneIssue(trueLink[2]) if issue is None: continue comments = mysqlOperator.selectCommentInOneIssue(trueLink[2]) try: files = repo.getFiles(trueLink[1]) except: print 'File Fail 1:', trueLink[1] continue res = {} res['type'] = 1 res['issueText'] = [] # issue body if issue[5]: res['issueCode'] = frpreprocesser.extractCode( issue[5].decode('utf-8')) res['issueText'].append( frpreprocesser.extractText( issue[5].decode('utf-8'))) # body else: res['issueCode'] = [] res['issueText'].append( frpreprocesser.extractText( issue[4].decode('utf-8'))) # title for comment in comments: res['issueText'].append( frpreprocesser.extractText(comment[4].decode('utf-8'))) res['commitText'] = [] res['commitCode'] = [] res['commitText'].append( frpreprocesser.extractText(commit.message.decode('utf-8'))) for changeFile in files: if not changeFile['path'].endswith('.java'): try: res['commitText'].append( frpreprocesser.extractText( changeFile['text'].decode('utf-8'))) except: print trueLink[1], ':', changeFile['path'] else: codes = frpreprocesser.extractCode( changeFile['text'].decode('utf-8')) for code in codes: if code in res['issueCode']: res['commitCode'].extend(codes) break linkList.append(res) writeToCorpus(textCorpus, codeCorpus, res['commitText'], res['commitCode']) writeToCorpus(textCorpus, codeCorpus, res['issueText'], res['issueCode']) for falseLink in falseLinkList: commit = repo.getOneCommit(falseLink[1]) issue = mysqlOperator.selectOneIssue(falseLink[2]) if issue is None: continue comments = mysqlOperator.selectCommentInOneIssue(falseLink[2]) try: files = repo.getFiles(falseLink[1]) except: print 'File Fail 0:', falseLink[1] continue res = {} res['type'] = 0 res['issueText'] = [] # issue body if issue[5]: res['issueCode'] = frpreprocesser.extractCode( issue[5].decode('utf-8')) res['issueText'].append( frpreprocesser.extractText( issue[5].decode('utf-8'))) # body else: res['issueCode'] = [] res['issueText'].append( frpreprocesser.extractText( issue[4].decode('utf-8'))) # title for comment in comments: res['issueText'].append( frpreprocesser.extractText(comment[4].decode('utf-8'))) res['commitText'] = [] res['commitCode'] = [] res['commitText'].append( frpreprocesser.extractText(commit.message.decode('utf-8'))) for changeFile in files: if not changeFile['path'].endswith('.java'): try: res['commitText'].append( frpreprocesser.extractText( changeFile['text'].decode('utf-8'))) except: print trueLink[1], ':', changeFile['path'] else: codes = frpreprocesser.extractCode( changeFile['text'].decode('utf-8')) for code in codes: if code in res['issueCode']: res['commitCode'].extend(codes) break linkList.append(res) writeToCorpus(textCorpus, codeCorpus, res['commitText'], res['commitCode']) writeToCorpus(textCorpus, codeCorpus, res['issueText'], res['issueCode']) index += 1 res = json.dumps(linkList, encoding="utf-8", indent=4) trainSet = open( './frtrain%d/traincase%d-%d.dat' % (repoId, repoId, index), "w") trainSet.write(res) trainSet.close() print './frtrain%d/traincase%d-%d.dat' % (repoId, repoId, index), 'Over' trueStart += trueGap falseStart += falseGap trueLinkList = linkOperator.selectInScope( (trueTable, trueStart, trueStart + trueCount)) falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount) except IOError, e: print "***", e print traceback.format_exc()
def buildTrainSet(trueTable, falseTable, repoId, repoPath, trueGap, falseGap, trueCount, falseCount): trueStart = 1 falseStart = 1 trueLinkList = linkOperator.selectInScope((trueTable, trueStart, trueStart + trueCount)) falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount) index = 0 repo = gitResolver.GitResolver(repoPath) while len(trueLinkList) > 0 and len(falseLinkList) > 0: print 'true: ', trueStart, ' to ', trueStart + trueCount print 'false: ', falseStart, ' to ', falseStart + falseCount linkList = [] for trueLink in trueLinkList: commit = repo.getOneCommit(trueLink[1]) issue = mysqlOperator.selectOneIssue(trueLink[2]) if issue is None: continue res = {} res['type'] = 1 res['commit'] = commit.message.decode('utf-8') res['issuetitle'] = issue[4].decode('utf-8') # issue body if issue[5]: res['issue'] = issue[5].decode('utf-8') issueCodes = [] bodycode = preprocessor.getIssueCode(res['issue']) if len(bodycode): issueCodes.append(bodycode) res['issuecode'] = issueCodes else: res['issue'] = '' res['issuecode'] = [] diffs = repo.getOneDiff(commit) diffCodes = [] for diff in diffs: diffCode = preprocessor.processDiffCode(diff.diff) if len(diffCode): diffCodes.append(diffCode) res['commitcode'] = diffCodes linkList.append(res) for falseLink in falseLinkList: commit = repo.getOneCommit(falseLink[1]) issue = mysqlOperator.selectOneIssue(falseLink[2]) if issue is None: continue res = {} res['type'] = 0 res['commit'] = commit.message.decode('utf-8') res['issuetitle'] = issue[4].decode('utf-8') # issue body if issue[5]: res['issue'] = issue[5].decode('utf-8') issueCodes = [] bodycode = preprocessor.getIssueCode(res['issue']) if len(bodycode): issueCodes.append(bodycode) res['issuecode'] = issueCodes else: res['issue'] = '' res['issuecode'] = [] diffs = repo.getOneDiff(commit) diffCodes = [] for diff in diffs: diffCode = preprocessor.processDiffCode(diff.diff) if len(diffCode): diffCodes.append(diffCode) res['commitcode'] = diffCodes linkList.append(res) index += 1 res = json.dumps(linkList, encoding="utf-8", indent=4) trainSet = open('./codetrain%d/codetrain%d-%d.dat' % (repoId, repoId, index), "w") trainSet.write(res) trainSet.close() print './codetrain%d/codetrain%d-%d.dat' % (repoId, repoId, index), 'Over' trueStart += trueGap falseStart += falseGap trueLinkList = linkOperator.selectInScope((trueTable, trueStart, trueStart + trueCount)) falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount) mysqlOperator.close() linkOperator.close()
# -*- coding: UTF-8 -*- from gitresolver import gitResolver import datetime # ioHandler.buildCorpus("model.dat", "corpus.dat") path = 'D:/github/checkstyle' repo = gitResolver.GitResolver(path) # print repo.getFiles('76d6365018ec7688c8a8475b2f9aa496fbcfe88c') print repo.getFiles('eceaa8b65a982db58d31ac901cdd751c435b1362')
def buildTrainSet(trueTable, falseTable, repoId, repoPath, trueGap, falseGap, trueCount, falseCount): trueStart = 1 falseStart = 1 textCorpus = open('frcorpus/frtext%d.dat' % repoId, "w") codeCorpus = open('frcorpus/frcode%d.dat' % repoId, "w") trueLinkList = linkOperator.selectInScope((trueTable, trueStart, trueStart + trueCount)) falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount) index = 0 repo = gitResolver.GitResolver(repoPath) try: while len(trueLinkList) > 0 and len(falseLinkList) > 0: print 'true: ', trueStart, ' to ', trueStart + trueCount print 'false: ', falseStart, ' to ', falseStart + falseCount my_linkList = [] fr_linkList = [] for trueLink in trueLinkList: commit = repo.getOneCommit(trueLink[1]) issue = mysqlOperator.selectOneIssue(trueLink[2]) if issue is None: continue my_res = {} my_res['type'] = 1 my_res['commit'] = commit.message.decode('utf-8') my_res['issuetitle'] = issue[4].decode('utf-8') # issue body if issue[5]: my_res['issue'] = issue[5].decode('utf-8') issueCodes = [] bodycode = preprocessor.getIssueCode(my_res['issue']) if len(bodycode): issueCodes.append(bodycode) my_res['issuecode'] = issueCodes else: my_res['issue'] = '' my_res['issuecode'] = [] diffs = repo.getOneDiff(commit) diffCodes = [] for diff in diffs: diffCode = preprocessor.processDiffCode(diff.diff) if len(diffCode): diffCodes.append(diffCode) my_res['commitcode'] = diffCodes my_linkList.append(my_res) fr_res = {} fr_res['type'] = 1 fr_res['issueText'] = [] # issue body if issue[5]: fr_res['issueCode'] = frpreprocesser.extractCode(issue[5].decode('utf-8')) fr_res['issueText'].append(frpreprocesser.extractText(issue[5].decode('utf-8'))) # body else: fr_res['issueCode'] = [] fr_res['issueText'].append(frpreprocesser.extractText(issue[4].decode('utf-8'))) # title fr_res['commitText'] = [] fr_res['commitCode'] = [] fr_res['commitText'].append(frpreprocesser.extractText(commit.message.decode('utf-8'))) comments = mysqlOperator.selectCommentInOneIssue(trueLink[2]) for comment in comments: fr_res['issueText'].append(frpreprocesser.extractText(comment[4].decode('utf-8'))) try: files = repo.getFiles(trueLink[1]) for changeFile in files: if not changeFile['path'].endswith('.java'): try: fr_res['commitText'].append( frpreprocesser.extractText(changeFile['text'].decode('utf-8'))) except: print trueLink[1], ':', changeFile['path'] else: codes = frpreprocesser.extractCode(changeFile['text'].decode('utf-8')) for code in codes: if code in fr_res['issueCode']: fr_res['commitCode'].extend(codes) break except: print 'File Fail 1:', trueLink[1] fr_linkList.append(fr_res) writeToCorpus(textCorpus, codeCorpus, fr_res['commitText'], fr_res['commitCode']) writeToCorpus(textCorpus, codeCorpus, fr_res['issueText'], fr_res['issueCode']) for falseLink in falseLinkList: commit = repo.getOneCommit(falseLink[1]) issue = mysqlOperator.selectOneIssue(falseLink[2]) if issue is None: continue my_res = {} my_res['type'] = 0 my_res['commit'] = commit.message.decode('utf-8') my_res['issuetitle'] = issue[4].decode('utf-8') # issue body if issue[5]: my_res['issue'] = issue[5].decode('utf-8') issueCodes = [] bodycode = preprocessor.getIssueCode(my_res['issue']) if len(bodycode): issueCodes.append(bodycode) my_res['issuecode'] = issueCodes else: my_res['issue'] = '' my_res['issuecode'] = [] diffs = repo.getOneDiff(commit) diffCodes = [] for diff in diffs: diffCode = preprocessor.processDiffCode(diff.diff) if len(diffCode): diffCodes.append(diffCode) my_res['commitcode'] = diffCodes my_linkList.append(my_res) fr_res = {} fr_res['type'] = 0 fr_res['issueText'] = [] # issue body if issue[5]: fr_res['issueCode'] = frpreprocesser.extractCode(issue[5].decode('utf-8')) fr_res['issueText'].append(frpreprocesser.extractText(issue[5].decode('utf-8'))) # body else: fr_res['issueCode'] = [] fr_res['issueText'].append(frpreprocesser.extractText(issue[4].decode('utf-8'))) # title fr_res['commitText'] = [] fr_res['commitCode'] = [] fr_res['commitText'].append(frpreprocesser.extractText(commit.message.decode('utf-8'))) comments = mysqlOperator.selectCommentInOneIssue(falseLink[2]) for comment in comments: fr_res['issueText'].append(frpreprocesser.extractText(comment[4].decode('utf-8'))) try: files = repo.getFiles(falseLink[1]) for changeFile in files: if not changeFile['path'].endswith('.java'): try: fr_res['commitText'].append( frpreprocesser.extractText(changeFile['text'].decode('utf-8'))) except: print trueLink[1], ':', changeFile['path'] else: codes = frpreprocesser.extractCode(changeFile['text'].decode('utf-8')) for code in codes: if code in fr_res['issueCode']: fr_res['commitCode'].extend(codes) break except: print 'File Fail 0:', falseLink[1] fr_linkList.append(fr_res) writeToCorpus(textCorpus, codeCorpus, fr_res['commitText'], fr_res['commitCode']) writeToCorpus(textCorpus, codeCorpus, fr_res['issueText'], fr_res['issueCode']) index += 1 res = json.dumps(my_linkList, encoding="utf-8", indent=4) trainSet = open('%s/codetrain%d-%d.dat' % (my_folder, repoId, index), "w") trainSet.write(res) trainSet.close() print '%s/codetrain%d-%d.dat' % (my_folder, repoId, index), 'Over' fres = json.dumps(fr_linkList, encoding="utf-8", indent=4) ftrainSet = open('%s/traincase%d-%d.dat' % (fr_folder, repoId, index), "w") ftrainSet.write(fres) ftrainSet.close() print '%s/traincase%d-%d.dat' % (fr_folder, repoId, index), 'Over' trueStart += trueGap falseStart += falseGap trueLinkList = linkOperator.selectInScope((trueTable, trueStart, trueStart + trueCount)) falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount) except IOError, e: print "***", e print traceback.format_exc()
def buildIssueAndCommit(): repos = linkOperator.selectOneRepo(50904245) # repos = linkOperator.selectRepoOver(5000) textCorpus = open('text50904245.dat', "w") codeCorpus = open('code50904245.dat', "w") try: print 'start' for highRepo in repos: try: # commit part path = getPath(highRepo[1]) gitRe = gitResolver.GitResolver(path) commits = gitRe.getCommits() print path, ":", len(commits) for commit in commits: words = preprocessor.preprocessToWord( commit.message.decode('utf-8')) if len(words): # 不是空列表 for word in words: textCorpus.write(word.encode('utf-8')) textCorpus.write(" ") textCorpus.write("\n") diffs = gitRe.getOneDiff(commit) for diff in diffs: diffCode = preprocessor.processDiffCode(diff.diff) preDiffCode = preprocessor.processPreDiffCode( diff.diff) if len(diffCode): for code in diffCode: codeCorpus.write(code) codeCorpus.write(" ") codeCorpus.write("\n") if len(preDiffCode): for code in preDiffCode: codeCorpus.write(code) codeCorpus.write(" ") codeCorpus.write("\n") # issue part issues = mysqlOperator.selectAllIssueInOneRepo(highRepo[0]) print highRepo[0], ":", len(issues) for issue in issues: titleWords = preprocessor.preprocessToWord( issue[4].decode('utf-8')) if len(titleWords): # 不是空列表 for word in titleWords: textCorpus.write(word.encode('utf-8')) textCorpus.write(" ") textCorpus.write("\n") if issue[5]: body = preprocessor.processHTML( issue[5].decode('utf-8')) bodyWords = body[1] codeWords = body[0] if len(bodyWords): # 不是空列表 for word in bodyWords: textCorpus.write(word.encode('utf-8')) textCorpus.write(" ") textCorpus.write("\n") if len(codeWords): # 不是空列表 for word in codeWords: codeCorpus.write(word.encode('utf-8')) codeCorpus.write(" ") codeCorpus.write("\n") comments = mysqlOperator.selectCommentInOneIssue(issue[1]) for comment in comments: temp = preprocessor.processHTML( comment[4].decode('utf-8')) cBodyWords = temp[1] cCodeWords = temp[0] if len(cBodyWords): # 不是空列表 for word in cBodyWords: textCorpus.write(word.encode('utf-8')) textCorpus.write(" ") textCorpus.write("\n") if len(cCodeWords): # 不是空列表 for word in cCodeWords: codeCorpus.write(word.encode('utf-8')) codeCorpus.write(" ") codeCorpus.write("\n") except BaseException, e: print "***", highRepo[0], ":", e print traceback.format_exc() print 'end'
comments = mysqlOperatorCopy.selectAllCommentInOneRepoDate(cou) issueSet = [] for issue in issues: if issue[1] not in issueSet: issueSet.append(issue[1]) for comment in comments: if comment[1] not in issueSet: issueSet.append(comment[1]) return issueSet projects = linkOperator.selectRepoOver(5000) print 'start' for repo in projects: try: gitRepo = gitResolver.GitResolver(getPath(repo[1])) commits = gitRepo.getCommits() print '==============', getPath(repo[1]), 'Start' for commit in commits: commitSha = str(commit.hexsha.encode("utf-8")) print commitSha commitIssues = mysqlOperatorCopy.selectExistIssueOnCommit( (repo[0], commitSha)) trueLinks = [] for ci in commitIssues: trueLinks.append(ci[0]) issueByDate = getIssueInDate( (repo[0], str(gitRepo.getDateTime(commit)), str(gitRepo.getDateTime(commit)))) for i in issueByDate: if len(commitIssues) > 0:
""" dot_val = 0.0 a_norm = 0.0 b_norm = 0.0 for a, b in zip(a_vect, b_vect): dot_val += a * b a_norm += a ** 2 b_norm += b ** 2 if a_norm == 0.0 or b_norm == 0.0: return -1 else: return dot_val / ((a_norm * b_norm) ** 0.5) repoMap = {} repoMap[12983151L] = gitResolver.GitResolver('/home/fdse/data/prior_repository/openhab/openhab1-addons') # repos = mysqlOperator.selectAllHighRepository() # for repo in repos: # print type(repo[0]) # try: # repoMap[repo[0]] = gitResolver.GitResolver(getPath(repo[1])) # except: # repoMap[repo[0]] = None # TRUE_LINK_TOTAL = linkOperator.count('true_link') # FALSE_LINK_TOTAL = linkOperator.count('false_link') TRUE_GAP = 559 FALSE_GAP = 15000 TRUE_COUNT = 559 FALSE_COUNT = 600