def buildCommitPart(): repos = linkOperator.selectRepoOver(5000) logCorpus = open('commitLog.dat', "w") codeCorpus = open('commitCode.dat', "w") try: print 'start' for highRepo in repos: path = getPath(highRepo[1]) try: gitRe = gitResolver.GitResolver(path) commits = gitRe.getCommits() print path, ":", len(commits) for commit in commits: words = preprocessor.preprocessToWord( commit.message.decode('utf-8')) if len(words): # 不是空列表 for word in words: logCorpus.write(word.encode('utf-8')) logCorpus.write(" ") logCorpus.write("\n") diffs = gitRe.getOneDiff(commit) for diff in diffs: diffCode = preprocessor.processDiffCode(diff.diff) if len(diffCode): for code in diffCode: codeCorpus.write(code) codeCorpus.write(" ") codeCorpus.write("\n") except BaseException, e: print "***", path, ":", e print traceback.format_exc() print 'end'
def buildIssueAndCommitSeq(repoId, repoPath, corpusName): corpus = open('corpus/code%s.dat' % corpusName, "w") try: print 'start' try: # commit part gitRe = gitResolver.GitResolver(repoPath) commits = gitRe.getCommits() print repoPath, ":", len(commits) for commit in commits: diffs = gitRe.getOneDiff(commit) for diff in diffs: diffCode = preprocessor.processDiffCode(diff.diff) if len(diffCode): for word in diffCode: corpus.write(word.encode('utf-8')) corpus.write(" ") corpus.write("\n") # issue part issues = mysqlOperator.selectAllIssueInOneRepo(repoId) print repoId, ":", len(issues) for issue in issues: if issue[5]: bodycode = preprocessor.getIssueCode( issue[5].decode('utf-8')) if len(bodycode): # 不是空列表 for word in bodycode: corpus.write(word.encode('utf-8')) corpus.write(" ") corpus.write("\n") except BaseException, e: print "***", repoId, ":", e print traceback.format_exc() print 'end'
return -1 else: return dot_val / ((a_norm * b_norm)**0.5) textModel = Doc2Vec.load("text12983151.model") codeModel = Doc2Vec.load("code12983151.model") index = 0 while index < 3: linkList = [] titleWords = preprocessor.preprocessToWord("test is for your parents") print type(textModel.infer_vector(titleWords)) titelTextVec = textModel.infer_vector(titleWords).tolist() print type(titelTextVec[0]) diffCode = preprocessor.processDiffCode("test is for your parents") commitCodeVec = codeModel.infer_vector(diffCode).tolist() linkList.append({'text': titelTextVec, 'code': commitCodeVec}) linkList.append({'text': titelTextVec, 'code': commitCodeVec}) index += 1 # res = json.dumps(linkList, encoding="utf-8", indent=4) # trainSet = open('./train/traruanhincase%d.dat' % index, "w") # trainSet.write(res) # trainSet.close() # path = './train' # filelist = os.listdir(path) # for i in range(0, len(filelist)): # filepath = os.path.join(path, filelist[i]) # print filepath
# -*- coding: UTF-8 -*- from preprocessor import preprocessor import re print preprocessor.processHTML(''' Examples shown in the javadoc for TESD_DSAFSA_DDS <code>ReplayingDecoder.addOption</code> seems to be wrong. In the document it shows <code>IntegerHeaderFrameDecoder, MyDecoder</code> taking multiple parameters where as in reality it can only accept one. I'm working with versions 4.0.0.CR3, 4.0.0.CR5. ''') print preprocessor.processDiffCode(''' @@ -349 +349 @@ public class JavadocUtilsTest { - "HTML_COMMENT", JavadocUtils.getTokenName(20077)); + "HTML_COMMENT", JavadocUtils.getTokenName(20078)); ''') print preprocessor.preprocessToWord(''' Examples shown in the javadoc for TESD_DSAFSA_DDS ReplayingDecoder.addOption seems to be wrong. In the document it shows IntegerHeaderFrameDecoder, MyDecoder taking multiple parameters where as in reality it can only accept one. I'm working with versions 4.0.0.CR3, 4.0.0.CR5. ''') if None: print 'none' else: print 'other' # camelCase1 = re.compile(r'^[A-Z]+[a-z]+.*[A-Z]+.*$') # 3 # camelCase2 = re.compile(r'^[a-z]+.*[A-Z]+.*$') # 12 # upperCase = re.compile(r'^[A-Z]+[0-9]*$') # 7 # upperExtCase = re.compile(r'^[A-Z]*(_+[A-Z]*)+[0-9]*$') # 6 # # print re.match(upperExtCase, 'aOption'), '1' # print re.match(upperExtCase, 'addOption'), '2' # print re.match(upperExtCase, 'AddToDeal'), '3'
def buildTrainSet(trueTable, falseTable, repoId, repoPath, trueGap, falseGap, trueCount, falseCount): trueStart = 1 falseStart = 1 trueLinkList = linkOperator.selectInScope((trueTable, trueStart, trueStart + trueCount)) falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount) index = 0 repo = gitResolver.GitResolver(repoPath) while len(trueLinkList) > 0 and len(falseLinkList) > 0: print 'true: ', trueStart, ' to ', trueStart + trueCount print 'false: ', falseStart, ' to ', falseStart + falseCount linkList = [] for trueLink in trueLinkList: commit = repo.getOneCommit(trueLink[1]) issue = mysqlOperator.selectOneIssue(trueLink[2]) if issue is None: continue res = {} res['type'] = 1 res['commit'] = commit.message.decode('utf-8') res['issuetitle'] = issue[4].decode('utf-8') # issue body if issue[5]: res['issue'] = issue[5].decode('utf-8') issueCodes = [] bodycode = preprocessor.getIssueCode(res['issue']) if len(bodycode): issueCodes.append(bodycode) res['issuecode'] = issueCodes else: res['issue'] = '' res['issuecode'] = [] diffs = repo.getOneDiff(commit) diffCodes = [] for diff in diffs: diffCode = preprocessor.processDiffCode(diff.diff) if len(diffCode): diffCodes.append(diffCode) res['commitcode'] = diffCodes linkList.append(res) for falseLink in falseLinkList: commit = repo.getOneCommit(falseLink[1]) issue = mysqlOperator.selectOneIssue(falseLink[2]) if issue is None: continue res = {} res['type'] = 0 res['commit'] = commit.message.decode('utf-8') res['issuetitle'] = issue[4].decode('utf-8') # issue body if issue[5]: res['issue'] = issue[5].decode('utf-8') issueCodes = [] bodycode = preprocessor.getIssueCode(res['issue']) if len(bodycode): issueCodes.append(bodycode) res['issuecode'] = issueCodes else: res['issue'] = '' res['issuecode'] = [] diffs = repo.getOneDiff(commit) diffCodes = [] for diff in diffs: diffCode = preprocessor.processDiffCode(diff.diff) if len(diffCode): diffCodes.append(diffCode) res['commitcode'] = diffCodes linkList.append(res) index += 1 res = json.dumps(linkList, encoding="utf-8", indent=4) trainSet = open('./codetrain%d/codetrain%d-%d.dat' % (repoId, repoId, index), "w") trainSet.write(res) trainSet.close() print './codetrain%d/codetrain%d-%d.dat' % (repoId, repoId, index), 'Over' trueStart += trueGap falseStart += falseGap trueLinkList = linkOperator.selectInScope((trueTable, trueStart, trueStart + trueCount)) falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount) mysqlOperator.close() linkOperator.close()
def buildTrainSet(trueTable, falseTable, repoId, repoPath, trueGap, falseGap, trueCount, falseCount): trueStart = 1 falseStart = 1 textCorpus = open('frcorpus/frtext%d.dat' % repoId, "w") codeCorpus = open('frcorpus/frcode%d.dat' % repoId, "w") trueLinkList = linkOperator.selectInScope((trueTable, trueStart, trueStart + trueCount)) falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount) index = 0 repo = gitResolver.GitResolver(repoPath) try: while len(trueLinkList) > 0 and len(falseLinkList) > 0: print 'true: ', trueStart, ' to ', trueStart + trueCount print 'false: ', falseStart, ' to ', falseStart + falseCount my_linkList = [] fr_linkList = [] for trueLink in trueLinkList: commit = repo.getOneCommit(trueLink[1]) issue = mysqlOperator.selectOneIssue(trueLink[2]) if issue is None: continue my_res = {} my_res['type'] = 1 my_res['commit'] = commit.message.decode('utf-8') my_res['issuetitle'] = issue[4].decode('utf-8') # issue body if issue[5]: my_res['issue'] = issue[5].decode('utf-8') issueCodes = [] bodycode = preprocessor.getIssueCode(my_res['issue']) if len(bodycode): issueCodes.append(bodycode) my_res['issuecode'] = issueCodes else: my_res['issue'] = '' my_res['issuecode'] = [] diffs = repo.getOneDiff(commit) diffCodes = [] for diff in diffs: diffCode = preprocessor.processDiffCode(diff.diff) if len(diffCode): diffCodes.append(diffCode) my_res['commitcode'] = diffCodes my_linkList.append(my_res) fr_res = {} fr_res['type'] = 1 fr_res['issueText'] = [] # issue body if issue[5]: fr_res['issueCode'] = frpreprocesser.extractCode(issue[5].decode('utf-8')) fr_res['issueText'].append(frpreprocesser.extractText(issue[5].decode('utf-8'))) # body else: fr_res['issueCode'] = [] fr_res['issueText'].append(frpreprocesser.extractText(issue[4].decode('utf-8'))) # title fr_res['commitText'] = [] fr_res['commitCode'] = [] fr_res['commitText'].append(frpreprocesser.extractText(commit.message.decode('utf-8'))) comments = mysqlOperator.selectCommentInOneIssue(trueLink[2]) for comment in comments: fr_res['issueText'].append(frpreprocesser.extractText(comment[4].decode('utf-8'))) try: files = repo.getFiles(trueLink[1]) for changeFile in files: if not changeFile['path'].endswith('.java'): try: fr_res['commitText'].append( frpreprocesser.extractText(changeFile['text'].decode('utf-8'))) except: print trueLink[1], ':', changeFile['path'] else: codes = frpreprocesser.extractCode(changeFile['text'].decode('utf-8')) for code in codes: if code in fr_res['issueCode']: fr_res['commitCode'].extend(codes) break except: print 'File Fail 1:', trueLink[1] fr_linkList.append(fr_res) writeToCorpus(textCorpus, codeCorpus, fr_res['commitText'], fr_res['commitCode']) writeToCorpus(textCorpus, codeCorpus, fr_res['issueText'], fr_res['issueCode']) for falseLink in falseLinkList: commit = repo.getOneCommit(falseLink[1]) issue = mysqlOperator.selectOneIssue(falseLink[2]) if issue is None: continue my_res = {} my_res['type'] = 0 my_res['commit'] = commit.message.decode('utf-8') my_res['issuetitle'] = issue[4].decode('utf-8') # issue body if issue[5]: my_res['issue'] = issue[5].decode('utf-8') issueCodes = [] bodycode = preprocessor.getIssueCode(my_res['issue']) if len(bodycode): issueCodes.append(bodycode) my_res['issuecode'] = issueCodes else: my_res['issue'] = '' my_res['issuecode'] = [] diffs = repo.getOneDiff(commit) diffCodes = [] for diff in diffs: diffCode = preprocessor.processDiffCode(diff.diff) if len(diffCode): diffCodes.append(diffCode) my_res['commitcode'] = diffCodes my_linkList.append(my_res) fr_res = {} fr_res['type'] = 0 fr_res['issueText'] = [] # issue body if issue[5]: fr_res['issueCode'] = frpreprocesser.extractCode(issue[5].decode('utf-8')) fr_res['issueText'].append(frpreprocesser.extractText(issue[5].decode('utf-8'))) # body else: fr_res['issueCode'] = [] fr_res['issueText'].append(frpreprocesser.extractText(issue[4].decode('utf-8'))) # title fr_res['commitText'] = [] fr_res['commitCode'] = [] fr_res['commitText'].append(frpreprocesser.extractText(commit.message.decode('utf-8'))) comments = mysqlOperator.selectCommentInOneIssue(falseLink[2]) for comment in comments: fr_res['issueText'].append(frpreprocesser.extractText(comment[4].decode('utf-8'))) try: files = repo.getFiles(falseLink[1]) for changeFile in files: if not changeFile['path'].endswith('.java'): try: fr_res['commitText'].append( frpreprocesser.extractText(changeFile['text'].decode('utf-8'))) except: print trueLink[1], ':', changeFile['path'] else: codes = frpreprocesser.extractCode(changeFile['text'].decode('utf-8')) for code in codes: if code in fr_res['issueCode']: fr_res['commitCode'].extend(codes) break except: print 'File Fail 0:', falseLink[1] fr_linkList.append(fr_res) writeToCorpus(textCorpus, codeCorpus, fr_res['commitText'], fr_res['commitCode']) writeToCorpus(textCorpus, codeCorpus, fr_res['issueText'], fr_res['issueCode']) index += 1 res = json.dumps(my_linkList, encoding="utf-8", indent=4) trainSet = open('%s/codetrain%d-%d.dat' % (my_folder, repoId, index), "w") trainSet.write(res) trainSet.close() print '%s/codetrain%d-%d.dat' % (my_folder, repoId, index), 'Over' fres = json.dumps(fr_linkList, encoding="utf-8", indent=4) ftrainSet = open('%s/traincase%d-%d.dat' % (fr_folder, repoId, index), "w") ftrainSet.write(fres) ftrainSet.close() print '%s/traincase%d-%d.dat' % (fr_folder, repoId, index), 'Over' trueStart += trueGap falseStart += falseGap trueLinkList = linkOperator.selectInScope((trueTable, trueStart, trueStart + trueCount)) falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount) except IOError, e: print "***", e print traceback.format_exc()
def buildIssueAndCommit(): repos = linkOperator.selectOneRepo(50904245) # repos = linkOperator.selectRepoOver(5000) textCorpus = open('text50904245.dat', "w") codeCorpus = open('code50904245.dat', "w") try: print 'start' for highRepo in repos: try: # commit part path = getPath(highRepo[1]) gitRe = gitResolver.GitResolver(path) commits = gitRe.getCommits() print path, ":", len(commits) for commit in commits: words = preprocessor.preprocessToWord( commit.message.decode('utf-8')) if len(words): # 不是空列表 for word in words: textCorpus.write(word.encode('utf-8')) textCorpus.write(" ") textCorpus.write("\n") diffs = gitRe.getOneDiff(commit) for diff in diffs: diffCode = preprocessor.processDiffCode(diff.diff) preDiffCode = preprocessor.processPreDiffCode( diff.diff) if len(diffCode): for code in diffCode: codeCorpus.write(code) codeCorpus.write(" ") codeCorpus.write("\n") if len(preDiffCode): for code in preDiffCode: codeCorpus.write(code) codeCorpus.write(" ") codeCorpus.write("\n") # issue part issues = mysqlOperator.selectAllIssueInOneRepo(highRepo[0]) print highRepo[0], ":", len(issues) for issue in issues: titleWords = preprocessor.preprocessToWord( issue[4].decode('utf-8')) if len(titleWords): # 不是空列表 for word in titleWords: textCorpus.write(word.encode('utf-8')) textCorpus.write(" ") textCorpus.write("\n") if issue[5]: body = preprocessor.processHTML( issue[5].decode('utf-8')) bodyWords = body[1] codeWords = body[0] if len(bodyWords): # 不是空列表 for word in bodyWords: textCorpus.write(word.encode('utf-8')) textCorpus.write(" ") textCorpus.write("\n") if len(codeWords): # 不是空列表 for word in codeWords: codeCorpus.write(word.encode('utf-8')) codeCorpus.write(" ") codeCorpus.write("\n") comments = mysqlOperator.selectCommentInOneIssue(issue[1]) for comment in comments: temp = preprocessor.processHTML( comment[4].decode('utf-8')) cBodyWords = temp[1] cCodeWords = temp[0] if len(cBodyWords): # 不是空列表 for word in cBodyWords: textCorpus.write(word.encode('utf-8')) textCorpus.write(" ") textCorpus.write("\n") if len(cCodeWords): # 不是空列表 for word in cCodeWords: codeCorpus.write(word.encode('utf-8')) codeCorpus.write(" ") codeCorpus.write("\n") except BaseException, e: print "***", highRepo[0], ":", e print traceback.format_exc() print 'end'
index = 0 while len(trueLinkList) > 0 and len(falseLinkList) > 0: print 'true: ', trueStart, ' to ', trueStart+TRUE_COUNT print 'false: ', falseStart, ' to ', falseStart+FALSE_COUNT linkList = [] for trueLink in trueLinkList: tempMap = {} tempMap['type'] = 1 repo = repoMap[trueLink[0]] commit = repo.getOneCommit(trueLink[1]) issue = mysqlOperator.selectOneIssue(trueLink[2]) comments = mysqlOperator.selectCommentInOneIssue(trueLink[2]) diffs = repo.getOneDiff(commit) diffCodeList = [] for diff in diffs: diffCode = preprocessor.processDiffCode(diff.diff) preDiffCode = preprocessor.processPreDiffCode(diff.diff) diffCodeList.append((codeModel.infer_vector(diffCode), codeModel.infer_vector(preDiffCode))) # code part init codeMax = -1 tempMap['commitCode'] = None tempMap['issueCode'] = None # text part init commitText = preprocessor.preprocessToWord(commit.message.decode('utf-8')) commitTextVec = textModel.infer_vector(commitText) tempMap['commitText'] = commitTextVec # 确定不变 titleWords = preprocessor.preprocessToWord(issue[4].decode('utf-8')) tempMap['issueText'] = textModel.infer_vector(titleWords) # 可能改变 textMax = similarity(commitTextVec, tempMap['issueText']) # issue body