示例#1
0
def buildCommitPart():
    repos = linkOperator.selectRepoOver(5000)
    logCorpus = open('commitLog.dat', "w")
    codeCorpus = open('commitCode.dat', "w")
    try:
        print 'start'
        for highRepo in repos:
            path = getPath(highRepo[1])
            try:
                gitRe = gitResolver.GitResolver(path)
                commits = gitRe.getCommits()
                print path, ":", len(commits)
                for commit in commits:
                    words = preprocessor.preprocessToWord(
                        commit.message.decode('utf-8'))
                    if len(words):
                        # 不是空列表
                        for word in words:
                            logCorpus.write(word.encode('utf-8'))
                            logCorpus.write(" ")
                        logCorpus.write("\n")
                    diffs = gitRe.getOneDiff(commit)
                    for diff in diffs:
                        diffCode = preprocessor.processDiffCode(diff.diff)
                        if len(diffCode):
                            for code in diffCode:
                                codeCorpus.write(code)
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
            except BaseException, e:
                print "***", path, ":", e
                print traceback.format_exc()
        print 'end'
示例#2
0
def buildFromGit():
    repos = mysqlOperator.selectAllHighRepository()
    corpus = open('corpusLabel.dat', "w")
    try:
        for highRepo in repos:
            path = getPath(highRepo[1])
            try:
                gitRe = gitResolver.GitResolver(path)
                commits = gitRe.getCommits()
                print path, ":", len(commits)
                for commit in commits:
                    corpus.write(str(highRepo[0]).encode("utf-8"))
                    corpus.write("\n")
                    corpus.write(commit.hexsha.encode("utf-8"))
                    corpus.write("\n")
                    sens = preprocessor.preprocess(
                        commit.message.decode('utf-8'))
                    for sentence in sens:
                        if len(sentence):  #不是空列表
                            for word in sentence:
                                corpus.write(word.encode('utf-8'))
                                corpus.write(" ")
                            corpus.write("\n")
                    corpus.write("\n")
            except BaseException, e:
                print "***", path, ":", e
    except IOError, e:  #检查open()是否失败,通常是IOError类型的错误
        print "***", e
示例#3
0
def buildByList(repoList, corpusName):
    corpus = open('corpus/nocode%s.dat' % corpusName, "w")
    commitCorpus = open('corpus/commit%s.dat' % corpusName, "w")
    issueCorpus = open('corpus/issue%s.dat' % corpusName, "w")
    try:
        print 'start'
        for i in range(len(repoList)):
            repoId = repoList[i]['id']
            repoPath = repoList[i]['path']
            try:
                # commit part
                gitRe = gitResolver.GitResolver(repoPath)
                commits = gitRe.getCommits()
                print repoPath, ":", len(commits)
                for commit in commits:
                    seqs = preprocessor.preprocessNoCamel(
                        commit.message.decode('utf-8'))
                    if len(seqs):
                        # 不是空列表
                        for seq in seqs:
                            for word in seq:
                                corpus.write(word.encode('utf-8'))
                                corpus.write(" ")
                                commitCorpus.write(word.encode('utf-8'))
                                commitCorpus.write(" ")
                            corpus.write("\n")
                            commitCorpus.write("\n")
                # issue part
                issues = mysqlOperator.selectAllIssueInOneRepo(repoId)
                print repoId, ":", len(issues)
                for issue in issues:
                    titleSeqs = preprocessor.preprocessNoCamel(
                        issue[4].decode('utf-8'))
                    if len(titleSeqs):
                        # 不是空列表
                        for titleSeq in titleSeqs:
                            for word in titleSeq:
                                corpus.write(word.encode('utf-8'))
                                corpus.write(" ")
                                issueCorpus.write(word.encode('utf-8'))
                                issueCorpus.write(" ")
                            corpus.write("\n")
                            issueCorpus.write("\n")
                    if issue[5]:
                        body = preprocessor.processHTMLNoCamel(
                            issue[5].decode('utf-8'))
                        if len(body):
                            # 不是空列表
                            for bodySeq in body:
                                for word in bodySeq:
                                    corpus.write(word.encode('utf-8'))
                                    corpus.write(" ")
                                    issueCorpus.write(word.encode('utf-8'))
                                    issueCorpus.write(" ")
                                corpus.write("\n")
                                issueCorpus.write("\n")
            except BaseException, e:
                print "***", repoId, ":", e
                print traceback.format_exc()
        print 'end'
示例#4
0
def buildLinks(repoId):
    print 'start'
    try:
        repoPath = nocodeRepoInfo.REPO_MAP[
            nocodeRepoInfo.USE_REPO_INDEX]['path']
        gitRepo = gitResolver.GitResolver(repoPath)
        issues = mysqlOperator.selectAllIssueInOneRepo(repoId)
        commits = gitRepo.getCommits()
        # repoName = re.sub(r'https://github.com/', '', repo[1], 0, re.I)
        print '==============', repoPath, 'Start'
        for commit in commits:
            commitSha = str(commit.hexsha.encode("utf-8"))
            print commitSha
            commitIssues = mysqlOperator.selectExistIssueOnCommit(
                (repoId, commitSha))
            trueLinks = []
            for ci in commitIssues:
                if ci[0] in trueLinks:
                    pass
                else:
                    trueLinks.append(ci[0])
                    linkOperator.insertLink(
                        ('true_link_%d' % repoId, repoId, commitSha, ci[0]))
            for issue in issues:
                if isUnlabeled(issue, gitRepo.getDateTime(commit)):
                    if len(commitIssues) > 0:
                        if issue[1] in trueLinks:
                            pass
                        else:
                            linkOperator.insertLink(
                                ('false_link_%d' % repoId, repoId, commitSha,
                                 issue[1]))
                    else:
                        pass
        print '==============', repoPath, 'End'
    except Exception, e:
        print 'Error:', repoPath
        print traceback.format_exc()
示例#5
0
def buildIssueAndCommitSeq(repoId, repoPath, corpusName):
    corpus = open('corpus/code%s.dat' % corpusName, "w")
    try:
        print 'start'
        try:
            # commit part
            gitRe = gitResolver.GitResolver(repoPath)
            commits = gitRe.getCommits()
            print repoPath, ":", len(commits)
            for commit in commits:
                diffs = gitRe.getOneDiff(commit)
                for diff in diffs:
                    diffCode = preprocessor.processDiffCode(diff.diff)
                    if len(diffCode):
                        for word in diffCode:
                            corpus.write(word.encode('utf-8'))
                            corpus.write(" ")
                        corpus.write("\n")

            # issue part
            issues = mysqlOperator.selectAllIssueInOneRepo(repoId)
            print repoId, ":", len(issues)
            for issue in issues:
                if issue[5]:
                    bodycode = preprocessor.getIssueCode(
                        issue[5].decode('utf-8'))
                    if len(bodycode):
                        # 不是空列表
                        for word in bodycode:
                            corpus.write(word.encode('utf-8'))
                            corpus.write(" ")
                        corpus.write("\n")
        except BaseException, e:
            print "***", repoId, ":", e
            print traceback.format_exc()
        print 'end'
示例#6
0
def buildTrainSet(trueTable, falseTable, repoId, repoPath, trueGap, falseGap,
                  trueCount, falseCount):
    trueStart = 1
    falseStart = 1
    textCorpus = open('frcorpus/text%d.dat' % repoId, "w")
    codeCorpus = open('frcorpus/code%d.dat' % repoId, "w")
    trueLinkList = linkOperator.selectInScope(
        (trueTable, trueStart, trueStart + trueCount))
    falseLinkList = getRandomFalse(falseTable, falseStart,
                                   falseStart + falseGap, falseCount)

    index = 0
    repo = gitResolver.GitResolver(repoPath)
    try:
        while len(trueLinkList) > 0 and len(falseLinkList) > 0:
            print 'true: ', trueStart, ' to ', trueStart + trueCount
            print 'false: ', falseStart, ' to ', falseStart + falseCount
            linkList = []
            for trueLink in trueLinkList:
                commit = repo.getOneCommit(trueLink[1])
                issue = mysqlOperator.selectOneIssue(trueLink[2])
                if issue is None:
                    continue
                comments = mysqlOperator.selectCommentInOneIssue(trueLink[2])
                try:
                    files = repo.getFiles(trueLink[1])
                except:
                    print 'File Fail 1:', trueLink[1]
                    continue

                res = {}
                res['type'] = 1
                res['issueText'] = []
                # issue body
                if issue[5]:
                    res['issueCode'] = frpreprocesser.extractCode(
                        issue[5].decode('utf-8'))
                    res['issueText'].append(
                        frpreprocesser.extractText(
                            issue[5].decode('utf-8')))  # body
                else:
                    res['issueCode'] = []
                res['issueText'].append(
                    frpreprocesser.extractText(
                        issue[4].decode('utf-8')))  # title
                for comment in comments:
                    res['issueText'].append(
                        frpreprocesser.extractText(comment[4].decode('utf-8')))
                res['commitText'] = []
                res['commitCode'] = []
                res['commitText'].append(
                    frpreprocesser.extractText(commit.message.decode('utf-8')))
                for changeFile in files:
                    if not changeFile['path'].endswith('.java'):
                        try:
                            res['commitText'].append(
                                frpreprocesser.extractText(
                                    changeFile['text'].decode('utf-8')))
                        except:
                            print trueLink[1], ':', changeFile['path']
                    else:
                        codes = frpreprocesser.extractCode(
                            changeFile['text'].decode('utf-8'))
                        for code in codes:
                            if code in res['issueCode']:
                                res['commitCode'].extend(codes)
                                break
                linkList.append(res)
                writeToCorpus(textCorpus, codeCorpus, res['commitText'],
                              res['commitCode'])
                writeToCorpus(textCorpus, codeCorpus, res['issueText'],
                              res['issueCode'])

            for falseLink in falseLinkList:
                commit = repo.getOneCommit(falseLink[1])
                issue = mysqlOperator.selectOneIssue(falseLink[2])
                if issue is None:
                    continue
                comments = mysqlOperator.selectCommentInOneIssue(falseLink[2])
                try:
                    files = repo.getFiles(falseLink[1])
                except:
                    print 'File Fail 0:', falseLink[1]
                    continue

                res = {}
                res['type'] = 0
                res['issueText'] = []
                # issue body
                if issue[5]:
                    res['issueCode'] = frpreprocesser.extractCode(
                        issue[5].decode('utf-8'))
                    res['issueText'].append(
                        frpreprocesser.extractText(
                            issue[5].decode('utf-8')))  # body
                else:
                    res['issueCode'] = []
                res['issueText'].append(
                    frpreprocesser.extractText(
                        issue[4].decode('utf-8')))  # title
                for comment in comments:
                    res['issueText'].append(
                        frpreprocesser.extractText(comment[4].decode('utf-8')))
                res['commitText'] = []
                res['commitCode'] = []
                res['commitText'].append(
                    frpreprocesser.extractText(commit.message.decode('utf-8')))
                for changeFile in files:
                    if not changeFile['path'].endswith('.java'):
                        try:
                            res['commitText'].append(
                                frpreprocesser.extractText(
                                    changeFile['text'].decode('utf-8')))
                        except:
                            print trueLink[1], ':', changeFile['path']
                    else:
                        codes = frpreprocesser.extractCode(
                            changeFile['text'].decode('utf-8'))
                        for code in codes:
                            if code in res['issueCode']:
                                res['commitCode'].extend(codes)
                                break
                linkList.append(res)
                writeToCorpus(textCorpus, codeCorpus, res['commitText'],
                              res['commitCode'])
                writeToCorpus(textCorpus, codeCorpus, res['issueText'],
                              res['issueCode'])

            index += 1
            res = json.dumps(linkList, encoding="utf-8", indent=4)
            trainSet = open(
                './frtrain%d/traincase%d-%d.dat' % (repoId, repoId, index),
                "w")
            trainSet.write(res)
            trainSet.close()
            print './frtrain%d/traincase%d-%d.dat' % (repoId, repoId,
                                                      index), 'Over'

            trueStart += trueGap
            falseStart += falseGap
            trueLinkList = linkOperator.selectInScope(
                (trueTable, trueStart, trueStart + trueCount))
            falseLinkList = getRandomFalse(falseTable, falseStart,
                                           falseStart + falseGap, falseCount)
    except IOError, e:
        print "***", e
        print traceback.format_exc()
示例#7
0
def buildTrainSet(trueTable, falseTable, repoId, repoPath, trueGap, falseGap, trueCount, falseCount):
    trueStart = 1
    falseStart = 1
    trueLinkList = linkOperator.selectInScope((trueTable, trueStart, trueStart + trueCount))
    falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount)

    index = 0
    repo = gitResolver.GitResolver(repoPath)
    while len(trueLinkList) > 0 and len(falseLinkList) > 0:
        print 'true: ', trueStart, ' to ', trueStart + trueCount
        print 'false: ', falseStart, ' to ', falseStart + falseCount
        linkList = []
        for trueLink in trueLinkList:
            commit = repo.getOneCommit(trueLink[1])
            issue = mysqlOperator.selectOneIssue(trueLink[2])
            if issue is None:
                continue

            res = {}
            res['type'] = 1
            res['commit'] = commit.message.decode('utf-8')
            res['issuetitle'] = issue[4].decode('utf-8')
            # issue body
            if issue[5]:
                res['issue'] = issue[5].decode('utf-8')
                issueCodes = []
                bodycode = preprocessor.getIssueCode(res['issue'])
                if len(bodycode):
                    issueCodes.append(bodycode)
                res['issuecode'] = issueCodes
            else:
                res['issue'] = ''
                res['issuecode'] = []

            diffs = repo.getOneDiff(commit)
            diffCodes = []
            for diff in diffs:
                diffCode = preprocessor.processDiffCode(diff.diff)
                if len(diffCode):
                    diffCodes.append(diffCode)
            res['commitcode'] = diffCodes

            linkList.append(res)

        for falseLink in falseLinkList:
            commit = repo.getOneCommit(falseLink[1])
            issue = mysqlOperator.selectOneIssue(falseLink[2])
            if issue is None:
                continue

            res = {}
            res['type'] = 0
            res['commit'] = commit.message.decode('utf-8')
            res['issuetitle'] = issue[4].decode('utf-8')
            # issue body
            if issue[5]:
                res['issue'] = issue[5].decode('utf-8')
                issueCodes = []
                bodycode = preprocessor.getIssueCode(res['issue'])
                if len(bodycode):
                    issueCodes.append(bodycode)
                res['issuecode'] = issueCodes
            else:
                res['issue'] = ''
                res['issuecode'] = []

            diffs = repo.getOneDiff(commit)
            diffCodes = []
            for diff in diffs:
                diffCode = preprocessor.processDiffCode(diff.diff)
                if len(diffCode):
                    diffCodes.append(diffCode)
            res['commitcode'] = diffCodes
            linkList.append(res)

        index += 1
        res = json.dumps(linkList, encoding="utf-8", indent=4)
        trainSet = open('./codetrain%d/codetrain%d-%d.dat' % (repoId, repoId, index), "w")
        trainSet.write(res)
        trainSet.close()
        print './codetrain%d/codetrain%d-%d.dat' % (repoId, repoId, index), 'Over'

        trueStart += trueGap
        falseStart += falseGap
        trueLinkList = linkOperator.selectInScope((trueTable, trueStart, trueStart + trueCount))
        falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount)
    mysqlOperator.close()
    linkOperator.close()
示例#8
0
# -*- coding: UTF-8 -*-

from gitresolver import gitResolver
import datetime

# ioHandler.buildCorpus("model.dat", "corpus.dat")
path = 'D:/github/checkstyle'
repo = gitResolver.GitResolver(path)

# print repo.getFiles('76d6365018ec7688c8a8475b2f9aa496fbcfe88c')
print repo.getFiles('eceaa8b65a982db58d31ac901cdd751c435b1362')
示例#9
0
def buildTrainSet(trueTable, falseTable, repoId, repoPath, trueGap, falseGap, trueCount, falseCount):
    trueStart = 1
    falseStart = 1
    textCorpus = open('frcorpus/frtext%d.dat' % repoId, "w")
    codeCorpus = open('frcorpus/frcode%d.dat' % repoId, "w")
    trueLinkList = linkOperator.selectInScope((trueTable, trueStart, trueStart + trueCount))
    falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount)

    index = 0
    repo = gitResolver.GitResolver(repoPath)
    try:
        while len(trueLinkList) > 0 and len(falseLinkList) > 0:
            print 'true: ', trueStart, ' to ', trueStart + trueCount
            print 'false: ', falseStart, ' to ', falseStart + falseCount
            my_linkList = []
            fr_linkList = []
            for trueLink in trueLinkList:
                commit = repo.getOneCommit(trueLink[1])
                issue = mysqlOperator.selectOneIssue(trueLink[2])
                if issue is None:
                    continue

                my_res = {}
                my_res['type'] = 1
                my_res['commit'] = commit.message.decode('utf-8')
                my_res['issuetitle'] = issue[4].decode('utf-8')
                # issue body
                if issue[5]:
                    my_res['issue'] = issue[5].decode('utf-8')
                    issueCodes = []
                    bodycode = preprocessor.getIssueCode(my_res['issue'])
                    if len(bodycode):
                        issueCodes.append(bodycode)
                    my_res['issuecode'] = issueCodes
                else:
                    my_res['issue'] = ''
                    my_res['issuecode'] = []

                diffs = repo.getOneDiff(commit)
                diffCodes = []
                for diff in diffs:
                    diffCode = preprocessor.processDiffCode(diff.diff)
                    if len(diffCode):
                        diffCodes.append(diffCode)
                my_res['commitcode'] = diffCodes
                my_linkList.append(my_res)

                fr_res = {}
                fr_res['type'] = 1
                fr_res['issueText'] = []
                # issue body
                if issue[5]:
                    fr_res['issueCode'] = frpreprocesser.extractCode(issue[5].decode('utf-8'))
                    fr_res['issueText'].append(frpreprocesser.extractText(issue[5].decode('utf-8')))  # body
                else:
                    fr_res['issueCode'] = []
                fr_res['issueText'].append(frpreprocesser.extractText(issue[4].decode('utf-8')))  # title
                fr_res['commitText'] = []
                fr_res['commitCode'] = []
                fr_res['commitText'].append(frpreprocesser.extractText(commit.message.decode('utf-8')))
                comments = mysqlOperator.selectCommentInOneIssue(trueLink[2])
                for comment in comments:
                    fr_res['issueText'].append(frpreprocesser.extractText(comment[4].decode('utf-8')))
                try:
                    files = repo.getFiles(trueLink[1])
                    for changeFile in files:
                        if not changeFile['path'].endswith('.java'):
                            try:
                                fr_res['commitText'].append(
                                    frpreprocesser.extractText(changeFile['text'].decode('utf-8')))
                            except:
                                print trueLink[1], ':', changeFile['path']
                        else:
                            codes = frpreprocesser.extractCode(changeFile['text'].decode('utf-8'))
                            for code in codes:
                                if code in fr_res['issueCode']:
                                    fr_res['commitCode'].extend(codes)
                                    break
                except:
                    print 'File Fail 1:', trueLink[1]
                fr_linkList.append(fr_res)
                writeToCorpus(textCorpus, codeCorpus, fr_res['commitText'], fr_res['commitCode'])
                writeToCorpus(textCorpus, codeCorpus, fr_res['issueText'], fr_res['issueCode'])

            for falseLink in falseLinkList:
                commit = repo.getOneCommit(falseLink[1])
                issue = mysqlOperator.selectOneIssue(falseLink[2])
                if issue is None:
                    continue

                my_res = {}
                my_res['type'] = 0
                my_res['commit'] = commit.message.decode('utf-8')
                my_res['issuetitle'] = issue[4].decode('utf-8')
                # issue body
                if issue[5]:
                    my_res['issue'] = issue[5].decode('utf-8')
                    issueCodes = []
                    bodycode = preprocessor.getIssueCode(my_res['issue'])
                    if len(bodycode):
                        issueCodes.append(bodycode)
                    my_res['issuecode'] = issueCodes
                else:
                    my_res['issue'] = ''
                    my_res['issuecode'] = []

                diffs = repo.getOneDiff(commit)
                diffCodes = []
                for diff in diffs:
                    diffCode = preprocessor.processDiffCode(diff.diff)
                    if len(diffCode):
                        diffCodes.append(diffCode)
                my_res['commitcode'] = diffCodes
                my_linkList.append(my_res)

                fr_res = {}
                fr_res['type'] = 0
                fr_res['issueText'] = []
                # issue body
                if issue[5]:
                    fr_res['issueCode'] = frpreprocesser.extractCode(issue[5].decode('utf-8'))
                    fr_res['issueText'].append(frpreprocesser.extractText(issue[5].decode('utf-8')))  # body
                else:
                    fr_res['issueCode'] = []
                fr_res['issueText'].append(frpreprocesser.extractText(issue[4].decode('utf-8')))  # title
                fr_res['commitText'] = []
                fr_res['commitCode'] = []
                fr_res['commitText'].append(frpreprocesser.extractText(commit.message.decode('utf-8')))
                comments = mysqlOperator.selectCommentInOneIssue(falseLink[2])
                for comment in comments:
                    fr_res['issueText'].append(frpreprocesser.extractText(comment[4].decode('utf-8')))
                try:
                    files = repo.getFiles(falseLink[1])
                    for changeFile in files:
                        if not changeFile['path'].endswith('.java'):
                            try:
                                fr_res['commitText'].append(
                                    frpreprocesser.extractText(changeFile['text'].decode('utf-8')))
                            except:
                                print trueLink[1], ':', changeFile['path']
                        else:
                            codes = frpreprocesser.extractCode(changeFile['text'].decode('utf-8'))
                            for code in codes:
                                if code in fr_res['issueCode']:
                                    fr_res['commitCode'].extend(codes)
                                    break
                except:
                    print 'File Fail 0:', falseLink[1]
                fr_linkList.append(fr_res)
                writeToCorpus(textCorpus, codeCorpus, fr_res['commitText'], fr_res['commitCode'])
                writeToCorpus(textCorpus, codeCorpus, fr_res['issueText'], fr_res['issueCode'])

            index += 1
            res = json.dumps(my_linkList, encoding="utf-8", indent=4)
            trainSet = open('%s/codetrain%d-%d.dat' % (my_folder, repoId, index), "w")
            trainSet.write(res)
            trainSet.close()
            print '%s/codetrain%d-%d.dat' % (my_folder, repoId, index), 'Over'
            fres = json.dumps(fr_linkList, encoding="utf-8", indent=4)
            ftrainSet = open('%s/traincase%d-%d.dat' % (fr_folder, repoId, index), "w")
            ftrainSet.write(fres)
            ftrainSet.close()
            print '%s/traincase%d-%d.dat' % (fr_folder, repoId, index), 'Over'

            trueStart += trueGap
            falseStart += falseGap
            trueLinkList = linkOperator.selectInScope((trueTable, trueStart, trueStart + trueCount))
            falseLinkList = getRandomFalse(falseTable, falseStart, falseStart + falseGap, falseCount)
    except IOError, e:
        print "***", e
        print traceback.format_exc()
示例#10
0
def buildIssueAndCommit():
    repos = linkOperator.selectOneRepo(50904245)
    # repos = linkOperator.selectRepoOver(5000)
    textCorpus = open('text50904245.dat', "w")
    codeCorpus = open('code50904245.dat', "w")
    try:
        print 'start'
        for highRepo in repos:
            try:
                # commit part
                path = getPath(highRepo[1])
                gitRe = gitResolver.GitResolver(path)
                commits = gitRe.getCommits()
                print path, ":", len(commits)
                for commit in commits:
                    words = preprocessor.preprocessToWord(
                        commit.message.decode('utf-8'))
                    if len(words):
                        # 不是空列表
                        for word in words:
                            textCorpus.write(word.encode('utf-8'))
                            textCorpus.write(" ")
                        textCorpus.write("\n")
                    diffs = gitRe.getOneDiff(commit)
                    for diff in diffs:
                        diffCode = preprocessor.processDiffCode(diff.diff)
                        preDiffCode = preprocessor.processPreDiffCode(
                            diff.diff)
                        if len(diffCode):
                            for code in diffCode:
                                codeCorpus.write(code)
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                        if len(preDiffCode):
                            for code in preDiffCode:
                                codeCorpus.write(code)
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                # issue part
                issues = mysqlOperator.selectAllIssueInOneRepo(highRepo[0])
                print highRepo[0], ":", len(issues)
                for issue in issues:
                    titleWords = preprocessor.preprocessToWord(
                        issue[4].decode('utf-8'))
                    if len(titleWords):
                        # 不是空列表
                        for word in titleWords:
                            textCorpus.write(word.encode('utf-8'))
                            textCorpus.write(" ")
                        textCorpus.write("\n")
                    if issue[5]:
                        body = preprocessor.processHTML(
                            issue[5].decode('utf-8'))
                        bodyWords = body[1]
                        codeWords = body[0]
                        if len(bodyWords):
                            # 不是空列表
                            for word in bodyWords:
                                textCorpus.write(word.encode('utf-8'))
                                textCorpus.write(" ")
                            textCorpus.write("\n")
                        if len(codeWords):
                            # 不是空列表
                            for word in codeWords:
                                codeCorpus.write(word.encode('utf-8'))
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                    comments = mysqlOperator.selectCommentInOneIssue(issue[1])
                    for comment in comments:
                        temp = preprocessor.processHTML(
                            comment[4].decode('utf-8'))
                        cBodyWords = temp[1]
                        cCodeWords = temp[0]
                        if len(cBodyWords):
                            # 不是空列表
                            for word in cBodyWords:
                                textCorpus.write(word.encode('utf-8'))
                                textCorpus.write(" ")
                            textCorpus.write("\n")
                        if len(cCodeWords):
                            # 不是空列表
                            for word in cCodeWords:
                                codeCorpus.write(word.encode('utf-8'))
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
            except BaseException, e:
                print "***", highRepo[0], ":", e
                print traceback.format_exc()
        print 'end'
示例#11
0
    comments = mysqlOperatorCopy.selectAllCommentInOneRepoDate(cou)
    issueSet = []
    for issue in issues:
        if issue[1] not in issueSet:
            issueSet.append(issue[1])
    for comment in comments:
        if comment[1] not in issueSet:
            issueSet.append(comment[1])
    return issueSet


projects = linkOperator.selectRepoOver(5000)
print 'start'
for repo in projects:
    try:
        gitRepo = gitResolver.GitResolver(getPath(repo[1]))
        commits = gitRepo.getCommits()
        print '==============', getPath(repo[1]), 'Start'
        for commit in commits:
            commitSha = str(commit.hexsha.encode("utf-8"))
            print commitSha
            commitIssues = mysqlOperatorCopy.selectExistIssueOnCommit(
                (repo[0], commitSha))
            trueLinks = []
            for ci in commitIssues:
                trueLinks.append(ci[0])
            issueByDate = getIssueInDate(
                (repo[0], str(gitRepo.getDateTime(commit)),
                 str(gitRepo.getDateTime(commit))))
            for i in issueByDate:
                if len(commitIssues) > 0:
示例#12
0
    """
    dot_val = 0.0
    a_norm = 0.0
    b_norm = 0.0
    for a, b in zip(a_vect, b_vect):
        dot_val += a * b
        a_norm += a ** 2
        b_norm += b ** 2
    if a_norm == 0.0 or b_norm == 0.0:
        return -1
    else:
        return dot_val / ((a_norm * b_norm) ** 0.5)


repoMap = {}
repoMap[12983151L] = gitResolver.GitResolver('/home/fdse/data/prior_repository/openhab/openhab1-addons')

# repos = mysqlOperator.selectAllHighRepository()
# for repo in repos:
#     print type(repo[0])
    # try:
    #     repoMap[repo[0]] = gitResolver.GitResolver(getPath(repo[1]))
    # except:
    #     repoMap[repo[0]] = None

# TRUE_LINK_TOTAL = linkOperator.count('true_link')
# FALSE_LINK_TOTAL = linkOperator.count('false_link')
TRUE_GAP = 559
FALSE_GAP = 15000
TRUE_COUNT = 559
FALSE_COUNT = 600