Пример #1
0
def buildByList(repoList, corpusName):
    corpus = open('corpus/nocode%s.dat' % corpusName, "w")
    commitCorpus = open('corpus/commit%s.dat' % corpusName, "w")
    issueCorpus = open('corpus/issue%s.dat' % corpusName, "w")
    try:
        print 'start'
        for i in range(len(repoList)):
            repoId = repoList[i]['id']
            repoPath = repoList[i]['path']
            try:
                # commit part
                gitRe = gitResolver.GitResolver(repoPath)
                commits = gitRe.getCommits()
                print repoPath, ":", len(commits)
                for commit in commits:
                    seqs = preprocessor.preprocessNoCamel(
                        commit.message.decode('utf-8'))
                    if len(seqs):
                        # 不是空列表
                        for seq in seqs:
                            for word in seq:
                                corpus.write(word.encode('utf-8'))
                                corpus.write(" ")
                                commitCorpus.write(word.encode('utf-8'))
                                commitCorpus.write(" ")
                            corpus.write("\n")
                            commitCorpus.write("\n")
                # issue part
                issues = mysqlOperator.selectAllIssueInOneRepo(repoId)
                print repoId, ":", len(issues)
                for issue in issues:
                    titleSeqs = preprocessor.preprocessNoCamel(
                        issue[4].decode('utf-8'))
                    if len(titleSeqs):
                        # 不是空列表
                        for titleSeq in titleSeqs:
                            for word in titleSeq:
                                corpus.write(word.encode('utf-8'))
                                corpus.write(" ")
                                issueCorpus.write(word.encode('utf-8'))
                                issueCorpus.write(" ")
                            corpus.write("\n")
                            issueCorpus.write("\n")
                    if issue[5]:
                        body = preprocessor.processHTMLNoCamel(
                            issue[5].decode('utf-8'))
                        if len(body):
                            # 不是空列表
                            for bodySeq in body:
                                for word in bodySeq:
                                    corpus.write(word.encode('utf-8'))
                                    corpus.write(" ")
                                    issueCorpus.write(word.encode('utf-8'))
                                    issueCorpus.write(" ")
                                corpus.write("\n")
                                issueCorpus.write("\n")
            except BaseException, e:
                print "***", repoId, ":", e
                print traceback.format_exc()
        print 'end'
Пример #2
0
def buildIssuePart():
    repos = linkOperator.selectRepoOver(5000)
    textCorpus = open('issueText.dat', "w")
    codeCorpus = open('issueCode.dat', "w")
    try:
        print 'start'
        for highRepo in repos:
            try:
                issues = mysqlOperator.selectAllIssueInOneRepo(highRepo[0])
                print highRepo[0], ":", len(issues)
                for issue in issues:
                    titleWords = preprocessor.preprocessToWord(
                        issue[4].decode('utf-8'))
                    if len(titleWords):
                        # 不是空列表
                        for word in titleWords:
                            textCorpus.write(word.encode('utf-8'))
                            textCorpus.write(" ")
                        textCorpus.write("\n")
                    if issue[5]:
                        body = preprocessor.processHTML(
                            issue[5].decode('utf-8'))
                        bodyWords = body[1]
                        codeWords = body[0]
                        if len(bodyWords):
                            # 不是空列表
                            for word in bodyWords:
                                textCorpus.write(word.encode('utf-8'))
                                textCorpus.write(" ")
                            textCorpus.write("\n")
                        if len(codeWords):
                            # 不是空列表
                            for word in codeWords:
                                codeCorpus.write(word.encode('utf-8'))
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                    comments = mysqlOperator.selectCommentInOneIssue(issue[1])
                    for comment in comments:
                        temp = preprocessor.processHTML(
                            comment[4].decode('utf-8'))
                        cBodyWords = temp[1]
                        cCodeWords = temp[0]
                        if len(cBodyWords):
                            # 不是空列表
                            for word in cBodyWords:
                                textCorpus.write(word.encode('utf-8'))
                                textCorpus.write(" ")
                            textCorpus.write("\n")
                        if len(cCodeWords):
                            # 不是空列表
                            for word in cCodeWords:
                                codeCorpus.write(word.encode('utf-8'))
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
            except BaseException, e:
                print "***", highRepo[0], ":", e
                print traceback.format_exc()
        print 'end'
Пример #3
0
def buildLinks(repoId):
    print 'start'
    try:
        repoPath = nocodeRepoInfo.REPO_MAP[
            nocodeRepoInfo.USE_REPO_INDEX]['path']
        gitRepo = gitResolver.GitResolver(repoPath)
        issues = mysqlOperator.selectAllIssueInOneRepo(repoId)
        commits = gitRepo.getCommits()
        # repoName = re.sub(r'https://github.com/', '', repo[1], 0, re.I)
        print '==============', repoPath, 'Start'
        for commit in commits:
            commitSha = str(commit.hexsha.encode("utf-8"))
            print commitSha
            commitIssues = mysqlOperator.selectExistIssueOnCommit(
                (repoId, commitSha))
            trueLinks = []
            for ci in commitIssues:
                if ci[0] in trueLinks:
                    pass
                else:
                    trueLinks.append(ci[0])
                    linkOperator.insertLink(
                        ('true_link_%d' % repoId, repoId, commitSha, ci[0]))
            for issue in issues:
                if isUnlabeled(issue, gitRepo.getDateTime(commit)):
                    if len(commitIssues) > 0:
                        if issue[1] in trueLinks:
                            pass
                        else:
                            linkOperator.insertLink(
                                ('false_link_%d' % repoId, repoId, commitSha,
                                 issue[1]))
                    else:
                        pass
        print '==============', repoPath, 'End'
    except Exception, e:
        print 'Error:', repoPath
        print traceback.format_exc()
Пример #4
0
def buildIssueAndCommitSeq(repoId, repoPath, corpusName):
    corpus = open('corpus/code%s.dat' % corpusName, "w")
    try:
        print 'start'
        try:
            # commit part
            gitRe = gitResolver.GitResolver(repoPath)
            commits = gitRe.getCommits()
            print repoPath, ":", len(commits)
            for commit in commits:
                diffs = gitRe.getOneDiff(commit)
                for diff in diffs:
                    diffCode = preprocessor.processDiffCode(diff.diff)
                    if len(diffCode):
                        for word in diffCode:
                            corpus.write(word.encode('utf-8'))
                            corpus.write(" ")
                        corpus.write("\n")

            # issue part
            issues = mysqlOperator.selectAllIssueInOneRepo(repoId)
            print repoId, ":", len(issues)
            for issue in issues:
                if issue[5]:
                    bodycode = preprocessor.getIssueCode(
                        issue[5].decode('utf-8'))
                    if len(bodycode):
                        # 不是空列表
                        for word in bodycode:
                            corpus.write(word.encode('utf-8'))
                            corpus.write(" ")
                        corpus.write("\n")
        except BaseException, e:
            print "***", repoId, ":", e
            print traceback.format_exc()
        print 'end'
Пример #5
0
# -*- coding: UTF-8 -*-

from database import linkOperator, mysqlOperator

projects = mysqlOperator.selectAllHighRepository()
print 'start'
for repo in projects:
    issues = mysqlOperator.selectAllIssueInOneRepo(repo[0])
    issueLen = len(issues)
    linkLen = 0
    for issue in issues:
        links = mysqlOperator.selectTrueLinkInOneIssue(issue[1])
        linkLen = linkLen + len(links)
    turple = (repo[0], repo[1], issueLen, linkLen)
    print turple, '\n'
    linkOperator.insertOneRepo(turple)
print 'end'
linkOperator.close()
mysqlOperator.close()
Пример #6
0
def buildIssueAndCommit():
    repos = linkOperator.selectOneRepo(50904245)
    # repos = linkOperator.selectRepoOver(5000)
    textCorpus = open('text50904245.dat', "w")
    codeCorpus = open('code50904245.dat', "w")
    try:
        print 'start'
        for highRepo in repos:
            try:
                # commit part
                path = getPath(highRepo[1])
                gitRe = gitResolver.GitResolver(path)
                commits = gitRe.getCommits()
                print path, ":", len(commits)
                for commit in commits:
                    words = preprocessor.preprocessToWord(
                        commit.message.decode('utf-8'))
                    if len(words):
                        # 不是空列表
                        for word in words:
                            textCorpus.write(word.encode('utf-8'))
                            textCorpus.write(" ")
                        textCorpus.write("\n")
                    diffs = gitRe.getOneDiff(commit)
                    for diff in diffs:
                        diffCode = preprocessor.processDiffCode(diff.diff)
                        preDiffCode = preprocessor.processPreDiffCode(
                            diff.diff)
                        if len(diffCode):
                            for code in diffCode:
                                codeCorpus.write(code)
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                        if len(preDiffCode):
                            for code in preDiffCode:
                                codeCorpus.write(code)
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                # issue part
                issues = mysqlOperator.selectAllIssueInOneRepo(highRepo[0])
                print highRepo[0], ":", len(issues)
                for issue in issues:
                    titleWords = preprocessor.preprocessToWord(
                        issue[4].decode('utf-8'))
                    if len(titleWords):
                        # 不是空列表
                        for word in titleWords:
                            textCorpus.write(word.encode('utf-8'))
                            textCorpus.write(" ")
                        textCorpus.write("\n")
                    if issue[5]:
                        body = preprocessor.processHTML(
                            issue[5].decode('utf-8'))
                        bodyWords = body[1]
                        codeWords = body[0]
                        if len(bodyWords):
                            # 不是空列表
                            for word in bodyWords:
                                textCorpus.write(word.encode('utf-8'))
                                textCorpus.write(" ")
                            textCorpus.write("\n")
                        if len(codeWords):
                            # 不是空列表
                            for word in codeWords:
                                codeCorpus.write(word.encode('utf-8'))
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
                    comments = mysqlOperator.selectCommentInOneIssue(issue[1])
                    for comment in comments:
                        temp = preprocessor.processHTML(
                            comment[4].decode('utf-8'))
                        cBodyWords = temp[1]
                        cCodeWords = temp[0]
                        if len(cBodyWords):
                            # 不是空列表
                            for word in cBodyWords:
                                textCorpus.write(word.encode('utf-8'))
                                textCorpus.write(" ")
                            textCorpus.write("\n")
                        if len(cCodeWords):
                            # 不是空列表
                            for word in cCodeWords:
                                codeCorpus.write(word.encode('utf-8'))
                                codeCorpus.write(" ")
                            codeCorpus.write("\n")
            except BaseException, e:
                print "***", highRepo[0], ":", e
                print traceback.format_exc()
        print 'end'