git.setRepository("/home/wolfgang/git-repos/linux-2.6/.git") git.setRevisionRange("v2.6.14", "v2.6.33") #git.setRevisionRange("v2.6.23", "v2.6.26") #git.setRepository("/home/wolfgang/git-repos/perl/.git") #git.setRevisionRange("8d063cd8450e", "HEAD") #git.setSubsysDescription(kerninfo.subsysDescrLinux) git.extractCommitData() ################################################### print("Shelfing the git object") import shelve d = shelve.open("/home/wolfgang/linux-14-33") d["git"] = git d.close() # #print("Same in blue after unshelfing:") #k = shelve.open("/tmp/git-shelf") #git2 = k["git"] #k.close() ################################################### #res = createCumulativeSeries(git, "__main__") #res = createCumulativeSeries(git, "block") res = createSeries(git, subsys="__main__", revrange=["v2.6.23", "v2.6.26"]) print("Obtained a list with {0} commits".format(len(res))) for i in range(0,10): print("{0}: {1}, {2}".format(res[i]["commit"].cdate, res[i]["value"][0], res[i]["commit"].getCommitMessageLines()))
git.setRepository("/home/wolfgang/git-repos/linux-2.6/.git") git.setRevisionRange("v2.6.14", "v2.6.33") #git.setRevisionRange("v2.6.23", "v2.6.26") #git.setRepository("/home/wolfgang/git-repos/perl/.git") #git.setRevisionRange("8d063cd8450e", "HEAD") #git.setSubsysDescription(kerninfo.subsysDescrLinux) git.extractCommitData() ################################################### print("Shelfing the git object") import shelve d = shelve.open("/home/wolfgang/linux-14-33") d["git"] = git d.close() # #print("Same in blue after unshelfing:") #k = shelve.open("/tmp/git-shelf") #git2 = k["git"] #k.close() ################################################### #res = createCumulativeSeries(git, "__main__") #res = createCumulativeSeries(git, "block") res = createSeries(git, subsys="__main__", revrange=["v2.6.23", "v2.6.26"]) print("Obtained a list with {0} commits".format(len(res))) for i in range(0, 10): print("{0}: {1}, {2}".format(res[i]["commit"].cdate, res[i]["value"][0], res[i]["commit"].getCommitMessageLines()))
def doAnalysis(vcs, basedir, revrange=None): # TODO: This needs to include the subrange analysis # TODO: Use a temporary dir for data storage (unless the R # data exchange problem is solved) print("Creating raw series") res = createSeries(vcs, "__main__", revrange) writeToFile(res, "/home/wolfgang/raw.dat") duration = getSeriesDuration(res) # Emergency stop: If the cycle is less than 200 commits long, # there are no meaningful results to be expected. if len(res) < 200: print("!!! Not enough commits in list, skipping analysis") return print("Creating cumulative series") res = createCumulativeSeries(vcs, "__main__", revrange) writeToFile(res, "/home/wolfgang/cum.dat") # TODO: How is it possible to exchange the data directly between python # and R? Writing to a file and then re-reading the stuff is a bit stupid # (if all else fails, we could at least use a named pipe) runR('raw = as.xts(read.zoo(file="/home/wolfgang/raw.dat", '\ 'FUN=tstamp_to_date))') raw = RtoPython(runR('raw')) # We use the average number of commits per quarter day as basis for the # moving average secs_per_hour = 60 * 60 smooth_commits = len(raw) / (duration / (6 * secs_per_hour)) print("Length: {0}, duration: {1}".format(len(raw), duration)) # ... but also ensure that we do not get excessively large or # small values if smooth_commits < 20: smooth_commits = 20 elif smooth_commits > 350: smooth_commits = 350 print("Using {0} as smoothing factor".format(smooth_commits)) if (len(raw) < smooth_commits): print("Pathological case: Excessively short series with {} commits " "detected, giving up.".format(len(raw))) return runR('reg = to.regts(raw[,1], {0})'.format(smooth_commits)) runR('cum = as.xts(read.zoo(file="/home/wolfgang/cum.dat", '\ 'FUN=tstamp_to_date))') reg = RtoPython(runR('reg')) cum = RtoPython(runR('cum')) # HARDCODED assumptions about the position of the data fields # TODO: These should get symbolic R labels. How is this possible? diff_sizes = RtoPython(runR('coredata(raw)[,1]')) descr_sizes = RtoPython(runR('coredata(raw)[,5]')) deltat = int(runR('deltat(reg)')[0]) tstart = int(runR('start(reg)')[0]) tend = int(runR('end(reg)')[0]) timelist_reg = RtoPython(runR('unclass(index(reg))')) # Create a simplified time range starting at zero timelist_reg_simplified = range(0, tend - tstart + 1, deltat) timelist_cum = RtoPython(runR('unclass(index(cum))')) # Plot the cumulative and the averaged series # TODO: Use different y axes for the components because they # scale vastly different # TODO: We need to re-initialise the plot object somehow since # in the second run, the histogram of the previous run is # plotted here. status("Computing Time Series Graphs") fig = plt.figure() ax = fig.add_subplot(111) _setupPythonGraphics(os.path.join(basedir, "timegraph"), "PDF") plot(timelist_reg, RtoPython(runR('reg'))) xlabel("Time (TODO: Label with tags)") plt.show() _closePythonGraphics(os.path.join(basedir, "timegraph"), "PDF") _setupPythonGraphics(os.path.join(basedir, "timegraph_cum"), "PDF") fig = plt.figure() ax = fig.add_subplot(111) ax.plot(timelist_cum, RtoPython(runR('coredata(cum)[,1]'))) xlabel("Time (TODO: Label with tags)") plt.show() _closePythonGraphics(os.path.join(basedir, "timegraph_cum"), "PDF") # Compare the histograms of commit size and description length # distributions # TODO: The plots overlap so that information gets lost. This is # obviously bad. status("Computing Histograms") _setupPythonGraphics(os.path.join(basedir, "histograms"), "PDF") fig = plt.figure() ax = fig.add_subplot(111) ax.hold(True) ax.hist(descr_sizes, 100, range=(0, 100), normed=True) ax.hist(diff_sizes, 100, range=(0, 100), normed=True, alpha=0.5) ax.set_xlabel("Commit/Diff size") ax.set_ylabel("Probability") ax.grid(True) ax.hold(False) plt.show() _closePythonGraphics(os.path.join(basedir, "histograms"), "PDF") # Let's look at some correlations: Between different diff approaches, # and the correlation between diff size and status("Computing Correlations") computeDiffsizeCommitlengthCorrelation("raw", filename=os.path.join( basedir, "diff_commit_corr"), backend="PDF") computeDifftypeCorrelation("raw", filename=os.path.join(basedir, "difftype_corr"), backend="PDF") # Determine the density. TODO: Find the best bandwidth. status("Computing Density") computeDensity("reg", bandwidth=10, filename=os.path.join(basedir, "density"), backend="PDF") # We could also use reg, but coredata gives more regular labels status("Computing Spectrum") computeSpectrum("coredata(reg)", filename=os.path.join(basedir, "spectrum"), backend="PDF") status("Computing ECDF") computeECDF("reg", filename=os.path.join(basedir, "ecdf"), backend="PDF") # Generate the recurrence diagram for a series # NOTE: When the number of considered data points exceeds a # certain threshold, we don't do the plot because it's # computationally too expensive if len(reg) < 5000: # We use PNG for plotting here because the PDF gets huge. # (we could also just pass reg, but extracting the coredata gives # "nicer" labels") status("Computing Recurrence Diagram") computeRecurrenceDiagram("coredata(reg)[,1]", filename=os.path.join(basedir, "recurrence"), backend="PNG") else: status("Skipping recurrence diagram: Too many data points")
def doAnalysis(vcs, basedir, revrange=None): # TODO: This needs to include the subrange analysis # TODO: Use a temporary dir for data storage (unless the R # data exchange problem is solved) print("Creating raw series") res = createSeries(vcs, "__main__", revrange) writeToFile(res, "/home/wolfgang/raw.dat") duration = getSeriesDuration(res) # Emergency stop: If the cycle is less than 200 commits long, # there are no meaningful results to be expected. if len(res) < 200: print("!!! Not enough commits in list, skipping analysis") return print("Creating cumulative series") res = createCumulativeSeries(vcs, "__main__", revrange) writeToFile(res, "/home/wolfgang/cum.dat") # TODO: How is it possible to exchange the data directly between python # and R? Writing to a file and then re-reading the stuff is a bit stupid # (if all else fails, we could at least use a named pipe) runR('raw = as.xts(read.zoo(file="/home/wolfgang/raw.dat", '\ 'FUN=tstamp_to_date))') raw = RtoPython(runR('raw')) # We use the average number of commits per quarter day as basis for the # moving average secs_per_hour = 60*60 smooth_commits = len(raw)/(duration/(6*secs_per_hour)) print("Length: {0}, duration: {1}".format(len(raw), duration)) # ... but also ensure that we do not get excessively large or # small values if smooth_commits < 20: smooth_commits = 20 elif smooth_commits > 350: smooth_commits = 350 print("Using {0} as smoothing factor".format(smooth_commits)) if (len(raw) < smooth_commits): print("Pathological case: Excessively short series with {} commits " "detected, giving up.".format(len(raw))) return runR('reg = to.regts(raw[,1], {0})'.format(smooth_commits)) runR('cum = as.xts(read.zoo(file="/home/wolfgang/cum.dat", '\ 'FUN=tstamp_to_date))') reg = RtoPython(runR('reg')) cum = RtoPython(runR('cum')) # HARDCODED assumptions about the position of the data fields # TODO: These should get symbolic R labels. How is this possible? diff_sizes = RtoPython(runR('coredata(raw)[,1]')) descr_sizes = RtoPython(runR('coredata(raw)[,5]')) deltat = int(runR('deltat(reg)')[0]) tstart = int(runR('start(reg)')[0]) tend = int(runR('end(reg)')[0]) timelist_reg = RtoPython(runR('unclass(index(reg))')) # Create a simplified time range starting at zero timelist_reg_simplified = range(0, tend-tstart+1, deltat) timelist_cum = RtoPython(runR('unclass(index(cum))')) # Plot the cumulative and the averaged series # TODO: Use different y axes for the components because they # scale vastly different # TODO: We need to re-initialise the plot object somehow since # in the second run, the histogram of the previous run is # plotted here. status("Computing Time Series Graphs") fig = plt.figure() ax = fig.add_subplot(111) _setupPythonGraphics(os.path.join(basedir, "timegraph"), "PDF") plot(timelist_reg, RtoPython(runR('reg'))) xlabel("Time (TODO: Label with tags)") plt.show() _closePythonGraphics(os.path.join(basedir, "timegraph"), "PDF") _setupPythonGraphics(os.path.join(basedir, "timegraph_cum"), "PDF") fig = plt.figure() ax = fig.add_subplot(111) ax.plot(timelist_cum, RtoPython(runR('coredata(cum)[,1]'))) xlabel("Time (TODO: Label with tags)") plt.show() _closePythonGraphics(os.path.join(basedir, "timegraph_cum"), "PDF") # Compare the histograms of commit size and description length # distributions # TODO: The plots overlap so that information gets lost. This is # obviously bad. status("Computing Histograms") _setupPythonGraphics(os.path.join(basedir, "histograms"), "PDF") fig = plt.figure() ax = fig.add_subplot(111) ax.hold(True) ax.hist(descr_sizes,100,range=(0,100),normed=True) ax.hist(diff_sizes,100,range=(0,100),normed=True,alpha=0.5) ax.set_xlabel("Commit/Diff size") ax.set_ylabel("Probability") ax.grid(True) ax.hold(False) plt.show() _closePythonGraphics(os.path.join(basedir, "histograms"), "PDF") # Let's look at some correlations: Between different diff approaches, # and the correlation between diff size and status("Computing Correlations") computeDiffsizeCommitlengthCorrelation("raw", filename=os.path.join(basedir, "diff_commit_corr"), backend="PDF") computeDifftypeCorrelation("raw", filename=os.path.join(basedir, "difftype_corr"), backend="PDF") # Determine the density. TODO: Find the best bandwidth. status("Computing Density") computeDensity("reg", bandwidth=10, filename=os.path.join(basedir, "density"), backend="PDF") # We could also use reg, but coredata gives more regular labels status("Computing Spectrum") computeSpectrum("coredata(reg)", filename=os.path.join(basedir, "spectrum"), backend="PDF") status("Computing ECDF") computeECDF("reg", filename=os.path.join(basedir, "ecdf"), backend="PDF") # Generate the recurrence diagram for a series # NOTE: When the number of considered data points exceeds a # certain threshold, we don't do the plot because it's # computationally too expensive if len(reg) < 5000: # We use PNG for plotting here because the PDF gets huge. # (we could also just pass reg, but extracting the coredata gives # "nicer" labels") status("Computing Recurrence Diagram") computeRecurrenceDiagram("coredata(reg)[,1]", filename=os.path.join(basedir, "recurrence"), backend="PNG") else: status("Skipping recurrence diagram: Too many data points")