class DatabaseGenerator(Experiment): inputs = {'mode': Integer(0)} outputs = { 'keydict': TableDict(), "table2": Table([("version", "integer"), ("method", "text"), ("result", "text")], keys=["version", "method"], conflict_strategy="REPLACE") } def run(self): for i in range(0, self.mode.value + 3): self.keydict[str(i)] = "barfoo " + str(i) if self.mode.value == 0: self.table2.insert(version=1, method="GET", result="404") # 1 self.table2.insert(version=1, method="GET", result="404") # 2: replace 1 self.table2.insert(version=1, method="POST", result="200") # 3 else: self.table2.insert(version=1, method="GET", result="200") # 4: replace 2 self.table2.insert(version=1, method="GET", result="300") # 5: replace 4 self.table2.insert(version=1, method="DROP", result="404") # 6 self.table2.insert(version=1, method="POST", result="200") # 7: replace 3
class HistoricalCompilationGlobalEvaluation(Experiment): inputs = { "clang_hash": GitArchive("/home/cip/2015/yb90ifym/clang-hash/"), "project": GitArchive("/home/cip/2015/yb90ifym/clang-hash/hash-projects/lua"), "commits": Integer(4744), "jobs": Integer(1), # was 4 "dataset": Directory("/home/cip/2015/yb90ifym/clang-hash/experiments/HistoricalCompilation-4e7c977077afea3d2ad77aeefe3b472c"), # full lua "hot_threshold_percentage": Integer(10), # minimal change percentage for commit to be classified as "hot" } outputs = { "stats": File("summary.dict"), "eval_data": File("eval.txt"), "hot_commits_histo": File("global_hot_commits.pdf"), } def project_name(self): return os.path.basename(self.metadata['project-clone-url']) def run(self): # Project name logging.info("Cloning project... %s", self.project_name()) self.build_info = {"project-name": self.project_name(), "commit-hash": self.metadata["project-hash"], 'builds': []} with self.project as src_path: time = 0 os.chdir(self.dataset.path) # Read summary file from data collection run commits = None with open("summary.dict") as sf: summary = eval(sf.read()) commits = summary['builds'] def read_chash_data(commit): element_hashes = [] try: with open(commit, 'r') as cf: commit_data = eval(cf.read()) for ofile_data in commit_data: element_hashes.extend(ofile_data['element-hashes']) except: pass return element_hashes stats = { 'data-empty': set(), # commits with empty info files, e.g. failed to be collected, (first n commits -> missing makefile o.a.) 'commits': {}, 'elements': {}, # symbol -> how often did this symbol change } total_changed_globals = 0 # How often was any global changed throughout the history? total_changed_records = 0 # How often was any record changed throughout the history? total_changed_static_funcs = 0 # How often was any static function changed throughout the history? total_changed_functions = 0 # without static functions total_insdel_globals = 0 # How often was any global introduced/removed throughout the history? total_insdel_records = 0 # How often was any record introduced/removed throughout the history? total_insdel_static_funcs = 0 # How often was any static function introduced/removed throughout the history? total_insdel_functions = 0 # without static functions # in-degree: how many SLOs depend on E? # out-degree: how many SLOs does E depend on? in_degrees = {} # indegree -> nr of elements with that indegree out_degrees = {} # outdegree -> nr of elements wirh that outdegree max_in_degree = (None, 0) # (element, degree) max_out_degree = (None, 0) # (element, degree) prev_commit = None prev_hashes = None prev_used_definitions = None prev_global_hashes = None counter = 1 for info in commits: print "\n%d/%d" % (counter, len(commits)) counter += 1 commit = info['commit'] parent = info['parent'] if not parent: # first commit has no parent print "No parent" continue commit_data = read_chash_data(commit) if not commit_data: # If the data does not exist, note and skip #print "Data empty" stats['data-empty'].add(commit) continue local_hashes = {} used_definitions = {} # just 4 testing: for element in commit_data: name = element[0] if name.startswith('static function:') or name.startswith('function:'): name = element[0].split(':')[1] local_hashes[name] = element[1] try: used_definitions[name] = set() for used_def in element[2]: if used_def.startswith('static function:') or used_def.startswith('function:'): used_definitions[name].add(used_def.split(':')[1]) except: pass # prev: #for element in commit_data: # local_hashes[element[0]] = element[1] # try: # used_definitions[element[0]] = element[2] # except: # pass parent_hashes = {} parent_global_hashes = {} parent_used_definitions = {} if parent == prev_commit and prev_global_hashes and prev_used_definitions and prev_hashes: #print "Reuse prev_commit" parent_hashes = prev_hashes parent_used_definitions = prev_used_definitions parent_global_hashes = prev_global_hashes else: #print "Cannot reuse prev_commit" parent_data = read_chash_data(parent) # just 4 testing: for element in parent_data: name = element[0] if name.startswith('static function:') or name.startswith('function:'): name = element[0].split(':')[1] parent_hashes[name] = element[1] try: parent_used_definitions[name] = set() for used_def in element[2]: if used_def.startswith('static function:') or used_def.startswith('function:'): parent_used_definitions[name].add(used_def.split(':')[1]) except: pass # prev: #for element in parent_data: # parent_hashes[element[0]] = element[1] # try: # parent_used_definitions[element[0]] = element[2] # except: # pass if not parent_hashes: # If the data does not exist, note and skip stats['data-empty'].add(commit) # Save data for reuse prev_commit = commit prev_hashes = local_hashes prev_used_definitions = used_definitions continue ########################## # GLOBAL HASH EVALUATION # ########################## commit_stats = { 'element-count' : len(local_hashes), 'changed-elements' : [], } elements = set(local_hashes.keys()) parent_elements = set(parent_hashes.keys()) # calculate in- and out-degree # reverse used_definitions out_use_defs = { s:0 for s in used_definitions.keys() } # element -> nr of depending elements for element in elements: for el in used_definitions[element]: try: out_use_defs[el] += 1 except: pass for element in elements: out_degree = len(used_definitions[element]) in_degree = out_use_defs[element] if in_degree > max_in_degree[1]: max_in_degree = (element, in_degree) if out_degree > max_out_degree[1]: max_out_degree = (element, out_degree) if in_degree not in in_degrees: in_degrees[in_degree] = 0 in_degrees[in_degree] += 1 if out_degree not in out_degrees: out_degrees[out_degree] = 0 out_degrees[out_degree] += 1 commit_stats['changed-elements'] = elements ^ parent_elements # elements either added or removed for element in commit_stats['changed-elements']: if element.startswith('record:'): # do this here to get only insertions and deletions total_insdel_records += 1 elif element.startswith('variable:') or element.startswith('static variable:'): total_insdel_globals += 1 elif element.startswith('static function:'): total_insdel_static_funcs += 1 else: total_insdel_functions += 1 # Compare hashes common_elements = elements & parent_elements global_hashes = {} for element in common_elements: global_hash = get_global_hash(element, global_hashes, local_hashes, used_definitions) parent_global_hash = get_global_hash(element, parent_global_hashes, parent_hashes, parent_used_definitions) if global_hash != parent_global_hash: commit_stats['changed-elements'].add(element) if element.startswith('record:'): # do this here to ignore insertions and deletions total_changed_records += 1 elif element.startswith('variable:') or element.startswith('static variable:'): total_changed_globals += 1 elif element.startswith('static function:'): total_changed_static_funcs += 1 else: total_changed_functions += 1 commit_stats['changed-element-count'] = len(commit_stats['changed-elements']); stats['commits'][commit] = commit_stats # Count how often each element was changed over the whole history for element in commit_stats['changed-elements']: if element not in stats['elements']: stats['elements'][element] = 0; stats['elements'][element] += 1 # Save data for reuse prev_commit = commit prev_hashes = local_hashes prev_used_definitions = used_definitions prev_global_hashes = global_hashes self.build_info['stats'] = stats #in_degrees = {} # indegree -> nr of elements with that indegree #out_degrees = {} # outdegree -> nr of elements wirh that outdegree #max_in_degree = (None, 0) # (element, degree) #max_out_degree = (None, 0) # (element, degree) summed_in_degrees = sum([k*v for k,v in in_degrees.iteritems()]) nr_of_elements = sum(in_degrees.values()) avg_in_degree = summed_in_degrees/float(nr_of_elements) avg_out_degree = sum([k*v for k,v in out_degrees.iteritems()])/float(sum(out_degrees.values())) eval_info = { 'nr-of-commits' : len(commits), 'change-percentage' : {}, # change percentage -> nr of commits with change < percentage 'hot-commits': {}, 'total-changed-globals': total_changed_globals, 'total-changed-records': total_changed_records, 'total-changed-static-funcs': total_changed_static_funcs, 'total-changed-functions': total_changed_functions, 'total-insdel-globals': total_insdel_globals, 'total-insdel-records': total_insdel_records, 'total-insdel-static-funcs': total_insdel_static_funcs, 'total-insdel-functions': total_insdel_functions, 'max_in_degree': max_in_degree, 'max_out_degree': max_out_degree, 'avg_in_degree': avg_in_degree, 'avg_out_degree': avg_out_degree, } # Get most changed elements eval_info['most-changed-elements'] = {k:v for k,v in stats['elements'].iteritems() if v > 1000} # arbitrary value (about 20% of commits) # Calc average nr and percentage of (changed) symbols per commit summed_avg_change_percentage = 0 summed_changed_elements = 0 summed_total_elements = 0 commits = self.build_info['stats']['commits'] for commit in commits: commit_stat = commits[commit] change_percentage = len(commit_stat['changed-elements'])/float(commit_stat['element-count']) summed_avg_change_percentage += change_percentage summed_changed_elements += len(commit_stat['changed-elements']) summed_total_elements += commit_stat['element-count'] percentage = int(round(change_percentage * 100)) if percentage not in eval_info['change-percentage']: eval_info['change-percentage'][percentage] = 0 eval_info['change-percentage'][percentage] += 1 # Identify hot commits #if percentage > self.hot_threshold_percentage.value: #eval_info['hot-commits'][commit] = percentage eval_info['avg-change-percentage'] = summed_avg_change_percentage / float(len(stats['commits'])) eval_info['avg-changed-elements'] = summed_changed_elements / eval_info['nr-of-commits'] eval_info['avg-total-elements'] = summed_total_elements / eval_info['nr-of-commits'] eval_info['nr-hot-commits'] = len(eval_info['hot-commits']) with open(self.eval_data.path, "w+") as fd: fd.write(repr(eval_info)) # Output the summary of this build into the statistics file. with open(self.stats.path, "w+") as fd: fd.write(repr(self.build_info)) def plot_hash_count_histogram(hash_values, filename): dictionary = plt.figure() fig, ax = plt.subplots() plt.xlabel('Prozentanteil geaenderter Elemente') plt.ylabel('Anzahl von Commits') axes = plt.gca() axes.set_xlim([-10,100]) axes.set_ylim([0,1600]) ax.bar(hash_values.keys(), hash_values.values(), align='center') fig.savefig(filename) # clean data for plotting data = {k:v for k,v in eval_info['change-percentage'].iteritems() if k <= 100} plot_hash_count_histogram(data, self.hot_commits_histo.path) def variant_name(self): return "%s-%s"%(self.project_name(), self.metadata['mode']) def symlink_name(self): return "%s-%s"%(self.title, self.variant_name())
class HistoricalCompilationCallGraphEvaluation(Experiment): inputs = { "clang_hash": GitArchive("/home/cip/2015/yb90ifym/clang-hash/"), "project": GitArchive("/home/cip/2015/yb90ifym/clang-hash/hash-projects/lua"), "commits": Integer(4744), "jobs": Integer(1), # was 4 "dataset": Directory( "/home/cip/2015/yb90ifym/clang-hash/experiments/HistoricalCompilation-4e7c977077afea3d2ad77aeefe3b472c" ), # full lua "hot_threshold_percentage": Integer( 10 ), # minimal change percentage for commit to be classified as "hot" } outputs = { "stats": File("summary.dict"), "eval_data": File("eval.txt"), "hot_commits_histo": File("cg_hot_commits.pdf"), } def project_name(self): return os.path.basename(self.metadata['project-clone-url']) def run(self): # Project name logging.info("Cloning project... %s", self.project_name()) self.build_info = { "project-name": self.project_name(), "commit-hash": self.metadata["project-hash"], 'builds': [] } with self.project as src_path: time = 0 os.chdir(self.dataset.path) # Read summary file from data collection run commits = None with open("summary.dict") as sf: summary = eval(sf.read()) commits = summary['builds'] def read_chash_data(commit): element_hashes = [] try: with open(commit, 'r') as cf: commit_data = eval(cf.read()) for ofile_data in commit_data: element_hashes.extend(ofile_data['element-hashes']) except: pass return element_hashes stats = { 'data-empty': set( ), # commits with empty info files, e.g. failed to be collected, (first n commits -> missing makefile o.a.) 'commits': {}, 'elements': {}, # symbol -> how often did this symbol change } total_changed_functions = 0 # How often was any function changed throughout the history? total_insdel_functions = 0 # How often was any function introduced/removed throughout the history? prev_commit = None prev_functions = None prev_used_definitions = None counter = 1 for info in commits: print "%d/%d" % (counter, len(commits)) counter += 1 commit = info['commit'] parent = info['parent'] if not parent: # first commit has no parent print "No parent" continue commit_data = read_chash_data(commit) if not commit_data: # If the data does not exist, note and skip #print "Data empty" stats['data-empty'].add(commit) continue functions = set() used_definitions = {} for element in commit_data: if element[0].startswith('static function:') or element[ 0].startswith('function:'): clean_name = element[0].split(':')[1] functions.add(clean_name) used_definitions[clean_name] = set() for used_def in element[2]: if used_def.startswith( 'static function:') or used_def.startswith( 'function:'): used_definitions[clean_name].add( used_def.split(':')[1]) parent_functions = {} parent_used_definitions = {} if parent == prev_commit and prev_functions and prev_used_definitions: #print "Reuse prev_commit" parent_functions = prev_functions parent_used_definitions = prev_used_definitions else: #print "Cannot reuse prev_commit" parent_data = read_chash_data(parent) for element in parent_data: if element[0].startswith( 'static function:') or element[0].startswith( 'function:'): clean_name = element[0].split(':')[1] parent_functions.insert(clean_name) parent_used_definitions[clean_name] = set() for used_def in element[2]: if used_def.startswith( 'static function:' ) or used_def.startswith('function:'): parent_used_definitions[clean_name].add( used_def.split(':')[1]) if not parent_functions: # If the data does not exist, note and skip stats['data-empty'].add(commit) # Save data for reuse prev_commit = commit prev_functions = functions prev_used_definitions = used_definitions continue ######################### # CALL GRAPH EVALUATION # ######################### commit_stats = { 'element-count': len(functions), 'changed-elements': [], # contains changed + impacted functions #'changed-not-impacted': set(), # contains directly changed functions only } elements = functions parent_elements = parent_functions commit_stats['changed-elements'] = set( ) #elements ^ parent_elements # elements either added or removed total_insdel_functions += len(commit_stats['changed-elements']) cwd = os.getcwd() os.chdir(src_path) changed_functions = get_changed_functions_from_commit( src_path, commit) os.chdir(cwd) commit_stats['changed-not-impacted'] = changed_functions.copy() # Get impacted functions changed_functions |= get_impacted_funcs_fake_hash( changed_functions, used_definitions) commit_stats['changed-elements'] |= changed_functions total_changed_functions += len(changed_functions) commit_stats['changed-element-count'] = len( commit_stats['changed-elements']) stats['commits'][commit] = commit_stats # Count how often each element was changed over the whole history for element in commit_stats['changed-elements']: if element not in stats['elements']: stats['elements'][element] = 0 stats['elements'][element] += 1 # Save data for reuse prev_commit = commit prev_functions = functions prev_used_definitions = used_definitions self.build_info['stats'] = stats eval_info = { 'nr-of-commits': len(commits), 'change-percentage': {}, # change percentage -> nr of commits with change < percentage 'hot-commits': {}, 'total-changed-functions': total_changed_functions, 'total-insdel-functions': total_insdel_functions, } # Get most changed elements eval_info['most-changed-elements'] = { k: v for k, v in stats['elements'].iteritems() if v > 400 } # arbitrary value (about 10% of commits) # Calc average nr and percentage of (changed) symbols per commit summed_avg_change_percentage = 0 summed_changed_elements = 0 summed_total_elements = 0 commits = self.build_info['stats']['commits'] for commit in commits: commit_stat = commits[commit] change_percentage = len(commit_stat['changed-elements']) / float( commit_stat['element-count']) summed_avg_change_percentage += change_percentage summed_changed_elements += len(commit_stat['changed-elements']) summed_total_elements += commit_stat['element-count'] percentage = int(round(change_percentage * 100)) if percentage not in eval_info['change-percentage']: eval_info['change-percentage'][percentage] = 0 eval_info['change-percentage'][percentage] += 1 # Identify hot commits #if percentage > self.hot_threshold_percentage.value: #eval_info['hot-commits'][commit] = percentage eval_info[ 'avg-change-percentage'] = summed_avg_change_percentage / float( len(stats['commits'])) eval_info[ 'avg-changed-elements'] = summed_changed_elements / eval_info[ 'nr-of-commits'] eval_info['avg-total-elements'] = summed_total_elements / eval_info[ 'nr-of-commits'] eval_info['nr-hot-commits'] = len(eval_info['hot-commits']) with open(self.eval_data.path, "w+") as fd: fd.write(repr(eval_info)) # Output the summary of this build into the statistics file. with open(self.stats.path, "w+") as fd: fd.write(repr(self.build_info)) def plot_hash_count_histogram(hash_values, filename): dictionary = plt.figure() fig, ax = plt.subplots() plt.xlabel('Prozentanteil geaenderter Elemente') plt.ylabel('Anzahl von Commits') axes = plt.gca() axes.set_xlim([-10, 100]) axes.set_ylim([0, 1600]) ax.bar(hash_values.keys(), hash_values.values(), align='center') fig.savefig(filename) # clean data for plotting data = { k: v for k, v in eval_info['change-percentage'].iteritems() if k <= 100 } plot_hash_count_histogram(data, self.hot_commits_histo.path) def variant_name(self): return "%s-%s" % (self.project_name(), self.metadata['mode']) def symlink_name(self): return "%s-%s" % (self.title, self.variant_name())
class HistoricalCompilationEvaluation(Experiment): inputs = { "clang_hash": GitArchive("/home/cip/2015/yb90ifym/clang-hash/"), "project": GitArchive("/home/cip/2015/yb90ifym/clang-hash/hash-projects/lua"), "commits": Integer(4744), "jobs": Integer(1), # was 4 "dataset": Directory( "/home/cip/2015/yb90ifym/clang-hash/experiments/HistoricalCompilation-4e7c977077afea3d2ad77aeefe3b472c" ), # full lua "hot_threshold_percentage": Integer( 50 ), # minimal change percentage for commit to be classified as "hot" } outputs = { "stats": File("summary.dict"), "eval_data": File("eval.txt"), "hot_commits_histo": File("local_hot_commits.pdf"), "compare_approx_elem": File("local_compare_approx_elem.pdf"), } def project_name(self): return os.path.basename(self.metadata['project-clone-url']) def run(self): # Project name logging.info("Cloning project... %s", self.project_name()) self.build_info = { "project-name": self.project_name(), "commit-hash": self.metadata["project-hash"], 'builds': [] } with self.project as src_path: time = 0 os.chdir(self.dataset.path) # Read summary file from data collection run commits = None with open("summary.dict") as sf: summary = eval(sf.read()) commits = summary['builds'] def read_chash_data(commit): element_hashes = [] try: with open(commit, 'r') as cf: commit_data = eval(cf.read()) for ofile_data in commit_data: element_hashes.extend(ofile_data['element-hashes']) except: pass return element_hashes stats = { 'data-empty': set( ), # commits with empty info files, e.g. failed to be collected, (first n commits -> missing makefile o.a.) 'commits': {}, 'elements': {}, # symbol -> how often did this symbol change } total_changed_globals = 0 # How often was any global changed/introduced throughout the history? total_changed_records = 0 # How often was any record changed/introduced throughout the history? total_changed_static_funcs = 0 # How often was any static function changed/introduced throughout the history? total_changed_functions = 0 # How often was any function changed/introduced throughout the history? (incl. static) prev_commit = None prev_hashes = None counter = 1 for info in commits: print "%d/%d" % (counter, len(commits)) counter += 1 commit = info['commit'] parent = info['parent'] if not parent: # first commit has no parent print "No parent" continue commit_data = read_chash_data(commit) if not commit_data: # If the data does not exist, note and skip #print "Data empty" stats['data-empty'].add(commit) continue local_hashes = {} for element in commit_data: local_hashes[element[0]] = element[1] parent_hashes = {} if parent == prev_commit: #print "Reuse prev_commit" parent_hashes = prev_hashes else: #print "Cannot reuse prev_commit" parent_data = read_chash_data(parent) for element in parent_data: parent_hashes[element[0]] = element[1] if not parent_hashes: # If the data does not exist, note and skip stats['data-empty'].add(commit) # Save data for reuse prev_commit = commit prev_hashes = local_hashes continue ######################### # LOCAL HASH EVALUATION # ######################### commit_stats = { 'element-count': len(local_hashes), 'changed-elements': [], 'changed-functions-approx': [], } # Get data from approximation cwd = os.getcwd() os.chdir(src_path) commit_stats[ 'changed-functions-approx'] = get_changed_functions_from_commit( src_path, commit) os.chdir(cwd) elements = set(local_hashes.keys()) parent_elements = set(parent_hashes.keys()) commit_stats['changed-elements'] = set( ) #TODO here elements ^ parent_elements # elements either added or removed: if this is initialized with the insdel items, causes weird data to show um in result. should perhaps include it and add explanation # Compare hashes common_elements = elements & parent_elements for element in common_elements: if local_hashes[element] != parent_hashes[element]: commit_stats['changed-elements'].add(element) if element.startswith( 'record:' ): # do this here to ignore insertions and deletions total_changed_records += 1 elif element.startswith( 'variable:') or element.startswith( 'static variable:'): total_changed_globals += 1 elif element.startswith('static function:'): total_changed_static_funcs += 1 total_changed_functions += 1 else: total_changed_functions += 1 commit_stats['changed-element-count'] = len( commit_stats['changed-elements']) stats['commits'][commit] = commit_stats # Count how often each element was changed over the whole history for element in commit_stats['changed-elements']: if element not in stats['elements']: stats['elements'][element] = 0 stats['elements'][element] += 1 # Save data for reuse prev_commit = commit prev_hashes = local_hashes self.build_info['stats'] = stats eval_info = { 'nr-of-commits': len(commits), 'change-percentage': {}, # change percentage -> nr of commits with change < percentage 'hot-commits': {}, 'total-changed-globals': total_changed_globals, 'total-changed-records': total_changed_records, 'total-changed-static-funcs': total_changed_static_funcs, 'total-changed-functions': total_changed_functions, 'total-changed-elements': total_changed_functions + total_changed_records + total_changed_globals, } # Get most changed elements eval_info['most-changed-elements'] = { k: v for k, v in stats['elements'].iteritems() if v > self.commits.value / 10 } # arbitrary value (about 10% of commits) # Calc average nr and percentage of (changed) symbols per commit summed_avg_change_percentage = 0 summed_changed_elements = 0 summed_total_elements = 0 commits = self.build_info['stats']['commits'] for commit in commits: commit_stat = commits[commit] change_percentage = len(commit_stat['changed-elements']) / float( commit_stat['element-count']) summed_avg_change_percentage += change_percentage summed_changed_elements += len(commit_stat['changed-elements']) summed_total_elements += commit_stat['element-count'] percentage = int(round(change_percentage * 100)) if percentage not in eval_info['change-percentage']: eval_info['change-percentage'][percentage] = 0 eval_info['change-percentage'][percentage] += 1 # Identify hot commits if percentage > self.hot_threshold_percentage.value: eval_info['hot-commits'][commit] = ( percentage, len(commit_stat['changed-elements']), commit_stat['element-count']) eval_info[ 'avg-change-percentage'] = summed_avg_change_percentage / float( len(stats['commits'])) eval_info[ 'avg-changed-elements'] = summed_changed_elements / eval_info[ 'nr-of-commits'] eval_info['avg-total-elements'] = summed_total_elements / eval_info[ 'nr-of-commits'] eval_info['nr-hot-commits'] = len(eval_info['hot-commits']) with open(self.eval_data.path, "w+") as fd: fd.write(repr(eval_info)) # Output the summary of this build into the statistics file. with open(self.stats.path, "w+") as fd: fd.write(repr(self.build_info)) ''' def plot_hash_count_histogram(hash_values, filename): dictionary = plt.figure() fig, ax = plt.subplots() plt.xlabel('Prozentanteil geaenderter Elemente') plt.ylabel('Anzahl von Commits') ax.bar(hash_values.keys(), hash_values.values(), align='center') fig.savefig(filename) # clean data for plotting data = {k:v for k,v in eval_info['change-percentage'].iteritems() if k <= 100} plot_hash_count_histogram(data, self.hot_commits_histo.path) changed_funcs_approx_list = [] changed_elements_list = [] for commit in commits: commit_stat = commits[commit] changed_functions_approx = commit_stat['changed-functions-approx'] changed_elements = commit_stat['changed-elements'] changed_funcs_approx_list.append(len(changed_functions_approx)) changed_elements_list.append(len(changed_elements)) #TODO plot changed elements vs approx. changed functions # and also changed functions vs approx changed functions fig, ax = plt.subplots() ax.plot(changed_elements_list, label='Geaenderte Elemente (lokal)') ax.plot(changed_funcs_approx_list, 'm', label='Geaenderte Funktionen (Approx)') lgd = ax.legend(loc='center left', bbox_to_anchor=(1, 0.5)) # legend on the right plt.xlabel('Commits') plt.ylabel('Anzahl') fig.savefig(self.compare_approx_elem.path, bbox_extra_artists=(lgd,), bbox_inches='tight') ''' def variant_name(self): return "%s-%s" % (self.project_name(), self.metadata['mode']) def symlink_name(self): return "%s-%s" % (self.title, self.variant_name())
class TimingInternal(Experiment, ClangHashHelper): inputs = { "clang_hash": GitArchive("/home/stettberger/w/clang-hash/"), "project": GitArchive("/home/stettberger/w/clang-hash/hash-projects/lua", shallow=True), "cflags": String(""), "jobs": Integer(4), "mode": String("normal"), # Unchangable } outputs = { "stats": File("summary.dict"), 'tex': DatarefDict('data.dref'), } def save(self, path, value): self.tex['/'.join(path)] = value logging.info("%s = %s", '/'.join(path), value) def run(self): with self.clang_hash as cl_path: logging.info("Cloning clang hash...") logging.info("Cloning project... %s", self.project_name()) # First, we redirect all calls to the compiler to our # gcc wrapper CC = os.path.join(cl_path, "wrappers/gcc-time") os.environ["CC"] = CC os.environ["TIMING_REPORT"] = self.stats.path os.environ["CHASH_EXTRA_FLAGS"] = self.cflags.value with self.project as src_path: info = {} self.call_configure(src_path) self.rebuild(src_path, info, True) collect = defaultdict(list) compiler_calls = 0 with open(self.stats.path) as fd: for line in fd.readlines(): data = eval(line) if "name" in data: compiler_calls += 1 for key in data: if type(data[key]) is float: collect[key].append(data[key]) self.save([self.project_name(), "phase", self.cflags.value, "count"], compiler_calls) for phase in collect: if phase in ("preprocessing", "parser (global)", "phase opt and generate"): self.save( [self.project_name(), "phase", phase, self.cflags.value], np.average(collect[phase])) def symlink_name(self): return "%s-%s%s" % (self.title, self.project_name(), self.cflags.value)
class HistoricalCompilation(Experiment, ClangHashHelper): inputs = { "clang_hash": GitArchive("/home/stettberger/w/clang-hash/"), "project": GitArchive("/home/stettberger/w/clang-hash/hash-projects/lua"), "mode": String("normal"), "commits": Integer(500), "jobs": Integer(4), } outputs = { "stats": File("summary.dict"), "ccache_stats": File("ccache.stats"), "clang_hash_log": File("clang-hash.log"), } def build_parent(self, commit, from_scratch = False): def eq_hash(a, b): if len(a) == 0 or len(b) == 0: return if len(a) > len(b): return a.startswith(b) else: return b.startswith(a) src_path = self.project.path if from_scratch: shell("cd %s; git clean -dfx -e '*.hash' -e '*.hash.copy'", src_path) logging.info("Parent [%s^]: clean build", commit) shell("cd %s; git reset --hard %s^", src_path, commit) info = {"commit": commit + "^"} self.call_configure(src_path) self.rebuild(src_path, info, True) # Did initial commit fail? Try again if info.get("failed"): logging.info("Parent[%s^]: failed", commit) return False return True else: (lines, _) = shell("cd %s; git rev-parse %s^", src_path, commit) parent_revision = lines[0].strip() if self.current_revision and eq_hash(self.current_revision, parent_revision): logging.info("Parent[%s^]: resuse good parent", commit) return True else: logging.info("Parent[%s^]: resuse similar build directory", commit) shell("cd %s; git reset --hard %s^", src_path, commit) info = {"commit": commit +"^"} self.call_reconfigure(src_path) self.rebuild(src_path, info, True) # Did initial commit fail? Try again if info.get("failed"): return self.build_parent(commit, from_scratch=True) return True def run(self): # Determine the mode modes = ('normal', 'ccache', 'clang-hash', 'ccache-clang-hash') if not self.mode.value in modes: raise RuntimeError("Mode can only be one of: %s"%modes) logging.info("Build the Clang-Hash Plugin") with self.clang_hash as cl_path: shell("cd %s; mkdir build; cd build; cmake .. -DCMAKE_BUILD_TYPE=Release; make -j 4", cl_path) shell("strip %s/build/clang-plguin/*.so", cl_path) # Project name logging.info("Cloning project... %s", self.project_name()) self.build_info = {"project-name": self.project_name(), "commit-hash": self.metadata["project-hash"], 'builds': []} with self.project as src_path: (commits, _) = shell("cd %s; git log --no-merges --oneline --topo-order --format='%%H %%P %%s'", src_path) # [0] is hash. [1] is parent, [2] rest commits = [x.split(" ", 2) for x in reversed(commits)] commits = commits[-self.commits.value:] self.current_revision = None # First, we redirect all calls to the compiler to our # clang hash wrapper self.setup_compiler_paths(cl_path) time = 0 last_failed = True while commits: # Search for a child of the current revision commit = None if self.current_revision: for idx in range(0, len(commits)): if commits[idx][1] == self.current_revision: commit = commits[idx] del commits[idx] break # No Child found -> Take the first one. if not commit: commit = commits.pop(0) # Bash initial commit if commit[0] == "726f63884db0132f01745f1fb4465e6621088ccf": continue info = {"commit": commit[0], "parent": commit[1], "summary": commit[2]} # Somehow this commit in musl is weird. It behaves # totally different, if build with a fresh parent and # a non-fresh parent. With this we are one the save side if commit[0] == "416d1c7a711807384cc21a18163475cf757bbcb5": last_failed = True # First, we build the parent. In a total linear # history, this is a NOP. Otherwise, we try to reset # to the actual parent, and rebuild the project. This # may fail, since the current commit might fix this. ret = self.build_parent(commit[0], from_scratch = last_failed) info['parent-ok'] = ret # Change to the ACTUAL commit. Call reconfigure, and # then go on building the commit. shell("cd %s; git reset --hard %s", src_path, commit[0]) self.call_reconfigure(src_path) if os.path.exists("/tmp/clang-hash.log"): os.unlink("/tmp/clang-hash.log") # Rebuild and Measure self.rebuild(src_path, info, fail_ok=True) if os.path.exists("/tmp/clang-hash.log") and not info.get("failed"): with open("/tmp/clang-hash.log") as fd: self.clang_hash_log.value += fd.read() self.build_info["builds"].append(info) if not info.get("failed"): time += info['build-time'] / 1e9 # Build was good. Remember that. self.current_revision = commit[0] last_failed = False else: self.current_revision = None last_failed = True logging.info("Rebuild for %d commits takes %f minutes", self.commits.value, time/60.) # Output the summary of this build into the statistics file. with open(self.stats.path, "w+") as fd: fd.write(repr(self.build_info)) def variant_name(self): return "%s-%s"%(self.project_name(), self.metadata['mode']) def symlink_name(self): return "%s-%s"%(self.title, self.variant_name())
class HistoricalCompilation(Experiment, ClangHashHelper): inputs = { "clang_hash": GitArchive("/home/cip/2015/yb90ifym/clang-hash/"), "project": GitArchive("/home/cip/2015/yb90ifym/lua"), "mode": String("normal"), "commits": Integer(500), # was 500 "jobs": Integer(1), # was 4 } outputs = { "stats": File("summary.dict"), "ccache_stats": File("ccache.stats"), "clang_hash_log": File("clang-hash.log"), } def build_parent(self, commit, from_scratch=False): def eq_hash(a, b): if len(a) == 0 or len(b) == 0: return if len(a) > len(b): return a.startswith(b) else: return b.startswith(a) src_path = self.project.path if from_scratch: shell("cd %s; git clean -dfx -e '*.hash' -e '*.hash.copy'", src_path) logging.info("Parent [%s^]: clean build", commit) shell("cd %s; git reset --hard %s^", src_path, commit) info = {"commit": commit + "^"} self.call_configure(src_path) self.rebuild(src_path, info, True) # Did initial commit fail? Try again if info.get("failed"): logging.info("Parent[%s^]: failed", commit) return False return True else: (lines, _) = shell("cd %s; git rev-parse %s^", src_path, commit) parent_revision = lines[0].strip() if self.current_revision and eq_hash(self.current_revision, parent_revision): logging.info("Parent[%s^]: resuse good parent", commit) return True else: logging.info("Parent[%s^]: resuse similar build directory", commit) shell("cd %s; git reset --hard %s^", src_path, commit) info = {"commit": commit + "^"} self.call_reconfigure(src_path) self.rebuild(src_path, info, True) # Did initial commit fail? Try again if info.get("failed"): return self.build_parent(commit, from_scratch=True) return True def run(self): # Determine the mode modes = ('normal', 'ccache', 'clang-hash', 'ccache-clang-hash') if not self.mode.value in modes: raise RuntimeError("Mode can only be one of: %s" % modes) logging.info("Build the Clang-Hash Plugin") with self.clang_hash as cl_path: shell( "cd %s; mkdir build; cd build; cmake .. -DCMAKE_BUILD_TYPE=Release; make -j 4", cl_path) shell("strip %s/build/clang-plugin/*.so", cl_path) # Project name logging.info("Cloning project... %s", self.project_name()) self.build_info = { "project-name": self.project_name(), "commit-hash": self.metadata["project-hash"], 'builds': [] } with self.project as src_path: (commits, _) = shell( "cd %s; git log --no-merges --oneline --topo-order --format='%%H %%P %%s'", src_path) # [0] is hash. [1] is parent, [2] rest commits = [x.split(" ", 2) for x in reversed(commits)] commits = commits[-self.commits.value:] self.current_revision = None # First, we redirect all calls to the compiler to our # clang hash wrapper self.setup_compiler_paths(cl_path) time = 0 last_failed = True nr_of_commits = len(commits) original_commits = commits[:] occurred_errors = {} # map commit -> [error strings] def gather_local_hashes(src_path): remove_keys = [ 'project', 'return-code', 'start-time', 'run_id', 'compile-duration', 'processed-bytes', 'hash-duration', 'hash-start-time', 'object-file-size' ] # TODO: ofile-size useful? hashes = read_hash_directory(src_path, remove_keys) local_hashes = {} for entry in hashes: element_hashes = entry['element-hashes'] for element in element_hashes: local_hashes[element[0]] = element[1] return local_hashes def gather_global_hashes(local_hashes, occurred_errors): global_hashes = {} for symbol in local_hashes: symbol = symbol.split(':')[ 1] # Remove the prefix ('function:' etc.) try: shell("cd %s; %s/clang-hash-global --definition %s", src_path, self.inputs.clang_hash.path, symbol) except Exception as e: occurred_errors[commit[0]] = e # don't raise exception return global_hashes def add_additional_commit_info_to(info): gitshow = subprocess.Popen(["git", "show"], stdout=subprocess.PIPE) dstat_out = subprocess.check_output(('diffstat'), stdin=gitshow.stdout) gitshow.wait() lines = dstat_out.split('\n') index = -1 while lines[index] == '': index -= 1 last_line = lines[index] changedInsertionsDeletions = [ int(s) for s in last_line.split() if s.isdigit() ] if "insertion" in last_line: info['insertions'] = changedInsertionsDeletions[1] if "deletion" in last_line: info['deletions'] = changedInsertionsDeletions[2] elif "deletion" in last_line: info['deletions'] = changedInsertionsDeletions[1] # Get changed files changed_files = {} for line in lines: if '|' in line: elems = line.split() assert elems[1] == '|' filename = elems[0] nr_of_changes = int(elems[2]) changed_files[filename] = nr_of_changes assert len(changed_files) == changedInsertionsDeletions[0] info['changes'] = changed_files while commits: # Search for a child of the current revision commit = None if self.current_revision: for idx in range(0, len(commits)): if commits[idx][1] == self.current_revision: commit = commits[idx] del commits[idx] break # No Child found -> Take the first one. if not commit: commit = commits.pop(0) info = { "commit": commit[0], "parent": commit[1], "summary": commit[2] } # First, we build the parent. In a total linear # history, this is a NOP. Otherwise, we try to reset # to the actual parent, and rebuild the project. This # may fail, since the current commit might fix this. ret = self.build_parent(commit[0], from_scratch=last_failed) info['parent-ok'] = ret parent_info = {} add_additional_commit_info_to(parent_info) info['parent-info'] = parent_info # Gather hashes of parent parent_local_hashes = gather_local_hashes(src_path) parent_global_hashes = gather_global_hashes( parent_local_hashes, occurred_errors) #info['parent-local-hashes'] = parent_local_hashes #info['parent-global-hashes'] = parent_global_hashes # Change to the ACTUAL commit. shell("cd %s; git reset --hard %s", src_path, commit[0]) add_additional_commit_info_to(info) # Call reconfigure, and then go on building the commit. self.call_reconfigure(src_path) if os.path.exists("/tmp/clang-hash.log"): os.unlink("/tmp/clang-hash.log") # Rebuild and Measure self.rebuild(src_path, info, fail_ok=True) # Don't need those atm del info['clang-hash-hits'] del info['clang-hash-misses'] # Gather hashes local_hashes = gather_local_hashes(src_path) global_hashes = gather_global_hashes(local_hashes, occurred_errors) #info['local-hashes'] = local_hashes #info['global-hashes'] = global_hashes # Compare hashes/search for changed hashes # The parent's global hashes are copied to find removed symbols changed_symbols = {} parent_hashes = deepcopy(parent_global_hashes) for symbol, global_hash in global_hashes.iteritems(): parent_global_hash = parent_hashes.pop(symbol, None) if global_hash != parent_global_hash: # Store it as [before, after] changed_symbols[symbol] = [ parent_global_hash, global_hash ] # Add removed symbols for symbol, parent_global_hash in parent_hashes.iteritems(): changed_symbols[symbol] = [parent_global_hash, None] # Compare hashes/search for changed hashes # The parent's global hashes are copied to find removed symbols local_changed_symbols = {} parent_hashes = deepcopy(parent_local_hashes) for symbol, local_hash in local_hashes.iteritems(): parent_local_hash = parent_hashes.pop(symbol, None) if local_hash != parent_local_hash: # Store it as [before, after] local_changed_symbols[symbol] = [ parent_local_hash, local_hash ] # Add removed symbols for symbol, parent_local_hash in parent_hashes.iteritems(): local_changed_symbols[symbol] = [parent_local_hash, None] info['changed-symbols'] = changed_symbols #info['local-changed-symbols'] = local_changed_symbols info['local-changed-sym-count'] = len(local_changed_symbols) # TODO: add more analysis # TODO: for each changed local hash, the symbol's global hash should also change... # check every symbol for changed global hash\ # also check the commits, if the correct ones are used... if os.path.exists( "/tmp/clang-hash.log") and not info.get("failed"): with open("/tmp/clang-hash.log") as fd: self.clang_hash_log.value += fd.read() self.build_info["builds"].append(info) if not info.get("failed"): time += info['build-time'] / 1e9 # Build was good. Remember that. self.current_revision = commit[0] last_failed = False else: self.current_revision = None last_failed = True logging.info("Rebuild for %d commits takes %f minutes", self.commits.value, time / 60.) print "\n\noccurred errors:\n" print occurred_errors print "\n\nchanged symbols:\n" print changed_symbols print "\n\nlocal changed symbols:\n" print local_changed_symbols print "\n\n\n" if len(changed_symbols) or len(local_changed_symbols): print "!!! success: found one !!!" # Output the summary of this build into the statistics file. with open(self.stats.path, "w+") as fd: fd.write(repr(self.build_info)) def variant_name(self): return "%s-%s" % (self.project_name(), self.metadata['mode']) def symlink_name(self): return "%s-%s" % (self.title, self.variant_name())
class IncrementalCompilation(Experiment, ClangHashHelper): inputs = { "clang_hash": GitArchive("/home/stettberger/w/clang-hash/"), "project": GitArchive("/home/stettberger/w/clang-hash/hash-projects/musl", shallow=True), "touch-only": Bool(False), "mode": String("normal"), "jobs": Integer(4), } outputs = { "stats": File("summary.dict"), } def get_sources(self, path): ret = [] for root, dirnames, filenames in os.walk(path): for filename in filenames: if filename.endswith(('.h', '.c')): ret.append(os.path.join(root, filename)) if self.project_name() == "musl": # We do not touch headers that are external, since they # are untouchable. ret = [x for x in ret if x.endswith(".c") or "internal" in x] return sorted(ret) def touch(self, path): if self.touch_only.value: os.utime(path, None) else: with open(path) as fd: content = fd.read() content = "#line 1\n" + content with open(path, "w") as fd: fd.write(content) def run(self): # Determine the mode modes = ('normal', 'ccache', 'clang-hash') if not self.mode.value in modes: raise RuntimeError("Mode can only be one of: %s" % modes) logging.info("Build the Clang-Hash Plugin") with self.clang_hash as cl_path: shell("cd %s; mkdir build; cd build; cmake ..; make -j 4", cl_path) # Project name logging.info("Cloning project... %s", self.project_name()) self.build_info = { "project-name": self.project_name(), "commit-hash": self.metadata["project-hash"], 'builds': [] } with self.project as src_path: # First, we redirect all calls to the compiler to our # clang hash wrapper self.setup_compiler_paths(cl_path) # Count the number of files sources = list(self.get_sources(src_path)) nr_files = len(sources) logging.info("#files: %d", nr_files) self.build_info['file-count'] = nr_files # Initial build of the given project self.call_configure(src_path) info = {"filename": "FRESH_BUILD"} self.rebuild(src_path, info) self.build_info["builds"].append(info) # Iterate over all files for fn in sources: self.touch(fn) info = {"filename": fn} self.rebuild(src_path, info) self.build_info["builds"].append(info) # Output the summary of this build into the statistics file. with open(self.stats.path, "w+") as fd: fd.write(repr(self.build_info)) def method_name(self): mod = "append" if self.metadata['touch-only']: mod = "touch" return "%s-%s" % (mod, self.metadata['mode']) def variant_name(self): return "%s-%s" % (self.project_name(), self.method_name()) def symlink_name(self): return "%s-%s" % (self.title, self.variant_name())