def _ComputeRegressionStatistics(cls, rev_states, first_working_rev, last_broken_rev): # TODO(sergiyb): We assume that value has "values" key, which may not be # the case for failure-bisects, where there is a single value only. broken_means = [ state.value['values'] for state in rev_states[:last_broken_rev.index + 1] if state.value ] working_means = [ state.value['values'] for state in rev_states[first_working_rev.index:] if state.value ] # Flatten the lists to calculate mean of all values. working_mean = sum(working_means, []) broken_mean = sum(broken_means, []) # Calculate the approximate size of the regression mean_of_bad_runs = math_utils.Mean(broken_mean) mean_of_good_runs = math_utils.Mean(working_mean) regression_size = 100 * math_utils.RelativeChange( mean_of_good_runs, mean_of_bad_runs) if math.isnan(regression_size): regression_size = 'zero-to-nonzero' regression_std_err = math.fabs( math_utils.PooledStandardError([working_mean, broken_mean]) / max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 # Give a "confidence" in the bisect. Currently, we consider the values of # only the revisions at the breaking range (last known good and first known # bad) see the note in the docstring for FindBreakingRange. confidence_params = (sum([first_working_rev.value['values']], []), sum([last_broken_rev.value['values']], [])) confidence = cls.ConfidenceScore(*confidence_params) bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs return { 'regression_size': regression_size, 'regression_std_err': regression_std_err, 'confidence': confidence, 'bad_greater_than_good': bad_greater_than_good }
def _ComputeRegressionStatistics(cls, rev_states, first_working_rev, last_broken_rev): # TODO(sergiyb): We assume that value has "values" key, which may not be # the case for failure-bisects, where there is a single value only. broken_means = [ state.value['values'] for state in rev_states[:last_broken_rev.index + 1] if state.value ] working_means = [ state.value['values'] for state in rev_states[first_working_rev.index:] if state.value ] # Flatten the lists to calculate mean of all values. working_mean = sum(working_means, []) broken_mean = sum(broken_means, []) # Calculate the approximate size of the regression mean_of_bad_runs = math_utils.Mean(broken_mean) mean_of_good_runs = math_utils.Mean(working_mean) regression_size = 100 * math_utils.RelativeChange( mean_of_good_runs, mean_of_bad_runs) if math.isnan(regression_size): regression_size = 'zero-to-nonzero' regression_std_err = math.fabs( math_utils.PooledStandardError([working_mean, broken_mean]) / max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 # Give a "confidence" in the bisect culprit by seeing whether the results # of the culprit revision and the revision before that appear to be # statistically significantly different. confidence = cls.ConfidenceScore( sum([first_working_rev.value['values']], []), sum([last_broken_rev.value['values']], [])) bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs return { 'regression_size': regression_size, 'regression_std_err': regression_std_err, 'confidence': confidence, 'bad_greater_than_good': bad_greater_than_good }
def _FindOtherRegressions(cls, revision_states, bad_greater_than_good): """Compiles a list of other possible regressions from the revision data. Args: revision_states: Sorted list of RevisionState objects. bad_greater_than_good: Whether the result value at the "bad" revision is numerically greater than the result value at the "good" revision. Returns: A list of [current_rev, previous_rev, confidence] for other places where there may have been a regression. """ other_regressions = [] previous_values = [] prev_state = None for revision_state in revision_states: if revision_state.value: current_values = revision_state.value['values'] if previous_values: confidence_params = (sum(previous_values, []), sum([current_values], [])) confidence = cls.ConfidenceScore( *confidence_params, accept_single_bad_or_good=True) mean_of_prev_runs = math_utils.Mean( sum(previous_values, [])) mean_of_current_runs = math_utils.Mean(current_values) # Check that the potential regression is in the same direction as # the overall regression. If the mean of the previous runs < the # mean of the current runs, this local regression is in same # direction. prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs if bad_greater_than_good: is_same_direction = prev_greater_than_current else: is_same_direction = not prev_greater_than_current # Only report potential regressions with high confidence. if is_same_direction and confidence > 50: other_regressions.append( [revision_state, prev_state, confidence]) previous_values.append(current_values) prev_state = revision_state return other_regressions
def _FindOtherRegressions(revision_data_sorted, bad_greater_than_good): """Compiles a list of other possible regressions from the revision data. Args: revision_data_sorted: Sorted list of (revision, revision data) pairs. bad_greater_than_good: Whether the result value at the "bad" revision is numerically greater than the result value at the "good" revision. Returns: A list of [current_rev, previous_rev, confidence] for other places where there may have been a regression. """ other_regressions = [] previous_values = [] previous_id = None for current_id, current_data in revision_data_sorted: current_values = current_data['value'] if current_values: current_values = current_values['values'] if previous_values: confidence = ConfidenceScore(previous_values, [current_values]) mean_of_prev_runs = math_utils.Mean( sum(previous_values, [])) mean_of_current_runs = math_utils.Mean(current_values) # Check that the potential regression is in the same direction as # the overall regression. If the mean of the previous runs < the # mean of the current runs, this local regression is in same # direction. prev_less_than_current = mean_of_prev_runs < mean_of_current_runs is_same_direction = (prev_less_than_current if bad_greater_than_good else not prev_less_than_current) # Only report potential regressions with high confidence. if is_same_direction and confidence > 50: other_regressions.append( [current_id, previous_id, confidence]) previous_values.append(current_values) previous_id = current_id return other_regressions
def testMeanCompareAlternateImplementation(self): """Tests Mean by comparing against an alternate implementation.""" def AlternateMeanFunction(values): """Simple arithmetic mean function.""" return sum(values) / float(len(values)) test_values_lists = [[1], [5, 6.5, 1.2, 3], [-3, 0, 1, 4], [-3, -1, 0.12, 0.752, 3.33, 8, 16, 32, 439]] for values in test_values_lists: self.assertEqual( AlternateMeanFunction(values), math_utils.Mean(values))
def WelchsTTest(sample1, sample2): """Performs Welch's t-test on the two samples. Welch's t-test is an adaptation of Student's t-test which is used when the two samples may have unequal variances. It is also an independent two-sample t-test. Args: sample1: A collection of numbers. sample2: Another collection of numbers. Returns: A 3-tuple (t-statistic, degrees of freedom, p-value). """ mean1 = math_utils.Mean(sample1) mean2 = math_utils.Mean(sample2) v1 = math_utils.Variance(sample1) v2 = math_utils.Variance(sample2) n1 = len(sample1) n2 = len(sample2) t = _TValue(mean1, mean2, v1, v2, n1, n2) df = _DegreesOfFreedom(v1, v2, n1, n2) p = _LookupPValue(t, df) return t, df, p
def testMean_ShortList(self): self.assertEqual(0.5, math_utils.Mean([-3, 0, 1, 4]))
def testMean_OneValue(self): self.assertEqual(3.0, math_utils.Mean([3]))
def testMeanShortList(self): """Tests the Mean function with a short list.""" self.assertEqual(0.5, math_utils.Mean([-3, 0, 1, 4]))
def testMeanSingleNum(self): """Tests the Mean function with a single number.""" self.assertEqual(3.0, math_utils.Mean([3]))
def GetResultsDict(self): """Prepares and returns information about the final resulsts as a dict. Returns: A dictionary with the following fields 'first_working_revision': First good revision. 'last_broken_revision': Last bad revision. 'culprit_revisions': A list of revisions, which contain the bad change introducing the failure. 'other_regressions': A list of tuples representing other regressions, which may have occured. 'regression_size': For performance bisects, this is a relative change of the mean metric value. For other bisects this field always contains 'zero-to-nonzero'. 'regression_std_err': For performance bisects, it is a pooled standard error for groups of good and bad runs. Not used for other bisects. 'confidence': For performance bisects, it is a confidence that the good and bad runs are distinct groups. Not used for non-performance bisects. 'revision_data_sorted': dict mapping revision ids to data about that revision. Each piece of revision data consists of a dict with the following keys: 'passed': Represents whether the performance test was successful at that revision. Possible values include: 1 (passed), 0 (failed), '?' (skipped), 'F' (build failed). 'depot': The depot that this revision is from (i.e. WebKit) 'external': If the revision is a 'src' revision, 'external' contains the revisions of each of the external libraries. 'sort': A sort value for sorting the dict in order of commits. For example: { 'CL #1': { 'passed': False, 'depot': 'chromium', 'external': None, 'sort': 0 } } """ revision_data_sorted = sorted(self.revision_data.iteritems(), key=lambda x: x[1]['sort']) # Find range where it possibly broke. first_working_revision = None first_working_revision_index = -1 last_broken_revision = None last_broken_revision_index = -1 culprit_revisions = [] other_regressions = [] regression_size = 0.0 regression_std_err = 0.0 confidence = 0.0 for i in xrange(len(revision_data_sorted)): k, v = revision_data_sorted[i] if v['passed'] == 1: if not first_working_revision: first_working_revision = k first_working_revision_index = i if not v['passed']: last_broken_revision = k last_broken_revision_index = i if last_broken_revision != None and first_working_revision != None: broken_means = [] for i in xrange(0, last_broken_revision_index + 1): if revision_data_sorted[i][1]['value']: broken_means.append( revision_data_sorted[i][1]['value']['values']) working_means = [] for i in xrange(first_working_revision_index, len(revision_data_sorted)): if revision_data_sorted[i][1]['value']: working_means.append( revision_data_sorted[i][1]['value']['values']) # Flatten the lists to calculate mean of all values. working_mean = sum(working_means, []) broken_mean = sum(broken_means, []) # Calculate the approximate size of the regression mean_of_bad_runs = math_utils.Mean(broken_mean) mean_of_good_runs = math_utils.Mean(working_mean) regression_size = 100 * math_utils.RelativeChange( mean_of_good_runs, mean_of_bad_runs) if math.isnan(regression_size): regression_size = 'zero-to-nonzero' regression_std_err = math.fabs( math_utils.PooledStandardError([working_mean, broken_mean]) / max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0 # Give a "confidence" in the bisect. At the moment we use how distinct the # values are before and after the last broken revision, and how noisy the # overall graph is. confidence = ConfidenceScore(working_means, broken_means) culprit_revisions = [] cwd = os.getcwd() self._depot_registry.ChangeToDepotDir( self.revision_data[last_broken_revision]['depot']) if self.revision_data[last_broken_revision]['depot'] == 'cros': # Want to get a list of all the commits and what depots they belong # to so that we can grab info about each. cmd = [ 'repo', 'forall', '-c', 'pwd ; git log --pretty=oneline --before=%d --after=%d' % (last_broken_revision, first_working_revision + 1) ] output, return_code = bisect_utils.RunProcessAndRetrieveOutput( cmd) changes = [] assert not return_code, ('An error occurred while running ' '"%s"' % ' '.join(cmd)) last_depot = None cwd = os.getcwd() for l in output.split('\n'): if l: # Output will be in form: # /path_to_depot # /path_to_other_depot # <SHA1> # /path_again # <SHA1> # etc. if l[0] == '/': last_depot = l else: contents = l.split(' ') if len(contents) > 1: changes.append([last_depot, contents[0]]) for c in changes: os.chdir(c[0]) info = self._source_control.QueryRevisionInfo(c[1]) culprit_revisions.append((c[1], info, None)) else: for i in xrange(last_broken_revision_index, len(revision_data_sorted)): k, v = revision_data_sorted[i] if k == first_working_revision: break self._depot_registry.ChangeToDepotDir(v['depot']) info = self._source_control.QueryRevisionInfo(k) culprit_revisions.append((k, info, v['depot'])) os.chdir(cwd) # Check for any other possible regression ranges. other_regressions = self._FindOtherRegressions( revision_data_sorted, mean_of_bad_runs > mean_of_good_runs) return { 'first_working_revision': first_working_revision, 'last_broken_revision': last_broken_revision, 'culprit_revisions': culprit_revisions, 'other_regressions': other_regressions, 'regression_size': regression_size, 'regression_std_err': regression_std_err, 'confidence': confidence, 'revision_data_sorted': revision_data_sorted }