示例#1
0
def TestResultsDataFrame(data):
  """Convert a test results request response into a data frame."""
  assert data['version'] == 4

  dfs = []
  for builder, builder_data in data.items():
    if builder == 'version':
      continue  # Skip, not a builder.
    builds = pandas.DataFrame()
    builds['timestamp'] = pandas.to_datetime(
        builder_data['secondsSinceEpoch'], unit='s')
    builds['builder'] = builder
    builds['build_number'] = builder_data['buildNumbers']
    _AddDataFrameColumn(builds, 'commit_pos', builder_data['chromeRevision'])
    for test_suite, test_case, test_results in _IterTestResults(
        builder_data['tests']):
      df = builds.copy()
      df['test_suite'] = test_suite
      df['test_case'] = test_case
      _AddDataFrameColumn(df, 'result', _RunLengthDecode(
          test_results['results']), fill_value='N')
      _AddDataFrameColumn(df, 'time', _RunLengthDecode(test_results['times']))
      dfs.append(df)

    if dfs:
      df = pandas.concat(dfs, ignore_index=True)
      assert tuple(df.columns) == TEST_RESULTS_COLUMNS
    else:
      # Return an empty data frame with the right column names otherwise.
      df = pandas.DataFrame(columns=TEST_RESULTS_COLUMNS)

    return df
示例#2
0
    def testCollectPinpointResults(self):
        state = [
            StateItem('a100', job1='completed', job2='completed'),
            StateItem('a200', job3='completed', job4='running'),
            StateItem('a300', job5='running', job6='running')
        ]

        # Write some fake "previous" results for first revision.
        df = pd.DataFrame({'revision': ['a100']})
        df.to_csv(pinboard.RevisionResultsFile(state[0]), index=False)

        self.subprocess.check_output.side_effect = [
            'job4: completed\n', 'job5: running\njob6: failed\n',
            'getting csv data ...\n'
        ]
        expected_state = [
            StateItem('a100', job1='completed', job2='completed'),
            StateItem('a200', job3='completed', job4='completed'),
            StateItem('a300', job5='running', job6='failed')
        ]

        pinboard.CollectPinpointResults(state)

        self.assertEqual(state, expected_state)
        self.subprocess.check_output.assert_has_calls([
            mock.call(['vpython', pinboard.PINPOINT_CLI, 'status', 'job4'],
                      universal_newlines=True),
            mock.call(
                ['vpython', pinboard.PINPOINT_CLI, 'status', 'job5', 'job6'],
                universal_newlines=True),
            mock.call([
                'vpython', pinboard.PINPOINT_CLI, 'get-csv', '--output',
                pinboard.RevisionResultsFile(state[1]), '--', 'job3', 'job4'
            ])
        ])
示例#3
0
def DataFrame(column_types, index=None, rows=None):
    """Create a DataFrame with given column types as index.

  Unlike usual pandas DataFrame constructors, this allows to have explicitly
  typed column values, even when no rows of data are provided. And, when such
  data is available, values are explicitly casted, instead of letting pandas
  guess a type.

  Args:
    column_types: A sequence of (name, dtype) pairs to define the columns.
    index: An optional column name or sequence of column names to use as index
      of the frame.
    rows: An optional sequence of rows of data.
  """
    if rows:
        cols = zip(*rows)
        assert len(cols) == len(column_types)
        cols = (list(vs) for vs in cols)
    else:
        cols = (None for _ in column_types)
    df = pandas.DataFrame()
    for (column, dtype), values in zip(column_types, cols):
        df[column] = pandas.Series(values, dtype=dtype)
    if index is not None:
        index = [index] if isinstance(index, basestring) else list(index)
        df.set_index(index, inplace=True)
    return df
def CalculateDistances(input_dataframe,
                       metric,
                       normalize=False,
                       output_path=None):
    """Calculates the distances of stories.

  If normalize flag is set the values are first normalized using min-max
  normalization. Then the similarity measure between every two stories is
  calculated using pearson correlation.

  Args:
    input_dataframe: A dataframe containing a list of records
    having (test_case, commit_pos, bot, value).
    metric: String containing name of the metric.
    normalize: A flag to determine if normalization is needed.
    output_path: Path to write the calculated distances.

  Returns:
    A dataframe containing the distance matrix of the stories.
  """
    input_by_story = input_dataframe.groupby('test_case')['value']
    total_values_per_story = input_by_story.size()
    nan_values_per_story = input_by_story.apply(lambda s: s.isna().sum())
    should_keep = nan_values_per_story < (total_values_per_story *
                                          HIGHEST_VALID_NAN_RATIO)
    valid_stories = total_values_per_story[should_keep].index

    filtered_dataframe = input_dataframe[input_dataframe['test_case'].isin(
        valid_stories)]

    temp_df = filtered_dataframe.copy()

    if normalize:
        # Min Max normalization
        grouped = temp_df.groupby(['bot', 'test_case'])['value']
        min_value = grouped.transform('min')
        max_value = grouped.transform('max')
        temp_df['value'] = temp_df['value'] / (1 + max_value - min_value)

    distances = pandas.DataFrame()
    grouped_temp = temp_df.groupby(temp_df['bot'])
    for _, group in grouped_temp:
        sample_df = group.pivot(index='commit_pos',
                                columns='test_case',
                                values='value')

        if distances.empty:
            distances = 1 - sample_df.corr(method='pearson')
        else:
            distances = distances.add(1 - sample_df.corr(method='pearson'),
                                      fill_value=0)

    if output_path is not None:
        if not os.path.isdir(output_path):
            os.makedirs(output_path)
        distances.to_csv(os.path.join(output_path, metric + '_distances.csv'))

    return distances
示例#5
0
 def GetFakeResults(item):
     df = pd.DataFrame(index=[0])
     df['revision'] = item['revision']
     df['label'] = 'with_patch'
     df['benchmark'] = 'loading'
     df['name'] = 'Total:duration'
     df['timestamp'] = pd.Timestamp(item['timestamp'])
     df['count'] = 1 if item['revision'] != 'a400' else 0
     return df
示例#6
0
def GetRevisionResults(item):
    """Aggregate the results from jobs that ran on a particular revision."""
    # First load pinpoint csv results into a DataFrame. The dtype arg is needed
    # to ensure that job_id's are always read a strings (even if some of them
    # look like large numbers).
    df = pd.read_csv(RevisionResultsFile(item), dtype={'job_id': str})
    assert df['change'].str.contains(item['revision']).all(), (
        'Not all results match the expected git revision')

    # Filter out and keep only the measurements and stories that we want.
    df = df[df['name'].isin(MEASUREMENTS)]
    df = df[df['story'].isin(ACTIVE_STORIES)]

    if not df.empty:
        # Aggregate over the results of individual stories.
        df = df.groupby(['change', 'name', 'benchmark',
                         'unit'])['mean'].agg(['mean', 'count']).reset_index()
    else:
        # Otherwise build a single row with an "empty" aggregate for this revision.
        # This is needed so we can remember in the cache that this revision has
        # been processed.
        df = pd.DataFrame(index=[0])
        df['change'] = item['revision']
        df['name'] = '(missing)'
        df['benchmark'] = '(missing)'
        df['unit'] = ''
        df['mean'] = np.nan
        df['count'] = 0

    # Convert time units from milliseconds to seconds. This is what Data Studio
    # dashboards expect.
    is_ms_unit = df['unit'].str.startswith('ms_')
    df.loc[is_ms_unit, 'mean'] = df['mean'] / 1000

    # Distinguish jobs that ran with/without the tested patch.
    df['label'] = df['change'].str.contains(r'\+').map({
        False: 'without_patch',
        True: 'with_patch'
    })

    # Add timestamp and revision information. We snap the date to noon and make
    # it naive (i.e. no timezone), so the dashboard doesn't get confused with
    # dates close to the end of day.
    date = item['timestamp'].split('T')[0] + 'T12:00:00'
    df['timestamp'] = pd.Timestamp(date)
    df['revision'] = item['revision']

    # Fake the timestamp of jobs without the patch to appear as if they ran a
    # year ago; this makes it easier to visualize and compare timeseries from
    # runs with/without the patch in Data Studio dashboards.
    df.loc[df['label'] == 'without_patch',
           'timestamp'] = (df['timestamp'] - pd.DateOffset(years=1))

    return df[[
        'revision', 'timestamp', 'label', 'benchmark', 'name', 'mean', 'count'
    ]]
示例#7
0
 def Process(item):
     # Add item to the database.
     df = pandas.DataFrame({'item': [item]})
     df.to_sql('items', con, index=False, if_exists='append')