Exemplo n.º 1
0
 def __init__(self):
   super(WordExtractingDoFn, self).__init__()
   self.words_counter = Metrics.counter(self.__class__, 'words')
   self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths')
   self.word_lengths_dist = Metrics.distribution(
       self.__class__, 'word_len_dist')
   self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')
Exemplo n.º 2
0
 def __init__(self, pattern):
   super(FilterTextFn, self).__init__()
   self.pattern = pattern
   # A custom metric can track values in your pipeline as it runs. Those
   # values will be available in the monitoring system of the runner used
   # to run the pipeline. These metrics below track the number of
   # matched and unmatched words.
   self.matched_words = Metrics.counter(self.__class__, 'matched_words')
   self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')
Exemplo n.º 3
0
def select_split(cumulative_splits, kv, unused_num_partitions):
  """Select split for an `(id, _)` tuple using a hash of `id`."""
  key, _ = kv
  m = hashlib.md5(key)
  r = int(m.hexdigest(), 16) / (2 ** (8 * m.digest_size))
  for i, (name, p) in enumerate(cumulative_splits):
    if r < p:
      Metrics.counter('select_split', name).inc()
      return i
  assert False
Exemplo n.º 4
0
def filter_invalid_notes(min_pitch, max_pitch, kv):
  """Filter notes with out-of-range pitch from NoteSequence protos."""
  key, ns_str = kv
  ns = music_pb2.NoteSequence.FromString(ns_str)
  valid_notes = [note for note in ns.notes
                 if min_pitch <= note.pitch <= max_pitch]
  if len(valid_notes) < len(ns.notes):
    del ns.notes[:]
    ns.notes.extend(valid_notes)
    Metrics.counter('filter_invalid_notes', 'out_of_range_pitch').inc()
  return key, ns.SerializeToString()
 def __init__(self):
   self.total_metric = Metrics.counter(self.__class__, 'total_values')
   self.dist_metric = Metrics.distribution(
       self.__class__, 'distribution_values')
   # TODO(ajamato): Add a verifier for gauge once it is supported by the SDKs
   # and runners.
   self.latest_metric = Metrics.gauge(self.__class__, 'latest_value')
Exemplo n.º 6
0
 def __init__(self):
   super(BitcoinTxnCountDoFn, self).__init__()
   self.txn_counter = Metrics.counter(self.__class__, 'txns')
   self.inputs_dist = Metrics.distribution(self.__class__, 'inputs_per_txn')
   self.outputs_dist = Metrics.distribution(self.__class__, 'outputs_per_txn')
   self.output_amts_dist = Metrics.distribution(self.__class__, 'output_amts')
   self.txn_amts_dist = Metrics.distribution(self.__class__, 'txn_amts')
Exemplo n.º 7
0
 def repl(*args):
   namespace = args[2]
   counter = Metrics.counter(namespace, counter_name)
   element = args[1]
   _, value = element
   for i in range(len(value)):
     counter.inc(i)
   return f(*args)
Exemplo n.º 8
0
 def __init__(self, pattern):
   self.pattern = pattern
   # A custom metric can track values in your pipeline as it runs. Create
   # custom metrics to count unmatched words, and know the distribution of
   # word lengths in the input PCollection.
   self.word_len_dist = Metrics.distribution(self.__class__,
                                             'word_len_dist')
   self.unmatched_words = Metrics.counter(self.__class__,
                                          'unmatched_words')
Exemplo n.º 9
0
def prepare_image_transforms(element, image_columns):
  """Replace an images url with its jpeg bytes.

  Args: 
    element: one input row, as a dict
    image_columns: list of columns that are image paths

  Return:
    element, where each image file path has been replaced by a base64 image.
  """
  import base64
  import cStringIO
  from PIL import Image
  from tensorflow.python.lib.io import file_io as tf_file_io
  from apache_beam.metrics import Metrics

  img_error_count = Metrics.counter('main', 'ImgErrorCount')
  img_missing_count = Metrics.counter('main', 'ImgMissingCount')

  for name in image_columns:
    uri = element[name]
    if not uri:
      img_missing_count.inc()
      continue
    try:
      with tf_file_io.FileIO(uri, 'r') as f:
        img = Image.open(f).convert('RGB')

    # A variety of different calling libraries throw different exceptions here.
    # They all correspond to an unreadable file so we treat them equivalently.
    # pylint: disable broad-except
    except Exception as e:
      logging.exception('Error processing image %s: %s', uri, str(e))
      img_error_count.inc()
      return

    # Convert to desired format and output.
    output = cStringIO.StringIO()
    img.save(output, 'jpeg')
    element[name] = base64.urlsafe_b64encode(output.getvalue())

  return element
Exemplo n.º 10
0
  def process(self, input_example):
    tf.logging.info('Splitting %s',
                    input_example.features.feature['id'].bytes_list.value[0])

    wav_data = input_example.features.feature['audio'].bytes_list.value[0]

    ns = music_pb2.NoteSequence.FromString(
        input_example.features.feature['sequence'].bytes_list.value[0])

    Metrics.counter('split_wav', 'read_midi_wav_to_split').inc()

    if self._split == 'test':
      # For the 'test' split, use the full length audio and midi.
      split_examples = split_audio_and_label_data.process_record(
          wav_data,
          ns,
          ns.id,
          min_length=0,
          max_length=-1,
          sample_rate=self._sample_rate)

      for example in split_examples:
        Metrics.counter('split_wav', 'full_example').inc()
        yield example
    else:
      split_examples = split_audio_and_label_data.process_record(
          wav_data, ns, ns.id, self._min_length, self._max_length,
          self._sample_rate)

      for example in split_examples:
        Metrics.counter('split_wav', 'split_example').inc()
        yield example
Exemplo n.º 11
0
 def __init__(self, project_id, instance_id, table_id):
   """ Constructor of the Write connector of Bigtable
   Args:
     project_id(str): GCP Project of to write the Rows
     instance_id(str): GCP Instance to write the Rows
     table_id(str): GCP Table to write the `DirectRows`
   """
   super(_BigTableWriteFn, self).__init__()
   self.beam_options = {'project_id': project_id,
                        'instance_id': instance_id,
                        'table_id': table_id}
   self.table = None
   self.batcher = None
   self.written = Metrics.counter(self.__class__, 'Written Row')
Exemplo n.º 12
0
  def process(self, input_example):
    tf.logging.info('Splitting %s',
                    input_example.features.feature['id'].bytes_list.value[0])

    wav_data = input_example.features.feature['audio'].bytes_list.value[0]

    ns = music_pb2.NoteSequence.FromString(
        input_example.features.feature['sequence'].bytes_list.value[0])

    Metrics.counter('split_wav', 'read_midi_wav_to_split').inc()

    if not self._chunk_files:
      split_examples = split_audio_and_label_data.process_record(
          wav_data,
          ns,
          ns.id,
          min_length=0,
          max_length=-1,
          sample_rate=self._sample_rate)

      for example in split_examples:
        Metrics.counter('split_wav', 'full_example').inc()
        yield example
    else:
      try:
        split_examples = split_audio_and_label_data.process_record(
            wav_data, ns, ns.id, self._min_length, self._max_length,
            self._sample_rate)

        for example in split_examples:
          Metrics.counter('split_wav', 'split_example').inc()
          yield example
      except AssertionError:
        output_file = 'badexample-' + hashlib.md5(ns.id).hexdigest() + '.proto'
        output_path = os.path.join(self._output_directory, output_file)
        tf.logging.error('Exception processing %s. Writing file to %s',
                         ns.id, output_path)
        with tf.gfile.Open(output_path, 'w') as f:
          f.write(input_example.SerializeToString())
        raise
Exemplo n.º 13
0
 def __init__(self):
     self.counter = Metrics.counter(self.__class__,
                                    counter_name)
     _LOGGER.info('counter: %s' % self.counter.metric_name)
Exemplo n.º 14
0
 def __init__(self, key_cols, val_col):
     # Count the row with missing values.
     self.null_row_count = Metrics.counter(self.__class__, 'null_row')
     self.key_cols = key_cols
     self.val_col = val_col
Exemplo n.º 15
0
 def __init__(self):
   self.empty_line_counter = Metrics.counter('main', 'empty_lines')
   self.word_length_counter = Metrics.counter('main', 'word_lengths')
   self.word_counter = Metrics.counter('main', 'total_words')
   self.word_lengths_dist = Metrics.distribution('main', 'word_len_dist')
Exemplo n.º 16
0
 def __init__(self):
   self.words_counter = Metrics.counter(self.__class__, 'words')
   self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths')
   self.word_lengths_dist = Metrics.distribution(
       self.__class__, 'word_len_dist')
   self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')
Exemplo n.º 17
0
  def _process_ns(self, ns):
    if self._filters:
      if ns.total_time > self._filters['max_total_time']:
        logging.info('Skipping %s: total_time=%f', ns.id, ns.total_time)
        beam_metrics.counter('ExtractExamplesDoFn', 'filtered-too-long').inc()
        return
      if len(ns.notes) > self._filters['max_num_notes']:
        logging.info('Skipping %s: num_notes=%d', ns.id, len(ns.notes))
        beam_metrics.counter(
            'ExtractExamplesDoFn', 'filtered-too-many-notes').inc()
        return

      try:
        qns = note_seq.quantize_note_sequence(ns, steps_per_quarter=16)
      except (note_seq.BadTimeSignatureError,
              note_seq.NonIntegerStepsPerBarError, note_seq.NegativeTimeError):
        beam_metrics.counter('ExtractExamplesDoFn', 'quantize-failed').inc()
        return

      vels = set()
      metric_positions = set()
      drums_only = True
      for note in qns.notes:
        drums_only &= note.is_drum
        if ((self._filters['is_drum'] is None or
             note.is_drum == self._filters['is_drum'])
            and note.velocity > 0):
          vels.add(note.velocity)
          metric_positions.add(note.quantized_start_step % 16)

      if len(vels) < self._filters['min_velocities']:
        beam_metrics.counter(
            'ExtractExamplesDoFn', 'filtered-min-velocities').inc()
        return
      if len(metric_positions) < self._filters['min_metric_positions']:
        beam_metrics.counter(
            'ExtractExamplesDoFn', 'filtered-min-metric-positions').inc()
        return
      if self._filters['drums_only'] and not drums_only:
        beam_metrics.counter(
            'ExtractExamplesDoFn', 'filtered-drums-only').inc()
        return

    beam_metrics.counter('ExtractExamplesDoFn', 'unfiltered-sequences').inc()
    logging.info('Converting %s to tensors', ns.id)
    extracted_examples = self._config.data_converter.to_tensors(ns)
    if not extracted_examples.outputs:
      beam_metrics.counter('ExtractExamplesDoFn', 'empty-extractions').inc()
      return
    beam_metrics.counter('ExtractExamplesDoFn', 'extracted-examples').inc(
        len(extracted_examples.outputs))
    for _, outputs, controls, _ in zip(*extracted_examples):
      if controls.size:
        example_ns = self._config.data_converter.from_tensors(
            [outputs], [controls])[0]
      else:
        example_ns = self._config.data_converter.from_tensors([outputs])[0]
      # Try to re-encode.
      # TODO(adarob): For now we filter and count examples that cannot be
      # re-extracted, but ultimately the converter should filter these or avoid
      # producing them all together.
      reextracted_examples = self._config.data_converter.to_tensors(
          example_ns).inputs
      assert len(reextracted_examples) <= 1
      if not reextracted_examples:
        logging.warning(
            'Extracted example NoteSequence does not reproduce example. '
            'Skipping: %s', example_ns)
        beam_metrics.counter('ExtractExamplesDoFn', 'empty-reextraction').inc()
        continue
      # Extra checks if the code returns multiple segments.
      # TODO(fjord): should probably make this recursive for cases with more
      # than 1 level of hierarchy.
      if isinstance(outputs, list):
        if len(outputs) != len(reextracted_examples[0]):
          logging.warning(
              'Re-extracted example tensor has different number of segments. '
              'ID: %s. original %d, reextracted %d. Skipping.', ns.id,
              len(outputs), len(reextracted_examples[0]))
          beam_metrics.counter(
              'ExtractExamplesDoFn', 'different-reextraction-count').inc()
          continue
        for i in range(len(outputs)):
          if not np.array_equal(reextracted_examples[0][i], outputs[i]):
            logging.warning(
                'Re-extracted example tensor does not equal original example. '
                'ID: %s. Index %d. NoteSequence: %s', ns.id, i, example_ns)
            beam_metrics.counter(
                'ExtractExamplesDoFn', 'different-reextraction').inc()
      yield example_ns, ns.id
Exemplo n.º 18
0
 def __init__(self, vals):
   self._vals = vals
   self._output_counter = Metrics.counter('main', 'outputs')
Exemplo n.º 19
0
  def process(self, kv):
    # Seed random number generator based on key so that hop times are
    # deterministic.
    key, ns_str = kv
    m = hashlib.md5(key)
    random.seed(int(m.hexdigest(), 16))

    # Deserialize NoteSequence proto.
    ns = music_pb2.NoteSequence.FromString(ns_str)

    # Apply sustain pedal.
    ns = sequences_lib.apply_sustain_control_changes(ns)

    # Remove control changes as there are potentially a lot of them and they are
    # no longer needed.
    del ns.control_changes[:]

    for _ in range(self._num_replications):
      for augment_fn in self._augment_fns:
        # Augment and encode the performance.
        try:
          augmented_performance_sequence = augment_fn(ns)
        except DataAugmentationError:
          Metrics.counter(
              'extract_examples', 'augment_performance_failed').inc()
          continue
        seq = self._encode_performance_fn(augmented_performance_sequence)
        # feed in performance as both input/output to music transformer
        # chopping sequence into length 2048 (throw out shorter sequences)
        if len(seq) >= 2048:
          max_offset = len(seq) - 2048
          offset = random.randrange(max_offset + 1)
          cropped_seq = seq[offset:offset + 2048]

          example_dict = {
              'inputs': cropped_seq,
              'targets': cropped_seq
          }

          if self._melody:
            # decode truncated performance sequence for melody inference
            decoded_midi = self._decode_performance_fn(cropped_seq)
            decoded_ns = mm.midi_io.midi_file_to_note_sequence(decoded_midi)

            # extract melody from cropped performance sequence
            melody_instrument = melody_inference.infer_melody_for_sequence(
                decoded_ns,
                melody_interval_scale=2.0,
                rest_prob=0.1,
                instantaneous_non_max_pitch_prob=1e-15,
                instantaneous_non_empty_rest_prob=0.0,
                instantaneous_missing_pitch_prob=1e-15)

            # remove non-melody notes from score
            score_sequence = copy.deepcopy(decoded_ns)
            score_notes = []
            for note in score_sequence.notes:
              if note.instrument == melody_instrument:
                score_notes.append(note)
            del score_sequence.notes[:]
            score_sequence.notes.extend(score_notes)

            # encode melody
            encode_score_fn = self._encode_score_fns['melody']
            example_dict['melody'] = encode_score_fn(score_sequence)
            # make sure performance input also matches targets; needed for
            # compatibility of both perf and (mel & perf) autoencoders

            if self._noisy:
              # randomly sample a pitch shift to construct noisy performance
              all_pitches = [x.pitch for x in decoded_ns.notes]
              min_val = min(all_pitches)
              max_val = max(all_pitches)
              transpose_range = range(-(min_val - 21), 108 - max_val + 1)
              try:
                transpose_range.remove(0)  # make sure you transpose
              except ValueError:
                pass
              transpose_amount = random.choice(transpose_range)
              augmented_ns, _ = sequences_lib.transpose_note_sequence(
                  decoded_ns, transpose_amount, min_allowed_pitch=21,
                  max_allowed_pitch=108, in_place=False)
              aug_seq = self._encode_performance_fn(augmented_ns)
              example_dict['performance'] = aug_seq
            else:
              example_dict['performance'] = example_dict['targets']
            del example_dict['inputs']

          Metrics.counter('extract_examples', 'encoded_example').inc()
          Metrics.distribution(
              'extract_examples', 'performance_length_in_seconds').update(
                  int(augmented_performance_sequence.total_time))

          yield generator_utils.to_example(example_dict)
Exemplo n.º 20
0
import json
import logging
import os
import random
import sys
import apache_beam as beam
from apache_beam.metrics import Metrics
import six
import textwrap
from tensorflow.python.lib.io import file_io
from tensorflow_transform import coders
from tensorflow_transform.beam import impl as tft
from tensorflow_transform.beam import tft_beam_io
from tensorflow_transform.tf_metadata import metadata_io

img_error_count = Metrics.counter('main', 'ImgErrorCount')

# Files
SCHEMA_FILE = 'schema.json'
FEATURES_FILE = 'features.json'

TRANSFORMED_METADATA_DIR = 'transformed_metadata'
RAW_METADATA_DIR = 'raw_metadata'
TRANSFORM_FN_DIR = 'transform_fn'

# Individual transforms
TARGET_TRANSFORM = 'target'
IMAGE_TRANSFORM = 'image_to_vec'


def parse_arguments(argv):
Exemplo n.º 21
0
 def __init__(self, image_uri_key: str):
     """Constructor."""
     super().__init__()
     self.image_uri_key = image_uri_key
     self.image_good_counter = Metrics.counter(self.__class__, 'image_good')
     self.image_bad_counter = Metrics.counter(self.__class__, 'image_bad')
Exemplo n.º 22
0
 def __init__(self, vals):
     self._vals = vals
     self._output_counter = Metrics.counter('main', 'outputs')
 def start_bundle(self):
   self.count = Metrics.counter(self.__class__, 'elementsplusone')
 def __init__(self, pattern):
     super(FilterTextFn, self).__init__()
     self.pattern = pattern
     cls = self.__class__
     self.matched_words = Metrics.counter(cls, 'matched_words')
     self.unmatched_words = Metrics.counter(cls, 'unmatched_words')
Exemplo n.º 25
0
def preprocess_data(input_example, hparams, process_for_training):
  """Preprocess example using data.preprocess_data."""
  with tf.Graph().as_default():
    audio = tf.constant(
        input_example.features.feature['audio'].bytes_list.value[0])

    sequence = tf.constant(
        input_example.features.feature['sequence'].bytes_list.value[0])
    sequence_id = tf.constant(
        input_example.features.feature['id'].bytes_list.value[0])
    velocity_range = tf.constant(
        input_example.features.feature['velocity_range'].bytes_list.value[0])

    input_tensors = data.preprocess_data(
        sequence_id, sequence, audio, velocity_range, hparams,
        is_training=process_for_training)

    with tf.Session() as sess:
      preprocessed = sess.run(input_tensors)

  example = tf.train.Example(
      features=tf.train.Features(
          feature={
              'spec':
                  tf.train.Feature(
                      float_list=tf.train.FloatList(
                          value=preprocessed.spec.flatten())),
              'spectrogram_hash':
                  tf.train.Feature(
                      int64_list=tf.train.Int64List(
                          value=[preprocessed.spectrogram_hash])),
              'labels':
                  tf.train.Feature(
                      float_list=tf.train.FloatList(
                          value=preprocessed.labels.flatten())),
              'label_weights':
                  tf.train.Feature(
                      float_list=tf.train.FloatList(
                          value=preprocessed.label_weights.flatten())),
              'length':
                  tf.train.Feature(
                      int64_list=tf.train.Int64List(
                          value=[preprocessed.length])),
              'onsets':
                  tf.train.Feature(
                      float_list=tf.train.FloatList(
                          value=preprocessed.onsets.flatten())),
              'offsets':
                  tf.train.Feature(
                      float_list=tf.train.FloatList(
                          value=preprocessed.offsets.flatten())),
              'velocities':
                  tf.train.Feature(
                      float_list=tf.train.FloatList(
                          value=preprocessed.velocities.flatten())),
              'sequence_id':
                  tf.train.Feature(
                      bytes_list=tf.train.BytesList(
                          value=[preprocessed.sequence_id])),
              'note_sequence':
                  tf.train.Feature(
                      bytes_list=tf.train.BytesList(
                          value=[preprocessed.note_sequence])),
          }))
  Metrics.counter('preprocess_data', 'preprocess_example').inc()
  return example
Exemplo n.º 26
0
 def __init__(self):
   self.words_counter = Metrics.counter(self.__class__, 'words')
   self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths')
   self.word_lengths_dist = Metrics.distribution(
       self.__class__, 'word_len_dist')
   self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')
Exemplo n.º 27
0
 def __init__(self):
   self.counter = Metrics.counter(self.__class__, counter_name)
   logging.info('counter: %s' % self.counter.metric_name)
 def start_bundle(self):
     self.count = Metrics.counter(self.__class__, 'elementsplusone')
Exemplo n.º 29
0
  try:
    from apache_beam.options.pipeline_options import PipelineOptions
  except ImportError:
    from apache_beam.utils.pipeline_options import PipelineOptions
except ImportError:
  from apache_beam.utils.options import PipelineOptions
from PIL import Image
import tensorflow as tf

from tensorflow.contrib.slim.python.slim.nets import inception_v3 as inception
from tensorflow.python.framework import errors
from tensorflow.python.lib.io import file_io

slim = tf.contrib.slim

error_count = Metrics.counter('main', 'errorCount')
missing_label_count = Metrics.counter('main', 'missingLabelCount')
csv_rows_count = Metrics.counter('main', 'csvRowsCount')
labels_count = Metrics.counter('main', 'labelsCount')
labels_without_ids = Metrics.counter('main', 'labelsWithoutIds')
existing_file = Metrics.counter('main', 'existingFile')
non_existing_file = Metrics.counter('main', 'nonExistingFile')
skipped_empty_line = Metrics.counter('main', 'skippedEmptyLine')
embedding_good = Metrics.counter('main', 'embedding_good')
embedding_bad = Metrics.counter('main', 'embedding_bad')
incompatible_image = Metrics.counter('main', 'incompatible_image')
invalid_uri = Metrics.counter('main', 'invalid_file_name')
unlabeled_image = Metrics.counter('main', 'unlabeled_image')
unknown_label = Metrics.counter('main', 'unknown_label')

 def __init__(self, namespace):
     self.namespace = namespace
     self.counter = Metrics.counter(self.namespace, self.LABEL)
Exemplo n.º 31
0
def main():
    project = 'chromeperf'
    options = PipelineOptions()
    options.view_as(DebugOptions).add_experiment('use_beam_bq_sink')
    options.view_as(GoogleCloudOptions).project = project
    bq_export_options = options.view_as(BqExportOptions)

    p = beam.Pipeline(options=options)
    entities_read = Metrics.counter('main', 'entities_read')
    failed_entity_transforms = Metrics.counter('main',
                                               'failed_entity_transforms')
    row_conflicts = Metrics.counter('main', 'row_conflicts')
    multiple_histograms_for_row = Metrics.counter(
        'main', 'multiple_histograms_for_row')
    orphaned_histogram = Metrics.counter('main', 'orphaned_histogram')

    """
  CREATE TABLE `chromeperf.chromeperf_dashboard_rows.<MASTER>`
  (revision INT64 NOT NULL,
   value FLOAT64 NOT NULL,
   std_error FLOAT64,
   `timestamp` TIMESTAMP NOT NULL,
   master STRING NOT NULL,
   bot STRING NOT NULL,
   measurement STRING,
   test STRING NOT NULL,
   properties STRING,
   sample_values ARRAY<FLOAT64>)
  PARTITION BY DATE(`timestamp`)
  CLUSTER BY master, bot, measurement;
  """  # pylint: disable=pointless-string-statement
    bq_row_schema = {
        'fields': [
            {
                'name': 'revision',
                'type': 'INT64',
                'mode': 'REQUIRED'
            },
            {
                'name': 'value',
                'type': 'FLOAT',
                'mode': 'REQUIRED'
            },
            {
                'name': 'std_error',
                'type': 'FLOAT',
                'mode': 'NULLABLE'
            },
            {
                'name': 'timestamp',
                'type': 'TIMESTAMP',
                'mode': 'REQUIRED'
            },
            {
                'name': 'master',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'bot',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'measurement',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'test',
                'type': 'STRING',
                'mode': 'REQUIRED'
            },
            {
                'name': 'properties',
                'type': 'STRING',
                'mode': 'NULLABLE'
            },
            {
                'name': 'sample_values',
                'type': 'FLOAT',
                'mode': 'REPEATED'
            },
        ]
    }

    def RowEntityToRowDict(entity):
        entities_read.inc()
        try:
            d = {
                'revision': entity.key.id,
                'value': FloatHack(entity['value']),
                'std_error': FloatHack(entity.get('error')),
                'timestamp': entity['timestamp'].isoformat(),
                'test': entity.key.parent.name,
            }
            # Add the expando properties as a JSON-encoded dict.
            properties = {}
            for key, value in entity.items():
                if key in d or key in ['parent_test', 'error']:
                    # skip properties with dedicated columns.
                    continue
                if isinstance(value, float):
                    value = FloatHack(value)
                properties[key] = value
            d['properties'] = json.dumps(properties) if properties else None
            # Add columns derived from test: master, bot.
            test_path_parts = d['test'].split('/', 2)
            if len(test_path_parts) >= 3:
                d['master'] = test_path_parts[0]
                d['bot'] = test_path_parts[1]
                d['measurement'] = '/'.join(test_path_parts[2:])
            return [d]
        except KeyError:
            logging.getLogger().exception('Failed to convert Row')
            failed_entity_transforms.inc()
            return []

    row_query_params = dict(project=project, kind='Row')
    row_entities = (
        p
        | 'ReadFromDatastore(Row)' >> ReadTimestampRangeFromDatastore(
            row_query_params,
            time_range_provider=bq_export_options.GetTimeRangeProvider(),
            step=datetime.timedelta(minutes=5)))

    row_dicts = (row_entities
                 | 'ConvertEntityToDict(Row)' >> FlatMap(RowEntityToRowDict))

    # The sample_values are not found in the Row entity.  So we have to fetch all
    # the corresponding Histogram entities and join them with our collection of
    # Rows (by using test + revision as the join key).  We also need to unpack the
    # sample values arrays out of the zlib-compressed JSON stored in the
    # Histogram's "data" property.
    def HistogramEntityToDict(entity):
        """Returns dicts with keys: 'test', 'revision', 'sample_values'."""
        entities_read.inc()
        try:
            data = entity['data']
        except KeyError:
            logging.getLogger().exception('Histogram missing "data" field')
            failed_entity_transforms.inc()
            return []
        try:
            json_str = zlib.decompress(data)
        except zlib.error:
            logging.getLogger().exception('Histogram data not valid zlib: %r',
                                          data)
            failed_entity_transforms.inc()
            return []
        try:
            data_dict = json.loads(json_str)
        except json.JSONDecodeError:
            logging.getLogger().exception('Histogram data not valid json.')
            failed_entity_transforms.inc()
            return []
        sample_values = data_dict.get('sampleValues', [])
        if not isinstance(sample_values, list):
            logging.getLogger().exception(
                'Histogram data.sampleValues not valid list.')
            failed_entity_transforms.inc()
            return []
        count = len(sample_values)
        sample_values = [v for v in sample_values if v is not None]
        if len(sample_values) != count:
            logging.getLogger().warn(
                'Histogram data.sampleValues contains null: %r', entity.key)
        for v in sample_values:
            if not isinstance(v, (int, float)):
                logging.getLogger().exception(
                    'Histogram data.sampleValues contains non-numeric: %r', v)
                failed_entity_transforms.inc()
                return []
        try:
            return [{
                'test': entity['test'].name,
                'revision': entity['revision'],
                'sample_values': sample_values,
            }]
        except KeyError:
            logging.getLogger().exception(
                'Histogram missing test or revision field/s')
            failed_entity_transforms.inc()
            return []

    histogram_query_params = dict(project=project, kind='Histogram')
    histogram_entities = (
        p
        | 'ReadFromDatastore(Histogram)' >> ReadTimestampRangeFromDatastore(
            histogram_query_params,
            time_range_provider=bq_export_options.GetTimeRangeProvider(),
            step=datetime.timedelta(minutes=5)))

    histogram_dicts = (
        histogram_entities
        | 'ConvertEntityToDict(Histogram)' >> FlatMap(HistogramEntityToDict))

    def TestRevision(element):
        return (element['test'], element['revision'])

    rows_with_key = (row_dicts
                     | 'WithKeys(Row)' >> beam.WithKeys(TestRevision))
    histograms_with_key = (
        histogram_dicts | 'WithKeys(Histogram)' >> beam.WithKeys(TestRevision))

    def MergeRowAndSampleValues(element):
        group_key, join_values = element
        rows, histograms = join_values
        if len(rows) == 0:
            orphaned_histogram.inc()
            logging.getLogger().error("No Row for Histogram(s) (%r)",
                                      group_key)
            return []
        elif len(rows) > 1:
            row_conflicts.inc()
            logging.getLogger().error("Multiple rows (%d) for %r", len(rows),
                                      group_key)
            return rows
        row = rows[0]
        if len(histograms) > 1:
            # We'll merge these, so this isn't an error.
            multiple_histograms_for_row.inc()
        elif len(histograms) == 0:
            # No sample values to annotate the row with.  This is common.
            return [row]
        # Merge multiple histogram's values into a single row.
        row['sample_values'] = list(
            itertools.chain.from_iterable(h['sample_values']
                                          for h in histograms))
        return [row]

    joined_and_annotated = ((rows_with_key, histograms_with_key)
                            | beam.CoGroupByKey()
                            | beam.FlatMap(MergeRowAndSampleValues))

    def TableNameFn(unused_element):
        return '{project}:{dataset}.rows{suffix}'.format(
            project=project,
            dataset=bq_export_options.dataset.get(),
            suffix=bq_export_options.table_suffix)

    _ = (joined_and_annotated
         | 'WriteToBigQuery(rows)' >> WriteToPartitionedBigQuery(
             TableNameFn,
             bq_row_schema,
             additional_bq_parameters={
                 'clustering': {
                     'fields': ['master', 'bot', 'measurement']
                 }
             }))

    result = p.run()
    result.wait_until_finish()
    PrintCounters(result)
 def __init__(self):
     self.word_length_counter = Metrics.counter('main','word_lengths')
Exemplo n.º 33
0
def combine_matching_seqs(ns_ids):
  ns, ids = ns_ids
  beam_metrics.counter('ExtractExamplesDoFn', 'unique-examples').inc()
  ns.id = ','.join(ids)
  return ns
 def __init__(self):
     self.word_counter = Metrics.counter('main','total_words')
Exemplo n.º 35
0
 def __init__(self, number_of_counters, number_of_operations):
     self.number_of_operations = number_of_operations
     self.counters = []
     for i in range(number_of_counters):
         self.counters.append(
             Metrics.counter('do-not-publish', 'name-{}'.format(i)))
 def __init__(self):
     self.empty_line_counter = Metrics.counter('main','empty_lines')
Exemplo n.º 37
0
import apache_beam as beam
from apache_beam.metrics import Metrics
try:
    from apache_beam.utils.pipeline_options import PipelineOptions
except ImportError:
    from apache_beam.utils.options import PipelineOptions
from PIL import Image
import tensorflow as tf

from tensorflow.contrib.slim.python.slim.nets import inception_v3 as inception
from tensorflow.python.framework import errors
from tensorflow.python.lib.io import file_io

slim = tf.contrib.slim

error_count = Metrics.counter('main', 'errorCount')
missing_label_count = Metrics.counter('main', 'missingLabelCount')
csv_rows_count = Metrics.counter('main', 'csvRowsCount')
labels_count = Metrics.counter('main', 'labelsCount')
labels_without_ids = Metrics.counter('main', 'labelsWithoutIds')
existing_file = Metrics.counter('main', 'existingFile')
non_existing_file = Metrics.counter('main', 'nonExistingFile')
skipped_empty_line = Metrics.counter('main', 'skippedEmptyLine')
embedding_good = Metrics.counter('main', 'embedding_good')
embedding_bad = Metrics.counter('main', 'embedding_bad')
incompatible_image = Metrics.counter('main', 'incompatible_image')
invalid_uri = Metrics.counter('main', 'invalid_file_name')
unlabeled_image = Metrics.counter('main', 'unlabeled_image')
unknown_label = Metrics.counter('main', 'unknown_label')

Exemplo n.º 38
0
def main():
  project = 'chromeperf'
  options = PipelineOptions()
  options.view_as(DebugOptions).add_experiment('use_beam_bq_sink')
  options.view_as(GoogleCloudOptions).project = project
  bq_export_options = options.view_as(BqExportOptions)

  p = beam.Pipeline(options=options)
  entities_read = Metrics.counter('main', 'entities_read')
  failed_entity_transforms = Metrics.counter('main', 'failed_entity_transforms')

  """
  CREATE TABLE `chromeperf.chromeperf_dashboard_data.rows_test`
  (revision INT64 NOT NULL,
   value FLOAT64 NOT NULL,
   std_error FLOAT64,
   `timestamp` TIMESTAMP NOT NULL,
   test STRING NOT NULL,
   master STRING,
   bot STRING,
   properties STRING)
  PARTITION BY DATE(`timestamp`);
  """  # pylint: disable=pointless-string-statement
  bq_row_schema = {'fields': [
      {'name': 'revision', 'type': 'INT64', 'mode': 'REQUIRED'},
      {'name': 'value', 'type': 'FLOAT', 'mode': 'REQUIRED'},
      {'name': 'std_error', 'type': 'FLOAT', 'mode': 'NULLABLE'},
      {'name': 'timestamp', 'type': 'TIMESTAMP', 'mode': 'REQUIRED'},
      {'name': 'test', 'type': 'STRING', 'mode': 'REQUIRED'},
      {'name': 'master', 'type': 'STRING', 'mode': 'NULLABLE'},
      {'name': 'bot', 'type': 'STRING', 'mode': 'NULLABLE'},
      {'name': 'properties', 'type': 'STRING', 'mode': 'NULLABLE'},
  ]}
  def RowEntityToRowDict(entity):
    entities_read.inc()
    try:
      d = {
          'revision': entity.key.id,
          'value': FloatHack(entity['value']),
          'std_error': FloatHack(entity.get('error')),
          'timestamp': entity['timestamp'].isoformat(),
          'test': entity.key.parent.name,
      }
      # Add the expando properties as a JSON-encoded dict.
      properties = {}
      for key, value in entity.items():
        if key in d or key in ['parent_test', 'error']:
          # skip properties with dedicated columns.
          continue
        if isinstance(value, float):
          value = FloatHack(value)
        properties[key] = value
      d['properties'] = json.dumps(properties) if properties else None
      # Add columns derived from test: master, bot.
      test_path_parts = d['test'].split('/', 2)
      if len(test_path_parts) >= 3:
        d['master'] = test_path_parts[0]
        d['bot'] = test_path_parts[1]
      return [d]
    except KeyError:
      logging.getLogger().exception('Failed to convert Row')
      failed_entity_transforms.inc()
      return []

  row_query_params = dict(project=project, kind='Row')
  row_entities = (
      p
      | 'ReadFromDatastore(Row)' >> ReadTimestampRangeFromDatastore(
          row_query_params,
          time_range_provider=bq_export_options.GetTimeRangeProvider(),
          step=datetime.timedelta(minutes=5)))

  row_dicts = (
      row_entities | 'ConvertEntityToRow(Row)' >> FlatMap(RowEntityToRowDict))

  table_name = '{}:chromeperf_dashboard_data.rows{}'.format(
      project, bq_export_options.table_suffix)
  _ = row_dicts | 'WriteToBigQuery(rows)' >> WriteToPartitionedBigQuery(
      table_name, bq_row_schema)

  result = p.run()
  result.wait_until_finish()
  PrintCounters(result)
Exemplo n.º 39
0
 def __init__(self, count):
     self.records_read = Metrics.counter(self.__class__, 'recordsRead')
     self._count = count
Exemplo n.º 40
0
 def __setstate__(self, options):
   self.beam_options = options
   self.table = None
   self.batcher = None
   self.written = Metrics.counter(self.__class__, 'Written Row')
Exemplo n.º 41
0
import uuid

from google.cloud.proto.datastore.v1 import entity_pb2
from google.cloud.proto.datastore.v1 import query_pb2
from googledatastore import helper as datastore_helper, PropertyFilter

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io.google_cloud_platform.datastore.v1.datastoreio import ReadFromDatastore
from apache_beam.io.google_cloud_platform.datastore.v1.datastoreio import WriteToDatastore
from apache_beam.metrics import Metrics
from apache_beam.utils.pipeline_options import GoogleCloudOptions
from apache_beam.utils.pipeline_options import PipelineOptions
from apache_beam.utils.pipeline_options import SetupOptions

empty_line_counter = Metrics.counter('main', 'empty_lines')
word_length_counter = Metrics.counter('main', 'word_lengths')
word_counter = Metrics.counter('main', 'total_words')


class WordExtractingDoFn(beam.DoFn):
    """Parse each line of input text into words."""
    def process(self, element):
        """Returns an iterator over words in contents of Cloud Datastore entity.
    The element is a line of text.  If the line is blank, note that, too.
    Args:
      element: the input element to be processed
    Returns:
      The processed element.
    """
        content_value = element.properties.get('content', None)
Exemplo n.º 42
0
 def __init__(self):
   super(ParseEventFn, self).__init__()
   self.num_parse_errors = Metrics.counter(self.__class__, 'num_parse_errors')
Exemplo n.º 43
0
"""


import apache_beam as beam
from apache_beam.io import tfrecordio
from apache_beam.metrics import Metrics
import cStringIO
import logging
import os
from PIL import Image

from . import _inceptionlib
from . import _util


error_count = Metrics.counter('main', 'errorCount')
rows_count = Metrics.counter('main', 'rowsCount')
skipped_empty_line = Metrics.counter('main', 'skippedEmptyLine')
embedding_good = Metrics.counter('main', 'embedding_good')
embedding_bad = Metrics.counter('main', 'embedding_bad')
incompatible_image = Metrics.counter('main', 'incompatible_image')
invalid_uri = Metrics.counter('main', 'invalid_file_name')
unlabeled_image = Metrics.counter('main', 'unlabeled_image')


class ExtractLabelIdsDoFn(beam.DoFn):
  """Extracts (uri, label_ids) tuples from CSV rows.
  """

  def start_bundle(self, context=None):
    self.label_to_id_map = {}
Exemplo n.º 44
0
 def __init__(self, pattern):
   self.pattern = pattern
   # A custom metric can track values in your pipeline as it runs. Create
   # custom metrics matched_word and unmatched_words.
   self.matched_words = Metrics.counter(self.__class__, 'matched_words')
   self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')
Exemplo n.º 45
0
 def __init__(self, pattern):
   self.pattern = pattern
   # A custom metric can track values in your pipeline as it runs. Create
   # custom metrics matched_word and unmatched_words.
   self.matched_words = Metrics.counter(self.__class__, 'matched_words')
   self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')
Exemplo n.º 46
0
 def __init__(self, allow_errors):
   self._allow_errors = allow_errors
   self._counter = Metrics.counter(self.__class__, 'ml-extract-features')
   self._error_counter = Metrics.counter(self.__class__,
                                         'ml-extract-features-errors')
Exemplo n.º 47
0
 def __init__(self):
   self.empty_line_counter = Metrics.counter('main', 'empty_lines')
   self.word_length_counter = Metrics.counter('main', 'word_lengths')
   self.word_counter = Metrics.counter('main', 'total_words')
   self.word_lengths_dist = Metrics.distribution('main', 'word_len_dist')
Exemplo n.º 48
0
 def __init__(self):
   super(ParseEventFn, self).__init__()
   self.num_parse_errors = Metrics.counter(self.__class__, 'num_parse_errors')
Exemplo n.º 49
0
from google.cloud.proto.datastore.v1 import entity_pb2
from google.cloud.proto.datastore.v1 import query_pb2
from googledatastore import helper as datastore_helper, PropertyFilter

import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io.gcp.datastore.v1.datastoreio import ReadFromDatastore
from apache_beam.io.gcp.datastore.v1.datastoreio import WriteToDatastore
from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter
from apache_beam.utils.pipeline_options import GoogleCloudOptions
from apache_beam.utils.pipeline_options import PipelineOptions
from apache_beam.utils.pipeline_options import SetupOptions

empty_line_counter = Metrics.counter('main', 'empty_lines')
word_length_counter = Metrics.counter('main', 'word_lengths')
word_counter = Metrics.counter('main', 'total_words')


class WordExtractingDoFn(beam.DoFn):
  """Parse each line of input text into words."""

  def process(self, element):
    """Returns an iterator over words in contents of Cloud Datastore entity.
    The element is a line of text.  If the line is blank, note that, too.
    Args:
      element: the input element to be processed
    Returns:
      The processed element.
    """
Exemplo n.º 50
0
 def __init__(self):
   self.counter = Metrics.counter('pardo', 'total_bytes.count')
Exemplo n.º 51
0
 def __init__(self, count):
   self.records_read = Metrics.counter(self.__class__, 'recordsRead')
   self._count = count
Exemplo n.º 52
0
 def __init__(self):
   super(WordExtractingDoFn, self).__init__()
   self.words_counter = Metrics.counter(self.__class__, 'words')
   self.word_lengths_counter = Metrics.counter(self.__class__, 'word_lengths')
   self.empty_line_counter = Metrics.counter(self.__class__, 'empty_lines')
Exemplo n.º 53
0
import logging
import os
import sys
from datetime import datetime
import numpy as np

import apache_beam as beam
from apache_beam.metrics import Metrics
from tensorflow_transform import coders

from trainer.config import PROJECT_ID, DATA_DIR, TFRECORD_DIR, NUM_LABELS
from trainer.util import schema, read_image

logging.warning('running preprocess')

partition_train = Metrics.counter('partition', 'train')
partition_validation = Metrics.counter('partition', 'validation')
partition_test = Metrics.counter('partition', 'test')
examples_failed = Metrics.counter('build', 'failed')


def build_example((key, label, img_bytes)):
    """Build a dictionary that contains all the features and label to store
    as TFRecord

    Args:
        raw_in: raw data to build the example from

    Returns:
        dict: A dictionary of features
Exemplo n.º 54
0
 def inc_counter(self, name):
     Metrics.counter(self.__class__.__name__, name).inc()
Exemplo n.º 55
0
    def process(self, kv):
        # Seed random number generator based on key so that hop times are
        # deterministic.
        key, ns_str = kv
        print(key)
        print("********************************************")
        #m = hashlib.md5(key)
        m = hashlib.md5(key.encode('utf-8'))
        random.seed(int(m.hexdigest(), 16))

        # Deserialize NoteSequence proto.
        ns = music_pb2.NoteSequence.FromString(ns_str)

        # Apply sustain pedal.
        ns = sequences_lib.apply_sustain_control_changes(ns)

        # Remove control changes as there are potentially a lot of them and they are
        # no longer needed.
        del ns.control_changes[:]

        if (self._min_hop_size_seconds
                and ns.total_time < self._min_hop_size_seconds):
            Metrics.counter('extract_examples', 'sequence_too_short').inc()
            return

        sequences = []
        for _ in range(self._num_replications):
            if self._max_hop_size_seconds:
                if self._max_hop_size_seconds == self._min_hop_size_seconds:
                    # Split using fixed hop size.
                    sequences += sequences_lib.split_note_sequence(
                        ns, self._max_hop_size_seconds)
                else:
                    # Sample random hop positions such that each segment size is within
                    # the specified range.
                    hop_times = [0.0]
                    while hop_times[
                            -1] <= ns.total_time - self._min_hop_size_seconds:
                        if hop_times[
                                -1] + self._max_hop_size_seconds < ns.total_time:
                            # It's important that we get a valid hop size here, since the
                            # remainder of the sequence is too long.
                            max_offset = min(
                                self._max_hop_size_seconds, ns.total_time -
                                self._min_hop_size_seconds - hop_times[-1])
                        else:
                            # It's okay if the next hop time is invalid (in which case we'll
                            # just stop).
                            max_offset = self._max_hop_size_seconds
                        offset = random.uniform(self._min_hop_size_seconds,
                                                max_offset)
                        hop_times.append(hop_times[-1] + offset)
                    # Split at the chosen hop times (ignoring zero and the final invalid
                    # time).
                    sequences += sequences_lib.split_note_sequence(
                        ns, hop_times[1:-1])
            else:
                sequences += [ns]

        for performance_sequence in sequences:
            if self._encode_score_fns:
                # We need to extract a score.
                if not self._absolute_timing:
                    # Beats are required to extract a score with metric timing.
                    beats = [
                        ta for ta in performance_sequence.text_annotations
                        if (ta.annotation_type ==
                            music_pb2.NoteSequence.TextAnnotation.BEAT)
                        and ta.time <= performance_sequence.total_time
                    ]
                    if len(beats) < 2:
                        Metrics.counter('extract_examples',
                                        'not_enough_beats').inc()
                        continue

                    # Ensure the sequence starts and ends on a beat.
                    performance_sequence = sequences_lib.extract_subsequence(
                        performance_sequence,
                        start_time=min(beat.time for beat in beats),
                        end_time=max(beat.time for beat in beats))

                    # Infer beat-aligned chords (only for relative timing).
                    try:
                        chord_inference.infer_chords_for_sequence(
                            performance_sequence,
                            chord_change_prob=0.25,
                            chord_note_concentration=50.0,
                            add_key_signatures=True)
                    except chord_inference.ChordInferenceError:
                        Metrics.counter('extract_examples',
                                        'chord_inference_failed').inc()
                        continue

                # Infer melody regardless of relative/absolute timing.
                try:
                    melody_instrument = melody_inference.infer_melody_for_sequence(
                        performance_sequence,
                        melody_interval_scale=2.0,
                        rest_prob=0.1,
                        instantaneous_non_max_pitch_prob=1e-15,
                        instantaneous_non_empty_rest_prob=0.0,
                        instantaneous_missing_pitch_prob=1e-15)
                except melody_inference.MelodyInferenceError:
                    Metrics.counter('extract_examples',
                                    'melody_inference_failed').inc()
                    continue

                if not self._absolute_timing:
                    # Now rectify detected beats to occur at fixed tempo.
                    # TODO(iansimon): also include the alignment
                    score_sequence, unused_alignment = sequences_lib.rectify_beats(
                        performance_sequence, beats_per_minute=SCORE_BPM)
                else:
                    # Score uses same timing as performance.
                    score_sequence = copy.deepcopy(performance_sequence)

                # Remove melody notes from performance.
                performance_notes = []
                for note in performance_sequence.notes:
                    if note.instrument != melody_instrument:
                        performance_notes.append(note)
                del performance_sequence.notes[:]
                performance_sequence.notes.extend(performance_notes)

                # Remove non-melody notes from score.
                score_notes = []
                for note in score_sequence.notes:
                    if note.instrument == melody_instrument:
                        score_notes.append(note)
                del score_sequence.notes[:]
                score_sequence.notes.extend(score_notes)

                # Remove key signatures and beat/chord annotations from performance.
                del performance_sequence.key_signatures[:]
                del performance_sequence.text_annotations[:]

                Metrics.counter('extract_examples', 'extracted_score').inc()

            for augment_fn in self._augment_fns:
                # Augment and encode the performance.
                try:
                    augmented_performance_sequence = augment_fn(
                        performance_sequence)
                except DataAugmentationError:
                    Metrics.counter('extract_examples',
                                    'augment_performance_failed').inc()
                    continue
                example_dict = {
                    'targets':
                    self._encode_performance_fn(augmented_performance_sequence)
                }
                if not example_dict['targets']:
                    Metrics.counter('extract_examples',
                                    'skipped_empty_targets').inc()
                    continue

                if (self._random_crop_length and len(example_dict['targets']) >
                        self._random_crop_length):
                    # Take a random crop of the encoded performance.
                    max_offset = len(
                        example_dict['targets']) - self._random_crop_length
                    offset = random.randrange(max_offset + 1)
                    example_dict['targets'] = example_dict['targets'][
                        offset:offset + self._random_crop_length]

                if self._encode_score_fns:
                    # Augment the extracted score.
                    try:
                        augmented_score_sequence = augment_fn(score_sequence)
                    except DataAugmentationError:
                        Metrics.counter('extract_examples',
                                        'augment_score_failed').inc()
                        continue

                    # Apply all score encoding functions.
                    skip = False
                    for name, encode_score_fn in self._encode_score_fns.items(
                    ):
                        example_dict[name] = encode_score_fn(
                            augmented_score_sequence)
                        if not example_dict[name]:
                            Metrics.counter('extract_examples',
                                            'skipped_empty_%s' % name).inc()
                            skip = True
                            break
                    if skip:
                        continue

                Metrics.counter('extract_examples', 'encoded_example').inc()
                Metrics.distribution(
                    'extract_examples',
                    'performance_length_in_seconds').update(
                        int(augmented_performance_sequence.total_time))

                yield generator_utils.to_example(example_dict)
Exemplo n.º 56
0
  def process(self, kv):
    # Seed random number generator based on key so that hop times are
    # deterministic.
    key, ns_str = kv
    m = hashlib.md5(key)
    random.seed(int(m.hexdigest(), 16))

    # Deserialize NoteSequence proto.
    ns = music_pb2.NoteSequence.FromString(ns_str)

    # Apply sustain pedal.
    ns = sequences_lib.apply_sustain_control_changes(ns)

    # Remove control changes as there are potentially a lot of them and they are
    # no longer needed.
    del ns.control_changes[:]

    if (self._min_hop_size_seconds and
        ns.total_time < self._min_hop_size_seconds):
      Metrics.counter('extract_examples', 'sequence_too_short').inc()
      return

    sequences = []
    for _ in range(self._num_replications):
      if self._max_hop_size_seconds:
        if self._max_hop_size_seconds == self._min_hop_size_seconds:
          # Split using fixed hop size.
          sequences += sequences_lib.split_note_sequence(
              ns, self._max_hop_size_seconds)
        else:
          # Sample random hop positions such that each segment size is within
          # the specified range.
          hop_times = [0.0]
          while hop_times[-1] <= ns.total_time - self._min_hop_size_seconds:
            if hop_times[-1] + self._max_hop_size_seconds < ns.total_time:
              # It's important that we get a valid hop size here, since the
              # remainder of the sequence is too long.
              max_offset = min(
                  self._max_hop_size_seconds,
                  ns.total_time - self._min_hop_size_seconds - hop_times[-1])
            else:
              # It's okay if the next hop time is invalid (in which case we'll
              # just stop).
              max_offset = self._max_hop_size_seconds
            offset = random.uniform(self._min_hop_size_seconds, max_offset)
            hop_times.append(hop_times[-1] + offset)
          # Split at the chosen hop times (ignoring zero and the final invalid
          # time).
          sequences += sequences_lib.split_note_sequence(ns, hop_times[1:-1])
      else:
        sequences += [ns]

    for performance_sequence in sequences:
      if self._encode_score_fns:
        # We need to extract a score.
        if not self._absolute_timing:
          # Beats are required to extract a score with metric timing.
          beats = [
              ta for ta in performance_sequence.text_annotations
              if (ta.annotation_type ==
                  music_pb2.NoteSequence.TextAnnotation.BEAT)
              and ta.time <= performance_sequence.total_time
          ]
          if len(beats) < 2:
            Metrics.counter('extract_examples', 'not_enough_beats').inc()
            continue

          # Ensure the sequence starts and ends on a beat.
          performance_sequence = sequences_lib.extract_subsequence(
              performance_sequence,
              start_time=min(beat.time for beat in beats),
              end_time=max(beat.time for beat in beats)
          )

          # Infer beat-aligned chords (only for relative timing).
          try:
            chord_inference.infer_chords_for_sequence(
                performance_sequence,
                chord_change_prob=0.25,
                chord_note_concentration=50.0,
                add_key_signatures=True)
          except chord_inference.ChordInferenceError:
            Metrics.counter('extract_examples', 'chord_inference_failed').inc()
            continue

        # Infer melody regardless of relative/absolute timing.
        try:
          melody_instrument = melody_inference.infer_melody_for_sequence(
              performance_sequence,
              melody_interval_scale=2.0,
              rest_prob=0.1,
              instantaneous_non_max_pitch_prob=1e-15,
              instantaneous_non_empty_rest_prob=0.0,
              instantaneous_missing_pitch_prob=1e-15)
        except melody_inference.MelodyInferenceError:
          Metrics.counter('extract_examples', 'melody_inference_failed').inc()
          continue

        if not self._absolute_timing:
          # Now rectify detected beats to occur at fixed tempo.
          # TODO(iansimon): also include the alignment
          score_sequence, unused_alignment = sequences_lib.rectify_beats(
              performance_sequence, beats_per_minute=SCORE_BPM)
        else:
          # Score uses same timing as performance.
          score_sequence = copy.deepcopy(performance_sequence)

        # Remove melody notes from performance.
        performance_notes = []
        for note in performance_sequence.notes:
          if note.instrument != melody_instrument:
            performance_notes.append(note)
        del performance_sequence.notes[:]
        performance_sequence.notes.extend(performance_notes)

        # Remove non-melody notes from score.
        score_notes = []
        for note in score_sequence.notes:
          if note.instrument == melody_instrument:
            score_notes.append(note)
        del score_sequence.notes[:]
        score_sequence.notes.extend(score_notes)

        # Remove key signatures and beat/chord annotations from performance.
        del performance_sequence.key_signatures[:]
        del performance_sequence.text_annotations[:]

        Metrics.counter('extract_examples', 'extracted_score').inc()

      for augment_fn in self._augment_fns:
        # Augment and encode the performance.
        try:
          augmented_performance_sequence = augment_fn(performance_sequence)
        except DataAugmentationError:
          Metrics.counter(
              'extract_examples', 'augment_performance_failed').inc()
          continue
        example_dict = {
            'targets': self._encode_performance_fn(
                augmented_performance_sequence)
        }
        if not example_dict['targets']:
          Metrics.counter('extract_examples', 'skipped_empty_targets').inc()
          continue

        if self._encode_score_fns:
          # Augment the extracted score.
          try:
            augmented_score_sequence = augment_fn(score_sequence)
          except DataAugmentationError:
            Metrics.counter('extract_examples', 'augment_score_failed').inc()
            continue

          # Apply all score encoding functions.
          skip = False
          for name, encode_score_fn in self._encode_score_fns.items():
            example_dict[name] = encode_score_fn(augmented_score_sequence)
            if not example_dict[name]:
              Metrics.counter('extract_examples',
                              'skipped_empty_%s' % name).inc()
              skip = True
              break
          if skip:
            continue

        Metrics.counter('extract_examples', 'encoded_example').inc()
        Metrics.distribution(
            'extract_examples', 'performance_length_in_seconds').update(
                int(augmented_performance_sequence.total_time))

        yield generator_utils.to_example(example_dict)