Exemplo n.º 1
0
def generateStats(filename, statsInfo, maxSamples = None, filters=[], cache=True):
  """Generate requested statistics for a dataset and cache to a file.
  If filename is None, then don't cache to a file"""

  # Sanity checking
  if not isinstance(statsInfo, dict):
    raise RuntimeError("statsInfo must be a dict -- "
                       "found '%s' instead" % type(statsInfo))

  filename = findDataset(filename)

  if cache:
    statsFilename = getStatsFilename(filename, statsInfo, filters)
    # Use cached stats if found AND if it has the right data
    if os.path.exists(statsFilename):
      try:
        r = pickle.load(open(statsFilename, "rb"))
      except:
        # Ok to ignore errors -- we will just re-generate the file
        print "Warning: unable to load stats for %s -- " \
              "will regenerate" % filename
        r = dict()
      requestedKeys = set([s for s in statsInfo])
      availableKeys = set(r.keys())
      unavailableKeys = requestedKeys.difference(availableKeys)
      if len(unavailableKeys ) == 0:
        return r
      else:
        print "generateStats: re-generating stats file %s because " \
              "keys %s are not available" %  \
              (filename, str(unavailableKeys))
        os.remove(filename)

  print "Generating statistics for file '%s' with filters '%s'" % (filename, filters)
  sensor = RecordSensor()
  sensor.dataSource = FileRecordStream(filename)
  sensor.preEncodingFilters = filters

  # Convert collector description to collector object
  stats = []
  for field in statsInfo:
    # field = key from statsInfo
    if statsInfo[field] == "number":
      # This wants a field name e.g. consumption and the field type as the value
      statsInfo[field] = NumberStatsCollector()
    elif statsInfo[field] == "category":
      statsInfo[field] = CategoryStatsCollector()
    else:
      raise RuntimeError("Unknown stats type '%s' for field '%s'" % (statsInfo[field], field))

  # Now collect the stats
  if maxSamples is None:
    maxSamples = 500000
  for i in xrange(maxSamples):
    try:
      record = sensor.getNextRecord()
    except StopIteration:
      break
    for (name, collector) in statsInfo.items():
      collector.add(record[name])

  del sensor

  # Assemble the results and return
  r = dict()
  for (field, collector) in statsInfo.items():
    stats = collector.getStats()
    if field not in r:
      r[field] = stats
    else:
      r[field].update(stats)

  if cache:
    f = open(statsFilename, "wb")
    pickle.dump(r, f)
    f.close()
    # caller may need to know name of cached file
    r["_filename"] = statsFilename

  return r
Exemplo n.º 2
0
def generateStats(filename, statsInfo, maxSamples = None, filters=[], cache=True):
  """Generate requested statistics for a dataset and cache to a file.
  If filename is None, then don't cache to a file"""

  # Sanity checking
  if not isinstance(statsInfo, dict):
    raise RuntimeError("statsInfo must be a dict -- "
                       "found '%s' instead" % type(statsInfo))

  filename = findDataset(filename)

  if cache:
    statsFilename = getStatsFilename(filename, statsInfo, filters)
    # Use cached stats if found AND if it has the right data
    if os.path.exists(statsFilename):
      try:
        r = pickle.load(open(statsFilename, "rb"))
      except:
        # Ok to ignore errors -- we will just re-generate the file
        print "Warning: unable to load stats for %s -- " \
              "will regenerate" % filename
        r = dict()
      requestedKeys = set([s for s in statsInfo])
      availableKeys = set(r.keys())
      unavailableKeys = requestedKeys.difference(availableKeys)
      if len(unavailableKeys ) == 0:
        return r
      else:
        print "generateStats: re-generating stats file %s because " \
              "keys %s are not available" %  \
              (filename, str(unavailableKeys))
        os.remove(filename)

  print "Generating statistics for file '%s' with filters '%s'" % (filename, filters)
  sensor = RecordSensor()
  sensor.dataSource = FileRecordStream(filename)
  sensor.preEncodingFilters = filters

  # Convert collector description to collector object
  stats = []
  for field in statsInfo:
    # field = key from statsInfo
    if statsInfo[field] == "number":
      # This wants a field name e.g. consumption and the field type as the value
      statsInfo[field] = NumberStatsCollector()
    elif statsInfo[field] == "category":
      statsInfo[field] = CategoryStatsCollector()
    else:
      raise RuntimeError("Unknown stats type '%s' for field '%s'" % (statsInfo[field], field))

  # Now collect the stats
  if maxSamples is None:
    maxSamples = 500000
  for i in xrange(maxSamples):
    try:
      record = sensor.getNextRecord()
    except StopIteration:
      break
    for (name, collector) in statsInfo.items():
      collector.add(record[name])

  del sensor

  # Assemble the results and return
  r = dict()
  for (field, collector) in statsInfo.items():
    stats = collector.getStats()
    if field not in r:
      r[field] = stats
    else:
      r[field].update(stats)

  if cache:
    f = open(statsFilename, "wb")
    pickle.dump(r, f)
    f.close()
    # caller may need to know name of cached file
    r["_filename"] = statsFilename

  return r
Exemplo n.º 3
0
    def testDeltaFilter(self):
        """
    data looks like:        should generate deltas
      "t"   "s"               "dt"     "ds"

      t     10                 X
      t+1s  20                 1s      10
      t+1d  50                 86399   30

    r t+1d+1s  60              X
      r+1d+3s  65              2s       5

    """
        r = RecordSensor()
        filename = findDataset("extra/qa/delta.csv")
        datasource = FileRecordStream(filename)
        r.dataSource = datasource
        n = 50
        encoder = MultiEncoder({
            'blah':
            dict(fieldname="s",
                 type='ScalarEncoder',
                 n=n,
                 w=11,
                 minval=0,
                 maxval=100)
        })

        r.encoder = encoder

        # Test #1 -- no deltas
        # Make sure we get a reset when the gym changes
        resetOut = numpy.zeros((1, ), dtype='float')
        sequenceIdOut = numpy.zeros((1, ), dtype='float')
        dataOut = numpy.zeros((n, ), dtype='float')
        sourceOut = numpy.zeros((1, ), dtype='float')
        categoryOut = numpy.zeros((1, ), dtype='float')

        outputs = dict(resetOut=resetOut,
                       sourceOut=sourceOut,
                       sequenceIdOut=sequenceIdOut,
                       dataOut=dataOut,
                       categoryOut=categoryOut)
        inputs = dict()
        r.verbosity = 0

        r.compute(inputs, outputs)
        lr = r.lastRecord
        self.assertEqual(
            lr['t'],
            datetime(year=2011, month=2, day=24, hour=16, minute=8, second=0))
        self.assertEqual(lr['s'], 10)
        self.assertEqual(lr['_reset'], 1)
        self.assertTrue('dt' not in lr)
        self.assertTrue('ds' not in lr)

        r.compute(inputs, outputs)
        lr = r.lastRecord
        self.assertEqual(
            lr['t'],
            datetime(year=2011, month=2, day=24, hour=16, minute=8, second=1))
        self.assertEqual(lr['s'], 20)
        self.assertEqual(lr['_reset'], 0)

        r.compute(inputs, outputs)
        lr = r.lastRecord
        self.assertEqual(
            lr['t'],
            datetime(year=2011, month=2, day=25, hour=16, minute=8, second=0))
        self.assertEqual(lr['s'], 50)
        self.assertEqual(lr['_reset'], 0)

        r.compute(inputs, outputs)
        lr = r.lastRecord
        self.assertEqual(
            lr['t'],
            datetime(year=2011, month=2, day=25, hour=16, minute=8, second=1))
        self.assertEqual(lr['s'], 60)
        self.assertEqual(lr['_reset'], 1)

        r.compute(inputs, outputs)
        lr = r.lastRecord
        self.assertEqual(
            lr['t'],
            datetime(year=2011, month=2, day=25, hour=16, minute=8, second=3))
        self.assertEqual(lr['s'], 65)
        self.assertEqual(lr['_reset'], 0)

        # Add filters

        r.preEncodingFilters = [DeltaFilter("s", "ds"), DeltaFilter("t", "dt")]
        r.rewind()

        # skip first record, which has a reset

        r.compute(inputs, outputs)
        lr = r.lastRecord
        self.assertEqual(
            lr['t'],
            datetime(year=2011, month=2, day=24, hour=16, minute=8, second=1))
        self.assertEqual(lr['s'], 20)
        self.assertEqual(lr['_reset'],
                         1)  # this record should have a reset since
        # it is first of a sequence
        self.assertEqual(lr['dt'], 1)
        self.assertEqual(lr['ds'], 10)

        r.compute(inputs, outputs)
        lr = r.lastRecord
        self.assertEqual(
            lr['t'],
            datetime(year=2011, month=2, day=25, hour=16, minute=8, second=0))
        self.assertEqual(lr['s'], 50)
        self.assertEqual(lr['_reset'], 0)
        self.assertEqual(lr['dt'], 3600 * 24 - 1)
        self.assertEqual(lr['ds'], 30)

        # next reset record is skipped

        r.compute(inputs, outputs)
        lr = r.lastRecord
        self.assertEqual(
            lr['t'],
            datetime(year=2011, month=2, day=25, hour=16, minute=8, second=3))
        self.assertEqual(lr['s'], 65)
        self.assertEqual(lr['_reset'], 1)
        self.assertEqual(lr['dt'], 2)
        self.assertEqual(lr['ds'], 5)
Exemplo n.º 4
0
  def testDeltaFilter(self):
    """
    data looks like:        should generate deltas
      "t"   "s"               "dt"     "ds"

      t     10                 X
      t+1s  20                 1s      10
      t+1d  50                 86399   30

    r t+1d+1s  60              X
      r+1d+3s  65              2s       5

    """
    r = RecordSensor()
    filename = findDataset("extra/qa/delta.csv")
    datasource = FileRecordStream(filename)
    r.dataSource = datasource
    n = 50
    encoder = MultiEncoder({'blah': dict(fieldname="s", type='ScalarEncoder',
                                         n=n, w=11, minval=0, maxval=100)})

    r.encoder = encoder

    # Test #1 -- no deltas
    # Make sure we get a reset when the gym changes
    resetOut = numpy.zeros((1,), dtype='float')
    sequenceIdOut = numpy.zeros((1,), dtype='float')
    dataOut = numpy.zeros((n,), dtype='float')
    sourceOut = numpy.zeros((1,), dtype='float')
    categoryOut = numpy.zeros((1,), dtype='float')

    outputs = dict(resetOut=resetOut,
                   sourceOut = sourceOut,
                   sequenceIdOut = sequenceIdOut,
                   dataOut = dataOut,
                   categoryOut = categoryOut)
    inputs = dict()
    r.verbosity=0

    r.compute(inputs, outputs)
    lr = r.lastRecord
    self.assertEqual(lr['t'], datetime(year=2011, month=2, day=24, hour=16,
                                       minute=8, second=0))
    self.assertEqual(lr['s'], 10)
    self.assertEqual(lr['_reset'], 1)
    self.assertTrue('dt' not in lr)
    self.assertTrue('ds' not in lr)

    r.compute(inputs, outputs)
    lr = r.lastRecord
    self.assertEqual(lr['t'], datetime(year=2011, month=2, day=24, hour=16,
                                       minute=8, second=1))
    self.assertEqual(lr['s'], 20)
    self.assertEqual(lr['_reset'], 0)

    r.compute(inputs, outputs)
    lr = r.lastRecord
    self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16,
                                       minute=8, second=0))
    self.assertEqual(lr['s'], 50)
    self.assertEqual(lr['_reset'], 0)

    r.compute(inputs, outputs)
    lr = r.lastRecord
    self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16,
                                       minute=8, second=1))
    self.assertEqual(lr['s'], 60)
    self.assertEqual(lr['_reset'], 1)

    r.compute(inputs, outputs)
    lr = r.lastRecord
    self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16,
                                       minute=8, second=3))
    self.assertEqual(lr['s'], 65)
    self.assertEqual(lr['_reset'], 0)

    # Add filters

    r.preEncodingFilters = [DeltaFilter("s", "ds"), DeltaFilter("t", "dt")]
    r.rewind()

    # skip first record, which has a reset

    r.compute(inputs, outputs)
    lr = r.lastRecord
    self.assertEqual(lr['t'], datetime(year=2011, month=2, day=24, hour=16,
                                       minute=8, second=1))
    self.assertEqual(lr['s'], 20)
    self.assertEqual(lr['_reset'], 1)  # this record should have a reset since
                                       # it is first of a sequence
    self.assertEqual(lr['dt'], 1)
    self.assertEqual(lr['ds'], 10)

    r.compute(inputs, outputs)
    lr = r.lastRecord
    self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16,
                                       minute=8, second=0))
    self.assertEqual(lr['s'], 50)
    self.assertEqual(lr['_reset'], 0)
    self.assertEqual(lr['dt'], 3600 * 24 - 1)
    self.assertEqual(lr['ds'], 30)

    # next reset record is skipped

    r.compute(inputs, outputs)
    lr = r.lastRecord
    self.assertEqual(lr['t'], datetime(year=2011, month=2, day=25, hour=16,
                                       minute=8, second=3))
    self.assertEqual(lr['s'], 65)
    self.assertEqual(lr['_reset'], 1)
    self.assertEqual(lr['dt'], 2)
    self.assertEqual(lr['ds'], 5)