示例#1
0
 def __init__(self, input_file, split_field):
     """Provide input information for splits."""
     raw_df = next(load_data([input_file], shard_size=None))
     self.splits = raw_df[split_field].values
示例#2
0
  def featurize(self, input_files, data_dir, shard_size=8192,
                num_shards_per_batch=24, worker_pool=None,
                logging=True, debug=False):
    """Featurize provided files and write to specified location."""
    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING
    log("Loading raw samples now.", self.verbosity)
    log("shard_size: %d" % shard_size, self.verbosity)
    log("num_shards_per_batch: %d" % num_shards_per_batch, self.verbosity)

    # Allow users to specify a single file for featurization
    if not isinstance(input_files, list):
      input_files = [input_files]

    if not os.path.exists(data_dir):
      os.makedirs(data_dir)

    # Construct partial function to write datasets.
    if not len(input_files):
      return None
    input_type = get_input_type(input_files[0])

    if logging:
      mp.log_to_stderr()
    if worker_pool is None:
      if logging:
        worker_pool = LoggingPool(processes=1)
      else:
        worker_pool = mp.Pool(processes=1)
    log("Spawning workers now.", self.verbosity)
    metadata_rows = []
    data_iterator = it.izip(
        it.repeat((self, shard_size, input_type, data_dir)),
        enumerate(load_data(input_files, shard_size, self.verbosity)))
    # Turns out python map is terrible and exhausts the generator as given.
    # Solution seems to be to to manually pull out N elements from iterator,
    # then to map on only those N elements. BLECH. Python should do a better
    # job here.
    num_batches = 0
    ############################################################## TIMING
    time2 = time.time()
    log("TIMING: pre-map featurization took %0.3f s" % (time2-time1))
    ############################################################## TIMING
    while True:
      log("About to start processing next batch of shards", self.verbosity)
      ############################################################## TIMING
      time1 = time.time()
      ############################################################## TIMING
      iterator = itertools.islice(data_iterator, num_shards_per_batch)
      if not debug:
        batch_metadata = worker_pool.map(
            featurize_map_function, iterator)
      else:
        batch_metadata = []
        for elt in iterator:
          batch_metadata.append(featurize_map_function(elt))
      ############################################################## TIMING
      time2 = time.time()
      log("TIMING: map call on batch took %0.3f s" % (time2-time1),
           self.verbosity)
      ############################################################## TIMING
      if batch_metadata:
        metadata_rows.extend([elt for elt in batch_metadata if elt is not None])
        num_batches += 1
        log("Featurized %d datapoints\n"
            % (shard_size * num_shards_per_batch * num_batches), self.verbosity)
      else:
        break
    ############################################################## TIMING
    time1 = time.time()
    ############################################################## TIMING

    # TODO(rbharath): This whole bit with metadata_rows is an awkward way of
    # creating a Dataset. Is there a more elegant solutions?
    dataset = Dataset(data_dir=data_dir,
                      metadata_rows=metadata_rows,
                      reload=reload, verbosity=self.verbosity)
    ############################################################## TIMING
    time2 = time.time()
    print("TIMING: dataset construction took %0.3f s" % (time2-time1),
          self.verbosity)
    ############################################################## TIMING
    return dataset 
示例#3
0
 def __init__(self, input_file, split_field, verbose=False):
   """Provide input information for splits."""
   raw_df = next(load_data([input_file], shard_size=None))
   self.splits = raw_df[split_field].values
   self.verbose = verbose
示例#4
0
    def featurize(self,
                  input_files,
                  data_dir,
                  shard_size=8192,
                  num_shards_per_batch=24,
                  worker_pool=None,
                  logging=True,
                  debug=False):
        """Featurize provided files and write to specified location."""
        ############################################################## TIMING
        time1 = time.time()
        ############################################################## TIMING
        log("Loading raw samples now.", self.verbosity)
        log("shard_size: %d" % shard_size, self.verbosity)
        log("num_shards_per_batch: %d" % num_shards_per_batch, self.verbosity)

        # Allow users to specify a single file for featurization
        if not isinstance(input_files, list):
            input_files = [input_files]

        if not os.path.exists(data_dir):
            os.makedirs(data_dir)

        # Construct partial function to write datasets.
        if not len(input_files):
            return None
        input_type = get_input_type(input_files[0])

        if logging:
            mp.log_to_stderr()
        if worker_pool is None:
            if logging:
                worker_pool = LoggingPool(processes=1)
            else:
                worker_pool = mp.Pool(processes=1)
        log("Spawning workers now.", self.verbosity)
        metadata_rows = []
        data_iterator = it.izip(
            it.repeat((self, shard_size, input_type, data_dir)),
            enumerate(load_data(input_files, shard_size, self.verbosity)))
        # Turns out python map is terrible and exhausts the generator as given.
        # Solution seems to be to to manually pull out N elements from iterator,
        # then to map on only those N elements. BLECH. Python should do a better
        # job here.
        num_batches = 0
        ############################################################## TIMING
        time2 = time.time()
        log("TIMING: pre-map featurization took %0.3f s" % (time2 - time1))
        ############################################################## TIMING
        while True:
            log("About to start processing next batch of shards",
                self.verbosity)
            ############################################################## TIMING
            time1 = time.time()
            ############################################################## TIMING
            iterator = itertools.islice(data_iterator, num_shards_per_batch)
            if not debug:
                batch_metadata = worker_pool.map(featurize_map_function,
                                                 iterator)
            else:
                batch_metadata = []
                for elt in iterator:
                    batch_metadata.append(featurize_map_function(elt))
            ############################################################## TIMING
            time2 = time.time()
            log("TIMING: map call on batch took %0.3f s" % (time2 - time1),
                self.verbosity)
            ############################################################## TIMING
            if batch_metadata:
                metadata_rows.extend(
                    [elt for elt in batch_metadata if elt is not None])
                num_batches += 1
                log(
                    "Featurized %d datapoints\n" %
                    (shard_size * num_shards_per_batch * num_batches),
                    self.verbosity)
            else:
                break
        ############################################################## TIMING
        time1 = time.time()
        ############################################################## TIMING

        # TODO(rbharath): This whole bit with metadata_rows is an awkward way of
        # creating a Dataset. Is there a more elegant solutions?
        dataset = Dataset(data_dir=data_dir,
                          metadata_rows=metadata_rows,
                          reload=reload,
                          verbosity=self.verbosity)
        ############################################################## TIMING
        time2 = time.time()
        print("TIMING: dataset construction took %0.3f s" % (time2 - time1),
              self.verbosity)
        ############################################################## TIMING
        return dataset