def fit(self, x, y, validation_data=None, epochs=100, patience=0, verbose=None, min_delta=0, tensorboard=False, timeline=False, **keras_kwargs): if validation_data is None: validation_data = self.model.pipeline.encoded_validation_data if isinstance(x, pandas.DataFrame): x = x.to_dict(orient='series') if isinstance(validation_data.x, pandas.DataFrame): validation_data = Observations( x=validation_data.x.to_dict(orient='series'), y=validation_data.y) if not self.keras or not self.optimizer: self.build() with self.session.as_default(): if timeline: run_metadata = tensorflow.RunMetadata() else: run_metadata = None self.keras.compile( loss=self.loss, optimizer=self.optimizer, options=tensorflow.RunOptions( trace_level=tensorflow.RunOptions.FULL_TRACE), run_metadata=run_metadata) if verbose is None: verbose = 1 if lore.env.name == lore.env.DEVELOPMENT else 0 logger.info('\n'.join([ '\n\n\n Fitting', '==============================', '| batch | learning | |', '| size | rate | decay |', '------------------------------', '| %5i | %8.6f | %7.5f |' % ( self.batch_size, self.learning_rate, self.decay, ), '==============================\n\n' ])) reload_best = ReloadBest( filepath=self.model.checkpoint_path(), monitor=self.monitor, mode='auto', ) callbacks = self.callbacks() callbacks += [ reload_best, TerminateOnNaN(), EarlyStopping( monitor=self.monitor, min_delta=min_delta, patience=patience, verbose=verbose, mode='auto', ), ] if tensorboard: callbacks += [ TensorBoard(log_dir=self.model.serializer.tensorboard_path, histogram_freq=1, batch_size=self.batch_size, write_graph=True, write_grads=True, write_images=True, embeddings_freq=1, embeddings_metadata=None) ] with self.session.as_default(): self.history = self.keras.fit( x=x, y=[y] * self.towers, validation_data=Observations(x=validation_data.x, y=[validation_data.y] * self.towers), batch_size=self.batch_size, epochs=epochs, verbose=verbose, callbacks=callbacks, **keras_kwargs).history if timeline: with open(self.model.timeline_path(), 'w') as f: f.write( Timeline(step_stats=run_metadata.step_stats). generate_chrome_trace_format()) return { 'epochs': len(self.history['loss']), 'train': reload_best.train_loss, 'validate': reload_best.validate_loss, }
def observations(self, data): return Observations(x=self.encode_x(data), y=self.encode_y(data))
def generator(self, table, orient='row', encoded=False, stratify=False, chunksize=None): if not self.loaded: self._split_data() if orient == 'column': if encoded: for encoder in self.encoders: transformed = encoder.transform(self.read_column(table, encoder.source_column)) encoded = {} if hasattr(encoder, 'sequence_length'): for i in range(encoder.sequence_length): encoded[encoder.sequence_name(i)] = transformed[:, i] else: encoded[encoder.name] = transformed yield Observations(x=pandas.DataFrame(encoded), y=None) else: for column in self.columns: yield self.read_column(table, column) elif orient == 'row': if stratify: if not self.stratify: raise ValueError("Can't stratify a generator for a pipeline with no stratify") if chunksize is None: chunksize = 1 min, max = self.connection.execute( """ SELECT min({stratify}), max({stratify}) FROM ( SELECT {stratify} FROM {table} ORDER BY {stratify} ASC LIMIT :chunksize ) """.format( stratify=self.quote(self.stratify), table=self.quote(table), ), {'chunksize': chunksize} ).fetchone() while min and max: dataframe = pandas.read_sql( """ SELECT * FROM {table} WHERE {stratify} BETWEEN :min AND :max """.format( stratify=self.quote(self.stratify), table=self.quote(table), ), self.connection, parse_dates=self.datetime_columns, params={'min': min, 'max': max} ) if encoded: dataframe = Observations(x=self.encode_x(dataframe), y=self.encode_y(dataframe)) yield dataframe min, max = self.connection.execute( """ SELECT min({stratify}), max({stratify}) FROM ( SELECT {stratify} FROM {table} WHERE {stratify} > :max ORDER BY {stratify} ASC LIMIT :chunksize ) """.format( stratify=self.quote(self.stratify), table=self.quote(table), ), {'max': max, 'chunksize': chunksize} ).fetchone() else: if chunksize is None: chunksize = self.chunksize for dataframe in pandas.read_sql( "SELECT * FROM {name}".format(name=self.quote(table)), self.connection, chunksize=chunksize, parse_dates=self.datetime_columns ): if encoded: dataframe = Observations(x=self.encode_x(dataframe), y=self.encode_y(dataframe)) yield dataframe else: raise ValueError('orient "%s" not in "[row, column]"' % orient)