class ModelTrainer(Component): task_builder = inject(TaskBuilder) args = inject(ConsoleArguments) def fit_tasks(self, all_data=False) -> [Task]: config = self.args.get() def find_best_epoch(ts) -> int: log.info('Finding best epoch') while True: task = random.choice(ts) epochs_without_improvement = task.epochs_without_improvement patience_exceeded = epochs_without_improvement > config.patience if task.target and patience_exceeded: total_epochs = (task.epoch - config.patience) * len(ts) log.info( 'Patience exceeded on task %s. Found best epoch: %i', task.name, total_epochs) return total_epochs task.fit_early_stopping() def fit_on_all_data(epochs, ts): log.info('Fitting on all data') for epoch in range(epochs): log.info('Epoch: %i of %i', epoch + 1, epochs) task = random.choice(ts) task.fit() tasks = self.task_builder.build(all_data) best_epoch = find_best_epoch(tasks) tasks = self.task_builder.build(all_data) fit_on_all_data(best_epoch, tasks) return tasks
class Evaluator(Component): args = inject(ConsoleArguments) cross_validation_split = inject(CrossValidationSplit) def evaluate(self, tasks: [Task]): log.info('Saving validation metrics') config = self.args.get() data = pandas.DataFrame([config.hyper_parameters]) for task in tasks: data[task.name + '_train_samples'] = len( task.filter_documents( self.cross_validation_split.train_documents)) data[task.name + '_test_samples'] = len( task.filter_documents( self.cross_validation_split.test_documents)) label = '{}_{}_{}' test_label = label.format(task.name, 'test', task.scoring_function) train_label = label.format(task.name, 'train', task.scoring_function) data[train_label] = task.train_score() data[test_label] = task.test_score() data['test_language'] = ( self.cross_validation_split.test_documents[0].language) out_dir = config.output_dir output_file = config.output_file os.makedirs(out_dir, exist_ok=True) output_path = os.path.join(out_dir, output_file) exists = os.path.isfile(output_path) data.to_csv(output_path, index=False, mode='a', header=not exists)
class ToDoApplication: database = inject(Database) log = inject(Log) reader = inject(ItemReader) writer = inject(ItemWriter) def run(self): self.report_items() self.read_items() def read_items(self): self.log.info('Reading items...') while self.reader.more_items(): try: item = self.read_item() self.database.save_item(item) except: self.log.error('Could not read item') def read_item(self): item = self.reader.next_item() self.log.info('Got item: '.format(str(item))) return item def report_items(self): self.log.info('Reporting Items...') for item in self.database.get_items(): self.writer.write_item(item)
class CrossValidationSplit(Singleton): reader = inject(DataSetReader) args = inject(ConsoleArguments) @property def train_documents(self) -> [HTMLDocument]: if self._train_indices is None: raise ValueError('No current split') doc_series = Series(self.documents) train_docs = doc_series[self._train_indices] return list(train_docs) @property def test_documents(self) -> [HTMLDocument]: if self._test_indices is None: raise ValueError('No current split') doc_series = Series(self.documents) test_docs = doc_series[self._test_indices] return list(test_docs) @abstractmethod def split(self) -> [Tuple[ndarray]]: raise NotImplementedError() def __len__(self): return len(self._splits) @property def documents(self) -> [HTMLDocument]: if self._documents is None: self._documents = self.reader.read_documents() return self._documents def __next__(self): try: self._train_indices, self._test_indices = self._splits.pop() if self.skip(): return next(self) return self._train_indices, self._test_indices except IndexError: raise StopIteration() def __iter__(self): return self def __init__(self): self._documents: [HTMLDocument] = None self._train_indices: ndarray = None self._test_indices: ndarray = None self._splits = self.split() def skip(self): test_doc = self.test_documents[0] return test_doc.language in self.args.get().skip
class Experiment: args = inject(ConsoleArguments) cross_validation_split = inject(CrossValidationSplit) model_trainer = inject(ModelTrainer) evaluator = inject(Evaluator) def run(self): if not self.args.get().skip_validation: for i, _ in enumerate(self.cross_validation_split): log.info('starting fold %i of %i', i + 1, len(self.cross_validation_split) + 2) tasks = self.model_trainer.fit_tasks() self.evaluator.evaluate(tasks) log.info('Fitting on all data') tasks = self.model_trainer.fit_tasks(all_data=True) for task in tasks: task.save()
class SimpleLog(Log): arguments = inject(Arguments) def info(self, message: str): if self.arguments.log_level > 1: print('INFO:', message) def error(self, message: str): if self.arguments.log_level > 0: print('ERROR:', message)
class TaskBuilder(Component): args = inject(ConsoleArguments) shared_layers_builder = inject(SharedLayersBuilder) def build(self, all_data=False) -> [Task]: shared_layers = self.shared_layers_builder.build() def task_factory(target: str) -> Task: return { 'price': PriceTask(shared_layers, all_data=all_data), 'vendor': VendorTask(shared_layers, all_data=all_data), 'brand': BrandTask(shared_layers, all_data=all_data), 'language': LanguageTask(shared_layers, all_data=all_data), 'ean': EANTask(shared_layers, all_data=all_data) }[target] config = self.args.get() log.info('recompiling models') return [task_factory(target) for target in config.targets]
def test_mock_replaces_named_value(self): class Dependency: def method(self): pass with Environment(key=Dependency()): mock_dependency = mock('key') mock_dependency.method.return_value = 'value' with self.assertRaises(AttributeError): mock_dependency.no_such_method() injected = inject('key') self.assertEqual(injected, mock_dependency) self.assertEqual(mock_dependency.method(), 'value')
class SharedLayersBuilder(Component): vocabulary = inject(Vocabulary) args = inject(ConsoleArguments) def build(self) -> SharedLayers: log.info('Recompiling shared layers') embedding_layer = self.get_embedding_layer() convolutions = self.get_convolutional_layers() return SharedLayers(embedding_layer=embedding_layer, convolutions=convolutions) def get_embedding_layer(self) -> Layer: vocab = self.vocabulary config = self.args.get() embedding = vocab.embedding _, embedding_dim = embedding.shape log.info( 'Building embedding layer with vocab size: %i ' 'and embedding dimension: %i', vocab.size, embedding_dim) return Embedding( input_dim=vocab.size, output_dim=embedding_dim, trainable=False, mask_zero=False, name='embedding_layer', embeddings_initializer=RandomUniform(seed=config.seed), weights=[embedding]) def get_convolutional_layers(self) -> Tuple[Layer]: config = self.args.get() log.info('Building CNN layers with %i filters each', config.filters) layers = () for filter_size in config.filter_sizes: layer = Conv1D(filters=config.filters, kernel_size=filter_size, kernel_initializer=glorot_uniform(config.seed), activation='relu') layers += (layer, ) return layers
class Dependent: some_singleton = inject(SomeSingleton) some_component = inject(SomeComponent)
class Dependent: a = inject(AbstractA)
class B(AbstractB): a = inject(AbstractA) def __init__(self): self.a
class A(AbstractA): b = inject(AbstractB) def __init__(self): self.b
class Chain(Component): some_component = inject(SomeComponent)
def test_decorate_nonclass_or_nonfunction(): result = inject(1) assert result == 1
def test_inside_environment(self): with Environment(): self.assertIsInstance(inject(SomeComponent), SomeComponent)
class DataSetReader(Component): database = inject(DocumentDatabase) def read_documents(self) -> [HTMLDocument]: documents = self.database.load_documents() return [d for d in documents if len(d.windows) > 0]
def test_inject_non_component_fails(self): class Test: pass with self.assertRaises(InvalidDependency): inject(Test)
class Dependent: some_component = inject(SomeComponent) abstract_component = inject(AbstractComponent) chain = inject(Chain)
def test_decorate_class_with_no_annotations(): class NoAnnotations: pass decorated = inject(NoAnnotations) assert decorated is NoAnnotations
class DocumentDatabase(Component): args = inject(ConsoleArguments) class Document(_db.Entity): id = PrimaryKey(int, auto=True) html = Optional(str) brand = Optional(str) gtin13 = Optional(str) ean = Optional(str) asin = Optional(str) sku = Optional(str) price = Optional(float) currency = Optional(str) vendor = Optional(str) language = Optional(str) tokens = Optional(bytes) brand_bio_labels = Optional(bytes) ean_bio_labels = Optional(bytes) windows_5 = Optional(bytes) windows_11 = Optional(bytes) windows_21 = Optional(bytes) @db_session def save_documents(self, documents: [HTMLDocument], overwrite=True): if overwrite: self.Document.select().delete() for doc in documents: db_doc = self.Document( html=doc.html, brand=doc.brand, vendor=doc.vendor, language=doc.language, tokens=pickle.dumps(doc.tokens), brand_bio_labels=pickle.dumps(doc.brand_bio_labels), ean_bio_labels=pickle.dumps(doc.ean_bio_labels), windows_5=pickle.dumps(doc.windows_5), windows_11=pickle.dumps(doc.windows_11), windows_21=pickle.dumps(doc.windows_21)) if doc.gtin13 is not None: db_doc.gtin13 = str(doc.gtin13) if doc.ean is not None: db_doc.ean = str(doc.ean) if doc.asin is not None: db_doc.asin = str(doc.asin) if doc.sku is not None: db_doc.sku = str(doc.sku) if doc.price is not None: try: db_doc.price = float(doc.price.replace(',', '.')) except: pass if doc.currency is not None: db_doc.currency = doc.currency _db.commit() @db_session def load_documents(self) -> [HTMLDocument]: docs = [] n_samples = self.args.get().n_samples n_samples = n_samples if n_samples else self.Document.select().count() for db_doc in self.Document.select().limit(n_samples): doc = HTMLDocument( html=db_doc.html, brand=db_doc.brand, gtin13=db_doc.gtin13, ean=db_doc.ean, asin=db_doc.asin, sku=db_doc.asin, price=db_doc.price, currency=db_doc.currency, vendor=db_doc.vendor, language=db_doc.language, tokens=pickle.loads(db_doc.tokens), brand_bio_labels=pickle.loads(db_doc.brand_bio_labels), ean_bio_labels=pickle.loads(db_doc.ean_bio_labels) if db_doc.ean_bio_labels else None, windows_5=pickle.loads(db_doc.windows_5), windows_11=pickle.loads(db_doc.windows_11), windows_21=pickle.loads(db_doc.windows_21)) docs.append(doc) return docs def __init__(self): path = self.args.get().database_path _db.bind(provider='sqlite', filename=path, create_db=False) _db.generate_mapping(create_tables=True)
def test_decorate_class_with_no_dependency_annotations(): class NoDependencyAnnotations: a: int decorated = inject(NoDependencyAnnotations) assert decorated is NoDependencyAnnotations
class Vocabulary(Singleton): cross_validation_split = inject(CrossValidationSplit) args = inject(ConsoleArguments) @property def embedding(self) -> ndarray: if self._embedding is None: self._embedding = self._read_embedding() return self._embedding def __init__(self): self._embedding = None with open(self.args.get().indices_path, 'rb') as f: self._indices = pickle.load(f) def _read_embedding(self): config = self.args.get() with open(config.embedding_path, 'rb') as embedding_file: fasttext_vectors = pickle.load(embedding_file) embeddings = random.uniform(low=.01, high=.1, size=(self.size, 300)) indices = self.indices for word, index in indices.items(): embeddings[index] = fasttext_vectors[word] embeddings[self.out_of_vocab_index] = random.normal(0, .01, 300) embeddings[self.padding_index] = zeros(300) return embeddings @property def indices(self) -> Dict[str, int]: return self._indices @property def unique_tokens(self) -> Set[Token]: return set(self.indices.keys()) @property def languages(self): return { doc.language for doc in self.cross_validation_split.documents if doc.language } def __len__(self): return self.size def make_batch(self, batch: [HTMLDocument], labeller: Labeller): maxlen = preprocess.maxlen(self.cross_validation_split.documents) indices = self.indices out_of_vocab_index = self.out_of_vocab_index def index(token: Token) -> int: return indices.get(token, out_of_vocab_index) def tokens2indices(doc: HTMLDocument): return [index(token) for token in doc.tokens] sequences = [tokens2indices(doc) for doc in batch] padded_sequences = immutable_array( pad_sequences(sequences, maxlen=maxlen, value=self.padding_index)) labels = labeller(batch) return padded_sequences, labels @property def padding_index(self) -> int: return 0 @property def out_of_vocab_index(self) -> int: return len(self.indices) + 1 @property def size(self) -> int: # + 2 because of padding and out of vocab tokens return len(self.unique_tokens) + 2
class SequenceClassificationTask(Task): config = inject(ConsoleArguments) def stack(self, documents): windows = [d.windows for d in documents] labels = [self.label(d) for d in documents] windows = numpy.vstack(windows) labels = numpy.concatenate(labels) return windows, labels @property @abstractmethod def name(self) -> str: pass @abstractmethod def label(self, document): pass @property def scoring_function(self) -> str: return 'sigmoid' @abstractmethod def filter_documents(self, documents: [HTMLDocument]) -> [HTMLDocument]: pass def compile_model(self) -> Model: window_size = self.config.get().window_size input_layer = Input(shape=(window_size, )) shared_tensor = self._shared_layers(input_layer) output = Dense(1, activation='sigmoid')(shared_tensor) model = Model(inputs=input_layer, outputs=output) model.compile(optimizer='adam', loss='binary_crossentropy') return model def encode_labels(self, documents: [HTMLDocument]) -> ndarray: return numpy.array([]) def fit(self): windows, labels = self.stack(self._train_set + self._early_stopping_set) self._model.fit(windows, labels, epochs=self.epoch + 1, batch_size=16, initial_epoch=self.epoch) self.epoch += 1 def train_score(self): windows, labels = self.stack(self._train_set) return self._score(windows, labels) def early_stopping_score(self): windows, labels = self.stack(self._early_stopping_set) return self._score(windows, labels) def _score(self, windows, labels) -> float: return self._model.evaluate(windows, labels) def test_score(self): windows, labels = self.stack( self.cross_validation_split.test_documents) return self._score(windows, labels) def fit_early_stopping(self): windows, labels = self.stack(self._train_set) self._model.fit(windows, labels, epochs=self.epoch + 1, batch_size=16, initial_epoch=self.epoch) self.update_epoch()
def test_inject_string(self): with Environment(key='value'): self.assertEqual(inject('key'), 'value')
class Depenedent: some_component = inject(SomeComponent)
class Depenedent: some_component = inject(SomeComponent) some_callable_component = inject(SomeCallableComponent)