Exemplo n.º 1
0
class ModelTrainer(Component):
    task_builder = inject(TaskBuilder)
    args = inject(ConsoleArguments)

    def fit_tasks(self, all_data=False) -> [Task]:
        config = self.args.get()

        def find_best_epoch(ts) -> int:
            log.info('Finding best epoch')
            while True:
                task = random.choice(ts)
                epochs_without_improvement = task.epochs_without_improvement
                patience_exceeded = epochs_without_improvement > config.patience
                if task.target and patience_exceeded:
                    total_epochs = (task.epoch - config.patience) * len(ts)
                    log.info(
                        'Patience exceeded on task %s. Found best epoch: %i',
                        task.name, total_epochs)
                    return total_epochs
                task.fit_early_stopping()

        def fit_on_all_data(epochs, ts):
            log.info('Fitting on all data')
            for epoch in range(epochs):
                log.info('Epoch: %i of %i', epoch + 1, epochs)
                task = random.choice(ts)
                task.fit()

        tasks = self.task_builder.build(all_data)
        best_epoch = find_best_epoch(tasks)
        tasks = self.task_builder.build(all_data)
        fit_on_all_data(best_epoch, tasks)
        return tasks
Exemplo n.º 2
0
class Evaluator(Component):
    args = inject(ConsoleArguments)
    cross_validation_split = inject(CrossValidationSplit)

    def evaluate(self, tasks: [Task]):
        log.info('Saving validation metrics')
        config = self.args.get()
        data = pandas.DataFrame([config.hyper_parameters])
        for task in tasks:
            data[task.name + '_train_samples'] = len(
                task.filter_documents(
                    self.cross_validation_split.train_documents))
            data[task.name + '_test_samples'] = len(
                task.filter_documents(
                    self.cross_validation_split.test_documents))
            label = '{}_{}_{}'
            test_label = label.format(task.name, 'test', task.scoring_function)
            train_label = label.format(task.name, 'train',
                                       task.scoring_function)
            data[train_label] = task.train_score()
            data[test_label] = task.test_score()
        data['test_language'] = (
            self.cross_validation_split.test_documents[0].language)
        out_dir = config.output_dir
        output_file = config.output_file
        os.makedirs(out_dir, exist_ok=True)
        output_path = os.path.join(out_dir, output_file)
        exists = os.path.isfile(output_path)
        data.to_csv(output_path, index=False, mode='a', header=not exists)
Exemplo n.º 3
0
class ToDoApplication:
    database = inject(Database)
    log = inject(Log)
    reader = inject(ItemReader)
    writer = inject(ItemWriter)

    def run(self):
        self.report_items()
        self.read_items()

    def read_items(self):
        self.log.info('Reading items...')
        while self.reader.more_items():
            try:
                item = self.read_item()
                self.database.save_item(item)
            except:
                self.log.error('Could not read item')

    def read_item(self):
        item = self.reader.next_item()
        self.log.info('Got item: '.format(str(item)))
        return item

    def report_items(self):
        self.log.info('Reporting Items...')
        for item in self.database.get_items():
            self.writer.write_item(item)
Exemplo n.º 4
0
class CrossValidationSplit(Singleton):
    reader = inject(DataSetReader)
    args = inject(ConsoleArguments)

    @property
    def train_documents(self) -> [HTMLDocument]:
        if self._train_indices is None:
            raise ValueError('No current split')
        doc_series = Series(self.documents)
        train_docs = doc_series[self._train_indices]
        return list(train_docs)

    @property
    def test_documents(self) -> [HTMLDocument]:
        if self._test_indices is None:
            raise ValueError('No current split')
        doc_series = Series(self.documents)
        test_docs = doc_series[self._test_indices]
        return list(test_docs)

    @abstractmethod
    def split(self) -> [Tuple[ndarray]]:
        raise NotImplementedError()

    def __len__(self):
        return len(self._splits)

    @property
    def documents(self) -> [HTMLDocument]:
        if self._documents is None:
            self._documents = self.reader.read_documents()
        return self._documents

    def __next__(self):
        try:
            self._train_indices, self._test_indices = self._splits.pop()
            if self.skip():
                return next(self)
            return self._train_indices, self._test_indices
        except IndexError:
            raise StopIteration()

    def __iter__(self):
        return self

    def __init__(self):
        self._documents: [HTMLDocument] = None
        self._train_indices: ndarray = None
        self._test_indices: ndarray = None
        self._splits = self.split()

    def skip(self):
        test_doc = self.test_documents[0]
        return test_doc.language in self.args.get().skip
Exemplo n.º 5
0
class Experiment:
    args = inject(ConsoleArguments)
    cross_validation_split = inject(CrossValidationSplit)
    model_trainer = inject(ModelTrainer)
    evaluator = inject(Evaluator)

    def run(self):
        if not self.args.get().skip_validation:
            for i, _ in enumerate(self.cross_validation_split):
                log.info('starting fold %i of %i', i + 1,
                         len(self.cross_validation_split) + 2)
                tasks = self.model_trainer.fit_tasks()
                self.evaluator.evaluate(tasks)
        log.info('Fitting on all data')
        tasks = self.model_trainer.fit_tasks(all_data=True)
        for task in tasks:
            task.save()
Exemplo n.º 6
0
class SimpleLog(Log):
    arguments = inject(Arguments)

    def info(self, message: str):
        if self.arguments.log_level > 1:
            print('INFO:', message)

    def error(self, message: str):
        if self.arguments.log_level > 0:
            print('ERROR:', message)
Exemplo n.º 7
0
class TaskBuilder(Component):
    args = inject(ConsoleArguments)
    shared_layers_builder = inject(SharedLayersBuilder)

    def build(self, all_data=False) -> [Task]:
        shared_layers = self.shared_layers_builder.build()

        def task_factory(target: str) -> Task:
            return {
                'price': PriceTask(shared_layers, all_data=all_data),
                'vendor': VendorTask(shared_layers, all_data=all_data),
                'brand': BrandTask(shared_layers, all_data=all_data),
                'language': LanguageTask(shared_layers, all_data=all_data),
                'ean': EANTask(shared_layers, all_data=all_data)
            }[target]
        config = self.args.get()
        log.info('recompiling models')

        return [task_factory(target) for target in config.targets]
Exemplo n.º 8
0
    def test_mock_replaces_named_value(self):
        class Dependency:
            def method(self):
                pass

        with Environment(key=Dependency()):
            mock_dependency = mock('key')
            mock_dependency.method.return_value = 'value'
            with self.assertRaises(AttributeError):
                mock_dependency.no_such_method()
            injected = inject('key')
            self.assertEqual(injected, mock_dependency)
            self.assertEqual(mock_dependency.method(), 'value')
Exemplo n.º 9
0
class SharedLayersBuilder(Component):
    vocabulary = inject(Vocabulary)
    args = inject(ConsoleArguments)

    def build(self) -> SharedLayers:
        log.info('Recompiling shared layers')
        embedding_layer = self.get_embedding_layer()
        convolutions = self.get_convolutional_layers()
        return SharedLayers(embedding_layer=embedding_layer,
                            convolutions=convolutions)

    def get_embedding_layer(self) -> Layer:
        vocab = self.vocabulary
        config = self.args.get()
        embedding = vocab.embedding
        _, embedding_dim = embedding.shape
        log.info(
            'Building embedding layer with vocab size: %i '
            'and embedding dimension: %i', vocab.size, embedding_dim)
        return Embedding(
            input_dim=vocab.size,
            output_dim=embedding_dim,
            trainable=False,
            mask_zero=False,
            name='embedding_layer',
            embeddings_initializer=RandomUniform(seed=config.seed),
            weights=[embedding])

    def get_convolutional_layers(self) -> Tuple[Layer]:
        config = self.args.get()
        log.info('Building CNN layers with %i filters each', config.filters)
        layers = ()
        for filter_size in config.filter_sizes:
            layer = Conv1D(filters=config.filters,
                           kernel_size=filter_size,
                           kernel_initializer=glorot_uniform(config.seed),
                           activation='relu')
            layers += (layer, )
        return layers
Exemplo n.º 10
0
class Dependent:
    some_singleton = inject(SomeSingleton)
    some_component = inject(SomeComponent)
Exemplo n.º 11
0
 class Dependent:
     a = inject(AbstractA)
Exemplo n.º 12
0
        class B(AbstractB):
            a = inject(AbstractA)

            def __init__(self):
                self.a
Exemplo n.º 13
0
        class A(AbstractA):
            b = inject(AbstractB)

            def __init__(self):
                self.b
Exemplo n.º 14
0
class Chain(Component):
    some_component = inject(SomeComponent)
Exemplo n.º 15
0
def test_decorate_nonclass_or_nonfunction():
    result = inject(1)
    assert result == 1
Exemplo n.º 16
0
 def test_inside_environment(self):
     with Environment():
         self.assertIsInstance(inject(SomeComponent), SomeComponent)
Exemplo n.º 17
0
class DataSetReader(Component):
    database = inject(DocumentDatabase)

    def read_documents(self) -> [HTMLDocument]:
        documents = self.database.load_documents()
        return [d for d in documents if len(d.windows) > 0]
Exemplo n.º 18
0
    def test_inject_non_component_fails(self):
        class Test:
            pass

        with self.assertRaises(InvalidDependency):
            inject(Test)
Exemplo n.º 19
0
class Dependent:
    some_component = inject(SomeComponent)
    abstract_component = inject(AbstractComponent)
    chain = inject(Chain)
Exemplo n.º 20
0
def test_decorate_class_with_no_annotations():
    class NoAnnotations:
        pass

    decorated = inject(NoAnnotations)
    assert decorated is NoAnnotations
Exemplo n.º 21
0
class DocumentDatabase(Component):
    args = inject(ConsoleArguments)

    class Document(_db.Entity):
        id = PrimaryKey(int, auto=True)
        html = Optional(str)
        brand = Optional(str)
        gtin13 = Optional(str)
        ean = Optional(str)
        asin = Optional(str)
        sku = Optional(str)
        price = Optional(float)
        currency = Optional(str)
        vendor = Optional(str)
        language = Optional(str)
        tokens = Optional(bytes)
        brand_bio_labels = Optional(bytes)
        ean_bio_labels = Optional(bytes)
        windows_5 = Optional(bytes)
        windows_11 = Optional(bytes)
        windows_21 = Optional(bytes)

    @db_session
    def save_documents(self, documents: [HTMLDocument], overwrite=True):
        if overwrite:
            self.Document.select().delete()
        for doc in documents:
            db_doc = self.Document(
                html=doc.html,
                brand=doc.brand,
                vendor=doc.vendor,
                language=doc.language,
                tokens=pickle.dumps(doc.tokens),
                brand_bio_labels=pickle.dumps(doc.brand_bio_labels),
                ean_bio_labels=pickle.dumps(doc.ean_bio_labels),
                windows_5=pickle.dumps(doc.windows_5),
                windows_11=pickle.dumps(doc.windows_11),
                windows_21=pickle.dumps(doc.windows_21))
            if doc.gtin13 is not None:
                db_doc.gtin13 = str(doc.gtin13)
            if doc.ean is not None:
                db_doc.ean = str(doc.ean)
            if doc.asin is not None:
                db_doc.asin = str(doc.asin)
            if doc.sku is not None:
                db_doc.sku = str(doc.sku)
            if doc.price is not None:
                try:
                    db_doc.price = float(doc.price.replace(',', '.'))
                except:
                    pass
            if doc.currency is not None:
                db_doc.currency = doc.currency
            _db.commit()

    @db_session
    def load_documents(self) -> [HTMLDocument]:
        docs = []
        n_samples = self.args.get().n_samples
        n_samples = n_samples if n_samples else self.Document.select().count()
        for db_doc in self.Document.select().limit(n_samples):
            doc = HTMLDocument(
                html=db_doc.html,
                brand=db_doc.brand,
                gtin13=db_doc.gtin13,
                ean=db_doc.ean,
                asin=db_doc.asin,
                sku=db_doc.asin,
                price=db_doc.price,
                currency=db_doc.currency,
                vendor=db_doc.vendor,
                language=db_doc.language,
                tokens=pickle.loads(db_doc.tokens),
                brand_bio_labels=pickle.loads(db_doc.brand_bio_labels),
                ean_bio_labels=pickle.loads(db_doc.ean_bio_labels)
                if db_doc.ean_bio_labels else None,
                windows_5=pickle.loads(db_doc.windows_5),
                windows_11=pickle.loads(db_doc.windows_11),
                windows_21=pickle.loads(db_doc.windows_21))
            docs.append(doc)
        return docs

    def __init__(self):
        path = self.args.get().database_path
        _db.bind(provider='sqlite', filename=path, create_db=False)
        _db.generate_mapping(create_tables=True)
Exemplo n.º 22
0
def test_decorate_class_with_no_dependency_annotations():
    class NoDependencyAnnotations:
        a: int

    decorated = inject(NoDependencyAnnotations)
    assert decorated is NoDependencyAnnotations
Exemplo n.º 23
0
class Vocabulary(Singleton):
    cross_validation_split = inject(CrossValidationSplit)
    args = inject(ConsoleArguments)

    @property
    def embedding(self) -> ndarray:
        if self._embedding is None:
            self._embedding = self._read_embedding()
        return self._embedding

    def __init__(self):
        self._embedding = None
        with open(self.args.get().indices_path, 'rb') as f:
            self._indices = pickle.load(f)

    def _read_embedding(self):
        config = self.args.get()
        with open(config.embedding_path, 'rb') as embedding_file:
            fasttext_vectors = pickle.load(embedding_file)
        embeddings = random.uniform(low=.01, high=.1, size=(self.size, 300))
        indices = self.indices
        for word, index in indices.items():
            embeddings[index] = fasttext_vectors[word]
        embeddings[self.out_of_vocab_index] = random.normal(0, .01, 300)
        embeddings[self.padding_index] = zeros(300)
        return embeddings

    @property
    def indices(self) -> Dict[str, int]:
        return self._indices

    @property
    def unique_tokens(self) -> Set[Token]:
        return set(self.indices.keys())

    @property
    def languages(self):
        return {
            doc.language
            for doc in self.cross_validation_split.documents if doc.language
        }

    def __len__(self):
        return self.size

    def make_batch(self, batch: [HTMLDocument], labeller: Labeller):
        maxlen = preprocess.maxlen(self.cross_validation_split.documents)
        indices = self.indices
        out_of_vocab_index = self.out_of_vocab_index

        def index(token: Token) -> int:
            return indices.get(token, out_of_vocab_index)

        def tokens2indices(doc: HTMLDocument):
            return [index(token) for token in doc.tokens]

        sequences = [tokens2indices(doc) for doc in batch]
        padded_sequences = immutable_array(
            pad_sequences(sequences, maxlen=maxlen, value=self.padding_index))
        labels = labeller(batch)
        return padded_sequences, labels

    @property
    def padding_index(self) -> int:
        return 0

    @property
    def out_of_vocab_index(self) -> int:
        return len(self.indices) + 1

    @property
    def size(self) -> int:
        # + 2 because of padding and out of vocab tokens
        return len(self.unique_tokens) + 2
Exemplo n.º 24
0
class SequenceClassificationTask(Task):
    config = inject(ConsoleArguments)

    def stack(self, documents):
        windows = [d.windows for d in documents]
        labels = [self.label(d) for d in documents]
        windows = numpy.vstack(windows)
        labels = numpy.concatenate(labels)
        return windows, labels

    @property
    @abstractmethod
    def name(self) -> str:
        pass

    @abstractmethod
    def label(self, document):
        pass

    @property
    def scoring_function(self) -> str:
        return 'sigmoid'

    @abstractmethod
    def filter_documents(self, documents: [HTMLDocument]) -> [HTMLDocument]:
        pass

    def compile_model(self) -> Model:
        window_size = self.config.get().window_size
        input_layer = Input(shape=(window_size, ))
        shared_tensor = self._shared_layers(input_layer)
        output = Dense(1, activation='sigmoid')(shared_tensor)
        model = Model(inputs=input_layer, outputs=output)
        model.compile(optimizer='adam', loss='binary_crossentropy')
        return model

    def encode_labels(self, documents: [HTMLDocument]) -> ndarray:
        return numpy.array([])

    def fit(self):
        windows, labels = self.stack(self._train_set +
                                     self._early_stopping_set)
        self._model.fit(windows,
                        labels,
                        epochs=self.epoch + 1,
                        batch_size=16,
                        initial_epoch=self.epoch)
        self.epoch += 1

    def train_score(self):
        windows, labels = self.stack(self._train_set)
        return self._score(windows, labels)

    def early_stopping_score(self):
        windows, labels = self.stack(self._early_stopping_set)
        return self._score(windows, labels)

    def _score(self, windows, labels) -> float:
        return self._model.evaluate(windows, labels)

    def test_score(self):
        windows, labels = self.stack(
            self.cross_validation_split.test_documents)
        return self._score(windows, labels)

    def fit_early_stopping(self):
        windows, labels = self.stack(self._train_set)
        self._model.fit(windows,
                        labels,
                        epochs=self.epoch + 1,
                        batch_size=16,
                        initial_epoch=self.epoch)
        self.update_epoch()
Exemplo n.º 25
0
 def test_inject_string(self):
     with Environment(key='value'):
         self.assertEqual(inject('key'), 'value')
Exemplo n.º 26
0
class Depenedent:
    some_component = inject(SomeComponent)
Exemplo n.º 27
0
class Depenedent:
    some_component = inject(SomeComponent)
    some_callable_component = inject(SomeCallableComponent)