Exemplo n.º 1
0
def main():
    util.print_box('Parameters', sorted(config.items(), key=itemgetter(0)))

    data_dir = io_util.Directory(args.data_dir)
    args.data_dir = io_util.Directory(args.data_dir)
    path_generator = pg.FileFlatGenerator(data_dir)
    util.print_box('Files per kind',
                   [(k, path_generator.num_files(k)) for k in ALLOWED_KINDS])
    vectorizer = get_vectorizer(path_generator)
    config.update_recursively(vocab_size=vectorizer.vocab_size)
    config.graph.dense.units = config.vocab_size
    config.graph.embedding.input_dim = config.vocab_size
    util.save_yaml_config(config, bundle.join('config.yml'))

    tr = Trainer(get_model(), model_dir=bundle, params=config.trainer)
    tr.model.summary()
    util.print_box('Weights', [(w.name, w.shape) for w in tr.model.weights])

    exit(0)
    try:
        tr.train_and_evaluate(
            train_input_fn=lambda: get_ds('train', vectorizer, path_generator),
            valid_input_fn=lambda: get_ds('valid', vectorizer, path_generator))
    except KeyboardInterrupt:
        logger.warning('Training terminated.')

    del tr

    model = keras.models.load_model(bundle.get_file_path('model.h5'))
    cb = GenerateTextCallback(vectorizer, seeds=[('beyer', "'s")])
    cb.set_model(model)
    cb.on_epoch_end(None, None)

    if util.is_running_on_bolt():
        shutil.copytree(src=str(bundle), dst=bundle.join('final'))
Exemplo n.º 2
0
def main():
    data_dir = io_util.Directory(args.data_dir)
    path_generator = pg.FileFlatGenerator(
        data_dir, data_split={
            'train': 0.97,
            'valid': 0.02,
            'test': 0.01})
    for k in ALLOWED_KINDS:
        for path in path_generator.get_paths(k):
            assert path.endswith('.json'), path
    util.print_box('Files per kind', [
        (k, path_generator.num_files(k))
        for k in ALLOWED_KINDS])
    # vectorizer = get_vectorizer(path_generator)
    vectorizer = FakeVectorizerForMontrealIDs()

    for kind in ALLOWED_KINDS:
        prepare_data(kind, vectorizer, path_generator)
def get_model_dir(dirname: str) -> io_util.Directory:
    base_models_dir = get_resource_dir('models')
    if dirname.endswith('*'):
        return io_util.Directory(base_models_dir.glob(dirname)[0])
    return base_models_dir.subdir(dirname)
Exemplo n.º 4
0
    '-d', '--data_dir', default='/Users/brandon/Downloads/foxnews.com')
parser.add_argument(
    '-s', '--streamer', default='VectorizedTruncatingJSONArticleStream')
args = parser.parse_args()

root_dir = io_util.Directory.of_file(__file__)

logger = util.get_logger(
    'project',
    is_root_logger=True,
    console_level='debug')

if args.bundle is None:
    args.bundle = root_dir.subdir('models', create=True).join('default')

bundle = io_util.Directory(args.bundle)
if not bundle.exists():
    logger.info(f'Creating bundle: {bundle}')
    bundle.create()


def get_vectorizer(path_generator=None) -> Vectorizer:
    vectorizer = Vectorizer(
        max_num_sents=None,
        max_seq_len=None,
        vocab_size=int(1e6))

    logger.trace('Training vectorizer.')
    vectorizer.update(map(
        str.lower,
        article_generator(path_generator.get_paths('train'))))
Exemplo n.º 5
0
    config_path = args.config
else:
    config_path = root_dir.subdir('config').join(args.config)
    config_path = util.force_extension(config_path, 'yml')

# Merge custom config file with command-line arguments.
config = util.parse_config(args=args, config_path=config_path)

logger = util.get_logger('project',
                         is_root_logger=True,
                         console_level=config.log_level)

if args.bundle is None:
    args.bundle = root_dir.subdir('models', create=True).join('default')

bundle = io_util.Directory(args.bundle)
if not bundle.exists():
    logger.info(f'Creating bundle: {bundle}')
    bundle.create()
elif not any((args.warm_start, args.load_vectorizer)):
    logger.info(f'Clearing previous bundle: {bundle}')
    bundle.delete_contents()

if util.is_running_on_bolt():
    bundle = io_util.Directory(bolt.ARTIFACT_DIR).subdir('bundle', create=True)


def get_vectorizer(path_generator=None) -> Vectorizer:
    def get_kind(kind):
        return [
            a.lower()