예제 #1
0
    def test_embedding_trainer_multicore_local(self, mock_getcwd):
        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            model_storage = ModelStorage(FSStore(td))
            job_postings_generator = JobPostingCollectionSample()
            corpus_generator = Word2VecGensimCorpusCreator(
                job_postings_generator,
                document_schema_fields=document_schema_fields)
            trainer = EmbeddingTrainer(FastTextModel(size=10,
                                                     min_count=3,
                                                     iter=4,
                                                     window=6,
                                                     workers=3),
                                       FastTextModel(size=10,
                                                     min_count=3,
                                                     iter=4,
                                                     window=10,
                                                     workers=3),
                                       Word2VecModel(size=10,
                                                     workers=3,
                                                     window=6),
                                       Word2VecModel(size=10,
                                                     min_count=10,
                                                     window=10,
                                                     workers=3),
                                       model_storage=model_storage)
            trainer.train(corpus_generator, n_processes=4)
            trainer.save_model()

            assert set(os.listdir(os.getcwd())) == set(
                [model.model_name for model in trainer._models])
    def test_embedding_trainer_doc2vec_s3(self):
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3_path = f"s3://fake-open-skills/model_cache/embedding"
        s3_storage = S3Store(path=s3_path)

        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        job_postings_generator = JobPostingCollectionSample(num_records=30)
        corpus_generator = Doc2VecGensimCorpusCreator(
            job_postings_generator,
            document_schema_fields=document_schema_fields)
        d2v = Doc2VecModel(storage=s3_storage,
                           size=10,
                           min_count=3,
                           iter=4,
                           window=6,
                           workers=3)

        trainer = EmbeddingTrainer(corpus_generator, d2v)
        trainer.train(lookup=True)
        trainer.save_model()

        vocab_size = len(d2v.wv.vocab.keys())
        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert d2v.model_name == trainer.model_name
        assert set(files) == set([trainer.model_name])
        self.assertDictEqual(trainer.lookup_dict, d2v.lookup_dict)

        # Save as different name
        d2v.save('other_name.model')

        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert set(files) == set([trainer.model_name, 'other_name.model'])

        # Load
        d2v_loaded = Doc2VecModel.load(s3_storage, trainer.model_name)
        assert d2v_loaded.metadata['embedding_model']['hyperparameters'][
            'vector_size'] == trainer.metadata['embedding_model'][
                'hyperparameters']['vector_size']
        # Change the store directory
        new_s3_path = "s3://fake-open-skills/model_cache/embedding/other_directory"
        trainer.save_model(S3Store(new_s3_path))
        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(new_s3_path)]
        assert set(files) == set([trainer.model_name])
예제 #3
0
    def test_embedding_trainer_fasttext_local(self, mock_getcwd):
        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            model_storage = ModelStorage(FSStore(td))
            job_postings_generator = JobPostingCollectionSample(num_records=30)
            corpus_generator = Word2VecGensimCorpusCreator(
                job_postings_generator,
                document_schema_fields=document_schema_fields)
            fasttext = FastTextModel(size=10,
                                     min_count=3,
                                     iter=4,
                                     window=6,
                                     workers=3)

            trainer = EmbeddingTrainer(fasttext, model_storage=model_storage)
            trainer.train(corpus_generator)
            trainer.save_model()

            vocab_size = len(fasttext.wv.vocab.keys())

            assert fasttext.model_name == trainer._models[0].model_name
            assert set(os.listdir(os.getcwd())) == set(
                [trainer._models[0].model_name])

            # Test Online Training
            job_postings_generator = JobPostingCollectionSample(num_records=50)
            corpus_generator = Word2VecGensimCorpusCreator(
                job_postings_generator,
                document_schema_fields=document_schema_fields)

            fasttext_loaded = model_storage.load_model(fasttext.model_name)
            new_trainer = EmbeddingTrainer(fasttext_loaded,
                                           model_storage=model_storage)
            new_trainer.train(corpus_generator)
            new_trainer.save_model()

            new_vocab_size = len(fasttext_loaded.wv.vocab.keys())

            assert set(os.listdir(os.getcwd())) == set([
                trainer._models[0].model_name,
                new_trainer._models[0].model_name
            ])
            assert new_trainer.metadata['embedding_trainer'][
                'models'] != trainer.metadata['embedding_trainer']['models']
            assert vocab_size <= new_vocab_size
예제 #4
0
    def test_embedding_trainer_doc2vec_local(self, mock_getcwd):
        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]

        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            model_storage = ModelStorage(FSStore(td))

            job_postings_generator = JobPostingCollectionSample(num_records=30)
            corpus_generator = Doc2VecGensimCorpusCreator(
                job_postings_generator,
                document_schema_fields=document_schema_fields)
            d2v = Doc2VecModel(size=10,
                               min_count=3,
                               iter=4,
                               window=6,
                               workers=3)

            trainer = EmbeddingTrainer(d2v, model_storage=model_storage)
            trainer.train(corpus_generator, lookup=True)
            trainer.save_model()

            vocab_size = len(d2v.wv.vocab.keys())
            assert d2v.model_name == trainer._models[0].model_name
            assert set(os.listdir(os.getcwd())) == set(
                [trainer._models[0].model_name])
            self.assertDictEqual(trainer.lookup_dict, d2v.lookup_dict)

            # Save as different name
            model_storage.save_model(d2v, 'other_name.model')
            assert set(os.listdir(os.getcwd())) == set(
                [trainer._models[0].model_name, 'other_name.model'])

            # Load
            d2v_loaded = model_storage.load_model(
                trainer._models[0].model_name)
            assert d2v_loaded.metadata["embedding_model"][
                "model_type"] == list(
                    trainer.metadata["embedding_trainer"]
                    ['models'].values())[0]['embedding_model']['model_type']

            # Change the store directory
            new_path = os.path.join(td, 'other_directory')
            trainer.save_model(FSStore(new_path))
            assert set(os.listdir(new_path)) == set(
                [trainer._models[0].model_name])
예제 #5
0
def test_embedding_trainer():
    s3_conn = boto.connect_s3()
    bucket_name = 'fake-jb-bucket'
    bucket = s3_conn.create_bucket(bucket_name)

    job_posting_name = 'FAKE_jobposting'
    s3_prefix_jb = 'fake-jb-bucket/job_postings'
    s3_prefix_model = 'fake-jb-bucket/model_cache/embedding/'
    quarters = '2011Q1'

    with tempfile.TemporaryDirectory() as td:
        with open(os.path.join(td, job_posting_name), 'w') as handle:
            json.dump(sample_document, handle)
        upload(s3_conn, os.path.join(td, job_posting_name), os.path.join(s3_prefix_jb, quarters))


    # Doc2Vec
    trainer = EmbeddingTrainer(s3_conn=s3_conn, quarters=['2011Q1'], jp_s3_path=s3_prefix_jb, model_s3_path=s3_prefix_model, model_type='doc2vec')
    trainer.train()
    files = list_files(s3_conn, os.path.join(s3_prefix_model, 'doc2vec_gensim_' + trainer.training_time))
    assert len(files) == 3

    assert files == ['doc2vec_gensim_' + trainer.training_time + '.model',
                     'lookup_doc2vec_gensim_' + trainer.training_time + '.json',
                     'metadata_doc2vec_gensim_' + trainer.training_time + '.json']

    with tempfile.TemporaryDirectory() as td:
        trainer.save_model(td)
        assert set(os.listdir(td)) == set(['doc2vec_gensim_' + trainer.training_time + '.model',
                                           'lookup_doc2vec_gensim_' + trainer.training_time + '.json',
                                           'metadata_doc2vec_gensim_' + trainer.training_time + '.json'])

    # Word2Vec
    trainer = EmbeddingTrainer(s3_conn=s3_conn, quarters=['2011Q1'], jp_s3_path=s3_prefix_jb, model_s3_path=s3_prefix_model, model_type='word2vec')
    trainer.train()
    files = list_files(s3_conn, os.path.join(s3_prefix_model, 'word2vec_gensim_' + trainer.training_time))
    assert len(files) == 2
    assert files == ['metadata_word2vec_gensim_' + trainer.training_time + '.json',
                     'word2vec_gensim_' + trainer.training_time + '.model']

    new_trainer = EmbeddingTrainer(s3_conn=s3_conn, quarters=['2011Q1'], jp_s3_path=s3_prefix_jb, model_s3_path=s3_prefix_model, model_type='word2vec')
    new_trainer.load(trainer.modelname, s3_prefix_model)
    assert new_trainer.metadata['metadata']['hyperparameters'] == trainer.metadata['metadata']['hyperparameters']
예제 #6
0
    def test_embedding_trainer_word2vec_local(self, mock_getcwd):
        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]

        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            model_storage = ModelStorage(FSStore(td))
            job_postings_generator = JobPostingCollectionSample(num_records=30)
            corpus_generator = Word2VecGensimCorpusCreator(
                job_postings_generator,
                document_schema_fields=document_schema_fields)
            w2v = Word2VecModel(size=10,
                                min_count=3,
                                iter=4,
                                window=6,
                                workers=3)

            trainer = EmbeddingTrainer(corpus_generator, w2v, model_storage)
            trainer.train()
            trainer.save_model()

            vocab_size = len(w2v.wv.vocab.keys())

            assert w2v.model_name == trainer.model_name
            assert set(os.listdir(os.getcwd())) == set([trainer.model_name])

            # Test Online Training
            job_postings_generator = JobPostingCollectionSample(num_records=50)
            corpus_generator = Word2VecGensimCorpusCreator(
                job_postings_generator,
                document_schema_fields=document_schema_fields)

            w2v_loaded = model_storage.load_model(w2v.model_name)

            new_trainer = EmbeddingTrainer(corpus_generator, w2v_loaded,
                                           model_storage)
            new_trainer.train()
            new_trainer.save_model()

            new_vocab_size = len(w2v_loaded.wv.vocab.keys())

            assert set(os.listdir(os.getcwd())) == set(
                [trainer.model_name, new_trainer.model_name])
            assert new_trainer.metadata['embedding_trainer'][
                'model_name'] != trainer.metadata['embedding_trainer'][
                    'model_name']
            assert vocab_size <= new_vocab_size

            # Save as different name
            model_storage.save_model(w2v, 'other_name.model')
            assert set(os.listdir(os.getcwd())) == set([
                trainer.model_name, new_trainer.model_name, 'other_name.model'
            ])

            # Change the store directory
            new_path = os.path.join(td, 'other_directory')
            new_trainer.save_model(FSStore(new_path))
            assert set(os.listdir(new_path)) == set([new_trainer.model_name])
예제 #7
0
    def test_embedding_trainer_multicore_s3(self):
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3_path = f"s3://fake-open-skills/model_cache/embedding"
        s3_storage = S3Store(path=s3_path)
        model_storage = ModelStorage(s3_storage)

        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        job_postings_generator = JobPostingCollectionSample()
        corpus_generator = Word2VecGensimCorpusCreator(
            job_postings_generator,
            document_schema_fields=document_schema_fields)
        trainer = EmbeddingTrainer(FastTextModel(size=10,
                                                 min_count=3,
                                                 iter=4,
                                                 window=6,
                                                 workers=3),
                                   FastTextModel(size=10,
                                                 min_count=3,
                                                 iter=4,
                                                 window=10,
                                                 workers=3),
                                   Word2VecModel(size=10, workers=3, window=6),
                                   Word2VecModel(size=10,
                                                 min_count=10,
                                                 window=10,
                                                 workers=3),
                                   model_storage=model_storage)
        trainer.train(corpus_generator)
        trainer.save_model()

        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert set(files) == set(
            [model.model_name for model in trainer._models])
    def test_embedding_trainer_word2vec_s3(self):
        client = boto3.client('s3')
        client.create_bucket(Bucket='fake-open-skills',
                             ACL='public-read-write')
        s3_path = f"s3://fake-open-skills/model_cache/embedding"
        s3_storage = S3Store(path=s3_path)

        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        job_postings_generator = JobPostingCollectionSample(num_records=30)
        corpus_generator = Word2VecGensimCorpusCreator(
            job_postings_generator,
            document_schema_fields=document_schema_fields)
        w2v = Word2VecModel(storage=s3_storage,
                            size=10,
                            min_count=3,
                            iter=4,
                            window=6,
                            workers=3)

        trainer = EmbeddingTrainer(corpus_generator, w2v)
        trainer.train()
        trainer.save_model()

        vocab_size = len(w2v.wv.vocab.keys())

        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert w2v.model_name == trainer.model_name
        assert set(files) == set([trainer.model_name])

        # Test online training
        job_postings_generator = JobPostingCollectionSample(num_records=50)
        corpus_generator = Word2VecGensimCorpusCreator(
            job_postings_generator,
            document_schema_fields=document_schema_fields)

        w2v_loaded = Word2VecModel.load(s3_storage, w2v.model_name)

        new_trainer = EmbeddingTrainer(corpus_generator, w2v_loaded)
        new_trainer.train()
        new_trainer.save_model()

        new_vocab_size = len(w2v_loaded.wv.vocab.keys())

        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert set(files) == set([new_trainer.model_name, trainer.model_name])
        assert new_trainer.metadata['embedding_trainer'][
            'model_name'] != trainer.metadata['embedding_trainer']['model_name']
        assert vocab_size <= new_vocab_size

        # Save as different name
        w2v.save('other_name.model')

        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(s3_path)]
        assert set(files) == set(
            [trainer.model_name, new_trainer.model_name, 'other_name.model'])

        # Change the store directory
        new_s3_path = "s3://fake-open-skills/model_cache/embedding/other_directory"
        new_trainer.save_model(S3Store(new_s3_path))
        s3 = s3fs.S3FileSystem()
        files = [f.split('/')[-1] for f in s3.ls(new_s3_path)]
        assert set(files) == set([new_trainer.model_name])