def test_combined_cls_local(self, mock_getcwd): with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td model_storage = ModelStorage(FSStore(td)) jobpostings = JobPostingCollectionSample() corpus_generator = Word2VecGensimCorpusCreator(jobpostings, raw=True) w2v = Word2VecModel(size=10, min_count=0, alpha=0.025, min_alpha=0.025) trainer = EmbeddingTrainer(w2v, model_storage=model_storage) trainer.train(corpus_generator, lookup=True) matrix = DesignMatrix(jobpostings, self.major_group, self.pipe_x, self.pipe_y) matrix.build() X = matrix.X rf = ProxyObjectWithStorage(RandomForestClassifier(), None, None, matrix.target_variable) rf.fit(X, matrix.y) proxy_rf = ProxyObjectWithStorage(rf, None, None, matrix.target_variable) # Remove the last step in the pipe_x # the input of predict_soc should be tokenized words new_pipe_x = self.pipe_x new_pipe_x.generators.pop() new_matrix = DesignMatrix(JobPostingCollectionSample(), self.major_group, new_pipe_x) new_matrix.build() ccls = CombinedClassifier(w2v, rf) assert len(ccls.predict_soc([new_matrix.X[0]])[0]) == 2
def test_embedding_feature(self): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields, raw=True) w2v = Word2VecModel(size=10, min_count=0, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(w2v) trainer.train(corpus_generator) job_postings = RawCorpusCreator( JobPostingCollectionSample(num_records=50)) raw1, raw2 = tee(job_postings) fc = SequenceFeatureCreator(raw1, sentence_tokenizer=sentence_tokenize, word_tokenizer=word_tokenize, embedding_model=w2v, features=["EmbeddingFeature"]) fc = iter(fc) self.assertEqual( next(fc).shape[0], np.array( next(iter(word_tokenizer_gen( sentence_tokenizer_gen(raw2))))).shape[0]) self.assertEqual(next(fc)[0].shape[0], 10)
def test_embedding_trainer_word2vec_local(self, mock_getcwd): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td model_storage = ModelStorage(FSStore(td)) job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v = Word2VecModel(size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(corpus_generator, w2v, model_storage) trainer.train() trainer.save_model() vocab_size = len(w2v.wv.vocab.keys()) assert w2v.model_name == trainer.model_name assert set(os.listdir(os.getcwd())) == set([trainer.model_name]) # Test Online Training job_postings_generator = JobPostingCollectionSample(num_records=50) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v_loaded = model_storage.load_model(w2v.model_name) new_trainer = EmbeddingTrainer(corpus_generator, w2v_loaded, model_storage) new_trainer.train() new_trainer.save_model() new_vocab_size = len(w2v_loaded.wv.vocab.keys()) assert set(os.listdir(os.getcwd())) == set( [trainer.model_name, new_trainer.model_name]) assert new_trainer.metadata['embedding_trainer'][ 'model_name'] != trainer.metadata['embedding_trainer'][ 'model_name'] assert vocab_size <= new_vocab_size # Save as different name model_storage.save_model(w2v, 'other_name.model') assert set(os.listdir(os.getcwd())) == set([ trainer.model_name, new_trainer.model_name, 'other_name.model' ]) # Change the store directory new_path = os.path.join(td, 'other_directory') new_trainer.save_model(FSStore(new_path)) assert set(os.listdir(new_path)) == set([new_trainer.model_name])
def test_batch_generator(self): job_postings = JobPostingCollectionSample() batch_iter = batches_generator(job_postings, 10) # Each bacth produced by batched_generator() is still an iterator assert isinstance(batch_iter, collections.abc.Iterator) batch_iter = list(batch_iter) assert len(batch_iter) == 10 job_postings = JobPostingCollectionSample() batch_tuple = tuple(BatchGenerator(job_postings, 10)) # Each batch produced by BatchGenerator() is a tuple assert isinstance(batch_tuple, tuple) assert len(list(batch_tuple)) == 10
def test_embedding_trainer_fasttext_local(self, mock_getcwd): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td model_storage = ModelStorage(FSStore(td)) job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) fasttext = FastTextModel(size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(fasttext, model_storage=model_storage) trainer.train(corpus_generator) trainer.save_model() vocab_size = len(fasttext.wv.vocab.keys()) assert fasttext.model_name == trainer._models[0].model_name assert set(os.listdir(os.getcwd())) == set( [trainer._models[0].model_name]) # Test Online Training job_postings_generator = JobPostingCollectionSample(num_records=50) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) fasttext_loaded = model_storage.load_model(fasttext.model_name) new_trainer = EmbeddingTrainer(fasttext_loaded, model_storage=model_storage) new_trainer.train(corpus_generator) new_trainer.save_model() new_vocab_size = len(fasttext_loaded.wv.vocab.keys()) assert set(os.listdir(os.getcwd())) == set([ trainer._models[0].model_name, new_trainer._models[0].model_name ]) assert new_trainer.metadata['embedding_trainer'][ 'models'] != trainer.metadata['embedding_trainer']['models'] assert vocab_size <= new_vocab_size
def test_contextual_feature(self): raw = RawCorpusCreator(JobPostingCollectionSample()) raw1, raw2 = tee(raw) fc = SequenceFeatureCreator(raw1, features=["ContextualFeature"]) fc = iter(fc) self.assertEqual(next(fc).shape[0], np.array(next(iter(word_tokenizer_gen(sentence_tokenizer_gen(raw2))))).shape[0]) self.assertEqual(next(fc)[0].shape[0], 17)
def test_word2vec(self): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample(num_records=50) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v = Word2VecModel(size=16, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(w2v) trainer.train(corpus_generator) v1 = w2v.infer_vector(["media"]) v2 = w2v.infer_vector(["media"]) assert_array_equal(v1, v2) # test unseen vocab assert w2v.infer_vector(["sports"]).shape[0] == 16 # test a list that has some words not in vocab sentence_with_unseen_word = ["sports", "news", "and", "media"] sentecne_without_unseen_word = ["news", "and", "media"] assert_array_equal(w2v.infer_vector(sentence_with_unseen_word), w2v.infer_vector(sentecne_without_unseen_word))
def test_doc2vec(self): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample(num_records=50) corpus_generator = Doc2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) d2v = Doc2VecModel(size=16, min_count=1, dm=0, alpha=0.025, min_alpha=0.025) trainer = EmbeddingTrainer(d2v) trainer.train(corpus_generator) # Since the inference of doc2vec is an non-deterministic algorithm, we need to reset the random seed for testing. d2v.random.seed(0) v1 = d2v.infer_vector(["media", "news"]) d2v.random.seed(0) v2 = d2v.infer_vector(["media", "news"]) assert_array_equal(v1, v2) # test unseen word self.assertRaises(KeyError, lambda: d2v["sports"]) # test unseen sentence v1 = d2v.infer_vector(["sports"]) v2 = d2v.infer_vector(["sports"]) assert_array_equal(v1, v2)
def test_fasttext(self): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample(num_records=50) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) fasttext = FastTextModel(size=16, min_count=3, window=6, iter=4) trainer = EmbeddingTrainer(fasttext) trainer.train(corpus_generator) v1 = fasttext.infer_vector(["media"]) v2 = fasttext.infer_vector(["media"]) assert_array_equal(v1, v2) # FastText models support vector lookups for out-of-vocabulary words by summing up character ngrams belonging to the word assert "sports" not in fasttext.wv.vocab assert fasttext["sports"].shape == (16, ) # test unseen word and none of the character ngrams of the word are present in the training data self.assertRaises(KeyError, lambda: fasttext["axe"]) # test a list that has some words not in vocab v1 = fasttext.infer_vector(["media", "sport", "axe"]) v2 = fasttext.infer_vector(["media", "sport"]) assert_array_equal(v1, v2)
def test_embedding_trainer_multicore_local(self, mock_getcwd): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td model_storage = ModelStorage(FSStore(td)) job_postings_generator = JobPostingCollectionSample() corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) trainer = EmbeddingTrainer(FastTextModel(size=10, min_count=3, iter=4, window=6, workers=3), FastTextModel(size=10, min_count=3, iter=4, window=10, workers=3), Word2VecModel(size=10, workers=3, window=6), Word2VecModel(size=10, min_count=10, window=10, workers=3), model_storage=model_storage) trainer.train(corpus_generator, n_processes=4) trainer.save_model() assert set(os.listdir(os.getcwd())) == set( [model.model_name for model in trainer._models])
def train_embedding(self): jobpostings = list(JobPostingCollectionSample()) corpus_generator = Word2VecGensimCorpusCreator(jobpostings, raw=True) w2v = Word2VecModel(size=10, min_count=0, alpha=0.025, min_alpha=0.025) trainer = EmbeddingTrainer(corpus_generator, w2v) trainer.train(True) self.embedding_model = w2v self.jobpostings = jobpostings
def test_JobPostingCollectionSample(self): job_postings = JobPostingCollectionSample() list_of_postings = list(job_postings) self.assertEqual(len(list_of_postings), 50) for posting in list_of_postings: self.assertIsInstance(posting, dict) self.assertEqual(posting['@type'], 'JobPosting') self.assertIn('title', posting) self.assertIn('description', posting) self.assertIn('job postings', job_postings.metadata)
def test_embedding_trainer_doc2vec_with_other(self): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Doc2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) trainer = EmbeddingTrainer(Doc2VecModel(), Word2VecModel(), FastTextModel()) self.assertRaises(TypeError, lambda: trainer.train(corpus_generator))
def test_embedding_trainer_doc2vec_s3(self): client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3_path = f"s3://fake-open-skills/model_cache/embedding" s3_storage = S3Store(path=s3_path) document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Doc2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) d2v = Doc2VecModel(storage=s3_storage, size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(corpus_generator, d2v) trainer.train(lookup=True) trainer.save_model() vocab_size = len(d2v.wv.vocab.keys()) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert d2v.model_name == trainer.model_name assert set(files) == set([trainer.model_name]) self.assertDictEqual(trainer.lookup_dict, d2v.lookup_dict) # Save as different name d2v.save('other_name.model') s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert set(files) == set([trainer.model_name, 'other_name.model']) # Load d2v_loaded = Doc2VecModel.load(s3_storage, trainer.model_name) assert d2v_loaded.metadata['embedding_model']['hyperparameters'][ 'vector_size'] == trainer.metadata['embedding_model'][ 'hyperparameters']['vector_size'] # Change the store directory new_s3_path = "s3://fake-open-skills/model_cache/embedding/other_directory" trainer.save_model(S3Store(new_s3_path)) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(new_s3_path)] assert set(files) == set([trainer.model_name])
def test_skill_feature(self): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields, raw=True) w2v = Word2VecModel(size=10, min_count=0, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(w2v) trainer.train(corpus_generator) raw = RawCorpusCreator(JobPostingCollectionSample()) raw1, raw2 = tee(raw) # default fc = SequenceFeatureCreator(raw1, embedding_model=w2v) self.assertEqual( fc.selected_features, ["StructuralFeature", "ContextualFeature", "EmbeddingFeature"]) self.assertEqual( fc.all_features, ["StructuralFeature", "ContextualFeature", "EmbeddingFeature"]) fc = iter(fc) self.assertEqual( next(fc).shape[0], np.array( next(iter(word_tokenizer_gen( sentence_tokenizer_gen(raw2))))).shape[0]) self.assertEqual(next(fc)[0].shape[0], 29) # Not Supported fc = SequenceFeatureCreator(raw1, features=["FeatureNotSupported"]) fc = iter(fc) self.assertRaises(TypeError, lambda: next(fc))
def test_tester(self): document_schema_fields = ['description','experienceRequirements', 'qualifications', 'skills'] corpus_generator = Word2VecGensimCorpusCreator(JobPostingCollectionSample(num_records=30), document_schema_fields=document_schema_fields) w2v = Word2VecModel(size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(w2v) trainer.train(corpus_generator) jp = JobPostingCollectionSample() train_gen = islice(jp, 30) test_gen = islice(jp, 30, None) train_matrix = DesignMatrix(train_gen, self.fullsoc, self.pipe_x, self.pipe_y) train_matrix.build() occ_trainer = OccupationClassifierTrainer(train_matrix, 2, grid_config=self.grid_config) occ_trainer.train(save=False) cc = CombinedClassifier(w2v, occ_trainer.best_estimators[0]) steps = self.pipe_x.generators[:-1] test_gen = (t for t in test_gen if t['onet_soc_code'] is not '') tester = OccupationClassifierTester(test_data_generator=test_gen, preprocessing=steps, classifier=cc) result = list(tester) assert len(tester) == len(result) == 18
def test_combined_cls_local(self, mock_getcwd): with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td jobpostings = list(JobPostingCollectionSample()) corpus_generator = Word2VecGensimCorpusCreator(jobpostings, raw=True) w2v = Word2VecModel(storage=FSStore(td), size=10, min_count=0, alpha=0.025, min_alpha=0.025) trainer = EmbeddingTrainer(corpus_generator, w2v) trainer.train(True) matrix = create_training_set(jobpostings, SOCMajorGroup()) X = EmbeddingTransformer(w2v).transform(matrix.X) rf = RandomForestClassifier() rf.fit(X, matrix.y) ccls = CombinedClassifier(w2v, rf, matrix.target_variable) assert len(ccls.predict_soc([matrix.X[0]])[0]) == 2
def test_embedding_trainer_doc2vec_local(self, mock_getcwd): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td model_storage = ModelStorage(FSStore(td)) job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Doc2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) d2v = Doc2VecModel(size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(d2v, model_storage=model_storage) trainer.train(corpus_generator, lookup=True) trainer.save_model() vocab_size = len(d2v.wv.vocab.keys()) assert d2v.model_name == trainer._models[0].model_name assert set(os.listdir(os.getcwd())) == set( [trainer._models[0].model_name]) self.assertDictEqual(trainer.lookup_dict, d2v.lookup_dict) # Save as different name model_storage.save_model(d2v, 'other_name.model') assert set(os.listdir(os.getcwd())) == set( [trainer._models[0].model_name, 'other_name.model']) # Load d2v_loaded = model_storage.load_model( trainer._models[0].model_name) assert d2v_loaded.metadata["embedding_model"][ "model_type"] == list( trainer.metadata["embedding_trainer"] ['models'].values())[0]['embedding_model']['model_type'] # Change the store directory new_path = os.path.join(td, 'other_directory') trainer.save_model(FSStore(new_path)) assert set(os.listdir(new_path)) == set( [trainer._models[0].model_name])
def test_visualize_in_tensorboard(self): document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample(num_records=50) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v = Word2VecModel(size=16, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(w2v) trainer.train(corpus_generator) with tempfile.TemporaryDirectory() as td: with mock.patch('os.getcwd') as mock_getcwd: mock_getcwd.return_value = td visualize_in_tensorboard(w2v) assert len( set( os.listdir( os.path.join(os.getcwd(), w2v.model_name.split('.')[0])))) == 7
def test_embedding_trainer_multicore_s3(self): client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3_path = f"s3://fake-open-skills/model_cache/embedding" s3_storage = S3Store(path=s3_path) model_storage = ModelStorage(s3_storage) document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample() corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) trainer = EmbeddingTrainer(FastTextModel(size=10, min_count=3, iter=4, window=6, workers=3), FastTextModel(size=10, min_count=3, iter=4, window=10, workers=3), Word2VecModel(size=10, workers=3, window=6), Word2VecModel(size=10, min_count=10, window=10, workers=3), model_storage=model_storage) trainer.train(corpus_generator) trainer.save_model() s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert set(files) == set( [model.model_name for model in trainer._models])
# Extracting skills using noun phrase endings # # To showcase the noun phrase skill extractor, we run a sample of job postings through it. # In the end, we have the most commonly occurring noun phrases ending in # 'skill' or 'skills'. from collections import Counter import logging from pprint import pformat from skills_ml.job_postings.common_schema import JobPostingCollectionSample from skills_ml.algorithms.skill_extractors.noun_phrase_ending import SkillEndingPatternExtractor logging.basicConfig(level=logging.INFO) if __name__ == '__main__': # Use the simplest possible input: # 1. 50 pre-downloaded job postings job_postings = JobPostingCollectionSample() # 2. A skill extractor to retrieve noun phrases ending in 'skill' or 'skills'. # VT job postings do not include line breaks, so the bulleted-line filter # will remove all possible matches. Let's turn it off pattern_extractor = SkillEndingPatternExtractor(only_bulleted_lines=False) skill_counts = Counter() for job_posting in job_postings: skill_counts += pattern_extractor.document_skill_counts(job_posting) logging.info('10 Most Common Skills in job descriptions:\n %s', pformat(skill_counts.most_common(10)))
def setUp(self): self.jobpostings = list(JobPostingCollectionSample())
from skills_ml.algorithms.skill_extractors import ( FuzzyMatchSkillExtractor, ExactMatchSkillExtractor, SocScopedExactMatchSkillExtractor, SectionExtractSkillExtractor, SkillEndingPatternExtractor, AbilityEndingPatternExtractor) from skills_ml.ontologies.onet import Onet from skills_ml.evaluation.skill_extractors import candidate_skills_from_sample, metrics_for_candidate_skills from skills_ml.evaluation.skill_extraction_metrics import TotalOccurrences, TotalVocabularySize, OntologyCompetencyRecall from skills_ml.job_postings.common_schema import JobPostingCollectionSample from tests.utils import sample_factory sample = sample_factory(JobPostingCollectionSample()) print('Building ONET, may take a while to download') full_onet = Onet() skill_extractors = [ SectionExtractSkillExtractor(), SkillEndingPatternExtractor(only_bulleted_lines=False), AbilityEndingPatternExtractor(only_bulleted_lines=False), FuzzyMatchSkillExtractor(full_onet.competency_framework), ExactMatchSkillExtractor(full_onet.competency_framework), SocScopedExactMatchSkillExtractor(full_onet) ] print('Done building ONET! Now subsetting ONET into K,S,A') metric_ontologies = [ full_onet, full_onet.filter_by(lambda edge: 'Knowledge' in edge.competency.categories, competency_name='onet_knowledge', competency_description='ONET Knowledge'), full_onet.filter_by(lambda edge: 'Abilities' in edge.competency.categories, competency_name='onet_ability',
from skills_ml.algorithms import nlp from skills_ml.algorithms.occupation_classifiers.train import OccupationClassifierTrainer from skills_ml.algorithms.occupation_classifiers import FullSOC, DesignMatrix import os import json import random from functools import partial import logging logging.basicConfig(level=logging.INFO, filename=os.path.abspath("grid_search.log")) logging.getLogger().addHandler(logging.StreamHandler()) import multiprocessing num_of_worker = multiprocessing.cpu_count() job_samples = JobPostingCollectionSample() job_postings = list(job_samples) random.shuffle(job_postings) train_data = job_postings[:30] test_data = job_postings[30:] train_bytes = json.dumps(train_data).encode() test_bytes = json.dumps(test_data).encode() logging.info("Loading Embedding Model") model_storage = ModelStorage(FSStore('/your/model/path')) w2v = model_storage.load_model(model_name='your_model_name')
TitleCleanPhaseOne, PostingIdPresent, Geography, SkillCounts from skills_ml.job_postings.geography_queriers.state import JobStateQuerier from skills_ml.algorithms.skill_extractors.noun_phrase_ending import SkillEndingPatternExtractor from skills_ml.job_postings.computed_properties.aggregators import\ aggregate_properties from skills_ml.storage import FSStore from functools import partial import unicodecsv as csv import numpy from skills_ml.job_postings.aggregate.pandas import listy_n_most_common import os import tempfile logging.basicConfig(level=logging.INFO) job_postings = list(JobPostingCollectionSample()) with tempfile.TemporaryDirectory() as tmpdir: computed_properties_path = os.path.join(tmpdir, 'computed_properties') storage = FSStore(computed_properties_path) # Create properties. In this example, we are going to both compute and aggregate, # but this is not necessary! Computation and aggregation are entirely decoupled. # So it's entirely valid to just compute a bunch of properties and then later # figure out how you want to aggregate them. # We are only introducing the 'grouping' and 'aggregate' semantics this early in the # script so as to avoid defining these properties twice in the same script. # create properties to be grouped on. In this case, we want to group on cleaned job title grouping_properties = [ TitleCleanPhaseOne(storage=storage),
def test_embedding_trainer_word2vec_s3(self): client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3_path = f"s3://fake-open-skills/model_cache/embedding" s3_storage = S3Store(path=s3_path) document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v = Word2VecModel(storage=s3_storage, size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(corpus_generator, w2v) trainer.train() trainer.save_model() vocab_size = len(w2v.wv.vocab.keys()) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert w2v.model_name == trainer.model_name assert set(files) == set([trainer.model_name]) # Test online training job_postings_generator = JobPostingCollectionSample(num_records=50) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v_loaded = Word2VecModel.load(s3_storage, w2v.model_name) new_trainer = EmbeddingTrainer(corpus_generator, w2v_loaded) new_trainer.train() new_trainer.save_model() new_vocab_size = len(w2v_loaded.wv.vocab.keys()) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert set(files) == set([new_trainer.model_name, trainer.model_name]) assert new_trainer.metadata['embedding_trainer'][ 'model_name'] != trainer.metadata['embedding_trainer']['model_name'] assert vocab_size <= new_vocab_size # Save as different name w2v.save('other_name.model') s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert set(files) == set( [trainer.model_name, new_trainer.model_name, 'other_name.model']) # Change the store directory new_s3_path = "s3://fake-open-skills/model_cache/embedding/other_directory" new_trainer.save_model(S3Store(new_s3_path)) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(new_s3_path)] assert set(files) == set([new_trainer.model_name])