def test_embedding_trainer_doc2vec_s3(self): client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3_path = f"s3://fake-open-skills/model_cache/embedding" s3_storage = S3Store(path=s3_path) document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Doc2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) d2v = Doc2VecModel(storage=s3_storage, size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(corpus_generator, d2v) trainer.train(lookup=True) trainer.save_model() vocab_size = len(d2v.wv.vocab.keys()) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert d2v.model_name == trainer.model_name assert set(files) == set([trainer.model_name]) self.assertDictEqual(trainer.lookup_dict, d2v.lookup_dict) # Save as different name d2v.save('other_name.model') s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert set(files) == set([trainer.model_name, 'other_name.model']) # Load d2v_loaded = Doc2VecModel.load(s3_storage, trainer.model_name) assert d2v_loaded.metadata['embedding_model']['hyperparameters'][ 'vector_size'] == trainer.metadata['embedding_model'][ 'hyperparameters']['vector_size'] # Change the store directory new_s3_path = "s3://fake-open-skills/model_cache/embedding/other_directory" trainer.save_model(S3Store(new_s3_path)) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(new_s3_path)] assert set(files) == set([trainer.model_name])
def test_geocode_cacher(): with patch('time.sleep') as time_mock: with open('tests/sample_geocode_result.json') as f: sample_geocode_result = json.load(f) client = boto3.resource('s3') client.create_bucket(Bucket='geobucket') cache_storage = S3Store('geobucket') cache_fname = 'cbsas.json' geocode_result = namedtuple('GeocodeResult', ['json']) geocode_func = MagicMock(return_value=geocode_result( json=sample_geocode_result)) geocoder = CachedGeocoder(cache_storage=cache_storage, cache_fname=cache_fname, geocode_func=geocode_func, sleep_time=1) geocoder.geocode('Canarsie, NY') geocoder.geocode('Poughkeepsie, NY') geocoder.geocode('Canarsie, NY') geocoder.save() assert geocode_func.call_count == 2 assert geocode_func.call_args_list == [ call('Canarsie, NY'), call('Poughkeepsie, NY') ] assert time_mock.call_count == 2 new_geocoder = CachedGeocoder(cache_storage=cache_storage, cache_fname=cache_fname, geocode_func=geocode_func, sleep_time=1) assert new_geocoder.all_cached_geocodes == { 'Canarsie, NY': sample_geocode_result, 'Poughkeepsie, NY': sample_geocode_result, }
def test_knn_doc2vec_cls_s3(self): client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3_path = f"s3://fake-open-skills/model_cache/soc_classifiers" s3_storage = S3Store(path=s3_path) model_storage = ModelStorage(s3_storage) corpus_generator = FakeCorpusGenerator() # Embedding has no lookup_dict d2v = Doc2VecModel(size=10, min_count=1, dm=0, alpha=0.025, min_alpha=0.025) trainer = EmbeddingTrainer(d2v, model_storage=model_storage) trainer.train(corpus_generator, lookup=False) self.assertRaises(ValueError, lambda: KNNDoc2VecClassifier(embedding_model=d2v)) d2v = Doc2VecModel(size=10, min_count=1, dm=0, alpha=0.025, min_alpha=0.025) trainer = EmbeddingTrainer(d2v, model_storage=model_storage) trainer.train(corpus_generator, lookup=True) # KNNDoc2VecClassifier only supports doc2vec now self.assertRaises(NotImplementedError, lambda: KNNDoc2VecClassifier(Word2VecModel())) doc = docs.split(',')[0].split() knn = KNNDoc2VecClassifier(embedding_model=d2v, k=0) self.assertRaises(ValueError, lambda: knn.predict_soc([doc])) knn = KNNDoc2VecClassifier(embedding_model=d2v, k=10) soc_cls = SocClassifier(knn) assert knn.predict_soc([doc])[0][0] == soc_cls.predict_soc([doc])[0][0] # Build Annoy index knn.build_ann_indexer(num_trees=5) assert isinstance(knn.indexer, AnnoyIndexer) # Save s3 = s3fs.S3FileSystem() model_storage.save_model(knn, knn.model_name) files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert set(files) == set([knn.model_name]) # Load new_knn = model_storage.load_model(knn.model_name) assert new_knn.model_name == knn.model_name assert new_knn.predict_soc([doc])[0][0] == '29-2061.00' # Have to re-build the index whenever ones load the knn model to the memory assert new_knn.indexer == None
def test_with_iterable_pipelin(self): import boto3 client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3 = S3Store('fake-open-skills/models') model_storage = ModelStorage(storage=s3) fake = FakeModel('fake') model_storage.save_model(fake, fake.model_name) vectorize_for_pipeline = partial(nlp.vectorize, embedding_model=SerializedByStorage( storage=s3, model_name=fake.model_name, model=fake)) pipe = IterablePipeline(vectorize_for_pipeline) pipe_unpickled = pickle.loads(pickle.dumps(pipe)) # make sure the fake model wasn't pickled but the reference assert pipe_unpickled.functions[-1].keywords[ 'embedding_model']._model == None assert pipe_unpickled.functions[-1].keywords[ 'embedding_model'].storage.path == s3.path # The model will be loaded when it's needed assert list(pipe_unpickled([1])) == [[1, 2, 3, 4]]
def test_s3store(self): import boto3 client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3 = s3fs.S3FileSystem() storage = S3Store(path=f"s3://fake-open-skills/model_cache") assert not s3.exists(storage.path) == True model = FakeModel('val') model_pickled = pickle.dumps(model) storage.write(model_pickled, 'for_testing.model') assert storage.exists("for_testing.model") model_loaded = storage.load('for_testing.model') model_loaded = pickle.loads(model_loaded) assert model_loaded.val == 'val' fake_lookup = {'1': 1, '2': 2, '3': 3} fake_lookup_bytes = json.dumps(fake_lookup).encode() storage.write(fake_lookup_bytes, 'for_testing.json') assert storage.exists("for_testing.json") fake_lookup_loaded = json.loads( storage.load('for_testing.json').decode()) assert fake_lookup == fake_lookup_loaded storage.delete('for_testing.model') assert not storage.exists("for_testing.model")
def test_with_grid_search(self): import boto3 client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3 = S3Store('fake-open-skills') model_storage = ModelStorage(s3) from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV gs = GridSearchCV(RandomForestClassifier(), {}) proxy_gs = ProxyObjectWithStorage(model_obj=gs, storage=s3, model_name='rf.grid') X = np.random.rand(20, 2) y = np.random.randint(2, size=20) proxy_gs.fit(X, y) model_storage.save_model(proxy_gs, 'rf.grid') loaded_proxy_gs = model_storage.load_model('rf.grid') assert loaded_proxy_gs.storage.path == s3.path assert proxy_gs.predict([[5, 6]]) == gs.predict([[5, 6]])
def test_aggregate_properties(): client = boto3.resource('s3') client.create_bucket(Bucket='test-bucket') s3_storage = S3Store('s3://test-bucket/aggregations') aggregate_properties(out_filename='2015', grouping_properties=[ FakeGroupingPropertyOne(), FakeGroupingPropertyTwo() ], aggregate_properties=[ FakeAggregationPropertyOne(), FakeAggregationPropertyTwo() ], aggregate_functions={ 'aggregation_property_two': [numpy.sum], 'aggregation_property_one': [partial(listy_n_most_common, 2)] }, storage=s3_storage, aggregation_name='fake_agg') s3 = s3fs.S3FileSystem() with s3.open('s3://test-bucket/aggregations/fake_agg/2015.csv', 'rb') as f: reader = csv.reader(f) num_rows = len([row for row in f]) assert num_rows == 5
def setUp(self): client = boto3.resource('s3') bucket = client.create_bucket(Bucket='test-bucket') storage = S3Store('s3://test-bucket/computed_properties') description = 'This is my description' class MockClassifier(object): def predict_soc(self, document): assert document.strip() == description.lower() return '11-1234.00' @property def name(self): return "MockClassifier" @property def description(self): return "fake algorithm" self.computed_property = SOCClassifyProperty( storage=storage, classifier_obj=MockClassifier(), ) self.job_postings = [ utils.job_posting_factory(datePosted=self.datestring, description=description, skills='', qualifications='', experienceRequirements='') ] self.computed_property.compute_on_collection(self.job_postings)
def setUp(self): self.client = boto3.resource('s3') self.client.create_bucket(Bucket='test-bucket') self.storage = S3Store('s3://test-bucket/computed_properties') self.computed_property = PostingIdPresent(self.storage) self.job_postings = [ utils.job_posting_factory(datePosted=self.datestring) ] self.computed_property.compute_on_collection(self.job_postings)
def setUp(self): self.client = boto3.resource('s3') self.client.create_bucket(Bucket='test-bucket') self.storage = S3Store('s3://test-bucket/computed_properties') self.computed_property = TitleCleanPhaseOne(self.storage) self.job_postings = [ utils.job_posting_factory(datePosted=self.datestring, title='Software Engineer - Tulsa') ] self.computed_property.compute_on_collection(self.job_postings)
def setUp(self): client = boto3.resource('s3') bucket = client.create_bucket(Bucket='test-bucket') storage = S3Store('s3://test-bucket/computed_properties') cache_storage = S3Store('s3://test-bucket') class SampleJobGeoQuerier(JobGeographyQuerier): name = 'blah' output_columns = (('city', 'the city'), ) def _query(self, job_posting): return ['Fargo'] self.computed_property = Geography( geo_querier=SampleJobGeoQuerier(), storage=storage, ) self.job_postings = [ utils.job_posting_factory(datePosted=self.datestring) ] self.computed_property.compute_on_collection(self.job_postings)
def test_s3store(self): import boto3 client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') storage = S3Store(path=f"s3://fake-open-skills/apath") # 1. Ensure that a new file is correctly created and saved to storage_one = PersistedJSONDict(storage, 'test.json') storage_one['key1'] = 'value1' storage_one['key2'] = {'nestedkey2': 'value2'} storage_one.save() loaded = json.loads(storage.load('test.json').decode()) assert loaded == {'key1': 'value1', 'key2': {'nestedkey2': 'value2'}} # 2. Ensure that an existing file is correctly read, updated, and saved to storage_two = PersistedJSONDict(storage, 'test.json') assert 'key1' in storage_two assert storage_two['key1'] == 'value1' storage_two['key3'] = 'value3' storage_two.save() loaded = json.loads(storage.load('test.json').decode()) assert loaded == {'key1': 'value1', 'key2': {'nestedkey2': 'value2'}, 'key3': 'value3'} # 3. Ensure that, in the same thread, updating and svaing an old one gets new chagnes too storage_one['key4'] = 'value4' storage_one.save() loaded = json.loads(storage.load('test.json').decode()) assert loaded == {'key1': 'value1', 'key2': {'nestedkey2': 'value2'}, 'key3': 'value3', 'key4': 'value4'} # 4. test autosave - this will be the fourth update of this object storage_one.SAVE_EVERY_N_UPDATES = 4 storage_one['key5'] = 'value5' loaded = json.loads(storage.load('test.json').decode()) assert loaded == {'key1': 'value1', 'key2': {'nestedkey2': 'value2'}, 'key3': 'value3', 'key4': 'value4', 'key5': 'value5'} # 5. test length checking assert len(storage_one) == 5 # 6. test iteration assert sorted( [(key, value) for key, value in storage_one.items()], key=lambda x: x[0] ) == [ ('key1', 'value1'), ('key2', {'nestedkey2': 'value2'}), ('key3', 'value3'), ('key4', 'value4'), ('key5', 'value5') ]
def setUp(self): s3_conn = boto.connect_s3() client = boto3.resource('s3') bucket = client.create_bucket(Bucket='test-bucket') storage = S3Store('s3://test-bucket/computed_properties') skill_extractor = ExactMatchSkillExtractor(utils.sample_framework()) self.computed_property = SkillCounts( skill_extractor=skill_extractor, storage=storage, ) self.job_postings = [ utils.job_posting_factory(datePosted=self.datestring, description='reading comprehension') ] self.computed_property.compute_on_collection(self.job_postings)
def test_cbsa_finder_nohits(): client = boto3.resource('s3') client.create_bucket(Bucket='geobucket') shapefile_name = 'tests/sample_cbsa_shapefile.shp' cache_storage = S3Store('geobucket') cache_fname = 'cbsas.json' finder = CachedCBSAFinder(cache_storage=cache_storage, cache_fname=cache_fname, shapefile_name=shapefile_name) sample_input = { "bbox": { "northeast": [65.2, 65.8], "southwest": [65.2, 65.8] }, } assert finder.query(sample_input) == None
def setUp(self): self.client = boto3.resource('s3') self.client.create_bucket(Bucket='test-bucket') self.storage = S3Store('s3://test-bucket/computed_properties') self.computed_property = TitleCleanPhaseTwo(self.storage) self.job_postings = [ utils.job_posting_factory(datePosted=self.datestring, title='Software Engineer Tulsa') ] with patch( 'skills_ml.algorithms.jobtitle_cleaner.clean.negative_positive_dict', return_value={ 'places': ['tulsa'], 'states': [], 'onetjobs': ['software engineer'] }): self.computed_property.compute_on_collection(self.job_postings)
def test_with_iterable_pipeline(self): import boto3 client=boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3 = S3Store('fake-open-skills') model_storage = ModelStorage(s3) proxy_fake = ProxyObjectWithStorage(model_obj=FakeModel('fake'), storage=s3, model_name='fake') model_storage.save_model(proxy_fake, proxy_fake.model_name) vectorize_for_pipeline = partial(nlp.vectorize, embedding_model=SerializedByStorage(model=proxy_fake, model_name=proxy_fake.model_name)) pipe = IterablePipeline(vectorize_for_pipeline) s3.write(pickle.dumps(pipe), 'fake.pipe') pipe_unpickled = pickle.loads(s3.load('fake.pipe')) assert list(pipe_unpickled([1])) == [[1, 2, 3, 4]]
def test_cbsa_finder_twohits(): client = boto3.resource('s3') client.create_bucket(Bucket='geobucket') shapefile_name = 'tests/sample_cbsa_shapefile.shp' cache_storage = S3Store('geobucket') cache_fname = 'cbsas.json' finder = CachedCBSAFinder(cache_storage=cache_storage, cache_fname=cache_fname, shapefile_name=shapefile_name) sample_input = { "bbox": { "northeast": [38.00, -81.05], "southwest": [35.13, -88.18] }, } assert finder.query(sample_input) == ( '40080', 'Richmond-Berea, KY Micro Area', )
def test_save_load(self): import boto3 client=boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3 = S3Store('fake-open-skills') model_storage = ModelStorage(storage=s3) fake = FakeModel('fake') model_storage.save_model(fake, fake.model_name) proxy_fake = ProxyObjectWithStorage(model_obj=fake, storage=s3, model_name=fake.model_name) assert proxy_fake.storage == s3 proxy_fake_unpickled = pickle.loads(pickle.dumps(proxy_fake)) assert proxy_fake_unpickled.val == proxy_fake.val model_storage.save_model(proxy_fake, 'proxy_'+ proxy_fake.model_name) proxy_fake_loaded= model_storage.load_model('proxy_'+ proxy_fake.model_name) assert proxy_fake_loaded.val == proxy_fake.val == fake.val
def test_cbsa_finder_empty_cache(): client = boto3.resource('s3') geobucket = client.create_bucket(Bucket='geobucket') cache_storage = S3Store('geobucket') cache_fname = 'cbsas.json' cbsa_finder = CachedCBSAFinder( cache_storage=cache_storage, cache_fname=cache_fname, shapefile_name='tests/sample_cbsa_shapefile.shp') # set the cache to something that JSON loads as None, not empty dict geobucket.put_object(Body='', Key='cbsas.json') geocode_results = { 'East of Charlotte, NC': { "bbox": { "northeast": [35.2268961, -80.8461711], "southwest": [35.2267961, -80.8462711] }, }, 'Flushing, NY': { "bbox": { "northeast": [40.7654801, -73.8173791], "southwest": [40.7653801, -73.8174791] }, } } cbsa_finder.find_all_cbsas_and_save(geocode_results) new_finder = CachedCBSAFinder( cache_storage=cache_storage, cache_fname=cache_fname, shapefile_name='tests/sample_cbsa_shapefile.shp') print(new_finder.all_cached_cbsa_results._storage) assert new_finder.all_cached_cbsa_results == { 'East of Charlotte, NC': [ '16740', 'Charlotte-Concord-Gastonia, NC-SC Metro Area', ], 'Flushing, NY': None }
def test_geocode_search_strings(): with open('tests/sample_geocode_result.json') as f: sample_geocode_result = json.load(f) client = boto3.resource('s3') client.create_bucket(Bucket='geobucket') cache_storage = S3Store('geobucket') cache_fname = 'cbsas.json' geocode_result = namedtuple('GeocodeResult', ['json']) geocode_func = MagicMock(return_value=geocode_result( json=sample_geocode_result)) geocoder = CachedGeocoder(cache_storage=cache_storage, cache_fname=cache_fname, geocode_func=geocode_func, sleep_time=0) geocoder.geocode_search_strings_and_save(['string1', 'string2']) new_geocoder = CachedGeocoder( cache_storage=cache_storage, cache_fname=cache_fname, ) assert next(iter(new_geocoder.all_cached_geocodes.values()))\ == sample_geocode_result
def setUp(self): self.client = boto3.resource('s3') self.client.create_bucket(Bucket='test-bucket') self.storage = S3Store('s3://test-bucket/computed_properties') self.computed_property = YearlyPay(self.storage) self.job_postings = [ utils.job_posting_factory(id=5, datePosted=self.datestring, baseSalary={ 'salaryFrequency': 'yearly', 'minValue': 5, 'maxValue': '' }), utils.job_posting_factory(id=6, datePosted=self.datestring, baseSalary={ 'salaryFrequency': 'yearly', 'minValue': '6.25', 'maxValue': '9.25' }) ] self.computed_property.compute_on_collection(self.job_postings)
def test_embedding_trainer_multicore_s3(self): client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3_path = f"s3://fake-open-skills/model_cache/embedding" s3_storage = S3Store(path=s3_path) model_storage = ModelStorage(s3_storage) document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample() corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) trainer = EmbeddingTrainer(FastTextModel(size=10, min_count=3, iter=4, window=6, workers=3), FastTextModel(size=10, min_count=3, iter=4, window=10, workers=3), Word2VecModel(size=10, workers=3, window=6), Word2VecModel(size=10, min_count=10, window=10, workers=3), model_storage=model_storage) trainer.train(corpus_generator) trainer.save_model() s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert set(files) == set( [model.model_name for model in trainer._models])
def test_pickle_s3(self): import boto3 client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3 = S3Store('fake-open-skills/models') model_storage = ModelStorage(storage=s3) fake = FakeModel('fake') model_storage.save_model(fake, fake.model_name) s_fake = SerializedByStorage(fake, s3, fake.model_name) s3.write(pickle.dumps(s_fake), 'fake.pickle') fake_unpickled = pickle.loads(s3.load('fake.pickle')) # make sure the fake model wasn't pickled but the reference assert fake_unpickled._model == None assert fake_unpickled.storage.path == s3.path assert fake_unpickled.val == fake.val # if the object to be pickled doesn't have storage attribute and didn't provide the storage # to SerializedByStorage, it will be serialized normally s_fake = SerializedByStorage(model=fake, model_name=fake.model_name) s3.write(pickle.dumps(s_fake), 'fake.pickle') fake_unpickled = pickle.loads(s3.load('fake.pickle')) assert fake_unpickled._model != None
def test_cbsa_finder_onehit(): client = boto3.resource('s3') client.create_bucket(Bucket='geobucket') shapefile_name = 'tests/sample_cbsa_shapefile.shp' cache_storage = S3Store('geobucket') cache_fname = 'cbsas.json' finder = CachedCBSAFinder(cache_storage=cache_storage, cache_fname=cache_fname, shapefile_name=shapefile_name) sample_input = { "lng": -80.8462211, "ok": True, "location": "East of Charlotte, NC", "provider": "osm", "country": "United States of America", "bbox": { "northeast": [35.2268961, -80.8461711], "southwest": [35.2267961, -80.8462711] }, "importance": 0.325, "quality": "postcode", "accuracy": 0.325, "address": "NC 28202, United States of America", "confidence": 10, "lat": 35.2268461, "type": "postcode", "place_rank": "25", "status_code": 200, "status": "OK", "place_id": "210190423", "encoding": "utf-8", "postal": "NC 28202" } assert finder.query(sample_input) == ( '16740', 'Charlotte-Concord-Gastonia, NC-SC Metro Area', )
def test_BratExperiment_add_allocation(): # given a user name # find the next allocation to use that the user has not annotated yet # create a directory with the users name # record in metadata the fact that the user has been allocated this # setup: create a bucket for the brat config s3 = boto3.resource('s3') storage = S3Store('s3://test-bucket/samples') s3.create_bucket(Bucket='test-bucket') job_postings = [job_posting_factory(id=i, description=str(i)) for i in range(100, 200)] sample = sample_factory(job_postings, name='300_weighted', storage=storage) experiment = BratExperiment( experiment_name='initial_skills_tag', brat_s3_path='test-bucket/brat' ) experiment.start( sample=sample, minimum_annotations_per_posting=2, max_postings_per_allocation=20, entities_with_shortcuts=( ('c', 'Competency'), ) ) # initialize the experiment in this bucket experiment = BratExperiment( experiment_name='initial_skills_tag', brat_s3_path='test-bucket/brat' ) username = '******' # should not be able to allocate without creating a user with pytest.raises(ValueError): experiment.add_allocation(username) # set up a user to allocate to experiment.user_pw_store[username] = 'password' experiment.user_pw_store.save() allocated_directory = experiment.add_allocation(username) allocations = experiment.metadata['allocations'][username] assert len(allocations) == 1 s3 = s3fs.S3FileSystem() filenames = s3.ls(allocated_directory) # there should be two files for each job posting: the .txt. and the .ann assert len(filenames) == len(experiment.metadata['units'][allocations[0]]) * 2 # simulate continued allocation with more users user_two = 'user_two' user_three = 'user_three' experiment.add_user(user_two, 'pass') experiment.add_user(user_three, 'pass') for i in range(0, 4): experiment.add_allocation(user_two) experiment.add_allocation(user_three) # at this point, trying to re-allocate to either user two or three # should fail as they have now tagged everything with pytest.raises(ValueError): experiment.add_allocation(user_two) # user one should still work for now for i in range(0, 4): new_directory = experiment.add_allocation(username) assert new_directory != allocated_directory # once they have seen the whole thing, no more! with pytest.raises(ValueError): experiment.add_allocation(username)
def test_embedding_trainer_word2vec_s3(self): client = boto3.client('s3') client.create_bucket(Bucket='fake-open-skills', ACL='public-read-write') s3_path = f"s3://fake-open-skills/model_cache/embedding" s3_storage = S3Store(path=s3_path) document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] job_postings_generator = JobPostingCollectionSample(num_records=30) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v = Word2VecModel(storage=s3_storage, size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(corpus_generator, w2v) trainer.train() trainer.save_model() vocab_size = len(w2v.wv.vocab.keys()) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert w2v.model_name == trainer.model_name assert set(files) == set([trainer.model_name]) # Test online training job_postings_generator = JobPostingCollectionSample(num_records=50) corpus_generator = Word2VecGensimCorpusCreator( job_postings_generator, document_schema_fields=document_schema_fields) w2v_loaded = Word2VecModel.load(s3_storage, w2v.model_name) new_trainer = EmbeddingTrainer(corpus_generator, w2v_loaded) new_trainer.train() new_trainer.save_model() new_vocab_size = len(w2v_loaded.wv.vocab.keys()) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert set(files) == set([new_trainer.model_name, trainer.model_name]) assert new_trainer.metadata['embedding_trainer'][ 'model_name'] != trainer.metadata['embedding_trainer']['model_name'] assert vocab_size <= new_vocab_size # Save as different name w2v.save('other_name.model') s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(s3_path)] assert set(files) == set( [trainer.model_name, new_trainer.model_name, 'other_name.model']) # Change the store directory new_s3_path = "s3://fake-open-skills/model_cache/embedding/other_directory" new_trainer.save_model(S3Store(new_s3_path)) s3 = s3fs.S3FileSystem() files = [f.split('/')[-1] for f in s3.ls(new_s3_path)] assert set(files) == set([new_trainer.model_name])
def test_BratExperiment_start(): # create a bucket that will contain both the source samples and BRAT config s3 = boto3.resource('s3') bucket = s3.create_bucket(Bucket='test-bucket') storage = S3Store('s3://test-bucket/samples') # create a sample. # sample format is one file, one job posting per line, in common schema JSON format job_postings = [job_posting_factory( id=i, description=str(i), experienceRequirements='', qualifications='', skills='' ) for i in range(100, 200)] sample = sample_factory(job_postings, name='300_weighted', storage=storage) experiment = BratExperiment( experiment_name='initial_skills_tag', brat_s3_path='test-bucket/brat' ) experiment.start( sample=sample, minimum_annotations_per_posting=2, max_postings_per_allocation=20, entities_with_shortcuts=( ('c', 'Competency'), ) ) # find metadata about what it created s3 = s3fs.S3FileSystem() # first assert that some shallow metadata was passed through assert experiment.metadata['sample_base_path'] == 's3://test-bucket/samples' assert experiment.metadata['sample_name'] == '300_weighted' assert experiment.metadata['entities_with_shortcuts'] == (('c', 'Competency'),) assert experiment.metadata['minimum_annotations_per_posting'] == 2 assert experiment.metadata['max_postings_per_allocation'] == 20 # next look at the posting texts themselves. # we expect them all of them to be present but split across a number of units units = experiment.metadata['units'] assert len(units) == 5 # 100/20 retrieved_descriptions = [] for unit_name, documents in units.items(): for posting_key, original_job_id in documents: # we should not expose the original posting ids # otherwise we don't care what the keys are but that they exist where we expect them to assert posting_key is not original_job_id with s3.open('{data_path}/.{unit_name}/{posting_key}.txt'.format( data_path=experiment.data_path, unit_name=unit_name, posting_key=posting_key ), mode='rb') as f: posting = f.read().decode('utf-8') retrieved_descriptions.append(posting.strip()) # make sure that the blank annotation file is there too with s3.open('{data_path}/.{unit_name}/{posting_key}.ann'.format( data_path=experiment.data_path, unit_name=unit_name, posting_key=posting_key ), mode='rb') as f: assert len(f.read().decode('utf-8')) == 0 # our fake descriptions were just the string values for the range numbers # so that's what should get written assert sorted(retrieved_descriptions) == sorted([str(i) for i in range(100, 200)]) def assert_conf_contains(conf_name, expected): with s3.open('{path}/{conf_name}'.format( path=experiment.brat_config_path, conf_name=conf_name ), 'rb') as f: assert expected in f.read().decode('utf-8') assert_conf_contains('visual.conf', '[labels]\nCompetency\n') assert_conf_contains('annotation.conf', '[entities]\nCompetency\n') assert_conf_contains('kb_shortcuts.conf', 'c Competency\n')