def _load_classifier(self, **kwargs): if self.classifier_type == 'ann': for f in list_files(self.s3_conn, self.s3_path): filepath = os.path.join(self.temporary_directory, f) if not os.path.exists(filepath): logging.warning('calling download from %s to %s', self.s3_path + f, filepath) download(self.s3_conn, filepath, os.path.join(self.s3_path, f)) ann_index = AnnoyIndexer() ann_index.load( os.path.join(self.temporary_directory, self.classifier_id + '.index')) return NearestNeighbors(s3_conn=self.s3_conn, indexer=ann_index, **kwargs) elif self.classifier_type == 'knn': return NearestNeighbors(s3_conn=self.s3_conn, indexed=False, **kwargs) else: print('Not implemented yet!') return None
def test_list_files(): s3_conn = boto.connect_s3() bucket_name = 'test-bucket' bucket = s3_conn.create_bucket(bucket_name) key = boto.s3.key.Key(bucket=bucket, name='apath/test.json') key.set_contents_from_string('some contents') s3_path = 'test-bucket/apath/' files = list_files(s3_conn, s3_path) assert files == ['test.json']
def test_embedding_trainer(): s3_conn = boto.connect_s3() bucket_name = 'fake-jb-bucket' bucket = s3_conn.create_bucket(bucket_name) job_posting_name = 'FAKE_jobposting' s3_prefix_jb = 'fake-jb-bucket/job_postings' s3_prefix_model = 'fake-jb-bucket/model_cache/embedding/' quarters = '2011Q1' with tempfile.TemporaryDirectory() as td: with open(os.path.join(td, job_posting_name), 'w') as handle: json.dump(sample_document, handle) upload(s3_conn, os.path.join(td, job_posting_name), os.path.join(s3_prefix_jb, quarters)) # Doc2Vec trainer = EmbeddingTrainer(s3_conn=s3_conn, quarters=['2011Q1'], jp_s3_path=s3_prefix_jb, model_s3_path=s3_prefix_model, model_type='doc2vec') trainer.train() files = list_files(s3_conn, os.path.join(s3_prefix_model, 'doc2vec_gensim_' + trainer.training_time)) assert len(files) == 3 assert files == ['doc2vec_gensim_' + trainer.training_time + '.model', 'lookup_doc2vec_gensim_' + trainer.training_time + '.json', 'metadata_doc2vec_gensim_' + trainer.training_time + '.json'] with tempfile.TemporaryDirectory() as td: trainer.save_model(td) assert set(os.listdir(td)) == set(['doc2vec_gensim_' + trainer.training_time + '.model', 'lookup_doc2vec_gensim_' + trainer.training_time + '.json', 'metadata_doc2vec_gensim_' + trainer.training_time + '.json']) # Word2Vec trainer = EmbeddingTrainer(s3_conn=s3_conn, quarters=['2011Q1'], jp_s3_path=s3_prefix_jb, model_s3_path=s3_prefix_model, model_type='word2vec') trainer.train() files = list_files(s3_conn, os.path.join(s3_prefix_model, 'word2vec_gensim_' + trainer.training_time)) assert len(files) == 2 assert files == ['metadata_word2vec_gensim_' + trainer.training_time + '.json', 'word2vec_gensim_' + trainer.training_time + '.model'] new_trainer = EmbeddingTrainer(s3_conn=s3_conn, quarters=['2011Q1'], jp_s3_path=s3_prefix_jb, model_s3_path=s3_prefix_model, model_type='word2vec') new_trainer.load(trainer.modelname, s3_prefix_model) assert new_trainer.metadata['metadata']['hyperparameters'] == trainer.metadata['metadata']['hyperparameters']
def download_ann_classifier_files(s3_prefix, classifier_id, download_directory, s3_conn): lock = filelock.FileLock(os.path.join(download_directory, 'ann_dl.lock')) with lock.acquire(timeout=1000): s3_path = s3_prefix + classifier_id files = list_files(s3_conn, s3_path) for f in files: filepath = os.path.join(download_directory, f) if not os.path.exists(filepath): logging.info('calling download from %s to %s', s3_path + f, filepath) download(s3_conn, filepath, os.path.join(s3_path, f)) else: logging.info('%s already exists, not downloading', filepath)
def _load_model(self): """The method to download the model from S3 and load to the memory. Returns: gensim.models.doc2vec.Doc2Vec: The word-embedding model object. """ files = list_files(self.s3_conn, self.s3_path) with tempfile.TemporaryDirectory() as td: for f in files: filepath = os.path.join(td, f) if not os.path.exists(filepath): logging.info('calling download from %s to %s', self.s3_path + f, filepath) download(self.s3_conn, filepath, os.path.join(self.s3_path, f)) model = Doc2Vec.load(os.path.join(td, self.model_name+".model")) return model