예제 #1
0
    def _load_classifier(self, **kwargs):
        if self.classifier_type == 'ann':
            for f in list_files(self.s3_conn, self.s3_path):
                filepath = os.path.join(self.temporary_directory, f)
                if not os.path.exists(filepath):
                    logging.warning('calling download from %s to %s',
                                    self.s3_path + f, filepath)
                    download(self.s3_conn, filepath,
                             os.path.join(self.s3_path, f))
            ann_index = AnnoyIndexer()
            ann_index.load(
                os.path.join(self.temporary_directory,
                             self.classifier_id + '.index'))
            return NearestNeighbors(s3_conn=self.s3_conn,
                                    indexer=ann_index,
                                    **kwargs)

        elif self.classifier_type == 'knn':
            return NearestNeighbors(s3_conn=self.s3_conn,
                                    indexed=False,
                                    **kwargs)

        else:
            print('Not implemented yet!')
            return None
예제 #2
0
def test_list_files():
    s3_conn = boto.connect_s3()
    bucket_name = 'test-bucket'
    bucket = s3_conn.create_bucket(bucket_name)
    key = boto.s3.key.Key(bucket=bucket, name='apath/test.json')
    key.set_contents_from_string('some contents')
    s3_path = 'test-bucket/apath/'
    files = list_files(s3_conn, s3_path)
    assert files == ['test.json']
예제 #3
0
def test_embedding_trainer():
    s3_conn = boto.connect_s3()
    bucket_name = 'fake-jb-bucket'
    bucket = s3_conn.create_bucket(bucket_name)

    job_posting_name = 'FAKE_jobposting'
    s3_prefix_jb = 'fake-jb-bucket/job_postings'
    s3_prefix_model = 'fake-jb-bucket/model_cache/embedding/'
    quarters = '2011Q1'

    with tempfile.TemporaryDirectory() as td:
        with open(os.path.join(td, job_posting_name), 'w') as handle:
            json.dump(sample_document, handle)
        upload(s3_conn, os.path.join(td, job_posting_name), os.path.join(s3_prefix_jb, quarters))


    # Doc2Vec
    trainer = EmbeddingTrainer(s3_conn=s3_conn, quarters=['2011Q1'], jp_s3_path=s3_prefix_jb, model_s3_path=s3_prefix_model, model_type='doc2vec')
    trainer.train()
    files = list_files(s3_conn, os.path.join(s3_prefix_model, 'doc2vec_gensim_' + trainer.training_time))
    assert len(files) == 3

    assert files == ['doc2vec_gensim_' + trainer.training_time + '.model',
                     'lookup_doc2vec_gensim_' + trainer.training_time + '.json',
                     'metadata_doc2vec_gensim_' + trainer.training_time + '.json']

    with tempfile.TemporaryDirectory() as td:
        trainer.save_model(td)
        assert set(os.listdir(td)) == set(['doc2vec_gensim_' + trainer.training_time + '.model',
                                           'lookup_doc2vec_gensim_' + trainer.training_time + '.json',
                                           'metadata_doc2vec_gensim_' + trainer.training_time + '.json'])

    # Word2Vec
    trainer = EmbeddingTrainer(s3_conn=s3_conn, quarters=['2011Q1'], jp_s3_path=s3_prefix_jb, model_s3_path=s3_prefix_model, model_type='word2vec')
    trainer.train()
    files = list_files(s3_conn, os.path.join(s3_prefix_model, 'word2vec_gensim_' + trainer.training_time))
    assert len(files) == 2
    assert files == ['metadata_word2vec_gensim_' + trainer.training_time + '.json',
                     'word2vec_gensim_' + trainer.training_time + '.model']

    new_trainer = EmbeddingTrainer(s3_conn=s3_conn, quarters=['2011Q1'], jp_s3_path=s3_prefix_jb, model_s3_path=s3_prefix_model, model_type='word2vec')
    new_trainer.load(trainer.modelname, s3_prefix_model)
    assert new_trainer.metadata['metadata']['hyperparameters'] == trainer.metadata['metadata']['hyperparameters']
예제 #4
0
def download_ann_classifier_files(s3_prefix, classifier_id, download_directory,
                                  s3_conn):
    lock = filelock.FileLock(os.path.join(download_directory, 'ann_dl.lock'))
    with lock.acquire(timeout=1000):
        s3_path = s3_prefix + classifier_id
        files = list_files(s3_conn, s3_path)
        for f in files:
            filepath = os.path.join(download_directory, f)
            if not os.path.exists(filepath):
                logging.info('calling download from %s to %s', s3_path + f,
                             filepath)
                download(s3_conn, filepath, os.path.join(s3_path, f))
            else:
                logging.info('%s already exists, not downloading', filepath)
예제 #5
0
    def _load_model(self):
        """The method to download the model from S3 and load to the memory.

        Returns:
            gensim.models.doc2vec.Doc2Vec: The word-embedding model object.
        """
        files  = list_files(self.s3_conn, self.s3_path)
        with tempfile.TemporaryDirectory() as td:
            for f in files:
                filepath = os.path.join(td, f)
                if not os.path.exists(filepath):
                    logging.info('calling download from %s to %s', self.s3_path + f, filepath)
                    download(self.s3_conn, filepath, os.path.join(self.s3_path, f))
            model = Doc2Vec.load(os.path.join(td, self.model_name+".model"))

            return model