def test_filtering(self): major_group_27_filter = lambda job: job['onet_soc_code'][:2] != '27' major_group_49_filter = lambda job: job['onet_soc_code'][:2] != '49' soc_target = SOCMajorGroup() matrix = create_training_set( self.jobpostings, soc_target, self.embedding_model, ) assert '27' in matrix.target_variable.encoder.inverse_transform( matrix.y) soc_target = SOCMajorGroup(major_group_27_filter) matrix = create_training_set(self.jobpostings, soc_target, self.embedding_model) assert '27' not in matrix.target_variable.encoder.inverse_transform( matrix.y) soc_target = SOCMajorGroup( [major_group_27_filter, major_group_49_filter]) matrix = create_training_set(self.jobpostings, soc_target, self.embedding_model) assert '27' not in matrix.target_variable.encoder.inverse_transform( matrix.y) assert '49' not in matrix.target_variable.encoder.inverse_transform( matrix.y)
def test_two_filters(self): major_group_27_filter = lambda job: job['onet_soc_code'][:2] != '27' major_group_49_filter = lambda job: job['onet_soc_code'][:2] != '49' soc_target = SOCMajorGroup( [major_group_27_filter, major_group_49_filter]) def new_filter(doc): if soc_target.filter_func(doc): return doc else: return None document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] pipe_x = IterablePipeline( new_filter, partial(nlp.fields_join, document_schema_fields=document_schema_fields), nlp.clean_str, nlp.word_tokenize, partial(nlp.vectorize, embedding_model=self.embedding_model)) pipe_y = IterablePipeline(new_filter, soc_target.transformer) matrix = DesignMatrix(self.jobpostings, soc_target, pipe_x, pipe_y) matrix.build() assert '27' not in matrix.target_variable.encoder.inverse_transform( matrix.y) assert '49' not in matrix.target_variable.encoder.inverse_transform( matrix.y)
def test_create_training_set(self): jp_f = list(JobPostingFilterer(self.jobpostings, [self.has_soc_filter])) matrix = create_training_set(jp_f, SOCMajorGroup(), self.embedding_model) assert matrix.target_variable.name == "major_group" assert matrix.X.shape[0] == len(jp_f) assert matrix.y.shape[0] == len(jp_f) assert matrix.embedding_model == self.embedding_model assert matrix.target_variable.encoder.inverse_transform([0]) == '11'
def test_training(self): jp_f = JobPostingFilterer(self.jobpostings, [self.has_soc_filter]) matrix = create_training_set(jp_f, SOCMajorGroup(), self.embedding_model) assert matrix.target_variable.name == "major_group" occ_trainer = OccupationClassifierTrainer(matrix, k_folds=2, grid_config=grid, scoring=['accuracy']) occ_trainer.train() assert list(occ_trainer.cls_cv_result['accuracy'].keys()) == [ 'ExtraTreesClassifier' ] assert occ_trainer.matrix.embedding_model.model_name == self.embedding_model.model_name
def test_combined_cls_local(self, mock_getcwd): with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td jobpostings = list(JobPostingCollectionSample()) corpus_generator = Word2VecGensimCorpusCreator(jobpostings, raw=True) w2v = Word2VecModel(storage=FSStore(td), size=10, min_count=0, alpha=0.025, min_alpha=0.025) trainer = EmbeddingTrainer(corpus_generator, w2v) trainer.train(True) matrix = create_training_set(jobpostings, SOCMajorGroup()) X = EmbeddingTransformer(w2v).transform(matrix.X) rf = RandomForestClassifier() rf.fit(X, matrix.y) ccls = CombinedClassifier(w2v, rf, matrix.target_variable) assert len(ccls.predict_soc([matrix.X[0]])[0]) == 2
def major_group(self): return SOCMajorGroup()