def test_two_filters(self): major_group_27_filter = lambda job: job['onet_soc_code'][:2] != '27' major_group_49_filter = lambda job: job['onet_soc_code'][:2] != '49' soc_target = SOCMajorGroup( [major_group_27_filter, major_group_49_filter]) def new_filter(doc): if soc_target.filter_func(doc): return doc else: return None document_schema_fields = [ 'description', 'experienceRequirements', 'qualifications', 'skills' ] pipe_x = IterablePipeline( new_filter, partial(nlp.fields_join, document_schema_fields=document_schema_fields), nlp.clean_str, nlp.word_tokenize, partial(nlp.vectorize, embedding_model=self.embedding_model)) pipe_y = IterablePipeline(new_filter, soc_target.transformer) matrix = DesignMatrix(self.jobpostings, soc_target, pipe_x, pipe_y) matrix.build() assert '27' not in matrix.target_variable.encoder.inverse_transform( matrix.y) assert '49' not in matrix.target_variable.encoder.inverse_transform( matrix.y)
def test_combined_cls_local(self, mock_getcwd): with tempfile.TemporaryDirectory() as td: mock_getcwd.return_value = td model_storage = ModelStorage(FSStore(td)) jobpostings = JobPostingCollectionSample() corpus_generator = Word2VecGensimCorpusCreator(jobpostings, raw=True) w2v = Word2VecModel(size=10, min_count=0, alpha=0.025, min_alpha=0.025) trainer = EmbeddingTrainer(w2v, model_storage=model_storage) trainer.train(corpus_generator, lookup=True) matrix = DesignMatrix(jobpostings, self.major_group, self.pipe_x, self.pipe_y) matrix.build() X = matrix.X rf = ProxyObjectWithStorage(RandomForestClassifier(), None, None, matrix.target_variable) rf.fit(X, matrix.y) proxy_rf = ProxyObjectWithStorage(rf, None, None, matrix.target_variable) # Remove the last step in the pipe_x # the input of predict_soc should be tokenized words new_pipe_x = self.pipe_x new_pipe_x.generators.pop() new_matrix = DesignMatrix(JobPostingCollectionSample(), self.major_group, new_pipe_x) new_matrix.build() ccls = CombinedClassifier(w2v, rf) assert len(ccls.predict_soc([new_matrix.X[0]])[0]) == 2
def test_tester(self): document_schema_fields = ['description','experienceRequirements', 'qualifications', 'skills'] corpus_generator = Word2VecGensimCorpusCreator(JobPostingCollectionSample(num_records=30), document_schema_fields=document_schema_fields) w2v = Word2VecModel(size=10, min_count=3, iter=4, window=6, workers=3) trainer = EmbeddingTrainer(w2v) trainer.train(corpus_generator) jp = JobPostingCollectionSample() train_gen = islice(jp, 30) test_gen = islice(jp, 30, None) train_matrix = DesignMatrix(train_gen, self.fullsoc, self.pipe_x, self.pipe_y) train_matrix.build() occ_trainer = OccupationClassifierTrainer(train_matrix, 2, grid_config=self.grid_config) occ_trainer.train(save=False) cc = CombinedClassifier(w2v, occ_trainer.best_estimators[0]) steps = self.pipe_x.generators[:-1] test_gen = (t for t in test_gen if t['onet_soc_code'] is not '') tester = OccupationClassifierTester(test_data_generator=test_gen, preprocessing=steps, classifier=cc) result = list(tester) assert len(tester) == len(result) == 18
def matrix(self): jp_f = JobPostingFilterer(self.jobpostings, [self.has_soc_filter]) matrix = DesignMatrix(jp_f, self.major_group, self.pipe_x, self.pipe_y) matrix.build() return matrix
document_schema_fields = ['description', 'experienceRequirements', 'qualifications', 'skills'] pipe_x = IterablePipeline( basic_filter, partial(nlp.fields_join, document_schema_fields=document_schema_fields), nlp.clean_str, nlp.word_tokenize, partial(nlp.vectorize, embedding_model=w2v) ) pipe_y = IterablePipeline( basic_filter, full_soc.transformer ) matrix = DesignMatrix( data_source_generator=JobGenerator(train_data), target_variable=full_soc, pipe_X=pipe_x, pipe_y=pipe_y) matrix.build() grid_config = { 'sklearn.ensemble.ExtraTreesClassifier': { 'n_estimators': [50, 100, 500, 1000], 'criterion': ['entropy'], 'max_depth': [20, 50], 'max_features': ['log2'], 'min_samples_split': [10, 20] }, 'sklearn.ensemble.RandomForestClassifier': { 'n_estimators': [50, 100, 500, 1000],