def test_two_filters(self):
        major_group_27_filter = lambda job: job['onet_soc_code'][:2] != '27'
        major_group_49_filter = lambda job: job['onet_soc_code'][:2] != '49'
        soc_target = SOCMajorGroup(
            [major_group_27_filter, major_group_49_filter])

        def new_filter(doc):
            if soc_target.filter_func(doc):
                return doc
            else:
                return None

        document_schema_fields = [
            'description', 'experienceRequirements', 'qualifications', 'skills'
        ]
        pipe_x = IterablePipeline(
            new_filter,
            partial(nlp.fields_join,
                    document_schema_fields=document_schema_fields),
            nlp.clean_str, nlp.word_tokenize,
            partial(nlp.vectorize, embedding_model=self.embedding_model))

        pipe_y = IterablePipeline(new_filter, soc_target.transformer)

        matrix = DesignMatrix(self.jobpostings, soc_target, pipe_x, pipe_y)
        matrix.build()
        assert '27' not in matrix.target_variable.encoder.inverse_transform(
            matrix.y)
        assert '49' not in matrix.target_variable.encoder.inverse_transform(
            matrix.y)
Exemplo n.º 2
0
    def test_combined_cls_local(self, mock_getcwd):
        with tempfile.TemporaryDirectory() as td:
            mock_getcwd.return_value = td
            model_storage = ModelStorage(FSStore(td))
            jobpostings = JobPostingCollectionSample()
            corpus_generator = Word2VecGensimCorpusCreator(jobpostings,
                                                           raw=True)
            w2v = Word2VecModel(size=10,
                                min_count=0,
                                alpha=0.025,
                                min_alpha=0.025)
            trainer = EmbeddingTrainer(w2v, model_storage=model_storage)
            trainer.train(corpus_generator, lookup=True)

            matrix = DesignMatrix(jobpostings, self.major_group, self.pipe_x,
                                  self.pipe_y)
            matrix.build()

            X = matrix.X
            rf = ProxyObjectWithStorage(RandomForestClassifier(), None, None,
                                        matrix.target_variable)
            rf.fit(X, matrix.y)

            proxy_rf = ProxyObjectWithStorage(rf, None, None,
                                              matrix.target_variable)
            # Remove the last step in the pipe_x
            # the input of predict_soc should be tokenized words
            new_pipe_x = self.pipe_x
            new_pipe_x.generators.pop()

            new_matrix = DesignMatrix(JobPostingCollectionSample(),
                                      self.major_group, new_pipe_x)
            new_matrix.build()
            ccls = CombinedClassifier(w2v, rf)
            assert len(ccls.predict_soc([new_matrix.X[0]])[0]) == 2
    def test_tester(self):
        document_schema_fields = ['description','experienceRequirements', 'qualifications', 'skills']
        corpus_generator = Word2VecGensimCorpusCreator(JobPostingCollectionSample(num_records=30), document_schema_fields=document_schema_fields)
        w2v = Word2VecModel(size=10, min_count=3, iter=4, window=6, workers=3)
        trainer = EmbeddingTrainer(w2v)
        trainer.train(corpus_generator)

        jp = JobPostingCollectionSample()
        train_gen = islice(jp, 30)
        test_gen = islice(jp, 30, None)
        train_matrix = DesignMatrix(train_gen, self.fullsoc, self.pipe_x, self.pipe_y)
        train_matrix.build()
        occ_trainer = OccupationClassifierTrainer(train_matrix, 2, grid_config=self.grid_config)
        occ_trainer.train(save=False)
        cc = CombinedClassifier(w2v, occ_trainer.best_estimators[0])

        steps = self.pipe_x.generators[:-1]

        test_gen = (t for t in test_gen if t['onet_soc_code'] is not '')

        tester = OccupationClassifierTester(test_data_generator=test_gen, preprocessing=steps, classifier=cc)
        result = list(tester)

        assert len(tester) == len(result) == 18
 def matrix(self):
     jp_f = JobPostingFilterer(self.jobpostings, [self.has_soc_filter])
     matrix = DesignMatrix(jp_f, self.major_group, self.pipe_x, self.pipe_y)
     matrix.build()
     return matrix
document_schema_fields = ['description', 'experienceRequirements', 'qualifications', 'skills']
pipe_x = IterablePipeline(
    basic_filter,
    partial(nlp.fields_join, document_schema_fields=document_schema_fields),
    nlp.clean_str,
    nlp.word_tokenize,
    partial(nlp.vectorize, embedding_model=w2v)
)
pipe_y = IterablePipeline(
    basic_filter,
    full_soc.transformer
)

matrix = DesignMatrix(
        data_source_generator=JobGenerator(train_data),
        target_variable=full_soc,
        pipe_X=pipe_x,
        pipe_y=pipe_y)

matrix.build()

grid_config = {
                 'sklearn.ensemble.ExtraTreesClassifier': {
                     'n_estimators': [50, 100, 500, 1000],
                     'criterion': ['entropy'],
                     'max_depth': [20, 50],
                     'max_features': ['log2'],
                     'min_samples_split': [10, 20]
                      },
                 'sklearn.ensemble.RandomForestClassifier': {
                     'n_estimators': [50, 100, 500, 1000],