Exemplo n.º 1
0
def test_onet_skill_extractor_knowledge():
    with tempfile.NamedTemporaryFile(mode='w+') as outputfile:
        extractor = OnetSkillListProcessor(output_filename=outputfile.name,
                                           onet_source=MockOnetSkillCache(),
                                           hash_function=md5,
                                           ksa_types=['knowledge'])
        extractor.run()
        outputfile.seek(0)
        output = pd.read_csv(outputfile, sep='\t').T.to_dict().values()

        # +2 base rows in input of K file
        assert len(output) == 2

        assert len([row for row in output
                    if row['ksa_type'] == 'knowledge']) == 2

        # make sure uuid is hashed version of the KSA
        for row in output:
            assert row['skill_uuid'] == md5(row['ONET KSA'])
Exemplo n.º 2
0
    def sequence_tagged_annotations(self):
        """Fetch sequence tagged annotations

        Expects these annotations to be produced by BRAT in CoNLL format.
        Returns: (dict), keys are tuples of (job posting id, tagger_id)
            and values are lists of (entity, token) tuples
        """
        annotations_by_posting_and_user = {}
        for user_name, unit_names in self.metadata['allocations'].items():
            for unit_name in unit_names:
                posting_id_lookup = dict(self.metadata['units'][unit_name])
                allocation_path = self.allocation_path(user_name, unit_name)
                for key in self.s3.ls(allocation_path + '/'):
                    # this will iterate through posting text (.txt), annotation (.ann),
                    # and CoNLL (.conll) files. In this case we only care about conll
                    if key.endswith('.conll'):
                        posting_key = key.split('/')[-1].replace('.conll', '')
                        with self.s3.open(key) as f:
                            logging.info('Reading conll file at %s', key)
                            job_posting_id = posting_id_lookup[int(
                                posting_key)]
                            raw_tokens = csv.reader(f, delimiter='\t')

                            tokens = []
                            for token_line in raw_tokens:
                                logging.info('Found token line %s', token_line)
                                if len(token_line) == 0:
                                    tokens.append((None, None))
                                else:
                                    tag, _, _, token = token_line
                                    tokens.append((tag, token))

                            key = (job_posting_id, md5(user_name))
                            if any(token for token in tokens
                                   if token[0] not in {'O', None}):
                                annotations_by_posting_and_user[key] = tokens
                            else:
                                logging.warning(
                                    'No annotations found in file. Skipping')

        return annotations_by_posting_and_user
Exemplo n.º 3
0
def test_onet_skill_extractor_all():
    with tempfile.NamedTemporaryFile(mode='w+') as outputfile:
        extractor = OnetSkillListProcessor(output_filename=outputfile.name,
                                           onet_source=MockOnetSkillCache(),
                                           hash_function=md5)
        extractor.run()
        outputfile.seek(0)
        output = pd.read_csv(outputfile, sep='\t').T.to_dict().values()

        # +17 base rows in input across the K,S,A,T files
        # -7 rows that don't have scale LV
        # -1 row that has 'Not Relevant' marked Y
        # -1 row that has 'Data Value' below 0
        # -1 row that is a dupe
        assert len(output) == 7

        assert len([row for row in output
                    if row['ksa_type'] == 'knowledge']) == 2
        assert len([row for row in output if row['ksa_type'] == 'skill']) == 1
        assert len([row for row in output
                    if row['ksa_type'] == 'ability']) == 1
        assert len([row for row in output if row['ksa_type'] == 'tool']) == 3

        # make sure uuid is hashed version of the KSA
        for row in output:
            assert row['skill_uuid'] == md5(row['ONET KSA'])

        # make sure nlp_a is cleaned version of skill
        assert next(
            row['nlp_a'] for row in output
            if row['ONET KSA'] == '10-key calculators') == '10key calculators'

        # make sure duplicate entries pick first SOC Code
        assert next(
            row['O*NET-SOC Code'] for row in output
            if row['ONET KSA'] == 'written comprehension') == '11-1011.00'

        # make sure duplicate entries pick first element id
        assert next(
            row['Element ID'] for row in output
            if row['ONET KSA'] == 'written comprehension') == '1.a.1.a.2'
Exemplo n.º 4
0
    def test_sequence_tagged_annotations(self):
        s3 = boto3.resource('s3')
        s3.create_bucket(Bucket='test-bucket')

        experiment = BratExperiment(experiment_name='initial_skills_tag',
                                    brat_s3_path='test-bucket/brat')

        tags = {
            'user_1': {
                'unit_1/0': [
                    'O\t0\t4\tthis',
                    'B-SKILL\t5\t7\tis',
                    'O\t8\t14\tpython',
                ],
                'unit_1/1': [
                    'O\t0\t4\tthis',
                    'O\t5\t7\tis',
                    'B-SKILL\t8\t14\tpython',
                ]
            },
            'user_2': {
                'unit_1/0': [
                    'O\t0\t4\tthis',
                    'O\t5\t7\tis',
                    'B-SKILL\t8\t14\tpython',
                ],
                'unit_1/1': [
                    'O\t0\t4\tthis',
                    'O\t5\t7\tis',
                    'B-SKILL\t8\t14\tpython',
                ]
            },
        }
        experiment.metadata['units'] = {
            'unit_1': [
                (0, 'ABC_91238'),
                (1, 'ABC_4823943'),
            ]
        }
        experiment.metadata['allocations'] = {}
        for user_name, annotations in tags.items():
            experiment.metadata['allocations'][user_name] = []
            for key, token_lines in annotations.items():
                unit_name, num = key.split('/')
                if unit_name not in experiment.metadata['allocations'][
                        user_name]:
                    experiment.metadata['allocations'][user_name].append(
                        unit_name)

                base_path = '{}/{}'.format(
                    experiment.user_allocations_path(user_name), key)
                with experiment.s3.open('{}.txt'.format(base_path), 'wb') as f:
                    f.write(
                        'does not matter we are not reading'.encode('utf-8'))
                with experiment.s3.open('{}.ann'.format(base_path), 'wb') as f:
                    f.write(
                        'does not matter we are not reading'.encode('utf-8'))
                with experiment.s3.open('{}.conll'.format(base_path),
                                        'wb') as f:
                    f.write('\n'.join(token_lines).encode('utf-8'))
        experiment.metadata.save()

        self.maxDiff = None
        expected_tokens = {
            ('ABC_91238', md5('user_1')): [('O', 'this'), ('B-SKILL', 'is'),
                                           ('O', 'python')],
            ('ABC_91238', md5('user_2')): [('O', 'this'), ('O', 'is'),
                                           ('B-SKILL', 'python')],
            ('ABC_4823943', md5('user_1')): [('O', 'this'), ('O', 'is'),
                                             ('B-SKILL', 'python')],
            ('ABC_4823943', md5('user_2')): [('O', 'this'), ('O', 'is'),
                                             ('B-SKILL', 'python')],
        }
        self.assertDictEqual(experiment.sequence_tagged_annotations,
                             expected_tokens)
Exemplo n.º 5
0
def test_onet_skill_extractor():
    skills_content = [
        [
            'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID',
            'Data Value', 'N', 'Standard Error', 'Lower CI Bound',
            'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date',
            'Domain Source'
        ],
        [
            '11-1011.00', '2.A.1.a', 'Reading Comprehension', 'IM', '4.12',
            '8', '0.13', '3.88', '4.37', 'N', 'n/a', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '2.A.1.a', 'Reading Comprehension', 'LV', '4.75',
            '8', '0.16', '4.43', '5.07', 'N', 'N', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '2.A.1.b', 'Active Listening', 'IM', '4.12', '8',
            '0.13', '3.88', '4.37', 'N', 'n/a', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '2.A.1.b', 'Active Listening', 'LV', '-4.88', '8',
            '0.23', '4.43', '5.32', 'N', 'N', '07/2014', 'Analyst'
        ],
    ]

    abilities_content = [
        [
            'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID',
            'Data Value', 'N', 'Standard Error', 'Lower CI Bound',
            'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date',
            'Domain Source'
        ],
        [
            '11-1011.00', '1.A.1.a.1', 'Oral Comprehension', 'IM', '4.50', '8',
            '0.19', '4.13', '4.87', 'N', 'n/a', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '1.A.1.a.1', 'Oral Comprehension', 'LV', '4.88', '8',
            '0.13', '4.63', '5.12', 'N', 'Y', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '1.A.1.a.2', 'Written Comprehension', 'IM', '4.25',
            '8', '0.16', '3.93', '4.57', 'N', 'n/a', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '1.A.1.a.2', 'Written Comprehension', 'LV', '4.62',
            '8', '0.18', '4.27', '4.98', 'N', 'N', '07/2014', 'Analyst'
        ],
        [
            '11-2031.00', '1.A.1.a.3', 'Written Comprehension', 'IM', '4.25',
            '8', '0.16', '3.93', '4.57', 'N', 'n/a', '07/2014', 'Analyst'
        ],
        [
            '11-2031.00', '1.A.1.a.3', 'Written Comprehension', 'LV', '4.62',
            '8', '0.18', '4.27', '4.98', 'N', 'N', '07/2014', 'Analyst'
        ],
    ]

    knowledge_content = [
        [
            'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID',
            'Data Value', 'N', 'Standard Error', 'Lower CI Bound',
            'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date',
            'Domain Source'
        ],
        [
            '11-1011.00', '2.C.1.a', 'Administration and Management', 'IM',
            '4.75', '27', '0.09', '4.56', '4.94', 'N', 'n/a', '07/2014',
            'Incumbent'
        ],
        [
            '11-1011.00', '2.C.1.a', 'Administration and Management', 'LV',
            '6.23', '27', '0.17', '5.88', '6.57', 'N', 'N', '07/2014',
            'Incumbent'
        ],
        [
            '11-1011.00', '2.C.1.b', 'Clerical', 'IM', '2.66', '27', '0.22',
            '2.21', '3.11', 'N', 'n/a', '07/2014', 'Incumbent'
        ],
        [
            '11-1011.00', '2.C.1.b', 'Clerical', 'LV', '3.50', '27', '0.41',
            '2.66', '4.34', 'N', 'N', '07/2014', 'Incumbent'
        ],
    ]

    class MockOnetSkillCache(object):
        @contextlib.contextmanager
        def ensure_file(self, dataset):
            fake_data_lookup = {
                'Skills.txt': skills_content,
                'Abilities.txt': abilities_content,
                'Knowledge.txt': knowledge_content
            }
            with utils.makeNamedTemporaryCSV(fake_data_lookup[dataset],
                                             '\t') as temp:
                yield temp

    with tempfile.NamedTemporaryFile(mode='w+') as outputfile:
        extractor = OnetSkillImportanceExtractor(
            output_filename=outputfile.name,
            onet_source=MockOnetSkillCache(),
            hash_function=md5)
        extractor.run()
        outputfile.seek(0)
        output = pd.read_csv(outputfile, sep='\t').T.to_dict().values()

        # +14 base rows in input across the K,S,A files
        assert len(output) == 14

        # make sure uuid is hashed version of the KSA
        for row in output:
            assert row['skill_uuid'] == md5(row['ONET KSA'])
            # otherwise, this is a simple concat so not much to assert
            # we do use these rows though so make sure they're there
            assert 'Data Value' in row
            assert 'O*NET-SOC Code' in row
            assert 'ONET KSA' in row
            assert row['Scale ID'] in ['im', 'lv']
def test_onet_skill_extractor():
    skills_content = [
        [
            'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID',
            'Data Value', 'N', 'Standard Error', 'Lower CI Bound',
            'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date',
            'Domain Source'
        ],
        [
            '11-1011.00', '2.A.1.a', 'Reading Comprehension', 'IM', '4.12',
            '8', '0.13', '3.88', '4.37', 'N', 'n/a', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '2.A.1.a', 'Reading Comprehension', 'LV', '4.75',
            '8', '0.16', '4.43', '5.07', 'N', 'N', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '2.A.1.b', 'Active Listening', 'IM', '4.12', '8',
            '0.13', '3.88', '4.37', 'N', 'n/a', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '2.A.1.b', 'Active Listening', 'LV', '-4.88', '8',
            '0.23', '4.43', '5.32', 'N', 'N', '07/2014', 'Analyst'
        ],
    ]

    abilities_content = [
        [
            'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID',
            'Data Value', 'N', 'Standard Error', 'Lower CI Bound',
            'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date',
            'Domain Source'
        ],
        [
            '11-1011.00', '1.A.1.a.1', 'Oral Comprehension', 'IM', '4.50', '8',
            '0.19', '4.13', '4.87', 'N', 'n/a', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '1.A.1.a.1', 'Oral Comprehension', 'LV', '4.88', '8',
            '0.13', '4.63', '5.12', 'N', 'Y', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '1.A.1.a.2', 'Written Comprehension', 'IM', '4.25',
            '8', '0.16', '3.93', '4.57', 'N', 'n/a', '07/2014', 'Analyst'
        ],
        [
            '11-1011.00', '1.A.1.a.2', 'Written Comprehension', 'LV', '4.62',
            '8', '0.18', '4.27', '4.98', 'N', 'N', '07/2014', 'Analyst'
        ],
        [
            '11-2031.00', '1.A.1.a.3', 'Written Comprehension', 'IM', '4.25',
            '8', '0.16', '3.93', '4.57', 'N', 'n/a', '07/2014', 'Analyst'
        ],
        [
            '11-2031.00', '1.A.1.a.3', 'Written Comprehension', 'LV', '4.62',
            '8', '0.18', '4.27', '4.98', 'N', 'N', '07/2014', 'Analyst'
        ],
    ]

    knowledge_content = [
        [
            'O*NET-SOC Code', 'Element ID', 'Element Name', 'Scale ID',
            'Data Value', 'N', 'Standard Error', 'Lower CI Bound',
            'Upper CI Bound', 'Recommend Suppress', 'Not Relevant', 'Date',
            'Domain Source'
        ],
        [
            '11-1011.00', '2.C.1.a', 'Administration and Management', 'IM',
            '4.75', '27', '0.09', '4.56', '4.94', 'N', 'n/a', '07/2014',
            'Incumbent'
        ],
        [
            '11-1011.00', '2.C.1.a', 'Administration and Management', 'LV',
            '6.23', '27', '0.17', '5.88', '6.57', 'N', 'N', '07/2014',
            'Incumbent'
        ],
        [
            '11-1011.00', '2.C.1.b', 'Clerical', 'IM', '2.66', '27', '0.22',
            '2.21', '3.11', 'N', 'n/a', '07/2014', 'Incumbent'
        ],
        [
            '11-1011.00', '2.C.1.b', 'Clerical', 'LV', '3.50', '27', '0.41',
            '2.66', '4.34', 'N', 'N', '07/2014', 'Incumbent'
        ],
    ]

    tools_content = [
        [
            'O*NET-SOC Code', 'T2 Type', 'T2 Example', 'Commodity Code',
            'Commodity Title'
        ],
        [
            '11-1011.00', 'Tools', '10-key calculators', '44101809',
            'Desktop calculator'
        ],
        [
            '11-1011.00', 'Tools', 'Desktop computers', '43211507',
            'Desktop computers'
        ],
        [
            '11-1011.00', 'Tools', 'Laptop computers', '43211503',
            'Notebook computers'
        ],
        [
            '11-1011.00', 'Tools', 'Personal computers', '43211508',
            'Personal computers'
        ],
        [
            '11-1011.00', 'Tools', 'Personal digital assistants PDA',
            '43211504', 'Personal digital assistant PDAs or organizers'
        ],
        ['11-1011.00', 'Tools', 'Smartphones', '43191501', 'Mobile phones'],
        [
            '11-1011.00', 'Tools', 'Universal serial bus USB flash drives',
            '43201813', 'High capacity removable media drives'
        ],
        [
            '11-1011.00', 'Technology', 'Adobe Systems Adobe Acrobat software',
            '43232202', 'Document management software'
        ],
        [
            '11-1011.00', 'Technology', 'AdSense Tracker', '43232306',
            'Data base user interface and query software'
        ],
        [
            '11-1011.00', 'Technology', 'Blackbaud The Raiser\'s Edge',
            '43232303', 'Customer relationship management CRM software'
        ],
    ]

    class MockOnetDownloader(object):
        def download(self, source_file):
            fake_data_lookup = {
                'Skills': skills_content,
                'Abilities': abilities_content,
                'Knowledge': knowledge_content,
                'Tools and Technology': tools_content,
            }

            with utils.makeNamedTemporaryCSV(fake_data_lookup[source_file],
                                             '\t') as tempname:
                with open(tempname) as fh:
                    return fh.read()

    with patch(
            'skills_ml.datasets.skill_importances.onet.OnetToMemoryDownloader',
            MockOnetDownloader):
        with tempfile.TemporaryDirectory() as output_dir:
            storage = FSStore(output_dir)
            extractor = OnetSkillImportanceExtractor(
                output_dataset_name='skills',
                storage=storage,
                hash_function=md5)
            extractor.run()
            pdin = io.StringIO(storage.load('skills.tsv').decode('utf-8'))
            output = pd.read_csv(pdin, sep='\t').T.to_dict().values()

            # +24 base rows in input across the K,S,A,T files
            assert len(output) == 24

            # make sure uuid is hashed version of the KSA
            for row in output:
                assert row['nlp_a'] == md5(row['ONET KSA'])
                # otherwise, this is a simple concat so not much to assert
                # we do use these rows though so make sure they're there
                assert 'O*NET-SOC Code' in row
                assert 'ONET KSA' in row
Exemplo n.º 7
0
def test_onet_title_extractor():
    occupation_content = [
        ['O*NET-SOC Code', 'Title', 'Description'],
        ['11-1011.00', 'Chief Executives', 'Not important'],
        ['11-1011.03', 'Chief Sustainability Officers', 'Not important'],
        ['11-1021.00', 'General and Operations Managers', 'Not important'],
        ['11-1031.00', 'Legislators', 'Not important'],
    ]

    alternate_title_content = [
        ['O*NET-SOC Code', 'Alternate Title', 'Short Title', 'Source(s)'],
        ['11-1011.00', 'Aeronautics Commission Director', 'n/a', '08'],
        ['11-1011.00', 'Agricultural Services Director', 'n/a', '08'],
        ['11-1011.00', 'Alcohol and Drug Abuse Assistance Admin', 'n/a', '08'],
    ]

    sample_content = [
        ['O*NET-SOC Code', 'Reported Job Title', 'Shown in My Next Move'],
        ['11-1011.00', 'Chief Diversity Officer (CDO)', 'N'],
        ['11-1011.00', 'Chief Executive Officer (CEO)', 'Y'],
        ['11-1011.00', 'Chief Financial Officer (CFO)', 'Y'],
    ]

    class MockOnetTitleCache(object):
        @contextlib.contextmanager
        def ensure_file(self, dataset):
            fake_data_lookup = {
                'Sample of Reported Titles.txt': sample_content,
                'Occupation Data.txt': occupation_content,
                'Alternate Titles.txt': alternate_title_content,
            }
            with utils.makeNamedTemporaryCSV(fake_data_lookup[dataset],
                                             '\t') as temp:
                yield temp

    with tempfile.NamedTemporaryFile(mode='w+') as outputfile:
        extractor = OnetTitleExtractor(output_filename=outputfile.name,
                                       onet_source=MockOnetTitleCache(),
                                       hash_function=md5)
        extractor.run()
        outputfile.seek(0)
        output = pd.read_csv(outputfile, sep='\t').T.to_dict().values()

        # the new file should be the three files concatenated
        assert len(output) == 10

        # for non-occupations, original title should be occupation
        assert next(row['Original Title'] for row in output if row['Title'] ==
                    'Aeronautics Commission Director') == 'Chief Executives'

        # for occupations, the original titles should also be occupation
        assert next(
            row['Original Title'] for row in output
            if row['Title'] == 'Chief Executives') == 'Chief Executives'

        # make sure uuid is hashed version of the title
        for row in output:
            assert row['job_uuid'] == md5(row['Original Title'])

        # make sure nlp_a is cleaned version of title
        assert next(
            row['nlp_a'] for row in output if row['Title'] ==
            'Chief Diversity Officer (CDO)') == 'chief diversity officer cdo'