Пример #1
0
def skill_aggregate(job_postings, aggregator_constructor, temp_dir,
                    processed_folder, phase_indices, download_folder):
    title_cleaner = partial(title_clean, phase_indices=phase_indices)

    skills_filename = '{}/skills_master_table.tsv'\
        .format(processed_folder)

    if not os.path.isfile(skills_filename):
        download(s3_conn=S3Hook().get_conn(),
                 out_filename=skills_filename,
                 s3_path=config['output_tables']['s3_path'] +
                 '/skills_master_table.tsv')
    corpus_creator = SimpleCorpusCreator()
    job_aggregators = {
        'onet_skills':
        OccupationScopedSkillAggregator(
            corpus_creator=corpus_creator,
            skill_extractor=OccupationScopedSkillExtractor(
                skills_filename=skills_filename),
            output_count=10)
    }
    aggregator = aggregator_constructor(job_aggregators=job_aggregators,
                                        title_cleaner=title_cleaner)
    aggregator.process_postings(job_postings)
    aggregator.job_aggregators['onet_skills'].skill_extractor = None
    aggregator.job_aggregators['onet_skills'].corpus_creator = None
    return save(
        aggregator,
        temp_dir,
    )
Пример #2
0
    def _load_classifier(self, **kwargs):
        if self.classifier_type == 'ann':
            for f in list_files(self.s3_conn, self.s3_path):
                filepath = os.path.join(self.temporary_directory, f)
                if not os.path.exists(filepath):
                    logging.warning('calling download from %s to %s',
                                    self.s3_path + f, filepath)
                    download(self.s3_conn, filepath,
                             os.path.join(self.s3_path, f))
            ann_index = AnnoyIndexer()
            ann_index.load(
                os.path.join(self.temporary_directory,
                             self.classifier_id + '.index'))
            return NearestNeighbors(s3_conn=self.s3_conn,
                                    indexer=ann_index,
                                    **kwargs)

        elif self.classifier_type == 'knn':
            return NearestNeighbors(s3_conn=self.s3_conn,
                                    indexed=False,
                                    **kwargs)

        else:
            print('Not implemented yet!')
            return None
Пример #3
0
 def _load_model(self, modelname):
     if not os.path.isdir('tmp'):
         os.mkdir('tmp')
     filepath = 'tmp/' + modelname
     s3path = self.path + self.model_name
     if not os.path.exists(filepath):
         logging.warning('calling download from %s to %s', s3path, filepath)
         download(self.s3_conn, filepath, s3path)
     return Doc2Vec.load(filepath)
Пример #4
0
def test_download():
    s3_conn = boto.connect_s3()
    bucket_name = 'test-bucket'
    bucket = s3_conn.create_bucket(bucket_name)
    key = boto.s3.key.Key(bucket=bucket, name='apath/akey')
    key.set_contents_from_string('test')
    s3_path = 'test-bucket/apath/akey'

    with tempfile.NamedTemporaryFile(mode='w+') as f:
        download(s3_conn, f.name, s3_path)
        f.seek(0)
        assert f.read() == 'test'
Пример #5
0
def download_ann_classifier_files(s3_prefix, classifier_id, download_directory,
                                  s3_conn):
    lock = filelock.FileLock(os.path.join(download_directory, 'ann_dl.lock'))
    with lock.acquire(timeout=1000):
        s3_path = s3_prefix + classifier_id
        files = list_files(s3_conn, s3_path)
        for f in files:
            filepath = os.path.join(download_directory, f)
            if not os.path.exists(filepath):
                logging.info('calling download from %s to %s', s3_path + f,
                             filepath)
                download(s3_conn, filepath, os.path.join(s3_path, f))
            else:
                logging.info('%s already exists, not downloading', filepath)
Пример #6
0
    def _load_model(self):
        """The method to download the model from S3 and load to the memory.

        Returns:
            gensim.models.doc2vec.Doc2Vec: The word-embedding model object.
        """
        files  = list_files(self.s3_conn, self.s3_path)
        with tempfile.TemporaryDirectory() as td:
            for f in files:
                filepath = os.path.join(td, f)
                if not os.path.exists(filepath):
                    logging.info('calling download from %s to %s', self.s3_path + f, filepath)
                    download(self.s3_conn, filepath, os.path.join(self.s3_path, f))
            model = Doc2Vec.load(os.path.join(td, self.model_name+".model"))

            return model
Пример #7
0
    def _load_lookup(self):
        """The method to download the lookup dictionary from S3 and load to the memory.

        Returns:
            dict: a lookup table for mapping gensim index to soc code.
        """
        with tempfile.TemporaryDirectory() as td:
            filepath = os.path.join(td, self.lookup_name)
            logging.info('calling download from %s to %s', self.s3_path + self.lookup_name, filepath)
            try:
                download(self.s3_conn, filepath, os.path.join(self.s3_path, self.lookup_name))
                with open(filepath, 'r') as handle:
                    lookup = json.load(handle)
            except boto.exception.S3ResponseError:
                lookup = None

            return lookup
Пример #8
0
        def execute(self, context):

            table_config = config['output_tables']
            folder_readmes = {}
            folder_readmes[table_config['cleaned_geo_title_count_dir']] = """
Counts of job posting title occurrences by CBSA.

{agg_info}

Job titles are cleaned by lowercasing, removing punctuation, and removing city and state names."""\
                .format(agg_info=COMMON_TITLE_AGG_INFO)

            folder_readmes[table_config['cleaned_title_count_dir']] = """
Counts of job posting title occurrences.

{agg_info}

Job titles are cleaned by lowercasing, removing punctuation, and removing city and state names."""\
                .format(agg_info=COMMON_TITLE_AGG_INFO)

            folder_readmes[table_config['geo_title_count_dir']] = """
Counts of job posting title occurrences by CBSA.

{agg_info}

Job titles are cleaned by lowercasing and removing punctuation."""\
                .format(agg_info=COMMON_TITLE_AGG_INFO)

            folder_readmes[table_config['title_count_dir']] = """
Counts of job posting title occurrences.

{agg_info}

Job titles are cleaned by lowercasing and removing punctuation."""\
                .format(agg_info=COMMON_TITLE_AGG_INFO)

            folder_readmes[table_config['geo_soc_common_count_dir']] = """
Job postings per SOC code, by CBSA.

SOC code inferred by 'common match' method
            """

            folder_readmes[table_config['soc_common_count_dir']] = """
Job postings per SOC code

SOC code inferred by 'common match' method
            """

            folder_readmes[table_config['geo_soc_top_count_dir']] = """
Job postings per SOC code, by CBSA.

SOC code inferred by 'top match' method
            """

            folder_readmes[table_config['soc_top_count_dir']] = """
Job postings per SOC code

SOC code inferred by 'top match' method
            """

            folder_readmes[table_config['geo_soc_given_count_dir']] = """
Job postings per SOC code, by CBSA.

SOC code given by data source
            """

            folder_readmes[table_config['soc_given_count_dir']] = """
Job postings per SOC code

SOC code given by data source
            """

            local_folder = config.get('output_folder', 'output')
            if not os.path.isdir(local_folder):
                os.mkdir(local_folder)
            source_s3_path = config['output_tables']['s3_path']
            upload_s3_path = config['tabular_uploads']['s3_path']

            s3_conn = S3Hook().get_conn()
            quarter = datetime_to_quarter(context['execution_date'])

            for folder_name, readme_string in folder_readmes.items():
                full_folder = '{}/{}'.format(local_folder, folder_name)
                if not os.path.isdir(full_folder):
                    os.mkdir(full_folder)
                data_filename = '{}.csv'.format(quarter)
                data_filepath = os.path.join(full_folder, data_filename)
                readme_filepath = os.path.join(full_folder, 'README.txt')
                with open(readme_filepath, 'w') as readme_file:
                    readme_file.write(readme_string + "\n" + QUARTERLY_NOTE)
                download(
                    s3_conn, data_filepath,
                    os.path.join(source_s3_path, folder_name, data_filename))
                upload_s3_folder = os.path.join(upload_s3_path, folder_name)
                upload(s3_conn, readme_filepath, upload_s3_folder)
                upload(s3_conn, data_filepath, upload_s3_folder)

            # metadata
            stats_s3_path = config['partner_stats']['s3_path']
            total_jobs = GlobalStatsAggregator(s3_conn=s3_conn)\
                .saved_total(stats_s3_path)
            quarterly_stats = DatasetStatsCounter\
                .quarterly_posting_stats(s3_conn, stats_s3_path)
            partner_list = DatasetStatsAggregator\
                .partners(s3_conn, stats_s3_path)

            base_readme_filepath = os.path.join(local_folder, 'README.txt')
            with open(base_readme_filepath, 'w') as readme_file:
                readme_file.write("Open Skills Datasets\n\n")
                for folder_name, readme_string in folder_readmes.items():
                    readme_file.write("###" + folder_name + "###\n\n")
                    readme_file.write(readme_string + "\n\n\n")
                readme_file.write('Dataset Stats\n\n')
                readme_file.write('Total Job Postings: ' + str(total_jobs) +
                                  "\n")
                readme_file.write('Quarterly Counts\n')
                for quarter in sorted(quarterly_stats.keys()):
                    readme_file.write(quarter + ': ' +
                                      str(quarterly_stats[quarter]) + '\n')
                readme_file.write('Partners: ' + ','.join(partner_list) + '\n')
            upload(s3_conn, base_readme_filepath, upload_s3_path)