def skill_aggregate(job_postings, aggregator_constructor, temp_dir, processed_folder, phase_indices, download_folder): title_cleaner = partial(title_clean, phase_indices=phase_indices) skills_filename = '{}/skills_master_table.tsv'\ .format(processed_folder) if not os.path.isfile(skills_filename): download(s3_conn=S3Hook().get_conn(), out_filename=skills_filename, s3_path=config['output_tables']['s3_path'] + '/skills_master_table.tsv') corpus_creator = SimpleCorpusCreator() job_aggregators = { 'onet_skills': OccupationScopedSkillAggregator( corpus_creator=corpus_creator, skill_extractor=OccupationScopedSkillExtractor( skills_filename=skills_filename), output_count=10) } aggregator = aggregator_constructor(job_aggregators=job_aggregators, title_cleaner=title_cleaner) aggregator.process_postings(job_postings) aggregator.job_aggregators['onet_skills'].skill_extractor = None aggregator.job_aggregators['onet_skills'].corpus_creator = None return save( aggregator, temp_dir, )
def _load_classifier(self, **kwargs): if self.classifier_type == 'ann': for f in list_files(self.s3_conn, self.s3_path): filepath = os.path.join(self.temporary_directory, f) if not os.path.exists(filepath): logging.warning('calling download from %s to %s', self.s3_path + f, filepath) download(self.s3_conn, filepath, os.path.join(self.s3_path, f)) ann_index = AnnoyIndexer() ann_index.load( os.path.join(self.temporary_directory, self.classifier_id + '.index')) return NearestNeighbors(s3_conn=self.s3_conn, indexer=ann_index, **kwargs) elif self.classifier_type == 'knn': return NearestNeighbors(s3_conn=self.s3_conn, indexed=False, **kwargs) else: print('Not implemented yet!') return None
def _load_model(self, modelname): if not os.path.isdir('tmp'): os.mkdir('tmp') filepath = 'tmp/' + modelname s3path = self.path + self.model_name if not os.path.exists(filepath): logging.warning('calling download from %s to %s', s3path, filepath) download(self.s3_conn, filepath, s3path) return Doc2Vec.load(filepath)
def test_download(): s3_conn = boto.connect_s3() bucket_name = 'test-bucket' bucket = s3_conn.create_bucket(bucket_name) key = boto.s3.key.Key(bucket=bucket, name='apath/akey') key.set_contents_from_string('test') s3_path = 'test-bucket/apath/akey' with tempfile.NamedTemporaryFile(mode='w+') as f: download(s3_conn, f.name, s3_path) f.seek(0) assert f.read() == 'test'
def download_ann_classifier_files(s3_prefix, classifier_id, download_directory, s3_conn): lock = filelock.FileLock(os.path.join(download_directory, 'ann_dl.lock')) with lock.acquire(timeout=1000): s3_path = s3_prefix + classifier_id files = list_files(s3_conn, s3_path) for f in files: filepath = os.path.join(download_directory, f) if not os.path.exists(filepath): logging.info('calling download from %s to %s', s3_path + f, filepath) download(s3_conn, filepath, os.path.join(s3_path, f)) else: logging.info('%s already exists, not downloading', filepath)
def _load_model(self): """The method to download the model from S3 and load to the memory. Returns: gensim.models.doc2vec.Doc2Vec: The word-embedding model object. """ files = list_files(self.s3_conn, self.s3_path) with tempfile.TemporaryDirectory() as td: for f in files: filepath = os.path.join(td, f) if not os.path.exists(filepath): logging.info('calling download from %s to %s', self.s3_path + f, filepath) download(self.s3_conn, filepath, os.path.join(self.s3_path, f)) model = Doc2Vec.load(os.path.join(td, self.model_name+".model")) return model
def _load_lookup(self): """The method to download the lookup dictionary from S3 and load to the memory. Returns: dict: a lookup table for mapping gensim index to soc code. """ with tempfile.TemporaryDirectory() as td: filepath = os.path.join(td, self.lookup_name) logging.info('calling download from %s to %s', self.s3_path + self.lookup_name, filepath) try: download(self.s3_conn, filepath, os.path.join(self.s3_path, self.lookup_name)) with open(filepath, 'r') as handle: lookup = json.load(handle) except boto.exception.S3ResponseError: lookup = None return lookup
def execute(self, context): table_config = config['output_tables'] folder_readmes = {} folder_readmes[table_config['cleaned_geo_title_count_dir']] = """ Counts of job posting title occurrences by CBSA. {agg_info} Job titles are cleaned by lowercasing, removing punctuation, and removing city and state names."""\ .format(agg_info=COMMON_TITLE_AGG_INFO) folder_readmes[table_config['cleaned_title_count_dir']] = """ Counts of job posting title occurrences. {agg_info} Job titles are cleaned by lowercasing, removing punctuation, and removing city and state names."""\ .format(agg_info=COMMON_TITLE_AGG_INFO) folder_readmes[table_config['geo_title_count_dir']] = """ Counts of job posting title occurrences by CBSA. {agg_info} Job titles are cleaned by lowercasing and removing punctuation."""\ .format(agg_info=COMMON_TITLE_AGG_INFO) folder_readmes[table_config['title_count_dir']] = """ Counts of job posting title occurrences. {agg_info} Job titles are cleaned by lowercasing and removing punctuation."""\ .format(agg_info=COMMON_TITLE_AGG_INFO) folder_readmes[table_config['geo_soc_common_count_dir']] = """ Job postings per SOC code, by CBSA. SOC code inferred by 'common match' method """ folder_readmes[table_config['soc_common_count_dir']] = """ Job postings per SOC code SOC code inferred by 'common match' method """ folder_readmes[table_config['geo_soc_top_count_dir']] = """ Job postings per SOC code, by CBSA. SOC code inferred by 'top match' method """ folder_readmes[table_config['soc_top_count_dir']] = """ Job postings per SOC code SOC code inferred by 'top match' method """ folder_readmes[table_config['geo_soc_given_count_dir']] = """ Job postings per SOC code, by CBSA. SOC code given by data source """ folder_readmes[table_config['soc_given_count_dir']] = """ Job postings per SOC code SOC code given by data source """ local_folder = config.get('output_folder', 'output') if not os.path.isdir(local_folder): os.mkdir(local_folder) source_s3_path = config['output_tables']['s3_path'] upload_s3_path = config['tabular_uploads']['s3_path'] s3_conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) for folder_name, readme_string in folder_readmes.items(): full_folder = '{}/{}'.format(local_folder, folder_name) if not os.path.isdir(full_folder): os.mkdir(full_folder) data_filename = '{}.csv'.format(quarter) data_filepath = os.path.join(full_folder, data_filename) readme_filepath = os.path.join(full_folder, 'README.txt') with open(readme_filepath, 'w') as readme_file: readme_file.write(readme_string + "\n" + QUARTERLY_NOTE) download( s3_conn, data_filepath, os.path.join(source_s3_path, folder_name, data_filename)) upload_s3_folder = os.path.join(upload_s3_path, folder_name) upload(s3_conn, readme_filepath, upload_s3_folder) upload(s3_conn, data_filepath, upload_s3_folder) # metadata stats_s3_path = config['partner_stats']['s3_path'] total_jobs = GlobalStatsAggregator(s3_conn=s3_conn)\ .saved_total(stats_s3_path) quarterly_stats = DatasetStatsCounter\ .quarterly_posting_stats(s3_conn, stats_s3_path) partner_list = DatasetStatsAggregator\ .partners(s3_conn, stats_s3_path) base_readme_filepath = os.path.join(local_folder, 'README.txt') with open(base_readme_filepath, 'w') as readme_file: readme_file.write("Open Skills Datasets\n\n") for folder_name, readme_string in folder_readmes.items(): readme_file.write("###" + folder_name + "###\n\n") readme_file.write(readme_string + "\n\n\n") readme_file.write('Dataset Stats\n\n') readme_file.write('Total Job Postings: ' + str(total_jobs) + "\n") readme_file.write('Quarterly Counts\n') for quarter in sorted(quarterly_stats.keys()): readme_file.write(quarter + ': ' + str(quarterly_stats[quarter]) + '\n') readme_file.write('Partners: ' + ','.join(partner_list) + '\n') upload(s3_conn, base_readme_filepath, upload_s3_path)