def execute(self, context): conn = S3Hook().get_conn() execution_date = context['execution_date'] quarter = datetime_to_quarter(execution_date) if quarter != datetime_to_quarter(datetime.now()): logging.warning('PartnerSnapshotOperator cannot be backfilled. Skipping') return updater = self.updater_class(**(self.passthrough_kwargs)) postings = updater.deduplicated_postings() upload_dict( s3_conn=conn, s3_prefix=self.s3_prefix + '/' + quarter, data_to_sync=postings )
def execute(self, context): s3_conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) with tempfile.TemporaryDirectory() as temp_dir: job_postings_generator = job_postings_highmem( s3_conn, quarter, config['job_postings']['s3_path']) geo_querier = JobCBSAFromGeocodeQuerier( cbsa_results=S3CachedCBSAFinder( s3_conn=s3_conn, cache_s3_path=config['cbsa_lookup'] ['s3_path']).all_cached_cbsa_results) logging.basicConfig( format='%(asctime)s %(process)d %(levelname)s: %(message)s') with Pool(processes=config['aggregation']['n_processes']) as pool: try: it = self.map( pool=pool, job_postings_generator=job_postings_generator, geo_querier=geo_querier, temp_dir=temp_dir) combined_agg = self.reduce(it) except Exception as e: logging.error("Child error: {}".format( traceback.format_exc())) raise self.save(combined_agg, quarter, s3_conn)
def execute(self, context): s3_conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) output_folder = config.get('output_folder', 'output') if not os.path.isdir(output_folder): os.mkdir(output_folder) merge(s3_conn, self.group_config_key, quarter, output_folder) merge(s3_conn, self.rollup_config_key, quarter, output_folder)
def execute(self, context): conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) NormalizeTopNIndexer( quarter=quarter, job_postings_generator=partial(job_postings, s3_path=config['job_postings']['s3_path']), job_titles_index=config['normalizer']['titles_master_index_name'], alias_name=config['normalizer']['es_index_name'], s3_conn=conn, es_client=basic_client() ).append()
def execute(self, context): year, quarter = datetime_to_year_quarter(context['execution_date']) quarter_string = datetime_to_quarter(context['execution_date']) engine = get_db() load_title_counts( filename=full_path( table_files['title_count']).format(quarter_string), year=year, quarter=quarter, db_engine=engine, )
def execute(self, context): s3_conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) job_label_filename = 'tmp/job_label_train_' + quarter + '.csv' with open(job_label_filename, 'w') as outfile: writer = csv.writer(outfile, delimiter=',') job_postings_generator = job_postings( s3_conn, quarter, config['job_postings']['s3_path']) corpus_generator = JobCategoryCorpusCreator().label_corpora( job_postings_generator) for label in corpus_generator: writer.writerow([label]) logging.info('Done labeling job categories to %s', job_label_filename)
def execute(self, context): s3_conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) job_postings_generator = job_postings_highmem( s3_conn, quarter, config['job_postings']['s3_path'] ) geocoder = S3CachedGeocoder( s3_conn=s3_conn, cache_s3_path=config['geocoder']['s3_path'] ) logging.info('Starting geocoding') geocoder.geocode_job_postings_and_save(job_postings_generator) logging.info('Done geocoding')
def execute(self, context): s3_conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) job_vector_filename = 'tmp/job_features_train_' + quarter + '.csv' with open(job_vector_filename, 'w') as outfile: writer = csv.writer(outfile, delimiter=',') job_postings_generator = job_postings( s3_conn, quarter, config['job_postings']['s3_path']) corpus_generator = Doc2VecGensimCorpusCreator().array_corpora( job_postings_generator) vectorized_job_generator = Doc2Vectorizer( model_name='gensim_doc2vec', path=config['job_vectorizer_cache']['s3_path'], s3_conn=s3_conn).vectorize(corpus_generator) for vector in vectorized_job_generator: writer.writerow(vector) logging.info('Done vecotrizing job postings to %s', job_vector_filename)
def execute(self, context): conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) stats_counter = DatasetStatsCounter( quarter=quarter, dataset_id=self.partner_id ) transformer = self.transformer_class( s3_conn=conn, partner_id=self.partner_id, onet_cache=OnetCache( s3_conn=conn, cache_dir=config['onet']['cache_dir'], s3_path=config['onet']['s3_path'], ), **self.passthrough_kwargs ) self.clear_old_postings(conn, quarter) for batch in Batch( transformer.postings(quarter, stats_counter), self.postings_per_file ): logging.info('Processing new batch') with tempfile.TemporaryFile(mode='w+') as f: for posting in batch: f.write(json.dumps(posting)) f.write('\n') logging.debug('New batch written, commencing upload') bucket = conn.get_bucket(self.output_bucket) key = boto.s3.key.Key( bucket=bucket, name='{}/{}/{}_{}'.format(self.output_prefix, quarter, self.partner_id, uuid.uuid4()) ) f.seek(0) key.set_contents_from_string(f.read()) logging.debug('Batch upload complete') stats_counter.save( s3_conn=conn, s3_prefix=config['partner_stats']['s3_path'] )
def execute(self, context): conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) labeled_filename = 'labeled_corpora_a' with open(labeled_filename, 'w') as outfile: writer = csv.writer(outfile, delimiter='\t') job_postings_generator = job_postings( conn, quarter, config['job_postings']['s3_path']) corpus_generator = SimpleCorpusCreator()\ .raw_corpora(job_postings_generator) tagged_document_generator = \ SimpleSkillTagger( skills_filename=skills_filename, hash_function=md5 ).tagged_documents(corpus_generator) for document in tagged_document_generator: writer.writerow([document]) logging.info('Done tagging skills to %s', labeled_filename) upload( conn, labeled_filename, '{}/{}'.format(config['labeled_postings']['s3_path'], quarter))
def get_time_range(start='2011-01-01', freq='Q', periods=24): return list(map(lambda x: datetime_to_quarter(x), pd.date_range(start=start, freq=freq, periods=periods)))
def execute(self, context): table_config = config['output_tables'] folder_readmes = {} folder_readmes[table_config['cleaned_geo_title_count_dir']] = """ Counts of job posting title occurrences by CBSA. {agg_info} Job titles are cleaned by lowercasing, removing punctuation, and removing city and state names."""\ .format(agg_info=COMMON_TITLE_AGG_INFO) folder_readmes[table_config['cleaned_title_count_dir']] = """ Counts of job posting title occurrences. {agg_info} Job titles are cleaned by lowercasing, removing punctuation, and removing city and state names."""\ .format(agg_info=COMMON_TITLE_AGG_INFO) folder_readmes[table_config['geo_title_count_dir']] = """ Counts of job posting title occurrences by CBSA. {agg_info} Job titles are cleaned by lowercasing and removing punctuation."""\ .format(agg_info=COMMON_TITLE_AGG_INFO) folder_readmes[table_config['title_count_dir']] = """ Counts of job posting title occurrences. {agg_info} Job titles are cleaned by lowercasing and removing punctuation."""\ .format(agg_info=COMMON_TITLE_AGG_INFO) folder_readmes[table_config['geo_soc_common_count_dir']] = """ Job postings per SOC code, by CBSA. SOC code inferred by 'common match' method """ folder_readmes[table_config['soc_common_count_dir']] = """ Job postings per SOC code SOC code inferred by 'common match' method """ folder_readmes[table_config['geo_soc_top_count_dir']] = """ Job postings per SOC code, by CBSA. SOC code inferred by 'top match' method """ folder_readmes[table_config['soc_top_count_dir']] = """ Job postings per SOC code SOC code inferred by 'top match' method """ folder_readmes[table_config['geo_soc_given_count_dir']] = """ Job postings per SOC code, by CBSA. SOC code given by data source """ folder_readmes[table_config['soc_given_count_dir']] = """ Job postings per SOC code SOC code given by data source """ local_folder = config.get('output_folder', 'output') if not os.path.isdir(local_folder): os.mkdir(local_folder) source_s3_path = config['output_tables']['s3_path'] upload_s3_path = config['tabular_uploads']['s3_path'] s3_conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) for folder_name, readme_string in folder_readmes.items(): full_folder = '{}/{}'.format(local_folder, folder_name) if not os.path.isdir(full_folder): os.mkdir(full_folder) data_filename = '{}.csv'.format(quarter) data_filepath = os.path.join(full_folder, data_filename) readme_filepath = os.path.join(full_folder, 'README.txt') with open(readme_filepath, 'w') as readme_file: readme_file.write(readme_string + "\n" + QUARTERLY_NOTE) download( s3_conn, data_filepath, os.path.join(source_s3_path, folder_name, data_filename)) upload_s3_folder = os.path.join(upload_s3_path, folder_name) upload(s3_conn, readme_filepath, upload_s3_folder) upload(s3_conn, data_filepath, upload_s3_folder) # metadata stats_s3_path = config['partner_stats']['s3_path'] total_jobs = GlobalStatsAggregator(s3_conn=s3_conn)\ .saved_total(stats_s3_path) quarterly_stats = DatasetStatsCounter\ .quarterly_posting_stats(s3_conn, stats_s3_path) partner_list = DatasetStatsAggregator\ .partners(s3_conn, stats_s3_path) base_readme_filepath = os.path.join(local_folder, 'README.txt') with open(base_readme_filepath, 'w') as readme_file: readme_file.write("Open Skills Datasets\n\n") for folder_name, readme_string in folder_readmes.items(): readme_file.write("###" + folder_name + "###\n\n") readme_file.write(readme_string + "\n\n\n") readme_file.write('Dataset Stats\n\n') readme_file.write('Total Job Postings: ' + str(total_jobs) + "\n") readme_file.write('Quarterly Counts\n') for quarter in sorted(quarterly_stats.keys()): readme_file.write(quarter + ': ' + str(quarterly_stats[quarter]) + '\n') readme_file.write('Partners: ' + ','.join(partner_list) + '\n') upload(s3_conn, base_readme_filepath, upload_s3_path)