예제 #1
0
 def execute(self, context):
     conn = S3Hook().get_conn()
     execution_date = context['execution_date']
     quarter = datetime_to_quarter(execution_date)
     if quarter != datetime_to_quarter(datetime.now()):
         logging.warning('PartnerSnapshotOperator cannot be backfilled. Skipping')
         return
     updater = self.updater_class(**(self.passthrough_kwargs))
     postings = updater.deduplicated_postings()
     upload_dict(
         s3_conn=conn,
         s3_prefix=self.s3_prefix + '/' + quarter,
         data_to_sync=postings
     )
예제 #2
0
    def execute(self, context):
        s3_conn = S3Hook().get_conn()
        quarter = datetime_to_quarter(context['execution_date'])

        with tempfile.TemporaryDirectory() as temp_dir:
            job_postings_generator = job_postings_highmem(
                s3_conn, quarter, config['job_postings']['s3_path'])
            geo_querier = JobCBSAFromGeocodeQuerier(
                cbsa_results=S3CachedCBSAFinder(
                    s3_conn=s3_conn,
                    cache_s3_path=config['cbsa_lookup']
                    ['s3_path']).all_cached_cbsa_results)

            logging.basicConfig(
                format='%(asctime)s %(process)d %(levelname)s: %(message)s')
            with Pool(processes=config['aggregation']['n_processes']) as pool:
                try:
                    it = self.map(
                        pool=pool,
                        job_postings_generator=job_postings_generator,
                        geo_querier=geo_querier,
                        temp_dir=temp_dir)
                    combined_agg = self.reduce(it)
                except Exception as e:
                    logging.error("Child error: {}".format(
                        traceback.format_exc()))
                    raise
            self.save(combined_agg, quarter, s3_conn)
예제 #3
0
    def execute(self, context):
        s3_conn = S3Hook().get_conn()
        quarter = datetime_to_quarter(context['execution_date'])
        output_folder = config.get('output_folder', 'output')
        if not os.path.isdir(output_folder):
            os.mkdir(output_folder)

        merge(s3_conn, self.group_config_key, quarter, output_folder)
        merge(s3_conn, self.rollup_config_key, quarter, output_folder)
예제 #4
0
 def execute(self, context):
     conn = S3Hook().get_conn()
     quarter = datetime_to_quarter(context['execution_date'])
     NormalizeTopNIndexer(
         quarter=quarter,
         job_postings_generator=partial(job_postings, s3_path=config['job_postings']['s3_path']),
         job_titles_index=config['normalizer']['titles_master_index_name'],
         alias_name=config['normalizer']['es_index_name'],
         s3_conn=conn,
         es_client=basic_client()
     ).append()
예제 #5
0
 def execute(self, context):
     year, quarter = datetime_to_year_quarter(context['execution_date'])
     quarter_string = datetime_to_quarter(context['execution_date'])
     engine = get_db()
     load_title_counts(
         filename=full_path(
             table_files['title_count']).format(quarter_string),
         year=year,
         quarter=quarter,
         db_engine=engine,
     )
예제 #6
0
 def execute(self, context):
     s3_conn = S3Hook().get_conn()
     quarter = datetime_to_quarter(context['execution_date'])
     job_label_filename = 'tmp/job_label_train_' + quarter + '.csv'
     with open(job_label_filename, 'w') as outfile:
         writer = csv.writer(outfile, delimiter=',')
         job_postings_generator = job_postings(
             s3_conn, quarter, config['job_postings']['s3_path'])
         corpus_generator = JobCategoryCorpusCreator().label_corpora(
             job_postings_generator)
         for label in corpus_generator:
             writer.writerow([label])
     logging.info('Done labeling job categories to %s',
                  job_label_filename)
예제 #7
0
    def execute(self, context):
        s3_conn = S3Hook().get_conn()
        quarter = datetime_to_quarter(context['execution_date'])

        job_postings_generator = job_postings_highmem(
            s3_conn,
            quarter,
            config['job_postings']['s3_path']
        )

        geocoder = S3CachedGeocoder(
            s3_conn=s3_conn,
            cache_s3_path=config['geocoder']['s3_path']
        )
        logging.info('Starting geocoding')
        geocoder.geocode_job_postings_and_save(job_postings_generator)
        logging.info('Done geocoding')
예제 #8
0
 def execute(self, context):
     s3_conn = S3Hook().get_conn()
     quarter = datetime_to_quarter(context['execution_date'])
     job_vector_filename = 'tmp/job_features_train_' + quarter + '.csv'
     with open(job_vector_filename, 'w') as outfile:
         writer = csv.writer(outfile, delimiter=',')
         job_postings_generator = job_postings(
             s3_conn, quarter, config['job_postings']['s3_path'])
         corpus_generator = Doc2VecGensimCorpusCreator().array_corpora(
             job_postings_generator)
         vectorized_job_generator = Doc2Vectorizer(
             model_name='gensim_doc2vec',
             path=config['job_vectorizer_cache']['s3_path'],
             s3_conn=s3_conn).vectorize(corpus_generator)
         for vector in vectorized_job_generator:
             writer.writerow(vector)
     logging.info('Done vecotrizing job postings to %s',
                  job_vector_filename)
예제 #9
0
    def execute(self, context):
        conn = S3Hook().get_conn()
        quarter = datetime_to_quarter(context['execution_date'])
        stats_counter = DatasetStatsCounter(
            quarter=quarter,
            dataset_id=self.partner_id
        )
        transformer = self.transformer_class(
            s3_conn=conn,
            partner_id=self.partner_id,
            onet_cache=OnetCache(
                s3_conn=conn,
                cache_dir=config['onet']['cache_dir'],
                s3_path=config['onet']['s3_path'],
            ),
            **self.passthrough_kwargs
        )
        self.clear_old_postings(conn, quarter)
        for batch in Batch(
            transformer.postings(quarter, stats_counter),
            self.postings_per_file
        ):
            logging.info('Processing new batch')
            with tempfile.TemporaryFile(mode='w+') as f:
                for posting in batch:
                    f.write(json.dumps(posting))
                    f.write('\n')

                logging.debug('New batch written, commencing upload')
                bucket = conn.get_bucket(self.output_bucket)
                key = boto.s3.key.Key(
                    bucket=bucket,
                    name='{}/{}/{}_{}'.format(self.output_prefix, quarter,
                                              self.partner_id, uuid.uuid4())
                )
                f.seek(0)
                key.set_contents_from_string(f.read())
                logging.debug('Batch upload complete')
        stats_counter.save(
            s3_conn=conn,
            s3_prefix=config['partner_stats']['s3_path']
        )
예제 #10
0
 def execute(self, context):
     conn = S3Hook().get_conn()
     quarter = datetime_to_quarter(context['execution_date'])
     labeled_filename = 'labeled_corpora_a'
     with open(labeled_filename, 'w') as outfile:
         writer = csv.writer(outfile, delimiter='\t')
         job_postings_generator = job_postings(
             conn, quarter, config['job_postings']['s3_path'])
         corpus_generator = SimpleCorpusCreator()\
             .raw_corpora(job_postings_generator)
         tagged_document_generator = \
             SimpleSkillTagger(
                 skills_filename=skills_filename,
                 hash_function=md5
             ).tagged_documents(corpus_generator)
         for document in tagged_document_generator:
             writer.writerow([document])
     logging.info('Done tagging skills to %s', labeled_filename)
     upload(
         conn, labeled_filename,
         '{}/{}'.format(config['labeled_postings']['s3_path'], quarter))
예제 #11
0
def get_time_range(start='2011-01-01', freq='Q', periods=24):

    return list(map(lambda x: datetime_to_quarter(x), pd.date_range(start=start, freq=freq, periods=periods)))
예제 #12
0
        def execute(self, context):

            table_config = config['output_tables']
            folder_readmes = {}
            folder_readmes[table_config['cleaned_geo_title_count_dir']] = """
Counts of job posting title occurrences by CBSA.

{agg_info}

Job titles are cleaned by lowercasing, removing punctuation, and removing city and state names."""\
                .format(agg_info=COMMON_TITLE_AGG_INFO)

            folder_readmes[table_config['cleaned_title_count_dir']] = """
Counts of job posting title occurrences.

{agg_info}

Job titles are cleaned by lowercasing, removing punctuation, and removing city and state names."""\
                .format(agg_info=COMMON_TITLE_AGG_INFO)

            folder_readmes[table_config['geo_title_count_dir']] = """
Counts of job posting title occurrences by CBSA.

{agg_info}

Job titles are cleaned by lowercasing and removing punctuation."""\
                .format(agg_info=COMMON_TITLE_AGG_INFO)

            folder_readmes[table_config['title_count_dir']] = """
Counts of job posting title occurrences.

{agg_info}

Job titles are cleaned by lowercasing and removing punctuation."""\
                .format(agg_info=COMMON_TITLE_AGG_INFO)

            folder_readmes[table_config['geo_soc_common_count_dir']] = """
Job postings per SOC code, by CBSA.

SOC code inferred by 'common match' method
            """

            folder_readmes[table_config['soc_common_count_dir']] = """
Job postings per SOC code

SOC code inferred by 'common match' method
            """

            folder_readmes[table_config['geo_soc_top_count_dir']] = """
Job postings per SOC code, by CBSA.

SOC code inferred by 'top match' method
            """

            folder_readmes[table_config['soc_top_count_dir']] = """
Job postings per SOC code

SOC code inferred by 'top match' method
            """

            folder_readmes[table_config['geo_soc_given_count_dir']] = """
Job postings per SOC code, by CBSA.

SOC code given by data source
            """

            folder_readmes[table_config['soc_given_count_dir']] = """
Job postings per SOC code

SOC code given by data source
            """

            local_folder = config.get('output_folder', 'output')
            if not os.path.isdir(local_folder):
                os.mkdir(local_folder)
            source_s3_path = config['output_tables']['s3_path']
            upload_s3_path = config['tabular_uploads']['s3_path']

            s3_conn = S3Hook().get_conn()
            quarter = datetime_to_quarter(context['execution_date'])

            for folder_name, readme_string in folder_readmes.items():
                full_folder = '{}/{}'.format(local_folder, folder_name)
                if not os.path.isdir(full_folder):
                    os.mkdir(full_folder)
                data_filename = '{}.csv'.format(quarter)
                data_filepath = os.path.join(full_folder, data_filename)
                readme_filepath = os.path.join(full_folder, 'README.txt')
                with open(readme_filepath, 'w') as readme_file:
                    readme_file.write(readme_string + "\n" + QUARTERLY_NOTE)
                download(
                    s3_conn, data_filepath,
                    os.path.join(source_s3_path, folder_name, data_filename))
                upload_s3_folder = os.path.join(upload_s3_path, folder_name)
                upload(s3_conn, readme_filepath, upload_s3_folder)
                upload(s3_conn, data_filepath, upload_s3_folder)

            # metadata
            stats_s3_path = config['partner_stats']['s3_path']
            total_jobs = GlobalStatsAggregator(s3_conn=s3_conn)\
                .saved_total(stats_s3_path)
            quarterly_stats = DatasetStatsCounter\
                .quarterly_posting_stats(s3_conn, stats_s3_path)
            partner_list = DatasetStatsAggregator\
                .partners(s3_conn, stats_s3_path)

            base_readme_filepath = os.path.join(local_folder, 'README.txt')
            with open(base_readme_filepath, 'w') as readme_file:
                readme_file.write("Open Skills Datasets\n\n")
                for folder_name, readme_string in folder_readmes.items():
                    readme_file.write("###" + folder_name + "###\n\n")
                    readme_file.write(readme_string + "\n\n\n")
                readme_file.write('Dataset Stats\n\n')
                readme_file.write('Total Job Postings: ' + str(total_jobs) +
                                  "\n")
                readme_file.write('Quarterly Counts\n')
                for quarter in sorted(quarterly_stats.keys()):
                    readme_file.write(quarter + ': ' +
                                      str(quarterly_stats[quarter]) + '\n')
                readme_file.write('Partners: ' + ','.join(partner_list) + '\n')
            upload(s3_conn, base_readme_filepath, upload_s3_path)