def save(self, combined_aggregator, quarter, s3_conn): logging.info('Saving group counts and rollup') count_folder = '{}/{}'.format( self.output_folder(), config['output_tables'][self.group_config_key]) if not os.path.isdir(count_folder): os.makedirs(count_folder) count_filename = '{}/{}_{}.csv'.format(count_folder, quarter, self.func_name) rollup_folder = '{}/{}'.format( self.output_folder(), config['output_tables'][self.rollup_config_key], ) if not os.path.isdir(rollup_folder): os.makedirs(rollup_folder) rollup_filename = '{}/{}_{}.csv'.format(rollup_folder, quarter, self.func_name) combined_aggregator.save_counts(count_filename) combined_aggregator.save_rollup(rollup_filename) logging.info('Uploading to s3') upload( s3_conn, count_filename, '{}/{}'.format(config['output_tables']['s3_path'], config['output_tables'][self.group_config_key])) upload( s3_conn, rollup_filename, '{}/{}'.format(config['output_tables']['s3_path'], config['output_tables'][self.rollup_config_key]))
def execute(self, context): conn = S3Hook().get_conn() skill_extractor = OnetSkillImportanceExtractor( onet_source=OnetCache( conn, cache_dir=config['onet']['cache_dir'], s3_path=config['onet']['s3_path'], ), output_filename=skill_importance_filename, hash_function=md5 ) skill_extractor.run() upload(conn, skill_importance_filename, config['output_tables']['s3_path'])
def execute(self, context): conn = S3Hook().get_conn() title_extractor = OnetTitleExtractor( onet_source=OnetCache( conn, cache_dir=config['onet']['cache_dir'], s3_path=config['onet']['s3_path'], ), output_filename=titles_filename, hash_function=md5 ) title_extractor.run() upload(conn, titles_filename, config['output_tables']['s3_path'])
def merge(s3_conn, config_key, quarter, output_folder): prefix = '{}{}/{}'.format(config['output_tables']['s3_path'], config['output_tables'][config_key], quarter) files = download_with_prefix(s3_conn, prefix + '_', output_folder) merge_df = pandas.read_csv(files[0]) for other_file in files[1:]: merge_df = merge_df.merge(pandas.read_csv(other_file)) merged_filename = os.path.join(output_folder, quarter + '.csv') merge_df.to_csv(merged_filename, index=False) upload( s3_conn, merged_filename, '{}/{}'.format(config['output_tables']['s3_path'], config['output_tables'][config_key]))
def test_upload(): s3_conn = boto.connect_s3() bucket_name = 'test-bucket' bucket = s3_conn.create_bucket(bucket_name) with tempfile.NamedTemporaryFile(mode='w+') as f: f.write('test') f.seek(0) s3_path = 'test-bucket/apath/akey' upload(s3_conn, f.name, s3_path) key = boto.s3.key.Key(bucket=bucket, name='apath/akey/{}'.format( os.path.basename(f.name))) s = key.get_contents_as_string() assert s.decode('utf-8') == 'test'
def test_embedding_trainer(): s3_conn = boto.connect_s3() bucket_name = 'fake-jb-bucket' bucket = s3_conn.create_bucket(bucket_name) job_posting_name = 'FAKE_jobposting' s3_prefix_jb = 'fake-jb-bucket/job_postings' s3_prefix_model = 'fake-jb-bucket/model_cache/embedding/' quarters = '2011Q1' with tempfile.TemporaryDirectory() as td: with open(os.path.join(td, job_posting_name), 'w') as handle: json.dump(sample_document, handle) upload(s3_conn, os.path.join(td, job_posting_name), os.path.join(s3_prefix_jb, quarters)) # Doc2Vec trainer = EmbeddingTrainer(s3_conn=s3_conn, quarters=['2011Q1'], jp_s3_path=s3_prefix_jb, model_s3_path=s3_prefix_model, model_type='doc2vec') trainer.train() files = list_files(s3_conn, os.path.join(s3_prefix_model, 'doc2vec_gensim_' + trainer.training_time)) assert len(files) == 3 assert files == ['doc2vec_gensim_' + trainer.training_time + '.model', 'lookup_doc2vec_gensim_' + trainer.training_time + '.json', 'metadata_doc2vec_gensim_' + trainer.training_time + '.json'] with tempfile.TemporaryDirectory() as td: trainer.save_model(td) assert set(os.listdir(td)) == set(['doc2vec_gensim_' + trainer.training_time + '.model', 'lookup_doc2vec_gensim_' + trainer.training_time + '.json', 'metadata_doc2vec_gensim_' + trainer.training_time + '.json']) # Word2Vec trainer = EmbeddingTrainer(s3_conn=s3_conn, quarters=['2011Q1'], jp_s3_path=s3_prefix_jb, model_s3_path=s3_prefix_model, model_type='word2vec') trainer.train() files = list_files(s3_conn, os.path.join(s3_prefix_model, 'word2vec_gensim_' + trainer.training_time)) assert len(files) == 2 assert files == ['metadata_word2vec_gensim_' + trainer.training_time + '.json', 'word2vec_gensim_' + trainer.training_time + '.model'] new_trainer = EmbeddingTrainer(s3_conn=s3_conn, quarters=['2011Q1'], jp_s3_path=s3_prefix_jb, model_s3_path=s3_prefix_model, model_type='word2vec') new_trainer.load(trainer.modelname, s3_prefix_model) assert new_trainer.metadata['metadata']['hyperparameters'] == trainer.metadata['metadata']['hyperparameters']
def execute(self, context): conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) labeled_filename = 'labeled_corpora_a' with open(labeled_filename, 'w') as outfile: writer = csv.writer(outfile, delimiter='\t') job_postings_generator = job_postings( conn, quarter, config['job_postings']['s3_path']) corpus_generator = SimpleCorpusCreator()\ .raw_corpora(job_postings_generator) tagged_document_generator = \ SimpleSkillTagger( skills_filename=skills_filename, hash_function=md5 ).tagged_documents(corpus_generator) for document in tagged_document_generator: writer.writerow([document]) logging.info('Done tagging skills to %s', labeled_filename) upload( conn, labeled_filename, '{}/{}'.format(config['labeled_postings']['s3_path'], quarter))
def test_occupation_classifier(): s3_conn = boto.connect_s3() bucket_name = 'fake-bucket' bucket = s3_conn.create_bucket(bucket_name) model_name = 'doc2vec_gensim_test' s3_prefix_model = 'fake-bucket/cache/embedding/' classifier_id = 'ann_0614' classifier_name = classifier_id + '.index' fake_corpus_train = FakeCorpusGenerator(num=10) model = gensim.models.Doc2Vec(size=500, min_count=1, iter=5, window=4) with tempfile.TemporaryDirectory() as td: model.build_vocab(fake_corpus_train) model.train(fake_corpus_train, total_examples=model.corpus_count, epochs=model.iter) model.save(os.path.join(td, model_name + '.model')) upload(s3_conn, os.path.join(td, model_name + '.model'), os.path.join(s3_prefix_model, model_name)) with tempfile.TemporaryDirectory() as td: lookup = fake_corpus_train.lookup lookup_name = 'lookup_' + model_name + '.json' with open(os.path.join(td, lookup_name), 'w') as handle: json.dump(lookup, handle) upload(s3_conn, os.path.join(td, lookup_name), os.path.join(s3_prefix_model, model_name)) nn_classifier = NearestNeighbors( model_name=model_name, s3_path=s3_prefix_model, s3_conn=s3_conn, ) model = nn_classifier.model model.init_sims() ann_index = AnnoyIndexer(model, 10) ann_classifier = NearestNeighbors( model_name=model_name, s3_path=s3_prefix_model, s3_conn=s3_conn, ) ann_classifier.indexer = ann_index clf_top = Classifier(classifier_id=classifier_id, s3_conn=s3_conn, s3_path=s3_prefix_model, classifier=ann_classifier, classify_kwargs={'mode': 'top'}) clf_common = Classifier(classifier_id=classifier_id, s3_conn=s3_conn, s3_path=s3_prefix_model, classifier=ann_classifier, classify_kwargs={'mode': 'common'}) assert nn_classifier.model_name == model_name assert nn_classifier.indexer != clf_top.classifier.indexer assert nn_classifier.predict_soc(docs, 'top')[0] == clf_top.classify(docs)[0] assert nn_classifier.predict_soc( docs, 'common')[0] == clf_common.classify(docs)[0]
def execute(self, context): table_config = config['output_tables'] folder_readmes = {} folder_readmes[table_config['cleaned_geo_title_count_dir']] = """ Counts of job posting title occurrences by CBSA. {agg_info} Job titles are cleaned by lowercasing, removing punctuation, and removing city and state names."""\ .format(agg_info=COMMON_TITLE_AGG_INFO) folder_readmes[table_config['cleaned_title_count_dir']] = """ Counts of job posting title occurrences. {agg_info} Job titles are cleaned by lowercasing, removing punctuation, and removing city and state names."""\ .format(agg_info=COMMON_TITLE_AGG_INFO) folder_readmes[table_config['geo_title_count_dir']] = """ Counts of job posting title occurrences by CBSA. {agg_info} Job titles are cleaned by lowercasing and removing punctuation."""\ .format(agg_info=COMMON_TITLE_AGG_INFO) folder_readmes[table_config['title_count_dir']] = """ Counts of job posting title occurrences. {agg_info} Job titles are cleaned by lowercasing and removing punctuation."""\ .format(agg_info=COMMON_TITLE_AGG_INFO) folder_readmes[table_config['geo_soc_common_count_dir']] = """ Job postings per SOC code, by CBSA. SOC code inferred by 'common match' method """ folder_readmes[table_config['soc_common_count_dir']] = """ Job postings per SOC code SOC code inferred by 'common match' method """ folder_readmes[table_config['geo_soc_top_count_dir']] = """ Job postings per SOC code, by CBSA. SOC code inferred by 'top match' method """ folder_readmes[table_config['soc_top_count_dir']] = """ Job postings per SOC code SOC code inferred by 'top match' method """ folder_readmes[table_config['geo_soc_given_count_dir']] = """ Job postings per SOC code, by CBSA. SOC code given by data source """ folder_readmes[table_config['soc_given_count_dir']] = """ Job postings per SOC code SOC code given by data source """ local_folder = config.get('output_folder', 'output') if not os.path.isdir(local_folder): os.mkdir(local_folder) source_s3_path = config['output_tables']['s3_path'] upload_s3_path = config['tabular_uploads']['s3_path'] s3_conn = S3Hook().get_conn() quarter = datetime_to_quarter(context['execution_date']) for folder_name, readme_string in folder_readmes.items(): full_folder = '{}/{}'.format(local_folder, folder_name) if not os.path.isdir(full_folder): os.mkdir(full_folder) data_filename = '{}.csv'.format(quarter) data_filepath = os.path.join(full_folder, data_filename) readme_filepath = os.path.join(full_folder, 'README.txt') with open(readme_filepath, 'w') as readme_file: readme_file.write(readme_string + "\n" + QUARTERLY_NOTE) download( s3_conn, data_filepath, os.path.join(source_s3_path, folder_name, data_filename)) upload_s3_folder = os.path.join(upload_s3_path, folder_name) upload(s3_conn, readme_filepath, upload_s3_folder) upload(s3_conn, data_filepath, upload_s3_folder) # metadata stats_s3_path = config['partner_stats']['s3_path'] total_jobs = GlobalStatsAggregator(s3_conn=s3_conn)\ .saved_total(stats_s3_path) quarterly_stats = DatasetStatsCounter\ .quarterly_posting_stats(s3_conn, stats_s3_path) partner_list = DatasetStatsAggregator\ .partners(s3_conn, stats_s3_path) base_readme_filepath = os.path.join(local_folder, 'README.txt') with open(base_readme_filepath, 'w') as readme_file: readme_file.write("Open Skills Datasets\n\n") for folder_name, readme_string in folder_readmes.items(): readme_file.write("###" + folder_name + "###\n\n") readme_file.write(readme_string + "\n\n\n") readme_file.write('Dataset Stats\n\n') readme_file.write('Total Job Postings: ' + str(total_jobs) + "\n") readme_file.write('Quarterly Counts\n') for quarter in sorted(quarterly_stats.keys()): readme_file.write(quarter + ': ' + str(quarterly_stats[quarter]) + '\n') readme_file.write('Partners: ' + ','.join(partner_list) + '\n') upload(s3_conn, base_readme_filepath, upload_s3_path)
def _upload(self): with tempfile.TemporaryDirectory() as td: self.save_model(td) for f in glob(os.path.join(td, '*{}*'.format(self.training_time))): upload(self.s3_conn, f, os.path.join(self.model_s3_path, self.modelname))