예제 #1
0
    def save(self, combined_aggregator, quarter, s3_conn):
        logging.info('Saving group counts and rollup')
        count_folder = '{}/{}'.format(
            self.output_folder(),
            config['output_tables'][self.group_config_key])
        if not os.path.isdir(count_folder):
            os.makedirs(count_folder)

        count_filename = '{}/{}_{}.csv'.format(count_folder, quarter,
                                               self.func_name)

        rollup_folder = '{}/{}'.format(
            self.output_folder(),
            config['output_tables'][self.rollup_config_key],
        )

        if not os.path.isdir(rollup_folder):
            os.makedirs(rollup_folder)

        rollup_filename = '{}/{}_{}.csv'.format(rollup_folder, quarter,
                                                self.func_name)
        combined_aggregator.save_counts(count_filename)
        combined_aggregator.save_rollup(rollup_filename)

        logging.info('Uploading to s3')
        upload(
            s3_conn, count_filename,
            '{}/{}'.format(config['output_tables']['s3_path'],
                           config['output_tables'][self.group_config_key]))
        upload(
            s3_conn, rollup_filename,
            '{}/{}'.format(config['output_tables']['s3_path'],
                           config['output_tables'][self.rollup_config_key]))
예제 #2
0
 def execute(self, context):
     conn = S3Hook().get_conn()
     skill_extractor = OnetSkillImportanceExtractor(
         onet_source=OnetCache(
             conn,
             cache_dir=config['onet']['cache_dir'],
             s3_path=config['onet']['s3_path'],
         ),
         output_filename=skill_importance_filename,
         hash_function=md5
     )
     skill_extractor.run()
     upload(conn, skill_importance_filename, config['output_tables']['s3_path'])
예제 #3
0
 def execute(self, context):
     conn = S3Hook().get_conn()
     title_extractor = OnetTitleExtractor(
         onet_source=OnetCache(
             conn,
             cache_dir=config['onet']['cache_dir'],
             s3_path=config['onet']['s3_path'],
         ),
         output_filename=titles_filename,
         hash_function=md5
     )
     title_extractor.run()
     upload(conn, titles_filename, config['output_tables']['s3_path'])
예제 #4
0
def merge(s3_conn, config_key, quarter, output_folder):
    prefix = '{}{}/{}'.format(config['output_tables']['s3_path'],
                              config['output_tables'][config_key], quarter)
    files = download_with_prefix(s3_conn, prefix + '_', output_folder)
    merge_df = pandas.read_csv(files[0])
    for other_file in files[1:]:
        merge_df = merge_df.merge(pandas.read_csv(other_file))
    merged_filename = os.path.join(output_folder, quarter + '.csv')
    merge_df.to_csv(merged_filename, index=False)

    upload(
        s3_conn, merged_filename,
        '{}/{}'.format(config['output_tables']['s3_path'],
                       config['output_tables'][config_key]))
예제 #5
0
def test_upload():
    s3_conn = boto.connect_s3()
    bucket_name = 'test-bucket'
    bucket = s3_conn.create_bucket(bucket_name)

    with tempfile.NamedTemporaryFile(mode='w+') as f:
        f.write('test')
        f.seek(0)
        s3_path = 'test-bucket/apath/akey'
        upload(s3_conn, f.name, s3_path)
        key = boto.s3.key.Key(bucket=bucket,
                              name='apath/akey/{}'.format(
                                  os.path.basename(f.name)))
        s = key.get_contents_as_string()
        assert s.decode('utf-8') == 'test'
예제 #6
0
def test_embedding_trainer():
    s3_conn = boto.connect_s3()
    bucket_name = 'fake-jb-bucket'
    bucket = s3_conn.create_bucket(bucket_name)

    job_posting_name = 'FAKE_jobposting'
    s3_prefix_jb = 'fake-jb-bucket/job_postings'
    s3_prefix_model = 'fake-jb-bucket/model_cache/embedding/'
    quarters = '2011Q1'

    with tempfile.TemporaryDirectory() as td:
        with open(os.path.join(td, job_posting_name), 'w') as handle:
            json.dump(sample_document, handle)
        upload(s3_conn, os.path.join(td, job_posting_name), os.path.join(s3_prefix_jb, quarters))


    # Doc2Vec
    trainer = EmbeddingTrainer(s3_conn=s3_conn, quarters=['2011Q1'], jp_s3_path=s3_prefix_jb, model_s3_path=s3_prefix_model, model_type='doc2vec')
    trainer.train()
    files = list_files(s3_conn, os.path.join(s3_prefix_model, 'doc2vec_gensim_' + trainer.training_time))
    assert len(files) == 3

    assert files == ['doc2vec_gensim_' + trainer.training_time + '.model',
                     'lookup_doc2vec_gensim_' + trainer.training_time + '.json',
                     'metadata_doc2vec_gensim_' + trainer.training_time + '.json']

    with tempfile.TemporaryDirectory() as td:
        trainer.save_model(td)
        assert set(os.listdir(td)) == set(['doc2vec_gensim_' + trainer.training_time + '.model',
                                           'lookup_doc2vec_gensim_' + trainer.training_time + '.json',
                                           'metadata_doc2vec_gensim_' + trainer.training_time + '.json'])

    # Word2Vec
    trainer = EmbeddingTrainer(s3_conn=s3_conn, quarters=['2011Q1'], jp_s3_path=s3_prefix_jb, model_s3_path=s3_prefix_model, model_type='word2vec')
    trainer.train()
    files = list_files(s3_conn, os.path.join(s3_prefix_model, 'word2vec_gensim_' + trainer.training_time))
    assert len(files) == 2
    assert files == ['metadata_word2vec_gensim_' + trainer.training_time + '.json',
                     'word2vec_gensim_' + trainer.training_time + '.model']

    new_trainer = EmbeddingTrainer(s3_conn=s3_conn, quarters=['2011Q1'], jp_s3_path=s3_prefix_jb, model_s3_path=s3_prefix_model, model_type='word2vec')
    new_trainer.load(trainer.modelname, s3_prefix_model)
    assert new_trainer.metadata['metadata']['hyperparameters'] == trainer.metadata['metadata']['hyperparameters']
예제 #7
0
 def execute(self, context):
     conn = S3Hook().get_conn()
     quarter = datetime_to_quarter(context['execution_date'])
     labeled_filename = 'labeled_corpora_a'
     with open(labeled_filename, 'w') as outfile:
         writer = csv.writer(outfile, delimiter='\t')
         job_postings_generator = job_postings(
             conn, quarter, config['job_postings']['s3_path'])
         corpus_generator = SimpleCorpusCreator()\
             .raw_corpora(job_postings_generator)
         tagged_document_generator = \
             SimpleSkillTagger(
                 skills_filename=skills_filename,
                 hash_function=md5
             ).tagged_documents(corpus_generator)
         for document in tagged_document_generator:
             writer.writerow([document])
     logging.info('Done tagging skills to %s', labeled_filename)
     upload(
         conn, labeled_filename,
         '{}/{}'.format(config['labeled_postings']['s3_path'], quarter))
def test_occupation_classifier():
    s3_conn = boto.connect_s3()
    bucket_name = 'fake-bucket'
    bucket = s3_conn.create_bucket(bucket_name)

    model_name = 'doc2vec_gensim_test'
    s3_prefix_model = 'fake-bucket/cache/embedding/'

    classifier_id = 'ann_0614'
    classifier_name = classifier_id + '.index'

    fake_corpus_train = FakeCorpusGenerator(num=10)

    model = gensim.models.Doc2Vec(size=500, min_count=1, iter=5, window=4)

    with tempfile.TemporaryDirectory() as td:
        model.build_vocab(fake_corpus_train)
        model.train(fake_corpus_train,
                    total_examples=model.corpus_count,
                    epochs=model.iter)
        model.save(os.path.join(td, model_name + '.model'))
        upload(s3_conn, os.path.join(td, model_name + '.model'),
               os.path.join(s3_prefix_model, model_name))

    with tempfile.TemporaryDirectory() as td:
        lookup = fake_corpus_train.lookup
        lookup_name = 'lookup_' + model_name + '.json'
        with open(os.path.join(td, lookup_name), 'w') as handle:
            json.dump(lookup, handle)
        upload(s3_conn, os.path.join(td, lookup_name),
               os.path.join(s3_prefix_model, model_name))

    nn_classifier = NearestNeighbors(
        model_name=model_name,
        s3_path=s3_prefix_model,
        s3_conn=s3_conn,
    )
    model = nn_classifier.model
    model.init_sims()
    ann_index = AnnoyIndexer(model, 10)
    ann_classifier = NearestNeighbors(
        model_name=model_name,
        s3_path=s3_prefix_model,
        s3_conn=s3_conn,
    )
    ann_classifier.indexer = ann_index

    clf_top = Classifier(classifier_id=classifier_id,
                         s3_conn=s3_conn,
                         s3_path=s3_prefix_model,
                         classifier=ann_classifier,
                         classify_kwargs={'mode': 'top'})
    clf_common = Classifier(classifier_id=classifier_id,
                            s3_conn=s3_conn,
                            s3_path=s3_prefix_model,
                            classifier=ann_classifier,
                            classify_kwargs={'mode': 'common'})

    assert nn_classifier.model_name == model_name
    assert nn_classifier.indexer != clf_top.classifier.indexer
    assert nn_classifier.predict_soc(docs,
                                     'top')[0] == clf_top.classify(docs)[0]
    assert nn_classifier.predict_soc(
        docs, 'common')[0] == clf_common.classify(docs)[0]
예제 #9
0
        def execute(self, context):

            table_config = config['output_tables']
            folder_readmes = {}
            folder_readmes[table_config['cleaned_geo_title_count_dir']] = """
Counts of job posting title occurrences by CBSA.

{agg_info}

Job titles are cleaned by lowercasing, removing punctuation, and removing city and state names."""\
                .format(agg_info=COMMON_TITLE_AGG_INFO)

            folder_readmes[table_config['cleaned_title_count_dir']] = """
Counts of job posting title occurrences.

{agg_info}

Job titles are cleaned by lowercasing, removing punctuation, and removing city and state names."""\
                .format(agg_info=COMMON_TITLE_AGG_INFO)

            folder_readmes[table_config['geo_title_count_dir']] = """
Counts of job posting title occurrences by CBSA.

{agg_info}

Job titles are cleaned by lowercasing and removing punctuation."""\
                .format(agg_info=COMMON_TITLE_AGG_INFO)

            folder_readmes[table_config['title_count_dir']] = """
Counts of job posting title occurrences.

{agg_info}

Job titles are cleaned by lowercasing and removing punctuation."""\
                .format(agg_info=COMMON_TITLE_AGG_INFO)

            folder_readmes[table_config['geo_soc_common_count_dir']] = """
Job postings per SOC code, by CBSA.

SOC code inferred by 'common match' method
            """

            folder_readmes[table_config['soc_common_count_dir']] = """
Job postings per SOC code

SOC code inferred by 'common match' method
            """

            folder_readmes[table_config['geo_soc_top_count_dir']] = """
Job postings per SOC code, by CBSA.

SOC code inferred by 'top match' method
            """

            folder_readmes[table_config['soc_top_count_dir']] = """
Job postings per SOC code

SOC code inferred by 'top match' method
            """

            folder_readmes[table_config['geo_soc_given_count_dir']] = """
Job postings per SOC code, by CBSA.

SOC code given by data source
            """

            folder_readmes[table_config['soc_given_count_dir']] = """
Job postings per SOC code

SOC code given by data source
            """

            local_folder = config.get('output_folder', 'output')
            if not os.path.isdir(local_folder):
                os.mkdir(local_folder)
            source_s3_path = config['output_tables']['s3_path']
            upload_s3_path = config['tabular_uploads']['s3_path']

            s3_conn = S3Hook().get_conn()
            quarter = datetime_to_quarter(context['execution_date'])

            for folder_name, readme_string in folder_readmes.items():
                full_folder = '{}/{}'.format(local_folder, folder_name)
                if not os.path.isdir(full_folder):
                    os.mkdir(full_folder)
                data_filename = '{}.csv'.format(quarter)
                data_filepath = os.path.join(full_folder, data_filename)
                readme_filepath = os.path.join(full_folder, 'README.txt')
                with open(readme_filepath, 'w') as readme_file:
                    readme_file.write(readme_string + "\n" + QUARTERLY_NOTE)
                download(
                    s3_conn, data_filepath,
                    os.path.join(source_s3_path, folder_name, data_filename))
                upload_s3_folder = os.path.join(upload_s3_path, folder_name)
                upload(s3_conn, readme_filepath, upload_s3_folder)
                upload(s3_conn, data_filepath, upload_s3_folder)

            # metadata
            stats_s3_path = config['partner_stats']['s3_path']
            total_jobs = GlobalStatsAggregator(s3_conn=s3_conn)\
                .saved_total(stats_s3_path)
            quarterly_stats = DatasetStatsCounter\
                .quarterly_posting_stats(s3_conn, stats_s3_path)
            partner_list = DatasetStatsAggregator\
                .partners(s3_conn, stats_s3_path)

            base_readme_filepath = os.path.join(local_folder, 'README.txt')
            with open(base_readme_filepath, 'w') as readme_file:
                readme_file.write("Open Skills Datasets\n\n")
                for folder_name, readme_string in folder_readmes.items():
                    readme_file.write("###" + folder_name + "###\n\n")
                    readme_file.write(readme_string + "\n\n\n")
                readme_file.write('Dataset Stats\n\n')
                readme_file.write('Total Job Postings: ' + str(total_jobs) +
                                  "\n")
                readme_file.write('Quarterly Counts\n')
                for quarter in sorted(quarterly_stats.keys()):
                    readme_file.write(quarter + ': ' +
                                      str(quarterly_stats[quarter]) + '\n')
                readme_file.write('Partners: ' + ','.join(partner_list) + '\n')
            upload(s3_conn, base_readme_filepath, upload_s3_path)
예제 #10
0
 def _upload(self):
     with tempfile.TemporaryDirectory() as td:
         self.save_model(td)
         for f in glob(os.path.join(td, '*{}*'.format(self.training_time))):
             upload(self.s3_conn, f, os.path.join(self.model_s3_path, self.modelname))