예제 #1
0
def define_partner_quarterly(main_dag_name):
    dag = DAG(dag_id='{}.partner_quarterly'.format(main_dag_name),
              default_args=default_args,
              schedule_interval='0 0 1 */3 *',
              max_active_runs=1)

    raw_jobs = config.get('raw_jobs_s3_paths', {})
    if not raw_jobs:
        return dag
    if 'VA' not in raw_jobs:
        return dag
    va_bucket, va_prefix = split_s3_path(raw_jobs['VA'])
    PartnerUpdateOperator(
        task_id='va_jobs_update',
        dag=dag,
        sources=[
            'http://opendata.cs.vt.edu/dataset/2002e48d-363e-40d1-9d1b-a134301126a7/resource/d40efa75-ed86-4a01-9854-90d27539d477/download/joblistings.merged.parsed.unique.grpbyyear.2010-2015.01.json',
            'http://opendata.cs.vt.edu/dataset/2002e48d-363e-40d1-9d1b-a134301126a7/resource/c7d4d3e6-61fb-4985-920d-7ac20732083d/download/joblistings.merged.parsed.unique.grpbyyear.2010-2015.02.json',
            'http://opendata.cs.vt.edu/dataset/2002e48d-363e-40d1-9d1b-a134301126a7/resource/638255b0-cd2f-4b34-8abb-cf46f075bdfd/download/joblistings.merged.parsed.unique.grpbyyear.2010-2015.03.json',
            'http://opendata.cs.vt.edu/dataset/2002e48d-363e-40d1-9d1b-a134301126a7/resource/7e14bb60-2474-420b-ae7b-62b195051f1f/download/joblistings.merged.parsed.unique.grpbyyear.2010-2015.04.json',
            'http://opendata.cs.vt.edu/dataset/b67a5b8e-679a-4442-a8c8-4bb55a4618d6/resource/62da570a-6970-46de-b206-dab067ba51eb/download/joblistings.merged.parsed.unique.grpbyyear.2016.json'
        ],
        output_bucket=va_bucket,
        output_prefix=va_prefix,
        cache_headers=['Content-Range'],
    )

    return dag
예제 #2
0
    def save(self, s3_conn, s3_prefix):
        """Save stats to S3, including percentages
        Args:
            s3_conn (boto.Connection) - an s3 connection
            s3_prefix (str) s3 path (including bucket) to save dataset stats
        """
        bucket_name, prefix = split_s3_path(s3_prefix)
        bucket = s3_conn.get_bucket(bucket_name)
        for field_name, counts in self.accumulator.items():
            output = BytesIO()
            writer = csv.writer(output)
            for value, count in counts.most_common():
                writer.writerow([value, count])

            key = boto.s3.key.Key(
                bucket=bucket,
                name='{}/{}/{}/{}.csv'.format(
                    prefix,
                    self.directory,
                    self.quarter,
                    field_name
                )
            )
            logging.info('Writing stats to %s', key)
            output.seek(0)
            key.set_contents_from_string(output.getvalue())
예제 #3
0
def generate_job_postings_from_s3(
    s3_conn,
    s3_prefix: Text,
) -> JobPostingGeneratorType:
    """
    Stream all job listings from s3
    Args:
        s3_conn: a boto s3 connection
        s3_prefix: path to the job listings.

    Yields:
        string in json format representing the next job listing
            Refer to sample_job_listing.json for example structure
    """
    retrier = Retrying(retry_on_exception=retry_if_io_error,
                       wait_exponential_multiplier=100,
                       wait_exponential_max=100000)
    bucket_name, prefix = split_s3_path(s3_prefix)
    bucket = s3_conn.get_bucket(bucket_name)
    keys = bucket.list(prefix=prefix)

    for key in keys:
        logging.info('Extracting job postings from key {}'.format(key.name))
        with BytesIO() as outfile:
            retrier.call(key.get_contents_to_file,
                         outfile,
                         cb=log_download_progress)
            outfile.seek(0)
            for line in outfile:
                yield json.loads(line.decode('utf-8'))
 def _save(self, s3_prefix):
     """Save stats to S3, including percentages
     """
     bucket_name, prefix = split_s3_path(s3_prefix)
     bucket = self.s3_conn.get_bucket(bucket_name)
     self._compute_percentages()
     self.stats['last_updated'] = datetime.now().isoformat()
     key = self._key(bucket, prefix)
     key.set_contents_from_string(json.dumps(self.stats))
예제 #5
0
def define_partner_etl(main_dag_name):
    dag = QuarterlySubDAG(main_dag_name, 'partner_etl')

    raw_jobs = config.get('raw_jobs_s3_paths', {})
    if not raw_jobs:
        return dag
    bucket, prefix = split_s3_path(config['job_postings']['s3_path'])

    partner_stats = {}
    for partner_id, s3_path in raw_jobs.items():
        importer_class = importers.get(partner_id, None)
        if not importer_class:
            logging.warning('Importer for %s not found, skipping', partner_id)
            continue

        input_bucket, input_prefix = split_s3_path(s3_path)

        etl = PartnerETLOperator(
            task_id='{}_etl'.format(partner_id),
            dag=dag,
            transformer_class=importer_class,
            output_bucket=bucket,
            output_prefix=prefix,
            partner_id=partner_id,
            passthrough_kwargs={
                'bucket_name': input_bucket,
                'prefix': input_prefix,
            }
        )

        partner_stats[partner_id] = PartnerStatsAggregateOperator(
            task_id='{}_partner_agg'.format(partner_id),
            dag=dag,
            partner_id=partner_id
        )

        partner_stats[partner_id].set_upstream(etl)

    global_stats = GlobalStatsAggregateOperator(task_id='global_agg', dag=dag)
    for partner_stats_instance in partner_stats.values():
        global_stats.set_upstream(partner_stats_instance)

    return dag
 def quarterly_posting_stats(s3_conn, stats_s3_path):
     bucket_name, prefix = split_s3_path(stats_s3_path)
     bucket = s3_conn.get_bucket(bucket_name)
     total = Counter()
     for key in bucket.list(
             prefix='{}/{}'.format(prefix, DatasetStatsCounter.directory)):
         quarter = key.name[-6:]
         stats = json.loads(key.get_contents_as_string().decode('utf-8'))
         total[quarter] += stats['total']
     return total
예제 #7
0
def download_with_prefix(s3_conn, s3_prefix, out_directory):
    bucket_name, prefix = split_s3_path(s3_prefix)
    bucket = s3_conn.get_bucket(bucket_name)
    out_filenames = []
    for key in bucket.list(prefix=prefix):
        leaf_name = key.name.split('/')[-1]
        out_filename = os.path.join(out_directory, leaf_name)
        key.get_contents_to_filename(out_filename)
        out_filenames.append(out_filename)
    return out_filenames
 def partners(s3_conn, s3_prefix):
     partners_list = []
     bucket_name, prefix = split_s3_path(s3_prefix)
     bucket = s3_conn.get_bucket(bucket_name)
     for key in bucket.list(prefix='{}/{}'.format(
             prefix, DatasetStatsAggregator.directory)):
         stats = json.loads(key.get_contents_as_string().decode('utf-8'))
         if stats['total'] > 0:
             partner_id = key.name.split('/')[-1].split('.')[0]
             partners_list.append(partner_id)
     return partners_list
 def _save(self, s3_prefix):
     """Save stats to S3, including percentages
     """
     bucket_name, prefix = split_s3_path(s3_prefix)
     bucket = self.s3_conn.get_bucket(bucket_name)
     self._compute_percentages()
     self.stats['last_updated'] = datetime.now().isoformat()
     key = boto.s3.key.Key(bucket=bucket,
                           name='{}/{}/{}.json'.format(
                               prefix, self.directory, self.dataset_id))
     key.set_contents_from_string(json.dumps(self.stats))
예제 #10
0
 def __init__(self, s3_conn, s3_path, cache_dir):
     """
     Args:
         s3_conn: a boto s3 connection
         s3_path: path to the onet directory
         cache_dir: directory to cache files
     """
     self.s3_conn = s3_conn
     self.cache_dir = cache_dir
     self.s3_path = s3_path
     self.bucket_name, self.prefix = split_s3_path(self.s3_path)
예제 #11
0
    def test_iterate(self):
        """Test that records from all files are properly returned or excluded
        according to the given date range

        This is explicitly testing edge cases; under normal operation
        each file will just contain records for the specified quarter 
        """
        bucket_name, prefix = split_s3_path(self.s3_prefix)
        bucket = self.connection.create_bucket(bucket_name)
        mock_data = {
            '2014Q4.gz': [
                {
                    'dateacquired': '2014-12-15 00:00:00'
                },
                {
                    'dateacquired': '2014-11-15 00:00:00'
                },
                {
                    'dateacquired': '2015-01-15 00:00:00'
                },
            ],
            '2015Q1.gz': [
                {
                    'dateacquired': '2014-12-15 00:00:00'
                },
                {
                    'dateacquired': '2014-11-15 00:00:00'
                },
                {
                    'dateacquired': '2015-01-15 00:00:00'
                },
            ]
        }

        for keyname, rows in mock_data.items():
            key = boto.s3.key.Key(bucket=bucket,
                                  name='{}/{}'.format(prefix, keyname))
            stream = BytesIO()
            gzipfile = gzip.GzipFile(fileobj=stream, mode='w')
            gzipfile.write(b'dateacquired\n')
            for row in rows:
                gzipfile.write(row['dateacquired'].encode('utf-8'))
                gzipfile.write(b'\n')
            gzipfile.close()
            stream.seek(0)
            key.set_contents_from_file(stream)

        self.assert_num_postings_for_quarter('2015Q1', 1)
        self.assert_num_postings_for_quarter('2014Q4', 2)
        self.assert_num_postings_for_quarter('2014Q1', 0)
예제 #12
0
 def execute(self, context):
     conn = S3Hook()
     input_bucket, input_prefix = split_s3_path(config['output_tables']['s3_path'])
     key = conn.get_key(
         '{}/{}'.format(input_prefix, titles_filename),
         bucket_name=input_bucket
     )
     text = key.get_contents_as_string().decode('utf-8')
     reader = csv.DictReader(io.StringIO(text), delimiter='\t')
     JobTitlesMasterIndexer(
         s3_conn=conn.get_conn(),
         es_client=basic_client(),
         job_title_generator=reader,
         alias_name=config['normalizer']['titles_master_index_name']
     ).replace()
 def save(self, s3_conn, s3_prefix):
     """Save stats to S3, including percentages
     Args:
         s3_conn (boto.Connection) - an s3 connection
         s3_prefix (str) s3 path (including bucket) to save dataset stats
     """
     bucket_name, prefix = split_s3_path(s3_prefix)
     bucket = s3_conn.get_bucket(bucket_name)
     self._compute_percentages()
     self.stats['last_updated'] = datetime.now().isoformat()
     key = boto.s3.key.Key(bucket=bucket,
                           name='{}/{}/{}_{}'.format(
                               prefix, self.directory, self.dataset_id,
                               self.quarter))
     key.set_contents_from_string(json.dumps(self.stats))
def job_postings(s3_conn, quarter, s3_path, source="all"):
    """
    Stream all job listings from s3 for a given quarter
    Args:
        s3_conn: a boto s3 connection
        quarter: a string representing a quarter (2015Q1)
        s3_path: path to the job listings.
        source: should be a string or a subset of "nlx", "va", "cb" or "all"

    Yields:
        string in json format representing the next job listing
            Refer to sample_job_listing.json for example structure
    """
    retrier = Retrying(
        retry_on_exception=retry_if_io_error,
        wait_exponential_multiplier=100,
        wait_exponential_max=100000
    )
    bucket_name, prefix = split_s3_path(s3_path)
    bucket = s3_conn.get_bucket(bucket_name)
    # keys = bucket.list(prefix='{}/{}'.format(prefix, quarter))
    if isinstance(source, str):
        if source.lower() == "all":
            keys = bucket.list(prefix='{}/{}'.format(prefix, quarter))
        else:
            keys = bucket.list(prefix='{}/{}/{}_'.format(prefix, quarter, source.upper()))
    elif isinstance(source, list):
        keys = []
        for s in source:
            keys.append(bucket.list(prefix='{}/{}/{}_'.format(prefix, quarter, s.upper())))
        keys = chain(*keys)

    for key in keys:
        logging.info('Extracting job postings from key {}'.format(key.name))
        with BytesIO() as outfile:
            retrier.call(key.get_contents_to_file, outfile, cb=log_download_progress)
            outfile.seek(0)
            for line in outfile:
                yield line.decode('utf-8')
def add_s3_content(s3_conn, key_data):
    for path, data in key_data.items():
        bucket_name, key_name = split_s3_path(path)
        bucket = s3_conn.create_bucket(bucket_name)
        key = boto.s3.key.Key(bucket=bucket, name=key_name)
        key.set_contents_from_string(data)
 def _iterate_keys(self, s3_prefix):
     bucket_name, prefix = split_s3_path(s3_prefix)
     bucket = self.s3_conn.get_bucket(bucket_name)
     for key in bucket.list(
             prefix='{}/quarterly/{}_'.format(prefix, self.dataset_id)):
         yield key
 def _iterate_keys(self, s3_prefix):
     bucket_name, prefix = split_s3_path(s3_prefix)
     bucket = self.s3_conn.get_bucket(bucket_name)
     for key in bucket.list(prefix='{}/dataset_summaries/'.format(prefix)):
         yield key
 def _load(self, s3_prefix):
     bucket_name, prefix = split_s3_path(s3_prefix)
     bucket = self.s3_conn.get_bucket(bucket_name)
     key = self._key(bucket, prefix)
     self.stats = json.loads(key.get_contents_as_string().decode('utf-8'))