def aggregate_properties_for_quarter(quarter, grouping_properties,
                                     aggregate_properties, aggregate_functions,
                                     aggregations_path, aggregation_name):
    """Aggregate computed properties for a quarter and writes the resulting CSV to S3

    Args:
        quarter (string)
        grouping_properties (list of JobPostingComputedProperty)
            Properties to form the primary key of the aggregation
        aggregate_properties (list of JobPostingComputedProperty)
            Properties to be aggregated over the primary key
        aggregate_functions (dict) A lookup of aggregate functions
            to be applied for each aggregate column
        aggregations_path (string) The base s3 path to store aggregations
        aggregation_name (string) The name of this particular aggregation

    Returns: nothing
    """
    start_date, end_date = quarter_to_daterange(quarter)
    included_dates = [date for date in dates_in_range(start_date, end_date)]
    aggregation_df = aggregation_for_properties_and_dates(
        grouping_properties, aggregate_properties, aggregate_functions,
        included_dates)

    out_path = '/'.join(
        [aggregations_path, aggregation_name, quarter + '.csv'])
    fs = s3fs.S3FileSystem()
    with fs.open(out_path, 'wb') as f:
        f.write(aggregation_df.to_csv(None).encode())
    return out_path
Exemplo n.º 2
0
    def _iter_postings(self, quarter):
        logging.info("Finding Virginia postings for %s", quarter)
        quarter_start, quarter_end = quarter_to_daterange(quarter)
        bucket = self.s3_conn.get_bucket(self.bucket_name)
        keylist = list(bucket.list(prefix=self.prefix, delimiter=''))
        for key in keylist:
            if key.name.endswith('.cache.json'):
                continue

            logging.info("Processing key %s", key.name)
            with tempfile.NamedTemporaryFile() as local_file:
                key.get_contents_to_file(local_file)
                local_file.seek(0)
                for posting in stream_json_file(local_file):
                    if len(posting['datePosted']) == 0:
                        continue
                    listing_start = datetime.strptime(posting['datePosted'],
                                                      self.DATE_FORMAT)
                    if len(posting['dateExpires']) == 0:
                        listing_end = listing_start
                    else:
                        listing_end = datetime.strptime(
                            posting['dateExpires'], self.DATE_FORMAT)
                    if overlaps(listing_start.date(), listing_end.date(),
                                quarter_start, quarter_end):
                        yield posting
Exemplo n.º 3
0
    def _iter_postings(self, quarter):
        """Iterate through raw postings for a given quarter

        Args:
            quarter (string): A quarter (in format 2015Q1)

        Yields:
            Untransformed job postings (dict)
        """
        logging.info("Finding USAJobs postings for %s", quarter)
        quarter_start, quarter_end = quarter_to_daterange(quarter)
        bucket = self.s3_conn.get_bucket(self.bucket_name)
        full_prefix = self.prefix + '/' + quarter
        keylist = list(bucket.list(prefix=full_prefix, delimiter=''))
        for key in keylist:
            logging.info("Processing key %s", key.name)
            contents = key.get_contents_as_string()
            posting = json.loads(contents.decode('utf-8'))
            posting['id'] = key.name.split('.')[-2]
            if len(posting['PositionStartDate']) == 0:
                continue
            listing_start = datetime.strptime(posting['PositionStartDate'],
                                              self.DATE_FORMAT)
            if len(posting['PositionEndDate']) == 0:
                listing_end = listing_start
            else:
                listing_end = datetime.strptime(posting['PositionEndDate'],
                                                self.DATE_FORMAT)
            if overlaps(listing_start.date(), listing_end.date(),
                        quarter_start, quarter_end):
                yield posting
            else:
                logging.warning('Posting %s does not overlap with quarter %s',
                                posting['id'], quarter)
Exemplo n.º 4
0
 def _iter_postings(self, quarter):
     logging.info("Finding CareerBuilder postings for %s", quarter)
     quarter_start, quarter_end = quarter_to_daterange(quarter)
     bucket = self.s3_conn.get_bucket(self.bucket_name)
     keylist = list(bucket.list(prefix=self.prefix, delimiter=''))
     for key in keylist:
         in_file = 0
         overlapping = 0
         logging.info('Processing key %s', key.name)
         with tempfile.NamedTemporaryFile() as local_file:
             key.get_contents_to_file(local_file, cb=log_download_progress)
             logging.info('Downloaded key %s for processing', key.name)
             local_file.seek(0)
             for posting in stream_json_file(local_file):
                 in_file += 1
                 listing_start = datetime.strptime(posting['created'],
                                                   self.DATE_FORMAT)
                 listing_end = datetime.strptime(posting['modified'],
                                                 self.DATE_FORMAT)
                 if overlaps(listing_start.date(), listing_end.date(),
                             quarter_start, quarter_end):
                     overlapping += 1
                     yield posting
             logging.info('%s overlapping out of %s total in file',
                          overlapping, in_file)