Exemplo n.º 1
0
def generate_job_postings_from_s3(
    s3_conn,
    s3_prefix: Text,
) -> JobPostingGeneratorType:
    """
    Stream all job listings from s3
    Args:
        s3_conn: a boto s3 connection
        s3_prefix: path to the job listings.

    Yields:
        string in json format representing the next job listing
            Refer to sample_job_listing.json for example structure
    """
    retrier = Retrying(retry_on_exception=retry_if_io_error,
                       wait_exponential_multiplier=100,
                       wait_exponential_max=100000)
    bucket_name, prefix = split_s3_path(s3_prefix)
    bucket = s3_conn.get_bucket(bucket_name)
    keys = bucket.list(prefix=prefix)

    for key in keys:
        logging.info('Extracting job postings from key {}'.format(key.name))
        with BytesIO() as outfile:
            retrier.call(key.get_contents_to_file,
                         outfile,
                         cb=log_download_progress)
            outfile.seek(0)
            for line in outfile:
                yield json.loads(line.decode('utf-8'))
def job_postings(s3_conn, quarter, s3_path, source="all"):
    """
    Stream all job listings from s3 for a given quarter
    Args:
        s3_conn: a boto s3 connection
        quarter: a string representing a quarter (2015Q1)
        s3_path: path to the job listings.
        source: should be a string or a subset of "nlx", "va", "cb" or "all"

    Yields:
        string in json format representing the next job listing
            Refer to sample_job_listing.json for example structure
    """
    retrier = Retrying(
        retry_on_exception=retry_if_io_error,
        wait_exponential_multiplier=100,
        wait_exponential_max=100000
    )
    bucket_name, prefix = split_s3_path(s3_path)
    bucket = s3_conn.get_bucket(bucket_name)
    # keys = bucket.list(prefix='{}/{}'.format(prefix, quarter))
    if isinstance(source, str):
        if source.lower() == "all":
            keys = bucket.list(prefix='{}/{}'.format(prefix, quarter))
        else:
            keys = bucket.list(prefix='{}/{}/{}_'.format(prefix, quarter, source.upper()))
    elif isinstance(source, list):
        keys = []
        for s in source:
            keys.append(bucket.list(prefix='{}/{}/{}_'.format(prefix, quarter, s.upper())))
        keys = chain(*keys)

    for key in keys:
        logging.info('Extracting job postings from key {}'.format(key.name))
        with BytesIO() as outfile:
            retrier.call(key.get_contents_to_file, outfile, cb=log_download_progress)
            outfile.seek(0)
            for line in outfile:
                yield line.decode('utf-8')
Exemplo n.º 3
0
    def api_call(self, path, headers=None, retries=10, **kwargs):
        timeout = kwargs.pop("timeout", 10.0)

        opener = urllib2.build_opener()
        opener.addheaders = [("User-Agent", self.user_agent),
                             ("region-id", str(self.region))]

        req = urllib2.Request(self.API_URL.format(path), **kwargs)

        # Make the request, with retries
        retrier = Retrying(stop_max_attempt_number=retries,
                           wait_exponential_multiplier=500,
                           wait_exponential_max=5000,
                           retry_on_exception=retry_if_http_error,
                           wrap_exception=True)
        res = retrier.call(opener.open, req, timeout=timeout)
        return json.loads(res.read())
Exemplo n.º 4
0
    def _get(self, url, package):
        retry = Retrying(wait_exponential_multiplier=2000, wait_exponential_max=120000,
            retry_on_exception=_retry_msg)

        return retry.call(requests.get, url % package)
Exemplo n.º 5
0
    def _get(self, url, package):
        retry = Retrying(wait_exponential_multiplier=2000, wait_exponential_max=120000,
            retry_on_exception=_retry_msg)

        return retry.call(requests.get, url % package)