def request_from_api(api_params, curr_index):
    """
    Make a POST API request and return response (if valid).
    :param api_params: api params set in yaml config
    :param curr_index: current API poll start position
    :return: response object
    """
    err_list = []
    try:
        request_params = {
            'from': curr_index,
            'size': api_params['BATCH_SIZE'],
            'expand': api_params['EXPAND_FIELD_GROUPS']
        }

        # retrieve and parse a "page" (batch) of case objects
        res = requests.post(url=api_params['ENDPOINT'], data=request_params)

        # return response body if request was successful
        if res.status_code == requests.codes.ok:
            return res

        restart_idx = curr_index
        err_list.append('API request returned status code {}.'.format(str(res.status_code)))

        if api_params['IO_MODE'] == 'a':
            err_list.append(
                'Scripts is being run in "append" mode. '
                'To resume without data loss or duplication, set START_INDEX = {} in your YAML config file.'
                    .format(restart_idx))
    except requests.exceptions.MissingSchema as e:
        err_list.append(str(e) + '(Hint: check the ENDPOINT value supplied in yaml config.)')

    has_fatal_error(err_list, res.raise_for_status())
示例#2
0
def retrieve_and_output_cases(batch_size, endpoint, expand_fields):
    start_time = time.time()  # for benchmarking
    total_cases_count = 0
    is_last_page = False
    curr_index = 0
    keys = set()

    while not is_last_page:
        res = request_from_api(curr_index, batch_size, expand_fields, endpoint)
        res_json = res.json()['data']
        cases_json = res_json['hits']

        # Currently, if response doesn't contain this metadata,
        # it indicates an invalid response or request.
        if 'pagination' not in res_json:
            has_fatal_error("'pagination' not found in API response, exiting.")

        batch_record_count = res_json['pagination']['count']
        total_cases_count = res_json['pagination']['total']
        curr_page = res_json['pagination']['page']
        last_page = res_json['pagination']['pages']

        for case in cases_json:
            if 'days_to_index' in case:
                print("Found days_to_index!\n{}".format(case))
            for field in case.copy():
                keys.add(field)

        if curr_page == last_page:
            is_last_page = True

        print("API call {}".format(curr_page))
        curr_index += batch_record_count
示例#3
0
def request_from_api(start_index, batch_size, expand_fields, endpoint):
    request_params = {
        'from': start_index,
        'size': batch_size,
        'expand': expand_fields
    }

    # retrieve and parse a "page" (batch) of case objects
    res = requests.post(url=endpoint, data=request_params)

    # return response body if request was successful
    if res.status_code == requests.codes.ok:
        return res
    else:
        has_fatal_error("API request returned result code {}, exiting.".format(
            res.status_code))
def retrieve_and_output_cases(api_params, bq_params, data_fp):
    """
    Retrieves case records from API and outputs them to a JSONL file,
    which is later used to populate the clinical data BQ table.
    :param api_params: API and file output params, from YAML config
    :param bq_params: BQ params, from YAML config
    :param data_fp: absolute path to data output file
    """
    start_time = time.time()  # for benchmarking
    total_cases_count = 0
    is_last_page = False

    with open(data_fp, api_params['IO_MODE']) as json_output_file:
        curr_index = api_params['START_INDEX']
        while not is_last_page:
            res = request_from_api(api_params, curr_index)

            res_json = res.json()['data']
            cases_json = res_json['hits']

            # Currently, if response doesn't contain this metadata, it indicates an invalid response or request.
            if 'pagination' in res_json:
                batch_record_count = res_json['pagination']['count']
                total_cases_count = res_json['pagination']['total']
                curr_page = res_json['pagination']['page']
                last_page = res_json['pagination']['pages']
            else:
                has_fatal_error("'pagination' key not found in response json, exiting.", KeyError)

            for case in cases_json:
                case_copy = case.copy()
                for field in api_params['EXCLUDE_FIELDS'].split(','):
                    if field in case_copy:
                        case.pop(field)

                no_list_value_case = arrays_to_str_list(case)
                # writing in jsonlines format, as required by BQ
                json.dump(obj=no_list_value_case, fp=json_output_file)
                json_output_file.write('\n')

            if curr_page == last_page or (api_params['MAX_PAGES'] and curr_page == api_params['MAX_PAGES']):
                is_last_page = True

            print("Inserted page {} of {} ({} records) into jsonlines file"
                  .format(curr_page, last_page, batch_record_count))
            curr_index += batch_record_count
    def is_valid_idx_param(yaml_param):
        """
        Verifies that index-type params provided are non-negative integer values.
        :param yaml_param: value to verify
        """
        e_list = []

        try:
            if int(api_params[yaml_param]) < 0:
                e_list.append('Invalid value for {} in yaml config (supplied: {}).'
                              .format(yaml_param, type(api_params[yaml_param])))
                e_list.append('Value should be a non-negative integer.')
                has_fatal_error(e_list, ValueError)
        except TypeError as e:
            # triggered by casting an inappropriate type to int for testing
            e_list.append('{} in yaml config should be of type int, not type {}).'
                          .format(yaml_param, type(api_params[yaml_param])))
            e_list.append(str(e))
            has_fatal_error(e_list, TypeError)
def main(args):
    if len(args) != 2:
        has_fatal_error('Usage : {} <configuration_yaml>".format(args[0])', ValueError)

    # Load the YAML config file
    with open(args[1], mode='r') as yaml_file:
        try:
            api_params, bq_params, steps = load_config(yaml_file, YAML_HEADERS)
        except ValueError as e:
            has_fatal_error(str(e), ValueError)

    # Validate YAML config params
    validate_params(api_params, bq_params)

    data_fp = construct_filepath(api_params)
    schema = None

    if 'retrieve_and_output_cases' in steps:
        # Hits the GDC api endpoint, outputs data to jsonl file (newline-delineated json, required by BQ)
        print('Starting GDC API calls!')
        retrieve_and_output_cases(api_params, bq_params, data_fp)

    if 'create_bq_schema_obj' in steps:
        # Creates a BQ schema python object consisting of nested SchemaField objects
        print('Creating BQ schema object!')
        schema = create_bq_schema(api_params, data_fp)

    if 'build_bq_table' in steps:
        # Creates and populates BQ table
        if not schema:
            has_fatal_error('Empty SchemaField object', UnboundLocalError)
        print('Building BQ Table!')

        # don't want the entire fp for 2nd param, just the file name
        create_and_load_table(bq_params, api_params['DATA_OUTPUT_FILE'], schema)
示例#7
0
def validate_params(api_params, bq_params):
    """
    Validates yaml parameters before beginning to execute the script. This checks for reasonable (though not necessarily
    correct) api request param types and values. It confirms all params are included in specified yaml file.
    :param api_params: dict of api and file related params from user-provided yaml config
    :param bq_params: dict of bq related params from user-provided yaml config
    """
    err_list = []

    def is_valid_idx_param(yaml_param):
        """
        Verifies that index-type params provided are non-negative integer values.
        :param yaml_param: value to verify
        """
        e_list = []

        try:
            if int(api_params[yaml_param]) < 0:
                e_list.append('Invalid value for {} in yaml config (supplied: {}).'
                              .format(yaml_param, type(api_params[yaml_param])))
                e_list.append('Value should be a non-negative integer.')
                has_fatal_error(e_list, ValueError)
        except TypeError as e:
            # triggered by casting an inappropriate type to int for testing
            e_list.append('{} in yaml config should be of type int, not type {}).'
                          .format(yaml_param, type(api_params[yaml_param])))
            e_list.append(str(e))
            has_fatal_error(e_list, TypeError)

    try:
        if api_params['IS_LOCAL_MODE']:
            yaml_template_path = '../ConfigFiles/ClinicalBQBuild.yaml'
        else:
            home = expanduser('~')
            yaml_template_path = home + '/NextGenETL/ConfigFiles/ClinicalBQBuild.yaml'

        with open(yaml_template_path, mode='r') as yaml_file:
            default_api_params, default_bq_params, steps = load_config(yaml_file, YAML_HEADERS)
            default_api_param_keys = [k for k in default_api_params.keys()]
            default_bq_param_keys = [k for k in default_bq_params.keys()]

            # verify all required params exist in yaml config
            for param in default_api_param_keys:
                val = api_params[param]
            for param in default_bq_param_keys:
                val = bq_params[param]
    except FileNotFoundError as e:
        print('Default yaml config file not found, unable to compare with supplied yaml config.\n' + str(e))
    except ValueError as e:
        has_fatal_error(str(e), e)
    except KeyError as e:
        has_fatal_error('Missing param from yaml config file.', e)

    # verify that api index-related params are set to non-negative integers
    is_valid_idx_param('BATCH_SIZE') and is_valid_idx_param('START_INDEX') and is_valid_idx_param('MAX_PAGES')

    # BATCH_SIZE must also be positive
    if api_params['BATCH_SIZE'] == 0:
        has_fatal_error('BATCH_SIZE set to 0 in yaml_config, should be > 0.', ValueError)