Exemplo n.º 1
0
def print_query(feature_id, cohort_id_array, program_array, project_id_array,
                cohort_table_id):
    program_set = get_bq_program_set(program_array)
    logger.info("Selected programs: {}".format(program_set))

    # Verify the program set
    # ----------------------
    if len(program_set) == 0:
        logger.info("No programs set. Please include at least one program.")
        sys_exit(0)

    # Verify the cohort ID array
    # --------------------------
    if len(cohort_id_array) == 0:
        logger.info(
            "No cohort IDs set. Please include at least one cohort ID.")
        sys_exit(0)

    # Verify the project ID array
    # ---------------------------
    logger.info("Selected projects: {}".format(project_id_array))
    if len(project_id_array) == 0:
        logger.info("No project IDs set. Using NULL.")
        project_id_array = None

    provider = FeatureProviderFactory.from_feature_id(feature_id)

    query_string = provider.build_query(program_set, cohort_table_id,
                                        cohort_id_array, project_id_array)

    logger.info("QUERY:\n\n{}".format(query_string))
Exemplo n.º 2
0
def get_feature_vector(feature_id, cohort_id_array, cohort_settings):
    """
    Fetches the data from BigQuery tables for a given feature identifier and
    one or more stored cohorts. Returns the intersection of the samples defined
    by the feature identifier and the stored cohort.

    Each returned data point is represented as a dict containing patient, sample and
    aliquot barcodes, and the value as defined by the feature identifier.

    Args:
        feature_id: Feature identifier
        cohort_id_array: Array of cohort identifiers (integers)

    Returns:
        Data as an array of dicts.
    """
    provider = FeatureProviderFactory.from_feature_id(feature_id)

    result = provider.get_data(cohort_id_array, cohort_settings.dataset_id,
                               cohort_settings.table_id)

    items = []
    for data_point in result:
        data_item = {
            key: data_point[key]
            for key in ['case_id', 'sample_id', 'aliquot_id']
        }
        value = provider.process_data_point(data_point)
        # TODO refactor missing value logic
        if value is None:
            value = 'NA'
        data_item['value'] = value
        items.append(data_item)

    return provider.get_value_type(), items
Exemplo n.º 3
0
def is_valid_feature_identifier(feature_id):
    """
    Answers if given internal feature identifier is valid.

    Args:
        feature_id: Internal feature identifier

    Returns:
        True if feature id is valid, otherwise False.
    """
    is_valid = False
    try:
        provider_class = FeatureProviderFactory.get_provider_class_from_feature_id(
            feature_id)
        is_valid = provider_class.is_valid_feature_id(feature_id)
    except FeatureNotFoundException as e:
        logger.exception(e)
        # FeatureProviderFactory.get_provider_class_from_feature_id raises FeatureNotFoundException
        # if the feature identifier does not look valid. Nothing needs to be done here,
        # since is_valid is already False.
        pass
    except Exception as e:
        logger.error("Unrecognized feature ID: '{}'".format(feature_id))
        logger.exception(e)
    finally:
        return is_valid
Exemplo n.º 4
0
def is_valid_feature_identifier(feature_id):
    """
    Answers if given internal feature identifier is valid.

    Args:
        feature_id: Internal feature identifier

    Returns:
        True if feature id is valid, otherwise False.
    """
    is_valid = False
    try:
        provider_class = FeatureProviderFactory.get_provider_class_from_feature_id(
            feature_id)
        is_valid = provider_class.is_valid_feature_id(feature_id)
    except FeatureNotFoundException as e:
        logging.exception(e)
        # FeatureProviderFactory.get_provider_class_from_feature_id raises FeatureNotFoundException
        # if the feature identifier does not look valid. Nothing needs to be done here,
        # since is_valid is already False.
        pass
    except Exception as e:
        print >> sys.stdout, traceback.format_exc()
    finally:
        return is_valid
Exemplo n.º 5
0
def submit_tcga_job(param_obj, project_id_number, bigquery_client,
                    cohort_settings):
    query_provider = FeatureProviderFactory.from_parameters(param_obj)
    bigquery_runner = FeatureDataProvider(query_provider,
                                          bigquery_service=bigquery_client,
                                          project_id_number=project_id_number)

    feature_id = param_obj.feature_id
    cohort_id_array = param_obj.cohort_id_array
    project_id_array = param_obj.project_id_array
    logger.info("[STATUS] In submit_tcga_job, project_id_array: {}".format(
        str(project_id_array)))
    program_set = param_obj.program_set

    job_description = bigquery_runner.get_data_job_reference(
        program_set, cohort_settings.table_id, cohort_id_array,
        project_id_array)

    # Was a query job submitted at all?
    run_query = job_description['run_query']
    if run_query:
        logger.info("Submitted TCGA {job_id}: {fid} - {cohorts}".format(
            job_id=job_description['job_reference']['jobId'],
            fid=feature_id,
            cohorts=str(cohort_id_array)))

        job_item = {
            'run_query': run_query,
            'feature_id': feature_id,
            'provider': bigquery_runner,
            'query_support': query_provider,
            'ready': False,
            'job_reference': job_description['job_reference'],
            'tables_used': job_description['tables_used']
        }
    else:
        job_item = {
            'run_query': run_query,
            'feature_id': feature_id,
            'provider': bigquery_runner,
            'query_support': query_provider,
            'ready': False,
            'job_reference': job_description['job_reference'],
            'tables_used': job_description['tables_used']
        }

    return job_item
def _feature_converter(feature_id):
    """
    User data feature requests can be mapped onto parent program feature IDs (this is the purpose of
    the shared_map_id column in the projects_user_feature_definitions table). Currently only used for
    case_barcodes, (which cannot be plotted?) but this capability was in the V1 code. Thus, we port it over.
    """
    if feature_id is None:
        return None
    provider_class = FeatureProviderFactory.get_provider_class_from_feature_id(
        feature_id)
    if provider_class.can_convert_feature_id():
        converted_user_feature = provider_class.convert_feature_id(feature_id)
        if converted_user_feature:
            if feature_id.startswith(
                    'v2') and not converted_user_feature.startswith('v2'):
                converted_user_feature = 'v2:{0}'.format(
                    converted_user_feature)
            return converted_user_feature
    return feature_id
def get_feature_vector(feature_id, cohort_id_array):
    include_tcga = False
    user_studies = ()
    for cohort_id in cohort_id_array:
        try:
            db = get_sql_connection()
            cursor = db.cursor(DictCursor)

            cursor.execute(
                "SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id",
                (cohort_id, ))
            for row in cursor.fetchall():
                if row['project_id'] is None:
                    include_tcga = True
                else:
                    user_studies += (row['project_id'], )

        except Exception as e:
            if db: db.close()
            if cursor: cursor.close()
            raise e

    #  ex: feature_id 'CLIN:Disease_Code'
    user_feature_id = None
    if feature_id.startswith('USER:'******'t include TCGA
            include_tcga = False

    items = []
    type = None
    result = []
    cohort_settings = settings.GET_BQ_COHORT_SETTINGS()
    if include_tcga:
        provider = FeatureProviderFactory.from_feature_id(feature_id)
        result = provider.get_data(cohort_id_array, cohort_settings.dataset_id,
                                   cohort_settings.table_id)

        # ex: result[0]
        # {'aliquot_id': None, 'case_id': u'TCGA-BH-A0B1', 'sample_id': u'TCGA-BH-A0B1-10A', 'value': u'BRCA'}
        for data_point in result:
            data_item = {
                key: data_point[key]
                for key in ['case_id', 'sample_id', 'aliquot_id']
            }
            value = provider.process_data_point(data_point)
            # TODO refactor missing value logic
            if value is None:
                value = 'NA'
            data_item['value'] = value
            items.append(data_item)

        type = provider.get_value_type()

    if len(user_studies) > 0:
        # Query User Data
        user_provider = UserFeatureProvider(feature_id,
                                            user_feature_id=user_feature_id)
        user_result = user_provider.get_data(cohort_id_array,
                                             cohort_settings.dataset_id,
                                             cohort_settings.table_id)
        result.extend(user_result)

        for data_point in user_result:
            data_item = {
                key: data_point[key]
                for key in ['case_id', 'sample_id', 'aliquot_id']
            }
            value = provider.process_data_point(data_point)
            # TODO refactor missing value logic
            if value is None:
                value = 'NA'
            data_item['value'] = value
            items.append(data_item)

        if not type:
            type = user_provider.get_value_type()

    return type, items