Пример #1
0
    def get_study_ids(self, cohort_id_array):
        """
        Returns: The user study identifiers associated with the samples in all given cohorts.
        """
        if self._study_ids is not None:
            return self._study_ids

        study_ids = ()
        for cohort_id in cohort_id_array:
            try:
                db = sql_connection()
                cursor = db.cursor(MySQLdb.cursors.DictCursor)

                cursor.execute(
                    "SELECT study_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY study_id", (cohort_id,)
                )
                for row in cursor.fetchall():
                    if row["study_id"] is not None:
                        study_ids += (row["study_id"],)

            except Exception as e:
                if db:
                    db.close()
                if cursor:
                    cursor.close()
                raise e

        self._study_ids = study_ids
        return self._study_ids
    def get_cohort_barcodes(cls, cohort_id_array):
        # Generate the 'IN' statement string: (%s, %s, ..., %s)
        cohort_id_stmt = ', '.join(['%s' for x in xrange(len(cohort_id_array))])
        query = 'SELECT sample_id AS barcode FROM {cohort_table} WHERE cohort_id IN ({cohort_id_stmt})'.format(
            cohort_table=DJANGO_COHORT_TABLE,
            cohort_id_stmt=cohort_id_stmt)
        values = cohort_id_array
        try:
            db = sql_connection()
            cursor = db.cursor(DictCursor)
            cursor.execute(query, tuple(values))

            result = cursor.fetchall()
            barcodes = []
            for row in result:
                barcodes.append(row['barcode'])
            cursor.close()
            db.close()

            # Return only unique barcodes
            return list(set(barcodes))

        except Exception as e:
            raise CohortException('get_cohort_barcodes CloudSQL error, cohort IDs {cohort_ids}: {message}'.format(
                cohort_ids=cohort_id_array,
                message=str(e.message)))
            raise CohortException('bad cohort: ' + str(cohort_id_array))
    def get_cohorts_for_datapoints(cls, cohort_id_array):
        # Generate the 'IN' statement string: (%s, %s, ..., %s)
        cohort_id_stmt = ', '.join(['%s' for x in xrange(len(cohort_id_array))])

        query = 'SELECT sample_id, cohort_id FROM {cohort_samples_table} WHERE cohort_id IN ({cohort_id_stmt})'.format(
            cohort_samples_table=DJANGO_COHORT_SAMPLES_TABLE,
            cohort_id_stmt=cohort_id_stmt)

        values = cohort_id_array
        try:
            db = sql_connection()
            cursor = db.cursor(DictCursor)
            cursor.execute(query, tuple(values))

            result = cursor.fetchall()
            cohort_per_samples = {}

            for row in result:
                cohort_id, sample_id = row['cohort_id'], row['sample_id']
                if sample_id not in cohort_per_samples:
                    cohort_per_samples[sample_id] = []
                cohort_per_samples[sample_id].append(cohort_id)

            cursor.close()
            db.close()

            return cohort_per_samples

        except Exception as e:
            raise CohortException('get_cohorts_for_datapoints CloudSQL error, cohort IDs {cohort_ids}: {message}'.format(
                cohort_ids=cohort_id_array,
                message=str(e.message)))
Пример #4
0
def user_feature_handler(feature_id, cohort_id_array):
    include_tcga = False
    user_studies = ()
    for cohort_id in cohort_id_array:
        try:
            db = sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)

            cursor.execute("SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id", (cohort_id,))
            for row in cursor.fetchall():
                if row['project_id'] is None:
                    include_tcga = True
                else:
                    user_studies += (row['project_id'],)

        except Exception as e:
            if db: db.close()
            if cursor: cursor.close()
            raise e

    user_feature_id = None
    if feature_id.startswith('USER:'******'t include TCGA
            include_tcga = False

    return {
        'converted_feature_id': feature_id,
        'include_tcga': include_tcga,
        'user_studies': user_studies,
        'user_feature_id': user_feature_id
    }
Пример #5
0
    def search(self, keyword, field):
        self.validate_search_field(keyword, field)

        query = (
            "SELECT mirna_name, platform, value_field, internal_feature_id "
            "FROM {table_name} WHERE {search_field} LIKE %s LIMIT %s".format(
                table_name=self.get_table_name(), search_field=field
            )
        )

        # Format the keyword for MySQL string matching
        sql_keyword = "%" + keyword + "%"
        query_args = [sql_keyword, FOUND_FEATURE_LIMIT]

        try:
            db = sql_connection()
            cursor = db.cursor(DictCursor)
            cursor.execute(query, tuple(query_args))
            items = []

            for row in cursor.fetchall():
                items.append(row)

            # Generate human readable labels
            for item in items:
                item["feature_type"] = MIRN_FEATURE_TYPE
                item["label"] = build_feature_label(item)

            return items

        except MySQLError:
            raise BackendException("database error", keyword, field)
Пример #6
0
    def convert_user_feature_id(cls, feature_id):
        bq_id = None
        try:
            db = sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)
            cursor.execute(
                """
                SELECT feature_name, bq_map_id, shared_map_id
                FROM projects_user_feature_definitions
                WHERE id = %s
            """,
                (int(feature_id.split(":")[-1]),),
            )
            for row in cursor.fetchall():
                bq_id = row["shared_map_id"]

            cursor.close()
            db.close()

            logging.debug("UserFeatureProvider.convert_user_feature_id {0} -> {1}".format(feature_id, bq_id))
            return bq_id

        except Exception as e:
            if db:
                db.close()
            if cursor:
                cursor.close()
            raise e
    def cloud_storage_file_paths(self, request):
        """
        Takes a sample barcode as a required parameter and
        returns cloud storage paths to files associated with that sample.
        """
        cursor = None
        db = None

        sample_barcode = request.get_assigned_value('sample_barcode')
        platform = request.get_assigned_value('platform')
        pipeline = request.get_assigned_value('pipeline')

        if are_there_bad_keys(request):
            err_msg = construct_parameter_error_message(request, False)
            raise endpoints.BadRequestException(err_msg)

        query_str, query_tuple = CohortsSamplesFilesQueryBuilder().build_query(
            platform=platform, pipeline=pipeline, sample_barcode=sample_barcode)

        try:
            db = sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)
            cursor.execute(query_str, query_tuple)
            cursor_rows = cursor.fetchall()
            # add 'cloud_storage_path' to cursor_rows
            bad_repo_count, bad_repo_set = CohortsSamplesFilesMessageBuilder().get_GCS_file_paths_and_bad_repos(cursor_rows)
            cloud_storage_path_list = [row['cloud_storage_path'] for row in cursor_rows]
            if bad_repo_count > 0:
                logger.warn("not returning {count} row(s) in sample_details due to repositories: {bad_repo_list}"
                            .format(count=bad_repo_count, bad_repo_list=list(bad_repo_set)))
            return GCSFilePathList(cloud_storage_file_paths=cloud_storage_path_list, count=len(cloud_storage_path_list))

        except (IndexError, TypeError), e:
            logger.warn(e)
            raise endpoints.NotFoundException("File paths for sample {} not found.".format(sample_barcode))
Пример #8
0
    def field_value_search(self, keyword, field):
        self.validate_field_search_input(keyword, field)

        query = 'SELECT DISTINCT {search_field} FROM {table_name} WHERE {search_field} LIKE %s LIMIT %s'.format(
            table_name=self.get_table_name(),
            search_field=field
        )
        # Format the keyword for MySQL string matching
        sql_keyword = '%' + keyword + '%'
        query_args = [sql_keyword, FOUND_FEATURE_LIMIT]
        logging.debug("CLOUDSQL_QUERY_GEXP_FIELDS: {}".format(query))

        try:
            db = sql_connection()
            cursor = db.cursor(DictCursor)
            cursor.execute(query, tuple(query_args))
            items = []

            for row in cursor.fetchall():
                items.append(row[field])

            return items

        except MySQLError as mse:
            raise BackendException("MySQLError: {}".format(str(mse)))
Пример #9
0
def get_binary_log_filenames():
    db = sql_connection()
    try:
        cursor = db.cursor()
        cursor.execute('SHOW BINARY LOGS;')
        filenames = []
        for row in cursor.fetchall():
            filenames.append(row[0])
        return filenames
    except (TypeError, IndexError) as e:
        logger.warn('Error in retrieving binary log filenames: {}'.format(e))
    finally:
        if cursor: cursor.close()
        if db: db.close()
Пример #10
0
    def from_feature_id(cls, feature_id, project_id=None):
        logging.debug("UserFeatureDef.from_feature_id: {0}".format(str([feature_id, project_id])))
        if feature_id is None:
            raise FeatureNotFoundException(feature_id)
        # ID breakdown: project ID:Feature ID
        # Example ID: USER:1:6
        regex = re_compile("^USER:"******"([0-9]+):"
                           # Feature ID
                           "([0-9]+)$"
                           )

        feature_fields = regex.findall(feature_id)
        if len(feature_fields) == 0:
            raise FeatureNotFoundException(feature_id)
        project_id, user_feature_id = feature_fields[0]


        try:
            db = sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)
            cursor.execute("""
                SELECT bq_map_id, project_id, is_numeric
                FROM projects_user_feature_definitions
                WHERE id = %s
            """, (user_feature_id,))

            results = []
            for row in cursor.fetchall():
                bq_table, column_name, symbol = cls.get_table_and_field(row['bq_map_id'])
                filters = None
                if symbol is not None:
                    filters = {
                        'Symbol': symbol
                    }
                results.append(cls(bq_table, column_name, row['project_id'], row['is_numeric'] == 1, filters))

            cursor.close()
            db.close()

            return results

        except Exception as e:
            if db: db.close()
            if cursor: cursor.close()
            raise e
Пример #11
0
    def get(self, request):
        """
        Returns information about a specific patient,
        including a list of samples and aliquots derived from this patient.
        Takes a patient barcode (of length 12, *eg* TCGA-B9-7268) as a required parameter.
        User does not need to be authenticated.
        """

        cursor = None
        db = None

        patient_barcode = request.get_assigned_value('patient_barcode')
        query_tuple = (str(patient_barcode),)
        clinical_query_str, sample_query_str, aliquot_query_str = PatientsGetQueryBuilder().build_queries()

        try:
            db = sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)

            # build clinical data message
            cursor.execute(clinical_query_str, query_tuple)
            row = cursor.fetchone()
            if row is None:
                cursor.close()
                db.close()
                logger.warn("Patient barcode {} not found in metadata_clinical table.".format(patient_barcode))
                raise endpoints.NotFoundException("Patient barcode {} not found".format(patient_barcode))
            constructor_dict = build_constructor_dict_for_message(MetadataItem(), row)
            clinical_data_item = MetadataItem(**constructor_dict)

            # get list of samples
            cursor.execute(sample_query_str, query_tuple)
            sample_list = [row['sample_barcode'] for row in cursor.fetchall()]

            # get list of aliquots
            cursor.execute(aliquot_query_str, query_tuple)
            aliquot_list = [row['AliquotBarcode'] for row in cursor.fetchall()]

            return PatientDetails(clinical_data=clinical_data_item, samples=sample_list, aliquots=aliquot_list)

        except (IndexError, TypeError), e:
            logger.info("Patient {} not found. Error: {}".format(patient_barcode, e))
            raise endpoints.NotFoundException("Patient {} not found.".format(patient_barcode))
    def search(self, parameters):
        self.validate_feature_search_input(parameters)

        query = 'SELECT gene_name, probe_name, platform, relation_to_gene, relation_to_island, ' \
                       'value_field, internal_feature_id ' \
                'FROM {table_name} ' \
                'WHERE gene_name LIKE %s ' \
                'AND probe_name LIKE %s ' \
                'AND platform LIKE %s ' \
                'AND relation_to_gene LIKE %s ' \
                'AND relation_to_island LIKE %s ' \
                'LIMIT %s'.format(table_name=self.get_table_name()
        )

        # Fills in '' for fields that were not specified in the parameters
        input = defaultdict(lambda: '', parameters)

        # Format the keyword for MySQL string matching
        query_args = ['%' + input['gene_name'] + '%',
                      '%' + input['probe_name'] + '%',
                      '%' + input['platform'] + '%',
                      '%' + input['relation_to_gene'] + '%',
                      '%' + input['relation_to_island'] + '%',
                      FOUND_FEATURE_LIMIT]

        try:
            db = sql_connection()
            cursor = db.cursor(DictCursor)
            cursor.execute(query, tuple(query_args))
            items = []

            for row in cursor.fetchall():
                items.append(row)

            # Generate human readable labels
            for item in items:
                item['feature_type'] = METH_FEATURE_TYPE
                item['label'] = self.build_feature_label(item)

            return items

        except MySQLError:
            raise BackendException('database error')
Пример #13
0
    def get_cohort_info(cls, cohort_id_array):
        # Generate the 'IN' statement string: (%s, %s, ..., %s)
        cohort_id_stmt = ', '.join(['%s' for x in xrange(len(cohort_id_array))])

        query_template = ("SELECT ti.id AS cohort_id, ti.name, COUNT(ts.sample_barcode) AS size "
                          "FROM {cohort_info_table} ti "
                          "   LEFT JOIN {cohort_samples_table} ts ON ts.cohort_id = ti.id "
                          "WHERE ti.id IN ({cohort_id_stmt}) "
                          "GROUP BY ti.id, ti.name")

        query = query_template.format(
            cohort_info_table=DJANGO_COHORT_INFO_TABLE,
            cohort_samples_table=DJANGO_COHORT_SAMPLES_TABLE,
            cohort_id_stmt=cohort_id_stmt)

        try:
            db = sql_connection()
            cursor = db.cursor(DictCursor)
            cursor.execute(query, tuple(cohort_id_array))

            result = []

            for row in cursor.fetchall():
                result.append({
                    'id': row['cohort_id'],
                    'name': row['name'],
                    'size': row['size']
                })

            cursor.close()
            db.close()

            return result

        except Exception as e:
            print >> sys.stdout, "[ERROR] In get_cohort_info: "
            print >> sys.stdout, e
            print >> sys.stdout, traceback.format_exc()
            raise CohortException('get_cohort_info CloudSQL error, cohort IDs {cohort_ids}: {message}'.format(
                cohort_ids=cohort_id_array,
                message=str(e.message)))
Пример #14
0
    def search(self, parameters):
        self.validate_feature_search_input(parameters)

        query = 'SELECT gene_name, platform, center, value_label, internal_feature_id' \
                ' FROM {table_name}' \
                ' WHERE gene_name LIKE %s'\
                ' AND platform LIKE %s' \
                ' AND center LIKE %s'\
                ' LIMIT %s'.format(table_name=self.get_table_name()
        )
        logging.debug("CLOUDSQL_QUERY_GEXP_SEARCH: {}".format(query))

        # Fills in '' for fields that were not specified in the parameters
        input = defaultdict(lambda: '', parameters)

        # Format the keyword for MySQL string matching
        # sql_keyword = '%' + keyword + '%'
        query_args = ['%' + input['gene_name'] + '%',
                      '%' + input['platform'] + '%',
                      '%' + input['center'] + '%',
                      FOUND_FEATURE_LIMIT]

        try:
            db = sql_connection()
            cursor = db.cursor(DictCursor)
            cursor.execute(query, tuple(query_args))
            items = []

            for row in cursor.fetchall():
                items.append(row)

            # Generate human readable labels
            for item in items:
                item['feature_type'] = GEXP_FEATURE_TYPE
                item['label'] = build_feature_label(item['gene_name'], item)

            return items

        except MySQLError as mse:
            raise BackendException("MySQLError: {}".format(str(mse)))
Пример #15
0
    def preview(self, request):
        """
        Takes a JSON object of filters in the request body and returns a "preview" of the cohort that would
        result from passing a similar request to the cohort **save** endpoint.  This preview consists of
        two lists: the lists of participant (aka patient) barcodes, and the list of sample barcodes.
        Authentication is not required.
        """
        patient_cursor = None
        sample_cursor = None

        if are_there_bad_keys(request) or are_there_no_acceptable_keys(request):
            err_msg = construct_parameter_error_message(request, True)
            raise endpoints.BadRequestException(err_msg)

        query_dict, gte_query_dict, lte_query_dict = CohortsCreatePreviewQueryBuilder().build_query_dictionaries(
            request
        )

        patient_query_str, sample_query_str, value_tuple = CohortsCreatePreviewQueryBuilder().build_query(
            query_dict, gte_query_dict, lte_query_dict
        )

        patient_barcodes = []
        sample_barcodes = []

        try:
            db = sql_connection()
            patient_cursor = db.cursor(MySQLdb.cursors.DictCursor)
            patient_cursor.execute(patient_query_str, value_tuple)
            for row in patient_cursor.fetchall():
                patient_barcodes.append(row["case_barcode"])

            sample_cursor = db.cursor(MySQLdb.cursors.DictCursor)
            sample_cursor.execute(sample_query_str, value_tuple)
            for row in sample_cursor.fetchall():
                sample_barcodes.append(row["sample_barcode"])

        except (IndexError, TypeError), e:
            logger.warn(e)
            raise endpoints.NotFoundException("Error retrieving samples or patients: {}".format(e))
Пример #16
0
    def googlegenomics(self, request):
        """
        Takes a sample barcode as a required parameter and returns the Google Genomics dataset id
        and readgroupset id associated with the sample, if any.
        """

        cursor = None
        db = None
        sample_barcode = request.get_assigned_value('sample_barcode')

        query_str = 'SELECT SampleBarcode, GG_dataset_id, GG_readgroupset_id ' \
                    'FROM metadata_data ' \
                    'WHERE SampleBarcode=%s ' \
                    'AND GG_dataset_id !="" AND GG_readgroupset_id !="" ' \
                    'GROUP BY SampleBarcode, GG_dataset_id, GG_readgroupset_id;'

        query_tuple = (sample_barcode,)
        try:
            db = sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)
            cursor.execute(query_str, query_tuple)

            google_genomics_items = [
                GoogleGenomics(
                        SampleBarcode=row['SampleBarcode'],
                        GG_dataset_id=row['GG_dataset_id'],
                        GG_readgroupset_id=row['GG_readgroupset_id']
                    )
                for row in cursor.fetchall()
            ]
            return GoogleGenomicsList(items=google_genomics_items, count=len(google_genomics_items))

        except (IndexError, TypeError), e:
            logger.warn(e)
            raise endpoints.NotFoundException(
                "Google Genomics dataset and readgroupset id's for sample {} not found."
                    .format(sample_barcode))
Пример #17
0
    def from_feature_id(cls, feature_id, study_id=None):
        logging.debug("UserFeatureDef.from_feature_id: {0}".format(str([feature_id, study_id])))
        if feature_id is None:
            raise FeatureNotFoundException(feature_id)

        try:
            db = sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)
            cursor.execute(
                """
                SELECT bq_map_id, study_id, is_numeric
                FROM projects_user_feature_definitions
                WHERE shared_map_id = %s
            """,
                (feature_id,),
            )

            results = []
            for row in cursor.fetchall():
                bq_table, column_name, symbol = cls.get_table_and_field(row["bq_map_id"])
                filters = None
                if symbol is not None:
                    filters = {"Symbol": symbol}
                results.append(cls(bq_table, column_name, row["study_id"], row["is_numeric"] == 1, filters))

            cursor.close()
            db.close()

            return results

        except Exception as e:
            if db:
                db.close()
            if cursor:
                cursor.close()
            raise e
Пример #18
0
def fetch_isbcgc_project_set():
    try:
        cursor = None
        db = sql_connection()
        if not ISB_CGC_PROJECTS['list'] or len(ISB_CGC_PROJECTS['list']) <= 0:
            cursor = db.cursor()
            cursor.execute("SELECT COUNT(SPECIFIC_NAME) FROM INFORMATION_SCHEMA.ROUTINES WHERE SPECIFIC_NAME = 'get_isbcgc_project_set';")
            # Only try to fetch the study set if the sproc exists
            if cursor.fetchall()[0][0] > 0:
                cursor.execute("CALL get_isbcgc_project_set();")
                ISB_CGC_PROJECTS['list'] = []
                for row in cursor.fetchall():
                    ISB_CGC_PROJECTS['list'].append(row[0])
            else:
                # Otherwise just warn
                logger.warn("[WARNING] Stored procedure get_isbcgc_project_set was not found!")

        return ISB_CGC_PROJECTS['list']
    except Exception as e:
        logger.error(e)
        logger.error(traceback.format_exc())
    finally:
        if cursor: cursor.close()
        if db and db.open: db.close()
            Cohort_Perms.objects.get(cohort_id=cohort_id, user_id=user_id)
        except (ObjectDoesNotExist, MultipleObjectsReturned), e:
            logger.warn(e)
            err_msg = "Error retrieving cohort {} for user {}: {}".format(cohort_id, user_email, e)
            if 'Cohort_Perms' in e.message:
                err_msg = "User {} does not have permissions on cohort {}. " \
                          "Error: {}".format(user_email, cohort_id, e)
            raise endpoints.UnauthorizedException(err_msg)
        finally:
            request_finished.send(self)

        query_str, query_tuple = CohortsSamplesFilesQueryBuilder().build_query(
            platform=platform, pipeline=pipeline, limit=limit, cohort_id=cohort_id)

        try:
            db = sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)
            cursor.execute(query_str, query_tuple)
            cursor_rows = cursor.fetchall()
            bad_repo_count, bad_repo_set = CohortsSamplesFilesMessageBuilder().get_GCS_file_paths_and_bad_repos(cursor_rows)
            cloud_storage_path_list = [row['cloud_storage_path'] for row in cursor_rows]
            if bad_repo_count > 0:
                logger.warn("not returning {count} row(s) in sample_details due to repositories: {bad_repo_list}"
                            .format(count=bad_repo_count, bad_repo_list=list(bad_repo_set)))
            return GCSFilePathList(cloud_storage_file_paths=cloud_storage_path_list, count=len(cloud_storage_path_list))

        except (IndexError, TypeError), e:
            logger.warn(e)
            raise endpoints.NotFoundException("File paths for cohort {} not found.".format(cohort_id))
        except MySQLdb.ProgrammingError as e:
            logger.warn("Error retrieving file paths. {}".format(e))
Пример #20
0
    def seqpeek_view_data(self, request):
        try:
            hugo_symbol = request.hugo_symbol
            cohort_id_array = request.cohort_id

            gnab_feature_id = self.build_gnab_feature_id(hugo_symbol)
            logging.debug("GNAB feature ID for SeqPeke: {0}".format(gnab_feature_id))

            # Lifted from api/data_access.py line 509+
            # Get the study IDs these cohorts' samples come from
            cohort_vals = ()
            cohort_params = ""

            for cohort in cohort_id_array:
                cohort_params += "%s,"
                cohort_vals += (cohort,)

            cohort_params = cohort_params[:-1]

            db = sql_connection()
            cursor = db.cursor()

            tcga_studies = fetch_isbcgc_project_set()

            cursor.execute("SELECT DISTINCT study_id FROM cohorts_samples WHERE cohort_id IN (" + cohort_params + ");",
                           cohort_vals)

            # Only samples whose source studies are TCGA studies, or extended from them, should be used
            confirmed_study_ids = []
            unconfirmed_study_ids = []

            for row in cursor.fetchall():
                if row[0] in tcga_studies:
                    if row[0] not in confirmed_study_ids:
                        confirmed_study_ids.append(row[0])
                elif row[0] not in unconfirmed_study_ids:
                    unconfirmed_study_ids.append(row[0])

            if len(unconfirmed_study_ids) > 0:
                studies = Study.objects.filter(id__in=unconfirmed_study_ids)

                for study in studies:
                    if study.get_my_root_and_depth()['root'] in tcga_studies:
                        confirmed_study_ids.append(study.id)

            async_params = [ProviderClassQueryDescription(SeqPeekDataProvider, gnab_feature_id, cohort_id_array, confirmed_study_ids)]
            maf_data_result = get_feature_vectors_tcga_only(async_params, skip_formatting_for_plot=True)

            maf_data_vector = maf_data_result[gnab_feature_id]['data']

            if len(maf_data_vector) > 0:
                # Since the gene (hugo_symbol) parameter is part of the GNAB feature ID,
                # it will be sanity-checked in the SeqPeekMAFDataAccess instance.
                seqpeek_data = SeqPeekMAFDataFormatter().format_maf_vector_for_view(maf_data_vector, cohort_id_array)

                seqpeek_maf_vector = seqpeek_data.maf_vector
                seqpeek_cohort_info = seqpeek_data.cohort_info
                removed_row_statistics_dict = seqpeek_data.removed_row_statistics

                seqpeek_view_data = SeqPeekViewDataBuilder().build_view_data(hugo_symbol,
                                                                             seqpeek_maf_vector,
                                                                             seqpeek_cohort_info,
                                                                             cohort_id_array,
                                                                             removed_row_statistics_dict)

                response = self.create_response(seqpeek_view_data)
                return response
            else:
                # No data found
                return SeqPeekViewRecord(plot_data=SeqPeekViewPlotDataRecord(tracks=[], protein=None, regions=[]),
                                         hugo_symbol=hugo_symbol, cohort_id_list=[str(i) for i in cohort_id_array],
                                         removed_row_statistics=[])
        except Exception as e:
            logging.exception(e)
            raise InternalServerErrorException()
Пример #21
0
def get_feature_vector(feature_id, cohort_id_array):
    include_tcga = False
    user_studies = ()
    for cohort_id in cohort_id_array:
        try:
            db = sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)

            cursor.execute("SELECT project_id FROM cohorts_samples WHERE cohort_id = %s GROUP BY project_id", (cohort_id,))
            for row in cursor.fetchall():
                if row['project_id'] is None:
                    include_tcga = True
                else:
                    user_studies += (row['project_id'],)

        except Exception as e:
            if db: db.close()
            if cursor: cursor.close()
            raise e

    #  ex: feature_id 'CLIN:Disease_Code'
    user_feature_id = None
    if feature_id.startswith('USER:'******'t include TCGA
            include_tcga = False

    items = []
    type = None
    result = []
    cohort_settings = settings.GET_BQ_COHORT_SETTINGS()
    if include_tcga:
        provider = FeatureProviderFactory.from_feature_id(feature_id)
        result = provider.get_data(cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id)

        # ex: result[0]
        # {'aliquot_id': None, 'case_id': u'TCGA-BH-A0B1', 'sample_id': u'TCGA-BH-A0B1-10A', 'value': u'BRCA'}
        for data_point in result:
            data_item = {key: data_point[key] for key in ['case_id', 'sample_id', 'aliquot_id']}
            value = provider.process_data_point(data_point)
            # TODO refactor missing value logic
            if value is None:
                value = 'NA'
            data_item['value'] = value
            items.append(data_item)

        type = provider.get_value_type()

    if len(user_studies) > 0:
        # Query User Data
        user_provider = UserFeatureProvider(feature_id, user_feature_id=user_feature_id)
        user_result = user_provider.get_data(cohort_id_array, cohort_settings.dataset_id, cohort_settings.table_id)
        result.extend(user_result)

        for data_point in user_result:
            data_item = {key: data_point[key] for key in ['case_id', 'sample_id', 'aliquot_id']}
            value = provider.process_data_point(data_point)
            # TODO refactor missing value logic
            if value is None:
                value = 'NA'
            data_item['value'] = value
            items.append(data_item)

        if not type:
            type = user_provider.get_value_type()

    return type, items
Пример #22
0
    def from_user_feature_id(cls, feature_id):
        logging.debug("UserFeatureDef.from_user_feature_id {0}".format(str([feature_id])))
        # ID breakdown: Study ID:Feature ID
        # Example ID: USER:1:6
        regex = re_compile(
            "^USER:"******"([0-9]+):"
            # Feature ID
            "([0-9]+)$"
        )

        feature_fields = regex.findall(feature_id)
        if len(feature_fields) == 0:
            raise FeatureNotFoundException(feature_id)
        study_id, user_feature_id = feature_fields[0]
        bq_id = None
        shared_id = None
        is_numeric = False

        try:
            db = sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)
            cursor.execute(
                """
                SELECT feature_name, bq_map_id, shared_map_id, is_numeric
                FROM projects_user_feature_definitions
                WHERE id = %s
            """,
                (user_feature_id,),
            )
            for row in cursor.fetchall():
                if row["shared_map_id"]:
                    shared_id = row["shared_map_id"]
                bq_id = row["bq_map_id"]
                is_numeric = row["is_numeric"] == 1

            cursor.close()
            db.close()

        except Exception as e:
            if db:
                db.close()
            if cursor:
                cursor.close()
            raise e

        if shared_id is not None:
            return cls.from_feature_id(bq_id, study_id)

        if bq_id is None:
            raise FeatureNotFoundException(feature_id)

        # Else we're querying a very specific feature from a specific study
        bq_table, column_name, symbol = cls.get_table_and_field(bq_id)
        if bq_table is None or column_name is None:
            raise FeatureNotFoundException(feature_id)

        logging.debug("{0} {1} {2}".format(bq_table, column_name, symbol))

        filters = None
        if symbol is not None:
            filters = {"Symbol": symbol}

        return [cls(bq_table, column_name, study_id, is_numeric, filters)]
Пример #23
0
    def data_access_for_plot(self, request):
        """ Used by the web application."""
        try:
            x_id = request.x_id
            y_id = request.y_id
            c_id = request.c_id
            logTransform = json.loads(request.log_transform)
            cohort_id_array = request.cohort_id

            # Check that all requested feature identifiers are valid. Do not check for y_id if it is not
            # supplied in the request.
            feature_ids_to_check = [x_id]
            if c_id is not None:
                feature_ids_to_check.append(c_id)
            if y_id is not None:
                feature_ids_to_check.append(y_id)

            valid_features = self.get_feature_id_validity_for_array(feature_ids_to_check)

            for feature_id, is_valid in valid_features:
                logging.info((feature_id, is_valid))
                if not is_valid:
                    logging.error("Invalid internal feature ID '{}'".format(feature_id))
                    raise NotFoundException()

            # Get the project IDs these cohorts' samples come from
            cohort_vals = ()
            cohort_params = ""

            for cohort in cohort_id_array:
                cohort_params += "%s,"
                cohort_vals += (cohort,)

            cohort_params = cohort_params[:-1]

            db = sql_connection()
            cursor = db.cursor()

            tcga_studies = fetch_isbcgc_project_set()

            cursor.execute("SELECT DISTINCT project_id FROM cohorts_samples WHERE cohort_id IN ("+cohort_params+");",cohort_vals)

            # Only samples whose source studies are TCGA studies, or extended from them, should be used
            confirmed_study_ids = []
            unconfirmed_study_ids = []

            for row in cursor.fetchall():
                if row[0] in tcga_studies:
                    if row[0] not in confirmed_study_ids:
                        confirmed_study_ids.append(row[0])
                elif row[0] not in unconfirmed_study_ids:
                    unconfirmed_study_ids.append(row[0])

            if len(unconfirmed_study_ids) > 0:
                projects = Project.objects.filter(id__in=unconfirmed_study_ids)

                for project in projects:
                    if project.get_my_root_and_depth()['root'] in tcga_studies:
                        confirmed_study_ids.append(project.id)

            return self.get_merged_feature_vectors(x_id, y_id, c_id, cohort_id_array, logTransform, confirmed_study_ids)
        except NotFoundException as nfe:
            # Pass through NotFoundException so that it is not handled as Exception below.
            raise nfe
        except Exception as e:
            logger.exception(e)
            raise InternalServerErrorException()
Пример #24
0
    def annotations(self, request):
        """
        Returns TCGA annotations about a specific aliquot,
        Takes a aliquot barcode (of length 28 *eg* TCGA-01-0628-11A-01D-0358-06) as a required parameter.
        User does not need to be authenticated.
        """

        cursor = None
        db = None

        aliquot_barcode = request.get_assigned_value('aliquot_barcode')
        query_tuple = (str(aliquot_barcode),)
        # check to make sure aliquot_barcode is in correct form
        try:
            parts = aliquot_barcode.split('-')
            assert len(parts) == 7
            assert len(parts[0]) == 4
            assert len(parts[1]) == 2
            assert len(parts[2]) == 4
            assert len(parts[3]) == 3
            assert len(parts[4]) in [2, 3]
            assert len(parts[5]) == 4
            assert len(parts[6]) == 2
        except AssertionError:
            raise endpoints.BadRequestException('{} is not the correct format for a aliquot barcode. '
                                                'Aliquot barcodes must be of the form XXXX-XX-XXXX-XXX-XXX-XXXX-XX '
                                                'or XXXX-XX-XXXX-XXX-XX-XXXX-XX.'.format(aliquot_barcode))

        item_type_name = request.get_assigned_value('item_type_name')

        # check to make sure each item_type_name is valid
        if len(item_type_name) > 0:
            for itm in item_type_name:
                itm = itm.strip()
                if itm.lower() not in ['patient', 'aliquot', 'analyte', 'shipped portion', 'portion', 'slide', 'sample']:
                    raise endpoints.BadRequestException("'{}' is not a valid entry for item_type_name. "
                                                        "Valid entries include 'Patient', 'Aliquot', 'Analyte', 'Shipped Portion', "
                                                        "'Portion', 'Slide', and 'Sample'".format(itm))
                query_tuple += (itm,)

        query_str = AliquotsAnnotationsQueryBuilder().build_query(item_type_name=item_type_name)
        metadata_samples_query_str = AliquotsAnnotationsQueryBuilder().build_metadata_samples_query()

        try:
            db = sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)

            # build annotation message
            cursor.execute(query_str, query_tuple)
            rows = cursor.fetchall()
            cursor.execute(metadata_samples_query_str, (str(aliquot_barcode),))
            metadata_sample_rows = cursor.fetchall()
            if len(rows) == 0:
                cursor.close()
                db.close()
                if len(metadata_sample_rows) == 0:
                    msg = "Aliquot barcode {} not found in the database.".format(aliquot_barcode)
                    logger.info(msg)
                else:
                    msg = "No annotations found for aliquot barcode {}".format(aliquot_barcode)
                    if item_type_name is not None:
                        msg += " and item type name {}. Item type name must be one of the following: " \
                               "'Patient', 'Aliquot', 'Analyte', 'Shipped Portion', 'Portion', 'Slide', 'Sample'.".format(item_type_name)
                    logger.info(msg)
                raise endpoints.NotFoundException(msg)

            items = []
            for row in rows:
                constructor_dict = build_constructor_dict_for_message(MetadataAnnotationItem(), row)
                items.append(MetadataAnnotationItem(**constructor_dict))

            return MetadataAnnotationList(items=items, count=len(items))

        except (IndexError, TypeError), e:
            logger.info("Aliquot {} not found. Error: {}".format(aliquot_barcode, e))
            raise endpoints.NotFoundException("Aliquot {} not found.".format(aliquot_barcode))
Пример #25
0
    def get(self, request):
        """
        Given a sample barcode (of length 16, *eg* TCGA-B9-7268-01A), this endpoint returns
        all available "biospecimen" information about this sample,
        the associated patient barcode, a list of associated aliquots,
        and a list of "data_details" blocks describing each of the data files associated with this sample
        """

        cursor = None
        db = None

        sample_barcode = request.get_assigned_value('sample_barcode')
        pipeline = request.get_assigned_value('pipeline')
        platform = request.get_assigned_value('platform')

        aliquot_query_str = SamplesGetQueryBuilder().build_aliquot_query(platform=platform, pipeline=pipeline)
        biospecimen_query_str = SamplesGetQueryBuilder().build_biospecimen_query()
        data_query_str = SamplesGetQueryBuilder().build_data_query(platform=platform, pipeline=pipeline)
        patient_query_str = SamplesGetQueryBuilder().build_patient_query()

        query_tuple = (str(sample_barcode),)
        extra_query_tuple = query_tuple
        if pipeline is not None: extra_query_tuple += (pipeline,)
        if platform is not None: extra_query_tuple += (platform,)

        try:
            db = sql_connection()
            cursor = db.cursor(MySQLdb.cursors.DictCursor)

            # build biospecimen data message
            cursor.execute(biospecimen_query_str, query_tuple)
            row = cursor.fetchone()
            if row is None:
                cursor.close()
                db.close()
                error_message = "Sample barcode {} not found in metadata_biospecimen table.".format(sample_barcode)
                raise endpoints.NotFoundException(error_message)
            constructor_dict = build_constructor_dict_for_message(MetadataItem(), row)
            biospecimen_data_item = MetadataItem(**constructor_dict)

            # get list of aliquots
            cursor.execute(aliquot_query_str, extra_query_tuple)
            aliquot_list = [row['AliquotBarcode'] for row in cursor.fetchall()]

            # get patient barcode (superfluous?)
            cursor.execute(patient_query_str, query_tuple)
            row = cursor.fetchone()
            patient_barcode = str(row["case_barcode"])

            # prepare to build list of data details messages
            cursor.execute(data_query_str, extra_query_tuple)
            cursor_rows = cursor.fetchall()
            # update every dictionary in cursor_rows to contain the full cloud_storage_path for each sample
            bad_repo_count, bad_repo_set = \
                CohortsSamplesFilesMessageBuilder().get_GCS_file_paths_and_bad_repos(cursor_rows)
            if bad_repo_count > 0:
                logger.warn("not returning {count} row(s) in sample_details due to repositories: {bad_repo_list}"
                            .format(count=bad_repo_count, bad_repo_list=list(bad_repo_set)))

            # build a data details message for each row returned from metadata_data table
            data_details_list = []
            for row in cursor_rows:
                constructor_dict = build_constructor_dict_for_message(DataDetails(), row)
                data_details_item = DataDetails(**constructor_dict)
                data_details_list.append(data_details_item)

            if bad_repo_count > 0:
                logger.warn("not returning {count} row(s) in sample_details due to repositories: {bad_repo_list}"
                            .format(count=bad_repo_count, bad_repo_list=list(bad_repo_set)))

            return SampleDetails(aliquots=aliquot_list,
                                 biospecimen_data=biospecimen_data_item,
                                 data_details=data_details_list,
                                 data_details_count=len(data_details_list),
                                 patient=patient_barcode)

        except (IndexError, TypeError) as e:
            logger.info("Sample details for barcode {} not found. Error: {}".format(sample_barcode, e))
            raise endpoints.NotFoundException(
                "Sample details for barcode {} not found.".format(sample_barcode))
        except MySQLdb.ProgrammingError as e:
            logger.warn(e)
            raise endpoints.BadRequestException("Error retrieving biospecimen, patient, or other data. {}".format(e))
        finally:
            if cursor: cursor.close()
            if db and db.open: db.close()