Exemplo n.º 1
0
 def annotate(self, connection: Connection,
              genomic_interval: GenomicInterval,
              attrs: Optional[List[Vocabulary]], assembly) -> Selectable:
     """
     :param connection:
     :param genomic_interval:
     :param attrs: a list of Vocabulary elements indicating the kind of annotation attributes desired
     :param assembly:
     :return: a statement that when executed returns the annotation data requested.
     """
     self.connection = connection
     columns_of_interest = [
         ann_table.c[self.col_map[attr]].label(attr.name) for attr in attrs
     ]
     stmt = \
         select(columns_of_interest) \
         .where((ann_table.c.start <= genomic_interval.stop) &
                (ann_table.c.stop >= genomic_interval.start) &
                (ann_table.c.chrom == genomic_interval.chrom))
     if genomic_interval.strand is not None and genomic_interval.strand != 0:
         stmt = stmt.where(ann_table.c.strand == genomic_interval.strand)
     item_id_for_assembly = item_id_assembly_hg19 if assembly == 'hg19' else item_id_assembly_grch38
     stmt = stmt.where(ann_table.c.item_id == item_id_for_assembly)
     if self.log_sql_statements:
         utils.show_stmt(connection, stmt, self.logger.debug,
                         'GENCODE_V19_HG19: ANNOTATE REGION/VARIANT')
     return stmt
Exemplo n.º 2
0
 def view_of_variants_in_interval_or_type(self,
                                          select_columns: Optional[list]):
     if self.region_attrs.with_variants_in_reg is None and self.region_attrs.with_variants_of_type is None:
         raise ValueError(
             'you called this method without giving any selection criteria')
     columns = [regions.c[c_name] for c_name in select_columns
                ] if select_columns is not None else [regions]
     stmt_as = select(columns)
     if self.region_attrs.with_variants_in_reg is not None:
         stmt_as = stmt_as.where((
             regions.c.chrom == self.region_attrs.with_variants_in_reg.chrom
         ) & (
             regions.c.start >= self.region_attrs.with_variants_in_reg.start
         ) & (regions.c.start <= self.region_attrs.with_variants_in_reg.stop
              ))
     if self.region_attrs.with_variants_of_type is not None:
         stmt_as = stmt_as.where(
             regions.c.mut_type.in_(
                 self.region_attrs.with_variants_of_type))
     generated_view_name = utils.random_t_name_w_prefix(
         'mut_of_type_interval')
     stmt = utils.stmt_create_view_as(generated_view_name, stmt_as,
                                      default_schema_to_use_name)
     if self.log_sql_commands:
         utils.show_stmt(
             self.connection, stmt, self.logger.debug,
             'VIEW OF REGIONS IN INTERVAL {} of types {}'.format(
                 self.region_attrs.with_variants_in_reg,
                 self.region_attrs.with_variants_of_type))
     self.connection.execute(stmt)
     return Table(generated_view_name,
                  db_meta,
                  autoload=True,
                  autoload_with=self.connection,
                  schema=default_schema_to_use_name)
Exemplo n.º 3
0
 def _table_with_any_of_mutations(self, select_columns,
                                  only_item_id_in_table: Optional[Table],
                                  *mutations: Mutation):
     """Returns a Table containing all the rows from the table regions containing one of the variants in
     the argument mutations.
     :param select_columns selects only the column names in this collection. If None, selects all the columns from regions.
     :param only_item_id_in_table If None, the variants that are not owned by any of the individuals in this table
     are discarded from the result.
     """
     if len(mutations) == 0:
         raise ValueError('function argument *mutations cannot be empty')
     else:
         # create table for the result
         t_name = utils.random_t_name_w_prefix('with_any_of_mut')
         columns = [regions.c[c_name] for c_name in select_columns
                    ] if select_columns is not None else [regions]
         stmt_as = self._stmt_where_region_is_any_of_mutations(
             *mutations,
             from_table=regions,
             select_expression=select(columns),
             only_item_id_in_table=only_item_id_in_table)
     stmt_create_table = utils.stmt_create_table_as(
         t_name, stmt_as, default_schema_to_use_name)
     if self.log_sql_commands:
         utils.show_stmt(
             self.connection, stmt_create_table, self.logger.debug,
             'CREATE TABLE HAVING ANY OF THE {} MUTATIONS'.format(
                 len(mutations)))
     self.connection.execute(stmt_create_table)
     return Table(t_name,
                  db_meta,
                  autoload=True,
                  autoload_with=self.connection,
                  schema=default_schema_to_use_name)
Exemplo n.º 4
0
 def _table_without_any_of_mutations(self):
     """
     Returns a Table containing the item_id from the table genomes that do not match the given mutations.
     :param select_columns selects only the column names in this collection. If None, selects all the columns from genomes.
     """
     mutations = self.region_attrs.without_variants
     if len(mutations) == 0:
         raise ValueError('function argument *mutations cannot be empty')
     else:
         # create table for the result
         t_name = utils.random_t_name_w_prefix('without_any_of_mut')
         query_mutations = self._stmt_where_region_is_any_of_mutations(
             *mutations,
             from_table=regions,
             select_expression=select([regions.c.item_id]),
             only_item_id_in_table=self.my_meta_t)
         stmt_as = except_(select([self.my_meta_t.c.item_id]),
                           query_mutations)
         stmt_create_table = utils.stmt_create_table_as(
             t_name, stmt_as, default_schema_to_use_name)
         if self.log_sql_commands:
             utils.show_stmt(
                 self.connection, stmt_create_table, self.logger.debug,
                 'CREATE TABLE WITHOUT ANY OF THE {} MUTATIONS'.format(
                     len(mutations)))
         self.connection.execute(stmt_create_table)
         return Table(t_name,
                      db_meta,
                      autoload=True,
                      autoload_with=self.connection,
                      schema=default_schema_to_use_name)
Exemplo n.º 5
0
 def table_with_variants_same_c_copy(self, select_columns: Optional[list]):
     """
      Returns a table of variants of the same type of the ones contained in RegionAttrs.with_variants_same_c_copy and only
      form the individuals that own all of them on the same chromosome copy.
      :param select_columns: the list of column names to select from the result. If None, all the columns are taken.
     """
     if len(self.region_attrs.with_variants_same_c_copy) < 2:
         raise ValueError(
             'You must provide at least two Mutation instances in order to use this method.'
         )
     # selects only the mutations to be on the same chromosome copies (this set will be used two times) from all individuals
     # we will enforce the presence of all the given mutations in all the individuals later...
     interm_select_column_names = None  # means all columns
     if select_columns is not None:  # otherwise pick select_columns + minimum required
         interm_select_column_names = set(select_columns)
         interm_select_column_names.update(['item_id', 'al1', 'al2'])
     intermediate_table = self._table_with_any_of_mutations(
         interm_select_column_names, self.my_meta_t,
         *self.region_attrs.with_variants_same_c_copy)
     # groups mutations by owner in the intermediate table, and take only the owners for which sum(al1) or sum(al2)
     # equals to the number of the given mutations. That condition automatically implies the presence of all the
     # given mutations in the same individual.
     # for those owner, take all the given mutations
     result_columns = [intermediate_table]  # means all columns
     if select_columns is not None:  # otherwise pick the columns from select_columns
         result_columns = [
             intermediate_table.c[col_name] for col_name in select_columns
         ]
     stmt_as = \
         select(result_columns) \
         .where(intermediate_table.c.item_id.in_(
             select([intermediate_table.c.item_id])
             .group_by(intermediate_table.c.item_id)
             .having(
                 (func.sum(intermediate_table.c.al1) == len(
                     self.region_attrs.with_variants_same_c_copy)) |  # the ( ) around each condition are mandatory
                 (func.sum(func.coalesce(intermediate_table.c.al2, 0)) == len(
                     self.region_attrs.with_variants_same_c_copy)))
         ))
     target_t_name = utils.random_t_name_w_prefix('with_var_same_c_copy')
     stmt = utils.stmt_create_table_as(target_t_name, stmt_as,
                                       default_schema_to_use_name)
     if self.log_sql_commands:
         utils.show_stmt(
             self.connection, stmt, self.logger.debug,
             'INDIVIDUALS (+ THE GIVEN MUTATIONS) HAVING ALL THE SPECIFIED MUTATIONS ON THE SAME CHROMOSOME COPY'
         )
     self.connection.execute(stmt)
     if self.log_sql_commands:
         self.logger.debug('DROP TABLE ' + intermediate_table.name)
     intermediate_table.drop(self.connection)
     return Table(target_t_name,
                  db_meta,
                  autoload=True,
                  autoload_with=self.connection,
                  schema=default_schema_to_use_name)
Exemplo n.º 6
0
    def variant_occurrence(self, connection: Connection, by_attributes: list,
                           meta_attrs: MetadataAttrs,
                           region_attrs: RegionAttrs,
                           variant: Mutation) -> Selectable:
        """
        Assembles a query statement that, after execution, returns a table containing for each individual matching the
        conditions in region_attrs and meta_attrs, the attributes given in by_attributes and the number of times
        the given "variant" occurs in each individual.
        """
        # init state
        self.connection = connection
        names_columns_of_interest = [
            self.meta_col_map[attr] for attr in by_attributes
        ]
        self._set_meta_attributes(meta_attrs)
        self.create_table_of_meta(names_columns_of_interest + ['item_id'])
        self._set_region_attributes(region_attrs)
        self.create_table_of_regions(['item_id'])

        # select target attributes from table of metadata with meta_attrs
        stmt_sample_set = select([
            self.my_meta_t.c[self.meta_col_map[attr]] for attr in by_attributes
        ] + [self.my_meta_t.c.item_id])
        # join with the table of regions with region_attrs
        if self.my_region_t is not None:
            stmt_sample_set = stmt_sample_set.where(
                self.my_meta_t.c.item_id.in_(
                    select([self.my_region_t.c.item_id]).distinct()))
        stmt_sample_set = stmt_sample_set.alias()

        # select individuals with "variant" in table genomes and compute the occurrence for each individual
        func_occurrence = (genomes.c.al1 +
                           func.coalesce(genomes.c.al2, 0)).label(
                               Vocabulary.OCCURRENCE.name)

        stmt_samples_w_var = self._stmt_where_region_is_any_of_mutations(variant,
                                                                         from_table=genomes,
                                                                         select_expression=select([genomes.c.item_id, func_occurrence])) \
            .alias('samples_w_var')

        # build a query returning individuals in sample_set and for each, the attributes in "by_attributes" + the occurrence
        # of the given variant
        stmt = \
            select([stmt_sample_set.c[self.meta_col_map[attr]].label(attr.name) for attr in by_attributes]
                   + [func.coalesce(column(Vocabulary.OCCURRENCE.name), 0).label(Vocabulary.OCCURRENCE.name)]) \
            .select_from(stmt_sample_set.outerjoin(stmt_samples_w_var,
                                                   stmt_sample_set.c.item_id == stmt_samples_w_var.c.item_id))
        # TODO test what happens if sample set is empty and it is anyway used in the left join statement
        if self.log_sql_commands:
            utils.show_stmt(connection, stmt, self.logger.debug,
                            'KGENOMES: STMT VARIANT OCCURRENCE')
        return stmt
Exemplo n.º 7
0
 def table_with_all_of_mutations(self, select_columns: Optional[list]):
     """
      Returns a table of variants of the same type of the ones contained in RegionAttrs.with_variants and only form the
      individuals that own all of them.
      :param select_columns: the list of column names to select from the result. If None, all the columns are taken.
     """
     if not self.region_attrs.with_variants:
         raise ValueError(
             'instance parameter self.with_variants not initialized')
     elif len(self.region_attrs.with_variants) == 1:
         return self._table_with_any_of_mutations(
             select_columns, self.my_meta_t,
             *self.region_attrs.with_variants)
     else:
         union_select_column_names = None  # means all columns
         if select_columns is not None:  # otherwise use select_columns + minimum necessary
             union_select_column_names = set(select_columns)
             union_select_column_names.add('item_id')
         union_table = self._table_with_any_of_mutations(
             union_select_column_names, self.my_meta_t,
             *self.region_attrs.with_variants)
         # extracts only the samples having all the mutations
         result_select_columns = [union_table]  # means all columns
         if select_columns is not None:  # otherwise use selected_columns
             result_select_columns = [
                 union_table.c[col_name] for col_name in select_columns
             ]
         stmt_as = \
             select(result_select_columns) \
             .where(union_table.c.item_id.in_(
                 select([union_table.c.item_id])
                 .group_by(union_table.c.item_id)
                 .having(func.count(union_table.c.item_id) == len(self.region_attrs.with_variants))
             ))
         target_t_name = utils.random_t_name_w_prefix('with')
         stmt_create_table = utils.stmt_create_table_as(
             target_t_name, stmt_as, default_schema_to_use_name)
         if self.log_sql_commands:
             utils.show_stmt(
                 self.connection, stmt_create_table, self.logger.debug,
                 'INDIVIDUALS HAVING "ALL" THE {} MUTATIONS (WITH DUPLICATE ITEM_ID)'
                 .format(len(self.region_attrs.with_variants)))
         self.connection.execute(stmt_create_table)
         if self.log_sql_commands:
             self.logger.debug('DROP TABLE ' + union_table.name)
         union_table.drop(self.connection)
         return Table(target_t_name,
                      db_meta,
                      autoload=True,
                      autoload_with=self.connection,
                      schema=default_schema_to_use_name)
Exemplo n.º 8
0
    def donors(self, connection, by_attributes: List[Vocabulary],
               meta_attrs: MetadataAttrs, region_attrs: RegionAttrs,
               with_download_urls: bool) -> Selectable:
        """
        Assembles a query statement that, when executed, returns a table containing for each individual matching the
        requirements in meta_attrs and region_attrs, the attributes in "by_attributes"
        """
        # init state
        self.connection = connection
        names_columns_of_interest = [
            self.meta_col_map[attr] for attr in by_attributes
        ]
        self._set_meta_attributes(meta_attrs)
        self.create_table_of_meta(names_columns_of_interest)
        self._set_region_attributes(region_attrs)
        self.create_table_of_regions(['item_id'])

        # TCGA has 4 gender classes: males/females/not reported/<no gender at all>. This trick merges null gender with
        # not reported. Otherwise, when coordinator does group by cube(gender) we would get 2 times a null gender.
        columns_of_interest = [
            self.my_meta_t.c[self.meta_col_map[attr]].label(attr.name)
            for attr in by_attributes if attr is not Vocabulary.GENDER
        ]
        if Vocabulary.GENDER in by_attributes:
            columns_of_interest.append(
                func.coalesce(
                    self.my_meta_t.c[self.meta_col_map[Vocabulary.GENDER]],
                    'not reported').label(Vocabulary.GENDER.name))

        # compute statistics
        if with_download_urls:
            columns_of_interest.append(
                public_item.c.local_url.label(
                    Vocabulary.DOWNLOAD_REGION_URL.name))
        stmt = select(columns_of_interest)
        if self.my_region_t is not None:
            stmt = stmt.where(
                self.my_meta_t.c.item_id.in_(
                    select([self.my_region_t.c.item_id]).distinct()))
        if with_download_urls:
            stmt = stmt.where(
                self.my_meta_t.c.item_id == public_item.c.item_id)
        if self.log_sql_commands:
            utils.show_stmt(self.connection, stmt, self.logger.debug,
                            'TCGA: STMT DONORS WITH REQUIRED ATTRIBUTES')
        return stmt
Exemplo n.º 9
0
 def take_regions_of_common_individuals(self, tables: list):
     """
     Generates a table containing all the mutations from all the origin tables but only for those individuals that
     appear in all the origin tables.
     Supposing that each origin table reflects a characteristic that the final sample set must have, this method
     basically puts those characteristics in AND relationship by taking only the regions from the individuals that
     have all the characteristics.
     :param tables: The source tables which must have the same columns in the same order.
     """
     if len(tables) == 1:
         return tables[0]
     else:
         # join 1st with 2nd with 3rd ... with nth on item_id
         # TODO consider creating temporary tables selecting only the item_id before joining
         stmt_join = reduce(
             lambda table_1, table_2: table_1.join(
                 table_2, tables[0].c.item_id == table_2.c.item_id), tables)
         # union of tables
         select_all_from_each_table = map(lambda table_: select([table_]),
                                          tables)
         # TODO consider selecting from union table only what is needed by the users of this method (parametric choice)
         stmt_union = union(*select_all_from_each_table).alias()
         # select from the union table only the item_id that exists in the join
         stmt_as = \
             select([stmt_union]) \
             .where(exists(select()
                           .select_from(stmt_join)
                           .where(stmt_union.c.item_id == tables[0].c.item_id)
                           ))
         target_t_name = utils.random_t_name_w_prefix('intersect')
         stmt_create_table = utils.stmt_create_table_as(
             target_t_name, stmt_as, default_schema_to_use_name)
         if self.log_sql_commands:
             utils.show_stmt(
                 self.connection, stmt_create_table, self.logger.debug,
                 'SELECT ALL FROM SOURCE TABLES WHERE item_id IS IN ALL SOURCE TABLES'
             )
         self.connection.execute(stmt_create_table)
         # TODO drop partial tables ?
         return Table(target_t_name,
                      db_meta,
                      autoload=True,
                      autoload_with=self.connection,
                      schema=default_schema_to_use_name)
Exemplo n.º 10
0
 def find_gene_region(self, connection: Connection, gene: Gene,
                      output_attrs: List[Vocabulary], assembly):
     self.connection = connection
     select_columns = [
         ann_table.c[self.col_map[att]].label(att.name)
         for att in output_attrs
     ]
     stmt = select(select_columns)\
         .where(ann_table.c.gene_name == gene.name)
     if gene.type_ is not None:
         stmt = stmt.where(ann_table.c.gene_type == gene.type_)
     if gene.id_ is not None:
         stmt = stmt.where(ann_table.c.gene_id == gene.id_)
     item_id_for_assembly = item_id_assembly_hg19 if assembly == 'hg19' else item_id_assembly_grch38
     stmt = stmt.where(ann_table.c.item_id == item_id_for_assembly)
     if self.log_sql_statements:
         utils.show_stmt(connection, stmt, self.logger.debug,
                         'GENCODE_V19_HG19: FIND GENE')
     return stmt
Exemplo n.º 11
0
    def get_variant_details(self, connection: Connection, variant: Mutation,
                            which_details: List[Vocabulary], assembly) -> list:
        self.connection = connection
        global genomes
        select_columns = []
        for att in which_details:
            mapping = self.region_col_map.get(att)
            if mapping is not None:
                select_columns.append(genomes.c[mapping].label(att.name))
            else:
                select_columns.append(
                    cast(literal(Vocabulary.unknown.name),
                         types.String).label(att.name))

        stmt = select(select_columns).distinct()
        if variant.chrom is not None:
            stmt = stmt.where((genomes.c.chrom == variant.chrom)
                              & (genomes.c.start == variant.start)
                              & (genomes.c.ref == variant.ref)
                              & (genomes.c.alt == variant.alt))
        else:
            stmt = stmt.where(genomes.c.id == variant.id)
        stmt = stmt.where(
            genomes.c.item_id.in_(
                select([metadata.c.item_id
                        ]).where(metadata.c.assembly == assembly)))
        if self.log_sql_commands:
            utils.show_stmt(connection, stmt, self.logger.debug,
                            'GET VARIANT DETAILS')
        result = connection.execute(stmt)
        if result.rowcount == 0:
            return list()
        else:
            if result.rowcount > 1:
                self.logger.error(
                    f'user searched for variant: chrom {str(variant.chrom)}, start {str(variant)}, '
                    f'ref {str(variant.ref)}, alt {str(variant.alt)}, id {str(variant.id)}'
                    f'but two results were found')
            final_result = result.fetchone().values()
            result.close()
            return final_result
Exemplo n.º 12
0
    def donors(self, connection, by_attributes: List[Vocabulary],
               meta_attrs: MetadataAttrs, region_attrs: RegionAttrs,
               with_download_urls: bool) -> Selectable:
        """
        Assembles a query statement that, when executed, returns a table containing for each individual matching the
        requirements in meta_attrs and region_attrs, the attributes in "by_attributes"
        """
        # init state
        self.connection = connection
        names_columns_of_interest = [
            self.meta_col_map[attr] for attr in by_attributes
        ]
        self._set_meta_attributes(meta_attrs)
        self.create_table_of_meta(names_columns_of_interest)
        self._set_region_attributes(region_attrs)
        self.create_table_of_regions(['item_id'])

        # compute statistics
        columns_of_interest = [
            self.my_meta_t.c[self.meta_col_map[attr]].label(attr.name)
            for attr in by_attributes
        ]
        if with_download_urls:
            columns_of_interest.append(
                public_item.c.local_url.label(
                    Vocabulary.DOWNLOAD_REGION_URL.name))
        stmt = select(columns_of_interest)
        if self.my_region_t is not None:
            stmt = stmt.where(
                self.my_meta_t.c.item_id.in_(
                    select([self.my_region_t.c.item_id]).distinct()))
        if with_download_urls:
            stmt = stmt.where(
                self.my_meta_t.c.item_id == public_item.c.item_id)
        if self.log_sql_commands:
            utils.show_stmt(self.connection, stmt, self.logger.debug,
                            'KGENOMES: STMT DONORS WITH REQUIRED ATTRIBUTES')
        return stmt
Exemplo n.º 13
0
def try_stmt(what,
             log_function: Optional[Callable],
             log_title: Optional[str],
             num_attempts: int = 2) -> ResultProxy:
    # following instruction can raise OperationalError if the database is not reachable/not connected but it's caught elsewhere
    connection = db_engine.connect().execution_options(autocommit=True)
    try:
        num_attempts -= 1
        if log_function is not None:
            db_utils.show_stmt(connection, what, log_function, log_title)
        result = connection.execute(what)
        return result
    except sqlalchemy_exceptions.DatabaseError as e:  # pooled database connection has been invalidated/restarted
        logger.debug('Connection has been reset. Invalidate connection pool.')
        db_engine.dispose()
        logger.debug(f'POOL STATUS {str(db_engine.pool.status())}')
        if num_attempts > 0:
            logger.debug('Attempt {} more time(s)'.format(num_attempts))
            return try_stmt(what, log_function, log_title, num_attempts)
        else:
            raise e
    finally:
        connection.close()
Exemplo n.º 14
0
    def variants_in_region(self, connection: Connection,
                           genomic_interval: GenomicInterval,
                           output_region_attrs: List[Vocabulary],
                           meta_attrs: MetadataAttrs,
                           region_attrs: Optional[RegionAttrs]) -> Selectable:
        # init state
        self.connection = connection
        self._set_meta_attributes(meta_attrs)
        self.create_table_of_meta(['item_id'])
        self._set_region_attributes(region_attrs)
        self.create_table_of_regions(['item_id'])

        if self.my_region_t is not None:
            only_from_samples = intersect(select([self.my_meta_t.c.item_id]),
                                          select([self.my_region_t.c.item_id]))
        else:
            only_from_samples = select([self.my_meta_t.c.item_id])
        only_from_samples = only_from_samples.alias('samples')

        select_columns = list()
        for att in output_region_attrs:
            select_columns.append(regions.c[self.region_col_map[att]].label(
                att.name))
        stmt =\
            select(select_columns).distinct() \
            .select_from(regions.join(only_from_samples, only_from_samples.c.item_id == regions.c.item_id)) \
            .where((regions.c.start >= genomic_interval.start) &
                   (regions.c.start <= genomic_interval.stop) &
                   (regions.c.chrom == genomic_interval.chrom))

        if self.log_sql_commands:
            utils.show_stmt(
                connection, stmt, self.logger.debug,
                f'TCGA: VARIANTS IN REGION '
                f'{genomic_interval.chrom}'
                f'-{genomic_interval.start}-{genomic_interval.stop}')
        return stmt
Exemplo n.º 15
0
    def rank_variants_by_frequency(self, connection, meta_attrs: MetadataAttrs,
                                   region_attrs: RegionAttrs, ascending: bool,
                                   freq_threshold: float, limit_result: int,
                                   time_estimate_only: bool) -> FromClause:
        # init state
        self.connection = connection
        self._set_meta_attributes(meta_attrs)
        self.create_table_of_meta(['item_id', 'gender'])
        self._set_region_attributes(region_attrs)
        self.create_table_of_regions(['item_id'])

        females_and_males_stmt = select(
            [self.my_meta_t.c.gender, func.count()])
        if self.my_region_t is not None:
            females_and_males_stmt = females_and_males_stmt\
                .where(self.my_meta_t.c.item_id.in_(select([self.my_region_t.c.item_id])))
        females_and_males_stmt = females_and_males_stmt \
            .group_by(self.my_meta_t.c.gender)
        gender_of_individuals = [
            row.values()
            for row in connection.execute(females_and_males_stmt).fetchall()
        ]
        if len(gender_of_individuals) == 0:
            raise EmptyResult('TCGA ')
        females = next(
            (el[1] for el in gender_of_individuals if el[0] == 'female'), 0)
        males = next(
            (el[1] for el in gender_of_individuals if el[0] == 'male'), 0)
        other_genders = reduce(
            lambda x1, x2: x1 + x2,
            [el[1] for el in gender_of_individuals]) - males - females
        self.logger.debug(
            f'TCGA: request /rank_variants_by_frequency for a population of {males+females+other_genders} individuals'
        )

        if time_estimate_only:
            approx_pop_size = males + females + other_genders
            self.notify_message(SourceMessage.Type.TIME_TO_FINISH,
                                str(int(0.3 * approx_pop_size)))
            self.notify_message(
                SourceMessage.Type.GENERAL_WARNING,
                f'Samples to analyze in TCGA: {approx_pop_size}')
            locale.setlocale(locale.LC_ALL, '')
            estimated_n_variants = 232 * approx_pop_size
            self.notify_message(
                SourceMessage.Type.GENERAL_WARNING,
                f'Estimated number of variants to rank in TCGA: ~{estimated_n_variants:n}'
            )
            raise EmptyResult('TCGA')
        if other_genders > 0:
            self.notify_message(
                SourceMessage.Type.GENERAL_WARNING,
                'Note for TCGA data: Individuals with an undefined gender have been excluded from the population while '
                'calculating the frequency of variants in chromosomes 23 and 24'
            )

        # reduce size of the join with regions table
        genomes_red = select(
            [regions.c.item_id, regions.c.chrom, regions.c.start, regions.c.ref, regions.c.alt, regions.c.al1,
             regions.c.al2])\
            .alias('variants_few_columns')

        # custom functions
        func_occurrence = (func.sum(genomes_red.c.al1) + func.sum(
            func.coalesce(genomes_red.c.al2, 0))).label('occurrence_by_gender')
        func_positive_donors = func.count(
            genomes_red.c.item_id).label('positives_by_gender')

        # Actually, self.my_region_t already contains only the individuals compatible with meta_attrs, but it can contain
        # duplicated item_id. Since we want to join, it's better to remove them.
        sample_set_with_limit = select(
            [self.my_meta_t.c.item_id, self.my_meta_t.c.gender])
        if self.my_region_t is not None:
            sample_set_with_limit = sample_set_with_limit \
                .where(self.my_meta_t.c.item_id.in_(
                    select([self.my_region_t.c.item_id])
                ))
        sample_set_with_limit = sample_set_with_limit.alias('sample_set')

        stmt = select([genomes_red.c.chrom,
                       genomes_red.c.start,
                       genomes_red.c.ref,
                       genomes_red.c.alt,
                       func_occurrence,
                       func_positive_donors,
                       sample_set_with_limit.c.gender]) \
            .select_from(genomes_red.join(
                sample_set_with_limit,
                genomes_red.c.item_id == sample_set_with_limit.c.item_id)) \
            .group_by(genomes_red.c.chrom, genomes_red.c.start, genomes_red.c.ref, genomes_red.c.alt, sample_set_with_limit.c.gender) \
            .alias('stmt_1')

        # do not count the occurrences and the positiveness of whom who aren't males/females for a variant in chrom 23/24
        outer_stmt = \
            select([stmt.c.chrom, stmt.c.start, stmt.c.ref, stmt.c.alt] +
                   [
                       case([
                           ((stmt.c.chrom < 23) | (stmt.c.chrom > 24), males+other_genders)
                            ], else_=males).label('males'),
                       cast(func.sum(column('positives_by_gender')), types.INTEGER).label('positives'),
                       cast(func.sum(column('occurrence_by_gender')), types.INTEGER).label('occurrence')
                   ]
                   ) \
            .where((stmt.c.gender.in_(['male', 'female'])) | (stmt.c.chrom < 23) | (stmt.c.chrom > 24)) \
            .group_by(stmt.c.chrom, stmt.c.start, stmt.c.ref, stmt.c.alt) \
            .alias('stmt_2')

        # other custom function
        if meta_attrs.assembly == 'hg19':
            func_frequency_new = func.rr.mut_frequency_new_hg19(
                column('occurrence'), column('males'), females,
                outer_stmt.c.chrom, outer_stmt.c.start)
        else:
            func_frequency_new = func.rr.mut_frequency_new_grch38(
                column('occurrence'), column('males'), females,
                outer_stmt.c.chrom, outer_stmt.c.start)
        func_frequency_new = func_frequency_new.label(
            Vocabulary.FREQUENCY.name)

        outer_outer_stmt = \
            select([
                outer_stmt.c.chrom.label(Vocabulary.CHROM.name),
                outer_stmt.c.start.label(Vocabulary.START.name),
                outer_stmt.c.ref.label(Vocabulary.REF.name),
                outer_stmt.c.alt.label(Vocabulary.ALT.name),
                (column('males') + females).label(Vocabulary.POPULATION_SIZE.name),
                column('positives').label(Vocabulary.POSITIVE_DONORS.name),
                column('occurrence').label(Vocabulary.OCCURRENCE.name),
                func_frequency_new
            ])
        if ascending:
            if freq_threshold:
                outer_outer_stmt = outer_outer_stmt.where(
                    func_frequency_new >= freq_threshold)
            outer_outer_stmt = outer_outer_stmt.order_by(
                asc(func_frequency_new), asc(column('occurrence')))
        else:
            if freq_threshold:
                outer_outer_stmt = outer_outer_stmt.where(
                    func_frequency_new <= freq_threshold)
            outer_outer_stmt = outer_outer_stmt.order_by(
                desc(func_frequency_new), desc(column('occurrence')))
        outer_outer_stmt = outer_outer_stmt.limit(limit_result) \
            .alias('TCGA_ranked')

        if self.log_sql_commands:
            utils.show_stmt(connection, outer_outer_stmt, self.logger.debug,
                            'TCGA: RANKING VARIANTS IN SAMPLE SET')
        return outer_outer_stmt