Exemplo n.º 1
0
    def column_values_at_locus(
            self,
            column_name,
            feature,
            contig,
            position,
            end=None,
            strand=None,
            distinct=False,
            sorted=False):
        """
        Get the non-null values of a column from the database
        at a particular range of loci
        """

        # TODO: combine with the query method, since they overlap
        # significantly
        require_string(column_name, "column_name", nonempty=True)

        contig = normalize_chromosome(contig)

        require_integer(position, "position")

        if end is None:
            end = position

        require_integer(end, "end")

        if not self.column_exists(feature, column_name):
            raise ValueError("Table %s doesn't have column %s" % (
                feature, column_name,))

        if distinct:
            distinct_string = "DISTINCT "
        else:
            distinct_string = ""

        query = """
            SELECT %s%s
            FROM %s
            WHERE seqname = ?
            AND start <= ?
            AND end >= ?

        """ % (distinct_string, column_name, feature)

        query_params = [contig, end, position]

        if strand:
            query += " AND strand = ?"
            query_params.append(strand)

        tuples = self.connection.execute(query, query_params).fetchall()

        # each result is a tuple, so pull out its first element
        results = [t[0] for t in tuples if t[0] is not None]

        if sorted:
            results.sort()
        return results
Exemplo n.º 2
0
def test_require_integer():
    require_integer(0)
    require_integer(10)
    require_integer(-10)
    with assert_raises(TypeError):
        require_integer("")
    with assert_raises(TypeError):
        require_integer(None)
Exemplo n.º 3
0
 def from_interbase_coordinates(contig, start, end=None):
     '''
     Given coordinates in 0-based interbase coordinates, return a Locus
     instance.
     '''
     typechecks.require_string(contig)
     typechecks.require_integer(start)
     if end is None:
         end = start + 1
     typechecks.require_integer(end)
     return Locus(contig, start, end)
Exemplo n.º 4
0
 def from_interbase_coordinates(contig, start, end=None):
     '''
     Given coordinates in 0-based interbase coordinates, return a Locus
     instance.
     '''
     typechecks.require_string(contig)
     typechecks.require_integer(start)
     if end is None:
         end = start + 1
     typechecks.require_integer(end)
     contig = pyensembl.locus.normalize_chromosome(contig)
     return Locus(contig, start, end)
Exemplo n.º 5
0
 def from_inclusive_coordinates(contig, start, end=None):
     '''
     Given coordinates in 1-based coordinates that are inclusive on start
     and end, return a Locus instance. Locus instances are always 0-based
     "interbase" coordinates.
     '''
     typechecks.require_string(contig)
     typechecks.require_integer(start)
     if end is None:
         end = start
     typechecks.require_integer(end)
     return Locus(contig, start - 1, end)
Exemplo n.º 6
0
def _create_cached_db(db_path, tables, version=1):
    """
    Either create or retrieve sqlite database.

    Parameters
    --------
    db_path : str
        Path to sqlite3 database file

    tables : dict
        Dictionary mapping table names to datacache.DatabaseTable objects

    version : int, optional
        Version acceptable as cached data.

    Returns sqlite3 connection
    """
    require_string(db_path, "db_path")
    require_iterable_of(tables, DatabaseTable)
    require_integer(version, "version")

    # if the database file doesn't already exist and we encounter an error
    # later, delete the file before raising an exception
    delete_on_error = not exists(db_path)

    # if the database already exists, contains all the table
    # names and has the right version, then just return it
    db = Database(db_path)

    # make sure to delete the database file in case anything goes wrong
    # to avoid leaving behind an empty DB
    table_names = [table.name for table in tables]
    try:
        if db.has_tables(table_names) and \
                db.has_version() and \
                db.version() == version:
            logger.info("Found existing table in database %s", db_path)
        else:
            if len(db.table_names()) > 0:
                logger.info("Dropping tables from database %s: %s", db_path,
                            ", ".join(db.table_names()))
                db.drop_all_tables()
            logger.info("Creating database %s containing: %s", db_path,
                        ", ".join(table_names))
            db.create(tables, version)
    except:
        logger.warning("Failed to create tables %s in database %s",
                       table_names, db_path)
        db.close()
        if delete_on_error:
            remove(db_path)
        raise
    return db.connection
Exemplo n.º 7
0
 def from_inclusive_coordinates(contig, start, end=None):
     '''
     Given coordinates in 1-based coordinates that are inclusive on start
     and end, return a Locus instance. Locus instances are always 0-based
     "interbase" coordinates.
     '''
     typechecks.require_string(contig)
     typechecks.require_integer(start)
     if end is None:
         end = start
     typechecks.require_integer(end)
     contig = pyensembl.locus.normalize_chromosome(contig)
     return Locus(contig, start - 1, end)
Exemplo n.º 8
0
    def _finalize_database(self, version):
        """
        Create metadata table for database with version number.

        Parameters
        ----------
        version : int
            Tag created database with user-specified version number
        """
        require_integer(version, "version")
        create_metadata_sql = \
            "CREATE TABLE %s (version INT)" % METADATA_TABLE_NAME
        self.execute_sql(create_metadata_sql)
        insert_version_sql = \
            "INSERT INTO %s VALUES (%s)" % (METADATA_TABLE_NAME, version)
        self.execute_sql(insert_version_sql)
Exemplo n.º 9
0
    def _finalize_database(self, version):
        """
        Create metadata table for database with version number.

        Parameters
        ----------
        version : int
            Tag created database with user-specified version number
        """
        require_integer(version, "version")
        create_metadata_sql = \
            "CREATE TABLE %s (version INT)" % METADATA_TABLE_NAME
        self.execute_sql(create_metadata_sql)
        insert_version_sql = \
            "INSERT INTO %s VALUES (%s)" % (METADATA_TABLE_NAME, version)
        self.execute_sql(insert_version_sql)
Exemplo n.º 10
0
def check_padding_around_mutation(given_padding, epitope_lengths):
    """
    If user doesn't provide any padding around the mutation we need
    to at least include enough of the surrounding non-mutated
    esidues to construct candidate epitopes of the specified lengths.
    """
    min_required_padding = max(epitope_lengths) - 1
    if not given_padding:
        return min_required_padding
    else:
        require_integer(given_padding, "Padding around mutation")
        if given_padding < min_required_padding:
            raise ValueError(
                "Padding around mutation %d cannot be less than %d "
                "for epitope lengths %s" %
                (given_padding, min_required_padding, epitope_lengths))
        return given_padding
Exemplo n.º 11
0
def check_padding_around_mutation(given_padding, epitope_lengths):
    """
    If user doesn't provide any padding around the mutation we need
    to at least include enough of the surrounding non-mutated
    esidues to construct candidate epitopes of the specified lengths.
    """
    min_required_padding = max(epitope_lengths) - 1
    if not given_padding:
        return min_required_padding
    else:
        require_integer(given_padding, "Padding around mutation")
        if given_padding < min_required_padding:
            raise ValueError("Padding around mutation %d cannot "
                             "be less than %d for epitope lengths "
                             "%s" % (
                                given_padding,
                                min_required_padding,
                                epitope_lengths))
        return given_padding
Exemplo n.º 12
0
def _create_cached_db(
        db_path,
        tables,
        version=1):
    """
    Either create or retrieve sqlite database.

    Parameters
    --------
    db_path : str
        Path to sqlite3 database file

    tables : dict
        Dictionary mapping table names to datacache.DatabaseTable objects

    version : int, optional
        Version acceptable as cached data.

    Returns sqlite3 connection
    """
    require_string(db_path, "db_path")
    require_iterable_of(tables, DatabaseTable)
    require_integer(version, "version")

    # if the database file doesn't already exist and we encounter an error
    # later, delete the file before raising an exception
    delete_on_error = not exists(db_path)

    # if the database already exists, contains all the table
    # names and has the right version, then just return it
    db = Database(db_path)

    # make sure to delete the database file in case anything goes wrong
    # to avoid leaving behind an empty DB
    table_names = [table.name for table in tables]
    try:
        if db.has_tables(table_names) and \
                db.has_version() and \
                db.version() == version:
            logger.info("Found existing table in database %s", db_path)
        else:
            if len(db.table_names()) > 0:
                logger.info("Dropping tables from database %s: %s",
                    db_path,
                    ", ".join(db.table_names()))
                db.drop_all_tables()
            logger.info(
                "Creating database %s containing: %s",
                db_path,
                ", ".join(table_names))
            db.create(tables, version)
    except:
        logger.warning(
            "Failed to create tables %s in database %s",
            table_names,
            db_path)
        db.close()
        if delete_on_error:
            remove(db_path)
        raise
    return db.connection
Exemplo n.º 13
0
    def column_values_at_locus(self,
                               column_name,
                               feature,
                               contig,
                               position,
                               end=None,
                               strand=None,
                               distinct=False,
                               sorted=False):
        """
        Get the non-null values of a column from the database
        at a particular range of loci
        """

        # TODO: combine with the query method, since they overlap
        # significantly
        require_string(column_name, "column_name", nonempty=True)

        contig = normalize_chromosome(contig)

        require_integer(position, "position")

        if end is None:
            end = position

        require_integer(end, "end")

        if not self.column_exists(feature, column_name):
            raise ValueError("Table %s doesn't have column %s" % (
                feature,
                column_name,
            ))

        if distinct:
            distinct_string = "DISTINCT "
        else:
            distinct_string = ""

        query = """
            SELECT %s%s
            FROM %s
            WHERE seqname = ?
            AND start <= ?
            AND end >= ?

        """ % (distinct_string, column_name, feature)

        query_params = [contig, end, position]

        if strand:
            query += " AND strand = ?"
            query_params.append(strand)

        tuples = self.connection.execute(query, query_params).fetchall()

        # each result is a tuple, so pull out its first element
        results = [t[0] for t in tuples if t[0] is not None]

        if sorted:
            results.sort()
        return results
Exemplo n.º 14
0
    def __init__(
            self,
            program_name,
            alleles,
            epitope_lengths,
            parse_output_fn,
            supported_alleles_flag,
            input_fasta_flag,
            length_flag,
            allele_flag,
            tempdir_flag=None,
            extra_flags=[],
            max_file_records=None,
            process_limit=0):
        """
        Parameters
        ----------
        program_name : str
            Name of prediction program to run
            (e.g. "netMHCcons" or "netMHCIIpan")

        alleles : list of str
            MHC alleles

        epitope_lengths : list of int

        supported_alleles_flag : str
            Flag to pass to the predictor to get a list of supported alleles
            (e.g. "-A", "-list", "-listMHC")

        parse_output_fn : fn
            Takes the stdout string from the predictor and returns a collection
            of BindingPrediction objects

        input_fasta_flag : str
            How to specify the input FASTA file of source sequences (e.g. "-f")

        length_flag : str
            How to specify the desired predicted epitope length (e.g. "-length")

        allele_flag : str
            How to specify the allele we want predictions for (e.g. "-a")

        tempdir_flag : str, optional
            How to specify the predictor's temporary directory (e.g. "-tdir")

        extra_flags : list of str
            Extra flags to pass to the predictor

        max_file_records : int, optional
            Maximum number of sequences per input FASTA file

        process_limit : int, optional
            Maximum number of parallel processes to start
        """
        require_string(program_name, "Predictor program name")
        self.program_name = program_name

        if supported_alleles_flag is not None:
            require_string(supported_alleles_flag, "Supported alleles flag")
        self.supported_alleles_flag = supported_alleles_flag

        require_string(input_fasta_flag, "Input FASTA file flag")
        self.input_fasta_flag = input_fasta_flag

        require_string(allele_flag, "Allele flag")
        self.allele_flag = allele_flag

        require_string(length_flag, "Peptide length flag")
        self.length_flag = length_flag

        if tempdir_flag is not None:
            require_string(tempdir_flag, "Temporary directory flag")
        self.tempdir_flag = tempdir_flag

        self.extra_flags = extra_flags

        if max_file_records is not None:
            require_integer(
                    max_file_records,
                    "Maximum number of sequences per input files")
        self.max_file_records = max_file_records

        require_integer(process_limit, "Maximum number of processes")
        self.process_limit = process_limit

        self.parse_output_fn = parse_output_fn

        if self.supported_alleles_flag:
            valid_alleles = self._determine_supported_alleles(
                self.program_name,
                self.supported_alleles_flag)
        else:
            # if we're not running the tool to determine supported alleles
            # then at least try running it by itself to determine if it's
            # it's present
            try:
                run_command([self.program_name])
            except:
                raise SystemError("Failed to run %s" % self.program_name)
            valid_alleles = None

        try:
            BasePredictor.__init__(
                self,
                alleles,
                epitope_lengths,
                valid_alleles=valid_alleles)
        except UnsupportedAllele as e:
            if self.supported_alleles_flag:
                additional_message = (
                    "\nRun command %s %s to see a list of valid alleles" % (
                        self.program_name,
                        self.supported_alleles_flag))
            else:
                additional_message = ""
            raise UnsupportedAllele(str(e) + additional_message)
Exemplo n.º 15
0
    def __init__(
        self,
        program_name,
        alleles,
        parse_output_fn,
        supported_alleles_flag,
        input_file_flag,
        length_flag,
        allele_flag,
        peptide_mode_flags=["-p"],
        tempdir_flag=None,
        extra_flags=[],
        max_peptides_per_file=10**4,
        process_limit=-1,
        default_peptide_lengths=[9],
        group_peptides_by_length=False,
        min_peptide_length=8,
        max_peptide_length=None,
    ):
        """
        Parameters
        ----------
        program_name : str
            Name of prediction program to run
            (e.g. "netMHCcons" or "netMHCIIpan")

        alleles : list of str
            MHC alleles

        supported_alleles_flag : str
            Flag to pass to the predictor to get a list of supported alleles
            (e.g. "-A", "-list", "-listMHC")

        parse_output_fn : fn
            Takes the stdout string from the predictor and returns a collection
            of BindingPrediction objects

        input_file_flag : str
            How to specify the input FASTA file of source sequences (e.g. "-f")

        length_flag : str
            How to specify the desired predicted peptide length (e.g. "-length")

        allele_flag : str
            How to specify the allele we want predictions for (e.g. "-a")

        peptide_mode_flags : list of str
            How to switch from the default FASTA subsequences input mode to
            where peptides are explicitly given one per line of a text file.

        tempdir_flag : str, optional
            How to specify the predictor's temporary directory (e.g. "-tdir")

        extra_flags : list of str
            Extra flags to pass to the predictor

        max_peptides_per_file : int, optional
            Maximum number of lines per file when predicting peptides directly.

        process_limit : int, optional
            Maximum number of parallel processes to start
            (0 for no limit, -1 for use all available processors)

        default_peptide_lengths : list of int, optional
            When making predictions across subsequences of protein sequences,
            what peptide lengths to predict for.

        group_peptides_by_length : bool
            Run commandline predictor on groups of peptides of equal length

        min_peptide_length : int
            Shortest peptide this predictor can handle

        max_peptide_length : int
            Longest peptide this predictor can handle
        """
        require_string(program_name, "Predictor program name")
        self.program_name = program_name

        if supported_alleles_flag is not None:
            require_string(supported_alleles_flag, "Supported alleles flag")
        self.supported_alleles_flag = supported_alleles_flag

        require_string(input_file_flag, "Input file flag")
        self.input_file_flag = input_file_flag

        require_string(length_flag, "Peptide length flag")
        self.length_flag = length_flag

        require_string(allele_flag, "Allele flag")
        self.allele_flag = allele_flag

        require_iterable_of(peptide_mode_flags, string_types)
        self.peptide_mode_flags = peptide_mode_flags

        if tempdir_flag is not None:
            require_string(tempdir_flag, "Temporary directory flag")
        self.tempdir_flag = tempdir_flag

        require_iterable_of(extra_flags, string_types)
        self.extra_flags = extra_flags

        require_integer(max_peptides_per_file,
                        "Maximum number of lines in a peptides input file")
        self.max_peptides_per_file = max_peptides_per_file

        require_integer(process_limit, "Maximum number of processes")
        self.process_limit = process_limit

        self.parse_output_fn = parse_output_fn

        if isinstance(default_peptide_lengths, int):
            default_peptide_lengths = [default_peptide_lengths]

        self.group_peptides_by_length = group_peptides_by_length

        if self.supported_alleles_flag:
            valid_alleles = self._determine_supported_alleles(
                self.program_name, self.supported_alleles_flag)
        else:
            # if we're not running the tool to determine supported alleles
            # then at least try running it by itself to determine if it's
            # it's present
            try:
                run_command([self.program_name])
            except:
                raise SystemError("Failed to run %s" % self.program_name)
            valid_alleles = None

        try:
            BasePredictor.__init__(
                self,
                alleles=alleles,
                valid_alleles=valid_alleles,
                default_peptide_lengths=default_peptide_lengths,
                min_peptide_length=min_peptide_length,
                max_peptide_length=max_peptide_length)
        except UnsupportedAllele as e:
            if self.supported_alleles_flag:
                additional_message = (
                    "\nRun command %s %s to see a list of valid alleles" %
                    (self.program_name, self.supported_alleles_flag))
            else:
                additional_message = ""
            raise UnsupportedAllele(str(e) + additional_message)