예제 #1
0
    def _check_hla_alleles(alleles, valid_alleles=None):
        """
        Given a list of HLA alleles and an optional list of valid
        HLA alleles, return a set of alleles that we will pass into
        the MHC binding predictor.
        """
        require_iterable_of(alleles, string_types, "HLA alleles")

        # Don't run the MHC predictor twice for homozygous alleles,
        # only run it for unique alleles
        alleles = {
            normalize_allele_name(allele.strip().upper())
            for allele in alleles
        }
        if valid_alleles:
            # For some reason netMHCpan drops the '*' in names, so
            # 'HLA-A*03:01' becomes 'HLA-A03:01'
            missing_alleles = [
                allele for allele in alleles if allele not in valid_alleles
            ]
            if len(missing_alleles) > 0:
                raise UnsupportedAllele("Unsupported HLA alleles: %s" %
                                        missing_alleles)

        return list(alleles)
예제 #2
0
    def __init__(
            self,
            effects,
            path=None,
            distinct=False,
            sort_key=None):
        """Construct an EffectCollection from a sequence of MutationEffects.

        Parameters
        ----------
        effects : iterable
            MutationEffect objects

        path : str, optional
            File path from which we loaded variants which gave rise to these
            effects.

        distinct : bool
            Don't keep repeated effects

        sort_key : callable
        """
        require_iterable_of(effects, MutationEffect)
        Collection.__init__(
            self,
            elements=effects,
            path=path,
            distinct=distinct,
            sort_key=sort_key)
예제 #3
0
 def _check_peptide_inputs(self, peptides):
     """
     Check peptide sequences to make sure they are valid for this predictor.
     """
     require_iterable_of(peptides, string_types)
     check_X = not self.allow_X_in_peptides
     check_lower = not self.allow_lowercase_in_peptides
     check_min_length = self.min_peptide_length is not None
     min_length = self.min_peptide_length
     check_max_length = self.max_peptide_length is not None
     max_length = self.max_peptide_length
     for p in peptides:
         if not p.isalpha():
             raise ValueError("Invalid characters in peptide '%s'" % p)
         elif check_X and "X" in p:
             raise ValueError("Invalid character 'X' in peptide '%s'" % p)
         elif check_lower and not p.isupper():
             raise ValueError("Invalid lowercase letters in peptide '%s'" %
                              p)
         elif check_min_length and len(p) < min_length:
             raise ValueError(
                 "Peptide '%s' too short (%d chars), must be at least %d" %
                 (p, len(p), min_length))
         elif check_max_length and len(p) > max_length:
             raise ValueError(
                 "Peptide '%s' too long (%d chars), must be at least %d" %
                 (p, len(p), max_length))
예제 #4
0
    def _check_peptide_lengths(self, peptide_lengths=None):
        """
        If peptide lengths not specified, then try using the default
        lengths associated with this predictor object. If those aren't
        a valid non-empty sequence of integers, then raise an exception.
        Otherwise return the peptide lengths.
        """
        if not peptide_lengths:
            peptide_lengths = self.default_peptide_lengths

        if not peptide_lengths:
            raise ValueError(("Must either provide 'peptide_lengths' argument "
                              "or set 'default_peptide_lengths"))
        if isinstance(peptide_lengths, int):
            peptide_lengths = [peptide_lengths]
        require_iterable_of(peptide_lengths, int)
        for peptide_length in peptide_lengths:
            if (self.min_peptide_length is not None
                    and peptide_length < self.min_peptide_length):
                raise ValueError(
                    "Invalid peptide length %d, shorter than min %d" %
                    (peptide_length, self.min_peptide_length))
            elif (self.max_peptide_length is not None
                  and peptide_length > self.max_peptide_length):
                raise ValueError(
                    "Invalid peptide length %d, longer than max %d" %
                    (peptide_length, self.max_peptide_length))
        return peptide_lengths
예제 #5
0
    def _create_table(self, table_name, column_types, primary=None, nullable=()):
        """Creates a sqlite3 table from the given metadata.

        Parameters
        ----------

        column_types : list of (str, str) pairs
            First element of each tuple is the column name, second element is the sqlite3 type

        primary : str, optional
            Which column is the primary key

        nullable : iterable, optional
            Names of columns which have null values
        """
        require_string(table_name, "table name")
        require_iterable_of(column_types, tuple, name="rows")
        if primary is not None:
            require_string(primary, "primary")
        require_iterable_of(nullable, str, name="nullable")

        column_decls = []
        for column_name, column_type in column_types:
            decl = "%s %s" % (column_name, column_type)
            if column_name == primary:
                decl += " UNIQUE PRIMARY KEY"
            if column_name not in nullable:
                decl += " NOT NULL"
            column_decls.append(decl)
        column_decl_str = ", ".join(column_decls)
        create_table_sql = \
            "CREATE TABLE %s (%s)" % (table_name, column_decl_str)
        self.execute_sql(create_table_sql)
예제 #6
0
    def _check_hla_alleles(
            alleles,
            valid_alleles=None):
        """
        Given a list of HLA alleles and an optional list of valid
        HLA alleles, return a set of alleles that we will pass into
        the MHC binding predictor.
        """
        require_iterable_of(alleles, str, "HLA alleles")
        alleles = [
            normalize_allele_name(allele.strip().upper())
            for allele in alleles
        ]
        if valid_alleles:
            # For some reason netMHCpan drops the '*' in names, so
            # 'HLA-A*03:01' becomes 'HLA-A03:01'
            missing_alleles = [
                allele
                for allele in alleles
                if allele not in valid_alleles
            ]
            if len(missing_alleles) > 0:
                raise UnsupportedAllele("Unsupported HLA alleles: %s" % missing_alleles)

        # Don't run the MHC predictor twice for homozygous alleles,
        # only run it for unique alleles
        alleles = list(set(alleles))
        return alleles
예제 #7
0
 def __init__(self, sequences):
     typechecks.require_iterable_of(
         sequences, typechecks.string_types, "sequences")
     self.sequences = numpy.array(sequences)
     self.encoding_cache = {}
     self.fixed_sequence_length = None
     if len(self.sequences) > 0 and all(
             len(s) == len(self.sequences[0]) for s in self.sequences):
         self.fixed_sequence_length = len(self.sequences[0])
예제 #8
0
def _create_cached_db(db_path, tables, version=1):
    """
    Either create or retrieve sqlite database.

    Parameters
    --------
    db_path : str
        Path to sqlite3 database file

    tables : dict
        Dictionary mapping table names to datacache.DatabaseTable objects

    version : int, optional
        Version acceptable as cached data.

    Returns sqlite3 connection
    """
    require_string(db_path, "db_path")
    require_iterable_of(tables, DatabaseTable)
    require_integer(version, "version")

    # if the database file doesn't already exist and we encounter an error
    # later, delete the file before raising an exception
    delete_on_error = not exists(db_path)

    # if the database already exists, contains all the table
    # names and has the right version, then just return it
    db = Database(db_path)

    # make sure to delete the database file in case anything goes wrong
    # to avoid leaving behind an empty DB
    table_names = [table.name for table in tables]
    try:
        if db.has_tables(table_names) and \
                db.has_version() and \
                db.version() == version:
            logger.info("Found existing table in database %s", db_path)
        else:
            if len(db.table_names()) > 0:
                logger.info("Dropping tables from database %s: %s", db_path,
                            ", ".join(db.table_names()))
                db.drop_all_tables()
            logger.info("Creating database %s containing: %s", db_path,
                        ", ".join(table_names))
            db.create(tables, version)
    except:
        logger.warning("Failed to create tables %s in database %s",
                       table_names, db_path)
        db.close()
        if delete_on_error:
            remove(db_path)
        raise
    return db.connection
예제 #9
0
    def __init__(self,
                 alleles,
                 valid_alleles=None,
                 default_peptide_lengths=None,
                 min_peptide_length=8,
                 max_peptide_length=None,
                 allow_X_in_peptides=False,
                 allow_lowercase_in_peptides=False):
        """
        Parameters
        ----------
        alleles : list
            List of strings containing names of HLA alleles we're
            making predictions for. Example:
                ["HLA-A*02:01", "HLA-B*07:02"]

        valid_alleles : list, optional
            If given, constrain HLA alleles to be contained within
            this set.

        default_peptide_lengths : list of int, optional
            When making predictions across subsequences of protein sequences,
            what peptide lengths to predict for.

        min_peptide_length : int
            Shortest peptide this predictor can handle

        max_peptide_length : int
            Longest peptide this predictor can handle

        allow_X_in_peptides : bool
            Allow unknown amino acids in peptide sequences

        allow_lowercase_in_peptides : bool
            Allow lowercase letters in peptide sequences
        """
        # I find myself often constructing a predictor with just one allele
        # so as a convenience, allow user to not wrap that allele as a list
        if isinstance(alleles, string_types):
            alleles = alleles.split(',')
        self.alleles = self._check_hla_alleles(alleles, valid_alleles)

        if isinstance(default_peptide_lengths, int):
            default_peptide_lengths = [default_peptide_lengths]
        require_iterable_of(default_peptide_lengths, int)
        self.default_peptide_lengths = default_peptide_lengths
        self.min_peptide_length = min_peptide_length
        self.max_peptide_length = max_peptide_length
        self.allow_X_in_peptides = allow_X_in_peptides
        self.allow_lowercase_in_peptides = allow_lowercase_in_peptides
예제 #10
0
    def _create_indices(self, table_name, indices):
        """
        Create multiple indices (each over multiple columns) on a given table.

        Parameters
        ----------
        table_name : str

        indices : iterable of tuples
            Multiple groups of columns, each of which should be indexed.
        """
        require_string(table_name, "table_name")
        require_iterable_of(indices, (tuple, list))
        for index_column_set in indices:
            self._create_index(table_name, index_column_set)
예제 #11
0
    def _create_indices(self, table_name, indices):
        """
        Create multiple indices (each over multiple columns) on a given table.

        Parameters
        ----------
        table_name : str

        indices : iterable of tuples
            Multiple groups of columns, each of which should be indexed.
        """
        require_string(table_name, "table_name")
        require_iterable_of(indices, (tuple, list))
        for index_column_set in indices:
            self._create_index(table_name, index_column_set)
예제 #12
0
    def drop_allele_peptide_pairs(self, allele_peptide_pairs):
        """
        Drop all allele-peptide tuple pairs in the given list.

        Parameters
        ----------
        allele_peptide_pairs : list of (str, str) tuples
        The two arguments are assumed to be the same length.

        Returns Dataset of equal or smaller size.
        """
        require_iterable_of(allele_peptide_pairs, tuple)
        keys_to_remove_set = set(allele_peptide_pairs)
        remove_mask = np.array([(k in keys_to_remove_set)
                                for k in zip(self.alleles, self.peptides)])
        keep_mask = ~remove_mask
        return self.slice(keep_mask)
예제 #13
0
    def _fill_table(self, table_name, rows):
        require_string(table_name, "table_name")
        require_iterable_of(rows, tuple, "rows")

        if not self.has_table(table_name):
            raise ValueError("Table '%s' does not exist in database" %
                             (table_name, ))
        if len(rows) == 0:
            raise ValueError("Rows must be non-empty sequence")

        first_row = rows[0]
        n_columns = len(first_row)
        if not all(len(row) == n_columns for row in rows):
            raise ValueError("Rows must all have %d values" % n_columns)
        blank_slots = ", ".join("?" for _ in range(n_columns))
        logger.info("Inserting %d rows into table %s", len(rows), table_name)
        sql = "INSERT INTO %s VALUES (%s)" % (table_name, blank_slots)
        self.connection.executemany(sql, rows)
예제 #14
0
    def _fill_table(self, table_name, rows):
        require_string(table_name, "table_name")
        require_iterable_of(rows, tuple, "rows")

        if not self.has_table(table_name):
            raise ValueError(
                "Table '%s' does not exist in database" % (table_name,))
        if len(rows) == 0:
            raise ValueError("Rows must be non-empty sequence")

        first_row = rows[0]
        n_columns = len(first_row)
        if not all(len(row) == n_columns for row in rows):
            raise ValueError("Rows must all have %d values" % n_columns)
        blank_slots = ", ".join("?" for _ in range(n_columns))
        logging.info("Inserting %d rows into table %s", len(rows), table_name)
        sql = "INSERT INTO %s VALUES (%s)" % (table_name, blank_slots)
        self.connection.executemany(sql, rows)
예제 #15
0
    def drop_allele_peptide_pairs(self, allele_peptide_pairs):
        """
        Drop all allele-peptide tuple pairs in the given list.

        Parameters
        ----------
        allele_peptide_pairs : list of (str, str) tuples
        The two arguments are assumed to be the same length.

        Returns Dataset of equal or smaller size.
        """
        require_iterable_of(allele_peptide_pairs, tuple)
        keys_to_remove_set = set(allele_peptide_pairs)
        remove_mask = np.array([
            (k in keys_to_remove_set)
            for k in zip(self.alleles, self.peptides)
        ])
        keep_mask = ~remove_mask
        return self.slice(keep_mask)
예제 #16
0
    def __init__(self, effects, path=None, distinct=False, sort_key=None):
        """Construct an EffectCollection from a sequence of MutationEffects.

        Parameters
        ----------
        effects : iterable
            MutationEffect objects

        path : str, optional
            File path from which we loaded variants which gave rise to these
            effects.

        distinct : bool
            Don't keep repeated effects

        sort_key : callable
        """
        require_iterable_of(effects, MutationEffect)
        Collection.__init__(self,
                            elements=effects,
                            path=path,
                            distinct=distinct,
                            sort_key=sort_key)
예제 #17
0
    def _create_table(self,
                      table_name,
                      column_types,
                      primary=None,
                      nullable=()):
        """Creates a sqlite3 table from the given metadata.

        Parameters
        ----------

        column_types : list of (str, str) pairs
            First element of each tuple is the column name, second element is the sqlite3 type

        primary : str, optional
            Which column is the primary key

        nullable : iterable, optional
            Names of columns which have null values
        """
        require_string(table_name, "table name")
        require_iterable_of(column_types, tuple, name="rows")
        if primary is not None:
            require_string(primary, "primary")
        require_iterable_of(nullable, str, name="nullable")

        column_decls = []
        for column_name, column_type in column_types:
            decl = "%s %s" % (column_name, column_type)
            if column_name == primary:
                decl += " UNIQUE PRIMARY KEY"
            if column_name not in nullable:
                decl += " NOT NULL"
            column_decls.append(decl)
        column_decl_str = ", ".join(column_decls)
        create_table_sql = \
            "CREATE TABLE %s (%s)" % (table_name, column_decl_str)
        self.execute_sql(create_table_sql)
예제 #18
0
def _create_cached_db(
        db_path,
        tables,
        version=1):
    """
    Either create or retrieve sqlite database.

    Parameters
    --------
    db_path : str
        Path to sqlite3 database file

    tables : dict
        Dictionary mapping table names to datacache.DatabaseTable objects

    version : int, optional
        Version acceptable as cached data.

    Returns sqlite3 connection
    """
    require_string(db_path, "db_path")
    require_iterable_of(tables, DatabaseTable)
    require_integer(version, "version")

    # if the database file doesn't already exist and we encounter an error
    # later, delete the file before raising an exception
    delete_on_error = not exists(db_path)

    # if the database already exists, contains all the table
    # names and has the right version, then just return it
    db = Database(db_path)

    # make sure to delete the database file in case anything goes wrong
    # to avoid leaving behind an empty DB
    table_names = [table.name for table in tables]
    try:
        if db.has_tables(table_names) and \
                db.has_version() and \
                db.version() == version:
            logger.info("Found existing table in database %s", db_path)
        else:
            if len(db.table_names()) > 0:
                logger.info("Dropping tables from database %s: %s",
                    db_path,
                    ", ".join(db.table_names()))
                db.drop_all_tables()
            logger.info(
                "Creating database %s containing: %s",
                db_path,
                ", ".join(table_names))
            db.create(tables, version)
    except:
        logger.warning(
            "Failed to create tables %s in database %s",
            table_names,
            db_path)
        db.close()
        if delete_on_error:
            remove(db_path)
        raise
    return db.connection
예제 #19
0
    def __init__(
        self,
        program_name,
        alleles,
        parse_output_fn,
        supported_alleles_flag,
        input_file_flag,
        length_flag,
        allele_flag,
        peptide_mode_flags=["-p"],
        tempdir_flag=None,
        extra_flags=[],
        max_peptides_per_file=10**4,
        process_limit=-1,
        default_peptide_lengths=[9],
        group_peptides_by_length=False,
        min_peptide_length=8,
        max_peptide_length=None,
    ):
        """
        Parameters
        ----------
        program_name : str
            Name of prediction program to run
            (e.g. "netMHCcons" or "netMHCIIpan")

        alleles : list of str
            MHC alleles

        supported_alleles_flag : str
            Flag to pass to the predictor to get a list of supported alleles
            (e.g. "-A", "-list", "-listMHC")

        parse_output_fn : fn
            Takes the stdout string from the predictor and returns a collection
            of BindingPrediction objects

        input_file_flag : str
            How to specify the input FASTA file of source sequences (e.g. "-f")

        length_flag : str
            How to specify the desired predicted peptide length (e.g. "-length")

        allele_flag : str
            How to specify the allele we want predictions for (e.g. "-a")

        peptide_mode_flags : list of str
            How to switch from the default FASTA subsequences input mode to
            where peptides are explicitly given one per line of a text file.

        tempdir_flag : str, optional
            How to specify the predictor's temporary directory (e.g. "-tdir")

        extra_flags : list of str
            Extra flags to pass to the predictor

        max_peptides_per_file : int, optional
            Maximum number of lines per file when predicting peptides directly.

        process_limit : int, optional
            Maximum number of parallel processes to start
            (0 for no limit, -1 for use all available processors)

        default_peptide_lengths : list of int, optional
            When making predictions across subsequences of protein sequences,
            what peptide lengths to predict for.

        group_peptides_by_length : bool
            Run commandline predictor on groups of peptides of equal length

        min_peptide_length : int
            Shortest peptide this predictor can handle

        max_peptide_length : int
            Longest peptide this predictor can handle
        """
        require_string(program_name, "Predictor program name")
        self.program_name = program_name

        if supported_alleles_flag is not None:
            require_string(supported_alleles_flag, "Supported alleles flag")
        self.supported_alleles_flag = supported_alleles_flag

        require_string(input_file_flag, "Input file flag")
        self.input_file_flag = input_file_flag

        require_string(length_flag, "Peptide length flag")
        self.length_flag = length_flag

        require_string(allele_flag, "Allele flag")
        self.allele_flag = allele_flag

        require_iterable_of(peptide_mode_flags, string_types)
        self.peptide_mode_flags = peptide_mode_flags

        if tempdir_flag is not None:
            require_string(tempdir_flag, "Temporary directory flag")
        self.tempdir_flag = tempdir_flag

        require_iterable_of(extra_flags, string_types)
        self.extra_flags = extra_flags

        require_integer(max_peptides_per_file,
                        "Maximum number of lines in a peptides input file")
        self.max_peptides_per_file = max_peptides_per_file

        require_integer(process_limit, "Maximum number of processes")
        self.process_limit = process_limit

        self.parse_output_fn = parse_output_fn

        if isinstance(default_peptide_lengths, int):
            default_peptide_lengths = [default_peptide_lengths]

        self.group_peptides_by_length = group_peptides_by_length

        if self.supported_alleles_flag:
            valid_alleles = self._determine_supported_alleles(
                self.program_name, self.supported_alleles_flag)
        else:
            # if we're not running the tool to determine supported alleles
            # then at least try running it by itself to determine if it's
            # it's present
            try:
                run_command([self.program_name])
            except:
                raise SystemError("Failed to run %s" % self.program_name)
            valid_alleles = None

        try:
            BasePredictor.__init__(
                self,
                alleles=alleles,
                valid_alleles=valid_alleles,
                default_peptide_lengths=default_peptide_lengths,
                min_peptide_length=min_peptide_length,
                max_peptide_length=max_peptide_length)
        except UnsupportedAllele as e:
            if self.supported_alleles_flag:
                additional_message = (
                    "\nRun command %s %s to see a list of valid alleles" %
                    (self.program_name, self.supported_alleles_flag))
            else:
                additional_message = ""
            raise UnsupportedAllele(str(e) + additional_message)