def _check_hla_alleles(alleles, valid_alleles=None): """ Given a list of HLA alleles and an optional list of valid HLA alleles, return a set of alleles that we will pass into the MHC binding predictor. """ require_iterable_of(alleles, string_types, "HLA alleles") # Don't run the MHC predictor twice for homozygous alleles, # only run it for unique alleles alleles = { normalize_allele_name(allele.strip().upper()) for allele in alleles } if valid_alleles: # For some reason netMHCpan drops the '*' in names, so # 'HLA-A*03:01' becomes 'HLA-A03:01' missing_alleles = [ allele for allele in alleles if allele not in valid_alleles ] if len(missing_alleles) > 0: raise UnsupportedAllele("Unsupported HLA alleles: %s" % missing_alleles) return list(alleles)
def __init__( self, effects, path=None, distinct=False, sort_key=None): """Construct an EffectCollection from a sequence of MutationEffects. Parameters ---------- effects : iterable MutationEffect objects path : str, optional File path from which we loaded variants which gave rise to these effects. distinct : bool Don't keep repeated effects sort_key : callable """ require_iterable_of(effects, MutationEffect) Collection.__init__( self, elements=effects, path=path, distinct=distinct, sort_key=sort_key)
def _check_peptide_inputs(self, peptides): """ Check peptide sequences to make sure they are valid for this predictor. """ require_iterable_of(peptides, string_types) check_X = not self.allow_X_in_peptides check_lower = not self.allow_lowercase_in_peptides check_min_length = self.min_peptide_length is not None min_length = self.min_peptide_length check_max_length = self.max_peptide_length is not None max_length = self.max_peptide_length for p in peptides: if not p.isalpha(): raise ValueError("Invalid characters in peptide '%s'" % p) elif check_X and "X" in p: raise ValueError("Invalid character 'X' in peptide '%s'" % p) elif check_lower and not p.isupper(): raise ValueError("Invalid lowercase letters in peptide '%s'" % p) elif check_min_length and len(p) < min_length: raise ValueError( "Peptide '%s' too short (%d chars), must be at least %d" % (p, len(p), min_length)) elif check_max_length and len(p) > max_length: raise ValueError( "Peptide '%s' too long (%d chars), must be at least %d" % (p, len(p), max_length))
def _check_peptide_lengths(self, peptide_lengths=None): """ If peptide lengths not specified, then try using the default lengths associated with this predictor object. If those aren't a valid non-empty sequence of integers, then raise an exception. Otherwise return the peptide lengths. """ if not peptide_lengths: peptide_lengths = self.default_peptide_lengths if not peptide_lengths: raise ValueError(("Must either provide 'peptide_lengths' argument " "or set 'default_peptide_lengths")) if isinstance(peptide_lengths, int): peptide_lengths = [peptide_lengths] require_iterable_of(peptide_lengths, int) for peptide_length in peptide_lengths: if (self.min_peptide_length is not None and peptide_length < self.min_peptide_length): raise ValueError( "Invalid peptide length %d, shorter than min %d" % (peptide_length, self.min_peptide_length)) elif (self.max_peptide_length is not None and peptide_length > self.max_peptide_length): raise ValueError( "Invalid peptide length %d, longer than max %d" % (peptide_length, self.max_peptide_length)) return peptide_lengths
def _create_table(self, table_name, column_types, primary=None, nullable=()): """Creates a sqlite3 table from the given metadata. Parameters ---------- column_types : list of (str, str) pairs First element of each tuple is the column name, second element is the sqlite3 type primary : str, optional Which column is the primary key nullable : iterable, optional Names of columns which have null values """ require_string(table_name, "table name") require_iterable_of(column_types, tuple, name="rows") if primary is not None: require_string(primary, "primary") require_iterable_of(nullable, str, name="nullable") column_decls = [] for column_name, column_type in column_types: decl = "%s %s" % (column_name, column_type) if column_name == primary: decl += " UNIQUE PRIMARY KEY" if column_name not in nullable: decl += " NOT NULL" column_decls.append(decl) column_decl_str = ", ".join(column_decls) create_table_sql = \ "CREATE TABLE %s (%s)" % (table_name, column_decl_str) self.execute_sql(create_table_sql)
def _check_hla_alleles( alleles, valid_alleles=None): """ Given a list of HLA alleles and an optional list of valid HLA alleles, return a set of alleles that we will pass into the MHC binding predictor. """ require_iterable_of(alleles, str, "HLA alleles") alleles = [ normalize_allele_name(allele.strip().upper()) for allele in alleles ] if valid_alleles: # For some reason netMHCpan drops the '*' in names, so # 'HLA-A*03:01' becomes 'HLA-A03:01' missing_alleles = [ allele for allele in alleles if allele not in valid_alleles ] if len(missing_alleles) > 0: raise UnsupportedAllele("Unsupported HLA alleles: %s" % missing_alleles) # Don't run the MHC predictor twice for homozygous alleles, # only run it for unique alleles alleles = list(set(alleles)) return alleles
def __init__(self, sequences): typechecks.require_iterable_of( sequences, typechecks.string_types, "sequences") self.sequences = numpy.array(sequences) self.encoding_cache = {} self.fixed_sequence_length = None if len(self.sequences) > 0 and all( len(s) == len(self.sequences[0]) for s in self.sequences): self.fixed_sequence_length = len(self.sequences[0])
def _create_cached_db(db_path, tables, version=1): """ Either create or retrieve sqlite database. Parameters -------- db_path : str Path to sqlite3 database file tables : dict Dictionary mapping table names to datacache.DatabaseTable objects version : int, optional Version acceptable as cached data. Returns sqlite3 connection """ require_string(db_path, "db_path") require_iterable_of(tables, DatabaseTable) require_integer(version, "version") # if the database file doesn't already exist and we encounter an error # later, delete the file before raising an exception delete_on_error = not exists(db_path) # if the database already exists, contains all the table # names and has the right version, then just return it db = Database(db_path) # make sure to delete the database file in case anything goes wrong # to avoid leaving behind an empty DB table_names = [table.name for table in tables] try: if db.has_tables(table_names) and \ db.has_version() and \ db.version() == version: logger.info("Found existing table in database %s", db_path) else: if len(db.table_names()) > 0: logger.info("Dropping tables from database %s: %s", db_path, ", ".join(db.table_names())) db.drop_all_tables() logger.info("Creating database %s containing: %s", db_path, ", ".join(table_names)) db.create(tables, version) except: logger.warning("Failed to create tables %s in database %s", table_names, db_path) db.close() if delete_on_error: remove(db_path) raise return db.connection
def __init__(self, alleles, valid_alleles=None, default_peptide_lengths=None, min_peptide_length=8, max_peptide_length=None, allow_X_in_peptides=False, allow_lowercase_in_peptides=False): """ Parameters ---------- alleles : list List of strings containing names of HLA alleles we're making predictions for. Example: ["HLA-A*02:01", "HLA-B*07:02"] valid_alleles : list, optional If given, constrain HLA alleles to be contained within this set. default_peptide_lengths : list of int, optional When making predictions across subsequences of protein sequences, what peptide lengths to predict for. min_peptide_length : int Shortest peptide this predictor can handle max_peptide_length : int Longest peptide this predictor can handle allow_X_in_peptides : bool Allow unknown amino acids in peptide sequences allow_lowercase_in_peptides : bool Allow lowercase letters in peptide sequences """ # I find myself often constructing a predictor with just one allele # so as a convenience, allow user to not wrap that allele as a list if isinstance(alleles, string_types): alleles = alleles.split(',') self.alleles = self._check_hla_alleles(alleles, valid_alleles) if isinstance(default_peptide_lengths, int): default_peptide_lengths = [default_peptide_lengths] require_iterable_of(default_peptide_lengths, int) self.default_peptide_lengths = default_peptide_lengths self.min_peptide_length = min_peptide_length self.max_peptide_length = max_peptide_length self.allow_X_in_peptides = allow_X_in_peptides self.allow_lowercase_in_peptides = allow_lowercase_in_peptides
def _create_indices(self, table_name, indices): """ Create multiple indices (each over multiple columns) on a given table. Parameters ---------- table_name : str indices : iterable of tuples Multiple groups of columns, each of which should be indexed. """ require_string(table_name, "table_name") require_iterable_of(indices, (tuple, list)) for index_column_set in indices: self._create_index(table_name, index_column_set)
def drop_allele_peptide_pairs(self, allele_peptide_pairs): """ Drop all allele-peptide tuple pairs in the given list. Parameters ---------- allele_peptide_pairs : list of (str, str) tuples The two arguments are assumed to be the same length. Returns Dataset of equal or smaller size. """ require_iterable_of(allele_peptide_pairs, tuple) keys_to_remove_set = set(allele_peptide_pairs) remove_mask = np.array([(k in keys_to_remove_set) for k in zip(self.alleles, self.peptides)]) keep_mask = ~remove_mask return self.slice(keep_mask)
def _fill_table(self, table_name, rows): require_string(table_name, "table_name") require_iterable_of(rows, tuple, "rows") if not self.has_table(table_name): raise ValueError("Table '%s' does not exist in database" % (table_name, )) if len(rows) == 0: raise ValueError("Rows must be non-empty sequence") first_row = rows[0] n_columns = len(first_row) if not all(len(row) == n_columns for row in rows): raise ValueError("Rows must all have %d values" % n_columns) blank_slots = ", ".join("?" for _ in range(n_columns)) logger.info("Inserting %d rows into table %s", len(rows), table_name) sql = "INSERT INTO %s VALUES (%s)" % (table_name, blank_slots) self.connection.executemany(sql, rows)
def _fill_table(self, table_name, rows): require_string(table_name, "table_name") require_iterable_of(rows, tuple, "rows") if not self.has_table(table_name): raise ValueError( "Table '%s' does not exist in database" % (table_name,)) if len(rows) == 0: raise ValueError("Rows must be non-empty sequence") first_row = rows[0] n_columns = len(first_row) if not all(len(row) == n_columns for row in rows): raise ValueError("Rows must all have %d values" % n_columns) blank_slots = ", ".join("?" for _ in range(n_columns)) logging.info("Inserting %d rows into table %s", len(rows), table_name) sql = "INSERT INTO %s VALUES (%s)" % (table_name, blank_slots) self.connection.executemany(sql, rows)
def drop_allele_peptide_pairs(self, allele_peptide_pairs): """ Drop all allele-peptide tuple pairs in the given list. Parameters ---------- allele_peptide_pairs : list of (str, str) tuples The two arguments are assumed to be the same length. Returns Dataset of equal or smaller size. """ require_iterable_of(allele_peptide_pairs, tuple) keys_to_remove_set = set(allele_peptide_pairs) remove_mask = np.array([ (k in keys_to_remove_set) for k in zip(self.alleles, self.peptides) ]) keep_mask = ~remove_mask return self.slice(keep_mask)
def __init__(self, effects, path=None, distinct=False, sort_key=None): """Construct an EffectCollection from a sequence of MutationEffects. Parameters ---------- effects : iterable MutationEffect objects path : str, optional File path from which we loaded variants which gave rise to these effects. distinct : bool Don't keep repeated effects sort_key : callable """ require_iterable_of(effects, MutationEffect) Collection.__init__(self, elements=effects, path=path, distinct=distinct, sort_key=sort_key)
def _create_cached_db( db_path, tables, version=1): """ Either create or retrieve sqlite database. Parameters -------- db_path : str Path to sqlite3 database file tables : dict Dictionary mapping table names to datacache.DatabaseTable objects version : int, optional Version acceptable as cached data. Returns sqlite3 connection """ require_string(db_path, "db_path") require_iterable_of(tables, DatabaseTable) require_integer(version, "version") # if the database file doesn't already exist and we encounter an error # later, delete the file before raising an exception delete_on_error = not exists(db_path) # if the database already exists, contains all the table # names and has the right version, then just return it db = Database(db_path) # make sure to delete the database file in case anything goes wrong # to avoid leaving behind an empty DB table_names = [table.name for table in tables] try: if db.has_tables(table_names) and \ db.has_version() and \ db.version() == version: logger.info("Found existing table in database %s", db_path) else: if len(db.table_names()) > 0: logger.info("Dropping tables from database %s: %s", db_path, ", ".join(db.table_names())) db.drop_all_tables() logger.info( "Creating database %s containing: %s", db_path, ", ".join(table_names)) db.create(tables, version) except: logger.warning( "Failed to create tables %s in database %s", table_names, db_path) db.close() if delete_on_error: remove(db_path) raise return db.connection
def __init__( self, program_name, alleles, parse_output_fn, supported_alleles_flag, input_file_flag, length_flag, allele_flag, peptide_mode_flags=["-p"], tempdir_flag=None, extra_flags=[], max_peptides_per_file=10**4, process_limit=-1, default_peptide_lengths=[9], group_peptides_by_length=False, min_peptide_length=8, max_peptide_length=None, ): """ Parameters ---------- program_name : str Name of prediction program to run (e.g. "netMHCcons" or "netMHCIIpan") alleles : list of str MHC alleles supported_alleles_flag : str Flag to pass to the predictor to get a list of supported alleles (e.g. "-A", "-list", "-listMHC") parse_output_fn : fn Takes the stdout string from the predictor and returns a collection of BindingPrediction objects input_file_flag : str How to specify the input FASTA file of source sequences (e.g. "-f") length_flag : str How to specify the desired predicted peptide length (e.g. "-length") allele_flag : str How to specify the allele we want predictions for (e.g. "-a") peptide_mode_flags : list of str How to switch from the default FASTA subsequences input mode to where peptides are explicitly given one per line of a text file. tempdir_flag : str, optional How to specify the predictor's temporary directory (e.g. "-tdir") extra_flags : list of str Extra flags to pass to the predictor max_peptides_per_file : int, optional Maximum number of lines per file when predicting peptides directly. process_limit : int, optional Maximum number of parallel processes to start (0 for no limit, -1 for use all available processors) default_peptide_lengths : list of int, optional When making predictions across subsequences of protein sequences, what peptide lengths to predict for. group_peptides_by_length : bool Run commandline predictor on groups of peptides of equal length min_peptide_length : int Shortest peptide this predictor can handle max_peptide_length : int Longest peptide this predictor can handle """ require_string(program_name, "Predictor program name") self.program_name = program_name if supported_alleles_flag is not None: require_string(supported_alleles_flag, "Supported alleles flag") self.supported_alleles_flag = supported_alleles_flag require_string(input_file_flag, "Input file flag") self.input_file_flag = input_file_flag require_string(length_flag, "Peptide length flag") self.length_flag = length_flag require_string(allele_flag, "Allele flag") self.allele_flag = allele_flag require_iterable_of(peptide_mode_flags, string_types) self.peptide_mode_flags = peptide_mode_flags if tempdir_flag is not None: require_string(tempdir_flag, "Temporary directory flag") self.tempdir_flag = tempdir_flag require_iterable_of(extra_flags, string_types) self.extra_flags = extra_flags require_integer(max_peptides_per_file, "Maximum number of lines in a peptides input file") self.max_peptides_per_file = max_peptides_per_file require_integer(process_limit, "Maximum number of processes") self.process_limit = process_limit self.parse_output_fn = parse_output_fn if isinstance(default_peptide_lengths, int): default_peptide_lengths = [default_peptide_lengths] self.group_peptides_by_length = group_peptides_by_length if self.supported_alleles_flag: valid_alleles = self._determine_supported_alleles( self.program_name, self.supported_alleles_flag) else: # if we're not running the tool to determine supported alleles # then at least try running it by itself to determine if it's # it's present try: run_command([self.program_name]) except: raise SystemError("Failed to run %s" % self.program_name) valid_alleles = None try: BasePredictor.__init__( self, alleles=alleles, valid_alleles=valid_alleles, default_peptide_lengths=default_peptide_lengths, min_peptide_length=min_peptide_length, max_peptide_length=max_peptide_length) except UnsupportedAllele as e: if self.supported_alleles_flag: additional_message = ( "\nRun command %s %s to see a list of valid alleles" % (self.program_name, self.supported_alleles_flag)) else: additional_message = "" raise UnsupportedAllele(str(e) + additional_message)