def test_netmhcii_pan_multiple_alleles(): alleles = [ normalize_allele_name("HLA-DPA1*01:05-DPB1*100:01"), normalize_allele_name("HLA-DQA1*05:11-DQB1*03:02"), normalize_allele_name("HLA-DRA1*01:01-DRB1*01:01") ] ii_pan_predictor = NetMHCIIpan(alleles=alleles) fasta_dictionary = {"TP53-001": "SQAMDDLMLSPDDIEQWFTED"} binding_predictions = ii_pan_predictor.predict_subsequences( sequence_dict=fasta_dictionary, peptide_lengths=[15, 16]) unique_lengths = {x.length for x in binding_predictions} eq_(unique_lengths, {15, 16}) unique_alleles = {x.allele for x in binding_predictions} eq_( unique_alleles, { "HLA-DPA1*01:05-DPB1*100:01", "HLA-DQA1*05:11-DQB1*03:02", "HLA-DRA1*01:01-DRB1*01:01" }) # length of "SQAMDDLMLSPDDIEQWFTED" is 21 # Expect 3 * ((21-15+1) + (21-16+1)) = 39 entries assert len(binding_predictions) == 39, \ "Expected 39 epitopes from %s" % (binding_predictions,)
def normalize_allele_name(s): try: return mhcnames.normalize_allele_name(s) except Exception: try: (a, b, c) = s.split("-") return mhcnames.normalize_allele_name("%s-%s*%s" % (a, b, c)) except Exception: return "UNKNOWN"
def test_mouse_class1_alleles_H2_Kk(): # H2-Kk eq_(parse_allele_name("H2-Kk"), AlleleName("H-2", "K", "", "k")) eq_(normalize_allele_name("H2-Kk"), "H-2-Kk") eq_(compact_allele_name("H-2-Kk"), "Kk") # with a hyphen in "H-2" eq_(parse_allele_name("H-2-Kk"), AlleleName("H-2", "K", "", "k")) eq_(normalize_allele_name("H-2-Kk"), "H-2-Kk") eq_(compact_allele_name("H-2-Kk"), "Kk")
def test_mouse_class2_alleles(): # H2-IAb eq_(parse_allele_name("H2-IAb"), AlleleName("H-2", "IA", "", "b")) eq_(normalize_allele_name("H2-IAb"), "H-2-IAb") eq_(compact_allele_name("H2-IAb"), "IAb") # with hyphen in "H-2" eq_(parse_allele_name("H-2-IAb"), AlleleName("H-2", "IA", "", "b")) eq_(normalize_allele_name("H-2-IAb"), "H-2-IAb") eq_(compact_allele_name("H-2-IAb"), "IAb")
def test_macaque_alleles(): allele_name = "Mamu-B*082:02" eq_(normalize_allele_name(allele_name), "Mamu-B*82:02") eq_(compact_allele_name(allele_name), "B8202") # expect 3rd zero in the family "007" to be trimmed in the normalized form # of this allele allele_name = "Mamu-B*007:02" eq_(normalize_allele_name(allele_name), "Mamu-B*07:02") eq_(compact_allele_name(allele_name), "B0702")
def test_mouse_class1_alleles_H2_Db(): # H2-Db eq_(parse_allele_name("H2-Db"), AlleleName("H-2", "D", "", "b")) eq_(normalize_allele_name("H2-Db"), "H-2-Db") eq_(compact_allele_name("H2-Db"), "Db") # with hyphen in "H-2" eq_(parse_allele_name("H-2-Db"), AlleleName("H-2", "D", "", "b")) eq_(normalize_allele_name("H-2-Db"), "H-2-Db") eq_(compact_allele_name("H-2-Db"), "Db")
def test_hla_with_3_digit_allele_code(): # B*15:120 eq_(normalize_allele_name("HLA-B*15:120"), "HLA-B*15:120") eq_(compact_allele_name("HLA-B*15:120"), "B15120") eq_(normalize_allele_name("B15120"), "HLA-B*15:120") eq_(compact_allele_name("B15120"), "B15120") # A*02*123 eq_(normalize_allele_name("HLA-A*02:123"), "HLA-A*02:123") eq_(compact_allele_name("HLA-A*02:123"), "A02123") eq_(normalize_allele_name("A02123"), "HLA-A*02:123") eq_(compact_allele_name("A02123"), "A02123")
def normalize(s, disallowed=["MIC", "HFE"]): if any(item in s for item in disallowed): return None try: return mhcnames.normalize_allele_name(s) except: while s: s = ":".join(s.split(":")[:-1]) try: return mhcnames.normalize_allele_name(s) except: pass return None
def mhc_alleles_from_args(args): alleles = [ normalize_allele_name(allele.strip()) for allele in args.mhc_alleles.split(",") if allele.strip() ] if args.mhc_alleles_file: with open(args.mhc_alleles_file, 'r') as f: for line in f: line = line.strip() if line: alleles.append(normalize_allele_name(line)) if len(alleles) == 0: raise ValueError( "MHC alleles required (use --mhc-alleles or --mhc-alleles-file)") return alleles
def predict_min_across_alleles(self, alleles, peptides): alleles = list( set([normalize_allele_name(allele) for allele in alleles])) peptides = EncodableSequences.create(peptides) df = pandas.DataFrame() df["peptide"] = peptides.sequences for allele in alleles: df[allele] = self.predictor.predict(peptides, allele=allele) result = { self.predictor_name + '_affinity': (df[list(df.columns)[1:]].min(axis=1)) } if self.percent_rank_transforms is not None: self.fit_percentile_rank_if_needed(alleles) percentile_ranks = pandas.DataFrame(index=df.index) for allele in alleles: percentile_ranks[allele] = ( self.percent_rank_transforms[allele].transform( df[allele].values)) result[self.predictor_name + '_percentile_rank'] = (percentile_ranks.min(axis=1).values) for (key, value) in result.items(): assert len(value) == len(peptides), (len(peptides), result) return result
def test_netmhc_cons_process_limits(): alleles = [normalize_allele_name(DEFAULT_ALLELE)] sequence_dict = { "SMAD4-001": "ASIINFKELA", "TP53-001": "ASILLLVFYW", "SMAD4-002": "ASIINFKELS", "TP53-002": "ASILLLVFYS", "TP53-003": "ASILLLVFYT", "TP53-004": "ASILLLVFYG", "TP53-005": "ASILLLVFYG" } for process_limit in [1, 2, 10]: cons_predictor = NetMHCcons(alleles=alleles, default_peptide_lengths=[9], process_limit=process_limit) binding_predictions = cons_predictor.predict_subsequences( sequence_dict=sequence_dict) assert len(binding_predictions) == 14, \ "Expected 14 binding predictions from but got %d: %s" % ( len(binding_predictions), binding_predictions) source_names = [bp.source_sequence_name for bp in binding_predictions] for fasta_key in sequence_dict.keys(): fasta_count = source_names.count(fasta_key) assert fasta_count == 2, \ ("Expected each fasta key to appear twice, once for " "each length, but saw %s %d time(s)" % ( fasta_key, fasta_count))
def _check_hla_alleles(alleles, valid_alleles=None): """ Given a list of HLA alleles and an optional list of valid HLA alleles, return a set of alleles that we will pass into the MHC binding predictor. """ require_iterable_of(alleles, string_types, "HLA alleles") # Don't run the MHC predictor twice for homozygous alleles, # only run it for unique alleles alleles = { normalize_allele_name(allele.strip().upper()) for allele in alleles } if valid_alleles: # For some reason netMHCpan drops the '*' in names, so # 'HLA-A*03:01' becomes 'HLA-A03:01' missing_alleles = [ allele for allele in alleles if allele not in valid_alleles ] if len(missing_alleles) > 0: raise UnsupportedAllele("Unsupported HLA alleles: %s" % missing_alleles) return list(alleles)
def parse_mixmhcpred_results(filename): """ Parses output files of MixMHCpred that are expected to look like: Peptide Score_bestAllele BestAllele %Rank_bestAllele Score_A0201 %Rank_A0201 MLDDFSAGA 0.182093 A0201 0.3 0.182093 0.3 SPEGEETII -0.655341 A0201 51.0 -0.655341 51.0 ILDRIITNA 0.203906 A0201 0.3 0.203906 0.3 Parameters ---------- filename : str Returns list of BindingPrediction """ df = pd.read_csv(filename, comment="#", sep="\t") binding_predictions = [] for peptide, allele, score, pr in zip(df["Peptide"], df["BestAllele"], df["Score_bestAllele"], df["%Rank_bestAllele"]): binding_predictions.append( BindingPrediction(peptide=peptide, allele=normalize_allele_name(allele), score=score, percentile_rank=pr, prediction_method_name="mixmhcpred")) return binding_predictions
def _determine_supported_alleles(command, supported_allele_flag): """ Try asking the commandline predictor (e.g. netMHCpan) which alleles it supports. """ try: # convert to str since Python3 returns a `bytes` object supported_alleles_output = check_output( [command, supported_allele_flag]) supported_alleles_str = supported_alleles_output.decode( "ascii", "ignore") assert len(supported_alleles_str) > 0, \ '%s returned empty allele list' % command supported_alleles = set([]) for line in supported_alleles_str.split("\n"): line = line.strip() if not line.startswith('#') and len(line) > 0: try: # We need to normalize these alleles (the output of the predictor # when it lists its supported alleles) so that they are comparable with # our own alleles. supported_alleles.add(normalize_allele_name(line)) except AlleleParseError as error: logger.info("Skipping allele %s: %s", line, error) continue if len(supported_alleles) == 0: raise ValueError("Unable to determine supported alleles") return supported_alleles except Exception as e: logger.exception(e) raise SystemError( "Failed to run %s %s. Possibly an incorrect executable version?" % (command, supported_allele_flag))
def parse_allele_name(allele_name): if allele_name not in allele_list: return None try: name = mhcnames.normalize_allele_name(allele_name) return name except: return None
def parse_stdout(stdout, prediction_method_name, sequence_key_mapping, key_index, offset_index, peptide_index, allele_index, ic50_index, rank_index, log_ic50_index, ignored_value_indices={}, transforms={}): """ Generic function for parsing any NetMHC* output, given expected indices of values of interest. Parameters ---------- ignored_value_indices : dict Map from values to the positions we'll ignore them at. See clean_fields. transforms : dict Map from field index to a transform function to be applied to values in that field. See clean_fields. Returns BindingPredictionCollection """ binding_predictions = [] for fields in split_stdout_lines(stdout): fields = clean_fields(fields, ignored_value_indices, transforms) offset = int(fields[offset_index]) peptide = str(fields[peptide_index]) allele = str(fields[allele_index]) ic50 = float(fields[ic50_index]) rank = float(fields[rank_index]) if rank_index else 0.0 log_ic50 = float(fields[log_ic50_index]) key = str(fields[key_index]) if sequence_key_mapping: original_key = sequence_key_mapping[key] else: # if sequence_key_mapping isn't provided then let's assume it's the # identity function original_key = key binding_predictions.append( BindingPrediction(source_sequence_name=original_key, offset=offset, peptide=peptide, allele=normalize_allele_name(allele), affinity=ic50, percentile_rank=rank, log_affinity=log_ic50, prediction_method_name=prediction_method_name)) return binding_predictions
def test_netmhc_cons(): alleles = [normalize_allele_name(DEFAULT_ALLELE)] cons_predictor = NetMHCcons(alleles=alleles, default_peptide_lengths=[9]) sequence_dict = {"SMAD4-001": "ASIINFKELA", "TP53-001": "ASILLLVFYW"} binding_predictions = cons_predictor.predict_subsequences( sequence_dict=sequence_dict) assert len(binding_predictions) == 4, \ "Expected 4 epitopes from %s" % (binding_predictions,)
def test_wrapper_function(): alleles = [normalize_allele_name("HLA-A*02:01")] wrapped_4 = NetMHC(alleles=alleles, default_peptide_lengths=[9], program_name="netMHC") eq_(type(wrapped_4), NetMHC4) wrapped_3 = NetMHC(alleles=alleles, default_peptide_lengths=[9], program_name="netMHC-3.4") eq_(type(wrapped_3), NetMHC3)
def test_human_class2_alpha_beta(): expected = "HLA-DPA1*01:05-DPB1*100:01" expected_compact = "DPA10105-DPB110001" for name in ["DPA10105-DPB110001", "HLA-DPA1*01:05-DPB1*100:01", "hla-dpa1*0105-dpb1*10001", "dpa1*0105-dpb1*10001", "HLA-DPA1*01:05/DPB1*100:01"]: eq_(normalize_allele_name(name), expected) eq_(compact_allele_name(name), expected_compact)
def run_class1_presentation_predictor(args, peptides): # This will trigger a Keras import - will break local parallelism. predictor = Class1PresentationPredictor.load(args.models_dir) if args.allele: alleles = [normalize_allele_name(a) for a in args.allele] elif args.alleles_file: alleles = pandas.read_csv(args.alleles_file).allele.unique() else: alleles = predictor.supported_alleles print("Num alleles", len(alleles)) genotypes = {} if args.alleles_per_genotype == 6: gene_to_alleles = collections.defaultdict(list) for a in alleles: for gene in ["A", "B", "C"]: if a.startswith("HLA-%s" % gene): gene_to_alleles[gene].append(a) for _ in range(args.num_genotypes): genotype = [] for gene in ["A", "A", "B", "B", "C", "C"]: genotype.append(numpy.random.choice(gene_to_alleles[gene])) genotypes[",".join(genotype)] = genotype elif args.alleles_per_genotype == 1: for _ in range(args.num_genotypes): genotype = [numpy.random.choice(alleles)] genotypes[",".join(genotype)] = genotype else: raise ValueError("Alleles per genotype must be 6 or 1") print("Sampled genotypes: ", list(genotypes)) print("Num peptides: ", len(peptides)) start = time.time() print("Generating predictions") predictions_df = predictor.predict(peptides=peptides, alleles=genotypes) print("Finished in %0.2f sec." % (time.time() - start)) print(predictions_df) print("Calibrating ranks") scores = predictions_df.presentation_score.values predictor.calibrate_percentile_ranks(scores) print("Done. Saving.") predictor.save(args.models_dir, write_affinity_predictor=False, write_processing_predictor=False, write_weights=False, write_percent_ranks=True, write_info=False, write_metdata=False) print("Wrote predictor to: %s" % args.models_dir)
def test_multiple_lengths_netmhc3(): alleles = [normalize_allele_name("H-2-Kb")] predictor = NetMHC3(alleles=alleles, default_peptide_lengths=[9], program_name="netMHC-3.4") protein_sequence_dict = { 'seq': 'AETDEIKILLEE', } binding_predictions = predictor.predict_subsequences( protein_sequence_dict, peptide_lengths=[10, 11]) eq_(5, len(binding_predictions))
def test_human_class2_alpha_beta(): expected = "HLA-DPA1*01:05-DPB1*100:01" expected_compact = "DPA10105-DPB110001" for name in ["DPA10105-DPB110001", "HLA-DPA1*01:05-DPB1*100:01", "hla-dpa1*0105-dpb1*10001", "dpa1*0105-dpb1*10001", "HLA-DPA1*01:05/DPB1*100:01", "DPA10105/DPB110001"]: eq_(normalize_allele_name(name), expected) eq_(compact_allele_name(name), expected_compact)
def test_human_class2(): expected = "HLA-DRB1*01:02" expected_compact = "DRB10102" for name in ["DRB1_0102", "DRB101:02", "HLA-DRB1_0102", "DRB10102", "DRB1*0102", "HLA-DRB1*0102", "HLA-DRB1*01:02"]: eq_(normalize_allele_name(name), expected) eq_(compact_allele_name(name), expected_compact)
def test_human_class2(): expected = "HLA-DRA1*01:01-DRB1*01:02" expected_compact = "DRB10102" for name in ["DRB1_0102", "DRB101:02", "HLA-DRB1_0102", "DRB10102", "DRB1*0102", "HLA-DRB1*0102", "HLA-DRB1*01:02", "DRB0102"]: eq_(normalize_allele_name(name), expected) eq_(compact_allele_name(name), expected_compact)
def predict_peptides(self, peptides): """ Parameters ---------- peptides : list of str Returns ------- list of BindingPrediction """ self._check_peptide_inputs(peptides) results = [] for allele in self.alleles: temp_dir = mkdtemp(prefix="mhctools", suffix="mixmhcpred") input_file_path = join(temp_dir, "mixmhcpred_inputs.txt") output_file_path = join(temp_dir, "mixmhcpred_outputs.txt") with open(input_file_path, "w") as f: for i, p in enumerate(peptides): f.write(p) if i < len(peptides) - 1: f.write("\n") with CleanupFiles(filenames=[input_file_path, output_file_path], directories=[temp_dir]): with NamedTemporaryFile(prefix="MixMHCpred_stdout", mode="w", delete=False) as stdout_file: stdout_file_name = stdout_file.name run_command([ self.program_name, "-i", input_file_path, "-o", output_file_path, "-a", normalize_allele_name(allele) ] + self.extra_commandline_args, suppress_stderr=False, redirect_stdout_file=stdout_file) if exists(output_file_path): results.extend(parse_mixmhcpred_results(output_file_path)) else: with open(stdout_file_name, "r") as f: stdout = f.read().strip() raise ValueError( "MixMHCpred failed on allele '%s' with stdout '%s'" % (allele, stdout)) remove(stdout_file_name) return BindingPredictionCollection(results)
def fit_allele(self, allele, hit_list, decoys_list): allele = normalize_allele_name(allele) hit_list = set(hit_list) df = pandas.DataFrame( {"peptide": sorted(set(hit_list).union(decoys_list))}) df["allele"] = allele df["species"] = "human" df["affinity"] = ((~df.peptide.isin(hit_list)).astype(float) * (self.decoy_affinity - self.hit_affinity) + self.hit_affinity) df["sample_weight"] = 1.0 df["peptide_length"] = 9 self.predictor.fit_allele_specific_predictors( n_models=self.ensemble_size, architecture_hyperparameters=self.mhcflurry_hyperparameters, allele=allele, peptides=df.peptide.values, affinities=df.affinity.values, )
def test_netmhcii_pan_mouse(): alleles = [normalize_allele_name("H2-IAb")] ii_pan_predictor = NetMHCIIpan(alleles=alleles) fasta_dictionary = { "SMAD4-001": "PAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGT", "TP53-001": "SQAMDDLMLSPDDIEQWFTED" } binding_predictions = ii_pan_predictor.predict_subsequences( sequence_dict=fasta_dictionary, peptide_lengths=[15, 16]) unique_lengths = {x.length for x in binding_predictions} eq_(unique_lengths, {15, 16}) unique_alleles = {x.allele for x in binding_predictions} eq_(unique_alleles, {"H-2-IAb"}) # length of "PAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGT" is 34 # length of "SQAMDDLMLSPDDIEQWFTED" is 21 # Expect (34-15+1) + (34-16+1) + (21-15+1) + (21-16+1) = 52 entries assert len(binding_predictions) == 52, \ "Expected 52 epitopes from %s" % (binding_predictions,)
def predict_to_dataframe(self, peptides, alleles=None, allele=None, throw=True, include_individual_model_predictions=False, include_percentile_ranks=True, include_confidence_intervals=True, centrality_measure=DEFAULT_CENTRALITY_MEASURE): """ Predict nM binding affinities. Gives more detailed output than `predict` method, including 5-95% prediction intervals. If multiple predictors are available for an allele, the predictions are the geometric means of the individual model predictions. One of 'allele' or 'alleles' must be specified. If 'allele' is specified all predictions will be for the given allele. If 'alleles' is specified it must be the same length as 'peptides' and give the allele corresponding to each peptide. Parameters ---------- peptides : `EncodableSequences` or list of string alleles : list of string allele : string throw : boolean If True, a ValueError will be raised in the case of unsupported alleles or peptide lengths. If False, a warning will be logged and the predictions for the unsupported alleles or peptides will be NaN. include_individual_model_predictions : boolean If True, the predictions of each individual model are included as columns in the result dataframe. include_percentile_ranks : boolean, default True If True, a "prediction_percentile" column will be included giving the percentile ranks. If no percentile rank information is available, this will be ignored with a warning. centrality_measure : string or callable Measure of central tendency to use to combine predictions in the ensemble. Options include: mean, median, robust_mean. Returns ------- `pandas.DataFrame` of predictions """ if isinstance(peptides, string_types): raise TypeError("peptides must be a list or array, not a string") if isinstance(alleles, string_types): raise TypeError("alleles must be a list or array, not a string") if allele is None and alleles is None: raise ValueError("Must specify 'allele' or 'alleles'.") peptides = EncodableSequences.create(peptides) df = pandas.DataFrame({'peptide': peptides.sequences}, copy=False) if allele is not None: if alleles is not None: raise ValueError("Specify exactly one of allele or alleles") df["allele"] = allele normalized_allele = mhcnames.normalize_allele_name(allele) df["normalized_allele"] = normalized_allele unique_alleles = [normalized_allele] else: df["allele"] = numpy.array(alleles) df["normalized_allele"] = df.allele.map( mhcnames.normalize_allele_name) unique_alleles = df.normalized_allele.unique() if len(df) == 0: # No predictions. logging.warning("Predicting for 0 peptides.") empty_result = pandas.DataFrame(columns=[ 'peptide', 'allele', 'prediction', 'prediction_low', 'prediction_high' ]) return empty_result (min_peptide_length, max_peptide_length) = (self.supported_peptide_lengths) if (peptides.min_length < min_peptide_length or peptides.max_length > max_peptide_length): # Only compute this if needed all_peptide_lengths_supported = False sequence_length = df.peptide.str.len() df["supported_peptide_length"] = ( (sequence_length >= min_peptide_length) & (sequence_length <= max_peptide_length)) if (~df.supported_peptide_length).any(): msg = ( "%d peptides have lengths outside of supported range [%d, %d]: " "%s" % ((~df.supported_peptide_length).sum(), min_peptide_length, max_peptide_length, str(df.ix[~df.supported_peptide_length].peptide.unique())) ) logging.warning(msg) if throw: raise ValueError(msg) else: # Handle common case efficiently. df["supported_peptide_length"] = True all_peptide_lengths_supported = True num_pan_models = len(self.class1_pan_allele_models) max_single_allele_models = max( len(self.allele_to_allele_specific_models.get(allele, [])) for allele in unique_alleles) predictions_array = numpy.zeros(shape=(df.shape[0], num_pan_models + max_single_allele_models), dtype="float64") predictions_array[:] = numpy.nan if self.class1_pan_allele_models: unsupported_alleles = [ allele for allele in df.normalized_allele.unique() if allele not in self.allele_to_fixed_length_sequence ] if unsupported_alleles: msg = ("No sequences for allele(s): %s.\n" "Supported alleles: %s" % (" ".join(unsupported_alleles), " ".join( sorted(self.allele_to_fixed_length_sequence)))) logging.warning(msg) if throw: raise ValueError(msg) mask = df.supported_peptide_length if mask.sum() > 0: masked_allele_encoding = AlleleEncoding( df.loc[mask].normalized_allele, allele_to_fixed_length_sequence=self. allele_to_fixed_length_sequence) masked_peptides = peptides.sequences[mask] for (i, model) in enumerate(self.class1_pan_allele_models): predictions_array[mask, i] = model.predict( masked_peptides, allele_encoding=masked_allele_encoding) if self.allele_to_allele_specific_models: unsupported_alleles = [ allele for allele in unique_alleles if not self.allele_to_allele_specific_models.get(allele) ] if unsupported_alleles: msg = ("No single-allele models for allele(s): %s.\n" "Supported alleles are: %s" % (" ".join(unsupported_alleles), " ".join( sorted(self.allele_to_allele_specific_models)))) logging.warning(msg) if throw: raise ValueError(msg) for allele in unique_alleles: models = self.allele_to_allele_specific_models.get(allele, []) if len(unique_alleles) == 1 and all_peptide_lengths_supported: mask = None else: mask = ((df.normalized_allele == allele) & df.supported_peptide_length).values if mask is None or mask.all(): # Common case optimization for (i, model) in enumerate(models): predictions_array[:, num_pan_models + i] = (model.predict(peptides)) elif mask.sum() > 0: peptides_for_allele = EncodableSequences.create( df.ix[mask].peptide.values) for (i, model) in enumerate(models): predictions_array[mask, num_pan_models + i, ] = model.predict( peptides_for_allele) if callable(centrality_measure): centrality_function = centrality_measure else: centrality_function = CENTRALITY_MEASURES[centrality_measure] logs = numpy.log(predictions_array) log_centers = centrality_function(logs) df["prediction"] = numpy.exp(log_centers) if include_confidence_intervals: df["prediction_low"] = numpy.exp( numpy.nanpercentile(logs, 5.0, axis=1)) df["prediction_high"] = numpy.exp( numpy.nanpercentile(logs, 95.0, axis=1)) if include_individual_model_predictions: for i in range(num_pan_models): df["model_pan_%d" % i] = predictions_array[:, i] for i in range(max_single_allele_models): df["model_single_%d" % i] = predictions_array[:, num_pan_models + i] if include_percentile_ranks: if self.allele_to_percent_rank_transform: df["prediction_percentile"] = self.percentile_ranks( df.prediction, alleles=df.normalized_allele.values, throw=throw) else: warnings.warn("No percentile rank information available.") del df["supported_peptide_length"] del df["normalized_allele"] return df
def fit_allele_specific_predictors(self, n_models, architecture_hyperparameters_list, allele, peptides, affinities, inequalities=None, train_rounds=None, models_dir_for_save=None, verbose=0, progress_preamble="", progress_print_interval=5.0): """ Fit one or more allele specific predictors for a single allele using one or more neural network architectures. The new predictors are saved in the Class1AffinityPredictor instance and will be used on subsequent calls to `predict`. Parameters ---------- n_models : int Number of neural networks to fit architecture_hyperparameters_list : list of dict List of hyperparameter sets. allele : string peptides : `EncodableSequences` or list of string affinities : list of float nM affinities inequalities : list of string, each element one of ">", "<", or "=" See Class1NeuralNetwork.fit for details. train_rounds : sequence of int Each training point i will be used on training rounds r for which train_rounds[i] > r, r >= 0. models_dir_for_save : string, optional If specified, the Class1AffinityPredictor is (incrementally) written to the given models dir after each neural network is fit. verbose : int Keras verbosity progress_preamble : string Optional string of information to include in each progress update progress_print_interval : float How often (in seconds) to print progress. Set to None to disable. Returns ------- list of `Class1NeuralNetwork` """ allele = mhcnames.normalize_allele_name(allele) if allele not in self.allele_to_allele_specific_models: self.allele_to_allele_specific_models[allele] = [] encodable_peptides = EncodableSequences.create(peptides) peptides_affinities_inequalities_per_round = [ (encodable_peptides, affinities, inequalities) ] if train_rounds is not None: for round in sorted(set(train_rounds)): round_mask = train_rounds > round if round_mask.any(): sub_encodable_peptides = EncodableSequences.create( encodable_peptides.sequences[round_mask]) peptides_affinities_inequalities_per_round.append( (sub_encodable_peptides, affinities[round_mask], None if inequalities is None else inequalities[round_mask])) n_rounds = len(peptides_affinities_inequalities_per_round) n_architectures = len(architecture_hyperparameters_list) # Adjust progress info to indicate number of models and # architectures. pieces = [] if n_models > 1: pieces.append("Model {model_num:2d} / {n_models:2d}") if n_architectures > 1: pieces.append( "Architecture {architecture_num:2d} / {n_architectures:2d}") if len(peptides_affinities_inequalities_per_round) > 1: pieces.append("Round {round:2d} / {n_rounds:2d}") pieces.append("{n_peptides:4d} peptides") progress_preamble_template = "[ %s ] {user_progress_preamble}" % ( ", ".join(pieces)) models = [] for model_num in range(n_models): for (architecture_num, architecture_hyperparameters ) in enumerate(architecture_hyperparameters_list): model = Class1NeuralNetwork(**architecture_hyperparameters) for round_num in range(n_rounds): (round_peptides, round_affinities, round_inequalities) = ( peptides_affinities_inequalities_per_round[round_num]) model.fit( round_peptides, round_affinities, inequalities=round_inequalities, verbose=verbose, progress_preamble=progress_preamble_template.format( n_peptides=len(round_peptides), round=round_num, n_rounds=n_rounds, user_progress_preamble=progress_preamble, model_num=model_num + 1, n_models=n_models, architecture_num=architecture_num + 1, n_architectures=n_architectures), progress_print_interval=progress_print_interval) model_name = self.model_name(allele, model_num) row = pandas.Series( collections.OrderedDict([ ("model_name", model_name), ("allele", allele), ("config_json", json.dumps(model.get_config())), ("model", model), ])).to_frame().T self._manifest_df = pandas.concat([self.manifest_df, row], ignore_index=True) self.allele_to_allele_specific_models[allele].append(model) if models_dir_for_save: self.save(models_dir_for_save, model_names_to_write=[model_name]) models.append(model) self.clear_cache() return models
def parse_stdout( stdout, prediction_method_name, sequence_key_mapping, key_index, offset_index, peptide_index, allele_index, score_index, rank_index=None, ic50_index=None, ignored_value_indices={}, transforms={}): """ Generic function for parsing any NetMHC* output, given expected indices of values of interest. Parameters ---------- stdout : str prediction_method_name : str key_index : int offset_index : int peptide_index : int allele_index : int score_index : int rank_index : int ic50_index : int sequence_key_mapping : dict Dictionary mapping sequence names (which might be hashes or truncated) to the sequence names which should be used in the parsed BindingPrediction objects ignored_value_indices : dict Map from values to the positions we'll ignore them at. See clean_fields. transforms : dict Map from field index to a transform function to be applied to values in that field. See clean_fields. Returns BindingPredictionCollection """ binding_predictions = [] for fields in split_stdout_lines(stdout): fields = clean_fields(fields, ignored_value_indices, transforms) offset = int(fields[offset_index]) peptide = str(fields[peptide_index]) allele = str(fields[allele_index]) if score_index is None: score = None else: score = float(fields[score_index]) if rank_index is None: rank = None else: rank = float(fields[rank_index]) if ic50_index is None: ic50 = None else: ic50 = float(fields[ic50_index]) key = str(fields[key_index]) if sequence_key_mapping: original_key = sequence_key_mapping[key] else: # if sequence_key_mapping isn't provided then let's assume it's the # identity function original_key = key # if we have a bad IC50 score we might still get a salvageable # log of the score. Strangely, this is necessary sometimes! if ic50_index is not None and (not valid_affinity(ic50)) and np.isfinite(score): # pylint: disable=invalid-unary-operand-type ic50 = 50000 ** (1 - score) binding_predictions.append(BindingPrediction( source_sequence_name=original_key, offset=offset, peptide=peptide, allele=normalize_allele_name(allele), score=score, affinity=ic50, percentile_rank=rank, prediction_method_name=prediction_method_name)) return binding_predictions
def run(argv=sys.argv[1:]): global GLOBAL_DATA # On sigusr1 print stack trace print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid()) signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack()) args = parser.parse_args(argv) args.out_models_dir = os.path.abspath(args.out_models_dir) configure_logging(verbose=args.verbosity > 1) input_predictor = Class1AffinityPredictor.load(args.models_dir) print("Loaded: %s" % input_predictor) if args.allele: alleles = [normalize_allele_name(a) for a in args.allele] else: alleles = input_predictor.supported_alleles metadata_dfs = {} if args.data: df = pandas.read_csv(args.data) print("Loaded data: %s" % (str(df.shape))) df = df.loc[(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15)] print("Subselected to 8-15mers: %s" % (str(df.shape))) # Allele names in data are assumed to be already normalized. df = df.loc[df.allele.isin(alleles)].dropna() print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles))) if args.exclude_data: exclude_df = pandas.read_csv(args.exclude_data) metadata_dfs["model_selection_exclude"] = exclude_df print("Loaded exclude data: %s" % (str(df.shape))) df["_key"] = df.allele + "__" + df.peptide exclude_df["_key"] = exclude_df.allele + "__" + exclude_df.peptide df["_excluded"] = df._key.isin(exclude_df._key.unique()) print("Excluding measurements per allele (counts): ") print(df.groupby("allele")._excluded.sum()) print("Excluding measurements per allele (fractions): ") print(df.groupby("allele")._excluded.mean()) df = df.loc[~df._excluded] del df["_excluded"] del df["_key"] print("Reduced data to: %s" % (str(df.shape))) metadata_dfs["model_selection_data"] = df df["mass_spec"] = df.measurement_source.str.contains( args.mass_spec_regex) else: df = None if args.out_unselected_predictions: df["unselected_prediction"] = input_predictor.predict( alleles=df.allele.values, peptides=df.peptide.values) df.to_csv(args.out_unselected_predictions) print("Wrote: %s" % args.out_unselected_predictions) selectors = {} selector_to_model_selection_kwargs = {} def make_selector(scoring, combined_min_contribution_percent=args. combined_min_contribution_percent): if scoring in selectors: return (selectors[scoring], selector_to_model_selection_kwargs[scoring]) start = time.time() if scoring.startswith("combined:"): model_selection_kwargs = { 'min_models': args.combined_min_models, 'max_models': args.combined_max_models, } component_selectors = [] for component_selector in scoring.split(":", 1)[1].split(","): component_selectors.append( make_selector(component_selector)[0]) selector = CombinedModelSelector( component_selectors, min_contribution_percent=combined_min_contribution_percent) elif scoring == "mse": model_selection_kwargs = { 'min_models': args.mse_min_models, 'max_models': args.mse_max_models, } min_measurements = args.mse_min_measurements selector = MSEModelSelector(df=df.loc[~df.mass_spec], predictor=input_predictor, min_measurements=min_measurements) elif scoring == "mass-spec": mass_spec_df = df.loc[df.mass_spec] model_selection_kwargs = { 'min_models': args.mass_spec_min_models, 'max_models': args.mass_spec_max_models, } min_measurements = args.mass_spec_min_measurements selector = MassSpecModelSelector(df=mass_spec_df, predictor=input_predictor, min_measurements=min_measurements) elif scoring == "consensus": model_selection_kwargs = { 'min_models': args.consensus_min_models, 'max_models': args.consensus_max_models, } selector = ConsensusModelSelector( predictor=input_predictor, num_peptides_per_length=args.consensus_num_peptides_per_length) else: raise ValueError("Unsupported scoring method: %s" % scoring) print("Instantiated model selector %s in %0.2f sec." % (scoring, time.time() - start)) return (selector, model_selection_kwargs) for scoring in args.scoring: (selector, model_selection_kwargs) = make_selector(scoring) selectors[scoring] = selector selector_to_model_selection_kwargs[scoring] = model_selection_kwargs unselected_accuracy_scorer = None if args.unselected_accuracy_scorer: # Force running all selectors by setting combined_min_contribution_percent=0. unselected_accuracy_scorer = make_selector( args.unselected_accuracy_scorer, combined_min_contribution_percent=0.0)[0] print("Using unselected accuracy scorer: %s" % unselected_accuracy_scorer) GLOBAL_DATA["unselected_accuracy_scorer"] = unselected_accuracy_scorer print("Selectors for alleles:") allele_to_selector = {} allele_to_model_selection_kwargs = {} for allele in alleles: selector = None for possible_selector in args.scoring: if selectors[possible_selector].usable_for_allele(allele=allele): selector = selectors[possible_selector] print("%20s %s" % (allele, selector.plan_summary(allele))) break if selector is None: raise ValueError("No selectors usable for allele: %s" % allele) allele_to_selector[allele] = selector allele_to_model_selection_kwargs[allele] = ( selector_to_model_selection_kwargs[possible_selector]) GLOBAL_DATA["args"] = args GLOBAL_DATA["input_predictor"] = input_predictor GLOBAL_DATA["unselected_accuracy_scorer"] = unselected_accuracy_scorer GLOBAL_DATA["allele_to_selector"] = allele_to_selector GLOBAL_DATA[ "allele_to_model_selection_kwargs"] = allele_to_model_selection_kwargs if not os.path.exists(args.out_models_dir): print("Attempting to create directory: %s" % args.out_models_dir) os.mkdir(args.out_models_dir) print("Done.") result_predictor = Class1AffinityPredictor( metadata_dataframes=metadata_dfs) worker_pool = worker_pool_with_gpu_assignments_from_args(args) start = time.time() if worker_pool is None: # Serial run print("Running in serial.") results = (model_select(allele) for allele in alleles) else: # Parallel run random.shuffle(alleles) results = worker_pool.imap_unordered(partial( model_select, constant_data=GLOBAL_DATA), alleles, chunksize=1) unselected_summary = [] model_selection_dfs = [] for result in tqdm.tqdm(results, total=len(alleles)): pprint(result) summary_dict = dict(result) summary_dict["retained"] = result["selected"] is not None del summary_dict["selected"] unselected_summary.append(summary_dict) if result['selected'] is not None: model_selection_dfs.append( result['selected'].metadata_dataframes['model_selection']) result_predictor.merge_in_place([result['selected']]) if model_selection_dfs: model_selection_df = pandas.concat(model_selection_dfs, ignore_index=True) model_selection_df["selector"] = model_selection_df.allele.map( allele_to_selector) result_predictor.metadata_dataframes["model_selection"] = ( model_selection_df) result_predictor.metadata_dataframes["unselected_summary"] = ( pandas.DataFrame(unselected_summary)) print("Done model selecting for %d alleles." % len(alleles)) result_predictor.save(args.out_models_dir) model_selection_time = time.time() - start if worker_pool: worker_pool.close() worker_pool.join() print("Model selection time %0.2f min." % (model_selection_time / 60.0)) print("Predictor written to: %s" % args.out_models_dir)
def test_extra_text_after_allele(): normalize_allele_name("HLA-A*02:01 zipper")
def test_hla_long_names(): expected = "HLA-A*02:01" for name in hla_02_01_names: result = normalize_allele_name(name) eq_(result, expected)