def predict_peptides(self, peptides): """ Predict MHC affinity for peptides. """ # importing locally to avoid slowing down CLI applications which # don't use MHCflurry from mhcflurry.encodable_sequences import EncodableSequences binding_predictions = [] encodable_sequences = EncodableSequences.create(peptides) for allele in self.alleles: predictions_df = self.predictor.predict_to_dataframe( encodable_sequences, allele=allele) for (_, row) in predictions_df.iterrows(): binding_prediction = BindingPrediction( allele=allele, peptide=row.peptide, affinity=row.prediction, percentile_rank=( row.prediction_percentile if 'prediction_percentile' in row else nan), prediction_method_name="mhcflurry" ) binding_predictions.append(binding_prediction) return BindingPredictionCollection(binding_predictions)
def test_speed_allele_specific(profile=False, num=DEFAULT_NUM_PREDICTIONS): global ALLELE_SPECIFIC_PREDICTOR starts = collections.OrderedDict() timings = collections.OrderedDict() profilers = collections.OrderedDict() predictor = ALLELE_SPECIFIC_PREDICTOR def start(name): starts[name] = time.time() if profile: profilers[name] = cProfile.Profile() profilers[name].enable() def end(name): timings[name] = time.time() - starts[name] if profile: profilers[name].disable() start("first") predictor.predict(["SIINFEKL"], allele="HLA-A*02:01") end("first") peptides = random_peptides(num) start("pred_%d" % num) predictor.predict(peptides, allele="HLA-A*02:01") end("pred_%d" % num) NUM2 = 10000 peptides = EncodableSequences.create(random_peptides(NUM2, length=13)) start("encode_blosum_%d" % NUM2) peptides.variable_length_to_fixed_length_vector_encoding("BLOSUM62") end("encode_blosum_%d" % NUM2) start("pred_already_encoded_%d" % NUM2) predictor.predict(peptides, allele="HLA-A*02:01") end("pred_already_encoded_%d" % NUM2) NUM_REPEATS = 100 start("pred_already_encoded_%d_%d_times" % (NUM2, NUM_REPEATS)) for _ in range(NUM_REPEATS): predictor.predict(peptides, allele="HLA-A*02:01") end("pred_already_encoded_%d_%d_times" % (NUM2, NUM_REPEATS)) print("SPEED BENCHMARK") print("Results:\n%s" % str(pandas.Series(timings))) return dict( (key, pstats.Stats(value)) for (key, value) in profilers.items())
def do_predictions_mhcflurry(work_item_dicts, constant_data=None): """ Each dict of work items should have keys: work_item_num, peptides, alleles """ # This may run on the cluster in a way that misses all top level imports, # so we have to re-import everything here. import time from mhcflurry.encodable_sequences import EncodableSequences from mhcflurry import Class1AffinityPredictor if constant_data is None: constant_data = GLOBAL_DATA args = constant_data['args'] assert args.predictor == "mhcflurry" assert constant_data['cols'] == ["affinity"] predictor = Class1AffinityPredictor.load(args.mhcflurry_models_dir) results = [] for (i, d) in enumerate(work_item_dicts): work_item_num = d['work_item_num'] peptides = d['peptides'] alleles = d['alleles'] print("Processing work item", i + 1, "of", len(work_item_dicts)) result = {} results.append((work_item_num, result)) start = time.time() peptides = EncodableSequences.create(peptides) for (i, allele) in enumerate(alleles): print("Processing allele %d / %d: %0.2f sec elapsed" % (i + 1, len(alleles), time.time() - start)) for col in ["affinity"]: result["%s %s" % (allele, col)] = predictor.predict( peptides=peptides, allele=allele, throw=False, model_kwargs={ 'batch_size': args.mhcflurry_batch_size, }).astype(constant_data['args'].result_dtype) print("Done predicting in", time.time() - start, "sec") return results
def test_correlation(alleles=None, num_peptides_per_length=1000, lengths=[8, 9, 10], debug=False): peptides = [] for length in lengths: peptides.extend(random_peptides(num_peptides_per_length, length)) # Cache encodings peptides = EncodableSequences.create(list(set(peptides))) if alleles is None: alleles = set.intersection(*[ set(predictor.supported_alleles) for predictor in PREDICTORS.values() ]) alleles = sorted(set(alleles)) df = pandas.DataFrame(index=peptides.sequences) results_df = [] for allele in alleles: for (name, predictor) in PREDICTORS.items(): df[name] = predictor.predict(peptides, allele=allele) correlation = numpy.corrcoef(numpy.log10(df["allele-specific"]), numpy.log10(df["pan-allele"]))[0, 1] results_df.append((allele, correlation)) print(len(results_df), len(alleles), *results_df[-1]) if correlation < 0.6: print("Warning: low correlation", allele) df["tightest"] = df.min(1) print(df.sort_values("tightest").iloc[:, :-1]) if debug: import ipdb ipdb.set_trace() del df["tightest"] results_df = pandas.DataFrame(results_df, columns=["allele", "correlation"]) print(results_df) print("Mean correlation", results_df.correlation.mean()) assert_greater(results_df.correlation.mean(), 0.65) return results_df