def test_allele_factory(self): a = Allele("HLA-DPA1*01:03-DPB1*01:01", prob=1) b = Allele("HLA-A*02:01", prob=2) self.assertIsInstance(a, CombinedAllele) self.assertEqual(a.prob, 1) self.assertIsInstance(b, Allele) self.assertEqual(b.prob, 2)
def setUp(self): self.peptides_mhcI = [Peptide("SYFPEITHI"), Peptide("IHTIEPFYS")] self.peptides_mhcII = [Peptide("AAAAAASYFPEITHI"), Peptide("IHTIEPFYSAAAAAA")] self.mhcI = [Allele("HLA-B*07:02"), Allele("HLA-A*02:01")] self.mhcII = [Allele("HLA-DRB1*07:01"), Allele("HLA-DRB1*15:01")] self.mhcII_combined_alleles = [CombinedAllele("DPA1*01:03-DPB1*01:01"), CombinedAllele("DQA1*06:02-DQB1*06:31")] self.transcript = Transcript("")
def parse_external_result(self, output): """ Searches within the defined dir _file for the newest dir and reads the prediction file from there :param str output: The path to the output dir :return: The predicted HLA genotype :rtype: list(:class:`~Fred2.Core.Allele.Allele`) """ alleles = [] if os.path.isdir(output): _file = os.path.join(output, ".typing.txt") else: _file = output + ".typing.txt" typing = False with open(_file, "r") as f: for l in f: if typing and l.strip() != "": a1, a2, _ = l.split() alleles.append(Allele("HLA-" + ":".join(a1.split(":")[:2]))) alleles.append(Allele("HLA-" + ":".join(a2.split(":")[:2]))) if "------------ Inferred Allelic Pairs -------------" in l: typing = True return alleles
def affinities_from_csv(bindings_file, allele_data=None, peptide_coverage=None): ''' Loads binding affinities from a csv file. Optionally, augments alleles with probability and peptides with protein coverage. Discards all peptides for which coverage is not provided. ''' df = pd.read_csv(bindings_file) df['Seq'] = df.Seq.apply(Peptide) if peptide_coverage is not None: keep = [] for pep in df.Seq: if pep not in peptide_coverage: keep.append(False) continue keep.append(True) for prot in peptide_coverage[str(pep)]: pep.proteins[prot] = prot df = df[keep] df = df.set_index(['Seq', 'Method']) if allele_data is not None: df.columns = [ Allele(c, allele_data[c]['frequency'] / 100) for c in df.columns ] else: df.columns = [Allele(c) for c in df.columns] return EpitopePredictionResult(df)
def parse_external_result(self, output): """ Searches within the defined dir _file for the newest dir and reads the prediction file from there :param str output: The path to the output dir :return: The predicted HLA genotype :rtype: list(:class:`~Fred2.Core.Allele.Allele`) """ alleles = [] try: with open(os.path.join(output, "winner.hla.txt"), "r") as f: for l in f: try: _, a1, a2 = l.replace("-n", "").replace("-e", "").strip().split() a1 = a1.split("_") a2 = a2.split("_") alleles.extend([ Allele("HLA-" + a1[1].upper() + "*" + a1[2] + ":" + a1[3]), Allele("HLA-" + a2[1].upper() + "*" + a2[2] + ":" + a2[3]) ]) except ValueError: IOError( "Output format seems incorrect:\n{line}\n. Please check if Polysolver ran correctly." .format(lines=l)) return alleles except IOError: raise IOError( "File {out} could not be found. Please check your specified output folder" .format(out=os.path.join(output, "winner.hla.txt")))
def parse_external_result(self, output): """ Searches within the defined dir _file for the newest dir and reads the prediction file from there :param str output: The path to the output dir :return: The predicted HLA genotype :rtype: list(:class:`~Fred2.Core.Allele.Allele`) """ alleles = [] try: with open(output+"-ClassI.HLAgenotype4digits") as c1: for row in csv.DictReader(c1, delimiter="\t"): alleles.extend([Allele("HLA-"+row["Allele 1"]), Allele("HLA-"+row["Allele 2"])]) except IOError as e: warnings.warn("Output file {c1} for HLA-I could not be found. {error}".format( c1=output + "-ClassI.HLAgenotype4digits"), error=e) try: with open(output+"-ClassII.HLAgenotype4digits") as c2: for row in csv.DictReader(c2, delimiter="\t"): alleles.extend([Allele("HLA-"+row["Allele 1"]), Allele("HLA-"+row["Allele 2"])]) except IOError as e: warnings.warn("Output file {c2} for HLA-I could not be found. {error}".format( c2=output + "-ClassII.HLAgenotype4digits"), error=e) return alleles
def test_consistency(self): """ tests all __*__ (including init) test has several asserts! If one fails, the following will not be evaluated! """ self.assertTrue(repr(self.simple) == "HLA-A*02:01") self.assertEqual(self.simple, Allele("HLA-A*02:01")) self.assertNotEqual(repr(self.simple), Allele("HLA-A*02:01:666"))
def setUp(self): #Peptides of different length 9,10,11,12,13,14,15 self.peptides_mhcI = [Peptide("SYFPEITHI"), Peptide("IHTIEPFYS")] self.peptides_mhcII = [ Peptide("SYFPEITHI"), Peptide("IHTIEPFYSAAAAAA") ] self.mhcI = [Allele("HLA-B*15:01"), Allele("HLA-A*02:01")] self.mhcII = [Allele("HLA-DRB1*07:01"), Allele("HLA-DRB1*15:01")]
def compute_affinities(input_alleles, input_peptides, output_affinities, processes, predictor): ''' Computes the binding affinities between the given peptides and HLA alleles ''' alleles = [ Allele(a.replace('HLA-', '')) for a in utilities.get_alleles_and_thresholds(input_alleles).index ] LOGGER.info('Loaded %d alleles', len(alleles)) with open(input_peptides) as f: reader = csv.DictReader(f) peptides = [(Peptide(r['peptide']), len(r['proteins'].split(';'))) for r in reader] peptides.sort(key=lambda p: p[1], reverse=True) LOGGER.info('Loaded %d peptides', len(peptides)) results = utilities.parallel_apply( get_binding_affinity_process, ((predictor.lower(), batch, alleles) for batch in utilities.batches((p for p, _ in peptides), bsize=256)), processes) count = 0 for bindings in results: bindings.to_csv(output_affinities, header=(count == 0), mode=('w' if count == 0 else 'a')) count += len(bindings) LOGGER.debug('Processed %d peptides (%.2f%%)...', count, 100 * count / len(peptides))
def read_hla_input(hla_file): """ reads in the hla file header are defined as: A1 - first HLA-A allele in 4-digit notation A2 - second HLA-A allele in 4-digit notation B1 - first HLA-B allele in 4-digit notation B2 - second HLA-B allele in 4-digit notation C1 - first HLA-C allele in 4-digit notation C2 - second HLA-C allele in 4-digit notation A_expression - expression of HLA A gene B_expression - expression of HLA B gene C_expression - expression of HLA C gene :param hla_file: :return: list(Allele) """ alleles = [] with open(hla_file, "rU") as f: reader = csv.DictReader(f, delimiter='\t') for row in reader: for n, hla in itr.product([1, 2], ["A", "B", "C"]): a = Allele(row[hla + str(n)]) a.log_metadata("abundance", float(row[hla + "_expression"])) alleles.append(a) return alleles
def parse_external_result(self, output): """ Searches within the defined dir _file for the newest dir and reads the prediction file from there :param str output: The path to the output dir :return: The predicted HLA genotype :rtype: list(:class:`~Fred2.Core.Allele.Allele`) """ alleles = [] try: with open(output + "-ClassI.HLAgenotype4digits") as c1: for row in csv.DictReader(c1, delimiter="\t"): alleles.extend([ Allele("HLA-" + row["Allele 1"].replace("'", "")), Allele("HLA-" + row["Allele 2"].replace("'", "")) ]) except IOError as e: warnings.warn( "Output file {c1} for HLA-I could not be found. {error}". format(c1=output + "-ClassI.HLAgenotype4digits"), error=e) try: with open(output + "-ClassII.HLAgenotype4digits") as c2: DQA = [] DQB = [] for row in csv.DictReader(c2, delimiter="\t"): a1, a2 = row["Allele 1"], row["Allele 2"] if "DRB" in a1 or "DRB" in a2: alleles.extend([ Allele("HLA-" + a1.replace("'", "")), Allele("HLA-" + a2.replace("'", "")) ]) elif "DQA" in a1 or "DQA" in a2: DQA.extend([a1.replace("'", ""), a2.replace("'", "")]) else: DQB.extend([a1.replace("'", ""), a2.replace("'", "")]) for dq in itertools.product(DQA, DQB): alleles.append(CombinedAllele("HLA-" + "-".join(dq))) except IOError as e: warnings.warn( "Output file {c2} for HLA-I could not be found. {error}". format(c2=output + "-ClassII.HLAgenotype4digits"), error=e) return alleles
def predict_peptide_effects(peptides, alleles=None): """ Predict the peptide effect for all the available methods on the machine Args: peptides (list of Peptides): Usually an output from read_fasta alleles (list of chars): Alleles for which to run the predictors Returns: pd.DataFrame: Tidy pd.DataFrame. If the method is unable to predict for a particular value the rows are not present. Example: >>> peptides = [Peptide("SYFPEITHI"), Peptide("FIASNGVKL"), Peptide("LLGATCMFV")] >>> alleles = ['A*02:16', 'B*45:01'] >>> predict_peptide_effects(peptides, alleles = alleles).head() Seq Method allele score 0 (F, I, A, S, N, G, V, K, L) arb A*02:16 594.691144 1 (F, I, A, S, N, G, V, K, L) smm A*02:16 159.768074 2 (F, I, A, S, N, G, V, K, L) smmpmbec A*02:16 211.977614 4 (F, I, A, S, N, G, V, K, L) unitope A*02:16 0.527849 5 (L, L, G, A, T, C, M, F, V) arb A*02:16 6.784222 """ dt = valid_predictors() results = [] for i in range(len(dt)): # subset to valid alleles if alleles is not None: valid_alleles = dt.iloc[i]["supportedAlleles"].intersection( alleles) if len(valid_alleles) == 0: continue valid_alleles = [Allele(al) for al in valid_alleles] else: valid_alleles = None method = dt.iloc[i]["name"] print("method: ", method) # TODO - use try, except t0 = time.time() try: results.append( EpitopePredictorFactory(method).predict(peptides, alleles=valid_alleles)) except: print("Error! Unable to run ", method, ": ", sys.exc_info()) t1 = time.time() print(" - runtime: ", str(t1 - t0)) df = results[0].merge_results(results[1:]).reset_index() dfm = pd.melt(df, id_vars=["Seq", "Method"], var_name="allele", value_name="score") dfm = dfm[dfm["score"].notnull()] dfm.rename(columns={'Seq': 'peptide', 'Method': 'method'}, inplace=True) return dfm
def generate_alleles(allele_file, generated=None): """ generate allele objects from input """ result=[] with open(allele_file, "r") as f: for l in f: al,freq = l.replace(","," ").replace(";"," ").replace("\n","").split() if al.split("HLA-")[-1][0] in ["A","B","C"]: result.append(Allele(al,prob=float(freq))) return result
def test_pareto_front_assembly(self): cl_pred = CleavageSitePredictorFactory("PCM") ep_pred = EpitopePredictorFactory("SMM") allele = [Allele("HLA-A*02:01")] thresh = {a.name:10000 for a in allele} comp = lambda a,b: a <= b assembler = ParetoEpitopeAssembly(self.peptides,cl_pred, ep_pred, allele, thresh, comp, solver="cbc", verbosity=0) r = assembler.paretosolve() print(r) #print assembler.solve(eps=2.0)
def run_predictor(pred, dataset): predictor = EpitopePredictorFactory(pred) results = () try: results = predictor.predict(dataset, alleles=[Allele(a) for a in args.allele]) print(results) print(results.describe()) except ValueError: pass return (len(results), len(dataset))
def test_pareto_assembly(self): cl_pred = CleavageSitePredictorFactory("PCM") ep_pred = EpitopePredictorFactory("SMM") allele = [Allele("HLA-A*02:01")] thresh = {a.name:10000 for a in allele} comp = lambda a,b: a <= b print(ep_pred.predict(self.peptides,alleles=allele)) #cl_pred, ep_pred, alleles, threshold, comparator, length=9 assembler = ParetoEpitopeAssembly(self.peptides,cl_pred, ep_pred, allele, thresh, comp, solver="cbc", verbosity=1) r = assembler.solve(eps=1e10, order=(1,0)) print(r)
def design_spacers(input_epitopes, input_alleles, top_proteins, top_immunogen, top_alleles, solver, pssm_cleavage, alpha, beta, spacer_length, pssm_epitope, processes, output_spacers): all_epitopes = list( utilities.load_epitopes(input_epitopes, top_immunogen, top_alleles, top_proteins).keys()) epitopes = [e for e in all_epitopes if 'X' not in e] LOGGER.debug('Removed %d epitopes with unknown amino acids', len(all_epitopes) - len(epitopes)) LOGGER.info('Loaded %d epitopes', len(epitopes)) alleles_df = utilities.get_alleles_and_thresholds(input_alleles) allele_list = [ Allele(a.replace('HLA-', ''), prob=row.frequency / 100) for a, row in alleles_df.iterrows() ] threshold = { a.replace('HLA-', ''): row.threshold for a, row in alleles_df.iterrows() } LOGGER.info('Loaded %d alleles', len(allele_list)) if pssm_cleavage != 'PCM': raise ValueError('Only PCM supported as cleavage predictor') cleavage_predictor = PCM() # TODO use factory when it works if pssm_epitope != 'BIMAS': raise ValueError('Only BIMAS supported as epitope predictor') epitope_predictor = BIMAS() # TODO use factory when it works designer = OptimalSpacerDesign( epitopes, cleavage_predictor, epitope_predictor, allele_list, threshold=threshold, solver=solver, k=spacer_length, alpha=alpha, beta=beta, ).solve(threads=processes) LOGGER.info('Writing results...') with open(output_spacers, 'w') as f: writer = csv.writer(f) writer.writerow(('from', 'to', 'score', 'spacer')) writer.writerows( (ei, ej, designer.adj_matrix[ei, ej], designer.spacer[ei, ej]) for ei in epitopes for ej in epitopes if ei != ej)
def parse_external_result(self, output): """ Searches within the defined dir _file for the newest dir and reads the prediction file from there :param str output: The path to the output dir :return: The predicted HLA genotype :rtype: list(:class:`~Fred2.Core.Allele.Allele`) """ all_subdirs = [os.path.join(output,d) for d in os.listdir(output) if os.path.isdir(os.path.join(output,d))] latest_subdir = max(all_subdirs, key=os.path.getmtime) result_file = latest_subdir+"/"+os.path.basename(os.path.normpath(latest_subdir))+"_result.tsv" with open(result_file, "r") as f: row = csv.DictReader(f, delimiter="\t").next() return map(lambda x: Allele("HLA-"+x), [ row[k] for k in ["A1","A2","B1","B2","C1","C2"]])
def generate_epitope_result(input, allele_file): """ generates EpitopePredictionResult from output of epitopeprediction and neoepitopeprediction """ #first generate alleles in allele file alleles = {} with open(allele_file, "r") as af: for l in af: allele, freq = l.split("\t") alleles[allele] = Allele(allele, prob=float(freq)) r_raw = pandas.read_csv(input, sep="\t") res_dic = {} method = r_raw.loc[0, "Method"] columns = set(["Sequence", "Method", "Antigen ID", "Variant"]) alleles_raw = [c for c in r_raw.columns if c not in columns] for k, row in r_raw.iterrows(): seq = row["Sequence"] protPos = collections.defaultdict(list) try: protPos = {Protein(p, gene_id=p, transcript_id=p): [0] for p in str(row["Antigen ID"]).split(",")} except KeyError: pass pep = Peptide(seq, protein_pos=protPos) for a in alleles_raw: if a in alleles: if alleles[a] not in res_dic: res_dic[alleles[a]] = {} res_dic[alleles[a]][pep] = float(row[a]) if not res_dic: sys.stderr.write("HLA alleles of population and HLA used for prediction did not overlap.") sys.exit(-1) df_result = EpitopePredictionResult.from_dict(res_dic) df_result.index = pandas.MultiIndex.from_tuples([tuple((i, method)) for i in df_result.index], names=['Seq', 'Method']) return df_result, method
def string_of_beads(input_proteins, input_alleles, input_epitopes, input_cleavages, output_vaccine, cocktail, greedy_subtour, max_aminoacids, max_epitopes, min_alleles, min_proteins, min_avg_prot_conservation, min_avg_alle_conservation): program_start_time = time.time() # load proteins LOGGER.info('Reading sequences...') proteins = FileReader.read_fasta(input_proteins, in_type=Protein) LOGGER.info('%d proteins read', len(proteins)) # load alleles alleles = [ Allele(a) for a in utilities.get_alleles_and_thresholds(input_alleles).index ] LOGGER.info('Loaded %d alleles', len(alleles)) # load epitopes epitopes = utilities.load_epitopes(input_epitopes) LOGGER.info('Loaded %d epitopes', len(epitopes)) # read cleavage scores cleavage_epitopes = set() with open(input_cleavages) as f: cleavages = {} for row in csv.DictReader(f): cleavages[(row['from'], row['to'])] = float(row['score']) cleavage_epitopes.add(row['from']) cleavage_epitopes.add(row['to']) LOGGER.info('Loaded %d cleavage scores', len(cleavages)) # compute edge cost edge_cost, vertices, vertices_rewards = [], [], [] vertex_to_epitope = [''] + list(cleavage_epitopes) for ep_from in vertex_to_epitope: vertices.append(ep_from) vertices_rewards.append(0 if ep_from == '' else epitopes[ep_from]['immunogen']) edge_cost.append([ cleavages[(ep_from, ep_to)] if ep_from != '' and ep_to != '' else 0.0 for ep_to in vertex_to_epitope ]) LOGGER.info('Kept %d epitopes with available clevages', len(vertices) - 1) type_coverage, min_type_coverage, min_avg_type_conservation = utilities.compute_coverage_matrix( [epitopes[e] for e in vertex_to_epitope[1:]], min_alleles, min_proteins, min_avg_prot_conservation, min_avg_alle_conservation, len(proteins), len(alleles)) # find optimal design solver_build_time = time.time() solver = TeamOrienteeringIlp( num_teams=cocktail, vertex_reward=vertices_rewards, edge_cost=edge_cost, type_coverage=type_coverage, min_type_coverage=min_type_coverage, min_avg_type_conservation=min_avg_type_conservation, max_edge_cost=max_aminoacids, max_vertices=max_epitopes, lazy_subtour_elimination=not greedy_subtour) solver.build_model() solver_start_time = time.time() result = solver.solve() solver_end_time = time.time() # print info and save with open(output_vaccine, 'w') as f: writer = csv.writer(f) writer.writerow(('cocktail', 'index', 'epitope')) for i, mosaic in enumerate(result): LOGGER.info('Mosaic #%d', i + 1) for j, (_, vertex) in enumerate(mosaic[:-1]): epitope = epitopes[vertex_to_epitope[vertex]] writer.writerow((i, j, epitope['epitope'])) LOGGER.info(' %s - IG: %.2f', epitope['epitope'], epitope['immunogen']) LOGGER.info('==== Stopwatch') LOGGER.info(' Total time : %.2f s', solver_end_time - program_start_time) LOGGER.info(' Pre-processing : %.2f s', solver_build_time - program_start_time) LOGGER.info(' Model creation time : %.2f s', solver_start_time - solver_build_time) LOGGER.info(' Solving time : %.2f s', solver_end_time - solver_start_time)
def run_sequential(input_epitopes, input_alleles, input_affinities, output_vaccine, num_epitopes, min_alleles, min_proteins, solver, **kwargs): epitope_data = { k: v for k, v in utilities.load_epitopes(input_epitopes).items() if 'X' not in k } LOGGER.info('Loaded %d epitopes', len(epitope_data)) peptide_coverage = { # we don't really need the actual protein sequence, just fill it with the id to make it unique Peptide(r['epitope']): set(Protein(gid, gene_id=gid) for gid in r['proteins']) for r in epitope_data.values() } allele_data = utilities.get_alleles_and_thresholds(input_alleles).to_dict( 'index') alleles = [ Allele(allele.replace('HLA-', ''), prob=data['frequency'] / 100) for allele, data in allele_data.items() ] threshold = { allele.replace('HLA-', ''): data['threshold'] for allele, data in allele_data.items() } LOGGER.info('Loaded %d alleles', len(threshold)) affinities = affinities_from_csv(input_affinities, allele_data, peptide_coverage=peptide_coverage) LOGGER.info('Loaded %d affinities', len(affinities)) LOGGER.info('Selecting epitopes...') model = OptiTope(affinities, threshold, k=num_epitopes, solver=solver) if min_alleles is not None: model.activate_allele_coverage_const(min_alleles) if min_proteins is not None: model.activate_antigen_coverage_const(min_proteins) selected_epitopes = model.solve() LOGGER.info('Creating spacers...') vaccine = EpitopeAssemblyWithSpacer(selected_epitopes, PCM(), BIMAS(), alleles, threshold=threshold, solver=solver).solve() immunogen = sum(epitope_data[str(e)]['immunogen'] for e in vaccine[::2]) sequence = ''.join(map(str, vaccine)) cleavage = pcm.DoennesKohlbacherPcm().cleavage_per_position(sequence) with open(output_vaccine, 'w') as f: writer = csv.DictWriter( f, ('immunogen', 'vaccine', 'spacers', 'cleavage')) writer.writeheader() writer.writerow({ 'immunogen': immunogen, 'vaccine': sequence, 'spacers': ';'.join(str(e) for e in vaccine[1::2]), 'cleavage': ';'.join('%.3f' % c for c in cleavage) })
def get_mosaic_solver_instance(logger, input_proteins, input_alleles, input_epitopes, input_overlaps, **kwargs): top_immunogen = kwargs.pop('top_immunogen') top_alleles = kwargs.pop('top_alleles') top_proteins = kwargs.pop('top_proteins') min_overlap = kwargs.get('min_overlap', 0) cocktail = kwargs.get('cocktail', 1) greedy_subtour = kwargs.get('greedy_subtour') max_epitopes = kwargs.get('max_epitopes') max_aminoacids = kwargs.get('max_aminoacids') min_alleles = kwargs.get('min_alleles', 0) min_proteins = kwargs.get('min_proteins', 0) min_avg_prot_conservation = kwargs.get('min_avg_prot_conservation', 0) min_avg_alle_conservation = kwargs.get('min_avg_alle_conservation', 0) # load proteins logger.info('Reading sequences...') proteins = FileReader.read_fasta(input_proteins, in_type=Protein) logger.info('%d proteins read', len(proteins)) # load alleles alleles = [ Allele(a) for a in get_alleles_and_thresholds(input_alleles).index ] logger.info('Loaded %d alleles', len(alleles)) # load epitopes epitope_data = list( load_epitopes(input_epitopes, top_immunogen, top_alleles, top_proteins).values()) logger.info('Loaded %d epitopes', len(epitope_data)) # load edge cost logger.info('Loading overlaps...') vertex_rewards = [0] + [b['immunogen'] for b in epitope_data] edges = load_edges_from_overlaps(input_overlaps, min_overlap, [b['epitope'] for b in epitope_data]) logger.info('Kept %d edges (from %d)', len(edges), len(epitope_data) * (len(epitope_data) + 1)) # compute hla and protein coverage logger.info('Computing coverage matrix...') type_coverage, min_type_coverage, min_avg_type_conservation = compute_coverage_matrix( epitope_data, min_alleles, min_proteins, min_avg_prot_conservation, min_avg_alle_conservation, len(proteins), len(alleles)) # find optimal design solver = TeamOrienteeringIlp( num_teams=cocktail, vertex_reward=vertex_rewards, edge_cost=edges, max_edge_cost=0, max_vertices=0, lazy_subtour_elimination=not greedy_subtour, type_coverage=type_coverage, min_type_coverage=min_type_coverage, min_avg_type_conservation=min_avg_type_conservation, ) if isinstance(max_epitopes, (int, float)): solver.update_max_vertices(max_epitopes) if isinstance(max_aminoacids, (int, float)): solver.update_max_edge_cost(max_aminoacids) return solver, { 'proteins': proteins, 'alleles': alleles, 'epitope_data': epitope_data, }
from Fred2.EpitopePrediction import EpitopePredictorFactory #################################################################################################### # Convert raw peptide sequences to Fred2.Core.Peptide objects all_peptides = [Peptide(row.strip()) for row in args.peptides] # Separate peptides by length peptides_by_length = {} for peptide in all_peptides: if not len(peptide) in peptides_by_length: peptides_by_length[len(peptide)] = [] peptides_by_length[len(peptide)].append(peptide) # Convert raw allele strings to Fred2.Core.Allele objects alleles = [Allele(allele) for allele in args.alleles] # Instatiate predictor predictor = EpitopePredictorFactory("Syfpeithi") def matrix_max(matrix): """Returns the maximum attainable score for a pssm""" return sum([max(value.values()) for _, value in matrix.items()]) def load_allele_model(allele, length): """Returns the SYFPEITHI pssm for a given allele""" allele_model = "%s_%i" % (allele, length) try: return matrix_max(
file_in = arguments["--input"] if not file_in: file_in = "./data/binders.csv" file_out = arguments["--output"] dt = pd.read_csv(file_in) dt = dt[dt["Sequence"].notnull()] dt = dt[dt["Sequence"].str.len() == 9] peptides = [Peptide(peptide) for peptide in dt["Sequence"]] dt["allele"] = dt["allele"].str.replace("\*","").\ str.replace("(-[a-zA-Z]+)([0-9]{2})([0-9]{2})","\\1*\\2:\\3").\ str.replace("w","").\ str.replace("HLA-","") dt.rename(columns={"Sequence": "peptide"}, inplace=True) alleles = [Allele(allele) for allele in dt["allele"].unique().tolist()] res = fred2wrap.predict_peptide_effects( peptides, alleles=dt["allele"].unique().tolist()) res["peptide"] = [peptide.tostring() for peptide in res["peptide"]] res["allele"] = [str(allele) for allele in res["allele"]] res = res.pivot_table(index=["peptide", "allele"], columns='method', values='score').reset_index(None) dt_merge = pd.merge(dt, res, on=["peptide", "allele"], how="left") dt_merge.to_csv(file_out, index=False)
all_peptides = dt["mutant_sequence"].append(dt["wt_sequence"]).unique() peptides = [Peptide(peptide) for peptide in all_peptides] dt["allele"] = dt["allele"].str.replace("\*", "").\ str.replace(":", "").\ str.replace("(-[a-zA-Z]+)([0-9]{2})([0-9]{2})", "\\1*\\2:\\3").\ str.replace("w", "").\ str.replace("HLA-", "") # TODO # dt.rename(columns = {"Sequence": "peptide"}, inplace = True) alleles = [] valid_alleles = [] for allele in dt["allele"].tolist(): try: a = Allele(allele) valid_alleles.append(True) except: a = None valid_alleles.append(False) alleles.append(a) # subset invalid allele names dt = dt[pd.Series(valid_alleles)] res = fred2wrap.predict_peptide_effects(peptides, alleles=dt["allele"].unique().tolist()) res["peptide"] = [str(peptide) for peptide in res["peptide"]] res["allele"] = [str(allele) for allele in res["allele"]] # TODO - change melt order res = res.pivot_table(index=["peptide", "allele"],
def setUp(self): self.simple = Allele("HLA-A*02:01")