def test_single_peptide_input_mhcII(self): for m in EpitopePredictorFactory.available_methods(): model = EpitopePredictorFactory(m) if not isinstance(model, AExternalEpitopePrediction): if all(a.name in model.supportedAlleles for a in self.mhcII): res = model.predict(self.peptides_mhcII[0], alleles=self.mhcII[1])
def test_single_allele_input(self): for m in EpitopePredictorFactory.available_methods(): for v in EpitopePredictorFactory.available_methods()[m]: mo = EpitopePredictorFactory(m, version=v) if isinstance(mo, AExternalEpitopePrediction) and not ( mo.version == "0.1" and mo.name == "netmhc"): print "Testing", mo.name, "version", mo.version try: if any(a.name in mo.supportedAlleles for a in self.mhcII): mo.predict(self.peptides_mhcII, alleles=self.mhcII[0]) if any(a.name in mo.supportedAlleles for a in self.mhcII_combined_alleles): mo.predict(self.peptides_mhcII, alleles=self.mhcII_combined_alleles[0]) if any(a.name in mo.supportedAlleles for a in self.mhcI): mo.predict(self.peptides_mhcI, alleles=self.mhcI[0]) print "Success" except RuntimeError as e: #catch only those stemming from binary unavailability if "could not be found in PATH" not in e.message: raise e #all others do not except else: print mo.name, "not available"
def est_multiple_peptide_input_mhcI(self): for m in EpitopePredictorFactory.available_methods(): model = EpitopePredictorFactory(m) if all( a.name in model.supportedAlleles for a in self.mhcI): res = model.predict(self.peptides_mhcI,alleles=self.mhcI)
def test_path_option_and_optional_parameters(self): netmhc = EpitopePredictorFactory("NetMHC") exe = netmhc.command.split()[0] for try_path in os.environ["PATH"].split(os.pathsep): try_path = try_path.strip('"') exe_try = os.path.join(try_path, exe).strip() if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK): netmhc.predict(self.peptides_mhcI, alleles=self.mhcI, path=exe_try, options="--sort")
def test_path_and_optional_parameters_netctl(self): netctlpan = EpitopePredictorFactory("NetCTLpan") exe = netctlpan.command.split()[0] for try_path in os.environ["PATH"].split(os.pathsep): try_path = try_path.strip('"') exe_try = os.path.join(try_path, exe).strip() if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK): print netctlpan.predict(self.peptides_mhcI, alleles=self.mhcI, commad=exe_try, options="-wt 0.05 -wc 0.225 -ethr 0.5")
def test_path_option_and_optional_parameters_netmhc(self): netmhc = EpitopePredictorFactory("NetMHC") exe = netmhc.command.split()[0] for try_path in os.environ["PATH"].split(os.pathsep): try_path = try_path.strip('"') exe_try = os.path.join(try_path, exe).strip() if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK): r = netmhc.predict(self.peptides_mhcI, alleles=self.mhcI, command=exe_try, options="--sort", chunksize=1) self.assertTrue(len(r) == len(self.peptides_mhcI)) self.assertAlmostEqual(r["A*02:01"]["SYFPEITHI"]["netmhc"], 0.150579105869, places=7, msg=None, delta=None) self.assertAlmostEqual(r["A*02:01"]["IHTIEPFYS"]["netmhc"], 0.0619540879359, places=7, msg=None, delta=None)
def run_predictor(pred, dataset): predictor = EpitopePredictorFactory(pred) results = () try: results = predictor.predict(dataset, alleles=[ Allele(a) for a in args.allele ]) print(results) print(results.describe()) except ValueError: pass return(len(results),len(dataset))
def run_predictor(pred, dataset): predictor = EpitopePredictorFactory(pred) results = () try: results = predictor.predict(dataset, alleles=[Allele(a) for a in args.allele]) print(results) print(results.describe()) except ValueError: pass return (len(results), len(dataset))
def test_pareto_assembly(self): cl_pred = CleavageSitePredictorFactory("PCM") ep_pred = EpitopePredictorFactory("SMM") allele = [Allele("HLA-A*02:01")] thresh = {a.name:10000 for a in allele} comp = lambda a,b: a <= b print ep_pred.predict(self.peptides,alleles=allele) #cl_pred, ep_pred, alleles, threshold, comparator, length=9 assembler = ParetoEpitopeAssembly(self.peptides,cl_pred, ep_pred, allele, thresh, comp, solver="cbc", verbosity=1) r = assembler.solve(eps=1e10, order=(1,0)) print r
def test_pareto_assembly(self): cl_pred = CleavageSitePredictorFactory("PCM") ep_pred = EpitopePredictorFactory("SMM") allele = [Allele("HLA-A*02:01")] thresh = {a.name:10000 for a in allele} comp = lambda a,b: a <= b print(ep_pred.predict(self.peptides,alleles=allele)) #cl_pred, ep_pred, alleles, threshold, comparator, length=9 assembler = ParetoEpitopeAssembly(self.peptides,cl_pred, ep_pred, allele, thresh, comp, solver="cbc", verbosity=1) r = assembler.solve(eps=1e10, order=(1,0)) print(r)
def setUp(self): self.proteins=[] self.alleles = [Allele("HLA-A*01:01"),Allele("HLA-B*07:02"), Allele("HLA-C*03:01")] self.peptides = [Peptide(p) for p in """SFSIFLLAL GHRMAWDMM VYEADDVIL CFTPSPVVV FLLLADARV GPADGMVSK YLYDHLAPM GLRDLAVAV GPTPLLYRL TWVLVGGVL IELGGKPAL LAGGVLAAV QYLAGLSTL NFVSGIQYL VLSDFKTWL ARPDYNPPL KLLPRLPGV RHTPVNSWL GLYLFNWAV ALYDVVSTL RRCRASGVL WPLLLLLLA VTYSLTGLW YFVIFFVAA""".split()] self.result= EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles) self.thresh = {"A*01:01":10,"B*07:02":10,"C*03:01":10}
def setUp(self): self.proteins=[] self.alleles = [Allele("HLA-A*01:01"),Allele("HLA-B*07:02"), Allele("HLA-C*03:01")] self.peptides = [Peptide(p) for p in """SFSIFLLAL GHRMAWDMM VYEADDVIL CFTPSPVVV FLLLADARV GPADGMVSK YLYDHLAPM GLRDLAVAV GPTPLLYRL TWVLVGGVL IELGGKPAL LAGGVLAAV QYLAGLSTL NFVSGIQYL VLSDFKTWL ARPDYNPPL KLLPRLPGV RHTPVNSWL GLYLFNWAV ALYDVVSTL RRCRASGVL WPLLLLLLA VTYSLTGLW YFVIFFVAA""".split()] self.result= EpitopePredictorFactory("NetMHC").predict(self.peptides, self.alleles) self.thresh = {"A*01:01":0,"B*07:02":0,"C*03:01":0}
def valid_predictors(supported_length=9, exclude_predictors=["epidemix", "unitope", "netctlpan"]): """ Get the infomation for all predictors and keep only the relevant ones. Args: supported_length (int): Supported peptide input length. exclude_predictors (list of chars): List of methods to remove in addition """ methods = EpitopePredictorFactory.available_methods().keys() dt = pd.DataFrame([predictor_info(method) for method in methods]) n_init = len(dt) dt = dt[[supported_length in elems for elems in dt["supportedLength"]]] dt = dt[dt["type"].notnull()] # we should know where it was trained dt = dt[dt["is_in_path"].isnull() | dt["is_in_path"]] for excl_predictor in exclude_predictors: dt = dt[dt["name"] != excl_predictor] print("removed {0} methods from Fred2. {1} remain".\ format(n_init - len(dt), len(dt))) return dt
def predictor_info(method): """ Get all the information about a particular predictor/method from Fred2 """ predictor = EpitopePredictorFactory(method) try: is_in_path = predictor.is_in_path() except: is_in_path = None try: command = predictor.command except: command = None method_hash = { "syfpeithi": "T-cell epitope", "bimas": "MHC-I binding", "svmhc": "MHC-I binding", "arb": "MHC-I binding", "smm": "MHC-I binding", "smmpmbec": "MHC-I binding", "epidemix": "MHC-I binding", "comblib": "MHC-I binding", "comblibsidney": "MHC-I binding", "pickpocket": "MHC-I binding", "netmhc": "MHC-I binding", "netmhcpan": "MHC-I binding", "hammer": "MHC-II binding", "tepitopepan": "MHC-II binding", "netmhcii": "MHC-II binding", "netmhciipan": "MHC-II binding", "unitope": "T-cell epitope", "netctlpan": "T-cell epitope", } retdict = { "is_in_path": is_in_path, "name": method, "supportedAlleles": predictor.supportedAlleles, "supportedLength": predictor.supportedLength, "command": command, "version": predictor.version, "type": method_hash.get(method) } return retdict
def predict_peptide_effects(peptides, alleles=None): """ Predict the peptide effect for all the available methods on the machine Args: peptides (list of Peptides): Usually an output from read_fasta alleles (list of chars): Alleles for which to run the predictors Returns: pd.DataFrame: Tidy pd.DataFrame. If the method is unable to predict for a particular value the rows are not present. Example: >>> peptides = [Peptide("SYFPEITHI"), Peptide("FIASNGVKL"), Peptide("LLGATCMFV")] >>> alleles = ['A*02:16', 'B*45:01'] >>> predict_peptide_effects(peptides, alleles = alleles).head() Seq Method allele score 0 (F, I, A, S, N, G, V, K, L) arb A*02:16 594.691144 1 (F, I, A, S, N, G, V, K, L) smm A*02:16 159.768074 2 (F, I, A, S, N, G, V, K, L) smmpmbec A*02:16 211.977614 4 (F, I, A, S, N, G, V, K, L) unitope A*02:16 0.527849 5 (L, L, G, A, T, C, M, F, V) arb A*02:16 6.784222 """ dt = valid_predictors() results = [] for i in range(len(dt)): # subset to valid alleles if alleles is not None: valid_alleles = dt.iloc[i]["supportedAlleles"].intersection( alleles) if len(valid_alleles) == 0: continue valid_alleles = [Allele(al) for al in valid_alleles] else: valid_alleles = None method = dt.iloc[i]["name"] print("method: ", method) # TODO - use try, except t0 = time.time() try: results.append( EpitopePredictorFactory(method).predict(peptides, alleles=valid_alleles)) except: print("Error! Unable to run ", method, ": ", sys.exc_info()) t1 = time.time() print(" - runtime: ", str(t1 - t0)) df = results[0].merge_results(results[1:]).reset_index() dfm = pd.melt(df, id_vars=["Seq", "Method"], var_name="allele", value_name="score") dfm = dfm[dfm["score"].notnull()] dfm.rename(columns={'Seq': 'peptide', 'Method': 'method'}, inplace=True) return dfm
def test_epitope_conservation_constraint(self): import random self.result = EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles) conservation = {} for e in self.result.index.levels[0]: conservation[str(e)] = random.random() pt = OptiTope(self.result, self.thresh, k=3, solver="cbc", verbosity=0) pt.activate_epitope_conservation_const(0.5, conservation=conservation) for e in pt.solve(): print e, conservation[e]
def test_single_allele_input(self): for m in EpitopePredictorFactory.available_methods(): for v in EpitopePredictorFactory.available_methods()[m]: mo = EpitopePredictorFactory(m, version=v) if isinstance(mo, AExternalEpitopePrediction) and not (mo.version=="0.1" and mo.name=="netmhc"): if any(a.name in mo.supportedAlleles for a in self.mhcII): mo.predict(self.peptides_mhcII, alleles=self.mhcII[0]) else: mo.predict(self.peptides_mhcI, alleles=self.mhcI[0])
def test_epitope_conservation_constraint(self): import random self.result = EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles) conservation = {} print self.result.index.levels[0] for e in self.result.index.levels[0]: conservation[str(e)] = random.random() pt = OptiTope(self.result, self.thresh, k=3, solver="cplex" ,verbosity=1) pt.activate_epitope_conservation_const(0.5, conservation=conservation) for e in pt.solve(): print e, conservation[e]
def __main__(): parser = argparse.ArgumentParser( "Write out information about supported models by Fred2 for available prediction tool versions." ) parser.add_argument('-v', '--versions', help='File with used software versions.', required=True) args = parser.parse_args() # NOTE this needs to be updated manually, if other methods should be used in the future available_methods = [ 'syfpeithi', 'mhcflurry', 'mhcnuggets-class-1', 'mhcnuggets-class-2' ] with open(args.versions, 'r') as versions_file: tool_version = [(row[0].split()[0], str(row[1])) for row in csv.reader(versions_file, delimiter=":")] # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future tool_version.append(('syfpeithi', '1.0')) # get for each method the corresponding tool version methods = { method.strip(): version.strip() for tool, version in tool_version for method in available_methods if tool.lower() in method.lower() } for method, version in methods.items(): if (version not in EpitopePredictorFactory.available_methods()[method]): raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.") predictor = EpitopePredictorFactory(method, version=version) with open(method + ".v" + str(version) + ".supported_alleles.txt", 'w') as output: for a in sorted(predictor.supportedAlleles): output.write(convert_allele_back(a) + "\n") with open(method + ".v" + str(version) + ".supported_lengths.txt", 'w') as output: for l in sorted(predictor.supportedLength): output.write(str(l) + "\n")
def test_pareto_front_assembly(self): cl_pred = CleavageSitePredictorFactory("PCM") ep_pred = EpitopePredictorFactory("SMM") allele = [Allele("HLA-A*02:01")] thresh = {a.name:10000 for a in allele} comp = lambda a,b: a <= b assembler = ParetoEpitopeAssembly(self.peptides,cl_pred, ep_pred, allele, thresh, comp, solver="cbc", verbosity=0) r = assembler.paretosolve() print(r) #print assembler.solve(eps=2.0)
def test_standart_functions(self): """ Tests default functions needs GLPK installed :return: """ epi_pred = EpitopePredictorFactory("Syfpeithi") cl_pred = CleavageSitePredictorFactory("PCM") sbws = EpitopeAssemblyWithSpacer(self.epis,cl_pred,epi_pred,self.alleles) sol = sbws.solve() print sol assert all(i == str(j) for i,j in zip(["GHRMAWDMM","HH","VYEADDVIL"],sol))
def test_single_allele_input(self): for m in EpitopePredictorFactory.available_methods(): for v in EpitopePredictorFactory.available_methods()[m]: mo = EpitopePredictorFactory(m, version=v) if isinstance(mo, AExternalEpitopePrediction) and not (mo.version=="0.1" and mo.name=="netmhc"): print "Testing", mo.name, "version", mo.version try: if any(a.name in mo.supportedAlleles for a in self.mhcII): mo.predict(self.peptides_mhcII, alleles=self.mhcII[0]) if any(a.name in mo.supportedAlleles for a in self.mhcII_combined_alleles): mo.predict(self.peptides_mhcII, alleles=self.mhcII_combined_alleles[0]) if any(a.name in mo.supportedAlleles for a in self.mhcI): mo.predict(self.peptides_mhcI, alleles=self.mhcI[0]) print "Success" except RuntimeError as e: #catch only those stemming from binary unavailability if "could not be found in PATH" not in e.message: raise e #all others do not except else: print mo.name, "not available"
def test_allele_cov_constraint(self): """ tests the allele converage constraints :return: """ #self.alleles.extend([Allele("HLA-A*02:01"),Allele("HLA-B*15:01")]) #self.thresh.update({"A*02:01":0,"B*15:01":0}) self.result= EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles) opt = OptiTope(self.result, self.thresh, k=3, solver="cbc", verbosity=0) opt.activate_allele_coverage_const(0.99) r = opt.solve() self.assertTrue(len(set(str(p) for p in r) - set(["GPTPLLYRL", "QYLAGLSTL", "ALYDVVSTL"])) == 0 )
def test_unsupported_allele_length_combination_exception(self): """ Tests default functions needs GLPK installed :return: """ epi_pred = EpitopePredictorFactory("Syfpeithi") cl_pred = CleavageSitePredictorFactory("PCM") alleles = [Allele("HLA-A*26:01", prob=0.5)] sbws = EpitopeAssemblyWithSpacer(self.epis, cl_pred, epi_pred, alleles, solver="cbc") self.assertRaises(ValueError, sbws.solve)
def test_unsupported_allele_length_combination(self): """ Tests default functions needs GLPK installed :return: """ epi_pred = EpitopePredictorFactory("Syfpeithi") cl_pred = CleavageSitePredictorFactory("PCM") alleles = [ Allele("HLA-A*02:01", prob=0.5), Allele("HLA-A*26:01", prob=0.5) ] sbws = EpitopeAssemblyWithSpacer(self.epis, cl_pred, epi_pred, alleles, solver="cbc") sol = sbws.solve() print sol assert all(i == str(j) for i, j in zip(["GHRMAWDMM", "HH", "VYEADDVIL"], sol))
def test_allele_cov_constraint(self): """ tests the allele converage constraints :return: """ #self.alleles.extend([Allele("HLA-A*02:01"),Allele("HLA-B*15:01")]) #self.thresh.update({"A*02:01":0,"B*15:01":0}) self.result= EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles) print self.result[self.alleles[0:2]] opt = OptiTope(self.result,self.thresh,k=3,solver="cplex",verbosity=1) opt.activate_allele_coverage_const(0.8) r = opt.solve() res_df = self.result.xs(self.result.index.values[0][1], level="Method") peps = [str(p) for p in r] probs = {"A*01:01":1, "A*02:01":1, "B*07:02":1,"B*15:01":1,"C*03:01":1.0} res_df = res_df.loc[peps, :] res_df = res_df[[a for a in self.alleles]] res_df = res_df[res_df.apply(lambda x: any(x[a] > self.thresh[a.name] for a in self.alleles), axis=1)] print res_df.apply(lambda x: sum( x[c]*probs[c.name] for c in res_df.columns),axis=1) self.assertTrue(len(set(str(p) for p in r) - set(["ALYDVVSTL", "KLLPRLPGV", "GPTPLLYRL"])) == 0 )
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True) parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True) parser.add_argument('-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.") parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction") parser.add_argument('-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]".format(m=PRED_METH)) parser.add_argument('-f', "--filter", dest="filter", type=float, help="Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict") parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.") parser.add_argument('-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.") options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if options.filter and not options.predict_with: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) if options.predict_with and not options.alleles: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) temp_dir = "/tmp/" logging.basicConfig(filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log', filemode='w+', level=logging.DEBUG) #, format='%(levelname)s:%(message)s' logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now())) logging.warning("verbosity turned on") #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts # complete proteins? # only containing binders? # k-mers? # binders only? # FastaSlicer.py? # remove original if homozygous (needs fasta input)? # add germline variant option? or expect all to be in one vcf? # MyObject = type('MyObject', (object,), {}) # options = MyObject() # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf") # # vt = os.path.splitext(options.var_file)[-1] # if ".vcf" == vt: # vcfvars, accessions = FileReader.read_vcf(options.var_file) # # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # transcripts = [x for x in transcript_gen if x.vars] # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # protein_gen = g.generate_proteins_from_transcripts(transcript_gen) # proteins = [x for x in protein_gen if x.vars] # for p in proteins: # p.gene_id = p.vars.values()[0][0].gene # # # for t in transcripts: # t.gene_id = t.vars.values()[0].gene # vt = os.path.splitext(options.var_file)[-1] if ".vcf" == vt: vcfvars, accessions = FileReader.read_vcf(options.var_file) elif ".GSvar" == vt: pass # vcfvars = FileReader.read_GSvar(options.var_file) else: m = "Could not read variants {f}, aborting.".format(f=options.var_file) logging.error(m) print m sys.exit(1) mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # TODO guess id_type for mart_db from accessions transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) protein_gen = g.generate_proteins_from_transcripts(transcript_gen) proteins = [x for x in protein_gen if x.vars] # removing unvaried for p in proteins: p.gene_id = p.vars.values()[0][0].gene # assume gene name from first variant proteins = [p for p in proteins if not is_stop_gain(p)] # kick out stop gains # First exit option if not (options.predict_with or options.filter) and options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta print "N/A" sys.exit(0) else: e = proteins_to_fasta(proteins) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, digestion must be set somehow if not options.digest: digest = 9 else: digest = options.digest peptide_gen = g.generate_peptides_from_proteins(proteins, digest) peptides = [x for x in peptide_gen] peptides_var = [x for x in peptides if any(x.get_variants_by_protein(y) for y in x.proteins.keys())] # removing unvaried # Second exit option if not (options.predict_with or options.filter): e = peptides_to_fasta(peptides_var) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, predictions are needed try: target_alleles_set = set(FileReader.read_lines(options.alleles, in_type=Allele)) except Exception as e: m = "Could not read alleles file {f}, aborting.".format(f=options.alleles) logging.error(m) print m, "what:", str(e) sys.exit(1) try: ttn = EpitopePredictorFactory(options.predict_with) except Exception as e: m = "Could not initialize prediction method {f}, aborting.".format(f=options.predict_with) logging.error(m) print m sys.exit(1) try: preds = ttn.predict(peptides_var, alleles=target_alleles_set) except Exception as e: print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(e) sys.exit(1) # punch prediction results in peptide metadata (inside pandas dataframe) #PRED_METH = set() for i, row in preds.iterrows(): for j in i[1:]: i[0].log_metadata(j, dict(zip(row.index, row.values))) #PRED_METH.add(j) # need that later # Third exit option if not options.filter: if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # kick out nonbinder preds_f = preds[(preds > options.filter).any(axis=1)] # Fourth exit option if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta binders only plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds_f) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds_f) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0)
def main(): model = argparse.ArgumentParser( description='Neoepitope prediction for TargetInsepctor.') model.add_argument( '-m', '--method', type=str, choices=EpitopePredictorFactory.available_methods().keys(), default="bimas", help='The name of the prediction method') model.add_argument('-v', '--vcf', type=str, default=None, help='Path to the vcf input file') model.add_argument( '-t', '--type', type=str, choices=["VEP", "ANNOVAR", "SNPEFF"], default="VEP", help= 'Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)' ) model.add_argument('-p', '--proteins', type=str, default=None, help='Path to the protein ID input file (in HGNC-ID)') model.add_argument('-l', '--length', choices=range(8, 18), type=int, default=9, help='The length of peptides') model.add_argument( '-a', '--alleles', type=str, required=True, help='Path to the allele file (one per line in new nomenclature)') model.add_argument( '-r', '--reference', type=str, default='GRCh38', help='The reference genome used for varinat annotation and calling.') model.add_argument( '-fINDEL', '--filterINDEL', action="store_true", help='Filter insertions and deletions (including frameshifts)') model.add_argument('-fFS', '--filterFSINDEL', action="store_true", help='Filter frameshift INDELs') model.add_argument('-fSNP', '--filterSNP', action="store_true", help='Filter SNPs') model.add_argument('-o', '--output', type=str, required=True, help='Path to the output file') model.add_argument('-etk', '--etk', action="store_true", help=argparse.SUPPRESS) args = model.parse_args() martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()]) transcript_to_genes = {} if args.vcf is None and args.proteins is None: sys.stderr.write( "At least a vcf file or a protein id file has to be provided.\n") return -1 # if vcf file is given: generate variants and filter them if HGNC IDs ar given if args.vcf is not None: protein_ids = [] if args.proteins is not None: with open(args.proteins, "r") as f: for l in f: l = l.strip() if l != "": protein_ids.append(l) if args.type == "VEP": variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids) elif args.type == "SNPEFF": variants = read_vcf(args.vcf)[0] else: variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids) variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants) if args.filterSNP: variants = filter(lambda x: x.type != VariationType.SNP, variants) if args.filterINDEL: variants = filter( lambda x: x.type not in [ VariationType.INS, VariationType.DEL, VariationType.FSDEL, VariationType.FSINS ], variants) if args.filterFSINDEL: variants = filter( lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants) if not variants: sys.stderr.write( "No variants left after filtering. Please refine your filtering criteria.\n" ) return -1 epitopes = filter( lambda x: any( x.get_variants_by_protein(tid) for tid in x.proteins.iterkeys()), generate_peptides_from_variants(variants, int(args.length), martDB, EIdentifierTypes.ENSEMBL)) for v in variants: for trans_id, coding in v.coding.iteritems(): if coding.geneID != None: transcript_to_genes[trans_id] = coding.geneID else: transcript_to_genes[trans_id] = 'None' #else: generate protein sequences from given HGNC IDs and than epitopes else: proteins = [] with open(args.proteins, "r") as f: for l in f: ensembl_ids = martDB.get_ensembl_ids_from_id( l.strip(), type=EIdentifierTypes.HGNC)[0] protein_seq = martDB.get_product_sequence( ensembl_ids[EAdapterFields.PROTID]) if protein_seq is not None: transcript_to_genes[ensembl_ids[ EAdapterFields.TRANSID]] = l.strip() proteins.append( Protein( protein_seq, gene_id=l.strip(), transcript_id=ensembl_ids[EAdapterFields.TRANSID])) epitopes = generate_peptides_from_proteins(proteins, int(args.length)) #read in allele list alleles = read_lines(args.alleles, in_type=Allele) result = EpitopePredictorFactory(args.method).predict(epitopes, alleles=alleles) with open(args.output, "w") as f: alleles = result.columns var_column = " Variants" if args.vcf is not None else "" f.write("Sequence\tMethod\t" + "\t".join(a.name for a in alleles) + "\tAntigen ID\t" + var_column + "\n") for index, row in result.iterrows(): p = index[0] method = index[1] proteins = ",".join( set([ transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins() ])) vars_str = "" if args.vcf is not None: vars_str = "\t" + "|".join( set( prot_id.split(":FRED2")[0] + ":" + ",".join( repr(v) for v in set(p.get_variants_by_protein(prot_id))) for prot_id in p.proteins.iterkeys() if p.get_variants_by_protein(prot_id))) f.write( str(p) + "\t" + method + "\t" + "\t".join("%.3f" % row[a] for a in alleles) + "\t" + proteins + vars_str + "\n") if args.etk: with open(args.output.rsplit(".", 1)[0] + "_etk.tsv", "w") as g: alleles = result.columns g.write("Alleles:\t" + "\t".join(a.name for a in alleles) + "\n") for index, row in result.iterrows(): p = index[0] proteins = " ".join( set([ transcript_to_genes[prot.transcript_id.split( ":FRED2")[0]] for prot in p.get_all_proteins() ])) g.write( str(p) + "\t" + "\t".join("%.3f" % row[a] for a in alleles) + "\t" + proteins + "\n") return 0
def make_predictions_from_peptides(peptides, methods, alleles, protein_db, identifier, metadata): # dictionaries for syfpeithi matrices max values and allele mapping max_values_matrices = {} allele_string_map = {} # list to hold dataframes for all predictions pred_dataframes = [] # filter out self peptides if specified selfies = [str(p) for p in peptides if protein_db.exists(str(p))] peptides_filtered = [p for p in peptides if str(p) not in selfies] # sort peptides by length (for predictions) sorted_peptides = {} for p in peptides_filtered: length = len(str(p)) if length in sorted_peptides: sorted_peptides[length].append(p) else: sorted_peptides[length] = [p] for peplen in sorted_peptides: all_peptides_filtered = sorted_peptides[peplen] results = [] for m in methods: try: results.extend([ EpitopePredictorFactory(m.split('-')[0], version=m.split('-')[1]).predict( all_peptides_filtered, alleles=alleles) ]) except: logging.warning( "Prediction for length {length} and allele {allele} not possible with {method}. No model available." .format(length=peplen, allele=','.join([str(a) for a in alleles]), method=m)) # merge dataframes of the performed predictions if (len(results) == 0): continue df = results[0].merge_results(results[1:]) df.insert(0, 'length', df.index.map(create_length_column_value)) for a in alleles: conv_allele = "%s_%s%s" % (a.locus, a.supertype, a.subtype) allele_string_map['%s_%s' % (a, peplen)] = '%s_%i' % (conv_allele, peplen) max_values_matrices['%s_%i' % (conv_allele, peplen)] = get_matrix_max_score( conv_allele, peplen) # reset index to have index as columns df.reset_index(inplace=True) mandatory_columns = [ 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'synonymous', 'homozygous', 'variant details (genomic)', 'variant details (protein)' ] for header in mandatory_columns: if header not in metadata: df[header] = np.nan else: df[header] = df.apply( lambda row: row[0].get_metadata(header)[0], axis=1) for c in list(set(metadata) - set(mandatory_columns)): df[c] = df.apply(lambda row: row[0].get_metadata(c)[0], axis=1) for c in df.columns: if '*' in str(c): idx = df.columns.get_loc(c) df.insert( idx + 1, '%s affinity' % c, df.apply(lambda x: create_affinity_values( str(c), int(x['length']), float(x[c]), x['Method'], max_values_matrices, allele_string_map), axis=1)) df.insert( idx + 2, '%s binder' % c, df.apply(lambda x: create_binder_values( float(x['%s affinity' % c]), x['Method']), axis=1)) df = df.rename(columns={c: '%s score' % c}) df = df.rename(columns={'Seq': 'sequence'}) df = df.rename(columns={'Method': 'method'}) pred_dataframes.append(df) # write prediction statistics statistics = { 'prediction_methods': methods, 'number_of_variants': '-', 'number_of_peptides': len(peptides), 'number_of_peptides_after_filtering': len(peptides_filtered) } return pred_dataframes, statistics
def __main__(): parser = argparse.ArgumentParser( "Write out information about supported models by Fred2 for installed predictor tool versions." ) parser.add_argument('-p', "--peptides", help="File with one peptide per line") parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II") parser.add_argument('-l', "--max_length", help="Maximum peptide length", type=int) parser.add_argument('-ml', "--min_length", help="Minimum peptide length", type=int) parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True, type=str) parser.add_argument('-t', '--tools', help='Tools requested for peptide predictions', required=True, type=str) parser.add_argument('-v', '--versions', help='<Required> File with used software versions.', required=True) args = parser.parse_args() selected_methods = [item for item in args.tools.split(',')] with open(args.versions, 'r') as versions_file: tool_version = [(row[0].split()[0], str(row[1])) for row in csv.reader(versions_file, delimiter=":")] # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future tool_version.append(('syfpeithi', '1.0')) # how to handle this? # get for each method the corresponding tool version methods = { method.strip(): version.strip() for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() } # get the alleles alleles = [Allele(a) for a in args.alleles.split(";")] peptide_lengths = [] if (args.peptides): peptides = read_peptide_input(args.peptides) peptide_lengths = set([len(pep) for pep in peptides]) else: peptide_lengths = range(args.min_length, args.max_length + 1) with open("model_report.txt", 'w') as output: # check if requested tool versions are supported for method, version in methods.items(): if version not in EpitopePredictorFactory.available_methods()[ method.lower()]: raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.") # check if requested alleles are supported support_all_alleles = True no_allele_support = True for a in alleles: supported = False for method, version in methods.items(): predictor = EpitopePredictorFactory(method, version=version) if a not in sorted(predictor.supportedAlleles): output.write("Allele " + convert_allele_back(a) + " is not supported by " + method + " " + version + ".\n") else: supported = True if not supported: output.write( "Allele " + convert_allele_back(a) + " is not supported by any of the requested tools.\n") logger.warning( "Allele " + convert_allele_back(a) + " is not supported by any of the requested tools.") support_all_alleles = False else: no_allele_support = False if support_all_alleles: output.write( "All selected alleles are supported by at least one of the requested tools.\n" ) if no_allele_support: output.write( "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n" ) raise ValueError( "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models." ) output.write("\n") # check if requested lengths are supported support_all_lengths = True no_length_support = True for l in peptide_lengths: supported = False for method, version in methods.items(): predictor = EpitopePredictorFactory(method, version=version) if l not in sorted(predictor.supportedLength): output.write("Peptide length " + str(l) + " is not supported by " + method + " " + version + ".\n") else: supported = True if not supported: output.write( "Peptide length " + str(l) + " is not supported by any of the requested tools.\n") logger.warning( "Peptide length " + str(l) + " is not supported by any of the requested tools.") support_all_lengths = False else: no_length_support = False if support_all_lengths: output.write( "All selected or provided peptide lengths are supported by at least one of the requested tools.\n" ) if no_length_support: output.write( "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n" ) raise ValueError( "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models." )
def __main__(): parser = argparse.ArgumentParser(description="""EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""", version=VERSION) parser.add_argument('-s', "--somatic_mutations", help='Somatic variants') parser.add_argument('-g', "--germline_mutations", help="Germline variants") parser.add_argument('-i', "--identifier", help="Dataset identifier") parser.add_argument('-p', "--peptides", help="File with one peptide per line") parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II") parser.add_argument('-l', "--max_length", help="Maximum peptide length") parser.add_argument('-ml', "--min_length", help="Minimum peptide length") parser.add_argument('-t', "--tools", help="Tools used for peptide predictions", required=True, type=str) parser.add_argument('-sv', "--versions", help="File containing parsed software version numbers.", required=True) parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True) parser.add_argument('-r', "--reference", help="Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38']) parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", required=False, action='store_true') parser.add_argument('-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", required=False, action='store_true') parser.add_argument('-fo', "--fasta_output", help="Create FASTA file with protein sequences", required=False, action='store_true') parser.add_argument('-rp', "--reference_proteome", help="Reference proteome for self-filtering", required=False) parser.add_argument('-gr', "--gene_reference", help="List of gene IDs for ID mapping.", required=False) parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values") parser.add_argument('-ge', "--gene_expression", help="File with expression analysis results") parser.add_argument('-de', "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)") parser.add_argument('-li', "--ligandomics_id", help="Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run.") args = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) logger.addHandler(logging.FileHandler('{}_prediction.log'.format(args.identifier))) logger.info("Starting predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) metadata = [] references = {'GRCh37': 'http://feb2014.archive.ensembl.org', 'GRCh38': 'http://dec2016.archive.ensembl.org'} global transcriptProteinMap global transcriptSwissProtMap '''read in variants or peptides''' if args.peptides: peptides, metadata = read_peptide_input(args.peptides) else: if args.somatic_mutations.endswith('.GSvar') or args.somatic_mutations.endswith('.tsv'): vl, transcripts, metadata = read_GSvar(args.somatic_mutations) elif args.somatic_mutations.endswith('.vcf'): vl, transcripts, metadata = read_vcf(args.somatic_mutations) transcripts = list(set(transcripts)) transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts(ID_SYSTEM_USED, transcripts, references[args.reference], args.reference) # get the alleles alleles = FileReader.read_lines(args.alleles, in_type=Allele) # initialize MartsAdapter, GRCh37 or GRCh38 based ma = MartsAdapter(biomart=references[args.reference]) # create protein db instance for filtering self-peptides up_db = UniProtDB('sp') if args.filter_self: logger.info('Reading human proteome') if os.path.isdir(args.reference_proteome): for filename in os.listdir(args.reference_proteome): if filename.endswith(".fasta") or filename.endswith(".fsa"): up_db.read_seqs(os.path.join(args.reference_proteome, filename)) else: up_db.read_seqs(args.reference_proteome) selected_methods = [item for item in args.tools.split(',')] with open(args.versions, 'r') as versions_file: tool_version = [ (row[0], str(row[1][1:])) for row in csv.reader(versions_file, delimiter = "\t") ] # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future tool_version.append(('syfpeithi', '1.0')) # get for each selected method the corresponding tool version methods = { method:version for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() } for method, version in methods.items(): if version not in EpitopePredictorFactory.available_methods()[method]: raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.") # MHC class I or II predictions if args.mhcclass is 1: if args.peptides: pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata) else: pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) else: if args.peptides: pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata) else: pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) # concat dataframes for all peptide lengths try: complete_df = pd.concat(pred_dataframes) except: complete_df = pd.DataFrame() logger.error("No predictions available.") # replace method names with method names with version # complete_df.replace({'method': methods}, inplace=True) complete_df['method'] = complete_df['method'].apply(lambda x : x + '-' + methods[x] ) # include wild type sequences to dataframe if specified if args.wild_type: wt_sequences = generate_wt_seqs(all_peptides_filtered) complete_df['wt sequence'] = complete_df.apply(lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1) columns_tiles = ['sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method'] # Change the order (the index) of the columns else: columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method'] for c in complete_df.columns: if c not in columns_tiles: columns_tiles.append(c) complete_df = complete_df.reindex(columns=columns_tiles) binder_cols = [col for col in complete_df.columns if 'binder' in col] binders = [] non_binders = [] pos_predictions = [] neg_predictions = [] for i, r in complete_df.iterrows(): binder = False for c in binder_cols: if r[c] is True: binder = True continue if binder: binders.append(str(r['sequence'])) pos_predictions.append(str(r['sequence'])) else: neg_predictions.append(str(r['sequence'])) if str(r['sequence']) not in binders: non_binders.append(str(r['sequence'])) # parse protein quantification results, annotate proteins for samples if args.protein_quantification is not None: protein_quant = read_protein_quant(args.protein_quantification) first_entry = protein_quant[protein_quant.keys()[0]] for k in first_entry.keys(): complete_df['{} log2 protein LFQ intensity'.format(k)] = complete_df.apply(lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1) # parse (differential) expression analysis results, annotate features (genes/transcripts) if args.gene_expression is not None: fold_changes = read_diff_expression_values(args.gene_expression) gene_id_lengths = {} col_name = 'RNA expression (RPKM)' with open(args.gene_reference, 'r') as gene_list: for l in gene_list: ids = l.split('\t') gene_id_in_df = complete_df.iloc[1]['gene'] if 'ENSG' in gene_id_in_df: gene_id_lengths[ids[0]] = float(ids[2].strip()) else: gene_id_lengths[ids[1]] = float(ids[2].strip()) deseq = False # add column to result dataframe complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1) if args.diff_gene_expression is not None: gene_id_lengths = {} fold_changes = read_diff_expression_values(args.diff_gene_expression) col_name = 'RNA normal_vs_tumor.log2FoldChange' deseq = True # add column to result dataframe complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1) # parse ligandomics identification results, annotate peptides for samples if args.ligandomics_id is not None: lig_id = read_lig_ID_values(args.ligandomics_id) # add columns to result dataframe complete_df['ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, False), axis=1) complete_df['ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, False), axis=1) if args.wild_type != None: complete_df['wt ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, True), axis=1) complete_df['wt ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, True), axis=1) # write mutated protein sequences to fasta file if args.fasta_output: with open('{}_prediction_proteins.fasta'.format(args.identifier), 'w') as protein_outfile: for p in proteins: variants = [] for v in p.vars: variants = variants + p.vars[v] c = [x.coding.values() for x in variants] cf = list(itertools.chain.from_iterable(c)) cds = ','.join([y.cdsMutationSyntax for y in set(cf)]) aas = ','.join([y.aaMutationSyntax for y in set(cf)]) protein_outfile.write('>{}:{}:{}\n'.format(p.transcript_id, aas, cds)) protein_outfile.write('{}\n'.format(str(p))) # write dataframe to tsv complete_df.fillna('') complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier), '\t', index=False) statistics['number_of_predictions'] = len(complete_df) statistics['number_of_binders'] = len(pos_predictions) statistics['number_of_nonbinders'] = len(neg_predictions) statistics['number_of_unique_binders'] = list(set(binders)) statistics['number_of_unique_nonbinders'] = list(set(non_binders) - set(binders)) with open('{}_report.json'.format(args.identifier), 'w') as json_out: json.dump(statistics, json_out) logger.info("Finished predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
def make_predictions_from_variants(variants_all, methods, alleles, minlength, maxlength, martsadapter, protein_db, identifier, metadata, transcriptProteinMap): # list for all peptides and filtered peptides all_peptides = [] all_peptides_filtered = [] # dictionaries for syfpeithi matrices max values and allele mapping max_values_matrices = {} allele_string_map = {} # list to hold dataframes for all predictions pred_dataframes = [] prots = [ p for p in generator.generate_proteins_from_transcripts( generator.generate_transcripts_from_variants( variants_all, martsadapter, ID_SYSTEM_USED)) ] for peplen in range(minlength, maxlength): peptide_gen = generator.generate_peptides_from_proteins(prots, peplen) peptides_var = [x for x in peptide_gen] # remove peptides which are not 'variant relevant' peptides = [ x for x in peptides_var if any( x.get_variants_by_protein(y) for y in x.proteins.keys()) ] # filter out self peptides selfies = [str(p) for p in peptides if protein_db.exists(str(p))] filtered_peptides = [p for p in peptides if str(p) not in selfies] all_peptides = all_peptides + peptides all_peptides_filtered = all_peptides_filtered + filtered_peptides results = [] if len(filtered_peptides) > 0: for m in methods: try: results.extend([ EpitopePredictorFactory( m.split('-')[0], version=m.split('-')[1]).predict(filtered_peptides, alleles=alleles) ]) except: logging.warning( "Prediction for length {length} and allele {allele} not possible with {method}." .format(length=peplen, allele=','.join([str(a) for a in alleles]), method=m)) if (len(results) == 0): continue df = results[0].merge_results(results[1:]) for a in alleles: conv_allele = "%s_%s%s" % (a.locus, a.supertype, a.subtype) allele_string_map['%s_%s' % (a, peplen)] = '%s_%i' % (conv_allele, peplen) max_values_matrices['%s_%i' % (conv_allele, peplen)] = get_matrix_max_score( conv_allele, peplen) df.insert(0, 'length', df.index.map(create_length_column_value)) df['chr'] = df.index.map(create_variant_chr_column_value) df['pos'] = df.index.map(create_variant_pos_column_value) df['gene'] = df.index.map(create_gene_column_value) df['transcripts'] = df.index.map(create_transcript_column_value) df['proteins'] = df.index.map(create_protein_column_value) df['variant type'] = df.index.map(create_variant_type_column_value) df['synonymous'] = df.index.map(create_variant_syn_column_value) df['homozygous'] = df.index.map(create_variant_hom_column_value) df['variant details (genomic)'] = df.index.map( create_mutationsyntax_genome_column_value) df['variant details (protein)'] = df.index.map( create_mutationsyntax_column_value) # reset index to have index as columns df.reset_index(inplace=True) for c in df.columns: if '*' in str(c): idx = df.columns.get_loc(c) df.insert( idx + 1, '%s affinity' % c, df.apply(lambda x: create_affinity_values( str(c), int(x['length']), float(x[c]), x['Method'], max_values_matrices, allele_string_map), axis=1)) df.insert( idx + 2, '%s binder' % c, df.apply(lambda x: create_binder_values( float(x['%s affinity' % c]), x['Method']), axis=1)) df = df.rename(columns={c: '%s score' % c}) df['%s score' % c] = df['%s score' % c].map(lambda x: round(x, 4)) for c in metadata: df[c] = df.apply(lambda row: create_metadata_column_value(row, c), axis=1) df = df.rename(columns={'Seq': 'sequence'}) df = df.rename(columns={'Method': 'method'}) pred_dataframes.append(df) statistics = { 'prediction_methods': methods, 'number_of_variants': len(variants_all), 'number_of_peptides': len(all_peptides), 'number_of_peptides_after_filtering': len(all_peptides_filtered) } return pred_dataframes, statistics, all_peptides_filtered
def main(): #Specify CTD interface # Every CTD Model has to have at least a name and a version, plus any of the optional attributes below them. model = argparse.ArgumentParser(description='Process some integers.') model.add_argument('-m', '--method', type=str, choices=EpitopePredictorFactory.available_methods().keys(), default="bimas", help='The name of the prediction method' ) model.add_argument('-v', '--version', type=str, default="", help='The version of the prediction method' ) model.add_argument('-i', '--input', type=str, required=True, help='Path to the input file' ) model.add_argument('-t', '--type', choices=["fasta","peptide"], type=str, default="fasta", help='The data type of the input (fasta, peptide list)' ) model.add_argument('-l', '--length', choices=range(8, 18), type=int, default=9, help='The length of peptides' ) model.add_argument('-a', '--alleles', type=str, required=True, help='Path to the allele file (one per line in new nomenclature)' ) model.add_argument('-op', '--options', type=str, default="", help="Additional options that get directly past to the tool" ) model.add_argument('-o', '--output', type=str, required=True, help='Path to the output file' ) args = model.parse_args() #fasta protein if args.type == "fasta": with open(args.input, 'r') as f: first_line = f.readline() sep_pos = 1 if first_line.count("|") else 0 proteins = read_fasta(args.input, in_type=Protein, id_position=sep_pos) peptides = generate_peptides_from_proteins(proteins, args.length) elif args.type == "peptide": peptides = read_lines(args.input, in_type=Peptide) else: sys.stderr.write('Input type not known\n') return -1 #read in alleles alleles = read_lines(args.alleles, in_type=Allele) if args.version == "": result = EpitopePredictorFactory(args.method).predict(peptides, alleles, options=args.options) else: result = EpitopePredictorFactory(args.method, version=args.version).predict(peptides, alleles, options=args.options) #write to TSV columns sequence method allele-scores...,protein-id/transcript-id with open(args.output, "w") as f: proteins = "\tAntigen ID" if args.type == "fasta" else "" alleles = result.columns f.write("Sequence\tMethod\t"+"\t".join(a.name for a in alleles)+proteins+"\n") for index, row in result.iterrows(): p = index[0] method = index[1] proteins = "\t"+",".join( prot.transcript_id for prot in p.get_all_proteins()) if args.type == "fasta" else "" f.write(str(p)+"\t"+method+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+proteins+"\n") return 0
def test_wrong_internal_to_external_version(self): with self.assertRaises(RuntimeError): EpitopePredictorFactory("NetMHC", version="0.1").predict(self.peptides_mhcI, alleles=self.mhcI)
class OptiTopeTestCase(unittest.TestCase): """ Unittest for OptiTope """ def setUp(self): self.proteins=[] self.alleles = [Allele("HLA-A*01:01"),Allele("HLA-B*07:02"), Allele("HLA-C*03:01")] self.peptides = [Peptide(p) for p in """SFSIFLLAL GHRMAWDMM VYEADDVIL CFTPSPVVV FLLLADARV GPADGMVSK YLYDHLAPM GLRDLAVAV GPTPLLYRL TWVLVGGVL IELGGKPAL LAGGVLAAV QYLAGLSTL NFVSGIQYL VLSDFKTWL ARPDYNPPL KLLPRLPGV RHTPVNSWL GLYLFNWAV ALYDVVSTL RRCRASGVL WPLLLLLLA VTYSLTGLW YFVIFFVAA""".split()] self.result= EpitopePredictorFactory("NetMHC").predict(self.peptides, self.alleles) self.thresh = {"A*01:01":0,"B*07:02":0,"C*03:01":0} def test_selection_without_constraints(self): """ tests if minimal selection withotu additional constraints (except the knapsack capacity) works #peptides obtainedn by perfroming optimization with same input and parameters by etk.informatik.uni-tuebingen.de/optitope :return: """ opt = OptiTope(self.result, self.thresh, k=3, solver="cplex", verbosity=0) r =opt.solve() self.assertTrue(len(set(str(p) for p in r) - set(["GPTPLLYRL", "QYLAGLSTL", "ALYDVVSTL"])) == 0 ) def test_allele_cov_constraint(self): """ tests the allele converage constraints :return: """ #self.alleles.extend([Allele("HLA-A*02:01"),Allele("HLA-B*15:01")]) #self.thresh.update({"A*02:01":0,"B*15:01":0}) self.result= EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles) print self.result[self.alleles[0:2]] opt = OptiTope(self.result,self.thresh,k=3,solver="cplex",verbosity=1) opt.activate_allele_coverage_const(0.8) r = opt.solve() res_df = self.result.xs(self.result.index.values[0][1], level="Method") peps = [str(p) for p in r] probs = {"A*01:01":1, "A*02:01":1, "B*07:02":1,"B*15:01":1,"C*03:01":1.0} res_df = res_df.loc[peps, :] res_df = res_df[[a for a in self.alleles]] res_df = res_df[res_df.apply(lambda x: any(x[a] > self.thresh[a.name] for a in self.alleles), axis=1)] print res_df.apply(lambda x: sum( x[c]*probs[c.name] for c in res_df.columns),axis=1) self.assertTrue(len(set(str(p) for p in r) - set(["ALYDVVSTL", "KLLPRLPGV", "GPTPLLYRL"])) == 0 ) def test_epitope_conservation_constraint(self): import random self.result = EpitopePredictorFactory("BIMAS").predict(self.peptides, self.alleles) conservation = {} print self.result.index.levels[0] for e in self.result.index.levels[0]: conservation[str(e)] = random.random() pt = OptiTope(self.result, self.thresh, k=3, solver="cplex" ,verbosity=1) pt.activate_epitope_conservation_const(0.5, conservation=conservation) for e in pt.solve(): print e, conservation[e]
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-in', dest="inf", help='<Required> full path to the input file', required=True) parser.add_argument('-out', dest="out", help="<Required> full path to the output file", required=True) parser.add_argument('-allele', dest="allele", help="<Required> full path to an allele file, if 'in', allele file will be deduced from in file name", required=True) parser.add_argument('-dirallele', dest="dirallele", help="for use with '-allele in', describes full base path to the allele files") options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if not (options.inf or options.out or options.allele): parser.print_help() sys.exit(1) target_alleles_set = set() #Fred2.FileReader.read_lines is broken #alleles = FileReader.read_lines(options.allele, type=Allele) if options.allele == "in" and options.dirallele: if "_W_" not in options.inf: print "No class 1 type run detected." sys.exit(0) af = None for sp in options.inf.split("_"): if sp.startswith("BD"): af = join(options.dirallele, sp.split("-")[1] + ".allele") with open(af, 'r') as handle: for line in handle: target_alleles_set.add(Allele(line.strip().upper())) else: with open(options.allele, 'r') as handle: for line in handle: target_alleles_set.add(Allele(line.strip().upper())) if not target_alleles_set: parser.print_help() sys.exit(1) ttn = EpitopePredictorFactory('netmhc') pros = list() peps = list() f = oms.IdXMLFile() f.load(options.inf, pros, peps) pepstr = set() for pep in peps: for h in pep.getHits(): #if "decoy" not in h.getMetaValue("target_decoy"): unmod = h.getSequence().toUnmodifiedString() if 7 < len(unmod) < 12 \ and 'U' not in unmod and 'B' not in unmod and 'X' not in unmod and 'Z' not in unmod: pepstr.add(h.getSequence().toUnmodifiedString()) es = [Peptide(x) for x in pepstr] try: preds_n = ttn.predict(es, alleles=target_alleles_set) except Exception as e: print "something went wrong with the netMHC prediction", options.inf, "what:", str(e) sys.exit(1) #only max preds = dict() for index, row in preds_n.iterrows(): score = row.max() #bigger_is_better allele = str(row.idxmax()) categ = categorize(score) seq = row.name[0].tostring() if categ: preds[seq] = (allele, categ, score) npeps = list() for pep in peps: hits = pep.getHits() nhits = list() for h in hits: if h.getSequence().toUnmodifiedString() in preds: x = preds[h.getSequence().toUnmodifiedString()] h.setMetaValue('binder', x[0]) h.setMetaValue(str(x[1]), x[2]) nhits.append(h) else: nhits.append(h) pep.setHits(nhits) f.store(options.out, pros, peps)
def test_epitope_prediction_unsupported_version(self): print EpitopePredictorFactory("BIMAS", version="4.0").predict( self.peptides_mhcI, self.mhcI)
def test_epitope_prediction_available_methods(self): print EpitopePredictorFactory.available_methods()
def toplevel_predictor(x): predictor = EpitopePredictorFactory("netMHC", version="3.4") peps = [Peptide(i) for i in x] return predictor.predict(peps)
def main(): parser = argparse.ArgumentParser( description= 'The software is a novel approach to construct epitope-based string-of-beads \ vaccines in optimal order and with sequence-optimized spacers of flexible length \ such that the recovery of contained epitopes is maximized and immunogenicity of \ arising neo-epitopes is reduced.', ) parser.add_argument('-i', "--input", required=True, help="File containing epitopes (one peptide per line)", type=str) parser.add_argument( '-a', "--alleles", required=True, help= "Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)", type=str) #parameters of the model parser.add_argument( '-l', "--max_length", default=6, type=int, help="Specifies the max. length of the spacers (default 6)", ) parser.add_argument( '-al', "--alpha", default=0.99, type=float, help= "Specifies the first-order preference of the user in the model [0,1] (default 0.99)", ) parser.add_argument( '-be', "--beta", default=0.0, type=float, help= "Specifies the second-order preference of the user in the model [0,1] (default 0).", ) parser.add_argument( '-cp', "--cleavage_prediction", default="pcm", choices=["pcm", "proteasmm_c", "proteasmm_i"], help= "Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_I]", type=str) parser.add_argument( '-ep', "--epitope_prediction", default="syfpeithi", choices=["syfpeithi", "smm", "smmpmbec", "bimas"], help= "Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]", type=str) parser.add_argument( '-t', "--threshold", default=20, type=float, help= "Specifies epitope prediction threshold for SYFPEITHI (default 20).", ) parser.add_argument( '-o', "--output", required=True, type=str, help="Specifies the output file.", ) parser.add_argument( '-p', "--threads", type=int, default=1, help= "Specifies number of threads. If not specified all available logical cpus are used.", ) parser.add_argument( '-apx', "--approximate", action="store_true", help= "Specifies number of threads. If not specified all available logical cpus are used.", ) args = parser.parse_args() #parse input peptides = read_lines(args.input) #read in alleles alleles = generate_alleles(args.alleles) if args.cleavage_prediction.upper() not in [ "PCM", "PROTEASMM_C", "PROTEASMM_I" ]: sys.stderr.write( "Specified cleavage predictor is currently not supported. \ Please choose either PCM, PROTEASMM_C, or PROTEASMM_I" ) sys.exit(-1) if args.epitope_prediction.upper() not in [ "SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC" ]: sys.stderr.write( "Specified cleavage predictor is currently not supported. \ Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC") sys.exit(-1) #set-up model cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction) epi_pred = EpitopePredictorFactory(args.epitope_prediction) thr = {a.name: args.threshold for a in alleles} solver = EpitopeAssemblyWithSpacer(peptides, cl_pred, epi_pred, alleles, k=args.max_length, en=9, threshold=thr, solver="cbc", alpha=args.alpha, beta=args.beta, verbosity=1) #solve #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n #TODO:CBC should be shipped with the node #TODO: has to be tested with CBC #TODO: LHK has to be shipped as well -> only academic license! #"preprocess":"off", "threads":1} threads = mp.cpu_count() if args.threads is None else args.threads if args.approximate: svbws = solver.approximate(threads=threads, options={ "preprocess": "off", "threads": 1 }) if not svbws: svbws = solver.solve(threads=threads, options={ "preprocess": "off", "threads": 1 }) else: svbws = solver.solve(threads=threads, options={ "preprocess": "off", "threads": 1 }) with open(args.output, "w") as f: f.write(">assembled_spacer_design\n") f.write("".join(map(str, svbws))) return 0
def main(): model = argparse.ArgumentParser(description='Neoepitope prediction for TargetInsepctor.') model.add_argument( '-m','--method', type=str, choices=EpitopePredictorFactory.available_methods().keys(), default="bimas", help='The name of the prediction method' ) model.add_argument( '-v', '--vcf', type=str, default=None, help='Path to the vcf input file' ) model.add_argument( '-t', '--type', type=str, choices=["VEP", "ANNOVAR", "SNPEFF"], default="VEP", help='Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)' ) model.add_argument( '-p','--proteins', type=str, default=None, help='Path to the protein ID input file (in HGNC-ID)' ) model.add_argument( '-l','--length', choices=range(8, 18), type=int, default=9, help='The length of peptides' ) model.add_argument( '-a','--alleles', type=str, required=True, help='Path to the allele file (one per line in new nomenclature)' ) model.add_argument( '-r' ,'--reference', type=str, default='GRCh38', help='The reference genome used for varinat annotation and calling.' ) model.add_argument( '-fINDEL' ,'--filterINDEL', action="store_true", help='Filter insertions and deletions (including frameshifts)' ) model.add_argument( '-fFS' ,'--filterFSINDEL', action="store_true", help='Filter frameshift INDELs' ) model.add_argument( '-fSNP' ,'--filterSNP', action="store_true", help='Filter SNPs' ) model.add_argument( '-o','--output', type=str, required=True, help='Path to the output file' ) model.add_argument( '-etk','--etk', action="store_true", help=argparse.SUPPRESS ) args = model.parse_args() martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()]) transcript_to_genes = {} if args.vcf is None and args.proteins is None: sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n") return -1 # if vcf file is given: generate variants and filter them if HGNC IDs ar given if args.vcf is not None: protein_ids = [] if args.proteins is not None: with open(args.proteins, "r") as f: for l in f: l = l.strip() if l != "": protein_ids.append(l) if args.type == "VEP": variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids) elif args.type == "SNPEFF": variants = read_vcf(args.vcf)[0] else: variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids) variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants) if args.filterSNP: variants = filter(lambda x: x.type != VariationType.SNP, variants) if args.filterINDEL: variants = filter(lambda x: x.type not in [VariationType.INS, VariationType.DEL, VariationType.FSDEL, VariationType.FSINS], variants) if args.filterFSINDEL: variants = filter(lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants) if not variants: sys.stderr.write("No variants left after filtering. Please refine your filtering criteria.\n") return -1 epitopes = filter(lambda x:any(x.get_variants_by_protein(tid) for tid in x.proteins.iterkeys()), generate_peptides_from_variants(variants, int(args.length), martDB, EIdentifierTypes.ENSEMBL)) for v in variants: for trans_id,coding in v.coding.iteritems(): if coding.geneID!=None: transcript_to_genes[trans_id] = coding.geneID else: transcript_to_genes[trans_id] = 'None' #else: generate protein sequences from given HGNC IDs and than epitopes else: proteins = [] with open(args.proteins, "r") as f: for l in f: ensembl_ids = martDB.get_ensembl_ids_from_id(l.strip(), type=EIdentifierTypes.HGNC)[0] protein_seq = martDB.get_product_sequence(ensembl_ids[EAdapterFields.PROTID]) if protein_seq is not None: transcript_to_genes[ensembl_ids[EAdapterFields.TRANSID]] = l.strip() proteins.append(Protein(protein_seq, gene_id=l.strip(), transcript_id=ensembl_ids[EAdapterFields.TRANSID])) epitopes = generate_peptides_from_proteins(proteins, int(args.length)) #read in allele list alleles = read_lines(args.alleles, in_type=Allele) result = EpitopePredictorFactory(args.method).predict(epitopes, alleles=alleles) with open(args.output, "w") as f: alleles = result.columns var_column = " Variants" if args.vcf is not None else "" f.write("Sequence\tMethod\t"+"\t".join(a.name for a in alleles)+"\tAntigen ID\t"+var_column+"\n") for index, row in result.iterrows(): p = index[0] method = index[1] proteins = ",".join(set([transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins()])) vars_str = "" if args.vcf is not None: vars_str = "\t"+"|".join(set(prot_id.split(":FRED2")[0]+":"+",".join(repr(v) for v in set(p.get_variants_by_protein(prot_id))) for prot_id in p.proteins.iterkeys() if p.get_variants_by_protein(prot_id))) f.write(str(p)+"\t"+method+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+"\t"+proteins+vars_str+"\n") if args.etk: with open(args.output.rsplit(".",1)[0]+"_etk.tsv", "w") as g: alleles = result.columns g.write("Alleles:\t"+"\t".join(a.name for a in alleles)+"\n") for index, row in result.iterrows(): p = index[0] proteins = " ".join(set([transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins()])) g.write(str(p)+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+"\t"+proteins+"\n") return 0
def test_wrong_allele_input(self): with self.assertRaises(ValueError): EpitopePredictorFactory("NetMHC").predict(self.mhcI, alleles=self.transcript)
def main(): parser = argparse.ArgumentParser(description="""The software is a novel approach to construct epitope-based string-of-beads vaccines in optimal order and with sequence-optimized spacers of flexible length such that the recovery of contained epitopes is maximized and immunogenicity of arising neo-epitopes is reduced. """) parser.add_argument("-i", "--input", required=True, help="File containing epitopes (one peptide per line)" ) parser.add_argument("-a", "--alleles", required=True, help="Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)" ) #parameters of the model parser.add_argument("-k","--max_length", default=6, type=int, help="Specifies the max. length of the spacers (default 6)") parser.add_argument("-al","--alpha", default=0.99, type=float, help="Specifies the first-order preference of the user in the model [0,1] (default 0.99)") parser.add_argument("-be","--beta", default=0.0, type=float, help="Specifies the second-order preference of the user in the model [0,1] (default 0).") parser.add_argument("-cp","--cleavage_prediction", default="PCM", help="Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_S]" ) parser.add_argument("-ep","--epitope_prediction", default="Syfpeithi", help="Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]" ) parser.add_argument("-thr","--threshold", default=20, type=float, help="Specifies epitope prediction threshold for SYFPEITHI (default 20).") parser.add_argument("-o", "--output", required=True, help="Specifies the output file.") parser.add_argument("-t", "--threads", type=int, default=None, help="Specifies number of threads. If not specified all available logical cpus are used.") args = parser.parse_args() #parse input peptides = list(FileReader.read_lines(args.input, in_type=Peptide)) #read in alleles alleles = generate_alleles(args.alleles) if args.cleavage_prediction.upper() not in ["PCM", "PROTEASMM_C", "PROTEASMM_S"]: print "Specified cleavage predictor is currently not supported. Please choose either PCM, PROTEASMM_C, or PROTEASMM_S" sys.exit(-1) if args.epitope_prediction.upper() not in ["SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC"]: print "Specified cleavage predictor is currently not supported. Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC" sys.exit(-1) #set-up model cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction) epi_pred = EpitopePredictorFactory(args.epitope_prediction) thr = {a.name:args.threshold for a in alleles} solver = EpitopeAssemblyWithSpacer(peptides,cl_pred,epi_pred,alleles, k=args.max_length,en=9,threshold=thr, solver="cplex", alpha=args.alpha, beta=args.beta, verbosity=0) #solve #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n threads = mp.cpu_count() if args.threads is None else args.threads svbws = solver.approximate(threads=threads,options={"preprocessing_presolve":"n","threads":1}) print print "Resulting String-of-Beads: ","-".join(map(str,svbws)) print with open(args.output, "w") as f: f.write("-".join(map(str,svbws)))
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True) parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True) parser.add_argument( '-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.") parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction") parser.add_argument( '-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]". format(m=PRED_METH)) parser.add_argument( '-f', "--filter", dest="filter", type=float, help= "Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict" ) parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.") parser.add_argument( '-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.") options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if options.filter and not options.predict_with: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) if options.predict_with and not options.alleles: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) temp_dir = "/tmp/" logging.basicConfig( filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log', filemode='w+', level=logging.DEBUG) #, format='%(levelname)s:%(message)s' logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now())) logging.warning("verbosity turned on") #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts # complete proteins? # only containing binders? # k-mers? # binders only? # FastaSlicer.py? # remove original if homozygous (needs fasta input)? # add germline variant option? or expect all to be in one vcf? # MyObject = type('MyObject', (object,), {}) # options = MyObject() # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf") # # vt = os.path.splitext(options.var_file)[-1] # if ".vcf" == vt: # vcfvars, accessions = FileReader.read_vcf(options.var_file) # # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # transcripts = [x for x in transcript_gen if x.vars] # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # protein_gen = g.generate_proteins_from_transcripts(transcript_gen) # proteins = [x for x in protein_gen if x.vars] # for p in proteins: # p.gene_id = p.vars.values()[0][0].gene # # # for t in transcripts: # t.gene_id = t.vars.values()[0].gene # vt = os.path.splitext(options.var_file)[-1] if ".vcf" == vt: vcfvars, accessions = FileReader.read_vcf(options.var_file) elif ".GSvar" == vt: pass # vcfvars = FileReader.read_GSvar(options.var_file) else: m = "Could not read variants {f}, aborting.".format(f=options.var_file) logging.error(m) print m sys.exit(1) mart_db = MartsAdapter(biomart="http://grch37.ensembl.org" ) # TODO guess id_type for mart_db from accessions transcript_gen = g.generate_transcripts_from_variants( vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) protein_gen = g.generate_proteins_from_transcripts(transcript_gen) proteins = [x for x in protein_gen if x.vars] # removing unvaried for p in proteins: p.gene_id = p.vars.values( )[0][0].gene # assume gene name from first variant proteins = [p for p in proteins if not is_stop_gain(p)] # kick out stop gains # First exit option if not (options.predict_with or options.filter) and options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta print "N/A" sys.exit(0) else: e = proteins_to_fasta(proteins) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, digestion must be set somehow if not options.digest: digest = 9 else: digest = options.digest peptide_gen = g.generate_peptides_from_proteins(proteins, digest) peptides = [x for x in peptide_gen] peptides_var = [ x for x in peptides if any( x.get_variants_by_protein(y) for y in x.proteins.keys()) ] # removing unvaried # Second exit option if not (options.predict_with or options.filter): e = peptides_to_fasta(peptides_var) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, predictions are needed try: target_alleles_set = set( FileReader.read_lines(options.alleles, in_type=Allele)) except Exception as e: m = "Could not read alleles file {f}, aborting.".format( f=options.alleles) logging.error(m) print m, "what:", str(e) sys.exit(1) try: ttn = EpitopePredictorFactory(options.predict_with) except Exception as e: m = "Could not initialize prediction method {f}, aborting.".format( f=options.predict_with) logging.error(m) print m sys.exit(1) try: preds = ttn.predict(peptides_var, alleles=target_alleles_set) except Exception as e: print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str( e) sys.exit(1) # punch prediction results in peptide metadata (inside pandas dataframe) #PRED_METH = set() for i, row in preds.iterrows(): for j in i[1:]: i[0].log_metadata(j, dict(zip(row.index, row.values))) #PRED_METH.add(j) # need that later # Third exit option if not options.filter: if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # kick out nonbinder preds_f = preds[(preds > options.filter).any(axis=1)] # Fourth exit option if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta binders only plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds_f) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds_f) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0)
def main(): parser = argparse.ArgumentParser( description= """The software is a novel approach to construct epitope-based string-of-beads vaccines in optimal order and with sequence-optimized spacers of flexible length such that the recovery of contained epitopes is maximized and immunogenicity of arising neo-epitopes is reduced. """) parser.add_argument("-i", "--input", required=True, help="File containing epitopes (one peptide per line)") parser.add_argument( "-a", "--alleles", required=True, help= "Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)" ) #parameters of the model parser.add_argument( "-k", "--max_length", default=6, type=int, help="Specifies the max. length of the spacers (default 6)") parser.add_argument( "-al", "--alpha", default=0.99, type=float, help= "Specifies the first-order preference of the user in the model [0,1] (default 0.99)" ) parser.add_argument( "-be", "--beta", default=0.0, type=float, help= "Specifies the second-order preference of the user in the model [0,1] (default 0)." ) parser.add_argument( "-cp", "--cleavage_prediction", default="PCM", help= "Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_S]" ) parser.add_argument( "-ep", "--epitope_prediction", default="Syfpeithi", help= "Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]" ) parser.add_argument( "-thr", "--threshold", default=20, type=float, help= "Specifies epitope prediction threshold for SYFPEITHI (default 20).") parser.add_argument("-o", "--output", required=True, help="Specifies the output file.") parser.add_argument( "-t", "--threads", type=int, default=None, help= "Specifies number of threads. If not specified all available logical cpus are used." ) parser.add_argument( "--ips-solver", default="cplex", choices=["cplex", "cbc"], help= "Executable name of the IPS solver. Executable needs to be available in PATH." ) parser.add_argument("--tsp-solution", default="approximate", choices=["approximate", "optimal"], help="Type of solution of the TSP") parser.add_argument( "--random-order", action="store_true", help= "Indicate whether to generate a random ordered string-of-beads polypeptide" ) parser.add_argument( "--seed", type=int, default=1, help="Seed for random ordering of string-of-beads polypeptide") args = parser.parse_args() #parse input peptides = list(FileReader.read_lines(args.input, in_type=Peptide)) #read in alleles alleles = generate_alleles(args.alleles) if args.cleavage_prediction.upper() not in [ "PCM", "PROTEASMM_C", "PROTEASMM_S" ]: print "Specified cleavage predictor is currently not supported. Please choose either PCM, PROTEASMM_C, or PROTEASMM_S" sys.exit(-1) if args.epitope_prediction.upper() not in [ "SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC" ]: print "Specified cleavage predictor is currently not supported. Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC" sys.exit(-1) #set-up model cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction) epi_pred = EpitopePredictorFactory(args.epitope_prediction) thr = {a.name: args.threshold for a in alleles} solver = EpitopeAssemblyWithSpacer(peptides, cl_pred, epi_pred, alleles, k=args.max_length, en=9, threshold=thr, solver=args.ips_solver, alpha=args.alpha, beta=args.beta, verbosity=0) #solve #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n threads = mp.cpu_count() if args.threads is None else args.threads if args.tsp_solution == "approximate": svbws = solver.approximate(threads=threads, options={ "preprocessing_presolve": "n", "threads": 1 }) else: svbws = solver.solve(threads=threads, options={ "preprocessing_presolve": "n", "threads": 1 }) # Generate random ordered string-of-breads, but still uses optimal spacers # determined from the above solve function. if args.random_order: print "Generating a randomly ordered polypeptide" random.seed(args.seed) random_order_sob = [] random.shuffle(peptides) for i in range(len(peptides)): # Break from loop once we hit the last peptide if i == len(peptides) - 1: random_order_sob.extend([Peptide(str(peptides[i]))]) break left_peptide = str(peptides[i]) right_peptide = str(peptides[i + 1]) opt_spacer = solver.spacer[(left_peptide, right_peptide)] # Right peptide gets added in the next iteration random_order_sob.extend( [Peptide(left_peptide), Peptide(opt_spacer)]) svbws = random_order_sob print print "Resulting String-of-Beads: ", "-".join(map(str, svbws)) print with open(args.output, "w") as f: f.write("-".join(map(str, svbws)))
pd.set_option('display.height', 1000) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 200) pd.set_option('display.width', 200) parser = argparse.ArgumentParser(description='Call epitope predictors on data.') requiredNamed = parser.add_argument_group('required arguments') requiredNamed.add_argument('--predictor', type=str, help='Epitope predictors [see all with --predictor=list]', required=True) requiredNamed.add_argument('--dataset', type=str, help='Immunogenic dataset [see all with --dataset=list]', required=True) parser.add_argument('-n', type=int, help='Number of rows to take from dataset') parser.add_argument('--allele', type=str, help='HLA Type', default=["HLA-A*01:01","HLA-A*02:01","HLA-B*15:01"]) args = parser.parse_args() all_predictors = [ name for name,version in EpitopePredictorFactory.available_methods().iteritems()] all_predictors.remove("netmhcstabpan") all_predictors.remove("netmhc") if args.predictor == 'list': print("Set one of the predictors with --predictor:") print(all_predictors) print (""" Details from https://bioinformatics.oxfordjournals.org/content/suppl/2016/02/26/btw113.DC1/S1.pdf SYFPEITHI T-cell epitope (Rammensee, et al., 1999) BIMAS MHC-I binding (Parker, et al., 1994) SVMHC MHC-I binding (Dönnes and Elofsson, 2002) ARB MHC-I binding (Bui, et al., 2005) SMM MHC-I binding (Peters and Sette, 2005) SMMPMBEC MHC-I binding (Kim, et al., 2009)