def test_path_option_and_optional_parameters(self): netmhc = EpitopePredictorFactory("NetMHC") exe = netmhc.command.split()[0] for try_path in os.environ["PATH"].split(os.pathsep): try_path = try_path.strip('"') exe_try = os.path.join(try_path, exe).strip() if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK): netmhc.predict(self.peptides_mhcI, alleles=self.mhcI, path=exe_try, options="--sort")
def test_single_allele_input(self): for m in EpitopePredictorFactory.available_methods(): for v in EpitopePredictorFactory.available_methods()[m]: mo = EpitopePredictorFactory(m, version=v) if isinstance(mo, AExternalEpitopePrediction) and not (mo.version=="0.1" and mo.name=="netmhc"): if any(a.name in mo.supportedAlleles for a in self.mhcII): mo.predict(self.peptides_mhcII, alleles=self.mhcII[0]) else: mo.predict(self.peptides_mhcI, alleles=self.mhcI[0])
def test_path_and_optional_parameters_netctl(self): netctlpan = EpitopePredictorFactory("NetCTLpan") exe = netctlpan.command.split()[0] for try_path in os.environ["PATH"].split(os.pathsep): try_path = try_path.strip('"') exe_try = os.path.join(try_path, exe).strip() if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK): print netctlpan.predict(self.peptides_mhcI, alleles=self.mhcI, commad=exe_try, options="-wt 0.05 -wc 0.225 -ethr 0.5")
def test_path_and_optional_parameters_netctl(self): netctlpan = EpitopePredictorFactory("NetCTLpan") exe = netctlpan.command.split()[0] for try_path in os.environ["PATH"].split(os.pathsep): try_path = try_path.strip('"') exe_try = os.path.join(try_path, exe).strip() if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK): print netctlpan.predict(self.peptides_mhcI, alleles=self.mhcI, commad=exe_try, options="-wt 0.05 -wc 0.225 -ethr 0.5")
def test_pareto_assembly(self): cl_pred = CleavageSitePredictorFactory("PCM") ep_pred = EpitopePredictorFactory("SMM") allele = [Allele("HLA-A*02:01")] thresh = {a.name:10000 for a in allele} comp = lambda a,b: a <= b print ep_pred.predict(self.peptides,alleles=allele) #cl_pred, ep_pred, alleles, threshold, comparator, length=9 assembler = ParetoEpitopeAssembly(self.peptides,cl_pred, ep_pred, allele, thresh, comp, solver="cbc", verbosity=1) r = assembler.solve(eps=1e10, order=(1,0)) print r
def test_single_peptide_input_mhcII(self): for m in EpitopePredictorFactory.available_methods(): model = EpitopePredictorFactory(m) if not isinstance(model, AExternalEpitopePrediction): if all(a.name in model.supportedAlleles for a in self.mhcII): res = model.predict(self.peptides_mhcII[0], alleles=self.mhcII[1])
def est_multiple_peptide_input_mhcI(self): for m in EpitopePredictorFactory.available_methods(): model = EpitopePredictorFactory(m) if all( a.name in model.supportedAlleles for a in self.mhcI): res = model.predict(self.peptides_mhcI,alleles=self.mhcI)
def test_path_option_and_optional_parameters_netmhc(self): netmhc = EpitopePredictorFactory("NetMHC") exe = netmhc.command.split()[0] for try_path in os.environ["PATH"].split(os.pathsep): try_path = try_path.strip('"') exe_try = os.path.join(try_path, exe).strip() if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK): r = netmhc.predict(self.peptides_mhcI, alleles=self.mhcI, command=exe_try, options="--sort", chunksize=1) self.assertTrue(len(r) == len(self.peptides_mhcI)) self.assertAlmostEqual(r["A*02:01"]["SYFPEITHI"]["netmhc"], 0.150579105869, places=7, msg=None, delta=None) self.assertAlmostEqual(r["A*02:01"]["IHTIEPFYS"]["netmhc"], 0.0619540879359, places=7, msg=None, delta=None)
def test_path_option_and_optional_parameters_netmhc(self): netmhc = EpitopePredictorFactory("NetMHC") exe = netmhc.command.split()[0] for try_path in os.environ["PATH"].split(os.pathsep): try_path = try_path.strip('"') exe_try = os.path.join(try_path, exe).strip() if os.path.isfile(exe_try) and os.access(exe_try, os.X_OK): r = netmhc.predict(self.peptides_mhcI, alleles=self.mhcI, command=exe_try, options="--sort", chunksize=1) self.assertTrue(len(r) == len(self.peptides_mhcI)) self.assertAlmostEqual(r["A*02:01"]["SYFPEITHI"]["netmhc"], 0.150579105869, places=7, msg=None, delta=None) self.assertAlmostEqual(r["A*02:01"]["IHTIEPFYS"]["netmhc"], 0.0619540879359, places=7, msg=None, delta=None)
def test_pareto_assembly(self): cl_pred = CleavageSitePredictorFactory("PCM") ep_pred = EpitopePredictorFactory("SMM") allele = [Allele("HLA-A*02:01")] thresh = {a.name: 10000 for a in allele} comp = lambda a, b: a <= b print ep_pred.predict(self.peptides, alleles=allele) #cl_pred, ep_pred, alleles, threshold, comparator, length=9 assembler = ParetoEpitopeAssembly(self.peptides, cl_pred, ep_pred, allele, thresh, comp, solver="glpk", verbosity=1) r = assembler.solve(eps=1e10, order=(1, 0)) print r
def run_predictor(pred, dataset): predictor = EpitopePredictorFactory(pred) results = () try: results = predictor.predict(dataset, alleles=[ Allele(a) for a in args.allele ]) print(results) print(results.describe()) except ValueError: pass return(len(results),len(dataset))
def run_predictor(pred, dataset): predictor = EpitopePredictorFactory(pred) results = () try: results = predictor.predict(dataset, alleles=[Allele(a) for a in args.allele]) print(results) print(results.describe()) except ValueError: pass return (len(results), len(dataset))
def test_single_allele_input(self): for m in EpitopePredictorFactory.available_methods(): for v in EpitopePredictorFactory.available_methods()[m]: mo = EpitopePredictorFactory(m, version=v) if isinstance(mo, AExternalEpitopePrediction) and not ( mo.version == "0.1" and mo.name == "netmhc"): print "Testing", mo.name, "version", mo.version try: if any(a.name in mo.supportedAlleles for a in self.mhcII): mo.predict(self.peptides_mhcII, alleles=self.mhcII[0]) if any(a.name in mo.supportedAlleles for a in self.mhcII_combined_alleles): mo.predict(self.peptides_mhcII, alleles=self.mhcII_combined_alleles[0]) if any(a.name in mo.supportedAlleles for a in self.mhcI): mo.predict(self.peptides_mhcI, alleles=self.mhcI[0]) print "Success" except RuntimeError as e: #catch only those stemming from binary unavailability if "could not be found in PATH" not in e.message: raise e #all others do not except else: print mo.name, "not available"
def test_single_allele_input(self): for m in EpitopePredictorFactory.available_methods(): for v in EpitopePredictorFactory.available_methods()[m]: mo = EpitopePredictorFactory(m, version=v) if isinstance(mo, AExternalEpitopePrediction) and not (mo.version=="0.1" and mo.name=="netmhc"): print "Testing", mo.name, "version", mo.version try: if any(a.name in mo.supportedAlleles for a in self.mhcII): mo.predict(self.peptides_mhcII, alleles=self.mhcII[0]) if any(a.name in mo.supportedAlleles for a in self.mhcII_combined_alleles): mo.predict(self.peptides_mhcII, alleles=self.mhcII_combined_alleles[0]) if any(a.name in mo.supportedAlleles for a in self.mhcI): mo.predict(self.peptides_mhcI, alleles=self.mhcI[0]) print "Success" except RuntimeError as e: #catch only those stemming from binary unavailability if "could not be found in PATH" not in e.message: raise e #all others do not except else: print mo.name, "not available"
def make_predictions_from_variants(variants_all, methods, alleles, minlength, maxlength, martsadapter, protein_db, identifier, metadata, transcriptProteinMap): # list for all peptides and filtered peptides all_peptides = [] all_peptides_filtered = [] # dictionaries for syfpeithi matrices max values and allele mapping max_values_matrices = {} allele_string_map = {} # list to hold dataframes for all predictions pred_dataframes = [] prots = [p for p in generator.generate_proteins_from_transcripts(generator.generate_transcripts_from_variants(variants_all, martsadapter, ID_SYSTEM_USED))] for peplen in range(minlength, maxlength): peptide_gen = generator.generate_peptides_from_proteins(prots, peplen) peptides_var = [x for x in peptide_gen] # remove peptides which are not 'variant relevant' peptides = [x for x in peptides_var if any(x.get_variants_by_protein(y) for y in x.proteins.keys())] # filter out self peptides selfies = [str(p) for p in peptides if protein_db.exists(str(p))] filtered_peptides = [p for p in peptides if str(p) not in selfies] all_peptides = all_peptides + peptides all_peptides_filtered = all_peptides_filtered + filtered_peptides results = [] if len(filtered_peptides) > 0: for method, version in methods.items(): try: predictor = EpitopePredictorFactory(method, version=version) results.extend([predictor.predict(filtered_peptides, alleles=alleles)]) except: logger.warning("Prediction for length {length} and allele {allele} not possible with {method} version {version}.".format(length=peplen, allele=','.join([str(a) for a in alleles]), method=method, version=version)) if(len(results) == 0): continue df = pd.concat(results) for a in alleles: conv_allele = "%s_%s%s" % (a.locus, a.supertype, a.subtype) allele_string_map['%s_%s' % (a, peplen)] = '%s_%i' % (conv_allele, peplen) max_values_matrices['%s_%i' % (conv_allele, peplen)] = get_matrix_max_score(conv_allele, peplen) df.insert(0, 'length', df.index.map(create_length_column_value)) df['chr'] = df.index.map(create_variant_chr_column_value) df['pos'] = df.index.map(create_variant_pos_column_value) df['gene'] = df.index.map(create_gene_column_value) df['transcripts'] = df.index.map(create_transcript_column_value) df['proteins'] = df.index.map(create_protein_column_value) df['variant type'] = df.index.map(create_variant_type_column_value) df['synonymous'] = df.index.map(create_variant_syn_column_value) df['homozygous'] = df.index.map(create_variant_hom_column_value) df['variant details (genomic)'] = df.index.map(create_mutationsyntax_genome_column_value) df['variant details (protein)'] = df.index.map(create_mutationsyntax_column_value) # reset index to have index as columns df.reset_index(inplace=True) for c in df.columns: if ('HLA-' in str(c)) or ('H-2-' in str(c)): idx = df.columns.get_loc(c) df.insert(idx + 1, '%s affinity' % c, df.apply(lambda x: create_affinity_values(str(c), int(x['length']), float(x[c]), x['Method'], max_values_matrices, allele_string_map), axis=1)) df.insert(idx + 2, '%s binder' % c, df.apply(lambda x: create_binder_values(float(x['%s affinity' % c]), x['Method']), axis=1)) df = df.rename(columns={c: '%s score' % c}) df['%s score' % c] = df.apply(lambda x: create_score_values(float(x['%s score' % c]), x['Method']), axis=1) for c in metadata: df[c] = df.apply(lambda row: create_metadata_column_value(row, c), axis=1) df = df.rename(columns={'Seq': 'sequence'}) df = df.rename(columns={'Method': 'method'}) pred_dataframes.append(df) statistics = {'prediction_methods': [ method + "-" + version for method, version in methods.items() ] ,'number_of_variants': len(variants_all), 'number_of_unique_peptides': [str(p) for p in all_peptides], 'number_of_unique_peptides_after_filtering': [str(p) for p in all_peptides_filtered]} return pred_dataframes, statistics, all_peptides_filtered, prots
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True) parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True) parser.add_argument( '-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.") parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction") parser.add_argument( '-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]". format(m=PRED_METH)) parser.add_argument( '-f', "--filter", dest="filter", type=float, help= "Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict" ) parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.") parser.add_argument( '-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.") options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if options.filter and not options.predict_with: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) if options.predict_with and not options.alleles: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) temp_dir = "/tmp/" logging.basicConfig( filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log', filemode='w+', level=logging.DEBUG) #, format='%(levelname)s:%(message)s' logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now())) logging.warning("verbosity turned on") #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts # complete proteins? # only containing binders? # k-mers? # binders only? # FastaSlicer.py? # remove original if homozygous (needs fasta input)? # add germline variant option? or expect all to be in one vcf? # MyObject = type('MyObject', (object,), {}) # options = MyObject() # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf") # # vt = os.path.splitext(options.var_file)[-1] # if ".vcf" == vt: # vcfvars, accessions = FileReader.read_vcf(options.var_file) # # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # transcripts = [x for x in transcript_gen if x.vars] # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # protein_gen = g.generate_proteins_from_transcripts(transcript_gen) # proteins = [x for x in protein_gen if x.vars] # for p in proteins: # p.gene_id = p.vars.values()[0][0].gene # # # for t in transcripts: # t.gene_id = t.vars.values()[0].gene # vt = os.path.splitext(options.var_file)[-1] if ".vcf" == vt: vcfvars, accessions = FileReader.read_vcf(options.var_file) elif ".GSvar" == vt: pass # vcfvars = FileReader.read_GSvar(options.var_file) else: m = "Could not read variants {f}, aborting.".format(f=options.var_file) logging.error(m) print m sys.exit(1) mart_db = MartsAdapter(biomart="http://grch37.ensembl.org" ) # TODO guess id_type for mart_db from accessions transcript_gen = g.generate_transcripts_from_variants( vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) protein_gen = g.generate_proteins_from_transcripts(transcript_gen) proteins = [x for x in protein_gen if x.vars] # removing unvaried for p in proteins: p.gene_id = p.vars.values( )[0][0].gene # assume gene name from first variant proteins = [p for p in proteins if not is_stop_gain(p)] # kick out stop gains # First exit option if not (options.predict_with or options.filter) and options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta print "N/A" sys.exit(0) else: e = proteins_to_fasta(proteins) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, digestion must be set somehow if not options.digest: digest = 9 else: digest = options.digest peptide_gen = g.generate_peptides_from_proteins(proteins, digest) peptides = [x for x in peptide_gen] peptides_var = [ x for x in peptides if any( x.get_variants_by_protein(y) for y in x.proteins.keys()) ] # removing unvaried # Second exit option if not (options.predict_with or options.filter): e = peptides_to_fasta(peptides_var) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, predictions are needed try: target_alleles_set = set( FileReader.read_lines(options.alleles, in_type=Allele)) except Exception as e: m = "Could not read alleles file {f}, aborting.".format( f=options.alleles) logging.error(m) print m, "what:", str(e) sys.exit(1) try: ttn = EpitopePredictorFactory(options.predict_with) except Exception as e: m = "Could not initialize prediction method {f}, aborting.".format( f=options.predict_with) logging.error(m) print m sys.exit(1) try: preds = ttn.predict(peptides_var, alleles=target_alleles_set) except Exception as e: print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str( e) sys.exit(1) # punch prediction results in peptide metadata (inside pandas dataframe) #PRED_METH = set() for i, row in preds.iterrows(): for j in i[1:]: i[0].log_metadata(j, dict(zip(row.index, row.values))) #PRED_METH.add(j) # need that later # Third exit option if not options.filter: if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # kick out nonbinder preds_f = preds[(preds > options.filter).any(axis=1)] # Fourth exit option if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta binders only plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds_f) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds_f) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0)
def toplevel_predictor(x): predictor = EpitopePredictorFactory("netMHC", version="3.4") peps = [Peptide(i) for i in x] return predictor.predict(peps)
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-c', dest="mhcclass", help='<Required> MHC class', required=True) parser.add_argument('-in', dest="inf", help='<Required> full path to the input file', required=True) parser.add_argument('-out', dest="out", help="<Required> full path to the output file", required=True) parser.add_argument( '-allele', dest="allele", help= "<Required> full path to an allele file, if 'in', allele file will be deduced from in file name", required=True) parser.add_argument( '-dirallele', dest="dirallele", help= "for use with '-allele in', describes full base path to the allele files" ) options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if not (options.inf or options.out or options.allele): parser.print_help() sys.exit(1) target_alleles_set = set() #Fred2.FileReader.read_lines is broken #alleles = FileReader.read_lines(options.allele, type=Allele) if options.allele == "in" and options.dirallele: if "_W_" not in options.inf: print "No class 1 type run detected." sys.exit(0) af = None for sp in options.inf.split("_"): if sp.startswith("BD"): af = join(options.dirallele, sp.split("-")[1] + ".allele") with open(af, 'r') as handle: for line in handle: target_alleles_set.add(Allele(line.strip().upper())) else: with open(options.allele, 'r') as handle: for line in handle: target_alleles_set.add(Allele(line.strip().upper())) if not target_alleles_set: parser.print_help() sys.exit(1) if options.mhcclass == "I": ttn = EpitopePredictorFactory('netmhcpan', version='3.0') lowerBound = 8 upperBound = 12 elif options.mhcclass == "II": ttn = EpitopePredictorFactory('netmhcIIpan', version='3.1') lowerBound = 15 upperBound = 25 pros = list() peps = list() f = oms.IdXMLFile() f.load(options.inf, pros, peps) pepstr = set() for pep in peps: for h in pep.getHits(): #if "decoy" not in h.getMetaValue("target_decoy"): unmod = h.getSequence().toUnmodifiedString() if lowerBound <= len(unmod) <= upperBound \ and 'U' not in unmod and 'B' not in unmod and 'X' not in unmod and 'Z' not in unmod: pepstr.add(h.getSequence().toUnmodifiedString()) es = [Peptide(x) for x in pepstr] try: preds_n = ttn.predict(es, alleles=target_alleles_set) except Exception as e: print "something went wrong with the netMHC prediction", options.inf, "what:", str( e) sys.exit(1) #only max preds = dict() for index, row in preds_n.iterrows(): score = row.max() #bigger_is_better allele = str(row.idxmax()) categ = categorize(score) seq = row.name[0].tostring() if categ: preds[seq] = (allele, categ, score) npeps = list() for pep in peps: hits = pep.getHits() nhits = list() for h in hits: if h.getSequence().toUnmodifiedString() in preds: x = preds[h.getSequence().toUnmodifiedString()] h.setMetaValue('binder', x[0]) h.setMetaValue(str(x[1]), x[2]) nhits.append(h) else: nhits.append(h) pep.setHits(nhits) f.store(options.out, pros, peps)
def toplevel_predictor(x): predictor = EpitopePredictorFactory("netMHC", version="3.4") peps = [Peptide(i) for i in x] return predictor.predict(peps)
return matrix_max( getattr( __import__("Fred2.Data.pssms.syfpeithi" + ".mat." + allele_model, fromlist=[allele_model]), allele_model)) except ImportError: return None # Calculate the maximum attainable score for each allele converted_alleles = dict(zip(alleles, predictor.convert_alleles(alleles))) max_score_by_allele = {(allele, length): load_allele_model(converted_alleles[allele], length) for length in predictor.supportedLength for allele in alleles} # Run predictions and output results print 'Peptide\tAllele\tSyfpeithiRawScore\tSyfpeithiNormScore' for pep_len, peptides in peptides_by_length.items(): for allele in alleles: if (allele, pep_len ) in max_score_by_allele and max_score_by_allele[allele, pep_len]: results = predictor.predict(peptides, alleles=[allele]) for index, row in results.iterrows(): print '{}\t{}\t{}\t{}'.format( str(index[0]), allele, row[allele], float(row[allele]) / max_score_by_allele[allele, pep_len]) else: for peptide in peptides: print '{}\t{}\t{}\t{}'.format(peptide, allele, 'NA', 'NA')
def make_predictions_from_peptides(peptides, methods, alleles, protein_db, identifier, metadata): # dictionaries for syfpeithi matrices max values and allele mapping max_values_matrices = {} allele_string_map = {} # list to hold dataframes for all predictions pred_dataframes = [] # filter out self peptides if specified selfies = [str(p) for p in peptides if protein_db.exists(str(p))] peptides_filtered = [p for p in peptides if str(p) not in selfies] # sort peptides by length (for predictions) sorted_peptides = {} for p in peptides_filtered: length = len(str(p)) if length in sorted_peptides: sorted_peptides[length].append(p) else: sorted_peptides[length] = [p] for peplen in sorted_peptides: all_peptides_filtered = sorted_peptides[peplen] results = [] for method, version in methods.items(): try: predictor = EpitopePredictorFactory(method, version=version) results.extend([predictor.predict(all_peptides_filtered, alleles=alleles)]) except: logger.warning("Prediction for length {length} and allele {allele} not possible with {method} version {version}. No model available.".format(length=peplen, allele=','.join([str(a) for a in alleles]), method=method, version=version)) # merge dataframes of the performed predictions if(len(results) == 0): continue df = pd.concat(results) df.insert(0, 'length', df.index.map(create_length_column_value)) for a in alleles: conv_allele = "%s_%s%s" % (a.locus, a.supertype, a.subtype) allele_string_map['%s_%s' % (a, peplen)] = '%s_%i' % (conv_allele, peplen) max_values_matrices['%s_%i' % (conv_allele, peplen)] = get_matrix_max_score(conv_allele,peplen) # reset index to have index as columns df.reset_index(inplace=True) mandatory_columns = ['chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'synonymous', 'homozygous', 'variant details (genomic)', 'variant details (protein)'] for header in mandatory_columns: if header not in metadata: df[header] = np.nan else: df[header] = df.apply(lambda row: row[0].get_metadata(header)[0], axis=1) for c in list(set(metadata) - set(mandatory_columns)): df[c] = df.apply(lambda row: row[0].get_metadata(c)[0], axis=1) for c in df.columns: if ('HLA-' in str(c)) or ('H-2-' in str(c)): idx = df.columns.get_loc(c) df.insert(idx + 1, '%s affinity' % c, df.apply(lambda x: create_affinity_values(str(c), int(x['length']), float(x[c]), x['Method'], max_values_matrices, allele_string_map), axis=1)) df.insert(idx + 2, '%s binder' % c, df.apply(lambda x: create_binder_values(float(x['%s affinity' % c]), x['Method']), axis=1)) df = df.rename(columns={c: '%s score' % c}) df['%s score' % c] = df.apply(lambda x: create_score_values(float(x['%s score' % c]), x['Method']), axis=1) df = df.rename(columns={'Seq': 'sequence'}) df = df.rename(columns={'Method': 'method'}) pred_dataframes.append(df) # write prediction statistics statistics = {'prediction_methods': [ method + "-" + version for method, version in methods.items() ],'number_of_variants': '-', 'number_of_unique_peptides': [str(p) for p in peptides], 'number_of_unique_peptides_after_filtering': [str(p) for p in peptides_filtered]} return pred_dataframes, statistics
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-in', dest="inf", help='<Required> full path to the input file', required=True) parser.add_argument('-out', dest="out", help="<Required> full path to the output file", required=True) parser.add_argument('-allele', dest="allele", help="<Required> full path to an allele file, if 'in', allele file will be deduced from in file name", required=True) parser.add_argument('-dirallele', dest="dirallele", help="for use with '-allele in', describes full base path to the allele files") options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if not (options.inf or options.out or options.allele): parser.print_help() sys.exit(1) target_alleles_set = set() #Fred2.FileReader.read_lines is broken #alleles = FileReader.read_lines(options.allele, type=Allele) if options.allele == "in" and options.dirallele: if "_W_" not in options.inf: print "No class 1 type run detected." sys.exit(0) af = None for sp in options.inf.split("_"): if sp.startswith("BD"): af = join(options.dirallele, sp.split("-")[1] + ".allele") with open(af, 'r') as handle: for line in handle: target_alleles_set.add(Allele(line.strip().upper())) else: with open(options.allele, 'r') as handle: for line in handle: target_alleles_set.add(Allele(line.strip().upper())) if not target_alleles_set: parser.print_help() sys.exit(1) ttn = EpitopePredictorFactory('netmhc') pros = list() peps = list() f = oms.IdXMLFile() f.load(options.inf, pros, peps) pepstr = set() for pep in peps: for h in pep.getHits(): #if "decoy" not in h.getMetaValue("target_decoy"): unmod = h.getSequence().toUnmodifiedString() if 7 < len(unmod) < 12 \ and 'U' not in unmod and 'B' not in unmod and 'X' not in unmod and 'Z' not in unmod: pepstr.add(h.getSequence().toUnmodifiedString()) es = [Peptide(x) for x in pepstr] try: preds_n = ttn.predict(es, alleles=target_alleles_set) except Exception as e: print "something went wrong with the netMHC prediction", options.inf, "what:", str(e) sys.exit(1) #only max preds = dict() for index, row in preds_n.iterrows(): score = row.max() #bigger_is_better allele = str(row.idxmax()) categ = categorize(score) seq = row.name[0].tostring() if categ: preds[seq] = (allele, categ, score) npeps = list() for pep in peps: hits = pep.getHits() nhits = list() for h in hits: if h.getSequence().toUnmodifiedString() in preds: x = preds[h.getSequence().toUnmodifiedString()] h.setMetaValue('binder', x[0]) h.setMetaValue(str(x[1]), x[2]) nhits.append(h) else: nhits.append(h) pep.setHits(nhits) f.store(options.out, pros, peps)
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True) parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True) parser.add_argument('-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.") parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction") parser.add_argument('-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]".format(m=PRED_METH)) parser.add_argument('-f', "--filter", dest="filter", type=float, help="Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict") parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.") parser.add_argument('-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.") options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if options.filter and not options.predict_with: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) if options.predict_with and not options.alleles: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) temp_dir = "/tmp/" logging.basicConfig(filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log', filemode='w+', level=logging.DEBUG) #, format='%(levelname)s:%(message)s' logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now())) logging.warning("verbosity turned on") #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts # complete proteins? # only containing binders? # k-mers? # binders only? # FastaSlicer.py? # remove original if homozygous (needs fasta input)? # add germline variant option? or expect all to be in one vcf? # MyObject = type('MyObject', (object,), {}) # options = MyObject() # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf") # # vt = os.path.splitext(options.var_file)[-1] # if ".vcf" == vt: # vcfvars, accessions = FileReader.read_vcf(options.var_file) # # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # transcripts = [x for x in transcript_gen if x.vars] # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # protein_gen = g.generate_proteins_from_transcripts(transcript_gen) # proteins = [x for x in protein_gen if x.vars] # for p in proteins: # p.gene_id = p.vars.values()[0][0].gene # # # for t in transcripts: # t.gene_id = t.vars.values()[0].gene # vt = os.path.splitext(options.var_file)[-1] if ".vcf" == vt: vcfvars, accessions = FileReader.read_vcf(options.var_file) elif ".GSvar" == vt: pass # vcfvars = FileReader.read_GSvar(options.var_file) else: m = "Could not read variants {f}, aborting.".format(f=options.var_file) logging.error(m) print m sys.exit(1) mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # TODO guess id_type for mart_db from accessions transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) protein_gen = g.generate_proteins_from_transcripts(transcript_gen) proteins = [x for x in protein_gen if x.vars] # removing unvaried for p in proteins: p.gene_id = p.vars.values()[0][0].gene # assume gene name from first variant proteins = [p for p in proteins if not is_stop_gain(p)] # kick out stop gains # First exit option if not (options.predict_with or options.filter) and options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta print "N/A" sys.exit(0) else: e = proteins_to_fasta(proteins) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, digestion must be set somehow if not options.digest: digest = 9 else: digest = options.digest peptide_gen = g.generate_peptides_from_proteins(proteins, digest) peptides = [x for x in peptide_gen] peptides_var = [x for x in peptides if any(x.get_variants_by_protein(y) for y in x.proteins.keys())] # removing unvaried # Second exit option if not (options.predict_with or options.filter): e = peptides_to_fasta(peptides_var) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, predictions are needed try: target_alleles_set = set(FileReader.read_lines(options.alleles, in_type=Allele)) except Exception as e: m = "Could not read alleles file {f}, aborting.".format(f=options.alleles) logging.error(m) print m, "what:", str(e) sys.exit(1) try: ttn = EpitopePredictorFactory(options.predict_with) except Exception as e: m = "Could not initialize prediction method {f}, aborting.".format(f=options.predict_with) logging.error(m) print m sys.exit(1) try: preds = ttn.predict(peptides_var, alleles=target_alleles_set) except Exception as e: print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(e) sys.exit(1) # punch prediction results in peptide metadata (inside pandas dataframe) #PRED_METH = set() for i, row in preds.iterrows(): for j in i[1:]: i[0].log_metadata(j, dict(zip(row.index, row.values))) #PRED_METH.add(j) # need that later # Third exit option if not options.filter: if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # kick out nonbinder preds_f = preds[(preds > options.filter).any(axis=1)] # Fourth exit option if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta binders only plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds_f) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds_f) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0)
def test_single_peptide_input_mhcII(self): for m in EpitopePredictorFactory.available_methods(): model = EpitopePredictorFactory(m) if not isinstance(model, AExternalEpitopePrediction): if all(a.name in model.supportedAlleles for a in self.mhcII): res = model.predict(self.peptides_mhcII[0], alleles=self.mhcII[1])