def test_read_vcf(self): #general vcfvars, accessions = FileReader.read_vcf(self.vcf_path1) self.assertEqual(len(vcfvars), 17) #no annotations self.assertWarnings(UserWarning, FileReader.read_vcf, self.vcf_path2) #variation types vcfvars, accessions = FileReader.read_vcf(self.vcf_path3) self.assertEqual(len(vcfvars), 5) #stopgaininsertion will yield no variant self.assertEqual(vcfvars[0].coding['NM_014675'].cdsMutationSyntax, 'c.6026C>G') self.assertEqual(vcfvars[1].coding['NM_015237'].cdsMutationSyntax, 'c.1225C>T') self.assertEqual(vcfvars[2].coding['NM_014826'].cdsMutationSyntax, 'c.2091delG') self.assertEqual(vcfvars[2].coding['NM_003607'].cdsMutationSyntax, 'c.2334delG') self.assertEqual(vcfvars[3].coding['NM_002318'].cdsMutationSyntax, 'c.142_162delCAGGCCCCCGCCAACGTGGCC') self.assertEqual(vcfvars[4].coding['NM_152888'].cdsMutationSyntax, 'c.2086G>T')
def main(input_peptides, input_vaccine_sequences, output_vaccine_epitopes, verbose, log_file): ''' Reads the vaccine produced by Fischer's online tool and converts it into epitopes ''' global LOGGER LOGGER = utilities.init_logging(verbose, log_file, log_append=False) LOGGER.info('Reading peptides...') with open(input_peptides) as f: peptides = set(r['peptide'] for r in csv.DictReader(f)) LOGGER.info('Read %d peptides', len(peptides)) LOGGER.info('Reading vaccine...') mosaics = FileReader.read_fasta(input_vaccine_sequences, in_type=Protein) LOGGER.info('Vaccine has %d mosaic(s)', len(mosaics)) with open(output_vaccine_epitopes, 'w') as f: writer = csv.writer(f) writer.writerow(('cocktail', 'index', 'epitope')) for c, mos in enumerate(mosaics): pep_count = unk_count = 0 for i in range(0, len(mos) - 8): pep = mos[i:i + 9] assert len(pep) == 9 if pep in peptides: writer.writerow((c, pep_count, pep)) pep_count += 1 else: unk_count += 1 LOGGER.info('Mosaic %d - Recognized: %d Unknown %d', c + 1, pep_count, unk_count)
def read_annotated_proteins(proteins_file): ''' Reads proteins from a fasta file and extracts their metadata from the header. Currently follows the format of the HIV database ''' proteins = FileReader.read_fasta(proteins_file, in_type=Protein) for prot in proteins: parts = prot.transcript_id.split('.') prot.transcript_id = parts[-1] return proteins
def test_read_lines(self): alleles = FileReader.read_lines(self.ale_path, in_type=Allele) self.assertEqual(len(alleles), 2) self.assertRaises(IOError, FileReader.read_lines, self.ale_no_path, in_type=Allele) self.assertRaises(ValueError, FileReader.read_lines, self.ale_zonk_path, in_type=Allele)
def main(): parser = argparse.ArgumentParser( description= """The software is a novel approach to construct epitope-based string-of-beads vaccines in optimal order and with sequence-optimized spacers of flexible length such that the recovery of contained epitopes is maximized and immunogenicity of arising neo-epitopes is reduced. """) parser.add_argument("-i", "--input", required=True, help="File containing epitopes (one peptide per line)") parser.add_argument( "-a", "--alleles", required=True, help= "Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)" ) #parameters of the model parser.add_argument( "-k", "--max_length", default=6, type=int, help="Specifies the max. length of the spacers (default 6)") parser.add_argument( "-al", "--alpha", default=0.99, type=float, help= "Specifies the first-order preference of the user in the model [0,1] (default 0.99)" ) parser.add_argument( "-be", "--beta", default=0.0, type=float, help= "Specifies the second-order preference of the user in the model [0,1] (default 0)." ) parser.add_argument( "-cp", "--cleavage_prediction", default="PCM", help= "Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_S]" ) parser.add_argument( "-ep", "--epitope_prediction", default="Syfpeithi", help= "Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]" ) parser.add_argument( "-thr", "--threshold", default=20, type=float, help= "Specifies epitope prediction threshold for SYFPEITHI (default 20).") parser.add_argument("-o", "--output", required=True, help="Specifies the output file.") parser.add_argument( "-t", "--threads", type=int, default=None, help= "Specifies number of threads. If not specified all available logical cpus are used." ) parser.add_argument( "--ips-solver", default="cplex", choices=["cplex", "cbc"], help= "Executable name of the IPS solver. Executable needs to be available in PATH." ) parser.add_argument("--tsp-solution", default="approximate", choices=["approximate", "optimal"], help="Type of solution of the TSP") parser.add_argument( "--random-order", action="store_true", help= "Indicate whether to generate a random ordered string-of-beads polypeptide" ) parser.add_argument( "--seed", type=int, default=1, help="Seed for random ordering of string-of-beads polypeptide") args = parser.parse_args() #parse input peptides = list(FileReader.read_lines(args.input, in_type=Peptide)) #read in alleles alleles = generate_alleles(args.alleles) if args.cleavage_prediction.upper() not in [ "PCM", "PROTEASMM_C", "PROTEASMM_S" ]: print "Specified cleavage predictor is currently not supported. Please choose either PCM, PROTEASMM_C, or PROTEASMM_S" sys.exit(-1) if args.epitope_prediction.upper() not in [ "SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC" ]: print "Specified cleavage predictor is currently not supported. Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC" sys.exit(-1) #set-up model cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction) epi_pred = EpitopePredictorFactory(args.epitope_prediction) thr = {a.name: args.threshold for a in alleles} solver = EpitopeAssemblyWithSpacer(peptides, cl_pred, epi_pred, alleles, k=args.max_length, en=9, threshold=thr, solver=args.ips_solver, alpha=args.alpha, beta=args.beta, verbosity=0) #solve #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n threads = mp.cpu_count() if args.threads is None else args.threads if args.tsp_solution == "approximate": svbws = solver.approximate(threads=threads, options={ "preprocessing_presolve": "n", "threads": 1 }) else: svbws = solver.solve(threads=threads, options={ "preprocessing_presolve": "n", "threads": 1 }) # Generate random ordered string-of-breads, but still uses optimal spacers # determined from the above solve function. if args.random_order: print "Generating a randomly ordered polypeptide" random.seed(args.seed) random_order_sob = [] random.shuffle(peptides) for i in range(len(peptides)): # Break from loop once we hit the last peptide if i == len(peptides) - 1: random_order_sob.extend([Peptide(str(peptides[i]))]) break left_peptide = str(peptides[i]) right_peptide = str(peptides[i + 1]) opt_spacer = solver.spacer[(left_peptide, right_peptide)] # Right peptide gets added in the next iteration random_order_sob.extend( [Peptide(left_peptide), Peptide(opt_spacer)]) svbws = random_order_sob print print "Resulting String-of-Beads: ", "-".join(map(str, svbws)) print with open(args.output, "w") as f: f.write("-".join(map(str, svbws)))
def test_read_annovar_exonic(self): ano = FileReader.read_annovar_exonic(self.ano_path) self.assertEqual(len(ano), 5)
def test_read_fasta(self): seqs = FileReader.read_fasta(self.fa_path) self.assertEqual(len(seqs), 2) seqs = FileReader.read_fasta(self.fa_unconventional_path) # no "|" self.assertEqual(len(seqs), 174)
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True) parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True) parser.add_argument('-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.") parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction") parser.add_argument('-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]".format(m=PRED_METH)) parser.add_argument('-f', "--filter", dest="filter", type=float, help="Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict") parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.") parser.add_argument('-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.") options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if options.filter and not options.predict_with: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) if options.predict_with and not options.alleles: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) temp_dir = "/tmp/" logging.basicConfig(filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log', filemode='w+', level=logging.DEBUG) #, format='%(levelname)s:%(message)s' logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now())) logging.warning("verbosity turned on") #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts # complete proteins? # only containing binders? # k-mers? # binders only? # FastaSlicer.py? # remove original if homozygous (needs fasta input)? # add germline variant option? or expect all to be in one vcf? # MyObject = type('MyObject', (object,), {}) # options = MyObject() # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf") # # vt = os.path.splitext(options.var_file)[-1] # if ".vcf" == vt: # vcfvars, accessions = FileReader.read_vcf(options.var_file) # # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # transcripts = [x for x in transcript_gen if x.vars] # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # protein_gen = g.generate_proteins_from_transcripts(transcript_gen) # proteins = [x for x in protein_gen if x.vars] # for p in proteins: # p.gene_id = p.vars.values()[0][0].gene # # # for t in transcripts: # t.gene_id = t.vars.values()[0].gene # vt = os.path.splitext(options.var_file)[-1] if ".vcf" == vt: vcfvars, accessions = FileReader.read_vcf(options.var_file) elif ".GSvar" == vt: pass # vcfvars = FileReader.read_GSvar(options.var_file) else: m = "Could not read variants {f}, aborting.".format(f=options.var_file) logging.error(m) print m sys.exit(1) mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # TODO guess id_type for mart_db from accessions transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) protein_gen = g.generate_proteins_from_transcripts(transcript_gen) proteins = [x for x in protein_gen if x.vars] # removing unvaried for p in proteins: p.gene_id = p.vars.values()[0][0].gene # assume gene name from first variant proteins = [p for p in proteins if not is_stop_gain(p)] # kick out stop gains # First exit option if not (options.predict_with or options.filter) and options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta print "N/A" sys.exit(0) else: e = proteins_to_fasta(proteins) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, digestion must be set somehow if not options.digest: digest = 9 else: digest = options.digest peptide_gen = g.generate_peptides_from_proteins(proteins, digest) peptides = [x for x in peptide_gen] peptides_var = [x for x in peptides if any(x.get_variants_by_protein(y) for y in x.proteins.keys())] # removing unvaried # Second exit option if not (options.predict_with or options.filter): e = peptides_to_fasta(peptides_var) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, predictions are needed try: target_alleles_set = set(FileReader.read_lines(options.alleles, in_type=Allele)) except Exception as e: m = "Could not read alleles file {f}, aborting.".format(f=options.alleles) logging.error(m) print m, "what:", str(e) sys.exit(1) try: ttn = EpitopePredictorFactory(options.predict_with) except Exception as e: m = "Could not initialize prediction method {f}, aborting.".format(f=options.predict_with) logging.error(m) print m sys.exit(1) try: preds = ttn.predict(peptides_var, alleles=target_alleles_set) except Exception as e: print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(e) sys.exit(1) # punch prediction results in peptide metadata (inside pandas dataframe) #PRED_METH = set() for i, row in preds.iterrows(): for j in i[1:]: i[0].log_metadata(j, dict(zip(row.index, row.values))) #PRED_METH.add(j) # need that later # Third exit option if not options.filter: if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # kick out nonbinder preds_f = preds[(preds > options.filter).any(axis=1)] # Fourth exit option if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta binders only plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds_f) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds_f) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0)
def __main__(): parser = argparse.ArgumentParser(description="""EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""", version=VERSION) parser.add_argument('-s', "--somatic_mutations", help='Somatic variants') parser.add_argument('-g', "--germline_mutations", help="Germline variants") parser.add_argument('-i', "--identifier", help="Dataset identifier") parser.add_argument('-p', "--peptides", help="File with one peptide per line") parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II") parser.add_argument('-l', "--max_length", help="Maximum peptide length") parser.add_argument('-ml', "--min_length", help="Minimum peptide length") parser.add_argument('-t', "--tools", help="Tools used for peptide predictions", required=True, type=str) parser.add_argument('-sv', "--versions", help="File containing parsed software version numbers.", required=True) parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True) parser.add_argument('-r', "--reference", help="Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38']) parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", required=False, action='store_true') parser.add_argument('-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", required=False, action='store_true') parser.add_argument('-fo', "--fasta_output", help="Create FASTA file with protein sequences", required=False, action='store_true') parser.add_argument('-rp', "--reference_proteome", help="Reference proteome for self-filtering", required=False) parser.add_argument('-gr', "--gene_reference", help="List of gene IDs for ID mapping.", required=False) parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values") parser.add_argument('-ge', "--gene_expression", help="File with expression analysis results") parser.add_argument('-de', "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)") parser.add_argument('-li', "--ligandomics_id", help="Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run.") args = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) logger.addHandler(logging.FileHandler('{}_prediction.log'.format(args.identifier))) logger.info("Starting predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) metadata = [] references = {'GRCh37': 'http://feb2014.archive.ensembl.org', 'GRCh38': 'http://dec2016.archive.ensembl.org'} global transcriptProteinMap global transcriptSwissProtMap '''read in variants or peptides''' if args.peptides: peptides, metadata = read_peptide_input(args.peptides) else: if args.somatic_mutations.endswith('.GSvar') or args.somatic_mutations.endswith('.tsv'): vl, transcripts, metadata = read_GSvar(args.somatic_mutations) elif args.somatic_mutations.endswith('.vcf'): vl, transcripts, metadata = read_vcf(args.somatic_mutations) transcripts = list(set(transcripts)) transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts(ID_SYSTEM_USED, transcripts, references[args.reference], args.reference) # get the alleles alleles = FileReader.read_lines(args.alleles, in_type=Allele) # initialize MartsAdapter, GRCh37 or GRCh38 based ma = MartsAdapter(biomart=references[args.reference]) # create protein db instance for filtering self-peptides up_db = UniProtDB('sp') if args.filter_self: logger.info('Reading human proteome') if os.path.isdir(args.reference_proteome): for filename in os.listdir(args.reference_proteome): if filename.endswith(".fasta") or filename.endswith(".fsa"): up_db.read_seqs(os.path.join(args.reference_proteome, filename)) else: up_db.read_seqs(args.reference_proteome) selected_methods = [item for item in args.tools.split(',')] with open(args.versions, 'r') as versions_file: tool_version = [ (row[0], str(row[1][1:])) for row in csv.reader(versions_file, delimiter = "\t") ] # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future tool_version.append(('syfpeithi', '1.0')) # get for each selected method the corresponding tool version methods = { method:version for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() } for method, version in methods.items(): if version not in EpitopePredictorFactory.available_methods()[method]: raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.") # MHC class I or II predictions if args.mhcclass is 1: if args.peptides: pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata) else: pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) else: if args.peptides: pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata) else: pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) # concat dataframes for all peptide lengths try: complete_df = pd.concat(pred_dataframes) except: complete_df = pd.DataFrame() logger.error("No predictions available.") # replace method names with method names with version # complete_df.replace({'method': methods}, inplace=True) complete_df['method'] = complete_df['method'].apply(lambda x : x + '-' + methods[x] ) # include wild type sequences to dataframe if specified if args.wild_type: wt_sequences = generate_wt_seqs(all_peptides_filtered) complete_df['wt sequence'] = complete_df.apply(lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1) columns_tiles = ['sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method'] # Change the order (the index) of the columns else: columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method'] for c in complete_df.columns: if c not in columns_tiles: columns_tiles.append(c) complete_df = complete_df.reindex(columns=columns_tiles) binder_cols = [col for col in complete_df.columns if 'binder' in col] binders = [] non_binders = [] pos_predictions = [] neg_predictions = [] for i, r in complete_df.iterrows(): binder = False for c in binder_cols: if r[c] is True: binder = True continue if binder: binders.append(str(r['sequence'])) pos_predictions.append(str(r['sequence'])) else: neg_predictions.append(str(r['sequence'])) if str(r['sequence']) not in binders: non_binders.append(str(r['sequence'])) # parse protein quantification results, annotate proteins for samples if args.protein_quantification is not None: protein_quant = read_protein_quant(args.protein_quantification) first_entry = protein_quant[protein_quant.keys()[0]] for k in first_entry.keys(): complete_df['{} log2 protein LFQ intensity'.format(k)] = complete_df.apply(lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1) # parse (differential) expression analysis results, annotate features (genes/transcripts) if args.gene_expression is not None: fold_changes = read_diff_expression_values(args.gene_expression) gene_id_lengths = {} col_name = 'RNA expression (RPKM)' with open(args.gene_reference, 'r') as gene_list: for l in gene_list: ids = l.split('\t') gene_id_in_df = complete_df.iloc[1]['gene'] if 'ENSG' in gene_id_in_df: gene_id_lengths[ids[0]] = float(ids[2].strip()) else: gene_id_lengths[ids[1]] = float(ids[2].strip()) deseq = False # add column to result dataframe complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1) if args.diff_gene_expression is not None: gene_id_lengths = {} fold_changes = read_diff_expression_values(args.diff_gene_expression) col_name = 'RNA normal_vs_tumor.log2FoldChange' deseq = True # add column to result dataframe complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1) # parse ligandomics identification results, annotate peptides for samples if args.ligandomics_id is not None: lig_id = read_lig_ID_values(args.ligandomics_id) # add columns to result dataframe complete_df['ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, False), axis=1) complete_df['ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, False), axis=1) if args.wild_type != None: complete_df['wt ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, True), axis=1) complete_df['wt ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, True), axis=1) # write mutated protein sequences to fasta file if args.fasta_output: with open('{}_prediction_proteins.fasta'.format(args.identifier), 'w') as protein_outfile: for p in proteins: variants = [] for v in p.vars: variants = variants + p.vars[v] c = [x.coding.values() for x in variants] cf = list(itertools.chain.from_iterable(c)) cds = ','.join([y.cdsMutationSyntax for y in set(cf)]) aas = ','.join([y.aaMutationSyntax for y in set(cf)]) protein_outfile.write('>{}:{}:{}\n'.format(p.transcript_id, aas, cds)) protein_outfile.write('{}\n'.format(str(p))) # write dataframe to tsv complete_df.fillna('') complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier), '\t', index=False) statistics['number_of_predictions'] = len(complete_df) statistics['number_of_binders'] = len(pos_predictions) statistics['number_of_nonbinders'] = len(neg_predictions) statistics['number_of_unique_binders'] = list(set(binders)) statistics['number_of_unique_nonbinders'] = list(set(non_binders) - set(binders)) with open('{}_report.json'.format(args.identifier), 'w') as json_out: json.dump(statistics, json_out) logger.info("Finished predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True) parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True) parser.add_argument( '-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.") parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction") parser.add_argument( '-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]". format(m=PRED_METH)) parser.add_argument( '-f', "--filter", dest="filter", type=float, help= "Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict" ) parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.") parser.add_argument( '-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.") options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if options.filter and not options.predict_with: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) if options.predict_with and not options.alleles: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) temp_dir = "/tmp/" logging.basicConfig( filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log', filemode='w+', level=logging.DEBUG) #, format='%(levelname)s:%(message)s' logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now())) logging.warning("verbosity turned on") #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts # complete proteins? # only containing binders? # k-mers? # binders only? # FastaSlicer.py? # remove original if homozygous (needs fasta input)? # add germline variant option? or expect all to be in one vcf? # MyObject = type('MyObject', (object,), {}) # options = MyObject() # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf") # # vt = os.path.splitext(options.var_file)[-1] # if ".vcf" == vt: # vcfvars, accessions = FileReader.read_vcf(options.var_file) # # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # transcripts = [x for x in transcript_gen if x.vars] # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # protein_gen = g.generate_proteins_from_transcripts(transcript_gen) # proteins = [x for x in protein_gen if x.vars] # for p in proteins: # p.gene_id = p.vars.values()[0][0].gene # # # for t in transcripts: # t.gene_id = t.vars.values()[0].gene # vt = os.path.splitext(options.var_file)[-1] if ".vcf" == vt: vcfvars, accessions = FileReader.read_vcf(options.var_file) elif ".GSvar" == vt: pass # vcfvars = FileReader.read_GSvar(options.var_file) else: m = "Could not read variants {f}, aborting.".format(f=options.var_file) logging.error(m) print m sys.exit(1) mart_db = MartsAdapter(biomart="http://grch37.ensembl.org" ) # TODO guess id_type for mart_db from accessions transcript_gen = g.generate_transcripts_from_variants( vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) protein_gen = g.generate_proteins_from_transcripts(transcript_gen) proteins = [x for x in protein_gen if x.vars] # removing unvaried for p in proteins: p.gene_id = p.vars.values( )[0][0].gene # assume gene name from first variant proteins = [p for p in proteins if not is_stop_gain(p)] # kick out stop gains # First exit option if not (options.predict_with or options.filter) and options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta print "N/A" sys.exit(0) else: e = proteins_to_fasta(proteins) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, digestion must be set somehow if not options.digest: digest = 9 else: digest = options.digest peptide_gen = g.generate_peptides_from_proteins(proteins, digest) peptides = [x for x in peptide_gen] peptides_var = [ x for x in peptides if any( x.get_variants_by_protein(y) for y in x.proteins.keys()) ] # removing unvaried # Second exit option if not (options.predict_with or options.filter): e = peptides_to_fasta(peptides_var) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, predictions are needed try: target_alleles_set = set( FileReader.read_lines(options.alleles, in_type=Allele)) except Exception as e: m = "Could not read alleles file {f}, aborting.".format( f=options.alleles) logging.error(m) print m, "what:", str(e) sys.exit(1) try: ttn = EpitopePredictorFactory(options.predict_with) except Exception as e: m = "Could not initialize prediction method {f}, aborting.".format( f=options.predict_with) logging.error(m) print m sys.exit(1) try: preds = ttn.predict(peptides_var, alleles=target_alleles_set) except Exception as e: print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str( e) sys.exit(1) # punch prediction results in peptide metadata (inside pandas dataframe) #PRED_METH = set() for i, row in preds.iterrows(): for j in i[1:]: i[0].log_metadata(j, dict(zip(row.index, row.values))) #PRED_METH.add(j) # need that later # Third exit option if not options.filter: if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # kick out nonbinder preds_f = preds[(preds > options.filter).any(axis=1)] # Fourth exit option if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta binders only plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds_f) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds_f) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0)
def vaccine(input_sequences, input_peptides, input_alleles, input_epitopes, input_vaccine, output_summary, verbose): # load vaccine with open(input_vaccine) as f: vaccine = {} for row in csv.DictReader(f): if row['cocktail'] not in vaccine: vaccine[row['cocktail']] = {} vaccine[row['cocktail']][int(row['index'])] = row['epitope'] cocktail = [] for mosaic in vaccine.values(): ordered = sorted(mosaic.items(), key=lambda x: x[0]) cocktail.append([e for _, e in ordered]) LOGGER.info('Vaccine loaded') # load alleles allele_data = utilities.get_alleles_and_thresholds(input_alleles).to_dict( 'index') LOGGER.info('Loaded %d alleles', len(allele_data)) # load peptides coverage peptides = {} with open(input_peptides) as f: for row in csv.DictReader(f): peptides[row['peptide']] = row['proteins'].split(';') LOGGER.info('Loaded %d peptides with coverage', len(peptides)) # load epitopes (also fill peptides since some design methods do not use epitopes) epitope_data = { pep: { 'immunogen': 0.0, 'alleles': [], 'proteins': prots } for pep, prots in peptides.items() } with open(input_epitopes) as f: for row in csv.DictReader(f): row['immunogen'] = float(row['immunogen']) row['alleles'] = row['alleles'].split( ';') if row['alleles'] else [] row['proteins'] = row['proteins'].split(';') if row['immunogen'] > 0: epitope_data[row['epitope']] = row LOGGER.info('Loaded %d epitopes', len(epitope_data)) # load sequences proteins = FileReader.read_fasta(input_sequences, in_type=Protein) LOGGER.info('Loaded %d proteins', len(proteins)) # print stats for each mosaic for i, mosaic in enumerate(cocktail): LOGGER.info('---') LOGGER.info('Mosaic #%d - %d epitopes', i + 1, len(mosaic)) for epi in mosaic: LOGGER.info(' %s', epi) evaluate_epitopes(mosaic, epitope_data, allele_data, len(proteins)) # write csv LOGGER.info('---') vaccine_stats = evaluate_epitopes( [epi for mosaic in cocktail for epi in mosaic], epitope_data, allele_data, len(proteins)) with open(output_summary, 'w') as f: writer = csv.DictWriter(f, vaccine_stats.keys()) writer.writeheader() writer.writerow(vaccine_stats)
def get_mosaic_solver_instance(logger, input_proteins, input_alleles, input_epitopes, input_overlaps, **kwargs): top_immunogen = kwargs.pop('top_immunogen') top_alleles = kwargs.pop('top_alleles') top_proteins = kwargs.pop('top_proteins') min_overlap = kwargs.get('min_overlap', 0) cocktail = kwargs.get('cocktail', 1) greedy_subtour = kwargs.get('greedy_subtour') max_epitopes = kwargs.get('max_epitopes') max_aminoacids = kwargs.get('max_aminoacids') min_alleles = kwargs.get('min_alleles', 0) min_proteins = kwargs.get('min_proteins', 0) min_avg_prot_conservation = kwargs.get('min_avg_prot_conservation', 0) min_avg_alle_conservation = kwargs.get('min_avg_alle_conservation', 0) # load proteins logger.info('Reading sequences...') proteins = FileReader.read_fasta(input_proteins, in_type=Protein) logger.info('%d proteins read', len(proteins)) # load alleles alleles = [ Allele(a) for a in get_alleles_and_thresholds(input_alleles).index ] logger.info('Loaded %d alleles', len(alleles)) # load epitopes epitope_data = list( load_epitopes(input_epitopes, top_immunogen, top_alleles, top_proteins).values()) logger.info('Loaded %d epitopes', len(epitope_data)) # load edge cost logger.info('Loading overlaps...') vertex_rewards = [0] + [b['immunogen'] for b in epitope_data] edges = load_edges_from_overlaps(input_overlaps, min_overlap, [b['epitope'] for b in epitope_data]) logger.info('Kept %d edges (from %d)', len(edges), len(epitope_data) * (len(epitope_data) + 1)) # compute hla and protein coverage logger.info('Computing coverage matrix...') type_coverage, min_type_coverage, min_avg_type_conservation = compute_coverage_matrix( epitope_data, min_alleles, min_proteins, min_avg_prot_conservation, min_avg_alle_conservation, len(proteins), len(alleles)) # find optimal design solver = TeamOrienteeringIlp( num_teams=cocktail, vertex_reward=vertex_rewards, edge_cost=edges, max_edge_cost=0, max_vertices=0, lazy_subtour_elimination=not greedy_subtour, type_coverage=type_coverage, min_type_coverage=min_type_coverage, min_avg_type_conservation=min_avg_type_conservation, ) if isinstance(max_epitopes, (int, float)): solver.update_max_vertices(max_epitopes) if isinstance(max_aminoacids, (int, float)): solver.update_max_edge_cost(max_aminoacids) return solver, { 'proteins': proteins, 'alleles': alleles, 'epitope_data': epitope_data, }
def main(): parser = argparse.ArgumentParser(description="Reads protein or peptide sequences and predicts peptides "+ "for a specified prediction method and HLA alleles.") parser.add_argument("-i", "--input", nargs="+", required=True, help="Input data can be RefSeq ID, UniProt ID, fasta file, peptide file (one peptide per line)," +" or peptide sequences as sequences (max 50)" ) input_types = parser.add_mutually_exclusive_group(required=True) input_types.add_argument("-r","--refseq", action="store_true", help= "Specifies the input as RefSeq IDs") input_types.add_argument("-u","--uniprot", action="store_true", help= "Specifies the input as UniProt IDs") input_types.add_argument("-f","--fasta", action="store_true", help= "Specifies the input as protein (multi-)Fasta file") input_types.add_argument("-pf","--pepfile", action="store_true", help= "Specifies the input as peptide file") input_types.add_argument("-p","--peptide", action="store_true", help= "Specifies the input as peptide sequences") parser.add_argument("-a", "--alleles", nargs="+", required=True, help="Specifies for which alleles prediction should be made. " + "Input either can be alleles as string (new nomenclature), or a file with one allele per line.") allele_types = parser.add_mutually_exclusive_group(required=True) allele_types.add_argument("-af", "--allelefile", action="store_true", help="Specifies the allele input as allele file.") allele_types.add_argument("-as", "--allelestring", action="store_true", help="Specifies the allele input as allele string.") parser.add_argument("-m", "--method", required=True, nargs="+", help="Specifies the method used for prediction.") parser.add_argument("-l", "--length", required=False, type=int, default=9, help="Specifies the length of the peptides (default=9).") parser.add_argument("-o", "--output", required=True, help="Specifies the output path. Results will be written to CSV") parser.add_argument("-am", "--available", required=False, action="store_true", help="Returns all available methods and their allele models.") #COMMENT: These options are hidden and only used for ETK2 parser.add_argument("-html", "--html", required=False, action="store_true", help=argparse.SUPPRESS) parser.add_argument("-od", "--outdir", required=False, default="", help=argparse.SUPPRESS) args = parser.parse_args() if args.available: for pred, obj in AEpitopePrediction.registry.iteritems(): if pred not in ["AEpitopePrediction", "APSSMEpitopePredictor", "ANetMHC", "ASVMEpitopePrediction"]: print "Method: ",pred print "Supported Alleles: ", " ".join(getattr(obj, "_"+pred+"__alleles" )) print "Supported Length: ", " ".join(map(str, getattr(obj, "_"+pred+"__supported_length"))) print sys.exit(0) ''' Parser Input ''' #RefSeq if args.refseq: pass #UniProt elif args.uniprot: pass #fasta protein elif args.fasta: proteins = FileReader.read_fasta(args.input, type="Protein") peptides = generate_peptides_from_protein(proteins, args.length) elif args.pepfile: peptides = FileReader.read_lines(args.input, type="Peptide") elif args.peptide: peptides = [Peptide(s) for s in args.input] #read in alleles if args.allelefile: alleles = FileReader.read_lines(args.alleles, type="Allele") else: alleles = [Allele(a.upper()) for a in args.alleles] result = [EpitopePredictorFactory(m).predict(peptides, alleles) for m in args.method] r_df = result.pop() for r in result: r_df_a, r_a = r_df.align(r, fill_value=0) r_df = r_df_a + r_a output = args.output if args.outdir == "" else args.outdir + os.path.basename(args.output) with open(output, "w") as out: r_df.to_csv(out) #generate Galaxy HTML output if args.html: begin_html = """<?xml version="1.0" encoding="utf-8" ?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <link rel="stylesheet" href="/static/style/blue/etk.css" type="text/css" /> <script type="text/javascript" src="/static/scripts/packed/libs/jquery/jquery.js"></script> <script type="text/javascript" src="/static/scripts/packed/libs/jquery/jquery.tablesorter.js"></script> <script type="text/javascript" src="/static/scripts/libs/etk.js"></script> </head> <body> <div class="document">""" setting = """ <h2 class="etk-heading">Epitope Prediction Results</h2> <table class="etk-parameterT"> <tr> <th class ="etk-innerHeading" colspan="2"> Parameters </th></tr> <tr> <th>Prediction Method:</th> <td>%s</td> </tr> </table>"""%args.method table=""" <input id="etk-search" placeholder=" filter"> <table class="etk-sortT etk-resultsT etk-filterT"> <thead> <tr> <th>Peptide</th>"""+"".join("<th>%s</th>"%str(a) for a in result.columns) \ +""" </tr> </thead>"""+"".join("<tr><td>%s<td>%s</tr>"%(r[0] ,"".join("<td align='right'>%s</td>"%str(result.loc[r, c]))) for r in result.index for c in result.columns)+"</table>" end_html = "</div></body></html>" html_out = ".".join(output.split(".")[:-1])+".html" with open(html_out, "w") as html_o: html_o.write(begin_html+setting+table+end_html)
def main(): parser = argparse.ArgumentParser(description="""The software is a novel approach to construct epitope-based string-of-beads vaccines in optimal order and with sequence-optimized spacers of flexible length such that the recovery of contained epitopes is maximized and immunogenicity of arising neo-epitopes is reduced. """) parser.add_argument("-i", "--input", required=True, help="File containing epitopes (one peptide per line)" ) parser.add_argument("-a", "--alleles", required=True, help="Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)" ) #parameters of the model parser.add_argument("-k","--max_length", default=6, type=int, help="Specifies the max. length of the spacers (default 6)") parser.add_argument("-al","--alpha", default=0.99, type=float, help="Specifies the first-order preference of the user in the model [0,1] (default 0.99)") parser.add_argument("-be","--beta", default=0.0, type=float, help="Specifies the second-order preference of the user in the model [0,1] (default 0).") parser.add_argument("-thr","--threshold", default=20, type=float, help="Specifies epitope prediction threshold for SYFPEITHI (default 20).") parser.add_argument("-o", "--output", required=True, help="Specifies the output file.") parser.add_argument("-t", "--threads", type=int, default=None, help="Specifies number of threads. If not specified all available logical cpus are used.") args = parser.parse_args() #parse input peptides = FileReader.read_lines(args.input, type="Peptide") #read in alleles alleles = generate_alleles(args.alleles) #set-up model cl_pred = CleavageSitePredictorFactory("PCM") epi_pred = EpitopePredictorFactory("Syfpeithi") thr = {a.name:args.threshold for a in alleles} solver = EpitopeAssemblyWithSpacer(peptides,cl_pred,epi_pred,alleles, k=args.max_length,en=9,threshold=thr, solver="cplex", alpha=args.alpha, beta=args.beta, verbosity=0) #solve #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n threads = mp.cpu_count() if args.threads is None else args.threads svbws = solver.approximate(threads=threads,options="preprocessing_presolve=n,threads=1") print print "Resulting String-of-Beads: ","-".join(map(str,svbws)) print with open(args.output, "w") as f: f.write("-".join(map(str,svbws)))
def __main__(): parser = argparse.ArgumentParser( "Write out information about supported models by Fred2 for installed predictor tool versions." ) parser.add_argument('-p', "--peptides", help="File with one peptide per line") parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II") parser.add_argument('-l', "--max_length", help="Maximum peptide length") parser.add_argument('-ml', "--min_length", help="Minimum peptide length") parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True) parser.add_argument('-t', '--tools', help='Tools requested for peptide predictions', required=True, type=str) parser.add_argument('-v', '--versions', help='<Required> File with used software versions.', required=True) args = parser.parse_args() selected_methods = [item for item in args.tools.split(',')] with open(args.versions, 'r') as versions_file: tool_version = [(row[0], str(row[1][1:])) for row in csv.reader(versions_file, delimiter="\t")] # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future tool_version.append(('syfpeithi', '1.0')) # how to handle this? # get for each method the corresponding tool version methods = { method: version for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() } # get the alleles alleles = FileReader.read_lines(args.alleles, in_type=Allele) peptide_lengths = [] if (args.peptides): peptides = read_peptide_input(args.peptides) peptide_lengths = set([len(pep) for pep in peptides]) else: peptide_lengths = range(args.min_length, args.max_length + 1) with open("model_report.txt", 'w') as output: # check if requested tool versions are supported for method, version in methods.items(): if version not in EpitopePredictorFactory.available_methods( )[method]: raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.") # check if reuested alleles are supported support_all_alleles = True no_allele_support = True for a in alleles: supported = False for method, version in methods.items(): predictor = EpitopePredictorFactory(method, version=version) if a not in sorted(predictor.supportedAlleles): output.write("Allele " + convert_allele_back(a) + " is not supported by " + method + " " + version + ".\n") else: supported = True if not supported: output.write( "Allele " + convert_allele_back(a) + " is not supported by any of the requested tools.\n") logger.warning( "Allele " + convert_allele_back(a) + " is not supported by any of the requested tools.") support_all_alleles = False else: no_allele_support = False if support_all_alleles: output.write( "All selected alleles are supported by at least one of the requested tools.\n" ) if no_allele_support: output.write( "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n" ) raise ValueError( "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models." ) output.write("\n") # check if reuested lengths are supported support_all_lengths = True no_length_support = True for l in peptide_lengths: supported = False for method, version in methods.items(): predictor = EpitopePredictorFactory(method, version=version) if l not in sorted(predictor.supportedLength): output.write("Peptide length " + str(l) + " is not supported by " + method + " " + version + ".\n") else: supported = True if not supported: output.write( "Peptide length " + str(l) + " is not supported by any of the requested tools.\n") logger.warning( "Peptide length " + str(l) + " is not supported by any of the requested tools.") support_all_lengths = False else: no_length_support = False if support_all_lengths: output.write( "All selected or provided peptide lengths are supported by at least one of the requested tools.\n" ) if no_length_support: output.write( "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n" ) raise ValueError( "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models." )
def extract_peptides(input_sequences, max_edits, output_peptides, top_n): ''' Extract peptides from the given sequences and computes protein coverage for each peptide. Coverage can be computed allowing for inexact matching. In other words, it first generates all peptides that appear in the input proteins, and stores which proteins each peptide appears in. Then, for every peptide, it finds all peptides that can be obtained by changing at most max-edits aminoacids, and counts the proteins that contain the edited peptides. ''' LOGGER.info('Reading sequences...') proteins = FileReader.read_fasta(input_sequences, in_type=Protein) LOGGER.info('%d proteins read', len(proteins)) LOGGER.info('Extracting protein coverage for each peptide...') all_peptides = utilities.Trie() proteins_by_peptide = {} for i, prot in enumerate(proteins): aminoacids = ''.join( c for c in prot._data if c.isalpha()) # remove non-aminoacids from alignment peptides_in_this_protein = set( ) # make sure we only count peptides once per protein for j in range(len(aminoacids) - 8): seq = str(aminoacids[j:j + 9]) if seq not in peptides_in_this_protein: peptides_in_this_protein.add(seq) all_peptides.insert(seq) if seq not in proteins_by_peptide: proteins_by_peptide[seq] = set() proteins_by_peptide[seq].add(i) if utilities.is_percent_barrier(i, len(proteins), 5): LOGGER.debug( '%d proteins analyzed (%.2f%%) and %d peptides extracted...', i + 1, 100 * (i + 1) / len(proteins), len(proteins_by_peptide)) LOGGER.info('Computing reachability...') top_peptides = [] with open(output_peptides, 'w') as f: writer = csv.writer(f) writer.writerow(('peptide', 'proteins')) for i, peptide in enumerate(proteins_by_peptide): # find reachable peptides and which proteins they belong to reachable_proteins = set() for reachable, edits in all_peptides.reachable_strings( peptide, max_edits): reachable_proteins.update(proteins_by_peptide[reachable]) # now either update the top N or save reachability to file if top_n > 0: heapq.heappush( top_peptides, (len(reachable_proteins), peptide, reachable_proteins)) if len(top_peptides) > top_n: heapq.heappop(top_peptides) else: writer.writerow( (peptide, ';'.join(list(map(str, reachable_proteins))))) if utilities.is_percent_barrier(i, len(proteins_by_peptide), 2.5): LOGGER.debug('%d peptides analyzed (%.2f%%)...', i + 1, 100 * (i + 1) / len(proteins_by_peptide)) # save the top N to file if top_n > 0: LOGGER.info('Saving top peptides to file') for _, peptide, proteins in top_peptides: writer.writerow( (peptide, ','.join(list(map(str, reachable_proteins)))))
def __main__(): parser = argparse.ArgumentParser( description= """EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""", version=VERSION) parser.add_argument('-s', "--somatic_mutations", help='Somatic variants') parser.add_argument('-g', "--germline_mutations", help="Germline variants") parser.add_argument('-i', "--identifier", help="Dataset identifier") parser.add_argument('-p', "--peptides", help="File with one peptide per line") parser.add_argument('-c', "--mhcclass", default="I", help="MHC class I or II") parser.add_argument('-l', "--length", help="Maximum peptide length") parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True) parser.add_argument( '-r', "--reference", help= "Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38']) parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", required=False, action='store_true') parser.add_argument( '-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", required=False, action='store_true') parser.add_argument('-rp', "--reference_proteome", help="Reference proteome for self-filtering", required=False) parser.add_argument('-gr', "--gene_reference", help="List of gene IDs for ID mapping.", required=False) parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values") parser.add_argument('-ge', "--gene_expression", help="File with expression analysis results") parser.add_argument( '-de', "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)") parser.add_argument( '-li', "--ligandomics_id", help= "Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run." ) parser.add_argument('-o', "--output_dir", help="All files written will be put in this directory") args = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if args.output_dir is not None: try: os.chdir(args.output_dir) logging.basicConfig(filename=os.path.join( args.output_dir, '{}_prediction.log'.format(args.identifier)), filemode='w+', level=logging.DEBUG) logging.info("Using provided data directory: {}".format( str(args.output_dir))) except: logging.info("No such directory, using current.") else: logging.basicConfig(filename='{}_prediction.log'.format( args.identifier), filemode='w+', level=logging.DEBUG) logging.info("Using current data directory.") logging.info("Starting predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) metadata = [] references = { 'GRCh37': 'http://feb2014.archive.ensembl.org', 'GRCh38': 'http://dec2016.archive.ensembl.org' } global transcriptProteinMap global transcriptSwissProtMap '''read in variants or peptides''' if args.peptides: peptides, metadata = read_peptide_input(args.peptides) else: if args.somatic_mutations.endswith( '.GSvar') or args.somatic_mutations.endswith('.tsv'): vl, transcripts, metadata = read_GSvar(args.somatic_mutations) elif args.somatic_mutations.endswith('.vcf'): vl, transcripts = read_vcf(args.somatic_mutations) transcripts = list(set(transcripts)) transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts( ID_SYSTEM_USED, transcripts, references[args.reference], args.reference) # get the alleles alleles = FileReader.read_lines(args.alleles, in_type=Allele) # initialize MartsAdapter, GRCh37 or GRCh38 based ma = MartsAdapter(biomart=references[args.reference]) # create protein db instance for filtering self-peptides up_db = UniProtDB('sp') if args.filter_self: logging.info('Reading human proteome') if os.path.isdir(args.reference_proteome): for filename in os.listdir(args.reference_proteome): if filename.endswith(".fasta") or filename.endswith(".fsa"): up_db.read_seqs( os.path.join(args.reference_proteome, filename)) else: up_db.read_seqs(args.reference_proteome) # MHC class I or II predictions if args.mhcclass == "I": #methods = ['netmhc-4.0', 'syfpeithi-1.0', 'netmhcpan-3.0'] methods = ['syfpeithi-1.0'] if args.peptides: pred_dataframes, statistics = make_predictions_from_peptides( peptides, methods, alleles, up_db, args.identifier, metadata) else: pred_dataframes, statistics, all_peptides_filtered = make_predictions_from_variants( vl, methods, alleles, 8, int(args.length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) else: methods = ['netmhcII-2.2', 'syfpeithi-1.0', 'netmhcIIpan-3.1'] if args.peptides: pred_dataframes, statistics = make_predictions_from_peptides( peptides, methods, alleles, up_db, args.identifier, metadata) else: pred_dataframes, statistics, all_peptides_filtered = make_predictions_from_variants( vl, methods, alleles, 15, int(args.length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) # concat dataframes for all peptide lengths try: complete_df = pd.concat(pred_dataframes) except: complete_df = pd.DataFrame() logging.error("No predictions available.") # store version of used methods method_map = {} for m in methods: method_map[m.split('-')[0]] = m # replace method names with method names with version complete_df.replace({'method': method_map}, inplace=True) # include wild type sequences to dataframe if specified if args.wild_type: wt_sequences = generate_wt_seqs(all_peptides_filtered) complete_df['wt sequence'] = complete_df.apply( lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1) columns_tiles = [ 'sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method' ] # Change the order (the index) of the columns else: columns_tiles = [ 'sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method' ] for c in complete_df.columns: if c not in columns_tiles: columns_tiles.append(c) complete_df = complete_df.reindex(columns=columns_tiles) binder_cols = [col for col in complete_df.columns if 'binder' in col] binders = [] non_binders = [] pos_predictions = [] neg_predictions = [] for i, r in complete_df.iterrows(): binder = False for c in binder_cols: if r[c] is True: binder = True continue if binder: binders.append(str(r['sequence'])) pos_predictions.append(str(r['sequence'])) else: neg_predictions.append(str(r['sequence'])) if str(r['sequence']) not in binders: non_binders.append(str(r['sequence'])) # parse protein quantification results, annotate proteins for samples if args.protein_quantification is not None: protein_quant = read_protein_quant(args.protein_quantification) first_entry = protein_quant[protein_quant.keys()[0]] for k in first_entry.keys(): complete_df['{} log2 protein LFQ intensity'.format( k)] = complete_df.apply( lambda row: create_quant_column_value_for_result( row, protein_quant, transcriptSwissProtMap, k), axis=1) # parse (differential) expression analysis results, annotate features (genes/transcripts) if args.gene_expression is not None: fold_changes = read_diff_expression_values(args.gene_expression) gene_id_lengths = {} col_name = 'RNA expression (RPKM)' with open(args.gene_reference, 'r') as gene_list: for l in gene_list: ids = l.split('\t') gene_id_in_df = complete_df.iloc[1]['gene'] if 'ENSG' in gene_id_in_df: gene_id_lengths[ids[0]] = float(ids[2].strip()) else: gene_id_lengths[ids[1]] = float(ids[2].strip()) deseq = False # add column to result dataframe complete_df[col_name] = complete_df.apply( lambda row: create_expression_column_value_for_result( row, fold_changes, deseq, gene_id_lengths), axis=1) if args.diff_gene_expression is not None: gene_id_lengths = {} fold_changes = read_diff_expression_values(args.diff_gene_expression) col_name = 'RNA normal_vs_tumor.log2FoldChange' deseq = True # add column to result dataframe complete_df[col_name] = complete_df.apply( lambda row: create_expression_column_value_for_result( row, fold_changes, deseq, gene_id_lengths), axis=1) # parse ligandomics identification results, annotate peptides for samples if args.ligandomics_id is not None: lig_id = read_lig_ID_values(args.ligandomics_id) # add columns to result dataframe complete_df['ligand score'] = complete_df.apply( lambda row: create_ligandomics_column_value_for_result( row, lig_id, 0, False), axis=1) complete_df['ligand intensity'] = complete_df.apply( lambda row: create_ligandomics_column_value_for_result( row, lig_id, 1, False), axis=1) if args.wild_type != None: complete_df['wt ligand score'] = complete_df.apply( lambda row: create_ligandomics_column_value_for_result( row, lig_id, 0, True), axis=1) complete_df['wt ligand intensity'] = complete_df.apply( lambda row: create_ligandomics_column_value_for_result( row, lig_id, 1, True), axis=1) # write dataframe to tsv complete_df.fillna('') complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier), '\t', index=False) statistics['number_of_predictions'] = complete_df.shape[0] statistics['number_of_binders'] = len(pos_predictions) statistics['number_of_nonbinders'] = len(neg_predictions) statistics['number_of_unique_binders'] = list(set(binders)) statistics['number_of_unique_nonbinders'] = list( set(non_binders) - set(binders)) with open('{}_report.json'.format(args.identifier), 'w') as json_out: json.dump(statistics, json_out)
def main(): parser = argparse.ArgumentParser(description="""The software is a novel approach to construct epitope-based string-of-beads vaccines in optimal order and with sequence-optimized spacers of flexible length such that the recovery of contained epitopes is maximized and immunogenicity of arising neo-epitopes is reduced. """) parser.add_argument("-i", "--input", required=True, help="File containing epitopes (one peptide per line)" ) parser.add_argument("-a", "--alleles", required=True, help="Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)" ) #parameters of the model parser.add_argument("-k","--max_length", default=6, type=int, help="Specifies the max. length of the spacers (default 6)") parser.add_argument("-al","--alpha", default=0.99, type=float, help="Specifies the first-order preference of the user in the model [0,1] (default 0.99)") parser.add_argument("-be","--beta", default=0.0, type=float, help="Specifies the second-order preference of the user in the model [0,1] (default 0).") parser.add_argument("-cp","--cleavage_prediction", default="PCM", help="Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_S]" ) parser.add_argument("-ep","--epitope_prediction", default="Syfpeithi", help="Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]" ) parser.add_argument("-thr","--threshold", default=20, type=float, help="Specifies epitope prediction threshold for SYFPEITHI (default 20).") parser.add_argument("-o", "--output", required=True, help="Specifies the output file.") parser.add_argument("-t", "--threads", type=int, default=None, help="Specifies number of threads. If not specified all available logical cpus are used.") args = parser.parse_args() #parse input peptides = list(FileReader.read_lines(args.input, in_type=Peptide)) #read in alleles alleles = generate_alleles(args.alleles) if args.cleavage_prediction.upper() not in ["PCM", "PROTEASMM_C", "PROTEASMM_S"]: print "Specified cleavage predictor is currently not supported. Please choose either PCM, PROTEASMM_C, or PROTEASMM_S" sys.exit(-1) if args.epitope_prediction.upper() not in ["SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC"]: print "Specified cleavage predictor is currently not supported. Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC" sys.exit(-1) #set-up model cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction) epi_pred = EpitopePredictorFactory(args.epitope_prediction) thr = {a.name:args.threshold for a in alleles} solver = EpitopeAssemblyWithSpacer(peptides,cl_pred,epi_pred,alleles, k=args.max_length,en=9,threshold=thr, solver="cplex", alpha=args.alpha, beta=args.beta, verbosity=0) #solve #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n threads = mp.cpu_count() if args.threads is None else args.threads svbws = solver.approximate(threads=threads,options={"preprocessing_presolve":"n","threads":1}) print print "Resulting String-of-Beads: ","-".join(map(str,svbws)) print with open(args.output, "w") as f: f.write("-".join(map(str,svbws)))
def string_of_beads(input_proteins, input_alleles, input_epitopes, input_cleavages, output_vaccine, cocktail, greedy_subtour, max_aminoacids, max_epitopes, min_alleles, min_proteins, min_avg_prot_conservation, min_avg_alle_conservation): program_start_time = time.time() # load proteins LOGGER.info('Reading sequences...') proteins = FileReader.read_fasta(input_proteins, in_type=Protein) LOGGER.info('%d proteins read', len(proteins)) # load alleles alleles = [ Allele(a) for a in utilities.get_alleles_and_thresholds(input_alleles).index ] LOGGER.info('Loaded %d alleles', len(alleles)) # load epitopes epitopes = utilities.load_epitopes(input_epitopes) LOGGER.info('Loaded %d epitopes', len(epitopes)) # read cleavage scores cleavage_epitopes = set() with open(input_cleavages) as f: cleavages = {} for row in csv.DictReader(f): cleavages[(row['from'], row['to'])] = float(row['score']) cleavage_epitopes.add(row['from']) cleavage_epitopes.add(row['to']) LOGGER.info('Loaded %d cleavage scores', len(cleavages)) # compute edge cost edge_cost, vertices, vertices_rewards = [], [], [] vertex_to_epitope = [''] + list(cleavage_epitopes) for ep_from in vertex_to_epitope: vertices.append(ep_from) vertices_rewards.append(0 if ep_from == '' else epitopes[ep_from]['immunogen']) edge_cost.append([ cleavages[(ep_from, ep_to)] if ep_from != '' and ep_to != '' else 0.0 for ep_to in vertex_to_epitope ]) LOGGER.info('Kept %d epitopes with available clevages', len(vertices) - 1) type_coverage, min_type_coverage, min_avg_type_conservation = utilities.compute_coverage_matrix( [epitopes[e] for e in vertex_to_epitope[1:]], min_alleles, min_proteins, min_avg_prot_conservation, min_avg_alle_conservation, len(proteins), len(alleles)) # find optimal design solver_build_time = time.time() solver = TeamOrienteeringIlp( num_teams=cocktail, vertex_reward=vertices_rewards, edge_cost=edge_cost, type_coverage=type_coverage, min_type_coverage=min_type_coverage, min_avg_type_conservation=min_avg_type_conservation, max_edge_cost=max_aminoacids, max_vertices=max_epitopes, lazy_subtour_elimination=not greedy_subtour) solver.build_model() solver_start_time = time.time() result = solver.solve() solver_end_time = time.time() # print info and save with open(output_vaccine, 'w') as f: writer = csv.writer(f) writer.writerow(('cocktail', 'index', 'epitope')) for i, mosaic in enumerate(result): LOGGER.info('Mosaic #%d', i + 1) for j, (_, vertex) in enumerate(mosaic[:-1]): epitope = epitopes[vertex_to_epitope[vertex]] writer.writerow((i, j, epitope['epitope'])) LOGGER.info(' %s - IG: %.2f', epitope['epitope'], epitope['immunogen']) LOGGER.info('==== Stopwatch') LOGGER.info(' Total time : %.2f s', solver_end_time - program_start_time) LOGGER.info(' Pre-processing : %.2f s', solver_build_time - program_start_time) LOGGER.info(' Model creation time : %.2f s', solver_start_time - solver_build_time) LOGGER.info(' Solving time : %.2f s', solver_end_time - solver_start_time)