def test_single_allele_input(self): for m in EpitopePredictorFactory.available_methods(): for v in EpitopePredictorFactory.available_methods()[m]: mo = EpitopePredictorFactory(m, version=v) if isinstance(mo, AExternalEpitopePrediction) and not ( mo.version == "0.1" and mo.name == "netmhc"): print "Testing", mo.name, "version", mo.version try: if any(a.name in mo.supportedAlleles for a in self.mhcII): mo.predict(self.peptides_mhcII, alleles=self.mhcII[0]) if any(a.name in mo.supportedAlleles for a in self.mhcII_combined_alleles): mo.predict(self.peptides_mhcII, alleles=self.mhcII_combined_alleles[0]) if any(a.name in mo.supportedAlleles for a in self.mhcI): mo.predict(self.peptides_mhcI, alleles=self.mhcI[0]) print "Success" except RuntimeError as e: #catch only those stemming from binary unavailability if "could not be found in PATH" not in e.message: raise e #all others do not except else: print mo.name, "not available"
def test_single_allele_input(self): for m in EpitopePredictorFactory.available_methods(): for v in EpitopePredictorFactory.available_methods()[m]: mo = EpitopePredictorFactory(m, version=v) if isinstance(mo, AExternalEpitopePrediction) and not (mo.version=="0.1" and mo.name=="netmhc"): if any(a.name in mo.supportedAlleles for a in self.mhcII): mo.predict(self.peptides_mhcII, alleles=self.mhcII[0]) else: mo.predict(self.peptides_mhcI, alleles=self.mhcI[0])
def test_single_peptide_input_mhcII(self): for m in EpitopePredictorFactory.available_methods(): model = EpitopePredictorFactory(m) if not isinstance(model, AExternalEpitopePrediction): if all(a.name in model.supportedAlleles for a in self.mhcII): res = model.predict(self.peptides_mhcII[0], alleles=self.mhcII[1])
def est_multiple_peptide_input_mhcI(self): for m in EpitopePredictorFactory.available_methods(): model = EpitopePredictorFactory(m) if all( a.name in model.supportedAlleles for a in self.mhcI): res = model.predict(self.peptides_mhcI,alleles=self.mhcI)
def valid_predictors(supported_length=9, exclude_predictors=["epidemix", "unitope", "netctlpan"]): """ Get the infomation for all predictors and keep only the relevant ones. Args: supported_length (int): Supported peptide input length. exclude_predictors (list of chars): List of methods to remove in addition """ methods = EpitopePredictorFactory.available_methods().keys() dt = pd.DataFrame([predictor_info(method) for method in methods]) n_init = len(dt) dt = dt[[supported_length in elems for elems in dt["supportedLength"]]] dt = dt[dt["type"].notnull()] # we should know where it was trained dt = dt[dt["is_in_path"].isnull() | dt["is_in_path"]] for excl_predictor in exclude_predictors: dt = dt[dt["name"] != excl_predictor] print("removed {0} methods from Fred2. {1} remain".\ format(n_init - len(dt), len(dt))) return dt
def test_single_allele_input(self): for m in EpitopePredictorFactory.available_methods(): for v in EpitopePredictorFactory.available_methods()[m]: mo = EpitopePredictorFactory(m, version=v) if isinstance(mo, AExternalEpitopePrediction) and not (mo.version=="0.1" and mo.name=="netmhc"): print "Testing", mo.name, "version", mo.version try: if any(a.name in mo.supportedAlleles for a in self.mhcII): mo.predict(self.peptides_mhcII, alleles=self.mhcII[0]) if any(a.name in mo.supportedAlleles for a in self.mhcII_combined_alleles): mo.predict(self.peptides_mhcII, alleles=self.mhcII_combined_alleles[0]) if any(a.name in mo.supportedAlleles for a in self.mhcI): mo.predict(self.peptides_mhcI, alleles=self.mhcI[0]) print "Success" except RuntimeError as e: #catch only those stemming from binary unavailability if "could not be found in PATH" not in e.message: raise e #all others do not except else: print mo.name, "not available"
def __main__(): parser = argparse.ArgumentParser( "Write out information about supported models by Fred2 for available prediction tool versions." ) parser.add_argument('-v', '--versions', help='File with used software versions.', required=True) args = parser.parse_args() # NOTE this needs to be updated manually, if other methods should be used in the future available_methods = [ 'syfpeithi', 'mhcflurry', 'mhcnuggets-class-1', 'mhcnuggets-class-2' ] with open(args.versions, 'r') as versions_file: tool_version = [(row[0].split()[0], str(row[1])) for row in csv.reader(versions_file, delimiter=":")] # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future tool_version.append(('syfpeithi', '1.0')) # get for each method the corresponding tool version methods = { method.strip(): version.strip() for tool, version in tool_version for method in available_methods if tool.lower() in method.lower() } for method, version in methods.items(): if (version not in EpitopePredictorFactory.available_methods()[method]): raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.") predictor = EpitopePredictorFactory(method, version=version) with open(method + ".v" + str(version) + ".supported_alleles.txt", 'w') as output: for a in sorted(predictor.supportedAlleles): output.write(convert_allele_back(a) + "\n") with open(method + ".v" + str(version) + ".supported_lengths.txt", 'w') as output: for l in sorted(predictor.supportedLength): output.write(str(l) + "\n")
def main(): model = argparse.ArgumentParser( description='Neoepitope prediction for TargetInsepctor.') model.add_argument( '-m', '--method', type=str, choices=EpitopePredictorFactory.available_methods().keys(), default="bimas", help='The name of the prediction method') model.add_argument('-v', '--vcf', type=str, default=None, help='Path to the vcf input file') model.add_argument( '-t', '--type', type=str, choices=["VEP", "ANNOVAR", "SNPEFF"], default="VEP", help= 'Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)' ) model.add_argument('-p', '--proteins', type=str, default=None, help='Path to the protein ID input file (in HGNC-ID)') model.add_argument('-l', '--length', choices=range(8, 18), type=int, default=9, help='The length of peptides') model.add_argument( '-a', '--alleles', type=str, required=True, help='Path to the allele file (one per line in new nomenclature)') model.add_argument( '-r', '--reference', type=str, default='GRCh38', help='The reference genome used for varinat annotation and calling.') model.add_argument( '-fINDEL', '--filterINDEL', action="store_true", help='Filter insertions and deletions (including frameshifts)') model.add_argument('-fFS', '--filterFSINDEL', action="store_true", help='Filter frameshift INDELs') model.add_argument('-fSNP', '--filterSNP', action="store_true", help='Filter SNPs') model.add_argument('-o', '--output', type=str, required=True, help='Path to the output file') model.add_argument('-etk', '--etk', action="store_true", help=argparse.SUPPRESS) args = model.parse_args() martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()]) transcript_to_genes = {} if args.vcf is None and args.proteins is None: sys.stderr.write( "At least a vcf file or a protein id file has to be provided.\n") return -1 # if vcf file is given: generate variants and filter them if HGNC IDs ar given if args.vcf is not None: protein_ids = [] if args.proteins is not None: with open(args.proteins, "r") as f: for l in f: l = l.strip() if l != "": protein_ids.append(l) if args.type == "VEP": variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids) elif args.type == "SNPEFF": variants = read_vcf(args.vcf)[0] else: variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids) variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants) if args.filterSNP: variants = filter(lambda x: x.type != VariationType.SNP, variants) if args.filterINDEL: variants = filter( lambda x: x.type not in [ VariationType.INS, VariationType.DEL, VariationType.FSDEL, VariationType.FSINS ], variants) if args.filterFSINDEL: variants = filter( lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants) if not variants: sys.stderr.write( "No variants left after filtering. Please refine your filtering criteria.\n" ) return -1 epitopes = filter( lambda x: any( x.get_variants_by_protein(tid) for tid in x.proteins.iterkeys()), generate_peptides_from_variants(variants, int(args.length), martDB, EIdentifierTypes.ENSEMBL)) for v in variants: for trans_id, coding in v.coding.iteritems(): if coding.geneID != None: transcript_to_genes[trans_id] = coding.geneID else: transcript_to_genes[trans_id] = 'None' #else: generate protein sequences from given HGNC IDs and than epitopes else: proteins = [] with open(args.proteins, "r") as f: for l in f: ensembl_ids = martDB.get_ensembl_ids_from_id( l.strip(), type=EIdentifierTypes.HGNC)[0] protein_seq = martDB.get_product_sequence( ensembl_ids[EAdapterFields.PROTID]) if protein_seq is not None: transcript_to_genes[ensembl_ids[ EAdapterFields.TRANSID]] = l.strip() proteins.append( Protein( protein_seq, gene_id=l.strip(), transcript_id=ensembl_ids[EAdapterFields.TRANSID])) epitopes = generate_peptides_from_proteins(proteins, int(args.length)) #read in allele list alleles = read_lines(args.alleles, in_type=Allele) result = EpitopePredictorFactory(args.method).predict(epitopes, alleles=alleles) with open(args.output, "w") as f: alleles = result.columns var_column = " Variants" if args.vcf is not None else "" f.write("Sequence\tMethod\t" + "\t".join(a.name for a in alleles) + "\tAntigen ID\t" + var_column + "\n") for index, row in result.iterrows(): p = index[0] method = index[1] proteins = ",".join( set([ transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins() ])) vars_str = "" if args.vcf is not None: vars_str = "\t" + "|".join( set( prot_id.split(":FRED2")[0] + ":" + ",".join( repr(v) for v in set(p.get_variants_by_protein(prot_id))) for prot_id in p.proteins.iterkeys() if p.get_variants_by_protein(prot_id))) f.write( str(p) + "\t" + method + "\t" + "\t".join("%.3f" % row[a] for a in alleles) + "\t" + proteins + vars_str + "\n") if args.etk: with open(args.output.rsplit(".", 1)[0] + "_etk.tsv", "w") as g: alleles = result.columns g.write("Alleles:\t" + "\t".join(a.name for a in alleles) + "\n") for index, row in result.iterrows(): p = index[0] proteins = " ".join( set([ transcript_to_genes[prot.transcript_id.split( ":FRED2")[0]] for prot in p.get_all_proteins() ])) g.write( str(p) + "\t" + "\t".join("%.3f" % row[a] for a in alleles) + "\t" + proteins + "\n") return 0
def __main__(): parser = argparse.ArgumentParser( "Write out information about supported models by Fred2 for installed predictor tool versions." ) parser.add_argument('-p', "--peptides", help="File with one peptide per line") parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II") parser.add_argument('-l', "--max_length", help="Maximum peptide length", type=int) parser.add_argument('-ml', "--min_length", help="Minimum peptide length", type=int) parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True, type=str) parser.add_argument('-t', '--tools', help='Tools requested for peptide predictions', required=True, type=str) parser.add_argument('-v', '--versions', help='<Required> File with used software versions.', required=True) args = parser.parse_args() selected_methods = [item for item in args.tools.split(',')] with open(args.versions, 'r') as versions_file: tool_version = [(row[0].split()[0], str(row[1])) for row in csv.reader(versions_file, delimiter=":")] # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future tool_version.append(('syfpeithi', '1.0')) # how to handle this? # get for each method the corresponding tool version methods = { method.strip(): version.strip() for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() } # get the alleles alleles = [Allele(a) for a in args.alleles.split(";")] peptide_lengths = [] if (args.peptides): peptides = read_peptide_input(args.peptides) peptide_lengths = set([len(pep) for pep in peptides]) else: peptide_lengths = range(args.min_length, args.max_length + 1) with open("model_report.txt", 'w') as output: # check if requested tool versions are supported for method, version in methods.items(): if version not in EpitopePredictorFactory.available_methods()[ method.lower()]: raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.") # check if requested alleles are supported support_all_alleles = True no_allele_support = True for a in alleles: supported = False for method, version in methods.items(): predictor = EpitopePredictorFactory(method, version=version) if a not in sorted(predictor.supportedAlleles): output.write("Allele " + convert_allele_back(a) + " is not supported by " + method + " " + version + ".\n") else: supported = True if not supported: output.write( "Allele " + convert_allele_back(a) + " is not supported by any of the requested tools.\n") logger.warning( "Allele " + convert_allele_back(a) + " is not supported by any of the requested tools.") support_all_alleles = False else: no_allele_support = False if support_all_alleles: output.write( "All selected alleles are supported by at least one of the requested tools.\n" ) if no_allele_support: output.write( "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n" ) raise ValueError( "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models." ) output.write("\n") # check if requested lengths are supported support_all_lengths = True no_length_support = True for l in peptide_lengths: supported = False for method, version in methods.items(): predictor = EpitopePredictorFactory(method, version=version) if l not in sorted(predictor.supportedLength): output.write("Peptide length " + str(l) + " is not supported by " + method + " " + version + ".\n") else: supported = True if not supported: output.write( "Peptide length " + str(l) + " is not supported by any of the requested tools.\n") logger.warning( "Peptide length " + str(l) + " is not supported by any of the requested tools.") support_all_lengths = False else: no_length_support = False if support_all_lengths: output.write( "All selected or provided peptide lengths are supported by at least one of the requested tools.\n" ) if no_length_support: output.write( "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n" ) raise ValueError( "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models." )
def __main__(): parser = argparse.ArgumentParser(description="""EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""", version=VERSION) parser.add_argument('-s', "--somatic_mutations", help='Somatic variants') parser.add_argument('-g', "--germline_mutations", help="Germline variants") parser.add_argument('-i', "--identifier", help="Dataset identifier") parser.add_argument('-p', "--peptides", help="File with one peptide per line") parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II") parser.add_argument('-l', "--max_length", help="Maximum peptide length") parser.add_argument('-ml', "--min_length", help="Minimum peptide length") parser.add_argument('-t', "--tools", help="Tools used for peptide predictions", required=True, type=str) parser.add_argument('-sv', "--versions", help="File containing parsed software version numbers.", required=True) parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True) parser.add_argument('-r', "--reference", help="Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38']) parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", required=False, action='store_true') parser.add_argument('-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", required=False, action='store_true') parser.add_argument('-fo', "--fasta_output", help="Create FASTA file with protein sequences", required=False, action='store_true') parser.add_argument('-rp', "--reference_proteome", help="Reference proteome for self-filtering", required=False) parser.add_argument('-gr', "--gene_reference", help="List of gene IDs for ID mapping.", required=False) parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values") parser.add_argument('-ge', "--gene_expression", help="File with expression analysis results") parser.add_argument('-de', "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)") parser.add_argument('-li', "--ligandomics_id", help="Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run.") args = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) logger.addHandler(logging.FileHandler('{}_prediction.log'.format(args.identifier))) logger.info("Starting predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) metadata = [] references = {'GRCh37': 'http://feb2014.archive.ensembl.org', 'GRCh38': 'http://dec2016.archive.ensembl.org'} global transcriptProteinMap global transcriptSwissProtMap '''read in variants or peptides''' if args.peptides: peptides, metadata = read_peptide_input(args.peptides) else: if args.somatic_mutations.endswith('.GSvar') or args.somatic_mutations.endswith('.tsv'): vl, transcripts, metadata = read_GSvar(args.somatic_mutations) elif args.somatic_mutations.endswith('.vcf'): vl, transcripts, metadata = read_vcf(args.somatic_mutations) transcripts = list(set(transcripts)) transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts(ID_SYSTEM_USED, transcripts, references[args.reference], args.reference) # get the alleles alleles = FileReader.read_lines(args.alleles, in_type=Allele) # initialize MartsAdapter, GRCh37 or GRCh38 based ma = MartsAdapter(biomart=references[args.reference]) # create protein db instance for filtering self-peptides up_db = UniProtDB('sp') if args.filter_self: logger.info('Reading human proteome') if os.path.isdir(args.reference_proteome): for filename in os.listdir(args.reference_proteome): if filename.endswith(".fasta") or filename.endswith(".fsa"): up_db.read_seqs(os.path.join(args.reference_proteome, filename)) else: up_db.read_seqs(args.reference_proteome) selected_methods = [item for item in args.tools.split(',')] with open(args.versions, 'r') as versions_file: tool_version = [ (row[0], str(row[1][1:])) for row in csv.reader(versions_file, delimiter = "\t") ] # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future tool_version.append(('syfpeithi', '1.0')) # get for each selected method the corresponding tool version methods = { method:version for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() } for method, version in methods.items(): if version not in EpitopePredictorFactory.available_methods()[method]: raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.") # MHC class I or II predictions if args.mhcclass is 1: if args.peptides: pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata) else: pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) else: if args.peptides: pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata) else: pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) # concat dataframes for all peptide lengths try: complete_df = pd.concat(pred_dataframes) except: complete_df = pd.DataFrame() logger.error("No predictions available.") # replace method names with method names with version # complete_df.replace({'method': methods}, inplace=True) complete_df['method'] = complete_df['method'].apply(lambda x : x + '-' + methods[x] ) # include wild type sequences to dataframe if specified if args.wild_type: wt_sequences = generate_wt_seqs(all_peptides_filtered) complete_df['wt sequence'] = complete_df.apply(lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1) columns_tiles = ['sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method'] # Change the order (the index) of the columns else: columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method'] for c in complete_df.columns: if c not in columns_tiles: columns_tiles.append(c) complete_df = complete_df.reindex(columns=columns_tiles) binder_cols = [col for col in complete_df.columns if 'binder' in col] binders = [] non_binders = [] pos_predictions = [] neg_predictions = [] for i, r in complete_df.iterrows(): binder = False for c in binder_cols: if r[c] is True: binder = True continue if binder: binders.append(str(r['sequence'])) pos_predictions.append(str(r['sequence'])) else: neg_predictions.append(str(r['sequence'])) if str(r['sequence']) not in binders: non_binders.append(str(r['sequence'])) # parse protein quantification results, annotate proteins for samples if args.protein_quantification is not None: protein_quant = read_protein_quant(args.protein_quantification) first_entry = protein_quant[protein_quant.keys()[0]] for k in first_entry.keys(): complete_df['{} log2 protein LFQ intensity'.format(k)] = complete_df.apply(lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1) # parse (differential) expression analysis results, annotate features (genes/transcripts) if args.gene_expression is not None: fold_changes = read_diff_expression_values(args.gene_expression) gene_id_lengths = {} col_name = 'RNA expression (RPKM)' with open(args.gene_reference, 'r') as gene_list: for l in gene_list: ids = l.split('\t') gene_id_in_df = complete_df.iloc[1]['gene'] if 'ENSG' in gene_id_in_df: gene_id_lengths[ids[0]] = float(ids[2].strip()) else: gene_id_lengths[ids[1]] = float(ids[2].strip()) deseq = False # add column to result dataframe complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1) if args.diff_gene_expression is not None: gene_id_lengths = {} fold_changes = read_diff_expression_values(args.diff_gene_expression) col_name = 'RNA normal_vs_tumor.log2FoldChange' deseq = True # add column to result dataframe complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1) # parse ligandomics identification results, annotate peptides for samples if args.ligandomics_id is not None: lig_id = read_lig_ID_values(args.ligandomics_id) # add columns to result dataframe complete_df['ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, False), axis=1) complete_df['ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, False), axis=1) if args.wild_type != None: complete_df['wt ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, True), axis=1) complete_df['wt ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, True), axis=1) # write mutated protein sequences to fasta file if args.fasta_output: with open('{}_prediction_proteins.fasta'.format(args.identifier), 'w') as protein_outfile: for p in proteins: variants = [] for v in p.vars: variants = variants + p.vars[v] c = [x.coding.values() for x in variants] cf = list(itertools.chain.from_iterable(c)) cds = ','.join([y.cdsMutationSyntax for y in set(cf)]) aas = ','.join([y.aaMutationSyntax for y in set(cf)]) protein_outfile.write('>{}:{}:{}\n'.format(p.transcript_id, aas, cds)) protein_outfile.write('{}\n'.format(str(p))) # write dataframe to tsv complete_df.fillna('') complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier), '\t', index=False) statistics['number_of_predictions'] = len(complete_df) statistics['number_of_binders'] = len(pos_predictions) statistics['number_of_nonbinders'] = len(neg_predictions) statistics['number_of_unique_binders'] = list(set(binders)) statistics['number_of_unique_nonbinders'] = list(set(non_binders) - set(binders)) with open('{}_report.json'.format(args.identifier), 'w') as json_out: json.dump(statistics, json_out) logger.info("Finished predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
def main(): model = argparse.ArgumentParser(description='Neoepitope prediction for TargetInsepctor.') model.add_argument( '-m','--method', type=str, choices=EpitopePredictorFactory.available_methods().keys(), default="bimas", help='The name of the prediction method' ) model.add_argument( '-v', '--vcf', type=str, default=None, help='Path to the vcf input file' ) model.add_argument( '-t', '--type', type=str, choices=["VEP", "ANNOVAR", "SNPEFF"], default="VEP", help='Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)' ) model.add_argument( '-p','--proteins', type=str, default=None, help='Path to the protein ID input file (in HGNC-ID)' ) model.add_argument( '-l','--length', choices=range(8, 18), type=int, default=9, help='The length of peptides' ) model.add_argument( '-a','--alleles', type=str, required=True, help='Path to the allele file (one per line in new nomenclature)' ) model.add_argument( '-r' ,'--reference', type=str, default='GRCh38', help='The reference genome used for varinat annotation and calling.' ) model.add_argument( '-fINDEL' ,'--filterINDEL', action="store_true", help='Filter insertions and deletions (including frameshifts)' ) model.add_argument( '-fFS' ,'--filterFSINDEL', action="store_true", help='Filter frameshift INDELs' ) model.add_argument( '-fSNP' ,'--filterSNP', action="store_true", help='Filter SNPs' ) model.add_argument( '-o','--output', type=str, required=True, help='Path to the output file' ) model.add_argument( '-etk','--etk', action="store_true", help=argparse.SUPPRESS ) args = model.parse_args() martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()]) transcript_to_genes = {} if args.vcf is None and args.proteins is None: sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n") return -1 # if vcf file is given: generate variants and filter them if HGNC IDs ar given if args.vcf is not None: protein_ids = [] if args.proteins is not None: with open(args.proteins, "r") as f: for l in f: l = l.strip() if l != "": protein_ids.append(l) if args.type == "VEP": variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids) elif args.type == "SNPEFF": variants = read_vcf(args.vcf)[0] else: variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids) variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants) if args.filterSNP: variants = filter(lambda x: x.type != VariationType.SNP, variants) if args.filterINDEL: variants = filter(lambda x: x.type not in [VariationType.INS, VariationType.DEL, VariationType.FSDEL, VariationType.FSINS], variants) if args.filterFSINDEL: variants = filter(lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants) if not variants: sys.stderr.write("No variants left after filtering. Please refine your filtering criteria.\n") return -1 epitopes = filter(lambda x:any(x.get_variants_by_protein(tid) for tid in x.proteins.iterkeys()), generate_peptides_from_variants(variants, int(args.length), martDB, EIdentifierTypes.ENSEMBL)) for v in variants: for trans_id,coding in v.coding.iteritems(): if coding.geneID!=None: transcript_to_genes[trans_id] = coding.geneID else: transcript_to_genes[trans_id] = 'None' #else: generate protein sequences from given HGNC IDs and than epitopes else: proteins = [] with open(args.proteins, "r") as f: for l in f: ensembl_ids = martDB.get_ensembl_ids_from_id(l.strip(), type=EIdentifierTypes.HGNC)[0] protein_seq = martDB.get_product_sequence(ensembl_ids[EAdapterFields.PROTID]) if protein_seq is not None: transcript_to_genes[ensembl_ids[EAdapterFields.TRANSID]] = l.strip() proteins.append(Protein(protein_seq, gene_id=l.strip(), transcript_id=ensembl_ids[EAdapterFields.TRANSID])) epitopes = generate_peptides_from_proteins(proteins, int(args.length)) #read in allele list alleles = read_lines(args.alleles, in_type=Allele) result = EpitopePredictorFactory(args.method).predict(epitopes, alleles=alleles) with open(args.output, "w") as f: alleles = result.columns var_column = " Variants" if args.vcf is not None else "" f.write("Sequence\tMethod\t"+"\t".join(a.name for a in alleles)+"\tAntigen ID\t"+var_column+"\n") for index, row in result.iterrows(): p = index[0] method = index[1] proteins = ",".join(set([transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins()])) vars_str = "" if args.vcf is not None: vars_str = "\t"+"|".join(set(prot_id.split(":FRED2")[0]+":"+",".join(repr(v) for v in set(p.get_variants_by_protein(prot_id))) for prot_id in p.proteins.iterkeys() if p.get_variants_by_protein(prot_id))) f.write(str(p)+"\t"+method+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+"\t"+proteins+vars_str+"\n") if args.etk: with open(args.output.rsplit(".",1)[0]+"_etk.tsv", "w") as g: alleles = result.columns g.write("Alleles:\t"+"\t".join(a.name for a in alleles)+"\n") for index, row in result.iterrows(): p = index[0] proteins = " ".join(set([transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins()])) g.write(str(p)+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+"\t"+proteins+"\n") return 0
def main(): #Specify CTD interface # Every CTD Model has to have at least a name and a version, plus any of the optional attributes below them. model = argparse.ArgumentParser(description='Process some integers.') model.add_argument('-m', '--method', type=str, choices=EpitopePredictorFactory.available_methods().keys(), default="bimas", help='The name of the prediction method' ) model.add_argument('-v', '--version', type=str, default="", help='The version of the prediction method' ) model.add_argument('-i', '--input', type=str, required=True, help='Path to the input file' ) model.add_argument('-t', '--type', choices=["fasta","peptide"], type=str, default="fasta", help='The data type of the input (fasta, peptide list)' ) model.add_argument('-l', '--length', choices=range(8, 18), type=int, default=9, help='The length of peptides' ) model.add_argument('-a', '--alleles', type=str, required=True, help='Path to the allele file (one per line in new nomenclature)' ) model.add_argument('-op', '--options', type=str, default="", help="Additional options that get directly past to the tool" ) model.add_argument('-o', '--output', type=str, required=True, help='Path to the output file' ) args = model.parse_args() #fasta protein if args.type == "fasta": with open(args.input, 'r') as f: first_line = f.readline() sep_pos = 1 if first_line.count("|") else 0 proteins = read_fasta(args.input, in_type=Protein, id_position=sep_pos) peptides = generate_peptides_from_proteins(proteins, args.length) elif args.type == "peptide": peptides = read_lines(args.input, in_type=Peptide) else: sys.stderr.write('Input type not known\n') return -1 #read in alleles alleles = read_lines(args.alleles, in_type=Allele) if args.version == "": result = EpitopePredictorFactory(args.method).predict(peptides, alleles, options=args.options) else: result = EpitopePredictorFactory(args.method, version=args.version).predict(peptides, alleles, options=args.options) #write to TSV columns sequence method allele-scores...,protein-id/transcript-id with open(args.output, "w") as f: proteins = "\tAntigen ID" if args.type == "fasta" else "" alleles = result.columns f.write("Sequence\tMethod\t"+"\t".join(a.name for a in alleles)+proteins+"\n") for index, row in result.iterrows(): p = index[0] method = index[1] proteins = "\t"+",".join( prot.transcript_id for prot in p.get_all_proteins()) if args.type == "fasta" else "" f.write(str(p)+"\t"+method+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+proteins+"\n") return 0
def test_single_peptide_input_mhcII(self): for m in EpitopePredictorFactory.available_methods(): model = EpitopePredictorFactory(m) if not isinstance(model, AExternalEpitopePrediction): if all(a.name in model.supportedAlleles for a in self.mhcII): res = model.predict(self.peptides_mhcII[0], alleles=self.mhcII[1])
def test_epitope_prediction_available_methods(self): print EpitopePredictorFactory.available_methods()
def test_epitope_prediction_available_methods(self): print EpitopePredictorFactory.available_methods()
pd.set_option('display.height', 1000) pd.set_option('display.max_rows', 500) pd.set_option('display.max_columns', 200) pd.set_option('display.width', 200) parser = argparse.ArgumentParser(description='Call epitope predictors on data.') requiredNamed = parser.add_argument_group('required arguments') requiredNamed.add_argument('--predictor', type=str, help='Epitope predictors [see all with --predictor=list]', required=True) requiredNamed.add_argument('--dataset', type=str, help='Immunogenic dataset [see all with --dataset=list]', required=True) parser.add_argument('-n', type=int, help='Number of rows to take from dataset') parser.add_argument('--allele', type=str, help='HLA Type', default=["HLA-A*01:01","HLA-A*02:01","HLA-B*15:01"]) args = parser.parse_args() all_predictors = [ name for name,version in EpitopePredictorFactory.available_methods().iteritems()] all_predictors.remove("netmhcstabpan") all_predictors.remove("netmhc") if args.predictor == 'list': print("Set one of the predictors with --predictor:") print(all_predictors) print (""" Details from https://bioinformatics.oxfordjournals.org/content/suppl/2016/02/26/btw113.DC1/S1.pdf SYFPEITHI T-cell epitope (Rammensee, et al., 1999) BIMAS MHC-I binding (Parker, et al., 1994) SVMHC MHC-I binding (Dönnes and Elofsson, 2002) ARB MHC-I binding (Bui, et al., 2005) SMM MHC-I binding (Peters and Sette, 2005) SMMPMBEC MHC-I binding (Kim, et al., 2009)
requiredNamed.add_argument( '--dataset', type=str, help='Immunogenic dataset [see all with --dataset=list]', required=True) parser.add_argument('-n', type=int, help='Number of rows to take from dataset') parser.add_argument('--allele', type=str, help='HLA Type', default=["HLA-A*01:01", "HLA-A*02:01", "HLA-B*15:01"]) args = parser.parse_args() all_predictors = [ name for name, version in EpitopePredictorFactory.available_methods().iteritems() ] all_predictors.remove("netmhcstabpan") all_predictors.remove("netmhc") if args.predictor == 'list': print("Set one of the predictors with --predictor:") print(all_predictors) print(""" Details from https://bioinformatics.oxfordjournals.org/content/suppl/2016/02/26/btw113.DC1/S1.pdf SYFPEITHI T-cell epitope (Rammensee, et al., 1999) BIMAS MHC-I binding (Parker, et al., 1994) SVMHC MHC-I binding (Dönnes and Elofsson, 2002) ARB MHC-I binding (Bui, et al., 2005) SMM MHC-I binding (Peters and Sette, 2005)