def test_peptides_from_varaints(self): coding = {} coding['NM_080751'] = MutationSyntax('NM_080751',2629,876,'c.2630C>T','p.Pro877Leu') var = Variant('line0',0,20,2621905,'C','T',coding,True,False) var.gene = 'TMC2' ma = MartsAdapter(biomart="http://ensembl.org") vars = [var, Variant("testInsertion", 2, 20, 2621899, "", "AAAAAA", {'NM_080751':MutationSyntax('NM_080751',2625,876,'c.2630C>T','p.Pro877Leu')}, True, False)] test = Generator.generate_peptides_from_variants(vars, 9, ma, id_type=EIdentifierTypes.REFSEQ, peptides=None) test2 = [x for x in test] print(len(test2)) ts = list() #using a tweaked generator that takes another sequence source if the sequence is too short in respect to the given variants #in this case a newer/older sequence from mart in respect to what was given as reference in the annotation process t = Generator.generate_transcripts_from_variants(vars, ma, id_type=EIdentifierTypes.REFSEQ) ts = [x for x in t] print(len(ts[0])) p = Generator.generate_proteins_from_transcripts(ts, to_stop=True) ps = [x for x in p] e = Generator.generate_peptides_from_proteins(ps, 9) es = [x for x in e] print(len(es)) #print vars print len(vars)
def setUp(self): self.trid = "NM_001114377" # FOXP3 # self, id, type, chrom, genomePos, ref, obs, coding, isHomozygous, # isSynonymous, metadata=None) self.non_syn_hetero_snp = Variant("COSM1122493", VariationType.SNP, "X", 49111949, "G", "T", {"NM_001114377": MutationSyntax( \ "NM_001114377", 756, 217, "", "") }, False, False) self.non_frame_shift_del = Variant("COSM1122495", VariationType.DEL, "X", 49113232, "CTT", "", {"NM_001114377": MutationSyntax( \ "NM_001114377", 615, 205, "", "") }, True, False) self.syn_homo_snp = Variant("COSM1122494", VariationType.SNP, "X", 49112257, "C", "T", {"NM_001114377": MutationSyntax( \ "NM_001114377", 653, 217, "", "") }, False, True) self.db_adapter = MartsAdapter(biomart="http://grch37.ensembl.org/biomart/martservice?query=")
def test_real_life_test(self): mart = MartsAdapter( biomart="http://grch37.ensembl.org/biomart/martservice?query=") ano_path = os.path.join(os.path.dirname(inspect.getfile(Fred2)), "Data/examples/test_annovar.out") vars = read_annovar_exonic(ano_path) peps = set( map( lambda x: str(x), Generator.generate_peptides_from_variants( vars, 9, mart, EIdentifierTypes.REFSEQ))) peps_from_prot = set( map( str, Generator.generate_peptides_from_proteins( Generator.generate_proteins_from_transcripts( Generator.generate_transcripts_from_variants( vars, mart, EIdentifierTypes.REFSEQ)), 9))) self.assertTrue(len(peps - peps_from_prot) == 0) self.assertTrue(len(peps_from_prot - peps) == 0)
def main(): model = argparse.ArgumentParser( description='Neoepitope prediction for TargetInsepctor.') model.add_argument( '-m', '--method', type=str, choices=EpitopePredictorFactory.available_methods().keys(), default="bimas", help='The name of the prediction method') model.add_argument('-v', '--vcf', type=str, default=None, help='Path to the vcf input file') model.add_argument( '-t', '--type', type=str, choices=["VEP", "ANNOVAR", "SNPEFF"], default="VEP", help= 'Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)' ) model.add_argument('-p', '--proteins', type=str, default=None, help='Path to the protein ID input file (in HGNC-ID)') model.add_argument('-l', '--length', choices=range(8, 18), type=int, default=9, help='The length of peptides') model.add_argument( '-a', '--alleles', type=str, required=True, help='Path to the allele file (one per line in new nomenclature)') model.add_argument( '-r', '--reference', type=str, default='GRCh38', help='The reference genome used for varinat annotation and calling.') model.add_argument( '-fINDEL', '--filterINDEL', action="store_true", help='Filter insertions and deletions (including frameshifts)') model.add_argument('-fFS', '--filterFSINDEL', action="store_true", help='Filter frameshift INDELs') model.add_argument('-fSNP', '--filterSNP', action="store_true", help='Filter SNPs') model.add_argument('-o', '--output', type=str, required=True, help='Path to the output file') model.add_argument('-etk', '--etk', action="store_true", help=argparse.SUPPRESS) args = model.parse_args() martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()]) transcript_to_genes = {} if args.vcf is None and args.proteins is None: sys.stderr.write( "At least a vcf file or a protein id file has to be provided.\n") return -1 # if vcf file is given: generate variants and filter them if HGNC IDs ar given if args.vcf is not None: protein_ids = [] if args.proteins is not None: with open(args.proteins, "r") as f: for l in f: l = l.strip() if l != "": protein_ids.append(l) if args.type == "VEP": variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids) elif args.type == "SNPEFF": variants = read_vcf(args.vcf)[0] else: variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids) variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants) if args.filterSNP: variants = filter(lambda x: x.type != VariationType.SNP, variants) if args.filterINDEL: variants = filter( lambda x: x.type not in [ VariationType.INS, VariationType.DEL, VariationType.FSDEL, VariationType.FSINS ], variants) if args.filterFSINDEL: variants = filter( lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants) if not variants: sys.stderr.write( "No variants left after filtering. Please refine your filtering criteria.\n" ) return -1 epitopes = filter( lambda x: any( x.get_variants_by_protein(tid) for tid in x.proteins.iterkeys()), generate_peptides_from_variants(variants, int(args.length), martDB, EIdentifierTypes.ENSEMBL)) for v in variants: for trans_id, coding in v.coding.iteritems(): if coding.geneID != None: transcript_to_genes[trans_id] = coding.geneID else: transcript_to_genes[trans_id] = 'None' #else: generate protein sequences from given HGNC IDs and than epitopes else: proteins = [] with open(args.proteins, "r") as f: for l in f: ensembl_ids = martDB.get_ensembl_ids_from_id( l.strip(), type=EIdentifierTypes.HGNC)[0] protein_seq = martDB.get_product_sequence( ensembl_ids[EAdapterFields.PROTID]) if protein_seq is not None: transcript_to_genes[ensembl_ids[ EAdapterFields.TRANSID]] = l.strip() proteins.append( Protein( protein_seq, gene_id=l.strip(), transcript_id=ensembl_ids[EAdapterFields.TRANSID])) epitopes = generate_peptides_from_proteins(proteins, int(args.length)) #read in allele list alleles = read_lines(args.alleles, in_type=Allele) result = EpitopePredictorFactory(args.method).predict(epitopes, alleles=alleles) with open(args.output, "w") as f: alleles = result.columns var_column = " Variants" if args.vcf is not None else "" f.write("Sequence\tMethod\t" + "\t".join(a.name for a in alleles) + "\tAntigen ID\t" + var_column + "\n") for index, row in result.iterrows(): p = index[0] method = index[1] proteins = ",".join( set([ transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins() ])) vars_str = "" if args.vcf is not None: vars_str = "\t" + "|".join( set( prot_id.split(":FRED2")[0] + ":" + ",".join( repr(v) for v in set(p.get_variants_by_protein(prot_id))) for prot_id in p.proteins.iterkeys() if p.get_variants_by_protein(prot_id))) f.write( str(p) + "\t" + method + "\t" + "\t".join("%.3f" % row[a] for a in alleles) + "\t" + proteins + vars_str + "\n") if args.etk: with open(args.output.rsplit(".", 1)[0] + "_etk.tsv", "w") as g: alleles = result.columns g.write("Alleles:\t" + "\t".join(a.name for a in alleles) + "\n") for index, row in result.iterrows(): p = index[0] proteins = " ".join( set([ transcript_to_genes[prot.transcript_id.split( ":FRED2")[0]] for prot in p.get_all_proteins() ])) g.write( str(p) + "\t" + "\t".join("%.3f" % row[a] for a in alleles) + "\t" + proteins + "\n") return 0
def main(): model = argparse.ArgumentParser( description='Neoepitope protein fasta generation from variant vcf') model.add_argument('-v', '--vcf', type=str, default=None, help='Path to the vcf input file') model.add_argument( '-t', '--type', type=str, choices=["VEP", "ANNOVAR", "SnpEff"], default="VEP", help= 'Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)' ) model.add_argument('-p', '--proteins', type=str, default=None, help='Path to the protein ID input file (in HGNC-ID)') model.add_argument( '-r', '--reference', type=str, default='GRCh38', help='The reference genome used for varinat annotation and calling.') model.add_argument( '-fINDEL', '--filterINDEL', action="store_true", help='Filter insertions and deletions (including frameshifts)') model.add_argument('-fFS', '--filterFSINDEL', action="store_true", help='Filter frameshift INDELs') model.add_argument('-fSNP', '--filterSNP', action="store_true", help='Filter SNPs') model.add_argument('-o', '--output', type=str, required=True, help='Path to the output file') args = model.parse_args() martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()]) if args.vcf is None: sys.stderr.write( "At least a vcf file or a protein id file has to be provided.\n") return -1 # if vcf file is given: generate variants and filter them if HGNC IDs ar given if args.vcf is not None: protein_ids = [] if args.proteins is not None: with open(args.proteins, "r") as f: for l in f: l = l.strip() if l != "": protein_ids.append(l) if args.type == "VEP": variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids) elif args.type == "SNPEFF": variants = read_vcf(args.vcf)[0] else: variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids) if args.filterSNP: variants = filter(lambda x: x.type != VariationType.SNP, variants) if args.filterINDEL: variants = filter( lambda x: x.type not in [ VariationType.INS, VariationType.DEL, VariationType.FSDEL, VariationType.FSINS ], variants) if args.filterFSINDEL: variants = filter( lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants) if not variants: sys.stderr.write( "No variants left after filtering. Please refine your filtering criteria.\n" ) return -1 variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants) #generate transcripts transcripts = generate_transcripts_from_variants( variants, martDB, EIdentifierTypes.ENSEMBL) #generate proteins proteins = filter( lambda x: any( x.get_variants_by_protein(tid) for tid in x.proteins.iterkeys()), generate_proteins_from_transcripts(transcripts)) #write fasta file with open(output, "w") as f: for p in proteins: f.write('>' + str(p.transcript_id) + '|' + str(p.vars) + '_var_' + '\n') f.write(str(p) + '\n') else: sys.stderr.write( "At least a vcf file or a protein id file has to be provided.\n") return -1 return 0
def main(): model = argparse.ArgumentParser(description='Neoepitope prediction for TargetInsepctor.') model.add_argument( '-m','--method', type=str, choices=EpitopePredictorFactory.available_methods().keys(), default="bimas", help='The name of the prediction method' ) model.add_argument( '-v', '--vcf', type=str, default=None, help='Path to the vcf input file' ) model.add_argument( '-t', '--type', type=str, choices=["VEP", "ANNOVAR", "SNPEFF"], default="VEP", help='Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)' ) model.add_argument( '-p','--proteins', type=str, default=None, help='Path to the protein ID input file (in HGNC-ID)' ) model.add_argument( '-l','--length', choices=range(8, 18), type=int, default=9, help='The length of peptides' ) model.add_argument( '-a','--alleles', type=str, required=True, help='Path to the allele file (one per line in new nomenclature)' ) model.add_argument( '-r' ,'--reference', type=str, default='GRCh38', help='The reference genome used for varinat annotation and calling.' ) model.add_argument( '-fINDEL' ,'--filterINDEL', action="store_true", help='Filter insertions and deletions (including frameshifts)' ) model.add_argument( '-fFS' ,'--filterFSINDEL', action="store_true", help='Filter frameshift INDELs' ) model.add_argument( '-fSNP' ,'--filterSNP', action="store_true", help='Filter SNPs' ) model.add_argument( '-o','--output', type=str, required=True, help='Path to the output file' ) model.add_argument( '-etk','--etk', action="store_true", help=argparse.SUPPRESS ) args = model.parse_args() martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()]) transcript_to_genes = {} if args.vcf is None and args.proteins is None: sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n") return -1 # if vcf file is given: generate variants and filter them if HGNC IDs ar given if args.vcf is not None: protein_ids = [] if args.proteins is not None: with open(args.proteins, "r") as f: for l in f: l = l.strip() if l != "": protein_ids.append(l) if args.type == "VEP": variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids) elif args.type == "SNPEFF": variants = read_vcf(args.vcf)[0] else: variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids) variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants) if args.filterSNP: variants = filter(lambda x: x.type != VariationType.SNP, variants) if args.filterINDEL: variants = filter(lambda x: x.type not in [VariationType.INS, VariationType.DEL, VariationType.FSDEL, VariationType.FSINS], variants) if args.filterFSINDEL: variants = filter(lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], variants) if not variants: sys.stderr.write("No variants left after filtering. Please refine your filtering criteria.\n") return -1 epitopes = filter(lambda x:any(x.get_variants_by_protein(tid) for tid in x.proteins.iterkeys()), generate_peptides_from_variants(variants, int(args.length), martDB, EIdentifierTypes.ENSEMBL)) for v in variants: for trans_id,coding in v.coding.iteritems(): if coding.geneID!=None: transcript_to_genes[trans_id] = coding.geneID else: transcript_to_genes[trans_id] = 'None' #else: generate protein sequences from given HGNC IDs and than epitopes else: proteins = [] with open(args.proteins, "r") as f: for l in f: ensembl_ids = martDB.get_ensembl_ids_from_id(l.strip(), type=EIdentifierTypes.HGNC)[0] protein_seq = martDB.get_product_sequence(ensembl_ids[EAdapterFields.PROTID]) if protein_seq is not None: transcript_to_genes[ensembl_ids[EAdapterFields.TRANSID]] = l.strip() proteins.append(Protein(protein_seq, gene_id=l.strip(), transcript_id=ensembl_ids[EAdapterFields.TRANSID])) epitopes = generate_peptides_from_proteins(proteins, int(args.length)) #read in allele list alleles = read_lines(args.alleles, in_type=Allele) result = EpitopePredictorFactory(args.method).predict(epitopes, alleles=alleles) with open(args.output, "w") as f: alleles = result.columns var_column = " Variants" if args.vcf is not None else "" f.write("Sequence\tMethod\t"+"\t".join(a.name for a in alleles)+"\tAntigen ID\t"+var_column+"\n") for index, row in result.iterrows(): p = index[0] method = index[1] proteins = ",".join(set([transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins()])) vars_str = "" if args.vcf is not None: vars_str = "\t"+"|".join(set(prot_id.split(":FRED2")[0]+":"+",".join(repr(v) for v in set(p.get_variants_by_protein(prot_id))) for prot_id in p.proteins.iterkeys() if p.get_variants_by_protein(prot_id))) f.write(str(p)+"\t"+method+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+"\t"+proteins+vars_str+"\n") if args.etk: with open(args.output.rsplit(".",1)[0]+"_etk.tsv", "w") as g: alleles = result.columns g.write("Alleles:\t"+"\t".join(a.name for a in alleles)+"\n") for index, row in result.iterrows(): p = index[0] proteins = " ".join(set([transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins()])) g.write(str(p)+"\t"+"\t".join("%.3f"%row[a] for a in alleles)+"\t"+proteins+"\n") return 0