def test_read_lines(self): alleles = FileReader.read_lines(self.ale_path, in_type=Allele) self.assertEqual(len(alleles), 2) self.assertRaises(IOError, FileReader.read_lines, self.ale_no_path, in_type=Allele) self.assertRaises(ValueError, FileReader.read_lines, self.ale_zonk_path, in_type=Allele)
def main(): parser = argparse.ArgumentParser(description="""The software is a novel approach to construct epitope-based string-of-beads vaccines in optimal order and with sequence-optimized spacers of flexible length such that the recovery of contained epitopes is maximized and immunogenicity of arising neo-epitopes is reduced. """) parser.add_argument("-i", "--input", required=True, help="File containing epitopes (one peptide per line)" ) parser.add_argument("-a", "--alleles", required=True, help="Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)" ) #parameters of the model parser.add_argument("-k","--max_length", default=6, type=int, help="Specifies the max. length of the spacers (default 6)") parser.add_argument("-al","--alpha", default=0.99, type=float, help="Specifies the first-order preference of the user in the model [0,1] (default 0.99)") parser.add_argument("-be","--beta", default=0.0, type=float, help="Specifies the second-order preference of the user in the model [0,1] (default 0).") parser.add_argument("-cp","--cleavage_prediction", default="PCM", help="Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_S]" ) parser.add_argument("-ep","--epitope_prediction", default="Syfpeithi", help="Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]" ) parser.add_argument("-thr","--threshold", default=20, type=float, help="Specifies epitope prediction threshold for SYFPEITHI (default 20).") parser.add_argument("-o", "--output", required=True, help="Specifies the output file.") parser.add_argument("-t", "--threads", type=int, default=None, help="Specifies number of threads. If not specified all available logical cpus are used.") args = parser.parse_args() #parse input peptides = list(FileReader.read_lines(args.input, in_type=Peptide)) #read in alleles alleles = generate_alleles(args.alleles) if args.cleavage_prediction.upper() not in ["PCM", "PROTEASMM_C", "PROTEASMM_S"]: print "Specified cleavage predictor is currently not supported. Please choose either PCM, PROTEASMM_C, or PROTEASMM_S" sys.exit(-1) if args.epitope_prediction.upper() not in ["SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC"]: print "Specified cleavage predictor is currently not supported. Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC" sys.exit(-1) #set-up model cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction) epi_pred = EpitopePredictorFactory(args.epitope_prediction) thr = {a.name:args.threshold for a in alleles} solver = EpitopeAssemblyWithSpacer(peptides,cl_pred,epi_pred,alleles, k=args.max_length,en=9,threshold=thr, solver="cplex", alpha=args.alpha, beta=args.beta, verbosity=0) #solve #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n threads = mp.cpu_count() if args.threads is None else args.threads svbws = solver.approximate(threads=threads,options={"preprocessing_presolve":"n","threads":1}) print print "Resulting String-of-Beads: ","-".join(map(str,svbws)) print with open(args.output, "w") as f: f.write("-".join(map(str,svbws)))
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True) parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True) parser.add_argument('-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.") parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction") parser.add_argument('-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]".format(m=PRED_METH)) parser.add_argument('-f', "--filter", dest="filter", type=float, help="Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict") parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.") parser.add_argument('-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.") options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if options.filter and not options.predict_with: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) if options.predict_with and not options.alleles: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) temp_dir = "/tmp/" logging.basicConfig(filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log', filemode='w+', level=logging.DEBUG) #, format='%(levelname)s:%(message)s' logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now())) logging.warning("verbosity turned on") #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts # complete proteins? # only containing binders? # k-mers? # binders only? # FastaSlicer.py? # remove original if homozygous (needs fasta input)? # add germline variant option? or expect all to be in one vcf? # MyObject = type('MyObject', (object,), {}) # options = MyObject() # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf") # # vt = os.path.splitext(options.var_file)[-1] # if ".vcf" == vt: # vcfvars, accessions = FileReader.read_vcf(options.var_file) # # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # transcripts = [x for x in transcript_gen if x.vars] # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # protein_gen = g.generate_proteins_from_transcripts(transcript_gen) # proteins = [x for x in protein_gen if x.vars] # for p in proteins: # p.gene_id = p.vars.values()[0][0].gene # # # for t in transcripts: # t.gene_id = t.vars.values()[0].gene # vt = os.path.splitext(options.var_file)[-1] if ".vcf" == vt: vcfvars, accessions = FileReader.read_vcf(options.var_file) elif ".GSvar" == vt: pass # vcfvars = FileReader.read_GSvar(options.var_file) else: m = "Could not read variants {f}, aborting.".format(f=options.var_file) logging.error(m) print m sys.exit(1) mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # TODO guess id_type for mart_db from accessions transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) protein_gen = g.generate_proteins_from_transcripts(transcript_gen) proteins = [x for x in protein_gen if x.vars] # removing unvaried for p in proteins: p.gene_id = p.vars.values()[0][0].gene # assume gene name from first variant proteins = [p for p in proteins if not is_stop_gain(p)] # kick out stop gains # First exit option if not (options.predict_with or options.filter) and options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta print "N/A" sys.exit(0) else: e = proteins_to_fasta(proteins) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, digestion must be set somehow if not options.digest: digest = 9 else: digest = options.digest peptide_gen = g.generate_peptides_from_proteins(proteins, digest) peptides = [x for x in peptide_gen] peptides_var = [x for x in peptides if any(x.get_variants_by_protein(y) for y in x.proteins.keys())] # removing unvaried # Second exit option if not (options.predict_with or options.filter): e = peptides_to_fasta(peptides_var) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, predictions are needed try: target_alleles_set = set(FileReader.read_lines(options.alleles, in_type=Allele)) except Exception as e: m = "Could not read alleles file {f}, aborting.".format(f=options.alleles) logging.error(m) print m, "what:", str(e) sys.exit(1) try: ttn = EpitopePredictorFactory(options.predict_with) except Exception as e: m = "Could not initialize prediction method {f}, aborting.".format(f=options.predict_with) logging.error(m) print m sys.exit(1) try: preds = ttn.predict(peptides_var, alleles=target_alleles_set) except Exception as e: print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(e) sys.exit(1) # punch prediction results in peptide metadata (inside pandas dataframe) #PRED_METH = set() for i, row in preds.iterrows(): for j in i[1:]: i[0].log_metadata(j, dict(zip(row.index, row.values))) #PRED_METH.add(j) # need that later # Third exit option if not options.filter: if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # kick out nonbinder preds_f = preds[(preds > options.filter).any(axis=1)] # Fourth exit option if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta binders only plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds_f) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds_f) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0)
def __main__(): parser = argparse.ArgumentParser( description= """EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""", version=VERSION) parser.add_argument('-s', "--somatic_mutations", help='Somatic variants') parser.add_argument('-g', "--germline_mutations", help="Germline variants") parser.add_argument('-i', "--identifier", help="Dataset identifier") parser.add_argument('-p', "--peptides", help="File with one peptide per line") parser.add_argument('-c', "--mhcclass", default="I", help="MHC class I or II") parser.add_argument('-l', "--length", help="Maximum peptide length") parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True) parser.add_argument( '-r', "--reference", help= "Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38']) parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", required=False, action='store_true') parser.add_argument( '-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", required=False, action='store_true') parser.add_argument('-rp', "--reference_proteome", help="Reference proteome for self-filtering", required=False) parser.add_argument('-gr', "--gene_reference", help="List of gene IDs for ID mapping.", required=False) parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values") parser.add_argument('-ge', "--gene_expression", help="File with expression analysis results") parser.add_argument( '-de', "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)") parser.add_argument( '-li', "--ligandomics_id", help= "Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run." ) parser.add_argument('-o', "--output_dir", help="All files written will be put in this directory") args = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if args.output_dir is not None: try: os.chdir(args.output_dir) logging.basicConfig(filename=os.path.join( args.output_dir, '{}_prediction.log'.format(args.identifier)), filemode='w+', level=logging.DEBUG) logging.info("Using provided data directory: {}".format( str(args.output_dir))) except: logging.info("No such directory, using current.") else: logging.basicConfig(filename='{}_prediction.log'.format( args.identifier), filemode='w+', level=logging.DEBUG) logging.info("Using current data directory.") logging.info("Starting predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) metadata = [] references = { 'GRCh37': 'http://feb2014.archive.ensembl.org', 'GRCh38': 'http://dec2016.archive.ensembl.org' } global transcriptProteinMap global transcriptSwissProtMap '''read in variants or peptides''' if args.peptides: peptides, metadata = read_peptide_input(args.peptides) else: if args.somatic_mutations.endswith( '.GSvar') or args.somatic_mutations.endswith('.tsv'): vl, transcripts, metadata = read_GSvar(args.somatic_mutations) elif args.somatic_mutations.endswith('.vcf'): vl, transcripts = read_vcf(args.somatic_mutations) transcripts = list(set(transcripts)) transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts( ID_SYSTEM_USED, transcripts, references[args.reference], args.reference) # get the alleles alleles = FileReader.read_lines(args.alleles, in_type=Allele) # initialize MartsAdapter, GRCh37 or GRCh38 based ma = MartsAdapter(biomart=references[args.reference]) # create protein db instance for filtering self-peptides up_db = UniProtDB('sp') if args.filter_self: logging.info('Reading human proteome') if os.path.isdir(args.reference_proteome): for filename in os.listdir(args.reference_proteome): if filename.endswith(".fasta") or filename.endswith(".fsa"): up_db.read_seqs( os.path.join(args.reference_proteome, filename)) else: up_db.read_seqs(args.reference_proteome) # MHC class I or II predictions if args.mhcclass == "I": #methods = ['netmhc-4.0', 'syfpeithi-1.0', 'netmhcpan-3.0'] methods = ['syfpeithi-1.0'] if args.peptides: pred_dataframes, statistics = make_predictions_from_peptides( peptides, methods, alleles, up_db, args.identifier, metadata) else: pred_dataframes, statistics, all_peptides_filtered = make_predictions_from_variants( vl, methods, alleles, 8, int(args.length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) else: methods = ['netmhcII-2.2', 'syfpeithi-1.0', 'netmhcIIpan-3.1'] if args.peptides: pred_dataframes, statistics = make_predictions_from_peptides( peptides, methods, alleles, up_db, args.identifier, metadata) else: pred_dataframes, statistics, all_peptides_filtered = make_predictions_from_variants( vl, methods, alleles, 15, int(args.length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) # concat dataframes for all peptide lengths try: complete_df = pd.concat(pred_dataframes) except: complete_df = pd.DataFrame() logging.error("No predictions available.") # store version of used methods method_map = {} for m in methods: method_map[m.split('-')[0]] = m # replace method names with method names with version complete_df.replace({'method': method_map}, inplace=True) # include wild type sequences to dataframe if specified if args.wild_type: wt_sequences = generate_wt_seqs(all_peptides_filtered) complete_df['wt sequence'] = complete_df.apply( lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1) columns_tiles = [ 'sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method' ] # Change the order (the index) of the columns else: columns_tiles = [ 'sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method' ] for c in complete_df.columns: if c not in columns_tiles: columns_tiles.append(c) complete_df = complete_df.reindex(columns=columns_tiles) binder_cols = [col for col in complete_df.columns if 'binder' in col] binders = [] non_binders = [] pos_predictions = [] neg_predictions = [] for i, r in complete_df.iterrows(): binder = False for c in binder_cols: if r[c] is True: binder = True continue if binder: binders.append(str(r['sequence'])) pos_predictions.append(str(r['sequence'])) else: neg_predictions.append(str(r['sequence'])) if str(r['sequence']) not in binders: non_binders.append(str(r['sequence'])) # parse protein quantification results, annotate proteins for samples if args.protein_quantification is not None: protein_quant = read_protein_quant(args.protein_quantification) first_entry = protein_quant[protein_quant.keys()[0]] for k in first_entry.keys(): complete_df['{} log2 protein LFQ intensity'.format( k)] = complete_df.apply( lambda row: create_quant_column_value_for_result( row, protein_quant, transcriptSwissProtMap, k), axis=1) # parse (differential) expression analysis results, annotate features (genes/transcripts) if args.gene_expression is not None: fold_changes = read_diff_expression_values(args.gene_expression) gene_id_lengths = {} col_name = 'RNA expression (RPKM)' with open(args.gene_reference, 'r') as gene_list: for l in gene_list: ids = l.split('\t') gene_id_in_df = complete_df.iloc[1]['gene'] if 'ENSG' in gene_id_in_df: gene_id_lengths[ids[0]] = float(ids[2].strip()) else: gene_id_lengths[ids[1]] = float(ids[2].strip()) deseq = False # add column to result dataframe complete_df[col_name] = complete_df.apply( lambda row: create_expression_column_value_for_result( row, fold_changes, deseq, gene_id_lengths), axis=1) if args.diff_gene_expression is not None: gene_id_lengths = {} fold_changes = read_diff_expression_values(args.diff_gene_expression) col_name = 'RNA normal_vs_tumor.log2FoldChange' deseq = True # add column to result dataframe complete_df[col_name] = complete_df.apply( lambda row: create_expression_column_value_for_result( row, fold_changes, deseq, gene_id_lengths), axis=1) # parse ligandomics identification results, annotate peptides for samples if args.ligandomics_id is not None: lig_id = read_lig_ID_values(args.ligandomics_id) # add columns to result dataframe complete_df['ligand score'] = complete_df.apply( lambda row: create_ligandomics_column_value_for_result( row, lig_id, 0, False), axis=1) complete_df['ligand intensity'] = complete_df.apply( lambda row: create_ligandomics_column_value_for_result( row, lig_id, 1, False), axis=1) if args.wild_type != None: complete_df['wt ligand score'] = complete_df.apply( lambda row: create_ligandomics_column_value_for_result( row, lig_id, 0, True), axis=1) complete_df['wt ligand intensity'] = complete_df.apply( lambda row: create_ligandomics_column_value_for_result( row, lig_id, 1, True), axis=1) # write dataframe to tsv complete_df.fillna('') complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier), '\t', index=False) statistics['number_of_predictions'] = complete_df.shape[0] statistics['number_of_binders'] = len(pos_predictions) statistics['number_of_nonbinders'] = len(neg_predictions) statistics['number_of_unique_binders'] = list(set(binders)) statistics['number_of_unique_nonbinders'] = list( set(non_binders) - set(binders)) with open('{}_report.json'.format(args.identifier), 'w') as json_out: json.dump(statistics, json_out)
def __main__(): parser = argparse.ArgumentParser(description="""EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""", version=VERSION) parser.add_argument('-s', "--somatic_mutations", help='Somatic variants') parser.add_argument('-g', "--germline_mutations", help="Germline variants") parser.add_argument('-i', "--identifier", help="Dataset identifier") parser.add_argument('-p', "--peptides", help="File with one peptide per line") parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II") parser.add_argument('-l', "--max_length", help="Maximum peptide length") parser.add_argument('-ml', "--min_length", help="Minimum peptide length") parser.add_argument('-t', "--tools", help="Tools used for peptide predictions", required=True, type=str) parser.add_argument('-sv', "--versions", help="File containing parsed software version numbers.", required=True) parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True) parser.add_argument('-r', "--reference", help="Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38']) parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", required=False, action='store_true') parser.add_argument('-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", required=False, action='store_true') parser.add_argument('-fo', "--fasta_output", help="Create FASTA file with protein sequences", required=False, action='store_true') parser.add_argument('-rp', "--reference_proteome", help="Reference proteome for self-filtering", required=False) parser.add_argument('-gr', "--gene_reference", help="List of gene IDs for ID mapping.", required=False) parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values") parser.add_argument('-ge', "--gene_expression", help="File with expression analysis results") parser.add_argument('-de', "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)") parser.add_argument('-li', "--ligandomics_id", help="Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run.") args = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) logger.addHandler(logging.FileHandler('{}_prediction.log'.format(args.identifier))) logger.info("Starting predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) metadata = [] references = {'GRCh37': 'http://feb2014.archive.ensembl.org', 'GRCh38': 'http://dec2016.archive.ensembl.org'} global transcriptProteinMap global transcriptSwissProtMap '''read in variants or peptides''' if args.peptides: peptides, metadata = read_peptide_input(args.peptides) else: if args.somatic_mutations.endswith('.GSvar') or args.somatic_mutations.endswith('.tsv'): vl, transcripts, metadata = read_GSvar(args.somatic_mutations) elif args.somatic_mutations.endswith('.vcf'): vl, transcripts, metadata = read_vcf(args.somatic_mutations) transcripts = list(set(transcripts)) transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts(ID_SYSTEM_USED, transcripts, references[args.reference], args.reference) # get the alleles alleles = FileReader.read_lines(args.alleles, in_type=Allele) # initialize MartsAdapter, GRCh37 or GRCh38 based ma = MartsAdapter(biomart=references[args.reference]) # create protein db instance for filtering self-peptides up_db = UniProtDB('sp') if args.filter_self: logger.info('Reading human proteome') if os.path.isdir(args.reference_proteome): for filename in os.listdir(args.reference_proteome): if filename.endswith(".fasta") or filename.endswith(".fsa"): up_db.read_seqs(os.path.join(args.reference_proteome, filename)) else: up_db.read_seqs(args.reference_proteome) selected_methods = [item for item in args.tools.split(',')] with open(args.versions, 'r') as versions_file: tool_version = [ (row[0], str(row[1][1:])) for row in csv.reader(versions_file, delimiter = "\t") ] # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future tool_version.append(('syfpeithi', '1.0')) # get for each selected method the corresponding tool version methods = { method:version for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() } for method, version in methods.items(): if version not in EpitopePredictorFactory.available_methods()[method]: raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.") # MHC class I or II predictions if args.mhcclass is 1: if args.peptides: pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata) else: pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) else: if args.peptides: pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata) else: pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap) # concat dataframes for all peptide lengths try: complete_df = pd.concat(pred_dataframes) except: complete_df = pd.DataFrame() logger.error("No predictions available.") # replace method names with method names with version # complete_df.replace({'method': methods}, inplace=True) complete_df['method'] = complete_df['method'].apply(lambda x : x + '-' + methods[x] ) # include wild type sequences to dataframe if specified if args.wild_type: wt_sequences = generate_wt_seqs(all_peptides_filtered) complete_df['wt sequence'] = complete_df.apply(lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1) columns_tiles = ['sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method'] # Change the order (the index) of the columns else: columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method'] for c in complete_df.columns: if c not in columns_tiles: columns_tiles.append(c) complete_df = complete_df.reindex(columns=columns_tiles) binder_cols = [col for col in complete_df.columns if 'binder' in col] binders = [] non_binders = [] pos_predictions = [] neg_predictions = [] for i, r in complete_df.iterrows(): binder = False for c in binder_cols: if r[c] is True: binder = True continue if binder: binders.append(str(r['sequence'])) pos_predictions.append(str(r['sequence'])) else: neg_predictions.append(str(r['sequence'])) if str(r['sequence']) not in binders: non_binders.append(str(r['sequence'])) # parse protein quantification results, annotate proteins for samples if args.protein_quantification is not None: protein_quant = read_protein_quant(args.protein_quantification) first_entry = protein_quant[protein_quant.keys()[0]] for k in first_entry.keys(): complete_df['{} log2 protein LFQ intensity'.format(k)] = complete_df.apply(lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1) # parse (differential) expression analysis results, annotate features (genes/transcripts) if args.gene_expression is not None: fold_changes = read_diff_expression_values(args.gene_expression) gene_id_lengths = {} col_name = 'RNA expression (RPKM)' with open(args.gene_reference, 'r') as gene_list: for l in gene_list: ids = l.split('\t') gene_id_in_df = complete_df.iloc[1]['gene'] if 'ENSG' in gene_id_in_df: gene_id_lengths[ids[0]] = float(ids[2].strip()) else: gene_id_lengths[ids[1]] = float(ids[2].strip()) deseq = False # add column to result dataframe complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1) if args.diff_gene_expression is not None: gene_id_lengths = {} fold_changes = read_diff_expression_values(args.diff_gene_expression) col_name = 'RNA normal_vs_tumor.log2FoldChange' deseq = True # add column to result dataframe complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1) # parse ligandomics identification results, annotate peptides for samples if args.ligandomics_id is not None: lig_id = read_lig_ID_values(args.ligandomics_id) # add columns to result dataframe complete_df['ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, False), axis=1) complete_df['ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, False), axis=1) if args.wild_type != None: complete_df['wt ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, True), axis=1) complete_df['wt ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, True), axis=1) # write mutated protein sequences to fasta file if args.fasta_output: with open('{}_prediction_proteins.fasta'.format(args.identifier), 'w') as protein_outfile: for p in proteins: variants = [] for v in p.vars: variants = variants + p.vars[v] c = [x.coding.values() for x in variants] cf = list(itertools.chain.from_iterable(c)) cds = ','.join([y.cdsMutationSyntax for y in set(cf)]) aas = ','.join([y.aaMutationSyntax for y in set(cf)]) protein_outfile.write('>{}:{}:{}\n'.format(p.transcript_id, aas, cds)) protein_outfile.write('{}\n'.format(str(p))) # write dataframe to tsv complete_df.fillna('') complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier), '\t', index=False) statistics['number_of_predictions'] = len(complete_df) statistics['number_of_binders'] = len(pos_predictions) statistics['number_of_nonbinders'] = len(neg_predictions) statistics['number_of_unique_binders'] = list(set(binders)) statistics['number_of_unique_nonbinders'] = list(set(non_binders) - set(binders)) with open('{}_report.json'.format(args.identifier), 'w') as json_out: json.dump(statistics, json_out) logger.info("Finished predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
def __main__(): parser = argparse.ArgumentParser( "Write out information about supported models by Fred2 for installed predictor tool versions." ) parser.add_argument('-p', "--peptides", help="File with one peptide per line") parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II") parser.add_argument('-l', "--max_length", help="Maximum peptide length") parser.add_argument('-ml', "--min_length", help="Minimum peptide length") parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True) parser.add_argument('-t', '--tools', help='Tools requested for peptide predictions', required=True, type=str) parser.add_argument('-v', '--versions', help='<Required> File with used software versions.', required=True) args = parser.parse_args() selected_methods = [item for item in args.tools.split(',')] with open(args.versions, 'r') as versions_file: tool_version = [(row[0], str(row[1][1:])) for row in csv.reader(versions_file, delimiter="\t")] # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future tool_version.append(('syfpeithi', '1.0')) # how to handle this? # get for each method the corresponding tool version methods = { method: version for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() } # get the alleles alleles = FileReader.read_lines(args.alleles, in_type=Allele) peptide_lengths = [] if (args.peptides): peptides = read_peptide_input(args.peptides) peptide_lengths = set([len(pep) for pep in peptides]) else: peptide_lengths = range(args.min_length, args.max_length + 1) with open("model_report.txt", 'w') as output: # check if requested tool versions are supported for method, version in methods.items(): if version not in EpitopePredictorFactory.available_methods( )[method]: raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.") # check if reuested alleles are supported support_all_alleles = True no_allele_support = True for a in alleles: supported = False for method, version in methods.items(): predictor = EpitopePredictorFactory(method, version=version) if a not in sorted(predictor.supportedAlleles): output.write("Allele " + convert_allele_back(a) + " is not supported by " + method + " " + version + ".\n") else: supported = True if not supported: output.write( "Allele " + convert_allele_back(a) + " is not supported by any of the requested tools.\n") logger.warning( "Allele " + convert_allele_back(a) + " is not supported by any of the requested tools.") support_all_alleles = False else: no_allele_support = False if support_all_alleles: output.write( "All selected alleles are supported by at least one of the requested tools.\n" ) if no_allele_support: output.write( "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n" ) raise ValueError( "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models." ) output.write("\n") # check if reuested lengths are supported support_all_lengths = True no_length_support = True for l in peptide_lengths: supported = False for method, version in methods.items(): predictor = EpitopePredictorFactory(method, version=version) if l not in sorted(predictor.supportedLength): output.write("Peptide length " + str(l) + " is not supported by " + method + " " + version + ".\n") else: supported = True if not supported: output.write( "Peptide length " + str(l) + " is not supported by any of the requested tools.\n") logger.warning( "Peptide length " + str(l) + " is not supported by any of the requested tools.") support_all_lengths = False else: no_length_support = False if support_all_lengths: output.write( "All selected or provided peptide lengths are supported by at least one of the requested tools.\n" ) if no_length_support: output.write( "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n" ) raise ValueError( "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models." )
def __main__(): parser = argparse.ArgumentParser(version=VERSION) parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True) parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True) parser.add_argument( '-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.") parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction") parser.add_argument( '-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]". format(m=PRED_METH)) parser.add_argument( '-f', "--filter", dest="filter", type=float, help= "Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict" ) parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.") parser.add_argument( '-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.") options = parser.parse_args() if len(sys.argv) <= 1: parser.print_help() sys.exit(1) if options.filter and not options.predict_with: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) if options.predict_with and not options.alleles: parser.print_help() print "Need alleles with predict option, aborting!" sys.exit(1) temp_dir = "/tmp/" logging.basicConfig( filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log', filemode='w+', level=logging.DEBUG) #, format='%(levelname)s:%(message)s' logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now())) logging.warning("verbosity turned on") #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts # complete proteins? # only containing binders? # k-mers? # binders only? # FastaSlicer.py? # remove original if homozygous (needs fasta input)? # add germline variant option? or expect all to be in one vcf? # MyObject = type('MyObject', (object,), {}) # options = MyObject() # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf") # # vt = os.path.splitext(options.var_file)[-1] # if ".vcf" == vt: # vcfvars, accessions = FileReader.read_vcf(options.var_file) # # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org") # # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # transcripts = [x for x in transcript_gen if x.vars] # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) # protein_gen = g.generate_proteins_from_transcripts(transcript_gen) # proteins = [x for x in protein_gen if x.vars] # for p in proteins: # p.gene_id = p.vars.values()[0][0].gene # # # for t in transcripts: # t.gene_id = t.vars.values()[0].gene # vt = os.path.splitext(options.var_file)[-1] if ".vcf" == vt: vcfvars, accessions = FileReader.read_vcf(options.var_file) elif ".GSvar" == vt: pass # vcfvars = FileReader.read_GSvar(options.var_file) else: m = "Could not read variants {f}, aborting.".format(f=options.var_file) logging.error(m) print m sys.exit(1) mart_db = MartsAdapter(biomart="http://grch37.ensembl.org" ) # TODO guess id_type for mart_db from accessions transcript_gen = g.generate_transcripts_from_variants( vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ) protein_gen = g.generate_proteins_from_transcripts(transcript_gen) proteins = [x for x in protein_gen if x.vars] # removing unvaried for p in proteins: p.gene_id = p.vars.values( )[0][0].gene # assume gene name from first variant proteins = [p for p in proteins if not is_stop_gain(p)] # kick out stop gains # First exit option if not (options.predict_with or options.filter) and options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta print "N/A" sys.exit(0) else: e = proteins_to_fasta(proteins) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, digestion must be set somehow if not options.digest: digest = 9 else: digest = options.digest peptide_gen = g.generate_peptides_from_proteins(proteins, digest) peptides = [x for x in peptide_gen] peptides_var = [ x for x in peptides if any( x.get_variants_by_protein(y) for y in x.proteins.keys()) ] # removing unvaried # Second exit option if not (options.predict_with or options.filter): e = peptides_to_fasta(peptides_var) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # From now on, predictions are needed try: target_alleles_set = set( FileReader.read_lines(options.alleles, in_type=Allele)) except Exception as e: m = "Could not read alleles file {f}, aborting.".format( f=options.alleles) logging.error(m) print m, "what:", str(e) sys.exit(1) try: ttn = EpitopePredictorFactory(options.predict_with) except Exception as e: m = "Could not initialize prediction method {f}, aborting.".format( f=options.predict_with) logging.error(m) print m sys.exit(1) try: preds = ttn.predict(peptides_var, alleles=target_alleles_set) except Exception as e: print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str( e) sys.exit(1) # punch prediction results in peptide metadata (inside pandas dataframe) #PRED_METH = set() for i, row in preds.iterrows(): for j in i[1:]: i[0].log_metadata(j, dict(zip(row.index, row.values))) #PRED_METH.add(j) # need that later # Third exit option if not options.filter: if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) # kick out nonbinder preds_f = preds[(preds > options.filter).any(axis=1)] # Fourth exit option if options.only_proteins: if options.basefasta_path: # TODO - replace from base fasta binders only plus prediction annotation print "N/A" sys.exit(0) else: prs = annotate_protein_from_peptides(preds_f) e = proteins_to_fasta(prs) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0) else: e = peptides_to_fasta(preds_f) with open(options.outfile_path, 'w') as f: f.write(e) sys.exit(0)
def main(): parser = argparse.ArgumentParser(description="""The software is a novel approach to construct epitope-based string-of-beads vaccines in optimal order and with sequence-optimized spacers of flexible length such that the recovery of contained epitopes is maximized and immunogenicity of arising neo-epitopes is reduced. """) parser.add_argument("-i", "--input", required=True, help="File containing epitopes (one peptide per line)" ) parser.add_argument("-a", "--alleles", required=True, help="Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)" ) #parameters of the model parser.add_argument("-k","--max_length", default=6, type=int, help="Specifies the max. length of the spacers (default 6)") parser.add_argument("-al","--alpha", default=0.99, type=float, help="Specifies the first-order preference of the user in the model [0,1] (default 0.99)") parser.add_argument("-be","--beta", default=0.0, type=float, help="Specifies the second-order preference of the user in the model [0,1] (default 0).") parser.add_argument("-thr","--threshold", default=20, type=float, help="Specifies epitope prediction threshold for SYFPEITHI (default 20).") parser.add_argument("-o", "--output", required=True, help="Specifies the output file.") parser.add_argument("-t", "--threads", type=int, default=None, help="Specifies number of threads. If not specified all available logical cpus are used.") args = parser.parse_args() #parse input peptides = FileReader.read_lines(args.input, type="Peptide") #read in alleles alleles = generate_alleles(args.alleles) #set-up model cl_pred = CleavageSitePredictorFactory("PCM") epi_pred = EpitopePredictorFactory("Syfpeithi") thr = {a.name:args.threshold for a in alleles} solver = EpitopeAssemblyWithSpacer(peptides,cl_pred,epi_pred,alleles, k=args.max_length,en=9,threshold=thr, solver="cplex", alpha=args.alpha, beta=args.beta, verbosity=0) #solve #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n threads = mp.cpu_count() if args.threads is None else args.threads svbws = solver.approximate(threads=threads,options="preprocessing_presolve=n,threads=1") print print "Resulting String-of-Beads: ","-".join(map(str,svbws)) print with open(args.output, "w") as f: f.write("-".join(map(str,svbws)))
def main(): parser = argparse.ArgumentParser(description="Reads protein or peptide sequences and predicts peptides "+ "for a specified prediction method and HLA alleles.") parser.add_argument("-i", "--input", nargs="+", required=True, help="Input data can be RefSeq ID, UniProt ID, fasta file, peptide file (one peptide per line)," +" or peptide sequences as sequences (max 50)" ) input_types = parser.add_mutually_exclusive_group(required=True) input_types.add_argument("-r","--refseq", action="store_true", help= "Specifies the input as RefSeq IDs") input_types.add_argument("-u","--uniprot", action="store_true", help= "Specifies the input as UniProt IDs") input_types.add_argument("-f","--fasta", action="store_true", help= "Specifies the input as protein (multi-)Fasta file") input_types.add_argument("-pf","--pepfile", action="store_true", help= "Specifies the input as peptide file") input_types.add_argument("-p","--peptide", action="store_true", help= "Specifies the input as peptide sequences") parser.add_argument("-a", "--alleles", nargs="+", required=True, help="Specifies for which alleles prediction should be made. " + "Input either can be alleles as string (new nomenclature), or a file with one allele per line.") allele_types = parser.add_mutually_exclusive_group(required=True) allele_types.add_argument("-af", "--allelefile", action="store_true", help="Specifies the allele input as allele file.") allele_types.add_argument("-as", "--allelestring", action="store_true", help="Specifies the allele input as allele string.") parser.add_argument("-m", "--method", required=True, nargs="+", help="Specifies the method used for prediction.") parser.add_argument("-l", "--length", required=False, type=int, default=9, help="Specifies the length of the peptides (default=9).") parser.add_argument("-o", "--output", required=True, help="Specifies the output path. Results will be written to CSV") parser.add_argument("-am", "--available", required=False, action="store_true", help="Returns all available methods and their allele models.") #COMMENT: These options are hidden and only used for ETK2 parser.add_argument("-html", "--html", required=False, action="store_true", help=argparse.SUPPRESS) parser.add_argument("-od", "--outdir", required=False, default="", help=argparse.SUPPRESS) args = parser.parse_args() if args.available: for pred, obj in AEpitopePrediction.registry.iteritems(): if pred not in ["AEpitopePrediction", "APSSMEpitopePredictor", "ANetMHC", "ASVMEpitopePrediction"]: print "Method: ",pred print "Supported Alleles: ", " ".join(getattr(obj, "_"+pred+"__alleles" )) print "Supported Length: ", " ".join(map(str, getattr(obj, "_"+pred+"__supported_length"))) print sys.exit(0) ''' Parser Input ''' #RefSeq if args.refseq: pass #UniProt elif args.uniprot: pass #fasta protein elif args.fasta: proteins = FileReader.read_fasta(args.input, type="Protein") peptides = generate_peptides_from_protein(proteins, args.length) elif args.pepfile: peptides = FileReader.read_lines(args.input, type="Peptide") elif args.peptide: peptides = [Peptide(s) for s in args.input] #read in alleles if args.allelefile: alleles = FileReader.read_lines(args.alleles, type="Allele") else: alleles = [Allele(a.upper()) for a in args.alleles] result = [EpitopePredictorFactory(m).predict(peptides, alleles) for m in args.method] r_df = result.pop() for r in result: r_df_a, r_a = r_df.align(r, fill_value=0) r_df = r_df_a + r_a output = args.output if args.outdir == "" else args.outdir + os.path.basename(args.output) with open(output, "w") as out: r_df.to_csv(out) #generate Galaxy HTML output if args.html: begin_html = """<?xml version="1.0" encoding="utf-8" ?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <link rel="stylesheet" href="/static/style/blue/etk.css" type="text/css" /> <script type="text/javascript" src="/static/scripts/packed/libs/jquery/jquery.js"></script> <script type="text/javascript" src="/static/scripts/packed/libs/jquery/jquery.tablesorter.js"></script> <script type="text/javascript" src="/static/scripts/libs/etk.js"></script> </head> <body> <div class="document">""" setting = """ <h2 class="etk-heading">Epitope Prediction Results</h2> <table class="etk-parameterT"> <tr> <th class ="etk-innerHeading" colspan="2"> Parameters </th></tr> <tr> <th>Prediction Method:</th> <td>%s</td> </tr> </table>"""%args.method table=""" <input id="etk-search" placeholder=" filter"> <table class="etk-sortT etk-resultsT etk-filterT"> <thead> <tr> <th>Peptide</th>"""+"".join("<th>%s</th>"%str(a) for a in result.columns) \ +""" </tr> </thead>"""+"".join("<tr><td>%s<td>%s</tr>"%(r[0] ,"".join("<td align='right'>%s</td>"%str(result.loc[r, c]))) for r in result.index for c in result.columns)+"</table>" end_html = "</div></body></html>" html_out = ".".join(output.split(".")[:-1])+".html" with open(html_out, "w") as html_o: html_o.write(begin_html+setting+table+end_html)
def main(): parser = argparse.ArgumentParser( description= """The software is a novel approach to construct epitope-based string-of-beads vaccines in optimal order and with sequence-optimized spacers of flexible length such that the recovery of contained epitopes is maximized and immunogenicity of arising neo-epitopes is reduced. """) parser.add_argument("-i", "--input", required=True, help="File containing epitopes (one peptide per line)") parser.add_argument( "-a", "--alleles", required=True, help= "Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)" ) #parameters of the model parser.add_argument( "-k", "--max_length", default=6, type=int, help="Specifies the max. length of the spacers (default 6)") parser.add_argument( "-al", "--alpha", default=0.99, type=float, help= "Specifies the first-order preference of the user in the model [0,1] (default 0.99)" ) parser.add_argument( "-be", "--beta", default=0.0, type=float, help= "Specifies the second-order preference of the user in the model [0,1] (default 0)." ) parser.add_argument( "-cp", "--cleavage_prediction", default="PCM", help= "Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_S]" ) parser.add_argument( "-ep", "--epitope_prediction", default="Syfpeithi", help= "Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]" ) parser.add_argument( "-thr", "--threshold", default=20, type=float, help= "Specifies epitope prediction threshold for SYFPEITHI (default 20).") parser.add_argument("-o", "--output", required=True, help="Specifies the output file.") parser.add_argument( "-t", "--threads", type=int, default=None, help= "Specifies number of threads. If not specified all available logical cpus are used." ) parser.add_argument( "--ips-solver", default="cplex", choices=["cplex", "cbc"], help= "Executable name of the IPS solver. Executable needs to be available in PATH." ) parser.add_argument("--tsp-solution", default="approximate", choices=["approximate", "optimal"], help="Type of solution of the TSP") parser.add_argument( "--random-order", action="store_true", help= "Indicate whether to generate a random ordered string-of-beads polypeptide" ) parser.add_argument( "--seed", type=int, default=1, help="Seed for random ordering of string-of-beads polypeptide") args = parser.parse_args() #parse input peptides = list(FileReader.read_lines(args.input, in_type=Peptide)) #read in alleles alleles = generate_alleles(args.alleles) if args.cleavage_prediction.upper() not in [ "PCM", "PROTEASMM_C", "PROTEASMM_S" ]: print "Specified cleavage predictor is currently not supported. Please choose either PCM, PROTEASMM_C, or PROTEASMM_S" sys.exit(-1) if args.epitope_prediction.upper() not in [ "SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC" ]: print "Specified cleavage predictor is currently not supported. Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC" sys.exit(-1) #set-up model cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction) epi_pred = EpitopePredictorFactory(args.epitope_prediction) thr = {a.name: args.threshold for a in alleles} solver = EpitopeAssemblyWithSpacer(peptides, cl_pred, epi_pred, alleles, k=args.max_length, en=9, threshold=thr, solver=args.ips_solver, alpha=args.alpha, beta=args.beta, verbosity=0) #solve #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n threads = mp.cpu_count() if args.threads is None else args.threads if args.tsp_solution == "approximate": svbws = solver.approximate(threads=threads, options={ "preprocessing_presolve": "n", "threads": 1 }) else: svbws = solver.solve(threads=threads, options={ "preprocessing_presolve": "n", "threads": 1 }) # Generate random ordered string-of-breads, but still uses optimal spacers # determined from the above solve function. if args.random_order: print "Generating a randomly ordered polypeptide" random.seed(args.seed) random_order_sob = [] random.shuffle(peptides) for i in range(len(peptides)): # Break from loop once we hit the last peptide if i == len(peptides) - 1: random_order_sob.extend([Peptide(str(peptides[i]))]) break left_peptide = str(peptides[i]) right_peptide = str(peptides[i + 1]) opt_spacer = solver.spacer[(left_peptide, right_peptide)] # Right peptide gets added in the next iteration random_order_sob.extend( [Peptide(left_peptide), Peptide(opt_spacer)]) svbws = random_order_sob print print "Resulting String-of-Beads: ", "-".join(map(str, svbws)) print with open(args.output, "w") as f: f.write("-".join(map(str, svbws)))