示例#1
0
文件: TestIO.py 项目: FRED-2/Fred2
 def test_read_vcf(self):
     #general
     vcfvars, accessions = FileReader.read_vcf(self.vcf_path1)
     self.assertEqual(len(vcfvars), 17)
     #no annotations
     self.assertWarnings(UserWarning, FileReader.read_vcf, self.vcf_path2)
     #variation types
     vcfvars, accessions = FileReader.read_vcf(self.vcf_path3)
     self.assertEqual(len(vcfvars), 5) #stopgaininsertion will yield no variant
     self.assertEqual(vcfvars[0].coding['NM_014675'].cdsMutationSyntax, 'c.6026C>G')
     self.assertEqual(vcfvars[1].coding['NM_015237'].cdsMutationSyntax, 'c.1225C>T')
     self.assertEqual(vcfvars[2].coding['NM_014826'].cdsMutationSyntax, 'c.2091delG')
     self.assertEqual(vcfvars[2].coding['NM_003607'].cdsMutationSyntax, 'c.2334delG')
     self.assertEqual(vcfvars[3].coding['NM_002318'].cdsMutationSyntax, 'c.142_162delCAGGCCCCCGCCAACGTGGCC')
     self.assertEqual(vcfvars[4].coding['NM_152888'].cdsMutationSyntax, 'c.2086G>T')
def main(input_peptides, input_vaccine_sequences, output_vaccine_epitopes,
         verbose, log_file):
    ''' Reads the vaccine produced by Fischer's online tool and converts it into epitopes
    '''
    global LOGGER
    LOGGER = utilities.init_logging(verbose, log_file, log_append=False)

    LOGGER.info('Reading peptides...')
    with open(input_peptides) as f:
        peptides = set(r['peptide'] for r in csv.DictReader(f))
    LOGGER.info('Read %d peptides', len(peptides))

    LOGGER.info('Reading vaccine...')
    mosaics = FileReader.read_fasta(input_vaccine_sequences, in_type=Protein)
    LOGGER.info('Vaccine has %d mosaic(s)', len(mosaics))

    with open(output_vaccine_epitopes, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(('cocktail', 'index', 'epitope'))

        for c, mos in enumerate(mosaics):
            pep_count = unk_count = 0
            for i in range(0, len(mos) - 8):
                pep = mos[i:i + 9]
                assert len(pep) == 9

                if pep in peptides:
                    writer.writerow((c, pep_count, pep))
                    pep_count += 1
                else:
                    unk_count += 1

            LOGGER.info('Mosaic %d - Recognized: %d Unknown %d', c + 1,
                        pep_count, unk_count)
示例#3
0
def read_annotated_proteins(proteins_file):
    ''' Reads proteins from a fasta file and extracts their metadata from the header.
        Currently follows the format of the HIV database
    '''
    proteins = FileReader.read_fasta(proteins_file, in_type=Protein)
    for prot in proteins:
        parts = prot.transcript_id.split('.')
        prot.transcript_id = parts[-1]
    return proteins
示例#4
0
 def test_read_lines(self):
     alleles = FileReader.read_lines(self.ale_path, in_type=Allele)
     self.assertEqual(len(alleles), 2)
     self.assertRaises(IOError,
                       FileReader.read_lines,
                       self.ale_no_path,
                       in_type=Allele)
     self.assertRaises(ValueError,
                       FileReader.read_lines,
                       self.ale_zonk_path,
                       in_type=Allele)
示例#5
0
 def test_read_vcf(self):
     #general
     vcfvars, accessions = FileReader.read_vcf(self.vcf_path1)
     self.assertEqual(len(vcfvars), 17)
     #no annotations
     self.assertWarnings(UserWarning, FileReader.read_vcf, self.vcf_path2)
     #variation types
     vcfvars, accessions = FileReader.read_vcf(self.vcf_path3)
     self.assertEqual(len(vcfvars),
                      5)  #stopgaininsertion will yield no variant
     self.assertEqual(vcfvars[0].coding['NM_014675'].cdsMutationSyntax,
                      'c.6026C>G')
     self.assertEqual(vcfvars[1].coding['NM_015237'].cdsMutationSyntax,
                      'c.1225C>T')
     self.assertEqual(vcfvars[2].coding['NM_014826'].cdsMutationSyntax,
                      'c.2091delG')
     self.assertEqual(vcfvars[2].coding['NM_003607'].cdsMutationSyntax,
                      'c.2334delG')
     self.assertEqual(vcfvars[3].coding['NM_002318'].cdsMutationSyntax,
                      'c.142_162delCAGGCCCCCGCCAACGTGGCC')
     self.assertEqual(vcfvars[4].coding['NM_152888'].cdsMutationSyntax,
                      'c.2086G>T')
示例#6
0
def main():
    parser = argparse.ArgumentParser(
        description=
        """The software is a novel approach to construct epitope-based string-of-beads
vaccines in optimal order and with sequence-optimized spacers of flexible length
such that the recovery of contained epitopes is maximized and immunogenicity of 
arising neo-epitopes is reduced. """)
    parser.add_argument("-i",
                        "--input",
                        required=True,
                        help="File containing epitopes (one peptide per line)")
    parser.add_argument(
        "-a",
        "--alleles",
        required=True,
        help=
        "Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)"
    )

    #parameters of the model
    parser.add_argument(
        "-k",
        "--max_length",
        default=6,
        type=int,
        help="Specifies the max. length of the spacers (default 6)")
    parser.add_argument(
        "-al",
        "--alpha",
        default=0.99,
        type=float,
        help=
        "Specifies the first-order preference of the user in the model [0,1] (default 0.99)"
    )
    parser.add_argument(
        "-be",
        "--beta",
        default=0.0,
        type=float,
        help=
        "Specifies the second-order preference of the user in the model [0,1] (default 0)."
    )

    parser.add_argument(
        "-cp",
        "--cleavage_prediction",
        default="PCM",
        help=
        "Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_S]"
    )
    parser.add_argument(
        "-ep",
        "--epitope_prediction",
        default="Syfpeithi",
        help=
        "Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]"
    )
    parser.add_argument(
        "-thr",
        "--threshold",
        default=20,
        type=float,
        help=
        "Specifies epitope prediction threshold for SYFPEITHI (default 20).")

    parser.add_argument("-o",
                        "--output",
                        required=True,
                        help="Specifies the output file.")
    parser.add_argument(
        "-t",
        "--threads",
        type=int,
        default=None,
        help=
        "Specifies number of threads. If not specified all available logical cpus are used."
    )

    parser.add_argument(
        "--ips-solver",
        default="cplex",
        choices=["cplex", "cbc"],
        help=
        "Executable name of the IPS solver. Executable needs to be available in PATH."
    )

    parser.add_argument("--tsp-solution",
                        default="approximate",
                        choices=["approximate", "optimal"],
                        help="Type of solution of the TSP")

    parser.add_argument(
        "--random-order",
        action="store_true",
        help=
        "Indicate whether to generate a random ordered string-of-beads polypeptide"
    )

    parser.add_argument(
        "--seed",
        type=int,
        default=1,
        help="Seed for random ordering of string-of-beads polypeptide")

    args = parser.parse_args()

    #parse input
    peptides = list(FileReader.read_lines(args.input, in_type=Peptide))
    #read in alleles
    alleles = generate_alleles(args.alleles)

    if args.cleavage_prediction.upper() not in [
            "PCM", "PROTEASMM_C", "PROTEASMM_S"
    ]:
        print "Specified cleavage predictor is currently not supported. Please choose either PCM, PROTEASMM_C, or PROTEASMM_S"
        sys.exit(-1)

    if args.epitope_prediction.upper() not in [
            "SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC"
    ]:
        print "Specified cleavage predictor is currently not supported. Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC"
        sys.exit(-1)

    #set-up model
    cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction)
    epi_pred = EpitopePredictorFactory(args.epitope_prediction)

    thr = {a.name: args.threshold for a in alleles}

    solver = EpitopeAssemblyWithSpacer(peptides,
                                       cl_pred,
                                       epi_pred,
                                       alleles,
                                       k=args.max_length,
                                       en=9,
                                       threshold=thr,
                                       solver=args.ips_solver,
                                       alpha=args.alpha,
                                       beta=args.beta,
                                       verbosity=0)

    #solve
    #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem
    #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n
    threads = mp.cpu_count() if args.threads is None else args.threads

    if args.tsp_solution == "approximate":
        svbws = solver.approximate(threads=threads,
                                   options={
                                       "preprocessing_presolve": "n",
                                       "threads": 1
                                   })
    else:
        svbws = solver.solve(threads=threads,
                             options={
                                 "preprocessing_presolve": "n",
                                 "threads": 1
                             })

    # Generate random ordered string-of-breads, but still uses optimal spacers
    # determined from the above solve function.
    if args.random_order:
        print "Generating a randomly ordered polypeptide"
        random.seed(args.seed)
        random_order_sob = []
        random.shuffle(peptides)
        for i in range(len(peptides)):

            # Break from loop once we hit the last peptide
            if i == len(peptides) - 1:
                random_order_sob.extend([Peptide(str(peptides[i]))])
                break

            left_peptide = str(peptides[i])
            right_peptide = str(peptides[i + 1])
            opt_spacer = solver.spacer[(left_peptide, right_peptide)]

            # Right peptide gets added in the next iteration
            random_order_sob.extend(
                [Peptide(left_peptide),
                 Peptide(opt_spacer)])

        svbws = random_order_sob

    print
    print "Resulting String-of-Beads: ", "-".join(map(str, svbws))
    print
    with open(args.output, "w") as f:
        f.write("-".join(map(str, svbws)))
示例#7
0
文件: TestIO.py 项目: Al3n70rn/Fred2
 def test_read_annovar_exonic(self):
     ano = FileReader.read_annovar_exonic(self.ano_path)
     self.assertEqual(len(ano), 5)
示例#8
0
文件: TestIO.py 项目: Al3n70rn/Fred2
 def test_read_fasta(self):
     seqs = FileReader.read_fasta(self.fa_path)
     self.assertEqual(len(seqs), 2)
     seqs = FileReader.read_fasta(self.fa_unconventional_path)  # no "|"
     self.assertEqual(len(seqs), 174)
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-V', '--variations', dest="var_file", help='<Required> full path to the input variations', required=True)
    parser.add_argument('-o', "--outfile", dest="outfile_path", help="Created fasta file", required=True)
    parser.add_argument('-d', "--digest", dest="digest", type=int, help="Length of peptides for predigestion and prediction, default 9.")
    parser.add_argument('-a', "--alleles", dest="alleles", help="Input alleles for prediction")
    parser.add_argument('-p', "--predict", dest="predict_with", help="Method of prediction, needs alleles & length, allowed:[{m}]".format(m=PRED_METH))
    parser.add_argument('-f', "--filter", dest="filter", type=float, help="Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict")
    parser.add_argument('-P', "--Proteins", dest="only_proteins", action='store_true', help="Will write only proteins.")
    parser.add_argument('-b', "--base", dest="basefasta_path", help="If given, entries are replaced by the variation.")

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if options.filter and not options.predict_with:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    if options.predict_with and not options.alleles:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    temp_dir = "/tmp/"

    logging.basicConfig(filename=os.path.splitext(options.outfile_path)[0] + "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log',
                        filemode='w+', level=logging.DEBUG)  #, format='%(levelname)s:%(message)s'
    logging.info("Starting variant fasta creation " + options.outfile_path + " at " + str(datetime.datetime.now()))
    logging.warning("verbosity turned on")

    #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts
    # complete proteins?
    # only containing binders?
    # k-mers?
    # binders only?
    # FastaSlicer.py?
    # remove original if homozygous (needs fasta input)?
    # add germline variant option? or expect all to be in one vcf?

# MyObject = type('MyObject', (object,), {})
# options = MyObject()
# setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf")
#
# vt = os.path.splitext(options.var_file)[-1]
# if ".vcf" == vt:
#     vcfvars, accessions = FileReader.read_vcf(options.var_file)
#
# mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")
#
# transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
# transcripts = [x for x in transcript_gen if x.vars]
# transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
# protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
# proteins = [x for x in protein_gen if x.vars]
# for p in proteins:
#     p.gene_id = p.vars.values()[0][0].gene
#
#
# for t in transcripts:
#     t.gene_id = t.vars.values()[0].gene
#

    vt = os.path.splitext(options.var_file)[-1]
    if ".vcf" == vt:
        vcfvars, accessions = FileReader.read_vcf(options.var_file)
    elif ".GSvar" == vt:
        pass
        # vcfvars = FileReader.read_GSvar(options.var_file)
    else:
        m = "Could not read variants {f}, aborting.".format(f=options.var_file)
        logging.error(m)
        print m
        sys.exit(1)

    mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")  # TODO guess id_type for mart_db from accessions

    transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)

    protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    proteins = [x for x in protein_gen if x.vars]  # removing unvaried

    for p in proteins:
        p.gene_id = p.vars.values()[0][0].gene  # assume gene name from first variant

    proteins = [p for p in proteins if not is_stop_gain(p)]  # kick out stop gains

    # First exit option
    if not (options.predict_with or options.filter) and options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta
            print "N/A"
            sys.exit(0)
        else:
            e = proteins_to_fasta(proteins)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # From now on, digestion must be set somehow
    if not options.digest:
        digest = 9
    else:
        digest = options.digest
    peptide_gen = g.generate_peptides_from_proteins(proteins, digest)
    peptides = [x for x in peptide_gen]
    peptides_var = [x for x in peptides if any(x.get_variants_by_protein(y) for y in x.proteins.keys())]  # removing unvaried

    # Second exit option
    if not (options.predict_with or options.filter):
        e = peptides_to_fasta(peptides_var)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)

    # From now on, predictions are needed
    try:
        target_alleles_set = set(FileReader.read_lines(options.alleles, in_type=Allele))
    except Exception as e:
        m = "Could not read alleles file {f}, aborting.".format(f=options.alleles)
        logging.error(m)
        print m, "what:", str(e)
        sys.exit(1)

    try:
        ttn = EpitopePredictorFactory(options.predict_with)
    except Exception as e:
        m = "Could not initialize prediction method {f}, aborting.".format(f=options.predict_with)
        logging.error(m)
        print m
        sys.exit(1)

    try:
        preds = ttn.predict(peptides_var, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(e)
        sys.exit(1)

    # punch prediction results in peptide metadata (inside pandas dataframe)
    #PRED_METH = set()
    for i, row in preds.iterrows():
        for j in i[1:]:
            i[0].log_metadata(j, dict(zip(row.index, row.values)))
            #PRED_METH.add(j)  # need that later

    # Third exit option
    if not options.filter:
        if options.only_proteins:
            if options.basefasta_path:
                # TODO - replace from base fasta plus prediction annotation
                print "N/A"
                sys.exit(0)
            else:
                prs = annotate_protein_from_peptides(preds)
                e = proteins_to_fasta(prs)
                with open(options.outfile_path, 'w') as f:
                    f.write(e)
                sys.exit(0)
        else:
            e = peptides_to_fasta(preds)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # kick out nonbinder
    preds_f = preds[(preds > options.filter).any(axis=1)]

    # Fourth exit option
    if options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta binders only plus prediction annotation
            print "N/A"
            sys.exit(0)
        else:
            prs = annotate_protein_from_peptides(preds_f)
            e = proteins_to_fasta(prs)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)
    else:
        e = peptides_to_fasta(preds_f)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)
示例#10
0
def __main__():
    parser = argparse.ArgumentParser(description="""EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. 
        Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""", version=VERSION)
    parser.add_argument('-s', "--somatic_mutations", help='Somatic variants')
    parser.add_argument('-g', "--germline_mutations", help="Germline variants")
    parser.add_argument('-i', "--identifier", help="Dataset identifier")
    parser.add_argument('-p', "--peptides", help="File with one peptide per line")
    parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II")
    parser.add_argument('-l', "--max_length", help="Maximum peptide length")
    parser.add_argument('-ml', "--min_length", help="Minimum peptide length")
    parser.add_argument('-t', "--tools", help="Tools used for peptide predictions", required=True, type=str)
    parser.add_argument('-sv', "--versions", help="File containing parsed software version numbers.", required=True)
    parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True)
    parser.add_argument('-r', "--reference", help="Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38'])
    parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", required=False, action='store_true')
    parser.add_argument('-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", required=False, action='store_true')
    parser.add_argument('-fo', "--fasta_output", help="Create FASTA file with protein sequences", required=False, action='store_true')
    parser.add_argument('-rp', "--reference_proteome", help="Reference proteome for self-filtering", required=False)
    parser.add_argument('-gr', "--gene_reference", help="List of gene IDs for ID mapping.", required=False)
    parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values")
    parser.add_argument('-ge', "--gene_expression", help="File with expression analysis results")
    parser.add_argument('-de', "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)")
    parser.add_argument('-li', "--ligandomics_id", help="Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run.")
    args = parser.parse_args()

    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    logger.addHandler(logging.FileHandler('{}_prediction.log'.format(args.identifier)))
    logger.info("Starting predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

    metadata = []
    references = {'GRCh37': 'http://feb2014.archive.ensembl.org', 'GRCh38': 'http://dec2016.archive.ensembl.org'}
    global transcriptProteinMap
    global transcriptSwissProtMap

    '''read in variants or peptides'''
    if args.peptides:
        peptides, metadata = read_peptide_input(args.peptides)
    else:
        if args.somatic_mutations.endswith('.GSvar') or args.somatic_mutations.endswith('.tsv'):
            vl, transcripts, metadata = read_GSvar(args.somatic_mutations)
        elif args.somatic_mutations.endswith('.vcf'):
            vl, transcripts, metadata = read_vcf(args.somatic_mutations)

        transcripts = list(set(transcripts))
        transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts(ID_SYSTEM_USED, transcripts, references[args.reference], args.reference)

    # get the alleles
    alleles = FileReader.read_lines(args.alleles, in_type=Allele)

    # initialize MartsAdapter, GRCh37 or GRCh38 based
    ma = MartsAdapter(biomart=references[args.reference])

    # create protein db instance for filtering self-peptides
    up_db = UniProtDB('sp')
    if args.filter_self:
        logger.info('Reading human proteome')

        if os.path.isdir(args.reference_proteome):
            for filename in os.listdir(args.reference_proteome):
                if filename.endswith(".fasta") or filename.endswith(".fsa"): 
                    up_db.read_seqs(os.path.join(args.reference_proteome, filename))
        else:
            up_db.read_seqs(args.reference_proteome)

    selected_methods = [item for item in args.tools.split(',')]
    with open(args.versions, 'r') as versions_file:
        tool_version = [ (row[0], str(row[1][1:])) for row in csv.reader(versions_file, delimiter = "\t") ]
        # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future
        tool_version.append(('syfpeithi', '1.0'))
        # get for each selected method the corresponding tool version
        methods = { method:version for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() }

    for method, version in methods.items():
        if version not in EpitopePredictorFactory.available_methods()[method]:
            raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.")

    # MHC class I or II predictions
    if args.mhcclass is 1:
        if args.peptides:
            pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata)
        else:
            pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)
    else:
        if args.peptides:
            pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata)
        else:
            pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)

    # concat dataframes for all peptide lengths
    try:
        complete_df = pd.concat(pred_dataframes)
    except:
        complete_df = pd.DataFrame()
        logger.error("No predictions available.")

    # replace method names with method names with version
    # complete_df.replace({'method': methods}, inplace=True)
    complete_df['method'] = complete_df['method'].apply(lambda x : x + '-' + methods[x] )

    # include wild type sequences to dataframe if specified
    if args.wild_type:
        wt_sequences = generate_wt_seqs(all_peptides_filtered)
        complete_df['wt sequence'] = complete_df.apply(lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1)
        columns_tiles = ['sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method']
    # Change the order (the index) of the columns
    else:
        columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method']
    for c in complete_df.columns:
        if c not in columns_tiles:
            columns_tiles.append(c)
    complete_df = complete_df.reindex(columns=columns_tiles)

    binder_cols = [col for col in complete_df.columns if 'binder' in col]

    binders = []
    non_binders = []
    pos_predictions = []
    neg_predictions = []

    for i, r in complete_df.iterrows():
        binder = False
        for c in binder_cols:
            if r[c] is True:
                binder = True
                continue
        if binder:
            binders.append(str(r['sequence']))
            pos_predictions.append(str(r['sequence']))
        else:
            neg_predictions.append(str(r['sequence']))
            if str(r['sequence']) not in binders:
                non_binders.append(str(r['sequence']))
    
    # parse protein quantification results, annotate proteins for samples
    if args.protein_quantification is not None:
        protein_quant = read_protein_quant(args.protein_quantification)
        first_entry = protein_quant[protein_quant.keys()[0]]
        for k in first_entry.keys():
            complete_df['{} log2 protein LFQ intensity'.format(k)] = complete_df.apply(lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1)
        
    # parse (differential) expression analysis results, annotate features (genes/transcripts)
    if args.gene_expression is not None:
        fold_changes = read_diff_expression_values(args.gene_expression)
        gene_id_lengths = {}
        col_name = 'RNA expression (RPKM)'

        with open(args.gene_reference, 'r') as gene_list:
            for l in gene_list:
                ids = l.split('\t')
                gene_id_in_df = complete_df.iloc[1]['gene']
                if 'ENSG' in gene_id_in_df:
                    gene_id_lengths[ids[0]] = float(ids[2].strip())
                else:
                    gene_id_lengths[ids[1]] = float(ids[2].strip())
        deseq = False
        # add column to result dataframe
        complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1)

    if args.diff_gene_expression is not None:
        gene_id_lengths = {}
        fold_changes = read_diff_expression_values(args.diff_gene_expression)
        col_name = 'RNA normal_vs_tumor.log2FoldChange'
        deseq = True

        # add column to result dataframe
        complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1)

    # parse ligandomics identification results, annotate peptides for samples
    if args.ligandomics_id is not None:
        lig_id = read_lig_ID_values(args.ligandomics_id)
        # add columns to result dataframe
        complete_df['ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, False), axis=1)
        complete_df['ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, False), axis=1)

        if args.wild_type != None:
            complete_df['wt ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, True), axis=1)
            complete_df['wt ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, True), axis=1)

    # write mutated protein sequences to fasta file
    if args.fasta_output:
        with open('{}_prediction_proteins.fasta'.format(args.identifier), 'w') as protein_outfile:
            for p in proteins:
                variants = []
                for v in p.vars:
                    variants = variants + p.vars[v]
                c = [x.coding.values() for x in variants]
                cf = list(itertools.chain.from_iterable(c))
                cds = ','.join([y.cdsMutationSyntax for y in set(cf)])
                aas = ','.join([y.aaMutationSyntax for y in set(cf)])
                protein_outfile.write('>{}:{}:{}\n'.format(p.transcript_id, aas, cds))
                protein_outfile.write('{}\n'.format(str(p)))

    # write dataframe to tsv
    complete_df.fillna('')
    complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier), '\t', index=False)

    statistics['number_of_predictions'] = len(complete_df)
    statistics['number_of_binders'] = len(pos_predictions)
    statistics['number_of_nonbinders'] = len(neg_predictions)
    statistics['number_of_unique_binders'] = list(set(binders))
    statistics['number_of_unique_nonbinders'] = list(set(non_binders) - set(binders))

    with open('{}_report.json'.format(args.identifier), 'w') as json_out:
        json.dump(statistics, json_out)
    
    logger.info("Finished predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
示例#11
0
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-V',
                        '--variations',
                        dest="var_file",
                        help='<Required> full path to the input variations',
                        required=True)
    parser.add_argument('-o',
                        "--outfile",
                        dest="outfile_path",
                        help="Created fasta file",
                        required=True)
    parser.add_argument(
        '-d',
        "--digest",
        dest="digest",
        type=int,
        help="Length of peptides for predigestion and prediction, default 9.")
    parser.add_argument('-a',
                        "--alleles",
                        dest="alleles",
                        help="Input alleles for prediction")
    parser.add_argument(
        '-p',
        "--predict",
        dest="predict_with",
        help="Method of prediction, needs alleles & length, allowed:[{m}]".
        format(m=PRED_METH))
    parser.add_argument(
        '-f',
        "--filter",
        dest="filter",
        type=float,
        help=
        "Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict"
    )
    parser.add_argument('-P',
                        "--Proteins",
                        dest="only_proteins",
                        action='store_true',
                        help="Will write only proteins.")
    parser.add_argument(
        '-b',
        "--base",
        dest="basefasta_path",
        help="If given, entries are replaced by the variation.")

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if options.filter and not options.predict_with:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    if options.predict_with and not options.alleles:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    temp_dir = "/tmp/"

    logging.basicConfig(
        filename=os.path.splitext(options.outfile_path)[0] +
        "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log',
        filemode='w+',
        level=logging.DEBUG)  #, format='%(levelname)s:%(message)s'
    logging.info("Starting variant fasta creation " + options.outfile_path +
                 " at " + str(datetime.datetime.now()))
    logging.warning("verbosity turned on")

    #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts
    # complete proteins?
    # only containing binders?
    # k-mers?
    # binders only?
    # FastaSlicer.py?
    # remove original if homozygous (needs fasta input)?
    # add germline variant option? or expect all to be in one vcf?

    # MyObject = type('MyObject', (object,), {})
    # options = MyObject()
    # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf")
    #
    # vt = os.path.splitext(options.var_file)[-1]
    # if ".vcf" == vt:
    #     vcfvars, accessions = FileReader.read_vcf(options.var_file)
    #
    # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")
    #
    # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
    # transcripts = [x for x in transcript_gen if x.vars]
    # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
    # protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    # proteins = [x for x in protein_gen if x.vars]
    # for p in proteins:
    #     p.gene_id = p.vars.values()[0][0].gene
    #
    #
    # for t in transcripts:
    #     t.gene_id = t.vars.values()[0].gene
    #

    vt = os.path.splitext(options.var_file)[-1]
    if ".vcf" == vt:
        vcfvars, accessions = FileReader.read_vcf(options.var_file)
    elif ".GSvar" == vt:
        pass
        # vcfvars = FileReader.read_GSvar(options.var_file)
    else:
        m = "Could not read variants {f}, aborting.".format(f=options.var_file)
        logging.error(m)
        print m
        sys.exit(1)

    mart_db = MartsAdapter(biomart="http://grch37.ensembl.org"
                           )  # TODO guess id_type for mart_db from accessions

    transcript_gen = g.generate_transcripts_from_variants(
        vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)

    protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    proteins = [x for x in protein_gen if x.vars]  # removing unvaried

    for p in proteins:
        p.gene_id = p.vars.values(
        )[0][0].gene  # assume gene name from first variant

    proteins = [p for p in proteins
                if not is_stop_gain(p)]  # kick out stop gains

    # First exit option
    if not (options.predict_with or options.filter) and options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta
            print "N/A"
            sys.exit(0)
        else:
            e = proteins_to_fasta(proteins)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # From now on, digestion must be set somehow
    if not options.digest:
        digest = 9
    else:
        digest = options.digest
    peptide_gen = g.generate_peptides_from_proteins(proteins, digest)
    peptides = [x for x in peptide_gen]
    peptides_var = [
        x for x in peptides if any(
            x.get_variants_by_protein(y) for y in x.proteins.keys())
    ]  # removing unvaried

    # Second exit option
    if not (options.predict_with or options.filter):
        e = peptides_to_fasta(peptides_var)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)

    # From now on, predictions are needed
    try:
        target_alleles_set = set(
            FileReader.read_lines(options.alleles, in_type=Allele))
    except Exception as e:
        m = "Could not read alleles file {f}, aborting.".format(
            f=options.alleles)
        logging.error(m)
        print m, "what:", str(e)
        sys.exit(1)

    try:
        ttn = EpitopePredictorFactory(options.predict_with)
    except Exception as e:
        m = "Could not initialize prediction method {f}, aborting.".format(
            f=options.predict_with)
        logging.error(m)
        print m
        sys.exit(1)

    try:
        preds = ttn.predict(peptides_var, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(
            e)
        sys.exit(1)

    # punch prediction results in peptide metadata (inside pandas dataframe)
    #PRED_METH = set()
    for i, row in preds.iterrows():
        for j in i[1:]:
            i[0].log_metadata(j, dict(zip(row.index, row.values)))
            #PRED_METH.add(j)  # need that later

    # Third exit option
    if not options.filter:
        if options.only_proteins:
            if options.basefasta_path:
                # TODO - replace from base fasta plus prediction annotation
                print "N/A"
                sys.exit(0)
            else:
                prs = annotate_protein_from_peptides(preds)
                e = proteins_to_fasta(prs)
                with open(options.outfile_path, 'w') as f:
                    f.write(e)
                sys.exit(0)
        else:
            e = peptides_to_fasta(preds)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # kick out nonbinder
    preds_f = preds[(preds > options.filter).any(axis=1)]

    # Fourth exit option
    if options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta binders only plus prediction annotation
            print "N/A"
            sys.exit(0)
        else:
            prs = annotate_protein_from_peptides(preds_f)
            e = proteins_to_fasta(prs)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)
    else:
        e = peptides_to_fasta(preds_f)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)
示例#12
0
 def test_read_fasta(self):
     seqs = FileReader.read_fasta(self.fa_path)
     self.assertEqual(len(seqs), 2)
     seqs = FileReader.read_fasta(self.fa_unconventional_path)  # no "|"
     self.assertEqual(len(seqs), 174)
def vaccine(input_sequences, input_peptides, input_alleles, input_epitopes,
            input_vaccine, output_summary, verbose):
    # load vaccine
    with open(input_vaccine) as f:
        vaccine = {}
        for row in csv.DictReader(f):
            if row['cocktail'] not in vaccine:
                vaccine[row['cocktail']] = {}
            vaccine[row['cocktail']][int(row['index'])] = row['epitope']

        cocktail = []
        for mosaic in vaccine.values():
            ordered = sorted(mosaic.items(), key=lambda x: x[0])
            cocktail.append([e for _, e in ordered])
    LOGGER.info('Vaccine loaded')

    # load alleles
    allele_data = utilities.get_alleles_and_thresholds(input_alleles).to_dict(
        'index')
    LOGGER.info('Loaded %d alleles', len(allele_data))

    # load peptides coverage
    peptides = {}
    with open(input_peptides) as f:
        for row in csv.DictReader(f):
            peptides[row['peptide']] = row['proteins'].split(';')
    LOGGER.info('Loaded %d peptides with coverage', len(peptides))

    # load epitopes (also fill peptides since some design methods do not use epitopes)
    epitope_data = {
        pep: {
            'immunogen': 0.0,
            'alleles': [],
            'proteins': prots
        }
        for pep, prots in peptides.items()
    }
    with open(input_epitopes) as f:
        for row in csv.DictReader(f):
            row['immunogen'] = float(row['immunogen'])
            row['alleles'] = row['alleles'].split(
                ';') if row['alleles'] else []
            row['proteins'] = row['proteins'].split(';')
            if row['immunogen'] > 0:
                epitope_data[row['epitope']] = row
    LOGGER.info('Loaded %d epitopes', len(epitope_data))

    # load sequences
    proteins = FileReader.read_fasta(input_sequences, in_type=Protein)
    LOGGER.info('Loaded %d proteins', len(proteins))

    # print stats for each mosaic
    for i, mosaic in enumerate(cocktail):
        LOGGER.info('---')
        LOGGER.info('Mosaic #%d - %d epitopes', i + 1, len(mosaic))
        for epi in mosaic:
            LOGGER.info('    %s', epi)
        evaluate_epitopes(mosaic, epitope_data, allele_data, len(proteins))

    # write csv
    LOGGER.info('---')
    vaccine_stats = evaluate_epitopes(
        [epi for mosaic in cocktail for epi in mosaic], epitope_data,
        allele_data, len(proteins))
    with open(output_summary, 'w') as f:
        writer = csv.DictWriter(f, vaccine_stats.keys())
        writer.writeheader()
        writer.writerow(vaccine_stats)
示例#14
0
def get_mosaic_solver_instance(logger, input_proteins, input_alleles,
                               input_epitopes, input_overlaps, **kwargs):
    top_immunogen = kwargs.pop('top_immunogen')
    top_alleles = kwargs.pop('top_alleles')
    top_proteins = kwargs.pop('top_proteins')
    min_overlap = kwargs.get('min_overlap', 0)
    cocktail = kwargs.get('cocktail', 1)
    greedy_subtour = kwargs.get('greedy_subtour')
    max_epitopes = kwargs.get('max_epitopes')
    max_aminoacids = kwargs.get('max_aminoacids')
    min_alleles = kwargs.get('min_alleles', 0)
    min_proteins = kwargs.get('min_proteins', 0)
    min_avg_prot_conservation = kwargs.get('min_avg_prot_conservation', 0)
    min_avg_alle_conservation = kwargs.get('min_avg_alle_conservation', 0)

    # load proteins
    logger.info('Reading sequences...')
    proteins = FileReader.read_fasta(input_proteins, in_type=Protein)
    logger.info('%d proteins read', len(proteins))

    # load alleles
    alleles = [
        Allele(a) for a in get_alleles_and_thresholds(input_alleles).index
    ]
    logger.info('Loaded %d alleles', len(alleles))

    # load epitopes
    epitope_data = list(
        load_epitopes(input_epitopes, top_immunogen, top_alleles,
                      top_proteins).values())
    logger.info('Loaded %d epitopes', len(epitope_data))

    # load edge cost
    logger.info('Loading overlaps...')
    vertex_rewards = [0] + [b['immunogen'] for b in epitope_data]
    edges = load_edges_from_overlaps(input_overlaps, min_overlap,
                                     [b['epitope'] for b in epitope_data])
    logger.info('Kept %d edges (from %d)', len(edges),
                len(epitope_data) * (len(epitope_data) + 1))

    # compute hla and protein coverage
    logger.info('Computing coverage matrix...')
    type_coverage, min_type_coverage, min_avg_type_conservation = compute_coverage_matrix(
        epitope_data, min_alleles, min_proteins, min_avg_prot_conservation,
        min_avg_alle_conservation, len(proteins), len(alleles))

    # find optimal design
    solver = TeamOrienteeringIlp(
        num_teams=cocktail,
        vertex_reward=vertex_rewards,
        edge_cost=edges,
        max_edge_cost=0,
        max_vertices=0,
        lazy_subtour_elimination=not greedy_subtour,
        type_coverage=type_coverage,
        min_type_coverage=min_type_coverage,
        min_avg_type_conservation=min_avg_type_conservation,
    )

    if isinstance(max_epitopes, (int, float)):
        solver.update_max_vertices(max_epitopes)

    if isinstance(max_aminoacids, (int, float)):
        solver.update_max_edge_cost(max_aminoacids)

    return solver, {
        'proteins': proteins,
        'alleles': alleles,
        'epitope_data': epitope_data,
    }
示例#15
0
def main():
    parser = argparse.ArgumentParser(description="Reads protein or peptide sequences and predicts peptides "+
                                                 "for a specified prediction method and HLA alleles.")
    parser.add_argument("-i", "--input",
                        nargs="+",
                        required=True,
                        help="Input data can be RefSeq ID, UniProt ID, fasta file, peptide file (one peptide per line),"
                             +" or peptide sequences as sequences (max 50)"
                        )
    input_types = parser.add_mutually_exclusive_group(required=True)
    input_types.add_argument("-r","--refseq",
                             action="store_true",
                             help= "Specifies the input as RefSeq IDs")
    input_types.add_argument("-u","--uniprot",
                             action="store_true",
                             help= "Specifies the input as UniProt IDs")
    input_types.add_argument("-f","--fasta",
                             action="store_true",
                             help= "Specifies the input as protein (multi-)Fasta file")
    input_types.add_argument("-pf","--pepfile",
                             action="store_true",
                             help= "Specifies the input as peptide file")
    input_types.add_argument("-p","--peptide",
                             action="store_true",
                             help= "Specifies the input as peptide sequences")
    parser.add_argument("-a", "--alleles",
                        nargs="+",
                        required=True,
                        help="Specifies for which alleles prediction should be made. " +
                             "Input either can be alleles as string (new nomenclature), or a file with one allele per line.")
    allele_types = parser.add_mutually_exclusive_group(required=True)
    allele_types.add_argument("-af", "--allelefile",
                               action="store_true",
                               help="Specifies the allele input as allele file.")
    allele_types.add_argument("-as", "--allelestring",
                               action="store_true",
                               help="Specifies the allele input as allele string.")
    parser.add_argument("-m", "--method",
                       required=True,
                       nargs="+",
                       help="Specifies the method used for prediction.")
    parser.add_argument("-l", "--length",
                        required=False,
                        type=int,
                        default=9,
                        help="Specifies the length of the peptides (default=9).")
    parser.add_argument("-o", "--output",
                        required=True,
                        help="Specifies the output path. Results will be written to CSV")
    parser.add_argument("-am", "--available",
                        required=False,
                        action="store_true",
                        help="Returns all available methods and their allele models.")

    #COMMENT: These options are hidden and only used for ETK2
    parser.add_argument("-html", "--html",
                        required=False,
                        action="store_true",
                        help=argparse.SUPPRESS)
    parser.add_argument("-od", "--outdir",
                        required=False,
                        default="",
                        help=argparse.SUPPRESS)
    args = parser.parse_args()

    if args.available:
        for pred, obj in AEpitopePrediction.registry.iteritems():
            if pred not in ["AEpitopePrediction", "APSSMEpitopePredictor", "ANetMHC", "ASVMEpitopePrediction"]:
                print "Method: ",pred
                print "Supported Alleles: ", " ".join(getattr(obj, "_"+pred+"__alleles" ))
                print "Supported Length: ", " ".join(map(str, getattr(obj,  "_"+pred+"__supported_length")))
                print
        sys.exit(0)


    '''
    Parser Input
    '''
    #RefSeq
    if args.refseq:
        pass

    #UniProt
    elif args.uniprot:
        pass

    #fasta protein
    elif args.fasta:
        proteins = FileReader.read_fasta(args.input, type="Protein")
        peptides = generate_peptides_from_protein(proteins, args.length)

    elif args.pepfile:
        peptides = FileReader.read_lines(args.input, type="Peptide")

    elif args.peptide:
        peptides = [Peptide(s) for s in args.input]

    #read in alleles
    if args.allelefile:
        alleles = FileReader.read_lines(args.alleles, type="Allele")
    else:
        alleles = [Allele(a.upper()) for a in args.alleles]

    result = [EpitopePredictorFactory(m).predict(peptides, alleles) for m in args.method]
    r_df = result.pop()
    for r in result:
        r_df_a, r_a = r_df.align(r, fill_value=0)
        r_df = r_df_a + r_a

    output = args.output if args.outdir == "" else args.outdir + os.path.basename(args.output)
    with open(output, "w") as out:
        r_df.to_csv(out)



    #generate Galaxy HTML output
    if args.html:
        begin_html = """<?xml version="1.0" encoding="utf-8" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <link rel="stylesheet" href="/static/style/blue/etk.css" type="text/css" />
    <script type="text/javascript" src="/static/scripts/packed/libs/jquery/jquery.js"></script>
    <script type="text/javascript" src="/static/scripts/packed/libs/jquery/jquery.tablesorter.js"></script>
    <script type="text/javascript" src="/static/scripts/libs/etk.js"></script>
</head>

<body>
    <div class="document">"""

        setting = """  <h2 class="etk-heading">Epitope Prediction Results</h2>

        <table class="etk-parameterT">
            <tr> <th class ="etk-innerHeading" colspan="2"> Parameters </th></tr>
            <tr>
                <th>Prediction Method:</th>
                <td>%s</td>
            </tr>
        </table>"""%args.method



        table="""

        <input id="etk-search" placeholder="  filter">
        <table class="etk-sortT etk-resultsT etk-filterT">

            <thead>
                <tr>
                    <th>Peptide</th>"""+"".join("<th>%s</th>"%str(a) for a in result.columns) \
            +"""
                </tr>
            </thead>"""+"".join("<tr><td>%s<td>%s</tr>"%(r[0] ,"".join("<td align='right'>%s</td>"%str(result.loc[r, c])))
                                for r in result.index for c in result.columns)+"</table>"

        end_html = "</div></body></html>"

        html_out = ".".join(output.split(".")[:-1])+".html"
        with open(html_out, "w") as html_o:
            html_o.write(begin_html+setting+table+end_html)
示例#16
0
def main():
    parser = argparse.ArgumentParser(description="""The software is a novel approach to construct epitope-based string-of-beads
vaccines in optimal order and with sequence-optimized spacers of flexible length
such that the recovery of contained epitopes is maximized and immunogenicity of 
arising neo-epitopes is reduced. """)
    parser.add_argument("-i", "--input",
                        required=True,
                        help="File containing epitopes (one peptide per line)"
                        )
    parser.add_argument("-a", "--alleles",
                        required=True,
                        help="Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)"
                        )

    #parameters of the model
    parser.add_argument("-k","--max_length",
                        default=6,
                        type=int,
                        help="Specifies the max. length of the spacers (default 6)")
    parser.add_argument("-al","--alpha",
                        default=0.99,
                        type=float,
                        help="Specifies the first-order preference of the user in the model [0,1] (default 0.99)")
    parser.add_argument("-be","--beta",
                        default=0.0,
                        type=float,
                        help="Specifies the second-order preference of the user in the model [0,1] (default 0).")
    parser.add_argument("-thr","--threshold",
                        default=20,
                        type=float,
                        help="Specifies epitope prediction threshold for SYFPEITHI (default 20).")


    parser.add_argument("-o", "--output",
                        required=True,
                        help="Specifies the output file.")
    parser.add_argument("-t", "--threads",
    					type=int,
    					default=None,
                        help="Specifies number of threads. If not specified all available logical cpus are used.")


    args = parser.parse_args()


  	#parse input
    peptides = FileReader.read_lines(args.input, type="Peptide")
    #read in alleles
    alleles = generate_alleles(args.alleles)


    #set-up model
    cl_pred = CleavageSitePredictorFactory("PCM")
    epi_pred = EpitopePredictorFactory("Syfpeithi")

    thr = {a.name:args.threshold for a in alleles}

    solver = EpitopeAssemblyWithSpacer(peptides,cl_pred,epi_pred,alleles,
                                      k=args.max_length,en=9,threshold=thr,
                                      solver="cplex", alpha=args.alpha, beta=args.beta,
                                      verbosity=0)

    #solve
    #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem
    #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n
    threads = mp.cpu_count() if args.threads is None else args.threads
    svbws = solver.approximate(threads=threads,options="preprocessing_presolve=n,threads=1")

    print
    print "Resulting String-of-Beads: ","-".join(map(str,svbws))
    print
    with open(args.output, "w") as f:
        f.write("-".join(map(str,svbws)))
示例#17
0
 def test_read_annovar_exonic(self):
     ano = FileReader.read_annovar_exonic(self.ano_path)
     self.assertEqual(len(ano), 5)
示例#18
0
def __main__():
    parser = argparse.ArgumentParser(
        "Write out information about supported models by Fred2 for installed predictor tool versions."
    )
    parser.add_argument('-p',
                        "--peptides",
                        help="File with one peptide per line")
    parser.add_argument('-c',
                        "--mhcclass",
                        default=1,
                        help="MHC class I or II")
    parser.add_argument('-l', "--max_length", help="Maximum peptide length")
    parser.add_argument('-ml', "--min_length", help="Minimum peptide length")
    parser.add_argument('-a',
                        "--alleles",
                        help="<Required> MHC Alleles",
                        required=True)
    parser.add_argument('-t',
                        '--tools',
                        help='Tools requested for peptide predictions',
                        required=True,
                        type=str)
    parser.add_argument('-v',
                        '--versions',
                        help='<Required> File with used software versions.',
                        required=True)
    args = parser.parse_args()

    selected_methods = [item for item in args.tools.split(',')]
    with open(args.versions, 'r') as versions_file:
        tool_version = [(row[0], str(row[1][1:]))
                        for row in csv.reader(versions_file, delimiter="\t")]
        # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future
        tool_version.append(('syfpeithi', '1.0'))  # how to handle this?
        # get for each method the corresponding tool version
        methods = {
            method: version
            for tool, version in tool_version for method in selected_methods
            if tool.lower() in method.lower()
        }

    # get the alleles
    alleles = FileReader.read_lines(args.alleles, in_type=Allele)

    peptide_lengths = []
    if (args.peptides):
        peptides = read_peptide_input(args.peptides)
        peptide_lengths = set([len(pep) for pep in peptides])
    else:
        peptide_lengths = range(args.min_length, args.max_length + 1)

    with open("model_report.txt", 'w') as output:
        # check if requested tool versions are supported
        for method, version in methods.items():
            if version not in EpitopePredictorFactory.available_methods(
            )[method]:
                raise ValueError("The specified version " + version + " for " +
                                 method + " is not supported by Fred2.")

        # check if reuested alleles are supported
        support_all_alleles = True
        no_allele_support = True
        for a in alleles:
            supported = False
            for method, version in methods.items():
                predictor = EpitopePredictorFactory(method, version=version)

                if a not in sorted(predictor.supportedAlleles):
                    output.write("Allele " + convert_allele_back(a) +
                                 " is not supported by " + method + " " +
                                 version + ".\n")
                else:
                    supported = True

            if not supported:
                output.write(
                    "Allele " + convert_allele_back(a) +
                    " is not supported by any of the requested tools.\n")
                logger.warning(
                    "Allele " + convert_allele_back(a) +
                    " is not supported by any of the requested tools.")
                support_all_alleles = False
            else:
                no_allele_support = False
        if support_all_alleles:
            output.write(
                "All selected alleles are supported by at least one of the requested tools.\n"
            )
        if no_allele_support:
            output.write(
                "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n"
            )
            raise ValueError(
                "None of the specified alleles is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models."
            )

        output.write("\n")
        # check if reuested lengths are supported
        support_all_lengths = True
        no_length_support = True
        for l in peptide_lengths:
            supported = False
            for method, version in methods.items():
                predictor = EpitopePredictorFactory(method, version=version)

                if l not in sorted(predictor.supportedLength):
                    output.write("Peptide length " + str(l) +
                                 " is not supported by " + method + " " +
                                 version + ".\n")
                else:
                    supported = True

            if not supported:
                output.write(
                    "Peptide length " + str(l) +
                    " is not supported by any of the requested tools.\n")
                logger.warning(
                    "Peptide length " + str(l) +
                    " is not supported by any of the requested tools.")
                support_all_lengths = False
            else:
                no_length_support = False
        if support_all_lengths:
            output.write(
                "All selected or provided peptide lengths are supported by at least one of the requested tools.\n"
            )
        if no_length_support:
            output.write(
                "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models.\n"
            )
            raise ValueError(
                "None of the peptide lengths is supported by any of the requested tools. Specify '--show_supported_models' to write out all supported models."
            )
def extract_peptides(input_sequences, max_edits, output_peptides, top_n):
    ''' Extract peptides from the given sequences and computes protein coverage for each peptide.
        Coverage can be computed allowing for inexact matching.

        In other words, it first generates all peptides that appear in the input proteins,
        and stores which proteins each peptide appears in. Then, for every peptide, it
        finds all peptides that can be obtained by changing at most max-edits aminoacids,
        and counts the proteins that contain the edited peptides.
    '''
    LOGGER.info('Reading sequences...')
    proteins = FileReader.read_fasta(input_sequences, in_type=Protein)
    LOGGER.info('%d proteins read', len(proteins))

    LOGGER.info('Extracting protein coverage for each peptide...')
    all_peptides = utilities.Trie()
    proteins_by_peptide = {}

    for i, prot in enumerate(proteins):
        aminoacids = ''.join(
            c for c in prot._data
            if c.isalpha())  # remove non-aminoacids from alignment
        peptides_in_this_protein = set(
        )  # make sure we only count peptides once per protein
        for j in range(len(aminoacids) - 8):
            seq = str(aminoacids[j:j + 9])
            if seq not in peptides_in_this_protein:
                peptides_in_this_protein.add(seq)
                all_peptides.insert(seq)
                if seq not in proteins_by_peptide:
                    proteins_by_peptide[seq] = set()
                proteins_by_peptide[seq].add(i)

        if utilities.is_percent_barrier(i, len(proteins), 5):
            LOGGER.debug(
                '%d proteins analyzed (%.2f%%) and %d peptides extracted...',
                i + 1, 100 * (i + 1) / len(proteins), len(proteins_by_peptide))

    LOGGER.info('Computing reachability...')
    top_peptides = []
    with open(output_peptides, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(('peptide', 'proteins'))

        for i, peptide in enumerate(proteins_by_peptide):
            # find reachable peptides and which proteins they belong to
            reachable_proteins = set()
            for reachable, edits in all_peptides.reachable_strings(
                    peptide, max_edits):
                reachable_proteins.update(proteins_by_peptide[reachable])

            # now either update the top N or save reachability to file
            if top_n > 0:
                heapq.heappush(
                    top_peptides,
                    (len(reachable_proteins), peptide, reachable_proteins))
                if len(top_peptides) > top_n:
                    heapq.heappop(top_peptides)
            else:
                writer.writerow(
                    (peptide, ';'.join(list(map(str, reachable_proteins)))))

            if utilities.is_percent_barrier(i, len(proteins_by_peptide), 2.5):
                LOGGER.debug('%d peptides analyzed (%.2f%%)...', i + 1,
                             100 * (i + 1) / len(proteins_by_peptide))

        # save the top N to file
        if top_n > 0:
            LOGGER.info('Saving top peptides to file')
            for _, peptide, proteins in top_peptides:
                writer.writerow(
                    (peptide, ','.join(list(map(str, reachable_proteins)))))
示例#20
0
def __main__():
    parser = argparse.ArgumentParser(
        description=
        """EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. 
        Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""",
        version=VERSION)
    parser.add_argument('-s', "--somatic_mutations", help='Somatic variants')
    parser.add_argument('-g', "--germline_mutations", help="Germline variants")
    parser.add_argument('-i', "--identifier", help="Dataset identifier")
    parser.add_argument('-p',
                        "--peptides",
                        help="File with one peptide per line")
    parser.add_argument('-c',
                        "--mhcclass",
                        default="I",
                        help="MHC class I or II")
    parser.add_argument('-l', "--length", help="Maximum peptide length")
    parser.add_argument('-a',
                        "--alleles",
                        help="<Required> MHC Alleles",
                        required=True)
    parser.add_argument(
        '-r',
        "--reference",
        help=
        "Reference, retrieved information will be based on this ensembl version",
        required=False,
        default='GRCh37',
        choices=['GRCh37', 'GRCh38'])
    parser.add_argument('-f',
                        "--filter_self",
                        help="Filter peptides against human proteom",
                        required=False,
                        action='store_true')
    parser.add_argument(
        '-wt',
        "--wild_type",
        help="Add wild type sequences of mutated peptides to output",
        required=False,
        action='store_true')
    parser.add_argument('-rp',
                        "--reference_proteome",
                        help="Reference proteome for self-filtering",
                        required=False)
    parser.add_argument('-gr',
                        "--gene_reference",
                        help="List of gene IDs for ID mapping.",
                        required=False)
    parser.add_argument('-pq',
                        "--protein_quantification",
                        help="File with protein quantification values")
    parser.add_argument('-ge',
                        "--gene_expression",
                        help="File with expression analysis results")
    parser.add_argument(
        '-de',
        "--diff_gene_expression",
        help="File with differential expression analysis results (DESeq2)")
    parser.add_argument(
        '-li',
        "--ligandomics_id",
        help=
        "Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run."
    )
    parser.add_argument('-o',
                        "--output_dir",
                        help="All files written will be put in this directory")

    args = parser.parse_args()

    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if args.output_dir is not None:
        try:
            os.chdir(args.output_dir)
            logging.basicConfig(filename=os.path.join(
                args.output_dir, '{}_prediction.log'.format(args.identifier)),
                                filemode='w+',
                                level=logging.DEBUG)
            logging.info("Using provided data directory: {}".format(
                str(args.output_dir)))
        except:
            logging.info("No such directory, using current.")
    else:
        logging.basicConfig(filename='{}_prediction.log'.format(
            args.identifier),
                            filemode='w+',
                            level=logging.DEBUG)
        logging.info("Using current data directory.")

    logging.info("Starting predictions at " +
                 str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

    metadata = []
    references = {
        'GRCh37': 'http://feb2014.archive.ensembl.org',
        'GRCh38': 'http://dec2016.archive.ensembl.org'
    }
    global transcriptProteinMap
    global transcriptSwissProtMap
    '''read in variants or peptides'''
    if args.peptides:
        peptides, metadata = read_peptide_input(args.peptides)
    else:
        if args.somatic_mutations.endswith(
                '.GSvar') or args.somatic_mutations.endswith('.tsv'):
            vl, transcripts, metadata = read_GSvar(args.somatic_mutations)
        elif args.somatic_mutations.endswith('.vcf'):
            vl, transcripts = read_vcf(args.somatic_mutations)

        transcripts = list(set(transcripts))
        transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts(
            ID_SYSTEM_USED, transcripts, references[args.reference],
            args.reference)

    # get the alleles
    alleles = FileReader.read_lines(args.alleles, in_type=Allele)

    # initialize MartsAdapter, GRCh37 or GRCh38 based
    ma = MartsAdapter(biomart=references[args.reference])

    # create protein db instance for filtering self-peptides
    up_db = UniProtDB('sp')
    if args.filter_self:
        logging.info('Reading human proteome')

        if os.path.isdir(args.reference_proteome):
            for filename in os.listdir(args.reference_proteome):
                if filename.endswith(".fasta") or filename.endswith(".fsa"):
                    up_db.read_seqs(
                        os.path.join(args.reference_proteome, filename))
        else:
            up_db.read_seqs(args.reference_proteome)

    # MHC class I or II predictions
    if args.mhcclass == "I":
        #methods = ['netmhc-4.0', 'syfpeithi-1.0', 'netmhcpan-3.0']
        methods = ['syfpeithi-1.0']
        if args.peptides:
            pred_dataframes, statistics = make_predictions_from_peptides(
                peptides, methods, alleles, up_db, args.identifier, metadata)
        else:
            pred_dataframes, statistics, all_peptides_filtered = make_predictions_from_variants(
                vl, methods, alleles, 8,
                int(args.length) + 1, ma, up_db, args.identifier, metadata,
                transcriptProteinMap)
    else:
        methods = ['netmhcII-2.2', 'syfpeithi-1.0', 'netmhcIIpan-3.1']
        if args.peptides:
            pred_dataframes, statistics = make_predictions_from_peptides(
                peptides, methods, alleles, up_db, args.identifier, metadata)
        else:
            pred_dataframes, statistics, all_peptides_filtered = make_predictions_from_variants(
                vl, methods, alleles, 15,
                int(args.length) + 1, ma, up_db, args.identifier, metadata,
                transcriptProteinMap)

    # concat dataframes for all peptide lengths
    try:
        complete_df = pd.concat(pred_dataframes)
    except:
        complete_df = pd.DataFrame()
        logging.error("No predictions available.")

    # store version of used methods
    method_map = {}
    for m in methods:
        method_map[m.split('-')[0]] = m

    # replace method names with method names with version
    complete_df.replace({'method': method_map}, inplace=True)

    # include wild type sequences to dataframe if specified
    if args.wild_type:
        wt_sequences = generate_wt_seqs(all_peptides_filtered)
        complete_df['wt sequence'] = complete_df.apply(
            lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1)
        columns_tiles = [
            'sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene',
            'transcripts', 'proteins', 'variant type', 'method'
        ]
    # Change the order (the index) of the columns
    else:
        columns_tiles = [
            'sequence', 'length', 'chr', 'pos', 'gene', 'transcripts',
            'proteins', 'variant type', 'method'
        ]
    for c in complete_df.columns:
        if c not in columns_tiles:
            columns_tiles.append(c)
    complete_df = complete_df.reindex(columns=columns_tiles)

    binder_cols = [col for col in complete_df.columns if 'binder' in col]

    binders = []
    non_binders = []
    pos_predictions = []
    neg_predictions = []

    for i, r in complete_df.iterrows():
        binder = False
        for c in binder_cols:
            if r[c] is True:
                binder = True
                continue
        if binder:
            binders.append(str(r['sequence']))
            pos_predictions.append(str(r['sequence']))
        else:
            neg_predictions.append(str(r['sequence']))
            if str(r['sequence']) not in binders:
                non_binders.append(str(r['sequence']))

    # parse protein quantification results, annotate proteins for samples
    if args.protein_quantification is not None:
        protein_quant = read_protein_quant(args.protein_quantification)
        first_entry = protein_quant[protein_quant.keys()[0]]
        for k in first_entry.keys():
            complete_df['{} log2 protein LFQ intensity'.format(
                k)] = complete_df.apply(
                    lambda row: create_quant_column_value_for_result(
                        row, protein_quant, transcriptSwissProtMap, k),
                    axis=1)

    # parse (differential) expression analysis results, annotate features (genes/transcripts)
    if args.gene_expression is not None:
        fold_changes = read_diff_expression_values(args.gene_expression)
        gene_id_lengths = {}
        col_name = 'RNA expression (RPKM)'

        with open(args.gene_reference, 'r') as gene_list:
            for l in gene_list:
                ids = l.split('\t')
                gene_id_in_df = complete_df.iloc[1]['gene']
                if 'ENSG' in gene_id_in_df:
                    gene_id_lengths[ids[0]] = float(ids[2].strip())
                else:
                    gene_id_lengths[ids[1]] = float(ids[2].strip())
        deseq = False
        # add column to result dataframe
        complete_df[col_name] = complete_df.apply(
            lambda row: create_expression_column_value_for_result(
                row, fold_changes, deseq, gene_id_lengths),
            axis=1)

    if args.diff_gene_expression is not None:
        gene_id_lengths = {}
        fold_changes = read_diff_expression_values(args.diff_gene_expression)
        col_name = 'RNA normal_vs_tumor.log2FoldChange'
        deseq = True

        # add column to result dataframe
        complete_df[col_name] = complete_df.apply(
            lambda row: create_expression_column_value_for_result(
                row, fold_changes, deseq, gene_id_lengths),
            axis=1)

    # parse ligandomics identification results, annotate peptides for samples
    if args.ligandomics_id is not None:
        lig_id = read_lig_ID_values(args.ligandomics_id)
        # add columns to result dataframe
        complete_df['ligand score'] = complete_df.apply(
            lambda row: create_ligandomics_column_value_for_result(
                row, lig_id, 0, False),
            axis=1)
        complete_df['ligand intensity'] = complete_df.apply(
            lambda row: create_ligandomics_column_value_for_result(
                row, lig_id, 1, False),
            axis=1)

        if args.wild_type != None:
            complete_df['wt ligand score'] = complete_df.apply(
                lambda row: create_ligandomics_column_value_for_result(
                    row, lig_id, 0, True),
                axis=1)
            complete_df['wt ligand intensity'] = complete_df.apply(
                lambda row: create_ligandomics_column_value_for_result(
                    row, lig_id, 1, True),
                axis=1)

    # write dataframe to tsv
    complete_df.fillna('')
    complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier),
                       '\t',
                       index=False)

    statistics['number_of_predictions'] = complete_df.shape[0]
    statistics['number_of_binders'] = len(pos_predictions)
    statistics['number_of_nonbinders'] = len(neg_predictions)
    statistics['number_of_unique_binders'] = list(set(binders))
    statistics['number_of_unique_nonbinders'] = list(
        set(non_binders) - set(binders))

    with open('{}_report.json'.format(args.identifier), 'w') as json_out:
        json.dump(statistics, json_out)
示例#21
0
文件: TestIO.py 项目: Al3n70rn/Fred2
 def test_read_lines(self):
     alleles = FileReader.read_lines(self.ale_path, in_type=Allele)
     self.assertEqual(len(alleles), 2)
     self.assertRaises(IOError, FileReader.read_lines, self.ale_no_path, in_type=Allele)
     self.assertRaises(ValueError, FileReader.read_lines, self.ale_zonk_path, in_type=Allele)
示例#22
0
def main():
    parser = argparse.ArgumentParser(description="""The software is a novel approach to construct epitope-based string-of-beads
vaccines in optimal order and with sequence-optimized spacers of flexible length
such that the recovery of contained epitopes is maximized and immunogenicity of 
arising neo-epitopes is reduced. """)
    parser.add_argument("-i", "--input",
                        required=True,
                        help="File containing epitopes (one peptide per line)"
    )
    parser.add_argument("-a", "--alleles",
                        required=True,
                        help="Specifies file containing HLA alleles with corresponding HLA probabilities (one HLA per line)"
    )

    #parameters of the model
    parser.add_argument("-k","--max_length",
                        default=6,
                        type=int,
                        help="Specifies the max. length of the spacers (default 6)")
    parser.add_argument("-al","--alpha",
                        default=0.99,
                        type=float,
                        help="Specifies the first-order preference of the user in the model [0,1] (default 0.99)")
    parser.add_argument("-be","--beta",
                        default=0.0,
                        type=float,
                        help="Specifies the second-order preference of the user in the model [0,1] (default 0).")

    parser.add_argument("-cp","--cleavage_prediction",
                        default="PCM",
                        help="Specifies the used cleavage prediction method (default PCM) [available: PCM, PROTEASMM_C, PROTEASMM_S]"
    )
    parser.add_argument("-ep","--epitope_prediction",
                        default="Syfpeithi",
                        help="Specifies the used epitope prediction method (default Syfpeithi) [available: Syfpeithi, BIMAS, SMM, SMMPMBEC]"
    )
    parser.add_argument("-thr","--threshold",
                        default=20,
                        type=float,
                        help="Specifies epitope prediction threshold for SYFPEITHI (default 20).")

    parser.add_argument("-o", "--output",
                        required=True,
                        help="Specifies the output file.")
    parser.add_argument("-t", "--threads",
                        type=int,
                        default=None,
                        help="Specifies number of threads. If not specified all available logical cpus are used.")


    args = parser.parse_args()

    #parse input
    peptides = list(FileReader.read_lines(args.input, in_type=Peptide))
    #read in alleles
    alleles = generate_alleles(args.alleles)

    if args.cleavage_prediction.upper() not in ["PCM", "PROTEASMM_C", "PROTEASMM_S"]:
        print "Specified cleavage predictor is currently not supported. Please choose either PCM, PROTEASMM_C, or PROTEASMM_S"
        sys.exit(-1)

    if args.epitope_prediction.upper() not in ["SYFPEITHI", "BIMAS", "SMM", "SMMPMBEC"]:
        print "Specified cleavage predictor is currently not supported. Please choose either Syfpeithi, BIMAS, SMM, SMMPMBEC"
        sys.exit(-1)

    #set-up model
    cl_pred = CleavageSitePredictorFactory(args.cleavage_prediction)
    epi_pred = EpitopePredictorFactory(args.epitope_prediction)

    thr = {a.name:args.threshold for a in alleles}

    solver = EpitopeAssemblyWithSpacer(peptides,cl_pred,epi_pred,alleles,
                                       k=args.max_length,en=9,threshold=thr,
                                       solver="cplex", alpha=args.alpha, beta=args.beta,
                                       verbosity=0)

    #solve
    #pre-processing has to be disable otherwise many solver will destroy the symmetry of the problem
    #how to do this is dependent on the solver used. For CPLEX it is preprocessing_presolve=n
    threads = mp.cpu_count() if args.threads is None else args.threads
    svbws = solver.approximate(threads=threads,options={"preprocessing_presolve":"n","threads":1})

    print
    print "Resulting String-of-Beads: ","-".join(map(str,svbws))
    print
    with open(args.output, "w") as f:
        f.write("-".join(map(str,svbws)))
示例#23
0
def string_of_beads(input_proteins, input_alleles, input_epitopes,
                    input_cleavages, output_vaccine, cocktail, greedy_subtour,
                    max_aminoacids, max_epitopes, min_alleles, min_proteins,
                    min_avg_prot_conservation, min_avg_alle_conservation):
    program_start_time = time.time()

    # load proteins
    LOGGER.info('Reading sequences...')
    proteins = FileReader.read_fasta(input_proteins, in_type=Protein)
    LOGGER.info('%d proteins read', len(proteins))

    # load alleles
    alleles = [
        Allele(a)
        for a in utilities.get_alleles_and_thresholds(input_alleles).index
    ]
    LOGGER.info('Loaded %d alleles', len(alleles))

    # load epitopes
    epitopes = utilities.load_epitopes(input_epitopes)
    LOGGER.info('Loaded %d epitopes', len(epitopes))

    # read cleavage scores
    cleavage_epitopes = set()
    with open(input_cleavages) as f:
        cleavages = {}
        for row in csv.DictReader(f):
            cleavages[(row['from'], row['to'])] = float(row['score'])
            cleavage_epitopes.add(row['from'])
            cleavage_epitopes.add(row['to'])
    LOGGER.info('Loaded %d cleavage scores', len(cleavages))

    # compute edge cost
    edge_cost, vertices, vertices_rewards = [], [], []
    vertex_to_epitope = [''] + list(cleavage_epitopes)
    for ep_from in vertex_to_epitope:
        vertices.append(ep_from)
        vertices_rewards.append(0 if ep_from ==
                                '' else epitopes[ep_from]['immunogen'])
        edge_cost.append([
            cleavages[(ep_from,
                       ep_to)] if ep_from != '' and ep_to != '' else 0.0
            for ep_to in vertex_to_epitope
        ])
    LOGGER.info('Kept %d epitopes with available clevages', len(vertices) - 1)

    type_coverage, min_type_coverage, min_avg_type_conservation = utilities.compute_coverage_matrix(
        [epitopes[e] for e in vertex_to_epitope[1:]], min_alleles,
        min_proteins, min_avg_prot_conservation, min_avg_alle_conservation,
        len(proteins), len(alleles))

    # find optimal design
    solver_build_time = time.time()
    solver = TeamOrienteeringIlp(
        num_teams=cocktail,
        vertex_reward=vertices_rewards,
        edge_cost=edge_cost,
        type_coverage=type_coverage,
        min_type_coverage=min_type_coverage,
        min_avg_type_conservation=min_avg_type_conservation,
        max_edge_cost=max_aminoacids,
        max_vertices=max_epitopes,
        lazy_subtour_elimination=not greedy_subtour)
    solver.build_model()
    solver_start_time = time.time()
    result = solver.solve()
    solver_end_time = time.time()

    # print info and save
    with open(output_vaccine, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(('cocktail', 'index', 'epitope'))
        for i, mosaic in enumerate(result):
            LOGGER.info('Mosaic #%d', i + 1)
            for j, (_, vertex) in enumerate(mosaic[:-1]):
                epitope = epitopes[vertex_to_epitope[vertex]]
                writer.writerow((i, j, epitope['epitope']))
                LOGGER.info('    %s - IG: %.2f', epitope['epitope'],
                            epitope['immunogen'])

    LOGGER.info('==== Stopwatch')
    LOGGER.info('          Total time : %.2f s',
                solver_end_time - program_start_time)
    LOGGER.info('      Pre-processing : %.2f s',
                solver_build_time - program_start_time)
    LOGGER.info(' Model creation time : %.2f s',
                solver_start_time - solver_build_time)
    LOGGER.info('        Solving time : %.2f s',
                solver_end_time - solver_start_time)