Exemplo n.º 1
0
    def test_MartsAdapter(self):
        ma = MartsAdapter(biomart="http://grch37.ensembl.org")

        self.assertEqual(
            (1515, 1529),
            ma.get_transcript_position('ENST00000361221',
                                       '17953929',
                                       '17953943',
                                       type=EIdentifierTypes.ENSEMBL))
        self.assertIsNone(
            ma.get_transcript_position("ENST00000614237",
                                       7566927,
                                       7566927,
                                       type=EIdentifierTypes.ENSEMBL))
        #logging.captureWarnings(True)
        #result = ma.get_transcript_position("ENST00000614237", 7566927, 7566927, type=EIdentifierTypes.ENSEMBL)
        self.assertEqual("TP53", ma.get_gene_by_position(17, 7566927, 7566927))
        self.assertIsNone(
            ma.get_product_sequence("Q15942", type=EIdentifierTypes.UNIPROT))
        self.assertEqual(
            self.NP_001005353,
            ma.get_product_sequence("NP_001005353",
                                    type=EIdentifierTypes.REFSEQ))
        self.assertEqual(
            self.ENSP00000369497,
            ma.get_product_sequence("ENSP00000369497",
                                    type=EIdentifierTypes.ENSEMBL))
        self.assertEqual(
            self.ENST00000361221[2],
            ma.get_transcript_sequence('ENST00000361221',
                                       type=EIdentifierTypes.ENSEMBL))
        self.assertIsNone(
            ma.get_transcript_sequence("ENST00000614237",
                                       type=EIdentifierTypes.ENSEMBL))
        self.assertDictEqual(
            self.ENST00000361221,
            ma.get_transcript_information('ENST00000361221',
                                          type=EIdentifierTypes.ENSEMBL))
        self.assertIsNone(
            ma.get_transcript_information("ENST00000614237",
                                          type=EIdentifierTypes.ENSEMBL))
        self.assertEqual(
            str(
                ma.get_ensembl_ids_from_id('TP53',
                                           type=EIdentifierTypes.GENENAME)),
            "[{0: 'ENSG00000141510', 1: '-', 3: 'ENST00000413465', 4: 'ENSP00000410739'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000359597', 4: 'ENSP00000352610'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000504290', 4: ''}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000510385', 4: ''}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000504937', 4: ''}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000269305', 4: 'ENSP00000269305'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000455263', 4: 'ENSP00000398846'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000420246', 4: 'ENSP00000391127'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000445888', 4: 'ENSP00000391478'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000576024', 4: 'ENSP00000458393'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000509690', 4: 'ENSP00000425104'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000514944', 4: 'ENSP00000423862'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000574684', 4: ''}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000505014', 4: ''}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000508793', 4: 'ENSP00000424104'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000604348', 4: 'ENSP00000473895'}, {0: 'ENSG00000141510', 1: '-', 3: 'ENST00000503591', 4: 'ENSP00000426252'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t8', 4: 'LRG_321p8'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t7', 4: 'LRG_321p13'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t6', 4: 'LRG_321p12'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t5', 4: 'LRG_321p11'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t4', 4: 'LRG_321p10'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t3', 4: 'LRG_321p3'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t2', 4: 'LRG_321p2'}, {0: 'LRG_321', 1: '+', 3: 'LRG_321t1', 4: 'LRG_321p1'}]"
        )
Exemplo n.º 2
0
    def test_MartsAdapter(self):
        ma = MartsAdapter(biomart="http://grch37.ensembl.org")

        self.assertEqual(
            (1515, 1529),
            ma.get_transcript_position('ENST00000361221',
                                       '17953929',
                                       '17953943',
                                       type=EIdentifierTypes.ENSEMBL))
        self.assertIsNone(
            ma.get_transcript_position("ENST00000614237",
                                       7566927,
                                       7566927,
                                       type=EIdentifierTypes.ENSEMBL))
        #logging.captureWarnings(True)
        #result = ma.get_transcript_position("ENST00000614237", 7566927, 7566927, type=EIdentifierTypes.ENSEMBL)
        self.assertEqual("TP53", ma.get_gene_by_position(17, 7566927, 7566927))
        self.assertIsNone(
            ma.get_product_sequence("Q15942", type=EIdentifierTypes.UNIPROT))
        self.assertEqual(
            self.NP_001005353,
            ma.get_product_sequence("NP_001005353",
                                    type=EIdentifierTypes.REFSEQ))
        self.assertEqual(
            self.ENSP00000369497,
            ma.get_product_sequence("ENSP00000369497",
                                    type=EIdentifierTypes.ENSEMBL))
        self.assertEqual(
            self.ENST00000361221[2],
            ma.get_transcript_sequence('ENST00000361221',
                                       type=EIdentifierTypes.ENSEMBL))
        self.assertIsNone(
            ma.get_transcript_sequence("ENST00000614237",
                                       type=EIdentifierTypes.ENSEMBL))
        self.assertDictEqual(
            self.ENST00000361221,
            ma.get_transcript_information('ENST00000361221',
                                          type=EIdentifierTypes.ENSEMBL))
        self.assertIsNone(
            ma.get_transcript_information("ENST00000614237",
                                          type=EIdentifierTypes.ENSEMBL))
        self.assertListEqual(
            ma.get_ensembl_ids_from_id('TP53', type=EIdentifierTypes.GENENAME),
            self.tp53_ensembl_biomart)
Exemplo n.º 3
0
def __main__():
    parser = argparse.ArgumentParser(
        description=
        """EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. 
        Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""",
        version=VERSION)
    parser.add_argument('-s', "--somatic_mutations", help='Somatic variants')
    parser.add_argument('-g', "--germline_mutations", help="Germline variants")
    parser.add_argument('-i', "--identifier", help="Dataset identifier")
    parser.add_argument('-p',
                        "--peptides",
                        help="File with one peptide per line")
    parser.add_argument('-c',
                        "--mhcclass",
                        default="I",
                        help="MHC class I or II")
    parser.add_argument('-l', "--length", help="Maximum peptide length")
    parser.add_argument('-a',
                        "--alleles",
                        help="<Required> MHC Alleles",
                        required=True)
    parser.add_argument(
        '-r',
        "--reference",
        help=
        "Reference, retrieved information will be based on this ensembl version",
        required=False,
        default='GRCh37',
        choices=['GRCh37', 'GRCh38'])
    parser.add_argument('-f',
                        "--filter_self",
                        help="Filter peptides against human proteom",
                        required=False,
                        action='store_true')
    parser.add_argument(
        '-wt',
        "--wild_type",
        help="Add wild type sequences of mutated peptides to output",
        required=False,
        action='store_true')
    parser.add_argument('-rp',
                        "--reference_proteome",
                        help="Reference proteome for self-filtering",
                        required=False)
    parser.add_argument('-gr',
                        "--gene_reference",
                        help="List of gene IDs for ID mapping.",
                        required=False)
    parser.add_argument('-pq',
                        "--protein_quantification",
                        help="File with protein quantification values")
    parser.add_argument('-ge',
                        "--gene_expression",
                        help="File with expression analysis results")
    parser.add_argument(
        '-de',
        "--diff_gene_expression",
        help="File with differential expression analysis results (DESeq2)")
    parser.add_argument(
        '-li',
        "--ligandomics_id",
        help=
        "Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run."
    )
    parser.add_argument('-o',
                        "--output_dir",
                        help="All files written will be put in this directory")

    args = parser.parse_args()

    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if args.output_dir is not None:
        try:
            os.chdir(args.output_dir)
            logging.basicConfig(filename=os.path.join(
                args.output_dir, '{}_prediction.log'.format(args.identifier)),
                                filemode='w+',
                                level=logging.DEBUG)
            logging.info("Using provided data directory: {}".format(
                str(args.output_dir)))
        except:
            logging.info("No such directory, using current.")
    else:
        logging.basicConfig(filename='{}_prediction.log'.format(
            args.identifier),
                            filemode='w+',
                            level=logging.DEBUG)
        logging.info("Using current data directory.")

    logging.info("Starting predictions at " +
                 str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

    metadata = []
    references = {
        'GRCh37': 'http://feb2014.archive.ensembl.org',
        'GRCh38': 'http://dec2016.archive.ensembl.org'
    }
    global transcriptProteinMap
    global transcriptSwissProtMap
    '''read in variants or peptides'''
    if args.peptides:
        peptides, metadata = read_peptide_input(args.peptides)
    else:
        if args.somatic_mutations.endswith(
                '.GSvar') or args.somatic_mutations.endswith('.tsv'):
            vl, transcripts, metadata = read_GSvar(args.somatic_mutations)
        elif args.somatic_mutations.endswith('.vcf'):
            vl, transcripts = read_vcf(args.somatic_mutations)

        transcripts = list(set(transcripts))
        transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts(
            ID_SYSTEM_USED, transcripts, references[args.reference],
            args.reference)

    # get the alleles
    alleles = FileReader.read_lines(args.alleles, in_type=Allele)

    # initialize MartsAdapter, GRCh37 or GRCh38 based
    ma = MartsAdapter(biomart=references[args.reference])

    # create protein db instance for filtering self-peptides
    up_db = UniProtDB('sp')
    if args.filter_self:
        logging.info('Reading human proteome')

        if os.path.isdir(args.reference_proteome):
            for filename in os.listdir(args.reference_proteome):
                if filename.endswith(".fasta") or filename.endswith(".fsa"):
                    up_db.read_seqs(
                        os.path.join(args.reference_proteome, filename))
        else:
            up_db.read_seqs(args.reference_proteome)

    # MHC class I or II predictions
    if args.mhcclass == "I":
        #methods = ['netmhc-4.0', 'syfpeithi-1.0', 'netmhcpan-3.0']
        methods = ['syfpeithi-1.0']
        if args.peptides:
            pred_dataframes, statistics = make_predictions_from_peptides(
                peptides, methods, alleles, up_db, args.identifier, metadata)
        else:
            pred_dataframes, statistics, all_peptides_filtered = make_predictions_from_variants(
                vl, methods, alleles, 8,
                int(args.length) + 1, ma, up_db, args.identifier, metadata,
                transcriptProteinMap)
    else:
        methods = ['netmhcII-2.2', 'syfpeithi-1.0', 'netmhcIIpan-3.1']
        if args.peptides:
            pred_dataframes, statistics = make_predictions_from_peptides(
                peptides, methods, alleles, up_db, args.identifier, metadata)
        else:
            pred_dataframes, statistics, all_peptides_filtered = make_predictions_from_variants(
                vl, methods, alleles, 15,
                int(args.length) + 1, ma, up_db, args.identifier, metadata,
                transcriptProteinMap)

    # concat dataframes for all peptide lengths
    try:
        complete_df = pd.concat(pred_dataframes)
    except:
        complete_df = pd.DataFrame()
        logging.error("No predictions available.")

    # store version of used methods
    method_map = {}
    for m in methods:
        method_map[m.split('-')[0]] = m

    # replace method names with method names with version
    complete_df.replace({'method': method_map}, inplace=True)

    # include wild type sequences to dataframe if specified
    if args.wild_type:
        wt_sequences = generate_wt_seqs(all_peptides_filtered)
        complete_df['wt sequence'] = complete_df.apply(
            lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1)
        columns_tiles = [
            'sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene',
            'transcripts', 'proteins', 'variant type', 'method'
        ]
    # Change the order (the index) of the columns
    else:
        columns_tiles = [
            'sequence', 'length', 'chr', 'pos', 'gene', 'transcripts',
            'proteins', 'variant type', 'method'
        ]
    for c in complete_df.columns:
        if c not in columns_tiles:
            columns_tiles.append(c)
    complete_df = complete_df.reindex(columns=columns_tiles)

    binder_cols = [col for col in complete_df.columns if 'binder' in col]

    binders = []
    non_binders = []
    pos_predictions = []
    neg_predictions = []

    for i, r in complete_df.iterrows():
        binder = False
        for c in binder_cols:
            if r[c] is True:
                binder = True
                continue
        if binder:
            binders.append(str(r['sequence']))
            pos_predictions.append(str(r['sequence']))
        else:
            neg_predictions.append(str(r['sequence']))
            if str(r['sequence']) not in binders:
                non_binders.append(str(r['sequence']))

    # parse protein quantification results, annotate proteins for samples
    if args.protein_quantification is not None:
        protein_quant = read_protein_quant(args.protein_quantification)
        first_entry = protein_quant[protein_quant.keys()[0]]
        for k in first_entry.keys():
            complete_df['{} log2 protein LFQ intensity'.format(
                k)] = complete_df.apply(
                    lambda row: create_quant_column_value_for_result(
                        row, protein_quant, transcriptSwissProtMap, k),
                    axis=1)

    # parse (differential) expression analysis results, annotate features (genes/transcripts)
    if args.gene_expression is not None:
        fold_changes = read_diff_expression_values(args.gene_expression)
        gene_id_lengths = {}
        col_name = 'RNA expression (RPKM)'

        with open(args.gene_reference, 'r') as gene_list:
            for l in gene_list:
                ids = l.split('\t')
                gene_id_in_df = complete_df.iloc[1]['gene']
                if 'ENSG' in gene_id_in_df:
                    gene_id_lengths[ids[0]] = float(ids[2].strip())
                else:
                    gene_id_lengths[ids[1]] = float(ids[2].strip())
        deseq = False
        # add column to result dataframe
        complete_df[col_name] = complete_df.apply(
            lambda row: create_expression_column_value_for_result(
                row, fold_changes, deseq, gene_id_lengths),
            axis=1)

    if args.diff_gene_expression is not None:
        gene_id_lengths = {}
        fold_changes = read_diff_expression_values(args.diff_gene_expression)
        col_name = 'RNA normal_vs_tumor.log2FoldChange'
        deseq = True

        # add column to result dataframe
        complete_df[col_name] = complete_df.apply(
            lambda row: create_expression_column_value_for_result(
                row, fold_changes, deseq, gene_id_lengths),
            axis=1)

    # parse ligandomics identification results, annotate peptides for samples
    if args.ligandomics_id is not None:
        lig_id = read_lig_ID_values(args.ligandomics_id)
        # add columns to result dataframe
        complete_df['ligand score'] = complete_df.apply(
            lambda row: create_ligandomics_column_value_for_result(
                row, lig_id, 0, False),
            axis=1)
        complete_df['ligand intensity'] = complete_df.apply(
            lambda row: create_ligandomics_column_value_for_result(
                row, lig_id, 1, False),
            axis=1)

        if args.wild_type != None:
            complete_df['wt ligand score'] = complete_df.apply(
                lambda row: create_ligandomics_column_value_for_result(
                    row, lig_id, 0, True),
                axis=1)
            complete_df['wt ligand intensity'] = complete_df.apply(
                lambda row: create_ligandomics_column_value_for_result(
                    row, lig_id, 1, True),
                axis=1)

    # write dataframe to tsv
    complete_df.fillna('')
    complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier),
                       '\t',
                       index=False)

    statistics['number_of_predictions'] = complete_df.shape[0]
    statistics['number_of_binders'] = len(pos_predictions)
    statistics['number_of_nonbinders'] = len(neg_predictions)
    statistics['number_of_unique_binders'] = list(set(binders))
    statistics['number_of_unique_nonbinders'] = list(
        set(non_binders) - set(binders))

    with open('{}_report.json'.format(args.identifier), 'w') as json_out:
        json.dump(statistics, json_out)
Exemplo n.º 4
0
def __main__():
    parser = argparse.ArgumentParser(description="""EPAA - Epitope Prediction And Annotation \n Pipeline for prediction of MHC class I and II epitopes from variants or peptides for a list of specified alleles. 
        Additionally predicted epitopes can be annotated with protein quantification values for the corresponding proteins, identified ligands, or differential expression values for the corresponding transcripts.""", version=VERSION)
    parser.add_argument('-s', "--somatic_mutations", help='Somatic variants')
    parser.add_argument('-g', "--germline_mutations", help="Germline variants")
    parser.add_argument('-i', "--identifier", help="Dataset identifier")
    parser.add_argument('-p', "--peptides", help="File with one peptide per line")
    parser.add_argument('-c', "--mhcclass", default=1, help="MHC class I or II")
    parser.add_argument('-l', "--max_length", help="Maximum peptide length")
    parser.add_argument('-ml', "--min_length", help="Minimum peptide length")
    parser.add_argument('-t', "--tools", help="Tools used for peptide predictions", required=True, type=str)
    parser.add_argument('-sv', "--versions", help="File containing parsed software version numbers.", required=True)
    parser.add_argument('-a', "--alleles", help="<Required> MHC Alleles", required=True)
    parser.add_argument('-r', "--reference", help="Reference, retrieved information will be based on this ensembl version", required=False, default='GRCh37', choices=['GRCh37', 'GRCh38'])
    parser.add_argument('-f', "--filter_self", help="Filter peptides against human proteom", required=False, action='store_true')
    parser.add_argument('-wt', "--wild_type", help="Add wild type sequences of mutated peptides to output", required=False, action='store_true')
    parser.add_argument('-fo', "--fasta_output", help="Create FASTA file with protein sequences", required=False, action='store_true')
    parser.add_argument('-rp', "--reference_proteome", help="Reference proteome for self-filtering", required=False)
    parser.add_argument('-gr', "--gene_reference", help="List of gene IDs for ID mapping.", required=False)
    parser.add_argument('-pq', "--protein_quantification", help="File with protein quantification values")
    parser.add_argument('-ge', "--gene_expression", help="File with expression analysis results")
    parser.add_argument('-de', "--diff_gene_expression", help="File with differential expression analysis results (DESeq2)")
    parser.add_argument('-li', "--ligandomics_id", help="Comma separated file with peptide sequence, score and median intensity of a ligandomics identification run.")
    args = parser.parse_args()

    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    logger.addHandler(logging.FileHandler('{}_prediction.log'.format(args.identifier)))
    logger.info("Starting predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

    metadata = []
    references = {'GRCh37': 'http://feb2014.archive.ensembl.org', 'GRCh38': 'http://dec2016.archive.ensembl.org'}
    global transcriptProteinMap
    global transcriptSwissProtMap

    '''read in variants or peptides'''
    if args.peptides:
        peptides, metadata = read_peptide_input(args.peptides)
    else:
        if args.somatic_mutations.endswith('.GSvar') or args.somatic_mutations.endswith('.tsv'):
            vl, transcripts, metadata = read_GSvar(args.somatic_mutations)
        elif args.somatic_mutations.endswith('.vcf'):
            vl, transcripts, metadata = read_vcf(args.somatic_mutations)

        transcripts = list(set(transcripts))
        transcriptProteinMap, transcriptSwissProtMap = get_protein_ids_for_transcripts(ID_SYSTEM_USED, transcripts, references[args.reference], args.reference)

    # get the alleles
    alleles = FileReader.read_lines(args.alleles, in_type=Allele)

    # initialize MartsAdapter, GRCh37 or GRCh38 based
    ma = MartsAdapter(biomart=references[args.reference])

    # create protein db instance for filtering self-peptides
    up_db = UniProtDB('sp')
    if args.filter_self:
        logger.info('Reading human proteome')

        if os.path.isdir(args.reference_proteome):
            for filename in os.listdir(args.reference_proteome):
                if filename.endswith(".fasta") or filename.endswith(".fsa"): 
                    up_db.read_seqs(os.path.join(args.reference_proteome, filename))
        else:
            up_db.read_seqs(args.reference_proteome)

    selected_methods = [item for item in args.tools.split(',')]
    with open(args.versions, 'r') as versions_file:
        tool_version = [ (row[0], str(row[1][1:])) for row in csv.reader(versions_file, delimiter = "\t") ]
        # NOTE this needs to be updated, if a newer version will be available via Fred2 and should be used in the future
        tool_version.append(('syfpeithi', '1.0'))
        # get for each selected method the corresponding tool version
        methods = { method:version for tool, version in tool_version for method in selected_methods if tool.lower() in method.lower() }

    for method, version in methods.items():
        if version not in EpitopePredictorFactory.available_methods()[method]:
            raise ValueError("The specified version " + version + " for " + method + " is not supported by Fred2.")

    # MHC class I or II predictions
    if args.mhcclass is 1:
        if args.peptides:
            pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata)
        else:
            pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)
    else:
        if args.peptides:
            pred_dataframes, statistics = make_predictions_from_peptides(peptides, methods, alleles, up_db, args.identifier, metadata)
        else:
            pred_dataframes, statistics, all_peptides_filtered, proteins = make_predictions_from_variants(vl, methods, alleles, int(args.min_length), int(args.max_length) + 1, ma, up_db, args.identifier, metadata, transcriptProteinMap)

    # concat dataframes for all peptide lengths
    try:
        complete_df = pd.concat(pred_dataframes)
    except:
        complete_df = pd.DataFrame()
        logger.error("No predictions available.")

    # replace method names with method names with version
    # complete_df.replace({'method': methods}, inplace=True)
    complete_df['method'] = complete_df['method'].apply(lambda x : x + '-' + methods[x] )

    # include wild type sequences to dataframe if specified
    if args.wild_type:
        wt_sequences = generate_wt_seqs(all_peptides_filtered)
        complete_df['wt sequence'] = complete_df.apply(lambda row: create_wt_seq_column_value(row, wt_sequences), axis=1)
        columns_tiles = ['sequence', 'wt sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method']
    # Change the order (the index) of the columns
    else:
        columns_tiles = ['sequence', 'length', 'chr', 'pos', 'gene', 'transcripts', 'proteins', 'variant type', 'method']
    for c in complete_df.columns:
        if c not in columns_tiles:
            columns_tiles.append(c)
    complete_df = complete_df.reindex(columns=columns_tiles)

    binder_cols = [col for col in complete_df.columns if 'binder' in col]

    binders = []
    non_binders = []
    pos_predictions = []
    neg_predictions = []

    for i, r in complete_df.iterrows():
        binder = False
        for c in binder_cols:
            if r[c] is True:
                binder = True
                continue
        if binder:
            binders.append(str(r['sequence']))
            pos_predictions.append(str(r['sequence']))
        else:
            neg_predictions.append(str(r['sequence']))
            if str(r['sequence']) not in binders:
                non_binders.append(str(r['sequence']))
    
    # parse protein quantification results, annotate proteins for samples
    if args.protein_quantification is not None:
        protein_quant = read_protein_quant(args.protein_quantification)
        first_entry = protein_quant[protein_quant.keys()[0]]
        for k in first_entry.keys():
            complete_df['{} log2 protein LFQ intensity'.format(k)] = complete_df.apply(lambda row: create_quant_column_value_for_result(row, protein_quant, transcriptSwissProtMap, k), axis=1)
        
    # parse (differential) expression analysis results, annotate features (genes/transcripts)
    if args.gene_expression is not None:
        fold_changes = read_diff_expression_values(args.gene_expression)
        gene_id_lengths = {}
        col_name = 'RNA expression (RPKM)'

        with open(args.gene_reference, 'r') as gene_list:
            for l in gene_list:
                ids = l.split('\t')
                gene_id_in_df = complete_df.iloc[1]['gene']
                if 'ENSG' in gene_id_in_df:
                    gene_id_lengths[ids[0]] = float(ids[2].strip())
                else:
                    gene_id_lengths[ids[1]] = float(ids[2].strip())
        deseq = False
        # add column to result dataframe
        complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1)

    if args.diff_gene_expression is not None:
        gene_id_lengths = {}
        fold_changes = read_diff_expression_values(args.diff_gene_expression)
        col_name = 'RNA normal_vs_tumor.log2FoldChange'
        deseq = True

        # add column to result dataframe
        complete_df[col_name] = complete_df.apply(lambda row: create_expression_column_value_for_result(row, fold_changes, deseq, gene_id_lengths), axis=1)

    # parse ligandomics identification results, annotate peptides for samples
    if args.ligandomics_id is not None:
        lig_id = read_lig_ID_values(args.ligandomics_id)
        # add columns to result dataframe
        complete_df['ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, False), axis=1)
        complete_df['ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, False), axis=1)

        if args.wild_type != None:
            complete_df['wt ligand score'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 0, True), axis=1)
            complete_df['wt ligand intensity'] = complete_df.apply(lambda row: create_ligandomics_column_value_for_result(row, lig_id, 1, True), axis=1)

    # write mutated protein sequences to fasta file
    if args.fasta_output:
        with open('{}_prediction_proteins.fasta'.format(args.identifier), 'w') as protein_outfile:
            for p in proteins:
                variants = []
                for v in p.vars:
                    variants = variants + p.vars[v]
                c = [x.coding.values() for x in variants]
                cf = list(itertools.chain.from_iterable(c))
                cds = ','.join([y.cdsMutationSyntax for y in set(cf)])
                aas = ','.join([y.aaMutationSyntax for y in set(cf)])
                protein_outfile.write('>{}:{}:{}\n'.format(p.transcript_id, aas, cds))
                protein_outfile.write('{}\n'.format(str(p)))

    # write dataframe to tsv
    complete_df.fillna('')
    complete_df.to_csv("{}_prediction_results.tsv".format(args.identifier), '\t', index=False)

    statistics['number_of_predictions'] = len(complete_df)
    statistics['number_of_binders'] = len(pos_predictions)
    statistics['number_of_nonbinders'] = len(neg_predictions)
    statistics['number_of_unique_binders'] = list(set(binders))
    statistics['number_of_unique_nonbinders'] = list(set(non_binders) - set(binders))

    with open('{}_report.json'.format(args.identifier), 'w') as json_out:
        json.dump(statistics, json_out)
    
    logger.info("Finished predictions at " + str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
Exemplo n.º 5
0
def __main__():
    parser = argparse.ArgumentParser(version=VERSION)
    parser.add_argument('-V',
                        '--variations',
                        dest="var_file",
                        help='<Required> full path to the input variations',
                        required=True)
    parser.add_argument('-o',
                        "--outfile",
                        dest="outfile_path",
                        help="Created fasta file",
                        required=True)
    parser.add_argument(
        '-d',
        "--digest",
        dest="digest",
        type=int,
        help="Length of peptides for predigestion and prediction, default 9.")
    parser.add_argument('-a',
                        "--alleles",
                        dest="alleles",
                        help="Input alleles for prediction")
    parser.add_argument(
        '-p',
        "--predict",
        dest="predict_with",
        help="Method of prediction, needs alleles & length, allowed:[{m}]".
        format(m=PRED_METH))
    parser.add_argument(
        '-f',
        "--filter",
        dest="filter",
        type=float,
        help=
        "Only include sequences with predictions above the given threshold (e.g. 0.4256 for at least weak binder), needs predict"
    )
    parser.add_argument('-P',
                        "--Proteins",
                        dest="only_proteins",
                        action='store_true',
                        help="Will write only proteins.")
    parser.add_argument(
        '-b',
        "--base",
        dest="basefasta_path",
        help="If given, entries are replaced by the variation.")

    options = parser.parse_args()
    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    if options.filter and not options.predict_with:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    if options.predict_with and not options.alleles:
        parser.print_help()
        print "Need alleles with predict option, aborting!"
        sys.exit(1)

    temp_dir = "/tmp/"

    logging.basicConfig(
        filename=os.path.splitext(options.outfile_path)[0] +
        "_{:%d-%m-%Y_%H-%M-%S}".format(datetime.datetime.now()) + '.log',
        filemode='w+',
        level=logging.DEBUG)  #, format='%(levelname)s:%(message)s'
    logging.info("Starting variant fasta creation " + options.outfile_path +
                 " at " + str(datetime.datetime.now()))
    logging.warning("verbosity turned on")

    #... look at theos filter, ligandoqc, fasta-distributions, lica and the morgenstellen server conten scripts
    # complete proteins?
    # only containing binders?
    # k-mers?
    # binders only?
    # FastaSlicer.py?
    # remove original if homozygous (needs fasta input)?
    # add germline variant option? or expect all to be in one vcf?

    # MyObject = type('MyObject', (object,), {})
    # options = MyObject()
    # setattr(options,"var_file","/home/walzer/immuno-tools/Fred2/Fred2/Data/examples/vcftestfile3.vcf")
    #
    # vt = os.path.splitext(options.var_file)[-1]
    # if ".vcf" == vt:
    #     vcfvars, accessions = FileReader.read_vcf(options.var_file)
    #
    # mart_db = MartsAdapter(biomart="http://grch37.ensembl.org")
    #
    # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
    # transcripts = [x for x in transcript_gen if x.vars]
    # transcript_gen = g.generate_transcripts_from_variants(vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)
    # protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    # proteins = [x for x in protein_gen if x.vars]
    # for p in proteins:
    #     p.gene_id = p.vars.values()[0][0].gene
    #
    #
    # for t in transcripts:
    #     t.gene_id = t.vars.values()[0].gene
    #

    vt = os.path.splitext(options.var_file)[-1]
    if ".vcf" == vt:
        vcfvars, accessions = FileReader.read_vcf(options.var_file)
    elif ".GSvar" == vt:
        pass
        # vcfvars = FileReader.read_GSvar(options.var_file)
    else:
        m = "Could not read variants {f}, aborting.".format(f=options.var_file)
        logging.error(m)
        print m
        sys.exit(1)

    mart_db = MartsAdapter(biomart="http://grch37.ensembl.org"
                           )  # TODO guess id_type for mart_db from accessions

    transcript_gen = g.generate_transcripts_from_variants(
        vcfvars, mart_db, id_type=EIdentifierTypes.REFSEQ)

    protein_gen = g.generate_proteins_from_transcripts(transcript_gen)
    proteins = [x for x in protein_gen if x.vars]  # removing unvaried

    for p in proteins:
        p.gene_id = p.vars.values(
        )[0][0].gene  # assume gene name from first variant

    proteins = [p for p in proteins
                if not is_stop_gain(p)]  # kick out stop gains

    # First exit option
    if not (options.predict_with or options.filter) and options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta
            print "N/A"
            sys.exit(0)
        else:
            e = proteins_to_fasta(proteins)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # From now on, digestion must be set somehow
    if not options.digest:
        digest = 9
    else:
        digest = options.digest
    peptide_gen = g.generate_peptides_from_proteins(proteins, digest)
    peptides = [x for x in peptide_gen]
    peptides_var = [
        x for x in peptides if any(
            x.get_variants_by_protein(y) for y in x.proteins.keys())
    ]  # removing unvaried

    # Second exit option
    if not (options.predict_with or options.filter):
        e = peptides_to_fasta(peptides_var)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)

    # From now on, predictions are needed
    try:
        target_alleles_set = set(
            FileReader.read_lines(options.alleles, in_type=Allele))
    except Exception as e:
        m = "Could not read alleles file {f}, aborting.".format(
            f=options.alleles)
        logging.error(m)
        print m, "what:", str(e)
        sys.exit(1)

    try:
        ttn = EpitopePredictorFactory(options.predict_with)
    except Exception as e:
        m = "Could not initialize prediction method {f}, aborting.".format(
            f=options.predict_with)
        logging.error(m)
        print m
        sys.exit(1)

    try:
        preds = ttn.predict(peptides_var, alleles=target_alleles_set)
    except Exception as e:
        print "something went wrong with the prediction", options.inf, options.predict_with, "what:", str(
            e)
        sys.exit(1)

    # punch prediction results in peptide metadata (inside pandas dataframe)
    #PRED_METH = set()
    for i, row in preds.iterrows():
        for j in i[1:]:
            i[0].log_metadata(j, dict(zip(row.index, row.values)))
            #PRED_METH.add(j)  # need that later

    # Third exit option
    if not options.filter:
        if options.only_proteins:
            if options.basefasta_path:
                # TODO - replace from base fasta plus prediction annotation
                print "N/A"
                sys.exit(0)
            else:
                prs = annotate_protein_from_peptides(preds)
                e = proteins_to_fasta(prs)
                with open(options.outfile_path, 'w') as f:
                    f.write(e)
                sys.exit(0)
        else:
            e = peptides_to_fasta(preds)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)

    # kick out nonbinder
    preds_f = preds[(preds > options.filter).any(axis=1)]

    # Fourth exit option
    if options.only_proteins:
        if options.basefasta_path:
            # TODO - replace from base fasta binders only plus prediction annotation
            print "N/A"
            sys.exit(0)
        else:
            prs = annotate_protein_from_peptides(preds_f)
            e = proteins_to_fasta(prs)
            with open(options.outfile_path, 'w') as f:
                f.write(e)
            sys.exit(0)
    else:
        e = peptides_to_fasta(preds_f)
        with open(options.outfile_path, 'w') as f:
            f.write(e)
        sys.exit(0)
Exemplo n.º 6
0
def __main__():
    parser = argparse.ArgumentParser(
        description=
        """Individualized Proteins 2.0 \n Script for generation of protein sequences which contain provided variants.""",
        version=VERSION)
    parser.add_argument('-s', "--somatic_mutations", help='Somatic variants')
    parser.add_argument('-g', "--germline_mutations", help="Germline variants")
    parser.add_argument(
        '-i',
        "--identifier",
        help="<Required> Predictions will be written with this name prefix",
        required=True)
    parser.add_argument(
        '-r',
        "--reference",
        help=
        "Reference, retrieved information will be based on this ensembl version",
        required=False,
        default='GRCh37',
        choices=['GRCh37', 'GRCh38'])
    parser.add_argument(
        '-db',
        "--database",
        help=
        "Proteome sequence reference database to be attached to individualized sequences",
        required=True)
    parser.add_argument('-o',
                        "--output_dir",
                        help="All files written will be put in this directory")

    args = parser.parse_args()

    if len(sys.argv) <= 1:
        parser.print_help()
        sys.exit(1)

    logging.basicConfig(filename=os.path.join(
        args.output_dir, '{}_indproteinsDB.log'.format(args.identifier)),
                        filemode='w+',
                        level=logging.DEBUG)
    logging.info("Starting generation of protein sequences at " +
                 str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

    if args.output_dir is not None:
        try:
            os.chdir(args.output_dir)
            logging.info("Using provided data directory: {}".format(
                str(args.output_dir)))
        except:
            logging.info("No such directory, using current.")
    else:
        logging.info("Using current data directory.")
    '''start the actual IRMA functions'''
    metadata = []
    #references = {'GRCh37': 'http://grch37.ensembl.org', 'GRCh38': 'http://ensembl.org'}
    references = {
        'GRCh37': 'http://feb2014.archive.ensembl.org',
        'GRCh38': 'http://dec2016.archive.ensembl.org'
    }
    global transcriptProteinMap
    '''read in variants'''
    if args.somatic_mutations.endswith(
            '.GSvar') or args.somatic_mutations.endswith('.tsv'):
        vl, transcripts, metadata = read_GSvar(args.somatic_mutations)
    elif args.somatic_mutations.endswith('.vcf'):
        vl, transcripts = read_vcf(args.somatic_mutations)

    if args.germline_mutations is not None:
        if args.germline_mutations.endswith(
                '.GSvar') or args.germline_mutations.endswith('.tsv'):
            vl_normal, transcripts_germline, metadata = read_GSvar(
                args.germline_mutations)
        elif args.germline_mutations.endswith('.vcf'):
            vl_normal, transcripts_germline = read_vcf(args.germline_mutations)

        # combine germline and somatic variants
        vl = vl + vl_normal
        transcripts = transcripts_germline + transcripts
    transcripts = list(set(transcripts))

    # initialize MartsAdapter, GRCh37 or GRCh38 based
    ma = MartsAdapter(biomart=references[args.reference])

    #generate transcripts containing variants, filter for unmutated sequences
    transcripts = [
        g for g in generator.generate_transcripts_from_variants(
            vl, ma, ID_SYSTEM_USED) if g.vars
    ]
    #generate proteins from transcripts, table='Standard', stop_symbol='*', to_stop=True, cds=False
    proteins = generator.generate_proteins_from_transcripts(transcripts)
    diff_sequences = {}

    out_ref = args.database.split('/')[-1].replace(
        '.fasta',
        '_{}_individualized_protein_DB.fasta'.format(args.identifier))

    cpRef = 'cp {f} {o}'.format(f=args.database, o=out_ref)
    subprocess.call(cpRef.split())

    with open(out_ref, 'a') as outfile:
        for p in proteins:

            variants = []
            for v in p.vars:
                variants = variants + p.vars[v]

            c = [x.coding.values() for x in variants]
            cf = list(itertools.chain.from_iterable(c))

            cds = ','.join([y.cdsMutationSyntax for y in set(cf)])
            aas = ','.join([y.aaMutationSyntax for y in set(cf)])

            outfile.write('>{}:{}\n'.format(p.transcript_id, aas))
            outfile.write('{}\n'.format(str(p)))

    logging.info("Finished generation of protein sequences at " +
                 str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))