def run(cancer_types, long_names, out_dir):

    last_dataset = "000007S"

    IDGenerator = id_generator.IDGenerator()

    for cancer in cancer_types:

        # get ref dataset id
        data_id, last_dataset = IDGenerator.get_new_OEB_id(
            "002", "D", last_dataset)

        info = {
            "_id":
            "TCGA:2018-04-05_" + cancer + "_M",
            "datalink": {
                "uri": "https://portal.gdc.cancer.gov/",
                "attrs": ["archive"],
                "validation_date": "2018-04-05T00:00:00Z",
                "status": "ok"
            },
            "type":
            "metrics_reference",
            "challenge_ids": ["TCGA:2018-04-05_" + cancer],
            "visibility":
            "public",
            "version":
            "unknown",
            "name":
            "Metrics Reference Dataset for " + long_names[cancer],
            "description":
            "List of genes (described by TCGA community) that can be used as 'gold standard' in "
            + long_names[cancer] + " benchmark ",
            "dates": {
                "creation": "2018-04-05T00:00:00Z",
                "modification": "2018-04-05T14:00:00Z"
            },
            "depends_on": {
                "rel_dataset_ids": [{
                    "dataset_id": "TCGA:2018-04-05_input",
                }]
            },
            "_schema":
            "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Dataset",
            "community_ids": ["OEBC001"],
            "dataset_contact_ids":
            ["Matthew.Bailey", "Eduard.Porta", "Collin.Tokheim"]
        }

        filename = "Dataset_Metrics_Ref_" + cancer + "_" + data_id + ".json"
        # print filename

        with open(out_dir + filename, 'w') as f:
            json.dump(info,
                      f,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ': '))
def run(cancer_types, mongo_ids, out_dir):

    last_challenge = "0000000"
    last_test_event = "0000000"
    last_participant_dataset = "0000000"
    last_tool = "0000008"

    IDGenerator = id_generator.IDGenerator()

    for cancer in cancer_types:

        challenge_id, last_challenge = IDGenerator.get_new_OEB_id(
            "002", "X", last_challenge)

        for participant in os.listdir(
                "/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/participants"
        ):

            #if participant is not in mongo, asign new temporary id
            if participant in mongo_ids:
                tool_id = mongo_ids[participant]
            else:
                tool_id, last_tool = IDGenerator.get_new_OEB_id(
                    "002", "T", last_tool)

            #get test event id
            Tevent_id, last_test_event = IDGenerator.get_new_OEB_id(
                "002", "A", last_test_event)

            # get participant dataset id
            participant_data_id, last_participant_dataset = IDGenerator.get_new_OEB_id(
                "002", "D", last_participant_dataset)

            info = {
                "_id":
                "TCGA:2018-04-05_" + cancer + "_testEvent_" + participant,
                "_schema":
                "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/TestAction",
                "action_type":
                "TestEvent",
                "tool_id":
                "TCGA:" + participant,
                "involved_datasets": [{
                    "dataset_id": "TCGA:2018-04-05_input",
                    "role": "incoming"
                }, {
                    "dataset_id":
                    "TCGA:2018-04-05_" + cancer + "_P_" + participant,
                    "role":
                    "outgoing"
                }],
                "challenge_id":
                "TCGA:2018-04-05_" + cancer,
                "dates": {
                    "creation": "2018-04-05T00:00:00Z",
                    "reception": "2018-04-05T00:00:00Z"
                },
                "test_contact_ids":
                ["Matthew.Bailey", "Eduard.Porta", "Collin.Tokheim"]
            }

            # print info
            filename = "TestEvent_" + cancer + "_" + participant + "_" + Tevent_id + ".json"
            # print filename

            with open(out_dir + filename, 'w') as f:
                json.dump(info,
                          f,
                          sort_keys=True,
                          indent=4,
                          separators=(',', ': '))
def run(cancer_types, long_names, mongo_tool_ids, tool_contact, mongo_datRef_ids, out_dir):

    ## create dict that will store info about all combined cancer types
    all_cancer_genes = {}
    for participant in os.listdir("/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/participants"):
        all_cancer_genes[participant] = []

    last_challenge = "0000000"
    last_event = "000007S"
    last_participant_dataset = "0000000"
    last_ref_dataset = "000007S"
    last_tool = "0000008"
    last_assessment_dataset = "000008R"

    IDGenerator = id_generator.IDGenerator()

    for cancer in cancer_types:

        challenge_id, last_challenge = IDGenerator.get_new_OEB_id("002", "X", last_challenge)

        # get metrics reference dataset id - incoming
        ref_data_id, last_ref_dataset = IDGenerator.get_new_OEB_id("002", "D", last_ref_dataset)


        data = pandas.read_csv("/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/"+ cancer + ".txt",
                               comment="#", header=None)
        gold_standard = data.iloc[:, 0].values

        participants_datasets, all_cancer_genes = compute_metrics("/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/", gold_standard, cancer, all_cancer_genes)

        for participant in os.listdir("/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/participants"):

            # if participant is not in mongo, asign new temporary id
            if participant in mongo_tool_ids:
                tool_id = mongo_tool_ids[participant]
            else:
                tool_id, last_tool = IDGenerator.get_new_OEB_id("002", "T", last_tool)

            # get participant dataset id - incoming
            participant_data_id, last_participant_dataset = IDGenerator.get_new_OEB_id("002", "D", last_participant_dataset)

            #get data-uri value of the 2 metrics
            metric1 = participants_datasets[participant][0]
            metric2 = participants_datasets[participant][1]

            #print metrics1 assesment file

            # get assessment dataset id for metric 1
            A_data_id, last_assessment_dataset = IDGenerator.get_new_OEB_id("002", "D", last_assessment_dataset)

            info = {
                "_id":"TCGA:2018-04-05_" + cancer + "_A_TPR_" + participant,
               "description":"Assessment dataset for applying Metric 'True Positive Rate' to " + participant + " predictions in " + long_names[cancer],
               "dates":{
                  "creation":"2018-04-05T00:00:00Z",
                  "modification":"2018-04-05T14:00:00Z"
               },
               "type":"assessment",
                "visibility": "public",
               "datalink":{
                  "inline_data": {"value": metric1}
               },
               "depends_on":{
                  "tool_id":"TCGA:" + participant,
                  "metrics_id":"TCGA:TPR",
                  "rel_dataset_ids":[
                     {
                        "dataset_id":"TCGA:2018-04-05_" + cancer + "_P_" + participant,
                     },
                     {
                        "dataset_id":"TCGA:2018-04-05_" + cancer + "_M",
                     }
                  ]
               },
               "_schema":"https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Dataset",
               "community_ids":["OEBC001"],
               "challenge_ids": ["TCGA:2018-04-05_" + cancer],
               "version":"1",
               "name":"Assesment of Metric TPR in " + participant,
               "dataset_contact_ids":[
                  tool_contact[participant]
               ]
            }

            # print info
            filename = "Dataset_assessment_" + cancer + "_" + participant + "_TPR_" + A_data_id + ".json"
            print filename

            with open(out_dir + filename, 'w') as f:
                json.dump(info, f, sort_keys=True, indent=4, separators=(',', ': '))

            # print metrics2 assessment file

            # get assessment dataset id for metric 2
            A_data_id, last_assessment_dataset = IDGenerator.get_new_OEB_id("002", "D", last_assessment_dataset)

            info = {

                "_id": "TCGA:2018-04-05_" + cancer + "_A_precision_" + participant,
                "description": "Assessment dataset for applying Metric 'Positive Predictive Value' to " + participant + " predictions in " +
                               long_names[cancer],
                "dates": {
                    "creation": "2018-04-05T00:00:00Z",
                    "modification": "2018-04-05T14:00:00Z"
                },
                "type": "assessment",
                "visibility": "public",
                "datalink": {
                    "inline_data": {"value": metric2}
                },
                "depends_on": {
                    "tool_id": "TCGA:" + participant,
                    "metrics_id": "TCGA:precision",
                    "rel_dataset_ids":[
                     {
                        "dataset_id":"TCGA:2018-04-05_" + cancer + "_P_" + participant,
                     },
                     {
                        "dataset_id":"TCGA:2018-04-05_" + cancer + "_M",
                     }
                  ]
                },
                "_schema": "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Dataset",
                "community_ids":["OEBC001"],
                "challenge_ids": ["TCGA:2018-04-05_" + cancer],
                "version": "1",
                "name": "Assesment of Metric precision-PPV in " + participant,
                "dataset_contact_ids": [
                    tool_contact[participant]
                ]
            }

            # print info
            filename = "Dataset_assessment_" + cancer + "_" + participant + "_precision_" + A_data_id + ".json"
            print filename

            with open(out_dir + filename, 'w') as f:
                json.dump(info, f, sort_keys=True, indent=4, separators=(',', ': '))


    get_metrics_across_all_cancers(all_cancer_genes, last_assessment_dataset, last_participant_dataset, last_tool, out_dir)
def get_metrics_across_all_cancers(all_cancer_genes, last_assessment_dataset, last_participant_dataset, last_tool, out_dir):


    # plot chart for results across all cancer types

    IDGenerator = id_generator.IDGenerator()

    data = pandas.read_csv("/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/ALL.txt",
                               comment="#", header=None)
    gold_standard = data.iloc[:, 0].values

    cancer = "ALL"
    challenge_id = "OEBX002t00000Z"
    ref_data_id = "OEBD002t00008R"

    participants_datasets = {}
    for participant in os.listdir("/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/participants"):

        #get set of predicted genes store in all_cancer_genes
        predicted_genes = all_cancer_genes[participant]
        # TRUE POSITIVE RATE
        overlapping_genes = set(predicted_genes).intersection(gold_standard)
        TPR = len(overlapping_genes) / len(gold_standard)

        # ACCURACY/ PRECISION
        if len(predicted_genes) == 0:
            acc = 0
        else:
            acc = len(overlapping_genes) / len(predicted_genes)

        participants_datasets[participant] = [TPR, acc]

        # if participant is not in mongo, asign new temporary id
        if participant in mongo_tool_ids:
            tool_id = mongo_tool_ids[participant]
        else:
            tool_id, last_tool = IDGenerator.get_new_OEB_id("002", "T", last_tool)

        # get participant dataset id - incoming
        participant_data_id, last_participant_dataset = IDGenerator.get_new_OEB_id("002", "D", last_participant_dataset)

        # get data-uri value of the 2 metrics
        metric1 = participants_datasets[participant][0]
        metric2 = participants_datasets[participant][1]

        # print metrics1 assesment file

        # get assessment dataset id for metric 1
        A_data_id, last_assessment_dataset = IDGenerator.get_new_OEB_id("002", "D", last_assessment_dataset)

        info = {
            "_id": "TCGA:2018-04-05_" + cancer + "_A_TPR_" + participant,
            "description": "Assessment dataset for applying Metric 'True Positive Rate' to " + participant + " predictions in " +
                           long_names[cancer],
            "dates": {
                "creation": "2018-04-05T00:00:00Z",
                "modification": "2018-04-05T14:00:00Z"
            },
            "type": "assessment",
            "visibility": "public",
            "datalink": {
                "inline_data": {"value": metric1}
            },
            "depends_on": {
                "tool_id": "TCGA:" + participant,
                "metrics_id": "TCGA:TPR",
                "rel_dataset_ids":[
                     {
                        "dataset_id":"TCGA:2018-04-05_" + cancer + "_P_" + participant,
                     },
                     {
                        "dataset_id":"TCGA:2018-04-05_" + cancer + "_M",
                     }
                  ]
            },
            "_schema": "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Dataset",
            "community_ids":["OEBC001"],
            "challenge_ids": ["TCGA:2018-04-05_" + cancer],
            "version": "1",
            "name": "Assesment of Metric TPR in " + participant,
            "dataset_contact_ids": [
                tool_contact[participant]
            ]
        }

        # print info
        filename = "Dataset_assessment_" + cancer + "_" + participant + "_TPR_" + A_data_id + ".json"
        print filename

        with open(out_dir + filename, 'w') as f:
            json.dump(info, f, sort_keys=True, indent=4, separators=(',', ': '))

        # print metrics2 assessment file

        # get assessment dataset id for metric 2
        A_data_id, last_assessment_dataset = IDGenerator.get_new_OEB_id("002", "D", last_assessment_dataset)

        info = {

            "_id": "TCGA:2018-04-05_" + cancer + "_A_precision_" + participant,
            "description": "Assessment dataset for applying Metric 'Positive Predictive Value' to " + participant + " predictions in " +
                           long_names[cancer],
            "dates": {
                "creation": "2018-04-05T00:00:00Z",
                "modification": "2018-04-05T14:00:00Z"
            },
            "type": "assessment",
            "visibility": "public",
            "datalink": {
                "inline_data": {"value": metric2}
            },
            "depends_on": {
                "tool_id": "TCGA:" + participant,
                "metrics_id": "TCGA:precision",
                "rel_dataset_ids":[
                     {
                        "dataset_id":"TCGA:2018-04-05_" + cancer + "_P_" + participant,
                     },
                     {
                        "dataset_id":"TCGA:2018-04-05_" + cancer + "_M",
                     }
                  ]
            },
            "_schema": "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Dataset",
            "community_ids":["OEBC001"],
            "challenge_ids": ["TCGA:2018-04-05_" + cancer],
            "version": "1",
            "name": "Assesment of Metric precision-PPV in " + participant,
            "dataset_contact_ids": [
                tool_contact[participant]
            ]
        }

        # print info
        filename = "Dataset_assessment_" + cancer + "_" + participant + "_precision_" + A_data_id + ".json"
        print filename

        with open(out_dir + filename, 'w') as f:
            json.dump(info, f, sort_keys=True, indent=4, separators=(',', ': '))
def run(cancer_types, long_names, urls, out_dir):

    IDGenerator = id_generator.IDGenerator()

    last_used = "0000000"

    for cancer in cancer_types:
        # get schema alphanumeric id
        challenge_id, last_used = IDGenerator.get_new_OEB_id(
            "002", "X", last_used)
        info = {
            "_id":
            "TCGA:2018-04-05_" + cancer,
            "_schema":
            "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Challenge",
            "acronym":
            cancer,
            "name":
            "Cancer Driver Genes Prediction Benchmark in " +
            long_names[cancer],
            "benchmarking_event_id":
            "TCGA:2018-04-05",
            "is_automated":
            False,
            "dates": {
                "creation": "2018-04-05T00:00:00Z",
                "modification": "2018-04-05T14:00:00Z",
                "benchmark_start": "2018-04-05T05:00:00Z",
                "benchmark_stop": "2018-04-05T02:00:00Z"
            },
            "metrics_categories": [{
                "category":
                "assessment",
                "description":
                "metrics used to benchmark the performance of cancer genes predictors in Challenge "
                + long_names[cancer] + ", generating the assessment datatseta",
                "metrics": [{
                    "metrics_id": "TCGA:TPR",
                    "tool_id": "TCGA:compute_TPR"
                }, {
                    "metrics_id": "TCGA:precision",
                    "tool_id": "TCGA:compute_precision"
                }]
            }, {
                "category":
                "aggregation",
                "description":
                "metrics used to aggregate the assessment data of all cancer genes predictors participating in challenge "
                + long_names[cancer] +
                " in a consolidated Aggregation dataset",
                "metrics": [{
                    "metrics_id": "TCGA:aggregation",
                    "tool_id": "TCGA:aggregate_benchmark"
                }]
            }],
            "url":
            urls[cancer],
            "challenge_contact_ids":
            ["Matthew.Bailey", "Eduard.Porta", "Collin.Tokheim"],
            "references": ["doi:10.1016/j.cell.2018.02.060"]
        }

        # print info
        filename = "Challenge_" + cancer + "_" + challenge_id + ".json"
        # print filename

        with open(out_dir + filename, 'w') as f:
            json.dump(info,
                      f,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ': '))
def run(cancer_types, long_names, mongo_tool_ids, tool_contact, out_dir):

    last_challenge = "0000000"
    last_tool = "0000008"
    last_assessment_dataset = "000008R"
    last_challenge_dataset = "00000OB"
    last_ref_dataset = "000007S"

    IDGenerator = id_generator.IDGenerator()

    for cancer in cancer_types:

        challenge_id, last_challenge = IDGenerator.get_new_OEB_id("002", "X", last_challenge)

        # get metrics reference dataset id - incoming
        ref_data_id, last_ref_dataset = IDGenerator.get_new_OEB_id("002", "D", last_ref_dataset)

        # get challenge dataset_id
        challenge_data_id, last_challenge_dataset = IDGenerator.get_new_OEB_id("002", "D", last_challenge_dataset)

        # generate array with all related datasets and object with participants metrics results
        involved_datasets = []

        inline_data = {

            "visualization": { "type":"2D-plot",
                               "x_axis": "OEBM0020000002",
                               "y_axis": "OEBM0020000001"
                               },
            "challenge_participants": []
        }

        for participant in os.listdir("/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/participants"):

            # if participant is not in mongo, asign new temporary id
            if participant in mongo_tool_ids:
                tool_id = mongo_tool_ids[participant]
            else:
                tool_id, last_tool = IDGenerator.get_new_OEB_id("002", "T", last_tool)

            # get assessment dataset id for metric 1
            A_data_id_TPR, last_assessment_dataset = IDGenerator.get_new_OEB_id("002", "D", last_assessment_dataset)
            # get assessment dataset id for metric 2
            A_data_id_precision, last_assessment_dataset = IDGenerator.get_new_OEB_id("002", "D", last_assessment_dataset)


            # read files which containes metrics values
            with io.open("out/assessment_datasets/Dataset_assessment_" + cancer + "_" + participant + "_TPR_" + A_data_id_TPR + ".json", mode='r', encoding="utf-8") as f:
                assess_file = json.load(f)
                metric1 = assess_file["datalink"]["inline_data"]["value"]

            with io.open("out/assessment_datasets/Dataset_assessment_" + cancer + "_" + participant + "_precision_" + A_data_id_precision + ".json", mode='r', encoding="utf-8") as f:
                assess_file = json.load(f)
                metric2 = assess_file["datalink"]["inline_data"]["value"]

            inline_data["challenge_participants"].append( {
                "tool_id": "TCGA:" + participant,
                "metric_x": metric1,
                "metric_y": metric2,
            })

            ###############################################################################
            involved_datasets.append({
                "dataset_id": "TCGA:2018-04-05_" + cancer + "_A_TPR_" + participant,
            })

            involved_datasets.append({
                "dataset_id": "TCGA:2018-04-05_" + cancer + "_A_precision_" + participant,
            })

        # append reference and input datasets
        involved_datasets.append({
                               "dataset_id": "TCGA:2018-04-05_input",
                           })
        involved_datasets.append({
            "dataset_id": "TCGA:2018-04-05_" + cancer + "_M",
        })


        info = {

            "_id": "TCGA:2018-04-05_" + cancer + "_Aggregation",
            "datalink":{
               "inline_data": inline_data,
                "schema_url": "https://raw.githubusercontent.com/inab/OpenEBench_scientific_visualizer/js/benchmarking_data_model/inline_data_visualizer.json"
            },
            "type":"aggregation",
            "visibility": "public",
            "version":"unknown",
            "name":"Summary dataset for challenge: " + long_names[cancer],
            "description":"Summary dataset with information about challenge " + long_names[cancer] + " (e.g. input/output datasets, metrics...) in participant " + participant,
            "dates":{
               "creation":"2018-04-05T00:00:00Z",
               "modification":"2018-04-05T14:00:00Z"
            },
            "depends_on":{
                "tool_id": "TCGA:aggregate_benchmark",
               "rel_dataset_ids": involved_datasets,
            },
            "_schema":"https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Dataset",
            "community_ids":["OEBC001"],
            "challenge_ids": ["TCGA:2018-04-05_" + cancer],
            "dataset_contact_ids":[
                "Eduard.Porta",
                "Matthew.Bailey",
                "Collin.Tokheim",
                "Loris.Mularoni",
                "Juri.Reimand",
                "David.Tamborero",
                "Nathan.Dees"
            ]
        }

        # print info
        filename = "Dataset_Aggregation_" + cancer + "_" + challenge_data_id + ".json"
        # print filename

        with open(out_dir + filename, 'w') as f:
            json.dump(info, f, sort_keys=True, indent=4, separators=(',', ': '))
Пример #7
0
def run(cancer_types, long_names, mongo_ids, tool_contact, download_urls,
        out_dir):

    last_challenge = "0000000"
    last_participant_dataset = "0000000"
    last_tool = "0000008"

    IDGenerator = id_generator.IDGenerator()

    for cancer in cancer_types:

        challenge_id, last_challenge = IDGenerator.get_new_OEB_id(
            "002", "X", last_challenge)

        for participant in os.listdir(
                "/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/participants"
        ):

            # if participant is not in mongo, assign new temporary id
            if participant in mongo_ids:
                tool_id = mongo_ids[participant]
            else:
                tool_id, last_tool = IDGenerator.get_new_OEB_id(
                    "002", "T", last_tool)

            # get participant dataset id
            participant_data_id, last_participant_dataset = IDGenerator.get_new_OEB_id(
                "002", "D", last_participant_dataset)

            info = {
                "_id":
                "TCGA:2018-04-05_" + cancer + "_P_" + participant,
                "name":
                "Cancer Driver Genes in " + long_names[cancer],
                "description":
                "List of Cancer Driver Genes predicted by tool " +
                participant + " in " + long_names[cancer],
                "dates": {
                    "creation": "2018-04-05T00:00:00Z",
                    "modification": "2018-04-05T14:00:00Z"
                },
                "datalink": {
                    "uri": download_urls[participant],
                    "attrs": ["archive"],
                    "validation_date": "2018-04-05T00:00:00Z",
                    "status": "ok"
                },
                "type":
                "participant",
                "visibility":
                "public",
                "_schema":
                "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/Dataset",
                "community_ids": ["OEBC001"],
                "challenge_ids": ["TCGA:2018-04-05_" + cancer],
                "depends_on": {
                    "tool_id": "TCGA:" + participant,
                    "rel_dataset_ids": [{
                        "dataset_id": "TCGA:2018-04-05_input",
                    }]
                },
                "version":
                "unknown",
                "dataset_contact_ids": [tool_contact[participant]]
            }

            # print info
            filename = "Dataset_participant_" + cancer + "_" + participant + "_" + participant_data_id + ".json"
            # print filename

            with open(out_dir + filename, 'w') as f:
                json.dump(info,
                          f,
                          sort_keys=True,
                          indent=4,
                          separators=(',', ': '))
Пример #8
0
def run(cancer_types, out_dir):

    last_challenge = "0000000"
    last_event = "00000NC"
    last_tool = "0000008"
    last_assessment_dataset = "000008R"
    last_challenge_dataset = "00000OB"

    IDGenerator = id_generator.IDGenerator()

    for cancer in cancer_types:

        challenge_id, last_challenge = IDGenerator.get_new_OEB_id(
            "002", "X", last_challenge)
        # get stat event id
        Sevent_id, last_event = IDGenerator.get_new_OEB_id(
            "002", "A", last_event)

        #generate array with all incoming assessment datasets aun aoutgoing aggregation dataset
        involved_datasets = []

        for participant in os.listdir(
                "/home/jgarrayo/PycharmProjects/TCGA_benchmark/input/participants"
        ):

            involved_datasets.append({
                "dataset_id":
                "TCGA:2018-04-05_" + cancer + "_A_TPR_" + participant,
                "role":
                "incoming"
            })

            involved_datasets.append({
                "dataset_id":
                "TCGA:2018-04-05_" + cancer + "_A_precision_" + participant,
                "role":
                "incoming"
            })
        # append test action outgoing dataset
        involved_datasets.append({
            "dataset_id": "TCGA:2018-04-05_" + cancer + "_Aggregation",
            "role": "outgoing"
        })

        info = {
            "_id":
            "TCGA:2018-04-05_" + cancer + "_do_aggregation",
            "_schema":
            "https://www.elixir-europe.org/excelerate/WP2/json-schemas/1.0/TestAction",
            "tool_id":
            "TCGA:aggregate_benchmark",
            "action_type":
            "AggregationEvent",
            "involved_datasets":
            involved_datasets,
            "challenge_id":
            "TCGA:2018-04-05_" + cancer,
            "dates": {
                "creation": "2018-04-05T00:00:00Z",
                "reception": "2018-04-05T00:00:00Z"
            },
            "test_contact_ids":
            ["Matthew.Bailey", "Eduard.Porta", "Collin.Tokheim"]
        }

        # print info
        filename = "AggregationEvent_" + cancer + "_" + Sevent_id + ".json"
        # print filename

        with open(out_dir + filename, 'w') as f:
            json.dump(info,
                      f,
                      sort_keys=True,
                      indent=4,
                      separators=(',', ': '))