Пример #1
0
def main():
    '''
    Main block with argparse and calls the main retract function
    '''
    parser = argparse.ArgumentParser(description='Sample retraction')
    parser.add_argument("--pemFile",
                        type=str,
                        help="Path to PEM file (genie.pem)")
    parser.add_argument("--test", action='store_true', help="Run test")
    parser.add_argument("--debug",
                        action='store_true',
                        help="Synapse Debug Feature")
    args = parser.parse_args()
    syn = process_functions.synLogin(args.pemFile, debug=args.debug)
    retract(syn, args.test)
Пример #2
0
def main():
    parser = argparse.ArgumentParser(description='Write invalid reasons')
    parser.add_argument("--pemFile",
                        type=str,
                        help="Path to PEM file (genie.pem)")
    parser.add_argument("--debug",
                        action='store_true',
                        help="Synapse Debug Feature")
    args = parser.parse_args()

    syn = process_functions.synLogin(args.pemFile, debug=args.debug)
    center_mapping = syn.tableQuery(
        'SELECT * FROM syn10061452 where inputSynId is not null and release is true'
    )
    center_mappingdf = center_mapping.asDataFrame()
    error_tracker_synid = "syn10153306"
    write_invalid_reasons(syn, center_mappingdf, error_tracker_synid)
def main():
    parser = argparse.ArgumentParser(description="Update dashboard tables")

    parser.add_argument("--release",
                        help="GENIE release number (ie. 5.3-consortium)",
                        default=None)

    parser.add_argument("--pem_file",
                        type=str,
                        help="Path to PEM file (genie.pem)")

    parser.add_argument("--staging",
                        action="store_true",
                        help="Using staging directory files")

    parser.add_argument("--debug",
                        action="store_true",
                        help="Synapse debugging flag")

    parser.add_argument("--public",
                        action="store_true",
                        help="Set true if releasing public release")

    args = parser.parse_args()
    syn = process_functions.synLogin(args)
    if args.staging:
        # Database to Synapse Id mapping Table
        database_mapping_synid = "syn12094210"
    else:
        database_mapping_synid = "syn10967259"

    database_mapping = syn.tableQuery("select * from %s" %
                                      database_mapping_synid)
    database_mappingdf = database_mapping.asDataFrame()

    run_dashboard(syn,
                  database_mappingdf,
                  args.release,
                  staging=args.staging,
                  public=args.public)
Пример #4
0
def main():
    """Set up argument parser and returns"""
    parser = argparse.ArgumentParser(
        description='GENIE center inputs to database')
    parser.add_argument("process",
                        choices=['vcf', 'maf', 'main', 'mafSP'],
                        help='Process vcf, maf or the rest of the files')
    parser.add_argument('--center', help='The centers')
    parser.add_argument("--pemFile",
                        type=str,
                        help="Path to PEM file (genie.pem)")
    parser.add_argument("--deleteOld",
                        action='store_true',
                        help="Delete all old processed and temp files")
    parser.add_argument("--onlyValidate",
                        action='store_true',
                        help="Only validate the files, don't process")
    parser.add_argument("--oncotreeLink",
                        type=str,
                        help="Link to oncotree code")
    parser.add_argument("--createNewMafDatabase",
                        action='store_true',
                        help="Creates a new maf database")
    parser.add_argument("--testing",
                        action='store_true',
                        help="Testing the infrastructure!")
    parser.add_argument("--debug",
                        action='store_true',
                        help="Add debug mode to synapse")
    parser.add_argument("--reference",
                        type=str,
                        help="Path to VCF reference file")

    #DEFAULT PARAMS
    parser.add_argument("--vcf2mafPath",
                        type=str,
                        help="Path to vcf2maf",
                        default="~/vcf2maf-1.6.14")
    parser.add_argument("--vepPath",
                        type=str,
                        help="Path to VEP",
                        default="~/vep")
    parser.add_argument("--vepData",
                        type=str,
                        help="Path to VEP data",
                        default="~/.vep")
    parser.add_argument('--thread',
                        type=int,
                        help="Number of threads to use for validation",
                        default=1)

    args = parser.parse_args()
    syn = process_functions.synLogin(args.pemFile, debug=args.debug)
    #Must specify path to vcf2maf, VEP and VEP data is these types are specified
    if args.process in ['vcf', 'maf', 'mafSP'] and not args.onlyValidate:
        assert os.path.exists(
            args.vcf2mafPath
        ), "Path to vcf2maf (--vcf2mafPath) must be specified if `--process {vcf,maf,mafSP}` is used"
        assert os.path.exists(
            args.vepPath
        ), "Path to VEP (--vepPath) must be specified if `--process {vcf,maf,mafSP}` is used"
        assert os.path.exists(
            args.vepData
        ), "Path to VEP data (--vepData) must be specified if `--process {vcf,maf,mafSP}` is used"

    if args.testing:
        databaseToSynIdMapping = syn.tableQuery('SELECT * FROM syn11600968')
    else:
        databaseToSynIdMapping = syn.tableQuery('SELECT * FROM syn10967259')

    databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame()

    center_mapping_id = process_functions.getDatabaseSynId(
        syn,
        "centerMapping",
        databaseToSynIdMappingDf=databaseToSynIdMappingDf)
    center_mapping = syn.tableQuery('SELECT * FROM %s' % center_mapping_id)
    center_mapping_df = center_mapping.asDataFrame()

    if args.center is not None:
        assert args.center in center_mapping_df.center.tolist(
        ), "Must specify one of these centers: %s" % ", ".join(
            center_mapping_df.center)
        centers = [args.center]
    else:
        center_mapping_df = center_mapping_df[~center_mapping_df['inputSynId'].
                                              isnull()]
        center_mapping_df = center_mapping_df[center_mapping_df['release'] ==
                                              True]
        centers = center_mapping_df.center

    if args.oncotreeLink is None:
        onco_link = databaseToSynIdMappingDf['Id'][
            databaseToSynIdMappingDf['Database'] == 'oncotreeLink'].values[0]
        onco_link_ent = syn.get(onco_link)
        args.oncotreeLink = onco_link_ent.externalURL
    #Check if you can connect to oncotree link, if not then don't run validation / processing
    process_functions.checkUrl(args.oncotreeLink)

    center_mapping_ent = syn.get(center_mapping_id)
    if center_mapping_ent.get('isProcessing', ['True'])[0] == 'True':
        raise Exception(
            "Processing/validation is currently happening.  Please change/add the 'isProcessing' annotation on %s to False to enable processing"
            % center_mapping_id)
    else:
        center_mapping_ent.isProcessing = "True"
        center_mapping_ent = syn.store(center_mapping_ent)
    #remove this query timeout and see what happens
    #syn.table_query_timeout = 50000

    #Create new maf database, should only happen once if its specified
    if args.createNewMafDatabase:
        createMafDatabase(syn, databaseToSynIdMappingDf, testing=args.testing)

    for center in centers:
        input_to_database(syn,
                          center,
                          args.process,
                          args.testing,
                          args.onlyValidate,
                          args.vcf2mafPath,
                          args.vepPath,
                          args.vepData,
                          databaseToSynIdMappingDf,
                          center_mapping_df,
                          reference=args.reference,
                          delete_old=args.deleteOld,
                          oncotree_link=args.oncotreeLink,
                          thread=args.thread)

    # To ensure that this is the new entity
    center_mapping_ent = syn.get(center_mapping_id)
    center_mapping_ent.isProcessing = "False"
    center_mapping_ent = syn.store(center_mapping_ent)

    error_tracker_synid = process_functions.getDatabaseSynId(
        syn, "errorTracker", databaseToSynIdMappingDf=databaseToSynIdMappingDf)
    #Only write out invalid reasons if the center isnt specified and if only validate
    if args.center is None and args.onlyValidate:
        logging.info("WRITING INVALID REASONS TO CENTER STAGING DIRS")
        write_invalid_reasons.write_invalid_reasons(syn, center_mapping_df,
                                                    error_tracker_synid)
def main(args):
    cbioValidatorPath = os.path.join(
        args.cbioportalPath, "core/src/main/scripts/importer/validateData.py"
    )
    assert os.path.exists(cbioValidatorPath), "Please specify correct cbioportalPath"
    assert not (
        args.test and args.staging
    ), "You can only specify --test or --staging, not both"
    try:
        processingDate = datetime.datetime.strptime(args.processingDate, "%b-%Y")
    except ValueError:
        raise ValueError(
            "Process date must be in the format " "abbreviated_month-YEAR ie. Oct-2017"
        )

    syn = process_functions.synLogin(args.pemFile, debug=args.debug)
    genie_user = os.environ.get("GENIE_USER")
    if args.pemFile is not None:
        genie_pass = process_functions.get_password(args.pemFile)
    else:
        genie_pass = None

    # Get all the possible public releases
    # Get configuration
    if args.test:
        databaseSynIdMappingId = "syn11600968"
        args.genieVersion = "TESTpublic"
    elif args.staging:
        databaseSynIdMappingId = "syn12094210"
    else:
        databaseSynIdMappingId = "syn10967259"
    databaseSynIdMapping = syn.tableQuery("select * from %s" % databaseSynIdMappingId)
    databaseSynIdMappingDf = databaseSynIdMapping.asDataFrame()
    public_synid = databaseSynIdMappingDf["Id"][
        databaseSynIdMappingDf["Database"] == "public"
    ].values[0]

    releaseSynId = databaseSynIdMappingDf["Id"][
        databaseSynIdMappingDf["Database"] == "release"
    ].values[0]

    officialPublic = consortium_to_public.get_public_to_consortium_synid_mapping(
        syn, releaseSynId, test=args.test
    )

    assert (
        args.genieVersion in officialPublic.keys()
    ), "genieVersion must be one of these: {}.".format(", ".join(officialPublic.keys()))

    args.releaseId = officialPublic[args.genieVersion]
    if not args.test and not args.staging:
        processTrackerSynId = databaseSynIdMappingDf["Id"][
            databaseSynIdMappingDf["Database"] == "processTracker"
        ].values[0]
        processTracker = syn.tableQuery(
            "SELECT timeStartProcessing FROM %s where center = 'SAGE' "
            "and processingType = 'public'" % processTrackerSynId
        )
        processTrackerDf = processTracker.asDataFrame()
        processTrackerDf["timeStartProcessing"].iloc[0] = str(int(time.time() * 1000))
        syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf))

    caseListEntities, genePanelEntities = consortium_to_public.consortiumToPublic(
        syn,
        processingDate,
        args.genieVersion,
        args.releaseId,
        databaseSynIdMappingDf,
        publicReleaseCutOff=args.publicReleaseCutOff,
    )

    database_to_staging.revise_metadata_files(
        syn, args.staging, public_synid, args.genieVersion
    )

    logger.info("CBIO VALIDATION")
    # Must be exit 0 because the validator sometimes fails,
    # but we still want to capture the output
    command = [
        cbioValidatorPath,
        "-s",
        database_to_staging.GENIE_RELEASE_DIR,
        "-n",
        "; exit 0",
    ]
    cbio_output = subprocess.check_output(" ".join(command), shell=True)
    cbio_decoded_output = cbio_output.decode("utf-8")
    logger.info(cbio_decoded_output)
    if not args.test and not args.staging:
        log_folder_synid = databaseSynIdMappingDf["Id"][
            databaseSynIdMappingDf["Database"] == "logs"
        ].values[0]
        # Use tempfiles
        cbio_log_file = "cbioValidatorLogsPublic_{}.txt".format(args.genieVersion)
        with open(cbio_log_file, "w") as cbioLog:
            cbioLog.write(cbio_decoded_output)
        syn.store(synapseclient.File(cbio_log_file, parentId=log_folder_synid))
        os.remove(cbio_log_file)
    logger.info("REMOVING OLD FILES")
    process_functions.rmFiles(database_to_staging.CASE_LIST_PATH)
    seg_meta_file = "{}/genie_public_meta_cna_hg19_seg.txt".format(
        database_to_staging.GENIE_RELEASE_DIR
    )
    if os.path.exists(seg_meta_file):
        os.unlink(seg_meta_file)

    logger.info("CREATING LINK VERSION")
    folders = database_to_staging.create_link_version(
        syn,
        args.genieVersion,
        caseListEntities,
        genePanelEntities,
        databaseSynIdMappingDf,
        release_type="public",
    )
    # Don't update process tracker is testing or staging
    if not args.test and not args.staging:
        processTracker = syn.tableQuery(
            "SELECT timeEndProcessing FROM %s where center = 'SAGE' and "
            "processingType = 'public'" % processTrackerSynId
        )
        processTrackerDf = processTracker.asDataFrame()
        processTrackerDf["timeEndProcessing"].iloc[0] = str(int(time.time() * 1000))
        syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf))

    if not args.test:
        logger.info("DASHBOARD UPDATE")
        dashboard_table_updater.run_dashboard(
            syn, databaseSynIdMappingDf, args.genieVersion, staging=args.staging
        )
        generate_dashboard_html(
            args.genieVersion,
            staging=args.staging,
            genie_user=genie_user,
            genie_pass=genie_pass,
        )
        logger.info("DASHBOARD UPDATE COMPLETE")
        logger.info("AUTO GENERATE DATA GUIDE")

    onco_link = databaseSynIdMappingDf["Id"][
        databaseSynIdMappingDf["Database"] == "oncotreeLink"
    ].values[0]
    onco_link_ent = syn.get(onco_link)
    oncotree_link = onco_link_ent.externalURL
    oncotree_version = oncotree_link.split("=")[1]

    data_guide_pdf = generate_data_guide(
        args.genieVersion,
        oncotree_version=oncotree_version,
        database_mapping=databaseSynIdMappingId,
        genie_user=genie_user,
        genie_pass=genie_pass,
    )
    data_guide_ent = synapseclient.File(
        data_guide_pdf, parent=folders["release_folder"]
    )
    syn.store(data_guide_ent)
    logger.info("COMPLETED CONSORTIUM TO PUBLIC")
Пример #6
0
def main(
    process,
    project_id,
    center=None,
    pemfile=None,
    delete_old=False,
    only_validate=False,
    oncotree_link=None,
    genie_annotation_pkg=None,
    create_new_maf_database=False,
    debug=False,
    format_registry=None,
):

    syn = process_functions.synLogin(pemfile, debug=debug)

    # Get the Synapse Project where data is stored
    # Should have annotations to find the table lookup
    project = syn.get(project_id)
    database_to_synid_mapping_synid = project.annotations.get("dbMapping", "")

    databaseToSynIdMapping = syn.tableQuery(
        "SELECT * FROM {}".format(database_to_synid_mapping_synid[0])
    )
    databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame()

    center_mapping_id = process_functions.getDatabaseSynId(
        syn, "centerMapping", databaseToSynIdMappingDf=databaseToSynIdMappingDf
    )

    center_mapping = syn.tableQuery("SELECT * FROM %s" % center_mapping_id)
    center_mapping_df = center_mapping.asDataFrame()

    if center is not None:
        assert (
            center in center_mapping_df.center.tolist()
        ), "Must specify one of these centers: {}".format(
            ", ".join(center_mapping_df.center)
        )
        centers = [center]
    else:
        # exclude_sites = ['JHU', 'DFCI', 'GRCC', 'VICC', 'NKI', 'MSK',
        #                  'UHN', 'MDA', 'WAKE', 'YALE', 'UCSF', 'CRUK',
        #                  'CHOP', 'VHIO', 'SCI', 'PHS', 'COLU', 'UCHI']
        center_mapping_df = center_mapping_df[~center_mapping_df["inputSynId"].isnull()]
        # release is a bool column
        center_mapping_df = center_mapping_df[center_mapping_df["release"]]
        # center_mapping_df = center_mapping_df[
        #     ~center_mapping_df['center'].isin(exclude_sites)
        # ]
        centers = center_mapping_df.center

    if oncotree_link is None:
        onco_link = databaseToSynIdMappingDf["Id"][
            databaseToSynIdMappingDf["Database"] == "oncotreeLink"
        ].values[0]
        onco_link_ent = syn.get(onco_link)
        oncotree_link = onco_link_ent.externalURL
    # Check if you can connect to oncotree link,
    # if not then don't run validation / processing
    process_functions.checkUrl(oncotree_link)

    center_mapping_ent = syn.get(center_mapping_id)
    if center_mapping_ent.get("isProcessing", ["True"])[0] == "True":
        raise Exception(
            "Processing/validation is currently happening.  "
            "Please change/add the 'isProcessing' annotation on {} "
            "to False to enable processing".format(center_mapping_id)
        )
    else:
        center_mapping_ent.isProcessing = "True"
        center_mapping_ent = syn.store(center_mapping_ent)
    # remove this query timeout and see what happens
    # syn.table_query_timeout = 50000

    # Create new maf database, should only happen once if its specified
    if create_new_maf_database:
        today = date.today()
        table_name = f"Narrow MAF Database - {today}"
        # filetype = "vcf2maf"
        # syn7208886 is the GENIE staging project to archive maf table
        new_tables = process_functions.create_new_fileformat_table(
            syn, "vcf2maf", table_name, project_id, "syn7208886"
        )
        syn.setPermissions(new_tables["newdb_ent"].id, 3326313, [])
        databaseToSynIdMappingDf = new_tables["newdb_mappingdf"]

    format_registry = config.collect_format_types(args.format_registry_packages)
    for process_center in centers:
        input_to_database.center_input_to_database(
            syn,
            project_id,
            process_center,
            process,
            only_validate,
            databaseToSynIdMappingDf,
            center_mapping_df,
            delete_old=delete_old,
            oncotree_link=oncotree_link,
            format_registry=format_registry,
            genie_annotation_pkg=genie_annotation_pkg,
        )

    # To ensure that this is the new entity
    center_mapping_ent = syn.get(center_mapping_id)
    center_mapping_ent.isProcessing = "False"
    center_mapping_ent = syn.store(center_mapping_ent)

    error_tracker_synid = process_functions.getDatabaseSynId(
        syn, "errorTracker", databaseToSynIdMappingDf=databaseToSynIdMappingDf
    )
    # Only write out invalid reasons if the center
    # isnt specified and if only validate
    if center is None and only_validate:
        logger.info("WRITING INVALID REASONS TO CENTER STAGING DIRS")
        write_invalid_reasons.write(syn, center_mapping_df, error_tracker_synid)
    logger.info("INPUT TO DATABASE COMPLETE")
Пример #7
0
def main(genie_version,
         processing_date,
         cbioportal_path,
         oncotree_link=None,
         consortium_release_cutoff=184,
         pemfile=None,
         test=False,
         staging=False,
         debug=False,
         skip_mutationsincis=False):
    '''
    - Does parameter checks
    - Updates process tracking start
    - initiates database to staging
    - create case lists
    - revise meta files
    - run cBioPortal validation
    - create link versions
    - update process tracking end
    - Create dashboard tables and plots

    Args:
        genie_version: GENIE version,
        processing_date: processing date
        cbioportal_path: Path to cbioportal validator
        oncotree_link: Link to oncotree codes
        consortium_release_cutoff: release cut off value in days
        pemfile: Path to private key file
        test: Test flag, uses test databases
        staging: Staging flag, uses staging databases
        debug:  Synapse debug flag
        skip_mutationsincis: Skip mutation in cis filter
    '''
    syn = process_functions.synLogin(pemfile, debug=debug)
    genie_user = os.environ.get('GENIE_USER')
    if pemfile is not None:
        genie_pass = process_functions.get_password(pemfile)
    else:
        genie_pass = None

    if test:
        databaseSynIdMappingId = 'syn11600968'
        genie_version = "TESTING"
    elif staging:
        skip_mutationsincis = True
        databaseSynIdMappingId = 'syn12094210'
    else:
        databaseSynIdMappingId = 'syn10967259'
    # Database/folder syn id mapping
    databaseSynIdMapping = syn.tableQuery(
        'select * from {}'.format(databaseSynIdMappingId))
    databaseSynIdMappingDf = databaseSynIdMapping.asDataFrame()
    # databaseSynIdMappingDf.index = databaseSynIdMappingDf.Database
    # del databaseSynIdMappingDf['Database']
    # databaseSynIdMappingDf.to_dict()

    if oncotree_link is None:
        oncoLink = databaseSynIdMappingDf['Id'][
            databaseSynIdMappingDf['Database'] == 'oncotreeLink'].values[0]
        oncoLinkEnt = syn.get(oncoLink)
        oncotree_link = oncoLinkEnt.externalURL

    # Check if you can connect to oncotree link,
    # if not then don't run validation / processing
    process_functions.checkUrl(oncotree_link)

    cbioValidatorPath = os.path.join(
        cbioportal_path, "core/src/main/scripts/importer/validateData.py")
    assert os.path.exists(cbioValidatorPath),\
        "Please specify correct cbioportalPath"
    syn.table_query_timeout = 50000

    consortiumSynId = databaseSynIdMappingDf['Id'][
        databaseSynIdMappingDf['Database'] == 'consortium'].values[0]
    processTrackerSynId = databaseSynIdMappingDf['Id'][
        databaseSynIdMappingDf['Database'] == 'processTracker'].values[0]
    # get syn id of case list folder in consortium release
    # caseListSynId = findCaseListId(syn, consortiumSynId)
    caseListSynId, _ = database_to_staging.search_and_create_folder(
            syn, consortiumSynId, "case_lists")

    if not staging:
        database_to_staging.update_process_trackingdf(syn,
                                                      processTrackerSynId,
                                                      'SAGE', 'dbToStage',
                                                      start=True)

    centerMappingSynId = databaseSynIdMappingDf['Id'][
        databaseSynIdMappingDf['Database'] == 'centerMapping'].values[0]
    # Only release files where release is true
    center_mapping = syn.tableQuery(
        'SELECT * FROM {} where release is true'.format(centerMappingSynId))
    center_mappingdf = center_mapping.asDataFrame()
    processingDate = datetime.datetime.strptime(processing_date, '%b-%Y')

    logger.info("STAGING TO CONSORTIUM")
    genePanelEntities = database_to_staging.stagingToCbio(
        syn,
        processingDate,
        genie_version,
        center_mappingdf,
        databaseSynIdMappingDf,
        oncotree_url=oncotree_link,
        consortiumReleaseCutOff=consortium_release_cutoff,
        current_release_staging=staging,
        skipMutationsInCis=skip_mutationsincis,
        test=test,
        genie_user=genie_user,
        genie_pass=genie_pass)

    # Create case lists files
    logger.info("CREATE CASE LIST FILES")
    # Remove old caselists first
    if not os.path.exists(database_to_staging.CASE_LIST_PATH):
        os.mkdir(database_to_staging.CASE_LIST_PATH)
    caselists = os.listdir(database_to_staging.CASE_LIST_PATH)
    for caselist in caselists:
        os.remove(os.path.join(database_to_staging.CASE_LIST_PATH, caselist))
    clinical_path = os.path.join(
        database_to_staging.GENIE_RELEASE_DIR,
        'data_clinical_{}.txt'.format(genie_version))
    gene_matrix_path = os.path.join(
        database_to_staging.GENIE_RELEASE_DIR,
        "data_gene_matrix_{}.txt".format(genie_version))
    create_case_lists.main(
        clinical_path,
        gene_matrix_path,
        database_to_staging.CASE_LIST_PATH,
        "genie_private")
    caseListFiles = os.listdir(database_to_staging.CASE_LIST_PATH)
    caseListEntities = []
    for casePath in caseListFiles:
        casePath = os.path.join(database_to_staging.CASE_LIST_PATH, casePath)
        caseListEntities.append(database_to_staging.store_file(
            syn,
            casePath,
            parent=caseListSynId,
            genieVersion=genie_version))

    logger.info("REMOVING UNNECESSARY FILES")
    genie_files = os.listdir(database_to_staging.GENIE_RELEASE_DIR)
    for genie_file in genie_files:
        if genie_version not in genie_file and \
             "meta" not in genie_file and "case_lists" not in genie_file:
            os.remove(os.path.join(database_to_staging.GENIE_RELEASE_DIR,
                                   genie_file))
    os.remove(clinical_path)

    logger.info("REVISE METADATA FILES")
    database_to_staging.revise_metadata_files(syn, staging,
                                              consortiumSynId,
                                              genie_version)

    logger.info("CBIO VALIDATION")
    '''
    Must be exit 0 because the validator sometimes fails,
    but we still want to capture the output
    '''
    command = [cbioValidatorPath, '-s', database_to_staging.GENIE_RELEASE_DIR,
               '-n', '; exit 0']
    cbioOutput = subprocess.check_output(" ".join(command), shell=True)
    logger.info(cbioOutput.decode("utf-8"))

    cbio_validator_log = \
        "cbioValidatorLogsConsortium_{}.txt".format(genie_version)
    if not test and not staging:
        log_folder_synid = databaseSynIdMappingDf['Id'][
            databaseSynIdMappingDf['Database'] == 'logs'].values[0]
        with open(cbio_validator_log, "w") as cbioLog:
            cbioLog.write(cbioOutput.decode("utf-8"))
        syn.store(synapseclient.File(
            cbio_validator_log, parentId=log_folder_synid))
        os.remove(cbio_validator_log)
    logger.info("REMOVING OLD FILES")

    process_functions.rmFiles(database_to_staging.CASE_LIST_PATH)
    private_cna_meta_path = os.path.join(database_to_staging.GENIE_RELEASE_DIR,
                                         "genie_private_meta_cna_hg19_seg.txt")
    if os.path.exists(private_cna_meta_path):
        os.unlink(private_cna_meta_path)

    logger.info("CREATING LINK VERSION")
    database_to_staging.create_link_version(syn, genie_version,
                                            caseListEntities,
                                            genePanelEntities,
                                            databaseSynIdMappingDf)

    if not staging:
        database_to_staging.update_process_trackingdf(
            syn, processTrackerSynId, 'SAGE', 'dbToStage', start=False)

    logger.info("COMPLETED DATABASE TO STAGING")

    if not test:
        logger.info("DASHBOARD UPDATE")
        dashboard_table_updater.run_dashboard(
            syn,
            databaseSynIdMappingDf,
            genie_version,
            staging=staging)
        dashboard_markdown_html_commands = [
            'Rscript',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'dashboard_markdown_generator.R'),
            genie_version]

        if genie_user is not None and genie_pass is not None:
            dashboard_markdown_html_commands.extend(
                ['--syn_user', genie_user, '--syn_pass', genie_pass])
        if staging:
            dashboard_markdown_html_commands.append('--staging')
        subprocess.check_call(dashboard_markdown_html_commands)
        logger.info("DASHBOARD UPDATE COMPLETE")
Пример #8
0
def main(
    genie_version,
    processing_date,
    cbioportal_path,
    oncotree_link=None,
    consortium_release_cutoff=184,
    pemfile=None,
    test=False,
    staging=False,
    debug=False,
    skip_mutationsincis=False,
):
    """
    - Does parameter checks
    - Updates process tracking start
    - initiates database to staging
    - create case lists
    - revise meta files
    - run cBioPortal validation
    - create link versions
    - update process tracking end
    - Create dashboard tables and plots

    Args:
        genie_version: GENIE version,
        processing_date: processing date
        cbioportal_path: Path to cbioportal validator
        oncotree_link: Link to oncotree codes
        consortium_release_cutoff: release cut off value in days
        pemfile: Path to private key file
        test: Test flag, uses test databases
        staging: Staging flag, uses staging databases
        debug:  Synapse debug flag
        skip_mutationsincis: Skip mutation in cis filter
    """
    syn = process_functions.synLogin(pemfile, debug=debug)
    genie_user = os.environ.get("GENIE_USER")
    if pemfile is not None:
        genie_pass = process_functions.get_password(pemfile)
    else:
        genie_pass = None

    if test:
        databaseSynIdMappingId = "syn11600968"
        genie_version = "TESTING"
    elif staging:
        skip_mutationsincis = True
        databaseSynIdMappingId = "syn12094210"
    else:
        databaseSynIdMappingId = "syn10967259"
    # Database/folder syn id mapping
    databaseSynIdMapping = syn.tableQuery(
        "select * from {}".format(databaseSynIdMappingId))
    databaseSynIdMappingDf = databaseSynIdMapping.asDataFrame()
    # databaseSynIdMappingDf.index = databaseSynIdMappingDf.Database
    # del databaseSynIdMappingDf['Database']
    # databaseSynIdMappingDf.to_dict()

    if oncotree_link is None:
        oncoLink = databaseSynIdMappingDf["Id"][
            databaseSynIdMappingDf["Database"] == "oncotreeLink"].values[0]
        oncoLinkEnt = syn.get(oncoLink)
        oncotree_link = oncoLinkEnt.externalURL

    # Check if you can connect to oncotree link,
    # if not then don't run validation / processing
    process_functions.checkUrl(oncotree_link)

    cbioValidatorPath = os.path.join(
        cbioportal_path, "core/src/main/scripts/importer/validateData.py")
    assert os.path.exists(
        cbioValidatorPath), "Please specify correct cbioportalPath"
    syn.table_query_timeout = 50000

    consortiumSynId = databaseSynIdMappingDf["Id"][
        databaseSynIdMappingDf["Database"] == "consortium"].values[0]
    processTrackerSynId = databaseSynIdMappingDf["Id"][
        databaseSynIdMappingDf["Database"] == "processTracker"].values[0]
    # get syn id of case list folder in consortium release
    # caseListSynId = findCaseListId(syn, consortiumSynId)
    caseListSynId, _ = database_to_staging.search_and_create_folder(
        syn, consortiumSynId, "case_lists")

    if not staging:
        database_to_staging.update_process_trackingdf(syn,
                                                      processTrackerSynId,
                                                      "SAGE",
                                                      "dbToStage",
                                                      start=True)

    centerMappingSynId = databaseSynIdMappingDf["Id"][
        databaseSynIdMappingDf["Database"] == "centerMapping"].values[0]
    # Only release files where release is true
    center_mapping = syn.tableQuery(
        "SELECT * FROM {} where release is true".format(centerMappingSynId))
    center_mappingdf = center_mapping.asDataFrame()
    processingDate = datetime.datetime.strptime(processing_date, "%b-%Y")

    logger.info("STAGING TO CONSORTIUM")
    genePanelEntities = database_to_staging.stagingToCbio(
        syn,
        processingDate,
        genie_version,
        center_mappingdf,
        databaseSynIdMappingDf,
        oncotree_url=oncotree_link,
        consortiumReleaseCutOff=consortium_release_cutoff,
        current_release_staging=staging,
        skipMutationsInCis=skip_mutationsincis,
        test=test,
        genie_user=genie_user,
        genie_pass=genie_pass,
    )

    # Create case lists files
    logger.info("CREATE CASE LIST FILES")
    # Remove old caselists first
    if not os.path.exists(database_to_staging.CASE_LIST_PATH):
        os.mkdir(database_to_staging.CASE_LIST_PATH)
    caselists = os.listdir(database_to_staging.CASE_LIST_PATH)
    for caselist in caselists:
        os.remove(os.path.join(database_to_staging.CASE_LIST_PATH, caselist))
    clinical_path = os.path.join(
        database_to_staging.GENIE_RELEASE_DIR,
        "data_clinical_{}.txt".format(genie_version),
    )
    assay_information_path = os.path.join(
        database_to_staging.GENIE_RELEASE_DIR,
        "assay_information_{}.txt".format(genie_version),
    )
    create_case_lists.main(
        clinical_path,
        assay_information_path,
        database_to_staging.CASE_LIST_PATH,
        "genie_private",
    )
    caseListFiles = os.listdir(database_to_staging.CASE_LIST_PATH)
    caseListEntities = []
    for casePath in caseListFiles:
        casePath = os.path.join(database_to_staging.CASE_LIST_PATH, casePath)
        caseListEntities.append(
            database_to_staging.store_file(syn,
                                           casePath,
                                           parent=caseListSynId,
                                           genieVersion=genie_version))

    logger.info("REMOVING UNNECESSARY FILES")
    genie_files = os.listdir(database_to_staging.GENIE_RELEASE_DIR)
    for genie_file in genie_files:
        if (genie_version not in genie_file and "meta" not in genie_file
                and "case_lists" not in genie_file):
            os.remove(
                os.path.join(database_to_staging.GENIE_RELEASE_DIR,
                             genie_file))
    os.remove(clinical_path)

    logger.info("REVISE METADATA FILES")
    database_to_staging.revise_metadata_files(syn, staging, consortiumSynId,
                                              genie_version)

    logger.info("CBIO VALIDATION")

    # Must be exit 0 because the validator sometimes fails,
    # but we still want to capture the output

    command = [
        cbioValidatorPath,
        "-s",
        database_to_staging.GENIE_RELEASE_DIR,
        "-n",
        "; exit 0",
    ]
    cbioOutput = subprocess.check_output(" ".join(command), shell=True)
    logger.info(cbioOutput.decode("utf-8"))

    cbio_validator_log = f"cbioValidatorLogsConsortium_{genie_version}.txt"
    if not test and not staging:
        log_folder_synid = databaseSynIdMappingDf["Id"][
            databaseSynIdMappingDf["Database"] == "logs"].values[0]
        with open(cbio_validator_log, "w") as cbio_log:
            cbio_log.write(cbioOutput.decode("utf-8"))
        syn.store(
            synapseclient.File(cbio_validator_log, parentId=log_folder_synid))
        os.remove(cbio_validator_log)
    logger.info("REMOVING OLD FILES")

    process_functions.rmFiles(database_to_staging.CASE_LIST_PATH)
    private_cna_meta_path = os.path.join(
        database_to_staging.GENIE_RELEASE_DIR,
        "genie_private_meta_cna_hg19_seg.txt")
    if os.path.exists(private_cna_meta_path):
        os.unlink(private_cna_meta_path)

    logger.info("CREATING LINK VERSION")
    # Returns release and case list folder
    folders = database_to_staging.create_link_version(syn, genie_version,
                                                      caseListEntities,
                                                      genePanelEntities,
                                                      databaseSynIdMappingDf)

    if not staging:
        database_to_staging.update_process_trackingdf(syn,
                                                      processTrackerSynId,
                                                      "SAGE",
                                                      "dbToStage",
                                                      start=False)

    if not test:
        logger.info("DASHBOARD UPDATE")
        dashboard_table_updater.run_dashboard(syn,
                                              databaseSynIdMappingDf,
                                              genie_version,
                                              staging=staging)
        generate_dashboard_html(genie_version,
                                staging=staging,
                                genie_user=genie_user,
                                genie_pass=genie_pass)
        logger.info("DASHBOARD UPDATE COMPLETE")
        logger.info("AUTO GENERATE DATA GUIDE")

    oncotree_version = oncotree_link.split("=")[1]
    data_guide_pdf = generate_data_guide(
        genie_version,
        oncotree_version=oncotree_version,
        database_mapping=databaseSynIdMappingId,
        genie_user=genie_user,
        genie_pass=genie_pass,
    )
    database_to_staging.store_file(
        syn,
        data_guide_pdf,
        genieVersion=genie_version,
        parent=folders["release_folder"],
    )
    logger.info("COMPLETED DATABASE TO STAGING")
Пример #9
0
def main(process, project_config=None, center=None, pemfile=None,
         delete_old=False, only_validate=False, oncotree_link=None,
         create_new_maf_database=False, testing=False, debug=False,
         reference=None, vcf2maf_path=None, vep_path=None,
         vep_data=None, thread=1, format_registry=config.PROCESS_FILES):

    syn = process_functions.synLogin(pemfile, debug=debug)

    try:
        # Must specify correct paths to vcf2maf, VEP and VEP data
        # if trying to process vcf, maf and mafSP
        if process in ['vcf', 'maf', 'mafSP'] and not only_validate:
            assert os.path.exists(vcf2maf_path), (
                "Path to vcf2maf (--vcf2mafPath) must be specified "
                "if `--process {vcf,maf,mafSP}` is used")
            assert os.path.exists(vep_path), (
                "Path to VEP (--vepPath) must be specified "
                "if `--process {vcf,maf,mafSP}` is used")
            assert os.path.exists(vep_data), (
                "Path to VEP data (--vepData) must be specified "
                "if `--process {vcf,maf,mafSP}` is used")

        databaseToSynIdMapping = syn.tableQuery('SELECT * FROM {}'.format(project_config.get('database_to_synid_mapping')))
        databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame()

        center_mapping_id = process_functions.getDatabaseSynId(
            syn, "centerMapping",
            databaseToSynIdMappingDf=databaseToSynIdMappingDf)

        center_mapping = syn.tableQuery('SELECT * FROM %s' % center_mapping_id)
        center_mapping_df = center_mapping.asDataFrame()

        if center is not None:
            assert center in center_mapping_df.center.tolist(), (
                "Must specify one of these centers: {}".format(
                    ", ".join(center_mapping_df.center)))
            centers = [center]
        else:
            center_mapping_df = \
                center_mapping_df[~center_mapping_df['inputSynId'].isnull()]
            # release is a bool column
            center_mapping_df = center_mapping_df[center_mapping_df['release']]
            centers = center_mapping_df.center

        if oncotree_link is None:
            onco_link = databaseToSynIdMappingDf['Id'][
                databaseToSynIdMappingDf['Database'] == 'oncotreeLink'].values[0]
            onco_link_ent = syn.get(onco_link)
            oncotree_link = onco_link_ent.externalURL
        # Check if you can connect to oncotree link,
        # if not then don't run validation / processing
        process_functions.checkUrl(oncotree_link)

        currently_processing = get_processing_status(syn, center_mapping_id)
        
        if currently_processing:
            logger.error(
                "Processing/validation is currently happening.  "
                "Please change/add the 'isProcessing' annotation on {} "
                "to False to enable processing".format(center_mapping_id))
            sys.exit(1)
        else:
            status = set_processing_status(syn, center_mapping_id, status=True)
        # remove this query timeout and see what happens
        # syn.table_query_timeout = 50000

        # Create new maf database, should only happen once if its specified
        if create_new_maf_database:
            databaseToSynIdMappingDf = \
                input_to_database.create_and_archive_maf_database(syn, databaseToSynIdMappingDf)

        format_registry = config.collect_format_types(args.format_registry_packages)
        logger.debug("Using {format_registry} file formats.".format(
            format_registry=format_registry))

        for center in centers:
            input_to_database.center_input_to_database(
                syn, center, process,
                testing, only_validate,
                vcf2maf_path, vep_path,
                vep_data, databaseToSynIdMappingDf,
                center_mapping_df, reference=reference,
                delete_old=delete_old,
                oncotree_link=oncotree_link,
                thread=thread, format_registry=format_registry)

        # To ensure that this is the new entity
        center_mapping_ent = syn.get(center_mapping_id)
        center_mapping_ent.isProcessing = "False"
        center_mapping_ent = syn.store(center_mapping_ent)

        error_tracker_synid = process_functions.getDatabaseSynId(
            syn, "errorTracker", databaseToSynIdMappingDf=databaseToSynIdMappingDf)
        # Only write out invalid reasons if the center
        # isnt specified and if only validate
        if center is None and only_validate:
            logger.info("WRITING INVALID REASONS TO CENTER STAGING DIRS")
            write_invalid_reasons.write_invalid_reasons(
                syn, center_mapping_df, error_tracker_synid)
    except Exception as e:
        raise e
    finally:
        _ = set_processing_status(syn, center_mapping_id, status=False)