示例#1
0
文件: maf.py 项目: sgosline/Genie
    def process_helper(self, filePath, path_to_GENIE, mafSynId, centerMafSynId,
                       vcf2mafPath, veppath, vepdata, reference=None):
        logger.info('MAF2MAF %s' % filePath)
        fileName = "data_mutations_extended_%s_MAF.txt" % self.center
        newMafPath = os.path.join(path_to_GENIE,self.center,"staging",fileName)
        narrowMafPath = os.path.join(path_to_GENIE,self.center,"staging","data_mutations_extended_%s_MAF_narrow.txt" % self.center)
        narrowMafColumns = [col['name'] for col in self.syn.getTableColumns(mafSynId) if col['name'] != 'inBED']
        #Strips out windows indentations \r
        command = ['dos2unix',filePath]
        subprocess.check_call(command)
        tempdir = os.path.join(path_to_GENIE, self.center)
        commandCall = ["perl",os.path.join(vcf2mafPath,"maf2maf.pl"),
                       "--input-maf",filePath,
                       "--output-maf",newMafPath,
                       "--vep-fork", '8',
                       "--tmp-dir",tempdir,
                       '--vep-path', veppath,
                       '--vep-data', vepdata,
                       #'--ref-fasta','/root/.vep/homo_sapiens/86_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa',
                       "--custom-enst", os.path.join(vcf2mafPath,"data/isoform_overrides_uniprot")]
        if reference is not None:
            commandCall.extend(["--ref-fasta",reference])
        maf = subprocess.check_call(commandCall) 

        process_functions.rmFiles(tempdir, recursive=False)
        open(narrowMafPath,"w").close()
        if os.path.exists(newMafPath):
            #This needs to switch to streaming at some point
            mafDf = pd.read_csv(newMafPath,sep="\t",comment="#")
            mafDf = self.formatMAF(mafDf)
            self.createFinalMaf(mafDf, newMafPath, maf=True)
            narrowMafDf = mafDf[narrowMafColumns]
            self.createFinalMaf(narrowMafDf, narrowMafPath, maf=True)
            #These functions have to be next to each other, because no modifications can happen 
            #Store Narrow MAF into db
            if self._fileType == "maf":
                self.storeProcessedMaf(narrowMafPath, mafSynId, centerMafSynId, isNarrow=True)
            #Store MAF flat file into synapse
            self.storeProcessedMaf(newMafPath, mafSynId, centerMafSynId)
        else:
            logger.error('ERROR PROCESSING %s' % filePath)
            filePath = "NOTPROCESSED"
        return(filePath)
示例#2
0
def input_to_database(syn,
                      center,
                      process,
                      testing,
                      only_validate,
                      vcf2maf_path,
                      vep_path,
                      vep_data,
                      database_to_synid_mappingdf,
                      center_mapping_df,
                      reference=None,
                      delete_old=False,
                      oncotree_link=None,
                      thread=1):
    if only_validate:
        log_path = os.path.join(process_functions.SCRIPT_DIR,
                                "%s_validation_log.txt" % center)
    else:
        log_path = os.path.join(process_functions.SCRIPT_DIR,
                                "%s_%s_log.txt" % (center, process))

    logFormatter = logging.Formatter(
        "%(asctime)s [%(name)s][%(levelname)s] %(message)s")
    fileHandler = logging.FileHandler(log_path, mode='w')
    fileHandler.setFormatter(logFormatter)
    logger.addHandler(fileHandler)

    if testing:
        logger.info("###########################################")
        logger.info("############NOW IN TESTING MODE############")
        logger.info("###########################################")

    # ----------------------------------------
    # Start input to staging process
    # ----------------------------------------

    #path_to_genie = os.path.realpath(os.path.join(process_functions.SCRIPT_DIR,"../"))
    #Make the synapsecache dir the genie input folder for now
    #The main reason for this is because the .synaspecache dir is mounted by batch
    path_to_genie = os.path.expanduser("~/.synapseCache")
    #Create input and staging folders
    if not os.path.exists(os.path.join(path_to_genie, center, "input")):
        os.makedirs(os.path.join(path_to_genie, center, "input"))
    if not os.path.exists(os.path.join(path_to_genie, center, "staging")):
        os.makedirs(os.path.join(path_to_genie, center, "staging"))

    if delete_old:
        process_functions.rmFiles(os.path.join(path_to_genie, center))

    validFiles = validation(syn, center, process, center_mapping_df,
                            database_to_synid_mappingdf, thread, testing,
                            oncotree_link)

    if len(validFiles) > 0 and not only_validate:
        #Reorganize so BED file are always validated and processed first
        validBED = [
            os.path.basename(i).endswith('.bed') for i in validFiles['path']
        ]
        beds = validFiles[validBED]
        validFiles = beds.append(validFiles)
        validFiles.drop_duplicates(inplace=True)
        #Valid maf, mafsp, vcf and cbs files
        validMAF = [
            i for i in validFiles['path']
            if os.path.basename(i) == "data_mutations_extended_%s.txt" % center
        ]
        validMAFSP = [
            i for i in validFiles['path'] if os.path.basename(i) ==
            "nonGENIE_data_mutations_extended_%s.txt" % center
        ]
        validVCF = [
            i for i in validFiles['path']
            if os.path.basename(i).endswith('.vcf')
        ]
        #validCBS = [i for i in validFiles['path'] if os.path.basename(i).endswith('.cbs')]
        if process == 'mafSP':
            validMAFs = validMAFSP
        else:
            validMAFs = validMAF

        processTrackerSynId = process_functions.getDatabaseSynId(
            syn,
            "processTracker",
            databaseToSynIdMappingDf=database_to_synid_mappingdf)
        #Add process tracker for time start
        processTracker = syn.tableQuery(
            "SELECT timeStartProcessing FROM %s where center = '%s' and processingType = '%s'"
            % (processTrackerSynId, center, process))
        processTrackerDf = processTracker.asDataFrame()
        if len(processTrackerDf) == 0:
            new_rows = [[
                center,
                str(int(time.time() * 1000)),
                str(int(time.time() * 1000)), process
            ]]
            table = syn.store(
                synapseclient.Table(processTrackerSynId, new_rows))
        else:
            processTrackerDf['timeStartProcessing'][0] = str(
                int(time.time() * 1000))
            syn.store(
                synapseclient.Table(processTrackerSynId, processTrackerDf))

        processFiles(syn,
                     validFiles,
                     center,
                     path_to_genie,
                     thread,
                     center_mapping_df,
                     oncotree_link,
                     database_to_synid_mappingdf,
                     validVCF=validVCF,
                     validMAFs=validMAFs,
                     vcf2mafPath=vcf2maf_path,
                     veppath=vep_path,
                     vepdata=vep_data,
                     test=testing,
                     processing=process,
                     reference=reference)

        #Should add in this process end tracking before the deletion of samples
        processTracker = syn.tableQuery(
            "SELECT timeEndProcessing FROM %s where center = '%s' and processingType = '%s'"
            % (processTrackerSynId, center, process))
        processTrackerDf = processTracker.asDataFrame()
        processTrackerDf['timeEndProcessing'][0] = str(int(time.time() * 1000))
        syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf))

        logger.info("SAMPLE/PATIENT RETRACTION")
        toRetract.retract(syn, testing)

    else:
        messageOut = "%s does not have any valid files" if not only_validate else "ONLY VALIDATION OCCURED FOR %s"
        logger.info(messageOut % center)

    #Store log file
    syn.store(synapseclient.File(log_path, parentId="syn10155804"))
    os.remove(log_path)
    logger.info("ALL PROCESSES COMPLETE")
def main(args):
    cbioValidatorPath = os.path.join(
        args.cbioportalPath, "core/src/main/scripts/importer/validateData.py"
    )
    assert os.path.exists(cbioValidatorPath), "Please specify correct cbioportalPath"
    assert not (
        args.test and args.staging
    ), "You can only specify --test or --staging, not both"
    try:
        processingDate = datetime.datetime.strptime(args.processingDate, "%b-%Y")
    except ValueError:
        raise ValueError(
            "Process date must be in the format " "abbreviated_month-YEAR ie. Oct-2017"
        )

    syn = process_functions.synLogin(args.pemFile, debug=args.debug)
    genie_user = os.environ.get("GENIE_USER")
    if args.pemFile is not None:
        genie_pass = process_functions.get_password(args.pemFile)
    else:
        genie_pass = None

    # Get all the possible public releases
    # Get configuration
    if args.test:
        databaseSynIdMappingId = "syn11600968"
        args.genieVersion = "TESTpublic"
    elif args.staging:
        databaseSynIdMappingId = "syn12094210"
    else:
        databaseSynIdMappingId = "syn10967259"
    databaseSynIdMapping = syn.tableQuery("select * from %s" % databaseSynIdMappingId)
    databaseSynIdMappingDf = databaseSynIdMapping.asDataFrame()
    public_synid = databaseSynIdMappingDf["Id"][
        databaseSynIdMappingDf["Database"] == "public"
    ].values[0]

    releaseSynId = databaseSynIdMappingDf["Id"][
        databaseSynIdMappingDf["Database"] == "release"
    ].values[0]

    officialPublic = consortium_to_public.get_public_to_consortium_synid_mapping(
        syn, releaseSynId, test=args.test
    )

    assert (
        args.genieVersion in officialPublic.keys()
    ), "genieVersion must be one of these: {}.".format(", ".join(officialPublic.keys()))

    args.releaseId = officialPublic[args.genieVersion]
    if not args.test and not args.staging:
        processTrackerSynId = databaseSynIdMappingDf["Id"][
            databaseSynIdMappingDf["Database"] == "processTracker"
        ].values[0]
        processTracker = syn.tableQuery(
            "SELECT timeStartProcessing FROM %s where center = 'SAGE' "
            "and processingType = 'public'" % processTrackerSynId
        )
        processTrackerDf = processTracker.asDataFrame()
        processTrackerDf["timeStartProcessing"].iloc[0] = str(int(time.time() * 1000))
        syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf))

    caseListEntities, genePanelEntities = consortium_to_public.consortiumToPublic(
        syn,
        processingDate,
        args.genieVersion,
        args.releaseId,
        databaseSynIdMappingDf,
        publicReleaseCutOff=args.publicReleaseCutOff,
    )

    database_to_staging.revise_metadata_files(
        syn, args.staging, public_synid, args.genieVersion
    )

    logger.info("CBIO VALIDATION")
    # Must be exit 0 because the validator sometimes fails,
    # but we still want to capture the output
    command = [
        cbioValidatorPath,
        "-s",
        database_to_staging.GENIE_RELEASE_DIR,
        "-n",
        "; exit 0",
    ]
    cbio_output = subprocess.check_output(" ".join(command), shell=True)
    cbio_decoded_output = cbio_output.decode("utf-8")
    logger.info(cbio_decoded_output)
    if not args.test and not args.staging:
        log_folder_synid = databaseSynIdMappingDf["Id"][
            databaseSynIdMappingDf["Database"] == "logs"
        ].values[0]
        # Use tempfiles
        cbio_log_file = "cbioValidatorLogsPublic_{}.txt".format(args.genieVersion)
        with open(cbio_log_file, "w") as cbioLog:
            cbioLog.write(cbio_decoded_output)
        syn.store(synapseclient.File(cbio_log_file, parentId=log_folder_synid))
        os.remove(cbio_log_file)
    logger.info("REMOVING OLD FILES")
    process_functions.rmFiles(database_to_staging.CASE_LIST_PATH)
    seg_meta_file = "{}/genie_public_meta_cna_hg19_seg.txt".format(
        database_to_staging.GENIE_RELEASE_DIR
    )
    if os.path.exists(seg_meta_file):
        os.unlink(seg_meta_file)

    logger.info("CREATING LINK VERSION")
    folders = database_to_staging.create_link_version(
        syn,
        args.genieVersion,
        caseListEntities,
        genePanelEntities,
        databaseSynIdMappingDf,
        release_type="public",
    )
    # Don't update process tracker is testing or staging
    if not args.test and not args.staging:
        processTracker = syn.tableQuery(
            "SELECT timeEndProcessing FROM %s where center = 'SAGE' and "
            "processingType = 'public'" % processTrackerSynId
        )
        processTrackerDf = processTracker.asDataFrame()
        processTrackerDf["timeEndProcessing"].iloc[0] = str(int(time.time() * 1000))
        syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf))

    if not args.test:
        logger.info("DASHBOARD UPDATE")
        dashboard_table_updater.run_dashboard(
            syn, databaseSynIdMappingDf, args.genieVersion, staging=args.staging
        )
        generate_dashboard_html(
            args.genieVersion,
            staging=args.staging,
            genie_user=genie_user,
            genie_pass=genie_pass,
        )
        logger.info("DASHBOARD UPDATE COMPLETE")
        logger.info("AUTO GENERATE DATA GUIDE")

    onco_link = databaseSynIdMappingDf["Id"][
        databaseSynIdMappingDf["Database"] == "oncotreeLink"
    ].values[0]
    onco_link_ent = syn.get(onco_link)
    oncotree_link = onco_link_ent.externalURL
    oncotree_version = oncotree_link.split("=")[1]

    data_guide_pdf = generate_data_guide(
        args.genieVersion,
        oncotree_version=oncotree_version,
        database_mapping=databaseSynIdMappingId,
        genie_user=genie_user,
        genie_pass=genie_pass,
    )
    data_guide_ent = synapseclient.File(
        data_guide_pdf, parent=folders["release_folder"]
    )
    syn.store(data_guide_ent)
    logger.info("COMPLETED CONSORTIUM TO PUBLIC")
示例#4
0
def main(genie_version,
         processing_date,
         cbioportal_path,
         oncotree_link=None,
         consortium_release_cutoff=184,
         pemfile=None,
         test=False,
         staging=False,
         debug=False,
         skip_mutationsincis=False):
    '''
    - Does parameter checks
    - Updates process tracking start
    - initiates database to staging
    - create case lists
    - revise meta files
    - run cBioPortal validation
    - create link versions
    - update process tracking end
    - Create dashboard tables and plots

    Args:
        genie_version: GENIE version,
        processing_date: processing date
        cbioportal_path: Path to cbioportal validator
        oncotree_link: Link to oncotree codes
        consortium_release_cutoff: release cut off value in days
        pemfile: Path to private key file
        test: Test flag, uses test databases
        staging: Staging flag, uses staging databases
        debug:  Synapse debug flag
        skip_mutationsincis: Skip mutation in cis filter
    '''
    syn = process_functions.synLogin(pemfile, debug=debug)
    genie_user = os.environ.get('GENIE_USER')
    if pemfile is not None:
        genie_pass = process_functions.get_password(pemfile)
    else:
        genie_pass = None

    if test:
        databaseSynIdMappingId = 'syn11600968'
        genie_version = "TESTING"
    elif staging:
        skip_mutationsincis = True
        databaseSynIdMappingId = 'syn12094210'
    else:
        databaseSynIdMappingId = 'syn10967259'
    # Database/folder syn id mapping
    databaseSynIdMapping = syn.tableQuery(
        'select * from {}'.format(databaseSynIdMappingId))
    databaseSynIdMappingDf = databaseSynIdMapping.asDataFrame()
    # databaseSynIdMappingDf.index = databaseSynIdMappingDf.Database
    # del databaseSynIdMappingDf['Database']
    # databaseSynIdMappingDf.to_dict()

    if oncotree_link is None:
        oncoLink = databaseSynIdMappingDf['Id'][
            databaseSynIdMappingDf['Database'] == 'oncotreeLink'].values[0]
        oncoLinkEnt = syn.get(oncoLink)
        oncotree_link = oncoLinkEnt.externalURL

    # Check if you can connect to oncotree link,
    # if not then don't run validation / processing
    process_functions.checkUrl(oncotree_link)

    cbioValidatorPath = os.path.join(
        cbioportal_path, "core/src/main/scripts/importer/validateData.py")
    assert os.path.exists(cbioValidatorPath),\
        "Please specify correct cbioportalPath"
    syn.table_query_timeout = 50000

    consortiumSynId = databaseSynIdMappingDf['Id'][
        databaseSynIdMappingDf['Database'] == 'consortium'].values[0]
    processTrackerSynId = databaseSynIdMappingDf['Id'][
        databaseSynIdMappingDf['Database'] == 'processTracker'].values[0]
    # get syn id of case list folder in consortium release
    # caseListSynId = findCaseListId(syn, consortiumSynId)
    caseListSynId, _ = database_to_staging.search_and_create_folder(
            syn, consortiumSynId, "case_lists")

    if not staging:
        database_to_staging.update_process_trackingdf(syn,
                                                      processTrackerSynId,
                                                      'SAGE', 'dbToStage',
                                                      start=True)

    centerMappingSynId = databaseSynIdMappingDf['Id'][
        databaseSynIdMappingDf['Database'] == 'centerMapping'].values[0]
    # Only release files where release is true
    center_mapping = syn.tableQuery(
        'SELECT * FROM {} where release is true'.format(centerMappingSynId))
    center_mappingdf = center_mapping.asDataFrame()
    processingDate = datetime.datetime.strptime(processing_date, '%b-%Y')

    logger.info("STAGING TO CONSORTIUM")
    genePanelEntities = database_to_staging.stagingToCbio(
        syn,
        processingDate,
        genie_version,
        center_mappingdf,
        databaseSynIdMappingDf,
        oncotree_url=oncotree_link,
        consortiumReleaseCutOff=consortium_release_cutoff,
        current_release_staging=staging,
        skipMutationsInCis=skip_mutationsincis,
        test=test,
        genie_user=genie_user,
        genie_pass=genie_pass)

    # Create case lists files
    logger.info("CREATE CASE LIST FILES")
    # Remove old caselists first
    if not os.path.exists(database_to_staging.CASE_LIST_PATH):
        os.mkdir(database_to_staging.CASE_LIST_PATH)
    caselists = os.listdir(database_to_staging.CASE_LIST_PATH)
    for caselist in caselists:
        os.remove(os.path.join(database_to_staging.CASE_LIST_PATH, caselist))
    clinical_path = os.path.join(
        database_to_staging.GENIE_RELEASE_DIR,
        'data_clinical_{}.txt'.format(genie_version))
    gene_matrix_path = os.path.join(
        database_to_staging.GENIE_RELEASE_DIR,
        "data_gene_matrix_{}.txt".format(genie_version))
    create_case_lists.main(
        clinical_path,
        gene_matrix_path,
        database_to_staging.CASE_LIST_PATH,
        "genie_private")
    caseListFiles = os.listdir(database_to_staging.CASE_LIST_PATH)
    caseListEntities = []
    for casePath in caseListFiles:
        casePath = os.path.join(database_to_staging.CASE_LIST_PATH, casePath)
        caseListEntities.append(database_to_staging.store_file(
            syn,
            casePath,
            parent=caseListSynId,
            genieVersion=genie_version))

    logger.info("REMOVING UNNECESSARY FILES")
    genie_files = os.listdir(database_to_staging.GENIE_RELEASE_DIR)
    for genie_file in genie_files:
        if genie_version not in genie_file and \
             "meta" not in genie_file and "case_lists" not in genie_file:
            os.remove(os.path.join(database_to_staging.GENIE_RELEASE_DIR,
                                   genie_file))
    os.remove(clinical_path)

    logger.info("REVISE METADATA FILES")
    database_to_staging.revise_metadata_files(syn, staging,
                                              consortiumSynId,
                                              genie_version)

    logger.info("CBIO VALIDATION")
    '''
    Must be exit 0 because the validator sometimes fails,
    but we still want to capture the output
    '''
    command = [cbioValidatorPath, '-s', database_to_staging.GENIE_RELEASE_DIR,
               '-n', '; exit 0']
    cbioOutput = subprocess.check_output(" ".join(command), shell=True)
    logger.info(cbioOutput.decode("utf-8"))

    cbio_validator_log = \
        "cbioValidatorLogsConsortium_{}.txt".format(genie_version)
    if not test and not staging:
        log_folder_synid = databaseSynIdMappingDf['Id'][
            databaseSynIdMappingDf['Database'] == 'logs'].values[0]
        with open(cbio_validator_log, "w") as cbioLog:
            cbioLog.write(cbioOutput.decode("utf-8"))
        syn.store(synapseclient.File(
            cbio_validator_log, parentId=log_folder_synid))
        os.remove(cbio_validator_log)
    logger.info("REMOVING OLD FILES")

    process_functions.rmFiles(database_to_staging.CASE_LIST_PATH)
    private_cna_meta_path = os.path.join(database_to_staging.GENIE_RELEASE_DIR,
                                         "genie_private_meta_cna_hg19_seg.txt")
    if os.path.exists(private_cna_meta_path):
        os.unlink(private_cna_meta_path)

    logger.info("CREATING LINK VERSION")
    database_to_staging.create_link_version(syn, genie_version,
                                            caseListEntities,
                                            genePanelEntities,
                                            databaseSynIdMappingDf)

    if not staging:
        database_to_staging.update_process_trackingdf(
            syn, processTrackerSynId, 'SAGE', 'dbToStage', start=False)

    logger.info("COMPLETED DATABASE TO STAGING")

    if not test:
        logger.info("DASHBOARD UPDATE")
        dashboard_table_updater.run_dashboard(
            syn,
            databaseSynIdMappingDf,
            genie_version,
            staging=staging)
        dashboard_markdown_html_commands = [
            'Rscript',
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'dashboard_markdown_generator.R'),
            genie_version]

        if genie_user is not None and genie_pass is not None:
            dashboard_markdown_html_commands.extend(
                ['--syn_user', genie_user, '--syn_pass', genie_pass])
        if staging:
            dashboard_markdown_html_commands.append('--staging')
        subprocess.check_call(dashboard_markdown_html_commands)
        logger.info("DASHBOARD UPDATE COMPLETE")
示例#5
0
def main(
    genie_version,
    processing_date,
    cbioportal_path,
    oncotree_link=None,
    consortium_release_cutoff=184,
    pemfile=None,
    test=False,
    staging=False,
    debug=False,
    skip_mutationsincis=False,
):
    """
    - Does parameter checks
    - Updates process tracking start
    - initiates database to staging
    - create case lists
    - revise meta files
    - run cBioPortal validation
    - create link versions
    - update process tracking end
    - Create dashboard tables and plots

    Args:
        genie_version: GENIE version,
        processing_date: processing date
        cbioportal_path: Path to cbioportal validator
        oncotree_link: Link to oncotree codes
        consortium_release_cutoff: release cut off value in days
        pemfile: Path to private key file
        test: Test flag, uses test databases
        staging: Staging flag, uses staging databases
        debug:  Synapse debug flag
        skip_mutationsincis: Skip mutation in cis filter
    """
    syn = process_functions.synLogin(pemfile, debug=debug)
    genie_user = os.environ.get("GENIE_USER")
    if pemfile is not None:
        genie_pass = process_functions.get_password(pemfile)
    else:
        genie_pass = None

    if test:
        databaseSynIdMappingId = "syn11600968"
        genie_version = "TESTING"
    elif staging:
        skip_mutationsincis = True
        databaseSynIdMappingId = "syn12094210"
    else:
        databaseSynIdMappingId = "syn10967259"
    # Database/folder syn id mapping
    databaseSynIdMapping = syn.tableQuery(
        "select * from {}".format(databaseSynIdMappingId))
    databaseSynIdMappingDf = databaseSynIdMapping.asDataFrame()
    # databaseSynIdMappingDf.index = databaseSynIdMappingDf.Database
    # del databaseSynIdMappingDf['Database']
    # databaseSynIdMappingDf.to_dict()

    if oncotree_link is None:
        oncoLink = databaseSynIdMappingDf["Id"][
            databaseSynIdMappingDf["Database"] == "oncotreeLink"].values[0]
        oncoLinkEnt = syn.get(oncoLink)
        oncotree_link = oncoLinkEnt.externalURL

    # Check if you can connect to oncotree link,
    # if not then don't run validation / processing
    process_functions.checkUrl(oncotree_link)

    cbioValidatorPath = os.path.join(
        cbioportal_path, "core/src/main/scripts/importer/validateData.py")
    assert os.path.exists(
        cbioValidatorPath), "Please specify correct cbioportalPath"
    syn.table_query_timeout = 50000

    consortiumSynId = databaseSynIdMappingDf["Id"][
        databaseSynIdMappingDf["Database"] == "consortium"].values[0]
    processTrackerSynId = databaseSynIdMappingDf["Id"][
        databaseSynIdMappingDf["Database"] == "processTracker"].values[0]
    # get syn id of case list folder in consortium release
    # caseListSynId = findCaseListId(syn, consortiumSynId)
    caseListSynId, _ = database_to_staging.search_and_create_folder(
        syn, consortiumSynId, "case_lists")

    if not staging:
        database_to_staging.update_process_trackingdf(syn,
                                                      processTrackerSynId,
                                                      "SAGE",
                                                      "dbToStage",
                                                      start=True)

    centerMappingSynId = databaseSynIdMappingDf["Id"][
        databaseSynIdMappingDf["Database"] == "centerMapping"].values[0]
    # Only release files where release is true
    center_mapping = syn.tableQuery(
        "SELECT * FROM {} where release is true".format(centerMappingSynId))
    center_mappingdf = center_mapping.asDataFrame()
    processingDate = datetime.datetime.strptime(processing_date, "%b-%Y")

    logger.info("STAGING TO CONSORTIUM")
    genePanelEntities = database_to_staging.stagingToCbio(
        syn,
        processingDate,
        genie_version,
        center_mappingdf,
        databaseSynIdMappingDf,
        oncotree_url=oncotree_link,
        consortiumReleaseCutOff=consortium_release_cutoff,
        current_release_staging=staging,
        skipMutationsInCis=skip_mutationsincis,
        test=test,
        genie_user=genie_user,
        genie_pass=genie_pass,
    )

    # Create case lists files
    logger.info("CREATE CASE LIST FILES")
    # Remove old caselists first
    if not os.path.exists(database_to_staging.CASE_LIST_PATH):
        os.mkdir(database_to_staging.CASE_LIST_PATH)
    caselists = os.listdir(database_to_staging.CASE_LIST_PATH)
    for caselist in caselists:
        os.remove(os.path.join(database_to_staging.CASE_LIST_PATH, caselist))
    clinical_path = os.path.join(
        database_to_staging.GENIE_RELEASE_DIR,
        "data_clinical_{}.txt".format(genie_version),
    )
    assay_information_path = os.path.join(
        database_to_staging.GENIE_RELEASE_DIR,
        "assay_information_{}.txt".format(genie_version),
    )
    create_case_lists.main(
        clinical_path,
        assay_information_path,
        database_to_staging.CASE_LIST_PATH,
        "genie_private",
    )
    caseListFiles = os.listdir(database_to_staging.CASE_LIST_PATH)
    caseListEntities = []
    for casePath in caseListFiles:
        casePath = os.path.join(database_to_staging.CASE_LIST_PATH, casePath)
        caseListEntities.append(
            database_to_staging.store_file(syn,
                                           casePath,
                                           parent=caseListSynId,
                                           genieVersion=genie_version))

    logger.info("REMOVING UNNECESSARY FILES")
    genie_files = os.listdir(database_to_staging.GENIE_RELEASE_DIR)
    for genie_file in genie_files:
        if (genie_version not in genie_file and "meta" not in genie_file
                and "case_lists" not in genie_file):
            os.remove(
                os.path.join(database_to_staging.GENIE_RELEASE_DIR,
                             genie_file))
    os.remove(clinical_path)

    logger.info("REVISE METADATA FILES")
    database_to_staging.revise_metadata_files(syn, staging, consortiumSynId,
                                              genie_version)

    logger.info("CBIO VALIDATION")

    # Must be exit 0 because the validator sometimes fails,
    # but we still want to capture the output

    command = [
        cbioValidatorPath,
        "-s",
        database_to_staging.GENIE_RELEASE_DIR,
        "-n",
        "; exit 0",
    ]
    cbioOutput = subprocess.check_output(" ".join(command), shell=True)
    logger.info(cbioOutput.decode("utf-8"))

    cbio_validator_log = f"cbioValidatorLogsConsortium_{genie_version}.txt"
    if not test and not staging:
        log_folder_synid = databaseSynIdMappingDf["Id"][
            databaseSynIdMappingDf["Database"] == "logs"].values[0]
        with open(cbio_validator_log, "w") as cbio_log:
            cbio_log.write(cbioOutput.decode("utf-8"))
        syn.store(
            synapseclient.File(cbio_validator_log, parentId=log_folder_synid))
        os.remove(cbio_validator_log)
    logger.info("REMOVING OLD FILES")

    process_functions.rmFiles(database_to_staging.CASE_LIST_PATH)
    private_cna_meta_path = os.path.join(
        database_to_staging.GENIE_RELEASE_DIR,
        "genie_private_meta_cna_hg19_seg.txt")
    if os.path.exists(private_cna_meta_path):
        os.unlink(private_cna_meta_path)

    logger.info("CREATING LINK VERSION")
    # Returns release and case list folder
    folders = database_to_staging.create_link_version(syn, genie_version,
                                                      caseListEntities,
                                                      genePanelEntities,
                                                      databaseSynIdMappingDf)

    if not staging:
        database_to_staging.update_process_trackingdf(syn,
                                                      processTrackerSynId,
                                                      "SAGE",
                                                      "dbToStage",
                                                      start=False)

    if not test:
        logger.info("DASHBOARD UPDATE")
        dashboard_table_updater.run_dashboard(syn,
                                              databaseSynIdMappingDf,
                                              genie_version,
                                              staging=staging)
        generate_dashboard_html(genie_version,
                                staging=staging,
                                genie_user=genie_user,
                                genie_pass=genie_pass)
        logger.info("DASHBOARD UPDATE COMPLETE")
        logger.info("AUTO GENERATE DATA GUIDE")

    oncotree_version = oncotree_link.split("=")[1]
    data_guide_pdf = generate_data_guide(
        genie_version,
        oncotree_version=oncotree_version,
        database_mapping=databaseSynIdMappingId,
        genie_user=genie_user,
        genie_pass=genie_pass,
    )
    database_to_staging.store_file(
        syn,
        data_guide_pdf,
        genieVersion=genie_version,
        parent=folders["release_folder"],
    )
    logger.info("COMPLETED DATABASE TO STAGING")